From 8828d7a1033b62ad5c65b0ffc1fe095aa5a37a29 Mon Sep 17 00:00:00 2001 From: maxachis Date: Tue, 14 Oct 2025 11:54:59 -0400 Subject: [PATCH 01/84] Continue draft --- ...1105-a8f36f185694_add_url_scheme_column.py | 63 +++++++++++++++++++ src/api/endpoints/annotate/_shared/extract.py | 2 +- src/api/endpoints/collector/manual/query.py | 7 ++- src/api/endpoints/submit/url/queries/core.py | 12 ++-- src/api/endpoints/url/get/query.py | 2 +- src/collectors/queries/insert/url.py | 6 +- src/collectors/queries/insert/urls/query.py | 2 +- src/core/tasks/handler.py | 1 - .../impl/huggingface/queries/get/core.py | 2 +- .../internet_archives/probe/queries/cte.py | 2 +- .../save/queries/shared/get_valid_entries.py | 2 +- .../queries/ctes/whitelisted_root_urls.py | 2 +- .../tasks/url/operators/html/queries/get.py | 2 +- .../queries/urls/not_probed/get/query.py | 4 +- .../url/operators/screenshot/queries/cte.py | 2 +- .../operators/submit_approved/queries/get.py | 2 +- .../operators/submit_meta_urls/queries/cte.py | 2 +- src/db/client/async_.py | 7 ++- src/db/client/sync.py | 6 +- src/db/models/impl/url/core/sqlalchemy.py | 9 +++ src/util/clean.py | 10 --- src/util/models/__init__.py | 0 src/util/models/url_and_scheme.py | 6 ++ src/util/url.py | 28 +++++++++ .../impl/huggingface/setup/queries/setup.py | 1 + tests/helpers/setup/populate.py | 3 +- 26 files changed, 151 insertions(+), 34 deletions(-) create mode 100644 alembic/versions/2025_10_14_1105-a8f36f185694_add_url_scheme_column.py delete mode 100644 src/util/clean.py create mode 100644 src/util/models/__init__.py create mode 100644 src/util/models/url_and_scheme.py create mode 100644 src/util/url.py diff --git a/alembic/versions/2025_10_14_1105-a8f36f185694_add_url_scheme_column.py b/alembic/versions/2025_10_14_1105-a8f36f185694_add_url_scheme_column.py new file mode 100644 index 00000000..3e302ea4 --- /dev/null +++ b/alembic/versions/2025_10_14_1105-a8f36f185694_add_url_scheme_column.py @@ -0,0 +1,63 @@ +"""Add url scheme column + +Revision ID: a8f36f185694 +Revises: 7aace6587d1a +Create Date: 2025-10-14 11:05:28.686940 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'a8f36f185694' +down_revision: Union[str, None] = '7aace6587d1a' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def _populate_column(): + op.execute( + """ + UPDATE urls + SET scheme = lower(split_part(url, '://', 1)) + WHERE url ~* '^[a-z][a-z0-9+.-]*://'; + """ + ) + + +def _remove_schemes_from_url_column(): + op.execute( + """ + UPDATE urls + SET url = regexp_replace(url, '^(?i)[a-z][a-z0-9+.-]*://', '') + WHERE url ~* '^[a-z][a-z0-9+.-]*://'; + """ + ) + + +def _add_check_constraint_to_url_column(): + op.execute( + """ + ALTER TABLE urls + ADD CONSTRAINT check_url_does_not_have_schema CHECK (url !~* '^[a-z][a-z0-9+.-]*://'); + """ + ) + + +def upgrade() -> None: + _add_column() + _populate_column() + _remove_schemes_from_url_column() + _add_check_constraint_to_url_column() + +def _add_column(): + op.add_column( + "urls", + sa.Column("scheme", sa.String(), nullable=True) + ) + +def downgrade() -> None: + pass diff --git a/src/api/endpoints/annotate/_shared/extract.py b/src/api/endpoints/annotate/_shared/extract.py index 390579d9..3534c997 100644 --- a/src/api/endpoints/annotate/_shared/extract.py +++ b/src/api/endpoints/annotate/_shared/extract.py @@ -46,7 +46,7 @@ async def extract_and_format_get_annotation_result( next_annotation=GetNextURLForAllAnnotationInnerResponse( url_info=URLMapping( url_id=url.id, - url=url.url + url=url.full_url ), html_info=html_response_info, url_type_suggestions=url_type_suggestions, diff --git a/src/api/endpoints/collector/manual/query.py b/src/api/endpoints/collector/manual/query.py index 4f8956dc..029b5ecb 100644 --- a/src/api/endpoints/collector/manual/query.py +++ b/src/api/endpoints/collector/manual/query.py @@ -12,6 +12,8 @@ from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType from src.db.queries.base.builder import QueryBuilderBase +from src.util.models.url_and_scheme import URLAndScheme +from src.util.url import get_url_and_scheme class UploadManualBatchQueryBuilder(QueryBuilderBase): @@ -43,8 +45,11 @@ async def run(self, session: AsyncSession) -> ManualBatchResponseDTO: duplicate_urls: list[str] = [] for entry in self.dto.entries: + url_and_scheme: URLAndScheme = get_url_and_scheme(entry.url) + url = URL( - url=entry.url, + url=url_and_scheme.url, + scheme=url_and_scheme.scheme, name=entry.name, description=entry.description, collector_metadata=entry.collector_metadata, diff --git a/src/api/endpoints/submit/url/queries/core.py b/src/api/endpoints/submit/url/queries/core.py index 081b5456..4d0269dd 100644 --- a/src/api/endpoints/submit/url/queries/core.py +++ b/src/api/endpoints/submit/url/queries/core.py @@ -19,7 +19,8 @@ from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion from src.db.queries.base.builder import QueryBuilderBase from src.db.utils.validate import is_valid_url -from src.util.clean import clean_url +from src.util.models.url_and_scheme import URLAndScheme +from src.util.url import clean_url, get_url_and_scheme class SubmitURLQueryBuilder(QueryBuilderBase): @@ -41,11 +42,13 @@ async def run(self, session: AsyncSession) -> URLSubmissionResponse: if not valid: return convert_invalid_url_to_url_response(url_original) - # Clean URLs + # Clean URL url_clean: str = clean_url(url_original) + url_and_scheme: URLAndScheme = get_url_and_scheme(url_clean) + # Check if duplicate - is_duplicate: bool = await DeduplicateURLQueryBuilder(url=url_clean).run(session) + is_duplicate: bool = await DeduplicateURLQueryBuilder(url=url_and_scheme.url).run(session) if is_duplicate: return convert_duplicate_urls_to_url_response( clean_url=url_clean, @@ -56,7 +59,8 @@ async def run(self, session: AsyncSession) -> URLSubmissionResponse: # Add URL url_insert = URL( - url=url_clean, + url=url_and_scheme.url, + scheme=url_and_scheme.scheme, source=URLSource.MANUAL, status=URLStatus.OK, ) diff --git a/src/api/endpoints/url/get/query.py b/src/api/endpoints/url/get/query.py index d7198612..6885ef64 100644 --- a/src/api/endpoints/url/get/query.py +++ b/src/api/endpoints/url/get/query.py @@ -50,7 +50,7 @@ async def run(self, session: AsyncSession) -> GetURLsResponseInfo: GetURLsResponseInnerInfo( id=result.id, batch_id=result.batch.id if result.batch is not None else None, - url=result.url, + url=result.full_url, status=URLStatus(result.status), collector_metadata=result.collector_metadata, updated_at=result.updated_at, diff --git a/src/collectors/queries/insert/url.py b/src/collectors/queries/insert/url.py index af72a3aa..8e9e75d3 100644 --- a/src/collectors/queries/insert/url.py +++ b/src/collectors/queries/insert/url.py @@ -4,6 +4,8 @@ from src.db.models.impl.url.core.pydantic.info import URLInfo from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase +from src.util.models.url_and_scheme import URLAndScheme +from src.util.url import get_url_and_scheme class InsertURLQueryBuilder(QueryBuilderBase): @@ -15,8 +17,10 @@ def __init__(self, url_info: URLInfo): async def run(self, session: AsyncSession) -> int: """Insert a new URL into the database.""" + url_and_scheme: URLAndScheme = get_url_and_scheme(self.url_info.url) url_entry = URL( - url=self.url_info.url, + url=url_and_scheme.url, + scheme=url_and_scheme.scheme, collector_metadata=self.url_info.collector_metadata, status=self.url_info.status.value, source=self.url_info.source diff --git a/src/collectors/queries/insert/urls/query.py b/src/collectors/queries/insert/urls/query.py index 75176158..d4165001 100644 --- a/src/collectors/queries/insert/urls/query.py +++ b/src/collectors/queries/insert/urls/query.py @@ -2,7 +2,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.collectors.queries.insert.urls.request_manager import InsertURLsRequestManager -from src.util.clean import clean_url +from src.util.url import clean_url from src.db.dtos.url.insert import InsertURLsInfo from src.db.dtos.url.mapping import URLMapping from src.db.models.impl.duplicate.pydantic.insert import DuplicateInsertInfo diff --git a/src/core/tasks/handler.py b/src/core/tasks/handler.py index 92b96103..7ed0d230 100644 --- a/src/core/tasks/handler.py +++ b/src/core/tasks/handler.py @@ -2,7 +2,6 @@ from discord_poster import DiscordPoster -from src.core.enums import BatchStatus from src.core.tasks.base.run_info import TaskOperatorRunInfo from src.core.tasks.url.enums import TaskOperatorOutcome from src.db.client.async_ import AsyncDatabaseClient diff --git a/src/core/tasks/scheduled/impl/huggingface/queries/get/core.py b/src/core/tasks/scheduled/impl/huggingface/queries/get/core.py index 5b6bd08d..802e8ea5 100644 --- a/src/core/tasks/scheduled/impl/huggingface/queries/get/core.py +++ b/src/core/tasks/scheduled/impl/huggingface/queries/get/core.py @@ -33,7 +33,7 @@ async def run(self, session: AsyncSession) -> list[GetForLoadingToHuggingFaceOut query = ( select( URL.id.label(label_url_id), - URL.url.label(label_url), + URL.full_url.label(label_url), URLRecordType.record_type.label(label_record_type_fine), URLCompressedHTML.compressed_html.label(label_html), FlagURLValidated.type.label(label_type) diff --git a/src/core/tasks/scheduled/impl/internet_archives/probe/queries/cte.py b/src/core/tasks/scheduled/impl/internet_archives/probe/queries/cte.py index 7de8b290..e6886134 100644 --- a/src/core/tasks/scheduled/impl/internet_archives/probe/queries/cte.py +++ b/src/core/tasks/scheduled/impl/internet_archives/probe/queries/cte.py @@ -12,7 +12,7 @@ def __init__(self): self._cte = ( select( URL.id.label("url_id"), - URL.url + URL.full_url.label("url") ) .where( or_( diff --git a/src/core/tasks/scheduled/impl/internet_archives/save/queries/shared/get_valid_entries.py b/src/core/tasks/scheduled/impl/internet_archives/save/queries/shared/get_valid_entries.py index b0f9eeea..1ce9c1d9 100644 --- a/src/core/tasks/scheduled/impl/internet_archives/save/queries/shared/get_valid_entries.py +++ b/src/core/tasks/scheduled/impl/internet_archives/save/queries/shared/get_valid_entries.py @@ -9,7 +9,7 @@ IA_SAVE_VALID_ENTRIES_QUERY = ( select( URL.id, - URL.url, + URL.full_url.label("url"), (URLInternetArchivesSaveMetadata.url_id.is_(None)).label("is_new"), ) # URL must have been previously probed for its online status. diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/whitelisted_root_urls.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/whitelisted_root_urls.py index 272717b5..dd7a5a8c 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/whitelisted_root_urls.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/whitelisted_root_urls.py @@ -34,7 +34,7 @@ # The connected URLs must be Meta URLs FlagURLValidated.type == URLType.META_URL, # Root URL can't be "https://catalog.data.gov" - URL.url != "https://catalog.data.gov" + URL.url != "catalog.data.gov" ) .group_by( URL.id diff --git a/src/core/tasks/url/operators/html/queries/get.py b/src/core/tasks/url/operators/html/queries/get.py index 832d9917..a6cbe4a8 100644 --- a/src/core/tasks/url/operators/html/queries/get.py +++ b/src/core/tasks/url/operators/html/queries/get.py @@ -19,7 +19,7 @@ async def run(self, session: AsyncSession) -> list[URLInfo]: url_info = URLInfo( id=url.id, batch_id=url.batch.id if url.batch is not None else None, - url=url.url, + url=url.full_url, collector_metadata=url.collector_metadata, status=url.status, created_at=url.created_at, diff --git a/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/query.py b/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/query.py index 36450252..0ecc50b3 100644 --- a/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/query.py +++ b/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/query.py @@ -4,7 +4,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from typing_extensions import override, final -from src.util.clean import clean_url +from src.util.url import clean_url from src.db.dtos.url.mapping import URLMapping from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata @@ -20,7 +20,7 @@ async def run(self, session: AsyncSession) -> list[URLMapping]: query = ( select( URL.id.label("url_id"), - URL.url + URL.full_url.label("url") ) .outerjoin( URLWebMetadata, diff --git a/src/core/tasks/url/operators/screenshot/queries/cte.py b/src/core/tasks/url/operators/screenshot/queries/cte.py index d961aabf..f1b3b1d2 100644 --- a/src/core/tasks/url/operators/screenshot/queries/cte.py +++ b/src/core/tasks/url/operators/screenshot/queries/cte.py @@ -13,7 +13,7 @@ def __init__(self): self._cte: CTE = ( select( URL.id.label("url_id"), - URL.url, + URL.full_url.label("url"), ) .join( URLWebMetadata, diff --git a/src/core/tasks/url/operators/submit_approved/queries/get.py b/src/core/tasks/url/operators/submit_approved/queries/get.py index d4138f9a..fb43dd34 100644 --- a/src/core/tasks/url/operators/submit_approved/queries/get.py +++ b/src/core/tasks/url/operators/submit_approved/queries/get.py @@ -55,7 +55,7 @@ async def _process_result(url: URL) -> SubmitApprovedURLTDO: supplying_entity = optional_metadata.supplying_entity tdo = SubmitApprovedURLTDO( url_id=url.id, - url=url.url, + url=url.full_url, name=url.name, agency_ids=agency_ids, description=url.description, diff --git a/src/core/tasks/url/operators/submit_meta_urls/queries/cte.py b/src/core/tasks/url/operators/submit_meta_urls/queries/cte.py index d350258c..54b1edf8 100644 --- a/src/core/tasks/url/operators/submit_meta_urls/queries/cte.py +++ b/src/core/tasks/url/operators/submit_meta_urls/queries/cte.py @@ -16,7 +16,7 @@ def __init__(self): self._cte = ( select( URL.id.label("url_id"), - URL.url, + URL.full_url.label("url"), LinkURLAgency.agency_id, ) # Validated as Meta URL diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 93c36544..2a15267e 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -102,6 +102,8 @@ from src.db.templates.markers.bulk.insert import BulkInsertableModel from src.db.templates.markers.bulk.upsert import BulkUpsertableModel from src.db.utils.compression import decompress_html, compress_html +from src.util.models.url_and_scheme import URLAndScheme +from src.util.url import get_url_and_scheme class AsyncDatabaseClient: @@ -828,9 +830,10 @@ async def upload_manual_batch( @session_manager async def search_for_url(self, session: AsyncSession, url: str) -> SearchURLResponse: - query = select(URL).where(URL.url == url) + url_and_scheme: URLAndScheme = get_url_and_scheme(url) + query = select(URL).where(URL.url == url_and_scheme.url) raw_results = await session.execute(query) - url = raw_results.scalars().one_or_none() + url: URL | None = raw_results.scalars().one_or_none() if url is None: return SearchURLResponse( found=False, diff --git a/src/db/client/sync.py b/src/db/client/sync.py index 006d6f0e..407cb3f4 100644 --- a/src/db/client/sync.py +++ b/src/db/client/sync.py @@ -23,6 +23,8 @@ from src.core.tasks.url.operators.submit_approved.tdo import SubmittedURLInfo from src.core.env_var_manager import EnvVarManager from src.core.enums import BatchStatus +from src.util.models.url_and_scheme import URLAndScheme +from src.util.url import get_url_and_scheme # Database Client @@ -116,8 +118,10 @@ def get_url_info_by_url( @session_manager def insert_url(self, session, url_info: URLInfo) -> int: """Insert a new URL into the database.""" + url_and_scheme: URLAndScheme = get_url_and_scheme(url_info.url) url_entry = URL( - url=url_info.url, + url=url_and_scheme.url, + scheme=url_and_scheme.scheme, collector_metadata=url_info.collector_metadata, status=url_info.status, name=url_info.name, diff --git a/src/db/models/impl/url/core/sqlalchemy.py b/src/db/models/impl/url/core/sqlalchemy.py index 3582dd56..98035bbf 100644 --- a/src/db/models/impl/url/core/sqlalchemy.py +++ b/src/db/models/impl/url/core/sqlalchemy.py @@ -1,5 +1,7 @@ from sqlalchemy import Column, Text, String, JSON +from sqlalchemy.ext.hybrid import hybrid_property from sqlalchemy.orm import relationship +from sqlalchemy.util import hybridproperty from src.collectors.enums import URLStatus from src.db.models.helpers import enum_column @@ -19,6 +21,7 @@ class URL(UpdatedAtMixin, CreatedAtMixin, WithIDBase): # The batch this URL is associated with url = Column(Text, unique=True) + scheme = Column(String) name = Column(String) description = Column(Text) # The metadata from the collector @@ -30,6 +33,12 @@ class URL(UpdatedAtMixin, CreatedAtMixin, WithIDBase): nullable=False ) + @hybrid_property + def full_url(self) -> str: + if self.scheme is None: + return self.url + return f"{self.scheme}://{self.url}" + source = enum_column( URLSource, name='url_source', diff --git a/src/util/clean.py b/src/util/clean.py deleted file mode 100644 index 3c0a0f92..00000000 --- a/src/util/clean.py +++ /dev/null @@ -1,10 +0,0 @@ - - -def clean_url(url: str) -> str: - # Remove Non-breaking spaces - url = url.strip(" ") - - # Remove any fragments and everything after them - url = url.split("#")[0] - return url - diff --git a/src/util/models/__init__.py b/src/util/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/util/models/url_and_scheme.py b/src/util/models/url_and_scheme.py new file mode 100644 index 00000000..494acd49 --- /dev/null +++ b/src/util/models/url_and_scheme.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class URLAndScheme(BaseModel): + url: str + scheme: str | None \ No newline at end of file diff --git a/src/util/url.py b/src/util/url.py new file mode 100644 index 00000000..ac4f73ca --- /dev/null +++ b/src/util/url.py @@ -0,0 +1,28 @@ +from urllib.parse import urlparse + +from src.util.models.url_and_scheme import URLAndScheme + + +def clean_url(url: str) -> str: + # Remove Non-breaking spaces + url = url.strip(" ") + + # Remove any fragments and everything after them + url = url.split("#")[0] + return url + +def get_url_and_scheme( + url: str +) -> URLAndScheme: + parsed = urlparse(url) + if parsed.scheme: + remainder = url.replace(f"{parsed.scheme}://", "", 1) + return URLAndScheme( + url=remainder, + scheme=parsed.scheme + ) + # Handle URLs without scheme + return URLAndScheme( + url=url, + scheme=None + ) diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/setup.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/setup.py index 417677df..55dbeb76 100644 --- a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/setup.py +++ b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/setup.py @@ -37,6 +37,7 @@ async def run(self, session: AsyncSession) -> list[int]: description = None url = URL( url=get_test_url(i), + scheme=None, status=URLStatus.OK, name=name, description=description, diff --git a/tests/helpers/setup/populate.py b/tests/helpers/setup/populate.py index 02c364d6..d0ce5869 100644 --- a/tests/helpers/setup/populate.py +++ b/tests/helpers/setup/populate.py @@ -5,7 +5,8 @@ async def populate_database(adb_client: AsyncDatabaseClient) -> None: """Populate database with test data.""" url = URL( - url="https://www.test-data.com/static-test-data", + url="www.test-data.com/static-test-data", + scheme="https", name="Fake test data", description="Test data populated as a result of `reset_database`, " "which imitates a validated URL synchronized from the Data Sources App.", From 00248c4fd7a4d5cf047f3620a89ea0e138784eb8 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 14 Oct 2025 13:28:38 -0400 Subject: [PATCH 02/84] Add schema column to `urls` table and update associated logic --- src/collectors/impl/example/core.py | 2 +- src/core/tasks/url/operators/root_url/extract.py | 5 +++-- src/db/models/impl/url/core/pydantic/insert.py | 1 + src/db/models/impl/url/core/sqlalchemy.py | 9 ++++++++- tests/automated/integration/api/test_manual_batch.py | 10 +++++----- .../integration/db/client/test_insert_urls.py | 10 +++++----- .../integration/db/structure/test_root_url.py | 2 +- .../impl/internet_archives/probe/constants.py | 4 ++-- .../scheduled/impl/internet_archives/save/constants.py | 4 ++-- .../integration/tasks/url/impl/html/setup/data.py | 10 +++++----- .../integration/tasks/url/impl/probe/constants.py | 4 ++-- .../tasks/url/impl/probe/no_redirect/test_two_urls.py | 4 ++-- .../url/impl/probe/redirect/test_two_urls_same_dest.py | 4 ++-- .../integration/tasks/url/impl/root_url/constants.py | 6 +++--- .../integration/tasks/url/impl/screenshot/test_core.py | 4 ++-- .../tasks/url/impl/submit_meta_urls/test_core.py | 2 +- tests/helpers/data_creator/generate.py | 3 ++- tests/helpers/simple_test_data_functions.py | 4 ++-- tests/manual/external/pdap/test_check_for_duplicate.py | 2 +- .../manual/external/url_request/test_url_screenshot.py | 2 +- 20 files changed, 51 insertions(+), 41 deletions(-) diff --git a/src/collectors/impl/example/core.py b/src/collectors/impl/example/core.py index 4bccf242..be0b8e07 100644 --- a/src/collectors/impl/example/core.py +++ b/src/collectors/impl/example/core.py @@ -24,7 +24,7 @@ async def run_implementation(self) -> None: await self.sleep() self.data = ExampleOutputDTO( message=f"Data collected by {self.batch_id}", - urls=["https://example.com", "https://example.com/2"], + urls=["example.com", "example.com/2"], parameters=self.dto.model_dump(), ) diff --git a/src/core/tasks/url/operators/root_url/extract.py b/src/core/tasks/url/operators/root_url/extract.py index e384fd15..9cb05c5a 100644 --- a/src/core/tasks/url/operators/root_url/extract.py +++ b/src/core/tasks/url/operators/root_url/extract.py @@ -2,6 +2,7 @@ def extract_root_url(url: str) -> str: - parsed_url: ParseResult = urlparse(url) - root_url = f"{parsed_url.scheme}://{parsed_url.netloc}" + # URLs in DB should not have HTTPS -- add to enable url parse to function properly + parsed_url: ParseResult = urlparse(f"https://{url}") + root_url = parsed_url.netloc return root_url \ No newline at end of file diff --git a/src/db/models/impl/url/core/pydantic/insert.py b/src/db/models/impl/url/core/pydantic/insert.py index f04dd3df..08480b6b 100644 --- a/src/db/models/impl/url/core/pydantic/insert.py +++ b/src/db/models/impl/url/core/pydantic/insert.py @@ -14,6 +14,7 @@ def sa_model(cls) -> type[Base]: return URL url: str + scheme: str | None = None collector_metadata: dict | None = None name: str | None = None status: URLStatus = URLStatus.OK diff --git a/src/db/models/impl/url/core/sqlalchemy.py b/src/db/models/impl/url/core/sqlalchemy.py index 98035bbf..e5bca30d 100644 --- a/src/db/models/impl/url/core/sqlalchemy.py +++ b/src/db/models/impl/url/core/sqlalchemy.py @@ -1,4 +1,4 @@ -from sqlalchemy import Column, Text, String, JSON +from sqlalchemy import Column, Text, String, JSON, case, literal from sqlalchemy.ext.hybrid import hybrid_property from sqlalchemy.orm import relationship from sqlalchemy.util import hybridproperty @@ -39,6 +39,13 @@ def full_url(self) -> str: return self.url return f"{self.scheme}://{self.url}" + @full_url.expression + def full_url(cls): + return case( + (cls.scheme != None, (cls.scheme + literal("://") + cls.url)), + else_=cls.url + ) + source = enum_column( URLSource, name='url_source', diff --git a/tests/automated/integration/api/test_manual_batch.py b/tests/automated/integration/api/test_manual_batch.py index dae5ee4f..9be80c25 100644 --- a/tests/automated/integration/api/test_manual_batch.py +++ b/tests/automated/integration/api/test_manual_batch.py @@ -20,14 +20,14 @@ async def test_manual_batch(api_test_helper): dtos = [] for i in range(50): dto = ManualBatchInnerInputDTO( - url=f"https://example.com/{i}", + url=f"example.com/{i}", ) dtos.append(dto) # Create 50 entries with URL and all optional fields for i in range(50): dto = ManualBatchInnerInputDTO( - url=f"https://example.com/{i+50}", + url=f"example.com/{i+50}", name=manual_batch_name, description=f"Description {i}", collector_metadata={ @@ -142,13 +142,13 @@ def check_opt_metadata(metadata: URLOptionalDataSourceMetadata, no_optional: boo more_dtos = [] for i in range(49): dto = ManualBatchInnerInputDTO( - url=f"https://example.com/{i+100}", + url=f"example.com/{i+100}", ) more_dtos.append(dto) for i in range(2): dto = ManualBatchInnerInputDTO( - url=f"https://example.com/{i+1}", + url=f"example.com/{i+1}", ) more_dtos.append(dto) @@ -162,7 +162,7 @@ def check_opt_metadata(metadata: URLOptionalDataSourceMetadata, no_optional: boo response = await ath.request_validator.submit_manual_batch(duplicate_input_dto) # Check duplicate URLs assert len(response.duplicate_urls) == 2 - assert response.duplicate_urls == ['https://example.com/1', 'https://example.com/2'] + assert response.duplicate_urls == ['example.com/1', 'example.com/2'] assert len(response.urls) == 49 # Check 149 URLs in database diff --git a/tests/automated/integration/db/client/test_insert_urls.py b/tests/automated/integration/db/client/test_insert_urls.py index f2d73f00..852da385 100644 --- a/tests/automated/integration/db/client/test_insert_urls.py +++ b/tests/automated/integration/db/client/test_insert_urls.py @@ -24,17 +24,17 @@ async def test_insert_urls( urls = [ URLInfo( - url="https://example.com/1", + url="example.com/1", collector_metadata={"name": "example_1"}, source=URLSource.COLLECTOR ), URLInfo( - url="https://example.com/2", + url="example.com/2", source=URLSource.COLLECTOR ), # Duplicate URLInfo( - url="https://example.com/1", + url="example.com/1", collector_metadata={"name": "example_duplicate"}, source=URLSource.COLLECTOR ) @@ -46,8 +46,8 @@ async def test_insert_urls( url_mappings = insert_urls_info.url_mappings assert len(url_mappings) == 2 - assert url_mappings[0].url == "https://example.com/1" - assert url_mappings[1].url == "https://example.com/2" + assert url_mappings[0].url == "example.com/1" + assert url_mappings[1].url == "example.com/2" assert insert_urls_info.original_count == 2 diff --git a/tests/automated/integration/db/structure/test_root_url.py b/tests/automated/integration/db/structure/test_root_url.py index 8f8be80b..62755b00 100644 --- a/tests/automated/integration/db/structure/test_root_url.py +++ b/tests/automated/integration/db/structure/test_root_url.py @@ -13,7 +13,7 @@ def test_root_url(db_data_creator: DBDataCreator): ColumnTester( column_name="url", type_=sa.String, - allowed_values=["https://example.com"] + allowed_values=["example.com"] ), ColumnTester( column_name="page_title", diff --git a/tests/automated/integration/tasks/scheduled/impl/internet_archives/probe/constants.py b/tests/automated/integration/tasks/scheduled/impl/internet_archives/probe/constants.py index d41ffb48..60f762e7 100644 --- a/tests/automated/integration/tasks/scheduled/impl/internet_archives/probe/constants.py +++ b/tests/automated/integration/tasks/scheduled/impl/internet_archives/probe/constants.py @@ -1,4 +1,4 @@ -TEST_URL_1 = "https://test-ia-metadata.com/1" -TEST_URL_2 = "https://test-ia-metadata.com/2" \ No newline at end of file +TEST_URL_1 = "test-ia-metadata.com/1" +TEST_URL_2 = "test-ia-metadata.com/2" \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/internet_archives/save/constants.py b/tests/automated/integration/tasks/scheduled/impl/internet_archives/save/constants.py index bc1b5a2e..658d8cb9 100644 --- a/tests/automated/integration/tasks/scheduled/impl/internet_archives/save/constants.py +++ b/tests/automated/integration/tasks/scheduled/impl/internet_archives/save/constants.py @@ -1,5 +1,5 @@ -TEST_URL_1 = "https://ia-save-test.com/1" -TEST_URL_2 = "https://ia-save-test.com/2" \ No newline at end of file +TEST_URL_1 = "ia-save-test.com/1" +TEST_URL_2 = "ia-save-test.com/2" \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/html/setup/data.py b/tests/automated/integration/tasks/url/impl/html/setup/data.py index 5615392c..203eb34b 100644 --- a/tests/automated/integration/tasks/url/impl/html/setup/data.py +++ b/tests/automated/integration/tasks/url/impl/html/setup/data.py @@ -10,7 +10,7 @@ # and their html should be stored TestURLHTMLTaskSetupEntry( url_info=TestURLInfo( - url="https://happy-path.com/pending", + url="happy-path.com/pending", status=URLStatus.OK ), web_metadata_info=TestWebMetadataInfo( @@ -28,7 +28,7 @@ # and their web metadata status should be updated to 404 TestURLHTMLTaskSetupEntry( url_info=TestURLInfo( - url="https://not-found-path.com/submitted", + url="not-found-path.com/submitted", status=URLStatus.ERROR ), web_metadata_info=TestWebMetadataInfo( @@ -47,7 +47,7 @@ # URLs that give errors should be updated with the appropriate scrape status TestURLHTMLTaskSetupEntry( url_info=TestURLInfo( - url="https://error-path.com/submitted", + url="error-path.com/submitted", status=URLStatus.ERROR ), web_metadata_info=TestWebMetadataInfo( @@ -65,7 +65,7 @@ # URLs with non-200 web metadata should not be processed TestURLHTMLTaskSetupEntry( url_info=TestURLInfo( - url="https://not-200-path.com/submitted", + url="not-200-path.com/submitted", status=URLStatus.OK ), web_metadata_info=TestWebMetadataInfo( @@ -82,7 +82,7 @@ # URLs with no web metadata should not be processed TestURLHTMLTaskSetupEntry( url_info=TestURLInfo( - url="https://no-web-metadata.com/submitted", + url="no-web-metadata.com/submitted", status=URLStatus.OK ), web_metadata_info=None, diff --git a/tests/automated/integration/tasks/url/impl/probe/constants.py b/tests/automated/integration/tasks/url/impl/probe/constants.py index 6c218e25..07ebbcc3 100644 --- a/tests/automated/integration/tasks/url/impl/probe/constants.py +++ b/tests/automated/integration/tasks/url/impl/probe/constants.py @@ -1,6 +1,6 @@ from src.db.models.impl.url.core.enums import URLSource PATCH_ROOT = "src.external.url_request.core.URLProbeManager" -TEST_URL = "https://www.example.com" -TEST_DEST_URL = "https://www.example.com/redirect" +TEST_URL = "www.example.com" +TEST_DEST_URL = "www.example.com/redirect" TEST_SOURCE = URLSource.COLLECTOR diff --git a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_two_urls.py b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_two_urls.py index cfd1f68f..c3b0c6c4 100644 --- a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_two_urls.py +++ b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_two_urls.py @@ -12,8 +12,8 @@ async def test_two_urls( setup_manager: TestURLProbeSetupManager, check_manager: TestURLProbeCheckManager ): - url_1 = "https://example.com/1" - url_2 = "https://example.com/2" + url_1 = "example.com/1" + url_2 = "example.com/2" operator = setup_manager.setup_operator( response_or_responses=[ setup_manager.setup_no_redirect_probe_response( diff --git a/tests/automated/integration/tasks/url/impl/probe/redirect/test_two_urls_same_dest.py b/tests/automated/integration/tasks/url/impl/probe/redirect/test_two_urls_same_dest.py index f0e113ff..bf5dab9f 100644 --- a/tests/automated/integration/tasks/url/impl/probe/redirect/test_two_urls_same_dest.py +++ b/tests/automated/integration/tasks/url/impl/probe/redirect/test_two_urls_same_dest.py @@ -30,12 +30,12 @@ async def test_url_probe_task_redirect_two_urls_same_dest( dest_status_code=200, dest_content_type=None, dest_error=None, - source_url="https://example.com/2", + source_url="example.com/2", ), ] ) source_url_id_1 = await setup_manager.setup_url(URLStatus.OK) - source_url_id_2 = await setup_manager.setup_url(URLStatus.OK, url="https://example.com/2") + source_url_id_2 = await setup_manager.setup_url(URLStatus.OK, url="example.com/2") run_info = await operator.run_task() assert_task_ran_without_error(run_info) await check_manager.check_url( diff --git a/tests/automated/integration/tasks/url/impl/root_url/constants.py b/tests/automated/integration/tasks/url/impl/root_url/constants.py index dc688797..d5e38e8f 100644 --- a/tests/automated/integration/tasks/url/impl/root_url/constants.py +++ b/tests/automated/integration/tasks/url/impl/root_url/constants.py @@ -1,5 +1,5 @@ -ROOT_URL = "https://root.com" -BRANCH_URL = "https://root.com/branch" -SECOND_BRANCH_URL = "https://root.com/second-branch" \ No newline at end of file +ROOT_URL = "root.com" +BRANCH_URL = "root.com/branch" +SECOND_BRANCH_URL = "root.com/second-branch" \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/screenshot/test_core.py b/tests/automated/integration/tasks/url/impl/screenshot/test_core.py index 6f54fbf9..f65aa40d 100644 --- a/tests/automated/integration/tasks/url/impl/screenshot/test_core.py +++ b/tests/automated/integration/tasks/url/impl/screenshot/test_core.py @@ -40,11 +40,11 @@ async def test_core( mock_get_screenshots = AsyncMock(return_value=[ URLScreenshotResponse( - url=screenshot_mapping.url, + url=f"https://{screenshot_mapping.url}", screenshot=bytes(124536), ), URLScreenshotResponse( - url=error_mapping.url, + url=f"https://{error_mapping.url}", screenshot=None, error="error", ) diff --git a/tests/automated/integration/tasks/url/impl/submit_meta_urls/test_core.py b/tests/automated/integration/tasks/url/impl/submit_meta_urls/test_core.py index 37d6e00f..92287454 100644 --- a/tests/automated/integration/tasks/url/impl/submit_meta_urls/test_core.py +++ b/tests/automated/integration/tasks/url/impl/submit_meta_urls/test_core.py @@ -51,7 +51,7 @@ async def test_submit_meta_urls( data={ "meta_urls": [ { - "url": mapping.url, + "url": f"https://{mapping.url}", "agency_id": agency_id, "status": SubmitMetaURLsStatus.SUCCESS.value, "meta_url_id": 2, diff --git a/tests/helpers/data_creator/generate.py b/tests/helpers/data_creator/generate.py index 1cf0a806..bee0993f 100644 --- a/tests/helpers/data_creator/generate.py +++ b/tests/helpers/data_creator/generate.py @@ -48,7 +48,8 @@ def generate_urls( for i in range(count): val: int = next_int() results.append(URLInsertModel( - url=f"http://example.com/{val}", + url=f"example.com/{val}", + scheme="https", status=status, source=source, name=f"Example {val}", diff --git a/tests/helpers/simple_test_data_functions.py b/tests/helpers/simple_test_data_functions.py index 4d321dc5..b250dc83 100644 --- a/tests/helpers/simple_test_data_functions.py +++ b/tests/helpers/simple_test_data_functions.py @@ -10,14 +10,14 @@ def generate_test_urls(count: int) -> list[str]: results = [] for i in range(count): - url = f"https://example.com/{uuid.uuid4().hex}" + url = f"example.com/{uuid.uuid4().hex}" results.append(url) return results def generate_test_url(i: int) -> str: - return f"https://test.com/{i}" + return f"test.com/{i}" def generate_test_name(i: int | None = None) -> str: if i is None: diff --git a/tests/manual/external/pdap/test_check_for_duplicate.py b/tests/manual/external/pdap/test_check_for_duplicate.py index 34bbc317..25a8bc52 100644 --- a/tests/manual/external/pdap/test_check_for_duplicate.py +++ b/tests/manual/external/pdap/test_check_for_duplicate.py @@ -4,6 +4,6 @@ @pytest.mark.asyncio async def test_check_for_duplicate(pdap_client): - response = await pdap_client.is_url_duplicate(url_to_check="https://example.com") + response = await pdap_client.is_url_duplicate(url_to_check="example.com") print(response) diff --git a/tests/manual/external/url_request/test_url_screenshot.py b/tests/manual/external/url_request/test_url_screenshot.py index b16535d6..3388c09f 100644 --- a/tests/manual/external/url_request/test_url_screenshot.py +++ b/tests/manual/external/url_request/test_url_screenshot.py @@ -12,7 +12,7 @@ async def test_url_screenshot(): """ urls: list[str] = [ - "https://www.example.com" + "www.example.com" ] responses: list[URLScreenshotResponse] = await get_screenshots(urls=urls) From 16f2b662436edfd9d562384ed82652116b2cb333 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 14 Oct 2025 17:59:56 -0400 Subject: [PATCH 03/84] Address alembic duplicates --- ...1105-a8f36f185694_add_url_scheme_column.py | 289 +++++++++++++++++- 1 file changed, 282 insertions(+), 7 deletions(-) diff --git a/alembic/versions/2025_10_14_1105-a8f36f185694_add_url_scheme_column.py b/alembic/versions/2025_10_14_1105-a8f36f185694_add_url_scheme_column.py index 3e302ea4..aa73e268 100644 --- a/alembic/versions/2025_10_14_1105-a8f36f185694_add_url_scheme_column.py +++ b/alembic/versions/2025_10_14_1105-a8f36f185694_add_url_scheme_column.py @@ -18,6 +18,287 @@ depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + _update_foreign_key_constraints() + + _delete_duplicate_urls() + _add_column() + _populate_column() + _remove_schemes_from_url_column() + _add_check_constraint_to_url_column() + +def _update_foreign_key_constraints(): + # URL Optional Data Source Metadata + op.execute(""" + ALTER TABLE url_optional_data_source_metadata + DROP CONSTRAINT IF EXISTS url_optional_data_source_metadata_url_id_fkey; + """) + + op.create_foreign_key( + "url_optional_data_source_metadata_url_id_fkey", + "url_optional_data_source_metadata", + "urls", + ["url_id"], + ["id"], + ondelete="CASCADE" + ) + + # Link URLs Redirect URL + # (Source URL ID) + op.execute(""" + ALTER TABLE link_urls_redirect_url + DROP CONSTRAINT IF EXISTS link_urls_redirect_url_source_url_id_fkey; + """) + + op.create_foreign_key( + "link_urls_redirect_url_source_url_id_fkey", + "link_urls_redirect_url", + "urls", + ["source_url_id"], + ["id"], + ondelete="CASCADE" + ) + + # (Destination URL ID) + op.execute(""" + ALTER TABLE link_urls_redirect_url + DROP CONSTRAINT IF EXISTS link_urls_redirect_url_destination_url_id_fkey; + """) + + op.create_foreign_key( + "link_urls_redirect_url_destination_url_id_fkey", + "link_urls_redirect_url", + "urls", + ["destination_url_id"], + ["id"], + ondelete="CASCADE" + ) + + # Reviewing User URL + op.execute(""" + ALTER TABLE reviewing_user_url + DROP CONSTRAINT IF EXISTS approving_user_url_url_id_fkey; + """) + + op.create_foreign_key( + "approving_user_url_url_id_fkey", + "reviewing_user_url", + "urls", + ["url_id"], + ["id"], + ondelete="CASCADE" + ) + + # user_url_agency_suggestions + op.execute(""" + ALTER TABLE user_url_agency_suggestions + DROP CONSTRAINT IF EXISTS user_url_agency_suggestions_url_id_fkey; + """) + + op.create_foreign_key( + "user_url_agency_suggestions_url_id_fkey", + "user_url_agency_suggestions", + "urls", + ["url_id"], + ["id"], + ondelete="CASCADE" + ) + + # Duplicates + op.execute(""" + ALTER TABLE duplicates + DROP CONSTRAINT IF EXISTS duplicates_original_url_id_fkey; + """) + + op.create_foreign_key( + "duplicates_original_url_id_fkey", + "duplicates", + "urls", + ["original_url_id"], + ["id"], + ondelete="CASCADE" + ) + + # link_user_name_suggestions + op.execute(""" + ALTER TABLE link_user_name_suggestions + DROP CONSTRAINT IF EXISTS link_user_name_suggestions_suggestion_id_fkey; + """) + + op.create_foreign_key( + "link_user_name_suggestions_suggestion_id_fkey", + "link_user_name_suggestions", + "url_name_suggestions", + ["suggestion_id"], + ["id"], + ondelete="CASCADE" + ) + +def _delete_duplicate_urls(): + op.execute(""" + DELETE FROM urls + WHERE id IN ( + 4217, + 15902, + 3472, + 17387, + 24256, + 17617, + 17414, + 15259, + 17952, + 17651, + 18010, + 18496, + 18563, + 18587, + 18592, + 18092, + 18046, + 20467, + 24346, + 28241, + 25075, + 22508, + 22391, + 24256, + 22486, + 28109, + 26336, + 30701, + 17387, + 19348, + 18080, + 27863, + 18855, + 28830, + 18824, + 17414, + 15259, + 20676, + 27716, + 21475, + 23442, + 28553, + 8176, + 22270, + 19161, + 21250, + 15659, + 18821, + 27067, + 27567, + 27318, + 20640, + 21840, + 3472, + 28982, + 28910, + 19527, + 28776, + 15902, + 18468, + 29557, + 22977, + 27694, + 22678, + 19094, + 27203, + 26436, + 18868, + 22813, + 25007, + 7548, + 30088, + 20924, + 22575, + 28149, + 30705, + 28179, + 30660, + 2988, + 17182, + 18893, + 30317, + 19215, + 17651, + 21117, + 17617, + 23742, + 19620, + 16865, + 19320, + 20516, + 25248, + 26122, + 30158, + 30522, + 23307, + 18621, + 27855, + 26922, + 21397, + 18010, + 18592, + 2527, + 26279, + 18563, + 18242, + 21550, + 28288, + 22361, + 24660, + 2989, + 28765, + 10627, + 19625, + 12191, + 27523, + 18373, + 28565, + 25437, + 26077, + 28554, + 23229, + 25631, + 25528, + 18092, + 10765, + 26126, + 51499, + 27375, + 24177, + 22734, + 22459, + 22439, + 18532, + 29064, + 20504, + 21643, + 21551, + 27698, + 19234, + 24308, + 22559, + 26227, + 19080, + 16010, + 3515, + 22658, + 20673, + 21854, + 19361, + 21768, + 26903, + 21253, + 23085, + 3761, + 3565 + ) + """) + def _populate_column(): op.execute( """ @@ -32,7 +313,7 @@ def _remove_schemes_from_url_column(): op.execute( """ UPDATE urls - SET url = regexp_replace(url, '^(?i)[a-z][a-z0-9+.-]*://', '') + SET url = regexp_replace(url, '^[a-z][a-z0-9+.-]*://', '', 'i') WHERE url ~* '^[a-z][a-z0-9+.-]*://'; """ ) @@ -47,12 +328,6 @@ def _add_check_constraint_to_url_column(): ) -def upgrade() -> None: - _add_column() - _populate_column() - _remove_schemes_from_url_column() - _add_check_constraint_to_url_column() - def _add_column(): op.add_column( "urls", From dabc10bf444e77cbfb3efb220253426098029e4a Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 14 Oct 2025 19:02:03 -0400 Subject: [PATCH 04/84] Add logic for adding `updated_at` triggers, add triggers to relevant columns --- ...37-ff4e8b2f6348_add_updated_at_triggers.py | 46 +++++++++++++++++++ src/db/client/async_.py | 8 +--- src/util/alembic_helpers.py | 33 ++++++++++++- .../db/structure/test_updated_at.py | 38 +++++++++++++++ 4 files changed, 118 insertions(+), 7 deletions(-) create mode 100644 alembic/versions/2025_10_14_1837-ff4e8b2f6348_add_updated_at_triggers.py create mode 100644 tests/automated/integration/db/structure/test_updated_at.py diff --git a/alembic/versions/2025_10_14_1837-ff4e8b2f6348_add_updated_at_triggers.py b/alembic/versions/2025_10_14_1837-ff4e8b2f6348_add_updated_at_triggers.py new file mode 100644 index 00000000..faf10f91 --- /dev/null +++ b/alembic/versions/2025_10_14_1837-ff4e8b2f6348_add_updated_at_triggers.py @@ -0,0 +1,46 @@ +"""Add updated_at triggers + +Revision ID: ff4e8b2f6348 +Revises: a8f36f185694 +Create Date: 2025-10-14 18:37:07.121323 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + +from src.util.alembic_helpers import create_updated_at_trigger + +# revision identifiers, used by Alembic. +revision: str = 'ff4e8b2f6348' +down_revision: Union[str, None] = 'a8f36f185694' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + for table in [ + "agencies", + "auto_record_type_suggestions", + "auto_relevant_suggestions", + "flag_url_validated", + "link_batch_urls", + "link_urls_agency", + "link_urls_redirect_url", + "link_urls_root_url", + "tasks", + "url_compressed_html", + "url_internet_archives_probe_metadata", + "url_scrape_info", + "url_screenshot", + "url_web_metadata", + "urls", + "user_record_type_suggestions", + "user_url_type_suggestions", + ]: + create_updated_at_trigger(table) + + +def downgrade() -> None: + pass diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 2a15267e..87fcb057 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -169,14 +169,10 @@ async def add_all( async def bulk_update( self, session: AsyncSession, - model: Base, - mappings: list[dict], + models: list[Base], ): # Note, mapping must include primary key - await session.execute( - update(model), - mappings - ) + await sh.bulk_update(session=session, models=models) @session_manager async def bulk_upsert( diff --git a/src/util/alembic_helpers.py b/src/util/alembic_helpers.py index cb9d8d67..f711136d 100644 --- a/src/util/alembic_helpers.py +++ b/src/util/alembic_helpers.py @@ -295,4 +295,35 @@ def remove_enum_value( f"ALTER TYPE {_q_ident(schema)}.{_q_ident(tmp_name)} " f"RENAME TO {_q_ident(enum_name)}" ) - ) \ No newline at end of file + ) + + +def create_updated_at_trigger(table_name: str) -> None: + """ + Adds a trigger to the given table that automatically updates the + 'updated_at' column to the current timestamp on UPDATE. + + Parameters: + table_name (str): Name of the table to attach the trigger to. + """ + + # Step 1: Define the trigger function (only needs to exist once) + op.execute(""" + CREATE OR REPLACE FUNCTION set_updated_at() + RETURNS TRIGGER AS $$ + BEGIN + NEW.updated_at = NOW(); + RETURN NEW; + END; + $$ LANGUAGE plpgsql; + """) + + # Step 2: Create the trigger for this specific table + trigger_name = f"{table_name}_updated_at_trigger" + op.execute(f""" + DROP TRIGGER IF EXISTS {trigger_name} ON {table_name}; + CREATE TRIGGER {trigger_name} + BEFORE UPDATE ON {table_name} + FOR EACH ROW + EXECUTE FUNCTION set_updated_at(); + """) diff --git a/tests/automated/integration/db/structure/test_updated_at.py b/tests/automated/integration/db/structure/test_updated_at.py new file mode 100644 index 00000000..281e6ee8 --- /dev/null +++ b/tests/automated/integration/db/structure/test_updated_at.py @@ -0,0 +1,38 @@ +import asyncio +from datetime import datetime + +import pytest + +from src.collectors.enums import URLStatus +from src.db.models.impl.url.core.pydantic.upsert import URLUpsertModel +from src.db.models.impl.url.core.sqlalchemy import URL +from tests.helpers.data_creator.core import DBDataCreator + + +@pytest.mark.asyncio +async def test_updated_at(db_data_creator: DBDataCreator): + + _ = await db_data_creator.create_urls( + count=1, + status=URLStatus.OK + ) + + urls: list[URL] = await db_data_creator.adb_client.get_all(URL) + url = urls[0] + assert url.updated_at is not None + updated_at: datetime = url.updated_at + + url_upsert = URLUpsertModel( + id=url.id, + name="New Name" + ) + + await db_data_creator.adb_client.bulk_update([url_upsert]) + + new_urls: list[URL] = await db_data_creator.adb_client.get_all(URL) + new_url = new_urls[0] + + new_updated_at = new_url.updated_at + assert new_updated_at > updated_at + + From 105eefa71d09d0a47341712c7e573863d6002b75 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 17 Oct 2025 20:09:52 -0400 Subject: [PATCH 05/84] Update root URL and redirect URL logic --- ...-7fc6502f1fa3_add_trailing_slash_column.py | 147 ++++++++++++++++++ src/api/endpoints/annotate/_shared/extract.py | 4 +- .../annotate/dtos/shared/base/response.py | 6 +- src/api/endpoints/collector/manual/query.py | 5 +- src/api/endpoints/submit/url/queries/core.py | 1 + src/collectors/queries/insert/url.py | 5 +- src/collectors/queries/insert/urls/query.py | 4 +- .../impl/internet_archives/probe/convert.py | 4 +- .../impl/internet_archives/probe/operator.py | 16 +- .../internet_archives/probe/queries/get.py | 11 +- .../internet_archives/save/models/entry.py | 6 +- src/core/tasks/url/operators/probe/convert.py | 17 ++ src/core/tasks/url/operators/probe/core.py | 83 ++++++++-- src/core/tasks/url/operators/probe/filter.py | 24 ++- .../url/operators/probe/models/__init__.py | 0 .../url/operators/probe/models/subsets.py | 8 + .../models/upsert_functional_equivalents.py | 22 +++ .../probe/queries/insert_redirects/convert.py | 25 ++- .../probe/queries/insert_redirects/extract.py | 1 - .../probe/queries/insert_redirects/filter.py | 14 -- .../probe/queries/insert_redirects/map.py | 9 +- .../queries/insert_redirects/models/subset.py | 9 ++ .../models/url_response_map.py | 5 +- .../probe/queries/insert_redirects/query.py | 47 +++--- .../insert_redirects/request_manager.py | 35 ++--- .../probe/queries/urls/exist/model.py | 18 ++- .../probe/queries/urls/exist/query.py | 50 ++++-- .../queries/urls/not_probed/get/query.py | 14 +- src/core/tasks/url/operators/probe/tdo.py | 7 +- .../tasks/url/operators/root_url/convert.py | 19 +-- src/core/tasks/url/operators/root_url/core.py | 24 +-- .../tasks/url/operators/root_url/extract.py | 2 +- .../operators/root_url/models/root_mapping.py | 3 +- .../url/operators/root_url/queries/get.py | 6 +- .../tasks/url/operators/screenshot/core.py | 6 +- .../tasks/url/operators/screenshot/get.py | 8 +- .../url/operators/screenshot/queries/get.py | 8 +- .../url/operators/submit_meta_urls/core.py | 10 +- .../url/operators/validate/queries/insert.py | 4 +- src/db/client/async_.py | 1 - src/db/client/sync.py | 8 +- src/db/dtos/url/insert.py | 4 +- src/db/dtos/url/mapping_/__init__.py | 0 src/db/dtos/url/mapping_/full.py | 14 ++ .../url/{mapping.py => mapping_/simple.py} | 2 +- src/db/helpers/session/session_helper.py | 16 +- .../models/impl/url/core/pydantic/insert.py | 3 +- .../models/impl/url/core/pydantic/upsert.py | 4 +- src/db/models/impl/url/core/sqlalchemy.py | 17 +- src/external/url_request/core.py | 3 +- src/external/url_request/probe/convert.py | 11 +- src/external/url_request/probe/core.py | 13 +- .../url_request/probe/models/response.py | 6 +- .../url_request/probe/models/wrapper.py | 6 +- src/util/models/full_url.py | 84 ++++++++++ src/util/url.py | 6 + src/util/url_mapper_/__init__.py | 0 src/util/url_mapper_/full.py | 49 ++++++ .../{url_mapper.py => url_mapper_/simple.py} | 14 +- .../api/annotate/anonymous/test_core.py | 6 +- .../integration/api/annotate/helpers.py | 6 +- .../summaries/test_pending_url_filter.py | 6 +- .../api/metrics/batches/test_aggregated.py | 10 +- .../api/metrics/batches/test_breakdown.py | 9 +- .../integration/api/metrics/test_backlog.py | 10 +- .../api/metrics/urls/aggregated/test_core.py | 19 ++- .../integration/api/submit/test_duplicate.py | 4 +- .../api/url/by_id/snapshot/test_success.py | 4 +- .../db/structure/test_updated_at.py | 4 +- .../impl/huggingface/setup/queries/setup.py | 3 +- .../impl/internet_archives/probe/setup.py | 6 +- .../impl/internet_archives/save/setup.py | 6 +- .../ineligible_cases/test_blacklist.py | 4 +- .../homepage_match/test_happy_path.py | 8 +- .../tasks/url/impl/html/setup/manager.py | 3 +- .../end_to_end/conftest.py | 4 +- .../tasks/url/impl/probe/constants.py | 2 +- .../impl/probe/mocks/url_request_interface.py | 7 +- .../probe/redirect/test_dest_exists_in_db.py | 2 +- .../redirect/test_functional_equivalent.py | 46 ++++++ .../probe/redirect/test_redirect_infinite.py | 46 ------ .../probe/redirect/test_two_urls_same_dest.py | 3 +- .../tasks/url/impl/probe/setup/manager.py | 12 +- .../root_url/test_branch_root_url_in_db.py | 6 +- .../test_branch_root_url_not_in_db.py | 3 +- .../url/impl/root_url/test_is_root_url.py | 3 +- .../test_two_branches_one_root_in_db.py | 9 +- ...two_branches_one_root_in_db_not_flagged.py | 9 +- .../test_two_branches_one_root_not_in_db.py | 6 +- .../tasks/url/impl/screenshot/test_core.py | 8 +- .../url/impl/submit_meta_urls/test_core.py | 5 +- tests/helpers/data_creator/core.py | 14 +- tests/helpers/data_creator/create.py | 8 +- tests/helpers/data_creator/generate.py | 1 + .../data_creator/models/creation_info/url.py | 5 +- tests/helpers/setup/final_review/core.py | 2 +- tests/helpers/setup/final_review/model.py | 6 +- 97 files changed, 888 insertions(+), 375 deletions(-) create mode 100644 alembic/versions/2025_10_17_1826-7fc6502f1fa3_add_trailing_slash_column.py create mode 100644 src/core/tasks/url/operators/probe/models/__init__.py create mode 100644 src/core/tasks/url/operators/probe/models/subsets.py create mode 100644 src/core/tasks/url/operators/probe/models/upsert_functional_equivalents.py delete mode 100644 src/core/tasks/url/operators/probe/queries/insert_redirects/filter.py create mode 100644 src/core/tasks/url/operators/probe/queries/insert_redirects/models/subset.py create mode 100644 src/db/dtos/url/mapping_/__init__.py create mode 100644 src/db/dtos/url/mapping_/full.py rename src/db/dtos/url/{mapping.py => mapping_/simple.py} (84%) create mode 100644 src/util/models/full_url.py create mode 100644 src/util/url_mapper_/__init__.py create mode 100644 src/util/url_mapper_/full.py rename src/util/{url_mapper.py => url_mapper_/simple.py} (72%) create mode 100644 tests/automated/integration/tasks/url/impl/probe/redirect/test_functional_equivalent.py delete mode 100644 tests/automated/integration/tasks/url/impl/probe/redirect/test_redirect_infinite.py diff --git a/alembic/versions/2025_10_17_1826-7fc6502f1fa3_add_trailing_slash_column.py b/alembic/versions/2025_10_17_1826-7fc6502f1fa3_add_trailing_slash_column.py new file mode 100644 index 00000000..69faae2e --- /dev/null +++ b/alembic/versions/2025_10_17_1826-7fc6502f1fa3_add_trailing_slash_column.py @@ -0,0 +1,147 @@ +"""Add trailing slash column + +Revision ID: 7fc6502f1fa3 +Revises: ff4e8b2f6348 +Create Date: 2025-10-17 18:26:56.756915 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '7fc6502f1fa3' +down_revision: Union[str, None] = 'ff4e8b2f6348' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + _remove_duplicates() + _add_trailing_slash_column() + _migrate_trailing_slash_to_column() + _remove_trailing_slash_from_url_column() + _add_check_constraint_forbidding_trailing_slash_in_url() + +def _remove_duplicates(): + op.execute( + """ + DELETE FROM urls + WHERE id IN ( + 23504, + 29401, + 21032, + 23687, + 15760, + 17574, + 17669, + 21382, + 11697, + 18076, + 27764, + 11395, + 17702, + 26857, + 30843, + 21850, + 29471, + 26789, + 19428, + 18452, + 30547, + 24004, + 27857, + 30260, + 26968, + 27065, + 29073, + 21827, + 25615, + 28644, + 24417, + 29801, + 27625, + 15708, + 23517, + 26415, + 26081, + 7478, + 20368, + 19494, + 26624, + 3817, + 3597, + 3568, + 16113, + 24125, + 30625, + 29965, + 23134, + 19207, + 12158, + 3835, + 24730, + 17113, + 29987, + 21452, + 24605, + 5043, + 17237, + 25522, + 11065, + 12387, + 12210, + 11185, + 11961, + 4935, + 24200, + 29028, + 24371, + 28355, + 17620, + 19546, + 3598 + ) + """ + ) + +def _add_trailing_slash_column(): + op.add_column( + 'urls', + sa.Column( + 'trailing_slash', + sa.Boolean(), + nullable=False, + server_default=sa.text('false') + ) + ) + +def _migrate_trailing_slash_to_column(): + op.execute( + """ + UPDATE urls + SET trailing_slash = url ~ '/$' + """ + ) + +def _remove_trailing_slash_from_url_column(): + op.execute( + """ + UPDATE urls + SET url = rtrim(url, '/') + WHERE url like '%/'; + """ + ) + +def _add_check_constraint_forbidding_trailing_slash_in_url(): + op.execute( + """ + ALTER TABLE urls + ADD CONSTRAINT no_trailing_slash CHECK (url !~ '/$') + """ + ) + +def downgrade() -> None: + pass diff --git a/src/api/endpoints/annotate/_shared/extract.py b/src/api/endpoints/annotate/_shared/extract.py index 3534c997..61e92c35 100644 --- a/src/api/endpoints/annotate/_shared/extract.py +++ b/src/api/endpoints/annotate/_shared/extract.py @@ -15,7 +15,7 @@ from src.api.endpoints.annotate.all.get.queries.location_.core import GetLocationSuggestionsQueryBuilder from src.api.endpoints.annotate.all.get.queries.name.core import GetNameSuggestionsQueryBuilder from src.db.dto_converter import DTOConverter -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion @@ -44,7 +44,7 @@ async def extract_and_format_get_annotation_result( await GetNameSuggestionsQueryBuilder(url_id=url.id).run(session) return GetNextURLForAllAnnotationResponse( next_annotation=GetNextURLForAllAnnotationInnerResponse( - url_info=URLMapping( + url_info=SimpleURLMapping( url_id=url.id, url=url.full_url ), diff --git a/src/api/endpoints/annotate/dtos/shared/base/response.py b/src/api/endpoints/annotate/dtos/shared/base/response.py index edcc80e1..0d3ae253 100644 --- a/src/api/endpoints/annotate/dtos/shared/base/response.py +++ b/src/api/endpoints/annotate/dtos/shared/base/response.py @@ -1,14 +1,12 @@ -from typing import Optional - from pydantic import BaseModel, Field from src.api.endpoints.annotate.dtos.shared.batch import AnnotationBatchInfo from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping class AnnotationInnerResponseInfoBase(BaseModel): - url_info: URLMapping = Field( + url_info: SimpleURLMapping = Field( title="Information about the URL" ) html_info: ResponseHTMLInfo = Field( diff --git a/src/api/endpoints/collector/manual/query.py b/src/api/endpoints/collector/manual/query.py index 029b5ecb..6cd7d7b8 100644 --- a/src/api/endpoints/collector/manual/query.py +++ b/src/api/endpoints/collector/manual/query.py @@ -48,13 +48,14 @@ async def run(self, session: AsyncSession) -> ManualBatchResponseDTO: url_and_scheme: URLAndScheme = get_url_and_scheme(entry.url) url = URL( - url=url_and_scheme.url, + url=url_and_scheme.url.rstrip('/'), scheme=url_and_scheme.scheme, name=entry.name, description=entry.description, collector_metadata=entry.collector_metadata, status=URLStatus.OK.value, - source=URLSource.MANUAL + source=URLSource.MANUAL, + trailing_slash=url_and_scheme.url.endswith('/'), ) diff --git a/src/api/endpoints/submit/url/queries/core.py b/src/api/endpoints/submit/url/queries/core.py index 4d0269dd..513d26ad 100644 --- a/src/api/endpoints/submit/url/queries/core.py +++ b/src/api/endpoints/submit/url/queries/core.py @@ -63,6 +63,7 @@ async def run(self, session: AsyncSession) -> URLSubmissionResponse: scheme=url_and_scheme.scheme, source=URLSource.MANUAL, status=URLStatus.OK, + trailing_slash=url_and_scheme.url.endswith('/'), ) session.add(url_insert) await session.flush() diff --git a/src/collectors/queries/insert/url.py b/src/collectors/queries/insert/url.py index 8e9e75d3..60f39a2c 100644 --- a/src/collectors/queries/insert/url.py +++ b/src/collectors/queries/insert/url.py @@ -19,11 +19,12 @@ async def run(self, session: AsyncSession) -> int: """Insert a new URL into the database.""" url_and_scheme: URLAndScheme = get_url_and_scheme(self.url_info.url) url_entry = URL( - url=url_and_scheme.url, + url=url_and_scheme.url.rstrip('/'), scheme=url_and_scheme.scheme, collector_metadata=self.url_info.collector_metadata, status=self.url_info.status.value, - source=self.url_info.source + source=self.url_info.source, + trailing_slash=url_and_scheme.url.endswith('/'), ) if self.url_info.created_at is not None: url_entry.created_at = self.url_info.created_at diff --git a/src/collectors/queries/insert/urls/query.py b/src/collectors/queries/insert/urls/query.py index d4165001..77f3fe1b 100644 --- a/src/collectors/queries/insert/urls/query.py +++ b/src/collectors/queries/insert/urls/query.py @@ -4,7 +4,7 @@ from src.collectors.queries.insert.urls.request_manager import InsertURLsRequestManager from src.util.url import clean_url from src.db.dtos.url.insert import InsertURLsInfo -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.duplicate.pydantic.insert import DuplicateInsertInfo from src.db.models.impl.url.core.pydantic.info import URLInfo from src.db.queries.base.builder import QueryBuilderBase @@ -32,7 +32,7 @@ async def run(self, session: AsyncSession) -> InsertURLsInfo: async with session.begin_nested() as sp: url_id = await rm.insert_url(url_info) url_mappings.append( - URLMapping( + SimpleURLMapping( url_id=url_id, url=url_info.url ) diff --git a/src/core/tasks/scheduled/impl/internet_archives/probe/convert.py b/src/core/tasks/scheduled/impl/internet_archives/probe/convert.py index efd5e45c..4d4be86d 100644 --- a/src/core/tasks/scheduled/impl/internet_archives/probe/convert.py +++ b/src/core/tasks/scheduled/impl/internet_archives/probe/convert.py @@ -1,10 +1,10 @@ from src.db.models.impl.url.internet_archives.probe.pydantic import URLInternetArchiveMetadataPydantic from src.external.internet_archives.models.ia_url_mapping import InternetArchivesURLMapping -from src.util.url_mapper import URLMapper +from src.util.url_mapper_.simple import SimpleURLMapper def convert_ia_url_mapping_to_ia_metadata( - url_mapper: URLMapper, + url_mapper: SimpleURLMapper, ia_mapping: InternetArchivesURLMapping ) -> URLInternetArchiveMetadataPydantic: iam = ia_mapping.ia_metadata diff --git a/src/core/tasks/scheduled/impl/internet_archives/probe/operator.py b/src/core/tasks/scheduled/impl/internet_archives/probe/operator.py index f4773417..4c58df00 100644 --- a/src/core/tasks/scheduled/impl/internet_archives/probe/operator.py +++ b/src/core/tasks/scheduled/impl/internet_archives/probe/operator.py @@ -12,7 +12,7 @@ CheckURLInternetArchivesTaskPrerequisitesQueryBuilder from src.core.tasks.scheduled.templates.operator import ScheduledTaskOperatorBase from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.enums import TaskType from src.db.models.impl.flag.checked_for_ia.pydantic import FlagURLCheckedForInternetArchivesPydantic from src.db.models.impl.url.internet_archives.probe.pydantic import URLInternetArchiveMetadataPydantic @@ -20,7 +20,7 @@ from src.external.internet_archives.client import InternetArchivesClient from src.external.internet_archives.models.ia_url_mapping import InternetArchivesURLMapping from src.util.progress_bar import get_progress_bar_disabled -from src.util.url_mapper import URLMapper +from src.util.url_mapper_.simple import SimpleURLMapper class InternetArchivesProbeTaskOperator( @@ -51,10 +51,10 @@ async def inner_task_logic(self) -> None: DeleteOldUnsuccessfulIACheckedFlagsQueryBuilder() ) - url_mappings: list[URLMapping] = await self._get_url_mappings() + url_mappings: list[SimpleURLMapping] = await self._get_url_mappings() if len(url_mappings) == 0: return - mapper = URLMapper(url_mappings) + mapper = SimpleURLMapper(url_mappings) await self.link_urls_to_task(mapper.get_all_ids()) @@ -65,7 +65,7 @@ async def inner_task_logic(self) -> None: await self._add_errors_to_db(mapper, ia_mappings=subsets.error) await self._add_ia_metadata_to_db(mapper, ia_mappings=subsets.has_metadata) - async def _add_errors_to_db(self, mapper: URLMapper, ia_mappings: list[InternetArchivesURLMapping]) -> None: + async def _add_errors_to_db(self, mapper: SimpleURLMapper, ia_mappings: list[InternetArchivesURLMapping]) -> None: url_error_info_list: list[URLTaskErrorSmall] = [] for ia_mapping in ia_mappings: url_id = mapper.get_id(ia_mapping.url) @@ -76,7 +76,7 @@ async def _add_errors_to_db(self, mapper: URLMapper, ia_mappings: list[InternetA url_error_info_list.append(url_error_info) await self.add_task_errors(url_error_info_list) - async def _get_url_mappings(self) -> list[URLMapping]: + async def _get_url_mappings(self) -> list[SimpleURLMapping]: return await self.adb_client.run_query_builder( GetURLsForInternetArchivesTaskQueryBuilder() ) @@ -93,7 +93,7 @@ async def _search_for_internet_archive_links(self, urls: list[str]) -> list[Inte async def _add_ia_metadata_to_db( self, - url_mapper: URLMapper, + url_mapper: SimpleURLMapper, ia_mappings: list[InternetArchivesURLMapping], ) -> None: insert_objects: list[URLInternetArchiveMetadataPydantic] = [ @@ -106,7 +106,7 @@ async def _add_ia_metadata_to_db( await self.adb_client.bulk_insert(insert_objects) async def _add_ia_flags_to_db( - self, mapper: URLMapper, ia_mappings: list[InternetArchivesURLMapping]) -> None: + self, mapper: SimpleURLMapper, ia_mappings: list[InternetArchivesURLMapping]) -> None: flags: list[FlagURLCheckedForInternetArchivesPydantic] = [] for ia_mapping in ia_mappings: url_id = mapper.get_id(ia_mapping.url) diff --git a/src/core/tasks/scheduled/impl/internet_archives/probe/queries/get.py b/src/core/tasks/scheduled/impl/internet_archives/probe/queries/get.py index 3306943a..a806b691 100644 --- a/src/core/tasks/scheduled/impl/internet_archives/probe/queries/get.py +++ b/src/core/tasks/scheduled/impl/internet_archives/probe/queries/get.py @@ -1,18 +1,15 @@ -from sqlalchemy import select, or_, exists, text, func +from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession from src.core.tasks.scheduled.impl.internet_archives.probe.queries.cte import CheckURLInternetArchivesCTEContainer -from src.db.dtos.url.mapping import URLMapping -from src.db.helpers.query import not_exists_url -from src.db.models.impl.flag.checked_for_ia.sqlalchemy import FlagURLCheckedForInternetArchives -from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.queries.base.builder import QueryBuilderBase from src.db.helpers.session import session_helper as sh class GetURLsForInternetArchivesTaskQueryBuilder(QueryBuilderBase): - async def run(self, session: AsyncSession) -> list[URLMapping]: + async def run(self, session: AsyncSession) -> list[SimpleURLMapping]: cte = CheckURLInternetArchivesCTEContainer() query = ( select( @@ -24,7 +21,7 @@ async def run(self, session: AsyncSession) -> list[URLMapping]: db_mappings = await sh.mappings(session, query=query) return [ - URLMapping( + SimpleURLMapping( url_id=mapping["url_id"], url=mapping["url"] ) for mapping in db_mappings diff --git a/src/core/tasks/scheduled/impl/internet_archives/save/models/entry.py b/src/core/tasks/scheduled/impl/internet_archives/save/models/entry.py index 6e4ae84e..280aa51d 100644 --- a/src/core/tasks/scheduled/impl/internet_archives/save/models/entry.py +++ b/src/core/tasks/scheduled/impl/internet_archives/save/models/entry.py @@ -1,6 +1,6 @@ from pydantic import BaseModel -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping class InternetArchivesSaveTaskEntry(BaseModel): @@ -8,8 +8,8 @@ class InternetArchivesSaveTaskEntry(BaseModel): url_id: int is_new: bool - def to_url_mapping(self) -> URLMapping: - return URLMapping( + def to_url_mapping(self) -> SimpleURLMapping: + return SimpleURLMapping( url_id=self.url_id, url=self.url ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/probe/convert.py b/src/core/tasks/url/operators/probe/convert.py index dcb211f0..e568de91 100644 --- a/src/core/tasks/url/operators/probe/convert.py +++ b/src/core/tasks/url/operators/probe/convert.py @@ -1,5 +1,6 @@ from src.core.tasks.url.operators.probe.tdo import URLProbeTDO from src.db.models.impl.url.web_metadata.insert import URLWebMetadataPydantic +from src.external.url_request.probe.models.redirect import URLProbeRedirectResponsePair def convert_tdo_to_web_metadata_list(tdos: list[URLProbeTDO]) -> list[URLWebMetadataPydantic]: @@ -16,3 +17,19 @@ def convert_tdo_to_web_metadata_list(tdos: list[URLProbeTDO]) -> list[URLWebMeta results.append(web_metadata_object) return results +def convert_tdos_with_functional_equivalents_to_web_metadata_list( + tdos: list[URLProbeTDO] +) -> list[URLWebMetadataPydantic]: + results: list[URLWebMetadataPydantic] = [] + for tdo in tdos: + response: URLProbeRedirectResponsePair = tdo.response.response + dest = response.destination + web_metadata_object = URLWebMetadataPydantic( + url_id=tdo.url_mapping.url_id, + accessed=dest.status_code != 404, + status_code=dest.status_code, + content_type=dest.content_type, + error_message=dest.error + ) + results.append(web_metadata_object) + return results diff --git a/src/core/tasks/url/operators/probe/core.py b/src/core/tasks/url/operators/probe/core.py index 1c961155..4f38c1d9 100644 --- a/src/core/tasks/url/operators/probe/core.py +++ b/src/core/tasks/url/operators/probe/core.py @@ -1,18 +1,25 @@ from typing import final + from typing_extensions import override from src.core.tasks.url.operators.base import URLTaskOperatorBase -from src.core.tasks.url.operators.probe.convert import convert_tdo_to_web_metadata_list -from src.core.tasks.url.operators.probe.filter import filter_non_redirect_tdos, filter_redirect_tdos +from src.core.tasks.url.operators.probe.convert import convert_tdo_to_web_metadata_list, \ + convert_tdos_with_functional_equivalents_to_web_metadata_list +from src.core.tasks.url.operators.probe.filter import filter_non_redirect_tdos, filter_redirect_tdos, \ + filter_functionally_equivalent_urls +from src.core.tasks.url.operators.probe.models.subsets import RedirectTDOSubsets +from src.core.tasks.url.operators.probe.models.upsert_functional_equivalents import URLFunctionalEquivalentsUpsertModel from src.core.tasks.url.operators.probe.queries.insert_redirects.query import InsertRedirectsQueryBuilder from src.core.tasks.url.operators.probe.queries.urls.not_probed.exists import HasURLsWithoutProbeQueryBuilder from src.core.tasks.url.operators.probe.queries.urls.not_probed.get.query import GetURLsWithoutProbeQueryBuilder from src.core.tasks.url.operators.probe.tdo import URLProbeTDO -from src.db.models.impl.url.web_metadata.insert import URLWebMetadataPydantic -from src.external.url_request.core import URLRequestInterface from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.full import FullURLMapping from src.db.enums import TaskType +from src.db.models.impl.url.task_error.pydantic_.small import URLTaskErrorSmall +from src.db.models.impl.url.web_metadata.insert import URLWebMetadataPydantic +from src.external.url_request.core import URLRequestInterface + @final class URLProbeTaskOperator(URLTaskOperatorBase): @@ -36,7 +43,7 @@ async def meets_task_prerequisites(self) -> bool: return await self.has_urls_without_probe() async def get_urls_without_probe(self) -> list[URLProbeTDO]: - url_mappings: list[URLMapping] = await self.adb_client.run_query_builder( + url_mappings: list[FullURLMapping] = await self.adb_client.run_query_builder( GetURLsWithoutProbeQueryBuilder() ) return [URLProbeTDO(url_mapping=url_mapping) for url_mapping in url_mappings] @@ -57,26 +64,76 @@ async def probe_urls(self, tdos: list[URLProbeTDO]) -> None: URLProbeTDO.response """ url_to_tdo: dict[str, URLProbeTDO] = { - tdo.url_mapping.url: tdo for tdo in tdos + tdo.url_mapping.full_url.id_form: tdo for tdo in tdos } responses = await self.url_request_interface.probe_urls( - urls=[tdo.url_mapping.url for tdo in tdos] + urls=[tdo.url_mapping.full_url for tdo in tdos] ) # Re-associate the responses with the URL mappings for response in responses: - tdo = url_to_tdo[response.original_url] + tdo = url_to_tdo[response.original_url.id_form] tdo.response = response async def update_database(self, tdos: list[URLProbeTDO]) -> None: - non_redirect_tdos = filter_non_redirect_tdos(tdos) + none_tdos: list[URLProbeTDO] = [ + tdo for tdo in tdos if tdo.response is None + ] + await self.upload_none_errors(none_tdos) + + non_error_tdos = [ + tdo for tdo in tdos if tdo.response is not None + ] + + non_redirect_tdos = filter_non_redirect_tdos(non_error_tdos) web_metadata_objects: list[URLWebMetadataPydantic] = convert_tdo_to_web_metadata_list(non_redirect_tdos) await self.adb_client.bulk_upsert(web_metadata_objects) - redirect_tdos: list[URLProbeTDO] = filter_redirect_tdos(tdos) + redirect_tdos: list[URLProbeTDO] = filter_redirect_tdos(non_error_tdos) + + # Filter redirects into true redirects and functional equivalents + redirect_subsets: RedirectTDOSubsets = filter_functionally_equivalent_urls(redirect_tdos) + + await self._insert_true_redirects(redirect_subsets.true_redirects) - query_builder = InsertRedirectsQueryBuilder(tdos=redirect_tdos) - await self.adb_client.run_query_builder(query_builder) + await self._update_functional_equivalents(redirect_subsets.functional_equivalents) + + async def upload_none_errors( + self, + tdos: list[URLProbeTDO] + ) -> None: + error_url_ids: list[int] = [tdo.url_mapping.url_id for tdo in tdos] + task_errors = [ + URLTaskErrorSmall( + url_id=url_id, + error="TDO response is None" + ) + for url_id in error_url_ids + ] + await self.add_task_errors(task_errors) + + + async def _insert_true_redirects(self, tdos: list[URLProbeTDO]) -> None: + await self.adb_client.run_query_builder( + InsertRedirectsQueryBuilder(tdos=tdos) + ) + async def _update_functional_equivalents(self, tdos: list[URLProbeTDO]) -> None: + # For non-true redirects, treat the redirected URL as the true URL and update database + url_updates = [ + URLFunctionalEquivalentsUpsertModel( + id=tdo.url_mapping.url_id, + url=tdo.response.response.destination.url.without_scheme.rstrip('/'), + trailing_slash=tdo.response.response.destination.url.without_scheme.endswith('/') + ) + for tdo in tdos + ] + await self.adb_client.bulk_update(url_updates) + # For these URLs, also update web metadata + func_equiv_web_metadata_objects: list[URLWebMetadataPydantic] = \ + convert_tdos_with_functional_equivalents_to_web_metadata_list( + tdos + ) + await self.adb_client.bulk_upsert(func_equiv_web_metadata_objects) async def has_urls_without_probe(self) -> bool: return await self.adb_client.run_query_builder( diff --git a/src/core/tasks/url/operators/probe/filter.py b/src/core/tasks/url/operators/probe/filter.py index 4a129676..2f9313e8 100644 --- a/src/core/tasks/url/operators/probe/filter.py +++ b/src/core/tasks/url/operators/probe/filter.py @@ -1,8 +1,30 @@ +from src.core.tasks.url.operators.probe.models.subsets import RedirectTDOSubsets from src.core.tasks.url.operators.probe.tdo import URLProbeTDO +from src.external.url_request.probe.models.redirect import URLProbeRedirectResponsePair +from src.util.models.full_url import FullURL def filter_non_redirect_tdos(tdos: list[URLProbeTDO]) -> list[URLProbeTDO]: return [tdo for tdo in tdos if not tdo.response.is_redirect] def filter_redirect_tdos(tdos: list[URLProbeTDO]) -> list[URLProbeTDO]: - return [tdo for tdo in tdos if tdo.response.is_redirect] \ No newline at end of file + return [tdo for tdo in tdos if tdo.response.is_redirect] + +def filter_functionally_equivalent_urls(tdos: list[URLProbeTDO]) -> RedirectTDOSubsets: + true_redirects: list[URLProbeTDO] = [] + functional_equivalents: list[URLProbeTDO] = [] + for tdo in tdos: + og_url: FullURL = tdo.url_mapping.full_url + response: URLProbeRedirectResponsePair = tdo.response.response + redirect_url: FullURL = response.destination.url + + if og_url.id_form != redirect_url.id_form: + true_redirects.append(tdo) + # Otherwise, they are functional equivalents. + else: + functional_equivalents.append(tdo) + + return RedirectTDOSubsets( + true_redirects=true_redirects, + functional_equivalents=functional_equivalents + ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/probe/models/__init__.py b/src/core/tasks/url/operators/probe/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/probe/models/subsets.py b/src/core/tasks/url/operators/probe/models/subsets.py new file mode 100644 index 00000000..8cad6434 --- /dev/null +++ b/src/core/tasks/url/operators/probe/models/subsets.py @@ -0,0 +1,8 @@ +from pydantic import BaseModel + +from src.core.tasks.url.operators.probe.tdo import URLProbeTDO + + +class RedirectTDOSubsets(BaseModel): + true_redirects: list[URLProbeTDO] + functional_equivalents: list[URLProbeTDO] diff --git a/src/core/tasks/url/operators/probe/models/upsert_functional_equivalents.py b/src/core/tasks/url/operators/probe/models/upsert_functional_equivalents.py new file mode 100644 index 00000000..434f43af --- /dev/null +++ b/src/core/tasks/url/operators/probe/models/upsert_functional_equivalents.py @@ -0,0 +1,22 @@ +from pydantic import BaseModel + +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.templates_.base import Base +from src.db.templates.markers.bulk.upsert import BulkUpsertableModel + + +class URLFunctionalEquivalentsUpsertModel(BulkUpsertableModel): + + @classmethod + def id_field(cls) -> str: + return "id" + + @classmethod + def sa_model(cls) -> type[Base]: + """Defines the SQLAlchemy model.""" + return URL + + id: int + url: str + trailing_slash: bool + diff --git a/src/core/tasks/url/operators/probe/queries/insert_redirects/convert.py b/src/core/tasks/url/operators/probe/queries/insert_redirects/convert.py index eb0597ba..80d58110 100644 --- a/src/core/tasks/url/operators/probe/queries/insert_redirects/convert.py +++ b/src/core/tasks/url/operators/probe/queries/insert_redirects/convert.py @@ -1,10 +1,11 @@ from src.core.tasks.url.operators.probe.queries.insert_redirects.models.url_response_map import URLResponseMapping -from src.core.tasks.url.operators.probe.queries.urls.exist.model import UrlExistsResult from src.core.tasks.url.operators.probe.tdo import URLProbeTDO -from src.db.dtos.url.mapping import URLMapping from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.core.pydantic.insert import URLInsertModel from src.db.models.impl.url.web_metadata.insert import URLWebMetadataPydantic +from src.util.models.full_url import FullURL +from src.util.models.url_and_scheme import URLAndScheme +from src.util.url import get_url_and_scheme def convert_url_response_mapping_to_web_metadata_list( @@ -23,23 +24,15 @@ def convert_url_response_mapping_to_web_metadata_list( results.append(web_metadata_object) return results - -def convert_to_url_mappings(url_exists_results: list[UrlExistsResult]) -> list[URLMapping]: - return [ - URLMapping( - url=url_exists_result.url, - url_id=url_exists_result.url_id - ) for url_exists_result in url_exists_results - ] - - -def convert_to_url_insert_models(urls: list[str]) -> list[URLInsertModel]: - results = [] +def convert_to_url_insert_models(urls: list[FullURL]) -> list[URLInsertModel]: + results: list[URLInsertModel] = [] for url in urls: results.append( URLInsertModel( - url=url, - source=URLSource.REDIRECT + url=url.without_scheme.rstrip('/'), + scheme=url.scheme, + source=URLSource.REDIRECT, + trailing_slash=url.without_scheme.endswith('/') ) ) return results diff --git a/src/core/tasks/url/operators/probe/queries/insert_redirects/extract.py b/src/core/tasks/url/operators/probe/queries/insert_redirects/extract.py index 3de66e85..1f6d83e5 100644 --- a/src/core/tasks/url/operators/probe/queries/insert_redirects/extract.py +++ b/src/core/tasks/url/operators/probe/queries/insert_redirects/extract.py @@ -1,5 +1,4 @@ from src.core.tasks.url.operators.probe.tdo import URLProbeTDO -from src.db.dtos.url.mapping import URLMapping from src.external.url_request.probe.models.redirect import URLProbeRedirectResponsePair diff --git a/src/core/tasks/url/operators/probe/queries/insert_redirects/filter.py b/src/core/tasks/url/operators/probe/queries/insert_redirects/filter.py deleted file mode 100644 index 1f36893d..00000000 --- a/src/core/tasks/url/operators/probe/queries/insert_redirects/filter.py +++ /dev/null @@ -1,14 +0,0 @@ -from src.db.dtos.url.mapping import URLMapping - - -def filter_new_dest_urls( - url_mappings_in_db: list[URLMapping], - all_dest_urls: list[str] -) -> list[str]: - extant_destination_urls: set[str] = set([url_mapping.url for url_mapping in url_mappings_in_db]) - new_dest_urls: list[str] = [ - url - for url in all_dest_urls - if url not in extant_destination_urls - ] - return new_dest_urls \ No newline at end of file diff --git a/src/core/tasks/url/operators/probe/queries/insert_redirects/map.py b/src/core/tasks/url/operators/probe/queries/insert_redirects/map.py index 53f2b2e1..3f83e941 100644 --- a/src/core/tasks/url/operators/probe/queries/insert_redirects/map.py +++ b/src/core/tasks/url/operators/probe/queries/insert_redirects/map.py @@ -1,15 +1,16 @@ from src.core.tasks.url.operators.probe.queries.insert_redirects.models.url_response_map import URLResponseMapping -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.full import FullURLMapping from src.external.url_request.probe.models.response import URLProbeResponse +from src.util.models.full_url import FullURL def map_url_mappings_to_probe_responses( - url_mappings: list[URLMapping], - url_to_probe_responses: dict[str, URLProbeResponse] + url_mappings: list[FullURLMapping], + url_to_probe_responses: dict[FullURL, URLProbeResponse] ) -> list[URLResponseMapping]: results = [] for url_mapping in url_mappings: - response = url_to_probe_responses[url_mapping.url] + response = url_to_probe_responses[url_mapping.full_url] results.append( URLResponseMapping( url_mapping=url_mapping, diff --git a/src/core/tasks/url/operators/probe/queries/insert_redirects/models/subset.py b/src/core/tasks/url/operators/probe/queries/insert_redirects/models/subset.py new file mode 100644 index 00000000..c5b26c24 --- /dev/null +++ b/src/core/tasks/url/operators/probe/queries/insert_redirects/models/subset.py @@ -0,0 +1,9 @@ +from pydantic import BaseModel + +from src.util.models.full_url import FullURL + + +class DestinationURLSubsets(BaseModel): + new_urls: list[FullURL] + exist_with_alterations: list[FullURL] + exist_as_is: list[FullURL] \ No newline at end of file diff --git a/src/core/tasks/url/operators/probe/queries/insert_redirects/models/url_response_map.py b/src/core/tasks/url/operators/probe/queries/insert_redirects/models/url_response_map.py index efbd5db8..fd90ab65 100644 --- a/src/core/tasks/url/operators/probe/queries/insert_redirects/models/url_response_map.py +++ b/src/core/tasks/url/operators/probe/queries/insert_redirects/models/url_response_map.py @@ -1,9 +1,10 @@ from pydantic import BaseModel -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.full import FullURLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.external.url_request.probe.models.response import URLProbeResponse class URLResponseMapping(BaseModel): - url_mapping: URLMapping + url_mapping: FullURLMapping response: URLProbeResponse \ No newline at end of file diff --git a/src/core/tasks/url/operators/probe/queries/insert_redirects/query.py b/src/core/tasks/url/operators/probe/queries/insert_redirects/query.py index 0ba70c47..8dd4f693 100644 --- a/src/core/tasks/url/operators/probe/queries/insert_redirects/query.py +++ b/src/core/tasks/url/operators/probe/queries/insert_redirects/query.py @@ -1,14 +1,15 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.core.tasks.url.operators.probe.queries.insert_redirects.extract import extract_response_pairs -from src.core.tasks.url.operators.probe.queries.insert_redirects.filter import filter_new_dest_urls from src.core.tasks.url.operators.probe.queries.insert_redirects.request_manager import InsertRedirectsRequestManager +from src.core.tasks.url.operators.probe.queries.urls.exist.model import URLExistsResult from src.core.tasks.url.operators.probe.tdo import URLProbeTDO -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.full import FullURLMapping from src.db.queries.base.builder import QueryBuilderBase from src.external.url_request.probe.models.redirect import URLProbeRedirectResponsePair from src.external.url_request.probe.models.response import URLProbeResponse -from src.util.url_mapper import URLMapper +from src.util.models.full_url import FullURL +from src.util.url_mapper_.full import FullURLMapper class InsertRedirectsQueryBuilder(QueryBuilderBase): @@ -19,7 +20,7 @@ def __init__( super().__init__() self.tdos = tdos self.source_url_mappings = [tdo.url_mapping for tdo in self.tdos] - self._mapper = URLMapper(self.source_url_mappings) + self._mapper = FullURLMapper(self.source_url_mappings) self._response_pairs: list[URLProbeRedirectResponsePair] = extract_response_pairs(self.tdos) @@ -27,12 +28,12 @@ def __init__( pair.destination for pair in self._response_pairs ] - self._destination_urls: list[str] = [ + self._destination_urls: list[FullURL] = [ response.url for response in self._destination_probe_responses ] - self._destination_url_to_probe_response_mapping: dict[str, URLProbeResponse] = { + self._destination_url_to_probe_response_mapping: dict[FullURL, URLProbeResponse] = { response.url: response for response in self._destination_probe_responses } @@ -50,29 +51,39 @@ async def run(self, session: AsyncSession) -> None: session=session ) - # Get all destination URLs already in the database - dest_url_mappings_in_db: list[URLMapping] = await rm.get_url_mappings_in_db( + url_exist_results: list[URLExistsResult] = await rm.check_if_urls_exist_in_db( urls=self._destination_urls ) - # Filter out to only have those URLs that are new in the database - new_dest_urls: list[str] = filter_new_dest_urls( - url_mappings_in_db=dest_url_mappings_in_db, - all_dest_urls=self._destination_urls - ) + # Two Options: + # - URLs that do not exist in any form in the database + # - URLs that exist as-is or in slightly modified version (url scheme or trailing slash differs) + new_urls: list[FullURL] = [] + extant_url_mappings: list[FullURLMapping] = [] + for result in url_exist_results: + if not result.exists: + new_urls.append(result.query_url) + else: + extant_url_mappings.append( + FullURLMapping( + full_url=result.query_url, + url_id=result.url_id + ) + ) # Add the new URLs - new_dest_url_mappings: list[URLMapping] = await rm.insert_new_urls( - urls=new_dest_urls + new_dest_url_mappings: list[FullURLMapping] = await rm.insert_new_urls( + urls=new_urls ) - all_dest_url_mappings: list[URLMapping] = dest_url_mappings_in_db + new_dest_url_mappings - self._mapper.add_mappings(all_dest_url_mappings) + all_url_mappings: list[FullURLMapping] = extant_url_mappings + new_dest_url_mappings + + self._mapper.add_mappings(all_url_mappings) # Add web metadata for new URLs await rm.add_web_metadata( - all_dest_url_mappings=all_dest_url_mappings, + all_dest_url_mappings=all_url_mappings, dest_url_to_probe_response_mappings=self._destination_url_to_probe_response_mapping, tdos=self.tdos ) diff --git a/src/core/tasks/url/operators/probe/queries/insert_redirects/request_manager.py b/src/core/tasks/url/operators/probe/queries/insert_redirects/request_manager.py index 35dfded5..45eaa8e3 100644 --- a/src/core/tasks/url/operators/probe/queries/insert_redirects/request_manager.py +++ b/src/core/tasks/url/operators/probe/queries/insert_redirects/request_manager.py @@ -3,23 +3,23 @@ from sqlalchemy import select, tuple_, RowMapping from sqlalchemy.ext.asyncio import AsyncSession -from src.core.tasks.url.operators.probe.queries.insert_redirects.convert import convert_to_url_mappings, \ - convert_to_url_insert_models, convert_tdo_to_url_response_mappings, \ +from src.core.tasks.url.operators.probe.queries.insert_redirects.convert import convert_to_url_insert_models, \ + convert_tdo_to_url_response_mappings, \ convert_url_response_mapping_to_web_metadata_list from src.core.tasks.url.operators.probe.queries.insert_redirects.map import map_url_mappings_to_probe_responses from src.core.tasks.url.operators.probe.queries.insert_redirects.models.url_response_map import URLResponseMapping -from src.core.tasks.url.operators.probe.queries.urls.exist.model import UrlExistsResult +from src.core.tasks.url.operators.probe.queries.urls.exist.model import URLExistsResult from src.core.tasks.url.operators.probe.queries.urls.exist.query import URLsExistInDBQueryBuilder from src.core.tasks.url.operators.probe.tdo import URLProbeTDO -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.full import FullURLMapping from src.db.helpers.session import session_helper as sh from src.db.models.impl.link.url_redirect_url.pydantic import LinkURLRedirectURLPydantic from src.db.models.impl.link.url_redirect_url.sqlalchemy import LinkURLRedirectURL -from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.web_metadata.insert import URLWebMetadataPydantic from src.external.url_request.probe.models.redirect import URLProbeRedirectResponsePair from src.external.url_request.probe.models.response import URLProbeResponse -from src.util.url_mapper import URLMapper +from src.util.models.full_url import FullURL +from src.util.url_mapper_.full import FullURLMapper class InsertRedirectsRequestManager: @@ -27,24 +27,23 @@ class InsertRedirectsRequestManager: def __init__(self, session: AsyncSession): self.session = session - async def get_url_mappings_in_db( + async def check_if_urls_exist_in_db( self, - urls: list[str], - ): - results: list[UrlExistsResult] = await URLsExistInDBQueryBuilder( - urls=urls + urls: list[FullURL], + ) -> list[URLExistsResult]: + results: list[URLExistsResult] = await URLsExistInDBQueryBuilder( + full_urls=urls ).run(self.session) - extant_urls = [result for result in results if result.exists] - return convert_to_url_mappings(extant_urls) + return results - async def insert_new_urls(self, urls: list[str]) -> list[URLMapping]: + async def insert_new_urls(self, urls: list[FullURL]) -> list[FullURLMapping]: if len(urls) == 0: return [] deduplicated_urls = list(set(urls)) insert_models = convert_to_url_insert_models(deduplicated_urls) url_ids = await sh.bulk_insert(self.session, models=insert_models, return_ids=True) url_mappings = [ - URLMapping(url=url, url_id=url_id) + FullURLMapping(full_url=url, url_id=url_id) for url, url_id in zip(deduplicated_urls, url_ids) ] @@ -52,8 +51,8 @@ async def insert_new_urls(self, urls: list[str]) -> list[URLMapping]: async def add_web_metadata( self, - all_dest_url_mappings: list[URLMapping], - dest_url_to_probe_response_mappings: dict[str, URLProbeResponse], + all_dest_url_mappings: list[FullURLMapping], + dest_url_to_probe_response_mappings: dict[FullURL, URLProbeResponse], tdos: list[URLProbeTDO], ) -> None: dest_url_response_mappings = map_url_mappings_to_probe_responses( @@ -72,7 +71,7 @@ async def add_web_metadata( async def add_redirect_links( self, response_pairs: list[URLProbeRedirectResponsePair], - mapper: URLMapper + mapper: FullURLMapper ) -> None: # Get all existing links and exclude link_tuples: list[tuple[int, int]] = [] diff --git a/src/core/tasks/url/operators/probe/queries/urls/exist/model.py b/src/core/tasks/url/operators/probe/queries/urls/exist/model.py index 1245044c..72e20cfa 100644 --- a/src/core/tasks/url/operators/probe/queries/urls/exist/model.py +++ b/src/core/tasks/url/operators/probe/queries/urls/exist/model.py @@ -1,10 +1,20 @@ from pydantic import BaseModel +from src.util.models.full_url import FullURL -class UrlExistsResult(BaseModel): - url: str + +class URLExistsResult(BaseModel): + class Config: + arbitrary_types_allowed = True + + query_url: FullURL + db_url: FullURL | None url_id: int | None @property - def exists(self): - return self.url_id is not None \ No newline at end of file + def exists(self) -> bool: + return self.url_id is not None + + @property + def urls_match(self) -> bool: + return self.query_url.id_form == self.db_url.id_form \ No newline at end of file diff --git a/src/core/tasks/url/operators/probe/queries/urls/exist/query.py b/src/core/tasks/url/operators/probe/queries/urls/exist/query.py index 5176add9..4e9d3173 100644 --- a/src/core/tasks/url/operators/probe/queries/urls/exist/query.py +++ b/src/core/tasks/url/operators/probe/queries/urls/exist/query.py @@ -1,29 +1,53 @@ -from sqlalchemy import select +from sqlalchemy import select, func from sqlalchemy.ext.asyncio import AsyncSession -from src.core.tasks.url.operators.probe.queries.urls.exist.model import UrlExistsResult +from src.core.tasks.url.operators.probe.queries.urls.exist.model import URLExistsResult +from src.db.helpers.session.session_helper import results_exist from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.helpers.session import session_helper as sh +from src.util.models.full_url import FullURL + class URLsExistInDBQueryBuilder(QueryBuilderBase): """Checks if URLs exist in the database.""" - def __init__(self, urls: list[str]): + def __init__(self, full_urls: list[FullURL]): super().__init__() - self.urls = urls + self.full_urls = full_urls + self.id_form_urls = [ + url.id_form + for url in full_urls + ] + + async def run(self, session: AsyncSession) -> list[URLExistsResult]: + norm_url = func.rtrim(URL.url, '/').label("norm_url") - async def run(self, session: AsyncSession) -> list[UrlExistsResult]: - query = select(URL.id, URL.url).where(URL.url.in_(self.urls)) + query = select( + URL.id, + norm_url + ).where( + norm_url.in_(self.id_form_urls) + ) db_mappings = await sh.mappings(session, query=query) url_to_id_map: dict[str, int] = { - row["url"]: row["id"] + row["norm_url"]: row["id"] for row in db_mappings } - return [ - UrlExistsResult( - url=url, - url_id=url_to_id_map.get(url) - ) for url in self.urls - ] \ No newline at end of file + id_to_db_url_map: dict[int, FullURL] = { + row["id"]: FullURL(row["norm_url"]) + for row in db_mappings + } + results: list[URLExistsResult] = [] + for full_url in self.full_urls: + url_id: int | None = url_to_id_map.get(full_url.id_form) + db_url: FullURL | None = id_to_db_url_map.get(url_id) + result = URLExistsResult( + query_url=full_url, + db_url=db_url, + url_id=url_id + ) + results.append(result) + + return results \ No newline at end of file diff --git a/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/query.py b/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/query.py index 0ecc50b3..7011a8de 100644 --- a/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/query.py +++ b/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/query.py @@ -4,23 +4,23 @@ from sqlalchemy.ext.asyncio import AsyncSession from typing_extensions import override, final -from src.util.url import clean_url -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.full import FullURLMapping +from src.db.helpers.session import session_helper as sh from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata -from src.db.helpers.session import session_helper as sh from src.db.queries.base.builder import QueryBuilderBase +from src.util.models.full_url import FullURL @final class GetURLsWithoutProbeQueryBuilder(QueryBuilderBase): @override - async def run(self, session: AsyncSession) -> list[URLMapping]: + async def run(self, session: AsyncSession) -> list[FullURLMapping]: query = ( select( URL.id.label("url_id"), - URL.full_url.label("url") + URL.full_url ) .outerjoin( URLWebMetadata, @@ -36,8 +36,8 @@ async def run(self, session: AsyncSession) -> list[URLMapping]: ) db_mappings = await sh.mappings(session, query=query) return [ - URLMapping( + FullURLMapping( url_id=mapping["url_id"], - url=clean_url(mapping["url"]) + full_url=FullURL(mapping["full_url"]) ) for mapping in db_mappings ] \ No newline at end of file diff --git a/src/core/tasks/url/operators/probe/tdo.py b/src/core/tasks/url/operators/probe/tdo.py index 5208fd80..0fcb806c 100644 --- a/src/core/tasks/url/operators/probe/tdo.py +++ b/src/core/tasks/url/operators/probe/tdo.py @@ -1,9 +1,12 @@ from pydantic import BaseModel -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.full import FullURLMapping from src.external.url_request.probe.models.wrapper import URLProbeResponseOuterWrapper class URLProbeTDO(BaseModel): - url_mapping: URLMapping + class Config: + arbitrary_types_allowed = True + + url_mapping: FullURLMapping response: URLProbeResponseOuterWrapper | None = None diff --git a/src/core/tasks/url/operators/root_url/convert.py b/src/core/tasks/url/operators/root_url/convert.py index 405cbc49..1c7a3cdc 100644 --- a/src/core/tasks/url/operators/root_url/convert.py +++ b/src/core/tasks/url/operators/root_url/convert.py @@ -1,17 +1,17 @@ from src.core.tasks.url.operators.root_url.extract import extract_root_url from src.core.tasks.url.operators.root_url.models.root_mapping import URLRootURLMapping -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.flag.root_url.pydantic import FlagRootURLPydantic from src.db.models.impl.link.urls_root_url.pydantic import LinkURLRootURLPydantic from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.core.pydantic.insert import URLInsertModel -from src.util.url_mapper import URLMapper +from src.util.url_mapper_.simple import SimpleURLMapper def convert_to_flag_root_url_pydantic(url_ids: list[int]) -> list[FlagRootURLPydantic]: return [FlagRootURLPydantic(url_id=url_id) for url_id in url_ids] -def convert_to_url_root_url_mapping(url_mappings: list[URLMapping]) -> list[URLRootURLMapping]: +def convert_to_url_root_url_mapping(url_mappings: list[SimpleURLMapping]) -> list[URLRootURLMapping]: return [ URLRootURLMapping( url=mapping.url, @@ -22,18 +22,19 @@ def convert_to_url_root_url_mapping(url_mappings: list[URLMapping]) -> list[URLR def convert_to_url_insert_models(urls: list[str]) -> list[URLInsertModel]: return [ URLInsertModel( - url=url, - source=URLSource.ROOT_URL + url=url.rstrip('/'), + source=URLSource.ROOT_URL, + trailing_slash=url.endswith('/') ) for url in urls ] def convert_to_root_url_links( - root_db_mappings: list[URLMapping], - branch_db_mappings: list[URLMapping], + root_db_mappings: list[SimpleURLMapping], + branch_db_mappings: list[SimpleURLMapping], url_root_url_mappings: list[URLRootURLMapping] ) -> list[LinkURLRootURLPydantic]: - root_mapper = URLMapper(root_db_mappings) - branch_mapper = URLMapper(branch_db_mappings) + root_mapper = SimpleURLMapper(root_db_mappings) + branch_mapper = SimpleURLMapper(branch_db_mappings) results: list[LinkURLRootURLPydantic] = [] for url_root_url_mapping in url_root_url_mappings: diff --git a/src/core/tasks/url/operators/root_url/core.py b/src/core/tasks/url/operators/root_url/core.py index e32654da..ece5929f 100644 --- a/src/core/tasks/url/operators/root_url/core.py +++ b/src/core/tasks/url/operators/root_url/core.py @@ -11,12 +11,12 @@ from src.core.tasks.url.operators.root_url.queries.lookup.response import LookupRootsURLResponse from src.core.tasks.url.operators.root_url.queries.prereq import CheckPrereqsForRootURLTaskQueryBuilder from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.enums import TaskType from src.db.models.impl.flag.root_url.pydantic import FlagRootURLPydantic from src.db.models.impl.link.urls_root_url.pydantic import LinkURLRootURLPydantic from src.db.models.impl.url.core.pydantic.insert import URLInsertModel -from src.util.url_mapper import URLMapper +from src.util.url_mapper_.simple import SimpleURLMapper @final @@ -37,14 +37,14 @@ def task_type(self) -> TaskType: @override async def inner_task_logic(self) -> None: - all_task_mappings: list[URLMapping] = await self._get_urls_for_root_url_task() + all_task_mappings: list[SimpleURLMapping] = await self._get_urls_for_root_url_task() await self.link_urls_to_task( url_ids=[mapping.url_id for mapping in all_task_mappings] ) # Get the Root URLs for all URLs - mapper = URLMapper(all_task_mappings) + mapper = SimpleURLMapper(all_task_mappings) # -- Identify and Derive Root URLs -- @@ -65,7 +65,7 @@ async def inner_task_logic(self) -> None: for response in derived_root_url_lookup_responses if response.url_id is None ] - new_derived_root_url_mappings: list[URLMapping] = await self._add_new_urls(derived_root_urls_not_in_db) + new_derived_root_url_mappings: list[SimpleURLMapping] = await self._add_new_urls(derived_root_urls_not_in_db) # Add these to the mapper mapper.add_mappings(new_derived_root_url_mappings) @@ -105,7 +105,7 @@ async def inner_task_logic(self) -> None: async def _add_root_url_links( self, - mapper: URLMapper, + mapper: SimpleURLMapper, root_url_mappings: list[URLRootURLMapping], ): # For all task URLs that are not root URLs (i.e. 'branch' URLs): @@ -115,8 +115,8 @@ async def _add_root_url_links( branch_urls: list[str] = [mapping.url for mapping in root_url_mappings] root_urls: list[str] = [mapping.root_url for mapping in root_url_mappings] - root_url_db_mappings: list[URLMapping] = await self._lookup_root_urls(root_urls) - task_url_db_mappings: list[URLMapping] = mapper.get_mappings_by_url(branch_urls) + root_url_db_mappings: list[SimpleURLMapping] = await self._lookup_root_urls(root_urls) + task_url_db_mappings: list[SimpleURLMapping] = mapper.get_mappings_by_url(branch_urls) links: list[LinkURLRootURLPydantic] = convert_to_root_url_links( root_db_mappings=root_url_db_mappings, @@ -131,7 +131,7 @@ async def _flag_root_urls( ): await self._flag_as_root_urls(url_ids) - async def _get_urls_for_root_url_task(self) -> list[URLMapping]: + async def _get_urls_for_root_url_task(self) -> list[SimpleURLMapping]: builder = GetURLsForRootURLTaskQueryBuilder() return await self.adb_client.run_query_builder(builder) @@ -139,15 +139,15 @@ async def _lookup_root_urls(self, urls: list[str]) -> list[LookupRootsURLRespons builder = LookupRootURLsQueryBuilder(urls=list(set(urls))) return await self.adb_client.run_query_builder(builder) - async def _add_new_urls(self, urls: list[str]) -> list[URLMapping]: + async def _add_new_urls(self, urls: list[str]) -> list[SimpleURLMapping]: if len(urls) == 0: return [] insert_models: list[URLInsertModel] = convert_to_url_insert_models(urls) url_ids: list[int] = await self.adb_client.bulk_insert(insert_models, return_ids=True) - mappings: list[URLMapping] = [] + mappings: list[SimpleURLMapping] = [] for url, url_id in zip(urls, url_ids): mappings.append( - URLMapping( + SimpleURLMapping( url=url, url_id=url_id ) diff --git a/src/core/tasks/url/operators/root_url/extract.py b/src/core/tasks/url/operators/root_url/extract.py index 9cb05c5a..67a66c6f 100644 --- a/src/core/tasks/url/operators/root_url/extract.py +++ b/src/core/tasks/url/operators/root_url/extract.py @@ -5,4 +5,4 @@ def extract_root_url(url: str) -> str: # URLs in DB should not have HTTPS -- add to enable url parse to function properly parsed_url: ParseResult = urlparse(f"https://{url}") root_url = parsed_url.netloc - return root_url \ No newline at end of file + return root_url.rstrip("/") \ No newline at end of file diff --git a/src/core/tasks/url/operators/root_url/models/root_mapping.py b/src/core/tasks/url/operators/root_url/models/root_mapping.py index 7b115f36..03f87f66 100644 --- a/src/core/tasks/url/operators/root_url/models/root_mapping.py +++ b/src/core/tasks/url/operators/root_url/models/root_mapping.py @@ -7,4 +7,5 @@ class URLRootURLMapping(BaseModel): @property def is_root_url(self) -> bool: - return self.url == self.root_url \ No newline at end of file + # Add rstrip to handle trailing slashes + return self.url.rstrip("/") == self.root_url.rstrip("/") \ No newline at end of file diff --git a/src/core/tasks/url/operators/root_url/queries/get.py b/src/core/tasks/url/operators/root_url/queries/get.py index 3643f343..e02651b3 100644 --- a/src/core/tasks/url/operators/root_url/queries/get.py +++ b/src/core/tasks/url/operators/root_url/queries/get.py @@ -2,7 +2,7 @@ from typing_extensions import override from src.core.tasks.url.operators.root_url.queries._shared.urls_without_root_id import URLS_WITHOUT_ROOT_ID_QUERY -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.helpers.session import session_helper as sh from src.db.queries.base.builder import QueryBuilderBase @@ -10,13 +10,13 @@ class GetURLsForRootURLTaskQueryBuilder(QueryBuilderBase): @override - async def run(self, session: AsyncSession) -> list[URLMapping]: + async def run(self, session: AsyncSession) -> list[SimpleURLMapping]: query = ( URLS_WITHOUT_ROOT_ID_QUERY ) mappings = await sh.mappings(session, query=query) return [ - URLMapping( + SimpleURLMapping( url_id=mapping["id"], url=mapping["url"] ) for mapping in mappings diff --git a/src/core/tasks/url/operators/screenshot/core.py b/src/core/tasks/url/operators/screenshot/core.py index 96627ab8..2afea9ed 100644 --- a/src/core/tasks/url/operators/screenshot/core.py +++ b/src/core/tasks/url/operators/screenshot/core.py @@ -8,7 +8,7 @@ from src.core.tasks.url.operators.screenshot.queries.get import GetURLsForScreenshotTaskQueryBuilder from src.core.tasks.url.operators.screenshot.queries.prereq import URLsForScreenshotTaskPrerequisitesQueryBuilder from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.enums import TaskType from src.db.models.impl.url.screenshot.pydantic import URLScreenshotPydantic from src.db.models.impl.url.task_error.pydantic_.small import URLTaskErrorSmall @@ -31,7 +31,7 @@ async def meets_task_prerequisites(self) -> bool: URLsForScreenshotTaskPrerequisitesQueryBuilder() ) - async def get_urls_without_screenshot(self) -> list[URLMapping]: + async def get_urls_without_screenshot(self) -> list[SimpleURLMapping]: return await self.adb_client.run_query_builder( GetURLsForScreenshotTaskQueryBuilder() ) @@ -47,7 +47,7 @@ async def upload_errors(self, outcomes: list[URLScreenshotOutcome]) -> None: await self.add_task_errors(insert_models) async def inner_task_logic(self) -> None: - url_mappings: list[URLMapping] = await self.get_urls_without_screenshot() + url_mappings: list[SimpleURLMapping] = await self.get_urls_without_screenshot() await self.link_urls_to_task( url_ids=[url_mapping.url_id for url_mapping in url_mappings] ) diff --git a/src/core/tasks/url/operators/screenshot/get.py b/src/core/tasks/url/operators/screenshot/get.py index 7c0d6a42..7598c43e 100644 --- a/src/core/tasks/url/operators/screenshot/get.py +++ b/src/core/tasks/url/operators/screenshot/get.py @@ -1,12 +1,12 @@ from src.core.tasks.url.operators.screenshot.models.outcome import URLScreenshotOutcome -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.external.url_request.dtos.screenshot_response import URLScreenshotResponse from src.external.url_request.screenshot_.core import get_screenshots -from src.util.url_mapper import URLMapper +from src.util.url_mapper_.simple import SimpleURLMapper -async def get_url_screenshots(mappings: list[URLMapping]) -> list[URLScreenshotOutcome]: - mapper = URLMapper(mappings) +async def get_url_screenshots(mappings: list[SimpleURLMapping]) -> list[URLScreenshotOutcome]: + mapper = SimpleURLMapper(mappings) responses: list[URLScreenshotResponse] = await get_screenshots( urls=mapper.get_all_urls() ) diff --git a/src/core/tasks/url/operators/screenshot/queries/get.py b/src/core/tasks/url/operators/screenshot/queries/get.py index e2dd94df..f3bf2839 100644 --- a/src/core/tasks/url/operators/screenshot/queries/get.py +++ b/src/core/tasks/url/operators/screenshot/queries/get.py @@ -1,18 +1,18 @@ -from typing import Any, Sequence +from typing import Sequence from sqlalchemy import select, RowMapping from sqlalchemy.ext.asyncio import AsyncSession from src.core.tasks.url.operators.screenshot.constants import TASK_URL_LIMIT from src.core.tasks.url.operators.screenshot.queries.cte import URLScreenshotPrerequisitesCTEContainer -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.queries.base.builder import QueryBuilderBase from src.db.helpers.session import session_helper as sh class GetURLsForScreenshotTaskQueryBuilder(QueryBuilderBase): - async def run(self, session: AsyncSession) -> list[URLMapping]: + async def run(self, session: AsyncSession) -> list[SimpleURLMapping]: cte = URLScreenshotPrerequisitesCTEContainer() query = select( @@ -22,4 +22,4 @@ async def run(self, session: AsyncSession) -> list[URLMapping]: mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) - return [URLMapping(**mapping) for mapping in mappings] + return [SimpleURLMapping(**mapping) for mapping in mappings] diff --git a/src/core/tasks/url/operators/submit_meta_urls/core.py b/src/core/tasks/url/operators/submit_meta_urls/core.py index e06901da..ae41d56b 100644 --- a/src/core/tasks/url/operators/submit_meta_urls/core.py +++ b/src/core/tasks/url/operators/submit_meta_urls/core.py @@ -3,7 +3,7 @@ from src.core.tasks.url.operators.submit_meta_urls.queries.prereq import \ MeetsMetaURLSSubmissionPrerequisitesQueryBuilder from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.enums import TaskType from src.db.models.impl.url.ds_meta_url.pydantic import URLDSMetaURLPydantic from src.db.models.impl.url.task_error.pydantic_.small import URLTaskErrorSmall @@ -11,7 +11,7 @@ from src.external.pdap.impl.meta_urls.enums import SubmitMetaURLsStatus from src.external.pdap.impl.meta_urls.request import SubmitMetaURLsRequest from src.external.pdap.impl.meta_urls.response import SubmitMetaURLsResponse -from src.util.url_mapper import URLMapper +from src.util.url_mapper_.simple import SimpleURLMapper class SubmitMetaURLsTaskOperator(URLTaskOperatorBase): @@ -38,15 +38,15 @@ async def inner_task_logic(self) -> None: GetMetaURLsForSubmissionQueryBuilder() ) - url_mappings: list[URLMapping] = [ - URLMapping( + url_mappings: list[SimpleURLMapping] = [ + SimpleURLMapping( url=request.url, url_id=request.url_id, ) for request in requests ] - mapper = URLMapper(url_mappings) + mapper = SimpleURLMapper(url_mappings) await self.link_urls_to_task(mapper.get_all_ids()) diff --git a/src/core/tasks/url/operators/validate/queries/insert.py b/src/core/tasks/url/operators/validate/queries/insert.py index 31bdfa74..00dc36ac 100644 --- a/src/core/tasks/url/operators/validate/queries/insert.py +++ b/src/core/tasks/url/operators/validate/queries/insert.py @@ -4,14 +4,14 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.core.tasks.url.operators.validate.queries.get.models.response import GetURLsForAutoValidationResponse +from src.db.helpers.session import session_helper as sh from src.db.models.impl.flag.auto_validated.pydantic import FlagURLAutoValidatedPydantic from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic -from src.db.models.impl.url.core.pydantic.upsert import URLUpsertModel from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.record_type.pydantic import URLRecordTypePydantic from src.db.queries.base.builder import QueryBuilderBase -from src.db.helpers.session import session_helper as sh + class InsertURLAutoValidationsQueryBuilder(QueryBuilderBase): diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 87fcb057..d1d093a8 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -171,7 +171,6 @@ async def bulk_update( session: AsyncSession, models: list[Base], ): - # Note, mapping must include primary key await sh.bulk_update(session=session, models=models) @session_manager diff --git a/src/db/client/sync.py b/src/db/client/sync.py index 407cb3f4..966d4bbd 100644 --- a/src/db/client/sync.py +++ b/src/db/client/sync.py @@ -1,17 +1,16 @@ from functools import wraps from typing import List -from sqlalchemy import create_engine, update, Select +from sqlalchemy import create_engine, Select from sqlalchemy.exc import IntegrityError from sqlalchemy.orm import sessionmaker, scoped_session, Session -from src.collectors.enums import URLStatus from src.db.config_manager import ConfigManager from src.db.models.impl.batch.pydantic.info import BatchInfo from src.db.models.impl.duplicate.pydantic.insert import DuplicateInsertInfo from src.db.dtos.url.insert import InsertURLsInfo from src.db.models.impl.log.pydantic.info import LogInfo -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.pydantic.info import URLInfo from src.db.models.templates_.base import Base @@ -125,6 +124,7 @@ def insert_url(self, session, url_info: URLInfo) -> int: collector_metadata=url_info.collector_metadata, status=url_info.status, name=url_info.name, + trailing_slash=url_and_scheme.url.endswith('/'), source=url_info.source ) if url_info.created_at is not None: @@ -147,7 +147,7 @@ def insert_urls(self, url_infos: List[URLInfo], batch_id: int) -> InsertURLsInfo url_info.batch_id = batch_id try: url_id = self.insert_url(url_info) - url_mappings.append(URLMapping(url_id=url_id, url=url_info.url)) + url_mappings.append(SimpleURLMapping(url_id=url_id, url=url_info.url)) except IntegrityError as e: orig_url_info = self.get_url_info_by_url(url_info.url) duplicate_info = DuplicateInsertInfo( diff --git a/src/db/dtos/url/insert.py b/src/db/dtos/url/insert.py index f3143668..672cbb9f 100644 --- a/src/db/dtos/url/insert.py +++ b/src/db/dtos/url/insert.py @@ -1,10 +1,10 @@ from pydantic import BaseModel -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping class InsertURLsInfo(BaseModel): - url_mappings: list[URLMapping] + url_mappings: list[SimpleURLMapping] url_ids: list[int] total_count: int = 0 original_count: int = 0 diff --git a/src/db/dtos/url/mapping_/__init__.py b/src/db/dtos/url/mapping_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/dtos/url/mapping_/full.py b/src/db/dtos/url/mapping_/full.py new file mode 100644 index 00000000..c60f367c --- /dev/null +++ b/src/db/dtos/url/mapping_/full.py @@ -0,0 +1,14 @@ +from pydantic import BaseModel, ConfigDict + +from src.util.models.full_url import FullURL + + +class FullURLMapping(BaseModel): + """Mapping between full URL and url_id""" + model_config = ConfigDict( + arbitrary_types_allowed=True, + frozen=True # <- makes it immutable & hashable + ) + + full_url: FullURL + url_id: int \ No newline at end of file diff --git a/src/db/dtos/url/mapping.py b/src/db/dtos/url/mapping_/simple.py similarity index 84% rename from src/db/dtos/url/mapping.py rename to src/db/dtos/url/mapping_/simple.py index d48a4649..ff2e4f6b 100644 --- a/src/db/dtos/url/mapping.py +++ b/src/db/dtos/url/mapping_/simple.py @@ -1,7 +1,7 @@ from pydantic import BaseModel, ConfigDict -class URLMapping(BaseModel): +class SimpleURLMapping(BaseModel): """Mapping between url and url_id.""" model_config = ConfigDict(frozen=True) # <- makes it immutable & hashable diff --git a/src/db/helpers/session/session_helper.py b/src/db/helpers/session/session_helper.py index 43369ff3..f451f30c 100644 --- a/src/db/helpers/session/session_helper.py +++ b/src/db/helpers/session/session_helper.py @@ -52,6 +52,12 @@ async def bulk_upsert( session: AsyncSession, models: list[BulkUpsertableModel], ) -> None: + """Bulk update sqlalchemy models via their pydantic counterparts. + + WARNING: All non-id fields in the model will be updated on conflict. Do not include + attributes in the BulkUpdatableModel unless you intend to update them. + + """ if len(models) == 0: return # Parse models to get sa_model and id_field @@ -205,15 +211,19 @@ async def bulk_update( session: AsyncSession, models: list[BulkUpdatableModel], ): - """Bulk update sqlalchemy models via their pydantic counterparts.""" + """Bulk update sqlalchemy models via their pydantic counterparts. + + WARNING: All non-id fields in the model will be updated. Do not include + attributes in the BulkUpdatableModel unless you intend to update them. + """ if len(models) == 0: return parser = BulkActionParser(models) sa_model = parser.sa_model - id_field = parser.id_field - update_fields = parser.get_non_id_fields() + id_field: str = parser.id_field + update_fields: list[str] = parser.get_non_id_fields() for model in models: diff --git a/src/db/models/impl/url/core/pydantic/insert.py b/src/db/models/impl/url/core/pydantic/insert.py index 08480b6b..ed73b6c1 100644 --- a/src/db/models/impl/url/core/pydantic/insert.py +++ b/src/db/models/impl/url/core/pydantic/insert.py @@ -18,4 +18,5 @@ def sa_model(cls) -> type[Base]: collector_metadata: dict | None = None name: str | None = None status: URLStatus = URLStatus.OK - source: URLSource \ No newline at end of file + source: URLSource + trailing_slash: bool \ No newline at end of file diff --git a/src/db/models/impl/url/core/pydantic/upsert.py b/src/db/models/impl/url/core/pydantic/upsert.py index 8a101c70..0ee5695a 100644 --- a/src/db/models/impl/url/core/pydantic/upsert.py +++ b/src/db/models/impl/url/core/pydantic/upsert.py @@ -15,4 +15,6 @@ def sa_model(cls) -> type[Base]: return URL id: int - name: str | None + name: str | None = None + url: str | None = None + trailing_slash: bool | None = None diff --git a/src/db/models/impl/url/core/sqlalchemy.py b/src/db/models/impl/url/core/sqlalchemy.py index e5bca30d..d4d8e7c2 100644 --- a/src/db/models/impl/url/core/sqlalchemy.py +++ b/src/db/models/impl/url/core/sqlalchemy.py @@ -1,4 +1,4 @@ -from sqlalchemy import Column, Text, String, JSON, case, literal +from sqlalchemy import Column, Text, String, JSON, case, literal, Boolean from sqlalchemy.ext.hybrid import hybrid_property from sqlalchemy.orm import relationship from sqlalchemy.util import hybridproperty @@ -32,17 +32,28 @@ class URL(UpdatedAtMixin, CreatedAtMixin, WithIDBase): name='url_status', nullable=False ) + trailing_slash = Column(Boolean, nullable=False) @hybrid_property def full_url(self) -> str: if self.scheme is None: return self.url - return f"{self.scheme}://{self.url}" + url: str = f"{self.scheme}://{self.url}" + if self.trailing_slash: + url += "/" + return url @full_url.expression def full_url(cls): return case( - (cls.scheme != None, (cls.scheme + literal("://") + cls.url)), + ( + (cls.scheme != None) & (cls.trailing_slash == True), + (cls.scheme + literal("://") + cls.url + literal("/")) + ), + ( + (cls.scheme != None) & (cls.trailing_slash == False), + (cls.scheme + literal("://") + cls.url) + ), else_=cls.url ) diff --git a/src/external/url_request/core.py b/src/external/url_request/core.py index 7a6920fe..d49b2649 100644 --- a/src/external/url_request/core.py +++ b/src/external/url_request/core.py @@ -4,6 +4,7 @@ from src.external.url_request.probe.core import URLProbeManager from src.external.url_request.probe.models.wrapper import URLProbeResponseOuterWrapper from src.external.url_request.request import fetch_urls +from src.util.models.full_url import FullURL class URLRequestInterface: @@ -15,7 +16,7 @@ async def make_requests_with_html( return await fetch_urls(urls) @staticmethod - async def probe_urls(urls: list[str]) -> list[URLProbeResponseOuterWrapper]: + async def probe_urls(urls: list[FullURL]) -> list[URLProbeResponseOuterWrapper]: async with ClientSession(timeout=ClientTimeout(total=30)) as session: manager = URLProbeManager(session=session) return await manager.probe_urls(urls=urls) diff --git a/src/external/url_request/probe/convert.py b/src/external/url_request/probe/convert.py index 3b15268a..16258cdb 100644 --- a/src/external/url_request/probe/convert.py +++ b/src/external/url_request/probe/convert.py @@ -6,6 +6,7 @@ from src.external.url_request.probe.models.response import URLProbeResponse from src.external.url_request.probe.models.redirect import URLProbeRedirectResponsePair from src.external.url_request.probe.models.wrapper import URLProbeResponseOuterWrapper +from src.util.models.full_url import FullURL def _process_client_response_history(history: Sequence[ClientResponse]) -> list[str]: @@ -29,7 +30,7 @@ def _extract_redirect_probe_response(cr: ClientResponse) -> URLProbeResponse | N first_url = all_urls[0] return URLProbeResponse( - url=first_url, + url=FullURL(first_url), status_code=HTTPStatus.FOUND.value, content_type=None, error=None, @@ -53,14 +54,14 @@ def _extract_destination_url(cr: ClientResponse) -> str: return str(cr.url) def convert_client_response_to_probe_response( - url: str, + url: FullURL, cr: ClientResponse ) -> URLProbeResponse | URLProbeRedirectResponsePair: error = _extract_error(cr) content_type = _extract_content_type(cr, error=error) if not _has_redirect(cr): return URLProbeResponse( - url=str(cr.url), + url=FullURL(str(cr.url)), status_code=cr.status, content_type=content_type, error=error, @@ -85,7 +86,7 @@ def convert_client_response_to_probe_response( destination_error = _extract_error(destination_cr) destination_content_type = _extract_content_type(destination_cr, error=destination_error) destination_probe_response = URLProbeResponse( - url=destination_url, + url=FullURL(destination_url), status_code=destination_cr.status, content_type=destination_content_type, error=destination_error, @@ -97,7 +98,7 @@ def convert_client_response_to_probe_response( ) def convert_to_error_response( - url: str, + url: FullURL, error: str, status_code: int | None = None ) -> URLProbeResponseOuterWrapper: diff --git a/src/external/url_request/probe/core.py b/src/external/url_request/probe/core.py index 48009381..120e1b66 100644 --- a/src/external/url_request/probe/core.py +++ b/src/external/url_request/probe/core.py @@ -9,6 +9,7 @@ from src.external.url_request.probe.convert import convert_client_response_to_probe_response, convert_to_error_response from src.external.url_request.probe.models.wrapper import URLProbeResponseOuterWrapper +from src.util.models.full_url import FullURL from src.util.progress_bar import get_progress_bar_disabled @@ -20,14 +21,14 @@ def __init__( ): self.session = session - async def probe_urls(self, urls: list[str]) -> list[URLProbeResponseOuterWrapper]: + async def probe_urls(self, urls: list[FullURL]) -> list[URLProbeResponseOuterWrapper]: return await tqdm_asyncio.gather( *[self._probe(url) for url in urls], timeout=60 * 10, # 10 minutes, disable=get_progress_bar_disabled() ) - async def _probe(self, url: str) -> URLProbeResponseOuterWrapper: + async def _probe(self, url: FullURL) -> URLProbeResponseOuterWrapper: try: response = await self._head(url) if not response.is_redirect and response.response.status_code == HTTPStatus.OK: @@ -52,9 +53,9 @@ async def _probe(self, url: str) -> URLProbeResponseOuterWrapper: except ClientOSError as e: return convert_to_error_response(url, error=f"Client OS Error: {e.errno}. {str(e)}") - async def _head(self, url: str) -> URLProbeResponseOuterWrapper: + async def _head(self, url: FullURL) -> URLProbeResponseOuterWrapper: try: - async with self.session.head(url, allow_redirects=True) as response: + async with self.session.head(str(url), allow_redirects=True) as response: return URLProbeResponseOuterWrapper( original_url=url, response=convert_client_response_to_probe_response( @@ -74,9 +75,9 @@ async def _head(self, url: str) -> URLProbeResponseOuterWrapper: status_code=e.status ) - async def _get(self, url: str) -> URLProbeResponseOuterWrapper: + async def _get(self, url: FullURL) -> URLProbeResponseOuterWrapper: try: - async with self.session.get(url, allow_redirects=True) as response: + async with self.session.get(str(url), allow_redirects=True) as response: return URLProbeResponseOuterWrapper( original_url=url, response=convert_client_response_to_probe_response( diff --git a/src/external/url_request/probe/models/response.py b/src/external/url_request/probe/models/response.py index 967f1c4f..ad6eb588 100644 --- a/src/external/url_request/probe/models/response.py +++ b/src/external/url_request/probe/models/response.py @@ -1,9 +1,13 @@ from pydantic import BaseModel, Field, model_validator +from src.util.models.full_url import FullURL class URLProbeResponse(BaseModel): - url: str + class Config: + arbitrary_types_allowed = True + + url: FullURL status_code: int | None = Field(le=999, ge=100) content_type: str | None error: str | None = None diff --git a/src/external/url_request/probe/models/wrapper.py b/src/external/url_request/probe/models/wrapper.py index 04dbc9c4..27fd7be8 100644 --- a/src/external/url_request/probe/models/wrapper.py +++ b/src/external/url_request/probe/models/wrapper.py @@ -2,10 +2,14 @@ from src.external.url_request.probe.models.redirect import URLProbeRedirectResponsePair from src.external.url_request.probe.models.response import URLProbeResponse +from src.util.models.full_url import FullURL class URLProbeResponseOuterWrapper(BaseModel): - original_url: str + class Config: + arbitrary_types_allowed = True + + original_url: FullURL response: URLProbeResponse | URLProbeRedirectResponsePair @property diff --git a/src/util/models/full_url.py b/src/util/models/full_url.py new file mode 100644 index 00000000..1118040e --- /dev/null +++ b/src/util/models/full_url.py @@ -0,0 +1,84 @@ +from urllib.parse import urlparse + +from src.util.url import clean_url + + +class FullURL: + __slots__ = ( + "_full_url", + "_scheme", + "_url_without_scheme" + ) + + def __init__( + self, + full_url: str + ): + if not isinstance(full_url, str): + raise ValueError("full_url must be a string") + self._full_url = full_url + self._scheme = None + self._url_without_scheme = None + + @property + def full_url(self) -> str: + return self._full_url + + def __str__(self): + return self.full_url + + def __repr__(self): + return self.id_form + + def __hash__(self): + return hash(self.id_form) + + def __eq__(self, other): + return isinstance(other, FullURL) and self.id_form == other.id_form + + def _set_url_parts(self): + """ + Modifies: + self._scheme + self._url + + """ + parse_result = urlparse(self.full_url) + self._scheme = parse_result.scheme + if parse_result.scheme is not None: + self._url_without_scheme = self.full_url.replace(f"{parse_result.scheme}://", "", 1) + else: + self._url_without_scheme = self.full_url + + + @property + def scheme(self) -> str | None: + if self._scheme is None: + self._set_url_parts() + return self._scheme + + @property + def without_scheme(self) -> str: + if self._url_without_scheme is None: + self._set_url_parts() + return self._url_without_scheme + + @property + def id_form(self) -> str: + """Retrieves URL in 'Identification Form' + + These are meant to be used to compare URLs with one another. + + They have the following properties: + No Scheme + No Trailing Slash + Cleaned of fragments and query parameters. + """ + no_scheme: str = self.without_scheme + no_trailing_slash: str = no_scheme.rstrip("/") + clean: str = clean_url(no_trailing_slash) + return clean + + def clean(self) -> str: + return clean_url(self.full_url) + diff --git a/src/util/url.py b/src/util/url.py index ac4f73ca..0fdf7d0b 100644 --- a/src/util/url.py +++ b/src/util/url.py @@ -26,3 +26,9 @@ def get_url_and_scheme( url=url, scheme=None ) + +def remove_url_scheme(url: str) -> str: + parsed = urlparse(url) + if parsed.scheme: + return url.replace(f"{parsed.scheme}://", "", 1) + return url \ No newline at end of file diff --git a/src/util/url_mapper_/__init__.py b/src/util/url_mapper_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/util/url_mapper_/full.py b/src/util/url_mapper_/full.py new file mode 100644 index 00000000..8f6272c2 --- /dev/null +++ b/src/util/url_mapper_/full.py @@ -0,0 +1,49 @@ +from src.db.dtos.url.mapping_.full import FullURLMapping +from src.util.models.full_url import FullURL + + +class FullURLMapper: + + def __init__(self, mappings: list[FullURLMapping]): + self._url_to_id = { + mapping.full_url.id_form: mapping.url_id + for mapping in mappings + } + self._id_to_url = { + mapping.url_id: mapping.full_url + for mapping in mappings + } + + def get_id(self, full_url: FullURL) -> int: + return self._url_to_id[full_url.id_form] + + def get_ids(self, full_urls: list[FullURL]) -> list[int]: + return [ + self._url_to_id[full_url.id_form] + for full_url in full_urls + ] + + def get_all_ids(self) -> list[int]: + return list(self._url_to_id.values()) + + def get_all_urls(self) -> list[FullURL]: + return list(self._id_to_url.values()) + + def get_url(self, url_id: int) -> FullURL: + return self._id_to_url[url_id] + + def get_mappings_by_url(self, full_urls: list[FullURL]) -> list[FullURLMapping]: + return [ + FullURLMapping( + url_id=self._url_to_id[full_url.id_form], + full_url=full_url + ) for full_url in full_urls + ] + + def add_mapping(self, mapping: FullURLMapping) -> None: + self._url_to_id[mapping.full_url.id_form] = mapping.url_id + self._id_to_url[mapping.url_id] = mapping.full_url + + def add_mappings(self, mappings: list[FullURLMapping]) -> None: + for mapping in mappings: + self.add_mapping(mapping) \ No newline at end of file diff --git a/src/util/url_mapper.py b/src/util/url_mapper_/simple.py similarity index 72% rename from src/util/url_mapper.py rename to src/util/url_mapper_/simple.py index 3a399d77..2a7f7353 100644 --- a/src/util/url_mapper.py +++ b/src/util/url_mapper_/simple.py @@ -1,9 +1,9 @@ -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping -class URLMapper: +class SimpleURLMapper: - def __init__(self, mappings: list[URLMapping]): + def __init__(self, mappings: list[SimpleURLMapping]): self._url_to_id = { mapping.url: mapping.url_id for mapping in mappings @@ -31,18 +31,18 @@ def get_all_urls(self) -> list[str]: def get_url(self, url_id: int) -> str: return self._id_to_url[url_id] - def get_mappings_by_url(self, urls: list[str]) -> list[URLMapping]: + def get_mappings_by_url(self, urls: list[str]) -> list[SimpleURLMapping]: return [ - URLMapping( + SimpleURLMapping( url_id=self._url_to_id[url], url=url ) for url in urls ] - def add_mapping(self, mapping: URLMapping) -> None: + def add_mapping(self, mapping: SimpleURLMapping) -> None: self._url_to_id[mapping.url] = mapping.url_id self._id_to_url[mapping.url_id] = mapping.url - def add_mappings(self, mappings: list[URLMapping]) -> None: + def add_mappings(self, mappings: list[SimpleURLMapping]) -> None: for mapping in mappings: self.add_mapping(mapping) \ No newline at end of file diff --git a/tests/automated/integration/api/annotate/anonymous/test_core.py b/tests/automated/integration/api/annotate/anonymous/test_core.py index 4b747363..d2b9f691 100644 --- a/tests/automated/integration/api/annotate/anonymous/test_core.py +++ b/tests/automated/integration/api/annotate/anonymous/test_core.py @@ -7,7 +7,7 @@ from src.api.endpoints.annotate.all.post.models.name import AnnotationPostNameInfo from src.api.endpoints.annotate.all.post.models.request import AllAnnotationPostInfo from src.core.enums import RecordType -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.flag.url_validated.enums import URLType from src.db.models.impl.url.suggestion.anonymous.agency.sqlalchemy import AnonymousAnnotationAgency from src.db.models.impl.url.suggestion.anonymous.location.sqlalchemy import AnonymousAnnotationLocation @@ -34,11 +34,11 @@ async def test_annotate_anonymous( setup_info_1 = await setup_for_get_next_url_for_final_review( db_data_creator=ath.db_data_creator, include_user_annotations=True ) - url_mapping_1: URLMapping = setup_info_1.url_mapping + url_mapping_1: SimpleURLMapping = setup_info_1.url_mapping setup_info_2: FinalReviewSetupInfo = await setup_for_get_next_url_for_final_review( db_data_creator=ath.db_data_creator, include_user_annotations=True ) - url_mapping_2: URLMapping = setup_info_2.url_mapping + url_mapping_2: SimpleURLMapping = setup_info_2.url_mapping get_response_1: GetNextURLForAllAnnotationResponse = await get_next_url_for_anonymous_annotation(rv) assert get_response_1.next_annotation is not None diff --git a/tests/automated/integration/api/annotate/helpers.py b/tests/automated/integration/api/annotate/helpers.py index 39cfedab..92392ab1 100644 --- a/tests/automated/integration/api/annotate/helpers.py +++ b/tests/automated/integration/api/annotate/helpers.py @@ -1,10 +1,10 @@ from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping def check_url_mappings_match( - map_1: URLMapping, - map_2: URLMapping + map_1: SimpleURLMapping, + map_2: SimpleURLMapping ): assert map_1.url_id == map_2.url_id assert map_2.url == map_2.url diff --git a/tests/automated/integration/api/batch/summaries/test_pending_url_filter.py b/tests/automated/integration/api/batch/summaries/test_pending_url_filter.py index c471b6fa..f4181629 100644 --- a/tests/automated/integration/api/batch/summaries/test_pending_url_filter.py +++ b/tests/automated/integration/api/batch/summaries/test_pending_url_filter.py @@ -2,7 +2,7 @@ from src.collectors.enums import CollectorType from src.core.enums import BatchStatus -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.views.batch_url_status.enums import BatchURLStatusEnum from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.data_creator.core import DBDataCreator @@ -27,7 +27,7 @@ async def test_get_batch_summaries_pending_url_filter(api_test_helper): # Add a batch with submitted URLs batch_submitted: int = await dbdc.create_batch(status=BatchStatus.READY_TO_LABEL) - submitted_url_mappings: list[URLMapping] = await dbdc.create_submitted_urls(count=2) + submitted_url_mappings: list[SimpleURLMapping] = await dbdc.create_submitted_urls(count=2) submitted_url_ids: list[int] = [url_mapping.url_id for url_mapping in submitted_url_mappings] await dbdc.create_batch_url_links( batch_id=batch_submitted, @@ -39,7 +39,7 @@ async def test_get_batch_summaries_pending_url_filter(api_test_helper): # Add a batch with validated URLs batch_validated: int = await dbdc.create_batch(status=BatchStatus.READY_TO_LABEL) - validated_url_mappings: list[URLMapping] = await dbdc.create_validated_urls( + validated_url_mappings: list[SimpleURLMapping] = await dbdc.create_validated_urls( count=2 ) validated_url_ids: list[int] = [url_mapping.url_id for url_mapping in validated_url_mappings] diff --git a/tests/automated/integration/api/metrics/batches/test_aggregated.py b/tests/automated/integration/api/metrics/batches/test_aggregated.py index 090896e8..97cd805e 100644 --- a/tests/automated/integration/api/metrics/batches/test_aggregated.py +++ b/tests/automated/integration/api/metrics/batches/test_aggregated.py @@ -3,13 +3,11 @@ from src.collectors.enums import CollectorType, URLStatus from src.core.enums import BatchStatus from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.mapping import URLMapping -from src.db.helpers.connect import get_postgres_connection_string +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.flag.url_validated.enums import URLType from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters from tests.helpers.data_creator.create import create_batch, create_url_data_sources, create_urls, \ create_batch_url_links, create_validated_flags -from tests.helpers.setup.wipe import wipe_database @pytest.mark.asyncio @@ -25,17 +23,17 @@ async def test_get_batches_aggregated_metrics( adb_client=adb_client, strategy=CollectorType.MANUAL, ) - url_mappings_error: list[URLMapping] = await create_urls( + url_mappings_error: list[SimpleURLMapping] = await create_urls( adb_client=adb_client, status=URLStatus.ERROR, count=4, ) - url_mappings_ok: list[URLMapping] = await create_urls( + url_mappings_ok: list[SimpleURLMapping] = await create_urls( adb_client=adb_client, status=URLStatus.OK, count=11, ) - url_mappings_all: list[URLMapping] = url_mappings_error + url_mappings_ok + url_mappings_all: list[SimpleURLMapping] = url_mappings_error + url_mappings_ok url_ids_all: list[int] = [url_mapping.url_id for url_mapping in url_mappings_all] await create_batch_url_links( adb_client=adb_client, diff --git a/tests/automated/integration/api/metrics/batches/test_breakdown.py b/tests/automated/integration/api/metrics/batches/test_breakdown.py index c6ef6e0b..ca05eaa1 100644 --- a/tests/automated/integration/api/metrics/batches/test_breakdown.py +++ b/tests/automated/integration/api/metrics/batches/test_breakdown.py @@ -1,12 +1,11 @@ from datetime import datetime, timedelta -import pendulum import pytest from src.collectors.enums import CollectorType, URLStatus from src.core.enums import BatchStatus from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.flag.url_validated.enums import URLType from tests.helpers.data_creator.create import create_batch, create_urls, create_batch_url_links, create_validated_flags, \ create_url_data_sources @@ -23,7 +22,7 @@ async def test_get_batches_breakdown_metrics(api_test_helper): adb_client=adb_client, strategy=CollectorType.MANUAL, ) - url_mappings_1: list[URLMapping] = await create_urls( + url_mappings_1: list[SimpleURLMapping] = await create_urls( adb_client=adb_client, count=3, ) @@ -50,13 +49,13 @@ async def test_get_batches_breakdown_metrics(api_test_helper): strategy=CollectorType.AUTO_GOOGLER, date_generated=today - timedelta(days=14) ) - error_url_mappings: list[URLMapping] = await create_urls( + error_url_mappings: list[SimpleURLMapping] = await create_urls( adb_client=adb_client, status=URLStatus.ERROR, count=4, ) error_url_ids: list[int] = [url_mapping.url_id for url_mapping in error_url_mappings] - validated_url_mappings: list[URLMapping] = await create_urls( + validated_url_mappings: list[SimpleURLMapping] = await create_urls( adb_client=adb_client, count=8, ) diff --git a/tests/automated/integration/api/metrics/test_backlog.py b/tests/automated/integration/api/metrics/test_backlog.py index da8dccd6..09f687f5 100644 --- a/tests/automated/integration/api/metrics/test_backlog.py +++ b/tests/automated/integration/api/metrics/test_backlog.py @@ -2,7 +2,7 @@ import pytest from src.collectors.enums import URLStatus -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.flag.url_validated.enums import URLType from tests.helpers.data_creator.core import DBDataCreator @@ -20,7 +20,7 @@ async def test_get_backlog_metrics(api_test_helper): # Ensure that multiple days in each month are added to the backlog table, with different values batch_1_id: int = await ddc.create_batch() - url_mappings_1: list[URLMapping] = await ddc.create_urls(count=3) + url_mappings_1: list[SimpleURLMapping] = await ddc.create_urls(count=3) url_ids_1: list[int] = [url_mapping.url_id for url_mapping in url_mappings_1] await ddc.create_batch_url_links(url_ids=url_ids_1, batch_id=batch_1_id) submitted_url_ids_1: list[int] = url_ids_1[:2] @@ -39,14 +39,14 @@ async def test_get_backlog_metrics(api_test_helper): ) batch_2_id: int = await ddc.create_batch() - not_relevant_url_mappings_2: list[URLMapping] = await ddc.create_urls(count=6) + not_relevant_url_mappings_2: list[SimpleURLMapping] = await ddc.create_urls(count=6) not_relevant_url_ids_2: list[int] = [url_mapping.url_id for url_mapping in not_relevant_url_mappings_2] await ddc.create_batch_url_links(url_ids=not_relevant_url_ids_2, batch_id=batch_2_id) await ddc.create_validated_flags( url_ids=not_relevant_url_ids_2[:4], validation_type=URLType.NOT_RELEVANT ) - error_url_mappings_2: list[URLMapping] = await ddc.create_urls( + error_url_mappings_2: list[SimpleURLMapping] = await ddc.create_urls( status=URLStatus.ERROR, count=2 ) @@ -62,7 +62,7 @@ async def test_get_backlog_metrics(api_test_helper): ) batch_3_id: int = await ddc.create_batch() - url_mappings_3: list[URLMapping] = await ddc.create_urls(count=12) + url_mappings_3: list[SimpleURLMapping] = await ddc.create_urls(count=12) url_ids_3: list[int] = [url_mapping.url_id for url_mapping in url_mappings_3] await ddc.create_batch_url_links(url_ids=url_ids_3, batch_id=batch_3_id) await ddc.create_validated_flags( diff --git a/tests/automated/integration/api/metrics/urls/aggregated/test_core.py b/tests/automated/integration/api/metrics/urls/aggregated/test_core.py index 64ae5ae4..1d8eb947 100644 --- a/tests/automated/integration/api/metrics/urls/aggregated/test_core.py +++ b/tests/automated/integration/api/metrics/urls/aggregated/test_core.py @@ -1,10 +1,9 @@ -from datetime import datetime, timedelta, timezone +from datetime import datetime, timedelta -import pendulum import pytest from src.collectors.enums import CollectorType, URLStatus -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.flag.url_validated.enums import URLType from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters from tests.helpers.batch_creation_parameters.enums import URLCreationEnum @@ -33,24 +32,24 @@ async def test_get_urls_aggregated_metrics(api_test_helper): strategy=CollectorType.MANUAL, date_generated=today - timedelta(days=1) ) - url_mappings_0: list[URLMapping] = await ddc.create_urls(batch_id=batch_0) + url_mappings_0: list[SimpleURLMapping] = await ddc.create_urls(batch_id=batch_0) oldest_url_id: int = url_mappings_0[0].url_id batch_1: int = await ddc.create_batch( strategy=CollectorType.MANUAL, ) - url_mappings_1_ok: list[URLMapping] = await ddc.create_urls(batch_id=batch_1, count=1) - url_mappings_1_submitted: list[URLMapping] = await ddc.create_submitted_urls(count=2) + url_mappings_1_ok: list[SimpleURLMapping] = await ddc.create_urls(batch_id=batch_1, count=1) + url_mappings_1_submitted: list[SimpleURLMapping] = await ddc.create_submitted_urls(count=2) url_ids_1_submitted: list[int] = [url_mapping.url_id for url_mapping in url_mappings_1_submitted] await ddc.create_batch_url_links(url_ids=url_ids_1_submitted, batch_id=batch_1) batch_2: int = await ddc.create_batch( strategy=CollectorType.AUTO_GOOGLER, ) - url_mappings_2_ok: list[URLMapping] = await ddc.create_urls(batch_id=batch_2, count=4, status=URLStatus.OK) - url_mappings_2_error: list[URLMapping] = await ddc.create_urls(batch_id=batch_2, count=2, status=URLStatus.ERROR) - url_mappings_2_validated: list[URLMapping] = await ddc.create_validated_urls(count=1, validation_type=URLType.DATA_SOURCE) - url_mappings_2_not_relevant: list[URLMapping] = await ddc.create_validated_urls(count=5, validation_type=URLType.NOT_RELEVANT) + url_mappings_2_ok: list[SimpleURLMapping] = await ddc.create_urls(batch_id=batch_2, count=4, status=URLStatus.OK) + url_mappings_2_error: list[SimpleURLMapping] = await ddc.create_urls(batch_id=batch_2, count=2, status=URLStatus.ERROR) + url_mappings_2_validated: list[SimpleURLMapping] = await ddc.create_validated_urls(count=1, validation_type=URLType.DATA_SOURCE) + url_mappings_2_not_relevant: list[SimpleURLMapping] = await ddc.create_validated_urls(count=5, validation_type=URLType.NOT_RELEVANT) url_ids_2_validated: list[int] = [url_mapping.url_id for url_mapping in url_mappings_2_validated] url_ids_2_not_relevant: list[int] = [url_mapping.url_id for url_mapping in url_mappings_2_not_relevant] await ddc.create_batch_url_links( diff --git a/tests/automated/integration/api/submit/test_duplicate.py b/tests/automated/integration/api/submit/test_duplicate.py index c1ccfd29..0bef1091 100644 --- a/tests/automated/integration/api/submit/test_duplicate.py +++ b/tests/automated/integration/api/submit/test_duplicate.py @@ -3,7 +3,7 @@ from src.api.endpoints.submit.url.enums import URLSubmissionStatus from src.api.endpoints.submit.url.models.request import URLSubmissionRequest from src.api.endpoints.submit.url.models.response import URLSubmissionResponse -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from tests.helpers.api_test_helper import APITestHelper from tests.helpers.data_creator.core import DBDataCreator @@ -13,7 +13,7 @@ async def test_duplicate( api_test_helper: APITestHelper, db_data_creator: DBDataCreator ): - url_mapping: URLMapping = (await db_data_creator.create_urls(count=1))[0] + url_mapping: SimpleURLMapping = (await db_data_creator.create_urls(count=1))[0] response: URLSubmissionResponse = await api_test_helper.request_validator.submit_url( request=URLSubmissionRequest( diff --git a/tests/automated/integration/api/url/by_id/snapshot/test_success.py b/tests/automated/integration/api/url/by_id/snapshot/test_success.py index e3ea9d73..3109706d 100644 --- a/tests/automated/integration/api/url/by_id/snapshot/test_success.py +++ b/tests/automated/integration/api/url/by_id/snapshot/test_success.py @@ -1,6 +1,6 @@ import pytest -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.url.screenshot.sqlalchemy import URLScreenshot from tests.automated.integration.api._helpers.RequestValidator import RequestValidator from tests.helpers.api_test_helper import APITestHelper @@ -15,7 +15,7 @@ async def test_get_url_screenshot_success( ddc: DBDataCreator = api_test_helper.db_data_creator rv: RequestValidator = ath.request_validator - url_mapping: URLMapping = (await ddc.create_urls())[0] + url_mapping: SimpleURLMapping = (await ddc.create_urls())[0] url_id: int = url_mapping.url_id url_screenshot = URLScreenshot( diff --git a/tests/automated/integration/db/structure/test_updated_at.py b/tests/automated/integration/db/structure/test_updated_at.py index 281e6ee8..0a4c18a4 100644 --- a/tests/automated/integration/db/structure/test_updated_at.py +++ b/tests/automated/integration/db/structure/test_updated_at.py @@ -24,7 +24,9 @@ async def test_updated_at(db_data_creator: DBDataCreator): url_upsert = URLUpsertModel( id=url.id, - name="New Name" + name="New Name", + url=url.url, + trailing_slash=url.trailing_slash, ) await db_data_creator.adb_client.bulk_update([url_upsert]) diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/setup.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/setup.py index 55dbeb76..1d1085a5 100644 --- a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/setup.py +++ b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/setup.py @@ -41,7 +41,8 @@ async def run(self, session: AsyncSession) -> list[int]: status=URLStatus.OK, name=name, description=description, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=False, ) session.add(url) await session.flush() diff --git a/tests/automated/integration/tasks/scheduled/impl/internet_archives/probe/setup.py b/tests/automated/integration/tasks/scheduled/impl/internet_archives/probe/setup.py index 59b2d77c..7bc33222 100644 --- a/tests/automated/integration/tasks/scheduled/impl/internet_archives/probe/setup.py +++ b/tests/automated/integration/tasks/scheduled/impl/internet_archives/probe/setup.py @@ -11,11 +11,13 @@ async def add_urls(dbc: AsyncDatabaseClient) -> list[int]: insert_models: list[URLInsertModel] = [ URLInsertModel( url=TEST_URL_1, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=False ), URLInsertModel( url=TEST_URL_2, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=False ) ] return await dbc.bulk_insert(insert_models, return_ids=True) diff --git a/tests/automated/integration/tasks/scheduled/impl/internet_archives/save/setup.py b/tests/automated/integration/tasks/scheduled/impl/internet_archives/save/setup.py index 36b1bcb9..836ee678 100644 --- a/tests/automated/integration/tasks/scheduled/impl/internet_archives/save/setup.py +++ b/tests/automated/integration/tasks/scheduled/impl/internet_archives/save/setup.py @@ -72,11 +72,13 @@ async def add_test_urls(adb_client: AsyncDatabaseClient) -> list[int]: url_inserts: list[URLInsertModel] = [ URLInsertModel( url=TEST_URL_1, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=False ), URLInsertModel( url=TEST_URL_2, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=False ) ] url_ids = await adb_client.bulk_insert(url_inserts, return_ids=True) diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/ineligible_cases/test_blacklist.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/ineligible_cases/test_blacklist.py index 2334aa17..a592002f 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/ineligible_cases/test_blacklist.py +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/ineligible_cases/test_blacklist.py @@ -1,7 +1,7 @@ import pytest from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.flag.url_validated.enums import URLType from tests.helpers.data_creator.core import DBDataCreator @@ -27,7 +27,7 @@ async def test_blacklist( await db_data_creator.link_urls_to_root([url_id], root_url_id=root_url_id) # Create Meta URLs - meta_urls: list[URLMapping] = await db_data_creator.create_validated_urls( + meta_urls: list[SimpleURLMapping] = await db_data_creator.create_validated_urls( count=3, validation_type=URLType.META_URL ) diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/test_happy_path.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/test_happy_path.py index 10e3f711..7575f37e 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/test_happy_path.py +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/test_happy_path.py @@ -5,7 +5,7 @@ from src.core.tasks.base.run_info import TaskOperatorRunInfo from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.flag.url_validated.enums import URLType from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType, SubtaskDetailCode from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask @@ -26,7 +26,7 @@ async def test_homepage_match( """ # Create 2 root URLs - root_url_mappings: list[URLMapping] = ( + root_url_mappings: list[SimpleURLMapping] = ( await db_data_creator.create_urls(count=2) ) root_url_ids: list[int] = [url_mapping.url_id for url_mapping in root_url_mappings] @@ -60,7 +60,7 @@ async def test_homepage_match( # Create 2 Meta URLs and agencies for multi agency case - multi_meta_urls: list[URLMapping] = await db_data_creator.create_validated_urls( + multi_meta_urls: list[SimpleURLMapping] = await db_data_creator.create_validated_urls( count=2, validation_type=URLType.META_URL ) @@ -84,7 +84,7 @@ async def test_homepage_match( assert not await operator.meets_task_prerequisites() # Set up eligible URLs - eligible_urls: list[URLMapping] = await db_data_creator.create_urls( + eligible_urls: list[SimpleURLMapping] = await db_data_creator.create_urls( count=2, ) single_url_id: int = eligible_urls[0].url_id diff --git a/tests/automated/integration/tasks/url/impl/html/setup/manager.py b/tests/automated/integration/tasks/url/impl/html/setup/manager.py index 986a9f7e..e01f7b6d 100644 --- a/tests/automated/integration/tasks/url/impl/html/setup/manager.py +++ b/tests/automated/integration/tasks/url/impl/html/setup/manager.py @@ -33,7 +33,8 @@ async def _setup_urls(self) -> list[TestURLHTMLTaskSetupRecord]: url=entry.url_info.url, name=f"Test for {entry.url_info.url}", record_type=RecordType.RESOURCES, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=False ) url_insert_models.append(url_insert_model) url_ids = await self.adb_client.bulk_insert(url_insert_models, return_ids=True) diff --git a/tests/automated/integration/tasks/url/impl/location_identification/subtasks/nlp_location_frequency/end_to_end/conftest.py b/tests/automated/integration/tasks/url/impl/location_identification/subtasks/nlp_location_frequency/end_to_end/conftest.py index 766a7ca5..e3d39db5 100644 --- a/tests/automated/integration/tasks/url/impl/location_identification/subtasks/nlp_location_frequency/end_to_end/conftest.py +++ b/tests/automated/integration/tasks/url/impl/location_identification/subtasks/nlp_location_frequency/end_to_end/conftest.py @@ -1,6 +1,6 @@ import pytest_asyncio -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from tests.helpers.data_creator.core import DBDataCreator @@ -9,7 +9,7 @@ async def url_ids( db_data_creator: DBDataCreator, ) -> list[int]: # Create 2 URLs with compressed HTML - url_mappings: list[URLMapping] = await db_data_creator.create_urls(count=2) + url_mappings: list[SimpleURLMapping] = await db_data_creator.create_urls(count=2) url_ids: list[int] = [url.url_id for url in url_mappings] await db_data_creator.html_data(url_ids=url_ids) return url_ids diff --git a/tests/automated/integration/tasks/url/impl/probe/constants.py b/tests/automated/integration/tasks/url/impl/probe/constants.py index 07ebbcc3..93988afb 100644 --- a/tests/automated/integration/tasks/url/impl/probe/constants.py +++ b/tests/automated/integration/tasks/url/impl/probe/constants.py @@ -2,5 +2,5 @@ PATCH_ROOT = "src.external.url_request.core.URLProbeManager" TEST_URL = "www.example.com" -TEST_DEST_URL = "www.example.com/redirect" +TEST_DEST_URL = "https://www.example.com/redirect" TEST_SOURCE = URLSource.COLLECTOR diff --git a/tests/automated/integration/tasks/url/impl/probe/mocks/url_request_interface.py b/tests/automated/integration/tasks/url/impl/probe/mocks/url_request_interface.py index cc493274..2eb6a5d7 100644 --- a/tests/automated/integration/tasks/url/impl/probe/mocks/url_request_interface.py +++ b/tests/automated/integration/tasks/url/impl/probe/mocks/url_request_interface.py @@ -1,4 +1,5 @@ from src.external.url_request.probe.models.wrapper import URLProbeResponseOuterWrapper +from src.util.models.full_url import FullURL class MockURLRequestInterface: @@ -13,10 +14,10 @@ def __init__( responses = response_or_responses self._url_to_response = { - response.original_url: response for response in responses + response.original_url.id_form: response for response in responses } - async def probe_urls(self, urls: list[str]) -> list[URLProbeResponseOuterWrapper]: + async def probe_urls(self, urls: list[FullURL]) -> list[URLProbeResponseOuterWrapper]: return [ - self._url_to_response[url] for url in urls + self._url_to_response[url.id_form] for url in urls ] diff --git a/tests/automated/integration/tasks/url/impl/probe/redirect/test_dest_exists_in_db.py b/tests/automated/integration/tasks/url/impl/probe/redirect/test_dest_exists_in_db.py index b52dce6b..7aeeb1f8 100644 --- a/tests/automated/integration/tasks/url/impl/probe/redirect/test_dest_exists_in_db.py +++ b/tests/automated/integration/tasks/url/impl/probe/redirect/test_dest_exists_in_db.py @@ -30,7 +30,7 @@ async def test_url_probe_task_redirect_dest_exists_in_db( ) ) source_url_id = await setup_manager.setup_url(URLStatus.OK) - dest_url_id = await setup_manager.setup_url(URLStatus.OK, url=TEST_DEST_URL) + dest_url_id = await setup_manager.setup_url(URLStatus.OK, url=TEST_DEST_URL.replace("https://", "")) # Add web metadata for destination URL, to prevent it from being pulled web_metadata = URLWebMetadataPydantic( url_id=dest_url_id, diff --git a/tests/automated/integration/tasks/url/impl/probe/redirect/test_functional_equivalent.py b/tests/automated/integration/tasks/url/impl/probe/redirect/test_functional_equivalent.py new file mode 100644 index 00000000..a8cb51f7 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/probe/redirect/test_functional_equivalent.py @@ -0,0 +1,46 @@ +import pytest + +from src.collectors.enums import URLStatus +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata +from src.util.models.full_url import FullURL +from tests.automated.integration.tasks.url.impl.probe.check.manager import TestURLProbeCheckManager +from tests.automated.integration.tasks.url.impl.probe.constants import TEST_URL +from tests.automated.integration.tasks.url.impl.probe.setup.manager import TestURLProbeSetupManager +from tests.helpers.run import run_task_and_confirm_success + + +@pytest.mark.asyncio +async def test_url_probe_task_functional_equivalent( + setup_manager: TestURLProbeSetupManager, + check_manager: TestURLProbeCheckManager +): + """ + If a URL: + - is functionally equivalent to the original URL + The existing URL should be updated to the functional equivalent + And no web metadata added. + """ + + operator = setup_manager.setup_operator( + response_or_responses=setup_manager.setup_redirect_probe_response( + redirect_status_code=303, + dest_status_code=303, + dest_content_type=None, + dest_error=None, + redirect_url=FullURL(TEST_URL + "/") + ) + ) + url_id = await setup_manager.setup_url(URLStatus.OK) + await run_task_and_confirm_success(operator) + + urls: list[URL] = await setup_manager.adb_client.get_all(URL) + assert len(urls) == 1 + url: URL = urls[0] + + assert url.url == TEST_URL + assert url.trailing_slash is True + + # Web metadata should be added + web_metadata: list[URLWebMetadata] = await setup_manager.adb_client.get_all(URLWebMetadata) + assert len(web_metadata) == 1 diff --git a/tests/automated/integration/tasks/url/impl/probe/redirect/test_redirect_infinite.py b/tests/automated/integration/tasks/url/impl/probe/redirect/test_redirect_infinite.py deleted file mode 100644 index 5a66af3d..00000000 --- a/tests/automated/integration/tasks/url/impl/probe/redirect/test_redirect_infinite.py +++ /dev/null @@ -1,46 +0,0 @@ -import pytest - -from src.collectors.enums import URLStatus -from tests.automated.integration.tasks.url.impl.probe.check.manager import TestURLProbeCheckManager -from tests.automated.integration.tasks.url.impl.probe.constants import TEST_URL -from tests.automated.integration.tasks.url.impl.probe.setup.manager import TestURLProbeSetupManager - - -@pytest.mark.asyncio -async def test_url_probe_task_redirect_infinite( - setup_manager: TestURLProbeSetupManager, - check_manager: TestURLProbeCheckManager -): - """ - If a URL: - - returns a redirect response to itself - The task should add a link that points to itself - as well as web metadata response to the database URL - """ - - operator = setup_manager.setup_operator( - response_or_responses=setup_manager.setup_redirect_probe_response( - redirect_status_code=303, - dest_status_code=303, - dest_content_type=None, - dest_error=None, - redirect_url=TEST_URL - ) - ) - url_id = await setup_manager.setup_url(URLStatus.OK) - run_info = await operator.run_task() - await check_manager.check_url( - url_id=url_id, - expected_status=URLStatus.OK - ) - await check_manager.check_web_metadata( - url_id=url_id, - status_code=303, - content_type=None, - error=None, - accessed=True - ) - redirect_url_id = await check_manager.check_redirect( - source_url_id=url_id, - ) - assert redirect_url_id == url_id diff --git a/tests/automated/integration/tasks/url/impl/probe/redirect/test_two_urls_same_dest.py b/tests/automated/integration/tasks/url/impl/probe/redirect/test_two_urls_same_dest.py index bf5dab9f..1dcd98d9 100644 --- a/tests/automated/integration/tasks/url/impl/probe/redirect/test_two_urls_same_dest.py +++ b/tests/automated/integration/tasks/url/impl/probe/redirect/test_two_urls_same_dest.py @@ -1,6 +1,7 @@ import pytest from src.collectors.enums import URLStatus +from src.util.models.full_url import FullURL from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error from tests.automated.integration.tasks.url.impl.probe.check.manager import TestURLProbeCheckManager from tests.automated.integration.tasks.url.impl.probe.setup.manager import TestURLProbeSetupManager @@ -30,7 +31,7 @@ async def test_url_probe_task_redirect_two_urls_same_dest( dest_status_code=200, dest_content_type=None, dest_error=None, - source_url="example.com/2", + source_url=FullURL("example.com/2"), ), ] ) diff --git a/tests/automated/integration/tasks/url/impl/probe/setup/manager.py b/tests/automated/integration/tasks/url/impl/probe/setup/manager.py index 50405970..44b5bd54 100644 --- a/tests/automated/integration/tasks/url/impl/probe/setup/manager.py +++ b/tests/automated/integration/tasks/url/impl/probe/setup/manager.py @@ -8,6 +8,7 @@ from src.external.url_request.probe.models.redirect import URLProbeRedirectResponsePair from src.external.url_request.probe.models.response import URLProbeResponse from src.external.url_request.probe.models.wrapper import URLProbeResponseOuterWrapper +from src.util.models.full_url import FullURL from tests.automated.integration.tasks.url.impl.probe.constants import TEST_URL, TEST_DEST_URL, TEST_SOURCE from tests.automated.integration.tasks.url.impl.probe.mocks.url_request_interface import MockURLRequestInterface @@ -28,7 +29,8 @@ async def setup_url( url_insert_model = URLInsertModel( url=url, status=url_status, - source=TEST_SOURCE + source=TEST_SOURCE, + trailing_slash=False ) return ( await self.adb_client.bulk_insert( @@ -60,9 +62,9 @@ def setup_no_redirect_probe_response( url: str = TEST_URL ) -> URLProbeResponseOuterWrapper: return URLProbeResponseOuterWrapper( - original_url=url, + original_url=FullURL(url), response=URLProbeResponse( - url=url, + url=FullURL(url), status_code=status_code, content_type=content_type, error=error @@ -75,8 +77,8 @@ def setup_redirect_probe_response( dest_status_code: int, dest_content_type: str | None, dest_error: str | None, - source_url: str = TEST_URL, - redirect_url: str = TEST_DEST_URL + source_url: FullURL = FullURL(TEST_URL), + redirect_url: FullURL = FullURL(TEST_DEST_URL) ) -> URLProbeResponseOuterWrapper: if redirect_status_code not in (301, 302, 303, 307, 308): raise ValueError('Redirect response must be one of 301, 302, 303, 307, 308') diff --git a/tests/automated/integration/tasks/url/impl/root_url/test_branch_root_url_in_db.py b/tests/automated/integration/tasks/url/impl/root_url/test_branch_root_url_in_db.py index 7e8af066..75b7f68f 100644 --- a/tests/automated/integration/tasks/url/impl/root_url/test_branch_root_url_in_db.py +++ b/tests/automated/integration/tasks/url/impl/root_url/test_branch_root_url_in_db.py @@ -25,7 +25,8 @@ async def test_branch_root_url_in_db( # Add URL that is a root URL, and mark as such url_insert_model_root = URLInsertModel( url=ROOT_URL, - source=URLSource.DATA_SOURCES + source=URLSource.DATA_SOURCES, + trailing_slash=False ) root_url_id = (await operator.adb_client.bulk_insert([url_insert_model_root], return_ids=True))[0] root_model_flag_insert = FlagRootURLPydantic( @@ -36,7 +37,8 @@ async def test_branch_root_url_in_db( # Add URL that is a branch of the root URL url_insert_model = URLInsertModel( url=BRANCH_URL, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=False ) branch_url_id = (await operator.adb_client.bulk_insert([url_insert_model], return_ids=True))[0] diff --git a/tests/automated/integration/tasks/url/impl/root_url/test_branch_root_url_not_in_db.py b/tests/automated/integration/tasks/url/impl/root_url/test_branch_root_url_not_in_db.py index 6c00f8f9..a0a43d3c 100644 --- a/tests/automated/integration/tasks/url/impl/root_url/test_branch_root_url_not_in_db.py +++ b/tests/automated/integration/tasks/url/impl/root_url/test_branch_root_url_not_in_db.py @@ -26,7 +26,8 @@ async def test_branch_root_url_not_in_db( # Add URL that is a branch of a root URL url_insert_model = URLInsertModel( url=BRANCH_URL, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=False ) branch_url_id = (await operator.adb_client.bulk_insert([url_insert_model], return_ids=True))[0] diff --git a/tests/automated/integration/tasks/url/impl/root_url/test_is_root_url.py b/tests/automated/integration/tasks/url/impl/root_url/test_is_root_url.py index a6a56c7c..f129b582 100644 --- a/tests/automated/integration/tasks/url/impl/root_url/test_is_root_url.py +++ b/tests/automated/integration/tasks/url/impl/root_url/test_is_root_url.py @@ -23,7 +23,8 @@ async def test_is_root_url( # Add URL that is a root URL url_insert_model = URLInsertModel( url=ROOT_URL, - source=URLSource.DATA_SOURCES + source=URLSource.DATA_SOURCES, + trailing_slash=False ) url_id = (await operator.adb_client.bulk_insert([url_insert_model], return_ids=True))[0] diff --git a/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_in_db.py b/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_in_db.py index be67d23e..6fe57721 100644 --- a/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_in_db.py +++ b/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_in_db.py @@ -23,7 +23,8 @@ async def test_two_branches_one_root_in_db( # Add root URL and mark as such url_insert_model_root = URLInsertModel( url=ROOT_URL, - source=URLSource.DATA_SOURCES + source=URLSource.DATA_SOURCES, + trailing_slash=False ) url_id_root = (await operator.adb_client.bulk_insert([url_insert_model_root], return_ids=True))[0] root_model_flag_insert = FlagRootURLPydantic( @@ -34,13 +35,15 @@ async def test_two_branches_one_root_in_db( # Add two URLs that are branches of that root URL url_insert_model_branch_1 = URLInsertModel( url=BRANCH_URL, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=False ) url_id_branch_1 = (await operator.adb_client.bulk_insert([url_insert_model_branch_1], return_ids=True))[0] url_insert_model_branch_2 = URLInsertModel( url=SECOND_BRANCH_URL, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=False ) url_id_branch_2 = (await operator.adb_client.bulk_insert([url_insert_model_branch_2], return_ids=True))[0] diff --git a/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_in_db_not_flagged.py b/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_in_db_not_flagged.py index 614796e9..8a40a476 100644 --- a/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_in_db_not_flagged.py +++ b/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_in_db_not_flagged.py @@ -26,20 +26,23 @@ async def test_two_branches_one_root_in_db_not_flagged( # Add root URL but do not mark as such url_insert_model_root = URLInsertModel( url=ROOT_URL, - source=URLSource.DATA_SOURCES + source=URLSource.DATA_SOURCES, + trailing_slash=False ) url_id_root = (await operator.adb_client.bulk_insert([url_insert_model_root], return_ids=True))[0] # Add two URLs that are branches of that root URL url_insert_model_branch_1 = URLInsertModel( url=BRANCH_URL, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=False ) url_id_branch_1 = (await operator.adb_client.bulk_insert([url_insert_model_branch_1], return_ids=True))[0] url_insert_model_branch_2 = URLInsertModel( url=SECOND_BRANCH_URL, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=False ) url_id_branch_2 = (await operator.adb_client.bulk_insert([url_insert_model_branch_2], return_ids=True))[0] diff --git a/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_not_in_db.py b/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_not_in_db.py index f68786b9..8839905b 100644 --- a/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_not_in_db.py +++ b/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_not_in_db.py @@ -23,13 +23,15 @@ async def test_two_branches_one_root_in_db_not_flagged( # Add two URLs that are branches of a root URL url_insert_model_branch_1 = URLInsertModel( url=BRANCH_URL, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=BRANCH_URL.endswith('/') ) url_id_branch_1 = (await operator.adb_client.bulk_insert([url_insert_model_branch_1], return_ids=True))[0] url_insert_model_branch_2 = URLInsertModel( url=SECOND_BRANCH_URL, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=SECOND_BRANCH_URL.endswith('/') ) url_id_branch_2 = (await operator.adb_client.bulk_insert([url_insert_model_branch_2], return_ids=True))[0] diff --git a/tests/automated/integration/tasks/url/impl/screenshot/test_core.py b/tests/automated/integration/tasks/url/impl/screenshot/test_core.py index f65aa40d..9acffd0e 100644 --- a/tests/automated/integration/tasks/url/impl/screenshot/test_core.py +++ b/tests/automated/integration/tasks/url/impl/screenshot/test_core.py @@ -3,7 +3,7 @@ import pytest from src.core.tasks.url.operators.screenshot.core import URLScreenshotTaskOperator -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.url.screenshot.sqlalchemy import URLScreenshot from src.db.models.impl.url.task_error.sqlalchemy import URLTaskError from src.external.url_request.dtos.screenshot_response import URLScreenshotResponse @@ -24,9 +24,9 @@ async def test_core( assert not await operator.meets_task_prerequisites() # Add two URLs to database - url_mappings: list[URLMapping] = await db_data_creator.create_urls(count=2) - screenshot_mapping: URLMapping = url_mappings[0] - error_mapping: URLMapping = url_mappings[1] + url_mappings: list[SimpleURLMapping] = await db_data_creator.create_urls(count=2) + screenshot_mapping: SimpleURLMapping = url_mappings[0] + error_mapping: SimpleURLMapping = url_mappings[1] url_ids: list[int] = [url_mapping.url_id for url_mapping in url_mappings] # Add web metadata for 200 responses diff --git a/tests/automated/integration/tasks/url/impl/submit_meta_urls/test_core.py b/tests/automated/integration/tasks/url/impl/submit_meta_urls/test_core.py index 92287454..08914bed 100644 --- a/tests/automated/integration/tasks/url/impl/submit_meta_urls/test_core.py +++ b/tests/automated/integration/tasks/url/impl/submit_meta_urls/test_core.py @@ -5,9 +5,8 @@ from pdap_access_manager import ResponseInfo from src.collectors.enums import URLStatus -from src.core.enums import SubmitResponseStatus from src.core.tasks.url.operators.submit_meta_urls.core import SubmitMetaURLsTaskOperator -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.flag.url_validated.enums import URLType from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.ds_meta_url.sqlalchemy import URLDSMetaURL @@ -37,7 +36,7 @@ async def test_submit_meta_urls( # Create validated meta url agency_id: int = (await db_data_creator.create_agencies(count=1))[0] - mapping: URLMapping = (await db_data_creator.create_validated_urls( + mapping: SimpleURLMapping = (await db_data_creator.create_validated_urls( validation_type=URLType.META_URL ))[0] await db_data_creator.link_urls_to_agencies( diff --git a/tests/helpers/data_creator/core.py b/tests/helpers/data_creator/core.py index cbeb207f..6bf7df5f 100644 --- a/tests/helpers/data_creator/core.py +++ b/tests/helpers/data_creator/core.py @@ -10,7 +10,7 @@ from src.db.client.async_ import AsyncDatabaseClient from src.db.client.sync import DatabaseClient from src.db.dtos.url.insert import InsertURLsInfo -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.enums import TaskType from src.db.models.impl.agency.enums import AgencyType from src.db.models.impl.agency.sqlalchemy import Agency @@ -398,8 +398,8 @@ async def create_validated_urls( record_type: RecordType = RecordType.RESOURCES, validation_type: URLType = URLType.DATA_SOURCE, count: int = 1 - ) -> list[URLMapping]: - url_mappings: list[URLMapping] = await self.create_urls( + ) -> list[SimpleURLMapping]: + url_mappings: list[SimpleURLMapping] = await self.create_urls( record_type=record_type, count=count ) @@ -414,8 +414,8 @@ async def create_submitted_urls( self, record_type: RecordType = RecordType.RESOURCES, count: int = 1 - ) -> list[URLMapping]: - url_mappings: list[URLMapping] = await self.create_urls( + ) -> list[SimpleURLMapping]: + url_mappings: list[SimpleURLMapping] = await self.create_urls( record_type=record_type, count=count ) @@ -436,9 +436,9 @@ async def create_urls( collector_metadata: dict | None = None, count: int = 1, batch_id: int | None = None - ) -> list[URLMapping]: + ) -> list[SimpleURLMapping]: - url_mappings: list[URLMapping] = await create_urls( + url_mappings: list[SimpleURLMapping] = await create_urls( adb_client=self.adb_client, status=status, source=source, diff --git a/tests/helpers/data_creator/create.py b/tests/helpers/data_creator/create.py index 200a34cd..57c9f9da 100644 --- a/tests/helpers/data_creator/create.py +++ b/tests/helpers/data_creator/create.py @@ -4,7 +4,7 @@ from src.core.enums import BatchStatus, RecordType from src.db import County, Locality, USState from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.batch.pydantic.insert import BatchInsertModel from src.db.models.impl.flag.url_validated.enums import URLType from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic @@ -13,7 +13,7 @@ from src.db.models.impl.url.core.pydantic.insert import URLInsertModel from src.db.models.impl.url.data_source.pydantic import URLDataSourcePydantic from src.db.models.impl.url.record_type.pydantic import URLRecordTypePydantic -from tests.helpers.counter import COUNTER, next_int +from tests.helpers.counter import next_int from tests.helpers.data_creator.generate import generate_batch, generate_urls, generate_validated_flags, \ generate_url_data_sources, generate_batch_url_links from tests.helpers.data_creator.models.creation_info.county import CountyCreationInfo @@ -37,7 +37,7 @@ async def create_urls( record_type: RecordType | None = RecordType.RESOURCES, collector_metadata: dict | None = None, count: int = 1 -) -> list[URLMapping]: +) -> list[SimpleURLMapping]: urls: list[URLInsertModel] = generate_urls( status=status, source=source, @@ -55,7 +55,7 @@ async def create_urls( ] await adb_client.bulk_insert(record_types) - return [URLMapping(url_id=url_id, url=url.url) for url_id, url in zip(url_ids, urls)] + return [SimpleURLMapping(url_id=url_id, url=url.url) for url_id, url in zip(url_ids, urls)] async def create_validated_flags( adb_client: AsyncDatabaseClient, diff --git a/tests/helpers/data_creator/generate.py b/tests/helpers/data_creator/generate.py index bee0993f..f1eefce2 100644 --- a/tests/helpers/data_creator/generate.py +++ b/tests/helpers/data_creator/generate.py @@ -54,6 +54,7 @@ def generate_urls( source=source, name=f"Example {val}", collector_metadata=collector_metadata, + trailing_slash=False )) return results diff --git a/tests/helpers/data_creator/models/creation_info/url.py b/tests/helpers/data_creator/models/creation_info/url.py index 16c45a0a..67e148c0 100644 --- a/tests/helpers/data_creator/models/creation_info/url.py +++ b/tests/helpers/data_creator/models/creation_info/url.py @@ -2,14 +2,13 @@ from pydantic import BaseModel -from src.collectors.enums import URLStatus -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from tests.helpers.batch_creation_parameters.annotation_info import AnnotationInfo from tests.helpers.batch_creation_parameters.enums import URLCreationEnum class URLCreationInfo(BaseModel): - url_mappings: list[URLMapping] + url_mappings: list[SimpleURLMapping] outcome: URLCreationEnum annotation_info: Optional[AnnotationInfo] = None diff --git a/tests/helpers/setup/final_review/core.py b/tests/helpers/setup/final_review/core.py index ababae82..a3a3d42c 100644 --- a/tests/helpers/setup/final_review/core.py +++ b/tests/helpers/setup/final_review/core.py @@ -9,7 +9,7 @@ async def setup_for_get_next_url_for_final_review( db_data_creator: DBDataCreator, - annotation_count: Optional[int] = None, + annotation_count: int | None = None, include_user_annotations: bool = True, include_miscellaneous_metadata: bool = True ) -> FinalReviewSetupInfo: diff --git a/tests/helpers/setup/final_review/model.py b/tests/helpers/setup/final_review/model.py index a3e57a3c..1eac963e 100644 --- a/tests/helpers/setup/final_review/model.py +++ b/tests/helpers/setup/final_review/model.py @@ -1,12 +1,10 @@ -from typing import Optional - from pydantic import BaseModel -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping class FinalReviewSetupInfo(BaseModel): batch_id: int - url_mapping: URLMapping + url_mapping: SimpleURLMapping user_agency_id: int | None name_suggestion_id: int | None From 6cf4e5f3db4cc1cccfadf62cbdf9c763ba4f340e Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sat, 18 Oct 2025 14:22:18 -0400 Subject: [PATCH 06/84] Begin draft --- src/api/endpoints/submit/url/queries/core.py | 3 +- .../endpoints/suggest}/__init__.py | 0 src/api/endpoints/suggest/routes.py | 19 +++++ src/api/endpoints/suggest/url/__init__.py | 0 .../endpoints/suggest/url/models/__init__.py | 0 .../endpoints/suggest/url/models/request.py | 13 ++++ .../suggest/url/models/response/__init__.py | 0 .../suggest/url/models/response/enums.py | 7 ++ .../suggest/url/models/response/model.py | 9 +++ .../endpoints/suggest/url/queries/__init__.py | 0 src/api/endpoints/suggest/url/queries/core.py | 73 +++++++++++++++++++ src/api/endpoints/suggest/url/wrapper.py | 24 ++++++ .../probe/queries/insert_redirects/query.py | 2 +- .../insert_redirects/request_manager.py | 4 +- src/db/queries/urls_exist/__init__.py | 0 .../exist => db/queries/urls_exist}/model.py | 0 .../exist => db/queries/urls_exist}/query.py | 5 +- src/db/queries/urls_exist/requester.py | 41 +++++++++++ src/db/utils/validate.py | 15 ---- src/util/models/full_url.py | 4 + src/util/url.py | 16 +++- 21 files changed, 211 insertions(+), 24 deletions(-) rename src/{core/tasks/url/operators/probe/queries/urls/exist => api/endpoints/suggest}/__init__.py (100%) create mode 100644 src/api/endpoints/suggest/routes.py create mode 100644 src/api/endpoints/suggest/url/__init__.py create mode 100644 src/api/endpoints/suggest/url/models/__init__.py create mode 100644 src/api/endpoints/suggest/url/models/request.py create mode 100644 src/api/endpoints/suggest/url/models/response/__init__.py create mode 100644 src/api/endpoints/suggest/url/models/response/enums.py create mode 100644 src/api/endpoints/suggest/url/models/response/model.py create mode 100644 src/api/endpoints/suggest/url/queries/__init__.py create mode 100644 src/api/endpoints/suggest/url/queries/core.py create mode 100644 src/api/endpoints/suggest/url/wrapper.py create mode 100644 src/db/queries/urls_exist/__init__.py rename src/{core/tasks/url/operators/probe/queries/urls/exist => db/queries/urls_exist}/model.py (100%) rename src/{core/tasks/url/operators/probe/queries/urls/exist => db/queries/urls_exist}/query.py (91%) create mode 100644 src/db/queries/urls_exist/requester.py diff --git a/src/api/endpoints/submit/url/queries/core.py b/src/api/endpoints/submit/url/queries/core.py index 513d26ad..f65f81d0 100644 --- a/src/api/endpoints/submit/url/queries/core.py +++ b/src/api/endpoints/submit/url/queries/core.py @@ -18,9 +18,8 @@ from src.db.models.impl.url.suggestion.name.sqlalchemy import URLNameSuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion from src.db.queries.base.builder import QueryBuilderBase -from src.db.utils.validate import is_valid_url from src.util.models.url_and_scheme import URLAndScheme -from src.util.url import clean_url, get_url_and_scheme +from src.util.url import clean_url, get_url_and_scheme, is_valid_url class SubmitURLQueryBuilder(QueryBuilderBase): diff --git a/src/core/tasks/url/operators/probe/queries/urls/exist/__init__.py b/src/api/endpoints/suggest/__init__.py similarity index 100% rename from src/core/tasks/url/operators/probe/queries/urls/exist/__init__.py rename to src/api/endpoints/suggest/__init__.py diff --git a/src/api/endpoints/suggest/routes.py b/src/api/endpoints/suggest/routes.py new file mode 100644 index 00000000..8caeb8ac --- /dev/null +++ b/src/api/endpoints/suggest/routes.py @@ -0,0 +1,19 @@ +from fastapi import APIRouter, Depends + +from src.api.dependencies import get_async_core +from src.api.endpoints.suggest.url.models.request import URLSuggestionRequest +from src.api.endpoints.suggest.url.models.response.model import URLSuggestResponse +from src.api.endpoints.suggest.url.wrapper import suggest_url_wrapper +from src.core.core import AsyncCore + +suggest_router = APIRouter(prefix="/suggest", tags=["suggest"]) + +@suggest_router.post("/url") +async def suggest_url( + request: URLSuggestionRequest, + async_core: AsyncCore = Depends(get_async_core), +) -> URLSuggestResponse: + return await suggest_url_wrapper( + request=request, + adb_client=async_core.adb_client, + ) diff --git a/src/api/endpoints/suggest/url/__init__.py b/src/api/endpoints/suggest/url/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/suggest/url/models/__init__.py b/src/api/endpoints/suggest/url/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/suggest/url/models/request.py b/src/api/endpoints/suggest/url/models/request.py new file mode 100644 index 00000000..4deec1d5 --- /dev/null +++ b/src/api/endpoints/suggest/url/models/request.py @@ -0,0 +1,13 @@ +from pydantic import BaseModel + +from src.core.enums import RecordType +from src.db.models.impl.flag.url_validated.enums import URLType + + +class URLSuggestionRequest(BaseModel): + url: str + url_type: URLType | None = None + record_type: RecordType | None = None + agency_ids: list[int] = [] + location_ids: list[int] = [] + name: str | None = None \ No newline at end of file diff --git a/src/api/endpoints/suggest/url/models/response/__init__.py b/src/api/endpoints/suggest/url/models/response/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/suggest/url/models/response/enums.py b/src/api/endpoints/suggest/url/models/response/enums.py new file mode 100644 index 00000000..337d759a --- /dev/null +++ b/src/api/endpoints/suggest/url/models/response/enums.py @@ -0,0 +1,7 @@ +from enum import Enum + + +class URLSuggestResultEnum(Enum): + ACCEPTED = "accepted" + ACCEPTED_WITH_ERRORS = "accepted_with_errors" + DUPLICATE = "duplicate" diff --git a/src/api/endpoints/suggest/url/models/response/model.py b/src/api/endpoints/suggest/url/models/response/model.py new file mode 100644 index 00000000..091734bb --- /dev/null +++ b/src/api/endpoints/suggest/url/models/response/model.py @@ -0,0 +1,9 @@ +from pydantic import BaseModel + +from src.api.endpoints.suggest.url.models.response.enums import URLSuggestResultEnum + + +class URLSuggestResponse(BaseModel): + result: URLSuggestResultEnum + url_id: int | None + msg: str \ No newline at end of file diff --git a/src/api/endpoints/suggest/url/queries/__init__.py b/src/api/endpoints/suggest/url/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/suggest/url/queries/core.py b/src/api/endpoints/suggest/url/queries/core.py new file mode 100644 index 00000000..77b90128 --- /dev/null +++ b/src/api/endpoints/suggest/url/queries/core.py @@ -0,0 +1,73 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.suggest.url.models.request import URLSuggestionRequest +from src.api.endpoints.suggest.url.models.response.enums import URLSuggestResultEnum +from src.api.endpoints.suggest.url.models.response.model import URLSuggestResponse +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.base.builder import QueryBuilderBase +from src.db.queries.urls_exist.model import URLExistsResult +from src.db.queries.urls_exist.query import URLsExistInDBQueryBuilder +from src.db.queries.urls_exist.requester import URLSuggestRequester +from src.util.models.full_url import FullURL + + +class URLSuggestQueryBuilder(QueryBuilderBase): + + def __init__( + self, + request: URLSuggestionRequest + ): + super().__init__() + self.request = request + + async def run(self, session: AsyncSession) -> URLSuggestResponse: + # Clean URL + full_url = FullURL(self.request.url) + + # Check if already exists in database + url_exists_result: URLExistsResult = (await URLsExistInDBQueryBuilder( + [full_url] + ).run(session))[0] + if url_exists_result.url_id is not None: + return URLSuggestResponse( + url_id=url_exists_result.url_id, + result=URLSuggestResultEnum.DUPLICATE, + msg=f"URL Already Exists In Database with ID {url_exists_result.url_id}" + ) + + # Add URL + url = URL( + scheme=full_url.scheme, + url=full_url.id_form, + trailing_slash=full_url.has_trailing_slash, + ) + session.add(url) + await session.flush() + url_id: int = url.id + + try: + requester = URLSuggestRequester(session=session, url_id=url_id) + + # Optionally add other annotations + await requester.optionally_add_url_type_suggestion(self.request.url_type) + + await requester.optionally_add_record_type_suggestion(self.request.record_type) + + await requester.optionally_add_agency_id_suggestions(self.request.agency_ids) + + await requester.optionally_add_name_suggestion(self.request.name) + + # If cleaned URL matches original URL, return as ACCEPTED + return URLSuggestResponse( + url_id=url_id, + result=URLSuggestResultEnum.ACCEPTED, + msg="URL was accepted" + ) + + except Exception as e: + return URLSuggestResponse( + url_id=url_id, + result=URLSuggestResultEnum.ACCEPTED_WITH_ERRORS, + msg=f"The URL was accepted, but there were errors in adding provided annotations: {e}" + ) + diff --git a/src/api/endpoints/suggest/url/wrapper.py b/src/api/endpoints/suggest/url/wrapper.py new file mode 100644 index 00000000..7927db25 --- /dev/null +++ b/src/api/endpoints/suggest/url/wrapper.py @@ -0,0 +1,24 @@ +from http import HTTPStatus + +from fastapi import HTTPException + +from src.api.endpoints.suggest.url.models.request import URLSuggestionRequest +from src.api.endpoints.suggest.url.models.response.model import URLSuggestResponse +from src.api.endpoints.suggest.url.queries.core import URLSuggestQueryBuilder +from src.db.client.async_ import AsyncDatabaseClient +from src.util.url import is_valid_url + + +async def suggest_url_wrapper( + request: URLSuggestionRequest, + adb_client: AsyncDatabaseClient, +) -> URLSuggestResponse: + if not is_valid_url(request.url): + raise HTTPException( + status_code=HTTPStatus.BAD_REQUEST, + detail="Invalid URL" + ) + + return await adb_client.run_query_builder( + URLSuggestQueryBuilder(request) + ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/probe/queries/insert_redirects/query.py b/src/core/tasks/url/operators/probe/queries/insert_redirects/query.py index 8dd4f693..79dd7d9a 100644 --- a/src/core/tasks/url/operators/probe/queries/insert_redirects/query.py +++ b/src/core/tasks/url/operators/probe/queries/insert_redirects/query.py @@ -2,7 +2,7 @@ from src.core.tasks.url.operators.probe.queries.insert_redirects.extract import extract_response_pairs from src.core.tasks.url.operators.probe.queries.insert_redirects.request_manager import InsertRedirectsRequestManager -from src.core.tasks.url.operators.probe.queries.urls.exist.model import URLExistsResult +from src.db.queries.urls_exist.model import URLExistsResult from src.core.tasks.url.operators.probe.tdo import URLProbeTDO from src.db.dtos.url.mapping_.full import FullURLMapping from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/core/tasks/url/operators/probe/queries/insert_redirects/request_manager.py b/src/core/tasks/url/operators/probe/queries/insert_redirects/request_manager.py index 45eaa8e3..614a8bca 100644 --- a/src/core/tasks/url/operators/probe/queries/insert_redirects/request_manager.py +++ b/src/core/tasks/url/operators/probe/queries/insert_redirects/request_manager.py @@ -8,8 +8,8 @@ convert_url_response_mapping_to_web_metadata_list from src.core.tasks.url.operators.probe.queries.insert_redirects.map import map_url_mappings_to_probe_responses from src.core.tasks.url.operators.probe.queries.insert_redirects.models.url_response_map import URLResponseMapping -from src.core.tasks.url.operators.probe.queries.urls.exist.model import URLExistsResult -from src.core.tasks.url.operators.probe.queries.urls.exist.query import URLsExistInDBQueryBuilder +from src.db.queries.urls_exist.model import URLExistsResult +from src.db.queries.urls_exist import URLsExistInDBQueryBuilder from src.core.tasks.url.operators.probe.tdo import URLProbeTDO from src.db.dtos.url.mapping_.full import FullURLMapping from src.db.helpers.session import session_helper as sh diff --git a/src/db/queries/urls_exist/__init__.py b/src/db/queries/urls_exist/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/probe/queries/urls/exist/model.py b/src/db/queries/urls_exist/model.py similarity index 100% rename from src/core/tasks/url/operators/probe/queries/urls/exist/model.py rename to src/db/queries/urls_exist/model.py diff --git a/src/core/tasks/url/operators/probe/queries/urls/exist/query.py b/src/db/queries/urls_exist/query.py similarity index 91% rename from src/core/tasks/url/operators/probe/queries/urls/exist/query.py rename to src/db/queries/urls_exist/query.py index 4e9d3173..510cf78f 100644 --- a/src/core/tasks/url/operators/probe/queries/urls/exist/query.py +++ b/src/db/queries/urls_exist/query.py @@ -1,11 +1,10 @@ from sqlalchemy import select, func from sqlalchemy.ext.asyncio import AsyncSession -from src.core.tasks.url.operators.probe.queries.urls.exist.model import URLExistsResult -from src.db.helpers.session.session_helper import results_exist +from src.db.queries.urls_exist.model import URLExistsResult +from src.db.helpers.session import session_helper as sh from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase -from src.db.helpers.session import session_helper as sh from src.util.models.full_url import FullURL diff --git a/src/db/queries/urls_exist/requester.py b/src/db/queries/urls_exist/requester.py new file mode 100644 index 00000000..45335b87 --- /dev/null +++ b/src/db/queries/urls_exist/requester.py @@ -0,0 +1,41 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.enums import RecordType +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.templates.requester import RequesterBase + + +class URLSuggestRequester(RequesterBase): + + def __init__( + self, + session: AsyncSession, + url_id: int + ): + super().__init__(session=session) + self.url_id = url_id + + async def optionally_add_url_type_suggestion( + self, + url_type: URLType | None + ) -> None: + if url_type is None: + return + # TODO + + async def optionally_add_record_type_suggestion(self, record_type: RecordType | None): + if record_type is None: + return + # TODO + + async def optionally_add_agency_id_suggestions(self, agency_ids: list[int]): + if len(agency_ids) == 0: + return + # TODO + + async def optionally_add_name_suggestion(self, name: str | None): + if name is None: + return + # TODO + + diff --git a/src/db/utils/validate.py b/src/db/utils/validate.py index 4837e12c..9d77f910 100644 --- a/src/db/utils/validate.py +++ b/src/db/utils/validate.py @@ -1,7 +1,4 @@ from typing import Protocol -from urllib.parse import urlparse - -from pydantic import BaseModel def validate_has_protocol(obj: object, protocol: type[Protocol]): @@ -13,15 +10,3 @@ def validate_all_models_of_same_type(objects: list[object]): if not all(isinstance(model, type(first_model)) for model in objects): raise TypeError("Models must be of the same type") -def is_valid_url(url: str) -> bool: - try: - result = urlparse(url) - # If scheme is missing, `netloc` will be empty, so we check path too - if result.scheme in ("http", "https") and result.netloc: - return True - if not result.scheme and result.path: - # no scheme, treat path as potential domain - return "." in result.path - return False - except ValueError: - return False diff --git a/src/util/models/full_url.py b/src/util/models/full_url.py index 1118040e..9b3fc694 100644 --- a/src/util/models/full_url.py +++ b/src/util/models/full_url.py @@ -79,6 +79,10 @@ def id_form(self) -> str: clean: str = clean_url(no_trailing_slash) return clean + @property + def has_trailing_slash(self) -> bool: + return self.full_url.endswith("/") + def clean(self) -> str: return clean_url(self.full_url) diff --git a/src/util/url.py b/src/util/url.py index 0fdf7d0b..88c8959d 100644 --- a/src/util/url.py +++ b/src/util/url.py @@ -31,4 +31,18 @@ def remove_url_scheme(url: str) -> str: parsed = urlparse(url) if parsed.scheme: return url.replace(f"{parsed.scheme}://", "", 1) - return url \ No newline at end of file + return url + + +def is_valid_url(url: str) -> bool: + try: + result = urlparse(url) + # If scheme is missing, `netloc` will be empty, so we check path too + if result.scheme in ("http", "https") and result.netloc: + return True + if not result.scheme and result.path: + # no scheme, treat path as potential domain + return "." in result.path + return False + except ValueError: + return False From 4340d4ac1286fa3e48c4758ca9e793b2f2ca6d5d Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sat, 18 Oct 2025 15:26:07 -0400 Subject: [PATCH 07/84] Update URL Status View Enum --- README.md | 1 + ...d35_update_url_status_materialized_view.py | 104 ++++++++++++++++++ .../query/subqueries/oldest_pending_url.py | 3 +- src/db/models/views/url_status/enums.py | 3 +- 4 files changed, 109 insertions(+), 2 deletions(-) create mode 100644 alembic/versions/2025_10_18_1517-9d57b3b79d35_update_url_status_materialized_view.py diff --git a/README.md b/README.md index ae2263dc..4fa95b40 100644 --- a/README.md +++ b/README.md @@ -156,3 +156,4 @@ if it detects any missing docstrings or type hints in files that you have modifi These will *not* block any Pull request, but exist primarily as advisory comments to encourage good coding standards. Note that `python_checks.yml` will only function on pull requests made from within the repo, not from a forked repo. + diff --git a/alembic/versions/2025_10_18_1517-9d57b3b79d35_update_url_status_materialized_view.py b/alembic/versions/2025_10_18_1517-9d57b3b79d35_update_url_status_materialized_view.py new file mode 100644 index 00000000..2a7db8e5 --- /dev/null +++ b/alembic/versions/2025_10_18_1517-9d57b3b79d35_update_url_status_materialized_view.py @@ -0,0 +1,104 @@ +"""Update URL Status Materialized View + +Revision ID: 9d57b3b79d35 +Revises: 7fc6502f1fa3 +Create Date: 2025-10-18 15:17:23.653448 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '9d57b3b79d35' +down_revision: Union[str, None] = '7fc6502f1fa3' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.execute("DROP MATERIALIZED VIEW IF EXISTS url_status_mat_view") + op.execute(""" + CREATE MATERIALIZED VIEW url_status_mat_view as + with + urls_with_relevant_errors as ( + select + ute.url_id + from + url_task_error ute + where + ute.task_type in ( + 'Screenshot', + 'HTML', + 'URL Probe' + ) + ) + , status_text as ( + select + u.id as url_id, + case + when ( + -- Validated as not relevant, individual record, or not found + fuv.type in ('not relevant', 'individual record', 'not found') + ) Then 'Accepted' + when ( + (fuv.type = 'data source' and uds.url_id is null) + OR + (fuv.type = 'meta url' and udmu.url_id is null) + ) Then 'Awaiting Submission' + when ( + (fuv.type = 'data source' and uds.url_id is not null) + OR + (fuv.type = 'meta url' and udmu.url_id is not null) + ) Then 'Submitted' + when ( + -- Has compressed HTML + uch.url_id is not null + AND + -- Has web metadata + uwm.url_id is not null + AND + -- Has screenshot + us.url_id is not null + ) THEN 'Community Labeling' + when uwre.url_id is not null then 'Error' + ELSE 'Intake' + END as status + + from + urls u + left join urls_with_relevant_errors uwre + on u.id = uwre.url_id + left join url_screenshot us + on u.id = us.url_id + left join url_compressed_html uch + on u.id = uch.url_id + left join url_web_metadata uwm + on u.id = uwm.url_id + left join flag_url_validated fuv + on u.id = fuv.url_id + left join url_ds_meta_url udmu + on u.id = udmu.url_id + left join url_data_source uds + on u.id = uds.url_id + ) + select + url_id, + status, + CASE status + WHEN 'Intake' THEN 100 + WHEN 'Error' THEN 110 + WHEN 'Community Labeling' THEN 200 + WHEN 'Accepted' THEN 300 + WHEN 'Awaiting Submission' THEN 380 + WHEN 'Submitted' THEN 390 + ELSE -1 + END as code + from status_text + """) + + +def downgrade() -> None: + pass diff --git a/src/api/endpoints/metrics/urls/aggregated/query/subqueries/oldest_pending_url.py b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/oldest_pending_url.py index 2a951b4a..e086b752 100644 --- a/src/api/endpoints/metrics/urls/aggregated/query/subqueries/oldest_pending_url.py +++ b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/oldest_pending_url.py @@ -27,8 +27,9 @@ async def run( ).where( URLStatusMatView.status.not_in( [ - URLStatusViewEnum.SUBMITTED_PIPELINE_COMPLETE.value, + URLStatusViewEnum.SUBMITTED.value, URLStatusViewEnum.ACCEPTED.value, + URLStatusViewEnum.AWAITING_SUBMISSION.value, ] ) ).order_by( diff --git a/src/db/models/views/url_status/enums.py b/src/db/models/views/url_status/enums.py index 82995812..a467a33d 100644 --- a/src/db/models/views/url_status/enums.py +++ b/src/db/models/views/url_status/enums.py @@ -4,6 +4,7 @@ class URLStatusViewEnum(Enum): INTAKE = "Intake" ACCEPTED = "Accepted" - SUBMITTED_PIPELINE_COMPLETE = "Submitted/Pipeline Complete" + AWAITING_SUBMISSION = "Awaiting Submission" + SUBMITTED = "Submitted" ERROR = "Error" COMMUNITY_LABELING = "Community Labeling" \ No newline at end of file From d93d90a193a9c864ba79b8ea6b09ca71688eb37a Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 20 Oct 2025 16:40:53 -0400 Subject: [PATCH 08/84] Add agency endpoints --- ...adf9d894180_add_autogenerated_agency_id.py | 32 ++++++ pytest.ini | 1 + .../{batch/dtos/post => agencies}/__init__.py | 0 src/api/endpoints/agencies/by_id/__init__.py | 0 .../agencies/by_id/delete/__init__.py | 0 .../endpoints/agencies/by_id/delete/query.py | 21 ++++ .../agencies/by_id/delete/request.py | 0 .../agencies/by_id/locations/__init__.py | 0 .../by_id/locations/delete/__init__.py | 0 .../agencies/by_id/locations/delete/query.py | 29 +++++ .../agencies/by_id/locations/get/__init__.py | 0 .../agencies/by_id/locations/get/query.py | 37 ++++++ .../agencies/by_id/locations/get/response.py | 6 + .../agencies/by_id/locations/post/__init__.py | 0 .../agencies/by_id/locations/post/query.py | 23 ++++ .../endpoints/agencies/by_id/put/__init__.py | 0 src/api/endpoints/agencies/by_id/put/query.py | 42 +++++++ .../endpoints/agencies/by_id/put/request.py | 9 ++ src/api/endpoints/agencies/root/__init__.py | 0 .../endpoints/agencies/root/get/__init__.py | 0 src/api/endpoints/agencies/root/get/query.py | 52 +++++++++ .../endpoints/agencies/root/get/response.py | 12 ++ .../endpoints/agencies/root/post/__init__.py | 0 src/api/endpoints/agencies/root/post/query.py | 44 +++++++ .../endpoints/agencies/root/post/request.py | 10 ++ .../endpoints/agencies/root/post/response.py | 5 + src/api/endpoints/agencies/routes.py | 107 ++++++++++++++++++ src/api/endpoints/batch/routes.py | 5 +- src/api/main.py | 4 +- src/api/shared/__init__.py | 0 src/api/shared/models/__init__.py | 0 .../models/message_response.py} | 0 src/core/core.py | 10 +- .../insert_redirects/request_manager.py | 2 +- src/db/client/async_.py | 8 ++ src/db/models/impl/agency/sqlalchemy.py | 20 +++- src/db/models/views/location_expanded.py | 1 - src/db/queries/base/builder.py | 3 +- .../api/_helpers/RequestValidator.py | 80 ++++++++++++- .../integration/api/agencies/__init__.py | 0 .../integration/api/agencies/test_core.py | 75 ++++++++++++ tests/automated/integration/api/conftest.py | 69 ----------- tests/automated/integration/conftest.py | 69 ++++++++++- .../automated/integration/readonly/README.md | 1 + .../integration/readonly/__init__.py | 0 .../integration/readonly/api/__init__.py | 0 .../readonly/api/agencies/__init__.py | 0 .../readonly/api/agencies/get/__init__.py | 0 .../api/agencies/get/test_locations.py | 16 +++ .../readonly/api/agencies/get/test_root.py | 20 ++++ .../integration/readonly/conftest.py | 101 +++++++++++++++++ tests/helpers/data_creator/core.py | 11 +- 52 files changed, 829 insertions(+), 96 deletions(-) create mode 100644 alembic/versions/2025_10_20_1620-6adf9d894180_add_autogenerated_agency_id.py rename src/api/endpoints/{batch/dtos/post => agencies}/__init__.py (100%) create mode 100644 src/api/endpoints/agencies/by_id/__init__.py create mode 100644 src/api/endpoints/agencies/by_id/delete/__init__.py create mode 100644 src/api/endpoints/agencies/by_id/delete/query.py create mode 100644 src/api/endpoints/agencies/by_id/delete/request.py create mode 100644 src/api/endpoints/agencies/by_id/locations/__init__.py create mode 100644 src/api/endpoints/agencies/by_id/locations/delete/__init__.py create mode 100644 src/api/endpoints/agencies/by_id/locations/delete/query.py create mode 100644 src/api/endpoints/agencies/by_id/locations/get/__init__.py create mode 100644 src/api/endpoints/agencies/by_id/locations/get/query.py create mode 100644 src/api/endpoints/agencies/by_id/locations/get/response.py create mode 100644 src/api/endpoints/agencies/by_id/locations/post/__init__.py create mode 100644 src/api/endpoints/agencies/by_id/locations/post/query.py create mode 100644 src/api/endpoints/agencies/by_id/put/__init__.py create mode 100644 src/api/endpoints/agencies/by_id/put/query.py create mode 100644 src/api/endpoints/agencies/by_id/put/request.py create mode 100644 src/api/endpoints/agencies/root/__init__.py create mode 100644 src/api/endpoints/agencies/root/get/__init__.py create mode 100644 src/api/endpoints/agencies/root/get/query.py create mode 100644 src/api/endpoints/agencies/root/get/response.py create mode 100644 src/api/endpoints/agencies/root/post/__init__.py create mode 100644 src/api/endpoints/agencies/root/post/query.py create mode 100644 src/api/endpoints/agencies/root/post/request.py create mode 100644 src/api/endpoints/agencies/root/post/response.py create mode 100644 src/api/endpoints/agencies/routes.py create mode 100644 src/api/shared/__init__.py create mode 100644 src/api/shared/models/__init__.py rename src/api/{endpoints/batch/dtos/post/abort.py => shared/models/message_response.py} (100%) create mode 100644 tests/automated/integration/api/agencies/__init__.py create mode 100644 tests/automated/integration/api/agencies/test_core.py delete mode 100644 tests/automated/integration/api/conftest.py create mode 100644 tests/automated/integration/readonly/README.md create mode 100644 tests/automated/integration/readonly/__init__.py create mode 100644 tests/automated/integration/readonly/api/__init__.py create mode 100644 tests/automated/integration/readonly/api/agencies/__init__.py create mode 100644 tests/automated/integration/readonly/api/agencies/get/__init__.py create mode 100644 tests/automated/integration/readonly/api/agencies/get/test_locations.py create mode 100644 tests/automated/integration/readonly/api/agencies/get/test_root.py create mode 100644 tests/automated/integration/readonly/conftest.py diff --git a/alembic/versions/2025_10_20_1620-6adf9d894180_add_autogenerated_agency_id.py b/alembic/versions/2025_10_20_1620-6adf9d894180_add_autogenerated_agency_id.py new file mode 100644 index 00000000..37fed1aa --- /dev/null +++ b/alembic/versions/2025_10_20_1620-6adf9d894180_add_autogenerated_agency_id.py @@ -0,0 +1,32 @@ +"""Add autogenerated agency id + +Revision ID: 6adf9d894180 +Revises: 7fc6502f1fa3 +Create Date: 2025-10-20 16:20:44.081736 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '6adf9d894180' +down_revision: Union[str, None] = '7fc6502f1fa3' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.execute(""" + CREATE SEQUENCE agencies_agency_id START WITH 23191; + """) + + op.execute(""" + ALTER TABLE agencies ALTER COLUMN agency_id SET DEFAULT nextval('agencies_agency_id'); + """) + + +def downgrade() -> None: + pass diff --git a/pytest.ini b/pytest.ini index ceaa093c..5c39d47c 100644 --- a/pytest.ini +++ b/pytest.ini @@ -3,3 +3,4 @@ timeout = 300 asyncio_default_fixture_loop_scope=function markers = manual: mark test as manual-only (excluded from default test runs) +asyncio_mode = auto \ No newline at end of file diff --git a/src/api/endpoints/batch/dtos/post/__init__.py b/src/api/endpoints/agencies/__init__.py similarity index 100% rename from src/api/endpoints/batch/dtos/post/__init__.py rename to src/api/endpoints/agencies/__init__.py diff --git a/src/api/endpoints/agencies/by_id/__init__.py b/src/api/endpoints/agencies/by_id/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/agencies/by_id/delete/__init__.py b/src/api/endpoints/agencies/by_id/delete/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/agencies/by_id/delete/query.py b/src/api/endpoints/agencies/by_id/delete/query.py new file mode 100644 index 00000000..800b7cde --- /dev/null +++ b/src/api/endpoints/agencies/by_id/delete/query.py @@ -0,0 +1,21 @@ +from sqlalchemy import delete +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.queries.base.builder import QueryBuilderBase + + +class DeleteAgencyQueryBuilder(QueryBuilderBase): + def __init__( + self, + agency_id: int, + ): + super().__init__() + self.agency_id = agency_id + + async def run(self, session: AsyncSession) -> None: + statement = ( + delete(Agency) + .where(Agency.agency_id == self.agency_id) + ) + await session.execute(statement) \ No newline at end of file diff --git a/src/api/endpoints/agencies/by_id/delete/request.py b/src/api/endpoints/agencies/by_id/delete/request.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/agencies/by_id/locations/__init__.py b/src/api/endpoints/agencies/by_id/locations/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/agencies/by_id/locations/delete/__init__.py b/src/api/endpoints/agencies/by_id/locations/delete/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/agencies/by_id/locations/delete/query.py b/src/api/endpoints/agencies/by_id/locations/delete/query.py new file mode 100644 index 00000000..9c96c65b --- /dev/null +++ b/src/api/endpoints/agencies/by_id/locations/delete/query.py @@ -0,0 +1,29 @@ +from sqlalchemy import delete +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.models.impl.link.agency_location.sqlalchemy import LinkAgencyLocation +from src.db.queries.base.builder import QueryBuilderBase + + +class DeleteAgencyLocationQueryBuilder(QueryBuilderBase): + + def __init__( + self, + agency_id: int, + location_id: int, + ): + super().__init__() + self.agency_id = agency_id + self.location_id = location_id + + async def run(self, session: AsyncSession) -> None: + statement = ( + delete(LinkAgencyLocation) + .where( + (LinkAgencyLocation.agency_id == self.agency_id) + & (LinkAgencyLocation.location_id == self.location_id) + ) + ) + + await session.execute(statement) + diff --git a/src/api/endpoints/agencies/by_id/locations/get/__init__.py b/src/api/endpoints/agencies/by_id/locations/get/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/agencies/by_id/locations/get/query.py b/src/api/endpoints/agencies/by_id/locations/get/query.py new file mode 100644 index 00000000..e7ad22d5 --- /dev/null +++ b/src/api/endpoints/agencies/by_id/locations/get/query.py @@ -0,0 +1,37 @@ +from typing import Sequence + +from sqlalchemy import select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.agencies.by_id.locations.get.response import AgencyGetLocationsResponse +from src.db.models.impl.link.agency_location.sqlalchemy import LinkAgencyLocation +from src.db.models.views.location_expanded import LocationExpandedView +from src.db.queries.base.builder import QueryBuilderBase + + +class GetAgencyLocationsQueryBuilder(QueryBuilderBase): + + def __init__( + self, + agency_id: int, + ): + super().__init__() + self.agency_id = agency_id + + async def run(self, session: AsyncSession) -> list[AgencyGetLocationsResponse]: + query = ( + select( + LinkAgencyLocation.location_id, + LocationExpandedView.full_display_name + ) + .where( + LinkAgencyLocation.agency_id == self.agency_id + ) + .join( + LocationExpandedView, + LocationExpandedView.id == LinkAgencyLocation.location_id + ) + ) + + result: Sequence[RowMapping] = await self.sh.mappings(session, query=query) + return [AgencyGetLocationsResponse(**row) for row in result] \ No newline at end of file diff --git a/src/api/endpoints/agencies/by_id/locations/get/response.py b/src/api/endpoints/agencies/by_id/locations/get/response.py new file mode 100644 index 00000000..1e4a3078 --- /dev/null +++ b/src/api/endpoints/agencies/by_id/locations/get/response.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class AgencyGetLocationsResponse(BaseModel): + location_id: int + full_display_name: str diff --git a/src/api/endpoints/agencies/by_id/locations/post/__init__.py b/src/api/endpoints/agencies/by_id/locations/post/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/agencies/by_id/locations/post/query.py b/src/api/endpoints/agencies/by_id/locations/post/query.py new file mode 100644 index 00000000..fd1bdf2f --- /dev/null +++ b/src/api/endpoints/agencies/by_id/locations/post/query.py @@ -0,0 +1,23 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.models.impl.link.agency_location.sqlalchemy import LinkAgencyLocation +from src.db.queries.base.builder import QueryBuilderBase + + +class AddAgencyLocationQueryBuilder(QueryBuilderBase): + + def __init__( + self, + agency_id: int, + location_id: int + ): + super().__init__() + self.agency_id = agency_id + self.location_id = location_id + + async def run(self, session: AsyncSession) -> None: + lal = LinkAgencyLocation( + agency_id=self.agency_id, + location_id=self.location_id, + ) + session.add(lal) \ No newline at end of file diff --git a/src/api/endpoints/agencies/by_id/put/__init__.py b/src/api/endpoints/agencies/by_id/put/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/agencies/by_id/put/query.py b/src/api/endpoints/agencies/by_id/put/query.py new file mode 100644 index 00000000..0f58a7db --- /dev/null +++ b/src/api/endpoints/agencies/by_id/put/query.py @@ -0,0 +1,42 @@ +from fastapi import HTTPException +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.agencies.by_id.put.request import AgencyPutRequest +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.queries.base.builder import QueryBuilderBase + + +class UpdateAgencyQueryBuilder(QueryBuilderBase): + + def __init__( + self, + agency_id: int, + request: AgencyPutRequest, + ): + super().__init__() + self.agency_id = agency_id + self.request = request + + async def run(self, session: AsyncSession) -> None: + + query = ( + select( + Agency + ) + .where( + Agency.agency_id == self.agency_id + ) + ) + + agency = await self.sh.one_or_none(session, query=query) + if not agency: + raise HTTPException(status_code=400, detail="Agency not found") + + if self.request.name is not None: + agency.name = self.request.name + if self.request.type is not None: + agency.type = self.request.type + if self.request.jurisdiction_type is not None: + agency.jurisdiction_type = self.request.jurisdiction_type + diff --git a/src/api/endpoints/agencies/by_id/put/request.py b/src/api/endpoints/agencies/by_id/put/request.py new file mode 100644 index 00000000..b485e43c --- /dev/null +++ b/src/api/endpoints/agencies/by_id/put/request.py @@ -0,0 +1,9 @@ +from pydantic import BaseModel + +from src.db.models.impl.agency.enums import AgencyType, JurisdictionType + + +class AgencyPutRequest(BaseModel): + name: str | None = None + type: AgencyType | None = None + jurisdiction_type: JurisdictionType | None = None diff --git a/src/api/endpoints/agencies/root/__init__.py b/src/api/endpoints/agencies/root/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/agencies/root/get/__init__.py b/src/api/endpoints/agencies/root/get/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/agencies/root/get/query.py b/src/api/endpoints/agencies/root/get/query.py new file mode 100644 index 00000000..9452f12e --- /dev/null +++ b/src/api/endpoints/agencies/root/get/query.py @@ -0,0 +1,52 @@ +from sqlalchemy import select, Result +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import joinedload, selectinload + +from src.api.endpoints.agencies.by_id.locations.get.response import AgencyGetLocationsResponse +from src.api.endpoints.agencies.root.get.response import AgencyGetResponse +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.queries.base.builder import QueryBuilderBase + + +class GetAgenciesQueryBuilder(QueryBuilderBase): + + def __init__( + self, + page: int, + ): + super().__init__() + self.page = page + + async def run(self, session: AsyncSession) -> list[AgencyGetResponse]: + + query = ( + select( + Agency + ) + .options( + selectinload(Agency.locations) + ) + .offset((self.page - 1) * 100) + .limit(100) + ) + + results: Result[tuple[Agency]] = await session.execute(query) + responses: list[AgencyGetResponse] = [] + for result in results: + agency: Agency = result[0] + locations: list[AgencyGetLocationsResponse] = [ + AgencyGetLocationsResponse( + location_id=location.id, + full_display_name=location.full_display_name, + ) + for location in agency.locations + ] + responses.append(AgencyGetResponse( + id=agency.agency_id, + name=agency.name, + type=agency.agency_type, + jurisdiction_type=agency.jurisdiction_type, + locations=locations, + )) + + return responses diff --git a/src/api/endpoints/agencies/root/get/response.py b/src/api/endpoints/agencies/root/get/response.py new file mode 100644 index 00000000..b9d374eb --- /dev/null +++ b/src/api/endpoints/agencies/root/get/response.py @@ -0,0 +1,12 @@ +from pydantic import BaseModel + +from src.api.endpoints.agencies.by_id.locations.get.response import AgencyGetLocationsResponse +from src.db.models.impl.agency.enums import AgencyType, JurisdictionType + + +class AgencyGetResponse(BaseModel): + id: int + name: str + type: AgencyType + jurisdiction_type: JurisdictionType + locations: list[AgencyGetLocationsResponse] \ No newline at end of file diff --git a/src/api/endpoints/agencies/root/post/__init__.py b/src/api/endpoints/agencies/root/post/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/agencies/root/post/query.py b/src/api/endpoints/agencies/root/post/query.py new file mode 100644 index 00000000..29ff9823 --- /dev/null +++ b/src/api/endpoints/agencies/root/post/query.py @@ -0,0 +1,44 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.agencies.root.post.request import AgencyPostRequest +from src.api.endpoints.agencies.root.post.response import AgencyPostResponse +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.link.agency_location.sqlalchemy import LinkAgencyLocation +from src.db.queries.base.builder import QueryBuilderBase + + +class AddAgencyQueryBuilder(QueryBuilderBase): + + def __init__( + self, + request: AgencyPostRequest, + ): + super().__init__() + self.request = request + + async def run(self, session: AsyncSession) -> AgencyPostResponse: + agency = Agency( + name=self.request.name, + agency_type=self.request.type, + jurisdiction_type=self.request.jurisdiction_type, + ) + + session.add(agency) + await session.flush() + await session.refresh(agency) + agency_id: int = agency.agency_id + + try: + + for location_id in self.request.location_ids: + lal = LinkAgencyLocation( + agency_id=agency_id, + location_id=location_id, + ) + session.add(lal) + + except Exception as e: + await session.rollback() + raise e + + return AgencyPostResponse(agency_id=agency_id) \ No newline at end of file diff --git a/src/api/endpoints/agencies/root/post/request.py b/src/api/endpoints/agencies/root/post/request.py new file mode 100644 index 00000000..6d95eaf2 --- /dev/null +++ b/src/api/endpoints/agencies/root/post/request.py @@ -0,0 +1,10 @@ +from pydantic import BaseModel + +from src.db.models.impl.agency.enums import AgencyType, JurisdictionType + + +class AgencyPostRequest(BaseModel): + name: str + type: AgencyType + jurisdiction_type: JurisdictionType + location_ids: list[int] \ No newline at end of file diff --git a/src/api/endpoints/agencies/root/post/response.py b/src/api/endpoints/agencies/root/post/response.py new file mode 100644 index 00000000..dfba5261 --- /dev/null +++ b/src/api/endpoints/agencies/root/post/response.py @@ -0,0 +1,5 @@ +from pydantic import BaseModel + + +class AgencyPostResponse(BaseModel): + agency_id: int \ No newline at end of file diff --git a/src/api/endpoints/agencies/routes.py b/src/api/endpoints/agencies/routes.py new file mode 100644 index 00000000..6edfdf03 --- /dev/null +++ b/src/api/endpoints/agencies/routes.py @@ -0,0 +1,107 @@ +from fastapi import APIRouter +from fastapi.params import Query, Depends, Path + +from src.api.dependencies import get_async_core +from src.api.endpoints.agencies.by_id.delete.query import DeleteAgencyQueryBuilder +from src.api.endpoints.agencies.by_id.locations.delete.query import DeleteAgencyLocationQueryBuilder +from src.api.endpoints.agencies.by_id.locations.get.query import GetAgencyLocationsQueryBuilder +from src.api.endpoints.agencies.by_id.locations.get.response import AgencyGetLocationsResponse +from src.api.endpoints.agencies.by_id.locations.post.query import AddAgencyLocationQueryBuilder +from src.api.endpoints.agencies.by_id.put.query import UpdateAgencyQueryBuilder +from src.api.endpoints.agencies.by_id.put.request import AgencyPutRequest +from src.api.endpoints.agencies.root.get.query import GetAgenciesQueryBuilder +from src.api.endpoints.agencies.root.get.response import AgencyGetResponse +from src.api.endpoints.agencies.root.post.query import AddAgencyQueryBuilder +from src.api.endpoints.agencies.root.post.request import AgencyPostRequest +from src.api.endpoints.agencies.root.post.response import AgencyPostResponse +from src.api.shared.models.message_response import MessageResponse +from src.core.core import AsyncCore + +agencies_router = APIRouter(prefix="/agencies", tags=["agencies"]) + +@agencies_router.get("") +async def get_agencies( + async_core: AsyncCore = Depends(get_async_core), + page: int = Query( + description="Page number", + default=1 + ), +) -> list[AgencyGetResponse]: + return await async_core.adb_client.run_query_builder( + GetAgenciesQueryBuilder(page=page) + ) + +@agencies_router.post("") +async def create_agency( + request: AgencyPostRequest, + async_core: AsyncCore = Depends(get_async_core), +) -> AgencyPostResponse: + return await async_core.adb_client.run_query_builder( + AddAgencyQueryBuilder(request=request) + ) + +@agencies_router.delete("/{agency_id}") +async def delete_agency( + agency_id: int = Path( + description="Agency ID to delete" + ), + async_core: AsyncCore = Depends(get_async_core), +) -> MessageResponse: + await async_core.adb_client.run_query_builder( + DeleteAgencyQueryBuilder(agency_id=agency_id) + ) + return MessageResponse(message="Agency deleted.") + +@agencies_router.put("/{agency_id}") +async def update_agency( + request: AgencyPutRequest, + agency_id: int = Path( + description="Agency ID to update" + ), + async_core: AsyncCore = Depends(get_async_core), +) -> MessageResponse: + await async_core.adb_client.run_query_builder( + UpdateAgencyQueryBuilder(agency_id=agency_id, request=request) + ) + return MessageResponse(message="Agency updated.") + +@agencies_router.get("/{agency_id}/locations") +async def get_agency_locations( + agency_id: int = Path( + description="Agency ID to get locations for" + ), + async_core: AsyncCore = Depends(get_async_core), +) -> list[AgencyGetLocationsResponse]: + return await async_core.adb_client.run_query_builder( + GetAgencyLocationsQueryBuilder(agency_id=agency_id) + ) + +@agencies_router.post("/{agency_id}/locations/{location_id}") +async def add_location_to_agency( + agency_id: int = Path( + description="Agency ID to add location to" + ), + location_id: int = Path( + description="Location ID to add" + ), + async_core: AsyncCore = Depends(get_async_core), +) -> MessageResponse: + await async_core.adb_client.run_query_builder( + AddAgencyLocationQueryBuilder(agency_id=agency_id, location_id=location_id) + ) + return MessageResponse(message="Location added to agency.") + +@agencies_router.delete("/{agency_id}/locations/{location_id}") +async def remove_location_from_agency( + agency_id: int = Path( + description="Agency ID to remove location from" + ), + location_id: int = Path( + description="Location ID to remove" + ), + async_core: AsyncCore = Depends(get_async_core), +) -> MessageResponse: + await async_core.adb_client.run_query_builder( + DeleteAgencyLocationQueryBuilder(agency_id=agency_id, location_id=location_id) + ) + return MessageResponse(message="Location removed from agency.") diff --git a/src/api/endpoints/batch/routes.py b/src/api/endpoints/batch/routes.py index bd7bbf61..87839fb7 100644 --- a/src/api/endpoints/batch/routes.py +++ b/src/api/endpoints/batch/routes.py @@ -1,5 +1,3 @@ -from typing import Optional - from fastapi import Path, APIRouter from fastapi.params import Query, Depends @@ -7,12 +5,11 @@ from src.api.endpoints.batch.dtos.get.logs import GetBatchLogsResponse from src.api.endpoints.batch.dtos.get.summaries.response import GetBatchSummariesResponse from src.api.endpoints.batch.dtos.get.summaries.summary import BatchSummary -from src.api.endpoints.batch.dtos.post.abort import MessageResponse +from src.api.shared.models.message_response import MessageResponse from src.api.endpoints.batch.duplicates.dto import GetDuplicatesByBatchResponse from src.api.endpoints.batch.urls.dto import GetURLsByBatchResponse from src.collectors.enums import CollectorType from src.core.core import AsyncCore -from src.core.enums import BatchStatus from src.db.models.views.batch_url_status.enums import BatchURLStatusEnum from src.security.dtos.access_info import AccessInfo from src.security.manager import get_access_info diff --git a/src/api/main.py b/src/api/main.py index 2d31dc1f..0026fda3 100644 --- a/src/api/main.py +++ b/src/api/main.py @@ -7,6 +7,7 @@ from pdap_access_manager import AccessManager from starlette.responses import RedirectResponse +from src.api.endpoints.agencies.routes import agencies_router from src.api.endpoints.annotate.routes import annotate_router from src.api.endpoints.batch.routes import batch_router from src.api.endpoints.collector.routes import collector_router @@ -177,7 +178,8 @@ async def redirect_docs(): search_router, metrics_router, submit_router, - contributions_router + contributions_router, + agencies_router ] for router in routers: diff --git a/src/api/shared/__init__.py b/src/api/shared/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/shared/models/__init__.py b/src/api/shared/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/batch/dtos/post/abort.py b/src/api/shared/models/message_response.py similarity index 100% rename from src/api/endpoints/batch/dtos/post/abort.py rename to src/api/shared/models/message_response.py diff --git a/src/core/core.py b/src/core/core.py index 7d4ac083..ad2f20d5 100644 --- a/src/core/core.py +++ b/src/core/core.py @@ -1,16 +1,12 @@ from http import HTTPStatus -from typing import Optional from fastapi import HTTPException from pydantic import BaseModel -from src.api.endpoints.annotate.all.get.models.response import GetNextURLForAllAnnotationResponse -from src.api.endpoints.annotate.all.post.models.request import AllAnnotationPostInfo -from src.api.endpoints.annotate.all.post.query import AddAllAnnotationsToURLQueryBuilder from src.api.endpoints.batch.dtos.get.logs import GetBatchLogsResponse from src.api.endpoints.batch.dtos.get.summaries.response import GetBatchSummariesResponse from src.api.endpoints.batch.dtos.get.summaries.summary import BatchSummary -from src.api.endpoints.batch.dtos.post.abort import MessageResponse +from src.api.shared.models.message_response import MessageResponse from src.api.endpoints.batch.duplicates.dto import GetDuplicatesByBatchResponse from src.api.endpoints.batch.urls.dto import GetURLsByBatchResponse from src.api.endpoints.collector.dtos.collector_start import CollectorStartInfo @@ -23,9 +19,6 @@ from src.api.endpoints.metrics.dtos.get.urls.aggregated.pending import GetMetricsURLsAggregatedPendingResponseDTO from src.api.endpoints.metrics.dtos.get.urls.breakdown.pending import GetMetricsURLsBreakdownPendingResponseDTO from src.api.endpoints.metrics.dtos.get.urls.breakdown.submitted import GetMetricsURLsBreakdownSubmittedResponseDTO -from src.api.endpoints.review.approve.dto import FinalReviewApprovalInfo -from src.api.endpoints.review.enums import RejectionReason -from src.api.endpoints.review.next.dto import GetNextURLForFinalReviewOuterResponse from src.api.endpoints.search.dtos.response import SearchURLResponse from src.api.endpoints.task.by_id.dto import TaskInfo from src.api.endpoints.task.dtos.get.task_status import GetTaskStatusResponseInfo @@ -39,7 +32,6 @@ from src.db.enums import TaskType from src.db.models.impl.batch.pydantic.info import BatchInfo from src.db.models.views.batch_url_status.enums import BatchURLStatusEnum -from src.security.dtos.access_info import AccessInfo class AsyncCore: diff --git a/src/core/tasks/url/operators/probe/queries/insert_redirects/request_manager.py b/src/core/tasks/url/operators/probe/queries/insert_redirects/request_manager.py index 614a8bca..64e6299a 100644 --- a/src/core/tasks/url/operators/probe/queries/insert_redirects/request_manager.py +++ b/src/core/tasks/url/operators/probe/queries/insert_redirects/request_manager.py @@ -9,13 +9,13 @@ from src.core.tasks.url.operators.probe.queries.insert_redirects.map import map_url_mappings_to_probe_responses from src.core.tasks.url.operators.probe.queries.insert_redirects.models.url_response_map import URLResponseMapping from src.db.queries.urls_exist.model import URLExistsResult -from src.db.queries.urls_exist import URLsExistInDBQueryBuilder from src.core.tasks.url.operators.probe.tdo import URLProbeTDO from src.db.dtos.url.mapping_.full import FullURLMapping from src.db.helpers.session import session_helper as sh from src.db.models.impl.link.url_redirect_url.pydantic import LinkURLRedirectURLPydantic from src.db.models.impl.link.url_redirect_url.sqlalchemy import LinkURLRedirectURL from src.db.models.impl.url.web_metadata.insert import URLWebMetadataPydantic +from src.db.queries.urls_exist.query import URLsExistInDBQueryBuilder from src.external.url_request.probe.models.redirect import URLProbeRedirectResponsePair from src.external.url_request.probe.models.response import URLProbeResponse from src.util.models.full_url import FullURL diff --git a/src/db/client/async_.py b/src/db/client/async_.py index d1d093a8..11e49472 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -437,6 +437,14 @@ async def has_urls_with_html_data_and_without_auto_record_type_suggestion(self, model=AutoRecordTypeSuggestion ) + @session_manager + async def one_or_none( + self, + session: AsyncSession, + model: Base + ) -> Row | None: + return await sh.one_or_none(session=session, query=select(model)) + @session_manager async def get_all( self, diff --git a/src/db/models/impl/agency/sqlalchemy.py b/src/db/models/impl/agency/sqlalchemy.py index 002b0255..28717bfd 100644 --- a/src/db/models/impl/agency/sqlalchemy.py +++ b/src/db/models/impl/agency/sqlalchemy.py @@ -2,8 +2,8 @@ References an agency in the data sources database. """ -from sqlalchemy import Column, Integer, String, DateTime -from sqlalchemy.orm import relationship +from sqlalchemy import Column, Integer, String, DateTime, Sequence +from sqlalchemy.orm import relationship, Mapped from src.db.models.helpers import enum_column from src.db.models.impl.agency.enums import AgencyType, JurisdictionType @@ -20,10 +20,13 @@ class Agency( # TODO: Rename agency_id to ds_agency_id - agency_id = Column(Integer, primary_key=True) + agency_id = Column( + Integer, + Sequence("agencies_agency_id"), + primary_key=True) name = Column(String, nullable=False) - agency_type = enum_column(AgencyType, name="agency_type_enum") - jurisdiction_type = enum_column( + agency_type: Mapped[AgencyType] = enum_column(AgencyType, name="agency_type_enum") + jurisdiction_type: Mapped[JurisdictionType] = enum_column( JurisdictionType, name="jurisdiction_type_enum", nullable=True, @@ -33,3 +36,10 @@ class Agency( automated_suggestions = relationship("AgencyIDSubtaskSuggestion") user_suggestions = relationship("UserUrlAgencySuggestion", back_populates="agency") confirmed_urls = relationship("LinkURLAgency", back_populates="agency") + + locations = relationship( + "LocationExpandedView", + primaryjoin="Agency.agency_id == LinkAgencyLocation.agency_id", + secondaryjoin="LocationExpandedView.id == LinkAgencyLocation.location_id", + secondary="link_agencies_locations", + ) diff --git a/src/db/models/views/location_expanded.py b/src/db/models/views/location_expanded.py index 1eb973aa..cf60005b 100644 --- a/src/db/models/views/location_expanded.py +++ b/src/db/models/views/location_expanded.py @@ -45,7 +45,6 @@ class LocationExpandedView( WithIDBase, ViewMixin, - LocationDependentMixin ): __tablename__ = "locations_expanded" diff --git a/src/db/queries/base/builder.py b/src/db/queries/base/builder.py index f0ef345c..8a1829d0 100644 --- a/src/db/queries/base/builder.py +++ b/src/db/queries/base/builder.py @@ -1,4 +1,4 @@ -from typing import Any, Generic, Optional +from typing import Any, Generic from sqlalchemy import FromClause, ColumnClause from sqlalchemy.ext.asyncio import AsyncSession @@ -12,6 +12,7 @@ class QueryBuilderBase(Generic[LabelsType]): def __init__(self, labels: LabelsType | None = None): self.query: FromClause | None = None self.labels = labels + self.sh = sh def get(self, key: str) -> ColumnClause: return getattr(self.query.c, key) diff --git a/tests/automated/integration/api/_helpers/RequestValidator.py b/tests/automated/integration/api/_helpers/RequestValidator.py index 73293522..0db00cb3 100644 --- a/tests/automated/integration/api/_helpers/RequestValidator.py +++ b/tests/automated/integration/api/_helpers/RequestValidator.py @@ -10,7 +10,7 @@ from src.api.endpoints.batch.dtos.get.logs import GetBatchLogsResponse from src.api.endpoints.batch.dtos.get.summaries.response import GetBatchSummariesResponse from src.api.endpoints.batch.dtos.get.summaries.summary import BatchSummary -from src.api.endpoints.batch.dtos.post.abort import MessageResponse +from src.api.shared.models.message_response import MessageResponse from src.api.endpoints.batch.duplicates.dto import GetDuplicatesByBatchResponse from src.api.endpoints.batch.urls.dto import GetURLsByBatchResponse from src.api.endpoints.collector.dtos.manual_batch.post import ManualBatchInputDTO @@ -102,6 +102,24 @@ def open_v2( ) return response.json() + def open_v3( + self, + method: str, + url: str, + params: dict | None = None, + expected_model: type[BaseModel] | None = None, + **kwargs + ) -> BaseModel | dict: + response = self.open_v2( + method=method, + url=url, + params=params, + **kwargs + ) + if expected_model: + return expected_model(**response) + return response + def get( self, url: str, @@ -158,6 +176,66 @@ def get_v2( **kwargs ) + def get_v3( + self, + url: str, + params: dict | None = None, + expected_model: BaseModel | None = None, + **kwargs + ): + return self.open_v3( + method="GET", + url=url, + params=params, + expected_model=expected_model, + **kwargs + ) + + def post_v3( + self, + url: str, + params: dict | None = None, + expected_model: BaseModel | None = None, + **kwargs + ): + return self.open_v3( + method="POST", + url=url, + params=params, + expected_model=expected_model, + **kwargs + ) + + def put_v3( + self, + url: str, + params: dict | None = None, + expected_model: BaseModel | None = None, + **kwargs + ): + return self.open_v3( + method="PUT", + url=url, + params=params, + expected_model=expected_model, + **kwargs + ) + + def delete_v3( + self, + url: str, + params: dict | None = None, + expected_model: BaseModel | None = None, + **kwargs + ): + return self.open_v3( + method="DELETE", + url=url, + params=params, + expected_model=expected_model, + **kwargs + ) + def put( self, diff --git a/tests/automated/integration/api/agencies/__init__.py b/tests/automated/integration/api/agencies/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/agencies/test_core.py b/tests/automated/integration/api/agencies/test_core.py new file mode 100644 index 00000000..cc2ddefc --- /dev/null +++ b/tests/automated/integration/api/agencies/test_core.py @@ -0,0 +1,75 @@ +import pytest + +from src.api.endpoints.agencies.by_id.put.request import AgencyPutRequest +from src.api.endpoints.agencies.root.post.request import AgencyPostRequest +from src.db.models.impl.agency.enums import AgencyType, JurisdictionType +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.link.agency_location.sqlalchemy import LinkAgencyLocation +from tests.helpers.api_test_helper import APITestHelper +from tests.helpers.data_creator.models.creation_info.us_state import USStateCreationInfo + + +@pytest.mark.asyncio +async def test_agencies( + api_test_helper: APITestHelper, + california: USStateCreationInfo, + pennsylvania: USStateCreationInfo +): + ath = api_test_helper + rv = ath.request_validator + + rv.post_v3( + url=f"/agencies", + json=AgencyPostRequest( + name="Test Agency", + type=AgencyType.LAW_ENFORCEMENT, + jurisdiction_type=JurisdictionType.STATE, + location_ids=[california.location_id] + ).model_dump(mode="json") + ) + + agency: Agency = await ath.adb_client().one_or_none(model=Agency) + assert agency.name == "Test Agency" + assert agency.agency_type == AgencyType.LAW_ENFORCEMENT + assert agency.jurisdiction_type == JurisdictionType.STATE + + link: LinkAgencyLocation = await ath.adb_client().one_or_none(model=LinkAgencyLocation) + assert link is not None + assert link.agency_id == agency.agency_id + assert link.location_id == california.location_id + + rv.delete_v3( + url=f"/agencies/{agency.agency_id}/locations/{california.location_id}", + ) + + link: LinkAgencyLocation | None = await ath.adb_client().one_or_none(model=LinkAgencyLocation) + assert link is None + + rv.post_v3( + url=f"/agencies/{agency.agency_id}/locations/{pennsylvania.location_id}", + ) + + link: LinkAgencyLocation = await ath.adb_client().one_or_none(model=LinkAgencyLocation) + assert link is not None + assert link.agency_id == agency.agency_id + assert link.location_id == pennsylvania.location_id + + rv.put_v3( + url=f"/agencies/{agency.agency_id}", + json=AgencyPutRequest( + name="Test Agency Updated", + ).model_dump(mode="json") + ) + + agency: Agency = await ath.adb_client().one_or_none(model=Agency) + assert agency.name == "Test Agency Updated" + assert agency.agency_type == AgencyType.LAW_ENFORCEMENT + assert agency.jurisdiction_type == JurisdictionType.STATE + + + rv.delete_v3( + url=f"/agencies/{agency.agency_id}", + ) + + agency: Agency | None = await ath.adb_client().one_or_none(model=Agency) + assert agency is None diff --git a/tests/automated/integration/api/conftest.py b/tests/automated/integration/api/conftest.py deleted file mode 100644 index fa019469..00000000 --- a/tests/automated/integration/api/conftest.py +++ /dev/null @@ -1,69 +0,0 @@ -from typing import Generator, Any, AsyncGenerator -from unittest.mock import AsyncMock - -import pytest -import pytest_asyncio -from starlette.testclient import TestClient - -from src.api.main import app -from src.core.core import AsyncCore -from src.security.dtos.access_info import AccessInfo -from src.security.enums import Permissions -from src.security.manager import get_access_info -from tests.automated.integration.api._helpers.RequestValidator import RequestValidator -from tests.helpers.api_test_helper import APITestHelper - -MOCK_USER_ID = 1 - -def disable_task_trigger(ath: APITestHelper) -> None: - ath.async_core.collector_manager.post_collection_function_trigger = AsyncMock() - - - -async def fail_task_trigger() -> None: - raise Exception( - "Task Trigger is set to fail in tests by default, to catch unintentional calls." - "If this is not intended, either replace with a Mock or the expected task function." - ) - -def override_access_info() -> AccessInfo: - return AccessInfo( - user_id=MOCK_USER_ID, - permissions=[ - Permissions.SOURCE_COLLECTOR, - Permissions.SOURCE_COLLECTOR_FINAL_REVIEW - ] - ) - - -@pytest.fixture(scope="session") -def client(disable_task_flags) -> Generator[TestClient, None, None]: - with TestClient(app) as c: - app.dependency_overrides[get_access_info] = override_access_info - async_core: AsyncCore = c.app.state.async_core - - # Interfaces to the web should be mocked - task_manager = async_core.task_manager - task_manager.url_request_interface = AsyncMock() - task_manager.discord_poster = AsyncMock() - # Disable Logger - task_manager.logger.disabled = True - # Set trigger to fail immediately if called, to force it to be manually specified in tests - task_manager.task_trigger._func = fail_task_trigger - yield c - - # Reset environment variables back to original state - - -@pytest_asyncio.fixture -async def api_test_helper( - client: TestClient, - db_data_creator, - monkeypatch -) -> AsyncGenerator[APITestHelper, Any]: - yield APITestHelper( - request_validator=RequestValidator(client=client), - async_core=client.app.state.async_core, - db_data_creator=db_data_creator, - ) - await client.app.state.async_core.collector_manager.logger.clear_log_queue() diff --git a/tests/automated/integration/conftest.py b/tests/automated/integration/conftest.py index 574f35f4..cca17136 100644 --- a/tests/automated/integration/conftest.py +++ b/tests/automated/integration/conftest.py @@ -1,12 +1,21 @@ -from unittest.mock import MagicMock +from typing import Generator, AsyncGenerator, Any +from unittest.mock import MagicMock, AsyncMock import pytest import pytest_asyncio +from starlette.testclient import TestClient +from src.api.main import app from src.collectors.manager import AsyncCollectorManager from src.core.core import AsyncCore from src.core.logger import AsyncCoreLogger from src.db.client.async_ import AsyncDatabaseClient +from src.db.client.sync import DatabaseClient +from src.security.dtos.access_info import AccessInfo +from src.security.enums import Permissions +from src.security.manager import get_access_info +from tests.automated.integration.api._helpers.RequestValidator import RequestValidator +from tests.helpers.api_test_helper import APITestHelper from tests.helpers.data_creator.core import DBDataCreator from tests.helpers.data_creator.models.creation_info.county import CountyCreationInfo from tests.helpers.data_creator.models.creation_info.locality import LocalityCreationInfo @@ -93,4 +102,60 @@ async def los_angeles_locality( state_id=california.us_state_id, county_id=los_angeles_county.county_id, name="Los Angeles" - ) \ No newline at end of file + ) + + +MOCK_USER_ID = 1 + + +async def fail_task_trigger() -> None: + raise Exception( + "Task Trigger is set to fail in tests by default, to catch unintentional calls." + "If this is not intended, either replace with a Mock or the expected task function." + ) + + +def override_access_info() -> AccessInfo: + return AccessInfo( + user_id=MOCK_USER_ID, + permissions=[ + Permissions.SOURCE_COLLECTOR, + Permissions.SOURCE_COLLECTOR_FINAL_REVIEW + ] + ) + + +@pytest.fixture(scope="session") +def client(disable_task_flags) -> Generator[TestClient, None, None]: + with TestClient(app) as c: + app.dependency_overrides[get_access_info] = override_access_info + async_core: AsyncCore = c.app.state.async_core + + # Interfaces to the web should be mocked + task_manager = async_core.task_manager + task_manager.url_request_interface = AsyncMock() + task_manager.discord_poster = AsyncMock() + # Disable Logger + task_manager.logger.disabled = True + # Set trigger to fail immediately if called, to force it to be manually specified in tests + task_manager.task_trigger._func = fail_task_trigger + yield c + + # Reset environment variables back to original state + + +@pytest_asyncio.fixture +async def api_test_helper( + client: TestClient, + db_client_test: DatabaseClient, + adb_client_test: AsyncDatabaseClient +) -> AsyncGenerator[APITestHelper, Any]: + yield APITestHelper( + request_validator=RequestValidator(client=client), + async_core=client.app.state.async_core, + db_data_creator=DBDataCreator( + db_client=db_client_test, + adb_client=adb_client_test + ), + ) + await client.app.state.async_core.collector_manager.logger.clear_log_queue() diff --git a/tests/automated/integration/readonly/README.md b/tests/automated/integration/readonly/README.md new file mode 100644 index 00000000..3c72830f --- /dev/null +++ b/tests/automated/integration/readonly/README.md @@ -0,0 +1 @@ +Read Only tests are tests that work on a variant of the database populated with static test data. These tests are designed to not modify the database in any way. diff --git a/tests/automated/integration/readonly/__init__.py b/tests/automated/integration/readonly/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/readonly/api/__init__.py b/tests/automated/integration/readonly/api/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/readonly/api/agencies/__init__.py b/tests/automated/integration/readonly/api/agencies/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/readonly/api/agencies/get/__init__.py b/tests/automated/integration/readonly/api/agencies/get/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/readonly/api/agencies/get/test_locations.py b/tests/automated/integration/readonly/api/agencies/get/test_locations.py new file mode 100644 index 00000000..13481c58 --- /dev/null +++ b/tests/automated/integration/readonly/api/agencies/get/test_locations.py @@ -0,0 +1,16 @@ +import pytest + +from tests.automated.integration.readonly.conftest import ReadOnlyTestHelper + + +@pytest.mark.asyncio +async def test_agency_get_locations( + readonly_helper: ReadOnlyTestHelper, +) -> None: + + response_raw: list[dict] = readonly_helper.api_test_helper.request_validator.get_v3( + url=f"/agencies/{readonly_helper.agency_1_id}/locations", + ) + assert len(response_raw) == 1 + assert response_raw[0]["location_id"] == readonly_helper.agency_1_location_id + assert response_raw[0]["full_display_name"] == "Pittsburgh, Allegheny, Pennsylvania" diff --git a/tests/automated/integration/readonly/api/agencies/get/test_root.py b/tests/automated/integration/readonly/api/agencies/get/test_root.py new file mode 100644 index 00000000..fa390abd --- /dev/null +++ b/tests/automated/integration/readonly/api/agencies/get/test_root.py @@ -0,0 +1,20 @@ +import pytest + +from src.db.models.impl.agency.enums import JurisdictionType, AgencyType +from tests.automated.integration.readonly.conftest import ReadOnlyTestHelper + + +@pytest.mark.asyncio +async def test_agency_get( + readonly_helper: ReadOnlyTestHelper +): + + responses_raw: list[dict] = readonly_helper.api_test_helper.request_validator.get_v3( + url=f"/agencies", + ) + assert len(responses_raw) == 1 + response_raw = responses_raw[0] + assert response_raw["id"] == readonly_helper.agency_1_id + assert response_raw["name"] == "Agency 1" + assert response_raw["type"] == AgencyType.LAW_ENFORCEMENT.value + assert response_raw["jurisdiction_type"] == JurisdictionType.STATE.value \ No newline at end of file diff --git a/tests/automated/integration/readonly/conftest.py b/tests/automated/integration/readonly/conftest.py new file mode 100644 index 00000000..1085c184 --- /dev/null +++ b/tests/automated/integration/readonly/conftest.py @@ -0,0 +1,101 @@ +import asyncio +from typing import Any, AsyncGenerator + +import pytest +import pytest_asyncio +from pydantic import BaseModel +from starlette.testclient import TestClient + +from src.db.client.async_ import AsyncDatabaseClient +from src.db.helpers.connect import get_postgres_connection_string +from src.db.models.impl.agency.enums import AgencyType, JurisdictionType +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.link.agency_location.sqlalchemy import LinkAgencyLocation +from tests.automated.integration.api._helpers.RequestValidator import RequestValidator +from tests.helpers.api_test_helper import APITestHelper +from tests.helpers.counter import next_int +from tests.helpers.data_creator.core import DBDataCreator +from tests.helpers.data_creator.models.creation_info.us_state import USStateCreationInfo +from tests.helpers.setup.wipe import wipe_database + + +class ReadOnlyTestHelper(BaseModel): + class Config: + arbitrary_types_allowed = True + + adb_client: AsyncDatabaseClient + api_test_helper: APITestHelper + + agency_1_id: int + agency_1_location_id: int + + +@pytest.fixture(scope="module") +def event_loop(): + loop = asyncio.new_event_loop() + yield loop + loop.close() + +@pytest_asyncio.fixture(scope='module') +async def california_readonly( +) -> USStateCreationInfo: + return await DBDataCreator().create_us_state( + name="California", + iso="CA" + ) + +@pytest_asyncio.fixture(scope="module") +async def readonly_helper( + event_loop, + client: TestClient, +) -> AsyncGenerator[ReadOnlyTestHelper, Any]: + wipe_database(get_postgres_connection_string()) + conn = get_postgres_connection_string(is_async=True) + adb_client = AsyncDatabaseClient(db_url=conn) + db_data_creator = DBDataCreator() + api_test_helper = APITestHelper( + request_validator=RequestValidator(client=client), + async_core=client.app.state.async_core, + db_data_creator=db_data_creator, + ) + + # Pennsylvania + pennsylvania = await DBDataCreator().create_us_state( + name="Pennsylvania", + iso="PA" + ) + + allegheny_county = await DBDataCreator().create_county( + state_id=pennsylvania.us_state_id, + name="Allegheny" + ) + pittsburgh = await DBDataCreator().create_locality( + state_id=pennsylvania.us_state_id, + county_id=allegheny_county.county_id, + name="Pittsburgh" + ) + + + # Add Agencies + agency_1 = Agency( + agency_id=next_int(), + name="Agency 1", + agency_type=AgencyType.LAW_ENFORCEMENT, + jurisdiction_type=JurisdictionType.STATE, + ) + await adb_client.add(agency_1) + + # Add Agency location + agency_1_location = LinkAgencyLocation( + agency_id=agency_1.agency_id, + location_id=pittsburgh.location_id, + ) + await adb_client.add(agency_1_location) + + yield ReadOnlyTestHelper( + adb_client=adb_client, + api_test_helper=api_test_helper, + + agency_1_id=agency_1.agency_id, + agency_1_location_id=pittsburgh.location_id, + ) \ No newline at end of file diff --git a/tests/helpers/data_creator/core.py b/tests/helpers/data_creator/core.py index 6bf7df5f..575c594f 100644 --- a/tests/helpers/data_creator/core.py +++ b/tests/helpers/data_creator/core.py @@ -68,12 +68,19 @@ class DBDataCreator: """ Assists in the creation of test data """ - def __init__(self, db_client: Optional[DatabaseClient] = None): + def __init__( + self, + db_client: DatabaseClient | None = None, + adb_client: AsyncDatabaseClient | None = None + ): if db_client is not None: self.db_client = db_client else: self.db_client = DatabaseClient() - self.adb_client: AsyncDatabaseClient = AsyncDatabaseClient() + if adb_client is not None: + self.adb_client = adb_client + else: + self.adb_client: AsyncDatabaseClient = AsyncDatabaseClient() self.clients = DBDataCreatorClientContainer( adb=self.adb_client, db=self.db_client From 7c86759204429e7ef53aee527ed5b79dee146d0d Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 21 Oct 2025 08:07:08 -0400 Subject: [PATCH 09/84] Add URL suggestion endpoint --- ...adf9d894180_add_autogenerated_agency_id.py | 32 ----- ...80_enable_data_source_agency_submission.py | 110 ++++++++++++++++ .../endpoints/agencies/by_id/delete/query.py | 1 + src/api/endpoints/collector/manual/query.py | 2 +- .../endpoints/review/approve/query_/core.py | 4 +- .../endpoints/submit/data_source/__init__.py | 0 src/api/endpoints/submit/data_source/query.py | 108 ++++++++++++++++ .../endpoints/submit/data_source/request.py | 37 ++++++ .../endpoints/submit/data_source/response.py | 5 + .../endpoints/submit/data_source/wrapper.py | 39 ++++++ src/api/endpoints/submit/routes.py | 15 ++- src/db/client/async_.py | 4 +- src/db/models/impl/url/core/sqlalchemy.py | 6 +- .../impl/url/optional_data_source_metadata.py | 16 --- .../impl/url/optional_ds_metadata/__init__.py | 0 .../impl/url/optional_ds_metadata/enums.py | 29 +++++ .../url/optional_ds_metadata/sqlalchemy.py | 40 ++++++ src/db/statement_composer.py | 4 +- .../integration/api/agencies/test_core.py | 12 +- .../api/submit/data_source/__init__.py | 0 .../api/submit/data_source/test_core.py | 118 ++++++++++++++++++ .../integration/api/test_manual_batch.py | 2 +- tests/automated/integration/conftest.py | 8 ++ .../db/client/approve_url/test_basic.py | 2 +- .../tasks/url/impl/probe/check/manager.py | 4 +- .../test_url_miscellaneous_metadata_task.py | 2 +- 26 files changed, 529 insertions(+), 71 deletions(-) delete mode 100644 alembic/versions/2025_10_20_1620-6adf9d894180_add_autogenerated_agency_id.py create mode 100644 alembic/versions/2025_10_20_1620-6adf9d894180_enable_data_source_agency_submission.py create mode 100644 src/api/endpoints/submit/data_source/__init__.py create mode 100644 src/api/endpoints/submit/data_source/query.py create mode 100644 src/api/endpoints/submit/data_source/request.py create mode 100644 src/api/endpoints/submit/data_source/response.py create mode 100644 src/api/endpoints/submit/data_source/wrapper.py delete mode 100644 src/db/models/impl/url/optional_data_source_metadata.py create mode 100644 src/db/models/impl/url/optional_ds_metadata/__init__.py create mode 100644 src/db/models/impl/url/optional_ds_metadata/enums.py create mode 100644 src/db/models/impl/url/optional_ds_metadata/sqlalchemy.py create mode 100644 tests/automated/integration/api/submit/data_source/__init__.py create mode 100644 tests/automated/integration/api/submit/data_source/test_core.py diff --git a/alembic/versions/2025_10_20_1620-6adf9d894180_add_autogenerated_agency_id.py b/alembic/versions/2025_10_20_1620-6adf9d894180_add_autogenerated_agency_id.py deleted file mode 100644 index 37fed1aa..00000000 --- a/alembic/versions/2025_10_20_1620-6adf9d894180_add_autogenerated_agency_id.py +++ /dev/null @@ -1,32 +0,0 @@ -"""Add autogenerated agency id - -Revision ID: 6adf9d894180 -Revises: 7fc6502f1fa3 -Create Date: 2025-10-20 16:20:44.081736 - -""" -from typing import Sequence, Union - -from alembic import op -import sqlalchemy as sa - - -# revision identifiers, used by Alembic. -revision: str = '6adf9d894180' -down_revision: Union[str, None] = '7fc6502f1fa3' -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None - - -def upgrade() -> None: - op.execute(""" - CREATE SEQUENCE agencies_agency_id START WITH 23191; - """) - - op.execute(""" - ALTER TABLE agencies ALTER COLUMN agency_id SET DEFAULT nextval('agencies_agency_id'); - """) - - -def downgrade() -> None: - pass diff --git a/alembic/versions/2025_10_20_1620-6adf9d894180_enable_data_source_agency_submission.py b/alembic/versions/2025_10_20_1620-6adf9d894180_enable_data_source_agency_submission.py new file mode 100644 index 00000000..d1a72f7e --- /dev/null +++ b/alembic/versions/2025_10_20_1620-6adf9d894180_enable_data_source_agency_submission.py @@ -0,0 +1,110 @@ +"""Enable data source/agency submission + +Revision ID: 6adf9d894180 +Revises: 7fc6502f1fa3 +Create Date: 2025-10-20 16:20:44.081736 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects.postgresql import ENUM, ARRAY + + +# revision identifiers, used by Alembic. +revision: str = '6adf9d894180' +down_revision: Union[str, None] = '7fc6502f1fa3' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + +def upgrade() -> None: + _add_autogenerated_agency_id() + _add_new_columns_to_optional_ds_metadata() + +def _add_new_columns_to_optional_ds_metadata(): + table_name: str = "url_optional_data_source_metadata" + + agency_aggregation_enum = ENUM( + 'federal', + 'state', + 'county', + 'local', + name='agency_aggregation_enum', + create_type=True, + ) + agency_aggregation_enum.create(op.get_bind()) + + update_method_enum = ENUM( + 'Overwrite', + 'Insert', + 'No updates', + name='update_method_enum', + create_type=True + ) + update_method_enum.create(op.get_bind()) + + retention_schedule_enum = ENUM( + 'Future only', + '1 month', + '1 day', + '1 week', + '1-10 years', + '< 1 day', + '< 1 week', + '< 1 year', + '> 10 years', + name='retention_schedule_enum', + create_type=True + ) + retention_schedule_enum.create(op.get_bind()) + + access_type_enum = ENUM( + 'Webpage', + 'Download', + 'API', + name='access_type_enum', + create_type=True, + ) + access_type_enum.create(op.get_bind()) + + for column in [ + sa.Column('coverage_start', sa.Date(), nullable=True), + sa.Column('coverage_end', sa.Date(), nullable=True), + sa.Column("agency_supplied", sa.Boolean(), nullable=True), + sa.Column('agency_originated', sa.Boolean(), nullable=True), + sa.Column('agency_aggregation', agency_aggregation_enum), + sa.Column('agency_described_not_in_database', sa.Text(), nullable=True), + sa.Column('update_method', update_method_enum, nullable=True), + sa.Column('readme_url', sa.Text(), nullable=True), + sa.Column('originating_entity', sa.Text(), nullable=True), + sa.Column('retention_schedule', retention_schedule_enum, nullable=True), + sa.Column('scraper_url', sa.Text(), nullable=True), + sa.Column('submission_notes', sa.Text(), nullable=True), + sa.Column('access_notes', sa.Text(), nullable=True), + sa.Column('data_portal', sa.Text(), nullable=True), + sa.Column('access_types', ARRAY( + access_type_enum + ), nullable=True), + ]: + op.add_column( + table_name, + column, + ) + +def _add_autogenerated_agency_id(): + op.execute( + """ + CREATE SEQUENCE agencies_agency_id START WITH 23191; + """ + ) + + op.execute( + """ + ALTER TABLE agencies + ALTER COLUMN agency_id SET DEFAULT nextval('agencies_agency_id'); + """ + ) + +def downgrade() -> None: + pass diff --git a/src/api/endpoints/agencies/by_id/delete/query.py b/src/api/endpoints/agencies/by_id/delete/query.py index 800b7cde..61ce2653 100644 --- a/src/api/endpoints/agencies/by_id/delete/query.py +++ b/src/api/endpoints/agencies/by_id/delete/query.py @@ -6,6 +6,7 @@ class DeleteAgencyQueryBuilder(QueryBuilderBase): + def __init__( self, agency_id: int, diff --git a/src/api/endpoints/collector/manual/query.py b/src/api/endpoints/collector/manual/query.py index 6cd7d7b8..dff2cbed 100644 --- a/src/api/endpoints/collector/manual/query.py +++ b/src/api/endpoints/collector/manual/query.py @@ -9,7 +9,7 @@ from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType from src.db.queries.base.builder import QueryBuilderBase from src.util.models.url_and_scheme import URLAndScheme diff --git a/src/api/endpoints/review/approve/query_/core.py b/src/api/endpoints/review/approve/query_/core.py index 15641764..b7abec5a 100644 --- a/src/api/endpoints/review/approve/query_/core.py +++ b/src/api/endpoints/review/approve/query_/core.py @@ -6,14 +6,12 @@ from src.api.endpoints.review.approve.dto import FinalReviewApprovalInfo from src.api.endpoints.review.approve.query_.util import update_if_not_none -from src.collectors.enums import URLStatus -from src.db.constants import PLACEHOLDER_AGENCY_NAME from src.db.models.impl.agency.sqlalchemy import Agency from src.db.models.impl.flag.url_validated.enums import URLType from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType from src.db.models.impl.url.reviewing_user import ReviewingUserURL from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/api/endpoints/submit/data_source/__init__.py b/src/api/endpoints/submit/data_source/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/submit/data_source/query.py b/src/api/endpoints/submit/data_source/query.py new file mode 100644 index 00000000..2d3d685b --- /dev/null +++ b/src/api/endpoints/submit/data_source/query.py @@ -0,0 +1,108 @@ +from typing import Any + +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.submit.data_source.request import DataSourceSubmissionRequest +from src.api.endpoints.submit.data_source.response import SubmitDataSourceURLProposalResponse +from src.collectors.enums import URLStatus +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata +from src.db.models.impl.url.suggestion.anonymous.agency.sqlalchemy import AnonymousAnnotationAgency +from src.db.models.impl.url.suggestion.anonymous.location.sqlalchemy import AnonymousAnnotationLocation +from src.db.models.impl.url.suggestion.anonymous.record_type.sqlalchemy import AnonymousAnnotationRecordType +from src.db.models.impl.url.suggestion.name.enums import NameSuggestionSource +from src.db.models.impl.url.suggestion.name.sqlalchemy import URLNameSuggestion +from src.db.queries.base.builder import QueryBuilderBase +from src.util.models.full_url import FullURL + + +class SubmitDataSourceURLProposalQueryBuilder(QueryBuilderBase): + + def __init__(self, request: DataSourceSubmissionRequest): + super().__init__() + self.request = request + + async def run(self, session: AsyncSession) -> Any: + full_url = FullURL(full_url=self.request.source_url) + + url = URL( + url=full_url.id_form, + scheme=full_url.scheme, + trailing_slash=full_url.has_trailing_slash, + name=self.request.name, + status=URLStatus.OK, + source=URLSource.MANUAL, + ) + + session.add(url) + await session.flush() + + url_id: int = url.id + + # Optionally add Record Type as suggestion + if self.request.record_type is not None: + record_type_suggestion = AnonymousAnnotationRecordType( + url_id=url_id, + record_type=self.request.record_type.value + ) + session.add(record_type_suggestion) + + # Optionally add Agency ID suggestions + if self.request.agency_ids is not None: + agency_id_suggestions = [ + AnonymousAnnotationAgency( + url_id=url_id, + agency_id=agency_id + ) + for agency_id in self.request.agency_ids + ] + session.add_all(agency_id_suggestions) + + # Optionally add Location ID suggestions + if self.request.location_ids is not None: + location_id_suggestions = [ + AnonymousAnnotationLocation( + url_id=url_id, + location_id=location_id + ) + for location_id in self.request.location_ids + ] + session.add_all(location_id_suggestions) + + # Optionally add name suggestion + if self.request.name is not None: + name_suggestion = URLNameSuggestion( + url_id=url_id, + suggestion=self.request.name, + source=NameSuggestionSource.USER + ) + session.add(name_suggestion) + + # Add data source metadata + ds_metadata = URLOptionalDataSourceMetadata( + url_id=url_id, + coverage_start=self.request.coverage_start, + coverage_end=self.request.coverage_end, + supplying_entity=self.request.supplying_entity, + agency_supplied=self.request.agency_supplied, + agency_originated=self.request.agency_originated, + agency_aggregation=self.request.agency_aggregation, + agency_described_not_in_database=self.request.agency_described_not_in_database, + data_portal=self.request.data_portal, + update_method=self.request.update_method, + readme_url=self.request.readme_url, + originating_entity=self.request.originating_entity, + retention_schedule=self.request.retention_schedule, + scraper_url=self.request.scraper_url, + submission_notes=self.request.submission_notes, + access_notes=self.request.access_notes, + access_types=self.request.access_types, + record_formats=self.request.record_formats, + ) + session.add(ds_metadata) + await session.flush() + + return SubmitDataSourceURLProposalResponse( + url_id=url_id, + ) \ No newline at end of file diff --git a/src/api/endpoints/submit/data_source/request.py b/src/api/endpoints/submit/data_source/request.py new file mode 100644 index 00000000..011ebe2a --- /dev/null +++ b/src/api/endpoints/submit/data_source/request.py @@ -0,0 +1,37 @@ +from datetime import date + +from pydantic import BaseModel + +from src.core.enums import RecordType +from src.db.models.impl.url.optional_ds_metadata.enums import AgencyAggregationEnum, UpdateMethodEnum, \ + RetentionScheduleEnum, AccessTypeEnum + + +class DataSourceSubmissionRequest(BaseModel): + # Required + name: str + record_type: RecordType + source_url: str + + # Optional URL DS Metadata + coverage_start: date | None = None + coverage_end: date | None = None + supplying_entity: str | None = None + agency_supplied: bool | None = None + agency_originated: bool | None = None + agency_aggregation: AgencyAggregationEnum | None = None + agency_described_not_in_database: str | None = None + update_method: UpdateMethodEnum | None = None + readme_url: str | None = None + originating_entity: str | None = None + retention_schedule: RetentionScheduleEnum | None = None + scraper_url: str | None = None + submission_notes: str | None = None + data_portal: str | None = None + access_notes: str | None = None + access_types: list[AccessTypeEnum] = [] + record_formats: list[str] = [] + + # Links to other entities + agency_ids: list[int] = [] + location_ids: list[int] = [] diff --git a/src/api/endpoints/submit/data_source/response.py b/src/api/endpoints/submit/data_source/response.py new file mode 100644 index 00000000..b2d7ba3f --- /dev/null +++ b/src/api/endpoints/submit/data_source/response.py @@ -0,0 +1,5 @@ +from pydantic import BaseModel + + +class SubmitDataSourceURLProposalResponse(BaseModel): + url_id: int \ No newline at end of file diff --git a/src/api/endpoints/submit/data_source/wrapper.py b/src/api/endpoints/submit/data_source/wrapper.py new file mode 100644 index 00000000..32794150 --- /dev/null +++ b/src/api/endpoints/submit/data_source/wrapper.py @@ -0,0 +1,39 @@ +from fastapi import HTTPException + +from src.api.endpoints.submit.data_source.query import SubmitDataSourceURLProposalQueryBuilder +from src.api.endpoints.submit.data_source.request import DataSourceSubmissionRequest +from src.api.endpoints.submit.data_source.response import SubmitDataSourceURLProposalResponse +from src.db.client.async_ import AsyncDatabaseClient +from src.db.queries.urls_exist.model import URLExistsResult +from src.db.queries.urls_exist.query import URLsExistInDBQueryBuilder +from src.util.models.full_url import FullURL +from src.util.url import is_valid_url + + +async def submit_data_source_url_proposal( + request: DataSourceSubmissionRequest, + adb_client: AsyncDatabaseClient +) -> SubmitDataSourceURLProposalResponse: + + if not is_valid_url(request.source_url): + raise HTTPException( + status_code=400, + detail="Invalid URL" + ) + + url_exists_results: URLExistsResult = (await adb_client.run_query_builder( + URLsExistInDBQueryBuilder( + full_urls=[FullURL(request.source_url)] + ) + ))[0] + if url_exists_results.exists: + raise HTTPException( + status_code=400, + detail="URL already exists in database." + ) + + return await adb_client.run_query_builder( + SubmitDataSourceURLProposalQueryBuilder( + request=request + ) + ) \ No newline at end of file diff --git a/src/api/endpoints/submit/routes.py b/src/api/endpoints/submit/routes.py index d91d1821..ee315493 100644 --- a/src/api/endpoints/submit/routes.py +++ b/src/api/endpoints/submit/routes.py @@ -1,6 +1,8 @@ from fastapi import APIRouter, Depends from src.api.dependencies import get_async_core +from src.api.endpoints.submit.data_source.query import SubmitDataSourceURLProposalQueryBuilder +from src.api.endpoints.submit.data_source.request import DataSourceSubmissionRequest from src.api.endpoints.submit.url.models.request import URLSubmissionRequest from src.api.endpoints.submit.url.models.response import URLSubmissionResponse from src.api.endpoints.submit.url.queries.core import SubmitURLQueryBuilder @@ -21,4 +23,15 @@ async def submit_url( request=request, user_id=access_info.user_id ) - ) \ No newline at end of file + ) + +@submit_router.post("/data-source") +async def submit_data_source( + request: DataSourceSubmissionRequest, + async_core: AsyncCore = Depends(get_async_core), +): + return await async_core.adb_client.run_query_builder( + SubmitDataSourceURLProposalQueryBuilder( + request=request, + ) + ) diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 11e49472..93af63f9 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -80,7 +80,7 @@ from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent -from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion @@ -438,7 +438,7 @@ async def has_urls_with_html_data_and_without_auto_record_type_suggestion(self, ) @session_manager - async def one_or_none( + async def one_or_none_model( self, session: AsyncSession, model: Base diff --git a/src/db/models/impl/url/core/sqlalchemy.py b/src/db/models/impl/url/core/sqlalchemy.py index d4d8e7c2..50fa1676 100644 --- a/src/db/models/impl/url/core/sqlalchemy.py +++ b/src/db/models/impl/url/core/sqlalchemy.py @@ -1,6 +1,6 @@ from sqlalchemy import Column, Text, String, JSON, case, literal, Boolean from sqlalchemy.ext.hybrid import hybrid_property -from sqlalchemy.orm import relationship +from sqlalchemy.orm import relationship, Mapped from sqlalchemy.util import hybridproperty from src.collectors.enums import URLStatus @@ -27,7 +27,7 @@ class URL(UpdatedAtMixin, CreatedAtMixin, WithIDBase): # The metadata from the collector collector_metadata = Column(JSON) # The outcome of the URL: submitted, human_labeling, rejected, duplicate, etc. - status = enum_column( + status: Mapped[URLStatus] = enum_column( URLStatus, name='url_status', nullable=False @@ -57,7 +57,7 @@ def full_url(cls): else_=cls.url ) - source = enum_column( + source: Mapped[URLSource] = enum_column( URLSource, name='url_source', nullable=False diff --git a/src/db/models/impl/url/optional_data_source_metadata.py b/src/db/models/impl/url/optional_data_source_metadata.py deleted file mode 100644 index bb2a95e5..00000000 --- a/src/db/models/impl/url/optional_data_source_metadata.py +++ /dev/null @@ -1,16 +0,0 @@ -from sqlalchemy import Column, ARRAY, String -from sqlalchemy.orm import relationship - -from src.db.models.mixins import URLDependentMixin -from src.db.models.templates_.with_id import WithIDBase - - -class URLOptionalDataSourceMetadata(URLDependentMixin, WithIDBase): - __tablename__ = 'url_optional_data_source_metadata' - - record_formats = Column(ARRAY(String), nullable=True) - data_portal_type = Column(String, nullable=True) - supplying_entity = Column(String, nullable=True) - - # Relationships - url = relationship("URL", uselist=False, back_populates="optional_data_source_metadata") diff --git a/src/db/models/impl/url/optional_ds_metadata/__init__.py b/src/db/models/impl/url/optional_ds_metadata/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/impl/url/optional_ds_metadata/enums.py b/src/db/models/impl/url/optional_ds_metadata/enums.py new file mode 100644 index 00000000..3b08e6f0 --- /dev/null +++ b/src/db/models/impl/url/optional_ds_metadata/enums.py @@ -0,0 +1,29 @@ +from enum import Enum + + +class AgencyAggregationEnum(Enum): + FEDERAL = "federal" + STATE = "state" + COUNTY = "county" + LOCALITY = "local" + +class UpdateMethodEnum(Enum): + OVERWRITE = "Overwrite" + INSERT = "Insert" + NO_UPDATES = "No updates" + +class RetentionScheduleEnum(Enum): + FUTURE_ONLY = "Future only" + ONE_MONTH = "1 month" + ONE_DAY = "1 day" + ONE_WEEK = "1 week" + ONE_TO_TEN_YEARS = "1-10 years" + LT_1_DAY = "< 1 day" + LT_1_WEEK = "< 1 week" + LT_1_YEAR = "< 1 year" + GT_10_YEARS = "> 10 years" + +class AccessTypeEnum(Enum): + WEBPAGE = "Webpage" + DOWNLOAD = "Download" + API = "API" \ No newline at end of file diff --git a/src/db/models/impl/url/optional_ds_metadata/sqlalchemy.py b/src/db/models/impl/url/optional_ds_metadata/sqlalchemy.py new file mode 100644 index 00000000..37d4638d --- /dev/null +++ b/src/db/models/impl/url/optional_ds_metadata/sqlalchemy.py @@ -0,0 +1,40 @@ +from sqlalchemy import Column, ARRAY, String, Date, Boolean, Enum +from sqlalchemy.orm import relationship, Mapped + +from src.db.models.helpers import enum_column +from src.db.models.impl.url.optional_ds_metadata.enums import AgencyAggregationEnum, AccessTypeEnum, \ + RetentionScheduleEnum, UpdateMethodEnum +from src.db.models.mixins import URLDependentMixin +from src.db.models.templates_.with_id import WithIDBase + + +class URLOptionalDataSourceMetadata(URLDependentMixin, WithIDBase): + __tablename__ = 'url_optional_data_source_metadata' + + record_formats = Column(ARRAY(String), nullable=True) + data_portal = Column(String, nullable=True) + supplying_entity = Column(String, nullable=True) + coverage_start = Column(Date, nullable=True) + coverage_end = Column(Date, nullable=True) + agency_supplied = Column(Boolean, nullable=True) + agency_originated = Column(Boolean, nullable=True) + agency_aggregation: Mapped[AgencyAggregationEnum] = enum_column(AgencyAggregationEnum, name="agency_aggregation_enum") + agency_described_not_in_database = Column(String, nullable=True) + update_method: Mapped[UpdateMethodEnum] = enum_column(UpdateMethodEnum, name="update_method_enum") + readme_url = Column(String, nullable=True) + originating_entity = Column(String, nullable=True) + retention_schedule: Mapped[RetentionScheduleEnum] = enum_column(RetentionScheduleEnum, name="retention_schedule_enum") + scraper_url = Column(String, nullable=True) + submission_notes = Column(String, nullable=True) + access_notes = Column(String, nullable=True) + access_types: Mapped[list[AccessTypeEnum]] = Column(ARRAY( + Enum( + AccessTypeEnum, + name="access_type_enum", + native_enum=True, + values_callable=lambda AccessTypeEnum: [e.value for e in AccessTypeEnum] + ) + ), nullable=True) + + # Relationships + url = relationship("URL", uselist=False, back_populates="optional_data_source_metadata") diff --git a/src/db/statement_composer.py b/src/db/statement_composer.py index 0ae843b3..31d6c7f9 100644 --- a/src/db/statement_composer.py +++ b/src/db/statement_composer.py @@ -5,8 +5,6 @@ from sqlalchemy.orm import selectinload from src.collectors.enums import URLStatus -from src.core.enums import BatchStatus -from src.db.constants import STANDARD_ROW_LIMIT from src.db.enums import TaskType from src.db.models.impl.batch.sqlalchemy import Batch from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL @@ -14,7 +12,7 @@ from src.db.models.impl.task.core import Task from src.db.models.impl.task.enums import TaskStatus from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata from src.db.models.impl.url.scrape_info.sqlalchemy import URLScrapeInfo from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata from src.db.types import UserSuggestionType diff --git a/tests/automated/integration/api/agencies/test_core.py b/tests/automated/integration/api/agencies/test_core.py index cc2ddefc..a986cacc 100644 --- a/tests/automated/integration/api/agencies/test_core.py +++ b/tests/automated/integration/api/agencies/test_core.py @@ -28,12 +28,12 @@ async def test_agencies( ).model_dump(mode="json") ) - agency: Agency = await ath.adb_client().one_or_none(model=Agency) + agency: Agency = await ath.adb_client().one_or_none_model(model=Agency) assert agency.name == "Test Agency" assert agency.agency_type == AgencyType.LAW_ENFORCEMENT assert agency.jurisdiction_type == JurisdictionType.STATE - link: LinkAgencyLocation = await ath.adb_client().one_or_none(model=LinkAgencyLocation) + link: LinkAgencyLocation = await ath.adb_client().one_or_none_model(model=LinkAgencyLocation) assert link is not None assert link.agency_id == agency.agency_id assert link.location_id == california.location_id @@ -42,14 +42,14 @@ async def test_agencies( url=f"/agencies/{agency.agency_id}/locations/{california.location_id}", ) - link: LinkAgencyLocation | None = await ath.adb_client().one_or_none(model=LinkAgencyLocation) + link: LinkAgencyLocation | None = await ath.adb_client().one_or_none_model(model=LinkAgencyLocation) assert link is None rv.post_v3( url=f"/agencies/{agency.agency_id}/locations/{pennsylvania.location_id}", ) - link: LinkAgencyLocation = await ath.adb_client().one_or_none(model=LinkAgencyLocation) + link: LinkAgencyLocation = await ath.adb_client().one_or_none_model(model=LinkAgencyLocation) assert link is not None assert link.agency_id == agency.agency_id assert link.location_id == pennsylvania.location_id @@ -61,7 +61,7 @@ async def test_agencies( ).model_dump(mode="json") ) - agency: Agency = await ath.adb_client().one_or_none(model=Agency) + agency: Agency = await ath.adb_client().one_or_none_model(model=Agency) assert agency.name == "Test Agency Updated" assert agency.agency_type == AgencyType.LAW_ENFORCEMENT assert agency.jurisdiction_type == JurisdictionType.STATE @@ -71,5 +71,5 @@ async def test_agencies( url=f"/agencies/{agency.agency_id}", ) - agency: Agency | None = await ath.adb_client().one_or_none(model=Agency) + agency: Agency | None = await ath.adb_client().one_or_none_model(model=Agency) assert agency is None diff --git a/tests/automated/integration/api/submit/data_source/__init__.py b/tests/automated/integration/api/submit/data_source/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/submit/data_source/test_core.py b/tests/automated/integration/api/submit/data_source/test_core.py new file mode 100644 index 00000000..6b8a9022 --- /dev/null +++ b/tests/automated/integration/api/submit/data_source/test_core.py @@ -0,0 +1,118 @@ +from datetime import date + +import pytest + +from src.api.endpoints.submit.data_source.request import DataSourceSubmissionRequest +from src.collectors.enums import URLStatus +from src.core.enums import RecordType +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.optional_ds_metadata.enums import AgencyAggregationEnum, UpdateMethodEnum, \ + RetentionScheduleEnum, AccessTypeEnum +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata +from src.db.models.impl.url.suggestion.anonymous.agency.sqlalchemy import AnonymousAnnotationAgency +from src.db.models.impl.url.suggestion.anonymous.location.sqlalchemy import AnonymousAnnotationLocation +from src.db.models.impl.url.suggestion.name.sqlalchemy import URLNameSuggestion +from tests.helpers.api_test_helper import APITestHelper +from tests.helpers.data_creator.models.creation_info.locality import LocalityCreationInfo + + +@pytest.mark.asyncio +async def test_submit_data_source( + api_test_helper: APITestHelper, + test_agency_id: int, + pittsburgh_locality: LocalityCreationInfo, +): + ath = api_test_helper + ath.request_validator.post_v3( + url="submit/data-source", + json=DataSourceSubmissionRequest( + source_url="https://example.com/", + name="Example name", + record_type=RecordType.COMPLAINTS_AND_MISCONDUCT, + coverage_start=date(year=2025, month=8, day=9), + coverage_end=date(year=2025, month=8, day=10), + supplying_entity="Test supplying entity", + agency_supplied=True, + agency_originated=False, + agency_aggregation=AgencyAggregationEnum.STATE, + agency_described_not_in_database="Test agency described not in database", + update_method=UpdateMethodEnum.NO_UPDATES, + readme_url="https://example.com/readme", + originating_entity="Test Originating Entity", + retention_schedule=RetentionScheduleEnum.GT_10_YEARS, + scraper_url="https://example.com/scraper", + submission_notes="Test submission notes", + data_portal="Test data portal", + access_notes="Test access notes", + access_types=[ + AccessTypeEnum.API, + AccessTypeEnum.DOWNLOAD, + AccessTypeEnum.WEBPAGE + ], + record_formats=[ + "Test record format", + "Test record format 2" + ], + + agency_ids=[test_agency_id], + location_ids=[pittsburgh_locality.location_id] + + ).model_dump(mode='json') + ) + + adb_client: AsyncDatabaseClient = api_test_helper.adb_client() + + # Check URL + url: URL = await adb_client.one_or_none_model(URL) + assert url is not None + assert url.url == "example.com" + assert url.scheme == "https" + assert url.trailing_slash == True + assert url.source == URLSource.MANUAL + assert url.status == URLStatus.OK + + # Check for Location Suggestion + location_suggestion: AnonymousAnnotationLocation = await adb_client.one_or_none_model(AnonymousAnnotationLocation) + assert location_suggestion is not None + assert location_suggestion.location_id == pittsburgh_locality.location_id + + # Check for Agency Suggestion + agency_suggestion: AnonymousAnnotationAgency = await adb_client.one_or_none_model(AnonymousAnnotationAgency) + assert agency_suggestion is not None + assert agency_suggestion.agency_id == test_agency_id + + # Check for Name Suggestion + name_suggestion: URLNameSuggestion = await adb_client.one_or_none_model(URLNameSuggestion) + assert name_suggestion is not None + assert name_suggestion.suggestion == "Example name" + + # Check for URL DS Optional Metadata + optional_ds: URLOptionalDataSourceMetadata = await adb_client.one_or_none_model(URLOptionalDataSourceMetadata) + assert optional_ds is not None + assert optional_ds.coverage_start == date(year=2025, month=8, day=9) + assert optional_ds.coverage_end == date(year=2025, month=8, day=10) + assert optional_ds.supplying_entity == "Test supplying entity" + assert optional_ds.agency_supplied + assert not optional_ds.agency_originated + assert optional_ds.agency_aggregation == AgencyAggregationEnum.STATE + assert optional_ds.agency_described_not_in_database == "Test agency described not in database" + assert optional_ds.data_portal == "Test data portal" + assert optional_ds.update_method == UpdateMethodEnum.NO_UPDATES + assert optional_ds.readme_url == "https://example.com/readme" + assert optional_ds.originating_entity == "Test Originating Entity" + assert optional_ds.retention_schedule == RetentionScheduleEnum.GT_10_YEARS + assert optional_ds.scraper_url == "https://example.com/scraper" + assert optional_ds.submission_notes == "Test submission notes" + assert optional_ds.access_notes == "Test access notes" + assert optional_ds.access_types == [ + AccessTypeEnum.API, + AccessTypeEnum.DOWNLOAD, + AccessTypeEnum.WEBPAGE + ] + assert optional_ds.record_formats == [ + "Test record format", + "Test record format 2" + ] + diff --git a/tests/automated/integration/api/test_manual_batch.py b/tests/automated/integration/api/test_manual_batch.py index 9be80c25..9e52d358 100644 --- a/tests/automated/integration/api/test_manual_batch.py +++ b/tests/automated/integration/api/test_manual_batch.py @@ -3,7 +3,7 @@ from src.api.endpoints.collector.dtos.manual_batch.post import ManualBatchInnerInputDTO, ManualBatchInputDTO from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL -from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.batch.sqlalchemy import Batch from src.collectors.enums import CollectorType diff --git a/tests/automated/integration/conftest.py b/tests/automated/integration/conftest.py index cca17136..b4466424 100644 --- a/tests/automated/integration/conftest.py +++ b/tests/automated/integration/conftest.py @@ -159,3 +159,11 @@ async def api_test_helper( ), ) await client.app.state.async_core.collector_manager.logger.clear_log_queue() + +@pytest_asyncio.fixture +async def test_agency_id( + db_data_creator: DBDataCreator +) -> int: + return await db_data_creator.agency( + name="Test Agency" + ) \ No newline at end of file diff --git a/tests/automated/integration/db/client/approve_url/test_basic.py b/tests/automated/integration/db/client/approve_url/test_basic.py index c9eb62b1..f090a4ea 100644 --- a/tests/automated/integration/db/client/approve_url/test_basic.py +++ b/tests/automated/integration/db/client/approve_url/test_basic.py @@ -6,7 +6,7 @@ from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType from src.db.models.impl.url.reviewing_user import ReviewingUserURL from tests.helpers.setup.final_review.core import setup_for_get_next_url_for_final_review diff --git a/tests/automated/integration/tasks/url/impl/probe/check/manager.py b/tests/automated/integration/tasks/url/impl/probe/check/manager.py index a8d89ba5..200f428a 100644 --- a/tests/automated/integration/tasks/url/impl/probe/check/manager.py +++ b/tests/automated/integration/tasks/url/impl/probe/check/manager.py @@ -20,7 +20,9 @@ async def check_url( url_id: int, expected_status: URLStatus ): - url: URL = await self.adb_client.one_or_none(select(URL).where(URL.id == url_id)) + url: URL = await self.adb_client.one_or_none( + statement=select(URL).where(URL.id == url_id) + ) assert url is not None assert url.status == expected_status diff --git a/tests/automated/integration/tasks/url/impl/test_url_miscellaneous_metadata_task.py b/tests/automated/integration/tasks/url/impl/test_url_miscellaneous_metadata_task.py index 0af83bff..93878562 100644 --- a/tests/automated/integration/tasks/url/impl/test_url_miscellaneous_metadata_task.py +++ b/tests/automated/integration/tasks/url/impl/test_url_miscellaneous_metadata_task.py @@ -3,7 +3,7 @@ import pytest from src.core.tasks.url.operators.misc_metadata.core import URLMiscellaneousMetadataTaskOperator -from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata from src.db.models.impl.url.core.sqlalchemy import URL from src.collectors.enums import CollectorType from src.core.tasks.url.enums import TaskOperatorOutcome From 20568a420e5d9959641351e7d424abde5acccbc1 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 21 Oct 2025 08:16:11 -0400 Subject: [PATCH 10/84] Fix tests --- ..._1620-6adf9d894180_enable_data_source_agency_submission.py | 1 - src/api/endpoints/submit/data_source/query.py | 2 +- src/api/endpoints/submit/data_source/request.py | 2 +- src/db/models/impl/url/optional_ds_metadata/sqlalchemy.py | 2 +- .../automated/integration/api/submit/data_source/test_core.py | 4 ++-- 5 files changed, 5 insertions(+), 6 deletions(-) diff --git a/alembic/versions/2025_10_20_1620-6adf9d894180_enable_data_source_agency_submission.py b/alembic/versions/2025_10_20_1620-6adf9d894180_enable_data_source_agency_submission.py index d1a72f7e..78e88dc0 100644 --- a/alembic/versions/2025_10_20_1620-6adf9d894180_enable_data_source_agency_submission.py +++ b/alembic/versions/2025_10_20_1620-6adf9d894180_enable_data_source_agency_submission.py @@ -82,7 +82,6 @@ def _add_new_columns_to_optional_ds_metadata(): sa.Column('scraper_url', sa.Text(), nullable=True), sa.Column('submission_notes', sa.Text(), nullable=True), sa.Column('access_notes', sa.Text(), nullable=True), - sa.Column('data_portal', sa.Text(), nullable=True), sa.Column('access_types', ARRAY( access_type_enum ), nullable=True), diff --git a/src/api/endpoints/submit/data_source/query.py b/src/api/endpoints/submit/data_source/query.py index 2d3d685b..405390d3 100644 --- a/src/api/endpoints/submit/data_source/query.py +++ b/src/api/endpoints/submit/data_source/query.py @@ -89,7 +89,7 @@ async def run(self, session: AsyncSession) -> Any: agency_originated=self.request.agency_originated, agency_aggregation=self.request.agency_aggregation, agency_described_not_in_database=self.request.agency_described_not_in_database, - data_portal=self.request.data_portal, + data_portal_type=self.request.data_portal_type, update_method=self.request.update_method, readme_url=self.request.readme_url, originating_entity=self.request.originating_entity, diff --git a/src/api/endpoints/submit/data_source/request.py b/src/api/endpoints/submit/data_source/request.py index 011ebe2a..fc649f74 100644 --- a/src/api/endpoints/submit/data_source/request.py +++ b/src/api/endpoints/submit/data_source/request.py @@ -21,13 +21,13 @@ class DataSourceSubmissionRequest(BaseModel): agency_originated: bool | None = None agency_aggregation: AgencyAggregationEnum | None = None agency_described_not_in_database: str | None = None + data_portal_type: str | None = None update_method: UpdateMethodEnum | None = None readme_url: str | None = None originating_entity: str | None = None retention_schedule: RetentionScheduleEnum | None = None scraper_url: str | None = None submission_notes: str | None = None - data_portal: str | None = None access_notes: str | None = None access_types: list[AccessTypeEnum] = [] record_formats: list[str] = [] diff --git a/src/db/models/impl/url/optional_ds_metadata/sqlalchemy.py b/src/db/models/impl/url/optional_ds_metadata/sqlalchemy.py index 37d4638d..3f6e239b 100644 --- a/src/db/models/impl/url/optional_ds_metadata/sqlalchemy.py +++ b/src/db/models/impl/url/optional_ds_metadata/sqlalchemy.py @@ -12,7 +12,7 @@ class URLOptionalDataSourceMetadata(URLDependentMixin, WithIDBase): __tablename__ = 'url_optional_data_source_metadata' record_formats = Column(ARRAY(String), nullable=True) - data_portal = Column(String, nullable=True) + data_portal_type = Column(String, nullable=True) supplying_entity = Column(String, nullable=True) coverage_start = Column(Date, nullable=True) coverage_end = Column(Date, nullable=True) diff --git a/tests/automated/integration/api/submit/data_source/test_core.py b/tests/automated/integration/api/submit/data_source/test_core.py index 6b8a9022..566ff60a 100644 --- a/tests/automated/integration/api/submit/data_source/test_core.py +++ b/tests/automated/integration/api/submit/data_source/test_core.py @@ -44,7 +44,7 @@ async def test_submit_data_source( retention_schedule=RetentionScheduleEnum.GT_10_YEARS, scraper_url="https://example.com/scraper", submission_notes="Test submission notes", - data_portal="Test data portal", + data_portal_type="Test data portal", access_notes="Test access notes", access_types=[ AccessTypeEnum.API, @@ -98,7 +98,7 @@ async def test_submit_data_source( assert not optional_ds.agency_originated assert optional_ds.agency_aggregation == AgencyAggregationEnum.STATE assert optional_ds.agency_described_not_in_database == "Test agency described not in database" - assert optional_ds.data_portal == "Test data portal" + assert optional_ds.data_portal_type == "Test data portal" assert optional_ds.update_method == UpdateMethodEnum.NO_UPDATES assert optional_ds.readme_url == "https://example.com/readme" assert optional_ds.originating_entity == "Test Originating Entity" From 1fe2235e6cefee4d76b347165c3c93b5f78e2217 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 21 Oct 2025 08:20:01 -0400 Subject: [PATCH 11/84] Merge from dev --- ...20_1620-6adf9d894180_enable_data_source_agency_submission.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alembic/versions/2025_10_20_1620-6adf9d894180_enable_data_source_agency_submission.py b/alembic/versions/2025_10_20_1620-6adf9d894180_enable_data_source_agency_submission.py index 78e88dc0..ffe51314 100644 --- a/alembic/versions/2025_10_20_1620-6adf9d894180_enable_data_source_agency_submission.py +++ b/alembic/versions/2025_10_20_1620-6adf9d894180_enable_data_source_agency_submission.py @@ -1,7 +1,7 @@ """Enable data source/agency submission Revision ID: 6adf9d894180 -Revises: 7fc6502f1fa3 +Revises: 9d57b3b79d35 Create Date: 2025-10-20 16:20:44.081736 """ From 08c0c4a2410c6f5e831856fe571045978546f5b2 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 21 Oct 2025 08:24:57 -0400 Subject: [PATCH 12/84] Fix alembic migration bug --- ...20_1620-6adf9d894180_enable_data_source_agency_submission.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alembic/versions/2025_10_20_1620-6adf9d894180_enable_data_source_agency_submission.py b/alembic/versions/2025_10_20_1620-6adf9d894180_enable_data_source_agency_submission.py index ffe51314..c45f4f28 100644 --- a/alembic/versions/2025_10_20_1620-6adf9d894180_enable_data_source_agency_submission.py +++ b/alembic/versions/2025_10_20_1620-6adf9d894180_enable_data_source_agency_submission.py @@ -14,7 +14,7 @@ # revision identifiers, used by Alembic. revision: str = '6adf9d894180' -down_revision: Union[str, None] = '7fc6502f1fa3' +down_revision: Union[str, None] = '9d57b3b79d35' branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None From 4fd01c2e8a4d5326b2f8d49482e2844075f63e39 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 21 Oct 2025 08:54:20 -0400 Subject: [PATCH 13/84] Have Request Models forbid extra parameters --- .../agencies/by_id/delete/request.py | 0 .../endpoints/agencies/by_id/put/request.py | 5 +- .../endpoints/agencies/root/post/request.py | 5 +- src/api/endpoints/annotate/agency/post/dto.py | 4 +- .../annotate/all/post/models/request.py | 6 +- .../collector/dtos/manual_batch/post.py | 3 +- .../endpoints/submit/data_source/request.py | 5 +- .../endpoints/submit/url/models/request.py | 3 +- src/api/endpoints/suggest/__init__.py | 0 src/api/endpoints/suggest/routes.py | 19 ----- src/api/endpoints/suggest/url/__init__.py | 0 .../endpoints/suggest/url/models/__init__.py | 0 .../endpoints/suggest/url/models/request.py | 13 ---- .../suggest/url/models/response/__init__.py | 0 .../suggest/url/models/response/enums.py | 7 -- .../suggest/url/models/response/model.py | 9 --- .../endpoints/suggest/url/queries/__init__.py | 0 src/api/endpoints/suggest/url/queries/core.py | 73 ------------------- src/api/endpoints/suggest/url/wrapper.py | 24 ------ src/api/shared/models/request_base.py | 8 ++ 20 files changed, 24 insertions(+), 160 deletions(-) delete mode 100644 src/api/endpoints/agencies/by_id/delete/request.py delete mode 100644 src/api/endpoints/suggest/__init__.py delete mode 100644 src/api/endpoints/suggest/routes.py delete mode 100644 src/api/endpoints/suggest/url/__init__.py delete mode 100644 src/api/endpoints/suggest/url/models/__init__.py delete mode 100644 src/api/endpoints/suggest/url/models/request.py delete mode 100644 src/api/endpoints/suggest/url/models/response/__init__.py delete mode 100644 src/api/endpoints/suggest/url/models/response/enums.py delete mode 100644 src/api/endpoints/suggest/url/models/response/model.py delete mode 100644 src/api/endpoints/suggest/url/queries/__init__.py delete mode 100644 src/api/endpoints/suggest/url/queries/core.py delete mode 100644 src/api/endpoints/suggest/url/wrapper.py create mode 100644 src/api/shared/models/request_base.py diff --git a/src/api/endpoints/agencies/by_id/delete/request.py b/src/api/endpoints/agencies/by_id/delete/request.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/api/endpoints/agencies/by_id/put/request.py b/src/api/endpoints/agencies/by_id/put/request.py index b485e43c..8d1457fb 100644 --- a/src/api/endpoints/agencies/by_id/put/request.py +++ b/src/api/endpoints/agencies/by_id/put/request.py @@ -1,9 +1,8 @@ -from pydantic import BaseModel - +from src.api.shared.models.request_base import RequestBase from src.db.models.impl.agency.enums import AgencyType, JurisdictionType -class AgencyPutRequest(BaseModel): +class AgencyPutRequest(RequestBase): name: str | None = None type: AgencyType | None = None jurisdiction_type: JurisdictionType | None = None diff --git a/src/api/endpoints/agencies/root/post/request.py b/src/api/endpoints/agencies/root/post/request.py index 6d95eaf2..009c863c 100644 --- a/src/api/endpoints/agencies/root/post/request.py +++ b/src/api/endpoints/agencies/root/post/request.py @@ -1,9 +1,8 @@ -from pydantic import BaseModel - +from src.api.shared.models.request_base import RequestBase from src.db.models.impl.agency.enums import AgencyType, JurisdictionType -class AgencyPostRequest(BaseModel): +class AgencyPostRequest(RequestBase): name: str type: AgencyType jurisdiction_type: JurisdictionType diff --git a/src/api/endpoints/annotate/agency/post/dto.py b/src/api/endpoints/annotate/agency/post/dto.py index dc41720a..1a13f073 100644 --- a/src/api/endpoints/annotate/agency/post/dto.py +++ b/src/api/endpoints/annotate/agency/post/dto.py @@ -2,7 +2,9 @@ from pydantic import BaseModel +from src.api.shared.models.request_base import RequestBase -class URLAgencyAnnotationPostInfo(BaseModel): + +class URLAgencyAnnotationPostInfo(RequestBase): is_new: bool = False suggested_agency: int | None = None diff --git a/src/api/endpoints/annotate/all/post/models/request.py b/src/api/endpoints/annotate/all/post/models/request.py index 8de222de..32228bac 100644 --- a/src/api/endpoints/annotate/all/post/models/request.py +++ b/src/api/endpoints/annotate/all/post/models/request.py @@ -1,15 +1,15 @@ -from pydantic import BaseModel, model_validator, ConfigDict +from pydantic import model_validator from src.api.endpoints.annotate.all.post.models.agency import AnnotationPostAgencyInfo from src.api.endpoints.annotate.all.post.models.location import AnnotationPostLocationInfo from src.api.endpoints.annotate.all.post.models.name import AnnotationPostNameInfo +from src.api.shared.models.request_base import RequestBase from src.core.enums import RecordType from src.core.exceptions import FailedValidationException from src.db.models.impl.flag.url_validated.enums import URLType -class AllAnnotationPostInfo(BaseModel): - model_config = ConfigDict(extra='forbid') +class AllAnnotationPostInfo(RequestBase): suggested_status: URLType record_type: RecordType | None = None diff --git a/src/api/endpoints/collector/dtos/manual_batch/post.py b/src/api/endpoints/collector/dtos/manual_batch/post.py index 6ec62579..ce00e40b 100644 --- a/src/api/endpoints/collector/dtos/manual_batch/post.py +++ b/src/api/endpoints/collector/dtos/manual_batch/post.py @@ -2,6 +2,7 @@ from pydantic import BaseModel, Field +from src.api.shared.models.request_base import RequestBase from src.core.enums import RecordType @@ -16,7 +17,7 @@ class ManualBatchInnerInputDTO(BaseModel): supplying_entity: str | None = None -class ManualBatchInputDTO(BaseModel): +class ManualBatchInputDTO(RequestBase): name: str entries: list[ManualBatchInnerInputDTO] = Field( min_length=1, diff --git a/src/api/endpoints/submit/data_source/request.py b/src/api/endpoints/submit/data_source/request.py index fc649f74..409fe254 100644 --- a/src/api/endpoints/submit/data_source/request.py +++ b/src/api/endpoints/submit/data_source/request.py @@ -1,13 +1,12 @@ from datetime import date -from pydantic import BaseModel - +from src.api.shared.models.request_base import RequestBase from src.core.enums import RecordType from src.db.models.impl.url.optional_ds_metadata.enums import AgencyAggregationEnum, UpdateMethodEnum, \ RetentionScheduleEnum, AccessTypeEnum -class DataSourceSubmissionRequest(BaseModel): +class DataSourceSubmissionRequest(RequestBase): # Required name: str record_type: RecordType diff --git a/src/api/endpoints/submit/url/models/request.py b/src/api/endpoints/submit/url/models/request.py index 5b52d761..34ec9df9 100644 --- a/src/api/endpoints/submit/url/models/request.py +++ b/src/api/endpoints/submit/url/models/request.py @@ -1,9 +1,10 @@ from pydantic import BaseModel +from src.api.shared.models.request_base import RequestBase from src.core.enums import RecordType -class URLSubmissionRequest(BaseModel): +class URLSubmissionRequest(RequestBase): url: str record_type: RecordType | None = None name: str | None = None diff --git a/src/api/endpoints/suggest/__init__.py b/src/api/endpoints/suggest/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/api/endpoints/suggest/routes.py b/src/api/endpoints/suggest/routes.py deleted file mode 100644 index 8caeb8ac..00000000 --- a/src/api/endpoints/suggest/routes.py +++ /dev/null @@ -1,19 +0,0 @@ -from fastapi import APIRouter, Depends - -from src.api.dependencies import get_async_core -from src.api.endpoints.suggest.url.models.request import URLSuggestionRequest -from src.api.endpoints.suggest.url.models.response.model import URLSuggestResponse -from src.api.endpoints.suggest.url.wrapper import suggest_url_wrapper -from src.core.core import AsyncCore - -suggest_router = APIRouter(prefix="/suggest", tags=["suggest"]) - -@suggest_router.post("/url") -async def suggest_url( - request: URLSuggestionRequest, - async_core: AsyncCore = Depends(get_async_core), -) -> URLSuggestResponse: - return await suggest_url_wrapper( - request=request, - adb_client=async_core.adb_client, - ) diff --git a/src/api/endpoints/suggest/url/__init__.py b/src/api/endpoints/suggest/url/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/api/endpoints/suggest/url/models/__init__.py b/src/api/endpoints/suggest/url/models/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/api/endpoints/suggest/url/models/request.py b/src/api/endpoints/suggest/url/models/request.py deleted file mode 100644 index 4deec1d5..00000000 --- a/src/api/endpoints/suggest/url/models/request.py +++ /dev/null @@ -1,13 +0,0 @@ -from pydantic import BaseModel - -from src.core.enums import RecordType -from src.db.models.impl.flag.url_validated.enums import URLType - - -class URLSuggestionRequest(BaseModel): - url: str - url_type: URLType | None = None - record_type: RecordType | None = None - agency_ids: list[int] = [] - location_ids: list[int] = [] - name: str | None = None \ No newline at end of file diff --git a/src/api/endpoints/suggest/url/models/response/__init__.py b/src/api/endpoints/suggest/url/models/response/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/api/endpoints/suggest/url/models/response/enums.py b/src/api/endpoints/suggest/url/models/response/enums.py deleted file mode 100644 index 337d759a..00000000 --- a/src/api/endpoints/suggest/url/models/response/enums.py +++ /dev/null @@ -1,7 +0,0 @@ -from enum import Enum - - -class URLSuggestResultEnum(Enum): - ACCEPTED = "accepted" - ACCEPTED_WITH_ERRORS = "accepted_with_errors" - DUPLICATE = "duplicate" diff --git a/src/api/endpoints/suggest/url/models/response/model.py b/src/api/endpoints/suggest/url/models/response/model.py deleted file mode 100644 index 091734bb..00000000 --- a/src/api/endpoints/suggest/url/models/response/model.py +++ /dev/null @@ -1,9 +0,0 @@ -from pydantic import BaseModel - -from src.api.endpoints.suggest.url.models.response.enums import URLSuggestResultEnum - - -class URLSuggestResponse(BaseModel): - result: URLSuggestResultEnum - url_id: int | None - msg: str \ No newline at end of file diff --git a/src/api/endpoints/suggest/url/queries/__init__.py b/src/api/endpoints/suggest/url/queries/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/api/endpoints/suggest/url/queries/core.py b/src/api/endpoints/suggest/url/queries/core.py deleted file mode 100644 index 77b90128..00000000 --- a/src/api/endpoints/suggest/url/queries/core.py +++ /dev/null @@ -1,73 +0,0 @@ -from sqlalchemy.ext.asyncio import AsyncSession - -from src.api.endpoints.suggest.url.models.request import URLSuggestionRequest -from src.api.endpoints.suggest.url.models.response.enums import URLSuggestResultEnum -from src.api.endpoints.suggest.url.models.response.model import URLSuggestResponse -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.queries.base.builder import QueryBuilderBase -from src.db.queries.urls_exist.model import URLExistsResult -from src.db.queries.urls_exist.query import URLsExistInDBQueryBuilder -from src.db.queries.urls_exist.requester import URLSuggestRequester -from src.util.models.full_url import FullURL - - -class URLSuggestQueryBuilder(QueryBuilderBase): - - def __init__( - self, - request: URLSuggestionRequest - ): - super().__init__() - self.request = request - - async def run(self, session: AsyncSession) -> URLSuggestResponse: - # Clean URL - full_url = FullURL(self.request.url) - - # Check if already exists in database - url_exists_result: URLExistsResult = (await URLsExistInDBQueryBuilder( - [full_url] - ).run(session))[0] - if url_exists_result.url_id is not None: - return URLSuggestResponse( - url_id=url_exists_result.url_id, - result=URLSuggestResultEnum.DUPLICATE, - msg=f"URL Already Exists In Database with ID {url_exists_result.url_id}" - ) - - # Add URL - url = URL( - scheme=full_url.scheme, - url=full_url.id_form, - trailing_slash=full_url.has_trailing_slash, - ) - session.add(url) - await session.flush() - url_id: int = url.id - - try: - requester = URLSuggestRequester(session=session, url_id=url_id) - - # Optionally add other annotations - await requester.optionally_add_url_type_suggestion(self.request.url_type) - - await requester.optionally_add_record_type_suggestion(self.request.record_type) - - await requester.optionally_add_agency_id_suggestions(self.request.agency_ids) - - await requester.optionally_add_name_suggestion(self.request.name) - - # If cleaned URL matches original URL, return as ACCEPTED - return URLSuggestResponse( - url_id=url_id, - result=URLSuggestResultEnum.ACCEPTED, - msg="URL was accepted" - ) - - except Exception as e: - return URLSuggestResponse( - url_id=url_id, - result=URLSuggestResultEnum.ACCEPTED_WITH_ERRORS, - msg=f"The URL was accepted, but there were errors in adding provided annotations: {e}" - ) - diff --git a/src/api/endpoints/suggest/url/wrapper.py b/src/api/endpoints/suggest/url/wrapper.py deleted file mode 100644 index 7927db25..00000000 --- a/src/api/endpoints/suggest/url/wrapper.py +++ /dev/null @@ -1,24 +0,0 @@ -from http import HTTPStatus - -from fastapi import HTTPException - -from src.api.endpoints.suggest.url.models.request import URLSuggestionRequest -from src.api.endpoints.suggest.url.models.response.model import URLSuggestResponse -from src.api.endpoints.suggest.url.queries.core import URLSuggestQueryBuilder -from src.db.client.async_ import AsyncDatabaseClient -from src.util.url import is_valid_url - - -async def suggest_url_wrapper( - request: URLSuggestionRequest, - adb_client: AsyncDatabaseClient, -) -> URLSuggestResponse: - if not is_valid_url(request.url): - raise HTTPException( - status_code=HTTPStatus.BAD_REQUEST, - detail="Invalid URL" - ) - - return await adb_client.run_query_builder( - URLSuggestQueryBuilder(request) - ) \ No newline at end of file diff --git a/src/api/shared/models/request_base.py b/src/api/shared/models/request_base.py new file mode 100644 index 00000000..816cc226 --- /dev/null +++ b/src/api/shared/models/request_base.py @@ -0,0 +1,8 @@ +from pydantic import BaseModel, ConfigDict + + +class RequestBase(BaseModel): + model_config = ConfigDict( + extra="forbid", + frozen=True + ) \ No newline at end of file From f24f6c4d74a153b21b87ec915b3788ae1de3c21d Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 21 Oct 2025 11:25:58 -0400 Subject: [PATCH 14/84] Add linking to batch logic, remove required user id for batches --- ...4e9f_set_batches_user_id_to_be_nullable.py | 30 +++++++++++++++++++ src/api/endpoints/submit/data_source/query.py | 20 +++++++++++++ src/db/models/impl/batch/sqlalchemy.py | 7 +++-- .../integration/api/batch/test_batch.py | 5 +--- .../api/submit/data_source/test_core.py | 18 ++++++++++- 5 files changed, 72 insertions(+), 8 deletions(-) create mode 100644 alembic/versions/2025_10_21_1123-f32ba7664e9f_set_batches_user_id_to_be_nullable.py diff --git a/alembic/versions/2025_10_21_1123-f32ba7664e9f_set_batches_user_id_to_be_nullable.py b/alembic/versions/2025_10_21_1123-f32ba7664e9f_set_batches_user_id_to_be_nullable.py new file mode 100644 index 00000000..d6076e7a --- /dev/null +++ b/alembic/versions/2025_10_21_1123-f32ba7664e9f_set_batches_user_id_to_be_nullable.py @@ -0,0 +1,30 @@ +"""Set batches.user_id to be nullable + +Revision ID: f32ba7664e9f +Revises: 6adf9d894180 +Create Date: 2025-10-21 11:23:35.611484 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'f32ba7664e9f' +down_revision: Union[str, None] = '6adf9d894180' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.alter_column( + table_name='batches', + column_name='user_id', + nullable=True + ) + + +def downgrade() -> None: + pass diff --git a/src/api/endpoints/submit/data_source/query.py b/src/api/endpoints/submit/data_source/query.py index 405390d3..6d7360f5 100644 --- a/src/api/endpoints/submit/data_source/query.py +++ b/src/api/endpoints/submit/data_source/query.py @@ -5,6 +5,9 @@ from src.api.endpoints.submit.data_source.request import DataSourceSubmissionRequest from src.api.endpoints.submit.data_source.response import SubmitDataSourceURLProposalResponse from src.collectors.enums import URLStatus +from src.core.enums import BatchStatus +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata @@ -40,6 +43,23 @@ async def run(self, session: AsyncSession) -> Any: url_id: int = url.id + # Add Batch + batch = Batch( + strategy='manual', + status=BatchStatus.READY_TO_LABEL, + parameters={} + ) + session.add(batch) + await session.flush() + batch_id: int = batch.id + + # Add Batch URL link + batch_url_link = LinkBatchURL( + batch_id=batch_id, + url_id=url_id + ) + session.add(batch_url_link) + # Optionally add Record Type as suggestion if self.request.record_type is not None: record_type_suggestion = AnonymousAnnotationRecordType( diff --git a/src/db/models/impl/batch/sqlalchemy.py b/src/db/models/impl/batch/sqlalchemy.py index 564ce163..fb44396b 100644 --- a/src/db/models/impl/batch/sqlalchemy.py +++ b/src/db/models/impl/batch/sqlalchemy.py @@ -1,7 +1,8 @@ from sqlalchemy import Column, Integer, TIMESTAMP, Float, JSON from sqlalchemy.dialects import postgresql -from sqlalchemy.orm import relationship +from sqlalchemy.orm import relationship, Mapped +from src.core.enums import BatchStatus from src.db.models.helpers import CURRENT_TIME_SERVER_DEFAULT from src.db.models.impl.log.sqlalchemy import Log from src.db.models.templates_.with_id import WithIDBase @@ -23,9 +24,9 @@ class Batch(WithIDBase): 'manual', name='batch_strategy'), nullable=False) - user_id = Column(Integer, nullable=False) + user_id = Column(Integer, nullable=True) # Gives the status of the batch - status = Column( + status: Mapped[BatchStatus] = Column( batch_status_enum, nullable=False ) diff --git a/tests/automated/integration/api/batch/test_batch.py b/tests/automated/integration/api/batch/test_batch.py index f1e3d4f2..f34928d6 100644 --- a/tests/automated/integration/api/batch/test_batch.py +++ b/tests/automated/integration/api/batch/test_batch.py @@ -1,8 +1,5 @@ -from src.api.endpoints.batch.dtos.get.summaries.summary import BatchSummary -from src.db.models.impl.batch.pydantic.info import BatchInfo from src.db.dtos.url.insert import InsertURLsInfo -from src.collectors.impl.example.dtos.input import ExampleInputDTO -from src.core.enums import BatchStatus + def test_get_batch_urls(api_test_helper): diff --git a/tests/automated/integration/api/submit/data_source/test_core.py b/tests/automated/integration/api/submit/data_source/test_core.py index 566ff60a..49df1dd4 100644 --- a/tests/automated/integration/api/submit/data_source/test_core.py +++ b/tests/automated/integration/api/submit/data_source/test_core.py @@ -4,8 +4,10 @@ from src.api.endpoints.submit.data_source.request import DataSourceSubmissionRequest from src.collectors.enums import URLStatus -from src.core.enums import RecordType +from src.core.enums import RecordType, BatchStatus from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.optional_ds_metadata.enums import AgencyAggregationEnum, UpdateMethodEnum, \ @@ -73,6 +75,20 @@ async def test_submit_data_source( assert url.source == URLSource.MANUAL assert url.status == URLStatus.OK + # Check for Batch + batch: Batch = await adb_client.one_or_none_model(Batch) + assert batch is not None + assert batch.user_id is None + assert batch.strategy == 'manual' + assert batch.status == BatchStatus.READY_TO_LABEL.value + assert batch.parameters == {} + + # Check for Batch URL Link + batch_url_link: LinkBatchURL = await adb_client.one_or_none_model(LinkBatchURL) + assert batch_url_link is not None + assert batch_url_link.batch_id == batch.id + assert batch_url_link.url_id == url.id + # Check for Location Suggestion location_suggestion: AnonymousAnnotationLocation = await adb_client.one_or_none_model(AnonymousAnnotationLocation) assert location_suggestion is not None From 83d458e3b41ce9b870df016c98bc0bd455aa34be Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 24 Oct 2025 08:26:00 -0400 Subject: [PATCH 15/84] Add linking to batch logic, remove required user id for batches --- src/api/endpoints/data_source/__init__.py | 0 src/api/endpoints/data_source/get/__init__.py | 0 src/api/endpoints/data_source/get/query.py | 0 src/api/endpoints/data_source/get/response.py | 4 +++ src/api/endpoints/data_source/put/__init__.py | 0 src/api/endpoints/data_source/put/query.py | 0 src/api/endpoints/data_source/put/request.py | 0 src/api/endpoints/data_source/routes.py | 34 +++++++++++++++++++ src/api/endpoints/meta_url/__init__.py | 0 src/api/endpoints/meta_url/get/__init__.py | 0 src/api/endpoints/meta_url/get/query.py | 0 src/api/endpoints/meta_url/get/response.py | 0 src/api/endpoints/meta_url/put/__init__.py | 0 src/api/endpoints/meta_url/put/query.py | 0 src/api/endpoints/meta_url/put/request.py | 0 src/api/endpoints/meta_url/routes.py | 34 +++++++++++++++++++ src/api/main.py | 4 ++- 17 files changed, 75 insertions(+), 1 deletion(-) create mode 100644 src/api/endpoints/data_source/__init__.py create mode 100644 src/api/endpoints/data_source/get/__init__.py create mode 100644 src/api/endpoints/data_source/get/query.py create mode 100644 src/api/endpoints/data_source/get/response.py create mode 100644 src/api/endpoints/data_source/put/__init__.py create mode 100644 src/api/endpoints/data_source/put/query.py create mode 100644 src/api/endpoints/data_source/put/request.py create mode 100644 src/api/endpoints/data_source/routes.py create mode 100644 src/api/endpoints/meta_url/__init__.py create mode 100644 src/api/endpoints/meta_url/get/__init__.py create mode 100644 src/api/endpoints/meta_url/get/query.py create mode 100644 src/api/endpoints/meta_url/get/response.py create mode 100644 src/api/endpoints/meta_url/put/__init__.py create mode 100644 src/api/endpoints/meta_url/put/query.py create mode 100644 src/api/endpoints/meta_url/put/request.py create mode 100644 src/api/endpoints/meta_url/routes.py diff --git a/src/api/endpoints/data_source/__init__.py b/src/api/endpoints/data_source/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/data_source/get/__init__.py b/src/api/endpoints/data_source/get/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/data_source/get/query.py b/src/api/endpoints/data_source/get/query.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/data_source/get/response.py b/src/api/endpoints/data_source/get/response.py new file mode 100644 index 00000000..51134ffc --- /dev/null +++ b/src/api/endpoints/data_source/get/response.py @@ -0,0 +1,4 @@ +from pydantic import BaseModel + + +class DataSourceGetResponse(BaseModel): diff --git a/src/api/endpoints/data_source/put/__init__.py b/src/api/endpoints/data_source/put/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/data_source/put/query.py b/src/api/endpoints/data_source/put/query.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/data_source/put/request.py b/src/api/endpoints/data_source/put/request.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/data_source/routes.py b/src/api/endpoints/data_source/routes.py new file mode 100644 index 00000000..770013f6 --- /dev/null +++ b/src/api/endpoints/data_source/routes.py @@ -0,0 +1,34 @@ +from fastapi import APIRouter, Depends, Query + +from src.api.dependencies import get_async_core +from src.api.endpoints.data_source.get.response import DataSourceGetResponse +from src.api.shared.models.message_response import MessageResponse +from src.core.core import AsyncCore + +data_source_router = APIRouter( + prefix="/data-source", + tags=["data-source"] +) + + +@data_source_router.get("") +async def get_data_sources( + async_core: AsyncCore = Depends(get_async_core), + page: int = Query( + description="Page number", + default=1 + ), +) -> list[DataSourceGetResponse]: + return await async_core.adb_client.run_query_builder( + GetDataSourceQueryBuilder(page=page) + ) + +@data_source_router.put("/{data_source_id}") +async def update_data_source( + data_source_id: int, + async_core: AsyncCore = Depends(get_async_core), + request: DataSourceUpdateRequest, +) -> MessageResponse: + return await async_core.adb_client.run_query_builder( + UpdateDataSourceQueryBuilder(data_source_id=data_source_id, data_source_update=data_source_update) + ) diff --git a/src/api/endpoints/meta_url/__init__.py b/src/api/endpoints/meta_url/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/meta_url/get/__init__.py b/src/api/endpoints/meta_url/get/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/meta_url/get/query.py b/src/api/endpoints/meta_url/get/query.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/meta_url/get/response.py b/src/api/endpoints/meta_url/get/response.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/meta_url/put/__init__.py b/src/api/endpoints/meta_url/put/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/meta_url/put/query.py b/src/api/endpoints/meta_url/put/query.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/meta_url/put/request.py b/src/api/endpoints/meta_url/put/request.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/meta_url/routes.py b/src/api/endpoints/meta_url/routes.py new file mode 100644 index 00000000..5d3f2d76 --- /dev/null +++ b/src/api/endpoints/meta_url/routes.py @@ -0,0 +1,34 @@ +from fastapi import APIRouter, Depends, Query + +from src.api.dependencies import get_async_core +from src.api.shared.models.message_response import MessageResponse +from src.core.core import AsyncCore + +meta_url_router = APIRouter( + prefix="/meta-url", + tags=["meta-url"] +) + + +@meta_url_router.get("") +async def get_meta_urls( + async_core: AsyncCore = Depends(get_async_core), + page: int = Query( + description="Page number", + default=1 + ), +) -> MetaURLGetResponse: + return await async_core.adb_client.run_query_builder(GetMetaURLQueryBuilder()) + + +@meta_url_router.put("/{meta_url_id}") +async def update_meta_url( + meta_url_id: int, + async_core: AsyncCore = Depends(get_async_core), + request: MetaURLUpdateRequest, +) -> MessageResponse: + return await async_core.adb_client.run_query_builder( + UpdateMetaURLQueryBuilder(meta_url_id=meta_url_id, meta_url_update=meta_url_update) + ) + + diff --git a/src/api/main.py b/src/api/main.py index 0026fda3..076b8108 100644 --- a/src/api/main.py +++ b/src/api/main.py @@ -12,6 +12,7 @@ from src.api.endpoints.batch.routes import batch_router from src.api.endpoints.collector.routes import collector_router from src.api.endpoints.contributions.routes import contributions_router +from src.api.endpoints.data_source.routes import data_source_router from src.api.endpoints.metrics.routes import metrics_router from src.api.endpoints.root import root_router from src.api.endpoints.search.routes import search_router @@ -179,7 +180,8 @@ async def redirect_docs(): metrics_router, submit_router, contributions_router, - agencies_router + agencies_router, + data_source_router ] for router in routers: From fcd183d06c5ed6b283a879c97793d4ea521c9846 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sat, 25 Oct 2025 15:23:42 -0400 Subject: [PATCH 16/84] Add meta url and data source endpoints --- .../endpoints/agencies/root/get/response.py | 7 +- .../data_source/{put => by_id}/__init__.py | 0 .../by_id/agency}/__init__.py | 0 .../agency/delete/__init__.py} | 0 .../by_id/agency/delete/wrapper.py | 17 ++ .../agency/get/__init__.py} | 0 .../data_source/by_id/agency/get/wrapper.py | 14 ++ .../by_id/agency/post/__init__.py} | 0 .../data_source/by_id/agency/post/wrapper.py | 17 ++ .../by_id/agency/shared/__init__.py} | 0 .../data_source/by_id/agency/shared/check.py | 17 ++ .../data_source/by_id/put/__init__.py | 0 .../endpoints/data_source/by_id/put/query.py | 123 +++++++++++++ .../data_source/by_id/put/request.py | 59 ++++++ src/api/endpoints/data_source/get/query.py | 155 ++++++++++++++++ src/api/endpoints/data_source/get/response.py | 39 ++++ src/api/endpoints/data_source/routes.py | 69 ++++++- src/api/endpoints/meta_url/by_id/__init__.py | 0 .../meta_url/by_id/agencies/__init__.py | 0 .../by_id/agencies/delete/__init__.py | 0 .../meta_url/by_id/agencies/delete/wrapper.py | 17 ++ .../meta_url/by_id/agencies/get/__init__.py | 0 .../meta_url/by_id/agencies/get/wrapper.py | 14 ++ .../meta_url/by_id/agencies/put/__init__.py | 0 .../meta_url/by_id/agencies/put/query.py | 47 +++++ .../meta_url/by_id/agencies/put/request.py | 10 + .../by_id/agencies/shared/__init__.py | 0 .../meta_url/by_id/agencies/shared/check.py | 17 ++ .../endpoints/meta_url/by_id/post/__init__.py | 0 .../endpoints/meta_url/by_id/post/wrapper.py | 17 ++ src/api/endpoints/meta_url/get/query.py | 83 +++++++++ src/api/endpoints/meta_url/get/response.py | 17 ++ src/api/endpoints/meta_url/routes.py | 72 ++++++-- src/api/main.py | 6 +- src/api/shared/agency/README.md | 1 + src/api/shared/agency/__init__.py | 0 src/api/shared/agency/delete/__init__.py | 0 src/api/shared/agency/delete/query.py | 29 +++ src/api/shared/agency/get/__init__.py | 0 src/api/shared/agency/get/query.py | 62 +++++++ src/api/shared/agency/get/response.py | 0 src/api/shared/agency/post/__init__.py | 0 src/api/shared/agency/post/query.py | 32 ++++ src/api/shared/batch/__init__.py | 0 src/api/shared/batch/url/__init__.py | 0 src/api/shared/batch/url/link.py | 36 ++++ src/api/shared/check/__init__.py | 0 src/api/shared/check/url_type/__init__.py | 0 src/api/shared/check/url_type/query.py | 58 ++++++ src/api/shared/record_type/__init__.py | 0 src/api/shared/record_type/put/__init__.py | 0 src/api/shared/record_type/put/query.py | 32 ++++ src/api/shared/url/__init__.py | 0 src/api/shared/url/put/__init__.py | 0 src/api/shared/url/put/query.py | 50 +++++ src/db/models/impl/url/core/sqlalchemy.py | 2 +- .../integration/api/data_sources/__init__.py | 0 .../api/data_sources/agencies/__init__.py | 0 .../data_sources/agencies/test_add_remove.py | 26 +++ .../agencies/test_invalid_type.py | 18 ++ .../api/data_sources/test_invalid_type.py | 20 ++ .../integration/api/data_sources/test_put.py | 89 +++++++++ .../integration/api/meta_urls/__init__.py | 0 .../api/meta_urls/agencies/__init__.py | 0 .../api/meta_urls/agencies/test_add_remove.py | 30 +++ .../meta_urls/agencies/test_invalid_type.py | 18 ++ .../api/meta_urls/test_invalid_type.py | 20 ++ .../integration/api/meta_urls/test_put.py | 39 ++++ tests/automated/integration/conftest.py | 27 ++- .../api/agencies/get/test_locations.py | 2 +- .../readonly/api/agencies/get/test_root.py | 2 +- .../readonly/api/data_sources/__init__.py | 0 .../api/data_sources/agencies/__init__.py | 0 .../api/data_sources/agencies/test_forbid.py | 13 ++ .../readonly/api/data_sources/test_get.py | 57 ++++++ .../readonly/api/meta_urls/__init__.py | 0 .../api/meta_urls/agencies/__init__.py | 0 .../api/meta_urls/agencies/test_forbid.py | 15 ++ .../readonly/api/meta_urls/test_get.py | 30 +++ .../integration/readonly/conftest.py | 61 +------ .../automated/integration/readonly/helper.py | 18 ++ tests/automated/integration/readonly/setup.py | 171 ++++++++++++++++++ .../test_submit_approved_url_task.py | 1 - tests/helpers/awaitable_barrier.py | 13 -- tests/helpers/check.py | 20 ++ tests/helpers/patch_functions.py | 10 - 86 files changed, 1709 insertions(+), 110 deletions(-) rename src/api/endpoints/data_source/{put => by_id}/__init__.py (100%) rename src/api/endpoints/{meta_url/put => data_source/by_id/agency}/__init__.py (100%) rename src/api/endpoints/data_source/{put/query.py => by_id/agency/delete/__init__.py} (100%) create mode 100644 src/api/endpoints/data_source/by_id/agency/delete/wrapper.py rename src/api/endpoints/data_source/{put/request.py => by_id/agency/get/__init__.py} (100%) create mode 100644 src/api/endpoints/data_source/by_id/agency/get/wrapper.py rename src/api/endpoints/{meta_url/put/query.py => data_source/by_id/agency/post/__init__.py} (100%) create mode 100644 src/api/endpoints/data_source/by_id/agency/post/wrapper.py rename src/api/endpoints/{meta_url/put/request.py => data_source/by_id/agency/shared/__init__.py} (100%) create mode 100644 src/api/endpoints/data_source/by_id/agency/shared/check.py create mode 100644 src/api/endpoints/data_source/by_id/put/__init__.py create mode 100644 src/api/endpoints/data_source/by_id/put/query.py create mode 100644 src/api/endpoints/data_source/by_id/put/request.py create mode 100644 src/api/endpoints/meta_url/by_id/__init__.py create mode 100644 src/api/endpoints/meta_url/by_id/agencies/__init__.py create mode 100644 src/api/endpoints/meta_url/by_id/agencies/delete/__init__.py create mode 100644 src/api/endpoints/meta_url/by_id/agencies/delete/wrapper.py create mode 100644 src/api/endpoints/meta_url/by_id/agencies/get/__init__.py create mode 100644 src/api/endpoints/meta_url/by_id/agencies/get/wrapper.py create mode 100644 src/api/endpoints/meta_url/by_id/agencies/put/__init__.py create mode 100644 src/api/endpoints/meta_url/by_id/agencies/put/query.py create mode 100644 src/api/endpoints/meta_url/by_id/agencies/put/request.py create mode 100644 src/api/endpoints/meta_url/by_id/agencies/shared/__init__.py create mode 100644 src/api/endpoints/meta_url/by_id/agencies/shared/check.py create mode 100644 src/api/endpoints/meta_url/by_id/post/__init__.py create mode 100644 src/api/endpoints/meta_url/by_id/post/wrapper.py create mode 100644 src/api/shared/agency/README.md create mode 100644 src/api/shared/agency/__init__.py create mode 100644 src/api/shared/agency/delete/__init__.py create mode 100644 src/api/shared/agency/delete/query.py create mode 100644 src/api/shared/agency/get/__init__.py create mode 100644 src/api/shared/agency/get/query.py create mode 100644 src/api/shared/agency/get/response.py create mode 100644 src/api/shared/agency/post/__init__.py create mode 100644 src/api/shared/agency/post/query.py create mode 100644 src/api/shared/batch/__init__.py create mode 100644 src/api/shared/batch/url/__init__.py create mode 100644 src/api/shared/batch/url/link.py create mode 100644 src/api/shared/check/__init__.py create mode 100644 src/api/shared/check/url_type/__init__.py create mode 100644 src/api/shared/check/url_type/query.py create mode 100644 src/api/shared/record_type/__init__.py create mode 100644 src/api/shared/record_type/put/__init__.py create mode 100644 src/api/shared/record_type/put/query.py create mode 100644 src/api/shared/url/__init__.py create mode 100644 src/api/shared/url/put/__init__.py create mode 100644 src/api/shared/url/put/query.py create mode 100644 tests/automated/integration/api/data_sources/__init__.py create mode 100644 tests/automated/integration/api/data_sources/agencies/__init__.py create mode 100644 tests/automated/integration/api/data_sources/agencies/test_add_remove.py create mode 100644 tests/automated/integration/api/data_sources/agencies/test_invalid_type.py create mode 100644 tests/automated/integration/api/data_sources/test_invalid_type.py create mode 100644 tests/automated/integration/api/data_sources/test_put.py create mode 100644 tests/automated/integration/api/meta_urls/__init__.py create mode 100644 tests/automated/integration/api/meta_urls/agencies/__init__.py create mode 100644 tests/automated/integration/api/meta_urls/agencies/test_add_remove.py create mode 100644 tests/automated/integration/api/meta_urls/agencies/test_invalid_type.py create mode 100644 tests/automated/integration/api/meta_urls/test_invalid_type.py create mode 100644 tests/automated/integration/api/meta_urls/test_put.py create mode 100644 tests/automated/integration/readonly/api/data_sources/__init__.py create mode 100644 tests/automated/integration/readonly/api/data_sources/agencies/__init__.py create mode 100644 tests/automated/integration/readonly/api/data_sources/agencies/test_forbid.py create mode 100644 tests/automated/integration/readonly/api/data_sources/test_get.py create mode 100644 tests/automated/integration/readonly/api/meta_urls/__init__.py create mode 100644 tests/automated/integration/readonly/api/meta_urls/agencies/__init__.py create mode 100644 tests/automated/integration/readonly/api/meta_urls/agencies/test_forbid.py create mode 100644 tests/automated/integration/readonly/api/meta_urls/test_get.py create mode 100644 tests/automated/integration/readonly/helper.py create mode 100644 tests/automated/integration/readonly/setup.py delete mode 100644 tests/helpers/awaitable_barrier.py create mode 100644 tests/helpers/check.py delete mode 100644 tests/helpers/patch_functions.py diff --git a/src/api/endpoints/agencies/root/get/response.py b/src/api/endpoints/agencies/root/get/response.py index b9d374eb..23590958 100644 --- a/src/api/endpoints/agencies/root/get/response.py +++ b/src/api/endpoints/agencies/root/get/response.py @@ -8,5 +8,8 @@ class AgencyGetResponse(BaseModel): id: int name: str type: AgencyType - jurisdiction_type: JurisdictionType - locations: list[AgencyGetLocationsResponse] \ No newline at end of file + jurisdiction_type: JurisdictionType | None + locations: list[AgencyGetLocationsResponse] + +class AgencyGetOuterResponse(BaseModel): + results: list[AgencyGetResponse] \ No newline at end of file diff --git a/src/api/endpoints/data_source/put/__init__.py b/src/api/endpoints/data_source/by_id/__init__.py similarity index 100% rename from src/api/endpoints/data_source/put/__init__.py rename to src/api/endpoints/data_source/by_id/__init__.py diff --git a/src/api/endpoints/meta_url/put/__init__.py b/src/api/endpoints/data_source/by_id/agency/__init__.py similarity index 100% rename from src/api/endpoints/meta_url/put/__init__.py rename to src/api/endpoints/data_source/by_id/agency/__init__.py diff --git a/src/api/endpoints/data_source/put/query.py b/src/api/endpoints/data_source/by_id/agency/delete/__init__.py similarity index 100% rename from src/api/endpoints/data_source/put/query.py rename to src/api/endpoints/data_source/by_id/agency/delete/__init__.py diff --git a/src/api/endpoints/data_source/by_id/agency/delete/wrapper.py b/src/api/endpoints/data_source/by_id/agency/delete/wrapper.py new file mode 100644 index 00000000..f04885af --- /dev/null +++ b/src/api/endpoints/data_source/by_id/agency/delete/wrapper.py @@ -0,0 +1,17 @@ +from src.api.endpoints.data_source.by_id.agency.shared.check import check_is_data_source_url +from src.api.shared.agency.delete.query import RemoveURLAgencyLinkQueryBuilder +from src.db.client.async_ import AsyncDatabaseClient + + +async def delete_data_source_agency_link( + url_id: int, + agency_id: int, + adb_client: AsyncDatabaseClient +) -> None: + await check_is_data_source_url(url_id=url_id, adb_client=adb_client) + await adb_client.run_query_builder( + RemoveURLAgencyLinkQueryBuilder( + url_id=url_id, + agency_id=agency_id + ) + ) \ No newline at end of file diff --git a/src/api/endpoints/data_source/put/request.py b/src/api/endpoints/data_source/by_id/agency/get/__init__.py similarity index 100% rename from src/api/endpoints/data_source/put/request.py rename to src/api/endpoints/data_source/by_id/agency/get/__init__.py diff --git a/src/api/endpoints/data_source/by_id/agency/get/wrapper.py b/src/api/endpoints/data_source/by_id/agency/get/wrapper.py new file mode 100644 index 00000000..f58d4936 --- /dev/null +++ b/src/api/endpoints/data_source/by_id/agency/get/wrapper.py @@ -0,0 +1,14 @@ +from src.api.endpoints.agencies.root.get.response import AgencyGetOuterResponse +from src.api.endpoints.data_source.by_id.agency.shared.check import check_is_data_source_url +from src.api.shared.agency.get.query import GetRelatedAgenciesQueryBuilder +from src.db.client.async_ import AsyncDatabaseClient + + +async def get_data_source_agencies_wrapper( + url_id: int, + adb_client: AsyncDatabaseClient +) -> AgencyGetOuterResponse: + await check_is_data_source_url(url_id=url_id, adb_client=adb_client) + return await adb_client.run_query_builder( + GetRelatedAgenciesQueryBuilder(url_id=url_id) + ) \ No newline at end of file diff --git a/src/api/endpoints/meta_url/put/query.py b/src/api/endpoints/data_source/by_id/agency/post/__init__.py similarity index 100% rename from src/api/endpoints/meta_url/put/query.py rename to src/api/endpoints/data_source/by_id/agency/post/__init__.py diff --git a/src/api/endpoints/data_source/by_id/agency/post/wrapper.py b/src/api/endpoints/data_source/by_id/agency/post/wrapper.py new file mode 100644 index 00000000..97197103 --- /dev/null +++ b/src/api/endpoints/data_source/by_id/agency/post/wrapper.py @@ -0,0 +1,17 @@ +from src.api.endpoints.data_source.by_id.agency.shared.check import check_is_data_source_url +from src.api.shared.agency.post.query import AddURLAgencyLinkQueryBuilder +from src.db.client.async_ import AsyncDatabaseClient + + +async def add_data_source_agency_link( + url_id: int, + agency_id: int, + adb_client: AsyncDatabaseClient +) -> None: + await check_is_data_source_url(url_id=url_id, adb_client=adb_client) + await adb_client.run_query_builder( + AddURLAgencyLinkQueryBuilder( + url_id=url_id, + agency_id=agency_id + ) + ) \ No newline at end of file diff --git a/src/api/endpoints/meta_url/put/request.py b/src/api/endpoints/data_source/by_id/agency/shared/__init__.py similarity index 100% rename from src/api/endpoints/meta_url/put/request.py rename to src/api/endpoints/data_source/by_id/agency/shared/__init__.py diff --git a/src/api/endpoints/data_source/by_id/agency/shared/check.py b/src/api/endpoints/data_source/by_id/agency/shared/check.py new file mode 100644 index 00000000..2ef9640c --- /dev/null +++ b/src/api/endpoints/data_source/by_id/agency/shared/check.py @@ -0,0 +1,17 @@ +from src.api.shared.check.url_type.query import CheckURLTypeQueryBuilder +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.flag.url_validated.enums import URLType + + +async def check_is_data_source_url( + url_id: int, + adb_client: AsyncDatabaseClient +) -> None: + """ + Raises: + Bad Request if url_type is not valid or does not exist + """ + + await adb_client.run_query_builder( + CheckURLTypeQueryBuilder(url_id=url_id, url_type=URLType.DATA_SOURCE) + ) \ No newline at end of file diff --git a/src/api/endpoints/data_source/by_id/put/__init__.py b/src/api/endpoints/data_source/by_id/put/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/data_source/by_id/put/query.py b/src/api/endpoints/data_source/by_id/put/query.py new file mode 100644 index 00000000..96106395 --- /dev/null +++ b/src/api/endpoints/data_source/by_id/put/query.py @@ -0,0 +1,123 @@ +from sqlalchemy import update, select, literal, insert +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.data_source.by_id.put.request import DataSourcePutRequest +from src.api.shared.batch.url.link import UpdateBatchURLLinkQueryBuilder +from src.api.shared.record_type.put.query import UpdateRecordTypeQueryBuilder +from src.api.shared.url.put.query import UpdateURLQueryBuilder +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata +from src.db.queries.base.builder import QueryBuilderBase + + +class UpdateDataSourceQueryBuilder(QueryBuilderBase): + + def __init__( + self, + url_id: int, + request: DataSourcePutRequest, + ): + super().__init__() + self.url_id = url_id + self.request = request + + async def run(self, session: AsyncSession) -> None: + + if self.request.record_type is not None: + await UpdateRecordTypeQueryBuilder( + url_id=self.url_id, + record_type=self.request.record_type + ).run(session) + + # Update URL if any of the URL fields are not None + if ( + self.request.description is None and + self.request.name is None and + self.request.description is None + ): + return + + # Update Batch if Batch link is none + if self.request.batch_id is not None: + await UpdateBatchURLLinkQueryBuilder( + batch_id=self.request.batch_id, + url_id=self.url_id + ).run(session) + + await UpdateURLQueryBuilder( + url_id=self.url_id, + url=self.request.url, + name=self.request.name, + description=self.request.description, + ).run( + session, + ) + if not self.request.optional_data_source_metadata_not_none(): + return + value_dict = {} + if self.request.record_formats is not None: + value_dict["record_formats"] = self.request.record_formats + if self.request.data_portal_type is not None: + value_dict["data_portal_type"] = self.request.data_portal_type + if self.request.supplying_entity is not None: + value_dict["supplying_entity"] = self.request.supplying_entity + if self.request.coverage_start is not None: + value_dict["coverage_start"] = self.request.coverage_start + if self.request.coverage_end is not None: + value_dict["coverage_end"] = self.request.coverage_end + if self.request.agency_supplied is not None: + value_dict["agency_supplied"] = self.request.agency_supplied + if self.request.agency_originated is not None: + value_dict["agency_originated"] = self.request.agency_originated + if self.request.agency_aggregation is not None: + value_dict["agency_aggregation"] = self.request.agency_aggregation + if self.request.agency_described_not_in_database is not None: + value_dict["agency_described_not_in_database"] = self.request.agency_described_not_in_database + if self.request.update_method is not None: + value_dict["update_method"] = self.request.update_method + if self.request.readme_url is not None: + value_dict["readme_url"] = self.request.readme_url + if self.request.originating_entity is not None: + value_dict["originating_entity"] = self.request.originating_entity + if self.request.retention_schedule is not None: + value_dict["retention_schedule"] = self.request.retention_schedule + if self.request.scraper_url is not None: + value_dict["scraper_url"] = self.request.scraper_url + if self.request.submission_notes is not None: + value_dict["submission_notes"] = self.request.submission_notes + if self.request.access_notes is not None: + value_dict["access_notes"] = self.request.access_notes + if self.request.access_types is not None: + value_dict["access_types"] = self.request.access_types + + # Check for existing metadata object + query = ( + select( + literal(True) + ) + .where( + URLOptionalDataSourceMetadata.url_id == self.url_id + ) + ) + exists = await self.sh.one_or_none(session=session, query=query) + if not exists: + insert_obj = URLOptionalDataSourceMetadata( + url_id=self.url_id, + **value_dict + ) + session.add(insert_obj) + else: + statement = ( + update( + URLOptionalDataSourceMetadata + ) + .where( + URLOptionalDataSourceMetadata.url_id == self.url_id + ) + .values( + value_dict + ) + ) + + await session.execute(statement) + + diff --git a/src/api/endpoints/data_source/by_id/put/request.py b/src/api/endpoints/data_source/by_id/put/request.py new file mode 100644 index 00000000..28549c28 --- /dev/null +++ b/src/api/endpoints/data_source/by_id/put/request.py @@ -0,0 +1,59 @@ +from datetime import date + +from pydantic import BaseModel + +from src.core.enums import RecordType +from src.db.models.impl.url.optional_ds_metadata.enums import AgencyAggregationEnum, UpdateMethodEnum, \ + RetentionScheduleEnum, AccessTypeEnum + + +class DataSourcePutRequest(BaseModel): + + # Required Attributes + url: str | None = None + name: str | None = None + record_type: RecordType | None = None + + # Optional Attributes + batch_id: int | None = None + description: str | None = None + + # Optional data source metadata + record_formats: list[str] | None = None + data_portal_type: str | None = None + supplying_entity: str | None = None + coverage_start: date | None = None + coverage_end: date | None = None + agency_supplied: bool | None = None + agency_originated: bool | None = None + agency_aggregation: AgencyAggregationEnum | None = None + agency_described_not_in_database: str | None = None + update_method: UpdateMethodEnum | None = None + readme_url: str | None = None + originating_entity: str | None = None + retention_schedule: RetentionScheduleEnum | None = None + scraper_url: str | None = None + submission_notes: str | None = None + access_notes: str | None = None + access_types: list[AccessTypeEnum] | None = None + + def optional_data_source_metadata_not_none(self) -> bool: + return ( + self.record_formats is not None or + self.data_portal_type is not None or + self.supplying_entity is not None or + self.coverage_start is not None or + self.coverage_end is not None or + self.agency_supplied is not None or + self.agency_originated is not None or + self.agency_aggregation is not None or + self.agency_described_not_in_database is not None or + self.update_method is not None or + self.readme_url is not None or + self.originating_entity is not None or + self.retention_schedule is not None or + self.scraper_url is not None or + self.submission_notes is not None or + self.access_notes is not None or + self.access_types is not None + ) \ No newline at end of file diff --git a/src/api/endpoints/data_source/get/query.py b/src/api/endpoints/data_source/get/query.py index e69de29b..e9d0598b 100644 --- a/src/api/endpoints/data_source/get/query.py +++ b/src/api/endpoints/data_source/get/query.py @@ -0,0 +1,155 @@ +from datetime import date +from typing import Any, Sequence + +from sqlalchemy import select, RowMapping, and_ +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import selectinload + +from src.api.endpoints.data_source.get.response import DataSourceGetOuterResponse, DataSourceGetResponse +from src.core.enums import RecordType +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.optional_ds_metadata.enums import AgencyAggregationEnum, UpdateMethodEnum, \ + RetentionScheduleEnum, AccessTypeEnum +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata +from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType +from src.db.queries.base.builder import QueryBuilderBase + + +class GetDataSourceQueryBuilder(QueryBuilderBase): + + def __init__( + self, + page: int, + ): + super().__init__() + self.page = page + + async def run(self, session: AsyncSession) -> DataSourceGetOuterResponse: + query = ( + select( + URL, + URL.id, + URL.url, + + # Required Attributes + URL.name, + URLRecordType.record_type, + URL.confirmed_agencies, + + # Optional Attributes + URL.description, + LinkBatchURL.batch_id, + URLOptionalDataSourceMetadata.record_formats, + URLOptionalDataSourceMetadata.data_portal_type, + URLOptionalDataSourceMetadata.supplying_entity, + URLOptionalDataSourceMetadata.coverage_start, + URLOptionalDataSourceMetadata.coverage_end, + URLOptionalDataSourceMetadata.agency_supplied, + URLOptionalDataSourceMetadata.agency_aggregation, + URLOptionalDataSourceMetadata.agency_described_not_in_database, + URLOptionalDataSourceMetadata.agency_originated, + URLOptionalDataSourceMetadata.update_method, + URLOptionalDataSourceMetadata.readme_url, + URLOptionalDataSourceMetadata.originating_entity, + URLOptionalDataSourceMetadata.retention_schedule, + URLOptionalDataSourceMetadata.scraper_url, + URLOptionalDataSourceMetadata.submission_notes, + URLOptionalDataSourceMetadata.access_notes, + URLOptionalDataSourceMetadata.access_types + ) + .join( + URLRecordType, + URLRecordType.url_id == URL.id + ) + .join( + FlagURLValidated, + and_( + FlagURLValidated.url_id == URL.id, + FlagURLValidated.type == URLType.DATA_SOURCE + ) + ) + .outerjoin( + LinkBatchURL, + LinkBatchURL.url_id == URL.id + ) + .outerjoin( + URLOptionalDataSourceMetadata, + URLOptionalDataSourceMetadata.url_id == URL.id + ) + .options( + selectinload(URL.confirmed_agencies), + ) + .limit(100) + .offset((self.page - 1) * 100) + ) + + mappings: Sequence[RowMapping] = await self.sh.mappings(session, query=query) + responses: list[DataSourceGetResponse] = [] + + for mapping in mappings: + url: URL = mapping[URL] + url_id: int = mapping[URL.id] + url_url: str = mapping[URL.url] + url_name: str = mapping[URL.name] + url_record_type: RecordType = mapping[URLRecordType.record_type] + + url_agency_ids: list[int] = [] + for agency in url.confirmed_agencies: + url_agency_ids.append(agency.agency_id) + + url_description: str | None = mapping[URL.description] + link_batch_url_batch_id: int | None = mapping[LinkBatchURL.batch_id] + url_record_formats: list[str] | None = mapping[URLOptionalDataSourceMetadata.record_formats] + url_data_portal_type: str | None = mapping[URLOptionalDataSourceMetadata.data_portal_type] + url_supplying_entity: str | None = mapping[URLOptionalDataSourceMetadata.supplying_entity] + url_coverage_start: date | None = mapping[URLOptionalDataSourceMetadata.coverage_start] + url_coverage_end: date | None = mapping[URLOptionalDataSourceMetadata.coverage_end] + url_agency_supplied: bool | None = mapping[URLOptionalDataSourceMetadata.agency_supplied] + url_agency_aggregation: AgencyAggregationEnum | None = mapping[URLOptionalDataSourceMetadata.agency_aggregation] + url_agency_originated: bool | None = mapping[URLOptionalDataSourceMetadata.agency_originated] + url_agency_described_not_in_database: bool | None = mapping[URLOptionalDataSourceMetadata.agency_described_not_in_database] + url_update_method: UpdateMethodEnum | None = mapping[URLOptionalDataSourceMetadata.update_method] + url_readme_url: str | None = mapping[URLOptionalDataSourceMetadata.readme_url] + url_originating_entity: str | None = mapping[URLOptionalDataSourceMetadata.originating_entity] + url_retention_schedule: RetentionScheduleEnum | None = mapping[URLOptionalDataSourceMetadata.retention_schedule] + url_scraper_url: str | None = mapping[URLOptionalDataSourceMetadata.scraper_url] + url_submission_notes: str | None = mapping[URLOptionalDataSourceMetadata.submission_notes] + url_access_notes: str | None = mapping[URLOptionalDataSourceMetadata.access_notes] + url_access_types: list[AccessTypeEnum] | None = mapping[URLOptionalDataSourceMetadata.access_types] + + responses.append( + DataSourceGetResponse( + url_id=url_id, + url=url_url, + name=url_name, + record_type=url_record_type, + agency_ids=url_agency_ids, + description=url_description, + batch_id=link_batch_url_batch_id, + record_formats=url_record_formats, + data_portal_type=url_data_portal_type, + supplying_entity=url_supplying_entity, + coverage_start=url_coverage_start, + coverage_end=url_coverage_end, + agency_supplied=url_agency_supplied, + agency_aggregation=url_agency_aggregation, + agency_originated=url_agency_originated, + agency_described_not_in_database=url_agency_described_not_in_database, + update_method=url_update_method, + readme_url=url_readme_url, + originating_entity=url_originating_entity, + retention_schedule=url_retention_schedule, + scraper_url=url_scraper_url, + submission_notes=url_submission_notes, + access_notes=url_access_notes, + access_types=url_access_types + ) + ) + + return DataSourceGetOuterResponse( + results=responses, + ) + diff --git a/src/api/endpoints/data_source/get/response.py b/src/api/endpoints/data_source/get/response.py index 51134ffc..b80ee9e1 100644 --- a/src/api/endpoints/data_source/get/response.py +++ b/src/api/endpoints/data_source/get/response.py @@ -1,4 +1,43 @@ +from datetime import date + from pydantic import BaseModel +from src.core.enums import RecordType +from src.db.models.impl.url.optional_ds_metadata.enums import AgencyAggregationEnum, UpdateMethodEnum, \ + RetentionScheduleEnum, AccessTypeEnum + class DataSourceGetResponse(BaseModel): + url_id: int + url: str + + # Required Attributes + name: str + record_type: RecordType + agency_ids: list[int] + + # Optional Attributes + batch_id: int | None + description: str | None + + # Optional data source metadata + record_formats: list[str] + data_portal_type: str | None = None + supplying_entity: str | None = None + coverage_start: date | None = None + coverage_end: date | None = None + agency_supplied: bool | None = None + agency_originated: bool | None = None + agency_aggregation: AgencyAggregationEnum | None = None + agency_described_not_in_database: str | None = None + update_method: UpdateMethodEnum | None = None + readme_url: str | None = None + originating_entity: str | None = None + retention_schedule: RetentionScheduleEnum | None = None + scraper_url: str | None = None + submission_notes: str | None = None + access_notes: str | None = None + access_types: list[AccessTypeEnum] + +class DataSourceGetOuterResponse(BaseModel): + results: list[DataSourceGetResponse] diff --git a/src/api/endpoints/data_source/routes.py b/src/api/endpoints/data_source/routes.py index 770013f6..2464ceea 100644 --- a/src/api/endpoints/data_source/routes.py +++ b/src/api/endpoints/data_source/routes.py @@ -1,34 +1,83 @@ from fastapi import APIRouter, Depends, Query from src.api.dependencies import get_async_core -from src.api.endpoints.data_source.get.response import DataSourceGetResponse +from src.api.endpoints.agencies.root.get.response import AgencyGetResponse, AgencyGetOuterResponse +from src.api.endpoints.data_source.by_id.agency.delete.wrapper import delete_data_source_agency_link +from src.api.endpoints.data_source.by_id.agency.get.wrapper import get_data_source_agencies_wrapper +from src.api.endpoints.data_source.by_id.agency.post.wrapper import add_data_source_agency_link +from src.api.endpoints.data_source.by_id.agency.shared.check import check_is_data_source_url +from src.api.endpoints.data_source.get.query import GetDataSourceQueryBuilder +from src.api.endpoints.data_source.get.response import DataSourceGetOuterResponse +from src.api.endpoints.data_source.by_id.put.query import UpdateDataSourceQueryBuilder +from src.api.endpoints.data_source.by_id.put.request import DataSourcePutRequest from src.api.shared.models.message_response import MessageResponse from src.core.core import AsyncCore -data_source_router = APIRouter( - prefix="/data-source", +data_sources_router = APIRouter( + prefix="/data-sources", tags=["data-source"] ) -@data_source_router.get("") +@data_sources_router.get("") async def get_data_sources( async_core: AsyncCore = Depends(get_async_core), page: int = Query( description="Page number", default=1 ), -) -> list[DataSourceGetResponse]: +) -> DataSourceGetOuterResponse: return await async_core.adb_client.run_query_builder( GetDataSourceQueryBuilder(page=page) ) -@data_source_router.put("/{data_source_id}") +@data_sources_router.put("/{url_id}") async def update_data_source( - data_source_id: int, + url_id: int , + request: DataSourcePutRequest, async_core: AsyncCore = Depends(get_async_core), - request: DataSourceUpdateRequest, ) -> MessageResponse: - return await async_core.adb_client.run_query_builder( - UpdateDataSourceQueryBuilder(data_source_id=data_source_id, data_source_update=data_source_update) + await check_is_data_source_url(url_id=url_id, adb_client=async_core.adb_client) + await async_core.adb_client.run_query_builder( + UpdateDataSourceQueryBuilder( + url_id=url_id, + request=request + ) + ) + return MessageResponse(message="Data source updated.") + +@data_sources_router.get("/{url_id}/agencies") +async def get_data_source_agencies( + url_id: int, + async_core: AsyncCore = Depends(get_async_core), +) -> AgencyGetOuterResponse: + return await get_data_source_agencies_wrapper( + url_id=url_id, + adb_client=async_core.adb_client + ) + +@data_sources_router.post("/{url_id}/agencies/{agency_id}") +async def add_agency_to_data_source( + url_id: int, + agency_id: int, + async_core: AsyncCore = Depends(get_async_core), +) -> MessageResponse: + await add_data_source_agency_link( + url_id=url_id, + agency_id=agency_id, + adb_client=async_core.adb_client + ) + return MessageResponse(message="Agency added to data source.") + +@data_sources_router.delete("/{url_id}/agencies/{agency_id}") +async def remove_agency_from_data_source( + url_id: int, + agency_id: int, + async_core: AsyncCore = Depends(get_async_core), +) -> MessageResponse: + await delete_data_source_agency_link( + url_id=url_id, + agency_id=agency_id, + adb_client=async_core.adb_client ) + return MessageResponse(message="Agency removed from data source.") diff --git a/src/api/endpoints/meta_url/by_id/__init__.py b/src/api/endpoints/meta_url/by_id/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/meta_url/by_id/agencies/__init__.py b/src/api/endpoints/meta_url/by_id/agencies/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/meta_url/by_id/agencies/delete/__init__.py b/src/api/endpoints/meta_url/by_id/agencies/delete/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/meta_url/by_id/agencies/delete/wrapper.py b/src/api/endpoints/meta_url/by_id/agencies/delete/wrapper.py new file mode 100644 index 00000000..7adf695a --- /dev/null +++ b/src/api/endpoints/meta_url/by_id/agencies/delete/wrapper.py @@ -0,0 +1,17 @@ +from src.api.endpoints.meta_url.by_id.agencies.shared.check import check_is_meta_url +from src.api.shared.agency.delete.query import RemoveURLAgencyLinkQueryBuilder +from src.db.client.async_ import AsyncDatabaseClient + + +async def delete_meta_url_agency_link( + url_id: int, + agency_id: int, + adb_client: AsyncDatabaseClient +) -> None: + await check_is_meta_url(url_id=url_id, adb_client=adb_client) + await adb_client.run_query_builder( + RemoveURLAgencyLinkQueryBuilder( + url_id=url_id, + agency_id=agency_id + ) + ) \ No newline at end of file diff --git a/src/api/endpoints/meta_url/by_id/agencies/get/__init__.py b/src/api/endpoints/meta_url/by_id/agencies/get/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/meta_url/by_id/agencies/get/wrapper.py b/src/api/endpoints/meta_url/by_id/agencies/get/wrapper.py new file mode 100644 index 00000000..17362a88 --- /dev/null +++ b/src/api/endpoints/meta_url/by_id/agencies/get/wrapper.py @@ -0,0 +1,14 @@ +from src.api.endpoints.agencies.root.get.response import AgencyGetOuterResponse +from src.api.endpoints.meta_url.by_id.agencies.shared.check import check_is_meta_url +from src.api.shared.agency.get.query import GetRelatedAgenciesQueryBuilder +from src.db.client.async_ import AsyncDatabaseClient + + +async def get_meta_url_agencies_wrapper( + url_id: int, + adb_client: AsyncDatabaseClient +) -> AgencyGetOuterResponse: + await check_is_meta_url(url_id=url_id, adb_client=adb_client) + return await adb_client.run_query_builder( + GetRelatedAgenciesQueryBuilder(url_id=url_id) + ) \ No newline at end of file diff --git a/src/api/endpoints/meta_url/by_id/agencies/put/__init__.py b/src/api/endpoints/meta_url/by_id/agencies/put/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/meta_url/by_id/agencies/put/query.py b/src/api/endpoints/meta_url/by_id/agencies/put/query.py new file mode 100644 index 00000000..a3be8cf8 --- /dev/null +++ b/src/api/endpoints/meta_url/by_id/agencies/put/query.py @@ -0,0 +1,47 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.meta_url.by_id.agencies.put.request import UpdateMetaURLRequest +from src.api.shared.batch.url.link import UpdateBatchURLLinkQueryBuilder +from src.api.shared.record_type.put.query import UpdateRecordTypeQueryBuilder +from src.api.shared.url.put.query import UpdateURLQueryBuilder +from src.db.queries.base.builder import QueryBuilderBase + + +class UpdateMetaURLQueryBuilder(QueryBuilderBase): + + def __init__( + self, + url_id: int, + request: UpdateMetaURLRequest + ): + super().__init__() + self.url_id = url_id + self.request = request + + async def run(self, session: AsyncSession) -> None: + + # Update Batch ID if not none + if self.request.batch_id is not None: + await UpdateBatchURLLinkQueryBuilder( + batch_id=self.request.batch_id, + url_id=self.url_id + ).run(session) + + + # Update URL if any of the URL fields are not None + if ( + self.request.description is None and + self.request.name is None and + self.request.description is None + ): + return + + await UpdateURLQueryBuilder( + url_id=self.url_id, + url=self.request.url, + name=self.request.name, + description=self.request.description, + ).run( + session, + ) + diff --git a/src/api/endpoints/meta_url/by_id/agencies/put/request.py b/src/api/endpoints/meta_url/by_id/agencies/put/request.py new file mode 100644 index 00000000..456f2b99 --- /dev/null +++ b/src/api/endpoints/meta_url/by_id/agencies/put/request.py @@ -0,0 +1,10 @@ +from pydantic import BaseModel + + +class UpdateMetaURLRequest(BaseModel): + url: str | None = None + name: str | None = None + description: str | None = None + + batch_id: int | None = None + diff --git a/src/api/endpoints/meta_url/by_id/agencies/shared/__init__.py b/src/api/endpoints/meta_url/by_id/agencies/shared/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/meta_url/by_id/agencies/shared/check.py b/src/api/endpoints/meta_url/by_id/agencies/shared/check.py new file mode 100644 index 00000000..72c79601 --- /dev/null +++ b/src/api/endpoints/meta_url/by_id/agencies/shared/check.py @@ -0,0 +1,17 @@ +from src.api.shared.check.url_type.query import CheckURLTypeQueryBuilder +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.flag.url_validated.enums import URLType + + +async def check_is_meta_url( + url_id: int, + adb_client: AsyncDatabaseClient +) -> None: + """ + Raises: + Bad Request if url_type is not valid or does not exist + """ + + await adb_client.run_query_builder( + CheckURLTypeQueryBuilder(url_id=url_id, url_type=URLType.META_URL) + ) \ No newline at end of file diff --git a/src/api/endpoints/meta_url/by_id/post/__init__.py b/src/api/endpoints/meta_url/by_id/post/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/meta_url/by_id/post/wrapper.py b/src/api/endpoints/meta_url/by_id/post/wrapper.py new file mode 100644 index 00000000..4153e144 --- /dev/null +++ b/src/api/endpoints/meta_url/by_id/post/wrapper.py @@ -0,0 +1,17 @@ +from src.api.endpoints.meta_url.by_id.agencies.shared.check import check_is_meta_url +from src.api.shared.agency.post.query import AddURLAgencyLinkQueryBuilder +from src.db.client.async_ import AsyncDatabaseClient + + +async def add_meta_url_agency_link( + url_id: int, + agency_id: int, + adb_client: AsyncDatabaseClient +) -> None: + await check_is_meta_url(url_id=url_id, adb_client=adb_client) + await adb_client.run_query_builder( + AddURLAgencyLinkQueryBuilder( + url_id=url_id, + agency_id=agency_id + ) + ) \ No newline at end of file diff --git a/src/api/endpoints/meta_url/get/query.py b/src/api/endpoints/meta_url/get/query.py index e69de29b..202626d8 100644 --- a/src/api/endpoints/meta_url/get/query.py +++ b/src/api/endpoints/meta_url/get/query.py @@ -0,0 +1,83 @@ +from typing import Sequence + +from sqlalchemy import select, and_, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import selectinload + +from src.api.endpoints.meta_url.get.response import MetaURLGetOuterResponse, MetaURLGetResponse +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.base.builder import QueryBuilderBase + + +class GetMetaURLQueryBuilder(QueryBuilderBase): + + def __init__( + self, + page: int, + ): + super().__init__() + self.page = page + + async def run(self, session: AsyncSession) -> MetaURLGetOuterResponse: + query = ( + select( + URL, + URL.id, + URL.url, + + # Required Attributes + URL.name, + URL.confirmed_agencies, + + # Optional Attributes + URL.description, + LinkBatchURL.batch_id, + ) + .join( + FlagURLValidated, + and_( + FlagURLValidated.url_id == URL.id, + FlagURLValidated.type == URLType.META_URL + ) + ) + .outerjoin( + LinkBatchURL, + LinkBatchURL.url_id == URL.id + ) + .options( + selectinload(URL.confirmed_agencies), + ) + .limit(100) + .offset((self.page - 1) * 100) + ) + + mappings: Sequence[RowMapping] = await self.sh.mappings(session, query=query) + responses: list[MetaURLGetResponse] = [] + + for mapping in mappings: + url: URL = mapping[URL] + url_id: int = mapping[URL.id] + url_url: str = mapping[URL.url] + url_name: str = mapping[URL.name] + url_agency_ids: list[int] = [] + for agency in url.confirmed_agencies: + url_agency_ids.append(agency.agency_id) + url_description: str | None = mapping[URL.description] + link_batch_url_batch_id: int | None = mapping[LinkBatchURL.batch_id] + responses.append( + MetaURLGetResponse( + url_id=url_id, + url=url_url, + name=url_name, + agency_ids=url_agency_ids, + description=url_description, + batch_id=link_batch_url_batch_id, + ) + ) + + return MetaURLGetOuterResponse( + results=responses, + ) \ No newline at end of file diff --git a/src/api/endpoints/meta_url/get/response.py b/src/api/endpoints/meta_url/get/response.py index e69de29b..1f683a65 100644 --- a/src/api/endpoints/meta_url/get/response.py +++ b/src/api/endpoints/meta_url/get/response.py @@ -0,0 +1,17 @@ +from pydantic import BaseModel + + +class MetaURLGetResponse(BaseModel): + url_id: int + url: str + + # Required Attributes + name: str + agency_ids: list[int] + + # Optional Attributes + batch_id: int| None + description: str | None + +class MetaURLGetOuterResponse(BaseModel): + results: list[MetaURLGetResponse] \ No newline at end of file diff --git a/src/api/endpoints/meta_url/routes.py b/src/api/endpoints/meta_url/routes.py index 5d3f2d76..0f14805c 100644 --- a/src/api/endpoints/meta_url/routes.py +++ b/src/api/endpoints/meta_url/routes.py @@ -1,34 +1,84 @@ from fastapi import APIRouter, Depends, Query from src.api.dependencies import get_async_core +from src.api.endpoints.agencies.root.get.response import AgencyGetResponse, AgencyGetOuterResponse +from src.api.endpoints.meta_url.by_id.agencies.delete.wrapper import delete_meta_url_agency_link +from src.api.endpoints.meta_url.by_id.agencies.get.wrapper import get_meta_url_agencies_wrapper +from src.api.endpoints.meta_url.by_id.agencies.shared.check import check_is_meta_url +from src.api.endpoints.meta_url.by_id.post.wrapper import add_meta_url_agency_link +from src.api.endpoints.meta_url.get.query import GetMetaURLQueryBuilder +from src.api.endpoints.meta_url.get.response import MetaURLGetResponse, MetaURLGetOuterResponse +from src.api.endpoints.meta_url.by_id.agencies.put.query import UpdateMetaURLQueryBuilder +from src.api.endpoints.meta_url.by_id.agencies.put.request import UpdateMetaURLRequest from src.api.shared.models.message_response import MessageResponse from src.core.core import AsyncCore -meta_url_router = APIRouter( - prefix="/meta-url", +meta_urls_router = APIRouter( + prefix="/meta-urls", tags=["meta-url"] ) - -@meta_url_router.get("") +@meta_urls_router.get("") async def get_meta_urls( async_core: AsyncCore = Depends(get_async_core), page: int = Query( description="Page number", default=1 ), -) -> MetaURLGetResponse: - return await async_core.adb_client.run_query_builder(GetMetaURLQueryBuilder()) +) -> MetaURLGetOuterResponse: + return await async_core.adb_client.run_query_builder( + GetMetaURLQueryBuilder(page=page) + ) -@meta_url_router.put("/{meta_url_id}") +@meta_urls_router.put("/{url_id}") async def update_meta_url( - meta_url_id: int, + url_id: int, + request: UpdateMetaURLRequest, async_core: AsyncCore = Depends(get_async_core), - request: MetaURLUpdateRequest, ) -> MessageResponse: - return await async_core.adb_client.run_query_builder( - UpdateMetaURLQueryBuilder(meta_url_id=meta_url_id, meta_url_update=meta_url_update) + await check_is_meta_url(url_id=url_id, adb_client=async_core.adb_client) + await async_core.adb_client.run_query_builder( + UpdateMetaURLQueryBuilder( + url_id=url_id, + request=request + ) + ) + return MessageResponse(message="Meta URL updated.") + + +@meta_urls_router.get("/{url_id}/agencies") +async def get_meta_url_agencies( + url_id: int, + async_core: AsyncCore = Depends(get_async_core), +) -> AgencyGetOuterResponse: + return await get_meta_url_agencies_wrapper( + url_id=url_id, + adb_client=async_core.adb_client ) +@meta_urls_router.post("/{url_id}/agencies/{agency_id}") +async def add_agency_to_meta_url( + url_id: int, + agency_id: int, + async_core: AsyncCore = Depends(get_async_core), +) -> MessageResponse: + await add_meta_url_agency_link( + url_id=url_id, + agency_id=agency_id, + adb_client=async_core.adb_client + ) + return MessageResponse(message="Agency added to meta URL.") +@meta_urls_router.delete("/{url_id}/agencies/{agency_id}") +async def remove_agency_from_meta_url( + url_id: int, + agency_id: int, + async_core: AsyncCore = Depends(get_async_core), +) -> MessageResponse: + await delete_meta_url_agency_link( + url_id=url_id, + agency_id=agency_id, + adb_client=async_core.adb_client + ) + return MessageResponse(message="Agency removed from meta URL.") diff --git a/src/api/main.py b/src/api/main.py index 076b8108..2dd7fa24 100644 --- a/src/api/main.py +++ b/src/api/main.py @@ -12,7 +12,8 @@ from src.api.endpoints.batch.routes import batch_router from src.api.endpoints.collector.routes import collector_router from src.api.endpoints.contributions.routes import contributions_router -from src.api.endpoints.data_source.routes import data_source_router +from src.api.endpoints.data_source.routes import data_sources_router +from src.api.endpoints.meta_url.routes import meta_urls_router from src.api.endpoints.metrics.routes import metrics_router from src.api.endpoints.root import root_router from src.api.endpoints.search.routes import search_router @@ -181,7 +182,8 @@ async def redirect_docs(): submit_router, contributions_router, agencies_router, - data_source_router + data_sources_router, + meta_urls_router ] for router in routers: diff --git a/src/api/shared/agency/README.md b/src/api/shared/agency/README.md new file mode 100644 index 00000000..6afa1917 --- /dev/null +++ b/src/api/shared/agency/README.md @@ -0,0 +1 @@ +Logic for adding, removing and getting agencies by URL id \ No newline at end of file diff --git a/src/api/shared/agency/__init__.py b/src/api/shared/agency/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/shared/agency/delete/__init__.py b/src/api/shared/agency/delete/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/shared/agency/delete/query.py b/src/api/shared/agency/delete/query.py new file mode 100644 index 00000000..ca291a6f --- /dev/null +++ b/src/api/shared/agency/delete/query.py @@ -0,0 +1,29 @@ +from sqlalchemy import delete +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.queries.base.builder import QueryBuilderBase + + +class RemoveURLAgencyLinkQueryBuilder(QueryBuilderBase): + + def __init__( + self, + url_id: int, + agency_id: int + ): + super().__init__() + self.url_id = url_id + self.agency_id = agency_id + + async def run(self, session: AsyncSession) -> None: + statement = ( + delete( + LinkURLAgency + ) + .where( + LinkURLAgency.url_id == self.url_id, + LinkURLAgency.agency_id == self.agency_id + ) + ) + await session.execute(statement) \ No newline at end of file diff --git a/src/api/shared/agency/get/__init__.py b/src/api/shared/agency/get/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/shared/agency/get/query.py b/src/api/shared/agency/get/query.py new file mode 100644 index 00000000..b49e47ee --- /dev/null +++ b/src/api/shared/agency/get/query.py @@ -0,0 +1,62 @@ +from typing import Sequence + +from sqlalchemy import select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import selectinload + +from src.api.endpoints.agencies.by_id.locations.get.response import AgencyGetLocationsResponse +from src.api.endpoints.agencies.root.get.response import AgencyGetResponse, AgencyGetOuterResponse +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.queries.base.builder import QueryBuilderBase + + +class GetRelatedAgenciesQueryBuilder(QueryBuilderBase): + + def __init__( + self, + url_id: int + ): + super().__init__() + self.url_id = url_id + + async def run(self, session: AsyncSession) -> AgencyGetOuterResponse: + query = ( + select( + Agency, + ) + .options( + selectinload(Agency.locations) + ) + .join( + LinkURLAgency, + LinkURLAgency.agency_id == Agency.agency_id + ) + .where( + LinkURLAgency.url_id == self.url_id + ) + ) + + results: Sequence[RowMapping] = await self.sh.mappings( + session, + query=query + ) + responses: list[AgencyGetResponse] = [] + for result in results: + agency: Agency = result[Agency] + locations: list[AgencyGetLocationsResponse] = [ + AgencyGetLocationsResponse( + location_id=location.id, + full_display_name=location.full_display_name, + ) + for location in agency.locations + ] + responses.append(AgencyGetResponse( + id=agency.agency_id, + name=agency.name, + type=agency.agency_type, + jurisdiction_type=agency.jurisdiction_type, + locations=locations, + )) + + return AgencyGetOuterResponse(results=responses) diff --git a/src/api/shared/agency/get/response.py b/src/api/shared/agency/get/response.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/shared/agency/post/__init__.py b/src/api/shared/agency/post/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/shared/agency/post/query.py b/src/api/shared/agency/post/query.py new file mode 100644 index 00000000..045d1c84 --- /dev/null +++ b/src/api/shared/agency/post/query.py @@ -0,0 +1,32 @@ +from fastapi import HTTPException +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.queries.base.builder import QueryBuilderBase + + +class AddURLAgencyLinkQueryBuilder(QueryBuilderBase): + + def __init__( + self, + url_id: int, + agency_id: int + ): + super().__init__() + self.url_id = url_id + self.agency_id = agency_id + + async def run(self, session: AsyncSession) -> None: + link = LinkURLAgency( + url_id=self.url_id, + agency_id=self.agency_id + ) + session.add(link) + try: + await session.commit() + except Exception as e: + await session.rollback() + raise HTTPException( + status_code=500, + detail=f"Failed to add URL agency link: {e}" + ) \ No newline at end of file diff --git a/src/api/shared/batch/__init__.py b/src/api/shared/batch/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/shared/batch/url/__init__.py b/src/api/shared/batch/url/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/shared/batch/url/link.py b/src/api/shared/batch/url/link.py new file mode 100644 index 00000000..2ea22525 --- /dev/null +++ b/src/api/shared/batch/url/link.py @@ -0,0 +1,36 @@ +from fastapi import HTTPException +from sqlalchemy import delete +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.queries.base.builder import QueryBuilderBase + + +class UpdateBatchURLLinkQueryBuilder(QueryBuilderBase): + + def __init__( + self, + batch_id: int, + url_id: int + ): + super().__init__() + self.batch_id = batch_id + self.url_id = url_id + + async def run(self, session: AsyncSession) -> None: + + # Delete existing link if it exists + statement = ( + delete(LinkBatchURL) + .where( + LinkBatchURL.url_id==self.url_id + ) + ) + await session.execute(statement) + + # Add new link + link = LinkBatchURL( + batch_id=self.batch_id, + url_id=self.url_id + ) + session.add(link) diff --git a/src/api/shared/check/__init__.py b/src/api/shared/check/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/shared/check/url_type/__init__.py b/src/api/shared/check/url_type/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/shared/check/url_type/query.py b/src/api/shared/check/url_type/query.py new file mode 100644 index 00000000..be6287c2 --- /dev/null +++ b/src/api/shared/check/url_type/query.py @@ -0,0 +1,58 @@ +from fastapi import HTTPException +from sqlalchemy import select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.base.builder import QueryBuilderBase + + +class CheckURLTypeQueryBuilder(QueryBuilderBase): + + def __init__( + self, + url_id: int, + url_type: URLType + ): + super().__init__() + self.url_id = url_id + self.url_type = url_type + + async def run(self, session: AsyncSession) -> None: + """ + Raises: + Bad Request if url_type is not valid or does not exist + """ + + query = ( + select( + URL.id, + FlagURLValidated.type + ) + .outerjoin( + FlagURLValidated, + FlagURLValidated.url_id == URL.id + ) + .where( + URL.id == self.url_id, + ) + ) + + result: RowMapping | None = await self.sh.mapping(session, query=query) + if result is None: + raise HTTPException( + status_code=404, + detail="URL not found" + ) + url_type: URLType | None = result.get("type") + if url_type is None: + raise HTTPException( + status_code=400, + detail="URL is not validated" + ) + if url_type != self.url_type: + raise HTTPException( + status_code=400, + detail="URL type does not match expected URL type" + ) \ No newline at end of file diff --git a/src/api/shared/record_type/__init__.py b/src/api/shared/record_type/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/shared/record_type/put/__init__.py b/src/api/shared/record_type/put/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/shared/record_type/put/query.py b/src/api/shared/record_type/put/query.py new file mode 100644 index 00000000..f4cbae5c --- /dev/null +++ b/src/api/shared/record_type/put/query.py @@ -0,0 +1,32 @@ +from sqlalchemy import update +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.enums import RecordType +from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType +from src.db.queries.base.builder import QueryBuilderBase + + +class UpdateRecordTypeQueryBuilder(QueryBuilderBase): + + def __init__( + self, + url_id: int, + record_type: RecordType + ): + super().__init__() + self.url_id = url_id + self.record_type = record_type + + async def run(self, session: AsyncSession) -> None: + statement = ( + update( + URLRecordType + ) + .where( + URLRecordType.url_id == self.url_id + ) + .values( + record_type=self.record_type + ) + ) + await session.execute(statement) \ No newline at end of file diff --git a/src/api/shared/url/__init__.py b/src/api/shared/url/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/shared/url/put/__init__.py b/src/api/shared/url/put/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/shared/url/put/query.py b/src/api/shared/url/put/query.py new file mode 100644 index 00000000..a47a382c --- /dev/null +++ b/src/api/shared/url/put/query.py @@ -0,0 +1,50 @@ +from typing import Any + +from sqlalchemy import update +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.base.builder import QueryBuilderBase +from src.util.models.full_url import FullURL + + +class UpdateURLQueryBuilder(QueryBuilderBase): + + def __init__( + self, + url_id: int, + url: str | None, + name: str | None, + description: str | None + ): + super().__init__() + self.url_id = url_id + self.url = url + self.name = name + self.description = description + + async def run(self, session: AsyncSession) -> Any: + values_dict = {} + if self.url is not None: + full_url = FullURL(self.url) + values_dict["url"] = full_url.id_form + values_dict["scheme"] = full_url.scheme + values_dict["trailing_slash"] = full_url.has_trailing_slash + if self.name is not None: + values_dict["name"] = self.name + if self.description is not None: + values_dict["description"] = self.description + + query = ( + update( + URL + ) + .where( + URL.id == self.url_id + ) + .values( + values_dict + ) + ) + + await session.execute(query) \ No newline at end of file diff --git a/src/db/models/impl/url/core/sqlalchemy.py b/src/db/models/impl/url/core/sqlalchemy.py index 50fa1676..02d4fbf2 100644 --- a/src/db/models/impl/url/core/sqlalchemy.py +++ b/src/db/models/impl/url/core/sqlalchemy.py @@ -21,7 +21,7 @@ class URL(UpdatedAtMixin, CreatedAtMixin, WithIDBase): # The batch this URL is associated with url = Column(Text, unique=True) - scheme = Column(String) + scheme: Mapped[str | None] = Column(String, nullable=True) name = Column(String) description = Column(Text) # The metadata from the collector diff --git a/tests/automated/integration/api/data_sources/__init__.py b/tests/automated/integration/api/data_sources/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/data_sources/agencies/__init__.py b/tests/automated/integration/api/data_sources/agencies/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/data_sources/agencies/test_add_remove.py b/tests/automated/integration/api/data_sources/agencies/test_add_remove.py new file mode 100644 index 00000000..7223c8ce --- /dev/null +++ b/tests/automated/integration/api/data_sources/agencies/test_add_remove.py @@ -0,0 +1,26 @@ +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from tests.helpers.api_test_helper import APITestHelper + + +async def test_agencies_add_remove( + api_test_helper: APITestHelper, + test_url_data_source_id: int, + test_agency_id: int +): + api_test_helper.request_validator.post_v3( + url=f"/data-sources/{test_url_data_source_id}/agencies/{test_agency_id}", + ) + adb_client: AsyncDatabaseClient = api_test_helper.adb_client() + + links: list[LinkURLAgency] = await adb_client.get_all(LinkURLAgency) + assert len(links) == 1 + assert links[0].agency_id == test_agency_id + assert links[0].url_id == test_url_data_source_id + + api_test_helper.request_validator.delete_v3( + url=f"/data-sources/{test_url_data_source_id}/agencies/{test_agency_id}", + ) + + links: list[LinkURLAgency] = await adb_client.get_all(LinkURLAgency) + assert len(links) == 0 \ No newline at end of file diff --git a/tests/automated/integration/api/data_sources/agencies/test_invalid_type.py b/tests/automated/integration/api/data_sources/agencies/test_invalid_type.py new file mode 100644 index 00000000..54be1750 --- /dev/null +++ b/tests/automated/integration/api/data_sources/agencies/test_invalid_type.py @@ -0,0 +1,18 @@ +import pytest + +from tests.helpers.api_test_helper import APITestHelper +from tests.helpers.check import check_forbidden_url_type + + +@pytest.mark.asyncio +async def test_invalid_type( + api_test_helper: APITestHelper, + test_url_meta_url_id: int, + test_agency_id: int +): + for method in ['POST', 'DELETE']: + check_forbidden_url_type( + method=method, + route=f"/data-sources/{test_url_meta_url_id}/agencies/{test_agency_id}", + api_test_helper=api_test_helper, + ) \ No newline at end of file diff --git a/tests/automated/integration/api/data_sources/test_invalid_type.py b/tests/automated/integration/api/data_sources/test_invalid_type.py new file mode 100644 index 00000000..f415ee2b --- /dev/null +++ b/tests/automated/integration/api/data_sources/test_invalid_type.py @@ -0,0 +1,20 @@ +import pytest + +from src.api.endpoints.data_source.by_id.put.request import DataSourcePutRequest +from tests.helpers.api_test_helper import APITestHelper +from tests.helpers.check import check_forbidden_url_type + + +@pytest.mark.asyncio +async def test_invalid_type( + api_test_helper: APITestHelper, + test_url_meta_url_id: int +): + check_forbidden_url_type( + method="PUT", + route=f"/data-sources/{test_url_meta_url_id}", + api_test_helper=api_test_helper, + json=DataSourcePutRequest( + name="test" + ).model_dump(mode='json') + ) \ No newline at end of file diff --git a/tests/automated/integration/api/data_sources/test_put.py b/tests/automated/integration/api/data_sources/test_put.py new file mode 100644 index 00000000..c954b59c --- /dev/null +++ b/tests/automated/integration/api/data_sources/test_put.py @@ -0,0 +1,89 @@ +from datetime import date + +import pytest + +from src.api.endpoints.data_source.by_id.put.request import DataSourcePutRequest +from src.core.enums import RecordType +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.optional_ds_metadata.enums import AgencyAggregationEnum, UpdateMethodEnum, \ + RetentionScheduleEnum, AccessTypeEnum +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata +from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType +from tests.helpers.api_test_helper import APITestHelper + + +@pytest.mark.asyncio +async def test_put( + api_test_helper: APITestHelper, + test_url_data_source_id: int, + test_batch_id: int +): + + api_test_helper.request_validator.put_v3( + url=f"/data-sources/{test_url_data_source_id}", + json=DataSourcePutRequest( + url="http://modified_url.com/", + name="Modified URL", + record_type=RecordType.OTHER, + + batch_id=test_batch_id, + description="Modified Description", + + record_formats=["csv", "pdf"], + data_portal_type="CKAN", + supplying_entity="Modified Supplying Entity", + coverage_start=date(year=2025, month=4, day=1), + coverage_end=date(year=2025, month=8, day=29), + agency_supplied=False, + agency_originated=True, + agency_aggregation=AgencyAggregationEnum.LOCALITY, + agency_described_not_in_database="Modified Agency Not In DB", + update_method=UpdateMethodEnum.OVERWRITE, + readme_url="https://modified-readme.com", + originating_entity="Modified Originating Entity", + retention_schedule=RetentionScheduleEnum.FUTURE_ONLY, + scraper_url="https://modified-scraper.com", + submission_notes="Modified Submission Notes", + access_notes="Modified Access Notes", + access_types=[AccessTypeEnum.WEBPAGE, AccessTypeEnum.API], + ).model_dump(mode='json') + + ) + + adb_client: AsyncDatabaseClient = api_test_helper.adb_client() + + url: URL = (await adb_client.get_all(URL))[0] + assert url.url == "modified_url.com" + assert url.scheme == "http" + assert url.trailing_slash == True + assert url.description == "Modified Description" + + # Check Record Type + record_type: URLRecordType = (await adb_client.get_all(URLRecordType))[0] + assert record_type.record_type == RecordType.OTHER + + # Check Batch Link + link: LinkBatchURL = (await adb_client.get_all(LinkBatchURL))[0] + assert link.batch_id == test_batch_id + + # Check Optional Metadata + optional_metadata: URLOptionalDataSourceMetadata = (await adb_client.get_all(URLOptionalDataSourceMetadata))[0] + assert optional_metadata.record_formats == ["csv", "pdf"] + assert optional_metadata.data_portal_type == "CKAN" + assert optional_metadata.supplying_entity == "Modified Supplying Entity" + assert optional_metadata.coverage_start == date(year=2025, month=4, day=1) + assert optional_metadata.coverage_end == date(year=2025, month=8, day=29) + assert optional_metadata.agency_supplied == False + assert optional_metadata.agency_originated == True + assert optional_metadata.agency_aggregation == AgencyAggregationEnum.LOCALITY + assert optional_metadata.agency_described_not_in_database == "Modified Agency Not In DB" + assert optional_metadata.update_method == UpdateMethodEnum.OVERWRITE + assert optional_metadata.readme_url == "https://modified-readme.com" + assert optional_metadata.originating_entity == "Modified Originating Entity" + assert optional_metadata.retention_schedule == RetentionScheduleEnum.FUTURE_ONLY + assert optional_metadata.scraper_url == "https://modified-scraper.com" + assert optional_metadata.submission_notes == "Modified Submission Notes" + assert optional_metadata.access_notes == "Modified Access Notes" + assert optional_metadata.access_types == [AccessTypeEnum.WEBPAGE, AccessTypeEnum.API] diff --git a/tests/automated/integration/api/meta_urls/__init__.py b/tests/automated/integration/api/meta_urls/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/meta_urls/agencies/__init__.py b/tests/automated/integration/api/meta_urls/agencies/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/meta_urls/agencies/test_add_remove.py b/tests/automated/integration/api/meta_urls/agencies/test_add_remove.py new file mode 100644 index 00000000..4f48ac5c --- /dev/null +++ b/tests/automated/integration/api/meta_urls/agencies/test_add_remove.py @@ -0,0 +1,30 @@ +from src.api.endpoints.agencies.root.get.response import AgencyGetOuterResponse +from tests.helpers.api_test_helper import APITestHelper + + +async def test_agencies_add_remove( + api_test_helper: APITestHelper, + test_url_meta_url_id: int, + test_agency_id: int +): + api_test_helper.request_validator.post_v3( + url=f"/meta-urls/{test_url_meta_url_id}/agencies/{test_agency_id}", + ) + + raw_response: dict = api_test_helper.request_validator.get_v3( + url=f"/meta-urls/{test_url_meta_url_id}/agencies", + ) + response = AgencyGetOuterResponse(**raw_response) + assert len(response.results) == 1 + assert response.results[0].id == test_agency_id + + + api_test_helper.request_validator.delete_v3( + url=f"/meta-urls/{test_url_meta_url_id}/agencies/{test_agency_id}", + ) + + raw_response: dict = api_test_helper.request_validator.get_v3( + url=f"/meta-urls/{test_url_meta_url_id}/agencies", + ) + response = AgencyGetOuterResponse(**raw_response) + assert len(response.results) == 0 diff --git a/tests/automated/integration/api/meta_urls/agencies/test_invalid_type.py b/tests/automated/integration/api/meta_urls/agencies/test_invalid_type.py new file mode 100644 index 00000000..4f3c6f4a --- /dev/null +++ b/tests/automated/integration/api/meta_urls/agencies/test_invalid_type.py @@ -0,0 +1,18 @@ +import pytest + +from tests.helpers.api_test_helper import APITestHelper +from tests.helpers.check import check_forbidden_url_type + + +@pytest.mark.asyncio +async def test_invalid_type( + api_test_helper: APITestHelper, + test_url_data_source_id: int, + test_agency_id: int +): + for method in ['POST', 'DELETE']: + check_forbidden_url_type( + method=method, + route=f"/meta-urls/{test_url_data_source_id}/agencies/{test_agency_id}", + api_test_helper=api_test_helper, + ) \ No newline at end of file diff --git a/tests/automated/integration/api/meta_urls/test_invalid_type.py b/tests/automated/integration/api/meta_urls/test_invalid_type.py new file mode 100644 index 00000000..12073191 --- /dev/null +++ b/tests/automated/integration/api/meta_urls/test_invalid_type.py @@ -0,0 +1,20 @@ +import pytest + +from src.api.endpoints.meta_url.by_id.agencies.put.request import UpdateMetaURLRequest +from tests.helpers.api_test_helper import APITestHelper +from tests.helpers.check import check_forbidden_url_type + + +@pytest.mark.asyncio +async def test_invalid_type( + api_test_helper: APITestHelper, + test_url_data_source_id: int +): + check_forbidden_url_type( + method="PUT", + route=f"/meta-urls/{test_url_data_source_id}", + api_test_helper=api_test_helper, + json=UpdateMetaURLRequest( + name="test" + ).model_dump(mode='json') + ) \ No newline at end of file diff --git a/tests/automated/integration/api/meta_urls/test_put.py b/tests/automated/integration/api/meta_urls/test_put.py new file mode 100644 index 00000000..28689a8b --- /dev/null +++ b/tests/automated/integration/api/meta_urls/test_put.py @@ -0,0 +1,39 @@ +import pytest + +from src.api.endpoints.meta_url.by_id.agencies.put.request import UpdateMetaURLRequest +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from tests.helpers.api_test_helper import APITestHelper + + +@pytest.mark.asyncio +async def test_put( + api_test_helper: APITestHelper, + test_url_meta_url_id: int, + test_batch_id: int +): + api_test_helper.request_validator.put_v3( + url=f"/meta-urls/{test_url_meta_url_id}", + json=UpdateMetaURLRequest( + url="new-meta-url.com", + name="Modified name", + description="Modified description", + batch_id=test_batch_id + ).model_dump(mode='json') + ) + + adb_client: AsyncDatabaseClient = api_test_helper.adb_client() + + # Check URL updated (including schema and trailing slash) + url: URL = (await adb_client.get_all(URL))[0] + assert url.url == "new-meta-url.com" + assert url.name == "Modified name" + assert url.scheme == "" + assert url.trailing_slash == False + assert url.description == "Modified description" + + # Check Batch ID + link: LinkBatchURL = (await adb_client.get_all(LinkBatchURL))[0] + assert link.batch_id == test_batch_id + diff --git a/tests/automated/integration/conftest.py b/tests/automated/integration/conftest.py index b4466424..42ab2214 100644 --- a/tests/automated/integration/conftest.py +++ b/tests/automated/integration/conftest.py @@ -8,9 +8,11 @@ from src.api.main import app from src.collectors.manager import AsyncCollectorManager from src.core.core import AsyncCore +from src.core.enums import RecordType from src.core.logger import AsyncCoreLogger from src.db.client.async_ import AsyncDatabaseClient from src.db.client.sync import DatabaseClient +from src.db.models.impl.flag.url_validated.enums import URLType from src.security.dtos.access_info import AccessInfo from src.security.enums import Permissions from src.security.manager import get_access_info @@ -160,10 +162,33 @@ async def api_test_helper( ) await client.app.state.async_core.collector_manager.logger.clear_log_queue() +@pytest.fixture +def test_batch_id( + db_data_creator: DBDataCreator +) -> int: + return db_data_creator.batch() + @pytest_asyncio.fixture async def test_agency_id( db_data_creator: DBDataCreator ) -> int: return await db_data_creator.agency( name="Test Agency" - ) \ No newline at end of file + ) + +@pytest_asyncio.fixture +async def test_url_data_source_id( + db_data_creator: DBDataCreator +) -> int: + return (await db_data_creator.create_validated_urls( + record_type=RecordType.CRIME_STATISTICS, + validation_type=URLType.DATA_SOURCE, + ))[0].url_id + +@pytest_asyncio.fixture +async def test_url_meta_url_id( + db_data_creator: DBDataCreator +) -> int: + return (await db_data_creator.create_validated_urls( + validation_type=URLType.META_URL, + ))[0].url_id diff --git a/tests/automated/integration/readonly/api/agencies/get/test_locations.py b/tests/automated/integration/readonly/api/agencies/get/test_locations.py index 13481c58..34904057 100644 --- a/tests/automated/integration/readonly/api/agencies/get/test_locations.py +++ b/tests/automated/integration/readonly/api/agencies/get/test_locations.py @@ -1,6 +1,6 @@ import pytest -from tests.automated.integration.readonly.conftest import ReadOnlyTestHelper +from tests.automated.integration.readonly.helper import ReadOnlyTestHelper @pytest.mark.asyncio diff --git a/tests/automated/integration/readonly/api/agencies/get/test_root.py b/tests/automated/integration/readonly/api/agencies/get/test_root.py index fa390abd..a74e49da 100644 --- a/tests/automated/integration/readonly/api/agencies/get/test_root.py +++ b/tests/automated/integration/readonly/api/agencies/get/test_root.py @@ -1,7 +1,7 @@ import pytest from src.db.models.impl.agency.enums import JurisdictionType, AgencyType -from tests.automated.integration.readonly.conftest import ReadOnlyTestHelper +from tests.automated.integration.readonly.helper import ReadOnlyTestHelper @pytest.mark.asyncio diff --git a/tests/automated/integration/readonly/api/data_sources/__init__.py b/tests/automated/integration/readonly/api/data_sources/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/readonly/api/data_sources/agencies/__init__.py b/tests/automated/integration/readonly/api/data_sources/agencies/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/readonly/api/data_sources/agencies/test_forbid.py b/tests/automated/integration/readonly/api/data_sources/agencies/test_forbid.py new file mode 100644 index 00000000..85a54705 --- /dev/null +++ b/tests/automated/integration/readonly/api/data_sources/agencies/test_forbid.py @@ -0,0 +1,13 @@ +import pytest + +from tests.automated.integration.readonly.helper import ReadOnlyTestHelper +from tests.helpers.check import check_forbidden_url_type + + +@pytest.mark.asyncio +async def test_forbid(readonly_helper: ReadOnlyTestHelper): + check_forbidden_url_type( + route=f"/data-sources/{readonly_helper.url_meta_url_id}/agencies", + api_test_helper=readonly_helper.api_test_helper, + method="GET" + ) diff --git a/tests/automated/integration/readonly/api/data_sources/test_get.py b/tests/automated/integration/readonly/api/data_sources/test_get.py new file mode 100644 index 00000000..e7bbe861 --- /dev/null +++ b/tests/automated/integration/readonly/api/data_sources/test_get.py @@ -0,0 +1,57 @@ +from datetime import date + +import pytest +from deepdiff import DeepDiff + +from src.api.endpoints.data_source.get.response import DataSourceGetOuterResponse, DataSourceGetResponse +from src.core.enums import RecordType +from src.db.models.impl.url.optional_ds_metadata.enums import AgencyAggregationEnum, UpdateMethodEnum, \ + RetentionScheduleEnum, AccessTypeEnum +from tests.automated.integration.readonly.helper import ReadOnlyTestHelper + + +@pytest.mark.asyncio +async def test_get(readonly_helper: ReadOnlyTestHelper): + + raw_json: dict = readonly_helper.api_test_helper.request_validator.get_v3( + url=f"/data-sources", + ) + outer_response = DataSourceGetOuterResponse(**raw_json) + + assert len(outer_response.results) == 1 + response: DataSourceGetResponse = outer_response.results[0] + + diff = DeepDiff( + response.model_dump(mode='json'), + DataSourceGetResponse( + url_id=readonly_helper.url_data_source_id, + url="read-only-ds.com", + + name="Read only URL name", + record_type=RecordType.CRIME_STATISTICS, + agency_ids=[readonly_helper.agency_1_id], + + batch_id=None, + description="Read only URL", + + record_formats=["csv", "pdf"], + data_portal_type="CKAN", + supplying_entity="ReadOnly Agency", + coverage_start=date(year=2025, month=6, day=1), + coverage_end=date(year=2025, month=8, day=20), + agency_supplied=False, + agency_originated=True, + agency_aggregation=AgencyAggregationEnum.LOCALITY, + agency_described_not_in_database="ReadOnly Agency Not In DB", + update_method=UpdateMethodEnum.NO_UPDATES, + readme_url="https://read-only-readme.com", + originating_entity="ReadOnly Agency Originating", + retention_schedule=RetentionScheduleEnum.GT_10_YEARS, + scraper_url="https://read-only-scraper.com", + submission_notes="Read Only Submission Notes", + access_notes="Read Only Access Notes", + access_types=[AccessTypeEnum.WEBPAGE, AccessTypeEnum.API], + ).model_dump(mode='json'), + ) + + assert diff == {}, f"Differences found: {diff}" diff --git a/tests/automated/integration/readonly/api/meta_urls/__init__.py b/tests/automated/integration/readonly/api/meta_urls/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/readonly/api/meta_urls/agencies/__init__.py b/tests/automated/integration/readonly/api/meta_urls/agencies/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/readonly/api/meta_urls/agencies/test_forbid.py b/tests/automated/integration/readonly/api/meta_urls/agencies/test_forbid.py new file mode 100644 index 00000000..d62fa524 --- /dev/null +++ b/tests/automated/integration/readonly/api/meta_urls/agencies/test_forbid.py @@ -0,0 +1,15 @@ + +import pytest + +from tests.automated.integration.readonly.helper import ReadOnlyTestHelper +from tests.helpers.check import check_forbidden_url_type + + +@pytest.mark.asyncio +async def test_forbid(readonly_helper: ReadOnlyTestHelper): + check_forbidden_url_type( + route=f"/meta-urls/{readonly_helper.url_data_source_id}/agencies", + api_test_helper=readonly_helper.api_test_helper, + method="GET" + ) + diff --git a/tests/automated/integration/readonly/api/meta_urls/test_get.py b/tests/automated/integration/readonly/api/meta_urls/test_get.py new file mode 100644 index 00000000..8779a3fc --- /dev/null +++ b/tests/automated/integration/readonly/api/meta_urls/test_get.py @@ -0,0 +1,30 @@ +import pytest +from deepdiff import DeepDiff + +from src.api.endpoints.meta_url.get.response import MetaURLGetOuterResponse, MetaURLGetResponse +from tests.automated.integration.readonly.helper import ReadOnlyTestHelper + + +@pytest.mark.asyncio +async def test_get(readonly_helper: ReadOnlyTestHelper): + + raw_json: dict = readonly_helper.api_test_helper.request_validator.get_v3( + url=f"/meta-urls", + ) + outer_response = MetaURLGetOuterResponse(**raw_json) + + assert len(outer_response.results) == 1 + response: MetaURLGetResponse = outer_response.results[0] + + diff = DeepDiff( + response.model_dump(mode='json'), + MetaURLGetResponse( + url_id=readonly_helper.url_meta_url_id, + url="read-only-meta-url.com", + name="Read only URL Name", + description="Read only URL", + batch_id=None, + agency_ids=[] + ).model_dump(mode='json'), + ) + assert diff == {}, f"Differences found: {diff}" \ No newline at end of file diff --git a/tests/automated/integration/readonly/conftest.py b/tests/automated/integration/readonly/conftest.py index 1085c184..a5bcd249 100644 --- a/tests/automated/integration/readonly/conftest.py +++ b/tests/automated/integration/readonly/conftest.py @@ -3,33 +3,19 @@ import pytest import pytest_asyncio -from pydantic import BaseModel from starlette.testclient import TestClient from src.db.client.async_ import AsyncDatabaseClient from src.db.helpers.connect import get_postgres_connection_string -from src.db.models.impl.agency.enums import AgencyType, JurisdictionType -from src.db.models.impl.agency.sqlalchemy import Agency -from src.db.models.impl.link.agency_location.sqlalchemy import LinkAgencyLocation from tests.automated.integration.api._helpers.RequestValidator import RequestValidator +from tests.automated.integration.readonly.helper import ReadOnlyTestHelper +from tests.automated.integration.readonly.setup import setup_readonly_data from tests.helpers.api_test_helper import APITestHelper -from tests.helpers.counter import next_int from tests.helpers.data_creator.core import DBDataCreator from tests.helpers.data_creator.models.creation_info.us_state import USStateCreationInfo from tests.helpers.setup.wipe import wipe_database -class ReadOnlyTestHelper(BaseModel): - class Config: - arbitrary_types_allowed = True - - adb_client: AsyncDatabaseClient - api_test_helper: APITestHelper - - agency_1_id: int - agency_1_location_id: int - - @pytest.fixture(scope="module") def event_loop(): loop = asyncio.new_event_loop() @@ -50,8 +36,6 @@ async def readonly_helper( client: TestClient, ) -> AsyncGenerator[ReadOnlyTestHelper, Any]: wipe_database(get_postgres_connection_string()) - conn = get_postgres_connection_string(is_async=True) - adb_client = AsyncDatabaseClient(db_url=conn) db_data_creator = DBDataCreator() api_test_helper = APITestHelper( request_validator=RequestValidator(client=client), @@ -59,43 +43,6 @@ async def readonly_helper( db_data_creator=db_data_creator, ) - # Pennsylvania - pennsylvania = await DBDataCreator().create_us_state( - name="Pennsylvania", - iso="PA" - ) - - allegheny_county = await DBDataCreator().create_county( - state_id=pennsylvania.us_state_id, - name="Allegheny" - ) - pittsburgh = await DBDataCreator().create_locality( - state_id=pennsylvania.us_state_id, - county_id=allegheny_county.county_id, - name="Pittsburgh" - ) - - - # Add Agencies - agency_1 = Agency( - agency_id=next_int(), - name="Agency 1", - agency_type=AgencyType.LAW_ENFORCEMENT, - jurisdiction_type=JurisdictionType.STATE, - ) - await adb_client.add(agency_1) - - # Add Agency location - agency_1_location = LinkAgencyLocation( - agency_id=agency_1.agency_id, - location_id=pittsburgh.location_id, - ) - await adb_client.add(agency_1_location) - - yield ReadOnlyTestHelper( - adb_client=adb_client, - api_test_helper=api_test_helper, + helper: ReadOnlyTestHelper = await setup_readonly_data(api_test_helper=api_test_helper) - agency_1_id=agency_1.agency_id, - agency_1_location_id=pittsburgh.location_id, - ) \ No newline at end of file + yield helper \ No newline at end of file diff --git a/tests/automated/integration/readonly/helper.py b/tests/automated/integration/readonly/helper.py new file mode 100644 index 00000000..68474256 --- /dev/null +++ b/tests/automated/integration/readonly/helper.py @@ -0,0 +1,18 @@ +from pydantic import BaseModel + +from src.db.client.async_ import AsyncDatabaseClient +from tests.helpers.api_test_helper import APITestHelper + + +class ReadOnlyTestHelper(BaseModel): + class Config: + arbitrary_types_allowed = True + + adb_client: AsyncDatabaseClient + api_test_helper: APITestHelper + + agency_1_id: int + agency_1_location_id: int + + url_data_source_id: int + url_meta_url_id: int diff --git a/tests/automated/integration/readonly/setup.py b/tests/automated/integration/readonly/setup.py new file mode 100644 index 00000000..20c6d537 --- /dev/null +++ b/tests/automated/integration/readonly/setup.py @@ -0,0 +1,171 @@ +from datetime import date + +from src.collectors.enums import URLStatus +from src.core.enums import RecordType +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.agency.enums import AgencyType, JurisdictionType +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.impl.link.agency_location.sqlalchemy import LinkAgencyLocation +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.optional_ds_metadata.enums import AgencyAggregationEnum, UpdateMethodEnum, \ + RetentionScheduleEnum, AccessTypeEnum +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata +from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType +from tests.automated.integration.readonly.helper import ReadOnlyTestHelper +from tests.helpers.api_test_helper import APITestHelper +from tests.helpers.counter import next_int +from tests.helpers.data_creator.core import DBDataCreator +from tests.helpers.data_creator.models.creation_info.county import CountyCreationInfo +from tests.helpers.data_creator.models.creation_info.locality import LocalityCreationInfo +from tests.helpers.data_creator.models.creation_info.us_state import USStateCreationInfo + + +async def setup_readonly_data( + api_test_helper: APITestHelper +) -> ReadOnlyTestHelper: + db_data_creator = api_test_helper.db_data_creator + adb_client = db_data_creator.adb_client + + # Pennsylvania + pennsylvania: USStateCreationInfo = await db_data_creator.create_us_state( + name="Pennsylvania", + iso="PA" + ) + + allegheny_county: CountyCreationInfo = await db_data_creator.create_county( + state_id=pennsylvania.us_state_id, + name="Allegheny" + ) + pittsburgh: LocalityCreationInfo = await db_data_creator.create_locality( + state_id=pennsylvania.us_state_id, + county_id=allegheny_county.county_id, + name="Pittsburgh" + ) + + + # Add Agencies + agency_1_id: int = await add_agency(adb_client, pittsburgh) + + # Add Data Source With Linked Agency + url_data_source_id: int = await add_data_source(agency_1_id, db_data_creator) + + # Add Meta URL with Linked Agency + url_meta_url_id: int = await add_meta_url(agency_1_id, db_data_creator) + + return ReadOnlyTestHelper( + adb_client=adb_client, + api_test_helper=api_test_helper, + + agency_1_id=agency_1_id, + agency_1_location_id=pittsburgh.location_id, + + url_data_source_id=url_data_source_id, + url_meta_url_id=url_meta_url_id, + ) + + +async def add_meta_url( + agency_1_id: int, + db_data_creator: DBDataCreator +) -> int: + adb_client: AsyncDatabaseClient = db_data_creator.adb_client + url = URL( + scheme=None, + url="read-only-meta-url.com", + name="Read only URL Name", + trailing_slash=False, + description="Read only URL", + collector_metadata={ + "url": "https://read-only-meta-url.com/" + }, + status=URLStatus.OK, + source=URLSource.REDIRECT, + ) + url_id: int = await adb_client.add(url, return_id=True) + + await db_data_creator.create_validated_flags( + url_ids=[url_id], + validation_type=URLType.META_URL + ) + + return url_id + + +async def add_data_source( + agency_1_id: int, + db_data_creator: DBDataCreator +) -> int: + adb_client: AsyncDatabaseClient = db_data_creator.adb_client + url = URL( + scheme="https", + url="read-only-ds.com", + name="Read only URL name", + trailing_slash=True, + description="Read only URL", + collector_metadata={ + "url": "https://read-only.com/" + }, + status=URLStatus.OK, + source=URLSource.COLLECTOR, + ) + url_id: int = await adb_client.add(url, return_id=True) + await db_data_creator.create_validated_flags( + url_ids=[url_id], + validation_type=URLType.DATA_SOURCE + ) + record_type = URLRecordType( + url_id=url_id, + record_type=RecordType.CRIME_STATISTICS + ) + await adb_client.add(record_type) + + optional_ds_metadata = URLOptionalDataSourceMetadata( + url_id=url_id, + record_formats=["csv", "pdf"], + data_portal_type="CKAN", + supplying_entity="ReadOnly Agency", + coverage_start=date(year=2025, month=6, day=1), + coverage_end=date(year=2025, month=8, day=20), + agency_supplied=False, + agency_originated=True, + agency_aggregation=AgencyAggregationEnum.LOCALITY, + agency_described_not_in_database="ReadOnly Agency Not In DB", + update_method=UpdateMethodEnum.NO_UPDATES, + readme_url="https://read-only-readme.com", + originating_entity="ReadOnly Agency Originating", + retention_schedule=RetentionScheduleEnum.GT_10_YEARS, + scraper_url="https://read-only-scraper.com", + submission_notes="Read Only Submission Notes", + access_notes="Read Only Access Notes", + access_types=[AccessTypeEnum.WEBPAGE, AccessTypeEnum.API], + ) + + await adb_client.add(optional_ds_metadata) + + await db_data_creator.create_url_agency_links( + url_ids=[url_id], + agency_ids=[agency_1_id] + ) + return url_id + + +async def add_agency( + adb_client: AsyncDatabaseClient, + pittsburgh: LocalityCreationInfo +) -> int: + agency_1 = Agency( + agency_id=next_int(), + name="Agency 1", + agency_type=AgencyType.LAW_ENFORCEMENT, + jurisdiction_type=JurisdictionType.STATE, + ) + await adb_client.add(agency_1) + # Add Agency location + agency_1_location = LinkAgencyLocation( + agency_id=agency_1.agency_id, + location_id=pittsburgh.location_id, + ) + await adb_client.add(agency_1_location) + return agency_1.agency_id \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/submit_approved/test_submit_approved_url_task.py b/tests/automated/integration/tasks/url/impl/submit_approved/test_submit_approved_url_task.py index 3d1aec23..22ae8129 100644 --- a/tests/automated/integration/tasks/url/impl/submit_approved/test_submit_approved_url_task.py +++ b/tests/automated/integration/tasks/url/impl/submit_approved/test_submit_approved_url_task.py @@ -2,7 +2,6 @@ from deepdiff import DeepDiff from pdap_access_manager import RequestInfo, RequestType, DataSourcesNamespaces -from src.collectors.enums import URLStatus from src.core.tasks.url.enums import TaskOperatorOutcome from src.core.tasks.url.operators.submit_approved.core import SubmitApprovedURLTaskOperator from src.db.models.impl.url.core.sqlalchemy import URL diff --git a/tests/helpers/awaitable_barrier.py b/tests/helpers/awaitable_barrier.py deleted file mode 100644 index 8bf65a11..00000000 --- a/tests/helpers/awaitable_barrier.py +++ /dev/null @@ -1,13 +0,0 @@ -import asyncio - - -class AwaitableBarrier: - def __init__(self): - self._event = asyncio.Event() - - async def __call__(self, *args, **kwargs): - await self._event.wait() - - def release(self): - self._event.set() - diff --git a/tests/helpers/check.py b/tests/helpers/check.py new file mode 100644 index 00000000..b9172151 --- /dev/null +++ b/tests/helpers/check.py @@ -0,0 +1,20 @@ +import pytest +from fastapi import HTTPException + +from tests.helpers.api_test_helper import APITestHelper + + +def check_forbidden_url_type( + route: str, + method: str, + api_test_helper: APITestHelper, + **kwargs +) -> None: + with pytest.raises(HTTPException) as e: + api_test_helper.request_validator.open_v3( + url=route, + method=method, + **kwargs + ) + assert e.value.status_code == 400, f"Expected status code 400, got {e.value.status_code}" + assert e.value.detail['detail'] == 'URL type does not match expected URL type' \ No newline at end of file diff --git a/tests/helpers/patch_functions.py b/tests/helpers/patch_functions.py deleted file mode 100644 index 170a2062..00000000 --- a/tests/helpers/patch_functions.py +++ /dev/null @@ -1,10 +0,0 @@ -from tests.helpers.awaitable_barrier import AwaitableBarrier - - -async def block_sleep(monkeypatch) -> AwaitableBarrier: - barrier = AwaitableBarrier() - monkeypatch.setattr( - "src.collectors.impl.example.core.ExampleCollector.sleep", - barrier - ) - return barrier From 261110914807728274b8c30be2424f8e11ac5482 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sun, 9 Nov 2025 08:50:14 -0500 Subject: [PATCH 17/84] Begin draft --- ...28_1539-a57c3b5b6e93_add_sync_log_table.py | 369 +++++++++++++++ .../endpoints/agencies/by_id/delete/query.py | 23 +- src/api/endpoints/agencies/by_id/put/query.py | 2 +- src/api/endpoints/agencies/root/get/query.py | 2 +- src/api/endpoints/agencies/root/post/query.py | 2 +- src/api/endpoints/annotate/_shared/extract.py | 4 +- .../all/get/queries/agency/requester.py | 18 +- .../annotate/all/get/queries/core.py | 8 +- .../endpoints/annotate/all/post/requester.py | 4 +- .../user/queries/agreement/agency.py | 18 +- src/api/endpoints/data_source/get/query.py | 2 +- .../by_id/{agencies => }/put/__init__.py | 0 .../by_id/{agencies => }/put/query.py | 3 +- .../by_id/{agencies => }/put/request.py | 0 src/api/endpoints/meta_url/get/query.py | 2 +- src/api/endpoints/meta_url/routes.py | 8 +- .../metrics/batches/aggregated/query/core.py | 2 +- .../aggregated/query/submitted_/query.py | 8 +- .../batches/breakdown/submitted/cte_.py | 8 +- .../metrics/urls/breakdown/query/core.py | 6 +- .../endpoints/review/approve/query_/core.py | 4 +- src/api/endpoints/search/agency/query.py | 6 +- src/api/endpoints/submit/url/queries/core.py | 4 +- .../endpoints/url/by_id/delete/__init__.py | 0 src/api/endpoints/url/by_id/delete/query.py | 79 +++ src/api/endpoints/url/routes.py | 12 + src/api/shared/agency/get/query.py | 4 +- src/core/tasks/base/operator.py | 14 +- .../scheduled/impl/sync_to_ds/__init__.py | 0 .../impl/sync_to_ds/impl/__init__.py | 0 .../impl/sync_to_ds/impl/agencies/__init__.py | 0 .../sync_to_ds/impl/agencies/add/__init__.py | 0 .../impl/sync_to_ds/impl/agencies/add/core.py | 12 + .../impl/agencies/add/queries/__init__.py | 0 .../impl/agencies/add/queries/cte.py | 4 + .../impl/agencies/delete/__init__.py | 0 .../sync_to_ds/impl/agencies/delete/core.py | 12 + .../impl/agencies/delete/queries/__init__.py | 0 .../impl/agencies/delete/queries/cte.py | 3 + .../impl/agencies/update/__init__.py | 0 .../sync_to_ds/impl/agencies/update/core.py | 12 + .../impl/agencies/update/queries/__init__.py | 0 .../impl/agencies/update/queries/cte.py | 0 .../sync_to_ds/impl/data_sources/__init__.py | 0 .../impl/data_sources/add/__init__.py | 0 .../sync_to_ds/impl/data_sources/add/core.py | 13 + .../impl/data_sources/add/queries/__init__.py | 0 .../impl/data_sources/add/queries/cte.py | 4 + .../impl/data_sources/delete/__init__.py | 0 .../impl/data_sources/delete/core.py | 12 + .../data_sources/delete/queries/__init__.py | 0 .../impl/data_sources/delete/queries/cte.py | 3 + .../impl/data_sources/update/__init__.py | 0 .../impl/data_sources/update/core.py | 12 + .../data_sources/update/queries/__init__.py | 0 .../impl/data_sources/update/queries/cte.py | 0 .../sync_to_ds/impl/meta_urls/__init__.py | 0 .../sync_to_ds/impl/meta_urls/add/__init__.py | 0 .../sync_to_ds/impl/meta_urls/add/core.py | 12 + .../impl/meta_urls/add/queries/__init__.py | 0 .../impl/meta_urls/add/queries/cte.py | 4 + .../impl/meta_urls/delete/__init__.py | 0 .../sync_to_ds/impl/meta_urls/delete/core.py | 12 + .../impl/meta_urls/delete/queries/__init__.py | 0 .../impl/meta_urls/delete/queries/cte.py | 0 .../impl/meta_urls/update/__init__.py | 0 .../sync_to_ds/impl/meta_urls/update/core.py | 12 + .../impl/meta_urls/update/queries/__init__.py | 0 .../impl/meta_urls/update/queries/cte.py | 0 .../impl/sync_to_ds/templates/__init__.py | 0 .../impl/sync_to_ds/templates/operator.py | 28 ++ src/core/tasks/url/operators/base.py | 12 - .../operators/submit_approved/queries/cte.py | 4 +- .../operators/submit_approved/queries/get.py | 2 +- .../submit_approved/queries/mark_submitted.py | 4 +- .../operators/submit_meta_urls/queries/cte.py | 8 +- .../queries/ctes/counts/impl/agency.py | 12 +- src/db/client/async_.py | 58 +-- src/db/client/sync.py | 4 +- src/db/client/types.py | 4 +- src/db/constants.py | 4 +- src/db/dto_converter.py | 2 +- src/db/models/helpers.py | 2 +- src/db/models/impl/agency/ds_link/__init__.py | 0 .../models/impl/agency/ds_link/sqlalchemy.py | 19 + src/db/models/impl/agency/sqlalchemy.py | 10 +- src/db/models/impl/flag/ds_delete/__init__.py | 0 src/db/models/impl/flag/ds_delete/agency.py | 20 + .../models/impl/flag/ds_delete/data_source.py | 20 + src/db/models/impl/flag/ds_delete/meta_url.py | 20 + .../models/impl/link/url_agency/sqlalchemy.py | 4 +- .../impl/link/url_redirect_url/sqlalchemy.py | 6 +- .../impl/link/urls_root_url/sqlalchemy.py | 4 +- src/db/models/impl/sync_log/__init__.py | 0 src/db/models/impl/sync_log/enums.py | 12 + src/db/models/impl/sync_log/sqlalchemy.py | 17 + src/db/models/impl/url/core/sqlalchemy.py | 8 +- .../models/impl/url/data_source/pydantic.py | 6 +- .../models/impl/url/data_source/sqlalchemy.py | 23 +- .../models/impl/url/ds_meta_url/pydantic.py | 6 +- .../models/impl/url/ds_meta_url/sqlalchemy.py | 26 +- .../suggestion/agency/subtask/sqlalchemy.py | 6 +- .../models/impl/url/suggestion/agency/user.py | 6 +- .../suggestion/relevant/auto/sqlalchemy.py | 7 +- src/db/models/mixins.py | 8 +- .../common/annotation_exists_/constants.py | 4 +- .../url_counts/builder.py | 2 +- .../url_counts/cte/submitted.py | 6 +- .../core/metrics/urls/aggregated/pending.py | 4 +- src/db/types.py | 4 +- src/external/pdap/impl/sync/__init__.py | 0 .../pdap/impl/sync/agencies/__init__.py | 0 .../impl/sync/agencies/_shared/__init__.py | 0 .../sync/agencies/_shared/models/__init__.py | 0 .../sync/agencies/_shared/models/content.py | 10 + .../pdap/impl/sync/agencies/add/__init__.py | 0 .../pdap/impl/sync/agencies/add/core.py | 20 + .../pdap/impl/sync/agencies/add/request.py | 20 + .../impl/sync/agencies/delete/__init__.py | 0 .../pdap/impl/sync/agencies/delete/core.py | 0 .../pdap/impl/sync/agencies/request.py | 0 .../impl/sync/agencies/update/__init__.py | 0 .../pdap/impl/sync/agencies/update/core.py | 0 .../pdap/impl/sync/agencies/update/request.py | 12 + .../pdap/impl/sync/data_sources/__init__.py | 0 .../sync/data_sources/_shared/__init__.py | 0 .../impl/sync/data_sources/_shared/content.py | 31 ++ .../impl/sync/data_sources/add/__init__.py | 0 .../pdap/impl/sync/data_sources/add/core.py | 0 .../impl/sync/data_sources/add/request.py | 20 + .../impl/sync/data_sources/delete/__init__.py | 0 .../impl/sync/data_sources/delete/core.py | 0 .../pdap/impl/sync/data_sources/request.py | 0 .../impl/sync/data_sources/update/__init__.py | 0 .../impl/sync/data_sources/update/core.py | 0 .../impl/sync/data_sources/update/request.py | 15 + .../pdap/impl/sync/meta_urls/__init__.py | 0 .../impl/sync/meta_urls/_shared/__init__.py | 0 .../impl/sync/meta_urls/_shared/content.py | 6 + .../pdap/impl/sync/meta_urls/add/__init__.py | 0 .../pdap/impl/sync/meta_urls/add/core.py | 0 .../impl/sync/meta_urls/delete/__init__.py | 0 .../pdap/impl/sync/meta_urls/delete/core.py | 0 .../pdap/impl/sync/meta_urls/request.py | 0 .../impl/sync/meta_urls/update/__init__.py | 0 .../pdap/impl/sync/meta_urls/update/core.py | 0 .../impl/sync/meta_urls/update/request.py | 12 + .../pdap/impl/sync/shared/__init__.py | 0 .../pdap/impl/sync/shared/models/__init__.py | 0 .../impl/sync/shared/models/add/__init__.py | 0 .../impl/sync/shared/models/add/response.py | 8 + .../sync/shared/models/delete/__init__.py | 0 .../impl/sync/shared/models/delete/request.py | 5 + .../api/agencies/delete/__init__.py | 0 .../api/agencies/{ => delete}/test_core.py | 12 +- .../api/agencies/delete/test_ds_linked.py | 44 ++ .../api/annotate/all/test_happy_path.py | 4 +- .../api/meta_urls/test_invalid_type.py | 2 +- .../integration/api/meta_urls/test_put.py | 2 +- .../api/submit/test_url_maximal.py | 6 +- .../api/url/by_id/delete/__init__.py | 0 .../integration/api/url/by_id/delete/setup.py | 0 .../api/url/by_id/delete/test_any_url.py | 448 ++++++++++++++++++ .../url/by_id/delete/test_data_source_url.py | 115 +++++ .../api/url/by_id/delete/test_meta_url.py | 78 +++ .../delete/test_validated_not_relevant.py | 71 +++ tests/automated/integration/conftest.py | 52 +- .../db/structure/test_upsert_new_agencies.py | 4 +- .../db/structure/updated_at/__init__.py | 0 .../updated_at/test_ds_optional_metadata.py | 6 + .../db/structure/updated_at/test_urls.py | 6 + .../integration/readonly/conftest.py | 1 - tests/automated/integration/readonly/setup.py | 7 +- .../scheduled/impl/sync_to_ds/__init__.py | 0 .../impl/sync_to_ds/agency/__init__.py | 0 .../impl/sync_to_ds/agency/conftest.py | 10 + .../impl/sync_to_ds/agency/test_add.py | 33 ++ .../impl/sync_to_ds/agency/test_delete.py | 55 +++ .../impl/sync_to_ds/agency/update/__init__.py | 0 .../impl/sync_to_ds/agency/update/conftest.py | 16 + .../impl/sync_to_ds/agency/update/helpers.py | 7 + .../agency/update/test_add_location_link.py | 29 ++ .../update/test_delete_location_link.py | 28 ++ .../agency/update/test_update_agency.py | 26 + .../impl/sync_to_ds/data_source/__init__.py | 0 .../impl/sync_to_ds/data_source/conftest.py | 10 + .../impl/sync_to_ds/data_source/test_add.py | 36 ++ .../sync_to_ds/data_source/test_delete.py | 48 ++ .../sync_to_ds/data_source/update/__init__.py | 0 .../sync_to_ds/data_source/update/conftest.py | 16 + .../sync_to_ds/data_source/update/helpers.py | 7 + .../update/test_add_agency_link.py | 27 ++ .../update/test_delete_agency_link.py | 27 ++ .../test_update_optional_ds_metadata.py | 26 + .../data_source/update/test_update_url.py | 26 + .../impl/sync_to_ds/meta_url/__init__.py | 0 .../impl/sync_to_ds/meta_url/conftest.py | 20 + .../impl/sync_to_ds/meta_url/test_add.py | 34 ++ .../impl/sync_to_ds/meta_url/test_delete.py | 49 ++ .../sync_to_ds/meta_url/update/__init__.py | 0 .../sync_to_ds/meta_url/update/conftest.py | 16 + .../sync_to_ds/meta_url/update/helpers.py | 7 + .../meta_url/update/test_add_agency_link.py | 27 ++ .../update/test_delete_agency_link.py | 27 ++ .../meta_url/update/test_update_url.py | 27 ++ .../impl/sync_to_ds/models/__init__.py | 0 .../sync_to_ds/models/ds_app_link_info.py | 8 + .../test_submit_approved_url_task.py | 10 +- .../test_validated_meta_url.py | 4 +- .../url/impl/submit_meta_urls/test_core.py | 6 +- tests/helpers/data_creator/core.py | 4 +- 211 files changed, 2583 insertions(+), 231 deletions(-) create mode 100644 alembic/versions/2025_10_28_1539-a57c3b5b6e93_add_sync_log_table.py rename src/api/endpoints/meta_url/by_id/{agencies => }/put/__init__.py (100%) rename src/api/endpoints/meta_url/by_id/{agencies => }/put/query.py (88%) rename src/api/endpoints/meta_url/by_id/{agencies => }/put/request.py (100%) create mode 100644 src/api/endpoints/url/by_id/delete/__init__.py create mode 100644 src/api/endpoints/url/by_id/delete/query.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/__init__.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/__init__.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/__init__.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/__init__.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/core.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/__init__.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/cte.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/__init__.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/core.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/__init__.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/cte.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/__init__.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/core.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/__init__.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/cte.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/__init__.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/__init__.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/core.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/__init__.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/cte.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/__init__.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/core.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/__init__.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/cte.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/__init__.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/core.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/__init__.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/cte.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/__init__.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/__init__.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/core.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/__init__.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/cte.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/__init__.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/core.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/__init__.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/cte.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/__init__.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/core.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/__init__.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/cte.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/templates/__init__.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/templates/operator.py create mode 100644 src/db/models/impl/agency/ds_link/__init__.py create mode 100644 src/db/models/impl/agency/ds_link/sqlalchemy.py create mode 100644 src/db/models/impl/flag/ds_delete/__init__.py create mode 100644 src/db/models/impl/flag/ds_delete/agency.py create mode 100644 src/db/models/impl/flag/ds_delete/data_source.py create mode 100644 src/db/models/impl/flag/ds_delete/meta_url.py create mode 100644 src/db/models/impl/sync_log/__init__.py create mode 100644 src/db/models/impl/sync_log/enums.py create mode 100644 src/db/models/impl/sync_log/sqlalchemy.py create mode 100644 src/external/pdap/impl/sync/__init__.py create mode 100644 src/external/pdap/impl/sync/agencies/__init__.py create mode 100644 src/external/pdap/impl/sync/agencies/_shared/__init__.py create mode 100644 src/external/pdap/impl/sync/agencies/_shared/models/__init__.py create mode 100644 src/external/pdap/impl/sync/agencies/_shared/models/content.py create mode 100644 src/external/pdap/impl/sync/agencies/add/__init__.py create mode 100644 src/external/pdap/impl/sync/agencies/add/core.py create mode 100644 src/external/pdap/impl/sync/agencies/add/request.py create mode 100644 src/external/pdap/impl/sync/agencies/delete/__init__.py create mode 100644 src/external/pdap/impl/sync/agencies/delete/core.py create mode 100644 src/external/pdap/impl/sync/agencies/request.py create mode 100644 src/external/pdap/impl/sync/agencies/update/__init__.py create mode 100644 src/external/pdap/impl/sync/agencies/update/core.py create mode 100644 src/external/pdap/impl/sync/agencies/update/request.py create mode 100644 src/external/pdap/impl/sync/data_sources/__init__.py create mode 100644 src/external/pdap/impl/sync/data_sources/_shared/__init__.py create mode 100644 src/external/pdap/impl/sync/data_sources/_shared/content.py create mode 100644 src/external/pdap/impl/sync/data_sources/add/__init__.py create mode 100644 src/external/pdap/impl/sync/data_sources/add/core.py create mode 100644 src/external/pdap/impl/sync/data_sources/add/request.py create mode 100644 src/external/pdap/impl/sync/data_sources/delete/__init__.py create mode 100644 src/external/pdap/impl/sync/data_sources/delete/core.py create mode 100644 src/external/pdap/impl/sync/data_sources/request.py create mode 100644 src/external/pdap/impl/sync/data_sources/update/__init__.py create mode 100644 src/external/pdap/impl/sync/data_sources/update/core.py create mode 100644 src/external/pdap/impl/sync/data_sources/update/request.py create mode 100644 src/external/pdap/impl/sync/meta_urls/__init__.py create mode 100644 src/external/pdap/impl/sync/meta_urls/_shared/__init__.py create mode 100644 src/external/pdap/impl/sync/meta_urls/_shared/content.py create mode 100644 src/external/pdap/impl/sync/meta_urls/add/__init__.py create mode 100644 src/external/pdap/impl/sync/meta_urls/add/core.py create mode 100644 src/external/pdap/impl/sync/meta_urls/delete/__init__.py create mode 100644 src/external/pdap/impl/sync/meta_urls/delete/core.py create mode 100644 src/external/pdap/impl/sync/meta_urls/request.py create mode 100644 src/external/pdap/impl/sync/meta_urls/update/__init__.py create mode 100644 src/external/pdap/impl/sync/meta_urls/update/core.py create mode 100644 src/external/pdap/impl/sync/meta_urls/update/request.py create mode 100644 src/external/pdap/impl/sync/shared/__init__.py create mode 100644 src/external/pdap/impl/sync/shared/models/__init__.py create mode 100644 src/external/pdap/impl/sync/shared/models/add/__init__.py create mode 100644 src/external/pdap/impl/sync/shared/models/add/response.py create mode 100644 src/external/pdap/impl/sync/shared/models/delete/__init__.py create mode 100644 src/external/pdap/impl/sync/shared/models/delete/request.py create mode 100644 tests/automated/integration/api/agencies/delete/__init__.py rename tests/automated/integration/api/agencies/{ => delete}/test_core.py (87%) create mode 100644 tests/automated/integration/api/agencies/delete/test_ds_linked.py create mode 100644 tests/automated/integration/api/url/by_id/delete/__init__.py create mode 100644 tests/automated/integration/api/url/by_id/delete/setup.py create mode 100644 tests/automated/integration/api/url/by_id/delete/test_any_url.py create mode 100644 tests/automated/integration/api/url/by_id/delete/test_data_source_url.py create mode 100644 tests/automated/integration/api/url/by_id/delete/test_meta_url.py create mode 100644 tests/automated/integration/api/url/by_id/delete/test_validated_not_relevant.py create mode 100644 tests/automated/integration/db/structure/updated_at/__init__.py create mode 100644 tests/automated/integration/db/structure/updated_at/test_ds_optional_metadata.py create mode 100644 tests/automated/integration/db/structure/updated_at/test_urls.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync_to_ds/__init__.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/__init__.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/conftest.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/test_add.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/test_delete.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/__init__.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/conftest.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/helpers.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/test_add_location_link.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/test_delete_location_link.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/test_update_agency.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/__init__.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/conftest.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/test_add.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/test_delete.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/__init__.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/conftest.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/helpers.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_add_agency_link.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_delete_agency_link.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_update_optional_ds_metadata.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_update_url.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/__init__.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/conftest.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/test_add.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/test_delete.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/__init__.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/conftest.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/helpers.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/test_add_agency_link.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/test_delete_agency_link.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/test_update_url.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync_to_ds/models/__init__.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync_to_ds/models/ds_app_link_info.py diff --git a/alembic/versions/2025_10_28_1539-a57c3b5b6e93_add_sync_log_table.py b/alembic/versions/2025_10_28_1539-a57c3b5b6e93_add_sync_log_table.py new file mode 100644 index 00000000..153b4fe6 --- /dev/null +++ b/alembic/versions/2025_10_28_1539-a57c3b5b6e93_add_sync_log_table.py @@ -0,0 +1,369 @@ +"""Add sync_log table + +Revision ID: a57c3b5b6e93 +Revises: f32ba7664e9f +Create Date: 2025-10-28 15:39:50.494489 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + +from src.util.alembic_helpers import created_at_column + +# revision identifiers, used by Alembic. +revision: str = 'a57c3b5b6e93' +down_revision: Union[str, None] = 'f32ba7664e9f' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def last_synced_at_column(): + return sa.Column( + 'last_synced_at', + sa.DateTime(), + nullable=False, + server_default=sa.func.now() + ) + + +def _add_link_table_modification_triggers(): + op.execute(""" + -- trigger func that "touches" parent rows hit by changes to the link table + CREATE OR REPLACE FUNCTION touch_url_from_agency_link() + RETURNS trigger + LANGUAGE plpgsql AS $$ + BEGIN + -- UNION to cover INSERT/UPDATE (NEW TABLE) and DELETE (OLD TABLE) + UPDATE urls u + SET updated_at = clock_timestamp() -- better than now() for long txns + FROM ( + SELECT DISTINCT url_id FROM newtab + UNION + SELECT DISTINCT url_id FROM oldtab + ) AS hit + WHERE u.id = hit.url_id; + + RETURN NULL; -- statement-level trigger + END $$; + + -- statement-level trigger with transition tables + CREATE TRIGGER trg_link_touch_parent + AFTER INSERT OR UPDATE OR DELETE ON link_parent_child + REFERENCING NEW TABLE AS newtab OLD TABLE AS oldtab + FOR EACH STATEMENT + EXECUTE FUNCTION touch_parent_from_link(); + + """) + + op.execute( + """ + -- trigger func that "touches" agency rows hit by changes to the link_agencies_locations table + CREATE OR REPLACE FUNCTION touch_agency_from_location_link() + RETURNS trigger + LANGUAGE plpgsql AS + $$ + BEGIN + -- UNION to cover INSERT/UPDATE (NEW TABLE) and DELETE (OLD TABLE) + UPDATE agencies a + SET updated_at = clock_timestamp() -- better than now() for long txns + FROM (SELECT DISTINCT agency_id + FROM newtab + UNION + SELECT DISTINCT agency_id + FROM oldtab) AS hit + WHERE a.id = hit.agency_id; + + RETURN NULL; -- statement-level trigger + END + $$; + + -- statement-level trigger with transition tables + CREATE TRIGGER trg_link_touch_parent + AFTER INSERT OR UPDATE OR DELETE + ON link_agencies_locations + REFERENCING NEW TABLE AS newtab OLD TABLE AS oldtab + FOR EACH STATEMENT + EXECUTE FUNCTION touch_agency_from_location_link(); + """ + ) + + +def upgrade() -> None: + _create_sync_log() + _create_ds_agency_link() + _migrate_agency_ids_to_ds_agency_link() + remove_id_column_from_agencies() + rename_agency_id_to_id() + _rename_existing_tables_to_ds_app_format() + _alter_ds_app_link_data_source_table() + _alter_ds_app_link_meta_url_table() + _add_flag_deletion_tables() + _add_last_synced_at_columns() + _add_link_table_modification_triggers() + +def _add_last_synced_at_columns(): + op.add_column( + 'ds_app_link_data_source', + last_synced_at_column() + ) + op.add_column( + 'ds_app_link_meta_url', + last_synced_at_column() + ) + + +def _alter_ds_app_link_data_source_table(): + # Drop unique constraint for data source id + op.drop_constraint( + 'uq_url_data_sources_data_source_id', + 'ds_app_link_data_source', + type_='unique' + ) + # Drop primary keys + op.drop_constraint( + 'url_data_sources_pkey', + 'ds_app_link_data_source', + type_='primary' + ) + # Rename `data_source_id` to `ds_data_source_id` + op.alter_column( + 'ds_app_link_data_source', + 'data_source_id', + new_column_name='ds_data_source_id', + ) + # Add new primary key + op.create_primary_key( + 'ds_app_link_data_source_pkey', + 'ds_app_link_data_source', + ['ds_data_source_id'] + ) + + # Drop url_id foreign key + op.drop_constraint( + 'url_data_sources_url_id_fkey', + 'ds_app_link_data_source', + type_='foreignkey' + ) + # Recreate foreign key with ON DELETE SET NULL + op.create_foreign_key( + 'ds_app_link_data_source_url_id_fkey', + 'ds_app_link_data_source', + 'urls', + ['url_id'], + ['id'], + ondelete='SET NULL' + ) + # Alter url_id column to be nullable + op.alter_column( + 'ds_app_link_data_source', + 'url_id', + nullable=True + ) + + + +def _alter_ds_app_link_meta_url_table(): + # Drop joint primary key for url_id and agency_id + op.drop_constraint( + 'url_ds_meta_url_pkey', + 'ds_app_link_meta_url', + type_='primary' + ) + # Drop unique constraint for ds_meta_url_id + op.drop_constraint( + 'url_ds_meta_url_ds_meta_url_id_key', + 'ds_app_link_meta_url', + type_='unique' + ) + # Drop agency_id column + op.drop_column( + 'ds_app_link_meta_url', + 'agency_id' + ) + # Make ds_meta_url_id primary key + op.create_primary_key( + 'ds_app_link_meta_url_pkey', + 'ds_app_link_meta_url', + ['ds_meta_url_id'] + ) + # Add unique constraint for url_id + op.create_unique_constraint( + 'uq_ds_app_link_meta_url_url_id', + 'ds_app_link_meta_url', + ['url_id'] + ) + # URL ID + ## Drop foreign key + op.drop_constraint( + 'url_ds_meta_url_url_id_fkey', + 'ds_app_link_meta_url', + type_='foreignkey' + ) + ## Recreate foreign key with ON DELETE SET NULL + op.create_foreign_key( + 'ds_app_link_meta_url_url_id_fkey', + 'ds_app_link_meta_url', + 'urls', + ['url_id'], + ['id'], + ondelete='SET NULL' + ) + ## Alter url_id column to be nullable + op.alter_column( + 'ds_app_link_meta_url', + 'url_id', + nullable=True + ) + + +def _add_flag_deletion_tables(): + op.create_table( + 'flag_ds_delete_agency', + sa.Column( + 'ds_agency_id', + sa.Integer(), + sa.ForeignKey( + 'ds_app_link_agency.ds_agency_id', + ondelete='CASCADE' + ), + primary_key=True, + nullable=False + ), + created_at_column() + ) + + op.create_table( + 'flag_ds_delete_data_source', + sa.Column( + 'ds_data_source_id', + sa.Integer(), + sa.ForeignKey( + 'ds_app_link_data_source.ds_data_source_id', + ondelete='CASCADE' + ), + primary_key=True, + nullable=False + ), + created_at_column(), + ) + + op.create_table( + 'flag_ds_delete_meta_url', + sa.Column( + 'ds_meta_url_id', + sa.Integer(), + sa.ForeignKey( + 'ds_app_link_meta_url.ds_meta_url_id', + ondelete='CASCADE' + ), + primary_key=True, + nullable=False + ), + created_at_column(), + ) + + +def _rename_existing_tables_to_ds_app_format(): + op.rename_table( + 'url_data_source', + 'ds_app_link_data_source' + ) + op.rename_table( + 'url_ds_meta_url', + 'ds_app_link_meta_url' + ) + +def _migrate_agency_ids_to_ds_agency_link(): + """ + While this migration uses the existing DS agency IDs for both sm and ds agency ids + From this point onward the sm ID is internal to the SM application, + and the same is true for DS ID. + """ + + op.execute(""" + INSERT INTO ds_app_link_agency(agency_id, ds_agency_id) + SELECT agency_id, agency_id + FROM agencies + """) + + +def remove_id_column_from_agencies(): + op.drop_column( + 'agencies', + 'id' + ) + +def rename_agency_id_to_id(): + op.alter_column( + 'agencies', + 'agency_id', + new_column_name='id' + ) + +def _create_ds_agency_link(): + op.create_table( + 'ds_app_link_agency', + sa.Column( + 'agency_id', + sa.Integer(), + sa.ForeignKey( + 'agencies.agency_id', + ondelete='SET NULL' + ), + nullable=True + ), + sa.Column( + 'ds_agency_id', + sa.Integer(), + nullable=False, + primary_key=True + ), + created_at_column(), + last_synced_at_column(), + sa.UniqueConstraint( + "agency_id", name="uq_ds_app_link_agency_agency_id" + ) + ) + + +def _create_sync_log(): + op.create_table( + 'sync_log', + sa.Column( + 'resource_type', + sa.Enum( + 'agency', + 'data_source', + 'meta_url', + name='resource_type_enum' + ), + nullable=False, + ), + sa.Column( + 'sync_type', + sa.Enum( + 'add', + 'update', + 'delete', + name='sync_type_enum' + ), + nullable=False, + ), + sa.Column( + 'count', + sa.Integer(), + nullable=False, + ), + created_at_column(), + sa.PrimaryKeyConstraint( + 'resource_type', + 'sync_type', + 'created_at' + ) + ) + + +def downgrade() -> None: + pass diff --git a/src/api/endpoints/agencies/by_id/delete/query.py b/src/api/endpoints/agencies/by_id/delete/query.py index 61ce2653..627fc932 100644 --- a/src/api/endpoints/agencies/by_id/delete/query.py +++ b/src/api/endpoints/agencies/by_id/delete/query.py @@ -1,7 +1,9 @@ -from sqlalchemy import delete +from sqlalchemy import delete, select from sqlalchemy.ext.asyncio import AsyncSession +from src.db.models.impl.agency.ds_link.sqlalchemy import DSAppLinkAgency from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.flag.ds_delete.agency import FlagDSDeleteAgency from src.db.queries.base.builder import QueryBuilderBase @@ -15,8 +17,25 @@ def __init__( self.agency_id = agency_id async def run(self, session: AsyncSession) -> None: + # Check for existence of DS App Link. If so, add deletion flag + query = ( + select( + DSAppLinkAgency + ) + .where( + DSAppLinkAgency.agency_id == self.agency_id + ) + ) + ds_app_link_agency: DSAppLinkAgency | None = await self.sh.one_or_none(session, query=query) + if ds_app_link_agency is not None: + flag = FlagDSDeleteAgency( + ds_agency_id=ds_app_link_agency.ds_agency_id, + ) + session.add(flag) + + # Delete Agency statement = ( delete(Agency) - .where(Agency.agency_id == self.agency_id) + .where(Agency.id == self.agency_id) ) await session.execute(statement) \ No newline at end of file diff --git a/src/api/endpoints/agencies/by_id/put/query.py b/src/api/endpoints/agencies/by_id/put/query.py index 0f58a7db..942203fc 100644 --- a/src/api/endpoints/agencies/by_id/put/query.py +++ b/src/api/endpoints/agencies/by_id/put/query.py @@ -25,7 +25,7 @@ async def run(self, session: AsyncSession) -> None: Agency ) .where( - Agency.agency_id == self.agency_id + Agency.id == self.agency_id ) ) diff --git a/src/api/endpoints/agencies/root/get/query.py b/src/api/endpoints/agencies/root/get/query.py index 9452f12e..ae3b943d 100644 --- a/src/api/endpoints/agencies/root/get/query.py +++ b/src/api/endpoints/agencies/root/get/query.py @@ -42,7 +42,7 @@ async def run(self, session: AsyncSession) -> list[AgencyGetResponse]: for location in agency.locations ] responses.append(AgencyGetResponse( - id=agency.agency_id, + id=agency.id, name=agency.name, type=agency.agency_type, jurisdiction_type=agency.jurisdiction_type, diff --git a/src/api/endpoints/agencies/root/post/query.py b/src/api/endpoints/agencies/root/post/query.py index 29ff9823..43064f85 100644 --- a/src/api/endpoints/agencies/root/post/query.py +++ b/src/api/endpoints/agencies/root/post/query.py @@ -26,7 +26,7 @@ async def run(self, session: AsyncSession) -> AgencyPostResponse: session.add(agency) await session.flush() await session.refresh(agency) - agency_id: int = agency.agency_id + agency_id: int = agency.id try: diff --git a/src/api/endpoints/annotate/_shared/extract.py b/src/api/endpoints/annotate/_shared/extract.py index 61e92c35..1a0932d3 100644 --- a/src/api/endpoints/annotate/_shared/extract.py +++ b/src/api/endpoints/annotate/_shared/extract.py @@ -17,7 +17,7 @@ from src.db.dto_converter import DTOConverter from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion async def extract_and_format_get_annotation_result( @@ -55,7 +55,7 @@ async def extract_and_format_get_annotation_result( batch_info=await GetAnnotationBatchInfoQueryBuilder( batch_id=batch_id, models=[ - UserUrlAgencySuggestion, + UserURLAgencySuggestion, ] ).run(session), location_suggestions=location_suggestions, diff --git a/src/api/endpoints/annotate/all/get/queries/agency/requester.py b/src/api/endpoints/annotate/all/get/queries/agency/requester.py index fc309e50..e6ffb817 100644 --- a/src/api/endpoints/annotate/all/get/queries/agency/requester.py +++ b/src/api/endpoints/annotate/all/get/queries/agency/requester.py @@ -11,7 +11,7 @@ from src.db.models.impl.agency.sqlalchemy import Agency from src.db.models.impl.link.agency_location.sqlalchemy import LinkAgencyLocation from src.db.models.impl.link.user_suggestion_not_found.agency.sqlalchemy import LinkUserSuggestionAgencyNotFound -from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion from src.db.templates.requester import RequesterBase @@ -30,13 +30,13 @@ def __init__( async def get_user_agency_suggestions(self) -> list[AgencyAnnotationUserSuggestion]: query = ( select( - UserUrlAgencySuggestion.agency_id, - func.count(UserUrlAgencySuggestion.user_id).label("count"), + UserURLAgencySuggestion.agency_id, + func.count(UserURLAgencySuggestion.user_id).label("count"), Agency.name.label("agency_name"), ) .join( Agency, - Agency.agency_id == UserUrlAgencySuggestion.agency_id + Agency.id == UserURLAgencySuggestion.agency_id ) ) @@ -45,7 +45,7 @@ async def get_user_agency_suggestions(self) -> list[AgencyAnnotationUserSuggesti query = ( query.join( LinkAgencyLocation, - LinkAgencyLocation.agency_id == UserUrlAgencySuggestion.agency_id + LinkAgencyLocation.agency_id == UserURLAgencySuggestion.agency_id ) .where( LinkAgencyLocation.location_id == self.location_id @@ -54,14 +54,14 @@ async def get_user_agency_suggestions(self) -> list[AgencyAnnotationUserSuggesti query = ( query.where( - UserUrlAgencySuggestion.url_id == self.url_id + UserURLAgencySuggestion.url_id == self.url_id ) .group_by( - UserUrlAgencySuggestion.agency_id, + UserURLAgencySuggestion.agency_id, Agency.name ) .order_by( - func.count(UserUrlAgencySuggestion.user_id).desc() + func.count(UserURLAgencySuggestion.user_id).desc() ) .limit(3) ) @@ -88,7 +88,7 @@ async def get_auto_agency_suggestions(self) -> list[AgencyAnnotationAutoSuggesti ) .join( Agency, - Agency.agency_id == cte.agency_id + Agency.id == cte.agency_id ) ) diff --git a/src/api/endpoints/annotate/all/get/queries/core.py b/src/api/endpoints/annotate/all/get/queries/core.py index e37f2396..9b905870 100644 --- a/src/api/endpoints/annotate/all/get/queries/core.py +++ b/src/api/endpoints/annotate/all/get/queries/core.py @@ -8,7 +8,7 @@ from src.db.models.impl.flag.url_suspended.sqlalchemy import FlagURLSuspended from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion from src.db.models.impl.url.suggestion.location.user.sqlalchemy import UserLocationSuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion @@ -68,10 +68,10 @@ async def run( ) ), ~exists( - select(UserUrlAgencySuggestion.id) + select(UserURLAgencySuggestion.id) .where( - UserUrlAgencySuggestion.url_id == URL.id, - UserUrlAgencySuggestion.user_id == self.user_id, + UserURLAgencySuggestion.url_id == URL.id, + UserURLAgencySuggestion.user_id == self.user_id, ) ), ~exists( diff --git a/src/api/endpoints/annotate/all/post/requester.py b/src/api/endpoints/annotate/all/post/requester.py index 14064e8a..2d9cfeca 100644 --- a/src/api/endpoints/annotate/all/post/requester.py +++ b/src/api/endpoints/annotate/all/post/requester.py @@ -6,7 +6,7 @@ from src.db.models.impl.link.user_name_suggestion.sqlalchemy import LinkUserNameSuggestion from src.db.models.impl.link.user_suggestion_not_found.agency.sqlalchemy import LinkUserSuggestionAgencyNotFound from src.db.models.impl.link.user_suggestion_not_found.location.sqlalchemy import LinkUserSuggestionLocationNotFound -from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion from src.db.models.impl.url.suggestion.location.user.sqlalchemy import UserLocationSuggestion from src.db.models.impl.url.suggestion.name.enums import NameSuggestionSource from src.db.models.impl.url.suggestion.name.sqlalchemy import URLNameSuggestion @@ -53,7 +53,7 @@ def add_relevant_annotation( def add_agency_ids(self, agency_ids: list[int]) -> None: for agency_id in agency_ids: - agency_suggestion = UserUrlAgencySuggestion( + agency_suggestion = UserURLAgencySuggestion( url_id=self.url_id, user_id=self.user_id, agency_id=agency_id, diff --git a/src/api/endpoints/contributions/user/queries/agreement/agency.py b/src/api/endpoints/contributions/user/queries/agreement/agency.py index 96011e06..488e5c19 100644 --- a/src/api/endpoints/contributions/user/queries/agreement/agency.py +++ b/src/api/endpoints/contributions/user/queries/agreement/agency.py @@ -3,7 +3,7 @@ from src.api.endpoints.contributions.user.queries.annotated_and_validated import AnnotatedAndValidatedCTEContainer from src.api.endpoints.contributions.user.queries.templates.agreement import AgreementCTEContainer from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency -from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion def get_agency_agreement_cte_container( @@ -16,10 +16,10 @@ def get_agency_agreement_cte_container( func.count() ) .join( - UserUrlAgencySuggestion, + UserURLAgencySuggestion, and_( - inner_cte.user_id == UserUrlAgencySuggestion.user_id, - inner_cte.url_id == UserUrlAgencySuggestion.url_id + inner_cte.user_id == UserURLAgencySuggestion.user_id, + inner_cte.url_id == UserURLAgencySuggestion.url_id ) ) .group_by( @@ -34,17 +34,17 @@ def get_agency_agreement_cte_container( func.count() ) .join( - UserUrlAgencySuggestion, + UserURLAgencySuggestion, and_( - inner_cte.user_id == UserUrlAgencySuggestion.user_id, - inner_cte.url_id == UserUrlAgencySuggestion.url_id + inner_cte.user_id == UserURLAgencySuggestion.user_id, + inner_cte.url_id == UserURLAgencySuggestion.url_id ) ) .where( exists() .where( - LinkURLAgency.url_id == UserUrlAgencySuggestion.url_id, - LinkURLAgency.agency_id == UserUrlAgencySuggestion.agency_id + LinkURLAgency.url_id == UserURLAgencySuggestion.url_id, + LinkURLAgency.agency_id == UserURLAgencySuggestion.agency_id ) ) .group_by( diff --git a/src/api/endpoints/data_source/get/query.py b/src/api/endpoints/data_source/get/query.py index e9d0598b..e15ce6b1 100644 --- a/src/api/endpoints/data_source/get/query.py +++ b/src/api/endpoints/data_source/get/query.py @@ -98,7 +98,7 @@ async def run(self, session: AsyncSession) -> DataSourceGetOuterResponse: url_agency_ids: list[int] = [] for agency in url.confirmed_agencies: - url_agency_ids.append(agency.agency_id) + url_agency_ids.append(agency.id) url_description: str | None = mapping[URL.description] link_batch_url_batch_id: int | None = mapping[LinkBatchURL.batch_id] diff --git a/src/api/endpoints/meta_url/by_id/agencies/put/__init__.py b/src/api/endpoints/meta_url/by_id/put/__init__.py similarity index 100% rename from src/api/endpoints/meta_url/by_id/agencies/put/__init__.py rename to src/api/endpoints/meta_url/by_id/put/__init__.py diff --git a/src/api/endpoints/meta_url/by_id/agencies/put/query.py b/src/api/endpoints/meta_url/by_id/put/query.py similarity index 88% rename from src/api/endpoints/meta_url/by_id/agencies/put/query.py rename to src/api/endpoints/meta_url/by_id/put/query.py index a3be8cf8..7392375c 100644 --- a/src/api/endpoints/meta_url/by_id/agencies/put/query.py +++ b/src/api/endpoints/meta_url/by_id/put/query.py @@ -1,8 +1,7 @@ from sqlalchemy.ext.asyncio import AsyncSession -from src.api.endpoints.meta_url.by_id.agencies.put.request import UpdateMetaURLRequest +from src.api.endpoints.meta_url.by_id.put.request import UpdateMetaURLRequest from src.api.shared.batch.url.link import UpdateBatchURLLinkQueryBuilder -from src.api.shared.record_type.put.query import UpdateRecordTypeQueryBuilder from src.api.shared.url.put.query import UpdateURLQueryBuilder from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/api/endpoints/meta_url/by_id/agencies/put/request.py b/src/api/endpoints/meta_url/by_id/put/request.py similarity index 100% rename from src/api/endpoints/meta_url/by_id/agencies/put/request.py rename to src/api/endpoints/meta_url/by_id/put/request.py diff --git a/src/api/endpoints/meta_url/get/query.py b/src/api/endpoints/meta_url/get/query.py index 202626d8..740dfd69 100644 --- a/src/api/endpoints/meta_url/get/query.py +++ b/src/api/endpoints/meta_url/get/query.py @@ -64,7 +64,7 @@ async def run(self, session: AsyncSession) -> MetaURLGetOuterResponse: url_name: str = mapping[URL.name] url_agency_ids: list[int] = [] for agency in url.confirmed_agencies: - url_agency_ids.append(agency.agency_id) + url_agency_ids.append(agency.id) url_description: str | None = mapping[URL.description] link_batch_url_batch_id: int | None = mapping[LinkBatchURL.batch_id] responses.append( diff --git a/src/api/endpoints/meta_url/routes.py b/src/api/endpoints/meta_url/routes.py index 0f14805c..79a5ab03 100644 --- a/src/api/endpoints/meta_url/routes.py +++ b/src/api/endpoints/meta_url/routes.py @@ -1,15 +1,15 @@ from fastapi import APIRouter, Depends, Query from src.api.dependencies import get_async_core -from src.api.endpoints.agencies.root.get.response import AgencyGetResponse, AgencyGetOuterResponse +from src.api.endpoints.agencies.root.get.response import AgencyGetOuterResponse from src.api.endpoints.meta_url.by_id.agencies.delete.wrapper import delete_meta_url_agency_link from src.api.endpoints.meta_url.by_id.agencies.get.wrapper import get_meta_url_agencies_wrapper from src.api.endpoints.meta_url.by_id.agencies.shared.check import check_is_meta_url from src.api.endpoints.meta_url.by_id.post.wrapper import add_meta_url_agency_link from src.api.endpoints.meta_url.get.query import GetMetaURLQueryBuilder -from src.api.endpoints.meta_url.get.response import MetaURLGetResponse, MetaURLGetOuterResponse -from src.api.endpoints.meta_url.by_id.agencies.put.query import UpdateMetaURLQueryBuilder -from src.api.endpoints.meta_url.by_id.agencies.put.request import UpdateMetaURLRequest +from src.api.endpoints.meta_url.get.response import MetaURLGetOuterResponse +from src.api.endpoints.meta_url.by_id.put.query import UpdateMetaURLQueryBuilder +from src.api.endpoints.meta_url.by_id.put.request import UpdateMetaURLRequest from src.api.shared.models.message_response import MessageResponse from src.core.core import AsyncCore diff --git a/src/api/endpoints/metrics/batches/aggregated/query/core.py b/src/api/endpoints/metrics/batches/aggregated/query/core.py index c17f0f6d..cc6259de 100644 --- a/src/api/endpoints/metrics/batches/aggregated/query/core.py +++ b/src/api/endpoints/metrics/batches/aggregated/query/core.py @@ -21,7 +21,7 @@ from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer diff --git a/src/api/endpoints/metrics/batches/aggregated/query/submitted_/query.py b/src/api/endpoints/metrics/batches/aggregated/query/submitted_/query.py index ee8f8065..e3fa9d14 100644 --- a/src/api/endpoints/metrics/batches/aggregated/query/submitted_/query.py +++ b/src/api/endpoints/metrics/batches/aggregated/query/submitted_/query.py @@ -8,7 +8,7 @@ from src.db.helpers.session import session_helper as sh from src.db.models.impl.batch.sqlalchemy import Batch from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL -from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource from src.db.queries.base.builder import QueryBuilderBase @@ -20,15 +20,15 @@ async def run(self, session: AsyncSession) -> list[ query = ( select( Batch.strategy, - func.count(URLDataSource.id).label("count") + func.count(DSAppLinkDataSource.id).label("count") ) .join( LinkBatchURL, LinkBatchURL.batch_id == Batch.id ) .join( - URLDataSource, - URLDataSource.url_id == LinkBatchURL.url_id + DSAppLinkDataSource, + DSAppLinkDataSource.url_id == LinkBatchURL.url_id ) .group_by(Batch.strategy) ) diff --git a/src/api/endpoints/metrics/batches/breakdown/submitted/cte_.py b/src/api/endpoints/metrics/batches/breakdown/submitted/cte_.py index face1891..1fd616a6 100644 --- a/src/api/endpoints/metrics/batches/breakdown/submitted/cte_.py +++ b/src/api/endpoints/metrics/batches/breakdown/submitted/cte_.py @@ -3,20 +3,20 @@ from src.api.endpoints.metrics.batches.breakdown.templates.cte_ import BatchesBreakdownURLCTE from src.db.models.impl.batch.sqlalchemy import Batch from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL -from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource SUBMITTED_CTE = BatchesBreakdownURLCTE( select( Batch.id, - func.count(URLDataSource.id).label("count_submitted") + func.count(DSAppLinkDataSource.id).label("count_submitted") ) .join( LinkBatchURL, LinkBatchURL.batch_id == Batch.id ) .join( - URLDataSource, - URLDataSource.url_id == LinkBatchURL.url_id + DSAppLinkDataSource, + DSAppLinkDataSource.url_id == LinkBatchURL.url_id ) .group_by(Batch.id) .cte("submitted") diff --git a/src/api/endpoints/metrics/urls/breakdown/query/core.py b/src/api/endpoints/metrics/urls/breakdown/query/core.py index e585554c..2606a079 100644 --- a/src/api/endpoints/metrics/urls/breakdown/query/core.py +++ b/src/api/endpoints/metrics/urls/breakdown/query/core.py @@ -8,7 +8,7 @@ from src.collectors.enums import URLStatus from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion from src.db.queries.base.builder import QueryBuilderBase @@ -27,13 +27,13 @@ async def run(self, session: AsyncSession) -> GetMetricsURLsBreakdownPendingResp case((UserURLTypeSuggestion.url_id != None, literal(True)), else_=literal(False)).label( "has_user_relevant_annotation" ), - case((UserUrlAgencySuggestion.url_id != None, literal(True)), else_=literal(False)).label( + case((UserURLAgencySuggestion.url_id != None, literal(True)), else_=literal(False)).label( "has_user_agency_annotation" ), ) .outerjoin(UserRecordTypeSuggestion, URL.id == UserRecordTypeSuggestion.url_id) .outerjoin(UserURLTypeSuggestion, URL.id == UserURLTypeSuggestion.url_id) - .outerjoin(UserUrlAgencySuggestion, URL.id == UserUrlAgencySuggestion.url_id) + .outerjoin(UserURLAgencySuggestion, URL.id == UserURLAgencySuggestion.url_id) ).cte("flags") month = func.date_trunc('month', URL.created_at) diff --git a/src/api/endpoints/review/approve/query_/core.py b/src/api/endpoints/review/approve/query_/core.py index b7abec5a..ff7a1c1f 100644 --- a/src/api/endpoints/review/approve/query_/core.py +++ b/src/api/endpoints/review/approve/query_/core.py @@ -37,7 +37,7 @@ async def run(self, session: AsyncSession) -> None: # Get existing agency ids existing_agencies = url.confirmed_agencies or [] - existing_agency_ids = [agency.agency_id for agency in existing_agencies] + existing_agency_ids = [agency.id for agency in existing_agencies] new_agency_ids = self.approval_info.agency_ids or [] await self._check_for_unspecified_agency_ids(existing_agency_ids, new_agency_ids) @@ -141,7 +141,7 @@ async def _add_new_agencies(self, existing_agency_ids, new_agency_ids, session): # Check if the new agency exists in the database query = ( select(Agency) - .where(Agency.agency_id == new_agency_id) + .where(Agency.id == new_agency_id) ) existing_agency = await session.execute(query) existing_agency = existing_agency.scalars().first() diff --git a/src/api/endpoints/search/agency/query.py b/src/api/endpoints/search/agency/query.py index 9476e039..254d90f5 100644 --- a/src/api/endpoints/search/agency/query.py +++ b/src/api/endpoints/search/agency/query.py @@ -30,7 +30,7 @@ async def run(self, session: AsyncSession) -> list[AgencySearchResponse]: query = ( select( - Agency.agency_id, + Agency.id.label("agency_id"), Agency.name.label("agency_name"), Agency.jurisdiction_type, Agency.agency_type, @@ -40,7 +40,7 @@ async def run(self, session: AsyncSession) -> list[AgencySearchResponse]: if self.location_id is None: query = query.join( LinkAgencyLocation, - LinkAgencyLocation.agency_id == Agency.agency_id + LinkAgencyLocation.agency_id == Agency.id ).join( LocationExpandedView, LocationExpandedView.id == LinkAgencyLocation.location_id @@ -49,7 +49,7 @@ async def run(self, session: AsyncSession) -> list[AgencySearchResponse]: with_location_id_cte_container = WithLocationIdCTEContainer(self.location_id) query = query.join( with_location_id_cte_container.cte, - with_location_id_cte_container.agency_id == Agency.agency_id + with_location_id_cte_container.agency_id == Agency.id ).join( LocationExpandedView, LocationExpandedView.id == with_location_id_cte_container.location_id diff --git a/src/api/endpoints/submit/url/queries/core.py b/src/api/endpoints/submit/url/queries/core.py index f65f81d0..9f3e7117 100644 --- a/src/api/endpoints/submit/url/queries/core.py +++ b/src/api/endpoints/submit/url/queries/core.py @@ -12,7 +12,7 @@ from src.db.models.impl.link.user_suggestion_not_found.users_submitted_url.sqlalchemy import LinkUserSubmittedURL from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion from src.db.models.impl.url.suggestion.location.user.sqlalchemy import UserLocationSuggestion from src.db.models.impl.url.suggestion.name.enums import NameSuggestionSource from src.db.models.impl.url.suggestion.name.sqlalchemy import URLNameSuggestion @@ -112,7 +112,7 @@ async def run(self, session: AsyncSession) -> URLSubmissionResponse: # Add agency ID as suggestion if exists if self.request.agency_id is not None: - agen_sugg = UserUrlAgencySuggestion( + agen_sugg = UserURLAgencySuggestion( user_id=self.user_id, url_id=url_insert.id, agency_id=self.request.agency_id diff --git a/src/api/endpoints/url/by_id/delete/__init__.py b/src/api/endpoints/url/by_id/delete/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/url/by_id/delete/query.py b/src/api/endpoints/url/by_id/delete/query.py new file mode 100644 index 00000000..f8eba43d --- /dev/null +++ b/src/api/endpoints/url/by_id/delete/query.py @@ -0,0 +1,79 @@ +from typing import Any + +from sqlalchemy import select, delete +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.models.impl.flag.ds_delete.data_source import FlagDSDeleteDataSource +from src.db.models.impl.flag.ds_delete.meta_url import FlagDSDeleteMetaURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource +from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL +from src.db.queries.base.builder import QueryBuilderBase + + +class DeleteURLQueryBuilder(QueryBuilderBase): + + def __init__( + self, + url_id: int + ): + super().__init__() + self.url_id = url_id + + async def run(self, session: AsyncSession) -> Any: + + await self._check_for_ds_app_link_data_source(session) + await self._check_for_ds_app_link_meta_url(session) + statement = ( + delete( + URL + ).where( + URL.id == self.url_id + ) + ) + await session.execute(statement) + + async def _check_for_ds_app_link_data_source( + self, + session: AsyncSession + ) -> Any: + """ + Check if a DS App Link Data Source exists for the URL + If so, add a deletion flag + """ + query = ( + select(DSAppLinkDataSource) + .where(DSAppLinkDataSource.url_id == self.url_id) + ) + ds_app_link_data_source: DSAppLinkDataSource | None = await self.sh.one_or_none( + session=session, + query=query + ) + if ds_app_link_data_source is not None: + delete_flag = FlagDSDeleteDataSource( + ds_data_source_id=ds_app_link_data_source.ds_data_source_id + ) + session.add(delete_flag) + + async def _check_for_ds_app_link_meta_url( + self, + session: AsyncSession + ) -> Any: + """ + Check if a DS App Link Meta URL exists for the URL + If so, add a deletion flag + """ + query = ( + select(DSAppLinkMetaURL) + .where(DSAppLinkMetaURL.url_id == self.url_id) + ) + ds_app_link_meta_url: DSAppLinkMetaURL | None = await self.sh.one_or_none( + session=session, + query=query + ) + if ds_app_link_meta_url is not None: + delete_flag = FlagDSDeleteMetaURL( + ds_meta_url_id=ds_app_link_meta_url.ds_meta_url_id + ) + session.add(delete_flag) + diff --git a/src/api/endpoints/url/routes.py b/src/api/endpoints/url/routes.py index c7bb59b0..7d184e6e 100644 --- a/src/api/endpoints/url/routes.py +++ b/src/api/endpoints/url/routes.py @@ -1,8 +1,10 @@ from fastapi import APIRouter, Query, Depends, Response from src.api.dependencies import get_async_core +from src.api.endpoints.url.by_id.delete.query import DeleteURLQueryBuilder from src.api.endpoints.url.by_id.screenshot.wrapper import get_url_screenshot_wrapper from src.api.endpoints.url.get.dto import GetURLsResponseInfo +from src.api.shared.models.message_response import MessageResponse from src.core.core import AsyncCore from src.security.manager import get_access_info from src.security.dtos.access_info import AccessInfo @@ -43,3 +45,13 @@ async def get_url_screenshot( content=raw_result, media_type="image/webp" ) + +@url_router.delete("/{url_id}") +async def delete_url( + url_id: int, + async_core: AsyncCore = Depends(get_async_core), +) -> MessageResponse: + await async_core.adb_client.run_query_builder( + DeleteURLQueryBuilder(url_id=url_id) + ) + return MessageResponse(message="URL deleted.") diff --git a/src/api/shared/agency/get/query.py b/src/api/shared/agency/get/query.py index b49e47ee..eccb3581 100644 --- a/src/api/shared/agency/get/query.py +++ b/src/api/shared/agency/get/query.py @@ -30,7 +30,7 @@ async def run(self, session: AsyncSession) -> AgencyGetOuterResponse: ) .join( LinkURLAgency, - LinkURLAgency.agency_id == Agency.agency_id + LinkURLAgency.agency_id == Agency.id ) .where( LinkURLAgency.url_id == self.url_id @@ -52,7 +52,7 @@ async def run(self, session: AsyncSession) -> AgencyGetOuterResponse: for location in agency.locations ] responses.append(AgencyGetResponse( - id=agency.agency_id, + id=agency.id, name=agency.name, type=agency.agency_type, jurisdiction_type=agency.jurisdiction_type, diff --git a/src/core/tasks/base/operator.py b/src/core/tasks/base/operator.py index 51f07a47..55d8033b 100644 --- a/src/core/tasks/base/operator.py +++ b/src/core/tasks/base/operator.py @@ -53,9 +53,17 @@ async def run_task(self) -> TaskOperatorRunInfo: message=str(e) + "\n" + stack_trace ) - @abstractmethod - async def run_info(self, outcome: TaskOperatorOutcome, message: str) -> TaskOperatorRunInfo: - raise NotImplementedError + async def run_info( + self, + outcome: TaskOperatorOutcome, + message: str + ) -> TaskOperatorRunInfo: + return TaskOperatorRunInfo( + task_id=self.task_id, + task_type=self.task_type, + outcome=outcome, + message=message + ) @abstractmethod diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/core.py new file mode 100644 index 00000000..2d43202d --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/core.py @@ -0,0 +1,12 @@ +from src.core.tasks.scheduled.impl.sync_to_ds.templates.operator import DSSyncTaskOperatorBase + + +class DSAppSyncAgenciesAddTaskOperator( + DSSyncTaskOperatorBase +): + + async def meets_task_prerequisites(self) -> bool: + raise NotImplementedError + + async def inner_task_logic(self) -> None: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/cte.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/cte.py new file mode 100644 index 00000000..7ea9742b --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/cte.py @@ -0,0 +1,4 @@ +""" +Agencies to be added to the DS database must not have a +ds app link entry +""" \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/core.py new file mode 100644 index 00000000..73cbf343 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/core.py @@ -0,0 +1,12 @@ +from src.core.tasks.scheduled.impl.sync_to_ds.templates.operator import DSSyncTaskOperatorBase + + +class DSAppSyncAgenciesDeleteTaskOperator( + DSSyncTaskOperatorBase +): + + async def meets_task_prerequisites(self) -> bool: + raise NotImplementedError + + async def inner_task_logic(self) -> None: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/cte.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/cte.py new file mode 100644 index 00000000..69d6150a --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/cte.py @@ -0,0 +1,3 @@ +""" +Agencies to be deleted from the DS database must be flagged for deletion +""" \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/core.py new file mode 100644 index 00000000..55eb8e3a --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/core.py @@ -0,0 +1,12 @@ +from src.core.tasks.scheduled.impl.sync_to_ds.templates.operator import DSSyncTaskOperatorBase + + +class DSAppSyncAgenciesUpdateTaskOperator( + DSSyncTaskOperatorBase +): + + async def meets_task_prerequisites(self) -> bool: + raise NotImplementedError + + async def inner_task_logic(self) -> None: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/cte.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/cte.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/core.py new file mode 100644 index 00000000..ea307cb5 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/core.py @@ -0,0 +1,13 @@ +from src.core.tasks.scheduled.impl.sync_to_ds.templates.operator import DSSyncTaskOperatorBase + + +class DSAppSyncDataSourcesAddTaskOperator( + DSSyncTaskOperatorBase +): + + async def meets_task_prerequisites(self) -> bool: + raise NotImplementedError + + + async def inner_task_logic(self) -> None: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/cte.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/cte.py new file mode 100644 index 00000000..a11d3d1d --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/cte.py @@ -0,0 +1,4 @@ +""" +Data sources to be added to the DS database must not have a +ds app link entry +""" \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/core.py new file mode 100644 index 00000000..b49b73c9 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/core.py @@ -0,0 +1,12 @@ +from src.core.tasks.scheduled.impl.sync_to_ds.templates.operator import DSSyncTaskOperatorBase + + +class DSAppSyncDataSourcesDeleteTaskOperator( + DSSyncTaskOperatorBase +): + + async def meets_task_prerequisites(self) -> bool: + raise NotImplementedError + + async def inner_task_logic(self) -> None: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/cte.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/cte.py new file mode 100644 index 00000000..1e555125 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/cte.py @@ -0,0 +1,3 @@ +""" +Data sources to be deleted from the DS database must be flagged for deletion +""" \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/core.py new file mode 100644 index 00000000..1947c202 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/core.py @@ -0,0 +1,12 @@ +from src.core.tasks.scheduled.impl.sync_to_ds.templates.operator import DSSyncTaskOperatorBase + + +class DSAppSyncDataSourcesUpdateTaskOperator( + DSSyncTaskOperatorBase +): + + async def meets_task_prerequisites(self) -> bool: + raise NotImplementedError + + async def inner_task_logic(self) -> None: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/cte.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/cte.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/core.py new file mode 100644 index 00000000..54fe1c90 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/core.py @@ -0,0 +1,12 @@ +from src.core.tasks.scheduled.impl.sync_to_ds.templates.operator import DSSyncTaskOperatorBase + + +class DSAppSyncMetaURLsAddTaskOperator( + DSSyncTaskOperatorBase +): + + async def meets_task_prerequisites(self) -> bool: + raise NotImplementedError + + async def inner_task_logic(self) -> None: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/cte.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/cte.py new file mode 100644 index 00000000..3776b2ed --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/cte.py @@ -0,0 +1,4 @@ +""" +Meta URLs to be added to the DS database must not have a +ds app link entry +""" \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/core.py new file mode 100644 index 00000000..00d2c225 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/core.py @@ -0,0 +1,12 @@ +from src.core.tasks.scheduled.impl.sync_to_ds.templates.operator import DSSyncTaskOperatorBase + + +class DSAppSyncMetaURLsDeleteTaskOperator( + DSSyncTaskOperatorBase +): + + async def meets_task_prerequisites(self) -> bool: + raise NotImplementedError + + async def inner_task_logic(self) -> None: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/cte.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/cte.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/core.py new file mode 100644 index 00000000..387d52d2 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/core.py @@ -0,0 +1,12 @@ +from src.core.tasks.scheduled.impl.sync_to_ds.templates.operator import DSSyncTaskOperatorBase + + +class DSAppSyncMetaURLsUpdateTaskOperator( + DSSyncTaskOperatorBase +): + + async def meets_task_prerequisites(self) -> bool: + raise NotImplementedError + + async def inner_task_logic(self) -> None: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/cte.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/cte.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/templates/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/templates/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/templates/operator.py b/src/core/tasks/scheduled/impl/sync_to_ds/templates/operator.py new file mode 100644 index 00000000..62794711 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/templates/operator.py @@ -0,0 +1,28 @@ +from abc import ABC + +from src.core.tasks.base.operator import TaskOperatorBase +from src.core.tasks.mixins.prereq import HasPrerequisitesMixin +from src.core.tasks.url.enums import TaskOperatorOutcome +from src.db.client.async_ import AsyncDatabaseClient +from src.external.pdap.client import PDAPClient + + +class DSSyncTaskOperatorBase( + TaskOperatorBase, + HasPrerequisitesMixin, + ABC +): + + def __init__( + self, + adb_client: AsyncDatabaseClient, + pdap_client: PDAPClient + ): + super().__init__(adb_client) + self.pdap_client = pdap_client + + async def conclude_task(self): + return await self.run_info( + outcome=TaskOperatorOutcome.SUCCESS, + message="Task completed successfully" + ) diff --git a/src/core/tasks/url/operators/base.py b/src/core/tasks/url/operators/base.py index e1d70d5e..8fc0b422 100644 --- a/src/core/tasks/url/operators/base.py +++ b/src/core/tasks/url/operators/base.py @@ -22,15 +22,3 @@ async def conclude_task(self): outcome=TaskOperatorOutcome.SUCCESS, message="Task completed successfully" ) - - async def run_info( - self, - outcome: TaskOperatorOutcome, - message: str - ) -> TaskOperatorRunInfo: - return TaskOperatorRunInfo( - task_id=self.task_id, - task_type=self.task_type, - outcome=outcome, - message=message - ) diff --git a/src/core/tasks/url/operators/submit_approved/queries/cte.py b/src/core/tasks/url/operators/submit_approved/queries/cte.py index cf7ccb71..47aad8e3 100644 --- a/src/core/tasks/url/operators/submit_approved/queries/cte.py +++ b/src/core/tasks/url/operators/submit_approved/queries/cte.py @@ -7,7 +7,7 @@ from src.db.models.impl.flag.url_validated.enums import URLType from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource VALIDATED_URLS_WITHOUT_DS_SQ =( select(URL) @@ -19,7 +19,7 @@ URL.status == URLStatus.OK, URL.name.isnot(None), FlagURLValidated.type == URLType.DATA_SOURCE, - not_exists_url(URLDataSource), + not_exists_url(DSAppLinkDataSource), no_url_task_error(TaskType.SUBMIT_APPROVED) ) .subquery() diff --git a/src/core/tasks/url/operators/submit_approved/queries/get.py b/src/core/tasks/url/operators/submit_approved/queries/get.py index fb43dd34..96621cb8 100644 --- a/src/core/tasks/url/operators/submit_approved/queries/get.py +++ b/src/core/tasks/url/operators/submit_approved/queries/get.py @@ -43,7 +43,7 @@ async def _build_query(): async def _process_result(url: URL) -> SubmitApprovedURLTDO: agency_ids = [] for agency in url.confirmed_agencies: - agency_ids.append(agency.agency_id) + agency_ids.append(agency.id) optional_metadata = url.optional_data_source_metadata if optional_metadata is None: record_formats = None diff --git a/src/core/tasks/url/operators/submit_approved/queries/mark_submitted.py b/src/core/tasks/url/operators/submit_approved/queries/mark_submitted.py index 4ebfef56..3ad1a228 100644 --- a/src/core/tasks/url/operators/submit_approved/queries/mark_submitted.py +++ b/src/core/tasks/url/operators/submit_approved/queries/mark_submitted.py @@ -4,7 +4,7 @@ from src.collectors.enums import URLStatus from src.core.tasks.url.operators.submit_approved.tdo import SubmittedURLInfo from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource from src.db.queries.base.builder import QueryBuilderBase @@ -19,7 +19,7 @@ async def run(self, session: AsyncSession): url_id = info.url_id data_source_id = info.data_source_id - url_data_source_object = URLDataSource( + url_data_source_object = DSAppLinkDataSource( url_id=url_id, data_source_id=data_source_id ) diff --git a/src/core/tasks/url/operators/submit_meta_urls/queries/cte.py b/src/core/tasks/url/operators/submit_meta_urls/queries/cte.py index 54b1edf8..d3dd7019 100644 --- a/src/core/tasks/url/operators/submit_meta_urls/queries/cte.py +++ b/src/core/tasks/url/operators/submit_meta_urls/queries/cte.py @@ -5,7 +5,7 @@ from src.db.models.impl.agency.sqlalchemy import Agency from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.ds_meta_url.sqlalchemy import URLDSMetaURL +from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL from src.db.models.views.meta_url import MetaURL @@ -32,11 +32,11 @@ def __init__(self): .where( ~exists( select( - URLDSMetaURL.ds_meta_url_id + DSAppLinkMetaURL.ds_meta_url_id ) .where( - URLDSMetaURL.url_id == URL.id, - URLDSMetaURL.agency_id == LinkURLAgency.agency_id + DSAppLinkMetaURL.url_id == URL.id, + DSAppLinkMetaURL.agency_id == LinkURLAgency.agency_id ) ), no_url_task_error(TaskType.SUBMIT_META_URLS) diff --git a/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/agency.py b/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/agency.py index e9df9db4..141393bd 100644 --- a/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/agency.py +++ b/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/agency.py @@ -1,23 +1,23 @@ from sqlalchemy import select, func from src.core.tasks.url.operators.validate.queries.ctes.counts.core import ValidatedCountsCTEContainer -from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion from src.db.models.views.unvalidated_url import UnvalidatedURL AGENCY_VALIDATION_COUNTS_CTE = ValidatedCountsCTEContainer( ( select( - UserUrlAgencySuggestion.url_id, - UserUrlAgencySuggestion.agency_id.label("entity"), + UserURLAgencySuggestion.url_id, + UserURLAgencySuggestion.agency_id.label("entity"), func.count().label("votes") ) .join( UnvalidatedURL, - UserUrlAgencySuggestion.url_id == UnvalidatedURL.url_id + UserURLAgencySuggestion.url_id == UnvalidatedURL.url_id ) .group_by( - UserUrlAgencySuggestion.url_id, - UserUrlAgencySuggestion.agency_id + UserURLAgencySuggestion.url_id, + UserURLAgencySuggestion.agency_id ) .cte("counts_agency") ) diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 93af63f9..2d483890 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -1,8 +1,8 @@ -from datetime import datetime, timedelta +from datetime import datetime from functools import wraps from typing import Optional, Type, Any, List, Sequence -from sqlalchemy import select, exists, func, Select, and_, update, delete, Row, text +from sqlalchemy import select, func, Select, and_, update, Row, text from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker from sqlalchemy.orm import selectinload @@ -74,14 +74,13 @@ from src.db.models.impl.task.core import Task from src.db.models.impl.task.enums import TaskStatus from src.db.models.impl.task.error import TaskError -from src.db.models.impl.url.checked_for_duplicate import URLCheckedForDuplicate from src.db.models.impl.url.core.pydantic.info import URLInfo from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata -from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion from src.db.models.impl.url.suggestion.relevant.auto.pydantic.input import AutoRelevancyAnnotationInput @@ -92,7 +91,6 @@ from src.db.models.templates_.base import Base from src.db.models.views.batch_url_status.enums import BatchURLStatusEnum from src.db.queries.base.builder import QueryBuilderBase -from src.db.queries.implementations.core.get.html_content_info import GetHTMLContentInfoQueryBuilder from src.db.queries.implementations.core.get.recent_batch_summaries.builder import GetRecentBatchSummariesQueryBuilder from src.db.queries.implementations.core.metrics.urls.aggregated.pending import \ GetMetricsURLSAggregatedPendingQueryBuilder @@ -144,8 +142,8 @@ async def wrapper(self, *args, **kwargs): return wrapper @session_manager - async def execute(self, session: AsyncSession, statement): - await session.execute(statement) + async def execute(self, session: AsyncSession, statement) -> Any: + return await session.execute(statement) @session_manager async def add( @@ -455,6 +453,12 @@ async def get_all( """Get all records of a model. Used primarily in testing.""" return await sh.get_all(session=session, model=model, order_by_attribute=order_by_attribute) + + @session_manager + async def has_no_rows(self, session: AsyncSession, model: Base) -> bool: + results: list[Base] = await sh.get_all(session=session, model=model) + return len(results) == 0 + async def get_urls( self, page: int, @@ -507,9 +511,6 @@ async def get_task_info( ) -> TaskInfo: return await self.run_query_builder(GetTaskInfoQueryBuilder(task_id)) - async def get_html_content_info(self, url_id: int) -> list[URLHTMLContentInfo]: - return await self.run_query_builder(GetHTMLContentInfoQueryBuilder(url_id)) - @session_manager async def link_urls_to_task( self, @@ -589,11 +590,11 @@ async def upsert_new_agencies( Add or update agencies in the database """ for suggestion in suggestions: - query = select(Agency).where(Agency.agency_id == suggestion.pdap_agency_id) + query = select(Agency).where(Agency.id == suggestion.pdap_agency_id) result = await session.execute(query) agency = result.scalars().one_or_none() if agency is None: - agency = Agency(agency_id=suggestion.pdap_agency_id) + agency = Agency(id=suggestion.pdap_agency_id) agency.name = suggestion.agency_name agency.agency_type = AgencyType.UNKNOWN session.add(agency) @@ -625,17 +626,17 @@ async def add_agency_manual_suggestion( # Check if agency exists in database -- if not, add with placeholder if agency_id is not None: - statement = select(Agency).where(Agency.agency_id == agency_id) + statement = select(Agency).where(Agency.id == agency_id) result = await session.execute(statement) if len(result.all()) == 0: agency = Agency( - agency_id=agency_id, + id=agency_id, name=PLACEHOLDER_AGENCY_NAME, agency_type=AgencyType.UNKNOWN, ) await session.merge(agency) - url_agency_suggestion = UserUrlAgencySuggestion( + url_agency_suggestion = UserURLAgencySuggestion( url_id=url_id, agency_id=agency_id, user_id=user_id, @@ -643,12 +644,6 @@ async def add_agency_manual_suggestion( ) session.add(url_agency_suggestion) - @session_manager - async def get_urls_with_confirmed_agencies(self, session: AsyncSession) -> list[URL]: - statement = select(URL).where(exists().where(LinkURLAgency.url_id == URL.id)) - results = await session.execute(statement) - return list(results.scalars().all()) - async def approve_url( self, approval_info: FinalReviewApprovalInfo, @@ -798,15 +793,6 @@ async def get_logs_by_batch_id(self, session, batch_id: int) -> List[LogOutputIn logs = raw_results.scalars().all() return ([LogOutputInfo(**log.__dict__) for log in logs]) - async def delete_old_logs(self): - """ - Delete logs older than a day - """ - statement = delete(Log).where( - Log.created_at < datetime.now() - timedelta(days=7) - ) - await self.execute(statement) - async def get_next_url_for_all_annotations( self, user_id: int, @@ -869,11 +855,11 @@ async def get_urls_breakdown_submitted_metrics( ) -> GetMetricsURLsBreakdownSubmittedResponseDTO: # Build the query - month = func.date_trunc('month', URLDataSource.created_at) + month = func.date_trunc('month', DSAppLinkDataSource.created_at) query = ( select( month.label('month'), - func.count(URLDataSource.id).label('count_submitted'), + func.count(DSAppLinkDataSource.id).label('count_submitted'), ) .group_by(month) .order_by(month.asc()) @@ -939,12 +925,6 @@ async def mark_all_as_404(self, url_ids: List[int]): query = update(URLWebMetadata).where(URLWebMetadata.url_id.in_(url_ids)).values(status_code=404) await self.execute(query) - @session_manager - async def mark_as_checked_for_duplicates(self, session: AsyncSession, url_ids: list[int]): - for url_id in url_ids: - url_checked_for_duplicate = URLCheckedForDuplicate(url_id=url_id) - session.add(url_checked_for_duplicate) - async def get_urls_aggregated_pending_metrics(self): return await self.run_query_builder(GetMetricsURLSAggregatedPendingQueryBuilder()) diff --git a/src/db/client/sync.py b/src/db/client/sync.py index 966d4bbd..90dba719 100644 --- a/src/db/client/sync.py +++ b/src/db/client/sync.py @@ -16,7 +16,7 @@ from src.db.models.templates_.base import Base from src.db.models.impl.duplicate.sqlalchemy import Duplicate from src.db.models.impl.log.sqlalchemy import Log -from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.batch.sqlalchemy import Batch from src.core.tasks.url.operators.submit_approved.tdo import SubmittedURLInfo @@ -225,7 +225,7 @@ def mark_urls_as_submitted( url_id = info.url_id data_source_id = info.data_source_id - url_data_source_object = URLDataSource( + url_data_source_object = DSAppLinkDataSource( url_id=url_id, data_source_id=data_source_id ) diff --git a/src/db/client/types.py b/src/db/client/types.py index ffce5621..18b32b88 100644 --- a/src/db/client/types.py +++ b/src/db/client/types.py @@ -1,5 +1,5 @@ -from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion -UserSuggestionModel = UserURLTypeSuggestion or UserRecordTypeSuggestion or UserUrlAgencySuggestion +UserSuggestionModel = UserURLTypeSuggestion or UserRecordTypeSuggestion or UserURLAgencySuggestion diff --git a/src/db/constants.py b/src/db/constants.py index a3574a96..67ff66a5 100644 --- a/src/db/constants.py +++ b/src/db/constants.py @@ -1,4 +1,4 @@ -from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion @@ -9,5 +9,5 @@ USER_ANNOTATION_MODELS = [ UserURLTypeSuggestion, UserRecordTypeSuggestion, - UserUrlAgencySuggestion + UserURLAgencySuggestion ] \ No newline at end of file diff --git a/src/db/dto_converter.py b/src/db/dto_converter.py index f0c9b097..dab6b496 100644 --- a/src/db/dto_converter.py +++ b/src/db/dto_converter.py @@ -13,7 +13,7 @@ from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.html.content.enums import HTMLContentType from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent -from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion from src.db.models.impl.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion diff --git a/src/db/models/helpers.py b/src/db/models/helpers.py index f547e8d4..592973a6 100644 --- a/src/db/models/helpers.py +++ b/src/db/models/helpers.py @@ -11,7 +11,7 @@ def get_agency_id_foreign_column( return Column( 'agency_id', Integer(), - ForeignKey('agencies.agency_id', ondelete='CASCADE'), + ForeignKey('agencies.id', ondelete='CASCADE'), nullable=nullable ) diff --git a/src/db/models/impl/agency/ds_link/__init__.py b/src/db/models/impl/agency/ds_link/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/impl/agency/ds_link/sqlalchemy.py b/src/db/models/impl/agency/ds_link/sqlalchemy.py new file mode 100644 index 00000000..32911882 --- /dev/null +++ b/src/db/models/impl/agency/ds_link/sqlalchemy.py @@ -0,0 +1,19 @@ +from sqlalchemy import Integer, Column + +from src.db.models.mixins import CreatedAtMixin, AgencyDependentMixin, LastSyncedAtMixin +from src.db.models.templates_.base import Base + + +class DSAppLinkAgency( + Base, + CreatedAtMixin, + AgencyDependentMixin, + LastSyncedAtMixin +): + __tablename__ = "ds_app_link_agency" + + ds_agency_id = Column( + Integer, + primary_key=True, + nullable=False + ) \ No newline at end of file diff --git a/src/db/models/impl/agency/sqlalchemy.py b/src/db/models/impl/agency/sqlalchemy.py index 28717bfd..d0233967 100644 --- a/src/db/models/impl/agency/sqlalchemy.py +++ b/src/db/models/impl/agency/sqlalchemy.py @@ -18,12 +18,6 @@ class Agency( ): __tablename__ = "agencies" - # TODO: Rename agency_id to ds_agency_id - - agency_id = Column( - Integer, - Sequence("agencies_agency_id"), - primary_key=True) name = Column(String, nullable=False) agency_type: Mapped[AgencyType] = enum_column(AgencyType, name="agency_type_enum") jurisdiction_type: Mapped[JurisdictionType] = enum_column( @@ -34,12 +28,12 @@ class Agency( # Relationships automated_suggestions = relationship("AgencyIDSubtaskSuggestion") - user_suggestions = relationship("UserUrlAgencySuggestion", back_populates="agency") + user_suggestions = relationship("UserURLAgencySuggestion", back_populates="agency") confirmed_urls = relationship("LinkURLAgency", back_populates="agency") locations = relationship( "LocationExpandedView", - primaryjoin="Agency.agency_id == LinkAgencyLocation.agency_id", + primaryjoin="Agency.id == LinkAgencyLocation.agency_id", secondaryjoin="LocationExpandedView.id == LinkAgencyLocation.location_id", secondary="link_agencies_locations", ) diff --git a/src/db/models/impl/flag/ds_delete/__init__.py b/src/db/models/impl/flag/ds_delete/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/impl/flag/ds_delete/agency.py b/src/db/models/impl/flag/ds_delete/agency.py new file mode 100644 index 00000000..2559376d --- /dev/null +++ b/src/db/models/impl/flag/ds_delete/agency.py @@ -0,0 +1,20 @@ +from sqlalchemy import ForeignKey, Integer, Column + +from src.db.models.mixins import CreatedAtMixin +from src.db.models.templates_.base import Base + + +class FlagDSDeleteAgency( + Base, + CreatedAtMixin +): + __tablename__ = "flag_ds_delete_agency" + + ds_agency_id = Column( + Integer, + ForeignKey( + "ds_app_link_agency.ds_agency_id", + ondelete="CASCADE" + ), + primary_key=True, + ) \ No newline at end of file diff --git a/src/db/models/impl/flag/ds_delete/data_source.py b/src/db/models/impl/flag/ds_delete/data_source.py new file mode 100644 index 00000000..38d3cba8 --- /dev/null +++ b/src/db/models/impl/flag/ds_delete/data_source.py @@ -0,0 +1,20 @@ +from sqlalchemy import ForeignKey, Integer, Column + +from src.db.models.mixins import CreatedAtMixin +from src.db.models.templates_.base import Base + + +class FlagDSDeleteDataSource( + Base, + CreatedAtMixin +): + __tablename__ = "flag_ds_delete_data_source" + + ds_data_source_id = Column( + Integer, + ForeignKey( + "ds_app_link_data_source.ds_data_source_id", + ondelete="CASCADE" + ), + primary_key=True, + ) \ No newline at end of file diff --git a/src/db/models/impl/flag/ds_delete/meta_url.py b/src/db/models/impl/flag/ds_delete/meta_url.py new file mode 100644 index 00000000..1fc90d06 --- /dev/null +++ b/src/db/models/impl/flag/ds_delete/meta_url.py @@ -0,0 +1,20 @@ +from sqlalchemy import Column, Integer, ForeignKey + +from src.db.models.mixins import CreatedAtMixin +from src.db.models.templates_.base import Base + + +class FlagDSDeleteMetaURL( + Base, + CreatedAtMixin +): + __tablename__ = "flag_ds_delete_meta_url" + + ds_meta_url_id = Column( + Integer, + ForeignKey( + 'ds_app_link_meta_url.ds_meta_url_id', + ondelete='CASCADE' + ), + primary_key=True, + ) \ No newline at end of file diff --git a/src/db/models/impl/link/url_agency/sqlalchemy.py b/src/db/models/impl/link/url_agency/sqlalchemy.py index 875fa25f..92d1c37b 100644 --- a/src/db/models/impl/link/url_agency/sqlalchemy.py +++ b/src/db/models/impl/link/url_agency/sqlalchemy.py @@ -11,8 +11,8 @@ class LinkURLAgency(URLDependentMixin, WithIDBase): agency_id: Mapped[int] = get_agency_id_foreign_column() - url = relationship("URL", back_populates="confirmed_agencies") - agency = relationship("Agency", back_populates="confirmed_urls") + url = relationship("URL") + agency = relationship("Agency") __table_args__ = ( UniqueConstraint("url_id", "agency_id", name="uq_confirmed_url_agency"), diff --git a/src/db/models/impl/link/url_redirect_url/sqlalchemy.py b/src/db/models/impl/link/url_redirect_url/sqlalchemy.py index 312cbb57..534c7213 100644 --- a/src/db/models/impl/link/url_redirect_url/sqlalchemy.py +++ b/src/db/models/impl/link/url_redirect_url/sqlalchemy.py @@ -1,3 +1,5 @@ +from sqlalchemy.orm import Mapped + from src.db.models.helpers import url_id_column from src.db.models.templates_.standard import StandardBase @@ -5,6 +7,6 @@ class LinkURLRedirectURL(StandardBase): __tablename__ = "link_urls_redirect_url" - source_url_id = url_id_column() - destination_url_id = url_id_column() + source_url_id: Mapped[int] = url_id_column() + destination_url_id: Mapped[int] = url_id_column() diff --git a/src/db/models/impl/link/urls_root_url/sqlalchemy.py b/src/db/models/impl/link/urls_root_url/sqlalchemy.py index a856dd31..8dcd7085 100644 --- a/src/db/models/impl/link/urls_root_url/sqlalchemy.py +++ b/src/db/models/impl/link/urls_root_url/sqlalchemy.py @@ -1,3 +1,5 @@ +from sqlalchemy.orm import Mapped + from src.db.models.helpers import url_id_column from src.db.models.mixins import URLDependentMixin, CreatedAtMixin, UpdatedAtMixin from src.db.models.templates_.with_id import WithIDBase @@ -11,4 +13,4 @@ class LinkURLRootURL( ): __tablename__ = "link_urls_root_url" - root_url_id = url_id_column() \ No newline at end of file + root_url_id: Mapped[int] = url_id_column() \ No newline at end of file diff --git a/src/db/models/impl/sync_log/__init__.py b/src/db/models/impl/sync_log/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/impl/sync_log/enums.py b/src/db/models/impl/sync_log/enums.py new file mode 100644 index 00000000..e1fe483a --- /dev/null +++ b/src/db/models/impl/sync_log/enums.py @@ -0,0 +1,12 @@ +from enum import Enum + + +class ResourceType(Enum): + AGENCY = 'agency' + DATA_SOURCE = 'data_source' + META_URL = 'meta_url' + +class SyncType(Enum): + ADD = 'add' + UPDATE = 'update' + DELETE = 'delete' \ No newline at end of file diff --git a/src/db/models/impl/sync_log/sqlalchemy.py b/src/db/models/impl/sync_log/sqlalchemy.py new file mode 100644 index 00000000..b545940f --- /dev/null +++ b/src/db/models/impl/sync_log/sqlalchemy.py @@ -0,0 +1,17 @@ +from sqlalchemy import PrimaryKeyConstraint, Column, Integer, DateTime + +from src.db.models.helpers import enum_column +from src.db.models.impl.sync_log.enums import ResourceType, SyncType +from src.db.models.templates_.base import Base + + +class SyncLog(Base): + __tablename__ = 'sync_log' + __table_args__ = ( + PrimaryKeyConstraint('resource_type', 'sync_type', 'created_at'), + ) + + resource_type = enum_column(ResourceType, name='resource_type_enum') + sync_type = enum_column(SyncType, name='sync_type_enum') + count = Column(Integer, nullable=False) + created_at = Column(DateTime, nullable=False) \ No newline at end of file diff --git a/src/db/models/impl/url/core/sqlalchemy.py b/src/db/models/impl/url/core/sqlalchemy.py index 02d4fbf2..5bdcdadb 100644 --- a/src/db/models/impl/url/core/sqlalchemy.py +++ b/src/db/models/impl/url/core/sqlalchemy.py @@ -95,7 +95,7 @@ def full_url(cls): URLNameSuggestion ) user_agency_suggestions = relationship( - "UserUrlAgencySuggestion", back_populates="url") + "UserURLAgencySuggestion", back_populates="url") auto_record_type_suggestion = relationship( "AutoRecordTypeSuggestion", uselist=False, back_populates="url") user_record_type_suggestions = relationship( @@ -109,10 +109,12 @@ def full_url(cls): optional_data_source_metadata = relationship( "URLOptionalDataSourceMetadata", uselist=False, back_populates="url") confirmed_agencies = relationship( - "LinkURLAgency", + "Agency", + secondary="link_urls_agency" + ) data_source = relationship( - "URLDataSource", + "DSAppLinkDataSource", back_populates="url", uselist=False ) diff --git a/src/db/models/impl/url/data_source/pydantic.py b/src/db/models/impl/url/data_source/pydantic.py index 7d02c5df..72dec9c6 100644 --- a/src/db/models/impl/url/data_source/pydantic.py +++ b/src/db/models/impl/url/data_source/pydantic.py @@ -1,4 +1,4 @@ -from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource from src.db.templates.markers.bulk.insert import BulkInsertableModel @@ -7,5 +7,5 @@ class URLDataSourcePydantic(BulkInsertableModel): url_id: int @classmethod - def sa_model(cls) -> type[URLDataSource]: - return URLDataSource \ No newline at end of file + def sa_model(cls) -> type[DSAppLinkDataSource]: + return DSAppLinkDataSource \ No newline at end of file diff --git a/src/db/models/impl/url/data_source/sqlalchemy.py b/src/db/models/impl/url/data_source/sqlalchemy.py index be7bf047..74c9bdf0 100644 --- a/src/db/models/impl/url/data_source/sqlalchemy.py +++ b/src/db/models/impl/url/data_source/sqlalchemy.py @@ -1,14 +1,27 @@ -from sqlalchemy import Column, Integer +from sqlalchemy import Column, Integer, ForeignKey from sqlalchemy.orm import relationship -from src.db.models.mixins import CreatedAtMixin, URLDependentMixin +from src.db.models.mixins import CreatedAtMixin, URLDependentMixin, LastSyncedAtMixin from src.db.models.templates_.with_id import WithIDBase -class URLDataSource(CreatedAtMixin, URLDependentMixin, WithIDBase): - __tablename__ = "url_data_source" +class DSAppLinkDataSource( + CreatedAtMixin, + URLDependentMixin, + WithIDBase, + LastSyncedAtMixin +): + __tablename__ = "ds_app_link_data_source" - data_source_id = Column(Integer, nullable=False) + url_id = Column( + Integer, + ForeignKey( + 'urls.id', + ondelete="SET NULL", + ), + nullable=True + ) + ds_data_source_id = Column(Integer, nullable=False, primary_key=True) # Relationships url = relationship( diff --git a/src/db/models/impl/url/ds_meta_url/pydantic.py b/src/db/models/impl/url/ds_meta_url/pydantic.py index 8f7674e9..60a83e3b 100644 --- a/src/db/models/impl/url/ds_meta_url/pydantic.py +++ b/src/db/models/impl/url/ds_meta_url/pydantic.py @@ -1,6 +1,6 @@ from pydantic import BaseModel -from src.db.models.impl.url.ds_meta_url.sqlalchemy import URLDSMetaURL +from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL class URLDSMetaURLPydantic(BaseModel): @@ -10,5 +10,5 @@ class URLDSMetaURLPydantic(BaseModel): agency_id: int @classmethod - def sa_model(cls) -> type[URLDSMetaURL]: - return URLDSMetaURL \ No newline at end of file + def sa_model(cls) -> type[DSAppLinkMetaURL]: + return DSAppLinkMetaURL \ No newline at end of file diff --git a/src/db/models/impl/url/ds_meta_url/sqlalchemy.py b/src/db/models/impl/url/ds_meta_url/sqlalchemy.py index e642a694..9f8092a9 100644 --- a/src/db/models/impl/url/ds_meta_url/sqlalchemy.py +++ b/src/db/models/impl/url/ds_meta_url/sqlalchemy.py @@ -1,20 +1,26 @@ -from sqlalchemy import Column, Integer, PrimaryKeyConstraint, UniqueConstraint +from sqlalchemy import Column, Integer, PrimaryKeyConstraint, UniqueConstraint, ForeignKey -from src.db.models.mixins import URLDependentMixin, CreatedAtMixin, AgencyDependentMixin +from src.db.models.mixins import URLDependentMixin, CreatedAtMixin, AgencyDependentMixin, LastSyncedAtMixin from src.db.models.templates_.base import Base -class URLDSMetaURL( +class DSAppLinkMetaURL( Base, - URLDependentMixin, - AgencyDependentMixin, - CreatedAtMixin + CreatedAtMixin, + LastSyncedAtMixin ): - __tablename__ = "url_ds_meta_url" + __tablename__ = "ds_app_link_meta_url" - ds_meta_url_id = Column(Integer) + url_id = Column( + Integer, + ForeignKey( + 'urls.id', + ondelete="SET NULL", + ), + nullable=True + ) + ds_meta_url_id = Column(Integer, primary_key=True) __table_args__ = ( - PrimaryKeyConstraint("url_id", "agency_id"), - UniqueConstraint("ds_meta_url_id"), + UniqueConstraint("url_id", "agency_id"), ) \ No newline at end of file diff --git a/src/db/models/impl/url/suggestion/agency/subtask/sqlalchemy.py b/src/db/models/impl/url/suggestion/agency/subtask/sqlalchemy.py index 89371498..7a297ef1 100644 --- a/src/db/models/impl/url/suggestion/agency/subtask/sqlalchemy.py +++ b/src/db/models/impl/url/suggestion/agency/subtask/sqlalchemy.py @@ -1,4 +1,4 @@ -from sqlalchemy.orm import relationship +from sqlalchemy.orm import relationship, Mapped from src.db.models.helpers import enum_column from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType, SubtaskDetailCode @@ -16,7 +16,7 @@ class URLAutoAgencyIDSubtask( __tablename__ = "url_auto_agency_id_subtasks" - type = enum_column( + type: Mapped[AutoAgencyIDSubtaskType] = enum_column( AutoAgencyIDSubtaskType, name="agency_auto_suggestion_method" ) @@ -24,7 +24,7 @@ class URLAutoAgencyIDSubtask( sa.Boolean(), nullable=False ) - detail = enum_column( + detail: Mapped[SubtaskDetailCode] = enum_column( SubtaskDetailCode, name="agency_id_subtask_detail_code", ) diff --git a/src/db/models/impl/url/suggestion/agency/user.py b/src/db/models/impl/url/suggestion/agency/user.py index f7c43aad..2cd18851 100644 --- a/src/db/models/impl/url/suggestion/agency/user.py +++ b/src/db/models/impl/url/suggestion/agency/user.py @@ -6,15 +6,15 @@ from src.db.models.templates_.with_id import WithIDBase -class UserUrlAgencySuggestion(URLDependentMixin, WithIDBase): +class UserURLAgencySuggestion(URLDependentMixin, WithIDBase): __tablename__ = "user_url_agency_suggestions" agency_id: Mapped[int] = get_agency_id_foreign_column(nullable=True) user_id = Column(Integer, nullable=False) is_new = Column(Boolean, nullable=True) - agency = relationship("Agency", back_populates="user_suggestions") - url = relationship("URL", back_populates="user_agency_suggestions") + agency = relationship("Agency") + url = relationship("URL") __table_args__ = ( UniqueConstraint("agency_id", "url_id", "user_id", name="uq_user_url_agency_suggestions"), diff --git a/src/db/models/impl/url/suggestion/relevant/auto/sqlalchemy.py b/src/db/models/impl/url/suggestion/relevant/auto/sqlalchemy.py index 49dc7457..dd109269 100644 --- a/src/db/models/impl/url/suggestion/relevant/auto/sqlalchemy.py +++ b/src/db/models/impl/url/suggestion/relevant/auto/sqlalchemy.py @@ -5,7 +5,12 @@ from src.db.models.templates_.with_id import WithIDBase -class AutoRelevantSuggestion(UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, WithIDBase): +class AutoRelevantSuggestion( + UpdatedAtMixin, + CreatedAtMixin, + URLDependentMixin, + WithIDBase +): __tablename__ = "auto_relevant_suggestions" relevant = Column(Boolean, nullable=True) diff --git a/src/db/models/mixins.py b/src/db/models/mixins.py index 12a0b2a1..417eae40 100644 --- a/src/db/models/mixins.py +++ b/src/db/models/mixins.py @@ -58,10 +58,16 @@ class AgencyDependentMixin: nullable=False ) - class CreatedAtMixin: created_at = get_created_at_column() +class LastSyncedAtMixin: + last_synced_at = Column( + TIMESTAMP, + nullable=False, + server_default=CURRENT_TIME_SERVER_DEFAULT + ) + class UpdatedAtMixin: updated_at = Column( diff --git a/src/db/queries/implementations/core/common/annotation_exists_/constants.py b/src/db/queries/implementations/core/common/annotation_exists_/constants.py index 1237634e..b5adfad9 100644 --- a/src/db/queries/implementations/core/common/annotation_exists_/constants.py +++ b/src/db/queries/implementations/core/common/annotation_exists_/constants.py @@ -1,5 +1,5 @@ from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask -from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion from src.db.models.impl.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion @@ -11,5 +11,5 @@ URLAutoAgencyIDSubtask, UserURLTypeSuggestion, UserRecordTypeSuggestion, - UserUrlAgencySuggestion + UserURLAgencySuggestion ] diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py index 4921337f..27240b7d 100644 --- a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py @@ -8,7 +8,7 @@ from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.batch.sqlalchemy import Batch -from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource from src.db.models.views.batch_url_status.core import BatchURLStatusMatView from src.db.models.views.batch_url_status.enums import BatchURLStatusEnum from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/submitted.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/submitted.py index 5ab305cc..3b9e0c55 100644 --- a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/submitted.py +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/submitted.py @@ -5,7 +5,7 @@ from src.db.models.impl.batch.sqlalchemy import Batch from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte_container import \ URLCountsCTEContainer @@ -23,8 +23,8 @@ URL.id == LinkBatchURL.url_id, ) .join( - URLDataSource, - URLDataSource.url_id == URL.id, + DSAppLinkDataSource, + DSAppLinkDataSource.url_id == URL.id, ) .group_by( Batch.id diff --git a/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py b/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py index 17136cce..395fe3f9 100644 --- a/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py +++ b/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py @@ -6,7 +6,7 @@ from src.api.endpoints.metrics.dtos.get.urls.aggregated.pending import GetMetricsURLsAggregatedPendingResponseDTO from src.collectors.enums import URLStatus from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion from src.db.models.mixins import URLDependentMixin @@ -25,7 +25,7 @@ def has_user_record_type_annotation(self): @property def has_user_agency_annotation(self): - return self.get_exists_for_model(UserUrlAgencySuggestion) + return self.get_exists_for_model(UserURLAgencySuggestion) def get_exists_for_model(self, model: Type[URLDependentMixin]): return self.query.c[ diff --git a/src/db/types.py b/src/db/types.py index dcee196f..073fec7c 100644 --- a/src/db/types.py +++ b/src/db/types.py @@ -1,10 +1,10 @@ from typing import TypeVar -from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion from src.db.queries.base.labels import LabelsBase -UserSuggestionType = UserUrlAgencySuggestion | UserURLTypeSuggestion | UserRecordTypeSuggestion +UserSuggestionType = UserURLAgencySuggestion | UserURLTypeSuggestion | UserRecordTypeSuggestion LabelsType = TypeVar("LabelsType", bound=LabelsBase) \ No newline at end of file diff --git a/src/external/pdap/impl/sync/__init__.py b/src/external/pdap/impl/sync/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/agencies/__init__.py b/src/external/pdap/impl/sync/agencies/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/agencies/_shared/__init__.py b/src/external/pdap/impl/sync/agencies/_shared/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/agencies/_shared/models/__init__.py b/src/external/pdap/impl/sync/agencies/_shared/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/agencies/_shared/models/content.py b/src/external/pdap/impl/sync/agencies/_shared/models/content.py new file mode 100644 index 00000000..e815b753 --- /dev/null +++ b/src/external/pdap/impl/sync/agencies/_shared/models/content.py @@ -0,0 +1,10 @@ +class AgencySyncContentModel(BaseModel): + # Required + name: str + jurisdiction_type: JurisdictionType + agency_type: AgencyType + location_ids: list[int] = Field(min_length=1) + + # Optional + no_web_presence: bool = False + defunct_year: int | None = None diff --git a/src/external/pdap/impl/sync/agencies/add/__init__.py b/src/external/pdap/impl/sync/agencies/add/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/agencies/add/core.py b/src/external/pdap/impl/sync/agencies/add/core.py new file mode 100644 index 00000000..109560a2 --- /dev/null +++ b/src/external/pdap/impl/sync/agencies/add/core.py @@ -0,0 +1,20 @@ +from pydantic import BaseModel, Field, model_validator + +from src.external.pdap.impl.sync.meta_urls._shared.content import MetaURLSyncContentModel + + +class AddMetaURLsInnerRequest(BaseModel): + request_id: int + content: MetaURLSyncContentModel + + +class AddMetaURLsOuterRequest(BaseModel): + meta_urls: list[AddMetaURLsInnerRequest] = Field(max_length=1000) + + @model_validator(mode="after") + def all_request_ids_unique(self): + if len(self.meta_urls) != len( + set([meta_url.request_id for meta_url in self.meta_urls]) + ): + raise ValueError("All request_ids must be unique") + return self diff --git a/src/external/pdap/impl/sync/agencies/add/request.py b/src/external/pdap/impl/sync/agencies/add/request.py new file mode 100644 index 00000000..575b4c42 --- /dev/null +++ b/src/external/pdap/impl/sync/agencies/add/request.py @@ -0,0 +1,20 @@ +from pydantic import BaseModel, model_validator, Field + +from src.external.pdap.impl.sync.agencies._shared.models.content import AgencySyncContentModel + + +class AddAgenciesInnerRequest(BaseModel): + request_id: int + content: AgencySyncContentModel + + +class AddAgenciesOuterRequest(BaseModel): + agencies: list[AddAgenciesInnerRequest] = Field(max_length=1000) + + @model_validator(mode="after") + def all_request_ids_unique(self): + if len(self.agencies) != len( + set([agency.request_id for agency in self.agencies]) + ): + raise ValueError("All request_ids must be unique") + return self diff --git a/src/external/pdap/impl/sync/agencies/delete/__init__.py b/src/external/pdap/impl/sync/agencies/delete/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/agencies/delete/core.py b/src/external/pdap/impl/sync/agencies/delete/core.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/agencies/request.py b/src/external/pdap/impl/sync/agencies/request.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/agencies/update/__init__.py b/src/external/pdap/impl/sync/agencies/update/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/agencies/update/core.py b/src/external/pdap/impl/sync/agencies/update/core.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/agencies/update/request.py b/src/external/pdap/impl/sync/agencies/update/request.py new file mode 100644 index 00000000..df43578e --- /dev/null +++ b/src/external/pdap/impl/sync/agencies/update/request.py @@ -0,0 +1,12 @@ +from pydantic import BaseModel, Field + +from src.external.pdap.impl.sync.agencies._shared.models.content import AgencySyncContentModel + + +class UpdateAgenciesInnerRequest(BaseModel): + app_id: int + content: AgencySyncContentModel + + +class UpdateAgenciesOuterRequest(BaseModel): + agencies: list[UpdateAgenciesInnerRequest] = Field(max_length=1000) diff --git a/src/external/pdap/impl/sync/data_sources/__init__.py b/src/external/pdap/impl/sync/data_sources/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/data_sources/_shared/__init__.py b/src/external/pdap/impl/sync/data_sources/_shared/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/data_sources/_shared/content.py b/src/external/pdap/impl/sync/data_sources/_shared/content.py new file mode 100644 index 00000000..58f9abf1 --- /dev/null +++ b/src/external/pdap/impl/sync/data_sources/_shared/content.py @@ -0,0 +1,31 @@ +class DataSourceSyncContentModel(BaseModel): + # Required + source_url: str + name: str + record_type: RecordTypesEnum + + # Optional + description: str | None = None + + # Optional data source metadata + record_formats: list[str] | None = None + data_portal_type: str | None = None + supplying_entity: str | None = None + coverage_start: date | None = None + coverage_end: date | None = None + detail_level: DetailLevel | None = None + agency_supplied: bool | None = None + agency_originated: bool | None = None + agency_aggregation: AgencyAggregation | None = None + agency_described_not_in_database: str | None = None + update_method: UpdateMethod | None = None + readme_url: str | None = None + originating_entity: str | None = None + retention_schedule: RetentionSchedule | None = None + scraper_url: str | None = None + access_notes: str | None = None + access_types: list[AccessType] | None = None + data_portal_type_other: str | None = None + url_status: URLStatus | None = None + + agency_ids: list[int] = Field(min_length=1) diff --git a/src/external/pdap/impl/sync/data_sources/add/__init__.py b/src/external/pdap/impl/sync/data_sources/add/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/data_sources/add/core.py b/src/external/pdap/impl/sync/data_sources/add/core.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/data_sources/add/request.py b/src/external/pdap/impl/sync/data_sources/add/request.py new file mode 100644 index 00000000..dfa7188f --- /dev/null +++ b/src/external/pdap/impl/sync/data_sources/add/request.py @@ -0,0 +1,20 @@ +from pydantic import BaseModel, Field, model_validator + +from src.external.pdap.impl.sync.data_sources._shared.content import DataSourceSyncContentModel + + +class AddDataSourcesInnerRequest(BaseModel): + request_id: int + content: DataSourceSyncContentModel + + +class AddDataSourcesOuterRequest(BaseModel): + data_sources: list[AddDataSourcesInnerRequest] = Field(max_length=1000) + + @model_validator(mode="after") + def all_request_ids_unique(self): + if len(self.data_sources) != len( + set([data_source.request_id for data_source in self.data_sources]) + ): + raise ValueError("All request_ids must be unique") + return self diff --git a/src/external/pdap/impl/sync/data_sources/delete/__init__.py b/src/external/pdap/impl/sync/data_sources/delete/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/data_sources/delete/core.py b/src/external/pdap/impl/sync/data_sources/delete/core.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/data_sources/request.py b/src/external/pdap/impl/sync/data_sources/request.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/data_sources/update/__init__.py b/src/external/pdap/impl/sync/data_sources/update/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/data_sources/update/core.py b/src/external/pdap/impl/sync/data_sources/update/core.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/data_sources/update/request.py b/src/external/pdap/impl/sync/data_sources/update/request.py new file mode 100644 index 00000000..97d95818 --- /dev/null +++ b/src/external/pdap/impl/sync/data_sources/update/request.py @@ -0,0 +1,15 @@ +from pydantic import BaseModel, Field + +from src.external.pdap.impl.sync.data_sources._shared.content import DataSourceSyncContentModel + + +class UpdateDataSourcesInnerRequest(BaseModel): + class Config: + arbitrary_types_allowed = True + + app_id: int + content: DataSourceSyncContentModel + + +class UpdateDataSourcesOuterRequest(BaseModel): + data_sources: list[UpdateDataSourcesInnerRequest] = Field(max_length=1000) diff --git a/src/external/pdap/impl/sync/meta_urls/__init__.py b/src/external/pdap/impl/sync/meta_urls/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/meta_urls/_shared/__init__.py b/src/external/pdap/impl/sync/meta_urls/_shared/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/meta_urls/_shared/content.py b/src/external/pdap/impl/sync/meta_urls/_shared/content.py new file mode 100644 index 00000000..2145225e --- /dev/null +++ b/src/external/pdap/impl/sync/meta_urls/_shared/content.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class MetaURLSyncContentModel(BaseModel): + url: str + agency_ids: list[int] diff --git a/src/external/pdap/impl/sync/meta_urls/add/__init__.py b/src/external/pdap/impl/sync/meta_urls/add/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/meta_urls/add/core.py b/src/external/pdap/impl/sync/meta_urls/add/core.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/meta_urls/delete/__init__.py b/src/external/pdap/impl/sync/meta_urls/delete/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/meta_urls/delete/core.py b/src/external/pdap/impl/sync/meta_urls/delete/core.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/meta_urls/request.py b/src/external/pdap/impl/sync/meta_urls/request.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/meta_urls/update/__init__.py b/src/external/pdap/impl/sync/meta_urls/update/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/meta_urls/update/core.py b/src/external/pdap/impl/sync/meta_urls/update/core.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/meta_urls/update/request.py b/src/external/pdap/impl/sync/meta_urls/update/request.py new file mode 100644 index 00000000..c38ae09e --- /dev/null +++ b/src/external/pdap/impl/sync/meta_urls/update/request.py @@ -0,0 +1,12 @@ +from pydantic import Field, BaseModel + +from src.external.pdap.impl.sync.meta_urls._shared.content import MetaURLSyncContentModel + + +class UpdateMetaURLsInnerRequest(BaseModel): + app_id: int + content: MetaURLSyncContentModel + + +class UpdateMetaURLsOuterRequest(BaseModel): + meta_urls: list[UpdateMetaURLsInnerRequest] = Field(max_length=1000) diff --git a/src/external/pdap/impl/sync/shared/__init__.py b/src/external/pdap/impl/sync/shared/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/shared/models/__init__.py b/src/external/pdap/impl/sync/shared/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/shared/models/add/__init__.py b/src/external/pdap/impl/sync/shared/models/add/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/shared/models/add/response.py b/src/external/pdap/impl/sync/shared/models/add/response.py new file mode 100644 index 00000000..209139cf --- /dev/null +++ b/src/external/pdap/impl/sync/shared/models/add/response.py @@ -0,0 +1,8 @@ +from pydantic import BaseModel + +class DSAppSyncAddResponseInnerModel(BaseModel): + request_id: int + app_id: int + +class DSAppSyncAddResponseModel(BaseModel): + entities: list[DSAppSyncAddResponseInnerModel] \ No newline at end of file diff --git a/src/external/pdap/impl/sync/shared/models/delete/__init__.py b/src/external/pdap/impl/sync/shared/models/delete/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/shared/models/delete/request.py b/src/external/pdap/impl/sync/shared/models/delete/request.py new file mode 100644 index 00000000..c4e3bb8d --- /dev/null +++ b/src/external/pdap/impl/sync/shared/models/delete/request.py @@ -0,0 +1,5 @@ +from pydantic import BaseModel + + +class DSAppSyncDeleteRequestModel(BaseModel): + ids: list[int] \ No newline at end of file diff --git a/tests/automated/integration/api/agencies/delete/__init__.py b/tests/automated/integration/api/agencies/delete/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/agencies/test_core.py b/tests/automated/integration/api/agencies/delete/test_core.py similarity index 87% rename from tests/automated/integration/api/agencies/test_core.py rename to tests/automated/integration/api/agencies/delete/test_core.py index a986cacc..be8fb9fa 100644 --- a/tests/automated/integration/api/agencies/test_core.py +++ b/tests/automated/integration/api/agencies/delete/test_core.py @@ -35,27 +35,27 @@ async def test_agencies( link: LinkAgencyLocation = await ath.adb_client().one_or_none_model(model=LinkAgencyLocation) assert link is not None - assert link.agency_id == agency.agency_id + assert link.agency_id == agency.id assert link.location_id == california.location_id rv.delete_v3( - url=f"/agencies/{agency.agency_id}/locations/{california.location_id}", + url=f"/agencies/{agency.id}/locations/{california.location_id}", ) link: LinkAgencyLocation | None = await ath.adb_client().one_or_none_model(model=LinkAgencyLocation) assert link is None rv.post_v3( - url=f"/agencies/{agency.agency_id}/locations/{pennsylvania.location_id}", + url=f"/agencies/{agency.id}/locations/{pennsylvania.location_id}", ) link: LinkAgencyLocation = await ath.adb_client().one_or_none_model(model=LinkAgencyLocation) assert link is not None - assert link.agency_id == agency.agency_id + assert link.agency_id == agency.id assert link.location_id == pennsylvania.location_id rv.put_v3( - url=f"/agencies/{agency.agency_id}", + url=f"/agencies/{agency.id}", json=AgencyPutRequest( name="Test Agency Updated", ).model_dump(mode="json") @@ -68,7 +68,7 @@ async def test_agencies( rv.delete_v3( - url=f"/agencies/{agency.agency_id}", + url=f"/agencies/{agency.id}", ) agency: Agency | None = await ath.adb_client().one_or_none_model(model=Agency) diff --git a/tests/automated/integration/api/agencies/delete/test_ds_linked.py b/tests/automated/integration/api/agencies/delete/test_ds_linked.py new file mode 100644 index 00000000..0470c75e --- /dev/null +++ b/tests/automated/integration/api/agencies/delete/test_ds_linked.py @@ -0,0 +1,44 @@ +import pytest + +from src.db.models.impl.agency.ds_link.sqlalchemy import DSAppLinkAgency +from src.db.models.impl.agency.enums import AgencyType, JurisdictionType +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.flag.ds_delete.agency import FlagDSDeleteAgency +from tests.helpers.api_test_helper import APITestHelper +from tests.helpers.counter import next_int + + +@pytest.mark.asyncio +async def test_ds_linked( + api_test_helper: APITestHelper +): + """If an agency has been linked to the Data Sources App, + the deletion operation should include an agency flag for deletion. + """ + + agency = Agency( + name="Test Agency", + agency_type=AgencyType.LAW_ENFORCEMENT, + jurisdiction_type=JurisdictionType.STATE, + ) + agency_id: int = await api_test_helper.adb_client().add(agency, return_id=True) + + ds_agency_id: int = next_int() + # Add DS link + ds_link = DSAppLinkAgency( + agency_id=agency_id, + ds_agency_id=ds_agency_id, + ) + await api_test_helper.adb_client().add(ds_link) + + api_test_helper.request_validator.delete_v3( + url=f"/agencies/{agency.id}", + ) + + agency: Agency | None = await api_test_helper.adb_client().one_or_none_model(model=Agency) + assert agency is None + + flag: FlagDSDeleteAgency | None = await api_test_helper.adb_client().one_or_none_model(model=FlagDSDeleteAgency) + assert flag is not None + assert flag.ds_agency_id == ds_agency_id + diff --git a/tests/automated/integration/api/annotate/all/test_happy_path.py b/tests/automated/integration/api/annotate/all/test_happy_path.py index 48b60b8b..e9fae81e 100644 --- a/tests/automated/integration/api/annotate/all/test_happy_path.py +++ b/tests/automated/integration/api/annotate/all/test_happy_path.py @@ -10,7 +10,7 @@ from src.core.enums import RecordType from src.db.models.impl.flag.url_validated.enums import URLType from src.db.models.impl.link.user_name_suggestion.sqlalchemy import LinkUserNameSuggestion -from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion from src.db.models.impl.url.suggestion.location.user.sqlalchemy import UserLocationSuggestion from src.db.models.impl.url.suggestion.name.sqlalchemy import URLNameSuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion @@ -110,7 +110,7 @@ async def test_annotate_all( assert suggested_types == {URLType.DATA_SOURCE, URLType.NOT_RELEVANT} # Should be one agency - all_agency_suggestions = await adb_client.get_all(UserUrlAgencySuggestion) + all_agency_suggestions = await adb_client.get_all(UserURLAgencySuggestion) assert len(all_agency_suggestions) == 3 suggested_agency_ids: set[int] = {sugg.agency_id for sugg in all_agency_suggestions} assert agency_id in suggested_agency_ids diff --git a/tests/automated/integration/api/meta_urls/test_invalid_type.py b/tests/automated/integration/api/meta_urls/test_invalid_type.py index 12073191..b3e98a3d 100644 --- a/tests/automated/integration/api/meta_urls/test_invalid_type.py +++ b/tests/automated/integration/api/meta_urls/test_invalid_type.py @@ -1,6 +1,6 @@ import pytest -from src.api.endpoints.meta_url.by_id.agencies.put.request import UpdateMetaURLRequest +from src.api.endpoints.meta_url.by_id.put.request import UpdateMetaURLRequest from tests.helpers.api_test_helper import APITestHelper from tests.helpers.check import check_forbidden_url_type diff --git a/tests/automated/integration/api/meta_urls/test_put.py b/tests/automated/integration/api/meta_urls/test_put.py index 28689a8b..1c493009 100644 --- a/tests/automated/integration/api/meta_urls/test_put.py +++ b/tests/automated/integration/api/meta_urls/test_put.py @@ -1,6 +1,6 @@ import pytest -from src.api.endpoints.meta_url.by_id.agencies.put.request import UpdateMetaURLRequest +from src.api.endpoints.meta_url.by_id.put.request import UpdateMetaURLRequest from src.db.client.async_ import AsyncDatabaseClient from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL diff --git a/tests/automated/integration/api/submit/test_url_maximal.py b/tests/automated/integration/api/submit/test_url_maximal.py index 8d1930f5..150b5409 100644 --- a/tests/automated/integration/api/submit/test_url_maximal.py +++ b/tests/automated/integration/api/submit/test_url_maximal.py @@ -8,7 +8,7 @@ from src.db.models.impl.link.user_name_suggestion.sqlalchemy import LinkUserNameSuggestion from src.db.models.impl.link.user_suggestion_not_found.users_submitted_url.sqlalchemy import LinkUserSubmittedURL from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion from src.db.models.impl.url.suggestion.location.user.sqlalchemy import UserLocationSuggestion from src.db.models.impl.url.suggestion.name.enums import NameSuggestionSource from src.db.models.impl.url.suggestion.name.sqlalchemy import URLNameSuggestion @@ -54,9 +54,9 @@ async def test_maximal( link: LinkUserSubmittedURL = links[0] assert link.url_id == url_id - agen_suggs: list[UserUrlAgencySuggestion] = await adb_client.get_all(UserUrlAgencySuggestion) + agen_suggs: list[UserURLAgencySuggestion] = await adb_client.get_all(UserURLAgencySuggestion) assert len(agen_suggs) == 1 - agen_sugg: UserUrlAgencySuggestion = agen_suggs[0] + agen_sugg: UserURLAgencySuggestion = agen_suggs[0] assert agen_sugg.url_id == url_id assert agen_sugg.agency_id == agency_id diff --git a/tests/automated/integration/api/url/by_id/delete/__init__.py b/tests/automated/integration/api/url/by_id/delete/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/url/by_id/delete/setup.py b/tests/automated/integration/api/url/by_id/delete/setup.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/url/by_id/delete/test_any_url.py b/tests/automated/integration/api/url/by_id/delete/test_any_url.py new file mode 100644 index 00000000..9a91f3d4 --- /dev/null +++ b/tests/automated/integration/api/url/by_id/delete/test_any_url.py @@ -0,0 +1,448 @@ +import pytest +from sqlalchemy import select + +from src.core.enums import RecordType +from src.db.client.async_ import AsyncDatabaseClient +from src.db.dtos.url.mapping_.simple import SimpleURLMapping +from src.db.enums import ChangeLogOperationType +from src.db.models.impl.change_log import ChangeLog +from src.db.models.impl.flag.checked_for_ia.sqlalchemy import FlagURLCheckedForInternetArchives +from src.db.models.impl.flag.root_url.sqlalchemy import FlagRootURL +from src.db.models.impl.flag.url_suspended.sqlalchemy import FlagURLSuspended +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.link.url_redirect_url.sqlalchemy import LinkURLRedirectURL +from src.db.models.impl.link.urls_root_url.sqlalchemy import LinkURLRootURL +from src.db.models.impl.link.user_name_suggestion.sqlalchemy import LinkUserNameSuggestion +from src.db.models.impl.link.user_suggestion_not_found.agency.sqlalchemy import LinkUserSuggestionAgencyNotFound +from src.db.models.impl.link.user_suggestion_not_found.location.sqlalchemy import LinkUserSuggestionLocationNotFound +from src.db.models.impl.link.user_suggestion_not_found.users_submitted_url.sqlalchemy import LinkUserSubmittedURL +from src.db.models.impl.url.checked_for_duplicate import URLCheckedForDuplicate +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML +from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent +from src.db.models.impl.url.internet_archives.probe.sqlalchemy import URLInternetArchivesProbeMetadata +from src.db.models.impl.url.internet_archives.save.sqlalchemy import URLInternetArchivesSaveMetadata +from src.db.models.impl.url.screenshot.sqlalchemy import URLScreenshot +from src.db.models.impl.url.suggestion.agency.subtask.enum import SubtaskDetailCode, AutoAgencyIDSubtaskType +from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask +from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion +from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion +from src.db.models.impl.url.suggestion.anonymous.agency.sqlalchemy import AnonymousAnnotationAgency +from src.db.models.impl.url.suggestion.anonymous.location.sqlalchemy import AnonymousAnnotationLocation +from src.db.models.impl.url.suggestion.anonymous.record_type.sqlalchemy import AnonymousAnnotationRecordType +from src.db.models.impl.url.suggestion.anonymous.url_type.sqlalchemy import AnonymousAnnotationURLType +from src.db.models.impl.url.suggestion.location.auto.subtask.enums import LocationIDSubtaskType +from src.db.models.impl.url.suggestion.location.auto.subtask.sqlalchemy import AutoLocationIDSubtask +from src.db.models.impl.url.suggestion.location.auto.suggestion.sqlalchemy import LocationIDSubtaskSuggestion +from src.db.models.impl.url.suggestion.location.user.sqlalchemy import UserLocationSuggestion +from src.db.models.impl.url.suggestion.name.enums import NameSuggestionSource +from src.db.models.impl.url.suggestion.name.sqlalchemy import URLNameSuggestion +from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion +from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion +from src.db.models.impl.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion +from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion +from src.db.models.impl.url.task_error.sqlalchemy import URLTaskError +from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata +from tests.helpers.api_test_helper import APITestHelper +from tests.helpers.data_creator.core import DBDataCreator +from tests.helpers.data_creator.models.creation_info.locality import LocalityCreationInfo + + +@pytest.mark.asyncio +async def test_any_url( + pittsburgh_locality: LocalityCreationInfo, + db_data_creator: DBDataCreator, + test_agency_id: int, + api_test_helper: APITestHelper +): + """ + Test that deletion works properly for a URL that has all possible attributes + that any URL could have + """ + + url_id: int = await _setup( + ddc=db_data_creator, + pittsburgh_id=pittsburgh_locality.location_id, + agency_id=test_agency_id + ) + api_test_helper.request_validator.delete_v3( + f"url/{url_id}" + ) + await _check_results(url_id, dbc=db_data_creator.adb_client) + + + +async def _check_results( + url_id: int, + dbc: AsyncDatabaseClient +) -> None: + # There should be only two urls present in the database, neither matching URL id + urls: list[URL] = await dbc.get_all(URL) + assert len(urls) == 2 + assert url_id not in (url.id for url in urls) + + # For the following models, there should no longer be any entries in the database. + models = [ + # Batch Link + LinkBatchURL, + # MISCELLANEOUS + ## Flag Root URL + FlagRootURL, + ## URL Task Error + URLTaskError, + ## URL Checked for Duplicate + URLCheckedForDuplicate, + ## Flag URL Suspended + FlagURLSuspended, + # LINKS + ## Link URLs Redirect URL + LinkURLRedirectURL, + ## Link URLs Root URL + LinkURLRootURL, + ## Link User Submitted URLs + LinkUserSubmittedURL, + ## Link User Suggestion Agency Not Found + LinkUserSuggestionAgencyNotFound, + ## Link User Suggestion Location Not Found + LinkUserSuggestionLocationNotFound, + # WEB DATA + ## URL Compressed HTML + URLCompressedHTML, + ## URL HTML Content + URLHTMLContent, + ## URL Screenshot + URLScreenshot, + ## URL Web Metadata + URLWebMetadata, + # INTERNET ARCHIVES + ## Flag URL Checked for Internet Archives + FlagURLCheckedForInternetArchives, + ## URL Internet Archives Probe Metadata + URLInternetArchivesProbeMetadata, + ## URL Internet Archives Save Metadata + URLInternetArchivesSaveMetadata, + # ANNOTATIONS + ## AUTO + ### Agency + URLAutoAgencyIDSubtask, + AgencyIDSubtaskSuggestion, + ### Record Type + AutoRecordTypeSuggestion, + ### URL Type + AutoRelevantSuggestion, + ### Location + AutoLocationIDSubtask, + LocationIDSubtaskSuggestion, + ## USER + ### Agency + UserURLAgencySuggestion, + ### Record Type + UserRecordTypeSuggestion, + ### URL Type + UserURLTypeSuggestion, + ### Location + UserLocationSuggestion, + URLNameSuggestion, + ## ANONYMOUS + ### Agency + AnonymousAnnotationAgency, + ### Location + AnonymousAnnotationLocation, + ### Record Type + AnonymousAnnotationRecordType, + ### URL Type + AnonymousAnnotationURLType, + ] + for model in models: + assert await dbc.get_all(model) == [] + + # The Change Log should show, at minimum, the deletion of the URL + query = ( + select( + ChangeLog + ) + .where( + ChangeLog.table_name == "urls", + ChangeLog.operation_type == ChangeLogOperationType.DELETE + ) + ) + result = dbc.one_or_none(query) + assert result is not None + + +async def _setup( + ddc: DBDataCreator, + pittsburgh_id: int, + agency_id: int +) -> int: + dbc: AsyncDatabaseClient = ddc.adb_client + # URL & Batch Link + url: SimpleURLMapping = (await ddc.create_urls( + record_type=None + ))[0] + + # MISCELLANEOUS + ## Flag Root URL + await ddc.flag_as_root(url_ids=[url.url_id]) + ## URL Task Error + ### Task + task_id: int = await ddc.task(url_ids=[url.url_id]) + ### Error + await ddc.task_errors(url_ids=[url.url_id], task_id=task_id) + ## URL Checked for Duplicate + await dbc.add( + URLCheckedForDuplicate( + url_id=url.url_id + ) + ) + ## Flag URL Suspended + await dbc.add( + FlagURLSuspended( + url_id=url.url_id + ) + ) + # LINKS + ## Link URLs Redirect URL + ### Additional url + additional_url: SimpleURLMapping = (await ddc.create_urls( + record_type=None + ))[0] + ### Redirect url + await dbc.add( + LinkURLRedirectURL( + source_url_id=url.url_id, + destination_url_id=additional_url.url_id + ) + ) + ### (We will go in both directions even though this should technically not be legal) + await dbc.add( + LinkURLRedirectURL( + source_url_id=additional_url.url_id, + destination_url_id=url.url_id + ) + ) + ## Link URLs Root URL + ### (Again, will go in both directions despite this not being legal) + root_url: SimpleURLMapping = (await ddc.create_urls( + record_type=None + ))[0] + await dbc.add( + LinkURLRootURL( + url_id=url.url_id, + root_url_id=root_url.url_id + ) + ) + await dbc.add( + LinkURLRootURL( + url_id=root_url.url_id, + root_url_id=url.url_id + ) + ) + ## Link User Submitted URL + await dbc.add( + LinkUserSubmittedURL( + url_id=url.url_id, + user_id=1 + ) + ) + ## Link User Suggestion Agency Not Found + await dbc.add( + LinkUserSuggestionAgencyNotFound( + url_id=url.url_id, + user_id=1 + ) + ) + ## Link User Suggestion Location Not Found + await dbc.add( + LinkUserSuggestionLocationNotFound( + url_id=url.url_id, + user_id=1 + ) + ) + # WEB DATA + ## URL Compressed HTML + await ddc.add_compressed_html( + url_ids=[url.url_id] + ) + ## URL HTML Content + await dbc.add( + URLHTMLContent( + url_id=url.url_id, + content_type="Title", + content="Test Title" + ) + ) + ## URL Screenshot + await dbc.add( + URLScreenshot( + url_id=url.url_id, + content=b"Test Screenshot", + file_size=1024 + ) + ) + ## URL Web Metadata + await ddc.create_web_metadata( + url_ids=[url.url_id] + ) + # INTERNET ARCHIVES + ## Flag URL Checked for Internet Archives + await dbc.add( + FlagURLCheckedForInternetArchives( + url_id=url.url_id, + success=True + ) + ) + ## URL Internet Archives Probe Metadata + await dbc.add( + URLInternetArchivesProbeMetadata( + url_id=url.url_id, + archive_url="https://example.com", + digest="test_digest", + length=1024, + ) + ) + ## URL Internet Archives Save Metadata + await dbc.add( + URLInternetArchivesSaveMetadata( + url_id=url.url_id, + ) + ) + # ANNOTATIONS + ## AUTO + ### Agency + #### Subtask + agency_subtask_id: int = await dbc.add( + URLAutoAgencyIDSubtask( + url_id=url.url_id, + task_id=1, + agencies_found=True, + type=AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH, + detail=SubtaskDetailCode.NO_DETAILS + ), + return_id=True + ) + ### Suggestion + await dbc.add( + AgencyIDSubtaskSuggestion( + subtask_id=agency_subtask_id, + agency_id=agency_id, + confidence=60 + ) + ) + ### Record Type + await dbc.add( + AutoRecordTypeSuggestion( + url_id=url.url_id, + record_type=RecordType.BOOKING_REPORTS.value + ) + ) + ### Relevant + await dbc.add( + AutoRelevantSuggestion( + url_id=url.url_id, + relevant=True, + confidence=0.5, + model_name="Test Model" + ) + ) + ### Location + #### Subtask + location_subtask_id: int = await dbc.add( + AutoLocationIDSubtask( + url_id=url.url_id, + task_id=task_id, + locations_found=True, + type=LocationIDSubtaskType.NLP_LOCATION_FREQUENCY, + ), + return_id=True + ) + #### Suggestion + await dbc.add( + LocationIDSubtaskSuggestion( + subtask_id=location_subtask_id, + location_id=pittsburgh_id, + confidence=50 + ) + ) + ## USER + ### Agency + await dbc.add( + UserURLAgencySuggestion( + url_id=url.url_id, + user_id=1, + agency_id=agency_id, + is_new=False + ) + ) + ### Record Type + await dbc.add( + UserRecordTypeSuggestion( + url_id=url.url_id, + user_id=1, + record_type=RecordType.BOOKING_REPORTS.value, + ) + ) + ### URL Type + await dbc.add( + UserURLTypeSuggestion( + url_id=url.url_id, + type=URLType.INDIVIDUAL_RECORD, + user_id=1 + ) + ) + ### Location + await dbc.add( + UserLocationSuggestion( + url_id=url.url_id, + location_id=pittsburgh_id, + user_id=1, + ) + ) + ### Name + name_suggestion_id: int = await dbc.add( + URLNameSuggestion( + url_id=url.url_id, + suggestion="Test Name", + source=NameSuggestionSource.USER, + ), + return_id=True + ) + await dbc.add( + LinkUserNameSuggestion( + suggestion_id=name_suggestion_id, + user_id=1, + ) + ) + ## ANONYMOUS + for model in [ + ### Agency + AnonymousAnnotationAgency( + url_id=url.url_id, + agency_id=agency_id + ), + ### Record Type + AnonymousAnnotationRecordType( + url_id=url.url_id, + record_type=RecordType.BOOKING_REPORTS.value + ), + ### URL Type + AnonymousAnnotationURLType( + url_id=url.url_id, + url_type=URLType.INDIVIDUAL_RECORD + ), + ### Location + AnonymousAnnotationLocation( + url_id=url.url_id, + location_id=pittsburgh_id + ) + ]: + await dbc.add(model) + + return url.url_id + + + + + + diff --git a/tests/automated/integration/api/url/by_id/delete/test_data_source_url.py b/tests/automated/integration/api/url/by_id/delete/test_data_source_url.py new file mode 100644 index 00000000..d551118b --- /dev/null +++ b/tests/automated/integration/api/url/by_id/delete/test_data_source_url.py @@ -0,0 +1,115 @@ +from datetime import date + +import pytest + +from src.core.enums import RecordType +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.flag.ds_delete.data_source import FlagDSDeleteDataSource +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource +from src.db.models.impl.url.optional_ds_metadata.enums import AccessTypeEnum, RetentionScheduleEnum, UpdateMethodEnum, \ + AgencyAggregationEnum +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata +from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType +from tests.helpers.api_test_helper import APITestHelper +from tests.helpers.data_creator.core import DBDataCreator + + +@pytest.mark.asyncio +async def test_data_source_url( + db_data_creator: DBDataCreator, + api_test_helper: APITestHelper, + test_agency_id: int +): + """ + Test that deletion works properly for a URL that is a validated data source + and has all data source-only attributes. + """ + + url_id: int = await _setup( + ddc=db_data_creator, + agency_id=test_agency_id + ) + api_test_helper.request_validator.delete_v3( + f"url/{url_id}" + ) + await _check_results( + dbc=db_data_creator.adb_client + ) + +async def _check_results( + dbc: AsyncDatabaseClient +) -> None: + pass + # CHECK + ## URL and all associated tables should be deleted + assert await dbc.has_no_rows(URL) + + ### Record Type should be deleted + assert await dbc.has_no_rows(URLOptionalDataSourceMetadata) + assert await dbc.has_no_rows(LinkURLAgency) + assert await dbc.has_no_rows(URLRecordType) + + ## DS App Link should not yet be deleted + app_link: DSAppLinkDataSource = await dbc.one_or_none_model(DSAppLinkDataSource) + assert app_link is not None + + ## DS App Data Source Deletion Flag should be added + flag: FlagDSDeleteDataSource = await dbc.one_or_none_model(FlagDSDeleteDataSource) + assert flag is not None + assert flag.ds_data_source_id == app_link.ds_data_source_id + + +async def _setup( + ddc: DBDataCreator, + agency_id: int +) -> int: + pass + # SETUP + ## Validated Flag - Data Source + ## Record Type + url_id: int = (await ddc.create_validated_urls( + validation_type=URLType.DATA_SOURCE, + record_type=RecordType.BOOKING_REPORTS, + count=1 + ))[0].url_id + + ## Link Agency + await ddc.create_url_agency_links( + url_ids=[url_id], + agency_ids=[agency_id] + ) + + ## Optional DS Metadata + optional_ds_metadata = URLOptionalDataSourceMetadata( + url_id=url_id, + record_formats=["csv", "pdf"], + data_portal_type="CKAN", + supplying_entity="ReadOnly Agency", + coverage_start=date(year=2025, month=6, day=1), + coverage_end=date(year=2025, month=8, day=20), + agency_supplied=False, + agency_originated=True, + agency_aggregation=AgencyAggregationEnum.LOCALITY, + agency_described_not_in_database="ReadOnly Agency Not In DB", + update_method=UpdateMethodEnum.NO_UPDATES, + readme_url="https://read-only-readme.com", + originating_entity="ReadOnly Agency Originating", + retention_schedule=RetentionScheduleEnum.GT_10_YEARS, + scraper_url="https://read-only-scraper.com", + submission_notes="Read Only Submission Notes", + access_notes="Read Only Access Notes", + access_types=[AccessTypeEnum.WEBPAGE, AccessTypeEnum.API], + ) + await ddc.adb_client.add(optional_ds_metadata) + + ## DS App Link + app_link = DSAppLinkDataSource( + url_id=url_id, + ds_data_source_id=1 + ) + await ddc.adb_client.add(app_link) + + return url_id diff --git a/tests/automated/integration/api/url/by_id/delete/test_meta_url.py b/tests/automated/integration/api/url/by_id/delete/test_meta_url.py new file mode 100644 index 00000000..6c2817b6 --- /dev/null +++ b/tests/automated/integration/api/url/by_id/delete/test_meta_url.py @@ -0,0 +1,78 @@ +import pytest + +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.flag.ds_delete.meta_url import FlagDSDeleteMetaURL +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL +from tests.helpers.api_test_helper import APITestHelper +from tests.helpers.data_creator.core import DBDataCreator + + +@pytest.mark.asyncio +async def test_meta_url( + db_data_creator: DBDataCreator, + api_test_helper: APITestHelper, + test_agency_id: int +): + """ + Test that deletion works properly for a URL that is a validated meta url + and has all data source-only attributes. + """ + + url_id: int = await _setup( + ddc=db_data_creator, + agency_id=test_agency_id + ) + api_test_helper.request_validator.delete_v3( + f"url/{url_id}" + ) + await _check_results( + dbc=db_data_creator.adb_client + ) + + +async def _check_results( + dbc: AsyncDatabaseClient +) -> None: + pass + # CHECK + ## URL and all associated tables should be deleted + assert await dbc.has_no_rows(URL) + + ## DS App Link should not yet be deleted + app_link: DSAppLinkMetaURL = await dbc.one_or_none_model(DSAppLinkMetaURL) + assert app_link is not None + + ## DS App Meta URL Deletion Flag should be added + flag: FlagDSDeleteMetaURL = await dbc.one_or_none_model(FlagDSDeleteMetaURL) + assert flag is not None + assert flag.ds_meta_url_id == app_link.ds_meta_url_id + + +async def _setup( + ddc: DBDataCreator, + agency_id: int +) -> int: + pass + # SETUP + ## Validated Flag - Meta URL + url_id: int = (await ddc.create_validated_urls( + validation_type=URLType.META_URL, + count=1 + ))[0].url_id + + ## Link Agency + await ddc.create_url_agency_links( + url_ids=[url_id], + agency_ids=[agency_id] + ) + ## DS App Link + app_link = DSAppLinkMetaURL( + url_id=url_id, + agency_id=agency_id, + ds_meta_url_id=1 + ) + await ddc.adb_client.add(app_link) + return url_id + diff --git a/tests/automated/integration/api/url/by_id/delete/test_validated_not_relevant.py b/tests/automated/integration/api/url/by_id/delete/test_validated_not_relevant.py new file mode 100644 index 00000000..6e6a738d --- /dev/null +++ b/tests/automated/integration/api/url/by_id/delete/test_validated_not_relevant.py @@ -0,0 +1,71 @@ +import pytest + +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.url.core.sqlalchemy import URL +from tests.helpers.api_test_helper import APITestHelper +from tests.helpers.data_creator.core import DBDataCreator + + +@pytest.mark.asyncio +async def test_validated_not_relevant( + db_data_creator: DBDataCreator, + api_test_helper: APITestHelper +): + """ + Test that deletion works properly for a URL that is a validated + as any of the non-relevant URL types + (not relevant, broken, individual record) + """ + + url_ids: list[int] = await _setup( + ddc=db_data_creator + ) + for url_id in url_ids: + api_test_helper.request_validator.delete_v3( + f"url/{url_id}" + ) + await _check_results( + url_ids, + dbc=db_data_creator.adb_client + ) + + + +async def _check_results( + url_ids: list[int], + dbc: AsyncDatabaseClient +) -> None: + pass + # CHECK + ## Each URLs Validation Flags should be deleted + url_validation_flags: list[FlagURLValidated] = await dbc.get_all(FlagURLValidated) + assert len(url_validation_flags) == 0 + + ## Each URL should be deleted + urls: list[URL] = await dbc.get_all(URL) + assert len(urls) == 0 + +async def _setup( + ddc: DBDataCreator +) -> list[int]: + url_ids: list[int] = [] + # SETUP (3 URLs) + for validated_type in [ + ## Validated Flag - Individual Record + URLType.INDIVIDUAL_RECORD, + ## Validated Flag - Broken + URLType.BROKEN_PAGE, + ## Validated Flag - Not Relevant + URLType.NOT_RELEVANT + ]: + url_id: int = (await ddc.create_validated_urls( + validation_type=validated_type, + count=1 + ))[0].url_id + url_ids.append(url_id) + return url_ids + + + diff --git a/tests/automated/integration/conftest.py b/tests/automated/integration/conftest.py index 42ab2214..6837bae0 100644 --- a/tests/automated/integration/conftest.py +++ b/tests/automated/integration/conftest.py @@ -151,7 +151,7 @@ async def api_test_helper( client: TestClient, db_client_test: DatabaseClient, adb_client_test: AsyncDatabaseClient -) -> AsyncGenerator[APITestHelper, Any]: + ) -> AsyncGenerator[APITestHelper, Any]: yield APITestHelper( request_validator=RequestValidator(client=client), async_core=client.app.state.async_core, @@ -170,25 +170,63 @@ def test_batch_id( @pytest_asyncio.fixture async def test_agency_id( - db_data_creator: DBDataCreator + db_data_creator: DBDataCreator, + pittsburgh_locality: LocalityCreationInfo, + pennsylvania: USStateCreationInfo ) -> int: - return await db_data_creator.agency( + """Test agency linked to two locations: Pittsburgh and Pennsylvania""" + agency_id: int = await db_data_creator.agency( name="Test Agency" ) + await db_data_creator.link_agencies_to_location( + agency_ids=[agency_id], + location_id=pittsburgh_locality.location_id + ) + await db_data_creator.link_agencies_to_location( + agency_ids=[agency_id], + location_id=pennsylvania.location_id + ) + return agency_id + +@pytest_asyncio.fixture +async def test_agency_id_2( + db_data_creator: DBDataCreator, + pittsburgh_locality: LocalityCreationInfo +) -> int: + agency_id: int = await db_data_creator.agency( + name="Test Agency 2" + ) + await db_data_creator.link_agencies_to_location( + agency_ids=[agency_id], + location_id=pittsburgh_locality.location_id + ) + return agency_id @pytest_asyncio.fixture async def test_url_data_source_id( - db_data_creator: DBDataCreator + db_data_creator: DBDataCreator, + test_agency_id: int ) -> int: - return (await db_data_creator.create_validated_urls( + url_id: int = (await db_data_creator.create_validated_urls( record_type=RecordType.CRIME_STATISTICS, validation_type=URLType.DATA_SOURCE, ))[0].url_id + await db_data_creator.link_urls_to_agencies( + url_ids=[url_id], + agency_ids=[test_agency_id] + ) + return url_id @pytest_asyncio.fixture async def test_url_meta_url_id( - db_data_creator: DBDataCreator + db_data_creator: DBDataCreator, + test_agency_id: int ) -> int: - return (await db_data_creator.create_validated_urls( + url_id: int = (await db_data_creator.create_validated_urls( validation_type=URLType.META_URL, ))[0].url_id + await db_data_creator.link_urls_to_agencies( + url_ids=[url_id], + agency_ids=[test_agency_id] + ) + return url_id diff --git a/tests/automated/integration/db/structure/test_upsert_new_agencies.py b/tests/automated/integration/db/structure/test_upsert_new_agencies.py index 6b377974..6adb043b 100644 --- a/tests/automated/integration/db/structure/test_upsert_new_agencies.py +++ b/tests/automated/integration/db/structure/test_upsert_new_agencies.py @@ -46,13 +46,13 @@ async def test_upsert_new_agencies( await adb_client.upsert_new_agencies([update_suggestion]) - rows = await adb_client.get_all(Agency, order_by_attribute="agency_id") + rows: list[Agency] = await adb_client.get_all(Agency, order_by_attribute="id") assert len(rows) == 3 d = {} for row in rows: - d[row.agency_id] = row.name + d[row.id] = row.name assert d[0] == "Updated Test Agency" assert d[1] == "Test Agency 1" diff --git a/tests/automated/integration/db/structure/updated_at/__init__.py b/tests/automated/integration/db/structure/updated_at/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/db/structure/updated_at/test_ds_optional_metadata.py b/tests/automated/integration/db/structure/updated_at/test_ds_optional_metadata.py new file mode 100644 index 00000000..cc88f697 --- /dev/null +++ b/tests/automated/integration/db/structure/updated_at/test_ds_optional_metadata.py @@ -0,0 +1,6 @@ +import pytest + + +@pytest.mark.asyncio +async def test_ds_optional_metadata_updated_at(): + raise NotImplementedError \ No newline at end of file diff --git a/tests/automated/integration/db/structure/updated_at/test_urls.py b/tests/automated/integration/db/structure/updated_at/test_urls.py new file mode 100644 index 00000000..cc88f697 --- /dev/null +++ b/tests/automated/integration/db/structure/updated_at/test_urls.py @@ -0,0 +1,6 @@ +import pytest + + +@pytest.mark.asyncio +async def test_ds_optional_metadata_updated_at(): + raise NotImplementedError \ No newline at end of file diff --git a/tests/automated/integration/readonly/conftest.py b/tests/automated/integration/readonly/conftest.py index a5bcd249..c8324d04 100644 --- a/tests/automated/integration/readonly/conftest.py +++ b/tests/automated/integration/readonly/conftest.py @@ -5,7 +5,6 @@ import pytest_asyncio from starlette.testclient import TestClient -from src.db.client.async_ import AsyncDatabaseClient from src.db.helpers.connect import get_postgres_connection_string from tests.automated.integration.api._helpers.RequestValidator import RequestValidator from tests.automated.integration.readonly.helper import ReadOnlyTestHelper diff --git a/tests/automated/integration/readonly/setup.py b/tests/automated/integration/readonly/setup.py index 20c6d537..ec8c78b1 100644 --- a/tests/automated/integration/readonly/setup.py +++ b/tests/automated/integration/readonly/setup.py @@ -156,16 +156,15 @@ async def add_agency( pittsburgh: LocalityCreationInfo ) -> int: agency_1 = Agency( - agency_id=next_int(), name="Agency 1", agency_type=AgencyType.LAW_ENFORCEMENT, jurisdiction_type=JurisdictionType.STATE, ) - await adb_client.add(agency_1) + agency_id: int = await adb_client.add(agency_1, return_id=True) # Add Agency location agency_1_location = LinkAgencyLocation( - agency_id=agency_1.agency_id, + agency_id=agency_id, location_id=pittsburgh.location_id, ) await adb_client.add(agency_1_location) - return agency_1.agency_id \ No newline at end of file + return agency_id \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/__init__.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/__init__.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/conftest.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/conftest.py new file mode 100644 index 00000000..9a9996a1 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/conftest.py @@ -0,0 +1,10 @@ +import pytest_asyncio + +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel + + +@pytest_asyncio.fixture +async def ds_app_linked_agency( + test_agency_id: int +) -> DSAppLinkInfoModel: + raise NotImplementedError \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/test_add.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/test_add.py new file mode 100644 index 00000000..669a7961 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/test_add.py @@ -0,0 +1,33 @@ +import pytest + +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.add.core import DSAppSyncAgenciesAddTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.external.pdap.client import PDAPClient +from tests.helpers.data_creator.core import DBDataCreator +from tests.helpers.run import run_task_and_confirm_success + + +@pytest.mark.asyncio +async def test_add( + db_data_creator: DBDataCreator, + test_agency_id: int, + adb_client_test: AsyncDatabaseClient, + mock_pdap_client: PDAPClient +): + operator = DSAppSyncAgenciesAddTaskOperator( + adb_client=adb_client_test, + pdap_client=mock_pdap_client + ) + + # Check meets prerequisite + assert await operator.meets_task_prerequisites() + + # Run task and confirm runs without error + await run_task_and_confirm_success(operator) + + # Confirm expected method was caused with expected parameters + + # Check Presence of DS App Link + + + raise NotImplementedError \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/test_delete.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/test_delete.py new file mode 100644 index 00000000..430e6645 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/test_delete.py @@ -0,0 +1,55 @@ +import pytest + +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.delete.core import DSAppSyncAgenciesDeleteTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.agency.ds_link.sqlalchemy import DSAppLinkAgency +from src.db.models.impl.flag.ds_delete.agency import FlagDSDeleteAgency +from src.external.pdap.client import PDAPClient +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel +from tests.helpers.data_creator.core import DBDataCreator +from tests.helpers.run import run_task_and_confirm_success + + +@pytest.mark.asyncio +async def test_delete( + db_data_creator: DBDataCreator, + ds_app_linked_agency: DSAppLinkInfoModel, + adb_client_test: AsyncDatabaseClient, + mock_pdap_client: PDAPClient +): + operator = DSAppSyncAgenciesDeleteTaskOperator( + adb_client=adb_client_test, + pdap_client=mock_pdap_client + ) + + # Check does not currently meet prerequisite + assert not await operator.meets_task_prerequisites() + + # Add DS App Link + ds_app_link = DSAppLinkAgency( + ds_agency_id=1, + agency_id=None, + ) + await adb_client_test.add(ds_app_link) + + # Add Task Deletion Flag for App Link + flag = FlagDSDeleteAgency( + ds_agency_id=1 + ) + await adb_client_test.add(flag) + + # Check meets prerequisite + assert operator.meets_task_prerequisites() + + # Run task and confirm runs without error + await run_task_and_confirm_success(operator) + + # Confirm expected method was caused with expected parameters + + # Check DS App Link Is Deleted + assert await adb_client_test.has_no_rows(DSAppLinkAgency) + + # Check DS App Agency Deletion Flag is deleted + assert await adb_client_test.has_no_rows(FlagDSDeleteAgency) + + raise NotImplementedError \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/__init__.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/conftest.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/conftest.py new file mode 100644 index 00000000..eafc4148 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/conftest.py @@ -0,0 +1,16 @@ +import pytest + +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.update.core import DSAppSyncAgenciesUpdateTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.external.pdap.client import PDAPClient + + +@pytest.fixture +def operator( + adb_client_test: AsyncDatabaseClient, + mock_pdap_client: PDAPClient +) -> DSAppSyncAgenciesUpdateTaskOperator: + return DSAppSyncAgenciesUpdateTaskOperator( + adb_client=adb_client_test, + pdap_client=mock_pdap_client + ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/helpers.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/helpers.py new file mode 100644 index 00000000..7901bea5 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/helpers.py @@ -0,0 +1,7 @@ +from datetime import datetime + + +def check_ds_app_link_updated( + old_updated_at: datetime +) -> None: + raise NotImplementedError \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/test_add_location_link.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/test_add_location_link.py new file mode 100644 index 00000000..91300b04 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/test_add_location_link.py @@ -0,0 +1,29 @@ +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.update.core import DSAppSyncAgenciesUpdateTaskOperator +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel +from tests.helpers.data_creator.models.creation_info.county import CountyCreationInfo +from tests.helpers.run import run_task_and_confirm_success + + +async def test_add_location_link( + ds_app_linked_agency: DSAppLinkInfoModel, + allegheny_county: CountyCreationInfo, + operator: DSAppSyncAgenciesUpdateTaskOperator +): + + + # Check prerequisites not met + assert not await operator.meets_task_prerequisites() + + # Add location link + + # Check prerequisites are met + assert operator.meets_task_prerequisites() + + # Run task and confirm runs without error + await run_task_and_confirm_success(operator) + + # Confirm expected method was called with expected parameters + + # Check DS App Link Is Updated + + raise NotImplementedError diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/test_delete_location_link.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/test_delete_location_link.py new file mode 100644 index 00000000..1e563a1f --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/test_delete_location_link.py @@ -0,0 +1,28 @@ +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.update.core import DSAppSyncAgenciesUpdateTaskOperator +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel +from tests.helpers.data_creator.models.creation_info.locality import LocalityCreationInfo +from tests.helpers.run import run_task_and_confirm_success + + +async def test_delete_location_link( + ds_app_linked_agency: DSAppLinkInfoModel, + pittsburgh_locality: LocalityCreationInfo, + operator: DSAppSyncAgenciesUpdateTaskOperator +): + + # Check prerequisites not met + assert not await operator.meets_task_prerequisites() + + # Delete location link + + # Check prerequisites are met + assert operator.meets_task_prerequisites() + + # Run task and confirm runs without error + await run_task_and_confirm_success(operator) + + # Confirm expected method was called with expected parameters + + # Check DS App Link Is Updated + + raise NotImplementedError diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/test_update_agency.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/test_update_agency.py new file mode 100644 index 00000000..5e5b78a5 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/test_update_agency.py @@ -0,0 +1,26 @@ +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.update.core import DSAppSyncAgenciesUpdateTaskOperator +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel +from tests.helpers.run import run_task_and_confirm_success + + +async def test_update_agency( + ds_app_linked_agency: DSAppLinkInfoModel, + operator: DSAppSyncAgenciesUpdateTaskOperator +): + + # Check prerequisites not met + assert not await operator.meets_task_prerequisites() + + # Update agency table + + # Check prerequisites are met + assert operator.meets_task_prerequisites() + + # Run task and confirm runs without error + await run_task_and_confirm_success(operator) + + # Confirm expected method was called with expected parameters + + # Check DS App Link Is Updated + + raise NotImplementedError diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/__init__.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/conftest.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/conftest.py new file mode 100644 index 00000000..e0a1c61b --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/conftest.py @@ -0,0 +1,10 @@ +import pytest_asyncio + +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel + + +@pytest_asyncio.fixture +async def ds_app_linked_data_source_url( + test_url_data_source_id: int, +) -> DSAppLinkInfoModel: + raise NotImplementedError \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/test_add.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/test_add.py new file mode 100644 index 00000000..9f97b64e --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/test_add.py @@ -0,0 +1,36 @@ +import pytest + +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.add.core import DSAppSyncDataSourcesAddTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.external.pdap.client import PDAPClient +from tests.helpers.data_creator.core import DBDataCreator +from tests.helpers.run import run_task_and_confirm_success + + +@pytest.mark.asyncio +async def test_add( + db_data_creator: DBDataCreator, + test_url_data_source_id: int, + adb_client_test: AsyncDatabaseClient, + mock_pdap_client: PDAPClient +): + operator = DSAppSyncDataSourcesAddTaskOperator( + adb_client=adb_client_test, + pdap_client=mock_pdap_client + ) + + # Check meet task prerequisites + assert not await operator.meets_task_prerequisites() + + # Run task and confirm runs without error + await run_task_and_confirm_success(operator) + + # Confirm expected method was called with expected parameters + + # Check Presence of DS App Link + + + + + + raise NotImplementedError \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/test_delete.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/test_delete.py new file mode 100644 index 00000000..ea202fb1 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/test_delete.py @@ -0,0 +1,48 @@ +import pytest + +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources import DSAppSyncDataSourcesDeleteTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.flag.ds_delete.data_source import FlagDSDeleteDataSource +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource +from src.external.pdap.client import PDAPClient +from tests.helpers.data_creator.core import DBDataCreator +from tests.helpers.run import run_task_and_confirm_success + + +@pytest.mark.asyncio +async def test_delete( + db_data_creator: DBDataCreator, + adb_client_test: AsyncDatabaseClient, + mock_pdap_client: PDAPClient +): + operator = DSAppSyncDataSourcesDeleteTaskOperator( + adb_client=adb_client_test, + pdap_client=mock_pdap_client + ) + # Check does not currently meet prerequisite + assert not operator.meets_task_prerequisites() + + # Add DS App Link + ds_app_link = DSAppLinkDataSource( + url_id=None, + ds_data_source_id=1, + ) + await adb_client_test.add(ds_app_link) + + # Add Task Deletion Flag for App Link + flag = FlagDSDeleteDataSource( + ds_data_source_id=1, + ) + await adb_client_test.add(flag) + + # Check meets prerequisite + assert operator.meets_task_prerequisites() + + # Run task and confirm runs without error + await run_task_and_confirm_success(operator) + + # Check DS App Link Is Deleted + assert await adb_client_test.has_no_rows(DSAppLinkDataSource) + + # Check DS App Data Source Deletion Flag is deleted + assert await adb_client_test.has_no_rows(FlagDSDeleteDataSource) diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/__init__.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/conftest.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/conftest.py new file mode 100644 index 00000000..f991d3c9 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/conftest.py @@ -0,0 +1,16 @@ +import pytest + +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update import DSAppSyncDataSourcesUpdateTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.external.pdap.client import PDAPClient + + +@pytest.fixture +def operator( + adb_client_test: AsyncDatabaseClient, + mock_pdap_client: PDAPClient +) -> DSAppSyncDataSourcesUpdateTaskOperator: + return DSAppSyncDataSourcesUpdateTaskOperator( + adb_client=adb_client_test, + pdap_client=mock_pdap_client + ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/helpers.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/helpers.py new file mode 100644 index 00000000..7901bea5 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/helpers.py @@ -0,0 +1,7 @@ +from datetime import datetime + + +def check_ds_app_link_updated( + old_updated_at: datetime +) -> None: + raise NotImplementedError \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_add_agency_link.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_add_agency_link.py new file mode 100644 index 00000000..1d53d364 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_add_agency_link.py @@ -0,0 +1,27 @@ +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update import DSAppSyncDataSourcesUpdateTaskOperator +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel +from tests.helpers.run import run_task_and_confirm_success + + +async def test_add_agency_link( + ds_app_linked_data_source_url: DSAppLinkInfoModel, + test_agency_id_2: int, + operator: DSAppSyncDataSourcesUpdateTaskOperator +): + + # Check prerequisites not met + assert not await operator.meets_task_prerequisites() + + # Add additional agency link + + # Check prerequisites are met + assert operator.meets_task_prerequisites() + + # Run task and confirm runs without error + await run_task_and_confirm_success(operator) + + # Confirm expected method was called with expected parameters + + # Check DS App Link Is Updated + + raise NotImplementedError diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_delete_agency_link.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_delete_agency_link.py new file mode 100644 index 00000000..8a16cc31 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_delete_agency_link.py @@ -0,0 +1,27 @@ +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update import DSAppSyncDataSourcesUpdateTaskOperator +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel +from tests.helpers.run import run_task_and_confirm_success + + +async def test_delete_agency_link( + ds_app_linked_data_source_url: DSAppLinkInfoModel, + test_agency_id_1: int, + operator: DSAppSyncDataSourcesUpdateTaskOperator +): + + # Check prerequisites not met + assert not await operator.meets_task_prerequisites() + + # Delete agency ID link + + # Check prerequisites are met + assert operator.meets_task_prerequisites() + + # Run task and confirm runs without error + await run_task_and_confirm_success(operator) + + # Confirm expected method was called with expected parameters + + # Check DS App Link Is Updated + + raise NotImplementedError diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_update_optional_ds_metadata.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_update_optional_ds_metadata.py new file mode 100644 index 00000000..ca188487 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_update_optional_ds_metadata.py @@ -0,0 +1,26 @@ +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update import DSAppSyncDataSourcesUpdateTaskOperator +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel +from tests.helpers.run import run_task_and_confirm_success + + +async def test_update_optional_ds_metadata( + ds_app_linked_data_source_url: DSAppLinkInfoModel, + operator: DSAppSyncDataSourcesUpdateTaskOperator +): + + # Check prerequisites not met + assert not await operator.meets_task_prerequisites() + + # Update url_optional_ds_metadata_table table + + # Check prerequisites are met + assert operator.meets_task_prerequisites() + + # Run task and confirm runs without error + await run_task_and_confirm_success(operator) + + # Confirm expected method was called with expected parameters + + # Check DS App Link Is Updated + + raise NotImplementedError diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_update_url.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_update_url.py new file mode 100644 index 00000000..9d6cd70a --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_update_url.py @@ -0,0 +1,26 @@ +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update import DSAppSyncDataSourcesUpdateTaskOperator +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel +from tests.helpers.run import run_task_and_confirm_success + + +async def test_update_url( + ds_app_linked_data_source_url: DSAppLinkInfoModel, + operator: DSAppSyncDataSourcesUpdateTaskOperator +): + + # Check prerequisites not met + assert not await operator.meets_task_prerequisites() + + # Update URL table + + # Check prerequisites are met + assert operator.meets_task_prerequisites() + + # Run task and confirm runs without error + await run_task_and_confirm_success(operator) + + # Confirm expected method was called with expected parameters + + # Check DS App Link Is Updated + + raise NotImplementedError diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/__init__.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/conftest.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/conftest.py new file mode 100644 index 00000000..3dfa7cf4 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/conftest.py @@ -0,0 +1,20 @@ +import pytest_asyncio + +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel + + +@pytest_asyncio.fixture +async def ds_app_linked_meta_url( + test_url_meta_url_id: int, + adb_client_test: AsyncDatabaseClient +) -> DSAppLinkInfoModel: + ds_app_link = DSAppLinkMetaURL( + url_id=test_url_meta_url_id, + ds_meta_url_id=1 + ) + await adb_client_test.add(ds_app_link) + return DSAppLinkInfoModel( + ds_app_id=1, + ) diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/test_add.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/test_add.py new file mode 100644 index 00000000..62cc9ee3 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/test_add.py @@ -0,0 +1,34 @@ +import pytest + +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.add.core import DSAppSyncMetaURLsAddTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL +from src.external.pdap.client import PDAPClient +from tests.helpers.data_creator.core import DBDataCreator +from tests.helpers.run import run_task_and_confirm_success + + +@pytest.mark.asyncio +async def test_add( + db_data_creator: DBDataCreator, + test_url_meta_url_id: int, + adb_client_test: AsyncDatabaseClient, + mock_pdap_client: PDAPClient +): + operator = DSAppSyncMetaURLsAddTaskOperator( + adb_client=adb_client_test, + pdap_client=mock_pdap_client + ) + + # Check meets prerequisites + assert await operator.meets_task_prerequisites() + + # Run task and confirm runs without error + await run_task_and_confirm_success(operator) + + # Confirm expected method was called with expected parameters + + # Check Presence of DS Meta URL App Link + ds_app_link: DSAppLinkMetaURL = await adb_client_test.one_or_none_model(model=DSAppLinkMetaURL) + assert ds_app_link is not None + assert ds_app_link.url_id == test_url_meta_url_id diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/test_delete.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/test_delete.py new file mode 100644 index 00000000..d66a8e91 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/test_delete.py @@ -0,0 +1,49 @@ +import pytest + +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.delete.core import DSAppSyncMetaURLsDeleteTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.flag.ds_delete.meta_url import FlagDSDeleteMetaURL +from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL +from src.external.pdap.client import PDAPClient +from tests.helpers.data_creator.core import DBDataCreator +from tests.helpers.run import run_task_and_confirm_success + + +@pytest.mark.asyncio +async def test_delete( + db_data_creator: DBDataCreator, + adb_client_test: AsyncDatabaseClient, + mock_pdap_client: PDAPClient +): + operator = DSAppSyncMetaURLsDeleteTaskOperator( + adb_client=adb_client_test, + pdap_client=mock_pdap_client + ) + + # Check does not currently meet prerequisite + assert not operator.meets_task_prerequisites() + + # Add DS App Link + ds_app_link = DSAppLinkMetaURL( + ds_meta_url_id=1, + url_id=None, + ) + await adb_client_test.add(ds_app_link) + + # Add Task Deletion Flag for App Link + flag = FlagDSDeleteMetaURL( + ds_meta_url_id=1 + ) + await adb_client_test.add(flag) + + # Check meets prerequisite + assert operator.meets_task_prerequisites() + + # Run task and confirm runs without error + await run_task_and_confirm_success(operator) + + # Check DS App Link Is Deleted + assert await adb_client_test.has_no_rows(DSAppLinkMetaURL) + + # Check DS App Meta URL Deletion Flag is deleted + assert await adb_client_test.has_no_rows(FlagDSDeleteMetaURL) diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/__init__.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/conftest.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/conftest.py new file mode 100644 index 00000000..3b2e8e7b --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/conftest.py @@ -0,0 +1,16 @@ +import pytest + +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.update.core import DSAppSyncMetaURLsUpdateTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.external.pdap.client import PDAPClient + + +@pytest.fixture +def operator( + adb_client_test: AsyncDatabaseClient, + mock_pdap_client: PDAPClient +) -> DSAppSyncMetaURLsUpdateTaskOperator: + return DSAppSyncMetaURLsUpdateTaskOperator( + adb_client=adb_client_test, + pdap_client=mock_pdap_client + ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/helpers.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/helpers.py new file mode 100644 index 00000000..7901bea5 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/helpers.py @@ -0,0 +1,7 @@ +from datetime import datetime + + +def check_ds_app_link_updated( + old_updated_at: datetime +) -> None: + raise NotImplementedError \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/test_add_agency_link.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/test_add_agency_link.py new file mode 100644 index 00000000..e7c0b525 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/test_add_agency_link.py @@ -0,0 +1,27 @@ +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.update.core import DSAppSyncMetaURLsUpdateTaskOperator +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel +from tests.helpers.run import run_task_and_confirm_success + + +async def test_add_agency_link( + ds_app_linked_meta_url: DSAppLinkInfoModel, + test_agency_id_2: int, + operator: DSAppSyncMetaURLsUpdateTaskOperator +): + + # Check prerequisites not met + assert not await operator.meets_task_prerequisites() + + # Add agency link + + # Check prerequisites are met + assert operator.meets_task_prerequisites() + + # Run task and confirm runs without error + await run_task_and_confirm_success(operator) + + # Confirm expected method was called with expected parameters + + # Check DS App Link Is Updated + + raise NotImplementedError diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/test_delete_agency_link.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/test_delete_agency_link.py new file mode 100644 index 00000000..a62c1d26 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/test_delete_agency_link.py @@ -0,0 +1,27 @@ +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.update.core import DSAppSyncMetaURLsUpdateTaskOperator +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel +from tests.helpers.run import run_task_and_confirm_success + + +async def test_delete_agency_link( + ds_app_linked_meta_url: DSAppLinkInfoModel, + test_agency_id_1: int, + operator: DSAppSyncMetaURLsUpdateTaskOperator +): + + # Check prerequisites not met + assert not await operator.meets_task_prerequisites() + + # Delete agency link + + # Check prerequisites are met + assert operator.meets_task_prerequisites() + + # Run task and confirm runs without error + await run_task_and_confirm_success(operator) + + # Confirm expected method was called with expected parameters + + # Check DS App Link Is Updated + + raise NotImplementedError diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/test_update_url.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/test_update_url.py new file mode 100644 index 00000000..ec71c60d --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/test_update_url.py @@ -0,0 +1,27 @@ +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.update.core import DSAppSyncMetaURLsUpdateTaskOperator +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel +from tests.helpers.run import run_task_and_confirm_success + + +async def test_update_url( + ds_app_linked_meta_url: DSAppLinkInfoModel, + operator: DSAppSyncMetaURLsUpdateTaskOperator +): + + # Check prerequisites not met + assert not await operator.meets_task_prerequisites() + + # Update URL table + + # Check prerequisites are met + assert operator.meets_task_prerequisites() + + # Run task and confirm runs without error + await run_task_and_confirm_success(operator) + + + # Confirm expected method was called with expected parameters + + # Check DS App Link Is Updated + + raise NotImplementedError diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/models/__init__.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/models/ds_app_link_info.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/models/ds_app_link_info.py new file mode 100644 index 00000000..c02a3f96 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/models/ds_app_link_info.py @@ -0,0 +1,8 @@ +from datetime import datetime + +from pydantic import BaseModel + + +class DSAppLinkInfoModel(BaseModel): + ds_app_id: int + updated_at: datetime = datetime.now() \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/submit_approved/test_submit_approved_url_task.py b/tests/automated/integration/tasks/url/impl/submit_approved/test_submit_approved_url_task.py index 22ae8129..12e20063 100644 --- a/tests/automated/integration/tasks/url/impl/submit_approved/test_submit_approved_url_task.py +++ b/tests/automated/integration/tasks/url/impl/submit_approved/test_submit_approved_url_task.py @@ -5,13 +5,13 @@ from src.core.tasks.url.enums import TaskOperatorOutcome from src.core.tasks.url.operators.submit_approved.core import SubmitApprovedURLTaskOperator from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource from src.db.models.impl.url.task_error.sqlalchemy import URLTaskError from src.external.pdap.client import PDAPClient from tests.automated.integration.tasks.url.impl.submit_approved.mock import mock_make_request from tests.automated.integration.tasks.url.impl.submit_approved.setup import setup_validated_urls - +# TODO: Marked for destruction @pytest.mark.asyncio async def test_submit_approved_url_task( db_data_creator, @@ -58,17 +58,17 @@ async def test_submit_approved_url_task( url_3: URL = urls[2] # Get URL Data Source Links - url_data_sources = await db_data_creator.adb_client.get_all(URLDataSource) + url_data_sources = await db_data_creator.adb_client.get_all(DSAppLinkDataSource) assert len(url_data_sources) == 2 url_data_source_1 = url_data_sources[0] url_data_source_2 = url_data_sources[1] assert url_data_source_1.url_id == url_1.id - assert url_data_source_1.data_source_id == 21 + assert url_data_source_1.ds_data_source_id == 21 assert url_data_source_2.url_id == url_2.id - assert url_data_source_2.data_source_id == 34 + assert url_data_source_2.ds_data_source_id == 34 # Check that errored URL has entry in url_error_info url_errors = await db_data_creator.adb_client.get_all(URLTaskError) diff --git a/tests/automated/integration/tasks/url/impl/submit_approved/test_validated_meta_url.py b/tests/automated/integration/tasks/url/impl/submit_approved/test_validated_meta_url.py index 76754b29..43818d8c 100644 --- a/tests/automated/integration/tasks/url/impl/submit_approved/test_validated_meta_url.py +++ b/tests/automated/integration/tasks/url/impl/submit_approved/test_validated_meta_url.py @@ -3,7 +3,7 @@ from src.core.tasks.base.run_info import TaskOperatorRunInfo from src.core.tasks.url.operators.submit_approved.core import SubmitApprovedURLTaskOperator from src.db.models.impl.flag.url_validated.enums import URLType -from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource from src.external.pdap.client import PDAPClient from tests.helpers.asserts import assert_task_run_success @@ -37,5 +37,5 @@ async def test_validated_meta_url_not_included( assert_task_run_success(run_info) # Confirm entry not included in database - ds_urls: list[URLDataSource] = await dbdc.adb_client.get_all(URLDataSource) + ds_urls: list[DSAppLinkDataSource] = await dbdc.adb_client.get_all(DSAppLinkDataSource) assert len(ds_urls) == 0 diff --git a/tests/automated/integration/tasks/url/impl/submit_meta_urls/test_core.py b/tests/automated/integration/tasks/url/impl/submit_meta_urls/test_core.py index 08914bed..dea8ca6a 100644 --- a/tests/automated/integration/tasks/url/impl/submit_meta_urls/test_core.py +++ b/tests/automated/integration/tasks/url/impl/submit_meta_urls/test_core.py @@ -9,7 +9,7 @@ from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.flag.url_validated.enums import URLType from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.ds_meta_url.sqlalchemy import URLDSMetaURL +from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL from src.external.pdap.client import PDAPClient from src.external.pdap.impl.meta_urls.enums import SubmitMetaURLsStatus from tests.helpers.data_creator.core import DBDataCreator @@ -71,9 +71,9 @@ async def test_submit_meta_urls( url: URL = urls[0] assert url.status == URLStatus.OK - url_ds_meta_urls: list[URLDSMetaURL] = await db_data_creator.adb_client.get_all(URLDSMetaURL) + url_ds_meta_urls: list[DSAppLinkMetaURL] = await db_data_creator.adb_client.get_all(DSAppLinkMetaURL) assert len(url_ds_meta_urls) == 1 - url_ds_meta_url: URLDSMetaURL = url_ds_meta_urls[0] + url_ds_meta_url: DSAppLinkMetaURL = url_ds_meta_urls[0] assert url_ds_meta_url.url_id == url.id assert url_ds_meta_url.ds_meta_url_id == 2 assert url_ds_meta_url.agency_id == agency_id \ No newline at end of file diff --git a/tests/helpers/data_creator/core.py b/tests/helpers/data_creator/core.py index 575c594f..dbe5a4e6 100644 --- a/tests/helpers/data_creator/core.py +++ b/tests/helpers/data_creator/core.py @@ -522,7 +522,7 @@ async def create_url_agency_links( async def create_agency(self, agency_id: int = 1) -> None: agency = Agency( - agency_id=agency_id, + id=agency_id, name=generate_test_name(agency_id), agency_type=AgencyType.UNKNOWN ) @@ -534,7 +534,7 @@ async def create_agencies(self, count: int = 3) -> list[int]: for _ in range(count): agency_id = next_int() agency = Agency( - agency_id=agency_id, + id=agency_id, name=generate_test_name(agency_id), agency_type=AgencyType.UNKNOWN ) From 8229199175f0c98d721475aca014d2c02a0d6cae Mon Sep 17 00:00:00 2001 From: maxachis Date: Sun, 9 Nov 2025 11:58:30 -0500 Subject: [PATCH 18/84] Update draft --- ENV.md | 11 +- ...28_1539-a57c3b5b6e93_add_sync_log_table.py | 15 +- .../impl/agencies/add/queries/add_links.py | 17 ++ .../impl/agencies/add/queries/cte.py | 30 +++- .../impl/agencies/add/queries/get.py | 10 ++ .../impl/agencies/add/queries/prereq.py | 9 ++ .../impl/agencies/delete/queries/cte.py | 28 +++- .../agencies/delete/queries/delete_flags.py | 16 ++ .../agencies/delete/queries/delete_links.py | 16 ++ .../impl/agencies/delete/queries/get.py | 10 ++ .../impl/agencies/delete/queries/prereq.py | 9 ++ .../impl/agencies/update/queries/cte.py | 34 ++++ .../impl/agencies/update/queries/get.py | 10 ++ .../impl/agencies/update/queries/prereq.py | 9 ++ .../agencies/update/queries/update_links.py | 16 ++ .../data_sources/add/queries/add_links.py | 17 ++ .../impl/data_sources/add/queries/cte.py | 37 ++++- .../impl/data_sources/add/queries/get.py | 10 ++ .../impl/data_sources/add/queries/prereq.py | 9 ++ .../impl/data_sources/delete/queries/cte.py | 28 +++- .../delete/queries/delete_flags.py | 16 ++ .../delete/queries/delete_links.py | 16 ++ .../impl/data_sources/delete/queries/get.py | 10 ++ .../data_sources/delete/queries/prereq.py | 9 ++ .../impl/data_sources/update/queries/cte.py | 42 +++++ .../impl/data_sources/update/queries/get.py | 10 ++ .../data_sources/update/queries/prereq.py | 9 ++ .../update/queries/update_links.py | 16 ++ .../impl/meta_urls/add/queries/add_links.py | 17 ++ .../impl/meta_urls/add/queries/cte.py | 30 +++- .../impl/meta_urls/add/queries/get.py | 10 ++ .../impl/meta_urls/add/queries/prereq.py | 9 ++ .../impl/meta_urls/delete/queries/cte.py | 29 ++++ .../meta_urls/delete/queries/delete_flags.py | 16 ++ .../meta_urls/delete/queries/delete_links.py | 16 ++ .../impl/meta_urls/delete/queries/get.py | 10 ++ .../impl/meta_urls/delete/queries/prereq.py | 9 ++ .../impl/meta_urls/update/queries/cte.py | 33 ++++ .../impl/meta_urls/update/queries/get.py | 10 ++ .../impl/meta_urls/update/queries/prereq.py | 9 ++ .../meta_urls/update/queries/update_links.py | 16 ++ src/core/tasks/url/loader.py | 148 +++++++++++++++--- src/core/tasks/url/models/entry.py | 4 +- .../url/optional_ds_metadata/sqlalchemy.py | 8 +- .../request.py => _templates/__init__.py} | 0 .../pdap/_templates/request_builder.py | 9 ++ .../pdap/impl/sync/meta_urls/add/request.py | 0 .../pdap/impl/sync/shared/models/mapping.py | 6 + 48 files changed, 827 insertions(+), 31 deletions(-) create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/add_links.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/get.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/prereq.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/delete_flags.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/delete_links.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/get.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/prereq.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/get.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/prereq.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/update_links.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/add_links.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/get.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/prereq.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/delete_flags.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/delete_links.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/get.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/prereq.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/get.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/prereq.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/update_links.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/add_links.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/get.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/prereq.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/delete_flags.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/delete_links.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/get.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/prereq.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/get.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/prereq.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/update_links.py rename src/external/pdap/{impl/sync/meta_urls/request.py => _templates/__init__.py} (100%) create mode 100644 src/external/pdap/_templates/request_builder.py create mode 100644 src/external/pdap/impl/sync/meta_urls/add/request.py create mode 100644 src/external/pdap/impl/sync/shared/models/mapping.py diff --git a/ENV.md b/ENV.md index b957bc11..d6ea4e78 100644 --- a/ENV.md +++ b/ENV.md @@ -70,6 +70,15 @@ Note that some tasks/subtasks are themselves enabled by other tasks. | `DELETE_STALE_SCREENSHOTS_TASK_FLAG` | Deletes stale screenshots for URLs already validated. | | `TASK_CLEANUP_TASK_FLAG` | Cleans up tasks that are no longer needed. | | `REFRESH_MATERIALIZED_VIEWS_TASK_FLAG` | Refreshes materialized views. | +| `DS_APP_SYNC_AGENCY_ADD_FLAG` | Adds new agencies to the Data Sources App| +| `DS_APP_SYNC_AGENCY_UPDATE_FLAG` | Updates existing agencies in the Data Sources App| +| `DS_APP_SYNC_AGENCY_DELETE_FLAG` | Deletes agencies in the Data Sources App| +| `DS_APP_SYNC_DATA_SOURCE_ADD_FLAG` | Adds new data sources to the Data Sources App| +| `DS_APP_SYNC_DATA_SOURCE_UPDATE_FLAG` | Updates existing data sources in the Data Sources App| +| `DS_APP_SYNC_DATA_SOURCE_DELETE_FLAG` | Deletes data sources in the Data Sources App| +| `DS_APP_SYNC_META_URL_ADD_FLAG` | Adds new meta URLs to the Data Sources App| +| `DS_APP_SYNC_META_URL_UPDATE_FLAG` | Updates existing meta URLs in the Data Sources App| +| `DS_APP_SYNC_META_URL_DELETE_FLAG` | Deletes meta URLs in the Data Sources App| ### URL Task Flags @@ -81,7 +90,6 @@ URL Task Flags are collectively controlled by the `RUN_URL_TASKS_TASK_FLAG` flag | `URL_HTML_TASK_FLAG` | URL HTML scraping task. | | `URL_RECORD_TYPE_TASK_FLAG` | Automatically assigns Record Types to URLs. | | `URL_AGENCY_IDENTIFICATION_TASK_FLAG` | Automatically assigns and suggests Agencies for URLs. | -| `URL_SUBMIT_APPROVED_TASK_FLAG` | Submits approved URLs to the Data Sources App. | | `URL_MISC_METADATA_TASK_FLAG` | Adds misc metadata to URLs. | | `URL_AUTO_RELEVANCE_TASK_FLAG` | Automatically assigns Relevances to URLs. | | `URL_PROBE_TASK_FLAG` | Probes URLs for web metadata. | @@ -90,7 +98,6 @@ URL Task Flags are collectively controlled by the `RUN_URL_TASKS_TASK_FLAG` flag | `URL_AUTO_VALIDATE_TASK_FLAG` | Automatically validates URLs. | | `URL_AUTO_NAME_TASK_FLAG` | Automatically names URLs. | | `URL_SUSPEND_TASK_FLAG` | Suspends URLs meeting suspension criteria. | -| `URL_SUBMIT_META_URLS_TASK_FLAG` | Submits meta URLs to the Data Sources App. | ### Agency ID Subtasks diff --git a/alembic/versions/2025_10_28_1539-a57c3b5b6e93_add_sync_log_table.py b/alembic/versions/2025_10_28_1539-a57c3b5b6e93_add_sync_log_table.py index 153b4fe6..181447a4 100644 --- a/alembic/versions/2025_10_28_1539-a57c3b5b6e93_add_sync_log_table.py +++ b/alembic/versions/2025_10_28_1539-a57c3b5b6e93_add_sync_log_table.py @@ -10,7 +10,7 @@ from alembic import op import sqlalchemy as sa -from src.util.alembic_helpers import created_at_column +from src.util.alembic_helpers import created_at_column, updated_at_column, create_updated_at_trigger # revision identifiers, used by Alembic. revision: str = 'a57c3b5b6e93' @@ -90,6 +90,9 @@ def _add_link_table_modification_triggers(): ) + + + def upgrade() -> None: _create_sync_log() _create_ds_agency_link() @@ -102,6 +105,16 @@ def upgrade() -> None: _add_flag_deletion_tables() _add_last_synced_at_columns() _add_link_table_modification_triggers() + _add_updated_at_to_optional_data_source_metadata_table() + +def _add_updated_at_to_optional_data_source_metadata_table(): + op.add_column( + "url_optional_data_source_metadata", + updated_at_column() + ) + create_updated_at_trigger( + "url_optional_data_source_metadata" + ) def _add_last_synced_at_columns(): op.add_column( diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/add_links.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/add_links.py new file mode 100644 index 00000000..c35eb463 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/add_links.py @@ -0,0 +1,17 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.queries.base.builder import QueryBuilderBase +from src.external.pdap.impl.sync.shared.models.mapping import DSSyncIDMapping + + +class DSAppSyncAgenciesAddInsertLinksQueryBuilder(QueryBuilderBase): + + def __init__( + self, + mappings: list[DSSyncIDMapping] + ): + super().__init__() + self._mappings = mappings + + async def run(self, session: AsyncSession) -> None: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/cte.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/cte.py index 7ea9742b..5335ea44 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/cte.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/cte.py @@ -1,4 +1,32 @@ """ Agencies to be added to the DS database must not have a ds app link entry -""" \ No newline at end of file +""" +from sqlalchemy import Column, select, exists, CTE + +from src.db.models.impl.agency.ds_link.sqlalchemy import DSAppLinkAgency +from src.db.models.impl.agency.sqlalchemy import Agency + + +class DSAppLinkSyncAgencyAddPrerequisitesCTEContainer: + + def __init__(self): + self._cte = ( + select( + Agency.id + ) + .where( + ~exists( + select(DSAppLinkAgency.agency_id) + .where(DSAppLinkAgency.agency_id == Agency.id) + ) + ).cte() + ) + + @property + def agency_id(self) -> Column[int]: + return self._cte.columns.id + + @property + def cte(self) -> CTE: + return self._cte \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/get.py new file mode 100644 index 00000000..308f564e --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/get.py @@ -0,0 +1,10 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.queries.base.builder import QueryBuilderBase +from src.external.pdap.impl.sync.agencies.add.request import AddAgenciesOuterRequest + + +class DSAppSyncAgenciesAddGetQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> AddAgenciesOuterRequest: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/prereq.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/prereq.py new file mode 100644 index 00000000..89b85367 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/prereq.py @@ -0,0 +1,9 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncAgenciesAddPrerequisitesQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> bool: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/cte.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/cte.py index 69d6150a..311f4a26 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/cte.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/cte.py @@ -1,3 +1,29 @@ """ Agencies to be deleted from the DS database must be flagged for deletion -""" \ No newline at end of file +""" +from sqlalchemy import select, Column, CTE + +from src.db.models.impl.agency.ds_link.sqlalchemy import DSAppLinkAgency +from src.db.models.impl.flag.ds_delete.agency import FlagDSDeleteAgency + + +class DSAppLinkSyncAgencyDeletePrerequisitesCTEContainer: + + def __init__(self): + self._cte = ( + select( + DSAppLinkAgency.ds_agency_id + ) + .join( + FlagDSDeleteAgency, + FlagDSDeleteAgency.ds_agency_id == DSAppLinkAgency.ds_agency_id + ).cte() + ) + + @property + def ds_agency_id(self) -> Column[int]: + return self._cte.columns.ds_agency_id + + @property + def cte(self) -> CTE: + return self._cte \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/delete_flags.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/delete_flags.py new file mode 100644 index 00000000..4dca28d3 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/delete_flags.py @@ -0,0 +1,16 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncAgenciesDeleteRemoveFlagsQueryBuilder(QueryBuilderBase): + + def __init__( + self, + ds_agency_ids: list[int] + ): + super().__init__() + self._ds_agency_ids = ds_agency_ids + + async def run(self, session: AsyncSession) -> None: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/delete_links.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/delete_links.py new file mode 100644 index 00000000..1a386e8a --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/delete_links.py @@ -0,0 +1,16 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncAgenciesDeleteRemoveLinksQueryBuilder(QueryBuilderBase): + + def __init__( + self, + ds_agency_ids: list[int] + ): + super().__init__() + self._ds_agency_ids = ds_agency_ids + + async def run(self, session: AsyncSession) -> None: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/get.py new file mode 100644 index 00000000..8ca4ba6c --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/get.py @@ -0,0 +1,10 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncAgenciesDeleteGetQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> list[int]: + """Get DS App links to delete.""" + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/prereq.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/prereq.py new file mode 100644 index 00000000..9af2af99 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/prereq.py @@ -0,0 +1,9 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncAgenciesDeletePrerequisitesQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> bool: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/cte.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/cte.py index e69de29b..3025c7e2 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/cte.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/cte.py @@ -0,0 +1,34 @@ +from sqlalchemy import select, Column, CTE + +from src.db.models.impl.agency.ds_link.sqlalchemy import DSAppLinkAgency +from src.db.models.impl.agency.sqlalchemy import Agency + + +class DSAppLinkSyncAgencyUpdatePrerequisitesCTEContainer: + + def __init__(self): + self._cte = ( + select( + DSAppLinkAgency.agency_id, + DSAppLinkAgency.ds_agency_id, + ) + .join( + Agency, + Agency.id == DSAppLinkAgency.agency_id, + ) + .where( + Agency.updated_at > DSAppLinkAgency.last_synced_at + ).cte() + ) + + @property + def ds_agency_id(self) -> Column[int]: + return self._cte.columns.ds_agency_id + + @property + def agency_id(self) -> Column[int]: + return self._cte.columns.agency_id + + @property + def cte(self) -> CTE: + return self._cte \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/get.py new file mode 100644 index 00000000..43df5d78 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/get.py @@ -0,0 +1,10 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.queries.base.builder import QueryBuilderBase +from src.external.pdap.impl.sync.agencies.update.request import UpdateAgenciesOuterRequest + + +class DSAppSyncAgenciesUpdateGetQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> UpdateAgenciesOuterRequest: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/prereq.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/prereq.py new file mode 100644 index 00000000..8eb8bc3f --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/prereq.py @@ -0,0 +1,9 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncAgenciesUpdatePrerequisitesQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> bool: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/update_links.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/update_links.py new file mode 100644 index 00000000..5e9288b7 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/update_links.py @@ -0,0 +1,16 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncAgenciesUpdateModifyLinksQueryBuilder(QueryBuilderBase): + + def __init__( + self, + ds_agency_ids: list[int] + ): + super().__init__() + self._ds_agency_ids = ds_agency_ids + + async def run(self, session: AsyncSession) -> None: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/add_links.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/add_links.py new file mode 100644 index 00000000..b819e4b2 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/add_links.py @@ -0,0 +1,17 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.queries.base.builder import QueryBuilderBase +from src.external.pdap.impl.sync.shared.models.mapping import DSSyncIDMapping + + +class DSAppSyncDataSourcesAddInsertLinksQueryBuilder(QueryBuilderBase): + + def __init__( + self, + mappings: list[DSSyncIDMapping] + ): + super().__init__() + self._mappings = mappings + + async def run(self, session: AsyncSession) -> None: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/cte.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/cte.py index a11d3d1d..51aa030f 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/cte.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/cte.py @@ -1,4 +1,39 @@ """ Data sources to be added to the DS database must not have a ds app link entry -""" \ No newline at end of file +""" +from sqlalchemy import select, exists, CTE, Column + +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource + + +class DSAppLinkSyncDataSourceAddPrerequisitesCTEContainer: + + def __init__(self): + self._cte = ( + select( + URL.id + ) + .join( + FlagURLValidated, + FlagURLValidated.url_id == URL.id, + ) + .where( + FlagURLValidated.type == URLType.DATA_SOURCE, + ~exists( + select(DSAppLinkDataSource.url_id) + .where(DSAppLinkDataSource.url_id == URL.id) + ) + ).cte() + ) + + @property + def url_id(self) -> Column[int]: + return self._cte.columns.id + + @property + def cte(self) -> CTE: + return self._cte \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/get.py new file mode 100644 index 00000000..4f73973e --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/get.py @@ -0,0 +1,10 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.queries.base.builder import QueryBuilderBase +from src.external.pdap.impl.sync.data_sources.add.request import AddDataSourcesOuterRequest + + +class DSAppSyncDataSourcesAddGetQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> AddDataSourcesOuterRequest: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/prereq.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/prereq.py new file mode 100644 index 00000000..6f8ac04e --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/prereq.py @@ -0,0 +1,9 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncDataSourcesAddPrerequisitesQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> bool: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/cte.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/cte.py index 1e555125..8e5100f8 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/cte.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/cte.py @@ -1,3 +1,29 @@ """ Data sources to be deleted from the DS database must be flagged for deletion -""" \ No newline at end of file +""" +from sqlalchemy import select, Column, CTE + +from src.db.models.impl.flag.ds_delete.data_source import FlagDSDeleteDataSource +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource + + +class DSAppLinkSyncDataSourceDeletePrerequisitesCTEContainer: + + def __init__(self): + self._cte = ( + select( + DSAppLinkDataSource.ds_data_source_id + ) + .join( + FlagDSDeleteDataSource, + FlagDSDeleteDataSource.ds_data_source_id == FlagDSDeleteDataSource.ds_data_source_id + ).cte() + ) + + @property + def ds_meta_url_id(self) -> Column[int]: + return self._cte.columns.ds_data_source_id + + @property + def cte(self) -> CTE: + return self._cte \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/delete_flags.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/delete_flags.py new file mode 100644 index 00000000..d9f31130 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/delete_flags.py @@ -0,0 +1,16 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncDataSourcesDeleteRemoveFlagsQueryBuilder(QueryBuilderBase): + + def __init__( + self, + ds_data_source_ids: list[int] + ): + super().__init__() + self._ds_data_source_ids = ds_data_source_ids + + async def run(self, session: AsyncSession) -> None: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/delete_links.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/delete_links.py new file mode 100644 index 00000000..547944be --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/delete_links.py @@ -0,0 +1,16 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncDataSourcesDeleteRemoveLinksQueryBuilder(QueryBuilderBase): + + def __init__( + self, + ds_data_source_ids: list[int] + ): + super().__init__() + self._ds_data_source_ids = ds_data_source_ids + + async def run(self, session: AsyncSession) -> None: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/get.py new file mode 100644 index 00000000..ec7724c8 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/get.py @@ -0,0 +1,10 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncDataSourcesDeleteGetQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> list[int]: + """Get DS App links to delete.""" + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/prereq.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/prereq.py new file mode 100644 index 00000000..03843e0a --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/prereq.py @@ -0,0 +1,9 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncDataSourcesDeletePrerequisitesQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> bool: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/cte.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/cte.py index e69de29b..5a3d8120 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/cte.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/cte.py @@ -0,0 +1,42 @@ +from sqlalchemy import select, or_, Column, CTE + +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata + + +class DSAppLinkSyncDataSourceUpdatePrerequisitesCTEContainer: + + def __init__(self): + self._cte = ( + select( + DSAppLinkDataSource.url_id, + DSAppLinkDataSource.ds_data_source_id, + ) + .join( + URL, + URL.id == DSAppLinkDataSource.url_id, + ) + .join( + URLOptionalDataSourceMetadata, + URL.id == URLOptionalDataSourceMetadata.url_id, + ) + .where( + or_( + URL.updated_at > DSAppLinkDataSource.last_synced_at, + URLOptionalDataSourceMetadata.updated_at > DSAppLinkDataSource.last_synced_at, + ) + ).cte() + ) + + @property + def url_id(self) -> Column[int]: + return self._cte.columns.url_id + + @property + def ds_data_source_id(self) -> Column[int]: + return self._cte.columns.ds_data_source_id + + @property + def cte(self) -> CTE: + return self._cte \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/get.py new file mode 100644 index 00000000..f5e0703b --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/get.py @@ -0,0 +1,10 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.queries.base.builder import QueryBuilderBase +from src.external.pdap.impl.sync.data_sources.update.request import UpdateDataSourcesOuterRequest + + +class DSAppSyncDataSourcesUpdateGetQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> UpdateDataSourcesOuterRequest: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/prereq.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/prereq.py new file mode 100644 index 00000000..08965cce --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/prereq.py @@ -0,0 +1,9 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncDataSourcesUpdatePrerequisitesQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> bool: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/update_links.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/update_links.py new file mode 100644 index 00000000..42945b53 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/update_links.py @@ -0,0 +1,16 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncDataSourcesUpdateAlterLinksQueryBuilder(QueryBuilderBase): + + def __init__( + self, + ds_data_source_ids: list[int] + ): + super().__init__() + self._ds_data_source_ids = ds_data_source_ids + + async def run(self, session: AsyncSession) -> None: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/add_links.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/add_links.py new file mode 100644 index 00000000..ff9ecfc7 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/add_links.py @@ -0,0 +1,17 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.queries.base.builder import QueryBuilderBase +from src.external.pdap.impl.sync.shared.models.mapping import DSSyncIDMapping + + +class DSAppSyncMetaURLsAddInsertLinksQueryBuilder(QueryBuilderBase): + + def __init__( + self, + mappings: list[DSSyncIDMapping] + ): + super().__init__() + self._mappings = mappings + + async def run(self, session: AsyncSession) -> None: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/cte.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/cte.py index 3776b2ed..eabd5da2 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/cte.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/cte.py @@ -1,4 +1,32 @@ """ Meta URLs to be added to the DS database must not have a ds app link entry -""" \ No newline at end of file +""" +from sqlalchemy import select, exists, Column, CTE + +from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL +from src.db.models.views.meta_url import MetaURL + + +class DSAppLinkSyncMetaURLAddPrerequisitesCTEContainer: + + def __init__(self): + self._cte = ( + select( + MetaURL.url_id + ) + .where( + ~exists( + select(DSAppLinkMetaURL.url_id) + .where(DSAppLinkMetaURL.url_id == MetaURL.url_id) + ) + ).cte() + ) + + @property + def url_id(self) -> Column[int]: + return self._cte.columns.url_id + + @property + def cte(self) -> CTE: + return self._cte \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/get.py new file mode 100644 index 00000000..998ac642 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/get.py @@ -0,0 +1,10 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.queries.base.builder import QueryBuilderBase +from src.external.pdap.impl.sync.agencies.add.core import AddMetaURLsOuterRequest + + +class DSAppSyncMetaURLsAddGetQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> AddMetaURLsOuterRequest: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/prereq.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/prereq.py new file mode 100644 index 00000000..c495f741 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/prereq.py @@ -0,0 +1,9 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncMetaURLsAddPrerequisitesQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> bool: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/cte.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/cte.py index e69de29b..94e69457 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/cte.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/cte.py @@ -0,0 +1,29 @@ +""" +Meta URLs to be deleted from the DS database must be flagged for deletion +""" +from sqlalchemy import Column, CTE, select + +from src.db.models.impl.flag.ds_delete.meta_url import FlagDSDeleteMetaURL +from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL + + +class DSAppLinkSyncAgencyDeletePrerequisitesCTEContainer: + + def __init__(self): + self._cte = ( + select( + DSAppLinkMetaURL.ds_meta_url_id + ) + .join( + FlagDSDeleteMetaURL, + FlagDSDeleteMetaURL.ds_meta_url_id == DSAppLinkMetaURL.ds_meta_url_id + ).cte() + ) + + @property + def ds_meta_url_id(self) -> Column[int]: + return self._cte.columns.ds_meta_url_id + + @property + def cte(self) -> CTE: + return self._cte \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/delete_flags.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/delete_flags.py new file mode 100644 index 00000000..9313d41d --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/delete_flags.py @@ -0,0 +1,16 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncMetaURLsDeleteRemoveFlagsQueryBuilder(QueryBuilderBase): + + def __init__( + self, + ds_meta_url_ids: list[int] + ): + super().__init__() + self._ds_meta_url_ids = ds_meta_url_ids + + async def run(self, session: AsyncSession) -> None: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/delete_links.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/delete_links.py new file mode 100644 index 00000000..d03c5512 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/delete_links.py @@ -0,0 +1,16 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncMetaURLsDeleteRemoveLinksQueryBuilder(QueryBuilderBase): + + def __init__( + self, + ds_meta_url_ids: list[int] + ): + super().__init__() + self._ds_meta_url_ids = ds_meta_url_ids + + async def run(self, session: AsyncSession) -> None: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/get.py new file mode 100644 index 00000000..6a1505c5 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/get.py @@ -0,0 +1,10 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncMetaURLsDeleteGetQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> list[int]: + """Get DS App links to delete.""" + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/prereq.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/prereq.py new file mode 100644 index 00000000..6e1824ea --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/prereq.py @@ -0,0 +1,9 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncMetaURLsDeletePrerequisitesQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> bool: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/cte.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/cte.py index e69de29b..7ea81c6a 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/cte.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/cte.py @@ -0,0 +1,33 @@ +from sqlalchemy import select, Column, CTE + +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL + +class DSAppLinkSyncDataSourceUpdatePrerequisitesCTEContainer: + + def __init__(self): + self._cte = ( + select( + DSAppLinkMetaURL.url_id, + DSAppLinkMetaURL.ds_meta_url_id, + ) + .join( + URL, + URL.id == DSAppLinkMetaURL.url_id, + ) + .where( + URL.updated_at > DSAppLinkMetaURL.last_synced_at, + ).cte() + ) + + @property + def url_id(self) -> Column[int]: + return self._cte.columns.url_id + + @property + def ds_meta_url_id(self) -> Column[int]: + return self._cte.columns.ds_meta_url_id + + @property + def cte(self) -> CTE: + return self._cte \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/get.py new file mode 100644 index 00000000..17e6742b --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/get.py @@ -0,0 +1,10 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.queries.base.builder import QueryBuilderBase +from src.external.pdap.impl.sync.meta_urls.update.request import UpdateMetaURLsOuterRequest + + +class DSAppSyncMetaURLsUpdateGetQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> UpdateMetaURLsOuterRequest: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/prereq.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/prereq.py new file mode 100644 index 00000000..ec4b3de1 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/prereq.py @@ -0,0 +1,9 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncMetaURLsUpdatePrerequisitesQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> bool: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/update_links.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/update_links.py new file mode 100644 index 00000000..cccc7471 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/update_links.py @@ -0,0 +1,16 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncMetaURLsUpdateAlterLinksQueryBuilder(QueryBuilderBase): + + def __init__( + self, + ds_meta_url_ids: list[int] + ): + super().__init__() + self._ds_meta_url_ids = ds_meta_url_ids + + async def run(self, session: AsyncSession) -> None: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/url/loader.py b/src/core/tasks/url/loader.py index b5910f5e..5632f11e 100644 --- a/src/core/tasks/url/loader.py +++ b/src/core/tasks/url/loader.py @@ -5,6 +5,17 @@ from environs import Env from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.add.core import DSAppSyncAgenciesAddTaskOperator +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.delete.core import DSAppSyncAgenciesDeleteTaskOperator +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.update.core import DSAppSyncAgenciesUpdateTaskOperator +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.add.core import DSAppSyncDataSourcesAddTaskOperator +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.delete.core import \ + DSAppSyncDataSourcesDeleteTaskOperator +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update.core import \ + DSAppSyncDataSourcesUpdateTaskOperator +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.add.core import DSAppSyncMetaURLsAddTaskOperator +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.delete.core import DSAppSyncMetaURLsDeleteTaskOperator +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.update.core import DSAppSyncMetaURLsUpdateTaskOperator from src.core.tasks.url.models.entry import URLTaskEntry from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator from src.core.tasks.url.operators.agency_identification.subtasks.loader import AgencyIdentificationSubtaskLoader @@ -96,25 +107,7 @@ def _get_agency_identification_task_operator(self) -> URLTaskEntry: enabled=self.setup_flag("URL_AGENCY_IDENTIFICATION_TASK_FLAG") ) - def _get_submit_approved_url_task_operator(self) -> URLTaskEntry: - operator = SubmitApprovedURLTaskOperator( - adb_client=self.adb_client, - pdap_client=self.pdap_client - ) - return URLTaskEntry( - operator=operator, - enabled=self.setup_flag("URL_SUBMIT_APPROVED_TASK_FLAG") - ) - def _get_submit_meta_urls_task_operator(self) -> URLTaskEntry: - operator = SubmitMetaURLsTaskOperator( - adb_client=self.adb_client, - pdap_client=self.pdap_client - ) - return URLTaskEntry( - operator=operator, - enabled=self.setup_flag("URL_SUBMIT_META_URLS_TASK_FLAG") - ) def _get_url_miscellaneous_metadata_task_operator(self) -> URLTaskEntry: operator = URLMiscellaneousMetadataTaskOperator( @@ -204,6 +197,110 @@ def _get_suspend_url_task_operator(self) -> URLTaskEntry: enabled=self.setup_flag("URL_SUSPEND_TASK_FLAG") ) + # TODO: Double check env var flags + # DS App Sync + ## Agency + ### Add + def _get_ds_app_sync_agency_add_task_operator(self) -> URLTaskEntry: + operator = DSAppSyncAgenciesAddTaskOperator( + adb_client=self.adb_client, + pdap_client=self.pdap_client + ) + return URLTaskEntry( + operator=operator, + enabled=self.setup_flag("DS_APP_SYNC_AGENCY_ADD_TASK_FLAG") + ) + + ### Update + def _get_ds_app_sync_agency_update_task_operator(self) -> URLTaskEntry: + operator = DSAppSyncAgenciesUpdateTaskOperator( + adb_client=self.adb_client, + pdap_client=self.pdap_client + ) + return URLTaskEntry( + operator=operator, + enabled=self.setup_flag("DS_APP_SYNC_AGENCY_UPDATE_TASK_FLAG") + ) + + ### Delete + def _get_ds_app_sync_agency_delete_task_operator(self) -> URLTaskEntry: + operator = DSAppSyncAgenciesDeleteTaskOperator( + adb_client=self.adb_client, + pdap_client=self.pdap_client + ) + return URLTaskEntry( + operator=operator, + enabled=self.setup_flag("DS_APP_SYNC_AGENCY_DELETE_TASK_FLAG") + ) + + ## Data Source + ### Add + def _get_ds_app_sync_data_source_add_task_operator(self) -> URLTaskEntry: + operator = DSAppSyncDataSourcesAddTaskOperator( + adb_client=self.adb_client, + pdap_client=self.pdap_client + ) + return URLTaskEntry( + operator=operator, + enabled=self.setup_flag("DS_APP_SYNC_DATA_SOURCE_ADD_TASK_FLAG") + ) + + ### Update + def _get_ds_app_sync_data_source_update_task_operator(self) -> URLTaskEntry: + operator = DSAppSyncDataSourcesUpdateTaskOperator( + adb_client=self.adb_client, + pdap_client=self.pdap_client + ) + return URLTaskEntry( + operator=operator, + enabled=self.setup_flag("DS_APP_SYNC_DATA_SOURCE_UPDATE_TASK_FLAG") + ) + + ### Delete + def _get_ds_app_sync_data_source_delete_task_operator(self) -> URLTaskEntry: + operator = DSAppSyncDataSourcesDeleteTaskOperator( + adb_client=self.adb_client, + pdap_client=self.pdap_client + ) + return URLTaskEntry( + operator=operator, + enabled=self.setup_flag("DS_APP_SYNC_DATA_SOURCE_DELETE_TASK_FLAG") + ) + + ## Meta URL + ### Add + def _get_ds_app_sync_meta_url_add_task_operator(self) -> URLTaskEntry: + operator = DSAppSyncMetaURLsAddTaskOperator( + adb_client=self.adb_client, + pdap_client=self.pdap_client + ) + return URLTaskEntry( + operator=operator, + enabled=self.setup_flag("DS_APP_SYNC_META_URL_ADD_TASK_FLAG") + ) + + ### Update + def _get_ds_app_sync_meta_url_update_task_operator(self) -> URLTaskEntry: + operator = DSAppSyncMetaURLsUpdateTaskOperator( + adb_client=self.adb_client, + pdap_client=self.pdap_client + ) + return URLTaskEntry( + operator=operator, + enabled=self.setup_flag("DS_APP_SYNC_META_URL_UPDATE_TASK_FLAG") + ) + + ### Delete + def _get_ds_app_sync_meta_url_delete_task_operator(self) -> URLTaskEntry: + operator = DSAppSyncMetaURLsDeleteTaskOperator( + adb_client=self.adb_client, + pdap_client=self.pdap_client + ) + return URLTaskEntry( + operator=operator, + enabled=self.setup_flag("DS_APP_SYNC_META_URL_DELETE_TASK_FLAG") + ) + async def load_entries(self) -> list[URLTaskEntry]: return [ @@ -213,12 +310,23 @@ async def load_entries(self) -> list[URLTaskEntry]: self._get_url_record_type_task_operator(), self._get_agency_identification_task_operator(), self._get_url_miscellaneous_metadata_task_operator(), - self._get_submit_approved_url_task_operator(), - self._get_submit_meta_urls_task_operator(), self._get_url_auto_relevance_task_operator(), self._get_url_screenshot_task_operator(), self._get_location_id_task_operator(), self._get_auto_validate_task_operator(), self._get_auto_name_task_operator(), self._get_suspend_url_task_operator(), + # DS App Sync + ## Agency + self._get_ds_app_sync_agency_add_task_operator(), + self._get_ds_app_sync_agency_update_task_operator(), + self._get_ds_app_sync_agency_delete_task_operator(), + ## Data Source + self._get_ds_app_sync_data_source_add_task_operator(), + self._get_ds_app_sync_data_source_update_task_operator(), + self._get_ds_app_sync_data_source_delete_task_operator(), + ## Meta URL + self._get_ds_app_sync_meta_url_add_task_operator(), + self._get_ds_app_sync_meta_url_update_task_operator(), + self._get_ds_app_sync_meta_url_delete_task_operator(), ] diff --git a/src/core/tasks/url/models/entry.py b/src/core/tasks/url/models/entry.py index eeb09047..69269c1e 100644 --- a/src/core/tasks/url/models/entry.py +++ b/src/core/tasks/url/models/entry.py @@ -1,5 +1,7 @@ from pydantic import BaseModel +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.add.core import DSAppSyncAgenciesAddTaskOperator +from src.core.tasks.scheduled.impl.sync_to_ds.templates.operator import DSSyncTaskOperatorBase from src.core.tasks.url.operators.base import URLTaskOperatorBase @@ -8,5 +10,5 @@ class URLTaskEntry(BaseModel): class Config: arbitrary_types_allowed = True - operator: URLTaskOperatorBase + operator: URLTaskOperatorBase | DSSyncTaskOperatorBase enabled: bool \ No newline at end of file diff --git a/src/db/models/impl/url/optional_ds_metadata/sqlalchemy.py b/src/db/models/impl/url/optional_ds_metadata/sqlalchemy.py index 3f6e239b..b3b49ce0 100644 --- a/src/db/models/impl/url/optional_ds_metadata/sqlalchemy.py +++ b/src/db/models/impl/url/optional_ds_metadata/sqlalchemy.py @@ -4,11 +4,15 @@ from src.db.models.helpers import enum_column from src.db.models.impl.url.optional_ds_metadata.enums import AgencyAggregationEnum, AccessTypeEnum, \ RetentionScheduleEnum, UpdateMethodEnum -from src.db.models.mixins import URLDependentMixin +from src.db.models.mixins import URLDependentMixin, UpdatedAtMixin from src.db.models.templates_.with_id import WithIDBase -class URLOptionalDataSourceMetadata(URLDependentMixin, WithIDBase): +class URLOptionalDataSourceMetadata( + URLDependentMixin, + WithIDBase, + UpdatedAtMixin +): __tablename__ = 'url_optional_data_source_metadata' record_formats = Column(ARRAY(String), nullable=True) diff --git a/src/external/pdap/impl/sync/meta_urls/request.py b/src/external/pdap/_templates/__init__.py similarity index 100% rename from src/external/pdap/impl/sync/meta_urls/request.py rename to src/external/pdap/_templates/__init__.py diff --git a/src/external/pdap/_templates/request_builder.py b/src/external/pdap/_templates/request_builder.py new file mode 100644 index 00000000..e74c87e5 --- /dev/null +++ b/src/external/pdap/_templates/request_builder.py @@ -0,0 +1,9 @@ +from abc import ABC, abstractmethod + +from pdap_access_manager import AccessManager + +class PDAPClientRequestBuilderBase(ABC): + + @abstractmethod + async def run(self, access_manager: AccessManager): + raise NotImplementedError \ No newline at end of file diff --git a/src/external/pdap/impl/sync/meta_urls/add/request.py b/src/external/pdap/impl/sync/meta_urls/add/request.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/shared/models/mapping.py b/src/external/pdap/impl/sync/shared/models/mapping.py new file mode 100644 index 00000000..fd22bca2 --- /dev/null +++ b/src/external/pdap/impl/sync/shared/models/mapping.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class DSSyncIDMapping(BaseModel): + ds_app_link_id: int + entity_id: int \ No newline at end of file From 13e92cec52b2434c9ff225de5cb1cd5554110fdc Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 10 Nov 2025 15:44:58 -0500 Subject: [PATCH 19/84] Complete pre-test draft --- ENV.md | 42 +++--- src/core/tasks/base/operator.py | 8 +- .../impl/sync_to_ds/impl/agencies/add/core.py | 37 ++++- .../impl/agencies/add/queries/add_links.py | 15 +- .../impl/agencies/add/queries/get.py | 68 ++++++++- .../impl/agencies/add/queries/prereq.py | 10 +- .../sync_to_ds/impl/agencies/delete/core.py | 51 ++++++- .../agencies/delete/queries/delete_flags.py | 8 +- .../agencies/delete/queries/delete_links.py | 8 +- .../impl/agencies/delete/queries/get.py | 20 ++- .../impl/agencies/delete/queries/prereq.py | 10 +- .../sync_to_ds/impl/agencies/update/core.py | 43 +++++- .../impl/agencies/update/queries/get.py | 69 ++++++++- .../impl/agencies/update/queries/prereq.py | 10 +- .../agencies/update/queries/update_links.py | 13 +- .../sync_to_ds/impl/data_sources/add/core.py | 39 ++++- .../data_sources/add/queries/add_links.py | 15 +- .../impl/data_sources/add/queries/get.py | 111 ++++++++++++++- .../impl/data_sources/add/queries/prereq.py | 10 +- .../impl/data_sources/delete/core.py | 51 ++++++- .../impl/data_sources/delete/queries/cte.py | 2 +- .../delete/queries/delete_flags.py | 8 +- .../delete/queries/delete_links.py | 8 +- .../impl/data_sources/delete/queries/get.py | 20 ++- .../data_sources/delete/queries/prereq.py | 10 +- .../impl/data_sources/update/core.py | 43 +++++- .../impl/data_sources/update/queries/get.py | 112 ++++++++++++++- .../data_sources/update/queries/prereq.py | 10 +- .../update/queries/update_links.py | 11 +- .../sync_to_ds/impl/meta_urls/add/core.py | 37 ++++- .../impl/meta_urls/add/queries/add_links.py | 15 +- .../impl/meta_urls/add/queries/get.py | 63 +++++++- .../impl/meta_urls/add/queries/prereq.py | 10 +- .../sync_to_ds/impl/meta_urls/delete/core.py | 51 ++++++- .../impl/meta_urls/delete/queries/cte.py | 2 +- .../meta_urls/delete/queries/delete_flags.py | 8 +- .../meta_urls/delete/queries/delete_links.py | 8 +- .../impl/meta_urls/delete/queries/get.py | 20 ++- .../impl/meta_urls/delete/queries/prereq.py | 10 +- .../sync_to_ds/impl/meta_urls/update/core.py | 43 +++++- .../impl/meta_urls/update/queries/cte.py | 2 +- .../impl/meta_urls/update/queries/get.py | 63 +++++++- .../impl/meta_urls/update/queries/prereq.py | 10 +- .../meta_urls/update/queries/update_links.py | 11 +- src/core/tasks/url/loader.py | 5 - .../url/operators/submit_approved/__init__.py | 0 .../url/operators/submit_approved/convert.py | 19 --- .../url/operators/submit_approved/core.py | 50 ------- .../url/operators/submit_approved/filter.py | 11 -- .../submit_approved/queries/__init__.py | 0 .../operators/submit_approved/queries/cte.py | 31 ---- .../operators/submit_approved/queries/get.py | 68 --------- .../submit_approved/queries/has_validated.py | 18 --- .../submit_approved/queries/mark_submitted.py | 29 ---- .../url/operators/submit_approved/tdo.py | 26 ---- .../operators/submit_meta_urls/__init__.py | 0 .../url/operators/submit_meta_urls/core.py | 78 ---------- .../submit_meta_urls/queries/__init__.py | 0 .../operators/submit_meta_urls/queries/cte.py | 61 -------- .../operators/submit_meta_urls/queries/get.py | 34 ----- .../submit_meta_urls/queries/prereq.py | 20 --- src/db/client/async_.py | 5 - src/db/client/sync.py | 2 +- .../pdap/_templates/request_builder.py | 38 ++++- src/external/pdap/client.py | 77 +--------- src/external/pdap/impl/meta_urls/__init__.py | 0 src/external/pdap/impl/meta_urls/core.py | 58 -------- src/external/pdap/impl/meta_urls/enums.py | 7 - src/external/pdap/impl/meta_urls/request.py | 7 - src/external/pdap/impl/meta_urls/response.py | 11 -- .../sync/agencies/_shared/models/content.py | 5 + .../pdap/impl/sync/agencies/add/core.py | 35 +++-- .../pdap/impl/sync/agencies/delete/core.py | 22 +++ .../pdap/impl/sync/agencies/update/core.py | 19 +++ .../impl/sync/data_sources/_shared/content.py | 23 ++- .../impl/sync/data_sources/_shared/enums.py | 11 ++ .../pdap/impl/sync/data_sources/add/core.py | 24 ++++ .../impl/sync/data_sources/delete/core.py | 22 +++ .../impl/sync/data_sources/update/core.py | 19 +++ .../pdap/impl/sync/meta_urls/add/core.py | 25 ++++ .../pdap/impl/sync/meta_urls/add/request.py | 20 +++ .../pdap/impl/sync/meta_urls/delete/core.py | 24 ++++ .../pdap/impl/sync/meta_urls/update/core.py | 21 +++ .../url/impl/submit_approved/__init__.py | 0 .../tasks/url/impl/submit_approved/mock.py | 38 ----- .../tasks/url/impl/submit_approved/setup.py | 49 ------- .../test_submit_approved_url_task.py | 134 ------------------ .../test_validated_meta_url.py | 41 ------ .../url/impl/submit_meta_urls/__init__.py | 0 .../url/impl/submit_meta_urls/test_core.py | 79 ----------- .../tasks/url/loader/test_flags.py | 5 - .../data_creator/commands/impl/urls_/query.py | 2 +- .../data_creator/commands/impl/urls_/tdo.py | 12 ++ 93 files changed, 1434 insertions(+), 1066 deletions(-) delete mode 100644 src/core/tasks/url/operators/submit_approved/__init__.py delete mode 100644 src/core/tasks/url/operators/submit_approved/convert.py delete mode 100644 src/core/tasks/url/operators/submit_approved/core.py delete mode 100644 src/core/tasks/url/operators/submit_approved/filter.py delete mode 100644 src/core/tasks/url/operators/submit_approved/queries/__init__.py delete mode 100644 src/core/tasks/url/operators/submit_approved/queries/cte.py delete mode 100644 src/core/tasks/url/operators/submit_approved/queries/get.py delete mode 100644 src/core/tasks/url/operators/submit_approved/queries/has_validated.py delete mode 100644 src/core/tasks/url/operators/submit_approved/queries/mark_submitted.py delete mode 100644 src/core/tasks/url/operators/submit_approved/tdo.py delete mode 100644 src/core/tasks/url/operators/submit_meta_urls/__init__.py delete mode 100644 src/core/tasks/url/operators/submit_meta_urls/core.py delete mode 100644 src/core/tasks/url/operators/submit_meta_urls/queries/__init__.py delete mode 100644 src/core/tasks/url/operators/submit_meta_urls/queries/cte.py delete mode 100644 src/core/tasks/url/operators/submit_meta_urls/queries/get.py delete mode 100644 src/core/tasks/url/operators/submit_meta_urls/queries/prereq.py delete mode 100644 src/external/pdap/impl/meta_urls/__init__.py delete mode 100644 src/external/pdap/impl/meta_urls/core.py delete mode 100644 src/external/pdap/impl/meta_urls/enums.py delete mode 100644 src/external/pdap/impl/meta_urls/request.py delete mode 100644 src/external/pdap/impl/meta_urls/response.py create mode 100644 src/external/pdap/impl/sync/data_sources/_shared/enums.py delete mode 100644 tests/automated/integration/tasks/url/impl/submit_approved/__init__.py delete mode 100644 tests/automated/integration/tasks/url/impl/submit_approved/mock.py delete mode 100644 tests/automated/integration/tasks/url/impl/submit_approved/setup.py delete mode 100644 tests/automated/integration/tasks/url/impl/submit_approved/test_submit_approved_url_task.py delete mode 100644 tests/automated/integration/tasks/url/impl/submit_approved/test_validated_meta_url.py delete mode 100644 tests/automated/integration/tasks/url/impl/submit_meta_urls/__init__.py delete mode 100644 tests/automated/integration/tasks/url/impl/submit_meta_urls/test_core.py create mode 100644 tests/helpers/data_creator/commands/impl/urls_/tdo.py diff --git a/ENV.md b/ENV.md index d6ea4e78..d4496dbc 100644 --- a/ENV.md +++ b/ENV.md @@ -57,28 +57,28 @@ Note that some tasks/subtasks are themselves enabled by other tasks. ### Scheduled Task Flags -| Flag | Description | -|-------------------------------------|-------------------------------------------------------------------------------| -| `SCHEDULED_TASKS_FLAG` | All scheduled tasks. Disabling disables all other scheduled tasks. | -| `PUSH_TO_HUGGING_FACE_TASK_FLAG` | Pushes data to HuggingFace. | -| `POPULATE_BACKLOG_SNAPSHOT_TASK_FLAG` | Populates the backlog snapshot. | -| `DELETE_OLD_LOGS_TASK_FLAG` | Deletes old logs. | -| `RUN_URL_TASKS_TASK_FLAG` | Runs URL tasks. | -| `IA_PROBE_TASK_FLAG` | Extracts and links Internet Archives metadata to URLs. | -| `IA_SAVE_TASK_FLAG` | Saves URLs to Internet Archives. | -| `MARK_TASK_NEVER_COMPLETED_TASK_FLAG` | Marks tasks that were started but never completed (usually due to a restart). | -| `DELETE_STALE_SCREENSHOTS_TASK_FLAG` | Deletes stale screenshots for URLs already validated. | -| `TASK_CLEANUP_TASK_FLAG` | Cleans up tasks that are no longer needed. | +| Flag | Description | +|----------------------------------------|-------------------------------------------------------------------------------| +| `SCHEDULED_TASKS_FLAG` | All scheduled tasks. Disabling disables all other scheduled tasks. | +| `PUSH_TO_HUGGING_FACE_TASK_FLAG` | Pushes data to HuggingFace. | +| `POPULATE_BACKLOG_SNAPSHOT_TASK_FLAG` | Populates the backlog snapshot. | +| `DELETE_OLD_LOGS_TASK_FLAG` | Deletes old logs. | +| `RUN_URL_TASKS_TASK_FLAG` | Runs URL tasks. | +| `IA_PROBE_TASK_FLAG` | Extracts and links Internet Archives metadata to URLs. | +| `IA_SAVE_TASK_FLAG` | Saves URLs to Internet Archives. | +| `MARK_TASK_NEVER_COMPLETED_TASK_FLAG` | Marks tasks that were started but never completed (usually due to a restart). | +| `DELETE_STALE_SCREENSHOTS_TASK_FLAG` | Deletes stale screenshots for URLs already validated. | +| `TASK_CLEANUP_TASK_FLAG` | Cleans up tasks that are no longer needed. | | `REFRESH_MATERIALIZED_VIEWS_TASK_FLAG` | Refreshes materialized views. | -| `DS_APP_SYNC_AGENCY_ADD_FLAG` | Adds new agencies to the Data Sources App| -| `DS_APP_SYNC_AGENCY_UPDATE_FLAG` | Updates existing agencies in the Data Sources App| -| `DS_APP_SYNC_AGENCY_DELETE_FLAG` | Deletes agencies in the Data Sources App| -| `DS_APP_SYNC_DATA_SOURCE_ADD_FLAG` | Adds new data sources to the Data Sources App| -| `DS_APP_SYNC_DATA_SOURCE_UPDATE_FLAG` | Updates existing data sources in the Data Sources App| -| `DS_APP_SYNC_DATA_SOURCE_DELETE_FLAG` | Deletes data sources in the Data Sources App| -| `DS_APP_SYNC_META_URL_ADD_FLAG` | Adds new meta URLs to the Data Sources App| -| `DS_APP_SYNC_META_URL_UPDATE_FLAG` | Updates existing meta URLs in the Data Sources App| -| `DS_APP_SYNC_META_URL_DELETE_FLAG` | Deletes meta URLs in the Data Sources App| +| `DS_APP_SYNC_AGENCY_ADD_TASK_FLAG` | Adds new agencies to the Data Sources App| +| `DS_APP_SYNC_AGENCY_UPDATE_TASK_FLAG` | Updates existing agencies in the Data Sources App| +| `DS_APP_SYNC_AGENCY_DELETE_TASK_FLAG` | Deletes agencies in the Data Sources App| +| `DS_APP_SYNC_DATA_SOURCE_ADD_TASK_FLAG` | Adds new data sources to the Data Sources App| +| `DS_APP_SYNC_DATA_SOURCE_UPDATE_TASK_FLAG` | Updates existing data sources in the Data Sources App| +| `DS_APP_SYNC_DATA_SOURCE_DELETE_TASK_FLAG` | Deletes data sources in the Data Sources App| +| `DS_APP_SYNC_META_URL_ADD_TASK_FLAG` | Adds new meta URLs to the Data Sources App| +| `DS_APP_SYNC_META_URL_UPDATE_TASK_FLAG` | Updates existing meta URLs in the Data Sources App| +| `DS_APP_SYNC_META_URL_DELETE_TASK_FLAG` | Deletes meta URLs in the Data Sources App| ### URL Task Flags diff --git a/src/core/tasks/base/operator.py b/src/core/tasks/base/operator.py index 55d8033b..719abdf5 100644 --- a/src/core/tasks/base/operator.py +++ b/src/core/tasks/base/operator.py @@ -1,5 +1,6 @@ import traceback from abc import ABC, abstractmethod +from typing import Any from src.core.enums import BatchStatus from src.core.tasks.base.run_info import TaskOperatorRunInfo @@ -9,6 +10,7 @@ from src.db.models.impl.task.enums import TaskStatus from src.db.models.impl.url.task_error.pydantic_.insert import URLTaskErrorPydantic from src.db.models.impl.url.task_error.pydantic_.small import URLTaskErrorSmall +from src.db.queries.base.builder import QueryBuilderBase class TaskOperatorBase(ABC): @@ -90,4 +92,8 @@ async def add_task_errors( ) for error in errors ] - await self.adb_client.bulk_insert(inserts) \ No newline at end of file + await self.adb_client.bulk_insert(inserts) + + # Convenience forwarder functions + async def run_query_builder(self, query_builder: QueryBuilderBase) -> Any: + return await self.adb_client.run_query_builder(query_builder) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/core.py index 2d43202d..ecc573da 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/core.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/core.py @@ -1,4 +1,12 @@ +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.add.queries.add_links import \ + DSAppSyncAgenciesAddInsertLinksQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.add.queries.get import DSAppSyncAgenciesAddGetQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.add.queries.prereq import \ + DSAppSyncAgenciesAddPrerequisitesQueryBuilder from src.core.tasks.scheduled.impl.sync_to_ds.templates.operator import DSSyncTaskOperatorBase +from src.external.pdap.impl.sync.agencies.add.core import AddAgenciesRequestBuilder +from src.external.pdap.impl.sync.agencies.add.request import AddAgenciesOuterRequest +from src.external.pdap.impl.sync.shared.models.add.response import DSAppSyncAddResponseInnerModel class DSAppSyncAgenciesAddTaskOperator( @@ -6,7 +14,32 @@ class DSAppSyncAgenciesAddTaskOperator( ): async def meets_task_prerequisites(self) -> bool: - raise NotImplementedError + return await self.run_query_builder( + DSAppSyncAgenciesAddPrerequisitesQueryBuilder() + ) async def inner_task_logic(self) -> None: - raise NotImplementedError \ No newline at end of file + request: AddAgenciesOuterRequest = await self.get_request_input() + responses: list[DSAppSyncAddResponseInnerModel] = await self.make_request(request) + await self.insert_ds_app_links(responses) + + async def get_request_input(self) -> AddAgenciesOuterRequest: + return await self.run_query_builder( + DSAppSyncAgenciesAddGetQueryBuilder() + ) + + async def make_request( + self, + request: AddAgenciesOuterRequest + ) -> list[DSAppSyncAddResponseInnerModel]: + return await self.pdap_client.run_request_builder( + AddAgenciesRequestBuilder(request) + ) + + async def insert_ds_app_links( + self, + responses: list[DSAppSyncAddResponseInnerModel] + ) -> None: + await self.run_query_builder( + DSAppSyncAgenciesAddInsertLinksQueryBuilder(responses) + ) diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/add_links.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/add_links.py index c35eb463..68b42aa6 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/add_links.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/add_links.py @@ -1,17 +1,26 @@ from sqlalchemy.ext.asyncio import AsyncSession +from src.db.models.impl.agency.ds_link.sqlalchemy import DSAppLinkAgency from src.db.queries.base.builder import QueryBuilderBase -from src.external.pdap.impl.sync.shared.models.mapping import DSSyncIDMapping +from src.external.pdap.impl.sync.shared.models.add.response import DSAppSyncAddResponseInnerModel class DSAppSyncAgenciesAddInsertLinksQueryBuilder(QueryBuilderBase): def __init__( self, - mappings: list[DSSyncIDMapping] + mappings: list[DSAppSyncAddResponseInnerModel] ): super().__init__() self._mappings = mappings async def run(self, session: AsyncSession) -> None: - raise NotImplementedError \ No newline at end of file + inserts: list[DSAppLinkAgency] = [] + for mapping in self._mappings: + inserts.append( + DSAppLinkAgency( + ds_agency_id=mapping.ds_app_id, + agency_id=mapping.request_id, + ) + ) + session.add_all(inserts) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/get.py index 308f564e..a2ac4957 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/get.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/get.py @@ -1,10 +1,74 @@ +from typing import Sequence + +from sqlalchemy import select, RowMapping, func from sqlalchemy.ext.asyncio import AsyncSession +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.add.queries.cte import \ + DSAppLinkSyncAgencyAddPrerequisitesCTEContainer +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.link.agency_location.sqlalchemy import LinkAgencyLocation from src.db.queries.base.builder import QueryBuilderBase -from src.external.pdap.impl.sync.agencies.add.request import AddAgenciesOuterRequest +from src.external.pdap.impl.sync.agencies._shared.models.content import AgencySyncContentModel +from src.external.pdap.impl.sync.agencies.add.request import AddAgenciesOuterRequest, AddAgenciesInnerRequest class DSAppSyncAgenciesAddGetQueryBuilder(QueryBuilderBase): async def run(self, session: AsyncSession) -> AddAgenciesOuterRequest: - raise NotImplementedError \ No newline at end of file + cte = DSAppLinkSyncAgencyAddPrerequisitesCTEContainer() + + location_id_cte = ( + select( + LinkAgencyLocation.agency_id, + func.array_agg(LinkAgencyLocation.location_id).label("location_ids"), + ) + .join( + Agency, + Agency.id == cte.agency_id, + ) + .group_by( + LinkAgencyLocation.agency_id, + ) + .cte() + ) + + query = ( + select( + cte.agency_id, + Agency.name, + Agency.jurisdiction_type, + Agency.agency_type, + location_id_cte.c.location_ids, + ) + .join( + Agency, + Agency.id == cte.agency_id, + ) + .join( + location_id_cte, + location_id_cte.c.agency_id == cte.agency_id, + ) + ) + + mappings: Sequence[RowMapping] = await self.sh.mappings( + session=session, + query=query, + ) + + inner_requests: list[AddAgenciesInnerRequest] = [] + for mapping in mappings: + inner_requests.append( + AddAgenciesInnerRequest( + request_id=mapping.agency_id, + content=AgencySyncContentModel( + name=mapping[Agency.name], + jurisdiction_type=mapping[Agency.jurisdiction_type], + agency_type=mapping[Agency.agency_type], + location_ids=mapping["location_ids"] + ) + ) + ) + + return AddAgenciesOuterRequest( + agencies=inner_requests, + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/prereq.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/prereq.py index 89b85367..61097fc6 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/prereq.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/prereq.py @@ -1,9 +1,17 @@ +from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.add.queries.cte import \ + DSAppLinkSyncAgencyAddPrerequisitesCTEContainer from src.db.queries.base.builder import QueryBuilderBase class DSAppSyncAgenciesAddPrerequisitesQueryBuilder(QueryBuilderBase): async def run(self, session: AsyncSession) -> bool: - raise NotImplementedError \ No newline at end of file + return await self.sh.results_exist( + session=session, + query=select( + DSAppLinkSyncAgencyAddPrerequisitesCTEContainer().agency_id + ) + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/core.py index 73cbf343..26e78a96 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/core.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/core.py @@ -1,4 +1,13 @@ +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.delete.queries.delete_flags import \ + DSAppSyncAgenciesDeleteRemoveFlagsQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.delete.queries.delete_links import \ + DSAppSyncAgenciesDeleteRemoveLinksQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.delete.queries.get import \ + DSAppSyncAgenciesDeleteGetQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.delete.queries.prereq import \ + DSAppSyncAgenciesDeletePrerequisitesQueryBuilder from src.core.tasks.scheduled.impl.sync_to_ds.templates.operator import DSSyncTaskOperatorBase +from src.external.pdap.impl.sync.agencies.delete.core import DeleteAgenciesRequestBuilder class DSAppSyncAgenciesDeleteTaskOperator( @@ -6,7 +15,45 @@ class DSAppSyncAgenciesDeleteTaskOperator( ): async def meets_task_prerequisites(self) -> bool: - raise NotImplementedError + return await self.adb_client.run_query_builder( + DSAppSyncAgenciesDeletePrerequisitesQueryBuilder() + ) async def inner_task_logic(self) -> None: - raise NotImplementedError \ No newline at end of file + ds_app_ids: list[int] = await self.get_inputs() + await self.make_request(ds_app_ids) + await self.delete_flags(ds_app_ids) + await self.delete_links(ds_app_ids) + + async def get_inputs(self) -> list[int]: + return await self.adb_client.run_query_builder( + DSAppSyncAgenciesDeleteGetQueryBuilder() + ) + + async def make_request( + self, + ds_app_ids: list[int] + ) -> None: + await self.pdap_client.run_request_builder( + DeleteAgenciesRequestBuilder(ds_app_ids) + ) + + async def delete_flags( + self, + ds_app_ids: list[int] + ) -> None: + await self.run_query_builder( + DSAppSyncAgenciesDeleteRemoveFlagsQueryBuilder( + ds_agency_ids=ds_app_ids + ) + ) + + async def delete_links( + self, + ds_app_ids: list[int] + ) -> None: + await self.run_query_builder( + DSAppSyncAgenciesDeleteRemoveLinksQueryBuilder( + ds_agency_ids=ds_app_ids + ) + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/delete_flags.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/delete_flags.py index 4dca28d3..f1633337 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/delete_flags.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/delete_flags.py @@ -1,5 +1,7 @@ +from sqlalchemy import delete from sqlalchemy.ext.asyncio import AsyncSession +from src.db.models.impl.flag.ds_delete.agency import FlagDSDeleteAgency from src.db.queries.base.builder import QueryBuilderBase @@ -13,4 +15,8 @@ def __init__( self._ds_agency_ids = ds_agency_ids async def run(self, session: AsyncSession) -> None: - raise NotImplementedError \ No newline at end of file + statement = ( + delete(FlagDSDeleteAgency) + .where(FlagDSDeleteAgency.ds_agency_id.in_(self._ds_agency_ids)) + ) + await session.execute(statement) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/delete_links.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/delete_links.py index 1a386e8a..0ad20ee0 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/delete_links.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/delete_links.py @@ -1,5 +1,7 @@ +from sqlalchemy import delete from sqlalchemy.ext.asyncio import AsyncSession +from src.db.models.impl.agency.ds_link.sqlalchemy import DSAppLinkAgency from src.db.queries.base.builder import QueryBuilderBase @@ -13,4 +15,8 @@ def __init__( self._ds_agency_ids = ds_agency_ids async def run(self, session: AsyncSession) -> None: - raise NotImplementedError \ No newline at end of file + statement = ( + delete(DSAppLinkAgency) + .where(DSAppLinkAgency.ds_agency_id.in_(self._ds_agency_ids)) + ) + await session.execute(statement) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/get.py index 8ca4ba6c..36dddee4 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/get.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/get.py @@ -1,5 +1,10 @@ +from typing import Sequence + +from sqlalchemy import select, RowMapping from sqlalchemy.ext.asyncio import AsyncSession +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.delete.queries.cte import \ + DSAppLinkSyncAgencyDeletePrerequisitesCTEContainer from src.db.queries.base.builder import QueryBuilderBase @@ -7,4 +12,17 @@ class DSAppSyncAgenciesDeleteGetQueryBuilder(QueryBuilderBase): async def run(self, session: AsyncSession) -> list[int]: """Get DS App links to delete.""" - raise NotImplementedError \ No newline at end of file + cte = DSAppLinkSyncAgencyDeletePrerequisitesCTEContainer() + + query = ( + select( + cte.ds_agency_id, + ) + ) + + mappings: Sequence[RowMapping] = await self.sh.mappings( + session=session, + query=query, + ) + + return [mapping[cte.ds_agency_id] for mapping in mappings] \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/prereq.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/prereq.py index 9af2af99..fdafab72 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/prereq.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/prereq.py @@ -1,9 +1,17 @@ +from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.delete.queries.cte import \ + DSAppLinkSyncAgencyDeletePrerequisitesCTEContainer from src.db.queries.base.builder import QueryBuilderBase class DSAppSyncAgenciesDeletePrerequisitesQueryBuilder(QueryBuilderBase): async def run(self, session: AsyncSession) -> bool: - raise NotImplementedError \ No newline at end of file + return await self.sh.results_exist( + session=session, + query=select( + DSAppLinkSyncAgencyDeletePrerequisitesCTEContainer().ds_agency_id + ) + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/core.py index 55eb8e3a..6ded28cc 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/core.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/core.py @@ -1,4 +1,12 @@ +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.update.queries.get import \ + DSAppSyncAgenciesUpdateGetQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.update.queries.prereq import \ + DSAppSyncAgenciesUpdatePrerequisitesQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.update.queries.update_links import \ + DSAppSyncAgenciesUpdateAlterLinksQueryBuilder from src.core.tasks.scheduled.impl.sync_to_ds.templates.operator import DSSyncTaskOperatorBase +from src.external.pdap.impl.sync.agencies.update.core import UpdateAgenciesRequestBuilder +from src.external.pdap.impl.sync.agencies.update.request import UpdateAgenciesOuterRequest class DSAppSyncAgenciesUpdateTaskOperator( @@ -6,7 +14,38 @@ class DSAppSyncAgenciesUpdateTaskOperator( ): async def meets_task_prerequisites(self) -> bool: - raise NotImplementedError + return await self.adb_client.run_query_builder( + DSAppSyncAgenciesUpdatePrerequisitesQueryBuilder() + ) async def inner_task_logic(self) -> None: - raise NotImplementedError \ No newline at end of file + request: UpdateAgenciesOuterRequest = await self.get_inputs() + await self.make_request(request) + ds_app_ids: list[int] = [ + agency.app_id + for agency in request.agencies + ] + await self.update_links(ds_app_ids) + + async def get_inputs(self) -> UpdateAgenciesOuterRequest: + return await self.adb_client.run_query_builder( + DSAppSyncAgenciesUpdateGetQueryBuilder() + ) + + async def make_request( + self, + request: UpdateAgenciesOuterRequest + ): + await self.pdap_client.run_request_builder( + UpdateAgenciesRequestBuilder(request) + ) + + async def update_links( + self, + ds_app_ids: list[int] + ) -> None: + await self.adb_client.run_query_builder( + DSAppSyncAgenciesUpdateAlterLinksQueryBuilder( + ds_agency_ids=ds_app_ids + ) + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/get.py index 43df5d78..7dc4329e 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/get.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/get.py @@ -1,10 +1,75 @@ +from typing import Sequence + +from sqlalchemy import select, func, RowMapping from sqlalchemy.ext.asyncio import AsyncSession +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.update.queries.cte import \ + DSAppLinkSyncAgencyUpdatePrerequisitesCTEContainer +from src.db.models.impl.agency.ds_link.sqlalchemy import DSAppLinkAgency +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.link.agency_location.sqlalchemy import LinkAgencyLocation from src.db.queries.base.builder import QueryBuilderBase -from src.external.pdap.impl.sync.agencies.update.request import UpdateAgenciesOuterRequest +from src.external.pdap.impl.sync.agencies._shared.models.content import AgencySyncContentModel +from src.external.pdap.impl.sync.agencies.update.request import UpdateAgenciesOuterRequest, UpdateAgenciesInnerRequest class DSAppSyncAgenciesUpdateGetQueryBuilder(QueryBuilderBase): async def run(self, session: AsyncSession) -> UpdateAgenciesOuterRequest: - raise NotImplementedError \ No newline at end of file + cte = DSAppLinkSyncAgencyUpdatePrerequisitesCTEContainer() + + location_id_cte = ( + select( + LinkAgencyLocation.agency_id, + func.array_agg(LinkAgencyLocation.location_id).label("location_ids"), + ) + .join( + Agency, + Agency.id == cte.agency_id, + ) + .group_by( + LinkAgencyLocation.agency_id, + ) + .cte() + ) + + query = ( + select( + cte.ds_agency_id, + Agency.name, + Agency.jurisdiction_type, + Agency.agency_type, + location_id_cte.c.location_ids, + ) + .join( + Agency, + Agency.id == cte.agency_id, + ) + .join( + location_id_cte, + location_id_cte.c.agency_id == cte.agency_id, + ) + ) + + mappings: Sequence[RowMapping] = await self.sh.mappings( + session=session, + query=query, + ) + + inner_requests: list[UpdateAgenciesInnerRequest] = [] + for mapping in mappings: + inner_requests.append( + UpdateAgenciesInnerRequest( + app_id=mapping[DSAppLinkAgency.ds_agency_id], + content=AgencySyncContentModel( + name=mapping[Agency.name], + jurisdiction_type=mapping[Agency.jurisdiction_type], + agency_type=mapping[Agency.agency_type], + location_ids=mapping["location_ids"] + ) + ) + ) + + return UpdateAgenciesOuterRequest( + agencies=inner_requests, + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/prereq.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/prereq.py index 8eb8bc3f..5327f4a8 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/prereq.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/prereq.py @@ -1,9 +1,17 @@ +from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.update.queries.cte import \ + DSAppLinkSyncAgencyUpdatePrerequisitesCTEContainer from src.db.queries.base.builder import QueryBuilderBase class DSAppSyncAgenciesUpdatePrerequisitesQueryBuilder(QueryBuilderBase): async def run(self, session: AsyncSession) -> bool: - raise NotImplementedError \ No newline at end of file + return await self.sh.results_exist( + session=session, + query=select( + DSAppLinkSyncAgencyUpdatePrerequisitesCTEContainer().agency_id + ) + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/update_links.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/update_links.py index 5e9288b7..8950ccd6 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/update_links.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/update_links.py @@ -1,9 +1,11 @@ +from sqlalchemy import update, func from sqlalchemy.ext.asyncio import AsyncSession +from src.db.models.impl.agency.ds_link.sqlalchemy import DSAppLinkAgency from src.db.queries.base.builder import QueryBuilderBase -class DSAppSyncAgenciesUpdateModifyLinksQueryBuilder(QueryBuilderBase): +class DSAppSyncAgenciesUpdateAlterLinksQueryBuilder(QueryBuilderBase): def __init__( self, @@ -13,4 +15,11 @@ def __init__( self._ds_agency_ids = ds_agency_ids async def run(self, session: AsyncSession) -> None: - raise NotImplementedError \ No newline at end of file + statement = ( + update(DSAppLinkAgency) + .where(DSAppLinkAgency.ds_agency_id.in_(self._ds_agency_ids)) + .values({ + DSAppLinkAgency.last_synced_at: func.now(), + }) + ) + await session.execute(statement) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/core.py index ea307cb5..1385caa0 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/core.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/core.py @@ -1,4 +1,13 @@ +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.add.queries.add_links import \ + DSAppSyncAgenciesAddInsertLinksQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.add.queries.get import \ + DSAppSyncDataSourcesAddGetQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.add.queries.prereq import \ + DSAppSyncDataSourcesAddPrerequisitesQueryBuilder from src.core.tasks.scheduled.impl.sync_to_ds.templates.operator import DSSyncTaskOperatorBase +from src.external.pdap.impl.sync.data_sources.add.core import AddDataSourcesRequestBuilder +from src.external.pdap.impl.sync.data_sources.add.request import AddDataSourcesOuterRequest +from src.external.pdap.impl.sync.shared.models.add.response import DSAppSyncAddResponseInnerModel class DSAppSyncDataSourcesAddTaskOperator( @@ -6,8 +15,34 @@ class DSAppSyncDataSourcesAddTaskOperator( ): async def meets_task_prerequisites(self) -> bool: - raise NotImplementedError + return await self.run_query_builder( + DSAppSyncDataSourcesAddPrerequisitesQueryBuilder() + ) async def inner_task_logic(self) -> None: - raise NotImplementedError \ No newline at end of file + request: AddDataSourcesOuterRequest = await self.get_request_input() + responses: list[DSAppSyncAddResponseInnerModel] = await self.make_request(request) + await self.insert_ds_app_links(responses) + + + async def get_request_input(self) -> AddDataSourcesOuterRequest: + return await self.run_query_builder( + DSAppSyncDataSourcesAddGetQueryBuilder() + ) + + async def make_request( + self, + request: AddDataSourcesOuterRequest + ) -> list[DSAppSyncAddResponseInnerModel]: + return await self.pdap_client.run_request_builder( + AddDataSourcesRequestBuilder(request) + ) + + async def insert_ds_app_links( + self, + responses: list[DSAppSyncAddResponseInnerModel] + ) -> None: + await self.run_query_builder( + DSAppSyncAgenciesAddInsertLinksQueryBuilder(responses) + ) diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/add_links.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/add_links.py index b819e4b2..e1bf4bf9 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/add_links.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/add_links.py @@ -1,17 +1,26 @@ from sqlalchemy.ext.asyncio import AsyncSession +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource from src.db.queries.base.builder import QueryBuilderBase -from src.external.pdap.impl.sync.shared.models.mapping import DSSyncIDMapping +from src.external.pdap.impl.sync.shared.models.add.response import DSAppSyncAddResponseInnerModel class DSAppSyncDataSourcesAddInsertLinksQueryBuilder(QueryBuilderBase): def __init__( self, - mappings: list[DSSyncIDMapping] + mappings: list[DSAppSyncAddResponseInnerModel] ): super().__init__() self._mappings = mappings async def run(self, session: AsyncSession) -> None: - raise NotImplementedError \ No newline at end of file + inserts: list[DSAppLinkDataSource] = [] + for mapping in self._mappings: + inserts.append( + DSAppLinkDataSource( + ds_data_source_id=mapping.ds_app_id, + url_id=mapping.request_id, + ) + ) + session.add_all(inserts) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/get.py index 4f73973e..17ed5d04 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/get.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/get.py @@ -1,10 +1,117 @@ +from typing import Sequence + +from sqlalchemy import RowMapping, func, select from sqlalchemy.ext.asyncio import AsyncSession +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.add.queries.cte import \ + DSAppLinkSyncDataSourceAddPrerequisitesCTEContainer +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata +from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType from src.db.queries.base.builder import QueryBuilderBase -from src.external.pdap.impl.sync.data_sources.add.request import AddDataSourcesOuterRequest +from src.external.pdap.impl.sync.data_sources._shared.content import DataSourceSyncContentModel +from src.external.pdap.impl.sync.data_sources.add.request import AddDataSourcesOuterRequest, AddDataSourcesInnerRequest class DSAppSyncDataSourcesAddGetQueryBuilder(QueryBuilderBase): async def run(self, session: AsyncSession) -> AddDataSourcesOuterRequest: - raise NotImplementedError \ No newline at end of file + cte = DSAppLinkSyncDataSourceAddPrerequisitesCTEContainer() + + agency_id_cte = ( + select( + LinkURLAgency.url_id, + func.array_agg(LinkURLAgency.agency_id).label("agency_ids") + ) + .group_by( + LinkURLAgency.url_id + ) + .cte() + ) + + query = ( + select( + cte.url_id, + # Required + URL.full_url, + URL.name, + URLRecordType.record_type, + agency_id_cte.c.agency_ids, + # Optional + URL.description, + URLOptionalDataSourceMetadata.record_formats, + URLOptionalDataSourceMetadata.data_portal_type, + URLOptionalDataSourceMetadata.supplying_entity, + URLOptionalDataSourceMetadata.coverage_start, + URLOptionalDataSourceMetadata.coverage_end, + URLOptionalDataSourceMetadata.agency_supplied, + URLOptionalDataSourceMetadata.agency_originated, + URLOptionalDataSourceMetadata.update_method, + URLOptionalDataSourceMetadata.readme_url, + URLOptionalDataSourceMetadata.originating_entity, + URLOptionalDataSourceMetadata.retention_schedule, + URLOptionalDataSourceMetadata.scraper_url, + URLOptionalDataSourceMetadata.access_notes, + URLOptionalDataSourceMetadata.access_types, + ) + .select_from( + cte.cte + ) + .join( + URL, + URL.id == cte.url_id, + ) + .join( + URLOptionalDataSourceMetadata, + URL.id == URLOptionalDataSourceMetadata.url_id, + ) + .join( + URLRecordType, + URLRecordType.url_id == URL.id, + ) + .join( + agency_id_cte, + cte.url_id == agency_id_cte.c.url_id + ) + ) + + mappings: Sequence[RowMapping] = await self.sh.mappings( + session=session, + query=query, + ) + + inner_requests: list[AddDataSourcesInnerRequest] = [] + for mapping in mappings: + inner_requests.append( + AddDataSourcesInnerRequest( + request_id=mapping[cte.url_id], + content=DataSourceSyncContentModel( + # Required + source_url=mapping[URL.full_url], + name=mapping[URL.name], + record_type=mapping[URLRecordType.record_type], + agency_ids=mapping["agency_ids"], + # Optional + description=mapping[URL.description], + record_formats=mapping[URLOptionalDataSourceMetadata.record_formats], + data_portal_type=mapping[URLOptionalDataSourceMetadata.data_portal_type], + supplying_entity=mapping[URLOptionalDataSourceMetadata.supplying_entity], + coverage_start=mapping[URLOptionalDataSourceMetadata.coverage_start], + coverage_end=mapping[URLOptionalDataSourceMetadata.coverage_end], + agency_supplied=mapping[URLOptionalDataSourceMetadata.agency_supplied], + agency_originated=mapping[URLOptionalDataSourceMetadata.agency_originated], + update_method=mapping[URLOptionalDataSourceMetadata.update_method], + readme_url=mapping[URLOptionalDataSourceMetadata.readme_url], + originating_entity=mapping[URLOptionalDataSourceMetadata.originating_entity], + retention_schedule=mapping[URLOptionalDataSourceMetadata.retention_schedule], + scraper_url=mapping[URLOptionalDataSourceMetadata.scraper_url], + access_notes=mapping[URLOptionalDataSourceMetadata.access_notes], + access_types=mapping[URLOptionalDataSourceMetadata.access_types], + ) + ) + ) + + return AddDataSourcesOuterRequest( + data_sources=inner_requests, + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/prereq.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/prereq.py index 6f8ac04e..d375f524 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/prereq.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/prereq.py @@ -1,9 +1,17 @@ +from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.add.queries.cte import \ + DSAppLinkSyncDataSourceAddPrerequisitesCTEContainer from src.db.queries.base.builder import QueryBuilderBase class DSAppSyncDataSourcesAddPrerequisitesQueryBuilder(QueryBuilderBase): async def run(self, session: AsyncSession) -> bool: - raise NotImplementedError \ No newline at end of file + return await self.sh.results_exist( + session=session, + query=select( + DSAppLinkSyncDataSourceAddPrerequisitesCTEContainer().url_id + ) + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/core.py index b49b73c9..adccb03b 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/core.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/core.py @@ -1,4 +1,13 @@ +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.delete.queries.delete_flags import \ + DSAppSyncDataSourcesDeleteRemoveFlagsQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.delete.queries.delete_links import \ + DSAppSyncDataSourcesDeleteRemoveLinksQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.delete.queries.get import \ + DSAppSyncDataSourcesDeleteGetQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.delete.queries.prereq import \ + DSAppSyncDataSourcesDeletePrerequisitesQueryBuilder from src.core.tasks.scheduled.impl.sync_to_ds.templates.operator import DSSyncTaskOperatorBase +from src.external.pdap.impl.sync.data_sources.delete.core import DeleteDataSourcesRequestBuilder class DSAppSyncDataSourcesDeleteTaskOperator( @@ -6,7 +15,45 @@ class DSAppSyncDataSourcesDeleteTaskOperator( ): async def meets_task_prerequisites(self) -> bool: - raise NotImplementedError + return await self.run_query_builder( + DSAppSyncDataSourcesDeletePrerequisitesQueryBuilder() + ) async def inner_task_logic(self) -> None: - raise NotImplementedError \ No newline at end of file + ds_app_ids: list[int] = await self.get_inputs() + await self.make_request(ds_app_ids) + await self.delete_flags(ds_app_ids) + await self.delete_links(ds_app_ids) + + async def get_inputs(self) -> list[int]: + return await self.run_query_builder( + DSAppSyncDataSourcesDeleteGetQueryBuilder() + ) + + async def make_request( + self, + ds_app_ids: list[int] + ) -> None: + await self.pdap_client.run_request_builder( + DeleteDataSourcesRequestBuilder(ds_app_ids) + ) + + async def delete_flags( + self, + ds_app_ids: list[int] + ) -> None: + await self.run_query_builder( + DSAppSyncDataSourcesDeleteRemoveFlagsQueryBuilder( + ds_data_source_ids=ds_app_ids + ) + ) + + async def delete_links( + self, + ds_app_ids: list[int] + ) -> None: + await self.run_query_builder( + DSAppSyncDataSourcesDeleteRemoveLinksQueryBuilder( + ds_data_source_ids=ds_app_ids + ) + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/cte.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/cte.py index 8e5100f8..1b9f2479 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/cte.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/cte.py @@ -21,7 +21,7 @@ def __init__(self): ) @property - def ds_meta_url_id(self) -> Column[int]: + def ds_data_source_id(self) -> Column[int]: return self._cte.columns.ds_data_source_id @property diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/delete_flags.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/delete_flags.py index d9f31130..ef869a9c 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/delete_flags.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/delete_flags.py @@ -1,5 +1,7 @@ +from sqlalchemy import delete from sqlalchemy.ext.asyncio import AsyncSession +from src.db.models.impl.flag.ds_delete.data_source import FlagDSDeleteDataSource from src.db.queries.base.builder import QueryBuilderBase @@ -13,4 +15,8 @@ def __init__( self._ds_data_source_ids = ds_data_source_ids async def run(self, session: AsyncSession) -> None: - raise NotImplementedError \ No newline at end of file + statement = ( + delete(FlagDSDeleteDataSource) + .where(FlagDSDeleteDataSource.ds_data_source_id.in_(self._ds_data_source_ids)) + ) + await session.execute(statement) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/delete_links.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/delete_links.py index 547944be..9b417ce8 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/delete_links.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/delete_links.py @@ -1,5 +1,7 @@ +from sqlalchemy import delete from sqlalchemy.ext.asyncio import AsyncSession +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource from src.db.queries.base.builder import QueryBuilderBase @@ -13,4 +15,8 @@ def __init__( self._ds_data_source_ids = ds_data_source_ids async def run(self, session: AsyncSession) -> None: - raise NotImplementedError \ No newline at end of file + statement = ( + delete(DSAppLinkDataSource) + .where(DSAppLinkDataSource.ds_data_source_id.in_(self._ds_data_source_ids)) + ) + await session.execute(statement) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/get.py index ec7724c8..7077beac 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/get.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/get.py @@ -1,5 +1,10 @@ +from typing import Sequence + +from sqlalchemy import select, RowMapping from sqlalchemy.ext.asyncio import AsyncSession +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.delete.queries.cte import \ + DSAppLinkSyncDataSourceDeletePrerequisitesCTEContainer from src.db.queries.base.builder import QueryBuilderBase @@ -7,4 +12,17 @@ class DSAppSyncDataSourcesDeleteGetQueryBuilder(QueryBuilderBase): async def run(self, session: AsyncSession) -> list[int]: """Get DS App links to delete.""" - raise NotImplementedError \ No newline at end of file + cte = DSAppLinkSyncDataSourceDeletePrerequisitesCTEContainer() + + query = ( + select( + cte.ds_data_source_id, + ) + ) + + mappings: Sequence[RowMapping] = await self.sh.mappings( + session=session, + query=query, + ) + + return [mapping[cte.ds_data_source_id] for mapping in mappings] \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/prereq.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/prereq.py index 03843e0a..5df5781c 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/prereq.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/prereq.py @@ -1,9 +1,17 @@ +from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.delete.queries.cte import \ + DSAppLinkSyncDataSourceDeletePrerequisitesCTEContainer from src.db.queries.base.builder import QueryBuilderBase class DSAppSyncDataSourcesDeletePrerequisitesQueryBuilder(QueryBuilderBase): async def run(self, session: AsyncSession) -> bool: - raise NotImplementedError \ No newline at end of file + return await self.sh.results_exist( + session=session, + query=select( + DSAppLinkSyncDataSourceDeletePrerequisitesCTEContainer().ds_meta_url_id + ) + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/core.py index 1947c202..08a8405b 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/core.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/core.py @@ -1,4 +1,12 @@ +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update.queries.get import \ + DSAppSyncDataSourcesUpdateGetQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update.queries.prereq import \ + DSAppSyncDataSourcesUpdatePrerequisitesQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update.queries.update_links import \ + DSAppSyncDataSourcesUpdateAlterLinksQueryBuilder from src.core.tasks.scheduled.impl.sync_to_ds.templates.operator import DSSyncTaskOperatorBase +from src.external.pdap.impl.sync.data_sources.update.core import UpdateDataSourcesRequestBuilder +from src.external.pdap.impl.sync.data_sources.update.request import UpdateDataSourcesOuterRequest class DSAppSyncDataSourcesUpdateTaskOperator( @@ -6,7 +14,38 @@ class DSAppSyncDataSourcesUpdateTaskOperator( ): async def meets_task_prerequisites(self) -> bool: - raise NotImplementedError + return await self.adb_client.run_query_builder( + DSAppSyncDataSourcesUpdatePrerequisitesQueryBuilder() + ) async def inner_task_logic(self) -> None: - raise NotImplementedError \ No newline at end of file + request: UpdateDataSourcesOuterRequest = await self.get_inputs() + await self.make_request(request) + ds_app_ids: list[int] = [ + ds.app_id + for ds in request.data_sources + ] + await self.update_links(ds_app_ids) + + async def get_inputs(self) -> UpdateDataSourcesOuterRequest: + return await self.adb_client.run_query_builder( + DSAppSyncDataSourcesUpdateGetQueryBuilder() + ) + + async def make_request( + self, + request: UpdateDataSourcesOuterRequest + ): + await self.pdap_client.run_request_builder( + UpdateDataSourcesRequestBuilder(request) + ) + + async def update_links( + self, + ds_app_ids: list[int] + ) -> None: + await self.adb_client.run_query_builder( + DSAppSyncDataSourcesUpdateAlterLinksQueryBuilder( + ds_data_source_ids=ds_app_ids + ) + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/get.py index f5e0703b..3e802656 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/get.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/get.py @@ -1,10 +1,118 @@ +from typing import Sequence + +from sqlalchemy import select, func, RowMapping from sqlalchemy.ext.asyncio import AsyncSession +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update.queries.cte import \ + DSAppLinkSyncDataSourceUpdatePrerequisitesCTEContainer +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata +from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType from src.db.queries.base.builder import QueryBuilderBase -from src.external.pdap.impl.sync.data_sources.update.request import UpdateDataSourcesOuterRequest +from src.external.pdap.impl.sync.data_sources._shared.content import DataSourceSyncContentModel +from src.external.pdap.impl.sync.data_sources.update.request import UpdateDataSourcesOuterRequest, \ + UpdateDataSourcesInnerRequest class DSAppSyncDataSourcesUpdateGetQueryBuilder(QueryBuilderBase): async def run(self, session: AsyncSession) -> UpdateDataSourcesOuterRequest: - raise NotImplementedError \ No newline at end of file + cte = DSAppLinkSyncDataSourceUpdatePrerequisitesCTEContainer() + + agency_id_cte = ( + select( + LinkURLAgency.url_id, + func.array_agg(LinkURLAgency.agency_id).label("agency_ids") + ) + .group_by( + LinkURLAgency.url_id + ) + .cte() + ) + + query = ( + select( + cte.ds_data_source_id, + # Required + URL.full_url, + URL.name, + URLRecordType.record_type, + agency_id_cte.c.agency_ids, + # Optional + URL.description, + URLOptionalDataSourceMetadata.record_formats, + URLOptionalDataSourceMetadata.data_portal_type, + URLOptionalDataSourceMetadata.supplying_entity, + URLOptionalDataSourceMetadata.coverage_start, + URLOptionalDataSourceMetadata.coverage_end, + URLOptionalDataSourceMetadata.agency_supplied, + URLOptionalDataSourceMetadata.agency_originated, + URLOptionalDataSourceMetadata.update_method, + URLOptionalDataSourceMetadata.readme_url, + URLOptionalDataSourceMetadata.originating_entity, + URLOptionalDataSourceMetadata.retention_schedule, + URLOptionalDataSourceMetadata.scraper_url, + URLOptionalDataSourceMetadata.access_notes, + URLOptionalDataSourceMetadata.access_types, + ) + .select_from( + cte.cte + ) + .join( + URL, + URL.id == cte.url_id, + ) + .join( + URLOptionalDataSourceMetadata, + URL.id == URLOptionalDataSourceMetadata.url_id, + ) + .join( + URLRecordType, + URLRecordType.url_id == URL.id, + ) + .join( + agency_id_cte, + cte.url_id == agency_id_cte.c.url_id + ) + ) + + mappings: Sequence[RowMapping] = await self.sh.mappings( + session=session, + query=query, + ) + + inner_requests: list[UpdateDataSourcesInnerRequest] = [] + for mapping in mappings: + inner_requests.append( + UpdateDataSourcesInnerRequest( + app_id=mapping[cte.ds_data_source_id], + content=DataSourceSyncContentModel( + # Required + source_url=mapping[URL.full_url], + name=mapping[URL.name], + record_type=mapping[URLRecordType.record_type], + agency_ids=mapping["agency_ids"], + # Optional + description=mapping[URL.description], + record_formats=mapping[URLOptionalDataSourceMetadata.record_formats], + data_portal_type=mapping[URLOptionalDataSourceMetadata.data_portal_type], + supplying_entity=mapping[URLOptionalDataSourceMetadata.supplying_entity], + coverage_start=mapping[URLOptionalDataSourceMetadata.coverage_start], + coverage_end=mapping[URLOptionalDataSourceMetadata.coverage_end], + agency_supplied=mapping[URLOptionalDataSourceMetadata.agency_supplied], + agency_originated=mapping[URLOptionalDataSourceMetadata.agency_originated], + update_method=mapping[URLOptionalDataSourceMetadata.update_method], + readme_url=mapping[URLOptionalDataSourceMetadata.readme_url], + originating_entity=mapping[URLOptionalDataSourceMetadata.originating_entity], + retention_schedule=mapping[URLOptionalDataSourceMetadata.retention_schedule], + scraper_url=mapping[URLOptionalDataSourceMetadata.scraper_url], + access_notes=mapping[URLOptionalDataSourceMetadata.access_notes], + access_types=mapping[URLOptionalDataSourceMetadata.access_types], + ) + ) + ) + + return UpdateDataSourcesOuterRequest( + data_sources=inner_requests, + ) diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/prereq.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/prereq.py index 08965cce..e31ff1d7 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/prereq.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/prereq.py @@ -1,9 +1,17 @@ +from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update.queries.cte import \ + DSAppLinkSyncDataSourceUpdatePrerequisitesCTEContainer from src.db.queries.base.builder import QueryBuilderBase class DSAppSyncDataSourcesUpdatePrerequisitesQueryBuilder(QueryBuilderBase): async def run(self, session: AsyncSession) -> bool: - raise NotImplementedError \ No newline at end of file + return await self.sh.results_exist( + session=session, + query=select( + DSAppLinkSyncDataSourceUpdatePrerequisitesCTEContainer().ds_data_source_id + ) + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/update_links.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/update_links.py index 42945b53..ffba7ec8 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/update_links.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/update_links.py @@ -1,5 +1,7 @@ +from sqlalchemy import update, func from sqlalchemy.ext.asyncio import AsyncSession +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource from src.db.queries.base.builder import QueryBuilderBase @@ -13,4 +15,11 @@ def __init__( self._ds_data_source_ids = ds_data_source_ids async def run(self, session: AsyncSession) -> None: - raise NotImplementedError \ No newline at end of file + statement = ( + update(DSAppLinkDataSource) + .where(DSAppLinkDataSource.ds_data_source_id.in_(self._ds_data_source_ids)) + .values({ + DSAppLinkDataSource.last_synced_at: func.now(), + }) + ) + await session.execute(statement) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/core.py index 54fe1c90..9abbe11d 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/core.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/core.py @@ -1,4 +1,12 @@ +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.add.queries.add_links import \ + DSAppSyncMetaURLsAddInsertLinksQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.add.queries.get import DSAppSyncMetaURLsAddGetQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.add.queries.prereq import \ + DSAppSyncMetaURLsAddPrerequisitesQueryBuilder from src.core.tasks.scheduled.impl.sync_to_ds.templates.operator import DSSyncTaskOperatorBase +from src.external.pdap.impl.sync.meta_urls.add.core import AddMetaURLsRequestBuilder +from src.external.pdap.impl.sync.meta_urls.add.request import AddMetaURLsOuterRequest +from src.external.pdap.impl.sync.shared.models.add.response import DSAppSyncAddResponseInnerModel class DSAppSyncMetaURLsAddTaskOperator( @@ -6,7 +14,32 @@ class DSAppSyncMetaURLsAddTaskOperator( ): async def meets_task_prerequisites(self) -> bool: - raise NotImplementedError + return await self.run_query_builder( + DSAppSyncMetaURLsAddPrerequisitesQueryBuilder() + ) async def inner_task_logic(self) -> None: - raise NotImplementedError \ No newline at end of file + request: AddMetaURLsOuterRequest = await self.get_request_input() + responses: list[DSAppSyncAddResponseInnerModel] = await self.make_request(request) + await self.insert_ds_app_links(responses) + + async def get_request_input(self) -> AddMetaURLsOuterRequest: + return await self.run_query_builder( + DSAppSyncMetaURLsAddGetQueryBuilder() + ) + + async def make_request( + self, + request: AddMetaURLsOuterRequest + ) -> list[DSAppSyncAddResponseInnerModel]: + return await self.pdap_client.run_request_builder( + AddMetaURLsRequestBuilder(request) + ) + + async def insert_ds_app_links( + self, + responses: list[DSAppSyncAddResponseInnerModel] + ) -> None: + await self.run_query_builder( + DSAppSyncMetaURLsAddInsertLinksQueryBuilder(responses) + ) diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/add_links.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/add_links.py index ff9ecfc7..648b3d25 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/add_links.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/add_links.py @@ -1,17 +1,26 @@ from sqlalchemy.ext.asyncio import AsyncSession +from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL from src.db.queries.base.builder import QueryBuilderBase -from src.external.pdap.impl.sync.shared.models.mapping import DSSyncIDMapping +from src.external.pdap.impl.sync.shared.models.add.response import DSAppSyncAddResponseInnerModel class DSAppSyncMetaURLsAddInsertLinksQueryBuilder(QueryBuilderBase): def __init__( self, - mappings: list[DSSyncIDMapping] + mappings: list[DSAppSyncAddResponseInnerModel] ): super().__init__() self._mappings = mappings async def run(self, session: AsyncSession) -> None: - raise NotImplementedError \ No newline at end of file + inserts: list[DSAppLinkMetaURL] = [] + for mapping in self._mappings: + inserts.append( + DSAppLinkMetaURL( + ds_meta_url_id=mapping.ds_app_id, + url_id=mapping.request_id, + ) + ) + session.add_all(inserts) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/get.py index 998ac642..5493c595 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/get.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/get.py @@ -1,10 +1,69 @@ +from typing import Sequence + +from sqlalchemy import select, func, RowMapping from sqlalchemy.ext.asyncio import AsyncSession +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.add.queries.cte import \ + DSAppLinkSyncMetaURLAddPrerequisitesCTEContainer +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase -from src.external.pdap.impl.sync.agencies.add.core import AddMetaURLsOuterRequest +from src.external.pdap.impl.sync.meta_urls._shared.content import MetaURLSyncContentModel +from src.external.pdap.impl.sync.meta_urls.add.request import AddMetaURLsOuterRequest, AddMetaURLsInnerRequest class DSAppSyncMetaURLsAddGetQueryBuilder(QueryBuilderBase): async def run(self, session: AsyncSession) -> AddMetaURLsOuterRequest: - raise NotImplementedError \ No newline at end of file + cte = DSAppLinkSyncMetaURLAddPrerequisitesCTEContainer() + + agency_id_cte = ( + select( + LinkURLAgency.url_id, + func.array_agg(LinkURLAgency.agency_id).label("agency_ids") + ) + .group_by( + LinkURLAgency.url_id + ) + .cte() + ) + + query = ( + select( + cte.url_id, + URL.full_url, + agency_id_cte.c.agency_ids + ) + .select_from( + cte.cte + ) + .join( + URL, + URL.id == cte.url_id, + ) + .join( + agency_id_cte, + cte.url_id == agency_id_cte.c.url_id + ) + ) + + mappings: Sequence[RowMapping] = await self.sh.mappings( + session=session, + query=query, + ) + + inner_requests: list[AddMetaURLsInnerRequest] = [] + for mapping in mappings: + inner_requests.append( + AddMetaURLsInnerRequest( + request_id=mapping[cte.url_id], + content=MetaURLSyncContentModel( + url=mapping[URL.full_url], + agency_ids=mapping["agency_ids"] + ) + ) + ) + + return AddMetaURLsOuterRequest( + meta_urls=inner_requests, + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/prereq.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/prereq.py index c495f741..9439b6d0 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/prereq.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/prereq.py @@ -1,9 +1,17 @@ +from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.add.queries.cte import \ + DSAppLinkSyncMetaURLAddPrerequisitesCTEContainer from src.db.queries.base.builder import QueryBuilderBase class DSAppSyncMetaURLsAddPrerequisitesQueryBuilder(QueryBuilderBase): async def run(self, session: AsyncSession) -> bool: - raise NotImplementedError \ No newline at end of file + return await self.sh.results_exist( + session=session, + query=select( + DSAppLinkSyncMetaURLAddPrerequisitesCTEContainer().url_id + ) + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/core.py index 00d2c225..d67880f3 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/core.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/core.py @@ -1,4 +1,13 @@ +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.delete.queries.delete_flags import \ + DSAppSyncMetaURLsDeleteRemoveFlagsQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.delete.queries.delete_links import \ + DSAppSyncMetaURLsDeleteRemoveLinksQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.delete.queries.get import \ + DSAppSyncMetaURLsDeleteGetQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.delete.queries.prereq import \ + DSAppSyncMetaURLsDeletePrerequisitesQueryBuilder from src.core.tasks.scheduled.impl.sync_to_ds.templates.operator import DSSyncTaskOperatorBase +from src.external.pdap.impl.sync.meta_urls.delete.core import DeleteMetaURLsRequestBuilder class DSAppSyncMetaURLsDeleteTaskOperator( @@ -6,7 +15,45 @@ class DSAppSyncMetaURLsDeleteTaskOperator( ): async def meets_task_prerequisites(self) -> bool: - raise NotImplementedError + return await self.run_query_builder( + DSAppSyncMetaURLsDeletePrerequisitesQueryBuilder() + ) async def inner_task_logic(self) -> None: - raise NotImplementedError \ No newline at end of file + ds_app_ids: list[int] = await self.get_inputs() + await self.make_request(ds_app_ids) + await self.delete_flags(ds_app_ids) + await self.delete_links(ds_app_ids) + + async def get_inputs(self) -> list[int]: + return await self.run_query_builder( + DSAppSyncMetaURLsDeleteGetQueryBuilder() + ) + + async def make_request( + self, + ds_app_ids: list[int] + ) -> None: + await self.pdap_client.run_request_builder( + DeleteMetaURLsRequestBuilder(ds_app_ids) + ) + + async def delete_flags( + self, + ds_app_ids: list[int] + ) -> None: + await self.run_query_builder( + DSAppSyncMetaURLsDeleteRemoveFlagsQueryBuilder( + ds_meta_url_ids=ds_app_ids + ) + ) + + async def delete_links( + self, + ds_app_ids: list[int] + ) -> None: + await self.run_query_builder( + DSAppSyncMetaURLsDeleteRemoveLinksQueryBuilder( + ds_meta_url_ids=ds_app_ids + ) + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/cte.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/cte.py index 94e69457..8a6fe844 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/cte.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/cte.py @@ -7,7 +7,7 @@ from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL -class DSAppLinkSyncAgencyDeletePrerequisitesCTEContainer: +class DSAppLinkSyncMetaURLDeletePrerequisitesCTEContainer: def __init__(self): self._cte = ( diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/delete_flags.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/delete_flags.py index 9313d41d..4bee4ccc 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/delete_flags.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/delete_flags.py @@ -1,5 +1,7 @@ +from sqlalchemy import delete from sqlalchemy.ext.asyncio import AsyncSession +from src.db.models.impl.flag.ds_delete.meta_url import FlagDSDeleteMetaURL from src.db.queries.base.builder import QueryBuilderBase @@ -13,4 +15,8 @@ def __init__( self._ds_meta_url_ids = ds_meta_url_ids async def run(self, session: AsyncSession) -> None: - raise NotImplementedError \ No newline at end of file + statement = ( + delete(FlagDSDeleteMetaURL) + .where(FlagDSDeleteMetaURL.ds_meta_url_id.in_(self._ds_meta_url_ids)) + ) + await session.execute(statement) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/delete_links.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/delete_links.py index d03c5512..0fb66bb5 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/delete_links.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/delete_links.py @@ -1,5 +1,7 @@ +from sqlalchemy import delete from sqlalchemy.ext.asyncio import AsyncSession +from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL from src.db.queries.base.builder import QueryBuilderBase @@ -13,4 +15,8 @@ def __init__( self._ds_meta_url_ids = ds_meta_url_ids async def run(self, session: AsyncSession) -> None: - raise NotImplementedError \ No newline at end of file + statement = ( + delete(DSAppLinkMetaURL) + .where(DSAppLinkMetaURL.ds_meta_url_id.in_(self._ds_meta_url_ids)) + ) + await session.execute(statement) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/get.py index 6a1505c5..f1d232f7 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/get.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/get.py @@ -1,5 +1,10 @@ +from typing import Sequence + +from sqlalchemy import select, RowMapping from sqlalchemy.ext.asyncio import AsyncSession +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.delete.queries.cte import \ + DSAppLinkSyncMetaURLDeletePrerequisitesCTEContainer from src.db.queries.base.builder import QueryBuilderBase @@ -7,4 +12,17 @@ class DSAppSyncMetaURLsDeleteGetQueryBuilder(QueryBuilderBase): async def run(self, session: AsyncSession) -> list[int]: """Get DS App links to delete.""" - raise NotImplementedError \ No newline at end of file + cte = DSAppLinkSyncMetaURLDeletePrerequisitesCTEContainer() + + query = ( + select( + cte.ds_meta_url_id, + ) + ) + + mappings: Sequence[RowMapping] = await self.sh.mappings( + session=session, + query=query, + ) + + return [mapping[cte.ds_meta_url_id] for mapping in mappings] \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/prereq.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/prereq.py index 6e1824ea..8bc7dbd8 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/prereq.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/prereq.py @@ -1,9 +1,17 @@ +from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.delete.queries.cte import \ + DSAppLinkSyncMetaURLDeletePrerequisitesCTEContainer from src.db.queries.base.builder import QueryBuilderBase class DSAppSyncMetaURLsDeletePrerequisitesQueryBuilder(QueryBuilderBase): async def run(self, session: AsyncSession) -> bool: - raise NotImplementedError \ No newline at end of file + return await self.sh.results_exist( + session=session, + query=select( + DSAppLinkSyncMetaURLDeletePrerequisitesCTEContainer().ds_meta_url_id + ) + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/core.py index 387d52d2..a9f85918 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/core.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/core.py @@ -1,4 +1,12 @@ +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.update.queries.get import \ + DSAppSyncMetaURLsUpdateGetQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.update.queries.prereq import \ + DSAppSyncMetaURLsUpdatePrerequisitesQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.update.queries.update_links import \ + DSAppSyncMetaURLsUpdateAlterLinksQueryBuilder from src.core.tasks.scheduled.impl.sync_to_ds.templates.operator import DSSyncTaskOperatorBase +from src.external.pdap.impl.sync.meta_urls.update.core import UpdateMetaURLsRequestBuilder +from src.external.pdap.impl.sync.meta_urls.update.request import UpdateMetaURLsOuterRequest class DSAppSyncMetaURLsUpdateTaskOperator( @@ -6,7 +14,38 @@ class DSAppSyncMetaURLsUpdateTaskOperator( ): async def meets_task_prerequisites(self) -> bool: - raise NotImplementedError + return await self.adb_client.run_query_builder( + DSAppSyncMetaURLsUpdatePrerequisitesQueryBuilder() + ) async def inner_task_logic(self) -> None: - raise NotImplementedError \ No newline at end of file + request: UpdateMetaURLsOuterRequest = await self.get_inputs() + await self.make_request(request) + ds_app_ids: list[int] = [ + meta_url.app_id + for meta_url in request.meta_urls + ] + await self.update_links(ds_app_ids) + + async def get_inputs(self) -> UpdateMetaURLsOuterRequest: + return await self.adb_client.run_query_builder( + DSAppSyncMetaURLsUpdateGetQueryBuilder() + ) + + async def make_request( + self, + request: UpdateMetaURLsOuterRequest + ): + await self.pdap_client.run_request_builder( + UpdateMetaURLsRequestBuilder(request) + ) + + async def update_links( + self, + ds_app_ids: list[int] + ) -> None: + await self.adb_client.run_query_builder( + DSAppSyncMetaURLsUpdateAlterLinksQueryBuilder( + ds_meta_url_ids=ds_app_ids + ) + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/cte.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/cte.py index 7ea81c6a..a60d02fd 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/cte.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/cte.py @@ -3,7 +3,7 @@ from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL -class DSAppLinkSyncDataSourceUpdatePrerequisitesCTEContainer: +class DSAppLinkSyncMetaURLUpdatePrerequisitesCTEContainer: def __init__(self): self._cte = ( diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/get.py index 17e6742b..2460aee3 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/get.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/get.py @@ -1,10 +1,69 @@ +from typing import Sequence + +from sqlalchemy import select, func, RowMapping from sqlalchemy.ext.asyncio import AsyncSession +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.update.queries.cte import \ + DSAppLinkSyncMetaURLUpdatePrerequisitesCTEContainer +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase -from src.external.pdap.impl.sync.meta_urls.update.request import UpdateMetaURLsOuterRequest +from src.external.pdap.impl.sync.meta_urls._shared.content import MetaURLSyncContentModel +from src.external.pdap.impl.sync.meta_urls.update.request import UpdateMetaURLsOuterRequest, UpdateMetaURLsInnerRequest class DSAppSyncMetaURLsUpdateGetQueryBuilder(QueryBuilderBase): async def run(self, session: AsyncSession) -> UpdateMetaURLsOuterRequest: - raise NotImplementedError \ No newline at end of file + cte = DSAppLinkSyncMetaURLUpdatePrerequisitesCTEContainer() + + agency_id_cte = ( + select( + LinkURLAgency.url_id, + func.array_agg(LinkURLAgency.agency_id).label("agency_ids") + ) + .group_by( + LinkURLAgency.url_id + ) + .cte() + ) + + query = ( + select( + cte.ds_meta_url_id, + URL.full_url, + agency_id_cte.c.agency_ids + ) + .select_from( + cte.cte + ) + .join( + URL, + URL.id == cte.url_id, + ) + .join( + agency_id_cte, + cte.url_id == agency_id_cte.c.url_id + ) + ) + + mappings: Sequence[RowMapping] = await self.sh.mappings( + session=session, + query=query, + ) + + inner_requests: list[UpdateMetaURLsInnerRequest] = [] + for mapping in mappings: + inner_requests.append( + UpdateMetaURLsInnerRequest( + app_id=mapping[cte.ds_meta_url_id], + content=MetaURLSyncContentModel( + url=mapping[URL.full_url], + agency_ids=mapping["agency_ids"] + ) + ) + ) + + return UpdateMetaURLsOuterRequest( + meta_urls=inner_requests, + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/prereq.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/prereq.py index ec4b3de1..761bb2c5 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/prereq.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/prereq.py @@ -1,9 +1,17 @@ +from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.update.queries.cte import \ + DSAppLinkSyncMetaURLUpdatePrerequisitesCTEContainer from src.db.queries.base.builder import QueryBuilderBase class DSAppSyncMetaURLsUpdatePrerequisitesQueryBuilder(QueryBuilderBase): async def run(self, session: AsyncSession) -> bool: - raise NotImplementedError \ No newline at end of file + return await self.sh.results_exist( + session=session, + query=select( + DSAppLinkSyncMetaURLUpdatePrerequisitesCTEContainer().ds_meta_url_id + ) + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/update_links.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/update_links.py index cccc7471..baafcaa8 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/update_links.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/update_links.py @@ -1,5 +1,7 @@ +from sqlalchemy import update, func from sqlalchemy.ext.asyncio import AsyncSession +from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL from src.db.queries.base.builder import QueryBuilderBase @@ -13,4 +15,11 @@ def __init__( self._ds_meta_url_ids = ds_meta_url_ids async def run(self, session: AsyncSession) -> None: - raise NotImplementedError \ No newline at end of file + statement = ( + update(DSAppLinkMetaURL) + .where(DSAppLinkMetaURL.ds_meta_url_id.in_(self._ds_meta_url_ids)) + .values({ + DSAppLinkMetaURL.last_synced_at: func.now(), + }) + ) + await session.execute(statement) \ No newline at end of file diff --git a/src/core/tasks/url/loader.py b/src/core/tasks/url/loader.py index 5632f11e..70c3eebe 100644 --- a/src/core/tasks/url/loader.py +++ b/src/core/tasks/url/loader.py @@ -32,8 +32,6 @@ from src.core.tasks.url.operators.record_type.llm_api.record_classifier.openai import OpenAIRecordClassifier from src.core.tasks.url.operators.root_url.core import URLRootURLTaskOperator from src.core.tasks.url.operators.screenshot.core import URLScreenshotTaskOperator -from src.core.tasks.url.operators.submit_approved.core import SubmitApprovedURLTaskOperator -from src.core.tasks.url.operators.submit_meta_urls.core import SubmitMetaURLsTaskOperator from src.core.tasks.url.operators.suspend.core import SuspendURLTaskOperator from src.core.tasks.url.operators.validate.core import AutoValidateURLTaskOperator from src.db.client.async_ import AsyncDatabaseClient @@ -107,8 +105,6 @@ def _get_agency_identification_task_operator(self) -> URLTaskEntry: enabled=self.setup_flag("URL_AGENCY_IDENTIFICATION_TASK_FLAG") ) - - def _get_url_miscellaneous_metadata_task_operator(self) -> URLTaskEntry: operator = URLMiscellaneousMetadataTaskOperator( adb_client=self.adb_client @@ -197,7 +193,6 @@ def _get_suspend_url_task_operator(self) -> URLTaskEntry: enabled=self.setup_flag("URL_SUSPEND_TASK_FLAG") ) - # TODO: Double check env var flags # DS App Sync ## Agency ### Add diff --git a/src/core/tasks/url/operators/submit_approved/__init__.py b/src/core/tasks/url/operators/submit_approved/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/core/tasks/url/operators/submit_approved/convert.py b/src/core/tasks/url/operators/submit_approved/convert.py deleted file mode 100644 index 1c4a8298..00000000 --- a/src/core/tasks/url/operators/submit_approved/convert.py +++ /dev/null @@ -1,19 +0,0 @@ -from src.core.tasks.url.operators.submit_approved.tdo import SubmittedURLInfo -from src.db.models.impl.url.task_error.pydantic_.small import URLTaskErrorSmall - - -async def convert_to_task_errors( - submitted_url_infos: list[SubmittedURLInfo] -) -> list[URLTaskErrorSmall]: - task_errors: list[URLTaskErrorSmall] = [] - error_response_objects = [ - response_object for response_object in submitted_url_infos - if response_object.request_error is not None - ] - for error_response_object in error_response_objects: - error_info = URLTaskErrorSmall( - url_id=error_response_object.url_id, - error=error_response_object.request_error, - ) - task_errors.append(error_info) - return task_errors diff --git a/src/core/tasks/url/operators/submit_approved/core.py b/src/core/tasks/url/operators/submit_approved/core.py deleted file mode 100644 index e16a1269..00000000 --- a/src/core/tasks/url/operators/submit_approved/core.py +++ /dev/null @@ -1,50 +0,0 @@ -from src.core.tasks.url.operators.base import URLTaskOperatorBase -from src.core.tasks.url.operators.submit_approved.convert import convert_to_task_errors -from src.core.tasks.url.operators.submit_approved.filter import filter_successes -from src.core.tasks.url.operators.submit_approved.queries.get import GetValidatedURLsQueryBuilder -from src.core.tasks.url.operators.submit_approved.queries.has_validated import HasValidatedURLsQueryBuilder -from src.core.tasks.url.operators.submit_approved.tdo import SubmitApprovedURLTDO, SubmittedURLInfo -from src.db.client.async_ import AsyncDatabaseClient -from src.db.enums import TaskType -from src.db.models.impl.url.task_error.pydantic_.small import URLTaskErrorSmall -from src.external.pdap.client import PDAPClient - - -class SubmitApprovedURLTaskOperator(URLTaskOperatorBase): - - def __init__( - self, - adb_client: AsyncDatabaseClient, - pdap_client: PDAPClient - ): - super().__init__(adb_client) - self.pdap_client = pdap_client - - @property - def task_type(self): - return TaskType.SUBMIT_APPROVED - - async def meets_task_prerequisites(self): - return await self.adb_client.run_query_builder(HasValidatedURLsQueryBuilder()) - - async def inner_task_logic(self): - # Retrieve all URLs that are validated and not submitted - tdos: list[SubmitApprovedURLTDO] = await self.get_validated_urls() - - # Link URLs to this task - await self.link_urls_to_task(url_ids=[tdo.url_id for tdo in tdos]) - - # Submit each URL, recording errors if they exist - submitted_url_infos: list[SubmittedURLInfo] = await self.pdap_client.submit_data_source_urls(tdos) - - task_errors: list[URLTaskErrorSmall] = await convert_to_task_errors(submitted_url_infos) - success_infos = await filter_successes(submitted_url_infos) - - # Update the database for successful submissions - await self.adb_client.mark_urls_as_submitted(infos=success_infos) - - # Update the database for failed submissions - await self.add_task_errors(task_errors) - - async def get_validated_urls(self) -> list[SubmitApprovedURLTDO]: - return await self.adb_client.run_query_builder(GetValidatedURLsQueryBuilder()) diff --git a/src/core/tasks/url/operators/submit_approved/filter.py b/src/core/tasks/url/operators/submit_approved/filter.py deleted file mode 100644 index 4ba2fad8..00000000 --- a/src/core/tasks/url/operators/submit_approved/filter.py +++ /dev/null @@ -1,11 +0,0 @@ -from src.core.tasks.url.operators.submit_approved.tdo import SubmittedURLInfo - - -async def filter_successes( - submitted_url_infos: list[SubmittedURLInfo] -) -> list[SubmittedURLInfo]: - success_infos = [ - response_object for response_object in submitted_url_infos - if response_object.data_source_id is not None - ] - return success_infos diff --git a/src/core/tasks/url/operators/submit_approved/queries/__init__.py b/src/core/tasks/url/operators/submit_approved/queries/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/core/tasks/url/operators/submit_approved/queries/cte.py b/src/core/tasks/url/operators/submit_approved/queries/cte.py deleted file mode 100644 index 47aad8e3..00000000 --- a/src/core/tasks/url/operators/submit_approved/queries/cte.py +++ /dev/null @@ -1,31 +0,0 @@ -from sqlalchemy import CTE, select, exists -from sqlalchemy.orm import aliased - -from src.collectors.enums import URLStatus -from src.db.enums import TaskType -from src.db.helpers.query import not_exists_url, no_url_task_error -from src.db.models.impl.flag.url_validated.enums import URLType -from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource - -VALIDATED_URLS_WITHOUT_DS_SQ =( - select(URL) - .join( - FlagURLValidated, - FlagURLValidated.url_id == URL.id - ) - .where( - URL.status == URLStatus.OK, - URL.name.isnot(None), - FlagURLValidated.type == URLType.DATA_SOURCE, - not_exists_url(DSAppLinkDataSource), - no_url_task_error(TaskType.SUBMIT_APPROVED) - ) - .subquery() -) - -VALIDATED_URLS_WITHOUT_DS_ALIAS = aliased( - URL, - VALIDATED_URLS_WITHOUT_DS_SQ -) \ No newline at end of file diff --git a/src/core/tasks/url/operators/submit_approved/queries/get.py b/src/core/tasks/url/operators/submit_approved/queries/get.py deleted file mode 100644 index 96621cb8..00000000 --- a/src/core/tasks/url/operators/submit_approved/queries/get.py +++ /dev/null @@ -1,68 +0,0 @@ -from sqlalchemy import select -from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.orm import selectinload - -from src.core.tasks.url.operators.submit_approved.queries.cte import VALIDATED_URLS_WITHOUT_DS_ALIAS -from src.core.tasks.url.operators.submit_approved.tdo import SubmitApprovedURLTDO -from src.db.helpers.session import session_helper as sh -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.queries.base.builder import QueryBuilderBase - - -class GetValidatedURLsQueryBuilder(QueryBuilderBase): - - async def run(self, session: AsyncSession) -> list[SubmitApprovedURLTDO]: - query = await self._build_query() - urls = await sh.scalars(session, query) - return await self._process_results(urls) - - async def _process_results(self, urls): - results: list[SubmitApprovedURLTDO] = [] - for url in urls: - try: - tdo = await self._process_result(url) - except Exception as e: - raise ValueError(f"Failed to process url {url.id}") from e - results.append(tdo) - return results - - @staticmethod - async def _build_query(): - query = ( - select(VALIDATED_URLS_WITHOUT_DS_ALIAS) - .options( - selectinload(VALIDATED_URLS_WITHOUT_DS_ALIAS.optional_data_source_metadata), - selectinload(VALIDATED_URLS_WITHOUT_DS_ALIAS.confirmed_agencies), - selectinload(VALIDATED_URLS_WITHOUT_DS_ALIAS.reviewing_user), - selectinload(VALIDATED_URLS_WITHOUT_DS_ALIAS.record_type), - ).limit(100) - ) - return query - - @staticmethod - async def _process_result(url: URL) -> SubmitApprovedURLTDO: - agency_ids = [] - for agency in url.confirmed_agencies: - agency_ids.append(agency.id) - optional_metadata = url.optional_data_source_metadata - if optional_metadata is None: - record_formats = None - data_portal_type = None - supplying_entity = None - else: - record_formats = optional_metadata.record_formats - data_portal_type = optional_metadata.data_portal_type - supplying_entity = optional_metadata.supplying_entity - tdo = SubmitApprovedURLTDO( - url_id=url.id, - url=url.full_url, - name=url.name, - agency_ids=agency_ids, - description=url.description, - record_type=url.record_type.record_type, - record_formats=record_formats, - data_portal_type=data_portal_type, - supplying_entity=supplying_entity, - approving_user_id=url.reviewing_user.user_id - ) - return tdo \ No newline at end of file diff --git a/src/core/tasks/url/operators/submit_approved/queries/has_validated.py b/src/core/tasks/url/operators/submit_approved/queries/has_validated.py deleted file mode 100644 index 2cbee486..00000000 --- a/src/core/tasks/url/operators/submit_approved/queries/has_validated.py +++ /dev/null @@ -1,18 +0,0 @@ -from sqlalchemy import select -from sqlalchemy.ext.asyncio import AsyncSession - -from src.core.tasks.url.operators.submit_approved.queries.cte import VALIDATED_URLS_WITHOUT_DS_ALIAS -from src.db.helpers.session import session_helper as sh -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.queries.base.builder import QueryBuilderBase - - -class HasValidatedURLsQueryBuilder(QueryBuilderBase): - - async def run(self, session: AsyncSession) -> bool: - query = ( - select(VALIDATED_URLS_WITHOUT_DS_ALIAS) - .limit(1) - ) - url: URL | None = await sh.one_or_none(session, query=query) - return url is not None \ No newline at end of file diff --git a/src/core/tasks/url/operators/submit_approved/queries/mark_submitted.py b/src/core/tasks/url/operators/submit_approved/queries/mark_submitted.py deleted file mode 100644 index 3ad1a228..00000000 --- a/src/core/tasks/url/operators/submit_approved/queries/mark_submitted.py +++ /dev/null @@ -1,29 +0,0 @@ -from sqlalchemy import update -from sqlalchemy.ext.asyncio import AsyncSession - -from src.collectors.enums import URLStatus -from src.core.tasks.url.operators.submit_approved.tdo import SubmittedURLInfo -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource -from src.db.queries.base.builder import QueryBuilderBase - - -class MarkURLsAsSubmittedQueryBuilder(QueryBuilderBase): - - def __init__(self, infos: list[SubmittedURLInfo]): - super().__init__() - self.infos = infos - - async def run(self, session: AsyncSession): - for info in self.infos: - url_id = info.url_id - data_source_id = info.data_source_id - - url_data_source_object = DSAppLinkDataSource( - url_id=url_id, - data_source_id=data_source_id - ) - if info.submitted_at is not None: - url_data_source_object.created_at = info.submitted_at - session.add(url_data_source_object) - diff --git a/src/core/tasks/url/operators/submit_approved/tdo.py b/src/core/tasks/url/operators/submit_approved/tdo.py deleted file mode 100644 index 89d89d9e..00000000 --- a/src/core/tasks/url/operators/submit_approved/tdo.py +++ /dev/null @@ -1,26 +0,0 @@ -from datetime import datetime - -from pydantic import BaseModel - -from src.core.enums import RecordType - - -class SubmitApprovedURLTDO(BaseModel): - url_id: int - url: str - record_type: RecordType - agency_ids: list[int] - name: str - description: str | None = None - approving_user_id: int - record_formats: list[str] | None = None - data_portal_type: str | None = None - supplying_entity: str | None = None - data_source_id: int | None = None - request_error: str | None = None - -class SubmittedURLInfo(BaseModel): - url_id: int - data_source_id: int | None - request_error: str | None - submitted_at: datetime | None = None \ No newline at end of file diff --git a/src/core/tasks/url/operators/submit_meta_urls/__init__.py b/src/core/tasks/url/operators/submit_meta_urls/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/core/tasks/url/operators/submit_meta_urls/core.py b/src/core/tasks/url/operators/submit_meta_urls/core.py deleted file mode 100644 index ae41d56b..00000000 --- a/src/core/tasks/url/operators/submit_meta_urls/core.py +++ /dev/null @@ -1,78 +0,0 @@ -from src.core.tasks.url.operators.base import URLTaskOperatorBase -from src.core.tasks.url.operators.submit_meta_urls.queries.get import GetMetaURLsForSubmissionQueryBuilder -from src.core.tasks.url.operators.submit_meta_urls.queries.prereq import \ - MeetsMetaURLSSubmissionPrerequisitesQueryBuilder -from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.mapping_.simple import SimpleURLMapping -from src.db.enums import TaskType -from src.db.models.impl.url.ds_meta_url.pydantic import URLDSMetaURLPydantic -from src.db.models.impl.url.task_error.pydantic_.small import URLTaskErrorSmall -from src.external.pdap.client import PDAPClient -from src.external.pdap.impl.meta_urls.enums import SubmitMetaURLsStatus -from src.external.pdap.impl.meta_urls.request import SubmitMetaURLsRequest -from src.external.pdap.impl.meta_urls.response import SubmitMetaURLsResponse -from src.util.url_mapper_.simple import SimpleURLMapper - - -class SubmitMetaURLsTaskOperator(URLTaskOperatorBase): - - def __init__( - self, - adb_client: AsyncDatabaseClient, - pdap_client: PDAPClient - ): - super().__init__(adb_client) - self.pdap_client = pdap_client - - @property - def task_type(self) -> TaskType: - return TaskType.SUBMIT_META_URLS - - async def meets_task_prerequisites(self) -> bool: - return await self.adb_client.run_query_builder( - MeetsMetaURLSSubmissionPrerequisitesQueryBuilder() - ) - - async def inner_task_logic(self) -> None: - requests: list[SubmitMetaURLsRequest] = await self.adb_client.run_query_builder( - GetMetaURLsForSubmissionQueryBuilder() - ) - - url_mappings: list[SimpleURLMapping] = [ - SimpleURLMapping( - url=request.url, - url_id=request.url_id, - ) - for request in requests - ] - - mapper = SimpleURLMapper(url_mappings) - - await self.link_urls_to_task(mapper.get_all_ids()) - - responses: list[SubmitMetaURLsResponse] = \ - await self.pdap_client.submit_meta_urls(requests) - - errors: list[URLTaskErrorSmall] = [] - inserts: list[URLDSMetaURLPydantic] = [] - - for response in responses: - url_id: int = mapper.get_id(response.url) - if response.status == SubmitMetaURLsStatus.SUCCESS: - inserts.append( - URLDSMetaURLPydantic( - url_id=url_id, - agency_id=response.agency_id, - ds_meta_url_id=response.meta_url_id - ) - ) - else: - errors.append( - URLTaskErrorSmall( - url_id=url_id, - error=response.error, - ) - ) - - await self.add_task_errors(errors) - await self.adb_client.bulk_insert(inserts) diff --git a/src/core/tasks/url/operators/submit_meta_urls/queries/__init__.py b/src/core/tasks/url/operators/submit_meta_urls/queries/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/core/tasks/url/operators/submit_meta_urls/queries/cte.py b/src/core/tasks/url/operators/submit_meta_urls/queries/cte.py deleted file mode 100644 index d3dd7019..00000000 --- a/src/core/tasks/url/operators/submit_meta_urls/queries/cte.py +++ /dev/null @@ -1,61 +0,0 @@ -from sqlalchemy import select, exists, Column, CTE - -from src.db.enums import TaskType -from src.db.helpers.query import no_url_task_error -from src.db.models.impl.agency.sqlalchemy import Agency -from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL -from src.db.models.views.meta_url import MetaURL - - -class SubmitMetaURLsPrerequisitesCTEContainer: - - def __init__(self): - - self._cte = ( - select( - URL.id.label("url_id"), - URL.full_url.label("url"), - LinkURLAgency.agency_id, - ) - # Validated as Meta URL - .join( - MetaURL, - MetaURL.url_id == URL.id - ) - .join( - LinkURLAgency, - LinkURLAgency.url_id == URL.id - ) - # Does not have a submission - .where( - ~exists( - select( - DSAppLinkMetaURL.ds_meta_url_id - ) - .where( - DSAppLinkMetaURL.url_id == URL.id, - DSAppLinkMetaURL.agency_id == LinkURLAgency.agency_id - ) - ), - no_url_task_error(TaskType.SUBMIT_META_URLS) - ) - .cte("submit_meta_urls_prerequisites") - ) - - @property - def cte(self) -> CTE: - return self._cte - - @property - def url_id(self) -> Column[int]: - return self._cte.c.url_id - - @property - def agency_id(self) -> Column[int]: - return self._cte.c.agency_id - - @property - def url(self) -> Column[str]: - return self._cte.c.url \ No newline at end of file diff --git a/src/core/tasks/url/operators/submit_meta_urls/queries/get.py b/src/core/tasks/url/operators/submit_meta_urls/queries/get.py deleted file mode 100644 index 518393f6..00000000 --- a/src/core/tasks/url/operators/submit_meta_urls/queries/get.py +++ /dev/null @@ -1,34 +0,0 @@ -from typing import Any, Sequence - -from sqlalchemy import select, RowMapping -from sqlalchemy.ext.asyncio import AsyncSession - -from src.core.tasks.url.operators.submit_meta_urls.queries.cte import SubmitMetaURLsPrerequisitesCTEContainer -from src.db.queries.base.builder import QueryBuilderBase -from src.external.pdap.impl.meta_urls.request import SubmitMetaURLsRequest - -from src.db.helpers.session import session_helper as sh - -class GetMetaURLsForSubmissionQueryBuilder(QueryBuilderBase): - - - async def run(self, session: AsyncSession) -> list[SubmitMetaURLsRequest]: - cte = SubmitMetaURLsPrerequisitesCTEContainer() - query = ( - select( - cte.url_id, - cte.agency_id, - cte.url - ) - ) - - mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) - - return [ - SubmitMetaURLsRequest( - url_id=mapping["url_id"], - agency_id=mapping["agency_id"], - url=mapping["url"], - ) - for mapping in mappings - ] diff --git a/src/core/tasks/url/operators/submit_meta_urls/queries/prereq.py b/src/core/tasks/url/operators/submit_meta_urls/queries/prereq.py deleted file mode 100644 index 3b5538be..00000000 --- a/src/core/tasks/url/operators/submit_meta_urls/queries/prereq.py +++ /dev/null @@ -1,20 +0,0 @@ -from sqlalchemy import select -from sqlalchemy.ext.asyncio import AsyncSession - -from src.core.tasks.url.operators.submit_meta_urls.queries.cte import SubmitMetaURLsPrerequisitesCTEContainer -from src.db.queries.base.builder import QueryBuilderBase -from src.db.helpers.session import session_helper as sh - - -class MeetsMetaURLSSubmissionPrerequisitesQueryBuilder(QueryBuilderBase): - - - async def run(self, session: AsyncSession) -> bool: - cte = SubmitMetaURLsPrerequisitesCTEContainer() - query = ( - select( - cte.url_id, - ) - ) - - return await sh.has_results(session, query=query) \ No newline at end of file diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 2d483890..0ee9db85 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -46,8 +46,6 @@ from src.core.tasks.url.operators.html.queries.get import \ GetPendingURLsWithoutHTMLDataQueryBuilder from src.core.tasks.url.operators.misc_metadata.tdo import URLMiscellaneousMetadataTDO -from src.core.tasks.url.operators.submit_approved.queries.mark_submitted import MarkURLsAsSubmittedQueryBuilder -from src.core.tasks.url.operators.submit_approved.tdo import SubmittedURLInfo from src.db.client.helpers import add_standard_limit_and_offset from src.db.client.types import UserSuggestionModel from src.db.config_manager import ConfigManager @@ -756,9 +754,6 @@ async def update_batch_post_collection( batch.status = batch_status.value batch.compute_time = compute_time - async def mark_urls_as_submitted(self, infos: list[SubmittedURLInfo]): - await self.run_query_builder(MarkURLsAsSubmittedQueryBuilder(infos)) - async def get_duplicates_by_batch_id(self, batch_id: int, page: int) -> list[DuplicateInfo]: return await self.run_query_builder( GetDuplicatesByBatchIDQueryBuilder( diff --git a/src/db/client/sync.py b/src/db/client/sync.py index 90dba719..eec2ce53 100644 --- a/src/db/client/sync.py +++ b/src/db/client/sync.py @@ -19,7 +19,7 @@ from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.batch.sqlalchemy import Batch -from src.core.tasks.url.operators.submit_approved.tdo import SubmittedURLInfo +from tests.helpers.data_creator.commands.impl.urls_.tdo import SubmittedURLInfo from src.core.env_var_manager import EnvVarManager from src.core.enums import BatchStatus from src.util.models.url_and_scheme import URLAndScheme diff --git a/src/external/pdap/_templates/request_builder.py b/src/external/pdap/_templates/request_builder.py index e74c87e5..226495f1 100644 --- a/src/external/pdap/_templates/request_builder.py +++ b/src/external/pdap/_templates/request_builder.py @@ -1,9 +1,39 @@ from abc import ABC, abstractmethod +from http import HTTPStatus +from typing import Any -from pdap_access_manager import AccessManager +from pdap_access_manager import AccessManager, RequestType, RequestInfo, ResponseInfo +from pydantic import BaseModel -class PDAPClientRequestBuilderBase(ABC): + +class PDAPRequestBuilderBase(ABC): + + def __init__(self): + self.access_manager: AccessManager | None = None + + async def run(self, access_manager: AccessManager) -> Any: + self.access_manager = access_manager + return await self.inner_logic() + + def build_url(self, path: str) -> str: + return f"{self.access_manager.data_sources_url}/{path}" + + async def post( + self, + url: str, + model: BaseModel + ) -> dict: + request_info = RequestInfo( + type_=RequestType.POST, + url=url, + json_=model.model_dump(mode='json'), + headers=self.access_manager.jwt_header() + ) + response_info: ResponseInfo = await self.access_manager.make_request(request_info) + if response_info.status_code != HTTPStatus.OK: + raise Exception(f"Failed to make request to PDAP: {response_info.data}") + return response_info.data @abstractmethod - async def run(self, access_manager: AccessManager): - raise NotImplementedError \ No newline at end of file + async def inner_logic(self) -> Any: + raise NotImplementedError diff --git a/src/external/pdap/client.py b/src/external/pdap/client.py index 1c950ad3..944f8a88 100644 --- a/src/external/pdap/client.py +++ b/src/external/pdap/client.py @@ -2,13 +2,11 @@ from pdap_access_manager import AccessManager, DataSourcesNamespaces, RequestInfo, RequestType, ResponseInfo -from src.core.tasks.url.operators.submit_approved.tdo import SubmitApprovedURLTDO, SubmittedURLInfo +from src.external.pdap._templates.request_builder import PDAPRequestBuilderBase from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse from src.external.pdap.dtos.unique_url_duplicate import UniqueURLDuplicateInfo from src.external.pdap.enums import MatchAgencyResponseStatus -from src.external.pdap.impl.meta_urls.core import submit_meta_urls -from src.external.pdap.impl.meta_urls.request import SubmitMetaURLsRequest class PDAPClient: @@ -19,6 +17,12 @@ def __init__( ): self.access_manager = access_manager + async def run_request_builder( + self, + request_builder: PDAPRequestBuilderBase + ) -> Any: + return await request_builder.run(self.access_manager) + async def match_agency( self, name: str, @@ -90,70 +94,3 @@ async def is_url_duplicate( ] is_duplicate: bool = (len(duplicates) != 0) return is_duplicate - - async def submit_data_source_urls( - self, - tdos: list[SubmitApprovedURLTDO] - ) -> list[SubmittedURLInfo]: - """ - Submits URLs to Data Sources App, - modifying tdos in-place with data source id or error - """ - request_url = self.access_manager.build_url( - namespace=DataSourcesNamespaces.SOURCE_COLLECTOR, - subdomains=["data-sources"] - ) - - # Build url-id dictionary - url_id_dict: dict[str, int] = {} - for tdo in tdos: - url_id_dict[tdo.url] = tdo.url_id - - data_sources_json: list[dict[str, Any]] = [] - for tdo in tdos: - data_sources_json.append( - { - "name": tdo.name, - "description": tdo.description, - "source_url": tdo.url, - "record_type": tdo.record_type.value, - "record_formats": tdo.record_formats, - "data_portal_type": tdo.data_portal_type, - "last_approval_editor": tdo.approving_user_id, - "supplying_entity": tdo.supplying_entity, - "agency_ids": tdo.agency_ids - } - ) - - headers: dict[str, str] = await self.access_manager.jwt_header() - request_info = RequestInfo( - type_=RequestType.POST, - url=request_url, - headers=headers, - json_={ - "data_sources": data_sources_json - } - ) - response_info: ResponseInfo = await self.access_manager.make_request(request_info) - data_sources_response_json: list[dict[str, Any]] = response_info.data["data_sources"] - - results: list[SubmittedURLInfo] = [] - for data_source in data_sources_response_json: - url: str = data_source["url"] - response_object = SubmittedURLInfo( - url_id=url_id_dict[url], - data_source_id=data_source["data_source_id"], - request_error=data_source["error"] - ) - results.append(response_object) - - return results - - async def submit_meta_urls( - self, - requests: list[SubmitMetaURLsRequest] - ): - return await submit_meta_urls( - self.access_manager, - requests=requests - ) \ No newline at end of file diff --git a/src/external/pdap/impl/meta_urls/__init__.py b/src/external/pdap/impl/meta_urls/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/external/pdap/impl/meta_urls/core.py b/src/external/pdap/impl/meta_urls/core.py deleted file mode 100644 index 4a34fbeb..00000000 --- a/src/external/pdap/impl/meta_urls/core.py +++ /dev/null @@ -1,58 +0,0 @@ -from typing import Any - -from pdap_access_manager import AccessManager, DataSourcesNamespaces, RequestInfo, RequestType, ResponseInfo - -from src.external.pdap.impl.meta_urls.enums import SubmitMetaURLsStatus -from src.external.pdap.impl.meta_urls.request import SubmitMetaURLsRequest -from src.external.pdap.impl.meta_urls.response import SubmitMetaURLsResponse - - -async def submit_meta_urls( - access_manager: AccessManager, - requests: list[SubmitMetaURLsRequest] -) -> list[SubmitMetaURLsResponse]: - - - # Build url-id dictionary - url_id_dict: dict[str, int] = {} - for request in requests: - url_id_dict[request.url] = request.url_id - - meta_urls_json: list[dict[str, Any]] = [] - for request in requests: - meta_urls_json.append( - { - "url": request.url, - "agency_id": request.agency_id - } - ) - - headers: dict[str, str] = await access_manager.jwt_header() - url: str = access_manager.build_url( - namespace=DataSourcesNamespaces.SOURCE_COLLECTOR, - subdomains=["meta-urls"] - ) - request_info = RequestInfo( - type_=RequestType.POST, - url=url, - headers=headers, - json_={ - "meta_urls": meta_urls_json - } - ) - - response_info: ResponseInfo = await access_manager.make_request(request_info) - meta_urls_response_json: list[dict[str, Any]] = response_info.data["meta_urls"] - - responses: list[SubmitMetaURLsResponse] = [] - for meta_url in meta_urls_response_json: - responses.append( - SubmitMetaURLsResponse( - url=meta_url["url"], - status=SubmitMetaURLsStatus(meta_url["status"]), - agency_id=meta_url["agency_id"], - meta_url_id=meta_url["meta_url_id"], - error=meta_url["error"] - ) - ) - return responses \ No newline at end of file diff --git a/src/external/pdap/impl/meta_urls/enums.py b/src/external/pdap/impl/meta_urls/enums.py deleted file mode 100644 index e49e71aa..00000000 --- a/src/external/pdap/impl/meta_urls/enums.py +++ /dev/null @@ -1,7 +0,0 @@ -from enum import Enum - - -class SubmitMetaURLsStatus(Enum): - SUCCESS = "success" - FAILURE = "failure" - ALREADY_EXISTS = "already_exists" \ No newline at end of file diff --git a/src/external/pdap/impl/meta_urls/request.py b/src/external/pdap/impl/meta_urls/request.py deleted file mode 100644 index ac222aca..00000000 --- a/src/external/pdap/impl/meta_urls/request.py +++ /dev/null @@ -1,7 +0,0 @@ -from pydantic import BaseModel - - -class SubmitMetaURLsRequest(BaseModel): - url_id: int - url: str - agency_id: int diff --git a/src/external/pdap/impl/meta_urls/response.py b/src/external/pdap/impl/meta_urls/response.py deleted file mode 100644 index 96d5ece7..00000000 --- a/src/external/pdap/impl/meta_urls/response.py +++ /dev/null @@ -1,11 +0,0 @@ -from pydantic import BaseModel - -from src.external.pdap.impl.meta_urls.enums import SubmitMetaURLsStatus - - -class SubmitMetaURLsResponse(BaseModel): - url: str - status: SubmitMetaURLsStatus - meta_url_id: int | None = None - agency_id: int | None = None - error: str | None = None \ No newline at end of file diff --git a/src/external/pdap/impl/sync/agencies/_shared/models/content.py b/src/external/pdap/impl/sync/agencies/_shared/models/content.py index e815b753..124072a7 100644 --- a/src/external/pdap/impl/sync/agencies/_shared/models/content.py +++ b/src/external/pdap/impl/sync/agencies/_shared/models/content.py @@ -1,3 +1,8 @@ +from pydantic import Field, BaseModel + +from src.db.models.impl.agency.enums import JurisdictionType, AgencyType + + class AgencySyncContentModel(BaseModel): # Required name: str diff --git a/src/external/pdap/impl/sync/agencies/add/core.py b/src/external/pdap/impl/sync/agencies/add/core.py index 109560a2..276ff39d 100644 --- a/src/external/pdap/impl/sync/agencies/add/core.py +++ b/src/external/pdap/impl/sync/agencies/add/core.py @@ -1,20 +1,27 @@ -from pydantic import BaseModel, Field, model_validator +from src.external.pdap._templates.request_builder import PDAPRequestBuilderBase +from src.external.pdap.impl.sync.agencies.add.request import AddAgenciesOuterRequest +from src.external.pdap.impl.sync.shared.models.add.response import DSAppSyncAddResponseInnerModel, \ + DSAppSyncAddResponseModel -from src.external.pdap.impl.sync.meta_urls._shared.content import MetaURLSyncContentModel +class AddAgenciesRequestBuilder(PDAPRequestBuilderBase): + + def __init__( + self, + request: AddAgenciesOuterRequest + ): + super().__init__() + self.request = request + + async def inner_logic(self) -> list[DSAppSyncAddResponseInnerModel]: + url: str = self.build_url("v3/source-manager/agencies/add") + raw_results = await self.post( + url=url, + model=self.request, + ) + response = DSAppSyncAddResponseModel(**raw_results) + return response.entities -class AddMetaURLsInnerRequest(BaseModel): - request_id: int - content: MetaURLSyncContentModel -class AddMetaURLsOuterRequest(BaseModel): - meta_urls: list[AddMetaURLsInnerRequest] = Field(max_length=1000) - @model_validator(mode="after") - def all_request_ids_unique(self): - if len(self.meta_urls) != len( - set([meta_url.request_id for meta_url in self.meta_urls]) - ): - raise ValueError("All request_ids must be unique") - return self diff --git a/src/external/pdap/impl/sync/agencies/delete/core.py b/src/external/pdap/impl/sync/agencies/delete/core.py index e69de29b..41c0cfd0 100644 --- a/src/external/pdap/impl/sync/agencies/delete/core.py +++ b/src/external/pdap/impl/sync/agencies/delete/core.py @@ -0,0 +1,22 @@ +from src.external.pdap._templates.request_builder import PDAPRequestBuilderBase +from src.external.pdap.impl.sync.shared.models.delete.request import DSAppSyncDeleteRequestModel + + +class DeleteAgenciesRequestBuilder(PDAPRequestBuilderBase): + + def __init__( + self, + ds_app_ids: list[int] + ): + super().__init__() + self.ds_app_ids = ds_app_ids + + async def inner_logic(self) -> None: + url: str = self.build_url("v3/source-manager/agencies/delete") + await self.post( + url=url, + model=DSAppSyncDeleteRequestModel( + ids=self.ds_app_ids + ) + ) + diff --git a/src/external/pdap/impl/sync/agencies/update/core.py b/src/external/pdap/impl/sync/agencies/update/core.py index e69de29b..4c5673ac 100644 --- a/src/external/pdap/impl/sync/agencies/update/core.py +++ b/src/external/pdap/impl/sync/agencies/update/core.py @@ -0,0 +1,19 @@ +from src.external.pdap._templates.request_builder import PDAPRequestBuilderBase +from src.external.pdap.impl.sync.agencies.update.request import UpdateAgenciesOuterRequest + + +class UpdateAgenciesRequestBuilder(PDAPRequestBuilderBase): + + def __init__( + self, + request: UpdateAgenciesOuterRequest + ): + super().__init__() + self.request = request + + async def inner_logic(self) -> None: + url: str = self.build_url("v3/source-manager/agencies/update") + await self.post( + url=url, + model=self.request + ) \ No newline at end of file diff --git a/src/external/pdap/impl/sync/data_sources/_shared/content.py b/src/external/pdap/impl/sync/data_sources/_shared/content.py index 58f9abf1..9895223a 100644 --- a/src/external/pdap/impl/sync/data_sources/_shared/content.py +++ b/src/external/pdap/impl/sync/data_sources/_shared/content.py @@ -1,8 +1,19 @@ +from datetime import date + +from pydantic import BaseModel, Field + +from src.core.enums import RecordType +from src.db.models.impl.url.optional_ds_metadata.enums import AgencyAggregationEnum, UpdateMethodEnum, \ + RetentionScheduleEnum, AccessTypeEnum +from src.external.pdap.enums import DataSourcesURLStatus +from src.external.pdap.impl.sync.data_sources._shared.enums import DetailLevel + + class DataSourceSyncContentModel(BaseModel): # Required source_url: str name: str - record_type: RecordTypesEnum + record_type: RecordType # Optional description: str | None = None @@ -16,16 +27,16 @@ class DataSourceSyncContentModel(BaseModel): detail_level: DetailLevel | None = None agency_supplied: bool | None = None agency_originated: bool | None = None - agency_aggregation: AgencyAggregation | None = None + agency_aggregation: AgencyAggregationEnum | None = None agency_described_not_in_database: str | None = None - update_method: UpdateMethod | None = None + update_method: UpdateMethodEnum | None = None readme_url: str | None = None originating_entity: str | None = None - retention_schedule: RetentionSchedule | None = None + retention_schedule: RetentionScheduleEnum | None = None scraper_url: str | None = None access_notes: str | None = None - access_types: list[AccessType] | None = None + access_types: list[AccessTypeEnum] | None = None data_portal_type_other: str | None = None - url_status: URLStatus | None = None + url_status: DataSourcesURLStatus | None = None agency_ids: list[int] = Field(min_length=1) diff --git a/src/external/pdap/impl/sync/data_sources/_shared/enums.py b/src/external/pdap/impl/sync/data_sources/_shared/enums.py new file mode 100644 index 00000000..bc7929a2 --- /dev/null +++ b/src/external/pdap/impl/sync/data_sources/_shared/enums.py @@ -0,0 +1,11 @@ +from enum import Enum + + +class DetailLevel(Enum): + """ + Correlates to the detail_level enum in the database + """ + + INDIVIDUAL = "Individual record" + AGGREGATED = "Aggregated records" + SUMMARIZED = "Summarized totals" diff --git a/src/external/pdap/impl/sync/data_sources/add/core.py b/src/external/pdap/impl/sync/data_sources/add/core.py index e69de29b..8eaa1b8b 100644 --- a/src/external/pdap/impl/sync/data_sources/add/core.py +++ b/src/external/pdap/impl/sync/data_sources/add/core.py @@ -0,0 +1,24 @@ +from src.external.pdap._templates.request_builder import PDAPRequestBuilderBase +from src.external.pdap.impl.sync.data_sources.add.request import AddDataSourcesOuterRequest +from src.external.pdap.impl.sync.shared.models.add.response import DSAppSyncAddResponseInnerModel, \ + DSAppSyncAddResponseModel + + +class AddDataSourcesRequestBuilder(PDAPRequestBuilderBase): + + def __init__( + self, + request: AddDataSourcesOuterRequest + ): + super().__init__() + self.request = request + + async def inner_logic(self) -> list[DSAppSyncAddResponseInnerModel]: + url: str = self.build_url("v3/source-manager/data-sources/add") + raw_results = await self.post( + url=url, + model=self.request, + ) + response = DSAppSyncAddResponseModel(**raw_results) + return response.entities + diff --git a/src/external/pdap/impl/sync/data_sources/delete/core.py b/src/external/pdap/impl/sync/data_sources/delete/core.py index e69de29b..7199c0ca 100644 --- a/src/external/pdap/impl/sync/data_sources/delete/core.py +++ b/src/external/pdap/impl/sync/data_sources/delete/core.py @@ -0,0 +1,22 @@ +from src.external.pdap._templates.request_builder import PDAPRequestBuilderBase +from src.external.pdap.impl.sync.shared.models.delete.request import DSAppSyncDeleteRequestModel + + +class DeleteDataSourcesRequestBuilder(PDAPRequestBuilderBase): + + def __init__( + self, + ds_app_ids: list[int] + ): + super().__init__() + self.ds_app_ids = ds_app_ids + + async def inner_logic(self) -> None: + url: str = self.build_url("v3/source-manager/data-sources/delete") + await self.post( + url=url, + model=DSAppSyncDeleteRequestModel( + ids=self.ds_app_ids + ) + ) + diff --git a/src/external/pdap/impl/sync/data_sources/update/core.py b/src/external/pdap/impl/sync/data_sources/update/core.py index e69de29b..8bcaf57e 100644 --- a/src/external/pdap/impl/sync/data_sources/update/core.py +++ b/src/external/pdap/impl/sync/data_sources/update/core.py @@ -0,0 +1,19 @@ +from src.external.pdap._templates.request_builder import PDAPRequestBuilderBase +from src.external.pdap.impl.sync.data_sources.update.request import UpdateDataSourcesOuterRequest + + +class UpdateDataSourcesRequestBuilder(PDAPRequestBuilderBase): + + def __init__( + self, + request: UpdateDataSourcesOuterRequest + ): + super().__init__() + self.request = request + + async def inner_logic(self) -> None: + url: str = self.build_url("v3/source-manager/data-sources/update") + await self.post( + url=url, + model=self.request + ) \ No newline at end of file diff --git a/src/external/pdap/impl/sync/meta_urls/add/core.py b/src/external/pdap/impl/sync/meta_urls/add/core.py index e69de29b..98d6f016 100644 --- a/src/external/pdap/impl/sync/meta_urls/add/core.py +++ b/src/external/pdap/impl/sync/meta_urls/add/core.py @@ -0,0 +1,25 @@ +from src.external.pdap._templates.request_builder import PDAPRequestBuilderBase +from src.external.pdap.impl.sync.meta_urls.add.request import AddMetaURLsOuterRequest +from src.external.pdap.impl.sync.shared.models.add.response import DSAppSyncAddResponseInnerModel, \ + DSAppSyncAddResponseModel + + +class AddMetaURLsRequestBuilder(PDAPRequestBuilderBase): + + def __init__( + self, + request: AddMetaURLsOuterRequest + ): + super().__init__() + self.request = request + + async def inner_logic(self) -> list[DSAppSyncAddResponseInnerModel]: + url: str = self.build_url("v3/source-manager/meta-urls/add") + raw_results = await self.post( + url=url, + model=self.request, + ) + response = DSAppSyncAddResponseModel(**raw_results) + return response.entities + + diff --git a/src/external/pdap/impl/sync/meta_urls/add/request.py b/src/external/pdap/impl/sync/meta_urls/add/request.py index e69de29b..109560a2 100644 --- a/src/external/pdap/impl/sync/meta_urls/add/request.py +++ b/src/external/pdap/impl/sync/meta_urls/add/request.py @@ -0,0 +1,20 @@ +from pydantic import BaseModel, Field, model_validator + +from src.external.pdap.impl.sync.meta_urls._shared.content import MetaURLSyncContentModel + + +class AddMetaURLsInnerRequest(BaseModel): + request_id: int + content: MetaURLSyncContentModel + + +class AddMetaURLsOuterRequest(BaseModel): + meta_urls: list[AddMetaURLsInnerRequest] = Field(max_length=1000) + + @model_validator(mode="after") + def all_request_ids_unique(self): + if len(self.meta_urls) != len( + set([meta_url.request_id for meta_url in self.meta_urls]) + ): + raise ValueError("All request_ids must be unique") + return self diff --git a/src/external/pdap/impl/sync/meta_urls/delete/core.py b/src/external/pdap/impl/sync/meta_urls/delete/core.py index e69de29b..abdc3a6b 100644 --- a/src/external/pdap/impl/sync/meta_urls/delete/core.py +++ b/src/external/pdap/impl/sync/meta_urls/delete/core.py @@ -0,0 +1,24 @@ +from pdap_access_manager import AccessManager + +from src.external.pdap._templates.request_builder import PDAPRequestBuilderBase +from src.external.pdap.impl.sync.shared.models.delete.request import DSAppSyncDeleteRequestModel + + +class DeleteMetaURLsRequestBuilder(PDAPRequestBuilderBase): + + def __init__( + self, + ds_app_ids: list[int] + ): + super().__init__() + self.ds_app_ids = ds_app_ids + + async def inner_logic(self) -> None: + url: str = self.build_url("v3/source-manager/meta-urls/delete") + await self.post( + url=url, + model=DSAppSyncDeleteRequestModel( + ids=self.ds_app_ids + ) + ) + diff --git a/src/external/pdap/impl/sync/meta_urls/update/core.py b/src/external/pdap/impl/sync/meta_urls/update/core.py index e69de29b..44077e47 100644 --- a/src/external/pdap/impl/sync/meta_urls/update/core.py +++ b/src/external/pdap/impl/sync/meta_urls/update/core.py @@ -0,0 +1,21 @@ +from pdap_access_manager import AccessManager + +from src.external.pdap._templates.request_builder import PDAPRequestBuilderBase +from src.external.pdap.impl.sync.meta_urls.update.request import UpdateMetaURLsOuterRequest + + +class UpdateMetaURLsRequestBuilder(PDAPRequestBuilderBase): + + def __init__( + self, + request: UpdateMetaURLsOuterRequest + ): + super().__init__() + self.request = request + + async def inner_logic(self) -> None: + url: str = self.build_url("v3/source-manager/meta-urls/update") + await self.post( + url=url, + model=self.request + ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/submit_approved/__init__.py b/tests/automated/integration/tasks/url/impl/submit_approved/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/automated/integration/tasks/url/impl/submit_approved/mock.py b/tests/automated/integration/tasks/url/impl/submit_approved/mock.py deleted file mode 100644 index 0e631d5b..00000000 --- a/tests/automated/integration/tasks/url/impl/submit_approved/mock.py +++ /dev/null @@ -1,38 +0,0 @@ -from http import HTTPStatus -from unittest.mock import AsyncMock - -from pdap_access_manager import ResponseInfo - -from src.core.enums import SubmitResponseStatus -from src.external.pdap.client import PDAPClient - - -def mock_make_request(pdap_client: PDAPClient, urls: list[str]): - assert len(urls) == 3, "Expected 3 urls" - pdap_client.access_manager.make_request = AsyncMock( - return_value=ResponseInfo( - status_code=HTTPStatus.OK, - data={ - "data_sources": [ - { - "url": urls[0], - "status": SubmitResponseStatus.SUCCESS, - "error": None, - "data_source_id": 21, - }, - { - "url": urls[1], - "status": SubmitResponseStatus.SUCCESS, - "error": None, - "data_source_id": 34, - }, - { - "url": urls[2], - "status": SubmitResponseStatus.FAILURE, - "error": "Test Error", - "data_source_id": None - } - ] - } - ) - ) diff --git a/tests/automated/integration/tasks/url/impl/submit_approved/setup.py b/tests/automated/integration/tasks/url/impl/submit_approved/setup.py deleted file mode 100644 index 1f9d8915..00000000 --- a/tests/automated/integration/tasks/url/impl/submit_approved/setup.py +++ /dev/null @@ -1,49 +0,0 @@ -from src.api.endpoints.review.approve.dto import FinalReviewApprovalInfo -from src.core.enums import RecordType -from tests.helpers.data_creator.core import DBDataCreator -from tests.helpers.data_creator.models.creation_info.batch.v1 import BatchURLCreationInfo - - -async def setup_validated_urls(db_data_creator: DBDataCreator, agency_id: int) -> list[str]: - creation_info: BatchURLCreationInfo = await db_data_creator.batch_and_urls( - url_count=3, - with_html_content=True - ) - - url_1 = creation_info.url_ids[0] - url_2 = creation_info.url_ids[1] - url_3 = creation_info.url_ids[2] - await db_data_creator.adb_client.approve_url( - approval_info=FinalReviewApprovalInfo( - url_id=url_1, - record_type=RecordType.ACCIDENT_REPORTS, - agency_ids=[agency_id], - name="URL 1 Name", - description=None, - record_formats=["Record Format 1", "Record Format 2"], - data_portal_type="Data Portal Type 1", - supplying_entity="Supplying Entity 1" - ), - user_id=1 - ) - await db_data_creator.adb_client.approve_url( - approval_info=FinalReviewApprovalInfo( - url_id=url_2, - record_type=RecordType.INCARCERATION_RECORDS, - agency_ids=[agency_id], - name="URL 2 Name", - description="URL 2 Description", - ), - user_id=2 - ) - await db_data_creator.adb_client.approve_url( - approval_info=FinalReviewApprovalInfo( - url_id=url_3, - record_type=RecordType.ACCIDENT_REPORTS, - agency_ids=[agency_id], - name="URL 3 Name", - description="URL 3 Description", - ), - user_id=3 - ) - return creation_info.urls diff --git a/tests/automated/integration/tasks/url/impl/submit_approved/test_submit_approved_url_task.py b/tests/automated/integration/tasks/url/impl/submit_approved/test_submit_approved_url_task.py deleted file mode 100644 index 12e20063..00000000 --- a/tests/automated/integration/tasks/url/impl/submit_approved/test_submit_approved_url_task.py +++ /dev/null @@ -1,134 +0,0 @@ -import pytest -from deepdiff import DeepDiff -from pdap_access_manager import RequestInfo, RequestType, DataSourcesNamespaces - -from src.core.tasks.url.enums import TaskOperatorOutcome -from src.core.tasks.url.operators.submit_approved.core import SubmitApprovedURLTaskOperator -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource -from src.db.models.impl.url.task_error.sqlalchemy import URLTaskError -from src.external.pdap.client import PDAPClient -from tests.automated.integration.tasks.url.impl.submit_approved.mock import mock_make_request -from tests.automated.integration.tasks.url.impl.submit_approved.setup import setup_validated_urls - -# TODO: Marked for destruction -@pytest.mark.asyncio -async def test_submit_approved_url_task( - db_data_creator, - mock_pdap_client: PDAPClient, - monkeypatch -): - """ - The submit_approved_url_task should submit - all validated URLs to the PDAP Data Sources App - """ - - - # Get Task Operator - operator = SubmitApprovedURLTaskOperator( - adb_client=db_data_creator.adb_client, - pdap_client=mock_pdap_client - ) - - # Check Task Operator does not yet meet pre-requisites - assert not await operator.meets_task_prerequisites() - - # Create URLs with status 'validated' in database and all requisite URL values - # Ensure they have optional metadata as well - agency_id = await db_data_creator.agency() - urls: list[str] = await setup_validated_urls(db_data_creator, agency_id=agency_id) - mock_make_request(mock_pdap_client, urls) - - # Check Task Operator does meet pre-requisites - assert await operator.meets_task_prerequisites() - - # Run Task - run_info = await operator.run_task() - - # Check Task has been marked as completed - assert run_info.outcome == TaskOperatorOutcome.SUCCESS, run_info.message - - # Check Task Operator no longer meets pre-requisites - assert not await operator.meets_task_prerequisites() - - # Get URLs - urls: list[URL] = await db_data_creator.adb_client.get_all(URL, order_by_attribute="id") - url_1: URL = urls[0] - url_2: URL = urls[1] - url_3: URL = urls[2] - - # Get URL Data Source Links - url_data_sources = await db_data_creator.adb_client.get_all(DSAppLinkDataSource) - assert len(url_data_sources) == 2 - - url_data_source_1 = url_data_sources[0] - url_data_source_2 = url_data_sources[1] - - assert url_data_source_1.url_id == url_1.id - assert url_data_source_1.ds_data_source_id == 21 - - assert url_data_source_2.url_id == url_2.id - assert url_data_source_2.ds_data_source_id == 34 - - # Check that errored URL has entry in url_error_info - url_errors = await db_data_creator.adb_client.get_all(URLTaskError) - assert len(url_errors) == 1 - url_error = url_errors[0] - assert url_error.url_id == url_3.id - assert url_error.error == "Test Error" - - # Check mock method was called expected parameters - access_manager = mock_pdap_client.access_manager - access_manager.make_request.assert_called_once() - access_manager.build_url.assert_called_with( - namespace=DataSourcesNamespaces.SOURCE_COLLECTOR, - subdomains=['data-sources'] - ) - - call_1 = access_manager.make_request.call_args_list[0][0][0] - expected_call_1 = RequestInfo( - type_=RequestType.POST, - url="http://example.com", - headers=access_manager.jwt_header.return_value, - json_={ - "data_sources": [ - { - "name": "URL 1 Name", - "source_url": url_1.url, - "record_type": "Accident Reports", - "description": None, - "record_formats": ["Record Format 1", "Record Format 2"], - "data_portal_type": "Data Portal Type 1", - "last_approval_editor": 1, - "supplying_entity": "Supplying Entity 1", - "agency_ids": [agency_id] - }, - { - "name": "URL 2 Name", - "source_url": url_2.url, - "record_type": "Incarceration Records", - "description": "URL 2 Description", - "last_approval_editor": 2, - "supplying_entity": None, - "record_formats": None, - "data_portal_type": None, - "agency_ids": [agency_id] - }, - { - "name": "URL 3 Name", - "source_url": url_3.url, - "record_type": "Accident Reports", - "description": "URL 3 Description", - "last_approval_editor": 3, - "supplying_entity": None, - "record_formats": None, - "data_portal_type": None, - "agency_ids": [agency_id] - } - ] - } - ) - assert call_1.type_ == expected_call_1.type_ - assert call_1.headers == expected_call_1.headers - diff = DeepDiff(call_1.json_, expected_call_1.json_, ignore_order=True) - assert diff == {}, f"Differences found: {diff}" diff --git a/tests/automated/integration/tasks/url/impl/submit_approved/test_validated_meta_url.py b/tests/automated/integration/tasks/url/impl/submit_approved/test_validated_meta_url.py deleted file mode 100644 index 43818d8c..00000000 --- a/tests/automated/integration/tasks/url/impl/submit_approved/test_validated_meta_url.py +++ /dev/null @@ -1,41 +0,0 @@ -import pytest - -from src.core.tasks.base.run_info import TaskOperatorRunInfo -from src.core.tasks.url.operators.submit_approved.core import SubmitApprovedURLTaskOperator -from src.db.models.impl.flag.url_validated.enums import URLType -from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource -from src.external.pdap.client import PDAPClient -from tests.helpers.asserts import assert_task_run_success - - -@pytest.mark.asyncio -async def test_validated_meta_url_not_included( - db_data_creator, - mock_pdap_client: PDAPClient, -): - """ - If a validated Meta URL is included in the database - This should not be included in the submit approved task - """ - - # Get Task Operator - operator = SubmitApprovedURLTaskOperator( - adb_client=db_data_creator.adb_client, - pdap_client=mock_pdap_client - ) - - dbdc = db_data_creator - url_1: int = (await dbdc.create_validated_urls( - validation_type=URLType.META_URL - ))[0].url_id - - # Test task operator does not meet prerequisites - assert not await operator.meets_task_prerequisites() - - # Run task and confirm runs without error - run_info: TaskOperatorRunInfo = await operator.run_task() - assert_task_run_success(run_info) - - # Confirm entry not included in database - ds_urls: list[DSAppLinkDataSource] = await dbdc.adb_client.get_all(DSAppLinkDataSource) - assert len(ds_urls) == 0 diff --git a/tests/automated/integration/tasks/url/impl/submit_meta_urls/__init__.py b/tests/automated/integration/tasks/url/impl/submit_meta_urls/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/automated/integration/tasks/url/impl/submit_meta_urls/test_core.py b/tests/automated/integration/tasks/url/impl/submit_meta_urls/test_core.py deleted file mode 100644 index dea8ca6a..00000000 --- a/tests/automated/integration/tasks/url/impl/submit_meta_urls/test_core.py +++ /dev/null @@ -1,79 +0,0 @@ -from http import HTTPStatus -from unittest.mock import AsyncMock - -import pytest -from pdap_access_manager import ResponseInfo - -from src.collectors.enums import URLStatus -from src.core.tasks.url.operators.submit_meta_urls.core import SubmitMetaURLsTaskOperator -from src.db.dtos.url.mapping_.simple import SimpleURLMapping -from src.db.models.impl.flag.url_validated.enums import URLType -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL -from src.external.pdap.client import PDAPClient -from src.external.pdap.impl.meta_urls.enums import SubmitMetaURLsStatus -from tests.helpers.data_creator.core import DBDataCreator -from tests.helpers.run import run_task_and_confirm_success - - -@pytest.mark.asyncio -async def test_submit_meta_urls( - db_data_creator: DBDataCreator, - mock_pdap_client: PDAPClient, -): - """ - Test Submit Meta URLs Task Operator - """ - - - operator = SubmitMetaURLsTaskOperator( - adb_client=db_data_creator.adb_client, - pdap_client=mock_pdap_client - ) - - assert not await operator.meets_task_prerequisites() - - # Create validated meta url - agency_id: int = (await db_data_creator.create_agencies(count=1))[0] - - mapping: SimpleURLMapping = (await db_data_creator.create_validated_urls( - validation_type=URLType.META_URL - ))[0] - await db_data_creator.link_urls_to_agencies( - url_ids=[mapping.url_id], - agency_ids=[agency_id] - ) - - mock_pdap_client.access_manager.make_request = AsyncMock( - return_value=ResponseInfo( - status_code=HTTPStatus.OK, - data={ - "meta_urls": [ - { - "url": f"https://{mapping.url}", - "agency_id": agency_id, - "status": SubmitMetaURLsStatus.SUCCESS.value, - "meta_url_id": 2, - "error": None, - }, - ] - } - ) - ) - - - assert await operator.meets_task_prerequisites() - - await run_task_and_confirm_success(operator) - - urls: list[URL] = await db_data_creator.adb_client.get_all(URL) - assert len(urls) == 1 - url: URL = urls[0] - assert url.status == URLStatus.OK - - url_ds_meta_urls: list[DSAppLinkMetaURL] = await db_data_creator.adb_client.get_all(DSAppLinkMetaURL) - assert len(url_ds_meta_urls) == 1 - url_ds_meta_url: DSAppLinkMetaURL = url_ds_meta_urls[0] - assert url_ds_meta_url.url_id == url.id - assert url_ds_meta_url.ds_meta_url_id == 2 - assert url_ds_meta_url.agency_id == agency_id \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/loader/test_flags.py b/tests/automated/integration/tasks/url/loader/test_flags.py index f812c947..33014f5f 100644 --- a/tests/automated/integration/tasks/url/loader/test_flags.py +++ b/tests/automated/integration/tasks/url/loader/test_flags.py @@ -12,7 +12,6 @@ from src.core.tasks.url.operators.probe.core import URLProbeTaskOperator from src.core.tasks.url.operators.record_type.core import URLRecordTypeTaskOperator from src.core.tasks.url.operators.root_url.core import URLRootURLTaskOperator -from src.core.tasks.url.operators.submit_approved.core import SubmitApprovedURLTaskOperator class FlagTestParams(BaseModel): @@ -36,10 +35,6 @@ class Config: env_var="URL_AGENCY_IDENTIFICATION_TASK_FLAG", operator=AgencyIdentificationTaskOperator ), - FlagTestParams( - env_var="URL_SUBMIT_APPROVED_TASK_FLAG", - operator=SubmitApprovedURLTaskOperator - ), FlagTestParams( env_var="URL_MISC_METADATA_TASK_FLAG", operator=URLMiscellaneousMetadataTaskOperator diff --git a/tests/helpers/data_creator/commands/impl/urls_/query.py b/tests/helpers/data_creator/commands/impl/urls_/query.py index 7587abfb..beff749f 100644 --- a/tests/helpers/data_creator/commands/impl/urls_/query.py +++ b/tests/helpers/data_creator/commands/impl/urls_/query.py @@ -1,6 +1,6 @@ from datetime import datetime -from src.core.tasks.url.operators.submit_approved.tdo import SubmittedURLInfo +from tests.helpers.data_creator.commands.impl.urls_.tdo import SubmittedURLInfo from src.db.dtos.url.insert import InsertURLsInfo from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.core.pydantic.info import URLInfo diff --git a/tests/helpers/data_creator/commands/impl/urls_/tdo.py b/tests/helpers/data_creator/commands/impl/urls_/tdo.py new file mode 100644 index 00000000..a8991dcd --- /dev/null +++ b/tests/helpers/data_creator/commands/impl/urls_/tdo.py @@ -0,0 +1,12 @@ +from datetime import datetime + +from pydantic import BaseModel + +from src.core.enums import RecordType + + +class SubmittedURLInfo(BaseModel): + url_id: int + data_source_id: int | None + request_error: str | None + submitted_at: datetime | None = None \ No newline at end of file From ce9575060b1c78db77a2e4bc428bca7620861dcd Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 14 Nov 2025 08:27:19 -0500 Subject: [PATCH 20/84] Finish draft --- ...28_1539-a57c3b5b6e93_add_sync_log_table.py | 338 +++++++++++++++--- .../impl/sync_to_ds/impl/agencies/add/core.py | 5 + .../impl/agencies/add/queries/add_links.py | 2 +- .../impl/agencies/add/queries/cte.py | 2 +- .../impl/agencies/add/queries/get.py | 8 +- .../sync_to_ds/impl/agencies/delete/core.py | 5 + .../impl/agencies/delete/queries/cte.py | 2 +- .../sync_to_ds/impl/agencies/update/core.py | 5 + .../impl/agencies/update/queries/cte.py | 2 +- .../impl/agencies/update/queries/get.py | 4 +- .../sync_to_ds/impl/data_sources/add/core.py | 11 +- .../data_sources/add/queries/add_links.py | 2 +- .../impl/data_sources/add/queries/cte.py | 2 +- .../impl/data_sources/add/queries/get.py | 4 +- .../impl/data_sources/delete/core.py | 5 + .../impl/data_sources/delete/queries/cte.py | 2 +- .../data_sources/delete/queries/prereq.py | 2 +- .../impl/data_sources/update/core.py | 5 + .../impl/data_sources/update/queries/cte.py | 11 +- .../impl/data_sources/update/queries/get.py | 10 +- .../sync_to_ds/impl/meta_urls/add/core.py | 5 + .../impl/meta_urls/add/queries/add_links.py | 2 +- .../impl/meta_urls/add/queries/cte.py | 2 +- .../impl/meta_urls/add/queries/get.py | 2 +- .../sync_to_ds/impl/meta_urls/delete/core.py | 5 + .../impl/meta_urls/delete/queries/cte.py | 2 +- .../sync_to_ds/impl/meta_urls/update/core.py | 5 + .../impl/meta_urls/update/queries/cte.py | 2 +- .../impl/meta_urls/update/queries/get.py | 6 +- src/db/client/async_.py | 8 +- src/db/client/sync.py | 4 +- src/db/enums.py | 12 +- src/db/models/impl/agency/sqlalchemy.py | 2 +- .../models/impl/url/data_source/pydantic.py | 2 +- .../models/impl/url/ds_meta_url/sqlalchemy.py | 2 +- .../url/optional_ds_metadata/sqlalchemy.py | 1 + .../models/impl/url/record_type/sqlalchemy.py | 3 +- .../pdap/_templates/request_builder.py | 2 +- .../impl/sync/data_sources/_shared/content.py | 2 +- .../impl/sync/meta_urls/_shared/content.py | 2 +- .../pdap/impl/sync/meta_urls/update/core.py | 2 - .../data_sources/agencies/test_add_remove.py | 13 +- .../api/meta_urls/agencies/test_add_remove.py | 13 +- .../api/url/by_id/delete/test_any_url.py | 2 +- .../api/url/by_id/delete/test_meta_url.py | 1 - .../db/structure/updated_at/__init__.py | 0 .../updated_at/test_ds_optional_metadata.py | 6 - .../db/structure/updated_at/test_urls.py | 6 - tests/automated/integration/tasks/conftest.py | 1 + .../impl/sync_to_ds/agency/conftest.py | 16 +- .../impl/sync_to_ds/agency/test_add.py | 54 ++- .../impl/sync_to_ds/agency/test_delete.py | 29 +- .../agency/update/test_add_location_link.py | 55 ++- .../update/test_delete_location_link.py | 56 ++- .../agency/update/test_update_agency.py | 46 ++- .../impl/sync_to_ds/data_source/conftest.py | 13 +- .../impl/sync_to_ds/data_source/test_add.py | 69 +++- .../sync_to_ds/data_source/test_delete.py | 28 +- .../sync_to_ds/data_source/update/conftest.py | 3 +- .../update/test_add_agency_link.py | 47 ++- .../update/test_delete_agency_link.py | 53 ++- .../test_update_optional_ds_metadata.py | 85 ++++- .../update/test_update_record_type.py | 78 ++++ .../data_source/update/test_update_url.py | 61 +++- .../scheduled/impl/sync_to_ds/helpers.py | 38 ++ .../impl/sync_to_ds/meta_url/conftest.py | 5 +- .../impl/sync_to_ds/meta_url/test_add.py | 37 +- .../impl/sync_to_ds/meta_url/test_delete.py | 27 +- .../meta_url/update/test_add_agency_link.py | 42 ++- .../update/test_delete_agency_link.py | 51 ++- .../meta_url/update/test_update_url.py | 54 ++- .../sync_to_ds/models/ds_app_link_info.py | 1 + .../tasks/scheduled/impl/sync_to_ds/test_.py | 0 .../tasks/url/loader/test_happy_path.py | 2 +- tests/helpers/data_creator/core.py | 8 +- tests/helpers/data_creator/generate.py | 2 +- tests/helpers/mock.py | 5 + 77 files changed, 1311 insertions(+), 196 deletions(-) delete mode 100644 tests/automated/integration/db/structure/updated_at/__init__.py delete mode 100644 tests/automated/integration/db/structure/updated_at/test_ds_optional_metadata.py delete mode 100644 tests/automated/integration/db/structure/updated_at/test_urls.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_update_record_type.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync_to_ds/helpers.py rename src/external/pdap/impl/sync/agencies/request.py => tests/automated/integration/tasks/scheduled/impl/sync_to_ds/test_.py (100%) create mode 100644 tests/helpers/mock.py diff --git a/alembic/versions/2025_10_28_1539-a57c3b5b6e93_add_sync_log_table.py b/alembic/versions/2025_10_28_1539-a57c3b5b6e93_add_sync_log_table.py index 181447a4..41b02082 100644 --- a/alembic/versions/2025_10_28_1539-a57c3b5b6e93_add_sync_log_table.py +++ b/alembic/versions/2025_10_28_1539-a57c3b5b6e93_add_sync_log_table.py @@ -10,7 +10,7 @@ from alembic import op import sqlalchemy as sa -from src.util.alembic_helpers import created_at_column, updated_at_column, create_updated_at_trigger +from src.util.alembic_helpers import created_at_column, updated_at_column, create_updated_at_trigger, remove_enum_value # revision identifiers, used by Alembic. revision: str = 'a57c3b5b6e93' @@ -19,6 +19,211 @@ depends_on: Union[str, Sequence[str], None] = None +def _add_data_portal_type_other_to_ds_optional_metadata(): + op.add_column( + 'url_optional_data_source_metadata', + sa.Column( + 'data_portal_type_other', + sa.String(), + nullable=True + ) + ) + + +def upgrade() -> None: + _create_sync_log() + _create_ds_agency_link() + _migrate_agency_ids_to_ds_agency_link() + remove_id_column_from_agencies() + rename_agency_id_to_id() + _rename_existing_tables_to_ds_app_format() + _alter_ds_app_link_data_source_table() + _alter_ds_app_link_meta_url_table() + _add_flag_deletion_tables() + _add_last_synced_at_columns() + _add_link_table_modification_triggers() + _add_updated_at_to_optional_data_source_metadata_table() + _update_sync_tasks() + _alter_agency_jurisdiction_type_column() + _add_updated_at_to_url_record_type_table() + _add_updated_at_trigger_to_url_optional_data_source_metadata() + _add_data_portal_type_other_to_ds_optional_metadata() + +def _add_updated_at_trigger_to_url_optional_data_source_metadata(): + create_updated_at_trigger( + "url_optional_data_source_metadata" + ) + +def _add_updated_at_to_url_record_type_table(): + op.add_column( + 'url_record_type', + updated_at_column() + ) + create_updated_at_trigger( + "url_record_type" + ) + + + +def _alter_agency_jurisdiction_type_column(): + op.alter_column( + 'agencies', + 'jurisdiction_type', + nullable=False, + ) + + +def _update_sync_tasks(): + + # Drop Views + op.execute("drop view url_task_count_1_day") + op.execute("drop view url_task_count_1_week") + op.execute("drop materialized view url_status_mat_view") + + + + targets: list[tuple[str, str]] = [ + ('tasks', 'task_type'), + ('url_task_error', 'task_type') + ] + + remove_enum_value( + enum_name="task_type", + value_to_remove="Sync Agencies", + targets=targets + ) + remove_enum_value( + enum_name="task_type", + value_to_remove="Sync Data Sources", + targets=targets + ) + new_enum_values: list[str] = [ + "Sync Agencies Add", + "Sync Agencies Update", + "Sync Agencies Delete", + "Sync Data Sources Add", + "Sync Data Sources Update", + "Sync Data Sources Delete", + "Sync Meta URLs Add", + "Sync Meta URLs Update", + "Sync Meta URLs Delete", + ] + for enum_value in new_enum_values: + op.execute(f"ALTER TYPE task_type ADD VALUE '{enum_value}';") + + # Recreate Views + op.execute(""" + create view url_task_count_1_day(task_type, count) as + SELECT + t.task_type, + count(ltu.url_id) AS count + FROM + tasks t + JOIN link_task_urls ltu + ON ltu.task_id = t.id + WHERE + t.updated_at > (now() - '1 day'::interval) + GROUP BY + t.task_type; + """) + + op.execute(""" + create view url_task_count_1_week(task_type, count) as + SELECT + t.task_type, + count(ltu.url_id) AS count + FROM + tasks t + JOIN link_task_urls ltu + ON ltu.task_id = t.id + WHERE + t.updated_at > (now() - '7 days'::interval) + GROUP BY + t.task_type; + """) + + op.execute( + """ + CREATE MATERIALIZED VIEW url_status_mat_view as + with + urls_with_relevant_errors as ( + select + ute.url_id + from + url_task_error ute + where + ute.task_type in ( + 'Screenshot', + 'HTML', + 'URL Probe' + ) + ) + , status_text as ( + select + u.id as url_id, + case + when ( + -- Validated as not relevant, individual record, or not found + fuv.type in ('not relevant', 'individual record', 'not found') + ) Then 'Accepted' + when ( + (fuv.type = 'data source' and uds.url_id is null) + OR + (fuv.type = 'meta url' and udmu.url_id is null) + ) Then 'Awaiting Submission' + when ( + (fuv.type = 'data source' and uds.url_id is not null) + OR + (fuv.type = 'meta url' and udmu.url_id is not null) + ) Then 'Submitted' + when ( + -- Has compressed HTML + uch.url_id is not null + AND + -- Has web metadata + uwm.url_id is not null + AND + -- Has screenshot + us.url_id is not null + ) THEN 'Community Labeling' + when uwre.url_id is not null then 'Error' + ELSE 'Intake' + END as status + + from + urls u + left join urls_with_relevant_errors uwre + on u.id = uwre.url_id + left join url_screenshot us + on u.id = us.url_id + left join url_compressed_html uch + on u.id = uch.url_id + left join url_web_metadata uwm + on u.id = uwm.url_id + left join flag_url_validated fuv + on u.id = fuv.url_id + left join ds_app_link_meta_url udmu + on u.id = udmu.url_id + left join ds_app_link_data_source uds + on u.id = uds.url_id + ) + select + url_id, + status, + CASE status + WHEN 'Intake' THEN 100 + WHEN 'Error' THEN 110 + WHEN 'Community Labeling' THEN 200 + WHEN 'Accepted' THEN 300 + WHEN 'Awaiting Submission' THEN 380 + WHEN 'Submitted' THEN 390 + ELSE -1 + END as code + from status_text + """ + ) + + def last_synced_at_column(): return sa.Column( 'last_synced_at', @@ -35,25 +240,56 @@ def _add_link_table_modification_triggers(): RETURNS trigger LANGUAGE plpgsql AS $$ BEGIN - -- UNION to cover INSERT/UPDATE (NEW TABLE) and DELETE (OLD TABLE) - UPDATE urls u - SET updated_at = clock_timestamp() -- better than now() for long txns - FROM ( - SELECT DISTINCT url_id FROM newtab - UNION - SELECT DISTINCT url_id FROM oldtab - ) AS hit - WHERE u.id = hit.url_id; + IF TG_OP = 'INSERT' THEN + EXECUTE $q$ + UPDATE urls u + SET updated_at = clock_timestamp() + FROM (SELECT DISTINCT url_id FROM newtab) AS hit + WHERE u.id = hit.url_id + $q$; + + ELSIF TG_OP = 'DELETE' THEN + EXECUTE $q$ + UPDATE urls u + SET updated_at = clock_timestamp() + FROM (SELECT DISTINCT url_id FROM oldtab) AS hit + WHERE u.id = hit.url_id + $q$; + + ELSE -- UPDATE + EXECUTE $q$ + UPDATE urls u + SET updated_at = clock_timestamp() + FROM ( + SELECT DISTINCT url_id FROM newtab + UNION + SELECT DISTINCT url_id FROM oldtab + ) AS hit + WHERE u.id = hit.url_id + $q$; + END IF; RETURN NULL; -- statement-level trigger END $$; -- statement-level trigger with transition tables - CREATE TRIGGER trg_link_touch_parent - AFTER INSERT OR UPDATE OR DELETE ON link_parent_child + CREATE TRIGGER trg_link_urls_agency_touch_url_ins + AFTER INSERT ON link_urls_agency + REFERENCING NEW TABLE AS newtab + FOR EACH STATEMENT + EXECUTE FUNCTION touch_url_from_agency_link(); + + CREATE TRIGGER trg_link_urls_agency_touch_url_upd + AFTER UPDATE ON link_urls_agency REFERENCING NEW TABLE AS newtab OLD TABLE AS oldtab FOR EACH STATEMENT - EXECUTE FUNCTION touch_parent_from_link(); + EXECUTE FUNCTION touch_url_from_agency_link(); + + CREATE TRIGGER trg_link_urls_agency_touch_url_del + AFTER DELETE ON link_urls_agency + REFERENCING OLD TABLE AS oldtab + FOR EACH STATEMENT + EXECUTE FUNCTION touch_url_from_agency_link(); """) @@ -65,26 +301,56 @@ def _add_link_table_modification_triggers(): LANGUAGE plpgsql AS $$ BEGIN - -- UNION to cover INSERT/UPDATE (NEW TABLE) and DELETE (OLD TABLE) - UPDATE agencies a - SET updated_at = clock_timestamp() -- better than now() for long txns - FROM (SELECT DISTINCT agency_id - FROM newtab - UNION - SELECT DISTINCT agency_id - FROM oldtab) AS hit - WHERE a.id = hit.agency_id; - - RETURN NULL; -- statement-level trigger + IF TG_OP = 'INSERT' THEN + EXECUTE $q$ + UPDATE agencies a + SET updated_at = clock_timestamp() + FROM (SELECT DISTINCT agency_id FROM newtab) AS hit + WHERE a.id = hit.agency_id + $q$; + + ELSIF TG_OP = 'DELETE' THEN + EXECUTE $q$ + UPDATE agencies a + SET updated_at = clock_timestamp() + FROM (SELECT DISTINCT agency_id FROM oldtab) AS hit + WHERE a.id = hit.agency_id + $q$; + + ELSE -- UPDATE + EXECUTE $q$ + UPDATE agencies a + SET updated_at = clock_timestamp() + FROM ( + SELECT DISTINCT agency_id FROM newtab + UNION + SELECT DISTINCT agency_id FROM oldtab + ) AS hit + WHERE a.id = hit.agency_id + $q$; + END IF; + + RETURN NULL; -- statement-level trigger END $$; -- statement-level trigger with transition tables - CREATE TRIGGER trg_link_touch_parent - AFTER INSERT OR UPDATE OR DELETE - ON link_agencies_locations - REFERENCING NEW TABLE AS newtab OLD TABLE AS oldtab - FOR EACH STATEMENT + CREATE TRIGGER trg_link_agencies_locations_touch_agencies_ins + AFTER INSERT ON link_agencies_locations + REFERENCING NEW TABLE AS newtab + FOR EACH STATEMENT + EXECUTE FUNCTION touch_agency_from_location_link(); + + CREATE TRIGGER trg_link_agencies_locations_touch_agencies_upd + AFTER UPDATE ON link_agencies_locations + REFERENCING NEW TABLE AS newtab OLD TABLE AS oldtab + FOR EACH STATEMENT + EXECUTE FUNCTION touch_agency_from_location_link(); + + CREATE TRIGGER trg_link_agencies_locations_touch_agencies_del + AFTER DELETE ON link_agencies_locations + REFERENCING OLD TABLE AS oldtab + FOR EACH STATEMENT EXECUTE FUNCTION touch_agency_from_location_link(); """ ) @@ -93,19 +359,7 @@ def _add_link_table_modification_triggers(): -def upgrade() -> None: - _create_sync_log() - _create_ds_agency_link() - _migrate_agency_ids_to_ds_agency_link() - remove_id_column_from_agencies() - rename_agency_id_to_id() - _rename_existing_tables_to_ds_app_format() - _alter_ds_app_link_data_source_table() - _alter_ds_app_link_meta_url_table() - _add_flag_deletion_tables() - _add_last_synced_at_columns() - _add_link_table_modification_triggers() - _add_updated_at_to_optional_data_source_metadata_table() + def _add_updated_at_to_optional_data_source_metadata_table(): op.add_column( diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/core.py index ecc573da..e46deed5 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/core.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/core.py @@ -4,6 +4,7 @@ from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.add.queries.prereq import \ DSAppSyncAgenciesAddPrerequisitesQueryBuilder from src.core.tasks.scheduled.impl.sync_to_ds.templates.operator import DSSyncTaskOperatorBase +from src.db.enums import TaskType from src.external.pdap.impl.sync.agencies.add.core import AddAgenciesRequestBuilder from src.external.pdap.impl.sync.agencies.add.request import AddAgenciesOuterRequest from src.external.pdap.impl.sync.shared.models.add.response import DSAppSyncAddResponseInnerModel @@ -13,6 +14,10 @@ class DSAppSyncAgenciesAddTaskOperator( DSSyncTaskOperatorBase ): + @property + def task_type(self) -> TaskType: + return TaskType.SYNC_AGENCIES_ADD + async def meets_task_prerequisites(self) -> bool: return await self.run_query_builder( DSAppSyncAgenciesAddPrerequisitesQueryBuilder() diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/add_links.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/add_links.py index 68b42aa6..36a3ebc0 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/add_links.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/add_links.py @@ -19,7 +19,7 @@ async def run(self, session: AsyncSession) -> None: for mapping in self._mappings: inserts.append( DSAppLinkAgency( - ds_agency_id=mapping.ds_app_id, + ds_agency_id=mapping.app_id, agency_id=mapping.request_id, ) ) diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/cte.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/cte.py index 5335ea44..b91feb11 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/cte.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/cte.py @@ -20,7 +20,7 @@ def __init__(self): select(DSAppLinkAgency.agency_id) .where(DSAppLinkAgency.agency_id == Agency.id) ) - ).cte() + ).cte("ds_app_link_sync_agency_add_prerequisites") ) @property diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/get.py index a2ac4957..1ae9a13c 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/get.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/get.py @@ -22,14 +22,10 @@ async def run(self, session: AsyncSession) -> AddAgenciesOuterRequest: LinkAgencyLocation.agency_id, func.array_agg(LinkAgencyLocation.location_id).label("location_ids"), ) - .join( - Agency, - Agency.id == cte.agency_id, - ) .group_by( LinkAgencyLocation.agency_id, ) - .cte() + .cte("location_id_cte") ) query = ( @@ -59,7 +55,7 @@ async def run(self, session: AsyncSession) -> AddAgenciesOuterRequest: for mapping in mappings: inner_requests.append( AddAgenciesInnerRequest( - request_id=mapping.agency_id, + request_id=mapping[cte.agency_id], content=AgencySyncContentModel( name=mapping[Agency.name], jurisdiction_type=mapping[Agency.jurisdiction_type], diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/core.py index 26e78a96..e84d3b2b 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/core.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/core.py @@ -7,6 +7,7 @@ from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.delete.queries.prereq import \ DSAppSyncAgenciesDeletePrerequisitesQueryBuilder from src.core.tasks.scheduled.impl.sync_to_ds.templates.operator import DSSyncTaskOperatorBase +from src.db.enums import TaskType from src.external.pdap.impl.sync.agencies.delete.core import DeleteAgenciesRequestBuilder @@ -14,6 +15,10 @@ class DSAppSyncAgenciesDeleteTaskOperator( DSSyncTaskOperatorBase ): + @property + def task_type(self) -> TaskType: + return TaskType.SYNC_AGENCIES_DELETE + async def meets_task_prerequisites(self) -> bool: return await self.adb_client.run_query_builder( DSAppSyncAgenciesDeletePrerequisitesQueryBuilder() diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/cte.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/cte.py index 311f4a26..d93f6a1d 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/cte.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/cte.py @@ -17,7 +17,7 @@ def __init__(self): .join( FlagDSDeleteAgency, FlagDSDeleteAgency.ds_agency_id == DSAppLinkAgency.ds_agency_id - ).cte() + ).cte("ds_app_link_sync_agency_delete_prerequisites") ) @property diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/core.py index 6ded28cc..24481e8d 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/core.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/core.py @@ -5,6 +5,7 @@ from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.update.queries.update_links import \ DSAppSyncAgenciesUpdateAlterLinksQueryBuilder from src.core.tasks.scheduled.impl.sync_to_ds.templates.operator import DSSyncTaskOperatorBase +from src.db.enums import TaskType from src.external.pdap.impl.sync.agencies.update.core import UpdateAgenciesRequestBuilder from src.external.pdap.impl.sync.agencies.update.request import UpdateAgenciesOuterRequest @@ -13,6 +14,10 @@ class DSAppSyncAgenciesUpdateTaskOperator( DSSyncTaskOperatorBase ): + @property + def task_type(self) -> TaskType: + return TaskType.SYNC_AGENCIES_UPDATE + async def meets_task_prerequisites(self) -> bool: return await self.adb_client.run_query_builder( DSAppSyncAgenciesUpdatePrerequisitesQueryBuilder() diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/cte.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/cte.py index 3025c7e2..57a9957c 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/cte.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/cte.py @@ -18,7 +18,7 @@ def __init__(self): ) .where( Agency.updated_at > DSAppLinkAgency.last_synced_at - ).cte() + ).cte("ds_app_link_sync_agency_update_prerequisites") ) @property diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/get.py index 7dc4329e..81572a24 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/get.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/get.py @@ -25,7 +25,7 @@ async def run(self, session: AsyncSession) -> UpdateAgenciesOuterRequest: ) .join( Agency, - Agency.id == cte.agency_id, + Agency.id == LinkAgencyLocation.agency_id, ) .group_by( LinkAgencyLocation.agency_id, @@ -60,7 +60,7 @@ async def run(self, session: AsyncSession) -> UpdateAgenciesOuterRequest: for mapping in mappings: inner_requests.append( UpdateAgenciesInnerRequest( - app_id=mapping[DSAppLinkAgency.ds_agency_id], + app_id=mapping[cte.ds_agency_id], content=AgencySyncContentModel( name=mapping[Agency.name], jurisdiction_type=mapping[Agency.jurisdiction_type], diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/core.py index 1385caa0..760583fd 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/core.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/core.py @@ -1,10 +1,11 @@ -from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.add.queries.add_links import \ - DSAppSyncAgenciesAddInsertLinksQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.add.queries.add_links import \ + DSAppSyncDataSourcesAddInsertLinksQueryBuilder from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.add.queries.get import \ DSAppSyncDataSourcesAddGetQueryBuilder from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.add.queries.prereq import \ DSAppSyncDataSourcesAddPrerequisitesQueryBuilder from src.core.tasks.scheduled.impl.sync_to_ds.templates.operator import DSSyncTaskOperatorBase +from src.db.enums import TaskType from src.external.pdap.impl.sync.data_sources.add.core import AddDataSourcesRequestBuilder from src.external.pdap.impl.sync.data_sources.add.request import AddDataSourcesOuterRequest from src.external.pdap.impl.sync.shared.models.add.response import DSAppSyncAddResponseInnerModel @@ -14,6 +15,10 @@ class DSAppSyncDataSourcesAddTaskOperator( DSSyncTaskOperatorBase ): + @property + def task_type(self) -> TaskType: + return TaskType.SYNC_DATA_SOURCES_ADD + async def meets_task_prerequisites(self) -> bool: return await self.run_query_builder( DSAppSyncDataSourcesAddPrerequisitesQueryBuilder() @@ -44,5 +49,5 @@ async def insert_ds_app_links( responses: list[DSAppSyncAddResponseInnerModel] ) -> None: await self.run_query_builder( - DSAppSyncAgenciesAddInsertLinksQueryBuilder(responses) + DSAppSyncDataSourcesAddInsertLinksQueryBuilder(responses) ) diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/add_links.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/add_links.py index e1bf4bf9..88c88d4b 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/add_links.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/add_links.py @@ -19,7 +19,7 @@ async def run(self, session: AsyncSession) -> None: for mapping in self._mappings: inserts.append( DSAppLinkDataSource( - ds_data_source_id=mapping.ds_app_id, + ds_data_source_id=mapping.app_id, url_id=mapping.request_id, ) ) diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/cte.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/cte.py index 51aa030f..8c8bc945 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/cte.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/cte.py @@ -27,7 +27,7 @@ def __init__(self): select(DSAppLinkDataSource.url_id) .where(DSAppLinkDataSource.url_id == URL.id) ) - ).cte() + ).cte("ds_app_link_sync_data_source_add_prerequisites") ) @property diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/get.py index 17ed5d04..47beb2a3 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/get.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/get.py @@ -62,7 +62,7 @@ async def run(self, session: AsyncSession) -> AddDataSourcesOuterRequest: URL, URL.id == cte.url_id, ) - .join( + .outerjoin( URLOptionalDataSourceMetadata, URL.id == URLOptionalDataSourceMetadata.url_id, ) @@ -88,7 +88,7 @@ async def run(self, session: AsyncSession) -> AddDataSourcesOuterRequest: request_id=mapping[cte.url_id], content=DataSourceSyncContentModel( # Required - source_url=mapping[URL.full_url], + source_url=mapping["full_url"], name=mapping[URL.name], record_type=mapping[URLRecordType.record_type], agency_ids=mapping["agency_ids"], diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/core.py index adccb03b..14450a51 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/core.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/core.py @@ -7,6 +7,7 @@ from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.delete.queries.prereq import \ DSAppSyncDataSourcesDeletePrerequisitesQueryBuilder from src.core.tasks.scheduled.impl.sync_to_ds.templates.operator import DSSyncTaskOperatorBase +from src.db.enums import TaskType from src.external.pdap.impl.sync.data_sources.delete.core import DeleteDataSourcesRequestBuilder @@ -14,6 +15,10 @@ class DSAppSyncDataSourcesDeleteTaskOperator( DSSyncTaskOperatorBase ): + @property + def task_type(self) -> TaskType: + return TaskType.SYNC_DATA_SOURCES_DELETE + async def meets_task_prerequisites(self) -> bool: return await self.run_query_builder( DSAppSyncDataSourcesDeletePrerequisitesQueryBuilder() diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/cte.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/cte.py index 1b9f2479..4e14dbf8 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/cte.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/cte.py @@ -17,7 +17,7 @@ def __init__(self): .join( FlagDSDeleteDataSource, FlagDSDeleteDataSource.ds_data_source_id == FlagDSDeleteDataSource.ds_data_source_id - ).cte() + ).cte("ds_app_link_sync_data_source_delete_prerequisites") ) @property diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/prereq.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/prereq.py index 5df5781c..1f3e797a 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/prereq.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/prereq.py @@ -12,6 +12,6 @@ async def run(self, session: AsyncSession) -> bool: return await self.sh.results_exist( session=session, query=select( - DSAppLinkSyncDataSourceDeletePrerequisitesCTEContainer().ds_meta_url_id + DSAppLinkSyncDataSourceDeletePrerequisitesCTEContainer().ds_data_source_id ) ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/core.py index 08a8405b..fd925146 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/core.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/core.py @@ -5,6 +5,7 @@ from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update.queries.update_links import \ DSAppSyncDataSourcesUpdateAlterLinksQueryBuilder from src.core.tasks.scheduled.impl.sync_to_ds.templates.operator import DSSyncTaskOperatorBase +from src.db.enums import TaskType from src.external.pdap.impl.sync.data_sources.update.core import UpdateDataSourcesRequestBuilder from src.external.pdap.impl.sync.data_sources.update.request import UpdateDataSourcesOuterRequest @@ -13,6 +14,10 @@ class DSAppSyncDataSourcesUpdateTaskOperator( DSSyncTaskOperatorBase ): + @property + def task_type(self) -> TaskType: + return TaskType.SYNC_DATA_SOURCES_UPDATE + async def meets_task_prerequisites(self) -> bool: return await self.adb_client.run_query_builder( DSAppSyncDataSourcesUpdatePrerequisitesQueryBuilder() diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/cte.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/cte.py index 5a3d8120..8f0ff65e 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/cte.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/cte.py @@ -3,6 +3,7 @@ from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata +from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType class DSAppLinkSyncDataSourceUpdatePrerequisitesCTEContainer: @@ -17,7 +18,11 @@ def __init__(self): URL, URL.id == DSAppLinkDataSource.url_id, ) - .join( + .outerjoin( + URLRecordType, + URL.id == URLRecordType.url_id, + ) + .outerjoin( URLOptionalDataSourceMetadata, URL.id == URLOptionalDataSourceMetadata.url_id, ) @@ -25,8 +30,10 @@ def __init__(self): or_( URL.updated_at > DSAppLinkDataSource.last_synced_at, URLOptionalDataSourceMetadata.updated_at > DSAppLinkDataSource.last_synced_at, + URLRecordType.created_at > DSAppLinkDataSource.last_synced_at, + URLRecordType.updated_at > DSAppLinkDataSource.last_synced_at, ) - ).cte() + ).cte("ds_app_link_sync_data_source_update_prerequisites") ) @property diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/get.py index 3e802656..855075e3 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/get.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/get.py @@ -55,6 +55,7 @@ async def run(self, session: AsyncSession) -> UpdateDataSourcesOuterRequest: URLOptionalDataSourceMetadata.scraper_url, URLOptionalDataSourceMetadata.access_notes, URLOptionalDataSourceMetadata.access_types, + URLOptionalDataSourceMetadata.data_portal_type_other ) .select_from( cte.cte @@ -63,7 +64,7 @@ async def run(self, session: AsyncSession) -> UpdateDataSourcesOuterRequest: URL, URL.id == cte.url_id, ) - .join( + .outerjoin( URLOptionalDataSourceMetadata, URL.id == URLOptionalDataSourceMetadata.url_id, ) @@ -71,7 +72,7 @@ async def run(self, session: AsyncSession) -> UpdateDataSourcesOuterRequest: URLRecordType, URLRecordType.url_id == URL.id, ) - .join( + .outerjoin( agency_id_cte, cte.url_id == agency_id_cte.c.url_id ) @@ -89,10 +90,10 @@ async def run(self, session: AsyncSession) -> UpdateDataSourcesOuterRequest: app_id=mapping[cte.ds_data_source_id], content=DataSourceSyncContentModel( # Required - source_url=mapping[URL.full_url], + source_url=mapping["full_url"], name=mapping[URL.name], record_type=mapping[URLRecordType.record_type], - agency_ids=mapping["agency_ids"], + agency_ids=mapping["agency_ids"] or [], # Optional description=mapping[URL.description], record_formats=mapping[URLOptionalDataSourceMetadata.record_formats], @@ -109,6 +110,7 @@ async def run(self, session: AsyncSession) -> UpdateDataSourcesOuterRequest: scraper_url=mapping[URLOptionalDataSourceMetadata.scraper_url], access_notes=mapping[URLOptionalDataSourceMetadata.access_notes], access_types=mapping[URLOptionalDataSourceMetadata.access_types], + data_portal_type_other=mapping[URLOptionalDataSourceMetadata.data_portal_type_other], ) ) ) diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/core.py index 9abbe11d..6823c205 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/core.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/core.py @@ -4,6 +4,7 @@ from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.add.queries.prereq import \ DSAppSyncMetaURLsAddPrerequisitesQueryBuilder from src.core.tasks.scheduled.impl.sync_to_ds.templates.operator import DSSyncTaskOperatorBase +from src.db.enums import TaskType from src.external.pdap.impl.sync.meta_urls.add.core import AddMetaURLsRequestBuilder from src.external.pdap.impl.sync.meta_urls.add.request import AddMetaURLsOuterRequest from src.external.pdap.impl.sync.shared.models.add.response import DSAppSyncAddResponseInnerModel @@ -13,6 +14,10 @@ class DSAppSyncMetaURLsAddTaskOperator( DSSyncTaskOperatorBase ): + @property + def task_type(self) -> TaskType: + return TaskType.SYNC_META_URLS_ADD + async def meets_task_prerequisites(self) -> bool: return await self.run_query_builder( DSAppSyncMetaURLsAddPrerequisitesQueryBuilder() diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/add_links.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/add_links.py index 648b3d25..52a288f3 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/add_links.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/add_links.py @@ -19,7 +19,7 @@ async def run(self, session: AsyncSession) -> None: for mapping in self._mappings: inserts.append( DSAppLinkMetaURL( - ds_meta_url_id=mapping.ds_app_id, + ds_meta_url_id=mapping.app_id, url_id=mapping.request_id, ) ) diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/cte.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/cte.py index eabd5da2..178e19e8 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/cte.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/cte.py @@ -20,7 +20,7 @@ def __init__(self): select(DSAppLinkMetaURL.url_id) .where(DSAppLinkMetaURL.url_id == MetaURL.url_id) ) - ).cte() + ).cte("ds_app_link_sync_meta_url_add_prerequisites") ) @property diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/get.py index 5493c595..42a9149b 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/get.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/get.py @@ -58,7 +58,7 @@ async def run(self, session: AsyncSession) -> AddMetaURLsOuterRequest: AddMetaURLsInnerRequest( request_id=mapping[cte.url_id], content=MetaURLSyncContentModel( - url=mapping[URL.full_url], + url=mapping["full_url"], agency_ids=mapping["agency_ids"] ) ) diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/core.py index d67880f3..32f5ef85 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/core.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/core.py @@ -7,6 +7,7 @@ from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.delete.queries.prereq import \ DSAppSyncMetaURLsDeletePrerequisitesQueryBuilder from src.core.tasks.scheduled.impl.sync_to_ds.templates.operator import DSSyncTaskOperatorBase +from src.db.enums import TaskType from src.external.pdap.impl.sync.meta_urls.delete.core import DeleteMetaURLsRequestBuilder @@ -14,6 +15,10 @@ class DSAppSyncMetaURLsDeleteTaskOperator( DSSyncTaskOperatorBase ): + @property + def task_type(self) -> TaskType: + return TaskType.SYNC_META_URLS_DELETE + async def meets_task_prerequisites(self) -> bool: return await self.run_query_builder( DSAppSyncMetaURLsDeletePrerequisitesQueryBuilder() diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/cte.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/cte.py index 8a6fe844..91887e48 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/cte.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/cte.py @@ -17,7 +17,7 @@ def __init__(self): .join( FlagDSDeleteMetaURL, FlagDSDeleteMetaURL.ds_meta_url_id == DSAppLinkMetaURL.ds_meta_url_id - ).cte() + ).cte("ds_app_link_sync_meta_url_delete_prerequisites") ) @property diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/core.py index a9f85918..3ef8dc28 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/core.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/core.py @@ -5,6 +5,7 @@ from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.update.queries.update_links import \ DSAppSyncMetaURLsUpdateAlterLinksQueryBuilder from src.core.tasks.scheduled.impl.sync_to_ds.templates.operator import DSSyncTaskOperatorBase +from src.db.enums import TaskType from src.external.pdap.impl.sync.meta_urls.update.core import UpdateMetaURLsRequestBuilder from src.external.pdap.impl.sync.meta_urls.update.request import UpdateMetaURLsOuterRequest @@ -13,6 +14,10 @@ class DSAppSyncMetaURLsUpdateTaskOperator( DSSyncTaskOperatorBase ): + @property + def task_type(self) -> TaskType: + return TaskType.SYNC_META_URLS_UPDATE + async def meets_task_prerequisites(self) -> bool: return await self.adb_client.run_query_builder( DSAppSyncMetaURLsUpdatePrerequisitesQueryBuilder() diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/cte.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/cte.py index a60d02fd..20123566 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/cte.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/cte.py @@ -17,7 +17,7 @@ def __init__(self): ) .where( URL.updated_at > DSAppLinkMetaURL.last_synced_at, - ).cte() + ).cte("ds_app_link_sync_meta_url_update_prerequisites") ) @property diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/get.py index 2460aee3..210909f9 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/get.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/get.py @@ -41,7 +41,7 @@ async def run(self, session: AsyncSession) -> UpdateMetaURLsOuterRequest: URL, URL.id == cte.url_id, ) - .join( + .outerjoin( agency_id_cte, cte.url_id == agency_id_cte.c.url_id ) @@ -58,8 +58,8 @@ async def run(self, session: AsyncSession) -> UpdateMetaURLsOuterRequest: UpdateMetaURLsInnerRequest( app_id=mapping[cte.ds_meta_url_id], content=MetaURLSyncContentModel( - url=mapping[URL.full_url], - agency_ids=mapping["agency_ids"] + url=mapping['full_url'], + agency_ids=mapping["agency_ids"] or [] ) ) ) diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 0ee9db85..26165a61 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -56,7 +56,7 @@ from src.db.dtos.url.raw_html import RawHTMLInfo from src.db.enums import TaskType from src.db.helpers.session import session_helper as sh -from src.db.models.impl.agency.enums import AgencyType +from src.db.models.impl.agency.enums import AgencyType, JurisdictionType from src.db.models.impl.agency.sqlalchemy import Agency from src.db.models.impl.backlog_snapshot import BacklogSnapshot from src.db.models.impl.batch.pydantic.info import BatchInfo @@ -592,7 +592,10 @@ async def upsert_new_agencies( result = await session.execute(query) agency = result.scalars().one_or_none() if agency is None: - agency = Agency(id=suggestion.pdap_agency_id) + agency = Agency( + id=suggestion.pdap_agency_id, + jurisdiction_type=JurisdictionType.LOCAL + ) agency.name = suggestion.agency_name agency.agency_type = AgencyType.UNKNOWN session.add(agency) @@ -631,6 +634,7 @@ async def add_agency_manual_suggestion( id=agency_id, name=PLACEHOLDER_AGENCY_NAME, agency_type=AgencyType.UNKNOWN, + jurisdiction_type=JurisdictionType.LOCAL ) await session.merge(agency) diff --git a/src/db/client/sync.py b/src/db/client/sync.py index eec2ce53..8cc99691 100644 --- a/src/db/client/sync.py +++ b/src/db/client/sync.py @@ -141,7 +141,7 @@ def insert_url(self, session, url_info: URLInfo) -> int: return url_entry.id def insert_urls(self, url_infos: List[URLInfo], batch_id: int) -> InsertURLsInfo: - url_mappings = [] + url_mappings: list[SimpleURLMapping] = [] duplicates = [] for url_info in url_infos: url_info.batch_id = batch_id @@ -227,7 +227,7 @@ def mark_urls_as_submitted( url_data_source_object = DSAppLinkDataSource( url_id=url_id, - data_source_id=data_source_id + ds_data_source_id=data_source_id ) if info.submitted_at is not None: url_data_source_object.created_at = info.submitted_at diff --git a/src/db/enums.py b/src/db/enums.py index b232c188..053fdace 100644 --- a/src/db/enums.py +++ b/src/db/enums.py @@ -55,8 +55,6 @@ class TaskType(PyEnum): # Scheduled Tasks PUSH_TO_HUGGINGFACE = "Push to Hugging Face" - SYNC_AGENCIES = "Sync Agencies" - SYNC_DATA_SOURCES = "Sync Data Sources" POPULATE_BACKLOG_SNAPSHOT = "Populate Backlog Snapshot" DELETE_OLD_LOGS = "Delete Old Logs" DELETE_STALE_SCREENSHOTS = "Delete Stale Screenshots" @@ -65,6 +63,16 @@ class TaskType(PyEnum): TASK_CLEANUP = "Task Cleanup" REFRESH_MATERIALIZED_VIEWS = "Refresh Materialized Views" + SYNC_AGENCIES_ADD = "Sync Agencies Add" + SYNC_AGENCIES_UPDATE = "Sync Agencies Update" + SYNC_AGENCIES_DELETE = "Sync Agencies Delete" + SYNC_DATA_SOURCES_ADD = "Sync Data Sources Add" + SYNC_DATA_SOURCES_UPDATE = "Sync Data Sources Update" + SYNC_DATA_SOURCES_DELETE = "Sync Data Sources Delete" + SYNC_META_URLS_ADD = "Sync Meta URLs Add" + SYNC_META_URLS_UPDATE = "Sync Meta URLs Update" + SYNC_META_URLS_DELETE = "Sync Meta URLs Delete" + class ChangeLogOperationType(PyEnum): INSERT = "INSERT" UPDATE = "UPDATE" diff --git a/src/db/models/impl/agency/sqlalchemy.py b/src/db/models/impl/agency/sqlalchemy.py index d0233967..e72e1038 100644 --- a/src/db/models/impl/agency/sqlalchemy.py +++ b/src/db/models/impl/agency/sqlalchemy.py @@ -23,7 +23,7 @@ class Agency( jurisdiction_type: Mapped[JurisdictionType] = enum_column( JurisdictionType, name="jurisdiction_type_enum", - nullable=True, + nullable=False, ) # Relationships diff --git a/src/db/models/impl/url/data_source/pydantic.py b/src/db/models/impl/url/data_source/pydantic.py index 72dec9c6..49a83ac8 100644 --- a/src/db/models/impl/url/data_source/pydantic.py +++ b/src/db/models/impl/url/data_source/pydantic.py @@ -3,7 +3,7 @@ class URLDataSourcePydantic(BulkInsertableModel): - data_source_id: int + ds_data_source_id: int url_id: int @classmethod diff --git a/src/db/models/impl/url/ds_meta_url/sqlalchemy.py b/src/db/models/impl/url/ds_meta_url/sqlalchemy.py index 9f8092a9..1d74c12d 100644 --- a/src/db/models/impl/url/ds_meta_url/sqlalchemy.py +++ b/src/db/models/impl/url/ds_meta_url/sqlalchemy.py @@ -22,5 +22,5 @@ class DSAppLinkMetaURL( ds_meta_url_id = Column(Integer, primary_key=True) __table_args__ = ( - UniqueConstraint("url_id", "agency_id"), + UniqueConstraint("url_id"), ) \ No newline at end of file diff --git a/src/db/models/impl/url/optional_ds_metadata/sqlalchemy.py b/src/db/models/impl/url/optional_ds_metadata/sqlalchemy.py index b3b49ce0..4661be7a 100644 --- a/src/db/models/impl/url/optional_ds_metadata/sqlalchemy.py +++ b/src/db/models/impl/url/optional_ds_metadata/sqlalchemy.py @@ -39,6 +39,7 @@ class URLOptionalDataSourceMetadata( values_callable=lambda AccessTypeEnum: [e.value for e in AccessTypeEnum] ) ), nullable=True) + data_portal_type_other = Column(String, nullable=True) # Relationships url = relationship("URL", uselist=False, back_populates="optional_data_source_metadata") diff --git a/src/db/models/impl/url/record_type/sqlalchemy.py b/src/db/models/impl/url/record_type/sqlalchemy.py index 7e8f2fac..23137fae 100644 --- a/src/db/models/impl/url/record_type/sqlalchemy.py +++ b/src/db/models/impl/url/record_type/sqlalchemy.py @@ -2,13 +2,14 @@ from src.core.enums import RecordType from src.db.models.helpers import url_id_primary_key_constraint, enum_column -from src.db.models.mixins import URLDependentMixin, CreatedAtMixin +from src.db.models.mixins import URLDependentMixin, CreatedAtMixin, UpdatedAtMixin from src.db.models.templates_.base import Base class URLRecordType( Base, CreatedAtMixin, + UpdatedAtMixin, URLDependentMixin ): __tablename__ = "url_record_type" diff --git a/src/external/pdap/_templates/request_builder.py b/src/external/pdap/_templates/request_builder.py index 226495f1..387421f4 100644 --- a/src/external/pdap/_templates/request_builder.py +++ b/src/external/pdap/_templates/request_builder.py @@ -27,7 +27,7 @@ async def post( type_=RequestType.POST, url=url, json_=model.model_dump(mode='json'), - headers=self.access_manager.jwt_header() + headers=await self.access_manager.jwt_header() ) response_info: ResponseInfo = await self.access_manager.make_request(request_info) if response_info.status_code != HTTPStatus.OK: diff --git a/src/external/pdap/impl/sync/data_sources/_shared/content.py b/src/external/pdap/impl/sync/data_sources/_shared/content.py index 9895223a..d9403c63 100644 --- a/src/external/pdap/impl/sync/data_sources/_shared/content.py +++ b/src/external/pdap/impl/sync/data_sources/_shared/content.py @@ -39,4 +39,4 @@ class DataSourceSyncContentModel(BaseModel): data_portal_type_other: str | None = None url_status: DataSourcesURLStatus | None = None - agency_ids: list[int] = Field(min_length=1) + agency_ids: list[int] = [] diff --git a/src/external/pdap/impl/sync/meta_urls/_shared/content.py b/src/external/pdap/impl/sync/meta_urls/_shared/content.py index 2145225e..9d81b3d7 100644 --- a/src/external/pdap/impl/sync/meta_urls/_shared/content.py +++ b/src/external/pdap/impl/sync/meta_urls/_shared/content.py @@ -3,4 +3,4 @@ class MetaURLSyncContentModel(BaseModel): url: str - agency_ids: list[int] + agency_ids: list[int] = [] diff --git a/src/external/pdap/impl/sync/meta_urls/update/core.py b/src/external/pdap/impl/sync/meta_urls/update/core.py index 44077e47..37e84da9 100644 --- a/src/external/pdap/impl/sync/meta_urls/update/core.py +++ b/src/external/pdap/impl/sync/meta_urls/update/core.py @@ -1,5 +1,3 @@ -from pdap_access_manager import AccessManager - from src.external.pdap._templates.request_builder import PDAPRequestBuilderBase from src.external.pdap.impl.sync.meta_urls.update.request import UpdateMetaURLsOuterRequest diff --git a/tests/automated/integration/api/data_sources/agencies/test_add_remove.py b/tests/automated/integration/api/data_sources/agencies/test_add_remove.py index 7223c8ce..42a82e11 100644 --- a/tests/automated/integration/api/data_sources/agencies/test_add_remove.py +++ b/tests/automated/integration/api/data_sources/agencies/test_add_remove.py @@ -6,21 +6,22 @@ async def test_agencies_add_remove( api_test_helper: APITestHelper, test_url_data_source_id: int, + test_agency_id_2: int, test_agency_id: int ): api_test_helper.request_validator.post_v3( - url=f"/data-sources/{test_url_data_source_id}/agencies/{test_agency_id}", + url=f"/data-sources/{test_url_data_source_id}/agencies/{test_agency_id_2}", ) adb_client: AsyncDatabaseClient = api_test_helper.adb_client() links: list[LinkURLAgency] = await adb_client.get_all(LinkURLAgency) - assert len(links) == 1 - assert links[0].agency_id == test_agency_id - assert links[0].url_id == test_url_data_source_id + assert len(links) == 2 + assert {link.agency_id for link in links} == {test_agency_id_2, test_agency_id} + assert {link.url_id for link in links} == {test_url_data_source_id} api_test_helper.request_validator.delete_v3( - url=f"/data-sources/{test_url_data_source_id}/agencies/{test_agency_id}", + url=f"/data-sources/{test_url_data_source_id}/agencies/{test_agency_id_2}", ) links: list[LinkURLAgency] = await adb_client.get_all(LinkURLAgency) - assert len(links) == 0 \ No newline at end of file + assert len(links) == 1 \ No newline at end of file diff --git a/tests/automated/integration/api/meta_urls/agencies/test_add_remove.py b/tests/automated/integration/api/meta_urls/agencies/test_add_remove.py index 4f48ac5c..1bd90ea2 100644 --- a/tests/automated/integration/api/meta_urls/agencies/test_add_remove.py +++ b/tests/automated/integration/api/meta_urls/agencies/test_add_remove.py @@ -5,26 +5,27 @@ async def test_agencies_add_remove( api_test_helper: APITestHelper, test_url_meta_url_id: int, - test_agency_id: int + test_agency_id: int, + test_agency_id_2: int ): api_test_helper.request_validator.post_v3( - url=f"/meta-urls/{test_url_meta_url_id}/agencies/{test_agency_id}", + url=f"/meta-urls/{test_url_meta_url_id}/agencies/{test_agency_id_2}", ) raw_response: dict = api_test_helper.request_validator.get_v3( url=f"/meta-urls/{test_url_meta_url_id}/agencies", ) response = AgencyGetOuterResponse(**raw_response) - assert len(response.results) == 1 - assert response.results[0].id == test_agency_id + assert len(response.results) == 2 + assert {result.id for result in response.results} == {test_agency_id, test_agency_id_2} api_test_helper.request_validator.delete_v3( - url=f"/meta-urls/{test_url_meta_url_id}/agencies/{test_agency_id}", + url=f"/meta-urls/{test_url_meta_url_id}/agencies/{test_agency_id_2}", ) raw_response: dict = api_test_helper.request_validator.get_v3( url=f"/meta-urls/{test_url_meta_url_id}/agencies", ) response = AgencyGetOuterResponse(**raw_response) - assert len(response.results) == 0 + assert len(response.results) == 1 diff --git a/tests/automated/integration/api/url/by_id/delete/test_any_url.py b/tests/automated/integration/api/url/by_id/delete/test_any_url.py index 9a91f3d4..579da570 100644 --- a/tests/automated/integration/api/url/by_id/delete/test_any_url.py +++ b/tests/automated/integration/api/url/by_id/delete/test_any_url.py @@ -315,7 +315,7 @@ async def _setup( agency_subtask_id: int = await dbc.add( URLAutoAgencyIDSubtask( url_id=url.url_id, - task_id=1, + task_id=task_id, agencies_found=True, type=AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH, detail=SubtaskDetailCode.NO_DETAILS diff --git a/tests/automated/integration/api/url/by_id/delete/test_meta_url.py b/tests/automated/integration/api/url/by_id/delete/test_meta_url.py index 6c2817b6..0fbee489 100644 --- a/tests/automated/integration/api/url/by_id/delete/test_meta_url.py +++ b/tests/automated/integration/api/url/by_id/delete/test_meta_url.py @@ -70,7 +70,6 @@ async def _setup( ## DS App Link app_link = DSAppLinkMetaURL( url_id=url_id, - agency_id=agency_id, ds_meta_url_id=1 ) await ddc.adb_client.add(app_link) diff --git a/tests/automated/integration/db/structure/updated_at/__init__.py b/tests/automated/integration/db/structure/updated_at/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/automated/integration/db/structure/updated_at/test_ds_optional_metadata.py b/tests/automated/integration/db/structure/updated_at/test_ds_optional_metadata.py deleted file mode 100644 index cc88f697..00000000 --- a/tests/automated/integration/db/structure/updated_at/test_ds_optional_metadata.py +++ /dev/null @@ -1,6 +0,0 @@ -import pytest - - -@pytest.mark.asyncio -async def test_ds_optional_metadata_updated_at(): - raise NotImplementedError \ No newline at end of file diff --git a/tests/automated/integration/db/structure/updated_at/test_urls.py b/tests/automated/integration/db/structure/updated_at/test_urls.py deleted file mode 100644 index cc88f697..00000000 --- a/tests/automated/integration/db/structure/updated_at/test_urls.py +++ /dev/null @@ -1,6 +0,0 @@ -import pytest - - -@pytest.mark.asyncio -async def test_ds_optional_metadata_updated_at(): - raise NotImplementedError \ No newline at end of file diff --git a/tests/automated/integration/tasks/conftest.py b/tests/automated/integration/tasks/conftest.py index a06da58c..937b2d12 100644 --- a/tests/automated/integration/tasks/conftest.py +++ b/tests/automated/integration/tasks/conftest.py @@ -11,6 +11,7 @@ def mock_pdap_client() -> PDAPClient: mock_access_manager = MagicMock( spec=AccessManager ) + mock_access_manager.data_sources_url = "http://example.com" mock_access_manager.build_url = MagicMock( return_value="http://example.com" ) diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/conftest.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/conftest.py index 9a9996a1..4cb7a3f2 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/conftest.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/conftest.py @@ -1,10 +1,22 @@ import pytest_asyncio +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.agency.ds_link.sqlalchemy import DSAppLinkAgency from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel @pytest_asyncio.fixture async def ds_app_linked_agency( - test_agency_id: int + test_agency_id: int, + adb_client_test: AsyncDatabaseClient ) -> DSAppLinkInfoModel: - raise NotImplementedError \ No newline at end of file + # Add DS App Link + ds_app_link = DSAppLinkAgency( + agency_id=test_agency_id, + ds_agency_id=67 + ) + await adb_client_test.add(ds_app_link) + return DSAppLinkInfoModel( + ds_app_id=67, + db_id=test_agency_id + ) diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/test_add.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/test_add.py index 669a7961..f0997d65 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/test_add.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/test_add.py @@ -1,9 +1,23 @@ +from http import HTTPStatus +from unittest.mock import AsyncMock + import pytest +from pdap_access_manager import ResponseInfo from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.add.core import DSAppSyncAgenciesAddTaskOperator from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.agency.ds_link.sqlalchemy import DSAppLinkAgency +from src.db.models.impl.agency.enums import JurisdictionType, AgencyType from src.external.pdap.client import PDAPClient +from src.external.pdap.impl.sync.agencies._shared.models.content import AgencySyncContentModel +from src.external.pdap.impl.sync.agencies.add.request import AddAgenciesOuterRequest, AddAgenciesInnerRequest +from src.external.pdap.impl.sync.shared.models.add.response import DSAppSyncAddResponseModel, \ + DSAppSyncAddResponseInnerModel +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.helpers import extract_and_validate_sync_request, \ + mock_make_request from tests.helpers.data_creator.core import DBDataCreator +from tests.helpers.data_creator.models.creation_info.locality import LocalityCreationInfo +from tests.helpers.data_creator.models.creation_info.us_state import USStateCreationInfo from tests.helpers.run import run_task_and_confirm_success @@ -12,22 +26,56 @@ async def test_add( db_data_creator: DBDataCreator, test_agency_id: int, adb_client_test: AsyncDatabaseClient, - mock_pdap_client: PDAPClient + mock_pdap_client: PDAPClient, + pittsburgh_locality: LocalityCreationInfo, + pennsylvania: USStateCreationInfo, ): operator = DSAppSyncAgenciesAddTaskOperator( adb_client=adb_client_test, pdap_client=mock_pdap_client ) + # Mock make_request to return a false DS App id + mock_make_request( + mock_pdap_client=mock_pdap_client, + data=DSAppSyncAddResponseModel( + entities=[ + DSAppSyncAddResponseInnerModel( + app_id=67, + request_id=test_agency_id + ) + ] + ) + ) + # Check meets prerequisite assert await operator.meets_task_prerequisites() # Run task and confirm runs without error await run_task_and_confirm_success(operator) - # Confirm expected method was caused with expected parameters + # Confirm expected method was called with expected parameters + request: AddAgenciesOuterRequest = extract_and_validate_sync_request( + mock_pdap_client, + expected_path="agencies/add", + expected_model=AddAgenciesOuterRequest + ) + assert len(request.agencies) == 1 + agency: AddAgenciesInnerRequest = request.agencies[0] + assert agency.request_id == test_agency_id + content: AgencySyncContentModel = agency.content + assert content.name == "Test Agency" + assert content.jurisdiction_type == JurisdictionType.LOCAL + assert content.agency_type == AgencyType.UNKNOWN + assert set(content.location_ids) == { + pittsburgh_locality.location_id, + pennsylvania.location_id + } # Check Presence of DS App Link + ds_app_link: DSAppLinkAgency = await adb_client_test.one_or_none_model(DSAppLinkAgency) + assert ds_app_link is not None + assert ds_app_link.ds_agency_id == 67 + assert ds_app_link.agency_id == test_agency_id - raise NotImplementedError \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/test_delete.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/test_delete.py index 430e6645..e311b886 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/test_delete.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/test_delete.py @@ -1,10 +1,14 @@ import pytest +from src.api.shared.models.message_response import MessageResponse from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.delete.core import DSAppSyncAgenciesDeleteTaskOperator from src.db.client.async_ import AsyncDatabaseClient from src.db.models.impl.agency.ds_link.sqlalchemy import DSAppLinkAgency from src.db.models.impl.flag.ds_delete.agency import FlagDSDeleteAgency from src.external.pdap.client import PDAPClient +from src.external.pdap.impl.sync.shared.models.delete.request import DSAppSyncDeleteRequestModel +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.helpers import extract_and_validate_sync_request, \ + mock_make_request from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel from tests.helpers.data_creator.core import DBDataCreator from tests.helpers.run import run_task_and_confirm_success @@ -17,24 +21,25 @@ async def test_delete( adb_client_test: AsyncDatabaseClient, mock_pdap_client: PDAPClient ): + ds_agency_id: int = 67 operator = DSAppSyncAgenciesDeleteTaskOperator( adb_client=adb_client_test, pdap_client=mock_pdap_client ) + # Mock make_request + mock_make_request( + mock_pdap_client=mock_pdap_client, + data=MessageResponse(message="Success") + ) + + # Check does not currently meet prerequisite assert not await operator.meets_task_prerequisites() - # Add DS App Link - ds_app_link = DSAppLinkAgency( - ds_agency_id=1, - agency_id=None, - ) - await adb_client_test.add(ds_app_link) - # Add Task Deletion Flag for App Link flag = FlagDSDeleteAgency( - ds_agency_id=1 + ds_agency_id=ds_agency_id ) await adb_client_test.add(flag) @@ -45,11 +50,15 @@ async def test_delete( await run_task_and_confirm_success(operator) # Confirm expected method was caused with expected parameters + request: DSAppSyncDeleteRequestModel = extract_and_validate_sync_request( + mock_pdap_client, + expected_path="agencies/delete", + expected_model=DSAppSyncDeleteRequestModel + ) + assert request.ids == [ds_agency_id] # Check DS App Link Is Deleted assert await adb_client_test.has_no_rows(DSAppLinkAgency) # Check DS App Agency Deletion Flag is deleted assert await adb_client_test.has_no_rows(FlagDSDeleteAgency) - - raise NotImplementedError \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/test_add_location_link.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/test_add_location_link.py index 91300b04..4dfbaba7 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/test_add_location_link.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/test_add_location_link.py @@ -1,20 +1,47 @@ +from src.api.shared.models.message_response import MessageResponse from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.update.core import DSAppSyncAgenciesUpdateTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.agency.ds_link.sqlalchemy import DSAppLinkAgency +from src.db.models.impl.agency.enums import JurisdictionType, AgencyType +from src.db.models.impl.link.agency_location.sqlalchemy import LinkAgencyLocation +from src.external.pdap.client import PDAPClient +from src.external.pdap.impl.sync.agencies._shared.models.content import AgencySyncContentModel +from src.external.pdap.impl.sync.agencies.update.request import UpdateAgenciesOuterRequest, UpdateAgenciesInnerRequest +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.agency.conftest import ds_app_linked_agency +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.helpers import extract_and_validate_sync_request, \ + mock_make_request from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel from tests.helpers.data_creator.models.creation_info.county import CountyCreationInfo +from tests.helpers.data_creator.models.creation_info.locality import LocalityCreationInfo +from tests.helpers.data_creator.models.creation_info.us_state import USStateCreationInfo from tests.helpers.run import run_task_and_confirm_success async def test_add_location_link( ds_app_linked_agency: DSAppLinkInfoModel, + pittsburgh_locality: LocalityCreationInfo, allegheny_county: CountyCreationInfo, - operator: DSAppSyncAgenciesUpdateTaskOperator + pennsylvania: USStateCreationInfo, + operator: DSAppSyncAgenciesUpdateTaskOperator, + mock_pdap_client: PDAPClient, + adb_client_test: AsyncDatabaseClient, ): - # Check prerequisites not met assert not await operator.meets_task_prerequisites() + # Mock make_request + mock_make_request( + mock_pdap_client=mock_pdap_client, + data=MessageResponse(message="Success") + ) + # Add location link + link = LinkAgencyLocation( + agency_id=ds_app_linked_agency.db_id, + location_id=allegheny_county.location_id + ) + await adb_client_test.add(link) # Check prerequisites are met assert operator.meets_task_prerequisites() @@ -23,7 +50,27 @@ async def test_add_location_link( await run_task_and_confirm_success(operator) # Confirm expected method was called with expected parameters + request: UpdateAgenciesOuterRequest = extract_and_validate_sync_request( + mock_pdap_client, + expected_path="agencies/update", + expected_model=UpdateAgenciesOuterRequest + ) + assert len(request.agencies) == 1 + agency: UpdateAgenciesInnerRequest = request.agencies[0] + assert agency.app_id == ds_app_linked_agency.ds_app_id + content: AgencySyncContentModel = agency.content + assert content.name == "Test Agency" + assert content.jurisdiction_type == JurisdictionType.LOCAL + assert content.agency_type == AgencyType.UNKNOWN + assert set(content.location_ids) == { + pittsburgh_locality.location_id, + pennsylvania.location_id, + allegheny_county.location_id + } - # Check DS App Link Is Updated - raise NotImplementedError + # Check DS App Link Is Updated + ds_app_link: DSAppLinkAgency | None = await adb_client_test.one_or_none_model(model=DSAppLinkAgency) + assert ds_app_link is not None + assert ds_app_link.ds_agency_id == 67 + assert ds_app_link.last_synced_at > ds_app_linked_agency.updated_at diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/test_delete_location_link.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/test_delete_location_link.py index 1e563a1f..7f0450fe 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/test_delete_location_link.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/test_delete_location_link.py @@ -1,19 +1,52 @@ +from sqlalchemy import delete + +from src.api.shared.models.message_response import MessageResponse from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.update.core import DSAppSyncAgenciesUpdateTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.agency.ds_link.sqlalchemy import DSAppLinkAgency +from src.db.models.impl.agency.enums import JurisdictionType, AgencyType +from src.db.models.impl.link.agency_location.sqlalchemy import LinkAgencyLocation +from src.external.pdap.client import PDAPClient +from src.external.pdap.impl.sync.agencies._shared.models.content import AgencySyncContentModel +from src.external.pdap.impl.sync.agencies.update.request import UpdateAgenciesOuterRequest, UpdateAgenciesInnerRequest +from tests.automated.integration.conftest import pennsylvania +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.helpers import extract_and_validate_sync_request, \ + mock_make_request from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel from tests.helpers.data_creator.models.creation_info.locality import LocalityCreationInfo +from tests.helpers.data_creator.models.creation_info.us_state import USStateCreationInfo from tests.helpers.run import run_task_and_confirm_success async def test_delete_location_link( ds_app_linked_agency: DSAppLinkInfoModel, pittsburgh_locality: LocalityCreationInfo, - operator: DSAppSyncAgenciesUpdateTaskOperator + operator: DSAppSyncAgenciesUpdateTaskOperator, + mock_pdap_client: PDAPClient, + pennsylvania: USStateCreationInfo, + adb_client_test: AsyncDatabaseClient ): # Check prerequisites not met assert not await operator.meets_task_prerequisites() - # Delete location link + # Mock make_request + mock_make_request( + mock_pdap_client=mock_pdap_client, + data=MessageResponse(message="Success") + ) + + # Delete location link (pittsburgh) + statement = ( + delete( + LinkAgencyLocation + ) + .where( + LinkAgencyLocation.agency_id == ds_app_linked_agency.db_id, + LinkAgencyLocation.location_id == pittsburgh_locality.location_id + ) + ) + await adb_client_test.execute(statement) # Check prerequisites are met assert operator.meets_task_prerequisites() @@ -22,7 +55,22 @@ async def test_delete_location_link( await run_task_and_confirm_success(operator) # Confirm expected method was called with expected parameters + request: UpdateAgenciesOuterRequest = extract_and_validate_sync_request( + mock_pdap_client, + expected_path="agencies/update", + expected_model=UpdateAgenciesOuterRequest + ) + assert len(request.agencies) == 1 + agency: UpdateAgenciesInnerRequest = request.agencies[0] + assert agency.app_id == ds_app_linked_agency.ds_app_id + content: AgencySyncContentModel = agency.content + assert content.name == "Test Agency" + assert content.jurisdiction_type == JurisdictionType.LOCAL + assert content.agency_type == AgencyType.UNKNOWN + assert content.location_ids == [pennsylvania.location_id] # Check DS App Link Is Updated - - raise NotImplementedError + ds_app_link: DSAppLinkAgency | None = await adb_client_test.one_or_none_model(model=DSAppLinkAgency) + assert ds_app_link is not None + assert ds_app_link.ds_agency_id == 67 + assert ds_app_link.last_synced_at > ds_app_linked_agency.updated_at \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/test_update_agency.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/test_update_agency.py index 5e5b78a5..4749b0b0 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/test_update_agency.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/test_update_agency.py @@ -1,17 +1,50 @@ +from sqlalchemy import update + +from src.api.shared.models.message_response import MessageResponse from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.update.core import DSAppSyncAgenciesUpdateTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.agency.ds_link.sqlalchemy import DSAppLinkAgency +from src.db.models.impl.agency.enums import AgencyType, JurisdictionType +from src.db.models.impl.agency.sqlalchemy import Agency +from src.external.pdap.client import PDAPClient +from src.external.pdap.impl.sync.agencies.update.request import UpdateAgenciesOuterRequest +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.helpers import extract_and_validate_sync_request, \ + mock_make_request from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel from tests.helpers.run import run_task_and_confirm_success async def test_update_agency( ds_app_linked_agency: DSAppLinkInfoModel, - operator: DSAppSyncAgenciesUpdateTaskOperator + operator: DSAppSyncAgenciesUpdateTaskOperator, + mock_pdap_client: PDAPClient, + adb_client_test: AsyncDatabaseClient ): # Check prerequisites not met assert not await operator.meets_task_prerequisites() + # Mock make_request + mock_make_request( + mock_pdap_client=mock_pdap_client, + data=MessageResponse(message="Success") + ) + # Update agency table + statement = ( + update( + Agency + ) + .values( + name="Updated Agency Name", + agency_type=AgencyType.COURT, + jurisdiction_type=JurisdictionType.STATE + ) + .where( + Agency.id == ds_app_linked_agency.db_id + ) + ) + await adb_client_test.execute(statement) # Check prerequisites are met assert operator.meets_task_prerequisites() @@ -20,7 +53,14 @@ async def test_update_agency( await run_task_and_confirm_success(operator) # Confirm expected method was called with expected parameters + extract_and_validate_sync_request( + mock_pdap_client, + expected_path="agencies/update", + expected_model=UpdateAgenciesOuterRequest + ) # Check DS App Link Is Updated - - raise NotImplementedError + ds_app_link: DSAppLinkAgency | None = await adb_client_test.one_or_none_model(model=DSAppLinkAgency) + assert ds_app_link is not None + assert ds_app_link.ds_agency_id == 67 + assert ds_app_link.last_synced_at > ds_app_linked_agency.updated_at diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/conftest.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/conftest.py index e0a1c61b..72b621b2 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/conftest.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/conftest.py @@ -1,10 +1,21 @@ import pytest_asyncio +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel @pytest_asyncio.fixture async def ds_app_linked_data_source_url( test_url_data_source_id: int, + adb_client_test: AsyncDatabaseClient ) -> DSAppLinkInfoModel: - raise NotImplementedError \ No newline at end of file + link = DSAppLinkDataSource( + ds_data_source_id=67, + url_id=test_url_data_source_id, + ) + await adb_client_test.add(link) + return DSAppLinkInfoModel( + db_id=test_url_data_source_id, + ds_app_id=67, + ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/test_add.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/test_add.py index 9f97b64e..060637db 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/test_add.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/test_add.py @@ -1,8 +1,16 @@ import pytest +from src.core.enums import RecordType from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.add.core import DSAppSyncDataSourcesAddTaskOperator from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource from src.external.pdap.client import PDAPClient +from src.external.pdap.impl.sync.data_sources._shared.content import DataSourceSyncContentModel +from src.external.pdap.impl.sync.data_sources.add.request import AddDataSourcesOuterRequest, AddDataSourcesInnerRequest +from src.external.pdap.impl.sync.shared.models.add.response import DSAppSyncAddResponseModel, \ + DSAppSyncAddResponseInnerModel +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.helpers import extract_and_validate_sync_request, \ + mock_make_request from tests.helpers.data_creator.core import DBDataCreator from tests.helpers.run import run_task_and_confirm_success @@ -12,25 +20,70 @@ async def test_add( db_data_creator: DBDataCreator, test_url_data_source_id: int, adb_client_test: AsyncDatabaseClient, - mock_pdap_client: PDAPClient + mock_pdap_client: PDAPClient, + test_agency_id: int ): operator = DSAppSyncDataSourcesAddTaskOperator( adb_client=adb_client_test, pdap_client=mock_pdap_client ) + # Mock make_request + mock_make_request( + mock_pdap_client=mock_pdap_client, + data=DSAppSyncAddResponseModel( + entities=[ + DSAppSyncAddResponseInnerModel( + app_id=67, + request_id=test_url_data_source_id + ) + ] + ) + ) + # Check meet task prerequisites - assert not await operator.meets_task_prerequisites() + assert await operator.meets_task_prerequisites() # Run task and confirm runs without error await run_task_and_confirm_success(operator) # Confirm expected method was called with expected parameters + request: AddDataSourcesOuterRequest = extract_and_validate_sync_request( + mock_pdap_client, + expected_path="data-sources/add", + expected_model=AddDataSourcesOuterRequest + ) + assert len(request.data_sources) == 1 + data_source: AddDataSourcesInnerRequest = request.data_sources[0] + assert data_source.request_id == test_url_data_source_id + content: DataSourceSyncContentModel = data_source.content + assert content.source_url.startswith("https://example.com/") + assert content.name.startswith("Example ") + assert content.record_type == RecordType.CRIME_STATISTICS + assert content.description is None + assert content.record_formats is None + assert content.data_portal_type is None + assert content.supplying_entity is None + assert content.coverage_start is None + assert content.coverage_end is None + assert content.detail_level is None + assert content.agency_supplied is None + assert content.agency_originated is None + assert content.agency_described_not_in_database is None + assert content.update_method is None + assert content.readme_url is None + assert content.originating_entity is None + assert content.retention_schedule is None + assert content.scraper_url is None + assert content.access_notes is None + assert content.access_types is None + assert content.data_portal_type_other is None + assert content.url_status is None - # Check Presence of DS App Link - - - - + assert content.agency_ids == [test_agency_id] - raise NotImplementedError \ No newline at end of file + # Check Presence of DS App Link + ds_app_link: DSAppLinkDataSource | None = await adb_client_test.one_or_none_model(DSAppLinkDataSource) + assert ds_app_link is not None + assert ds_app_link.ds_data_source_id == 67 + assert ds_app_link.url_id == test_url_data_source_id diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/test_delete.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/test_delete.py index ea202fb1..a67f5db3 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/test_delete.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/test_delete.py @@ -1,10 +1,15 @@ import pytest -from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources import DSAppSyncDataSourcesDeleteTaskOperator +from src.api.shared.models.message_response import MessageResponse +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.delete.core import \ + DSAppSyncDataSourcesDeleteTaskOperator from src.db.client.async_ import AsyncDatabaseClient from src.db.models.impl.flag.ds_delete.data_source import FlagDSDeleteDataSource from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource from src.external.pdap.client import PDAPClient +from src.external.pdap.impl.sync.shared.models.delete.request import DSAppSyncDeleteRequestModel +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.helpers import extract_and_validate_sync_request, \ + mock_make_request from tests.helpers.data_creator.core import DBDataCreator from tests.helpers.run import run_task_and_confirm_success @@ -15,23 +20,30 @@ async def test_delete( adb_client_test: AsyncDatabaseClient, mock_pdap_client: PDAPClient ): + ds_data_source_id: int = 67 operator = DSAppSyncDataSourcesDeleteTaskOperator( adb_client=adb_client_test, pdap_client=mock_pdap_client ) + # Mock make_request + mock_make_request( + mock_pdap_client=mock_pdap_client, + data=MessageResponse(message="Success") + ) + # Check does not currently meet prerequisite - assert not operator.meets_task_prerequisites() + assert not await operator.meets_task_prerequisites() # Add DS App Link ds_app_link = DSAppLinkDataSource( url_id=None, - ds_data_source_id=1, + ds_data_source_id=ds_data_source_id, ) await adb_client_test.add(ds_app_link) # Add Task Deletion Flag for App Link flag = FlagDSDeleteDataSource( - ds_data_source_id=1, + ds_data_source_id=ds_data_source_id, ) await adb_client_test.add(flag) @@ -41,6 +53,14 @@ async def test_delete( # Run task and confirm runs without error await run_task_and_confirm_success(operator) + # Confirm expected method was caused with expected parameters + request: DSAppSyncDeleteRequestModel = extract_and_validate_sync_request( + mock_pdap_client, + expected_path="data-sources/delete", + expected_model=DSAppSyncDeleteRequestModel + ) + assert request.ids == [ds_data_source_id] + # Check DS App Link Is Deleted assert await adb_client_test.has_no_rows(DSAppLinkDataSource) diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/conftest.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/conftest.py index f991d3c9..8a6bbfc5 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/conftest.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/conftest.py @@ -1,6 +1,7 @@ import pytest -from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update import DSAppSyncDataSourcesUpdateTaskOperator +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update.core import \ + DSAppSyncDataSourcesUpdateTaskOperator from src.db.client.async_ import AsyncDatabaseClient from src.external.pdap.client import PDAPClient diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_add_agency_link.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_add_agency_link.py index 1d53d364..9852df7a 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_add_agency_link.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_add_agency_link.py @@ -1,18 +1,42 @@ -from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update import DSAppSyncDataSourcesUpdateTaskOperator +from src.api.shared.models.message_response import MessageResponse +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update.core import \ + DSAppSyncDataSourcesUpdateTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource +from src.external.pdap.client import PDAPClient +from src.external.pdap.impl.sync.data_sources._shared.content import DataSourceSyncContentModel +from src.external.pdap.impl.sync.data_sources.update.request import UpdateDataSourcesInnerRequest, \ + UpdateDataSourcesOuterRequest +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.helpers import extract_and_validate_sync_request, \ + mock_make_request from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel from tests.helpers.run import run_task_and_confirm_success async def test_add_agency_link( ds_app_linked_data_source_url: DSAppLinkInfoModel, + test_agency_id: int, test_agency_id_2: int, - operator: DSAppSyncDataSourcesUpdateTaskOperator + operator: DSAppSyncDataSourcesUpdateTaskOperator, + mock_pdap_client: PDAPClient, + adb_client_test: AsyncDatabaseClient ): + # Mock make_request + mock_make_request( + mock_pdap_client=mock_pdap_client, + data=MessageResponse(message="Success") + ) # Check prerequisites not met assert not await operator.meets_task_prerequisites() # Add additional agency link + link = LinkURLAgency( + url_id=ds_app_linked_data_source_url.db_id, + agency_id=test_agency_id_2 + ) + await adb_client_test.add(link) # Check prerequisites are met assert operator.meets_task_prerequisites() @@ -21,7 +45,24 @@ async def test_add_agency_link( await run_task_and_confirm_success(operator) # Confirm expected method was called with expected parameters + request: UpdateDataSourcesOuterRequest = extract_and_validate_sync_request( + mock_pdap_client, + expected_path="data-sources/update", + expected_model=UpdateDataSourcesOuterRequest + ) + assert len(request.data_sources) == 1 + data_source: UpdateDataSourcesInnerRequest = request.data_sources[0] + assert data_source.app_id == ds_app_linked_data_source_url.ds_app_id + content: DataSourceSyncContentModel = data_source.content + assert content.name.startswith("Example") + assert set(content.agency_ids) == { + test_agency_id, + test_agency_id_2 + } # Check DS App Link Is Updated + ds_app_link: DSAppLinkDataSource | None = await adb_client_test.one_or_none_model(model=DSAppLinkDataSource) + assert ds_app_link is not None + assert ds_app_link.ds_data_source_id == 67 + assert ds_app_link.last_synced_at > ds_app_linked_data_source_url.updated_at - raise NotImplementedError diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_delete_agency_link.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_delete_agency_link.py index 8a16cc31..f0dbf204 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_delete_agency_link.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_delete_agency_link.py @@ -1,18 +1,49 @@ -from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update import DSAppSyncDataSourcesUpdateTaskOperator +from sqlalchemy import delete + +from src.api.shared.models.message_response import MessageResponse +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update.core import \ + DSAppSyncDataSourcesUpdateTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource +from src.external.pdap.client import PDAPClient +from src.external.pdap.impl.sync.data_sources._shared.content import DataSourceSyncContentModel +from src.external.pdap.impl.sync.data_sources.update.request import UpdateDataSourcesInnerRequest, \ + UpdateDataSourcesOuterRequest +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.helpers import extract_and_validate_sync_request, \ + mock_make_request from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel +from tests.conftest import adb_client_test from tests.helpers.run import run_task_and_confirm_success async def test_delete_agency_link( ds_app_linked_data_source_url: DSAppLinkInfoModel, - test_agency_id_1: int, - operator: DSAppSyncDataSourcesUpdateTaskOperator + test_agency_id: int, + operator: DSAppSyncDataSourcesUpdateTaskOperator, + mock_pdap_client: PDAPClient, + adb_client_test: AsyncDatabaseClient ): + # Mock make_request + mock_make_request( + mock_pdap_client=mock_pdap_client, + data=MessageResponse(message="Success") + ) # Check prerequisites not met assert not await operator.meets_task_prerequisites() # Delete agency ID link + statement = ( + delete( + LinkURLAgency + ) + .where( + LinkURLAgency.url_id == ds_app_linked_data_source_url.db_id, + LinkURLAgency.agency_id == test_agency_id + ) + ) + await adb_client_test.execute(statement) # Check prerequisites are met assert operator.meets_task_prerequisites() @@ -21,7 +52,21 @@ async def test_delete_agency_link( await run_task_and_confirm_success(operator) # Confirm expected method was called with expected parameters + request: UpdateDataSourcesOuterRequest = extract_and_validate_sync_request( + mock_pdap_client, + expected_path="data-sources/update", + expected_model=UpdateDataSourcesOuterRequest + ) + assert len(request.data_sources) == 1 + data_source: UpdateDataSourcesInnerRequest = request.data_sources[0] + assert data_source.app_id == ds_app_linked_data_source_url.ds_app_id + content: DataSourceSyncContentModel = data_source.content + assert content.name.startswith("Example") + assert content.agency_ids == [] # Check DS App Link Is Updated + ds_app_link: DSAppLinkDataSource | None = await adb_client_test.one_or_none_model(model=DSAppLinkDataSource) + assert ds_app_link is not None + assert ds_app_link.ds_data_source_id == 67 + assert ds_app_link.last_synced_at > ds_app_linked_data_source_url.updated_at - raise NotImplementedError diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_update_optional_ds_metadata.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_update_optional_ds_metadata.py index ca188487..94273019 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_update_optional_ds_metadata.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_update_optional_ds_metadata.py @@ -1,17 +1,64 @@ -from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update import DSAppSyncDataSourcesUpdateTaskOperator +from datetime import date + +from sqlalchemy import update + +from src.api.shared.models.message_response import MessageResponse +from src.core.enums import RecordType +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update.core import \ + DSAppSyncDataSourcesUpdateTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource +from src.db.models.impl.url.optional_ds_metadata.enums import AgencyAggregationEnum, AccessTypeEnum, UpdateMethodEnum, \ + RetentionScheduleEnum +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata +from src.external.pdap.client import PDAPClient +from src.external.pdap.impl.sync.data_sources._shared.content import DataSourceSyncContentModel +from src.external.pdap.impl.sync.data_sources.update.request import UpdateDataSourcesInnerRequest, \ + UpdateDataSourcesOuterRequest +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.helpers import extract_and_validate_sync_request, \ + mock_make_request from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel from tests.helpers.run import run_task_and_confirm_success async def test_update_optional_ds_metadata( ds_app_linked_data_source_url: DSAppLinkInfoModel, - operator: DSAppSyncDataSourcesUpdateTaskOperator + operator: DSAppSyncDataSourcesUpdateTaskOperator, + mock_pdap_client: PDAPClient, + adb_client_test: AsyncDatabaseClient, + test_agency_id: int ): + # Mock make_request + mock_make_request( + mock_pdap_client=mock_pdap_client, + data=MessageResponse(message="Success") + ) # Check prerequisites not met assert not await operator.meets_task_prerequisites() # Update url_optional_ds_metadata_table table + insert = URLOptionalDataSourceMetadata( + url_id=ds_app_linked_data_source_url.db_id, + record_formats=["Record Format 1", "Record Format 2"], + data_portal_type="Test Data Portal Type", + supplying_entity="Test Supplying Entity", + coverage_start=date(year=2025, month=5, day=1), + coverage_end=date(year=2025, month=5, day=31), + agency_supplied=True, + agency_originated=True, + agency_aggregation=AgencyAggregationEnum.FEDERAL, + update_method=UpdateMethodEnum.OVERWRITE, + readme_url="https://example.com/readme", + originating_entity="Test originating entity", + retention_schedule=RetentionScheduleEnum.FUTURE_ONLY, + scraper_url="https://example.com/scraper", + submission_notes="Test submission notes", + access_notes="Test Access notes", + access_types=[AccessTypeEnum.DOWNLOAD], + data_portal_type_other="Test data portal type other" + ) + await adb_client_test.add(insert) # Check prerequisites are met assert operator.meets_task_prerequisites() @@ -20,7 +67,39 @@ async def test_update_optional_ds_metadata( await run_task_and_confirm_success(operator) # Confirm expected method was called with expected parameters + request: UpdateDataSourcesOuterRequest = extract_and_validate_sync_request( + mock_pdap_client, + expected_path="data-sources/update", + expected_model=UpdateDataSourcesOuterRequest + ) + assert len(request.data_sources) == 1 + data_source: UpdateDataSourcesInnerRequest = request.data_sources[0] + assert data_source.app_id == ds_app_linked_data_source_url.ds_app_id + content: DataSourceSyncContentModel = data_source.content + assert content.source_url.startswith("https://example.com/") + assert content.name.startswith("Example ") + assert content.record_type == RecordType.CRIME_STATISTICS + assert content.description is None + assert content.record_formats == ["Record Format 1", "Record Format 2"] + assert content.data_portal_type == "Test Data Portal Type" + assert content.supplying_entity == "Test Supplying Entity" + assert content.coverage_start == date(year=2025, month=5, day=1) + assert content.coverage_end == date(year=2025, month=5, day=31) + assert content.detail_level is None + assert content.agency_supplied == True + assert content.agency_originated == True + assert content.update_method == UpdateMethodEnum.OVERWRITE + assert content.readme_url == "https://example.com/readme" + assert content.originating_entity == "Test originating entity" + assert content.retention_schedule == RetentionScheduleEnum.FUTURE_ONLY + assert content.scraper_url == "https://example.com/scraper" + assert content.access_notes == "Test Access notes" + assert content.access_types == [AccessTypeEnum.DOWNLOAD] + assert content.data_portal_type_other == "Test data portal type other" # Check DS App Link Is Updated + ds_app_link: DSAppLinkDataSource | None = await adb_client_test.one_or_none_model(model=DSAppLinkDataSource) + assert ds_app_link is not None + assert ds_app_link.ds_data_source_id == 67 + assert ds_app_link.last_synced_at > ds_app_linked_data_source_url.updated_at - raise NotImplementedError diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_update_record_type.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_update_record_type.py new file mode 100644 index 00000000..66fae2cb --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_update_record_type.py @@ -0,0 +1,78 @@ +from sqlalchemy import update + +from src.api.shared.models.message_response import MessageResponse +from src.core.enums import RecordType +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update.core import \ + DSAppSyncDataSourcesUpdateTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource +from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType +from src.external.pdap.client import PDAPClient +from src.external.pdap.impl.sync.data_sources._shared.content import DataSourceSyncContentModel +from src.external.pdap.impl.sync.data_sources.update.request import UpdateDataSourcesInnerRequest, \ + UpdateDataSourcesOuterRequest +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.helpers import extract_and_validate_sync_request, \ + mock_make_request +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel +from tests.helpers.run import run_task_and_confirm_success + + +async def test_update_url( + ds_app_linked_data_source_url: DSAppLinkInfoModel, + operator: DSAppSyncDataSourcesUpdateTaskOperator, + mock_pdap_client: PDAPClient, + adb_client_test: AsyncDatabaseClient, + test_agency_id: int +): + # Mock make_request + mock_make_request( + mock_pdap_client=mock_pdap_client, + data=MessageResponse(message="Success") + ) + + # Check prerequisites not met + assert not await operator.meets_task_prerequisites() + + # Update URL Record Type table + statement = ( + update( + URLRecordType + ) + .values( + record_type=RecordType.POLICIES_AND_CONTRACTS + ) + .where( + URLRecordType.url_id == ds_app_linked_data_source_url.db_id + ) + ) + await adb_client_test.execute(statement) + + # Check prerequisites are met + assert operator.meets_task_prerequisites() + + # Run task and confirm runs without error + await run_task_and_confirm_success(operator) + + # Confirm expected method was called with expected parameters + request: UpdateDataSourcesOuterRequest = extract_and_validate_sync_request( + mock_pdap_client, + expected_path="data-sources/update", + expected_model=UpdateDataSourcesOuterRequest + ) + assert len(request.data_sources) == 1 + data_source: UpdateDataSourcesInnerRequest = request.data_sources[0] + assert data_source.app_id == ds_app_linked_data_source_url.ds_app_id + content: DataSourceSyncContentModel = data_source.content + assert content.name.startswith("Example ") + assert content.record_type == RecordType.POLICIES_AND_CONTRACTS + assert content.agency_ids == [ + test_agency_id + ] + assert content.retention_schedule is None + + # Check DS App Link Is Updated + ds_app_link: DSAppLinkDataSource | None = await adb_client_test.one_or_none_model(model=DSAppLinkDataSource) + assert ds_app_link is not None + assert ds_app_link.ds_data_source_id == 67 + assert ds_app_link.last_synced_at > ds_app_linked_data_source_url.updated_at + diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_update_url.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_update_url.py index 9d6cd70a..78c095c0 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_update_url.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_update_url.py @@ -1,17 +1,54 @@ -from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update import DSAppSyncDataSourcesUpdateTaskOperator +from sqlalchemy import update + +from src.api.shared.models.message_response import MessageResponse +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update.core import \ + DSAppSyncDataSourcesUpdateTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource +from src.external.pdap.client import PDAPClient +from src.external.pdap.impl.sync.data_sources._shared.content import DataSourceSyncContentModel +from src.external.pdap.impl.sync.data_sources.update.request import UpdateDataSourcesInnerRequest, \ + UpdateDataSourcesOuterRequest +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.helpers import extract_and_validate_sync_request, \ + mock_make_request from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel from tests.helpers.run import run_task_and_confirm_success async def test_update_url( ds_app_linked_data_source_url: DSAppLinkInfoModel, - operator: DSAppSyncDataSourcesUpdateTaskOperator + operator: DSAppSyncDataSourcesUpdateTaskOperator, + mock_pdap_client: PDAPClient, + adb_client_test: AsyncDatabaseClient, + test_agency_id: int ): + # Mock make_request + mock_make_request( + mock_pdap_client=mock_pdap_client, + data=MessageResponse(message="Success") + ) # Check prerequisites not met assert not await operator.meets_task_prerequisites() # Update URL table + statement = ( + update( + URL + ) + .values( + name="Updated URL Name", + scheme="http", + trailing_slash=True, + url="modified-example.com", + description="Updated URL Description", + ) + .where( + URL.id == ds_app_linked_data_source_url.db_id + ) + ) + await adb_client_test.execute(statement) # Check prerequisites are met assert operator.meets_task_prerequisites() @@ -20,7 +57,25 @@ async def test_update_url( await run_task_and_confirm_success(operator) # Confirm expected method was called with expected parameters + request: UpdateDataSourcesOuterRequest = extract_and_validate_sync_request( + mock_pdap_client, + expected_path="data-sources/update", + expected_model=UpdateDataSourcesOuterRequest + ) + assert len(request.data_sources) == 1 + data_source: UpdateDataSourcesInnerRequest = request.data_sources[0] + assert data_source.app_id == ds_app_linked_data_source_url.ds_app_id + content: DataSourceSyncContentModel = data_source.content + assert content.name == "Updated URL Name" + assert content.agency_ids == [ + test_agency_id + ] + assert content.source_url == "http://modified-example.com/" + assert content.description == "Updated URL Description" # Check DS App Link Is Updated + ds_app_link: DSAppLinkDataSource | None = await adb_client_test.one_or_none_model(model=DSAppLinkDataSource) + assert ds_app_link is not None + assert ds_app_link.ds_data_source_id == 67 + assert ds_app_link.last_synced_at > ds_app_linked_data_source_url.updated_at - raise NotImplementedError diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/helpers.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/helpers.py new file mode 100644 index 00000000..fcc1a93c --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/helpers.py @@ -0,0 +1,38 @@ +from http import HTTPStatus +from typing import Any +from unittest.mock import AsyncMock + +from pdap_access_manager import RequestInfo, RequestType, ResponseInfo +from pydantic import BaseModel + +from src.external.pdap.client import PDAPClient +from tests.helpers.mock import get_last_call_arguments + + +def get_last_request( + mock_pdap_client: PDAPClient +) -> RequestInfo: + return get_last_call_arguments(mock_pdap_client.access_manager.make_request)[0] + +def extract_and_validate_sync_request( + mock_pdap_client: PDAPClient, + expected_path: str, + expected_model: type[BaseModel] +) -> Any: + assert mock_pdap_client.access_manager.make_request.call_count == 1 + request_info: RequestInfo = get_last_request(mock_pdap_client) + assert request_info.type_ == RequestType.POST + full_expected_url: str = f"http://example.com/v3/source-manager/{expected_path}" + assert request_info.url == full_expected_url, f"Expected URL: {full_expected_url}, Actual URL: {request_info.url}" + return expected_model(**request_info.json_) + +def mock_make_request( + mock_pdap_client: PDAPClient, + data: BaseModel +) -> None: + mock_pdap_client.access_manager.make_request = AsyncMock( + return_value=ResponseInfo( + status_code=HTTPStatus.OK, + data=data.model_dump(mode='json') + ) + ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/conftest.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/conftest.py index 3dfa7cf4..69bf1287 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/conftest.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/conftest.py @@ -12,9 +12,10 @@ async def ds_app_linked_meta_url( ) -> DSAppLinkInfoModel: ds_app_link = DSAppLinkMetaURL( url_id=test_url_meta_url_id, - ds_meta_url_id=1 + ds_meta_url_id=67 ) await adb_client_test.add(ds_app_link) return DSAppLinkInfoModel( - ds_app_id=1, + ds_app_id=67, + db_id=test_url_meta_url_id ) diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/test_add.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/test_add.py index 62cc9ee3..e63e1496 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/test_add.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/test_add.py @@ -4,6 +4,12 @@ from src.db.client.async_ import AsyncDatabaseClient from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL from src.external.pdap.client import PDAPClient +from src.external.pdap.impl.sync.meta_urls._shared.content import MetaURLSyncContentModel +from src.external.pdap.impl.sync.meta_urls.add.request import AddMetaURLsOuterRequest, AddMetaURLsInnerRequest +from src.external.pdap.impl.sync.shared.models.add.response import DSAppSyncAddResponseModel, \ + DSAppSyncAddResponseInnerModel +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.helpers import extract_and_validate_sync_request, \ + mock_make_request from tests.helpers.data_creator.core import DBDataCreator from tests.helpers.run import run_task_and_confirm_success @@ -13,13 +19,28 @@ async def test_add( db_data_creator: DBDataCreator, test_url_meta_url_id: int, adb_client_test: AsyncDatabaseClient, - mock_pdap_client: PDAPClient + mock_pdap_client: PDAPClient, + test_agency_id: int ): operator = DSAppSyncMetaURLsAddTaskOperator( adb_client=adb_client_test, pdap_client=mock_pdap_client ) + # Mock make_request + mock_make_request( + mock_pdap_client=mock_pdap_client, + data=DSAppSyncAddResponseModel( + entities=[ + DSAppSyncAddResponseInnerModel( + app_id=67, + request_id=test_url_meta_url_id + ) + ] + ) + ) + + # Check meets prerequisites assert await operator.meets_task_prerequisites() @@ -27,8 +48,20 @@ async def test_add( await run_task_and_confirm_success(operator) # Confirm expected method was called with expected parameters + request: AddMetaURLsOuterRequest = extract_and_validate_sync_request( + mock_pdap_client, + expected_path="meta-urls/add", + expected_model=AddMetaURLsOuterRequest + ) + assert len(request.meta_urls) == 1 + meta_url: AddMetaURLsInnerRequest = request.meta_urls[0] + assert meta_url.request_id == test_url_meta_url_id + content: MetaURLSyncContentModel = meta_url.content + assert content.url.startswith("https://example.com/") + assert content.agency_ids == [test_agency_id] # Check Presence of DS Meta URL App Link - ds_app_link: DSAppLinkMetaURL = await adb_client_test.one_or_none_model(model=DSAppLinkMetaURL) + ds_app_link: DSAppLinkMetaURL | None = await adb_client_test.one_or_none_model(model=DSAppLinkMetaURL) assert ds_app_link is not None + assert ds_app_link.ds_meta_url_id == 67 assert ds_app_link.url_id == test_url_meta_url_id diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/test_delete.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/test_delete.py index d66a8e91..8218759f 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/test_delete.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/test_delete.py @@ -1,10 +1,14 @@ import pytest +from src.api.shared.models.message_response import MessageResponse from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.delete.core import DSAppSyncMetaURLsDeleteTaskOperator from src.db.client.async_ import AsyncDatabaseClient from src.db.models.impl.flag.ds_delete.meta_url import FlagDSDeleteMetaURL from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL from src.external.pdap.client import PDAPClient +from src.external.pdap.impl.sync.shared.models.delete.request import DSAppSyncDeleteRequestModel +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.helpers import extract_and_validate_sync_request, \ + mock_make_request from tests.helpers.data_creator.core import DBDataCreator from tests.helpers.run import run_task_and_confirm_success @@ -15,33 +19,48 @@ async def test_delete( adb_client_test: AsyncDatabaseClient, mock_pdap_client: PDAPClient ): + ds_meta_url_id: int = 67 operator = DSAppSyncMetaURLsDeleteTaskOperator( adb_client=adb_client_test, pdap_client=mock_pdap_client ) + # Mock make_request + mock_make_request( + mock_pdap_client=mock_pdap_client, + data=MessageResponse(message="Success") + ) + # Check does not currently meet prerequisite - assert not operator.meets_task_prerequisites() + assert not await operator.meets_task_prerequisites() # Add DS App Link ds_app_link = DSAppLinkMetaURL( - ds_meta_url_id=1, + ds_meta_url_id=ds_meta_url_id, url_id=None, ) await adb_client_test.add(ds_app_link) # Add Task Deletion Flag for App Link flag = FlagDSDeleteMetaURL( - ds_meta_url_id=1 + ds_meta_url_id=ds_meta_url_id ) await adb_client_test.add(flag) # Check meets prerequisite - assert operator.meets_task_prerequisites() + assert await operator.meets_task_prerequisites() # Run task and confirm runs without error await run_task_and_confirm_success(operator) + # Confirm expected method was called with expected parameters + request: DSAppSyncDeleteRequestModel = extract_and_validate_sync_request( + mock_pdap_client, + expected_path="meta-urls/delete", + expected_model=DSAppSyncDeleteRequestModel + ) + assert request.ids == [ds_meta_url_id] + # Check DS App Link Is Deleted assert await adb_client_test.has_no_rows(DSAppLinkMetaURL) diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/test_add_agency_link.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/test_add_agency_link.py index e7c0b525..1caa1eab 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/test_add_agency_link.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/test_add_agency_link.py @@ -1,18 +1,41 @@ +from src.api.shared.models.message_response import MessageResponse from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.update.core import DSAppSyncMetaURLsUpdateTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL +from src.external.pdap.client import PDAPClient +from src.external.pdap.impl.sync.meta_urls._shared.content import MetaURLSyncContentModel +from src.external.pdap.impl.sync.meta_urls.update.request import UpdateMetaURLsOuterRequest, UpdateMetaURLsInnerRequest +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.helpers import extract_and_validate_sync_request, \ + mock_make_request from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel +from tests.conftest import adb_client_test from tests.helpers.run import run_task_and_confirm_success async def test_add_agency_link( ds_app_linked_meta_url: DSAppLinkInfoModel, + test_agency_id: int, test_agency_id_2: int, - operator: DSAppSyncMetaURLsUpdateTaskOperator + operator: DSAppSyncMetaURLsUpdateTaskOperator, + mock_pdap_client: PDAPClient, + adb_client_test: AsyncDatabaseClient ): + # Mock make_request + mock_make_request( + mock_pdap_client=mock_pdap_client, + data=MessageResponse(message="Success") + ) # Check prerequisites not met assert not await operator.meets_task_prerequisites() # Add agency link + link = LinkURLAgency( + url_id=ds_app_linked_meta_url.db_id, + agency_id=test_agency_id_2 + ) + await adb_client_test.add(link) # Check prerequisites are met assert operator.meets_task_prerequisites() @@ -21,7 +44,20 @@ async def test_add_agency_link( await run_task_and_confirm_success(operator) # Confirm expected method was called with expected parameters + request: UpdateMetaURLsOuterRequest = extract_and_validate_sync_request( + mock_pdap_client, + expected_path="meta-urls/update", + expected_model=UpdateMetaURLsOuterRequest + ) + assert len(request.meta_urls) == 1 + meta_url: UpdateMetaURLsInnerRequest = request.meta_urls[0] + assert meta_url.app_id == ds_app_linked_meta_url.ds_app_id + content: MetaURLSyncContentModel = meta_url.content + assert content.url.startswith("https://example.com/") + assert set(content.agency_ids) == {test_agency_id, test_agency_id_2} # Check DS App Link Is Updated - - raise NotImplementedError + ds_app_link: DSAppLinkMetaURL | None = await adb_client_test.one_or_none_model(model=DSAppLinkMetaURL) + assert ds_app_link is not None + assert ds_app_link.ds_meta_url_id == 67 + assert ds_app_link.last_synced_at > ds_app_linked_meta_url.updated_at diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/test_delete_agency_link.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/test_delete_agency_link.py index a62c1d26..11ef284d 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/test_delete_agency_link.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/test_delete_agency_link.py @@ -1,27 +1,66 @@ +from sqlalchemy import delete + +from src.api.shared.models.message_response import MessageResponse from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.update.core import DSAppSyncMetaURLsUpdateTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL +from src.external.pdap.client import PDAPClient +from src.external.pdap.impl.sync.meta_urls._shared.content import MetaURLSyncContentModel +from src.external.pdap.impl.sync.meta_urls.update.request import UpdateMetaURLsOuterRequest, UpdateMetaURLsInnerRequest +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.helpers import extract_and_validate_sync_request, \ + mock_make_request from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel from tests.helpers.run import run_task_and_confirm_success async def test_delete_agency_link( ds_app_linked_meta_url: DSAppLinkInfoModel, - test_agency_id_1: int, - operator: DSAppSyncMetaURLsUpdateTaskOperator + test_agency_id: int, + operator: DSAppSyncMetaURLsUpdateTaskOperator, + mock_pdap_client: PDAPClient, + adb_client_test: AsyncDatabaseClient ): + # Mock make_request + mock_make_request( + mock_pdap_client=mock_pdap_client, + data=MessageResponse(message="Success") + ) - # Check prerequisites not met assert not await operator.meets_task_prerequisites() # Delete agency link + statement = ( + delete( + LinkURLAgency + ) + .where( + LinkURLAgency.url_id == ds_app_linked_meta_url.db_id, + LinkURLAgency.agency_id == test_agency_id + ) + ) + await adb_client_test.execute(statement) # Check prerequisites are met - assert operator.meets_task_prerequisites() + assert await operator.meets_task_prerequisites() # Run task and confirm runs without error await run_task_and_confirm_success(operator) # Confirm expected method was called with expected parameters + request: UpdateMetaURLsOuterRequest = extract_and_validate_sync_request( + mock_pdap_client, + expected_path="meta-urls/update", + expected_model=UpdateMetaURLsOuterRequest + ) + assert len(request.meta_urls) == 1 + meta_url: UpdateMetaURLsInnerRequest = request.meta_urls[0] + assert meta_url.app_id == ds_app_linked_meta_url.ds_app_id + content: MetaURLSyncContentModel = meta_url.content + assert content.agency_ids == [] # Check DS App Link Is Updated - - raise NotImplementedError + ds_app_link: DSAppLinkMetaURL | None = await adb_client_test.one_or_none_model(model=DSAppLinkMetaURL) + assert ds_app_link is not None + assert ds_app_link.ds_meta_url_id == 67 + assert ds_app_link.last_synced_at > ds_app_linked_meta_url.updated_at diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/test_update_url.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/test_update_url.py index ec71c60d..0342c388 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/test_update_url.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/test_update_url.py @@ -1,17 +1,52 @@ +from sqlalchemy import update + +from src.api.shared.models.message_response import MessageResponse from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.update.core import DSAppSyncMetaURLsUpdateTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL +from src.external.pdap.client import PDAPClient +from src.external.pdap.impl.sync.meta_urls._shared.content import MetaURLSyncContentModel +from src.external.pdap.impl.sync.meta_urls.update.request import UpdateMetaURLsOuterRequest, UpdateMetaURLsInnerRequest +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.helpers import extract_and_validate_sync_request, \ + mock_make_request from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel from tests.helpers.run import run_task_and_confirm_success async def test_update_url( ds_app_linked_meta_url: DSAppLinkInfoModel, - operator: DSAppSyncMetaURLsUpdateTaskOperator + operator: DSAppSyncMetaURLsUpdateTaskOperator, + mock_pdap_client: PDAPClient, + adb_client_test: AsyncDatabaseClient, + test_agency_id: int ): + # Mock make_request + mock_make_request( + mock_pdap_client=mock_pdap_client, + data=MessageResponse(message="Success") + ) # Check prerequisites not met assert not await operator.meets_task_prerequisites() # Update URL table + statement = ( + update( + URL + ) + .values( + name="Updated URL Name", + scheme="http", + trailing_slash=True, + url="modified-example.com", + description="Updated URL Description", + ) + .where( + URL.id == ds_app_linked_meta_url.db_id + ) + ) + await adb_client_test.execute(statement) # Check prerequisites are met assert operator.meets_task_prerequisites() @@ -19,9 +54,22 @@ async def test_update_url( # Run task and confirm runs without error await run_task_and_confirm_success(operator) - # Confirm expected method was called with expected parameters + request: UpdateMetaURLsOuterRequest = extract_and_validate_sync_request( + mock_pdap_client, + expected_path="meta-urls/update", + expected_model=UpdateMetaURLsOuterRequest + ) + assert len(request.meta_urls) == 1 + meta_url: UpdateMetaURLsInnerRequest = request.meta_urls[0] + assert meta_url.app_id == ds_app_linked_meta_url.ds_app_id + content: MetaURLSyncContentModel = meta_url.content + assert content.url == "http://modified-example.com/" + assert set(content.agency_ids) == {test_agency_id} # Check DS App Link Is Updated + ds_app_link: DSAppLinkMetaURL | None = await adb_client_test.one_or_none_model(model=DSAppLinkMetaURL) + assert ds_app_link is not None + assert ds_app_link.ds_meta_url_id == 67 + assert ds_app_link.last_synced_at > ds_app_linked_meta_url.updated_at - raise NotImplementedError diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/models/ds_app_link_info.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/models/ds_app_link_info.py index c02a3f96..36e86874 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/models/ds_app_link_info.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/models/ds_app_link_info.py @@ -5,4 +5,5 @@ class DSAppLinkInfoModel(BaseModel): ds_app_id: int + db_id: int updated_at: datetime = datetime.now() \ No newline at end of file diff --git a/src/external/pdap/impl/sync/agencies/request.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/test_.py similarity index 100% rename from src/external/pdap/impl/sync/agencies/request.py rename to tests/automated/integration/tasks/scheduled/impl/sync_to_ds/test_.py diff --git a/tests/automated/integration/tasks/url/loader/test_happy_path.py b/tests/automated/integration/tasks/url/loader/test_happy_path.py index a7b02e89..0786cb24 100644 --- a/tests/automated/integration/tasks/url/loader/test_happy_path.py +++ b/tests/automated/integration/tasks/url/loader/test_happy_path.py @@ -2,7 +2,7 @@ from src.core.tasks.url.loader import URLTaskOperatorLoader -NUMBER_OF_TASK_OPERATORS: int = 14 +NUMBER_OF_TASK_OPERATORS: int = 21 @pytest.mark.asyncio async def test_happy_path( diff --git a/tests/helpers/data_creator/core.py b/tests/helpers/data_creator/core.py index dbe5a4e6..dd08a178 100644 --- a/tests/helpers/data_creator/core.py +++ b/tests/helpers/data_creator/core.py @@ -12,7 +12,7 @@ from src.db.dtos.url.insert import InsertURLsInfo from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.enums import TaskType -from src.db.models.impl.agency.enums import AgencyType +from src.db.models.impl.agency.enums import AgencyType, JurisdictionType from src.db.models.impl.agency.sqlalchemy import Agency from src.db.models.impl.duplicate.pydantic.insert import DuplicateInsertInfo from src.db.models.impl.flag.root_url.sqlalchemy import FlagRootURL @@ -524,7 +524,8 @@ async def create_agency(self, agency_id: int = 1) -> None: agency = Agency( id=agency_id, name=generate_test_name(agency_id), - agency_type=AgencyType.UNKNOWN + agency_type=AgencyType.UNKNOWN, + jurisdiction_type=JurisdictionType.LOCAL ) await self.adb_client.add_all([agency]) @@ -536,7 +537,8 @@ async def create_agencies(self, count: int = 3) -> list[int]: agency = Agency( id=agency_id, name=generate_test_name(agency_id), - agency_type=AgencyType.UNKNOWN + agency_type=AgencyType.UNKNOWN, + jurisdiction_type=JurisdictionType.LOCAL ) agencies.append(agency) agency_ids.append(agency_id) diff --git a/tests/helpers/data_creator/generate.py b/tests/helpers/data_creator/generate.py index f1eefce2..b447888d 100644 --- a/tests/helpers/data_creator/generate.py +++ b/tests/helpers/data_creator/generate.py @@ -76,7 +76,7 @@ def generate_url_data_sources( return [ URLDataSourcePydantic( url_id=url_id, - data_source_id=url_id, + ds_data_source_id=url_id, ) for url_id in url_ids ] \ No newline at end of file diff --git a/tests/helpers/mock.py b/tests/helpers/mock.py new file mode 100644 index 00000000..b761887b --- /dev/null +++ b/tests/helpers/mock.py @@ -0,0 +1,5 @@ +from unittest.mock import MagicMock, AsyncMock + + +def get_last_call_arguments(mock: MagicMock | AsyncMock) -> tuple: + return mock.call_args_list[-1].args \ No newline at end of file From b9dafff2258e3dc7b79e1230ede17b031299c821 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 14 Nov 2025 08:39:25 -0500 Subject: [PATCH 21/84] Bump up max postgres connections --- local_database/docker-compose.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/local_database/docker-compose.yml b/local_database/docker-compose.yml index efff881a..3999db33 100644 --- a/local_database/docker-compose.yml +++ b/local_database/docker-compose.yml @@ -12,5 +12,6 @@ services: - POSTGRES_PASSWORD=HanviliciousHamiltonHilltops - POSTGRES_USER=test_source_collector_user - POSTGRES_DB=source_collector_test_db + command: ['postgres', '-c', 'max_connections=160'] volumes: dbscripts: \ No newline at end of file From c4052a55f7b2ba694b598fd3893d7c9c404053ab Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 14 Nov 2025 09:08:08 -0500 Subject: [PATCH 22/84] Fix connection creation leak --- local_database/docker-compose.yml | 1 - src/api/main.py | 10 ++-- src/db/client/async_.py | 20 +++---- src/db/client/sync.py | 18 +++--- .../integration/readonly/conftest.py | 5 +- tests/conftest.py | 57 +++++++++++++------ tests/helpers/setup/wipe.py | 5 +- 7 files changed, 71 insertions(+), 45 deletions(-) diff --git a/local_database/docker-compose.yml b/local_database/docker-compose.yml index 3999db33..efff881a 100644 --- a/local_database/docker-compose.yml +++ b/local_database/docker-compose.yml @@ -12,6 +12,5 @@ services: - POSTGRES_PASSWORD=HanviliciousHamiltonHilltops - POSTGRES_USER=test_source_collector_user - POSTGRES_DB=source_collector_test_db - command: ['postgres', '-c', 'max_connections=160'] volumes: dbscripts: \ No newline at end of file diff --git a/src/api/main.py b/src/api/main.py index 2dd7fa24..27abcb62 100644 --- a/src/api/main.py +++ b/src/api/main.py @@ -5,6 +5,7 @@ from discord_poster import DiscordPoster from fastapi import FastAPI from pdap_access_manager import AccessManager +from sqlalchemy.ext.asyncio import create_async_engine from starlette.responses import RedirectResponse from src.api.endpoints.agencies.routes import agencies_router @@ -52,12 +53,9 @@ async def lifespan(app: FastAPI): env.read_env() # Initialize shared dependencies - db_client = DatabaseClient( - db_url=env_var_manager.get_postgres_connection_string() - ) - adb_client = AsyncDatabaseClient( - db_url=env_var_manager.get_postgres_connection_string(is_async=True) - ) + + db_client = DatabaseClient() + adb_client = AsyncDatabaseClient() await setup_database(db_client) core_logger = AsyncCoreLogger(adb_client=adb_client) diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 26165a61..95bc7082 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -2,8 +2,8 @@ from functools import wraps from typing import Optional, Type, Any, List, Sequence -from sqlalchemy import select, func, Select, and_, update, Row, text -from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker +from sqlalchemy import select, func, Select, and_, update, Row, text, Engine +from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker, AsyncEngine from sqlalchemy.orm import selectinload from src.api.endpoints.annotate.all.get.models.response import GetNextURLForAllAnnotationResponse @@ -103,15 +103,15 @@ class AsyncDatabaseClient: - def __init__(self, db_url: str | None = None): - if db_url is None: + def __init__(self, engine: AsyncEngine | None = None): + if engine is None: db_url = EnvVarManager.get().get_postgres_connection_string(is_async=True) - self.db_url = db_url - echo = ConfigManager.get_sqlalchemy_echo() - self.engine = create_async_engine( - url=db_url, - echo=echo, - ) + echo = ConfigManager.get_sqlalchemy_echo() + engine = create_async_engine( + url=db_url, + echo=echo, + ) + self.engine = engine self.session_maker = async_sessionmaker(bind=self.engine, expire_on_commit=False) self.statement_composer = StatementComposer() diff --git a/src/db/client/sync.py b/src/db/client/sync.py index 8cc99691..2e9e6f9b 100644 --- a/src/db/client/sync.py +++ b/src/db/client/sync.py @@ -1,7 +1,7 @@ from functools import wraps from typing import List -from sqlalchemy import create_engine, Select +from sqlalchemy import create_engine, Select, Engine from sqlalchemy.exc import IntegrityError from sqlalchemy.orm import sessionmaker, scoped_session, Session @@ -28,15 +28,19 @@ # Database Client class DatabaseClient: - def __init__(self, db_url: str | None = None): + def __init__( + self, + engine: Engine | None = None + ): """Initialize the DatabaseClient.""" - if db_url is None: + if engine is None: db_url = EnvVarManager.get().get_postgres_connection_string(is_async=True) + engine = create_engine( + url=db_url, + echo=ConfigManager.get_sqlalchemy_echo(), + ) - self.engine = create_engine( - url=db_url, - echo=ConfigManager.get_sqlalchemy_echo(), - ) + self.engine = engine self.session_maker = scoped_session(sessionmaker(bind=self.engine)) self.session = None diff --git a/tests/automated/integration/readonly/conftest.py b/tests/automated/integration/readonly/conftest.py index c8324d04..4589f5b5 100644 --- a/tests/automated/integration/readonly/conftest.py +++ b/tests/automated/integration/readonly/conftest.py @@ -3,6 +3,7 @@ import pytest import pytest_asyncio +from sqlalchemy import Engine from starlette.testclient import TestClient from src.db.helpers.connect import get_postgres_connection_string @@ -33,8 +34,10 @@ async def california_readonly( async def readonly_helper( event_loop, client: TestClient, + engine: Engine + ) -> AsyncGenerator[ReadOnlyTestHelper, Any]: - wipe_database(get_postgres_connection_string()) + wipe_database(engine) db_data_creator = DBDataCreator() api_test_helper = APITestHelper( request_validator=RequestValidator(client=client), diff --git a/tests/conftest.py b/tests/conftest.py index 8ba93200..eddb7f2d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,18 +7,19 @@ import pytest_asyncio from aiohttp import ClientSession from alembic.config import Config -from sqlalchemy import create_engine, inspect, MetaData +from sqlalchemy import create_engine, inspect, MetaData, Engine +from sqlalchemy.ext.asyncio import AsyncEngine, create_async_engine from sqlalchemy.orm import scoped_session, sessionmaker from src.core.env_var_manager import EnvVarManager +from src.db.client.async_ import AsyncDatabaseClient +from src.db.client.sync import DatabaseClient +from src.db.helpers.connect import get_postgres_connection_string +from src.db.models.impl.log.sqlalchemy import Log # noqa: F401 # Below are to prevent import errors from src.db.models.impl.missing import Missing # noqa: F401 -from src.db.models.impl.log.sqlalchemy import Log # noqa: F401 from src.db.models.impl.task.error import TaskError # noqa: F401 from src.db.models.impl.url.checked_for_duplicate import URLCheckedForDuplicate # noqa: F401 -from src.db.client.async_ import AsyncDatabaseClient -from src.db.client.sync import DatabaseClient -from src.db.helpers.connect import get_postgres_connection_string from src.util.helper_functions import load_from_environment from tests.helpers.alembic_runner import AlembicRunner from tests.helpers.data_creator.core import DBDataCreator @@ -99,33 +100,55 @@ def setup_and_teardown(): live_connection.close() engine.dispose() +@pytest.fixture(scope="session") +def engine(): + conn = get_postgres_connection_string() + engine = create_engine(conn) + yield engine + engine.dispose() + +@pytest.fixture(scope="session") +def async_engine(): + conn = get_postgres_connection_string(is_async=True) + engine = create_async_engine(conn) + yield engine + engine.dispose() + @pytest.fixture -def wiped_database(): +def wiped_database( + engine: Engine +): """Wipe all data from database.""" - wipe_database(get_postgres_connection_string()) + wipe_database(engine) @pytest.fixture -def db_client_test(wiped_database) -> Generator[DatabaseClient, Any, None]: +def db_client_test( + wiped_database, + engine +) -> Generator[DatabaseClient, Any, None]: # Drop pre-existing table - conn = get_postgres_connection_string() - db_client = DatabaseClient(db_url=conn) + db_client = DatabaseClient(engine) yield db_client db_client.engine.dispose() @pytest_asyncio.fixture -async def populated_database(wiped_database) -> None: - conn = get_postgres_connection_string(is_async=True) - adb_client = AsyncDatabaseClient(db_url=conn) +async def populated_database( + wiped_database, + async_engine: AsyncEngine +) -> None: + adb_client = AsyncDatabaseClient(async_engine) await populate_database(adb_client) @pytest_asyncio.fixture -async def adb_client_test(wiped_database) -> AsyncGenerator[AsyncDatabaseClient, Any]: - conn = get_postgres_connection_string(is_async=True) - adb_client = AsyncDatabaseClient(db_url=conn) +async def adb_client_test( + wiped_database, + async_engine: AsyncEngine +) -> AsyncGenerator[AsyncDatabaseClient, Any]: + adb_client = AsyncDatabaseClient(async_engine) yield adb_client - adb_client.engine.dispose() + await adb_client.engine.dispose() @pytest.fixture def db_data_creator( diff --git a/tests/helpers/setup/wipe.py b/tests/helpers/setup/wipe.py index e81c266d..f6cd3582 100644 --- a/tests/helpers/setup/wipe.py +++ b/tests/helpers/setup/wipe.py @@ -1,11 +1,10 @@ -from sqlalchemy import create_engine +from sqlalchemy import create_engine, Engine from src.db.models.templates_.base import Base -def wipe_database(connection_string: str) -> None: +def wipe_database(engine: Engine) -> None: """Wipe all data from database.""" - engine = create_engine(connection_string) with engine.connect() as connection: for table in reversed(Base.metadata.sorted_tables): if table.info == "view": From 595f896f3177abc0df1ab0368f4724f07fdc02aa Mon Sep 17 00:00:00 2001 From: maxachis Date: Fri, 14 Nov 2025 13:26:28 -0500 Subject: [PATCH 23/84] Add delete_url_ds_app_links --- ...28_1539-a57c3b5b6e93_add_sync_log_table.py | 26 ++++++++++++------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/alembic/versions/2025_10_28_1539-a57c3b5b6e93_add_sync_log_table.py b/alembic/versions/2025_10_28_1539-a57c3b5b6e93_add_sync_log_table.py index 41b02082..03510b1c 100644 --- a/alembic/versions/2025_10_28_1539-a57c3b5b6e93_add_sync_log_table.py +++ b/alembic/versions/2025_10_28_1539-a57c3b5b6e93_add_sync_log_table.py @@ -19,15 +19,7 @@ depends_on: Union[str, Sequence[str], None] = None -def _add_data_portal_type_other_to_ds_optional_metadata(): - op.add_column( - 'url_optional_data_source_metadata', - sa.Column( - 'data_portal_type_other', - sa.String(), - nullable=True - ) - ) + def upgrade() -> None: @@ -37,6 +29,7 @@ def upgrade() -> None: remove_id_column_from_agencies() rename_agency_id_to_id() _rename_existing_tables_to_ds_app_format() + _delete_meta_url_ds_app_links() _alter_ds_app_link_data_source_table() _alter_ds_app_link_meta_url_table() _add_flag_deletion_tables() @@ -49,6 +42,21 @@ def upgrade() -> None: _add_updated_at_trigger_to_url_optional_data_source_metadata() _add_data_portal_type_other_to_ds_optional_metadata() +def _delete_meta_url_ds_app_links(): + op.execute( + "DELETE FROM ds_app_link_meta_url;" + ) + +def _add_data_portal_type_other_to_ds_optional_metadata(): + op.add_column( + 'url_optional_data_source_metadata', + sa.Column( + 'data_portal_type_other', + sa.String(), + nullable=True + ) + ) + def _add_updated_at_trigger_to_url_optional_data_source_metadata(): create_updated_at_trigger( "url_optional_data_source_metadata" From dd90f6c2b1d478742d71aec2c5de8f2960d9eefc Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 14 Nov 2025 16:20:21 -0500 Subject: [PATCH 24/84] Remove test logic from DatabaseClient --- src/db/client/sync.py | 39 +++++-------------- .../data_creator/commands/impl/urls_/query.py | 17 +++++++- 2 files changed, 25 insertions(+), 31 deletions(-) diff --git a/src/db/client/sync.py b/src/db/client/sync.py index 2e9e6f9b..e29909cf 100644 --- a/src/db/client/sync.py +++ b/src/db/client/sync.py @@ -5,23 +5,21 @@ from sqlalchemy.exc import IntegrityError from sqlalchemy.orm import sessionmaker, scoped_session, Session +from src.core.enums import BatchStatus +from src.core.env_var_manager import EnvVarManager from src.db.config_manager import ConfigManager -from src.db.models.impl.batch.pydantic.info import BatchInfo -from src.db.models.impl.duplicate.pydantic.insert import DuplicateInsertInfo from src.db.dtos.url.insert import InsertURLsInfo -from src.db.models.impl.log.pydantic.info import LogInfo from src.db.dtos.url.mapping_.simple import SimpleURLMapping -from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL -from src.db.models.impl.url.core.pydantic.info import URLInfo -from src.db.models.templates_.base import Base +from src.db.models.impl.batch.pydantic.info import BatchInfo +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.duplicate.pydantic.insert import DuplicateInsertInfo from src.db.models.impl.duplicate.sqlalchemy import Duplicate +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.log.pydantic.info import LogInfo from src.db.models.impl.log.sqlalchemy import Log -from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource +from src.db.models.impl.url.core.pydantic.info import URLInfo from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.batch.sqlalchemy import Batch -from tests.helpers.data_creator.commands.impl.urls_.tdo import SubmittedURLInfo -from src.core.env_var_manager import EnvVarManager -from src.core.enums import BatchStatus +from src.db.models.templates_.base import Base from src.util.models.url_and_scheme import URLAndScheme from src.util.url import get_url_and_scheme @@ -219,25 +217,6 @@ def update_url( url = session.query(URL).filter_by(id=url_info.id).first() url.collector_metadata = url_info.collector_metadata - @session_manager - def mark_urls_as_submitted( - self, - session: Session, - infos: list[SubmittedURLInfo] - ): - for info in infos: - url_id = info.url_id - data_source_id = info.data_source_id - - url_data_source_object = DSAppLinkDataSource( - url_id=url_id, - ds_data_source_id=data_source_id - ) - if info.submitted_at is not None: - url_data_source_object.created_at = info.submitted_at - session.add(url_data_source_object) - - if __name__ == "__main__": client = DatabaseClient() print("Database client initialized.") diff --git a/tests/helpers/data_creator/commands/impl/urls_/query.py b/tests/helpers/data_creator/commands/impl/urls_/query.py index beff749f..1123af8e 100644 --- a/tests/helpers/data_creator/commands/impl/urls_/query.py +++ b/tests/helpers/data_creator/commands/impl/urls_/query.py @@ -1,5 +1,6 @@ from datetime import datetime +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource from tests.helpers.data_creator.commands.impl.urls_.tdo import SubmittedURLInfo from src.db.dtos.url.insert import InsertURLsInfo from src.db.models.impl.url.core.enums import URLSource @@ -64,7 +65,21 @@ def run_sync(self) -> InsertURLsInfo: submitted_at=self.created_at ) submitted_url_infos.append(submitted_url_info) - self.db_client.mark_urls_as_submitted(submitted_url_infos) + + url_data_source_objects: list[DSAppLinkDataSource] = [] + for info in submitted_url_infos: + url_id = info.url_id + data_source_id = info.data_source_id + + url_data_source_object = DSAppLinkDataSource( + url_id=url_id, + ds_data_source_id=data_source_id + ) + if info.submitted_at is not None: + url_data_source_object.created_at = info.submitted_at + url_data_source_objects.append(url_data_source_object) + + self.db_client.add_all(url_data_source_objects) return url_insert_info \ No newline at end of file From ab13ea8e0def8c6cf7d2bd1d487bdbc59d698fe2 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 14 Nov 2025 16:43:58 -0500 Subject: [PATCH 25/84] Add sync loaders --- .../impl/sync_to_ds/templates/operator.py | 11 +-- src/core/tasks/scheduled/loader.py | 97 ++++++++++++++++++- .../tasks/scheduled/loader/test_happy_path.py | 2 +- 3 files changed, 99 insertions(+), 11 deletions(-) diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/templates/operator.py b/src/core/tasks/scheduled/impl/sync_to_ds/templates/operator.py index 62794711..63a72a2f 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/templates/operator.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/templates/operator.py @@ -1,14 +1,13 @@ from abc import ABC -from src.core.tasks.base.operator import TaskOperatorBase from src.core.tasks.mixins.prereq import HasPrerequisitesMixin -from src.core.tasks.url.enums import TaskOperatorOutcome +from src.core.tasks.scheduled.templates.operator import ScheduledTaskOperatorBase from src.db.client.async_ import AsyncDatabaseClient from src.external.pdap.client import PDAPClient class DSSyncTaskOperatorBase( - TaskOperatorBase, + ScheduledTaskOperatorBase, HasPrerequisitesMixin, ABC ): @@ -20,9 +19,3 @@ def __init__( ): super().__init__(adb_client) self.pdap_client = pdap_client - - async def conclude_task(self): - return await self.run_info( - outcome=TaskOperatorOutcome.SUCCESS, - message="Task completed successfully" - ) diff --git a/src/core/tasks/scheduled/loader.py b/src/core/tasks/scheduled/loader.py index 82ac92cc..bbd76e6e 100644 --- a/src/core/tasks/scheduled/loader.py +++ b/src/core/tasks/scheduled/loader.py @@ -9,9 +9,19 @@ from src.core.tasks.scheduled.impl.internet_archives.probe.operator import InternetArchivesProbeTaskOperator from src.core.tasks.scheduled.impl.internet_archives.save.operator import InternetArchivesSaveTaskOperator from src.core.tasks.scheduled.impl.mark_never_completed.operator import MarkTaskNeverCompletedOperator -from src.core.tasks.scheduled.impl.mark_never_completed.query import MarkTaskNeverCompletedQueryBuilder from src.core.tasks.scheduled.impl.refresh_materialized_views.operator import RefreshMaterializedViewsOperator from src.core.tasks.scheduled.impl.run_url_tasks.operator import RunURLTasksTaskOperator +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.add.core import DSAppSyncAgenciesAddTaskOperator +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.delete.core import DSAppSyncAgenciesDeleteTaskOperator +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.update.core import DSAppSyncAgenciesUpdateTaskOperator +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.add.core import DSAppSyncDataSourcesAddTaskOperator +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.delete.core import \ + DSAppSyncDataSourcesDeleteTaskOperator +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update.core import \ + DSAppSyncDataSourcesUpdateTaskOperator +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.add.core import DSAppSyncMetaURLsAddTaskOperator +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.delete.core import DSAppSyncMetaURLsDeleteTaskOperator +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.update.core import DSAppSyncMetaURLsUpdateTaskOperator from src.core.tasks.scheduled.impl.task_cleanup.operator import TaskCleanupOperator from src.core.tasks.scheduled.models.entry import ScheduledTaskEntry from src.db.client.async_ import AsyncDatabaseClient @@ -115,5 +125,90 @@ async def load_entries(self) -> list[ScheduledTaskEntry]: operator=RefreshMaterializedViewsOperator(adb_client=self.adb_client), interval_minutes=IntervalEnum.DAILY.value, enabled=self.setup_flag("REFRESH_MATERIALIZED_VIEWS_TASK_FLAG") + ), + # Sync + ## Agency + ### Add + ScheduledTaskEntry( + operator=DSAppSyncAgenciesAddTaskOperator( + adb_client=self.adb_client, + pdap_client=self.pdap_client + ), + interval_minutes=IntervalEnum.HOURLY.value, + enabled=self.setup_flag("DS_APP_SYNC_AGENCIES_ADD_TASK_FLAG") + ), + ### Update + ScheduledTaskEntry( + operator=DSAppSyncAgenciesUpdateTaskOperator( + adb_client=self.adb_client, + pdap_client=self.pdap_client + ), + interval_minutes=IntervalEnum.HOURLY.value, + enabled=self.setup_flag("DS_APP_SYNC_AGENCIES_UPDATE_TASK_FLAG") + ), + ### Delete + ScheduledTaskEntry( + operator=DSAppSyncAgenciesDeleteTaskOperator( + adb_client=self.adb_client, + pdap_client=self.pdap_client + ), + interval_minutes=IntervalEnum.HOURLY.value, + enabled=self.setup_flag("DS_APP_SYNC_AGENCIES_DELETE_TASK_FLAG") + ), + ## Data Source + ### Add + ScheduledTaskEntry( + operator=DSAppSyncDataSourcesAddTaskOperator( + adb_client=self.adb_client, + pdap_client=self.pdap_client + ), + interval_minutes=IntervalEnum.HOURLY.value, + enabled=self.setup_flag("DS_APP_SYNC_DATA_SOURCES_ADD_TASK_FLAG") + ), + ### Update + ScheduledTaskEntry( + operator=DSAppSyncDataSourcesUpdateTaskOperator( + adb_client=self.adb_client, + pdap_client=self.pdap_client + ), + interval_minutes=IntervalEnum.HOURLY.value, + enabled=self.setup_flag("DS_APP_SYNC_DATA_SOURCES_UPDATE_TASK_FLAG") + ), + ### Delete + ScheduledTaskEntry( + operator=DSAppSyncDataSourcesDeleteTaskOperator( + adb_client=self.adb_client, + pdap_client=self.pdap_client + ), + interval_minutes=IntervalEnum.HOURLY.value, + enabled=self.setup_flag("DS_APP_SYNC_DATA_SOURCES_DELETE_TASK_FLAG") + ), + ## Meta URL + ### Add + ScheduledTaskEntry( + operator=DSAppSyncMetaURLsAddTaskOperator( + adb_client=self.adb_client, + pdap_client=self.pdap_client + ), + interval_minutes=IntervalEnum.HOURLY.value, + enabled=self.setup_flag("DS_APP_SYNC_META_URLS_ADD_TASK_FLAG") + ), + ### Update + ScheduledTaskEntry( + operator=DSAppSyncMetaURLsUpdateTaskOperator( + adb_client=self.adb_client, + pdap_client=self.pdap_client + ), + interval_minutes=IntervalEnum.HOURLY.value, + enabled=self.setup_flag("DS_APP_SYNC_META_URLS_UPDATE_TASK_FLAG") + ), + ### Delete + ScheduledTaskEntry( + operator=DSAppSyncMetaURLsDeleteTaskOperator( + adb_client=self.adb_client, + pdap_client=self.pdap_client + ), + interval_minutes=IntervalEnum.HOURLY.value, + enabled=self.setup_flag("DS_APP_SYNC_META_URLS_DELETE_TASK_FLAG") ) ] diff --git a/tests/automated/integration/tasks/scheduled/loader/test_happy_path.py b/tests/automated/integration/tasks/scheduled/loader/test_happy_path.py index f3402f4f..63c64264 100644 --- a/tests/automated/integration/tasks/scheduled/loader/test_happy_path.py +++ b/tests/automated/integration/tasks/scheduled/loader/test_happy_path.py @@ -2,7 +2,7 @@ from src.core.tasks.scheduled.loader import ScheduledTaskOperatorLoader -NUMBER_OF_ENTRIES = 10 +NUMBER_OF_ENTRIES = 19 @pytest.mark.asyncio async def test_happy_path( From e63b7ab673a56ab705b25c1beae1e1f3d9ef5df7 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 14 Nov 2025 16:56:58 -0500 Subject: [PATCH 26/84] Fix misnamed env vars and refine task print logic --- src/core/tasks/scheduled/loader.py | 18 +++++++++--------- src/core/tasks/scheduled/manager.py | 10 +++++++++- src/core/tasks/scheduled/registry/core.py | 6 ++---- 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/src/core/tasks/scheduled/loader.py b/src/core/tasks/scheduled/loader.py index bbd76e6e..f104b84f 100644 --- a/src/core/tasks/scheduled/loader.py +++ b/src/core/tasks/scheduled/loader.py @@ -135,7 +135,7 @@ async def load_entries(self) -> list[ScheduledTaskEntry]: pdap_client=self.pdap_client ), interval_minutes=IntervalEnum.HOURLY.value, - enabled=self.setup_flag("DS_APP_SYNC_AGENCIES_ADD_TASK_FLAG") + enabled=self.setup_flag("DS_APP_SYNC_AGENCY_ADD_TASK_FLAG") ), ### Update ScheduledTaskEntry( @@ -144,7 +144,7 @@ async def load_entries(self) -> list[ScheduledTaskEntry]: pdap_client=self.pdap_client ), interval_minutes=IntervalEnum.HOURLY.value, - enabled=self.setup_flag("DS_APP_SYNC_AGENCIES_UPDATE_TASK_FLAG") + enabled=self.setup_flag("DS_APP_SYNC_AGENCY_UPDATE_TASK_FLAG") ), ### Delete ScheduledTaskEntry( @@ -153,7 +153,7 @@ async def load_entries(self) -> list[ScheduledTaskEntry]: pdap_client=self.pdap_client ), interval_minutes=IntervalEnum.HOURLY.value, - enabled=self.setup_flag("DS_APP_SYNC_AGENCIES_DELETE_TASK_FLAG") + enabled=self.setup_flag("DS_APP_SYNC_AGENCY_DELETE_TASK_FLAG") ), ## Data Source ### Add @@ -163,7 +163,7 @@ async def load_entries(self) -> list[ScheduledTaskEntry]: pdap_client=self.pdap_client ), interval_minutes=IntervalEnum.HOURLY.value, - enabled=self.setup_flag("DS_APP_SYNC_DATA_SOURCES_ADD_TASK_FLAG") + enabled=self.setup_flag("DS_APP_SYNC_DATA_SOURCE_ADD_TASK_FLAG") ), ### Update ScheduledTaskEntry( @@ -172,7 +172,7 @@ async def load_entries(self) -> list[ScheduledTaskEntry]: pdap_client=self.pdap_client ), interval_minutes=IntervalEnum.HOURLY.value, - enabled=self.setup_flag("DS_APP_SYNC_DATA_SOURCES_UPDATE_TASK_FLAG") + enabled=self.setup_flag("DS_APP_SYNC_DATA_SOURCE_UPDATE_TASK_FLAG") ), ### Delete ScheduledTaskEntry( @@ -181,7 +181,7 @@ async def load_entries(self) -> list[ScheduledTaskEntry]: pdap_client=self.pdap_client ), interval_minutes=IntervalEnum.HOURLY.value, - enabled=self.setup_flag("DS_APP_SYNC_DATA_SOURCES_DELETE_TASK_FLAG") + enabled=self.setup_flag("DS_APP_SYNC_DATA_SOURCE_DELETE_TASK_FLAG") ), ## Meta URL ### Add @@ -191,7 +191,7 @@ async def load_entries(self) -> list[ScheduledTaskEntry]: pdap_client=self.pdap_client ), interval_minutes=IntervalEnum.HOURLY.value, - enabled=self.setup_flag("DS_APP_SYNC_META_URLS_ADD_TASK_FLAG") + enabled=self.setup_flag("DS_APP_SYNC_META_URL_ADD_TASK_FLAG") ), ### Update ScheduledTaskEntry( @@ -200,7 +200,7 @@ async def load_entries(self) -> list[ScheduledTaskEntry]: pdap_client=self.pdap_client ), interval_minutes=IntervalEnum.HOURLY.value, - enabled=self.setup_flag("DS_APP_SYNC_META_URLS_UPDATE_TASK_FLAG") + enabled=self.setup_flag("DS_APP_SYNC_META_URL_UPDATE_TASK_FLAG") ), ### Delete ScheduledTaskEntry( @@ -209,6 +209,6 @@ async def load_entries(self) -> list[ScheduledTaskEntry]: pdap_client=self.pdap_client ), interval_minutes=IntervalEnum.HOURLY.value, - enabled=self.setup_flag("DS_APP_SYNC_META_URLS_DELETE_TASK_FLAG") + enabled=self.setup_flag("DS_APP_SYNC_META_URL_DELETE_TASK_FLAG") ) ] diff --git a/src/core/tasks/scheduled/manager.py b/src/core/tasks/scheduled/manager.py index 87cb5a27..adf386a6 100644 --- a/src/core/tasks/scheduled/manager.py +++ b/src/core/tasks/scheduled/manager.py @@ -1,3 +1,5 @@ +from datetime import datetime + from src.core.tasks.base.run_info import TaskOperatorRunInfo from src.core.tasks.handler import TaskHandler from src.core.tasks.mixins.link_urls import LinkURLsMixin @@ -5,6 +7,7 @@ from src.core.tasks.scheduled.loader import ScheduledTaskOperatorLoader from src.core.tasks.scheduled.models.entry import ScheduledTaskEntry from src.core.tasks.scheduled.registry.core import ScheduledJobRegistry +from src.core.tasks.scheduled.registry.format import format_job_datetime from src.core.tasks.scheduled.templates.operator import ScheduledTaskOperatorBase @@ -44,12 +47,17 @@ async def add_scheduled_tasks(self): enabled_entries.append(entry) initial_lag: int = 1 + + print("Adding the following scheduled tasks:") + print(f"TASK_NAME | TASK_INTERVAL") for idx, entry in enumerate(enabled_entries): - await self._registry.add_job( + next_run_time: datetime = await self._registry.add_job( func=self.run_task, entry=entry, minute_lag=idx + initial_lag ) + run_time_str: str = format_job_datetime(next_run_time) + print(f"{entry.operator.task_type.value:<25}| {run_time_str}") def shutdown(self): self._registry.shutdown_scheduler() diff --git a/src/core/tasks/scheduled/registry/core.py b/src/core/tasks/scheduled/registry/core.py index e9fc205b..e85c15f0 100644 --- a/src/core/tasks/scheduled/registry/core.py +++ b/src/core/tasks/scheduled/registry/core.py @@ -25,7 +25,7 @@ async def add_job( func: Callable, entry: ScheduledTaskEntry, minute_lag: int - ) -> None: + ) -> datetime: """ Modifies: self._jobs @@ -40,10 +40,8 @@ async def add_job( misfire_grace_time=60, kwargs={"operator": entry.operator} ) - run_time_str: str = format_job_datetime(job.next_run_time) - print(f"Adding {job.id} task to scheduler. " + - f"First run at {run_time_str}") self._jobs[entry.operator.task_type] = job + return job.next_run_time def start_scheduler(self) -> None: """ From 4b418604818462607abaf9b911d1e005c12af77c Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 14 Nov 2025 17:41:44 -0500 Subject: [PATCH 27/84] Update PDAP Access Client --- pyproject.toml | 2 +- src/api/main.py | 11 +++++++---- .../pdap/_templates/request_builder.py | 9 ++++++--- src/external/pdap/client.py | 18 ++++++++---------- .../pdap/impl/sync/meta_urls/delete/core.py | 2 -- tests/automated/integration/tasks/conftest.py | 6 +++--- .../impl/sync_to_ds/agency/test_add.py | 4 ---- .../tasks/scheduled/impl/sync_to_ds/helpers.py | 4 +++- uv.lock | 8 ++++---- 9 files changed, 32 insertions(+), 32 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 70f54673..abcee13e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,7 @@ dependencies = [ "lxml~=5.1.0", "marshmallow~=3.23.2", "openai~=1.60.1", - "pdap-access-manager==0.3.6", + "pdap-access-manager==0.4.3", "pillow>=11.3.0", "pip>=25.2", "playwright~=1.49.1", diff --git a/src/api/main.py b/src/api/main.py index 27abcb62..8f080d25 100644 --- a/src/api/main.py +++ b/src/api/main.py @@ -4,7 +4,8 @@ import uvicorn from discord_poster import DiscordPoster from fastapi import FastAPI -from pdap_access_manager import AccessManager +from pdap_access_manager.access_manager.async_ import AccessManagerAsync +from pdap_access_manager.models.auth import AuthInfo from sqlalchemy.ext.asyncio import create_async_engine from starlette.responses import RedirectResponse @@ -73,10 +74,12 @@ async def lifespan(app: FastAPI): discord_poster=discord_poster ) pdap_client = PDAPClient( - access_manager=AccessManager( + access_manager=AccessManagerAsync( data_sources_url=env_var_manager.pdap_api_url, - email=env_var_manager.pdap_email, - password=env_var_manager.pdap_password, + auth=AuthInfo( + email=env_var_manager.pdap_email, + password=env_var_manager.pdap_password, + ), api_key=env_var_manager.pdap_api_key, session=session ) diff --git a/src/external/pdap/_templates/request_builder.py b/src/external/pdap/_templates/request_builder.py index 387421f4..2cde6c51 100644 --- a/src/external/pdap/_templates/request_builder.py +++ b/src/external/pdap/_templates/request_builder.py @@ -2,16 +2,19 @@ from http import HTTPStatus from typing import Any -from pdap_access_manager import AccessManager, RequestType, RequestInfo, ResponseInfo +from pdap_access_manager.access_manager.async_ import AccessManagerAsync +from pdap_access_manager.enums import RequestType +from pdap_access_manager.models.request import RequestInfo +from pdap_access_manager.models.response import ResponseInfo from pydantic import BaseModel class PDAPRequestBuilderBase(ABC): def __init__(self): - self.access_manager: AccessManager | None = None + self.access_manager: AccessManagerAsync | None = None - async def run(self, access_manager: AccessManager) -> Any: + async def run(self, access_manager: AccessManagerAsync) -> Any: self.access_manager = access_manager return await self.inner_logic() diff --git a/src/external/pdap/client.py b/src/external/pdap/client.py index 944f8a88..0d6d9ec7 100644 --- a/src/external/pdap/client.py +++ b/src/external/pdap/client.py @@ -1,6 +1,9 @@ from typing import Any -from pdap_access_manager import AccessManager, DataSourcesNamespaces, RequestInfo, RequestType, ResponseInfo +from pdap_access_manager.access_manager.async_ import AccessManagerAsync +from pdap_access_manager.enums import RequestType +from pdap_access_manager.models.request import RequestInfo +from pdap_access_manager.models.response import ResponseInfo from src.external.pdap._templates.request_builder import PDAPRequestBuilderBase from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo @@ -13,7 +16,7 @@ class PDAPClient: def __init__( self, - access_manager: AccessManager, + access_manager: AccessManagerAsync, ): self.access_manager = access_manager @@ -33,10 +36,7 @@ async def match_agency( """ Returns agencies, if any, that match or partially match the search criteria """ - url: str = self.access_manager.build_url( - namespace=DataSourcesNamespaces.MATCH, - subdomains=["agency"] - ) + url: str = f"{self.access_manager.data_sources_url}/v2/match/agency" headers: dict[str, str] = await self.access_manager.jwt_header() headers['Content-Type']: str = "application/json" @@ -77,10 +77,8 @@ async def is_url_duplicate( """ Check if a URL is unique. Returns duplicate info otherwise """ - url: str = self.access_manager.build_url( - namespace=DataSourcesNamespaces.CHECK, - subdomains=["unique-url"] - ) + url: str = f"{self.access_manager.data_sources_url}/v2/check/unique-url" + request_info = RequestInfo( type_=RequestType.GET, url=url, diff --git a/src/external/pdap/impl/sync/meta_urls/delete/core.py b/src/external/pdap/impl/sync/meta_urls/delete/core.py index abdc3a6b..08b6fd81 100644 --- a/src/external/pdap/impl/sync/meta_urls/delete/core.py +++ b/src/external/pdap/impl/sync/meta_urls/delete/core.py @@ -1,5 +1,3 @@ -from pdap_access_manager import AccessManager - from src.external.pdap._templates.request_builder import PDAPRequestBuilderBase from src.external.pdap.impl.sync.shared.models.delete.request import DSAppSyncDeleteRequestModel diff --git a/tests/automated/integration/tasks/conftest.py b/tests/automated/integration/tasks/conftest.py index 937b2d12..e14d3369 100644 --- a/tests/automated/integration/tasks/conftest.py +++ b/tests/automated/integration/tasks/conftest.py @@ -1,15 +1,15 @@ from unittest.mock import MagicMock, AsyncMock import pytest -from pdap_access_manager import AccessManager +from pdap_access_manager.access_manager.async_ import AccessManagerAsync from src.external.pdap.client import PDAPClient @pytest.fixture def mock_pdap_client() -> PDAPClient: - mock_access_manager = MagicMock( - spec=AccessManager + mock_access_manager = AsyncMock( + spec=AccessManagerAsync ) mock_access_manager.data_sources_url = "http://example.com" mock_access_manager.build_url = MagicMock( diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/test_add.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/test_add.py index f0997d65..6a1dc358 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/test_add.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/test_add.py @@ -1,8 +1,4 @@ -from http import HTTPStatus -from unittest.mock import AsyncMock - import pytest -from pdap_access_manager import ResponseInfo from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.add.core import DSAppSyncAgenciesAddTaskOperator from src.db.client.async_ import AsyncDatabaseClient diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/helpers.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/helpers.py index fcc1a93c..c90a9654 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/helpers.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/helpers.py @@ -2,7 +2,9 @@ from typing import Any from unittest.mock import AsyncMock -from pdap_access_manager import RequestInfo, RequestType, ResponseInfo +from pdap_access_manager.enums import RequestType +from pdap_access_manager.models.request import RequestInfo +from pdap_access_manager.models.response import ResponseInfo from pydantic import BaseModel from src.external.pdap.client import PDAPClient diff --git a/uv.lock b/uv.lock index e7f52cfd..120be75b 100644 --- a/uv.lock +++ b/uv.lock @@ -560,7 +560,7 @@ requires-dist = [ { name = "lxml", specifier = "~=5.1.0" }, { name = "marshmallow", specifier = "~=3.23.2" }, { name = "openai", specifier = "~=1.60.1" }, - { name = "pdap-access-manager", specifier = "==0.3.6" }, + { name = "pdap-access-manager", specifier = "==0.4.3" }, { name = "pillow", specifier = ">=11.3.0" }, { name = "pip", specifier = ">=25.2" }, { name = "playwright", specifier = "~=1.49.1" }, @@ -1591,7 +1591,7 @@ wheels = [ [[package]] name = "pdap-access-manager" -version = "0.3.6" +version = "0.4.3" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiohttp" }, @@ -1599,9 +1599,9 @@ dependencies = [ { name = "pydantic" }, { name = "requests" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/a4/14/d910483f08a0203a20fc2839738d9e27c83a66849fed422c3d4e804e15f5/pdap_access_manager-0.3.6.tar.gz", hash = "sha256:15c04f704e22116cd56b459e8a9d7f8514c75c36ca2c8a889b9ce2a308d88f6c", size = 4169, upload_time = "2025-06-12T20:14:55.942Z" } +sdist = { url = "https://files.pythonhosted.org/packages/92/8f/ad75b32cc91673d89510c0adb451027c29b47d09069f02ad920b8a29ff0d/pdap_access_manager-0.4.3.tar.gz", hash = "sha256:24fe43550caa2a4fb0e4ac255d4265bcfd5985f08ff55cc7dd1bc24224d80f08", size = 5995, upload_time = "2025-11-14T22:12:07.622Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/f7/81/76803339fd732cd3eda7458d48e67487d9377197f9ea7d4583df098823b2/pdap_access_manager-0.3.6-py3-none-any.whl", hash = "sha256:a5910068f642f7548d037bcb98657ca1945997fae4e89dc4e1d47283da485b91", size = 5034, upload_time = "2025-06-12T20:14:48.452Z" }, + { url = "https://files.pythonhosted.org/packages/c1/fa/52ad971907cc54dee673effdabb6f7ee87d5beb1966bb554aebbf7b9e47e/pdap_access_manager-0.4.3-py3-none-any.whl", hash = "sha256:9d58f4065b9fea38af1fe0a6afc77c9b8030b42f7cf15068edbe7e53fe11f949", size = 10807, upload_time = "2025-11-14T22:12:06.154Z" }, ] [[package]] From 87146fdc98f011a589be88675266a5637b35d04d Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 14 Nov 2025 19:48:12 -0500 Subject: [PATCH 28/84] Include URL status in Sync Content --- .../impl/sync_to_ds/impl/data_sources/add/queries/get.py | 2 ++ .../impl/sync_to_ds/impl/data_sources/update/queries/get.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/get.py index 47beb2a3..7c22b332 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/get.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/get.py @@ -10,6 +10,7 @@ from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType from src.db.queries.base.builder import QueryBuilderBase +from src.external.pdap.enums import DataSourcesURLStatus from src.external.pdap.impl.sync.data_sources._shared.content import DataSourceSyncContentModel from src.external.pdap.impl.sync.data_sources.add.request import AddDataSourcesOuterRequest, AddDataSourcesInnerRequest @@ -108,6 +109,7 @@ async def run(self, session: AsyncSession) -> AddDataSourcesOuterRequest: scraper_url=mapping[URLOptionalDataSourceMetadata.scraper_url], access_notes=mapping[URLOptionalDataSourceMetadata.access_notes], access_types=mapping[URLOptionalDataSourceMetadata.access_types], + url_status=DataSourcesURLStatus.OK ) ) ) diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/get.py index 855075e3..74d79f6e 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/get.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/get.py @@ -10,6 +10,7 @@ from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType from src.db.queries.base.builder import QueryBuilderBase +from src.external.pdap.enums import DataSourcesURLStatus from src.external.pdap.impl.sync.data_sources._shared.content import DataSourceSyncContentModel from src.external.pdap.impl.sync.data_sources.update.request import UpdateDataSourcesOuterRequest, \ UpdateDataSourcesInnerRequest @@ -111,6 +112,7 @@ async def run(self, session: AsyncSession) -> UpdateDataSourcesOuterRequest: access_notes=mapping[URLOptionalDataSourceMetadata.access_notes], access_types=mapping[URLOptionalDataSourceMetadata.access_types], data_portal_type_other=mapping[URLOptionalDataSourceMetadata.data_portal_type_other], + url_status=DataSourcesURLStatus.OK ) ) ) From ca26debbd5a682d570e75802b2c5281a15f30e98 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 14 Nov 2025 19:55:29 -0500 Subject: [PATCH 29/84] Fix bug with access type --- .../impl/sync_to_ds/impl/data_sources/update/queries/get.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/get.py index 74d79f6e..6630d701 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/get.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/get.py @@ -110,7 +110,7 @@ async def run(self, session: AsyncSession) -> UpdateDataSourcesOuterRequest: retention_schedule=mapping[URLOptionalDataSourceMetadata.retention_schedule], scraper_url=mapping[URLOptionalDataSourceMetadata.scraper_url], access_notes=mapping[URLOptionalDataSourceMetadata.access_notes], - access_types=mapping[URLOptionalDataSourceMetadata.access_types], + access_types=mapping[URLOptionalDataSourceMetadata.access_types] or [], data_portal_type_other=mapping[URLOptionalDataSourceMetadata.data_portal_type_other], url_status=DataSourcesURLStatus.OK ) From caaaa5027474fbaf88ad6916f98b600c76e8f59d Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 14 Nov 2025 20:14:40 -0500 Subject: [PATCH 30/84] Fix bug with access type --- .../impl/sync_to_ds/impl/data_sources/update/queries/get.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/get.py index 6630d701..fe0baa86 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/get.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/get.py @@ -97,7 +97,7 @@ async def run(self, session: AsyncSession) -> UpdateDataSourcesOuterRequest: agency_ids=mapping["agency_ids"] or [], # Optional description=mapping[URL.description], - record_formats=mapping[URLOptionalDataSourceMetadata.record_formats], + record_formats=mapping[URLOptionalDataSourceMetadata.record_formats] or [], data_portal_type=mapping[URLOptionalDataSourceMetadata.data_portal_type], supplying_entity=mapping[URLOptionalDataSourceMetadata.supplying_entity], coverage_start=mapping[URLOptionalDataSourceMetadata.coverage_start], From 7e3c2b1c49cc7ca53792c3d56e849e07ea5c9712 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 14 Nov 2025 20:40:44 -0500 Subject: [PATCH 31/84] Correct bug with Data Source Sync Delete pulling too many data sources --- .../impl/data_sources/delete/queries/cte.py | 2 +- .../sync_to_ds/data_source/delete/__init__.py | 0 .../data_source/{ => delete}/test_delete.py | 23 +++++++++++++++---- 3 files changed, 20 insertions(+), 5 deletions(-) create mode 100644 tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/delete/__init__.py rename tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/{ => delete}/test_delete.py (77%) diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/cte.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/cte.py index 4e14dbf8..12ad5c84 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/cte.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/cte.py @@ -16,7 +16,7 @@ def __init__(self): ) .join( FlagDSDeleteDataSource, - FlagDSDeleteDataSource.ds_data_source_id == FlagDSDeleteDataSource.ds_data_source_id + FlagDSDeleteDataSource.ds_data_source_id == DSAppLinkDataSource.ds_data_source_id ).cte("ds_app_link_sync_data_source_delete_prerequisites") ) diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/delete/__init__.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/delete/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/test_delete.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/delete/test_delete.py similarity index 77% rename from tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/test_delete.py rename to tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/delete/test_delete.py index a67f5db3..1987bc79 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/test_delete.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/delete/test_delete.py @@ -5,6 +5,7 @@ DSAppSyncDataSourcesDeleteTaskOperator from src.db.client.async_ import AsyncDatabaseClient from src.db.models.impl.flag.ds_delete.data_source import FlagDSDeleteDataSource +from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource from src.external.pdap.client import PDAPClient from src.external.pdap.impl.sync.shared.models.delete.request import DSAppSyncDeleteRequestModel @@ -18,7 +19,8 @@ async def test_delete( db_data_creator: DBDataCreator, adb_client_test: AsyncDatabaseClient, - mock_pdap_client: PDAPClient + mock_pdap_client: PDAPClient, + test_url_data_source_id: int ): ds_data_source_id: int = 67 operator = DSAppSyncDataSourcesDeleteTaskOperator( @@ -34,13 +36,23 @@ async def test_delete( # Check does not currently meet prerequisite assert not await operator.meets_task_prerequisites() - # Add DS App Link + # Add DS App Link for deleted URL ds_app_link = DSAppLinkDataSource( url_id=None, ds_data_source_id=ds_data_source_id, ) await adb_client_test.add(ds_app_link) + # Add DS App Link for extant URL + ds_app_link = DSAppLinkDataSource( + url_id=test_url_data_source_id, + ds_data_source_id=ds_data_source_id + 1, + ) + await adb_client_test.add(ds_app_link) + + # Check does not currently meet prerequisite + assert not await operator.meets_task_prerequisites() + # Add Task Deletion Flag for App Link flag = FlagDSDeleteDataSource( ds_data_source_id=ds_data_source_id, @@ -61,8 +73,11 @@ async def test_delete( ) assert request.ids == [ds_data_source_id] - # Check DS App Link Is Deleted - assert await adb_client_test.has_no_rows(DSAppLinkDataSource) + # Check DS App Link has only one row + assert len(await adb_client_test.get_all(DSAppLinkDataSource)) == 1 # Check DS App Data Source Deletion Flag is deleted assert await adb_client_test.has_no_rows(FlagDSDeleteDataSource) + + # Check one row in URLs table + assert len(await adb_client_test.get_all(URL)) == 1 From 9179b8479e821489814484f179d16a1b0d4ec062 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sat, 15 Nov 2025 06:02:47 -0500 Subject: [PATCH 32/84] Add per-request entity limit --- src/core/tasks/scheduled/impl/sync_to_ds/constants.py | 3 +++ .../scheduled/impl/sync_to_ds/impl/agencies/add/queries/get.py | 3 ++- .../impl/sync_to_ds/impl/agencies/delete/queries/get.py | 3 ++- .../impl/sync_to_ds/impl/agencies/update/queries/get.py | 2 ++ .../impl/sync_to_ds/impl/data_sources/add/queries/get.py | 3 ++- .../impl/sync_to_ds/impl/data_sources/delete/queries/get.py | 3 ++- .../impl/sync_to_ds/impl/data_sources/update/queries/get.py | 3 ++- .../impl/sync_to_ds/impl/meta_urls/add/queries/get.py | 2 ++ .../impl/sync_to_ds/impl/meta_urls/delete/queries/get.py | 2 ++ .../impl/sync_to_ds/impl/meta_urls/update/queries/get.py | 2 ++ 10 files changed, 21 insertions(+), 5 deletions(-) create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/constants.py diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/constants.py b/src/core/tasks/scheduled/impl/sync_to_ds/constants.py new file mode 100644 index 00000000..d4bb072f --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/constants.py @@ -0,0 +1,3 @@ + + +PER_REQUEST_ENTITY_LIMIT = 1000 \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/get.py index 1ae9a13c..f037115a 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/get.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/get.py @@ -3,6 +3,7 @@ from sqlalchemy import select, RowMapping, func from sqlalchemy.ext.asyncio import AsyncSession +from src.core.tasks.scheduled.impl.sync_to_ds.constants import PER_REQUEST_ENTITY_LIMIT from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.add.queries.cte import \ DSAppLinkSyncAgencyAddPrerequisitesCTEContainer from src.db.models.impl.agency.sqlalchemy import Agency @@ -43,7 +44,7 @@ async def run(self, session: AsyncSession) -> AddAgenciesOuterRequest: .join( location_id_cte, location_id_cte.c.agency_id == cte.agency_id, - ) + ).limit(PER_REQUEST_ENTITY_LIMIT) ) mappings: Sequence[RowMapping] = await self.sh.mappings( diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/get.py index 36dddee4..c155f921 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/get.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/get.py @@ -3,6 +3,7 @@ from sqlalchemy import select, RowMapping from sqlalchemy.ext.asyncio import AsyncSession +from src.core.tasks.scheduled.impl.sync_to_ds.constants import PER_REQUEST_ENTITY_LIMIT from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.delete.queries.cte import \ DSAppLinkSyncAgencyDeletePrerequisitesCTEContainer from src.db.queries.base.builder import QueryBuilderBase @@ -17,7 +18,7 @@ async def run(self, session: AsyncSession) -> list[int]: query = ( select( cte.ds_agency_id, - ) + ).limit(PER_REQUEST_ENTITY_LIMIT) ) mappings: Sequence[RowMapping] = await self.sh.mappings( diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/get.py index 81572a24..0488f51b 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/get.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/get.py @@ -3,6 +3,7 @@ from sqlalchemy import select, func, RowMapping from sqlalchemy.ext.asyncio import AsyncSession +from src.core.tasks.scheduled.impl.sync_to_ds.constants import PER_REQUEST_ENTITY_LIMIT from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.update.queries.cte import \ DSAppLinkSyncAgencyUpdatePrerequisitesCTEContainer from src.db.models.impl.agency.ds_link.sqlalchemy import DSAppLinkAgency @@ -49,6 +50,7 @@ async def run(self, session: AsyncSession) -> UpdateAgenciesOuterRequest: location_id_cte, location_id_cte.c.agency_id == cte.agency_id, ) + .limit(PER_REQUEST_ENTITY_LIMIT) ) mappings: Sequence[RowMapping] = await self.sh.mappings( diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/get.py index 7c22b332..103923d2 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/get.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/get.py @@ -3,6 +3,7 @@ from sqlalchemy import RowMapping, func, select from sqlalchemy.ext.asyncio import AsyncSession +from src.core.tasks.scheduled.impl.sync_to_ds.constants import PER_REQUEST_ENTITY_LIMIT from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.add.queries.cte import \ DSAppLinkSyncDataSourceAddPrerequisitesCTEContainer from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency @@ -74,7 +75,7 @@ async def run(self, session: AsyncSession) -> AddDataSourcesOuterRequest: .join( agency_id_cte, cte.url_id == agency_id_cte.c.url_id - ) + ).limit(PER_REQUEST_ENTITY_LIMIT) ) mappings: Sequence[RowMapping] = await self.sh.mappings( diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/get.py index 7077beac..0e8e5732 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/get.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/get.py @@ -3,6 +3,7 @@ from sqlalchemy import select, RowMapping from sqlalchemy.ext.asyncio import AsyncSession +from src.core.tasks.scheduled.impl.sync_to_ds.constants import PER_REQUEST_ENTITY_LIMIT from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.delete.queries.cte import \ DSAppLinkSyncDataSourceDeletePrerequisitesCTEContainer from src.db.queries.base.builder import QueryBuilderBase @@ -18,7 +19,7 @@ async def run(self, session: AsyncSession) -> list[int]: select( cte.ds_data_source_id, ) - ) + ).limit(PER_REQUEST_ENTITY_LIMIT) mappings: Sequence[RowMapping] = await self.sh.mappings( session=session, diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/get.py index fe0baa86..b6b94779 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/get.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/get.py @@ -3,6 +3,7 @@ from sqlalchemy import select, func, RowMapping from sqlalchemy.ext.asyncio import AsyncSession +from src.core.tasks.scheduled.impl.sync_to_ds.constants import PER_REQUEST_ENTITY_LIMIT from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update.queries.cte import \ DSAppLinkSyncDataSourceUpdatePrerequisitesCTEContainer from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency @@ -77,7 +78,7 @@ async def run(self, session: AsyncSession) -> UpdateDataSourcesOuterRequest: agency_id_cte, cte.url_id == agency_id_cte.c.url_id ) - ) + ).limit(PER_REQUEST_ENTITY_LIMIT) mappings: Sequence[RowMapping] = await self.sh.mappings( session=session, diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/get.py index 42a9149b..da695cf0 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/get.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/get.py @@ -3,6 +3,7 @@ from sqlalchemy import select, func, RowMapping from sqlalchemy.ext.asyncio import AsyncSession +from src.core.tasks.scheduled.impl.sync_to_ds.constants import PER_REQUEST_ENTITY_LIMIT from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.add.queries.cte import \ DSAppLinkSyncMetaURLAddPrerequisitesCTEContainer from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency @@ -45,6 +46,7 @@ async def run(self, session: AsyncSession) -> AddMetaURLsOuterRequest: agency_id_cte, cte.url_id == agency_id_cte.c.url_id ) + .limit(PER_REQUEST_ENTITY_LIMIT) ) mappings: Sequence[RowMapping] = await self.sh.mappings( diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/get.py index f1d232f7..0d3b09cc 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/get.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/get.py @@ -3,6 +3,7 @@ from sqlalchemy import select, RowMapping from sqlalchemy.ext.asyncio import AsyncSession +from src.core.tasks.scheduled.impl.sync_to_ds.constants import PER_REQUEST_ENTITY_LIMIT from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.delete.queries.cte import \ DSAppLinkSyncMetaURLDeletePrerequisitesCTEContainer from src.db.queries.base.builder import QueryBuilderBase @@ -18,6 +19,7 @@ async def run(self, session: AsyncSession) -> list[int]: select( cte.ds_meta_url_id, ) + .limit(PER_REQUEST_ENTITY_LIMIT) ) mappings: Sequence[RowMapping] = await self.sh.mappings( diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/get.py index 210909f9..5dfb81bd 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/get.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/get.py @@ -3,6 +3,7 @@ from sqlalchemy import select, func, RowMapping from sqlalchemy.ext.asyncio import AsyncSession +from src.core.tasks.scheduled.impl.sync_to_ds.constants import PER_REQUEST_ENTITY_LIMIT from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.update.queries.cte import \ DSAppLinkSyncMetaURLUpdatePrerequisitesCTEContainer from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency @@ -45,6 +46,7 @@ async def run(self, session: AsyncSession) -> UpdateMetaURLsOuterRequest: agency_id_cte, cte.url_id == agency_id_cte.c.url_id ) + .limit(PER_REQUEST_ENTITY_LIMIT) ) mappings: Sequence[RowMapping] = await self.sh.mappings( From e6353235dffa3a59d34e6b66d13d6d464b2df25f Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sat, 15 Nov 2025 14:02:17 -0500 Subject: [PATCH 33/84] Add README for synchronization logic. --- .../tasks/scheduled/impl/sync_to_ds/README.md | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/README.md diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/README.md b/src/core/tasks/scheduled/impl/sync_to_ds/README.md new file mode 100644 index 00000000..e9dcddb8 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/README.md @@ -0,0 +1,63 @@ +The Source Manager (SM) is part of a two app system, with the other app being the Data Sources (DS) App. + + +# Add, Update, and Delete + +These are the core synchronization actions. + +In order to propagate changes to DS, we synchronize additions, updates, and deletions of the following entities: +- Agencies +- Data Sources +- Meta URLs + +Each action for each entity occurs through a separate task. At the moment, there are nine tasks total. + +Each task gathers requisite information from the SM database and sends a request to one of nine corresponding endpoints in the DS API. + +Each DS endpoint follows the following format: + +```text +/v3/source-manager/{entity}/{action} +``` + +Here is a high-level description of how each action works: + +## Add + +Adds the given entities to DS. + +These are denoted with the `/{entity}/add` path in the DS API. + +When an entity is added, it returns a unique DS ID that is mapped to the internal SM database ID via the DS app link tables. + +For an entity to be added, it must meet preconditions which are distinct for each entity: +- Agencies: Must have an agency entry in the database and be linked to a location. +- Data Sources: Must be a URL that has been internally validated as a data source and linked to an agency. +- Meta URLs: Must be a URL that has been internally validated as a meta URL and linked to an agency. + +## Update + +Updates the given entities in DS. + +These are denoted with the `/{entity}/update` path in the DS API. + +These consist of submitting the updated entities (in full) to the requisite endpoint, and updating the local app link to indicate that the update occurred. All updates are designed to be full overwrites of the entity. + +For an entity to be updated, it must meet preconditions which are distinct for each entity: +- Agencies: Must have either an agency row updated or an agency/location link updated or deleted. +- Data Sources: One of the following must be updated: + - The URL table + - The record type table + - The optional data sources metadata table + - The agency link table (either an addition or deletion) +- Meta URLs: Must be a URL that has been internally validated as a meta URL and linked to an agency. Either the URL table or the agency link table (addition or deletion) must be updated. + +## Delete + +Deletes the given entities from DS. + +These are denoted with the `/{entity}/delete` path in the DS API. + +This consists of submitting a set of DS IDs to the requisite endpoint, and removing the associated DS app link entry in the SM database. + +When an entity with a corresponding DS App Link is deleted from the Source Manager, the core data is removed but a deletion flag is appended to the DS App Link entry, indicating that the entry is not yet removed from the DS App. The deletion task uses this flag to identify entities to be deleted, submits the deletion request to the DS API, and removes both the flag and the DS App Link. \ No newline at end of file From 7bc63484e920437ac676d94efceebdbdf3b12491 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sat, 15 Nov 2025 14:04:41 -0500 Subject: [PATCH 34/84] Add README for synchronization logic. --- src/core/tasks/scheduled/impl/sync_to_ds/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/README.md b/src/core/tasks/scheduled/impl/sync_to_ds/README.md index e9dcddb8..488b52bb 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/README.md +++ b/src/core/tasks/scheduled/impl/sync_to_ds/README.md @@ -20,6 +20,8 @@ Each DS endpoint follows the following format: /v3/source-manager/{entity}/{action} ``` +Synchronizations are designed to occur on an hourly basis. + Here is a high-level description of how each action works: ## Add From 08159e0a228e4ab870b04f1131443577957ec4b9 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sat, 15 Nov 2025 14:59:10 -0500 Subject: [PATCH 35/84] Set alter record formats and access types columns to be not null, default to empty array. --- ...update_record_formats_and_access_types_.py | 67 +++++++++++++++++++ src/api/endpoints/collector/manual/query.py | 3 +- .../contributions/user/queries/core.py | 6 +- .../endpoints/review/approve/query_/core.py | 5 +- src/core/tasks/scheduled/loader.py | 59 ++++++++-------- src/db/client/async_.py | 5 +- .../url/optional_ds_metadata/sqlalchemy.py | 4 +- .../integration/api/test_manual_batch.py | 5 +- .../impl/sync_to_ds/data_source/test_add.py | 3 +- .../test_url_miscellaneous_metadata_task.py | 10 +-- 10 files changed, 121 insertions(+), 46 deletions(-) create mode 100644 alembic/versions/2025_11_15_1441-de0305465e2c_update_record_formats_and_access_types_.py diff --git a/alembic/versions/2025_11_15_1441-de0305465e2c_update_record_formats_and_access_types_.py b/alembic/versions/2025_11_15_1441-de0305465e2c_update_record_formats_and_access_types_.py new file mode 100644 index 00000000..e9e14ca8 --- /dev/null +++ b/alembic/versions/2025_11_15_1441-de0305465e2c_update_record_formats_and_access_types_.py @@ -0,0 +1,67 @@ +"""Update record_formats and access_types to be not null + +Revision ID: de0305465e2c +Revises: a57c3b5b6e93 +Create Date: 2025-11-15 14:41:45.619148 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'de0305465e2c' +down_revision: Union[str, None] = 'a57c3b5b6e93' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +TABLE_NAME = "url_optional_data_source_metadata" + + +def upgrade() -> None: + _update_record_formats() + _update_access_types() + _alter_record_formats_column() + _alter_access_types_column() + +def _alter_record_formats_column(): + op.alter_column( + table_name=TABLE_NAME, + column_name="record_formats", + nullable=False, + server_default='{}' + ) + + +def _alter_access_types_column(): + op.alter_column( + table_name=TABLE_NAME, + column_name="access_types", + nullable=False, + server_default='{}' + ) + + + +def _update_access_types(): + op.execute(""" + UPDATE url_optional_data_source_metadata + SET access_types = '{}' + WHERE access_types is null + + """) + + +def _update_record_formats(): + op.execute(""" + UPDATE url_optional_data_source_metadata + SET record_formats = '{}' + WHERE record_formats is null + """) + + +def downgrade() -> None: + pass diff --git a/src/api/endpoints/collector/manual/query.py b/src/api/endpoints/collector/manual/query.py index dff2cbed..5ebe0e4b 100644 --- a/src/api/endpoints/collector/manual/query.py +++ b/src/api/endpoints/collector/manual/query.py @@ -84,9 +84,10 @@ async def run(self, session: AsyncSession) -> ManualBatchResponseDTO: optional_metadata = URLOptionalDataSourceMetadata( url_id=url.id, - record_formats=entry.record_formats, + record_formats=entry.record_formats or [], data_portal_type=entry.data_portal_type, supplying_entity=entry.supplying_entity, + access_types=[] ) session.add(optional_metadata) url_ids.append(url.id) diff --git a/src/api/endpoints/contributions/user/queries/core.py b/src/api/endpoints/contributions/user/queries/core.py index 57727215..1709776c 100644 --- a/src/api/endpoints/contributions/user/queries/core.py +++ b/src/api/endpoints/contributions/user/queries/core.py @@ -33,15 +33,15 @@ async def run(self, session: AsyncSession) -> ContributionsUserResponse: agency_agree.agreement.label("agency"), url_type_agree.agreement.label("url_type") ) - .join( + .outerjoin( record_type_agree.cte, contributions_cte.user_id == record_type_agree.user_id ) - .join( + .outerjoin( agency_agree.cte, contributions_cte.user_id == agency_agree.user_id ) - .join( + .outerjoin( url_type_agree.cte, contributions_cte.user_id == url_type_agree.user_id ) diff --git a/src/api/endpoints/review/approve/query_/core.py b/src/api/endpoints/review/approve/query_/core.py index ff7a1c1f..b05c6c67 100644 --- a/src/api/endpoints/review/approve/query_/core.py +++ b/src/api/endpoints/review/approve/query_/core.py @@ -66,9 +66,10 @@ async def _optionally_update_optional_metdata(self, url: URL) -> None: optional_metadata = url.optional_data_source_metadata if optional_metadata is None: url.optional_data_source_metadata = URLOptionalDataSourceMetadata( - record_formats=self.approval_info.record_formats, + record_formats=self.approval_info.record_formats or [], data_portal_type=self.approval_info.data_portal_type, - supplying_entity=self.approval_info.supplying_entity + supplying_entity=self.approval_info.supplying_entity, + access_types=[] ) else: update_if_not_none( diff --git a/src/core/tasks/scheduled/loader.py b/src/core/tasks/scheduled/loader.py index f104b84f..3ea4fc94 100644 --- a/src/core/tasks/scheduled/loader.py +++ b/src/core/tasks/scheduled/loader.py @@ -127,8 +127,8 @@ async def load_entries(self) -> list[ScheduledTaskEntry]: enabled=self.setup_flag("REFRESH_MATERIALIZED_VIEWS_TASK_FLAG") ), # Sync - ## Agency - ### Add + ## Adds + ### Agency ScheduledTaskEntry( operator=DSAppSyncAgenciesAddTaskOperator( adb_client=self.adb_client, @@ -137,78 +137,79 @@ async def load_entries(self) -> list[ScheduledTaskEntry]: interval_minutes=IntervalEnum.HOURLY.value, enabled=self.setup_flag("DS_APP_SYNC_AGENCY_ADD_TASK_FLAG") ), - ### Update + ### Meta URL ScheduledTaskEntry( - operator=DSAppSyncAgenciesUpdateTaskOperator( + operator=DSAppSyncMetaURLsAddTaskOperator( adb_client=self.adb_client, pdap_client=self.pdap_client ), interval_minutes=IntervalEnum.HOURLY.value, - enabled=self.setup_flag("DS_APP_SYNC_AGENCY_UPDATE_TASK_FLAG") + enabled=self.setup_flag("DS_APP_SYNC_META_URL_ADD_TASK_FLAG") ), - ### Delete + ### Data Source ScheduledTaskEntry( - operator=DSAppSyncAgenciesDeleteTaskOperator( + operator=DSAppSyncDataSourcesAddTaskOperator( adb_client=self.adb_client, pdap_client=self.pdap_client ), interval_minutes=IntervalEnum.HOURLY.value, - enabled=self.setup_flag("DS_APP_SYNC_AGENCY_DELETE_TASK_FLAG") + enabled=self.setup_flag("DS_APP_SYNC_DATA_SOURCE_ADD_TASK_FLAG") ), - ## Data Source - ### Add + ## Updates + ### Agency ScheduledTaskEntry( - operator=DSAppSyncDataSourcesAddTaskOperator( + operator=DSAppSyncAgenciesUpdateTaskOperator( adb_client=self.adb_client, pdap_client=self.pdap_client ), interval_minutes=IntervalEnum.HOURLY.value, - enabled=self.setup_flag("DS_APP_SYNC_DATA_SOURCE_ADD_TASK_FLAG") + enabled=self.setup_flag("DS_APP_SYNC_AGENCY_UPDATE_TASK_FLAG") ), - ### Update + ### Meta URL ScheduledTaskEntry( - operator=DSAppSyncDataSourcesUpdateTaskOperator( + operator=DSAppSyncMetaURLsUpdateTaskOperator( adb_client=self.adb_client, pdap_client=self.pdap_client ), interval_minutes=IntervalEnum.HOURLY.value, - enabled=self.setup_flag("DS_APP_SYNC_DATA_SOURCE_UPDATE_TASK_FLAG") + enabled=self.setup_flag("DS_APP_SYNC_META_URL_UPDATE_TASK_FLAG") ), - ### Delete + ### Data Source ScheduledTaskEntry( - operator=DSAppSyncDataSourcesDeleteTaskOperator( + operator=DSAppSyncDataSourcesUpdateTaskOperator( adb_client=self.adb_client, pdap_client=self.pdap_client ), interval_minutes=IntervalEnum.HOURLY.value, - enabled=self.setup_flag("DS_APP_SYNC_DATA_SOURCE_DELETE_TASK_FLAG") + enabled=self.setup_flag("DS_APP_SYNC_DATA_SOURCE_UPDATE_TASK_FLAG") ), - ## Meta URL - ### Add + ## Deletes + ### Data Source ScheduledTaskEntry( - operator=DSAppSyncMetaURLsAddTaskOperator( + operator=DSAppSyncDataSourcesDeleteTaskOperator( adb_client=self.adb_client, pdap_client=self.pdap_client ), interval_minutes=IntervalEnum.HOURLY.value, - enabled=self.setup_flag("DS_APP_SYNC_META_URL_ADD_TASK_FLAG") + enabled=self.setup_flag("DS_APP_SYNC_DATA_SOURCE_DELETE_TASK_FLAG") ), - ### Update + ### Meta URL ScheduledTaskEntry( - operator=DSAppSyncMetaURLsUpdateTaskOperator( + operator=DSAppSyncMetaURLsDeleteTaskOperator( adb_client=self.adb_client, pdap_client=self.pdap_client ), interval_minutes=IntervalEnum.HOURLY.value, - enabled=self.setup_flag("DS_APP_SYNC_META_URL_UPDATE_TASK_FLAG") + enabled=self.setup_flag("DS_APP_SYNC_META_URL_DELETE_TASK_FLAG") ), - ### Delete + ### Agency ScheduledTaskEntry( - operator=DSAppSyncMetaURLsDeleteTaskOperator( + operator=DSAppSyncAgenciesDeleteTaskOperator( adb_client=self.adb_client, pdap_client=self.pdap_client ), interval_minutes=IntervalEnum.HOURLY.value, - enabled=self.setup_flag("DS_APP_SYNC_META_URL_DELETE_TASK_FLAG") - ) + enabled=self.setup_flag("DS_APP_SYNC_AGENCY_DELETE_TASK_FLAG") + ), + ] diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 95bc7082..50802347 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -371,9 +371,10 @@ async def add_miscellaneous_metadata(self, session: AsyncSession, tdos: list[URL for tdo in tdos: metadata_object = URLOptionalDataSourceMetadata( url_id=tdo.url_id, - record_formats=tdo.record_formats, + record_formats=tdo.record_formats or [], data_portal_type=tdo.data_portal_type, - supplying_entity=tdo.supplying_entity + supplying_entity=tdo.supplying_entity, + access_types=[], ) session.add(metadata_object) diff --git a/src/db/models/impl/url/optional_ds_metadata/sqlalchemy.py b/src/db/models/impl/url/optional_ds_metadata/sqlalchemy.py index 4661be7a..32156a38 100644 --- a/src/db/models/impl/url/optional_ds_metadata/sqlalchemy.py +++ b/src/db/models/impl/url/optional_ds_metadata/sqlalchemy.py @@ -15,7 +15,7 @@ class URLOptionalDataSourceMetadata( ): __tablename__ = 'url_optional_data_source_metadata' - record_formats = Column(ARRAY(String), nullable=True) + record_formats = Column(ARRAY(String), nullable=False, default=[]) data_portal_type = Column(String, nullable=True) supplying_entity = Column(String, nullable=True) coverage_start = Column(Date, nullable=True) @@ -38,7 +38,7 @@ class URLOptionalDataSourceMetadata( native_enum=True, values_callable=lambda AccessTypeEnum: [e.value for e in AccessTypeEnum] ) - ), nullable=True) + ), nullable=False, default=[]) data_portal_type_other = Column(String, nullable=True) # Relationships diff --git a/tests/automated/integration/api/test_manual_batch.py b/tests/automated/integration/api/test_manual_batch.py index 9e52d358..fa3f7884 100644 --- a/tests/automated/integration/api/test_manual_batch.py +++ b/tests/automated/integration/api/test_manual_batch.py @@ -121,7 +121,10 @@ def check_url(url: URL, url_only: bool): def check_opt_metadata(metadata: URLOptionalDataSourceMetadata, no_optional: bool): assert metadata.url_id is not None - other_attributes = ["record_formats", "data_portal_type", "supplying_entity"] + other_attributes = [ + "data_portal_type", + "supplying_entity" + ] return check_attributes(metadata, other_attributes, no_optional) # Confirm 50 have nothing but URL id diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/test_add.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/test_add.py index 060637db..b90bb761 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/test_add.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/test_add.py @@ -5,6 +5,7 @@ from src.db.client.async_ import AsyncDatabaseClient from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource from src.external.pdap.client import PDAPClient +from src.external.pdap.enums import DataSourcesURLStatus from src.external.pdap.impl.sync.data_sources._shared.content import DataSourceSyncContentModel from src.external.pdap.impl.sync.data_sources.add.request import AddDataSourcesOuterRequest, AddDataSourcesInnerRequest from src.external.pdap.impl.sync.shared.models.add.response import DSAppSyncAddResponseModel, \ @@ -78,7 +79,7 @@ async def test_add( assert content.access_notes is None assert content.access_types is None assert content.data_portal_type_other is None - assert content.url_status is None + assert content.url_status == DataSourcesURLStatus.OK assert content.agency_ids == [test_agency_id] diff --git a/tests/automated/integration/tasks/url/impl/test_url_miscellaneous_metadata_task.py b/tests/automated/integration/tasks/url/impl/test_url_miscellaneous_metadata_task.py index 93878562..bc3f240d 100644 --- a/tests/automated/integration/tasks/url/impl/test_url_miscellaneous_metadata_task.py +++ b/tests/automated/integration/tasks/url/impl/test_url_miscellaneous_metadata_task.py @@ -122,12 +122,12 @@ async def test_url_miscellaneous_metadata_task(db_data_creator: DBDataCreator): assert url.description == expected_description, f"For url.id {url.id}, expected description {expected_description}, got {url.description}" expected_urls = { - common_crawler_url_id: (None, None, None), - auto_googler_url_id: (None, None, None), + common_crawler_url_id: ([], None, None), + auto_googler_url_id: ([], None, None), ckan_url_id: (["CSV", "JSON"], "Test Data Portal Type", "Test Supplying Entity"), - muckrock_simple_url_id: (None, None, None), - muckrock_county_url_id: (None, None, None), - muckrock_all_url_id: (None, None, None), + muckrock_simple_url_id: ([], None, None), + muckrock_county_url_id: ([], None, None), + muckrock_all_url_id: ([], None, None), } metadatas: list[URLOptionalDataSourceMetadata] = await db_data_creator.adb_client.get_all(URLOptionalDataSourceMetadata) From ce27f509f619fd3986246a017a136f3b3d9c2f35 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sat, 15 Nov 2025 15:25:52 -0500 Subject: [PATCH 36/84] Add check for duplicate entries --- src/core/tasks/scheduled/impl/internet_archives/save/mapper.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/core/tasks/scheduled/impl/internet_archives/save/mapper.py b/src/core/tasks/scheduled/impl/internet_archives/save/mapper.py index 1d20b1c2..09a708bc 100644 --- a/src/core/tasks/scheduled/impl/internet_archives/save/mapper.py +++ b/src/core/tasks/scheduled/impl/internet_archives/save/mapper.py @@ -7,6 +7,8 @@ def __init__(self, entries: list[InternetArchivesSaveTaskEntry]): self._url_to_entry: dict[str, InternetArchivesSaveTaskEntry] = { entry.url: entry for entry in entries } + if len(self._url_to_entry) != len(entries): + raise ValueError("Duplicate URLs found in entries") def get_is_new(self, url: str) -> bool: return self._url_to_entry[url].is_new From 3b1feb35160ec44e01c6d97809b63a5f98a224d3 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sat, 15 Nov 2025 15:37:23 -0500 Subject: [PATCH 37/84] Add condition to check for no extant URL Task Error --- .../internet_archives/save/queries/shared/get_valid_entries.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/core/tasks/scheduled/impl/internet_archives/save/queries/shared/get_valid_entries.py b/src/core/tasks/scheduled/impl/internet_archives/save/queries/shared/get_valid_entries.py index 1ce9c1d9..fa4c36f0 100644 --- a/src/core/tasks/scheduled/impl/internet_archives/save/queries/shared/get_valid_entries.py +++ b/src/core/tasks/scheduled/impl/internet_archives/save/queries/shared/get_valid_entries.py @@ -1,5 +1,7 @@ from sqlalchemy import select, or_, func, text +from src.db.enums import TaskType +from src.db.helpers.query import no_url_task_error from src.db.models.impl.flag.checked_for_ia.sqlalchemy import FlagURLCheckedForInternetArchives from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.internet_archives.probe.sqlalchemy import URLInternetArchivesProbeMetadata @@ -39,6 +41,7 @@ URLInternetArchivesSaveMetadata.url_id.is_(None), URLInternetArchivesSaveMetadata.last_uploaded_at < func.now() - text("INTERVAL '1 month'") ), + no_url_task_error(TaskType.IA_SAVE), # Must have returned a 200 status code URLWebMetadata.status_code == 200 ) From a952f1d7900489d56057531063a032be9311adf5 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sat, 15 Nov 2025 18:53:40 -0500 Subject: [PATCH 38/84] Fix bug in data sources and meta URL GET queries --- src/api/endpoints/data_source/get/query.py | 5 +- src/api/endpoints/meta_url/get/query.py | 1 - .../readonly/api/agencies/get/test_root.py | 2 +- .../readonly/api/data_sources/test_get.py | 4 +- .../api/meta_urls/agencies/test_forbid.py | 2 +- .../integration/readonly/conftest.py | 5 +- .../automated/integration/readonly/helper.py | 5 +- tests/automated/integration/readonly/setup.py | 57 ++++++++++++++++--- 8 files changed, 62 insertions(+), 19 deletions(-) diff --git a/src/api/endpoints/data_source/get/query.py b/src/api/endpoints/data_source/get/query.py index e15ce6b1..8766409d 100644 --- a/src/api/endpoints/data_source/get/query.py +++ b/src/api/endpoints/data_source/get/query.py @@ -37,7 +37,6 @@ async def run(self, session: AsyncSession) -> DataSourceGetOuterResponse: # Required Attributes URL.name, URLRecordType.record_type, - URL.confirmed_agencies, # Optional Attributes URL.description, @@ -102,7 +101,7 @@ async def run(self, session: AsyncSession) -> DataSourceGetOuterResponse: url_description: str | None = mapping[URL.description] link_batch_url_batch_id: int | None = mapping[LinkBatchURL.batch_id] - url_record_formats: list[str] | None = mapping[URLOptionalDataSourceMetadata.record_formats] + url_record_formats: list[str] = mapping[URLOptionalDataSourceMetadata.record_formats] or [] url_data_portal_type: str | None = mapping[URLOptionalDataSourceMetadata.data_portal_type] url_supplying_entity: str | None = mapping[URLOptionalDataSourceMetadata.supplying_entity] url_coverage_start: date | None = mapping[URLOptionalDataSourceMetadata.coverage_start] @@ -118,7 +117,7 @@ async def run(self, session: AsyncSession) -> DataSourceGetOuterResponse: url_scraper_url: str | None = mapping[URLOptionalDataSourceMetadata.scraper_url] url_submission_notes: str | None = mapping[URLOptionalDataSourceMetadata.submission_notes] url_access_notes: str | None = mapping[URLOptionalDataSourceMetadata.access_notes] - url_access_types: list[AccessTypeEnum] | None = mapping[URLOptionalDataSourceMetadata.access_types] + url_access_types: list[AccessTypeEnum] = mapping[URLOptionalDataSourceMetadata.access_types] or [] responses.append( DataSourceGetResponse( diff --git a/src/api/endpoints/meta_url/get/query.py b/src/api/endpoints/meta_url/get/query.py index 740dfd69..30db1e05 100644 --- a/src/api/endpoints/meta_url/get/query.py +++ b/src/api/endpoints/meta_url/get/query.py @@ -30,7 +30,6 @@ async def run(self, session: AsyncSession) -> MetaURLGetOuterResponse: # Required Attributes URL.name, - URL.confirmed_agencies, # Optional Attributes URL.description, diff --git a/tests/automated/integration/readonly/api/agencies/get/test_root.py b/tests/automated/integration/readonly/api/agencies/get/test_root.py index a74e49da..412a9512 100644 --- a/tests/automated/integration/readonly/api/agencies/get/test_root.py +++ b/tests/automated/integration/readonly/api/agencies/get/test_root.py @@ -12,7 +12,7 @@ async def test_agency_get( responses_raw: list[dict] = readonly_helper.api_test_helper.request_validator.get_v3( url=f"/agencies", ) - assert len(responses_raw) == 1 + assert len(responses_raw) == 2 response_raw = responses_raw[0] assert response_raw["id"] == readonly_helper.agency_1_id assert response_raw["name"] == "Agency 1" diff --git a/tests/automated/integration/readonly/api/data_sources/test_get.py b/tests/automated/integration/readonly/api/data_sources/test_get.py index e7bbe861..27e6ad63 100644 --- a/tests/automated/integration/readonly/api/data_sources/test_get.py +++ b/tests/automated/integration/readonly/api/data_sources/test_get.py @@ -18,13 +18,13 @@ async def test_get(readonly_helper: ReadOnlyTestHelper): ) outer_response = DataSourceGetOuterResponse(**raw_json) - assert len(outer_response.results) == 1 + assert len(outer_response.results) == 2 response: DataSourceGetResponse = outer_response.results[0] diff = DeepDiff( response.model_dump(mode='json'), DataSourceGetResponse( - url_id=readonly_helper.url_data_source_id, + url_id=readonly_helper.maximal_data_source, url="read-only-ds.com", name="Read only URL name", diff --git a/tests/automated/integration/readonly/api/meta_urls/agencies/test_forbid.py b/tests/automated/integration/readonly/api/meta_urls/agencies/test_forbid.py index d62fa524..32bb08fe 100644 --- a/tests/automated/integration/readonly/api/meta_urls/agencies/test_forbid.py +++ b/tests/automated/integration/readonly/api/meta_urls/agencies/test_forbid.py @@ -8,7 +8,7 @@ @pytest.mark.asyncio async def test_forbid(readonly_helper: ReadOnlyTestHelper): check_forbidden_url_type( - route=f"/meta-urls/{readonly_helper.url_data_source_id}/agencies", + route=f"/meta-urls/{readonly_helper.minimal_data_source}/agencies", api_test_helper=readonly_helper.api_test_helper, method="GET" ) diff --git a/tests/automated/integration/readonly/conftest.py b/tests/automated/integration/readonly/conftest.py index 4589f5b5..d98d9c19 100644 --- a/tests/automated/integration/readonly/conftest.py +++ b/tests/automated/integration/readonly/conftest.py @@ -6,7 +6,6 @@ from sqlalchemy import Engine from starlette.testclient import TestClient -from src.db.helpers.connect import get_postgres_connection_string from tests.automated.integration.api._helpers.RequestValidator import RequestValidator from tests.automated.integration.readonly.helper import ReadOnlyTestHelper from tests.automated.integration.readonly.setup import setup_readonly_data @@ -45,6 +44,8 @@ async def readonly_helper( db_data_creator=db_data_creator, ) - helper: ReadOnlyTestHelper = await setup_readonly_data(api_test_helper=api_test_helper) + helper: ReadOnlyTestHelper = await setup_readonly_data( + api_test_helper=api_test_helper + ) yield helper \ No newline at end of file diff --git a/tests/automated/integration/readonly/helper.py b/tests/automated/integration/readonly/helper.py index 68474256..1331aa17 100644 --- a/tests/automated/integration/readonly/helper.py +++ b/tests/automated/integration/readonly/helper.py @@ -13,6 +13,9 @@ class Config: agency_1_id: int agency_1_location_id: int + agency_2_id: int + agency_2_location_id: int - url_data_source_id: int + minimal_data_source: int + maximal_data_source: int url_meta_url_id: int diff --git a/tests/automated/integration/readonly/setup.py b/tests/automated/integration/readonly/setup.py index ec8c78b1..7f5e1637 100644 --- a/tests/automated/integration/readonly/setup.py +++ b/tests/automated/integration/readonly/setup.py @@ -15,7 +15,6 @@ from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType from tests.automated.integration.readonly.helper import ReadOnlyTestHelper from tests.helpers.api_test_helper import APITestHelper -from tests.helpers.counter import next_int from tests.helpers.data_creator.core import DBDataCreator from tests.helpers.data_creator.models.creation_info.county import CountyCreationInfo from tests.helpers.data_creator.models.creation_info.locality import LocalityCreationInfo @@ -33,7 +32,6 @@ async def setup_readonly_data( name="Pennsylvania", iso="PA" ) - allegheny_county: CountyCreationInfo = await db_data_creator.create_county( state_id=pennsylvania.us_state_id, name="Allegheny" @@ -46,10 +44,18 @@ async def setup_readonly_data( # Add Agencies - agency_1_id: int = await add_agency(adb_client, pittsburgh) + agency_1_id: int = await add_agency(adb_client, pittsburgh.location_id) + agency_2_id: int = await add_agency(adb_client, allegheny_county.location_id) # Add Data Source With Linked Agency - url_data_source_id: int = await add_data_source(agency_1_id, db_data_creator) + maximal_data_source: int = await add_maximal_data_source( + agency_1_id=agency_1_id, + db_data_creator=db_data_creator + ) + minimal_data_source: int = await add_minimal_data_source( + agency_1_id=agency_1_id, + db_data_creator=db_data_creator + ) # Add Meta URL with Linked Agency url_meta_url_id: int = await add_meta_url(agency_1_id, db_data_creator) @@ -61,7 +67,11 @@ async def setup_readonly_data( agency_1_id=agency_1_id, agency_1_location_id=pittsburgh.location_id, - url_data_source_id=url_data_source_id, + agency_2_id=agency_2_id, + agency_2_location_id=allegheny_county.location_id, + + maximal_data_source=maximal_data_source, + minimal_data_source=minimal_data_source, url_meta_url_id=url_meta_url_id, ) @@ -93,7 +103,7 @@ async def add_meta_url( return url_id -async def add_data_source( +async def add_maximal_data_source( agency_1_id: int, db_data_creator: DBDataCreator ) -> int: @@ -150,10 +160,41 @@ async def add_data_source( ) return url_id +async def add_minimal_data_source( + agency_1_id: int, + db_data_creator: DBDataCreator +) -> int: + adb_client: AsyncDatabaseClient = db_data_creator.adb_client + url = URL( + scheme="https", + url="minimal-ds.com", + name="Minimal name", + trailing_slash=False, + collector_metadata={}, + status=URLStatus.OK, + source=URLSource.ROOT_URL, + ) + url_id: int = await adb_client.add(url, return_id=True) + await db_data_creator.create_validated_flags( + url_ids=[url_id], + validation_type=URLType.DATA_SOURCE + ) + record_type = URLRecordType( + url_id=url_id, + record_type=RecordType.POLICIES_AND_CONTRACTS + ) + await adb_client.add(record_type) + + await db_data_creator.create_url_agency_links( + url_ids=[url_id], + agency_ids=[agency_1_id] + ) + return url_id + async def add_agency( adb_client: AsyncDatabaseClient, - pittsburgh: LocalityCreationInfo + location_id: int ) -> int: agency_1 = Agency( name="Agency 1", @@ -164,7 +205,7 @@ async def add_agency( # Add Agency location agency_1_location = LinkAgencyLocation( agency_id=agency_id, - location_id=pittsburgh.location_id, + location_id=location_id, ) await adb_client.add(agency_1_location) return agency_id \ No newline at end of file From 94c32e37af49bbc1f4ec934194b2b6739d47d661 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sun, 16 Nov 2025 06:27:09 -0500 Subject: [PATCH 39/84] Add `data-sources/:id` `GET` endpoint --- .../data_source/_shared}/__init__.py | 0 .../endpoints/data_source/_shared/build.py | 66 ++++++++++ .../endpoints/data_source/_shared/process.py | 44 +++++++ .../data_source/by_id/get/__init__.py | 0 .../endpoints/data_source/by_id/get/query.py | 24 ++++ src/api/endpoints/data_source/get/query.py | 120 ++---------------- src/api/endpoints/data_source/routes.py | 17 ++- .../api/data_sources/by_id/__init__.py | 0 .../data_sources/by_id/agencies/__init__.py | 0 .../{ => by_id}/agencies/test_forbid.py | 0 .../api/data_sources/by_id/test_get.py | 12 ++ 11 files changed, 168 insertions(+), 115 deletions(-) rename {tests/automated/integration/readonly/api/data_sources/agencies => src/api/endpoints/data_source/_shared}/__init__.py (100%) create mode 100644 src/api/endpoints/data_source/_shared/build.py create mode 100644 src/api/endpoints/data_source/_shared/process.py create mode 100644 src/api/endpoints/data_source/by_id/get/__init__.py create mode 100644 src/api/endpoints/data_source/by_id/get/query.py create mode 100644 tests/automated/integration/readonly/api/data_sources/by_id/__init__.py create mode 100644 tests/automated/integration/readonly/api/data_sources/by_id/agencies/__init__.py rename tests/automated/integration/readonly/api/data_sources/{ => by_id}/agencies/test_forbid.py (100%) create mode 100644 tests/automated/integration/readonly/api/data_sources/by_id/test_get.py diff --git a/tests/automated/integration/readonly/api/data_sources/agencies/__init__.py b/src/api/endpoints/data_source/_shared/__init__.py similarity index 100% rename from tests/automated/integration/readonly/api/data_sources/agencies/__init__.py rename to src/api/endpoints/data_source/_shared/__init__.py diff --git a/src/api/endpoints/data_source/_shared/build.py b/src/api/endpoints/data_source/_shared/build.py new file mode 100644 index 00000000..35b65343 --- /dev/null +++ b/src/api/endpoints/data_source/_shared/build.py @@ -0,0 +1,66 @@ +from sqlalchemy import Select, select, and_ +from sqlalchemy.orm import selectinload + +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata +from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType + + +def build_data_source_get_query() -> Select: + return ( + select( + URL, + URL.id, + URL.url, + + # Required Attributes + URL.name, + URLRecordType.record_type, + + # Optional Attributes + URL.description, + LinkBatchURL.batch_id, + URLOptionalDataSourceMetadata.record_formats, + URLOptionalDataSourceMetadata.data_portal_type, + URLOptionalDataSourceMetadata.supplying_entity, + URLOptionalDataSourceMetadata.coverage_start, + URLOptionalDataSourceMetadata.coverage_end, + URLOptionalDataSourceMetadata.agency_supplied, + URLOptionalDataSourceMetadata.agency_aggregation, + URLOptionalDataSourceMetadata.agency_described_not_in_database, + URLOptionalDataSourceMetadata.agency_originated, + URLOptionalDataSourceMetadata.update_method, + URLOptionalDataSourceMetadata.readme_url, + URLOptionalDataSourceMetadata.originating_entity, + URLOptionalDataSourceMetadata.retention_schedule, + URLOptionalDataSourceMetadata.scraper_url, + URLOptionalDataSourceMetadata.submission_notes, + URLOptionalDataSourceMetadata.access_notes, + URLOptionalDataSourceMetadata.access_types + ) + .join( + URLRecordType, + URLRecordType.url_id == URL.id + ) + .join( + FlagURLValidated, + and_( + FlagURLValidated.url_id == URL.id, + FlagURLValidated.type == URLType.DATA_SOURCE + ) + ) + .outerjoin( + LinkBatchURL, + LinkBatchURL.url_id == URL.id + ) + .outerjoin( + URLOptionalDataSourceMetadata, + URLOptionalDataSourceMetadata.url_id == URL.id + ) + .options( + selectinload(URL.confirmed_agencies), + ) + ) \ No newline at end of file diff --git a/src/api/endpoints/data_source/_shared/process.py b/src/api/endpoints/data_source/_shared/process.py new file mode 100644 index 00000000..252ed7c0 --- /dev/null +++ b/src/api/endpoints/data_source/_shared/process.py @@ -0,0 +1,44 @@ +from sqlalchemy import RowMapping + +from src.api.endpoints.data_source.get.response import DataSourceGetResponse +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata +from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType + + +def process_data_source_get_mapping( + mapping: RowMapping +) -> DataSourceGetResponse: + url: URL = mapping[URL] + + url_agency_ids: list[int] = [] + for agency in url.confirmed_agencies: + url_agency_ids.append(agency.id) + + return DataSourceGetResponse( + url_id=mapping[URL.id], + url=mapping[URL.url], + name=mapping[URL.name], + record_type=mapping[URLRecordType.record_type], + agency_ids=url_agency_ids, + description=mapping[URL.description], + batch_id=mapping[LinkBatchURL.batch_id], + record_formats=mapping[URLOptionalDataSourceMetadata.record_formats] or [], + data_portal_type=mapping[URLOptionalDataSourceMetadata.data_portal_type], + supplying_entity=mapping[URLOptionalDataSourceMetadata.supplying_entity], + coverage_start=mapping[URLOptionalDataSourceMetadata.coverage_start], + coverage_end=mapping[URLOptionalDataSourceMetadata.coverage_end], + agency_supplied=mapping[URLOptionalDataSourceMetadata.agency_supplied], + agency_aggregation=mapping[URLOptionalDataSourceMetadata.agency_aggregation], + agency_originated=mapping[URLOptionalDataSourceMetadata.agency_originated], + agency_described_not_in_database=mapping[URLOptionalDataSourceMetadata.agency_described_not_in_database], + update_method=mapping[URLOptionalDataSourceMetadata.update_method], + readme_url=mapping[URLOptionalDataSourceMetadata.readme_url], + originating_entity=mapping[URLOptionalDataSourceMetadata.originating_entity], + retention_schedule=mapping[URLOptionalDataSourceMetadata.retention_schedule], + scraper_url=mapping[URLOptionalDataSourceMetadata.scraper_url], + submission_notes=mapping[URLOptionalDataSourceMetadata.submission_notes], + access_notes=mapping[URLOptionalDataSourceMetadata.access_notes], + access_types=mapping[URLOptionalDataSourceMetadata.access_types] or [] + ) \ No newline at end of file diff --git a/src/api/endpoints/data_source/by_id/get/__init__.py b/src/api/endpoints/data_source/by_id/get/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/data_source/by_id/get/query.py b/src/api/endpoints/data_source/by_id/get/query.py new file mode 100644 index 00000000..8f839543 --- /dev/null +++ b/src/api/endpoints/data_source/by_id/get/query.py @@ -0,0 +1,24 @@ +from sqlalchemy import Select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.data_source._shared.build import build_data_source_get_query +from src.api.endpoints.data_source._shared.process import process_data_source_get_mapping +from src.api.endpoints.data_source.get.response import DataSourceGetResponse +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.base.builder import QueryBuilderBase + + +class GetDataSourceByIDQueryBuilder(QueryBuilderBase): + def __init__( + self, + url_id: int, + ): + super().__init__() + self.url_id = url_id + + async def run(self, session: AsyncSession) -> DataSourceGetResponse: + query: Select = build_data_source_get_query() + query = query.where(URL.id == self.url_id) + + mapping: RowMapping = await self.sh.mapping(session, query=query) + return process_data_source_get_mapping(mapping=mapping) \ No newline at end of file diff --git a/src/api/endpoints/data_source/get/query.py b/src/api/endpoints/data_source/get/query.py index 8766409d..cc167d62 100644 --- a/src/api/endpoints/data_source/get/query.py +++ b/src/api/endpoints/data_source/get/query.py @@ -1,10 +1,12 @@ from datetime import date from typing import Any, Sequence -from sqlalchemy import select, RowMapping, and_ +from sqlalchemy import select, RowMapping, and_, Select from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.orm import selectinload +from src.api.endpoints.data_source._shared.build import build_data_source_get_query +from src.api.endpoints.data_source._shared.process import process_data_source_get_mapping from src.api.endpoints.data_source.get.response import DataSourceGetOuterResponse, DataSourceGetResponse from src.core.enums import RecordType from src.db.models.impl.flag.url_validated.enums import URLType @@ -18,7 +20,7 @@ from src.db.queries.base.builder import QueryBuilderBase -class GetDataSourceQueryBuilder(QueryBuilderBase): +class GetDataSourcesQueryBuilder(QueryBuilderBase): def __init__( self, @@ -28,59 +30,9 @@ def __init__( self.page = page async def run(self, session: AsyncSession) -> DataSourceGetOuterResponse: + query: Select = build_data_source_get_query() query = ( - select( - URL, - URL.id, - URL.url, - - # Required Attributes - URL.name, - URLRecordType.record_type, - - # Optional Attributes - URL.description, - LinkBatchURL.batch_id, - URLOptionalDataSourceMetadata.record_formats, - URLOptionalDataSourceMetadata.data_portal_type, - URLOptionalDataSourceMetadata.supplying_entity, - URLOptionalDataSourceMetadata.coverage_start, - URLOptionalDataSourceMetadata.coverage_end, - URLOptionalDataSourceMetadata.agency_supplied, - URLOptionalDataSourceMetadata.agency_aggregation, - URLOptionalDataSourceMetadata.agency_described_not_in_database, - URLOptionalDataSourceMetadata.agency_originated, - URLOptionalDataSourceMetadata.update_method, - URLOptionalDataSourceMetadata.readme_url, - URLOptionalDataSourceMetadata.originating_entity, - URLOptionalDataSourceMetadata.retention_schedule, - URLOptionalDataSourceMetadata.scraper_url, - URLOptionalDataSourceMetadata.submission_notes, - URLOptionalDataSourceMetadata.access_notes, - URLOptionalDataSourceMetadata.access_types - ) - .join( - URLRecordType, - URLRecordType.url_id == URL.id - ) - .join( - FlagURLValidated, - and_( - FlagURLValidated.url_id == URL.id, - FlagURLValidated.type == URLType.DATA_SOURCE - ) - ) - .outerjoin( - LinkBatchURL, - LinkBatchURL.url_id == URL.id - ) - .outerjoin( - URLOptionalDataSourceMetadata, - URLOptionalDataSourceMetadata.url_id == URL.id - ) - .options( - selectinload(URL.confirmed_agencies), - ) + query .limit(100) .offset((self.page - 1) * 100) ) @@ -89,64 +41,8 @@ async def run(self, session: AsyncSession) -> DataSourceGetOuterResponse: responses: list[DataSourceGetResponse] = [] for mapping in mappings: - url: URL = mapping[URL] - url_id: int = mapping[URL.id] - url_url: str = mapping[URL.url] - url_name: str = mapping[URL.name] - url_record_type: RecordType = mapping[URLRecordType.record_type] - - url_agency_ids: list[int] = [] - for agency in url.confirmed_agencies: - url_agency_ids.append(agency.id) - - url_description: str | None = mapping[URL.description] - link_batch_url_batch_id: int | None = mapping[LinkBatchURL.batch_id] - url_record_formats: list[str] = mapping[URLOptionalDataSourceMetadata.record_formats] or [] - url_data_portal_type: str | None = mapping[URLOptionalDataSourceMetadata.data_portal_type] - url_supplying_entity: str | None = mapping[URLOptionalDataSourceMetadata.supplying_entity] - url_coverage_start: date | None = mapping[URLOptionalDataSourceMetadata.coverage_start] - url_coverage_end: date | None = mapping[URLOptionalDataSourceMetadata.coverage_end] - url_agency_supplied: bool | None = mapping[URLOptionalDataSourceMetadata.agency_supplied] - url_agency_aggregation: AgencyAggregationEnum | None = mapping[URLOptionalDataSourceMetadata.agency_aggregation] - url_agency_originated: bool | None = mapping[URLOptionalDataSourceMetadata.agency_originated] - url_agency_described_not_in_database: bool | None = mapping[URLOptionalDataSourceMetadata.agency_described_not_in_database] - url_update_method: UpdateMethodEnum | None = mapping[URLOptionalDataSourceMetadata.update_method] - url_readme_url: str | None = mapping[URLOptionalDataSourceMetadata.readme_url] - url_originating_entity: str | None = mapping[URLOptionalDataSourceMetadata.originating_entity] - url_retention_schedule: RetentionScheduleEnum | None = mapping[URLOptionalDataSourceMetadata.retention_schedule] - url_scraper_url: str | None = mapping[URLOptionalDataSourceMetadata.scraper_url] - url_submission_notes: str | None = mapping[URLOptionalDataSourceMetadata.submission_notes] - url_access_notes: str | None = mapping[URLOptionalDataSourceMetadata.access_notes] - url_access_types: list[AccessTypeEnum] = mapping[URLOptionalDataSourceMetadata.access_types] or [] - - responses.append( - DataSourceGetResponse( - url_id=url_id, - url=url_url, - name=url_name, - record_type=url_record_type, - agency_ids=url_agency_ids, - description=url_description, - batch_id=link_batch_url_batch_id, - record_formats=url_record_formats, - data_portal_type=url_data_portal_type, - supplying_entity=url_supplying_entity, - coverage_start=url_coverage_start, - coverage_end=url_coverage_end, - agency_supplied=url_agency_supplied, - agency_aggregation=url_agency_aggregation, - agency_originated=url_agency_originated, - agency_described_not_in_database=url_agency_described_not_in_database, - update_method=url_update_method, - readme_url=url_readme_url, - originating_entity=url_originating_entity, - retention_schedule=url_retention_schedule, - scraper_url=url_scraper_url, - submission_notes=url_submission_notes, - access_notes=url_access_notes, - access_types=url_access_types - ) - ) + response: DataSourceGetResponse = process_data_source_get_mapping(mapping) + responses.append(response) return DataSourceGetOuterResponse( results=responses, diff --git a/src/api/endpoints/data_source/routes.py b/src/api/endpoints/data_source/routes.py index 2464ceea..04d81f10 100644 --- a/src/api/endpoints/data_source/routes.py +++ b/src/api/endpoints/data_source/routes.py @@ -6,8 +6,9 @@ from src.api.endpoints.data_source.by_id.agency.get.wrapper import get_data_source_agencies_wrapper from src.api.endpoints.data_source.by_id.agency.post.wrapper import add_data_source_agency_link from src.api.endpoints.data_source.by_id.agency.shared.check import check_is_data_source_url -from src.api.endpoints.data_source.get.query import GetDataSourceQueryBuilder -from src.api.endpoints.data_source.get.response import DataSourceGetOuterResponse +from src.api.endpoints.data_source.by_id.get.query import GetDataSourceByIDQueryBuilder +from src.api.endpoints.data_source.get.query import GetDataSourcesQueryBuilder +from src.api.endpoints.data_source.get.response import DataSourceGetOuterResponse, DataSourceGetResponse from src.api.endpoints.data_source.by_id.put.query import UpdateDataSourceQueryBuilder from src.api.endpoints.data_source.by_id.put.request import DataSourcePutRequest from src.api.shared.models.message_response import MessageResponse @@ -28,7 +29,16 @@ async def get_data_sources( ), ) -> DataSourceGetOuterResponse: return await async_core.adb_client.run_query_builder( - GetDataSourceQueryBuilder(page=page) + GetDataSourcesQueryBuilder(page=page) + ) + +@data_sources_router.get("/{url_id}") +async def get_data_source_by_id( + url_id: int, + async_core: AsyncCore = Depends(get_async_core), +) -> DataSourceGetResponse: + return await async_core.adb_client.run_query_builder( + GetDataSourceByIDQueryBuilder(url_id) ) @data_sources_router.put("/{url_id}") @@ -81,3 +91,4 @@ async def remove_agency_from_data_source( adb_client=async_core.adb_client ) return MessageResponse(message="Agency removed from data source.") + diff --git a/tests/automated/integration/readonly/api/data_sources/by_id/__init__.py b/tests/automated/integration/readonly/api/data_sources/by_id/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/readonly/api/data_sources/by_id/agencies/__init__.py b/tests/automated/integration/readonly/api/data_sources/by_id/agencies/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/readonly/api/data_sources/agencies/test_forbid.py b/tests/automated/integration/readonly/api/data_sources/by_id/agencies/test_forbid.py similarity index 100% rename from tests/automated/integration/readonly/api/data_sources/agencies/test_forbid.py rename to tests/automated/integration/readonly/api/data_sources/by_id/agencies/test_forbid.py diff --git a/tests/automated/integration/readonly/api/data_sources/by_id/test_get.py b/tests/automated/integration/readonly/api/data_sources/by_id/test_get.py new file mode 100644 index 00000000..a874d054 --- /dev/null +++ b/tests/automated/integration/readonly/api/data_sources/by_id/test_get.py @@ -0,0 +1,12 @@ +import pytest + +from src.api.endpoints.data_source.get.response import DataSourceGetResponse +from tests.automated.integration.readonly.helper import ReadOnlyTestHelper + +@pytest.mark.asyncio +async def test_get_by_id(readonly_helper: ReadOnlyTestHelper): + raw_json: dict = readonly_helper.api_test_helper.request_validator.get_v3( + url=f"/data-sources/{readonly_helper.maximal_data_source}", + ) + # Test response is in expected form. + DataSourceGetResponse(**raw_json) \ No newline at end of file From 9da9a8f8afa6106653489d034e3f6cd34c8285a1 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sun, 16 Nov 2025 07:19:06 -0500 Subject: [PATCH 40/84] Add more strict requirements to record_formats, access_types, url_status --- .../impl/sync_to_ds/impl/data_sources/add/queries/get.py | 4 ++-- src/external/pdap/impl/sync/data_sources/_shared/content.py | 6 +++--- .../tasks/scheduled/impl/sync_to_ds/data_source/test_add.py | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/get.py index 103923d2..ae0a01ec 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/get.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/get.py @@ -96,7 +96,7 @@ async def run(self, session: AsyncSession) -> AddDataSourcesOuterRequest: agency_ids=mapping["agency_ids"], # Optional description=mapping[URL.description], - record_formats=mapping[URLOptionalDataSourceMetadata.record_formats], + record_formats=mapping[URLOptionalDataSourceMetadata.record_formats] or [], data_portal_type=mapping[URLOptionalDataSourceMetadata.data_portal_type], supplying_entity=mapping[URLOptionalDataSourceMetadata.supplying_entity], coverage_start=mapping[URLOptionalDataSourceMetadata.coverage_start], @@ -109,7 +109,7 @@ async def run(self, session: AsyncSession) -> AddDataSourcesOuterRequest: retention_schedule=mapping[URLOptionalDataSourceMetadata.retention_schedule], scraper_url=mapping[URLOptionalDataSourceMetadata.scraper_url], access_notes=mapping[URLOptionalDataSourceMetadata.access_notes], - access_types=mapping[URLOptionalDataSourceMetadata.access_types], + access_types=mapping[URLOptionalDataSourceMetadata.access_types] or [], url_status=DataSourcesURLStatus.OK ) ) diff --git a/src/external/pdap/impl/sync/data_sources/_shared/content.py b/src/external/pdap/impl/sync/data_sources/_shared/content.py index d9403c63..914b6d1e 100644 --- a/src/external/pdap/impl/sync/data_sources/_shared/content.py +++ b/src/external/pdap/impl/sync/data_sources/_shared/content.py @@ -19,7 +19,7 @@ class DataSourceSyncContentModel(BaseModel): description: str | None = None # Optional data source metadata - record_formats: list[str] | None = None + record_formats: list[str] = [] data_portal_type: str | None = None supplying_entity: str | None = None coverage_start: date | None = None @@ -35,8 +35,8 @@ class DataSourceSyncContentModel(BaseModel): retention_schedule: RetentionScheduleEnum | None = None scraper_url: str | None = None access_notes: str | None = None - access_types: list[AccessTypeEnum] | None = None + access_types: list[AccessTypeEnum] = [] data_portal_type_other: str | None = None - url_status: DataSourcesURLStatus | None = None + url_status: DataSourcesURLStatus agency_ids: list[int] = [] diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/test_add.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/test_add.py index b90bb761..fa31dc40 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/test_add.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/test_add.py @@ -62,7 +62,7 @@ async def test_add( assert content.name.startswith("Example ") assert content.record_type == RecordType.CRIME_STATISTICS assert content.description is None - assert content.record_formats is None + assert content.record_formats == [] assert content.data_portal_type is None assert content.supplying_entity is None assert content.coverage_start is None @@ -77,7 +77,7 @@ async def test_add( assert content.retention_schedule is None assert content.scraper_url is None assert content.access_notes is None - assert content.access_types is None + assert content.access_types == [] assert content.data_portal_type_other is None assert content.url_status == DataSourcesURLStatus.OK From 4f41bdd9807d1710a79e2f6752fc6d060bf375f4 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sun, 16 Nov 2025 11:28:32 -0500 Subject: [PATCH 41/84] Address bug in get user contributions --- .../annotate/all/get/queries/convert.py | 2 +- .../annotate/all/get/queries/core.py | 2 +- .../endpoints/annotate/all/post/requester.py | 2 +- .../contributions/shared/contributions.py | 2 +- .../user/queries/agreement/url_type.py | 2 +- .../user/queries/annotated_and_validated.py | 2 +- .../contributions/user/queries/core.py | 9 +- .../metrics/urls/breakdown/query/core.py | 2 +- .../tasks/url/operators/auto_relevant/core.py | 2 +- .../operators/auto_relevant/queries/cte.py | 2 +- .../operators/auto_relevant/queries/get.py | 2 +- .../queries/ctes/counts/impl/url_type.py | 3 +- src/db/client/async_.py | 6 +- src/db/client/types.py | 2 +- src/db/constants.py | 2 +- src/db/dto_converter.py | 4 +- .../{relevant => url_type}/__init__.py | 0 .../{relevant => url_type}/auto/__init__.py | 0 .../auto/pydantic/__init__.py | 0 .../auto/pydantic/input.py | 0 .../{relevant => url_type}/auto/sqlalchemy.py | 0 .../suggestion/{relevant => url_type}/user.py | 0 .../common/annotation_exists_/constants.py | 4 +- .../core/metrics/urls/aggregated/pending.py | 2 +- src/db/types.py | 2 +- .../api/annotate/all/test_happy_path.py | 2 +- .../api/url/by_id/delete/test_any_url.py | 4 +- .../readonly/api/contributions/__init__.py | 0 .../api/contributions/test_leaderboard.py | 14 ++ .../readonly/api/contributions/test_user.py | 17 ++ .../api/data_sources/by_id/test_get.py | 2 +- .../readonly/api/data_sources/test_get.py | 2 +- .../api/meta_urls/agencies/test_forbid.py | 2 +- .../integration/readonly/conftest.py | 2 +- .../automated/integration/readonly/helper.py | 14 +- tests/automated/integration/readonly/setup.py | 211 ------------------ .../integration/readonly/setup/__init__.py | 0 .../integration/readonly/setup/agency.py | 23 ++ .../integration/readonly/setup/annotations.py | 72 ++++++ .../integration/readonly/setup/core.py | 99 ++++++++ .../integration/readonly/setup/data_source.py | 103 +++++++++ .../integration/readonly/setup/meta_url.py | 33 +++ .../tasks/url/impl/auto_relevant/test_task.py | 2 +- .../commands/impl/suggestion/auto/relevant.py | 2 +- 44 files changed, 410 insertions(+), 248 deletions(-) rename src/db/models/impl/url/suggestion/{relevant => url_type}/__init__.py (100%) rename src/db/models/impl/url/suggestion/{relevant => url_type}/auto/__init__.py (100%) rename src/db/models/impl/url/suggestion/{relevant => url_type}/auto/pydantic/__init__.py (100%) rename src/db/models/impl/url/suggestion/{relevant => url_type}/auto/pydantic/input.py (100%) rename src/db/models/impl/url/suggestion/{relevant => url_type}/auto/sqlalchemy.py (100%) rename src/db/models/impl/url/suggestion/{relevant => url_type}/user.py (100%) create mode 100644 tests/automated/integration/readonly/api/contributions/__init__.py create mode 100644 tests/automated/integration/readonly/api/contributions/test_leaderboard.py create mode 100644 tests/automated/integration/readonly/api/contributions/test_user.py delete mode 100644 tests/automated/integration/readonly/setup.py create mode 100644 tests/automated/integration/readonly/setup/__init__.py create mode 100644 tests/automated/integration/readonly/setup/agency.py create mode 100644 tests/automated/integration/readonly/setup/annotations.py create mode 100644 tests/automated/integration/readonly/setup/core.py create mode 100644 tests/automated/integration/readonly/setup/data_source.py create mode 100644 tests/automated/integration/readonly/setup/meta_url.py diff --git a/src/api/endpoints/annotate/all/get/queries/convert.py b/src/api/endpoints/annotate/all/get/queries/convert.py index 535a7d15..386389a5 100644 --- a/src/api/endpoints/annotate/all/get/queries/convert.py +++ b/src/api/endpoints/annotate/all/get/queries/convert.py @@ -5,7 +5,7 @@ from src.core.enums import RecordType from src.db.models.impl.flag.url_validated.enums import URLType from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion +from src.db.models.impl.url.suggestion.url_type.user import UserURLTypeSuggestion def convert_user_url_type_suggestion_to_url_type_annotation_suggestion( diff --git a/src/api/endpoints/annotate/all/get/queries/core.py b/src/api/endpoints/annotate/all/get/queries/core.py index 9b905870..5b239db0 100644 --- a/src/api/endpoints/annotate/all/get/queries/core.py +++ b/src/api/endpoints/annotate/all/get/queries/core.py @@ -11,7 +11,7 @@ from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion from src.db.models.impl.url.suggestion.location.user.sqlalchemy import UserLocationSuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion +from src.db.models.impl.url.suggestion.url_type.user import UserURLTypeSuggestion from src.db.models.views.unvalidated_url import UnvalidatedURL from src.db.models.views.url_anno_count import URLAnnotationCount from src.db.models.views.url_annotations_flags import URLAnnotationFlagsView diff --git a/src/api/endpoints/annotate/all/post/requester.py b/src/api/endpoints/annotate/all/post/requester.py index 2d9cfeca..8834ff76 100644 --- a/src/api/endpoints/annotate/all/post/requester.py +++ b/src/api/endpoints/annotate/all/post/requester.py @@ -11,7 +11,7 @@ from src.db.models.impl.url.suggestion.name.enums import NameSuggestionSource from src.db.models.impl.url.suggestion.name.sqlalchemy import URLNameSuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion +from src.db.models.impl.url.suggestion.url_type.user import UserURLTypeSuggestion from src.db.templates.requester import RequesterBase diff --git a/src/api/endpoints/contributions/shared/contributions.py b/src/api/endpoints/contributions/shared/contributions.py index 477f0365..ae72fc00 100644 --- a/src/api/endpoints/contributions/shared/contributions.py +++ b/src/api/endpoints/contributions/shared/contributions.py @@ -1,6 +1,6 @@ from sqlalchemy import select, func, CTE, Column -from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion +from src.db.models.impl.url.suggestion.url_type.user import UserURLTypeSuggestion class ContributionsCTEContainer: diff --git a/src/api/endpoints/contributions/user/queries/agreement/url_type.py b/src/api/endpoints/contributions/user/queries/agreement/url_type.py index cf028bf1..12feb834 100644 --- a/src/api/endpoints/contributions/user/queries/agreement/url_type.py +++ b/src/api/endpoints/contributions/user/queries/agreement/url_type.py @@ -3,7 +3,7 @@ from src.api.endpoints.contributions.user.queries.annotated_and_validated import AnnotatedAndValidatedCTEContainer from src.api.endpoints.contributions.user.queries.templates.agreement import AgreementCTEContainer from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated -from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion +from src.db.models.impl.url.suggestion.url_type.user import UserURLTypeSuggestion def get_url_type_agreement_cte_container( diff --git a/src/api/endpoints/contributions/user/queries/annotated_and_validated.py b/src/api/endpoints/contributions/user/queries/annotated_and_validated.py index a9740328..9c7c48f6 100644 --- a/src/api/endpoints/contributions/user/queries/annotated_and_validated.py +++ b/src/api/endpoints/contributions/user/queries/annotated_and_validated.py @@ -1,7 +1,7 @@ from sqlalchemy import select, Column, CTE from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated -from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion +from src.db.models.impl.url.suggestion.url_type.user import UserURLTypeSuggestion class AnnotatedAndValidatedCTEContainer: diff --git a/src/api/endpoints/contributions/user/queries/core.py b/src/api/endpoints/contributions/user/queries/core.py index 1709776c..025815ed 100644 --- a/src/api/endpoints/contributions/user/queries/core.py +++ b/src/api/endpoints/contributions/user/queries/core.py @@ -45,6 +45,9 @@ async def run(self, session: AsyncSession) -> ContributionsUserResponse: url_type_agree.cte, contributions_cte.user_id == url_type_agree.user_id ) + .where( + contributions_cte.user_id == self.user_id + ) ) mapping: RowMapping = await sh.mapping(session, query=query) @@ -52,8 +55,8 @@ async def run(self, session: AsyncSession) -> ContributionsUserResponse: return ContributionsUserResponse( count_validated=mapping.count, agreement=ContributionsUserAgreement( - record_type=mapping.record_type, - agency=mapping.agency, - url_type=mapping.url_type + record_type=mapping.record_type or 0, + agency=mapping.agency or 0, + url_type=mapping.url_type or 0 ) ) \ No newline at end of file diff --git a/src/api/endpoints/metrics/urls/breakdown/query/core.py b/src/api/endpoints/metrics/urls/breakdown/query/core.py index 2606a079..bccc7d68 100644 --- a/src/api/endpoints/metrics/urls/breakdown/query/core.py +++ b/src/api/endpoints/metrics/urls/breakdown/query/core.py @@ -10,7 +10,7 @@ from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion +from src.db.models.impl.url.suggestion.url_type.user import UserURLTypeSuggestion from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/core/tasks/url/operators/auto_relevant/core.py b/src/core/tasks/url/operators/auto_relevant/core.py index 86cc179e..3acff217 100644 --- a/src/core/tasks/url/operators/auto_relevant/core.py +++ b/src/core/tasks/url/operators/auto_relevant/core.py @@ -5,7 +5,7 @@ from src.core.tasks.url.operators.auto_relevant.sort import separate_success_and_error_subsets from src.core.tasks.url.operators.base import URLTaskOperatorBase from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.impl.url.suggestion.relevant.auto.pydantic.input import AutoRelevancyAnnotationInput +from src.db.models.impl.url.suggestion.url_type.auto.pydantic.input import AutoRelevancyAnnotationInput from src.db.enums import TaskType from src.db.models.impl.url.task_error.pydantic_.insert import URLTaskErrorPydantic from src.db.models.impl.url.task_error.pydantic_.small import URLTaskErrorSmall diff --git a/src/core/tasks/url/operators/auto_relevant/queries/cte.py b/src/core/tasks/url/operators/auto_relevant/queries/cte.py index 8ad33867..c8b816fd 100644 --- a/src/core/tasks/url/operators/auto_relevant/queries/cte.py +++ b/src/core/tasks/url/operators/auto_relevant/queries/cte.py @@ -6,7 +6,7 @@ from src.db.helpers.query import not_exists_url, no_url_task_error from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML -from src.db.models.impl.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion +from src.db.models.impl.url.suggestion.url_type.auto.sqlalchemy import AutoRelevantSuggestion class AutoRelevantPrerequisitesCTEContainer: diff --git a/src/core/tasks/url/operators/auto_relevant/queries/get.py b/src/core/tasks/url/operators/auto_relevant/queries/get.py index 6f6c59b0..b566bb42 100644 --- a/src/core/tasks/url/operators/auto_relevant/queries/get.py +++ b/src/core/tasks/url/operators/auto_relevant/queries/get.py @@ -9,7 +9,7 @@ from src.core.tasks.url.operators.auto_relevant.queries.cte import AutoRelevantPrerequisitesCTEContainer from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion +from src.db.models.impl.url.suggestion.url_type.auto.sqlalchemy import AutoRelevantSuggestion from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer from src.db.utils.compression import decompress_html diff --git a/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/url_type.py b/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/url_type.py index 0e3de946..f0d340e7 100644 --- a/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/url_type.py +++ b/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/url_type.py @@ -1,8 +1,7 @@ from sqlalchemy import select, func from src.core.tasks.url.operators.validate.queries.ctes.counts.core import ValidatedCountsCTEContainer -from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion +from src.db.models.impl.url.suggestion.url_type.user import UserURLTypeSuggestion from src.db.models.views.unvalidated_url import UnvalidatedURL URL_TYPES_VALIDATION_COUNTS_CTE = ValidatedCountsCTEContainer( diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 50802347..10ee5b6c 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -81,9 +81,9 @@ from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.impl.url.suggestion.relevant.auto.pydantic.input import AutoRelevancyAnnotationInput -from src.db.models.impl.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion -from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion +from src.db.models.impl.url.suggestion.url_type.auto.pydantic.input import AutoRelevancyAnnotationInput +from src.db.models.impl.url.suggestion.url_type.auto.sqlalchemy import AutoRelevantSuggestion +from src.db.models.impl.url.suggestion.url_type.user import UserURLTypeSuggestion from src.db.models.impl.url.task_error.sqlalchemy import URLTaskError from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata from src.db.models.templates_.base import Base diff --git a/src/db/client/types.py b/src/db/client/types.py index 18b32b88..e4f70301 100644 --- a/src/db/client/types.py +++ b/src/db/client/types.py @@ -1,5 +1,5 @@ from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion +from src.db.models.impl.url.suggestion.url_type.user import UserURLTypeSuggestion UserSuggestionModel = UserURLTypeSuggestion or UserRecordTypeSuggestion or UserURLAgencySuggestion diff --git a/src/db/constants.py b/src/db/constants.py index 67ff66a5..c8821e7e 100644 --- a/src/db/constants.py +++ b/src/db/constants.py @@ -1,6 +1,6 @@ from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion +from src.db.models.impl.url.suggestion.url_type.user import UserURLTypeSuggestion PLACEHOLDER_AGENCY_NAME = "PLACEHOLDER_AGENCY_NAME" diff --git a/src/db/dto_converter.py b/src/db/dto_converter.py index dab6b496..4c91a353 100644 --- a/src/db/dto_converter.py +++ b/src/db/dto_converter.py @@ -16,8 +16,8 @@ from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.impl.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion -from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion +from src.db.models.impl.url.suggestion.url_type.auto.sqlalchemy import AutoRelevantSuggestion +from src.db.models.impl.url.suggestion.url_type.user import UserURLTypeSuggestion class DTOConverter: diff --git a/src/db/models/impl/url/suggestion/relevant/__init__.py b/src/db/models/impl/url/suggestion/url_type/__init__.py similarity index 100% rename from src/db/models/impl/url/suggestion/relevant/__init__.py rename to src/db/models/impl/url/suggestion/url_type/__init__.py diff --git a/src/db/models/impl/url/suggestion/relevant/auto/__init__.py b/src/db/models/impl/url/suggestion/url_type/auto/__init__.py similarity index 100% rename from src/db/models/impl/url/suggestion/relevant/auto/__init__.py rename to src/db/models/impl/url/suggestion/url_type/auto/__init__.py diff --git a/src/db/models/impl/url/suggestion/relevant/auto/pydantic/__init__.py b/src/db/models/impl/url/suggestion/url_type/auto/pydantic/__init__.py similarity index 100% rename from src/db/models/impl/url/suggestion/relevant/auto/pydantic/__init__.py rename to src/db/models/impl/url/suggestion/url_type/auto/pydantic/__init__.py diff --git a/src/db/models/impl/url/suggestion/relevant/auto/pydantic/input.py b/src/db/models/impl/url/suggestion/url_type/auto/pydantic/input.py similarity index 100% rename from src/db/models/impl/url/suggestion/relevant/auto/pydantic/input.py rename to src/db/models/impl/url/suggestion/url_type/auto/pydantic/input.py diff --git a/src/db/models/impl/url/suggestion/relevant/auto/sqlalchemy.py b/src/db/models/impl/url/suggestion/url_type/auto/sqlalchemy.py similarity index 100% rename from src/db/models/impl/url/suggestion/relevant/auto/sqlalchemy.py rename to src/db/models/impl/url/suggestion/url_type/auto/sqlalchemy.py diff --git a/src/db/models/impl/url/suggestion/relevant/user.py b/src/db/models/impl/url/suggestion/url_type/user.py similarity index 100% rename from src/db/models/impl/url/suggestion/relevant/user.py rename to src/db/models/impl/url/suggestion/url_type/user.py diff --git a/src/db/queries/implementations/core/common/annotation_exists_/constants.py b/src/db/queries/implementations/core/common/annotation_exists_/constants.py index b5adfad9..190291ef 100644 --- a/src/db/queries/implementations/core/common/annotation_exists_/constants.py +++ b/src/db/queries/implementations/core/common/annotation_exists_/constants.py @@ -2,8 +2,8 @@ from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.impl.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion -from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion +from src.db.models.impl.url.suggestion.url_type.auto.sqlalchemy import AutoRelevantSuggestion +from src.db.models.impl.url.suggestion.url_type.user import UserURLTypeSuggestion ALL_ANNOTATION_MODELS = [ AutoRecordTypeSuggestion, diff --git a/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py b/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py index 395fe3f9..d609e2b3 100644 --- a/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py +++ b/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py @@ -8,7 +8,7 @@ from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion +from src.db.models.impl.url.suggestion.url_type.user import UserURLTypeSuggestion from src.db.models.mixins import URLDependentMixin from src.db.queries.base.builder import QueryBuilderBase from src.db.queries.implementations.core.common.annotation_exists_.core import AnnotationExistsCTEQueryBuilder diff --git a/src/db/types.py b/src/db/types.py index 073fec7c..c224a36c 100644 --- a/src/db/types.py +++ b/src/db/types.py @@ -2,7 +2,7 @@ from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion +from src.db.models.impl.url.suggestion.url_type.user import UserURLTypeSuggestion from src.db.queries.base.labels import LabelsBase UserSuggestionType = UserURLAgencySuggestion | UserURLTypeSuggestion | UserRecordTypeSuggestion diff --git a/tests/automated/integration/api/annotate/all/test_happy_path.py b/tests/automated/integration/api/annotate/all/test_happy_path.py index e9fae81e..007e87f7 100644 --- a/tests/automated/integration/api/annotate/all/test_happy_path.py +++ b/tests/automated/integration/api/annotate/all/test_happy_path.py @@ -14,7 +14,7 @@ from src.db.models.impl.url.suggestion.location.user.sqlalchemy import UserLocationSuggestion from src.db.models.impl.url.suggestion.name.sqlalchemy import URLNameSuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion +from src.db.models.impl.url.suggestion.url_type.user import UserURLTypeSuggestion from tests.helpers.data_creator.models.creation_info.us_state import USStateCreationInfo from tests.helpers.setup.final_review.core import setup_for_get_next_url_for_final_review diff --git a/tests/automated/integration/api/url/by_id/delete/test_any_url.py b/tests/automated/integration/api/url/by_id/delete/test_any_url.py index 579da570..bd17141b 100644 --- a/tests/automated/integration/api/url/by_id/delete/test_any_url.py +++ b/tests/automated/integration/api/url/by_id/delete/test_any_url.py @@ -40,8 +40,8 @@ from src.db.models.impl.url.suggestion.name.sqlalchemy import URLNameSuggestion from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.impl.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion -from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion +from src.db.models.impl.url.suggestion.url_type.auto.sqlalchemy import AutoRelevantSuggestion +from src.db.models.impl.url.suggestion.url_type.user import UserURLTypeSuggestion from src.db.models.impl.url.task_error.sqlalchemy import URLTaskError from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata from tests.helpers.api_test_helper import APITestHelper diff --git a/tests/automated/integration/readonly/api/contributions/__init__.py b/tests/automated/integration/readonly/api/contributions/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/readonly/api/contributions/test_leaderboard.py b/tests/automated/integration/readonly/api/contributions/test_leaderboard.py new file mode 100644 index 00000000..140cc777 --- /dev/null +++ b/tests/automated/integration/readonly/api/contributions/test_leaderboard.py @@ -0,0 +1,14 @@ +import pytest + +from src.api.endpoints.contributions.leaderboard.query import GetContributionsLeaderboardQueryBuilder +from tests.automated.integration.readonly.helper import ReadOnlyTestHelper + + +@pytest.mark.asyncio +async def test_leaderboard( + readonly_helper: ReadOnlyTestHelper +): + await readonly_helper.adb_client.run_query_builder( + GetContributionsLeaderboardQueryBuilder() + ) + diff --git a/tests/automated/integration/readonly/api/contributions/test_user.py b/tests/automated/integration/readonly/api/contributions/test_user.py new file mode 100644 index 00000000..170797df --- /dev/null +++ b/tests/automated/integration/readonly/api/contributions/test_user.py @@ -0,0 +1,17 @@ +import pytest + +from src.api.endpoints.contributions.user.queries.core import GetUserContributionsQueryBuilder +from tests.automated.integration.readonly.helper import ReadOnlyTestHelper + + +@pytest.mark.asyncio +async def test_user( + readonly_helper: ReadOnlyTestHelper +): + for user_id in [ + readonly_helper.user_1_id, + readonly_helper.user_2_id, + ]: + await readonly_helper.adb_client.run_query_builder( + GetUserContributionsQueryBuilder(user_id) + ) diff --git a/tests/automated/integration/readonly/api/data_sources/by_id/test_get.py b/tests/automated/integration/readonly/api/data_sources/by_id/test_get.py index a874d054..16c30869 100644 --- a/tests/automated/integration/readonly/api/data_sources/by_id/test_get.py +++ b/tests/automated/integration/readonly/api/data_sources/by_id/test_get.py @@ -6,7 +6,7 @@ @pytest.mark.asyncio async def test_get_by_id(readonly_helper: ReadOnlyTestHelper): raw_json: dict = readonly_helper.api_test_helper.request_validator.get_v3( - url=f"/data-sources/{readonly_helper.maximal_data_source}", + url=f"/data-sources/{readonly_helper.maximal_data_source_url_id}", ) # Test response is in expected form. DataSourceGetResponse(**raw_json) \ No newline at end of file diff --git a/tests/automated/integration/readonly/api/data_sources/test_get.py b/tests/automated/integration/readonly/api/data_sources/test_get.py index 27e6ad63..c23d2177 100644 --- a/tests/automated/integration/readonly/api/data_sources/test_get.py +++ b/tests/automated/integration/readonly/api/data_sources/test_get.py @@ -24,7 +24,7 @@ async def test_get(readonly_helper: ReadOnlyTestHelper): diff = DeepDiff( response.model_dump(mode='json'), DataSourceGetResponse( - url_id=readonly_helper.maximal_data_source, + url_id=readonly_helper.maximal_data_source_url_id, url="read-only-ds.com", name="Read only URL name", diff --git a/tests/automated/integration/readonly/api/meta_urls/agencies/test_forbid.py b/tests/automated/integration/readonly/api/meta_urls/agencies/test_forbid.py index 32bb08fe..28d5e45e 100644 --- a/tests/automated/integration/readonly/api/meta_urls/agencies/test_forbid.py +++ b/tests/automated/integration/readonly/api/meta_urls/agencies/test_forbid.py @@ -8,7 +8,7 @@ @pytest.mark.asyncio async def test_forbid(readonly_helper: ReadOnlyTestHelper): check_forbidden_url_type( - route=f"/meta-urls/{readonly_helper.minimal_data_source}/agencies", + route=f"/meta-urls/{readonly_helper.minimal_data_source_url_id}/agencies", api_test_helper=readonly_helper.api_test_helper, method="GET" ) diff --git a/tests/automated/integration/readonly/conftest.py b/tests/automated/integration/readonly/conftest.py index d98d9c19..3fdd0598 100644 --- a/tests/automated/integration/readonly/conftest.py +++ b/tests/automated/integration/readonly/conftest.py @@ -8,7 +8,7 @@ from tests.automated.integration.api._helpers.RequestValidator import RequestValidator from tests.automated.integration.readonly.helper import ReadOnlyTestHelper -from tests.automated.integration.readonly.setup import setup_readonly_data +from tests.automated.integration.readonly.setup.core import setup_readonly_data from tests.helpers.api_test_helper import APITestHelper from tests.helpers.data_creator.core import DBDataCreator from tests.helpers.data_creator.models.creation_info.us_state import USStateCreationInfo diff --git a/tests/automated/integration/readonly/helper.py b/tests/automated/integration/readonly/helper.py index 1331aa17..b0ffcd9e 100644 --- a/tests/automated/integration/readonly/helper.py +++ b/tests/automated/integration/readonly/helper.py @@ -8,14 +8,24 @@ class ReadOnlyTestHelper(BaseModel): class Config: arbitrary_types_allowed = True + # Clients adb_client: AsyncDatabaseClient api_test_helper: APITestHelper + # Agencies agency_1_id: int agency_1_location_id: int agency_2_id: int agency_2_location_id: int - minimal_data_source: int - maximal_data_source: int + # URLs + minimal_data_source_url_id: int + maximal_data_source_url_id: int url_meta_url_id: int + unvalidated_url_id: int + + # Users + user_1_id: int + user_2_id: int + + diff --git a/tests/automated/integration/readonly/setup.py b/tests/automated/integration/readonly/setup.py deleted file mode 100644 index 7f5e1637..00000000 --- a/tests/automated/integration/readonly/setup.py +++ /dev/null @@ -1,211 +0,0 @@ -from datetime import date - -from src.collectors.enums import URLStatus -from src.core.enums import RecordType -from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.impl.agency.enums import AgencyType, JurisdictionType -from src.db.models.impl.agency.sqlalchemy import Agency -from src.db.models.impl.flag.url_validated.enums import URLType -from src.db.models.impl.link.agency_location.sqlalchemy import LinkAgencyLocation -from src.db.models.impl.url.core.enums import URLSource -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.optional_ds_metadata.enums import AgencyAggregationEnum, UpdateMethodEnum, \ - RetentionScheduleEnum, AccessTypeEnum -from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata -from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType -from tests.automated.integration.readonly.helper import ReadOnlyTestHelper -from tests.helpers.api_test_helper import APITestHelper -from tests.helpers.data_creator.core import DBDataCreator -from tests.helpers.data_creator.models.creation_info.county import CountyCreationInfo -from tests.helpers.data_creator.models.creation_info.locality import LocalityCreationInfo -from tests.helpers.data_creator.models.creation_info.us_state import USStateCreationInfo - - -async def setup_readonly_data( - api_test_helper: APITestHelper -) -> ReadOnlyTestHelper: - db_data_creator = api_test_helper.db_data_creator - adb_client = db_data_creator.adb_client - - # Pennsylvania - pennsylvania: USStateCreationInfo = await db_data_creator.create_us_state( - name="Pennsylvania", - iso="PA" - ) - allegheny_county: CountyCreationInfo = await db_data_creator.create_county( - state_id=pennsylvania.us_state_id, - name="Allegheny" - ) - pittsburgh: LocalityCreationInfo = await db_data_creator.create_locality( - state_id=pennsylvania.us_state_id, - county_id=allegheny_county.county_id, - name="Pittsburgh" - ) - - - # Add Agencies - agency_1_id: int = await add_agency(adb_client, pittsburgh.location_id) - agency_2_id: int = await add_agency(adb_client, allegheny_county.location_id) - - # Add Data Source With Linked Agency - maximal_data_source: int = await add_maximal_data_source( - agency_1_id=agency_1_id, - db_data_creator=db_data_creator - ) - minimal_data_source: int = await add_minimal_data_source( - agency_1_id=agency_1_id, - db_data_creator=db_data_creator - ) - - # Add Meta URL with Linked Agency - url_meta_url_id: int = await add_meta_url(agency_1_id, db_data_creator) - - return ReadOnlyTestHelper( - adb_client=adb_client, - api_test_helper=api_test_helper, - - agency_1_id=agency_1_id, - agency_1_location_id=pittsburgh.location_id, - - agency_2_id=agency_2_id, - agency_2_location_id=allegheny_county.location_id, - - maximal_data_source=maximal_data_source, - minimal_data_source=minimal_data_source, - url_meta_url_id=url_meta_url_id, - ) - - -async def add_meta_url( - agency_1_id: int, - db_data_creator: DBDataCreator -) -> int: - adb_client: AsyncDatabaseClient = db_data_creator.adb_client - url = URL( - scheme=None, - url="read-only-meta-url.com", - name="Read only URL Name", - trailing_slash=False, - description="Read only URL", - collector_metadata={ - "url": "https://read-only-meta-url.com/" - }, - status=URLStatus.OK, - source=URLSource.REDIRECT, - ) - url_id: int = await adb_client.add(url, return_id=True) - - await db_data_creator.create_validated_flags( - url_ids=[url_id], - validation_type=URLType.META_URL - ) - - return url_id - - -async def add_maximal_data_source( - agency_1_id: int, - db_data_creator: DBDataCreator -) -> int: - adb_client: AsyncDatabaseClient = db_data_creator.adb_client - url = URL( - scheme="https", - url="read-only-ds.com", - name="Read only URL name", - trailing_slash=True, - description="Read only URL", - collector_metadata={ - "url": "https://read-only.com/" - }, - status=URLStatus.OK, - source=URLSource.COLLECTOR, - ) - url_id: int = await adb_client.add(url, return_id=True) - await db_data_creator.create_validated_flags( - url_ids=[url_id], - validation_type=URLType.DATA_SOURCE - ) - record_type = URLRecordType( - url_id=url_id, - record_type=RecordType.CRIME_STATISTICS - ) - await adb_client.add(record_type) - - optional_ds_metadata = URLOptionalDataSourceMetadata( - url_id=url_id, - record_formats=["csv", "pdf"], - data_portal_type="CKAN", - supplying_entity="ReadOnly Agency", - coverage_start=date(year=2025, month=6, day=1), - coverage_end=date(year=2025, month=8, day=20), - agency_supplied=False, - agency_originated=True, - agency_aggregation=AgencyAggregationEnum.LOCALITY, - agency_described_not_in_database="ReadOnly Agency Not In DB", - update_method=UpdateMethodEnum.NO_UPDATES, - readme_url="https://read-only-readme.com", - originating_entity="ReadOnly Agency Originating", - retention_schedule=RetentionScheduleEnum.GT_10_YEARS, - scraper_url="https://read-only-scraper.com", - submission_notes="Read Only Submission Notes", - access_notes="Read Only Access Notes", - access_types=[AccessTypeEnum.WEBPAGE, AccessTypeEnum.API], - ) - - await adb_client.add(optional_ds_metadata) - - await db_data_creator.create_url_agency_links( - url_ids=[url_id], - agency_ids=[agency_1_id] - ) - return url_id - -async def add_minimal_data_source( - agency_1_id: int, - db_data_creator: DBDataCreator -) -> int: - adb_client: AsyncDatabaseClient = db_data_creator.adb_client - url = URL( - scheme="https", - url="minimal-ds.com", - name="Minimal name", - trailing_slash=False, - collector_metadata={}, - status=URLStatus.OK, - source=URLSource.ROOT_URL, - ) - url_id: int = await adb_client.add(url, return_id=True) - await db_data_creator.create_validated_flags( - url_ids=[url_id], - validation_type=URLType.DATA_SOURCE - ) - record_type = URLRecordType( - url_id=url_id, - record_type=RecordType.POLICIES_AND_CONTRACTS - ) - await adb_client.add(record_type) - - await db_data_creator.create_url_agency_links( - url_ids=[url_id], - agency_ids=[agency_1_id] - ) - return url_id - - -async def add_agency( - adb_client: AsyncDatabaseClient, - location_id: int -) -> int: - agency_1 = Agency( - name="Agency 1", - agency_type=AgencyType.LAW_ENFORCEMENT, - jurisdiction_type=JurisdictionType.STATE, - ) - agency_id: int = await adb_client.add(agency_1, return_id=True) - # Add Agency location - agency_1_location = LinkAgencyLocation( - agency_id=agency_id, - location_id=location_id, - ) - await adb_client.add(agency_1_location) - return agency_id \ No newline at end of file diff --git a/tests/automated/integration/readonly/setup/__init__.py b/tests/automated/integration/readonly/setup/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/readonly/setup/agency.py b/tests/automated/integration/readonly/setup/agency.py new file mode 100644 index 00000000..366bc43d --- /dev/null +++ b/tests/automated/integration/readonly/setup/agency.py @@ -0,0 +1,23 @@ +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.agency.enums import AgencyType, JurisdictionType +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.link.agency_location.sqlalchemy import LinkAgencyLocation + + +async def add_agency( + adb_client: AsyncDatabaseClient, + location_id: int +) -> int: + agency_1 = Agency( + name="Agency 1", + agency_type=AgencyType.LAW_ENFORCEMENT, + jurisdiction_type=JurisdictionType.STATE, + ) + agency_id: int = await adb_client.add(agency_1, return_id=True) + # Add Agency location + agency_1_location = LinkAgencyLocation( + agency_id=agency_id, + location_id=location_id, + ) + await adb_client.add(agency_1_location) + return agency_id diff --git a/tests/automated/integration/readonly/setup/annotations.py b/tests/automated/integration/readonly/setup/annotations.py new file mode 100644 index 00000000..b07bbd9f --- /dev/null +++ b/tests/automated/integration/readonly/setup/annotations.py @@ -0,0 +1,72 @@ +from src.core.enums import RecordType +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.impl.link.user_name_suggestion.sqlalchemy import LinkUserNameSuggestion +from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion +from src.db.models.impl.url.suggestion.location.user.sqlalchemy import UserLocationSuggestion +from src.db.models.impl.url.suggestion.name.enums import NameSuggestionSource +from src.db.models.impl.url.suggestion.name.sqlalchemy import URLNameSuggestion +from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion +from src.db.models.impl.url.suggestion.url_type.user import UserURLTypeSuggestion + + +async def add_full_data_sources_annotations( + url_id: int, + user_id: int, + agency_id: int, + location_id: int, + adb_client: AsyncDatabaseClient +) -> None: + name_suggestion = URLNameSuggestion( + url_id=url_id, + suggestion="Name suggestion", + source=NameSuggestionSource.USER + ) + name_suggestion_id: int = await adb_client.add( + name_suggestion, + return_id=True + ) + url_type_suggestion = UserURLTypeSuggestion( + url_id=url_id, + user_id=user_id, + type=URLType.DATA_SOURCE + ) + record_type_suggestion = UserRecordTypeSuggestion( + user_id=user_id, + url_id=url_id, + record_type=RecordType.RECORDS_REQUEST_INFO.value + ) + user_name_suggestion = LinkUserNameSuggestion( + user_id=user_id, + suggestion_id=name_suggestion_id, + ) + agency_suggestion = UserURLAgencySuggestion( + agency_id=agency_id, + url_id=url_id, + user_id=user_id, + ) + location_suggestion = UserLocationSuggestion( + location_id=location_id, + url_id=url_id, + user_id=user_id, + ) + for suggestion in [ + url_type_suggestion, + record_type_suggestion, + user_name_suggestion, + agency_suggestion, + location_suggestion + ]: + await adb_client.add(suggestion) + +async def add_minimal_not_relevant_annotation( + url_id: int, + user_id: int, + adb_client: AsyncDatabaseClient +) -> None: + url_type_suggestion = UserURLTypeSuggestion( + url_id=url_id, + user_id=user_id, + type=URLType.NOT_RELEVANT + ) + await adb_client.add(url_type_suggestion) \ No newline at end of file diff --git a/tests/automated/integration/readonly/setup/core.py b/tests/automated/integration/readonly/setup/core.py new file mode 100644 index 00000000..d3584929 --- /dev/null +++ b/tests/automated/integration/readonly/setup/core.py @@ -0,0 +1,99 @@ +from tests.automated.integration.readonly.helper import ReadOnlyTestHelper +from tests.automated.integration.readonly.setup.agency import add_agency +from tests.automated.integration.readonly.setup.annotations import add_full_data_sources_annotations, \ + add_minimal_not_relevant_annotation +from tests.automated.integration.readonly.setup.data_source import add_maximal_data_source, add_minimal_data_source +from tests.automated.integration.readonly.setup.meta_url import add_meta_url +from tests.helpers.api_test_helper import APITestHelper +from tests.helpers.data_creator.models.creation_info.county import CountyCreationInfo +from tests.helpers.data_creator.models.creation_info.locality import LocalityCreationInfo +from tests.helpers.data_creator.models.creation_info.us_state import USStateCreationInfo + + +async def setup_readonly_data( + api_test_helper: APITestHelper +) -> ReadOnlyTestHelper: + db_data_creator = api_test_helper.db_data_creator + adb_client = db_data_creator.adb_client + + # Pennsylvania + pennsylvania: USStateCreationInfo = await db_data_creator.create_us_state( + name="Pennsylvania", + iso="PA" + ) + allegheny_county: CountyCreationInfo = await db_data_creator.create_county( + state_id=pennsylvania.us_state_id, + name="Allegheny" + ) + pittsburgh: LocalityCreationInfo = await db_data_creator.create_locality( + state_id=pennsylvania.us_state_id, + county_id=allegheny_county.county_id, + name="Pittsburgh" + ) + + # Add Agencies + agency_1_id: int = await add_agency(adb_client, pittsburgh.location_id) + agency_2_id: int = await add_agency(adb_client, allegheny_county.location_id) + + + # Add users with varying contributions + user_id_1: int = 1 + user_id_2: int = 2 + # Add unvalidated URL + unvalidated_url_id: int = (await db_data_creator.create_urls( + record_type=None, + count=1 + ))[0].url_id + # Have User 1 give a full set of data sources annotations + await add_full_data_sources_annotations( + url_id=unvalidated_url_id, + user_id=user_id_1, + agency_id=agency_1_id, + location_id=pittsburgh.location_id, + adb_client=adb_client + ) + # Have User 2 give a single rejected annotation + await add_minimal_not_relevant_annotation( + url_id=unvalidated_url_id, + user_id=user_id_2, + adb_client=adb_client + ) + + + + + # Add Data Source With Linked Agency + maximal_data_source: int = await add_maximal_data_source( + agency_1_id=agency_1_id, + db_data_creator=db_data_creator + ) + minimal_data_source: int = await add_minimal_data_source( + agency_1_id=agency_1_id, + db_data_creator=db_data_creator + ) + + # Add Meta URL with Linked Agency + url_meta_url_id: int = await add_meta_url(agency_1_id, db_data_creator) + + return ReadOnlyTestHelper( + adb_client=adb_client, + api_test_helper=api_test_helper, + + # Agencies + agency_1_id=agency_1_id, + agency_1_location_id=pittsburgh.location_id, + agency_2_id=agency_2_id, + agency_2_location_id=allegheny_county.location_id, + + # URLs + maximal_data_source_url_id=maximal_data_source, + minimal_data_source_url_id=minimal_data_source, + url_meta_url_id=url_meta_url_id, + unvalidated_url_id=unvalidated_url_id, + + # Users + user_1_id=user_id_1, + user_2_id=user_id_2, + ) + + diff --git a/tests/automated/integration/readonly/setup/data_source.py b/tests/automated/integration/readonly/setup/data_source.py new file mode 100644 index 00000000..e22929ee --- /dev/null +++ b/tests/automated/integration/readonly/setup/data_source.py @@ -0,0 +1,103 @@ +from datetime import date + +from src.collectors.enums import URLStatus +from src.core.enums import RecordType +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.optional_ds_metadata.enums import AgencyAggregationEnum, UpdateMethodEnum, \ + RetentionScheduleEnum, AccessTypeEnum +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata +from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType +from tests.helpers.data_creator.core import DBDataCreator + + +async def add_maximal_data_source( + agency_1_id: int, + db_data_creator: DBDataCreator +) -> int: + adb_client: AsyncDatabaseClient = db_data_creator.adb_client + url = URL( + scheme="https", + url="read-only-ds.com", + name="Read only URL name", + trailing_slash=True, + description="Read only URL", + collector_metadata={ + "url": "https://read-only.com/" + }, + status=URLStatus.OK, + source=URLSource.COLLECTOR, + ) + url_id: int = await adb_client.add(url, return_id=True) + await db_data_creator.create_validated_flags( + url_ids=[url_id], + validation_type=URLType.DATA_SOURCE + ) + record_type = URLRecordType( + url_id=url_id, + record_type=RecordType.CRIME_STATISTICS + ) + await adb_client.add(record_type) + + optional_ds_metadata = URLOptionalDataSourceMetadata( + url_id=url_id, + record_formats=["csv", "pdf"], + data_portal_type="CKAN", + supplying_entity="ReadOnly Agency", + coverage_start=date(year=2025, month=6, day=1), + coverage_end=date(year=2025, month=8, day=20), + agency_supplied=False, + agency_originated=True, + agency_aggregation=AgencyAggregationEnum.LOCALITY, + agency_described_not_in_database="ReadOnly Agency Not In DB", + update_method=UpdateMethodEnum.NO_UPDATES, + readme_url="https://read-only-readme.com", + originating_entity="ReadOnly Agency Originating", + retention_schedule=RetentionScheduleEnum.GT_10_YEARS, + scraper_url="https://read-only-scraper.com", + submission_notes="Read Only Submission Notes", + access_notes="Read Only Access Notes", + access_types=[AccessTypeEnum.WEBPAGE, AccessTypeEnum.API], + ) + + await adb_client.add(optional_ds_metadata) + + await db_data_creator.create_url_agency_links( + url_ids=[url_id], + agency_ids=[agency_1_id] + ) + return url_id + + +async def add_minimal_data_source( + agency_1_id: int, + db_data_creator: DBDataCreator +) -> int: + adb_client: AsyncDatabaseClient = db_data_creator.adb_client + url = URL( + scheme="https", + url="minimal-ds.com", + name="Minimal name", + trailing_slash=False, + collector_metadata={}, + status=URLStatus.OK, + source=URLSource.ROOT_URL, + ) + url_id: int = await adb_client.add(url, return_id=True) + await db_data_creator.create_validated_flags( + url_ids=[url_id], + validation_type=URLType.DATA_SOURCE + ) + record_type = URLRecordType( + url_id=url_id, + record_type=RecordType.POLICIES_AND_CONTRACTS + ) + await adb_client.add(record_type) + + await db_data_creator.create_url_agency_links( + url_ids=[url_id], + agency_ids=[agency_1_id] + ) + return url_id diff --git a/tests/automated/integration/readonly/setup/meta_url.py b/tests/automated/integration/readonly/setup/meta_url.py new file mode 100644 index 00000000..837274bb --- /dev/null +++ b/tests/automated/integration/readonly/setup/meta_url.py @@ -0,0 +1,33 @@ +from src.collectors.enums import URLStatus +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.sqlalchemy import URL +from tests.helpers.data_creator.core import DBDataCreator + + +async def add_meta_url( + agency_1_id: int, + db_data_creator: DBDataCreator +) -> int: + adb_client: AsyncDatabaseClient = db_data_creator.adb_client + url = URL( + scheme=None, + url="read-only-meta-url.com", + name="Read only URL Name", + trailing_slash=False, + description="Read only URL", + collector_metadata={ + "url": "https://read-only-meta-url.com/" + }, + status=URLStatus.OK, + source=URLSource.REDIRECT, + ) + url_id: int = await adb_client.add(url, return_id=True) + + await db_data_creator.create_validated_flags( + url_ids=[url_id], + validation_type=URLType.META_URL + ) + + return url_id diff --git a/tests/automated/integration/tasks/url/impl/auto_relevant/test_task.py b/tests/automated/integration/tasks/url/impl/auto_relevant/test_task.py index 5de999ec..3f4873f4 100644 --- a/tests/automated/integration/tasks/url/impl/auto_relevant/test_task.py +++ b/tests/automated/integration/tasks/url/impl/auto_relevant/test_task.py @@ -5,7 +5,7 @@ from src.collectors.enums import URLStatus from src.core.tasks.url.operators.auto_relevant.core import URLAutoRelevantTaskOperator from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion +from src.db.models.impl.url.suggestion.url_type.auto.sqlalchemy import AutoRelevantSuggestion from src.db.models.impl.url.task_error.sqlalchemy import URLTaskError from tests.automated.integration.tasks.url.impl.asserts import assert_prereqs_not_met, assert_prereqs_met from tests.automated.integration.tasks.url.impl.auto_relevant.setup import setup_operator, setup_urls diff --git a/tests/helpers/data_creator/commands/impl/suggestion/auto/relevant.py b/tests/helpers/data_creator/commands/impl/suggestion/auto/relevant.py index 2e31491d..d85b5a1b 100644 --- a/tests/helpers/data_creator/commands/impl/suggestion/auto/relevant.py +++ b/tests/helpers/data_creator/commands/impl/suggestion/auto/relevant.py @@ -1,4 +1,4 @@ -from src.db.models.impl.url.suggestion.relevant.auto.pydantic.input import AutoRelevancyAnnotationInput +from src.db.models.impl.url.suggestion.url_type.auto.pydantic.input import AutoRelevancyAnnotationInput from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase From 169037b65a2a4d57476ffc318ca12edd7ffa2965 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sun, 16 Nov 2025 17:24:15 -0500 Subject: [PATCH 42/84] Add task logs to app syncs --- ...25_11_16_1130-88ac26c3b025_add_task_log.py | 37 +++++++++++++++++++ src/core/tasks/base/operator.py | 11 ++++++ .../impl/sync_to_ds/impl/agencies/add/core.py | 2 + .../sync_to_ds/impl/agencies/delete/core.py | 4 ++ .../sync_to_ds/impl/agencies/update/core.py | 6 ++- .../sync_to_ds/impl/data_sources/add/core.py | 6 ++- .../impl/data_sources/delete/core.py | 4 ++ .../impl/data_sources/update/core.py | 6 ++- .../sync_to_ds/impl/meta_urls/add/core.py | 7 +++- .../sync_to_ds/impl/meta_urls/delete/core.py | 4 ++ .../sync_to_ds/impl/meta_urls/update/core.py | 6 ++- src/db/models/impl/task/log.py | 17 +++++++++ 12 files changed, 105 insertions(+), 5 deletions(-) create mode 100644 alembic/versions/2025_11_16_1130-88ac26c3b025_add_task_log.py create mode 100644 src/db/models/impl/task/log.py diff --git a/alembic/versions/2025_11_16_1130-88ac26c3b025_add_task_log.py b/alembic/versions/2025_11_16_1130-88ac26c3b025_add_task_log.py new file mode 100644 index 00000000..ed7f9e49 --- /dev/null +++ b/alembic/versions/2025_11_16_1130-88ac26c3b025_add_task_log.py @@ -0,0 +1,37 @@ +"""Add task log + +Revision ID: 88ac26c3b025 +Revises: de0305465e2c +Create Date: 2025-11-16 11:30:25.742630 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + +from src.util.alembic_helpers import task_id_column, created_at_column + +# revision identifiers, used by Alembic. +revision: str = '88ac26c3b025' +down_revision: Union[str, None] = 'de0305465e2c' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.create_table( + "tasks__log", + task_id_column(), + sa.Column( + "log", + sa.Text, + nullable=False, + ), + created_at_column(), + sa.PrimaryKeyConstraint("task_id"), + ) + + +def downgrade() -> None: + pass diff --git a/src/core/tasks/base/operator.py b/src/core/tasks/base/operator.py index 719abdf5..ff5ec4e5 100644 --- a/src/core/tasks/base/operator.py +++ b/src/core/tasks/base/operator.py @@ -8,6 +8,7 @@ from src.db.client.async_ import AsyncDatabaseClient from src.db.enums import TaskType from src.db.models.impl.task.enums import TaskStatus +from src.db.models.impl.task.log import TaskLog from src.db.models.impl.url.task_error.pydantic_.insert import URLTaskErrorPydantic from src.db.models.impl.url.task_error.pydantic_.small import URLTaskErrorSmall from src.db.queries.base.builder import QueryBuilderBase @@ -94,6 +95,16 @@ async def add_task_errors( ] await self.adb_client.bulk_insert(inserts) + async def add_task_log( + self, + log: str + ) -> None: + task_log = TaskLog( + task_id=self.task_id, + log=log + ) + await self.adb_client.add(task_log) + # Convenience forwarder functions async def run_query_builder(self, query_builder: QueryBuilderBase) -> Any: return await self.adb_client.run_query_builder(query_builder) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/core.py index e46deed5..d21f1259 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/core.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/core.py @@ -25,6 +25,8 @@ async def meets_task_prerequisites(self) -> bool: async def inner_task_logic(self) -> None: request: AddAgenciesOuterRequest = await self.get_request_input() + db_ids: list[int] = [r.request_id for r in request.agencies] + await self.add_task_log(f"Adding agencies with the following db_ids: {db_ids}") responses: list[DSAppSyncAddResponseInnerModel] = await self.make_request(request) await self.insert_ds_app_links(responses) diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/core.py index e84d3b2b..806ba230 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/core.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/core.py @@ -26,10 +26,14 @@ async def meets_task_prerequisites(self) -> bool: async def inner_task_logic(self) -> None: ds_app_ids: list[int] = await self.get_inputs() + await self.log_ds_app_ids(ds_app_ids) await self.make_request(ds_app_ids) await self.delete_flags(ds_app_ids) await self.delete_links(ds_app_ids) + async def log_ds_app_ids(self, ds_app_ids: list[int]): + await self.add_task_log(f"Deleting agencies with the following ds_app_ids: {ds_app_ids}") + async def get_inputs(self) -> list[int]: return await self.adb_client.run_query_builder( DSAppSyncAgenciesDeleteGetQueryBuilder() diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/core.py index 24481e8d..814f9a1e 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/core.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/core.py @@ -25,13 +25,17 @@ async def meets_task_prerequisites(self) -> bool: async def inner_task_logic(self) -> None: request: UpdateAgenciesOuterRequest = await self.get_inputs() - await self.make_request(request) ds_app_ids: list[int] = [ agency.app_id for agency in request.agencies ] + await self.log_ds_app_ids(ds_app_ids) + await self.make_request(request) await self.update_links(ds_app_ids) + async def log_ds_app_ids(self, ds_app_ids: list[int]): + await self.add_task_log(f"Updating agencies with the following ds_app_ids: {ds_app_ids}") + async def get_inputs(self) -> UpdateAgenciesOuterRequest: return await self.adb_client.run_query_builder( DSAppSyncAgenciesUpdateGetQueryBuilder() diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/core.py index 760583fd..6acd74fd 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/core.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/core.py @@ -7,7 +7,7 @@ from src.core.tasks.scheduled.impl.sync_to_ds.templates.operator import DSSyncTaskOperatorBase from src.db.enums import TaskType from src.external.pdap.impl.sync.data_sources.add.core import AddDataSourcesRequestBuilder -from src.external.pdap.impl.sync.data_sources.add.request import AddDataSourcesOuterRequest +from src.external.pdap.impl.sync.data_sources.add.request import AddDataSourcesOuterRequest, AddDataSourcesInnerRequest from src.external.pdap.impl.sync.shared.models.add.response import DSAppSyncAddResponseInnerModel @@ -27,9 +27,13 @@ async def meets_task_prerequisites(self) -> bool: async def inner_task_logic(self) -> None: request: AddDataSourcesOuterRequest = await self.get_request_input() + await self.log_db_ids(request.data_sources) responses: list[DSAppSyncAddResponseInnerModel] = await self.make_request(request) await self.insert_ds_app_links(responses) + async def log_db_ids(self, data_sources: list[AddDataSourcesInnerRequest]): + db_ids: list[int] = [d.request_id for d in data_sources] + await self.add_task_log(f"Adding data sources with the following db_ids: {db_ids}") async def get_request_input(self) -> AddDataSourcesOuterRequest: return await self.run_query_builder( diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/core.py index 14450a51..0c5bd53e 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/core.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/core.py @@ -26,10 +26,14 @@ async def meets_task_prerequisites(self) -> bool: async def inner_task_logic(self) -> None: ds_app_ids: list[int] = await self.get_inputs() + await self.log_ds_app_ids(ds_app_ids) await self.make_request(ds_app_ids) await self.delete_flags(ds_app_ids) await self.delete_links(ds_app_ids) + async def log_ds_app_ids(self, ds_app_ids: list[int]): + await self.add_task_log(f"Deleting data sources with the following ds_app_ids: {ds_app_ids}") + async def get_inputs(self) -> list[int]: return await self.run_query_builder( DSAppSyncDataSourcesDeleteGetQueryBuilder() diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/core.py index fd925146..0a0c4d21 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/core.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/core.py @@ -25,13 +25,17 @@ async def meets_task_prerequisites(self) -> bool: async def inner_task_logic(self) -> None: request: UpdateDataSourcesOuterRequest = await self.get_inputs() - await self.make_request(request) ds_app_ids: list[int] = [ ds.app_id for ds in request.data_sources ] + await self.log_ds_app_ids(ds_app_ids) + await self.make_request(request) await self.update_links(ds_app_ids) + async def log_ds_app_ids(self, ds_app_ids: list[int]): + await self.add_task_log(f"Updating data sources with the following ds_app_ids: {ds_app_ids}") + async def get_inputs(self) -> UpdateDataSourcesOuterRequest: return await self.adb_client.run_query_builder( DSAppSyncDataSourcesUpdateGetQueryBuilder() diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/core.py index 6823c205..08ee031d 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/core.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/core.py @@ -6,7 +6,7 @@ from src.core.tasks.scheduled.impl.sync_to_ds.templates.operator import DSSyncTaskOperatorBase from src.db.enums import TaskType from src.external.pdap.impl.sync.meta_urls.add.core import AddMetaURLsRequestBuilder -from src.external.pdap.impl.sync.meta_urls.add.request import AddMetaURLsOuterRequest +from src.external.pdap.impl.sync.meta_urls.add.request import AddMetaURLsOuterRequest, AddMetaURLsInnerRequest from src.external.pdap.impl.sync.shared.models.add.response import DSAppSyncAddResponseInnerModel @@ -25,9 +25,14 @@ async def meets_task_prerequisites(self) -> bool: async def inner_task_logic(self) -> None: request: AddMetaURLsOuterRequest = await self.get_request_input() + await self.log_db_ids(request.meta_urls) responses: list[DSAppSyncAddResponseInnerModel] = await self.make_request(request) await self.insert_ds_app_links(responses) + async def log_db_ids(self, meta_urls: list[AddMetaURLsInnerRequest]): + db_ids: list[int] = [m.request_id for m in meta_urls] + await self.add_task_log(f"Adding meta urls with the following db_ids: {db_ids}") + async def get_request_input(self) -> AddMetaURLsOuterRequest: return await self.run_query_builder( DSAppSyncMetaURLsAddGetQueryBuilder() diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/core.py index 32f5ef85..76fc9c4b 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/core.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/core.py @@ -26,10 +26,14 @@ async def meets_task_prerequisites(self) -> bool: async def inner_task_logic(self) -> None: ds_app_ids: list[int] = await self.get_inputs() + await self.log_ds_app_ids(ds_app_ids) await self.make_request(ds_app_ids) await self.delete_flags(ds_app_ids) await self.delete_links(ds_app_ids) + async def log_ds_app_ids(self, ds_app_ids: list[int]): + await self.add_task_log(f"Deleting meta urls with the following ds_app_ids: {ds_app_ids}") + async def get_inputs(self) -> list[int]: return await self.run_query_builder( DSAppSyncMetaURLsDeleteGetQueryBuilder() diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/core.py index 3ef8dc28..ff0b06ec 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/core.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/core.py @@ -25,13 +25,17 @@ async def meets_task_prerequisites(self) -> bool: async def inner_task_logic(self) -> None: request: UpdateMetaURLsOuterRequest = await self.get_inputs() - await self.make_request(request) ds_app_ids: list[int] = [ meta_url.app_id for meta_url in request.meta_urls ] + await self.log_ds_app_ids(ds_app_ids) + await self.make_request(request) await self.update_links(ds_app_ids) + async def log_ds_app_ids(self, ds_app_ids: list[int]): + await self.add_task_log(f"Updating meta urls with the following ds_app_ids: {ds_app_ids}") + async def get_inputs(self) -> UpdateMetaURLsOuterRequest: return await self.adb_client.run_query_builder( DSAppSyncMetaURLsUpdateGetQueryBuilder() diff --git a/src/db/models/impl/task/log.py b/src/db/models/impl/task/log.py new file mode 100644 index 00000000..9efd86da --- /dev/null +++ b/src/db/models/impl/task/log.py @@ -0,0 +1,17 @@ +from sqlalchemy import Column, Text, PrimaryKeyConstraint + +from src.db.models.mixins import TaskDependentMixin, UpdatedAtMixin, CreatedAtMixin +from src.db.models.templates_.base import Base + + +class TaskLog( + Base, + TaskDependentMixin, + CreatedAtMixin, +): + __tablename__ = "tasks__log" + __table_args__ = ( + PrimaryKeyConstraint("task_id"), + ) + + log = Column(Text, nullable=False) From 37d5bf4a6eaf3c3ff89538a0d8accb3415c97571 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sun, 16 Nov 2025 17:53:39 -0500 Subject: [PATCH 43/84] Add handling for when no results found --- .../endpoints/contributions/user/queries/core.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/api/endpoints/contributions/user/queries/core.py b/src/api/endpoints/contributions/user/queries/core.py index 025815ed..c7d4afef 100644 --- a/src/api/endpoints/contributions/user/queries/core.py +++ b/src/api/endpoints/contributions/user/queries/core.py @@ -1,4 +1,5 @@ from sqlalchemy import select, RowMapping +from sqlalchemy.exc import NoResultFound from sqlalchemy.ext.asyncio import AsyncSession from src.api.endpoints.contributions.shared.contributions import ContributionsCTEContainer @@ -50,7 +51,17 @@ async def run(self, session: AsyncSession) -> ContributionsUserResponse: ) ) - mapping: RowMapping = await sh.mapping(session, query=query) + try: + mapping: RowMapping = await sh.mapping(session, query=query) + except NoResultFound: + return ContributionsUserResponse( + count_validated=0, + agreement=ContributionsUserAgreement( + record_type=0, + agency=0, + url_type=0 + ) + ) return ContributionsUserResponse( count_validated=mapping.count, From 772ef34ca150eac46215b6dd24b8fbb43a6a65af Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sun, 16 Nov 2025 18:17:55 -0500 Subject: [PATCH 44/84] Bump PDAP Access Manager to latest version --- pyproject.toml | 2 +- uv.lock | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index abcee13e..eda8cd67 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,7 @@ dependencies = [ "lxml~=5.1.0", "marshmallow~=3.23.2", "openai~=1.60.1", - "pdap-access-manager==0.4.3", + "pdap-access-manager==0.4.4", "pillow>=11.3.0", "pip>=25.2", "playwright~=1.49.1", diff --git a/uv.lock b/uv.lock index 120be75b..50ae00e8 100644 --- a/uv.lock +++ b/uv.lock @@ -560,7 +560,7 @@ requires-dist = [ { name = "lxml", specifier = "~=5.1.0" }, { name = "marshmallow", specifier = "~=3.23.2" }, { name = "openai", specifier = "~=1.60.1" }, - { name = "pdap-access-manager", specifier = "==0.4.3" }, + { name = "pdap-access-manager", specifier = "==0.4.4" }, { name = "pillow", specifier = ">=11.3.0" }, { name = "pip", specifier = ">=25.2" }, { name = "playwright", specifier = "~=1.49.1" }, @@ -1591,7 +1591,7 @@ wheels = [ [[package]] name = "pdap-access-manager" -version = "0.4.3" +version = "0.4.4" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiohttp" }, @@ -1599,9 +1599,9 @@ dependencies = [ { name = "pydantic" }, { name = "requests" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/92/8f/ad75b32cc91673d89510c0adb451027c29b47d09069f02ad920b8a29ff0d/pdap_access_manager-0.4.3.tar.gz", hash = "sha256:24fe43550caa2a4fb0e4ac255d4265bcfd5985f08ff55cc7dd1bc24224d80f08", size = 5995, upload_time = "2025-11-14T22:12:07.622Z" } +sdist = { url = "https://files.pythonhosted.org/packages/75/60/743b8d5e2478e911c421f5cc4a8dec0f051542c12958776a4a96fef73ee5/pdap_access_manager-0.4.4.tar.gz", hash = "sha256:b824cf8014c6eb6ca29e797788f26d44d96ca0bc033309d5af2d878bb913d08f", size = 6004, upload_time = "2025-11-16T23:15:13.835Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/c1/fa/52ad971907cc54dee673effdabb6f7ee87d5beb1966bb554aebbf7b9e47e/pdap_access_manager-0.4.3-py3-none-any.whl", hash = "sha256:9d58f4065b9fea38af1fe0a6afc77c9b8030b42f7cf15068edbe7e53fe11f949", size = 10807, upload_time = "2025-11-14T22:12:06.154Z" }, + { url = "https://files.pythonhosted.org/packages/bd/48/4fe13370886bcc2ac8b6e35d508c733a6eae05e2fe684e6aacc26d7ddea2/pdap_access_manager-0.4.4-py3-none-any.whl", hash = "sha256:b65203daec4a5bffe0be5d3577b6e515c644b3a9ca583c39f912f32e0bca11ef", size = 10808, upload_time = "2025-11-16T23:15:12.406Z" }, ] [[package]] From 5f8248c775514fdb854993c55112e52d0e74d1f8 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 18 Nov 2025 12:59:33 -0500 Subject: [PATCH 45/84] Continue draft --- ENV.md | 1 + ...783268bd3daa_add_update_url_status_task.py | 34 ++++++++ src/collectors/enums.py | 1 + .../impl/data_sources/add/queries/get.py | 13 +++- .../impl/data_sources/update/queries/get.py | 15 +++- .../impl/meta_urls/add/queries/get.py | 17 +++- .../impl/meta_urls/update/queries/get.py | 16 +++- .../impl/sync_to_ds/shared/__init__.py | 0 .../impl/sync_to_ds/shared/convert.py | 14 ++++ .../impl/update_url_status/__init__.py | 0 .../impl/update_url_status/operator.py | 15 ++++ .../scheduled/impl/update_url_status/query.py | 49 ++++++++++++ src/core/tasks/scheduled/loader.py | 9 +++ src/db/enums.py | 2 + .../impl/sync/data_sources/_shared/content.py | 3 +- .../impl/sync/meta_urls/_shared/content.py | 4 + .../impl/update_url_status/__init__.py | 0 .../impl/update_url_status/test_core.py | 77 +++++++++++++++++++ 18 files changed, 262 insertions(+), 8 deletions(-) create mode 100644 alembic/versions/2025_11_18_0902-783268bd3daa_add_update_url_status_task.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/shared/__init__.py create mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/shared/convert.py create mode 100644 src/core/tasks/scheduled/impl/update_url_status/__init__.py create mode 100644 src/core/tasks/scheduled/impl/update_url_status/operator.py create mode 100644 src/core/tasks/scheduled/impl/update_url_status/query.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/update_url_status/__init__.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/update_url_status/test_core.py diff --git a/ENV.md b/ENV.md index d4496dbc..a4ae17a7 100644 --- a/ENV.md +++ b/ENV.md @@ -70,6 +70,7 @@ Note that some tasks/subtasks are themselves enabled by other tasks. | `DELETE_STALE_SCREENSHOTS_TASK_FLAG` | Deletes stale screenshots for URLs already validated. | | `TASK_CLEANUP_TASK_FLAG` | Cleans up tasks that are no longer needed. | | `REFRESH_MATERIALIZED_VIEWS_TASK_FLAG` | Refreshes materialized views. | +| `UPDATE_URL_STATUS_TASK_FLAG` | Updates the status of URLs. | | `DS_APP_SYNC_AGENCY_ADD_TASK_FLAG` | Adds new agencies to the Data Sources App| | `DS_APP_SYNC_AGENCY_UPDATE_TASK_FLAG` | Updates existing agencies in the Data Sources App| | `DS_APP_SYNC_AGENCY_DELETE_TASK_FLAG` | Deletes agencies in the Data Sources App| diff --git a/alembic/versions/2025_11_18_0902-783268bd3daa_add_update_url_status_task.py b/alembic/versions/2025_11_18_0902-783268bd3daa_add_update_url_status_task.py new file mode 100644 index 00000000..986d6187 --- /dev/null +++ b/alembic/versions/2025_11_18_0902-783268bd3daa_add_update_url_status_task.py @@ -0,0 +1,34 @@ +"""Add update_url_status task + +Revision ID: 783268bd3daa +Revises: 88ac26c3b025 +Create Date: 2025-11-18 09:02:54.985705 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + +from src.util.alembic_helpers import add_enum_value + +# revision identifiers, used by Alembic. +revision: str = '783268bd3daa' +down_revision: Union[str, None] = '88ac26c3b025' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + add_enum_value( + enum_name="url_status", + enum_value="broken" + ) + add_enum_value( + enum_name="task_type", + enum_value="Update URL Status" + ) + + +def downgrade() -> None: + pass diff --git a/src/collectors/enums.py b/src/collectors/enums.py index f40e5f19..16711a0c 100644 --- a/src/collectors/enums.py +++ b/src/collectors/enums.py @@ -14,3 +14,4 @@ class URLStatus(Enum): OK = "ok" ERROR = "error" DUPLICATE = "duplicate" + BROKEN = "broken" diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/get.py index ae0a01ec..04710ba6 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/get.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/get.py @@ -6,8 +6,10 @@ from src.core.tasks.scheduled.impl.sync_to_ds.constants import PER_REQUEST_ENTITY_LIMIT from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.add.queries.cte import \ DSAppLinkSyncDataSourceAddPrerequisitesCTEContainer +from src.core.tasks.scheduled.impl.sync_to_ds.shared.convert import convert_sm_url_status_to_ds_url_status from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.internet_archives.probe.sqlalchemy import URLInternetArchivesProbeMetadata from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType from src.db.queries.base.builder import QueryBuilderBase @@ -38,6 +40,7 @@ async def run(self, session: AsyncSession) -> AddDataSourcesOuterRequest: # Required URL.full_url, URL.name, + URL.status, URLRecordType.record_type, agency_id_cte.c.agency_ids, # Optional @@ -56,6 +59,7 @@ async def run(self, session: AsyncSession) -> AddDataSourcesOuterRequest: URLOptionalDataSourceMetadata.scraper_url, URLOptionalDataSourceMetadata.access_notes, URLOptionalDataSourceMetadata.access_types, + URLInternetArchivesProbeMetadata.archive_url, ) .select_from( cte.cte @@ -68,6 +72,10 @@ async def run(self, session: AsyncSession) -> AddDataSourcesOuterRequest: URLOptionalDataSourceMetadata, URL.id == URLOptionalDataSourceMetadata.url_id, ) + .outerjoin( + URLInternetArchivesProbeMetadata, + URL.id == URLInternetArchivesProbeMetadata.url_id, + ) .join( URLRecordType, URLRecordType.url_id == URL.id, @@ -110,7 +118,10 @@ async def run(self, session: AsyncSession) -> AddDataSourcesOuterRequest: scraper_url=mapping[URLOptionalDataSourceMetadata.scraper_url], access_notes=mapping[URLOptionalDataSourceMetadata.access_notes], access_types=mapping[URLOptionalDataSourceMetadata.access_types] or [], - url_status=DataSourcesURLStatus.OK + url_status=convert_sm_url_status_to_ds_url_status( + sm_url_status=mapping[URL.status], + ), + internet_archives_url=mapping[URLInternetArchivesProbeMetadata.archive_url] or None, ) ) ) diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/get.py index b6b94779..a710b6f7 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/get.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/get.py @@ -6,8 +6,10 @@ from src.core.tasks.scheduled.impl.sync_to_ds.constants import PER_REQUEST_ENTITY_LIMIT from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update.queries.cte import \ DSAppLinkSyncDataSourceUpdatePrerequisitesCTEContainer +from src.core.tasks.scheduled.impl.sync_to_ds.shared.convert import convert_sm_url_status_to_ds_url_status from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.internet_archives.probe.sqlalchemy import URLInternetArchivesProbeMetadata from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType from src.db.queries.base.builder import QueryBuilderBase @@ -39,6 +41,7 @@ async def run(self, session: AsyncSession) -> UpdateDataSourcesOuterRequest: # Required URL.full_url, URL.name, + URL.status, URLRecordType.record_type, agency_id_cte.c.agency_ids, # Optional @@ -57,7 +60,8 @@ async def run(self, session: AsyncSession) -> UpdateDataSourcesOuterRequest: URLOptionalDataSourceMetadata.scraper_url, URLOptionalDataSourceMetadata.access_notes, URLOptionalDataSourceMetadata.access_types, - URLOptionalDataSourceMetadata.data_portal_type_other + URLOptionalDataSourceMetadata.data_portal_type_other, + URLInternetArchivesProbeMetadata.archive_url, ) .select_from( cte.cte @@ -70,6 +74,10 @@ async def run(self, session: AsyncSession) -> UpdateDataSourcesOuterRequest: URLOptionalDataSourceMetadata, URL.id == URLOptionalDataSourceMetadata.url_id, ) + .outerjoin( + URLInternetArchivesProbeMetadata, + URL.id == URLInternetArchivesProbeMetadata.url_id, + ) .join( URLRecordType, URLRecordType.url_id == URL.id, @@ -113,7 +121,10 @@ async def run(self, session: AsyncSession) -> UpdateDataSourcesOuterRequest: access_notes=mapping[URLOptionalDataSourceMetadata.access_notes], access_types=mapping[URLOptionalDataSourceMetadata.access_types] or [], data_portal_type_other=mapping[URLOptionalDataSourceMetadata.data_portal_type_other], - url_status=DataSourcesURLStatus.OK + url_status=convert_sm_url_status_to_ds_url_status( + sm_url_status=mapping[URL.status], + ), + internet_archives_url=mapping[URLInternetArchivesProbeMetadata.archive_url] or None, ) ) ) diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/get.py index da695cf0..5a784295 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/get.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/get.py @@ -6,8 +6,10 @@ from src.core.tasks.scheduled.impl.sync_to_ds.constants import PER_REQUEST_ENTITY_LIMIT from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.add.queries.cte import \ DSAppLinkSyncMetaURLAddPrerequisitesCTEContainer +from src.core.tasks.scheduled.impl.sync_to_ds.shared.convert import convert_sm_url_status_to_ds_url_status from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.internet_archives.probe.sqlalchemy import URLInternetArchivesProbeMetadata from src.db.queries.base.builder import QueryBuilderBase from src.external.pdap.impl.sync.meta_urls._shared.content import MetaURLSyncContentModel from src.external.pdap.impl.sync.meta_urls.add.request import AddMetaURLsOuterRequest, AddMetaURLsInnerRequest @@ -21,7 +23,8 @@ async def run(self, session: AsyncSession) -> AddMetaURLsOuterRequest: agency_id_cte = ( select( LinkURLAgency.url_id, - func.array_agg(LinkURLAgency.agency_id).label("agency_ids") + func.array_agg(LinkURLAgency.agency_id).label("agency_ids"), + ) .group_by( LinkURLAgency.url_id @@ -33,6 +36,8 @@ async def run(self, session: AsyncSession) -> AddMetaURLsOuterRequest: select( cte.url_id, URL.full_url, + URL.status, + URLInternetArchivesProbeMetadata.archive_url, agency_id_cte.c.agency_ids ) .select_from( @@ -42,6 +47,10 @@ async def run(self, session: AsyncSession) -> AddMetaURLsOuterRequest: URL, URL.id == cte.url_id, ) + .outerjoin( + URLInternetArchivesProbeMetadata, + URL.id == URLInternetArchivesProbeMetadata.url_id, + ) .join( agency_id_cte, cte.url_id == agency_id_cte.c.url_id @@ -61,7 +70,11 @@ async def run(self, session: AsyncSession) -> AddMetaURLsOuterRequest: request_id=mapping[cte.url_id], content=MetaURLSyncContentModel( url=mapping["full_url"], - agency_ids=mapping["agency_ids"] + agency_ids=mapping["agency_ids"], + internet_archives_url=mapping[URLInternetArchivesProbeMetadata.archive_url] or None, + url_status=convert_sm_url_status_to_ds_url_status( + sm_url_status=mapping[URL.status], + ), ) ) ) diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/get.py index 5dfb81bd..8cdb8ed6 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/get.py +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/get.py @@ -6,8 +6,10 @@ from src.core.tasks.scheduled.impl.sync_to_ds.constants import PER_REQUEST_ENTITY_LIMIT from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.update.queries.cte import \ DSAppLinkSyncMetaURLUpdatePrerequisitesCTEContainer +from src.core.tasks.scheduled.impl.sync_to_ds.shared.convert import convert_sm_url_status_to_ds_url_status from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.internet_archives.probe.sqlalchemy import URLInternetArchivesProbeMetadata from src.db.queries.base.builder import QueryBuilderBase from src.external.pdap.impl.sync.meta_urls._shared.content import MetaURLSyncContentModel from src.external.pdap.impl.sync.meta_urls.update.request import UpdateMetaURLsOuterRequest, UpdateMetaURLsInnerRequest @@ -33,7 +35,9 @@ async def run(self, session: AsyncSession) -> UpdateMetaURLsOuterRequest: select( cte.ds_meta_url_id, URL.full_url, - agency_id_cte.c.agency_ids + URL.status, + agency_id_cte.c.agency_ids, + URLInternetArchivesProbeMetadata.archive_url, ) .select_from( cte.cte @@ -42,6 +46,10 @@ async def run(self, session: AsyncSession) -> UpdateMetaURLsOuterRequest: URL, URL.id == cte.url_id, ) + .outerjoin( + URLInternetArchivesProbeMetadata, + URL.id == URLInternetArchivesProbeMetadata.url_id, + ) .outerjoin( agency_id_cte, cte.url_id == agency_id_cte.c.url_id @@ -61,7 +69,11 @@ async def run(self, session: AsyncSession) -> UpdateMetaURLsOuterRequest: app_id=mapping[cte.ds_meta_url_id], content=MetaURLSyncContentModel( url=mapping['full_url'], - agency_ids=mapping["agency_ids"] or [] + agency_ids=mapping["agency_ids"] or [], + internet_archives_url=mapping[URLInternetArchivesProbeMetadata.archive_url] or None, + url_status=convert_sm_url_status_to_ds_url_status( + sm_url_status=mapping[URL.status], + ), ) ) ) diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/shared/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/shared/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/shared/convert.py b/src/core/tasks/scheduled/impl/sync_to_ds/shared/convert.py new file mode 100644 index 00000000..3f586b20 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/shared/convert.py @@ -0,0 +1,14 @@ +from src.collectors.enums import URLStatus +from src.external.pdap.enums import DataSourcesURLStatus + + +def convert_sm_url_status_to_ds_url_status( + sm_url_status: URLStatus +) -> DataSourcesURLStatus: + match sm_url_status: + case URLStatus.OK: + return DataSourcesURLStatus.OK + case URLStatus.BROKEN: + return DataSourcesURLStatus.BROKEN + case _: + raise ValueError(f"URL status has no corresponding DS Status: {sm_url_status}") \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/update_url_status/__init__.py b/src/core/tasks/scheduled/impl/update_url_status/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/update_url_status/operator.py b/src/core/tasks/scheduled/impl/update_url_status/operator.py new file mode 100644 index 00000000..82285996 --- /dev/null +++ b/src/core/tasks/scheduled/impl/update_url_status/operator.py @@ -0,0 +1,15 @@ +from src.core.tasks.scheduled.impl.update_url_status.query import UpdateURLStatusQueryBuilder +from src.core.tasks.scheduled.templates.operator import ScheduledTaskOperatorBase +from src.db.enums import TaskType + + +class UpdateURLStatusOperator(ScheduledTaskOperatorBase): + + @property + def task_type(self) -> TaskType: + return TaskType.UPDATE_URL_STATUS + + async def inner_task_logic(self) -> None: + await self.adb_client.run_query_builder( + UpdateURLStatusQueryBuilder() + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/update_url_status/query.py b/src/core/tasks/scheduled/impl/update_url_status/query.py new file mode 100644 index 00000000..963405b6 --- /dev/null +++ b/src/core/tasks/scheduled/impl/update_url_status/query.py @@ -0,0 +1,49 @@ +from sqlalchemy import update, exists, select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.collectors.enums import URLStatus +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata +from src.db.queries.base.builder import QueryBuilderBase + + +class UpdateURLStatusQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> None: + + # Update broken URLs to nonbroken if their status is not 404 + query_broken_to_ok = ( + update(URL) + .values( + status=URLStatus.OK + ) + .where( + exists( + select(1).where( + URLWebMetadata.url_id == URL.id, # <-- correlate + URLWebMetadata.status_code != 404, + URL.status == URLStatus.BROKEN + ) + ) + ) + ) + + # Update ok URLs to broken if their status is 404 + query_ok_to_broken = ( + update(URL) + .values( + status=URLStatus.BROKEN + ) + .where( + exists( + select(1).where( + URLWebMetadata.url_id == URL.id, # <-- correlate + URLWebMetadata.status_code == 404, + URL.status == URLStatus.OK + ) + ) + ) + ) + + await session.execute(query_broken_to_ok) + await session.execute(query_ok_to_broken) \ No newline at end of file diff --git a/src/core/tasks/scheduled/loader.py b/src/core/tasks/scheduled/loader.py index 3ea4fc94..394a60ce 100644 --- a/src/core/tasks/scheduled/loader.py +++ b/src/core/tasks/scheduled/loader.py @@ -23,6 +23,7 @@ from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.delete.core import DSAppSyncMetaURLsDeleteTaskOperator from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.update.core import DSAppSyncMetaURLsUpdateTaskOperator from src.core.tasks.scheduled.impl.task_cleanup.operator import TaskCleanupOperator +from src.core.tasks.scheduled.impl.update_url_status.operator import UpdateURLStatusOperator from src.core.tasks.scheduled.models.entry import ScheduledTaskEntry from src.db.client.async_ import AsyncDatabaseClient from src.external.huggingface.hub.client import HuggingFaceHubClient @@ -211,5 +212,13 @@ async def load_entries(self) -> list[ScheduledTaskEntry]: interval_minutes=IntervalEnum.HOURLY.value, enabled=self.setup_flag("DS_APP_SYNC_AGENCY_DELETE_TASK_FLAG") ), + ### URL + ScheduledTaskEntry( + operator=UpdateURLStatusOperator( + adb_client=self.adb_client + ), + interval_minutes=IntervalEnum.HOURLY.value, + enabled=self.setup_flag("UPDATE_URL_STATUS_TASK_FLAG") + ), ] diff --git a/src/db/enums.py b/src/db/enums.py index 053fdace..034ec0b8 100644 --- a/src/db/enums.py +++ b/src/db/enums.py @@ -62,7 +62,9 @@ class TaskType(PyEnum): RUN_URL_TASKS = "Run URL Task Cycles" TASK_CLEANUP = "Task Cleanup" REFRESH_MATERIALIZED_VIEWS = "Refresh Materialized Views" + UPDATE_URL_STATUS = "Update URL Status" + # Sync Tasks SYNC_AGENCIES_ADD = "Sync Agencies Add" SYNC_AGENCIES_UPDATE = "Sync Agencies Update" SYNC_AGENCIES_DELETE = "Sync Agencies Delete" diff --git a/src/external/pdap/impl/sync/data_sources/_shared/content.py b/src/external/pdap/impl/sync/data_sources/_shared/content.py index 914b6d1e..59d0bcc6 100644 --- a/src/external/pdap/impl/sync/data_sources/_shared/content.py +++ b/src/external/pdap/impl/sync/data_sources/_shared/content.py @@ -37,6 +37,7 @@ class DataSourceSyncContentModel(BaseModel): access_notes: str | None = None access_types: list[AccessTypeEnum] = [] data_portal_type_other: str | None = None - url_status: DataSourcesURLStatus + url_status: DataSourcesURLStatus = DataSourcesURLStatus.OK + internet_archives_url: str | None = None agency_ids: list[int] = [] diff --git a/src/external/pdap/impl/sync/meta_urls/_shared/content.py b/src/external/pdap/impl/sync/meta_urls/_shared/content.py index 9d81b3d7..5db804cd 100644 --- a/src/external/pdap/impl/sync/meta_urls/_shared/content.py +++ b/src/external/pdap/impl/sync/meta_urls/_shared/content.py @@ -1,6 +1,10 @@ from pydantic import BaseModel +from src.external.pdap.enums import DataSourcesURLStatus + class MetaURLSyncContentModel(BaseModel): url: str + url_status: DataSourcesURLStatus = DataSourcesURLStatus.OK + internet_archives_url: str | None = None agency_ids: list[int] = [] diff --git a/tests/automated/integration/tasks/scheduled/impl/update_url_status/__init__.py b/tests/automated/integration/tasks/scheduled/impl/update_url_status/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/impl/update_url_status/test_core.py b/tests/automated/integration/tasks/scheduled/impl/update_url_status/test_core.py new file mode 100644 index 00000000..6b06fe31 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/update_url_status/test_core.py @@ -0,0 +1,77 @@ +import pytest +from sqlalchemy import update + +from src.collectors.enums import URLStatus +from src.core.tasks.scheduled.impl.update_url_status.operator import UpdateURLStatusOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata +from tests.helpers.data_creator.core import DBDataCreator + + +@pytest.mark.asyncio +async def test_update_url_status_task( + test_url_data_source_id: int, + test_url_meta_url_id: int, + adb_client_test: AsyncDatabaseClient, + db_data_creator: DBDataCreator +): + + # Create Operator + operator = UpdateURLStatusOperator( + adb_client=adb_client_test, + ) + + # Add web metadata to URLs + ## Data Source URL: Add 404 + await db_data_creator.create_web_metadata( + url_ids=[test_url_data_source_id], + status_code=404 + ) + + ## Meta URL: Add 200 + await db_data_creator.create_web_metadata( + url_ids=[test_url_meta_url_id], + status_code=200 + ) + + # Run Task + await operator.run_task() + + # Check URLs + urls: list[URL] = await adb_client_test.get_all(URL) + id_status_set_tuple: set[tuple[int, URLStatus]] = { + (url.id, url.status) + for url in urls + } + ## Data Source URL: Status should now be broken + ## Meta URL: Status should be unchanged + assert id_status_set_tuple == { + (test_url_data_source_id, URLStatus.BROKEN), + (test_url_meta_url_id, URLStatus.OK) + } + + # Update Web Metadata for Data Source URL to be 404 + statement = update(URLWebMetadata).where( + URLWebMetadata.url_id == test_url_data_source_id, + ).values( + status_code=200 + ) + await adb_client_test.execute(statement) + + # Run Task + await operator.run_task() + + # Check URLs + urls: list[URL] = await adb_client_test.get_all(URL) + id_status_set_tuple: set[tuple[int, URLStatus]] = { + (url.id, url.status) + for url in urls + } + ## Data Source URL: Status should now be ok + ## Meta URL: Status should be unchanged + assert id_status_set_tuple == { + (test_url_data_source_id, URLStatus.OK), + (test_url_meta_url_id, URLStatus.OK) + } + From 309b105668f13c8b7640bf9785b3e2e9703417f4 Mon Sep 17 00:00:00 2001 From: maxachis Date: Tue, 18 Nov 2025 14:16:13 -0500 Subject: [PATCH 46/84] Remove unused import --- src/api/main.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/api/main.py b/src/api/main.py index 8f080d25..141d4e38 100644 --- a/src/api/main.py +++ b/src/api/main.py @@ -6,7 +6,6 @@ from fastapi import FastAPI from pdap_access_manager.access_manager.async_ import AccessManagerAsync from pdap_access_manager.models.auth import AuthInfo -from sqlalchemy.ext.asyncio import create_async_engine from starlette.responses import RedirectResponse from src.api.endpoints.agencies.routes import agencies_router From 968b064e39ef8d2cbf9d5d56f65b6fc252aba4fb Mon Sep 17 00:00:00 2001 From: maxachis Date: Tue, 18 Nov 2025 14:23:02 -0500 Subject: [PATCH 47/84] Update number of entries --- .../integration/tasks/scheduled/loader/test_happy_path.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/automated/integration/tasks/scheduled/loader/test_happy_path.py b/tests/automated/integration/tasks/scheduled/loader/test_happy_path.py index 63c64264..ae41bc30 100644 --- a/tests/automated/integration/tasks/scheduled/loader/test_happy_path.py +++ b/tests/automated/integration/tasks/scheduled/loader/test_happy_path.py @@ -2,7 +2,7 @@ from src.core.tasks.scheduled.loader import ScheduledTaskOperatorLoader -NUMBER_OF_ENTRIES = 19 +NUMBER_OF_ENTRIES = 20 @pytest.mark.asyncio async def test_happy_path( From 2cb3b535bb61cb255d22ac4e974b23df6ecc9022 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 18 Nov 2025 19:03:51 -0500 Subject: [PATCH 48/84] Change namespace `source-manager` to `sync` --- src/core/tasks/scheduled/impl/sync_to_ds/README.md | 2 +- src/external/pdap/impl/sync/agencies/add/core.py | 2 +- src/external/pdap/impl/sync/agencies/delete/core.py | 2 +- src/external/pdap/impl/sync/agencies/update/core.py | 2 +- src/external/pdap/impl/sync/data_sources/add/core.py | 2 +- src/external/pdap/impl/sync/data_sources/delete/core.py | 2 +- src/external/pdap/impl/sync/data_sources/update/core.py | 2 +- src/external/pdap/impl/sync/meta_urls/add/core.py | 2 +- src/external/pdap/impl/sync/meta_urls/delete/core.py | 2 +- src/external/pdap/impl/sync/meta_urls/update/core.py | 2 +- .../integration/tasks/scheduled/impl/sync_to_ds/helpers.py | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/README.md b/src/core/tasks/scheduled/impl/sync_to_ds/README.md index 488b52bb..3af42af8 100644 --- a/src/core/tasks/scheduled/impl/sync_to_ds/README.md +++ b/src/core/tasks/scheduled/impl/sync_to_ds/README.md @@ -17,7 +17,7 @@ Each task gathers requisite information from the SM database and sends a request Each DS endpoint follows the following format: ```text -/v3/source-manager/{entity}/{action} +/v3/sync/{entity}/{action} ``` Synchronizations are designed to occur on an hourly basis. diff --git a/src/external/pdap/impl/sync/agencies/add/core.py b/src/external/pdap/impl/sync/agencies/add/core.py index 276ff39d..a8f190ce 100644 --- a/src/external/pdap/impl/sync/agencies/add/core.py +++ b/src/external/pdap/impl/sync/agencies/add/core.py @@ -14,7 +14,7 @@ def __init__( self.request = request async def inner_logic(self) -> list[DSAppSyncAddResponseInnerModel]: - url: str = self.build_url("v3/source-manager/agencies/add") + url: str = self.build_url("v3/sync/agencies/add") raw_results = await self.post( url=url, model=self.request, diff --git a/src/external/pdap/impl/sync/agencies/delete/core.py b/src/external/pdap/impl/sync/agencies/delete/core.py index 41c0cfd0..f48e1b11 100644 --- a/src/external/pdap/impl/sync/agencies/delete/core.py +++ b/src/external/pdap/impl/sync/agencies/delete/core.py @@ -12,7 +12,7 @@ def __init__( self.ds_app_ids = ds_app_ids async def inner_logic(self) -> None: - url: str = self.build_url("v3/source-manager/agencies/delete") + url: str = self.build_url("v3/sync/agencies/delete") await self.post( url=url, model=DSAppSyncDeleteRequestModel( diff --git a/src/external/pdap/impl/sync/agencies/update/core.py b/src/external/pdap/impl/sync/agencies/update/core.py index 4c5673ac..6589c8b0 100644 --- a/src/external/pdap/impl/sync/agencies/update/core.py +++ b/src/external/pdap/impl/sync/agencies/update/core.py @@ -12,7 +12,7 @@ def __init__( self.request = request async def inner_logic(self) -> None: - url: str = self.build_url("v3/source-manager/agencies/update") + url: str = self.build_url("v3/sync/agencies/update") await self.post( url=url, model=self.request diff --git a/src/external/pdap/impl/sync/data_sources/add/core.py b/src/external/pdap/impl/sync/data_sources/add/core.py index 8eaa1b8b..c3576961 100644 --- a/src/external/pdap/impl/sync/data_sources/add/core.py +++ b/src/external/pdap/impl/sync/data_sources/add/core.py @@ -14,7 +14,7 @@ def __init__( self.request = request async def inner_logic(self) -> list[DSAppSyncAddResponseInnerModel]: - url: str = self.build_url("v3/source-manager/data-sources/add") + url: str = self.build_url("v3/sync/data-sources/add") raw_results = await self.post( url=url, model=self.request, diff --git a/src/external/pdap/impl/sync/data_sources/delete/core.py b/src/external/pdap/impl/sync/data_sources/delete/core.py index 7199c0ca..e58d1741 100644 --- a/src/external/pdap/impl/sync/data_sources/delete/core.py +++ b/src/external/pdap/impl/sync/data_sources/delete/core.py @@ -12,7 +12,7 @@ def __init__( self.ds_app_ids = ds_app_ids async def inner_logic(self) -> None: - url: str = self.build_url("v3/source-manager/data-sources/delete") + url: str = self.build_url("v3/sync/data-sources/delete") await self.post( url=url, model=DSAppSyncDeleteRequestModel( diff --git a/src/external/pdap/impl/sync/data_sources/update/core.py b/src/external/pdap/impl/sync/data_sources/update/core.py index 8bcaf57e..491f1676 100644 --- a/src/external/pdap/impl/sync/data_sources/update/core.py +++ b/src/external/pdap/impl/sync/data_sources/update/core.py @@ -12,7 +12,7 @@ def __init__( self.request = request async def inner_logic(self) -> None: - url: str = self.build_url("v3/source-manager/data-sources/update") + url: str = self.build_url("v3/sync/data-sources/update") await self.post( url=url, model=self.request diff --git a/src/external/pdap/impl/sync/meta_urls/add/core.py b/src/external/pdap/impl/sync/meta_urls/add/core.py index 98d6f016..8f1b3752 100644 --- a/src/external/pdap/impl/sync/meta_urls/add/core.py +++ b/src/external/pdap/impl/sync/meta_urls/add/core.py @@ -14,7 +14,7 @@ def __init__( self.request = request async def inner_logic(self) -> list[DSAppSyncAddResponseInnerModel]: - url: str = self.build_url("v3/source-manager/meta-urls/add") + url: str = self.build_url("v3/sync/meta-urls/add") raw_results = await self.post( url=url, model=self.request, diff --git a/src/external/pdap/impl/sync/meta_urls/delete/core.py b/src/external/pdap/impl/sync/meta_urls/delete/core.py index 08b6fd81..bde1dc8d 100644 --- a/src/external/pdap/impl/sync/meta_urls/delete/core.py +++ b/src/external/pdap/impl/sync/meta_urls/delete/core.py @@ -12,7 +12,7 @@ def __init__( self.ds_app_ids = ds_app_ids async def inner_logic(self) -> None: - url: str = self.build_url("v3/source-manager/meta-urls/delete") + url: str = self.build_url("v3/sync/meta-urls/delete") await self.post( url=url, model=DSAppSyncDeleteRequestModel( diff --git a/src/external/pdap/impl/sync/meta_urls/update/core.py b/src/external/pdap/impl/sync/meta_urls/update/core.py index 37e84da9..0c917535 100644 --- a/src/external/pdap/impl/sync/meta_urls/update/core.py +++ b/src/external/pdap/impl/sync/meta_urls/update/core.py @@ -12,7 +12,7 @@ def __init__( self.request = request async def inner_logic(self) -> None: - url: str = self.build_url("v3/source-manager/meta-urls/update") + url: str = self.build_url("v3/sync/meta-urls/update") await self.post( url=url, model=self.request diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/helpers.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/helpers.py index c90a9654..f6d1bd68 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/helpers.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/helpers.py @@ -24,7 +24,7 @@ def extract_and_validate_sync_request( assert mock_pdap_client.access_manager.make_request.call_count == 1 request_info: RequestInfo = get_last_request(mock_pdap_client) assert request_info.type_ == RequestType.POST - full_expected_url: str = f"http://example.com/v3/source-manager/{expected_path}" + full_expected_url: str = f"http://example.com/v3/sync/{expected_path}" assert request_info.url == full_expected_url, f"Expected URL: {full_expected_url}, Actual URL: {request_info.url}" return expected_model(**request_info.json_) From bdcd211cb376743cc1b16080e2916ca7b60a14d7 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 18 Nov 2025 19:17:06 -0500 Subject: [PATCH 49/84] Rename link tables --- ...18_1907-b8a68f4260a4_rename_link_tables.py | 35 +++++++++++++++++++ src/db/models/impl/agency/sqlalchemy.py | 2 +- src/db/models/impl/batch/sqlalchemy.py | 2 +- .../impl/link/agency_batch/sqlalchemy.py | 2 +- .../impl/link/agency_location/sqlalchemy.py | 2 +- .../models/impl/link/batch_url/sqlalchemy.py | 2 +- .../impl/link/location_batch/sqlalchemy.py | 2 +- src/db/models/impl/link/task_url.py | 2 +- .../models/impl/link/url_agency/sqlalchemy.py | 2 +- src/db/models/impl/task/core.py | 2 +- src/db/models/impl/url/core/sqlalchemy.py | 6 ++-- src/db/models/views/batch_url_status/core.py | 6 ++-- src/db/models/views/url_annotations_flags.py | 2 +- 13 files changed, 51 insertions(+), 16 deletions(-) create mode 100644 alembic/versions/2025_11_18_1907-b8a68f4260a4_rename_link_tables.py diff --git a/alembic/versions/2025_11_18_1907-b8a68f4260a4_rename_link_tables.py b/alembic/versions/2025_11_18_1907-b8a68f4260a4_rename_link_tables.py new file mode 100644 index 00000000..fb927bf6 --- /dev/null +++ b/alembic/versions/2025_11_18_1907-b8a68f4260a4_rename_link_tables.py @@ -0,0 +1,35 @@ +"""Rename link tables + +Revision ID: b8a68f4260a4 +Revises: 783268bd3daa +Create Date: 2025-11-18 19:07:48.518828 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'b8a68f4260a4' +down_revision: Union[str, None] = '783268bd3daa' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + old_name_new_name = { + "link_task_urls": "link_tasks__urls", + "link_agencies_locations": "link_agencies__locations", + "link_agency_batches": "link_agencies__batches", + "link_batch_urls": "link_batches__urls", + "link_location_batches": "link_batches__locations", + "link_urls_agency": "link_agencies__urls", + } + for old_name, new_name in old_name_new_name.items(): + op.rename_table(old_name, new_name) + + +def downgrade() -> None: + pass diff --git a/src/db/models/impl/agency/sqlalchemy.py b/src/db/models/impl/agency/sqlalchemy.py index e72e1038..9e99a0be 100644 --- a/src/db/models/impl/agency/sqlalchemy.py +++ b/src/db/models/impl/agency/sqlalchemy.py @@ -35,5 +35,5 @@ class Agency( "LocationExpandedView", primaryjoin="Agency.id == LinkAgencyLocation.agency_id", secondaryjoin="LocationExpandedView.id == LinkAgencyLocation.location_id", - secondary="link_agencies_locations", + secondary="link_agencies__locations", ) diff --git a/src/db/models/impl/batch/sqlalchemy.py b/src/db/models/impl/batch/sqlalchemy.py index fb44396b..72c8b39b 100644 --- a/src/db/models/impl/batch/sqlalchemy.py +++ b/src/db/models/impl/batch/sqlalchemy.py @@ -41,7 +41,7 @@ class Batch(WithIDBase): # Relationships urls = relationship( "URL", - secondary="link_batch_urls", + secondary="link_batches__urls", back_populates="batch", overlaps="url" ) diff --git a/src/db/models/impl/link/agency_batch/sqlalchemy.py b/src/db/models/impl/link/agency_batch/sqlalchemy.py index dcb670d3..32518ed4 100644 --- a/src/db/models/impl/link/agency_batch/sqlalchemy.py +++ b/src/db/models/impl/link/agency_batch/sqlalchemy.py @@ -10,7 +10,7 @@ class LinkAgencyBatch( BatchDependentMixin, AgencyDependentMixin, ): - __tablename__ = "link_agency_batches" + __tablename__ = "link_agencies__batches" __table_args__ = ( PrimaryKeyConstraint( 'batch_id', diff --git a/src/db/models/impl/link/agency_location/sqlalchemy.py b/src/db/models/impl/link/agency_location/sqlalchemy.py index 18a3ae5f..fb7f34da 100644 --- a/src/db/models/impl/link/agency_location/sqlalchemy.py +++ b/src/db/models/impl/link/agency_location/sqlalchemy.py @@ -7,4 +7,4 @@ class LinkAgencyLocation( AgencyDependentMixin, LocationDependentMixin, ): - __tablename__ = "link_agencies_locations" \ No newline at end of file + __tablename__ = "link_agencies__locations" \ No newline at end of file diff --git a/src/db/models/impl/link/batch_url/sqlalchemy.py b/src/db/models/impl/link/batch_url/sqlalchemy.py index 951ac539..ac747e01 100644 --- a/src/db/models/impl/link/batch_url/sqlalchemy.py +++ b/src/db/models/impl/link/batch_url/sqlalchemy.py @@ -11,5 +11,5 @@ class LinkBatchURL( BatchDependentMixin, WithIDBase ): - __tablename__ = "link_batch_urls" + __tablename__ = "link_batches__urls" diff --git a/src/db/models/impl/link/location_batch/sqlalchemy.py b/src/db/models/impl/link/location_batch/sqlalchemy.py index e73a5ec8..e3ea5e55 100644 --- a/src/db/models/impl/link/location_batch/sqlalchemy.py +++ b/src/db/models/impl/link/location_batch/sqlalchemy.py @@ -11,7 +11,7 @@ class LinkLocationBatch( CreatedAtMixin ): - __tablename__ = "link_location_batches" + __tablename__ = "link_batches__locations" __table_args__ = ( PrimaryKeyConstraint( 'batch_id', diff --git a/src/db/models/impl/link/task_url.py b/src/db/models/impl/link/task_url.py index 2535d317..d04d8275 100644 --- a/src/db/models/impl/link/task_url.py +++ b/src/db/models/impl/link/task_url.py @@ -4,7 +4,7 @@ class LinkTaskURL(Base): - __tablename__ = 'link_task_urls' + __tablename__ = 'link_tasks__urls' __table_args__ = (UniqueConstraint( "task_id", "url_id", diff --git a/src/db/models/impl/link/url_agency/sqlalchemy.py b/src/db/models/impl/link/url_agency/sqlalchemy.py index 92d1c37b..c4ca6124 100644 --- a/src/db/models/impl/link/url_agency/sqlalchemy.py +++ b/src/db/models/impl/link/url_agency/sqlalchemy.py @@ -7,7 +7,7 @@ class LinkURLAgency(URLDependentMixin, WithIDBase): - __tablename__ = "link_urls_agency" + __tablename__ = "link_agencies__urls" agency_id: Mapped[int] = get_agency_id_foreign_column() diff --git a/src/db/models/impl/task/core.py b/src/db/models/impl/task/core.py index 566dd116..2eb8fd44 100644 --- a/src/db/models/impl/task/core.py +++ b/src/db/models/impl/task/core.py @@ -32,7 +32,7 @@ class Task(UpdatedAtMixin, WithIDBase): # Relationships urls = relationship( "URL", - secondary="link_task_urls", + secondary="link_tasks__urls", back_populates="tasks" ) errors = relationship(TaskError) diff --git a/src/db/models/impl/url/core/sqlalchemy.py b/src/db/models/impl/url/core/sqlalchemy.py index 5bdcdadb..56681e3d 100644 --- a/src/db/models/impl/url/core/sqlalchemy.py +++ b/src/db/models/impl/url/core/sqlalchemy.py @@ -66,7 +66,7 @@ def full_url(cls): # Relationships batch = relationship( "Batch", - secondary="link_batch_urls", + secondary="link_batches__urls", back_populates="urls", uselist=False, ) @@ -82,7 +82,7 @@ def full_url(cls): ) tasks = relationship( "Task", - secondary="link_task_urls", + secondary="link_tasks__urls", back_populates="urls", ) auto_agency_subtasks = relationship( @@ -110,7 +110,7 @@ def full_url(cls): "URLOptionalDataSourceMetadata", uselist=False, back_populates="url") confirmed_agencies = relationship( "Agency", - secondary="link_urls_agency" + secondary="link_agencies__urls" ) data_source = relationship( diff --git a/src/db/models/views/batch_url_status/core.py b/src/db/models/views/batch_url_status/core.py index 888ca169..1ec0711d 100644 --- a/src/db/models/views/batch_url_status/core.py +++ b/src/db/models/views/batch_url_status/core.py @@ -11,7 +11,7 @@ select 1 from - link_batch_urls lbu + link_batches__urls lbu where lbu.batch_id = b.id ) @@ -26,7 +26,7 @@ select 1 from - link_batch_urls lbu + link_batches__urls lbu left join flag_url_validated fuv on fuv.url_id = lbu.url_id where lbu.batch_id = b.id @@ -36,7 +36,7 @@ select 1 from - link_batch_urls lbu + link_batches__urls lbu left join flag_url_validated fuv on fuv.url_id = lbu.url_id where lbu.batch_id = b.id diff --git a/src/db/models/views/url_annotations_flags.py b/src/db/models/views/url_annotations_flags.py index 57d8e866..47250d1b 100644 --- a/src/db/models/views/url_annotations_flags.py +++ b/src/db/models/views/url_annotations_flags.py @@ -18,7 +18,7 @@ LEFT JOIN public.user_relevant_suggestions urs ON u.id = urs.url_id LEFT JOIN public.user_url_agency_suggestions uuas ON u.id = uuas.url_id LEFT JOIN public.reviewing_user_url ruu ON u.id = ruu.url_id - LEFT JOIN public.link_urls_agency cua on u.id = cua.url_id + LEFT JOIN public.link_agencies__urls cua on u.id = cua.url_id ) """ From 67b4e47a1a09c8095107f508f4f7cea06380f409 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Wed, 19 Nov 2025 23:25:27 -0500 Subject: [PATCH 50/84] Add descriptions to data source submissions --- src/api/endpoints/submit/data_source/query.py | 1 + src/api/endpoints/submit/data_source/request.py | 1 + src/api/endpoints/submit/url/models/request.py | 3 ++- src/api/endpoints/submit/url/queries/core.py | 1 + .../automated/integration/api/submit/data_source/test_core.py | 2 ++ tests/automated/integration/api/submit/test_url_maximal.py | 2 ++ 6 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/api/endpoints/submit/data_source/query.py b/src/api/endpoints/submit/data_source/query.py index 6d7360f5..c14bd560 100644 --- a/src/api/endpoints/submit/data_source/query.py +++ b/src/api/endpoints/submit/data_source/query.py @@ -34,6 +34,7 @@ async def run(self, session: AsyncSession) -> Any: scheme=full_url.scheme, trailing_slash=full_url.has_trailing_slash, name=self.request.name, + description=self.request.description, status=URLStatus.OK, source=URLSource.MANUAL, ) diff --git a/src/api/endpoints/submit/data_source/request.py b/src/api/endpoints/submit/data_source/request.py index 409fe254..fe541923 100644 --- a/src/api/endpoints/submit/data_source/request.py +++ b/src/api/endpoints/submit/data_source/request.py @@ -11,6 +11,7 @@ class DataSourceSubmissionRequest(RequestBase): name: str record_type: RecordType source_url: str + description: str | None = None # Optional URL DS Metadata coverage_start: date | None = None diff --git a/src/api/endpoints/submit/url/models/request.py b/src/api/endpoints/submit/url/models/request.py index 34ec9df9..4e5656b0 100644 --- a/src/api/endpoints/submit/url/models/request.py +++ b/src/api/endpoints/submit/url/models/request.py @@ -9,4 +9,5 @@ class URLSubmissionRequest(RequestBase): record_type: RecordType | None = None name: str | None = None location_id: int | None = None - agency_id: int | None = None \ No newline at end of file + agency_id: int | None = None + description: str | None = None \ No newline at end of file diff --git a/src/api/endpoints/submit/url/queries/core.py b/src/api/endpoints/submit/url/queries/core.py index 9f3e7117..0d2c1c84 100644 --- a/src/api/endpoints/submit/url/queries/core.py +++ b/src/api/endpoints/submit/url/queries/core.py @@ -62,6 +62,7 @@ async def run(self, session: AsyncSession) -> URLSubmissionResponse: scheme=url_and_scheme.scheme, source=URLSource.MANUAL, status=URLStatus.OK, + description=self.request.description, trailing_slash=url_and_scheme.url.endswith('/'), ) session.add(url_insert) diff --git a/tests/automated/integration/api/submit/data_source/test_core.py b/tests/automated/integration/api/submit/data_source/test_core.py index 49df1dd4..eed0cd00 100644 --- a/tests/automated/integration/api/submit/data_source/test_core.py +++ b/tests/automated/integration/api/submit/data_source/test_core.py @@ -32,6 +32,7 @@ async def test_submit_data_source( json=DataSourceSubmissionRequest( source_url="https://example.com/", name="Example name", + description="Example description", record_type=RecordType.COMPLAINTS_AND_MISCONDUCT, coverage_start=date(year=2025, month=8, day=9), coverage_end=date(year=2025, month=8, day=10), @@ -74,6 +75,7 @@ async def test_submit_data_source( assert url.trailing_slash == True assert url.source == URLSource.MANUAL assert url.status == URLStatus.OK + assert url.description == "Example description" # Check for Batch batch: Batch = await adb_client.one_or_none_model(Batch) diff --git a/tests/automated/integration/api/submit/test_url_maximal.py b/tests/automated/integration/api/submit/test_url_maximal.py index 150b5409..e57770fb 100644 --- a/tests/automated/integration/api/submit/test_url_maximal.py +++ b/tests/automated/integration/api/submit/test_url_maximal.py @@ -32,6 +32,7 @@ async def test_maximal( request=URLSubmissionRequest( url="www.example.com", record_type=RecordType.INCARCERATION_RECORDS, + description="Example description", name="Example URL", location_id=pittsburgh_locality.location_id, agency_id=agency_id, @@ -48,6 +49,7 @@ async def test_maximal( url: URL = urls[0] assert url.id == url_id assert url.url == "www.example.com" + assert url.description == "Example description" links: list[LinkUserSubmittedURL] = await adb_client.get_all(LinkUserSubmittedURL) assert len(links) == 1 From 052343b531c1e9fa18edd3ffc24d499a35f69976 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 20 Nov 2025 06:41:57 -0500 Subject: [PATCH 51/84] Continue draft --- .../submit/data_source/models/__init__.py | 0 .../data_source/models/response/__init__.py | 0 .../data_source/models/response/duplicate.py | 11 +++++ .../response/standard.py} | 0 .../submit/data_source/queries/__init__.py | 0 .../data_source/{query.py => queries/core.py} | 16 +++++-- .../submit/data_source/queries/duplicate.py | 43 +++++++++++++++++++ .../submit/data_source/queries/standard.py | 24 +++++++++++ .../endpoints/submit/data_source/wrapper.py | 2 +- src/api/endpoints/submit/routes.py | 18 +++++++- .../api/submit/data_source/test_duplicate.py | 34 +++++++++++++++ tests/automated/integration/conftest.py | 16 +++++++ 12 files changed, 158 insertions(+), 6 deletions(-) create mode 100644 src/api/endpoints/submit/data_source/models/__init__.py create mode 100644 src/api/endpoints/submit/data_source/models/response/__init__.py create mode 100644 src/api/endpoints/submit/data_source/models/response/duplicate.py rename src/api/endpoints/submit/data_source/{response.py => models/response/standard.py} (100%) create mode 100644 src/api/endpoints/submit/data_source/queries/__init__.py rename src/api/endpoints/submit/data_source/{query.py => queries/core.py} (88%) create mode 100644 src/api/endpoints/submit/data_source/queries/duplicate.py create mode 100644 src/api/endpoints/submit/data_source/queries/standard.py create mode 100644 tests/automated/integration/api/submit/data_source/test_duplicate.py diff --git a/src/api/endpoints/submit/data_source/models/__init__.py b/src/api/endpoints/submit/data_source/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/submit/data_source/models/response/__init__.py b/src/api/endpoints/submit/data_source/models/response/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/submit/data_source/models/response/duplicate.py b/src/api/endpoints/submit/data_source/models/response/duplicate.py new file mode 100644 index 00000000..12367372 --- /dev/null +++ b/src/api/endpoints/submit/data_source/models/response/duplicate.py @@ -0,0 +1,11 @@ +from pydantic import BaseModel + +from src.collectors.enums import URLStatus +from src.db.models.impl.flag.url_validated.enums import URLType + + +class SubmitDataSourceURLDuplicateSubmissionResponse(BaseModel): + message: str + url_id: int + url_type: URLType | None + url_status: URLStatus \ No newline at end of file diff --git a/src/api/endpoints/submit/data_source/response.py b/src/api/endpoints/submit/data_source/models/response/standard.py similarity index 100% rename from src/api/endpoints/submit/data_source/response.py rename to src/api/endpoints/submit/data_source/models/response/standard.py diff --git a/src/api/endpoints/submit/data_source/queries/__init__.py b/src/api/endpoints/submit/data_source/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/submit/data_source/query.py b/src/api/endpoints/submit/data_source/queries/core.py similarity index 88% rename from src/api/endpoints/submit/data_source/query.py rename to src/api/endpoints/submit/data_source/queries/core.py index c14bd560..a372a630 100644 --- a/src/api/endpoints/submit/data_source/query.py +++ b/src/api/endpoints/submit/data_source/queries/core.py @@ -1,9 +1,11 @@ from typing import Any +from sqlalchemy.exc import IntegrityError from sqlalchemy.ext.asyncio import AsyncSession +from src.api.endpoints.submit.data_source.models.response.standard import SubmitDataSourceURLProposalResponse +from src.api.endpoints.submit.data_source.queries.duplicate import GetDataSourceDuplicateQueryBuilder from src.api.endpoints.submit.data_source.request import DataSourceSubmissionRequest -from src.api.endpoints.submit.data_source.response import SubmitDataSourceURLProposalResponse from src.collectors.enums import URLStatus from src.core.enums import BatchStatus from src.db.models.impl.batch.sqlalchemy import Batch @@ -26,9 +28,10 @@ def __init__(self, request: DataSourceSubmissionRequest): super().__init__() self.request = request - async def run(self, session: AsyncSession) -> Any: + async def run(self, session: AsyncSession) -> SubmitDataSourceURLProposalResponse: full_url = FullURL(full_url=self.request.source_url) + # Begin by attempting to submit the full URL url = URL( url=full_url.id_form, scheme=full_url.scheme, @@ -40,7 +43,14 @@ async def run(self, session: AsyncSession) -> Any: ) session.add(url) - await session.flush() + try: + await session.flush() + except IntegrityError: + qb = GetDataSourceDuplicateQueryBuilder( + url=full_url.id_form + ) + await qb.run(session=session) + url_id: int = url.id diff --git a/src/api/endpoints/submit/data_source/queries/duplicate.py b/src/api/endpoints/submit/data_source/queries/duplicate.py new file mode 100644 index 00000000..13a3626a --- /dev/null +++ b/src/api/endpoints/submit/data_source/queries/duplicate.py @@ -0,0 +1,43 @@ +from typing import Any + +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.base.builder import QueryBuilderBase + + +class GetDataSourceDuplicateQueryBuilder(QueryBuilderBase): + + def __init__( + self, + url: str + ): + super().__init__() + self.url = url + + async def run(self, session: AsyncSession) -> None: + """ + Raises: + HTTPException including details on the duplicate result. + """ + + query = ( + select( + URL.status, + FlagURLValidated.type + ) + .outerjoin( + FlagURLValidated, + FlagURLValidated.url_id == URL.id + ) + .where( + URL.id == self.url_id + ) + ) + mapping: RowMapping = await self.sh.mapping( + query=query, + session=session + ) + diff --git a/src/api/endpoints/submit/data_source/queries/standard.py b/src/api/endpoints/submit/data_source/queries/standard.py new file mode 100644 index 00000000..ef01f5ec --- /dev/null +++ b/src/api/endpoints/submit/data_source/queries/standard.py @@ -0,0 +1,24 @@ +from typing import Any + +from sqlalchemy import select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.submit.data_source.models.response.standard import SubmitDataSourceURLProposalResponse +from src.api.endpoints.submit.data_source.request import DataSourceSubmissionRequest +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.base.builder import QueryBuilderBase + + +class SubmitDataSourceURLProposalStandardQueryBuilder(QueryBuilderBase): + + def __init__( + self, + url_id: int, + request: DataSourceSubmissionRequest + ): + super().__init__() + self.url_id = url_id + self.request = request + + async def run(self, session: AsyncSession) -> SubmitDataSourceURLProposalResponse: diff --git a/src/api/endpoints/submit/data_source/wrapper.py b/src/api/endpoints/submit/data_source/wrapper.py index 32794150..1ab4d919 100644 --- a/src/api/endpoints/submit/data_source/wrapper.py +++ b/src/api/endpoints/submit/data_source/wrapper.py @@ -1,8 +1,8 @@ from fastapi import HTTPException +from src.api.endpoints.submit.data_source.models.response.standard import SubmitDataSourceURLProposalResponse from src.api.endpoints.submit.data_source.query import SubmitDataSourceURLProposalQueryBuilder from src.api.endpoints.submit.data_source.request import DataSourceSubmissionRequest -from src.api.endpoints.submit.data_source.response import SubmitDataSourceURLProposalResponse from src.db.client.async_ import AsyncDatabaseClient from src.db.queries.urls_exist.model import URLExistsResult from src.db.queries.urls_exist.query import URLsExistInDBQueryBuilder diff --git a/src/api/endpoints/submit/routes.py b/src/api/endpoints/submit/routes.py index ee315493..ca1def43 100644 --- a/src/api/endpoints/submit/routes.py +++ b/src/api/endpoints/submit/routes.py @@ -2,6 +2,10 @@ from src.api.dependencies import get_async_core from src.api.endpoints.submit.data_source.query import SubmitDataSourceURLProposalQueryBuilder + +from src.api.endpoints.submit.data_source.models.response.duplicate import \ + SubmitDataSourceURLDuplicateSubmissionResponse +from src.api.endpoints.submit.data_source.models.response.standard import SubmitDataSourceURLProposalResponse from src.api.endpoints.submit.data_source.request import DataSourceSubmissionRequest from src.api.endpoints.submit.url.models.request import URLSubmissionRequest from src.api.endpoints.submit.url.models.response import URLSubmissionResponse @@ -12,7 +16,9 @@ submit_router = APIRouter(prefix="/submit", tags=["submit"]) -@submit_router.post("/url") +@submit_router.post( + "/url" +) async def submit_url( request: URLSubmissionRequest, access_info: AccessInfo = Depends(get_access_info), @@ -25,7 +31,15 @@ async def submit_url( ) ) -@submit_router.post("/data-source") +@submit_router.post( + "/data-source", + response_model=SubmitDataSourceURLProposalResponse, + responses={ + 409: { + "model": SubmitDataSourceURLDuplicateSubmissionResponse + } + } +) async def submit_data_source( request: DataSourceSubmissionRequest, async_core: AsyncCore = Depends(get_async_core), diff --git a/tests/automated/integration/api/submit/data_source/test_duplicate.py b/tests/automated/integration/api/submit/data_source/test_duplicate.py new file mode 100644 index 00000000..c1d5a88e --- /dev/null +++ b/tests/automated/integration/api/submit/data_source/test_duplicate.py @@ -0,0 +1,34 @@ +import pytest + +from src.api.endpoints.submit.data_source.models.response.duplicate import SubmitDataSourceURLDuplicateSubmissionResponse +from src.api.endpoints.submit.data_source.request import DataSourceSubmissionRequest +from src.collectors.enums import URLStatus +from src.core.enums import RecordType +from src.db.dtos.url.mapping_.simple import SimpleURLMapping +from src.db.models.impl.flag.url_validated.enums import URLType +from tests.helpers.api_test_helper import APITestHelper +from tests.helpers.data_creator.models.creation_info.locality import LocalityCreationInfo + + +@pytest.mark.asyncio +async def test_submit_data_source_duplicate( + api_test_helper: APITestHelper, + test_agency_id: int, + pittsburgh_locality: LocalityCreationInfo, + test_url_data_source_mapping: SimpleURLMapping +): + + ath = api_test_helper + response: dict = ath.request_validator.post_v3( + url="submit/data-source", + json=DataSourceSubmissionRequest( + source_url=test_url_data_source_mapping.url, + name="Test Name", + record_type=RecordType.RECORDS_REQUEST_INFO + ).model_dump(mode='json') + ) + model = SubmitDataSourceURLDuplicateSubmissionResponse(**response) + assert model.url_id == test_url_data_source_mapping.url_id + assert model.url_type == URLType.DATA_SOURCE + assert model.url_status == URLStatus.OK + assert model.message == "Duplicate URL Found" diff --git a/tests/automated/integration/conftest.py b/tests/automated/integration/conftest.py index 6837bae0..6e2be0f0 100644 --- a/tests/automated/integration/conftest.py +++ b/tests/automated/integration/conftest.py @@ -12,6 +12,7 @@ from src.core.logger import AsyncCoreLogger from src.db.client.async_ import AsyncDatabaseClient from src.db.client.sync import DatabaseClient +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.flag.url_validated.enums import URLType from src.security.dtos.access_info import AccessInfo from src.security.enums import Permissions @@ -217,6 +218,21 @@ async def test_url_data_source_id( ) return url_id +@pytest_asyncio.fixture +async def test_url_data_source_mapping( + db_data_creator: DBDataCreator, + test_agency_id: int +) -> SimpleURLMapping: + url_mapping: SimpleURLMapping = (await db_data_creator.create_validated_urls( + record_type=RecordType.CRIME_STATISTICS, + validation_type=URLType.DATA_SOURCE, + ))[0] + await db_data_creator.link_urls_to_agencies( + url_ids=[url_mapping.url_id], + agency_ids=[test_agency_id] + ) + return url_mapping + @pytest_asyncio.fixture async def test_url_meta_url_id( db_data_creator: DBDataCreator, From 90ecbaee64d37022bcfdbf0130f54c16a4aa6b05 Mon Sep 17 00:00:00 2001 From: maxachis Date: Thu, 20 Nov 2025 08:00:16 -0500 Subject: [PATCH 52/84] Add logic for handling duplicates in data source submission --- .../submit/data_source/queries/core.py | 16 ++++------ .../submit/data_source/queries/duplicate.py | 21 +++++++++++-- .../submit/data_source/queries/standard.py | 24 --------------- .../endpoints/submit/data_source/wrapper.py | 15 ++++++---- src/api/endpoints/submit/routes.py | 10 +++---- .../api/submit/data_source/test_duplicate.py | 30 +++++++++++-------- 6 files changed, 56 insertions(+), 60 deletions(-) delete mode 100644 src/api/endpoints/submit/data_source/queries/standard.py diff --git a/src/api/endpoints/submit/data_source/queries/core.py b/src/api/endpoints/submit/data_source/queries/core.py index a372a630..b3d1ff46 100644 --- a/src/api/endpoints/submit/data_source/queries/core.py +++ b/src/api/endpoints/submit/data_source/queries/core.py @@ -4,7 +4,6 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.api.endpoints.submit.data_source.models.response.standard import SubmitDataSourceURLProposalResponse -from src.api.endpoints.submit.data_source.queries.duplicate import GetDataSourceDuplicateQueryBuilder from src.api.endpoints.submit.data_source.request import DataSourceSubmissionRequest from src.collectors.enums import URLStatus from src.core.enums import BatchStatus @@ -28,7 +27,10 @@ def __init__(self, request: DataSourceSubmissionRequest): super().__init__() self.request = request - async def run(self, session: AsyncSession) -> SubmitDataSourceURLProposalResponse: + async def run( + self, + session: AsyncSession + ) -> SubmitDataSourceURLProposalResponse: full_url = FullURL(full_url=self.request.source_url) # Begin by attempting to submit the full URL @@ -43,15 +45,9 @@ async def run(self, session: AsyncSession) -> SubmitDataSourceURLProposalRespons ) session.add(url) - try: - await session.flush() - except IntegrityError: - qb = GetDataSourceDuplicateQueryBuilder( - url=full_url.id_form - ) - await qb.run(session=session) - + await session.flush() + # Standard Path url_id: int = url.id # Add Batch diff --git a/src/api/endpoints/submit/data_source/queries/duplicate.py b/src/api/endpoints/submit/data_source/queries/duplicate.py index 13a3626a..75346cf6 100644 --- a/src/api/endpoints/submit/data_source/queries/duplicate.py +++ b/src/api/endpoints/submit/data_source/queries/duplicate.py @@ -1,8 +1,11 @@ -from typing import Any +from http import HTTPStatus -from sqlalchemy import select +from fastapi import HTTPException +from sqlalchemy import select, RowMapping from sqlalchemy.ext.asyncio import AsyncSession +from src.api.endpoints.submit.data_source.models.response.duplicate import \ + SubmitDataSourceURLDuplicateSubmissionResponse from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase @@ -25,6 +28,7 @@ async def run(self, session: AsyncSession) -> None: query = ( select( + URL.id, URL.status, FlagURLValidated.type ) @@ -33,7 +37,7 @@ async def run(self, session: AsyncSession) -> None: FlagURLValidated.url_id == URL.id ) .where( - URL.id == self.url_id + URL.url == self.url ) ) mapping: RowMapping = await self.sh.mapping( @@ -41,3 +45,14 @@ async def run(self, session: AsyncSession) -> None: session=session ) + model = SubmitDataSourceURLDuplicateSubmissionResponse( + message="Duplicate URL found", + url_id=mapping[URL.id], + url_status=mapping[URL.status], + url_type=mapping[FlagURLValidated.type] + ) + raise HTTPException( + detail=model.model_dump(mode='json'), + status_code=HTTPStatus.CONFLICT + ) + diff --git a/src/api/endpoints/submit/data_source/queries/standard.py b/src/api/endpoints/submit/data_source/queries/standard.py deleted file mode 100644 index ef01f5ec..00000000 --- a/src/api/endpoints/submit/data_source/queries/standard.py +++ /dev/null @@ -1,24 +0,0 @@ -from typing import Any - -from sqlalchemy import select, RowMapping -from sqlalchemy.ext.asyncio import AsyncSession - -from src.api.endpoints.submit.data_source.models.response.standard import SubmitDataSourceURLProposalResponse -from src.api.endpoints.submit.data_source.request import DataSourceSubmissionRequest -from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.queries.base.builder import QueryBuilderBase - - -class SubmitDataSourceURLProposalStandardQueryBuilder(QueryBuilderBase): - - def __init__( - self, - url_id: int, - request: DataSourceSubmissionRequest - ): - super().__init__() - self.url_id = url_id - self.request = request - - async def run(self, session: AsyncSession) -> SubmitDataSourceURLProposalResponse: diff --git a/src/api/endpoints/submit/data_source/wrapper.py b/src/api/endpoints/submit/data_source/wrapper.py index 1ab4d919..20e5e158 100644 --- a/src/api/endpoints/submit/data_source/wrapper.py +++ b/src/api/endpoints/submit/data_source/wrapper.py @@ -1,7 +1,9 @@ from fastapi import HTTPException from src.api.endpoints.submit.data_source.models.response.standard import SubmitDataSourceURLProposalResponse -from src.api.endpoints.submit.data_source.query import SubmitDataSourceURLProposalQueryBuilder +from src.api.endpoints.submit.data_source.queries.core import SubmitDataSourceURLProposalQueryBuilder + +from src.api.endpoints.submit.data_source.queries.duplicate import GetDataSourceDuplicateQueryBuilder from src.api.endpoints.submit.data_source.request import DataSourceSubmissionRequest from src.db.client.async_ import AsyncDatabaseClient from src.db.queries.urls_exist.model import URLExistsResult @@ -21,15 +23,18 @@ async def submit_data_source_url_proposal( detail="Invalid URL" ) + full_url = FullURL(request.source_url) + url_exists_results: URLExistsResult = (await adb_client.run_query_builder( URLsExistInDBQueryBuilder( - full_urls=[FullURL(request.source_url)] + full_urls=[full_url] ) ))[0] if url_exists_results.exists: - raise HTTPException( - status_code=400, - detail="URL already exists in database." + await adb_client.run_query_builder( + GetDataSourceDuplicateQueryBuilder( + url=full_url.id_form + ) ) return await adb_client.run_query_builder( diff --git a/src/api/endpoints/submit/routes.py b/src/api/endpoints/submit/routes.py index ca1def43..37f4a3c9 100644 --- a/src/api/endpoints/submit/routes.py +++ b/src/api/endpoints/submit/routes.py @@ -1,12 +1,13 @@ from fastapi import APIRouter, Depends from src.api.dependencies import get_async_core -from src.api.endpoints.submit.data_source.query import SubmitDataSourceURLProposalQueryBuilder from src.api.endpoints.submit.data_source.models.response.duplicate import \ SubmitDataSourceURLDuplicateSubmissionResponse from src.api.endpoints.submit.data_source.models.response.standard import SubmitDataSourceURLProposalResponse +from src.api.endpoints.submit.data_source.queries.core import SubmitDataSourceURLProposalQueryBuilder from src.api.endpoints.submit.data_source.request import DataSourceSubmissionRequest +from src.api.endpoints.submit.data_source.wrapper import submit_data_source_url_proposal from src.api.endpoints.submit.url.models.request import URLSubmissionRequest from src.api.endpoints.submit.url.models.response import URLSubmissionResponse from src.api.endpoints.submit.url.queries.core import SubmitURLQueryBuilder @@ -44,8 +45,7 @@ async def submit_data_source( request: DataSourceSubmissionRequest, async_core: AsyncCore = Depends(get_async_core), ): - return await async_core.adb_client.run_query_builder( - SubmitDataSourceURLProposalQueryBuilder( - request=request, - ) + return await submit_data_source_url_proposal( + request=request, + adb_client=async_core.adb_client ) diff --git a/tests/automated/integration/api/submit/data_source/test_duplicate.py b/tests/automated/integration/api/submit/data_source/test_duplicate.py index c1d5a88e..ea16e1ec 100644 --- a/tests/automated/integration/api/submit/data_source/test_duplicate.py +++ b/tests/automated/integration/api/submit/data_source/test_duplicate.py @@ -1,4 +1,5 @@ import pytest +from fastapi import HTTPException from src.api.endpoints.submit.data_source.models.response.duplicate import SubmitDataSourceURLDuplicateSubmissionResponse from src.api.endpoints.submit.data_source.request import DataSourceSubmissionRequest @@ -19,16 +20,19 @@ async def test_submit_data_source_duplicate( ): ath = api_test_helper - response: dict = ath.request_validator.post_v3( - url="submit/data-source", - json=DataSourceSubmissionRequest( - source_url=test_url_data_source_mapping.url, - name="Test Name", - record_type=RecordType.RECORDS_REQUEST_INFO - ).model_dump(mode='json') - ) - model = SubmitDataSourceURLDuplicateSubmissionResponse(**response) - assert model.url_id == test_url_data_source_mapping.url_id - assert model.url_type == URLType.DATA_SOURCE - assert model.url_status == URLStatus.OK - assert model.message == "Duplicate URL Found" + try: + ath.request_validator.post_v3( + url="submit/data-source", + json=DataSourceSubmissionRequest( + source_url=test_url_data_source_mapping.url, + name="Test Name", + record_type=RecordType.RECORDS_REQUEST_INFO + ).model_dump(mode='json') + ) + except HTTPException as e: + response = e.detail['detail'] + model = SubmitDataSourceURLDuplicateSubmissionResponse(**response) + assert model.url_id == test_url_data_source_mapping.url_id + assert model.url_type == URLType.DATA_SOURCE + assert model.url_status == URLStatus.OK + assert model.message == "Duplicate URL found" From fe67257ec4a3e25a61b414430c208c4e5d47f6dc Mon Sep 17 00:00:00 2001 From: maxachis Date: Thu, 20 Nov 2025 16:01:10 -0500 Subject: [PATCH 53/84] Begin draft --- ...30-c4edeb795134_remove_url_error_status.py | 33 +++++++++++++++++++ .../aggregated/query/url_error/query.py | 6 +++- .../metrics/batches/breakdown/error/cte_.py | 11 ++++--- .../url_counts/cte/error.py | 4 ++- .../api/metrics/batches/test_aggregated.py | 8 ++--- .../tasks/url/impl/html/setup/data.py | 4 +-- .../commands/impl/urls_/convert.py | 2 +- 7 files changed, 55 insertions(+), 13 deletions(-) create mode 100644 alembic/versions/2025_11_20_1530-c4edeb795134_remove_url_error_status.py diff --git a/alembic/versions/2025_11_20_1530-c4edeb795134_remove_url_error_status.py b/alembic/versions/2025_11_20_1530-c4edeb795134_remove_url_error_status.py new file mode 100644 index 00000000..32c977e2 --- /dev/null +++ b/alembic/versions/2025_11_20_1530-c4edeb795134_remove_url_error_status.py @@ -0,0 +1,33 @@ +"""Remove URL Error Status + +Revision ID: c4edeb795134 +Revises: b8a68f4260a4 +Create Date: 2025-11-20 15:30:15.783191 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + +from src.util.alembic_helpers import remove_enum_value + +# revision identifiers, used by Alembic. +revision: str = 'c4edeb795134' +down_revision: Union[str, None] = 'b8a68f4260a4' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + remove_enum_value( + enum_name="url_status", + value_to_remove="error", + targets=[ + ("urls", "status") + ] + ) + + +def downgrade() -> None: + pass diff --git a/src/api/endpoints/metrics/batches/aggregated/query/url_error/query.py b/src/api/endpoints/metrics/batches/aggregated/query/url_error/query.py index 9bcc3a57..a7b9e27a 100644 --- a/src/api/endpoints/metrics/batches/aggregated/query/url_error/query.py +++ b/src/api/endpoints/metrics/batches/aggregated/query/url_error/query.py @@ -5,10 +5,12 @@ from src.api.endpoints.metrics.batches.aggregated.query.models.strategy_count import CountByBatchStrategyResponse from src.collectors.enums import URLStatus +from src.db.helpers.query import exists_url from src.db.helpers.session import session_helper as sh from src.db.models.impl.batch.sqlalchemy import Batch from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.task_error.sqlalchemy import URLTaskError from src.db.queries.base.builder import QueryBuilderBase @@ -23,7 +25,9 @@ async def run(self, session: AsyncSession) -> list[CountByBatchStrategyResponse] .select_from(Batch) .join(LinkBatchURL) .join(URL) - .where(URL.status == URLStatus.ERROR) + .where( + exists_url(URLTaskError) + ) .group_by(Batch.strategy, URL.status) ) diff --git a/src/api/endpoints/metrics/batches/breakdown/error/cte_.py b/src/api/endpoints/metrics/batches/breakdown/error/cte_.py index ed2ff44f..6c54e45b 100644 --- a/src/api/endpoints/metrics/batches/breakdown/error/cte_.py +++ b/src/api/endpoints/metrics/batches/breakdown/error/cte_.py @@ -1,10 +1,11 @@ -from sqlalchemy import select, func, CTE, Column +from sqlalchemy import select, func -from src.collectors.enums import URLStatus +from src.api.endpoints.metrics.batches.breakdown.templates.cte_ import BatchesBreakdownURLCTE +from src.db.helpers.query import exists_url from src.db.models.impl.batch.sqlalchemy import Batch from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL -from src.api.endpoints.metrics.batches.breakdown.templates.cte_ import BatchesBreakdownURLCTE from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.task_error.sqlalchemy import URLTaskError URL_ERROR_CTE = BatchesBreakdownURLCTE( select( @@ -19,7 +20,9 @@ URL, URL.id == LinkBatchURL.url_id ) - .where(URL.status == URLStatus.ERROR) + .where( + exists_url(URLTaskError) + ) .group_by(Batch.id) .cte("error") ) diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/error.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/error.py index b74020c4..953a5c0d 100644 --- a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/error.py +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/error.py @@ -1,9 +1,11 @@ from sqlalchemy import select, func from src.collectors.enums import URLStatus +from src.db.helpers.query import exists_url from src.db.models.impl.batch.sqlalchemy import Batch from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.task_error.sqlalchemy import URLTaskError from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte_container import \ URLCountsCTEContainer @@ -21,7 +23,7 @@ URL.id == LinkBatchURL.url_id, ) .where( - URL.status == URLStatus.ERROR + exists_url(URLTaskError) ) .group_by( Batch.id diff --git a/tests/automated/integration/api/metrics/batches/test_aggregated.py b/tests/automated/integration/api/metrics/batches/test_aggregated.py index 97cd805e..3d84d6d7 100644 --- a/tests/automated/integration/api/metrics/batches/test_aggregated.py +++ b/tests/automated/integration/api/metrics/batches/test_aggregated.py @@ -23,9 +23,9 @@ async def test_get_batches_aggregated_metrics( adb_client=adb_client, strategy=CollectorType.MANUAL, ) - url_mappings_error: list[SimpleURLMapping] = await create_urls( + url_mappings_broken: list[SimpleURLMapping] = await create_urls( adb_client=adb_client, - status=URLStatus.ERROR, + status=URLStatus.BROKEN, count=4, ) url_mappings_ok: list[SimpleURLMapping] = await create_urls( @@ -33,7 +33,7 @@ async def test_get_batches_aggregated_metrics( status=URLStatus.OK, count=11, ) - url_mappings_all: list[SimpleURLMapping] = url_mappings_error + url_mappings_ok + url_mappings_all: list[SimpleURLMapping] = url_mappings_broken + url_mappings_ok url_ids_all: list[int] = [url_mapping.url_id for url_mapping in url_mappings_all] await create_batch_url_links( adb_client=adb_client, @@ -88,5 +88,5 @@ async def test_get_batches_aggregated_metrics( assert inner_dto_manual.count_urls_pending == 15 assert inner_dto_manual.count_urls_submitted == 6 assert inner_dto_manual.count_urls_rejected == 9 - assert inner_dto_manual.count_urls_errors == 12 + assert inner_dto_manual.count_urls_errors == 0 # TODO: Change by adding URL Task Errors assert inner_dto_manual.count_urls_validated == 30 diff --git a/tests/automated/integration/tasks/url/impl/html/setup/data.py b/tests/automated/integration/tasks/url/impl/html/setup/data.py index 203eb34b..a3a43f8b 100644 --- a/tests/automated/integration/tasks/url/impl/html/setup/data.py +++ b/tests/automated/integration/tasks/url/impl/html/setup/data.py @@ -29,7 +29,7 @@ TestURLHTMLTaskSetupEntry( url_info=TestURLInfo( url="not-found-path.com/submitted", - status=URLStatus.ERROR + status=URLStatus.OK ), web_metadata_info=TestWebMetadataInfo( accessed=True, @@ -48,7 +48,7 @@ TestURLHTMLTaskSetupEntry( url_info=TestURLInfo( url="error-path.com/submitted", - status=URLStatus.ERROR + status=URLStatus.OK ), web_metadata_info=TestWebMetadataInfo( accessed=True, diff --git a/tests/helpers/data_creator/commands/impl/urls_/convert.py b/tests/helpers/data_creator/commands/impl/urls_/convert.py index 66747e6c..c1e2db31 100644 --- a/tests/helpers/data_creator/commands/impl/urls_/convert.py +++ b/tests/helpers/data_creator/commands/impl/urls_/convert.py @@ -14,7 +14,7 @@ def convert_url_creation_enum_to_url_status(url_creation_enum: URLCreationEnum) case URLCreationEnum.NOT_RELEVANT: return URLStatus.OK case URLCreationEnum.ERROR: - return URLStatus.ERROR + raise ValueError("Invalid URL Status") case URLCreationEnum.DUPLICATE: return URLStatus.DUPLICATE case _: From 77c18c6cec86911a1f97e3cf701499b90eba399e Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 20 Nov 2025 16:28:18 -0500 Subject: [PATCH 54/84] Add check/unique-url endpoint --- src/api/endpoints/check/__init__.py | 0 src/api/endpoints/check/routes.py | 21 ++++++++++++ .../endpoints/check/unique_url/__init__.py | 0 .../endpoints/check/unique_url/response.py | 6 ++++ src/api/endpoints/check/unique_url/wrapper.py | 23 +++++++++++++ src/api/main.py | 4 ++- .../readonly/api/check/__init__.py | 0 .../readonly/api/check/test_unique_url.py | 33 +++++++++++++++++++ .../integration/readonly/setup/core.py | 3 -- 9 files changed, 86 insertions(+), 4 deletions(-) create mode 100644 src/api/endpoints/check/__init__.py create mode 100644 src/api/endpoints/check/routes.py create mode 100644 src/api/endpoints/check/unique_url/__init__.py create mode 100644 src/api/endpoints/check/unique_url/response.py create mode 100644 src/api/endpoints/check/unique_url/wrapper.py create mode 100644 tests/automated/integration/readonly/api/check/__init__.py create mode 100644 tests/automated/integration/readonly/api/check/test_unique_url.py diff --git a/src/api/endpoints/check/__init__.py b/src/api/endpoints/check/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/check/routes.py b/src/api/endpoints/check/routes.py new file mode 100644 index 00000000..09870f15 --- /dev/null +++ b/src/api/endpoints/check/routes.py @@ -0,0 +1,21 @@ +from fastapi import APIRouter, Depends + +from src.api.dependencies import get_async_core +from src.api.endpoints.check.unique_url.response import CheckUniqueURLResponse +from src.api.endpoints.check.unique_url.wrapper import check_unique_url_wrapper +from src.core.core import AsyncCore + +check_router = APIRouter( + prefix="/check", + tags=["check"] +) + +@check_router.get("/unique-url") +async def check_unique_url( + url: str, + async_core: AsyncCore = Depends(get_async_core), +) -> CheckUniqueURLResponse: + return await check_unique_url_wrapper( + adb_client=async_core.adb_client, + url=url + ) diff --git a/src/api/endpoints/check/unique_url/__init__.py b/src/api/endpoints/check/unique_url/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/check/unique_url/response.py b/src/api/endpoints/check/unique_url/response.py new file mode 100644 index 00000000..f9a15ddd --- /dev/null +++ b/src/api/endpoints/check/unique_url/response.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class CheckUniqueURLResponse(BaseModel): + unique_url: bool + url_id: int | None \ No newline at end of file diff --git a/src/api/endpoints/check/unique_url/wrapper.py b/src/api/endpoints/check/unique_url/wrapper.py new file mode 100644 index 00000000..63deddf1 --- /dev/null +++ b/src/api/endpoints/check/unique_url/wrapper.py @@ -0,0 +1,23 @@ +from src.api.endpoints.check.unique_url.response import CheckUniqueURLResponse +from src.db.client.async_ import AsyncDatabaseClient +from src.db.queries.urls_exist.model import URLExistsResult +from src.db.queries.urls_exist.query import URLsExistInDBQueryBuilder +from src.util.models.full_url import FullURL + + +async def check_unique_url_wrapper( + adb_client: AsyncDatabaseClient, + url: str +) -> CheckUniqueURLResponse: + result: URLExistsResult = (await adb_client.run_query_builder( + URLsExistInDBQueryBuilder(full_urls=[FullURL(url)]) + ))[0] + if result.exists: + return CheckUniqueURLResponse( + unique_url=False, + url_id=result.url_id + ) + return CheckUniqueURLResponse( + unique_url=True, + url_id=None + ) diff --git a/src/api/main.py b/src/api/main.py index 141d4e38..826d3013 100644 --- a/src/api/main.py +++ b/src/api/main.py @@ -11,6 +11,7 @@ from src.api.endpoints.agencies.routes import agencies_router from src.api.endpoints.annotate.routes import annotate_router from src.api.endpoints.batch.routes import batch_router +from src.api.endpoints.check.routes import check_router from src.api.endpoints.collector.routes import collector_router from src.api.endpoints.contributions.routes import contributions_router from src.api.endpoints.data_source.routes import data_sources_router @@ -183,7 +184,8 @@ async def redirect_docs(): contributions_router, agencies_router, data_sources_router, - meta_urls_router + meta_urls_router, + check_router ] for router in routers: diff --git a/tests/automated/integration/readonly/api/check/__init__.py b/tests/automated/integration/readonly/api/check/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/readonly/api/check/test_unique_url.py b/tests/automated/integration/readonly/api/check/test_unique_url.py new file mode 100644 index 00000000..12123b99 --- /dev/null +++ b/tests/automated/integration/readonly/api/check/test_unique_url.py @@ -0,0 +1,33 @@ +import pytest + +from src.api.endpoints.check.unique_url.response import CheckUniqueURLResponse +from tests.automated.integration.readonly.helper import ReadOnlyTestHelper +from tests.helpers.api_test_helper import APITestHelper + + +@pytest.mark.asyncio +async def test_check_unique_url( + readonly_helper: ReadOnlyTestHelper +): + + ath: APITestHelper = readonly_helper.api_test_helper + response_not_unique_url = ath.request_validator.get_v3( + url="/check/unique-url", + params={ + "url": "https://read-only-ds.com" + } + ) + model_not_unique_url = CheckUniqueURLResponse(**response_not_unique_url) + assert not model_not_unique_url.unique_url + assert model_not_unique_url.url_id == readonly_helper.maximal_data_source_url_id + + + response_unique_url = ath.request_validator.get_v3( + url="/check/unique-url", + params={ + "url": "https://nonexistent-url.com" + } + ) + model_unique_url = CheckUniqueURLResponse(**response_unique_url) + assert model_unique_url.unique_url + assert model_unique_url.url_id is None \ No newline at end of file diff --git a/tests/automated/integration/readonly/setup/core.py b/tests/automated/integration/readonly/setup/core.py index d3584929..c938b523 100644 --- a/tests/automated/integration/readonly/setup/core.py +++ b/tests/automated/integration/readonly/setup/core.py @@ -59,9 +59,6 @@ async def setup_readonly_data( adb_client=adb_client ) - - - # Add Data Source With Linked Agency maximal_data_source: int = await add_maximal_data_source( agency_1_id=agency_1_id, From 0f3de3cad9b40bda1fc3ecc407921ed70e856cec Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 21 Nov 2025 16:35:47 -0500 Subject: [PATCH 55/84] Migrate select endpoints --- src/api/endpoints/locations/__init__.py | 0 src/api/endpoints/locations/post/__init__.py | 0 src/api/endpoints/locations/post/query.py | 46 +++++++++++++++++++ src/api/endpoints/locations/post/request.py | 6 +++ src/api/endpoints/locations/post/response.py | 5 ++ src/api/endpoints/locations/routes.py | 22 +++++++++ src/api/main.py | 4 +- .../integration/api/locations/__init__.py | 0 .../api/locations/post/__init__.py | 0 .../api/locations/post/test_locality.py | 38 +++++++++++++++ 10 files changed, 120 insertions(+), 1 deletion(-) create mode 100644 src/api/endpoints/locations/__init__.py create mode 100644 src/api/endpoints/locations/post/__init__.py create mode 100644 src/api/endpoints/locations/post/query.py create mode 100644 src/api/endpoints/locations/post/request.py create mode 100644 src/api/endpoints/locations/post/response.py create mode 100644 src/api/endpoints/locations/routes.py create mode 100644 tests/automated/integration/api/locations/__init__.py create mode 100644 tests/automated/integration/api/locations/post/__init__.py create mode 100644 tests/automated/integration/api/locations/post/test_locality.py diff --git a/src/api/endpoints/locations/__init__.py b/src/api/endpoints/locations/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/locations/post/__init__.py b/src/api/endpoints/locations/post/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/locations/post/query.py b/src/api/endpoints/locations/post/query.py new file mode 100644 index 00000000..61345191 --- /dev/null +++ b/src/api/endpoints/locations/post/query.py @@ -0,0 +1,46 @@ +from typing import Any + +from sqlalchemy import select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.locations.post.request import AddLocationRequestModel +from src.api.endpoints.locations.post.response import AddLocationResponseModel +from src.db import Locality, Location +from src.db.queries.base.builder import QueryBuilderBase + + +class AddLocationQueryBuilder(QueryBuilderBase): + + def __init__( + self, + request: AddLocationRequestModel + ): + super().__init__() + self.request = request + + async def run(self, session: AsyncSession) -> AddLocationResponseModel: + locality = Locality( + name=self.request.locality_name, + county_id=self.request.county_id + ) + session.add(locality) + await session.flush() + locality_id: int = locality.id + + query = ( + select( + Location.id + ) + .where( + Location.locality_id == locality_id + ) + ) + + mapping: RowMapping = await self.sh.mapping( + session=session, + query=query + ) + + return AddLocationResponseModel( + location_id=mapping[Location.id] + ) diff --git a/src/api/endpoints/locations/post/request.py b/src/api/endpoints/locations/post/request.py new file mode 100644 index 00000000..1f8eba3d --- /dev/null +++ b/src/api/endpoints/locations/post/request.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class AddLocationRequestModel(BaseModel): + locality_name: str + county_id: int diff --git a/src/api/endpoints/locations/post/response.py b/src/api/endpoints/locations/post/response.py new file mode 100644 index 00000000..6cd6a249 --- /dev/null +++ b/src/api/endpoints/locations/post/response.py @@ -0,0 +1,5 @@ +from pydantic import BaseModel + + +class AddLocationResponseModel(BaseModel): + location_id: int \ No newline at end of file diff --git a/src/api/endpoints/locations/routes.py b/src/api/endpoints/locations/routes.py new file mode 100644 index 00000000..4a0ef096 --- /dev/null +++ b/src/api/endpoints/locations/routes.py @@ -0,0 +1,22 @@ +from fastapi import APIRouter, Depends + +from src.api.dependencies import get_async_core +from src.api.endpoints.locations.post.query import AddLocationQueryBuilder +from src.api.endpoints.locations.post.request import AddLocationRequestModel +from src.api.endpoints.locations.post.response import AddLocationResponseModel +from src.core.core import AsyncCore + +location_url_router = APIRouter( + prefix="/locations", + tags=["Locations"], + responses={404: {"description": "Not found"}} +) + +@location_url_router.post("") +async def create_location( + request: AddLocationRequestModel, + async_core: AsyncCore = Depends(get_async_core), +) -> AddLocationResponseModel: + return await async_core.adb_client.run_query_builder( + AddLocationQueryBuilder(request) + ) diff --git a/src/api/main.py b/src/api/main.py index 826d3013..ca6e56c4 100644 --- a/src/api/main.py +++ b/src/api/main.py @@ -15,6 +15,7 @@ from src.api.endpoints.collector.routes import collector_router from src.api.endpoints.contributions.routes import contributions_router from src.api.endpoints.data_source.routes import data_sources_router +from src.api.endpoints.locations.routes import location_url_router from src.api.endpoints.meta_url.routes import meta_urls_router from src.api.endpoints.metrics.routes import metrics_router from src.api.endpoints.root import root_router @@ -185,7 +186,8 @@ async def redirect_docs(): agencies_router, data_sources_router, meta_urls_router, - check_router + check_router, + location_url_router ] for router in routers: diff --git a/tests/automated/integration/api/locations/__init__.py b/tests/automated/integration/api/locations/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/locations/post/__init__.py b/tests/automated/integration/api/locations/post/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/locations/post/test_locality.py b/tests/automated/integration/api/locations/post/test_locality.py new file mode 100644 index 00000000..6a1bc4b0 --- /dev/null +++ b/tests/automated/integration/api/locations/post/test_locality.py @@ -0,0 +1,38 @@ +import pytest + +from src.api.endpoints.locations.post.request import AddLocationRequestModel +from src.api.endpoints.locations.post.response import AddLocationResponseModel +from src.db import Locality, Location +from src.db.client.async_ import AsyncDatabaseClient +from tests.helpers.api_test_helper import APITestHelper +from tests.helpers.data_creator.models.creation_info.county import CountyCreationInfo + + +@pytest.mark.asyncio +async def test_add_locality( + allegheny_county: CountyCreationInfo, + adb_client_test: AsyncDatabaseClient, + api_test_helper: APITestHelper +): + # Add Locality + locality_response: dict = api_test_helper.request_validator.post_v3( + "/locations", + json=AddLocationRequestModel( + locality_name="Test Locality", + county_id=allegheny_county.county_id + ).model_dump(mode='json') + ) + response_model = AddLocationResponseModel( + **locality_response + ) + + # Confirm exists in database + localities: list[Locality] = await adb_client_test.get_all(Locality) + assert len(localities) == 1 + assert localities[0].name == "Test Locality" + assert localities[0].county_id == allegheny_county.county_id + + locations: list[Location] = await adb_client_test.get_all(Location) + assert len(locations) == 3 + location_ids = {location.id for location in locations} + assert response_model.location_id in location_ids From e10624cd36e01dd70971b7c3ebddf4abec669bfc Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 21 Nov 2025 17:46:34 -0500 Subject: [PATCH 56/84] Remove URL Error Status --- .../api/batch/summaries/test_happy_path.py | 10 +++------- .../api/metrics/batches/test_breakdown.py | 14 ++++---------- .../integration/api/metrics/test_backlog.py | 6 ------ .../api/metrics/urls/aggregated/test_core.py | 1 - .../api/metrics/urls/breakdown/test_pending.py | 4 ---- .../api/metrics/urls/breakdown/test_submitted.py | 4 ---- .../manual/agency_identifier/test_nlp_processor.py | 3 +-- .../core/lifecycle/test_auto_googler_lifecycle.py | 3 +-- 8 files changed, 9 insertions(+), 36 deletions(-) diff --git a/tests/automated/integration/api/batch/summaries/test_happy_path.py b/tests/automated/integration/api/batch/summaries/test_happy_path.py index f6e28238..6af9ce2b 100644 --- a/tests/automated/integration/api/batch/summaries/test_happy_path.py +++ b/tests/automated/integration/api/batch/summaries/test_happy_path.py @@ -29,10 +29,6 @@ async def test_get_batch_summaries(api_test_helper): count=4, status=URLCreationEnum.NOT_RELEVANT ), - TestURLCreationParameters( - count=3, - status=URLCreationEnum.ERROR - ) ] ), TestBatchCreationParameters( @@ -78,10 +74,10 @@ async def test_get_batch_summaries(api_test_helper): result_2 = results[1] assert result_2.id == batch_2_id counts_2 = result_2.url_counts - assert counts_2.total == 7 + assert counts_2.total == 4 assert counts_2.not_relevant == 4 - assert counts_2.errored == 3 - assert counts_2.pending == 3 + assert counts_2.errored == 0 + assert counts_2.pending == 0 assert counts_2.submitted == 0 assert counts_2.duplicate == 0 diff --git a/tests/automated/integration/api/metrics/batches/test_breakdown.py b/tests/automated/integration/api/metrics/batches/test_breakdown.py index ca05eaa1..6921c3c1 100644 --- a/tests/automated/integration/api/metrics/batches/test_breakdown.py +++ b/tests/automated/integration/api/metrics/batches/test_breakdown.py @@ -49,12 +49,6 @@ async def test_get_batches_breakdown_metrics(api_test_helper): strategy=CollectorType.AUTO_GOOGLER, date_generated=today - timedelta(days=14) ) - error_url_mappings: list[SimpleURLMapping] = await create_urls( - adb_client=adb_client, - status=URLStatus.ERROR, - count=4, - ) - error_url_ids: list[int] = [url_mapping.url_id for url_mapping in error_url_mappings] validated_url_mappings: list[SimpleURLMapping] = await create_urls( adb_client=adb_client, count=8, @@ -73,7 +67,7 @@ async def test_get_batches_breakdown_metrics(api_test_helper): await create_batch_url_links( adb_client=adb_client, batch_id=batch_id_3, - url_ids=error_url_ids + validated_url_ids, + url_ids=validated_url_ids, ) @@ -107,11 +101,11 @@ async def test_get_batches_breakdown_metrics(api_test_helper): assert dto_batch_3.batch_id == batch_id_3 assert dto_batch_3.status == BatchStatus.READY_TO_LABEL assert dto_batch_3.strategy == CollectorType.AUTO_GOOGLER - assert dto_batch_3.count_url_total == 12 - assert dto_batch_3.count_url_pending == 5 + assert dto_batch_3.count_url_total == 8 + assert dto_batch_3.count_url_pending == 1 assert dto_batch_3.count_url_submitted == 0 assert dto_batch_3.count_url_rejected == 3 - assert dto_batch_3.count_url_error == 4 + assert dto_batch_3.count_url_error == 0 assert dto_batch_3.count_url_validated == 7 dto_2 = await ath.request_validator.get_batches_breakdown_metrics( diff --git a/tests/automated/integration/api/metrics/test_backlog.py b/tests/automated/integration/api/metrics/test_backlog.py index 09f687f5..181c295e 100644 --- a/tests/automated/integration/api/metrics/test_backlog.py +++ b/tests/automated/integration/api/metrics/test_backlog.py @@ -46,12 +46,6 @@ async def test_get_backlog_metrics(api_test_helper): url_ids=not_relevant_url_ids_2[:4], validation_type=URLType.NOT_RELEVANT ) - error_url_mappings_2: list[SimpleURLMapping] = await ddc.create_urls( - status=URLStatus.ERROR, - count=2 - ) - error_url_ids_2: list[int] = [url_mapping.url_id for url_mapping in error_url_mappings_2] - await ddc.create_batch_url_links(url_ids=error_url_ids_2, batch_id=batch_2_id) await adb_client.populate_backlog_snapshot( dt=today.subtract(months=2).naive() diff --git a/tests/automated/integration/api/metrics/urls/aggregated/test_core.py b/tests/automated/integration/api/metrics/urls/aggregated/test_core.py index 1d8eb947..e203b722 100644 --- a/tests/automated/integration/api/metrics/urls/aggregated/test_core.py +++ b/tests/automated/integration/api/metrics/urls/aggregated/test_core.py @@ -47,7 +47,6 @@ async def test_get_urls_aggregated_metrics(api_test_helper): strategy=CollectorType.AUTO_GOOGLER, ) url_mappings_2_ok: list[SimpleURLMapping] = await ddc.create_urls(batch_id=batch_2, count=4, status=URLStatus.OK) - url_mappings_2_error: list[SimpleURLMapping] = await ddc.create_urls(batch_id=batch_2, count=2, status=URLStatus.ERROR) url_mappings_2_validated: list[SimpleURLMapping] = await ddc.create_validated_urls(count=1, validation_type=URLType.DATA_SOURCE) url_mappings_2_not_relevant: list[SimpleURLMapping] = await ddc.create_validated_urls(count=5, validation_type=URLType.NOT_RELEVANT) url_ids_2_validated: list[int] = [url_mapping.url_id for url_mapping in url_mappings_2_validated] diff --git a/tests/automated/integration/api/metrics/urls/breakdown/test_pending.py b/tests/automated/integration/api/metrics/urls/breakdown/test_pending.py index 3e906a8c..9bdf59ba 100644 --- a/tests/automated/integration/api/metrics/urls/breakdown/test_pending.py +++ b/tests/automated/integration/api/metrics/urls/breakdown/test_pending.py @@ -64,10 +64,6 @@ async def test_get_urls_breakdown_pending_metrics(api_test_helper): count=3, status=URLCreationEnum.SUBMITTED ), - TestURLCreationParameters( - count=4, - status=URLCreationEnum.ERROR - ), TestURLCreationParameters( count=5, status=URLCreationEnum.OK, diff --git a/tests/automated/integration/api/metrics/urls/breakdown/test_submitted.py b/tests/automated/integration/api/metrics/urls/breakdown/test_submitted.py index cbd30f8b..d0a25ab1 100644 --- a/tests/automated/integration/api/metrics/urls/breakdown/test_submitted.py +++ b/tests/automated/integration/api/metrics/urls/breakdown/test_submitted.py @@ -47,10 +47,6 @@ async def test_get_urls_breakdown_submitted_metrics(api_test_helper): count=3, status=URLCreationEnum.SUBMITTED ), - TestURLCreationParameters( - count=4, - status=URLCreationEnum.ERROR - ), TestURLCreationParameters( count=5, status=URLCreationEnum.VALIDATED diff --git a/tests/manual/agency_identifier/test_nlp_processor.py b/tests/manual/agency_identifier/test_nlp_processor.py index 30978a56..0786b830 100644 --- a/tests/manual/agency_identifier/test_nlp_processor.py +++ b/tests/manual/agency_identifier/test_nlp_processor.py @@ -1,7 +1,6 @@ import pytest -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor import \ - NLPProcessor +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.core import NLPProcessor SAMPLE_HTML: str = """ diff --git a/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py b/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py index bc9b5dfa..22203910 100644 --- a/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py +++ b/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py @@ -3,9 +3,8 @@ import dotenv from src.db.models.impl.batch.pydantic.info import BatchInfo -from src.collectors import CollectorType +from src.collectors.enums import CollectorType from src.core.enums import BatchStatus -from test_automated.integration.core.helpers.common_test_procedures import run_collector_and_wait_for_completion def test_auto_googler_collector_lifecycle(test_core): From 89b0955244c1b00a9aa3a90bce2b3c1e0186cab4 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 21 Nov 2025 19:45:16 -0500 Subject: [PATCH 57/84] Change get Location Suggestions to return full display name --- .../endpoints/annotate/all/get/queries/location_/requester.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/api/endpoints/annotate/all/get/queries/location_/requester.py b/src/api/endpoints/annotate/all/get/queries/location_/requester.py index c60c8efe..6ad56c56 100644 --- a/src/api/endpoints/annotate/all/get/queries/location_/requester.py +++ b/src/api/endpoints/annotate/all/get/queries/location_/requester.py @@ -20,7 +20,7 @@ async def get_user_location_suggestions(self, url_id: int) -> list[LocationAnnot query = ( select( UserLocationSuggestion.location_id, - LocationExpandedView.display_name.label("location_name"), + LocationExpandedView.full_display_name.label("location_name"), func.count(UserLocationSuggestion.user_id).label('user_count') ) .join( @@ -32,7 +32,7 @@ async def get_user_location_suggestions(self, url_id: int) -> list[LocationAnnot ) .group_by( UserLocationSuggestion.location_id, - LocationExpandedView.display_name + LocationExpandedView.full_display_name ) .order_by( func.count(UserLocationSuggestion.user_id).desc() From b6c9cf52f32a79f8e5cd40d6f8f7f73be49e79e7 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sat, 22 Nov 2025 20:47:08 -0500 Subject: [PATCH 58/84] Update URL to set rows with `error` status to `ok` --- .../2025_11_20_1530-c4edeb795134_remove_url_error_status.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/alembic/versions/2025_11_20_1530-c4edeb795134_remove_url_error_status.py b/alembic/versions/2025_11_20_1530-c4edeb795134_remove_url_error_status.py index 32c977e2..faa827b4 100644 --- a/alembic/versions/2025_11_20_1530-c4edeb795134_remove_url_error_status.py +++ b/alembic/versions/2025_11_20_1530-c4edeb795134_remove_url_error_status.py @@ -20,6 +20,12 @@ def upgrade() -> None: + op.execute(""" + UPDATE urls + SET status = 'ok' + WHERE status = 'error'; + """) + remove_enum_value( enum_name="url_status", value_to_remove="error", From 1c4e373aa9fb462e7028c79fe4b89123f7113595 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sun, 23 Nov 2025 09:35:09 -0500 Subject: [PATCH 59/84] Create Jenkinsfile --- alembic/Jenkinsfile | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 alembic/Jenkinsfile diff --git a/alembic/Jenkinsfile b/alembic/Jenkinsfile new file mode 100644 index 00000000..b5a330c7 --- /dev/null +++ b/alembic/Jenkinsfile @@ -0,0 +1,30 @@ +pipeline { + agent { + dockerfile { + filename 'Dockerfile' + args '-e POSTGRES_USER=POSTGRES_USER -e POSTGRES_PASSWORD=POSTGRES_PASSWORD -e POSTGRES_DB=POSTGRES_DB -e POSTGRES_HOST=POSTGRES_HOST -e POSTGRES_PORT=POSTGRES_PORT' + } + } + + stages { + stage('Migrate using Alembic') { + steps { + echo 'Building..' + sh 'python apply_migrations.py' + } + } + } + post { + failure { + script { + def payload = """{ + "content": "🚨 Build Failed: ${env.JOB_NAME} #${env.BUILD_NUMBER}" + }""" + + sh """ + curl -X POST -H "Content-Type: application/json" -d '${payload}' ${env.WEBHOOK_URL} + """ + } + } + } +} \ No newline at end of file From 1816c4e23a663129644a50c38071bdbd0a0f1193 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sun, 23 Nov 2025 09:53:40 -0500 Subject: [PATCH 60/84] Remove dependencies from apply_migrations --- apply_migrations.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/apply_migrations.py b/apply_migrations.py index 2b217c8b..cbacf0a4 100644 --- a/apply_migrations.py +++ b/apply_migrations.py @@ -1,15 +1,19 @@ from alembic import command from alembic.config import Config -from src.db.helpers.connect import get_postgres_connection_string +from src.util.helper_functions import get_from_env def apply_migrations(): print("Applying migrations...") alembic_config = Config("alembic.ini") + connection_string = ( + f"postgresql://{get_from_env('POSTGRES_USER')}:{get_from_env('POSTGRES_PASSWORD')}" + + f"@{get_from_env('POSTGRES_HOST')}:{get_from_env('POSTGRES_PORT')}/{get_from_env('POSTGRES_DB')}") + alembic_config.set_main_option( "sqlalchemy.url", - get_postgres_connection_string() + connection_string ) command.upgrade(alembic_config, "head") print("Migrations applied.") From 0aef8d3f896439069a4a6ea5fd3906cafce48bc3 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sun, 23 Nov 2025 17:21:13 -0500 Subject: [PATCH 61/84] Update Record Task type not to repeat on error. --- .../tasks/url/operators/record_type/core.py | 57 +++++++++++---- .../operators/record_type/queries/__init__.py | 0 .../url/operators/record_type/queries/cte.py | 31 ++++++++ .../url/operators/record_type/queries/get.py | 36 ++++++++++ .../operators/record_type/queries/prereq.py | 18 +++++ src/db/client/async_.py | 71 +------------------ .../url/impl/test_url_record_type_task.py | 7 ++ 7 files changed, 139 insertions(+), 81 deletions(-) create mode 100644 src/core/tasks/url/operators/record_type/queries/__init__.py create mode 100644 src/core/tasks/url/operators/record_type/queries/cte.py create mode 100644 src/core/tasks/url/operators/record_type/queries/get.py create mode 100644 src/core/tasks/url/operators/record_type/queries/prereq.py diff --git a/src/core/tasks/url/operators/record_type/core.py b/src/core/tasks/url/operators/record_type/core.py index 8e31fa8d..9f63a6a5 100644 --- a/src/core/tasks/url/operators/record_type/core.py +++ b/src/core/tasks/url/operators/record_type/core.py @@ -1,9 +1,13 @@ from src.core.enums import RecordType from src.core.tasks.url.operators.base import URLTaskOperatorBase from src.core.tasks.url.operators.record_type.llm_api.record_classifier.openai import OpenAIRecordClassifier +from src.core.tasks.url.operators.record_type.queries.get import GetRecordTypeTaskURLsQueryBuilder +from src.core.tasks.url.operators.record_type.queries.prereq import RecordTypeTaskPrerequisiteQueryBuilder from src.core.tasks.url.operators.record_type.tdo import URLRecordTypeTDO from src.db.client.async_ import AsyncDatabaseClient +from src.db.dtos.url.with_html import URLWithHTML from src.db.enums import TaskType +from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.impl.url.task_error.pydantic_.small import URLTaskErrorSmall @@ -18,18 +22,22 @@ def __init__( self.classifier = classifier @property - def task_type(self): + def task_type(self) -> TaskType: return TaskType.RECORD_TYPE - async def meets_task_prerequisites(self): - return await self.adb_client.has_urls_with_html_data_and_without_auto_record_type_suggestion() + async def meets_task_prerequisites(self) -> bool: + return await self.run_query_builder( + RecordTypeTaskPrerequisiteQueryBuilder() + ) async def get_tdos(self) -> list[URLRecordTypeTDO]: - urls_with_html = await self.adb_client.get_urls_with_html_data_and_without_auto_record_type_suggestion() + urls_with_html: list[URLWithHTML] = await self.run_query_builder( + GetRecordTypeTaskURLsQueryBuilder() + ) tdos = [URLRecordTypeTDO(url_with_html=url_with_html) for url_with_html in urls_with_html] return tdos - async def inner_task_logic(self): + async def inner_task_logic(self) -> None: # Get pending urls from Source Collector # with HTML data and without Record Type Metadata tdos = await self.get_tdos() @@ -41,7 +49,10 @@ async def inner_task_logic(self): await self.put_results_into_database(success_subset) await self.update_errors_in_database(error_subset) - async def update_errors_in_database(self, tdos: list[URLRecordTypeTDO]): + async def update_errors_in_database( + self, + tdos: list[URLRecordTypeTDO] + ) -> None: task_errors: list[URLTaskErrorSmall] = [] for tdo in tdos: error_info = URLTaskErrorSmall( @@ -51,20 +62,42 @@ async def update_errors_in_database(self, tdos: list[URLRecordTypeTDO]): task_errors.append(error_info) await self.add_task_errors(task_errors) - async def put_results_into_database(self, tdos: list[URLRecordTypeTDO]): - suggestions = [] + async def put_results_into_database( + self, + tdos: list[URLRecordTypeTDO] + ) -> None: + url_and_record_type_list = [] for tdo in tdos: url_id = tdo.url_with_html.url_id record_type = tdo.record_type - suggestions.append((url_id, record_type)) - await self.adb_client.add_auto_record_type_suggestions(suggestions) + url_and_record_type_list.append((url_id, record_type)) + # Add to database + suggestions: list[AutoRecordTypeSuggestion] = [] + for url_id, record_type in url_and_record_type_list: + suggestion = AutoRecordTypeSuggestion( + url_id=url_id, + record_type=record_type.value + ) + suggestions.append(suggestion) + await self.adb_client.add_all(suggestions) - async def separate_success_and_error_subsets(self, tdos: list[URLRecordTypeTDO]): + @staticmethod + async def separate_success_and_error_subsets( + tdos: list[URLRecordTypeTDO] + ) -> tuple[list[URLRecordTypeTDO], list[URLRecordTypeTDO]]: success_subset = [tdo for tdo in tdos if not tdo.is_errored()] error_subset = [tdo for tdo in tdos if tdo.is_errored()] return success_subset, error_subset - async def get_ml_classifications(self, tdos: list[URLRecordTypeTDO]): + async def get_ml_classifications( + self, + tdos: list[URLRecordTypeTDO] + ) -> None: + """ + Modifies: + - tdo.record_type + - tdo.error + """ for tdo in tdos: try: record_type_str = await self.classifier.classify_url(tdo.url_with_html.html_infos) diff --git a/src/core/tasks/url/operators/record_type/queries/__init__.py b/src/core/tasks/url/operators/record_type/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/record_type/queries/cte.py b/src/core/tasks/url/operators/record_type/queries/cte.py new file mode 100644 index 00000000..22d3db10 --- /dev/null +++ b/src/core/tasks/url/operators/record_type/queries/cte.py @@ -0,0 +1,31 @@ +from sqlalchemy import select, CTE, Column + +from src.db.enums import TaskType +from src.db.helpers.query import not_exists_url, no_url_task_error +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML +from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion + + +class RecordTypeTaskPrerequisiteCTEContainer: + + def __init__(self): + self.cte: CTE = ( + select( + URL.id + ) + .join( + URLCompressedHTML + ) + .where( + not_exists_url(AutoRecordTypeSuggestion), + no_url_task_error( + TaskType.RECORD_TYPE + ) + ) + .cte("record_type_task_prerequisite") + ) + + @property + def url_id(self) -> Column[int]: + return self.cte.columns.id \ No newline at end of file diff --git a/src/core/tasks/url/operators/record_type/queries/get.py b/src/core/tasks/url/operators/record_type/queries/get.py new file mode 100644 index 00000000..c5b7e7e0 --- /dev/null +++ b/src/core/tasks/url/operators/record_type/queries/get.py @@ -0,0 +1,36 @@ +from typing import Sequence + +from sqlalchemy import select, Row +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import selectinload + +from src.core.tasks.url.operators.record_type.queries.cte import RecordTypeTaskPrerequisiteCTEContainer +from src.db.dto_converter import DTOConverter +from src.db.dtos.url.with_html import URLWithHTML +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.base.builder import QueryBuilderBase + + +class GetRecordTypeTaskURLsQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> list[URLWithHTML]: + cte = RecordTypeTaskPrerequisiteCTEContainer() + query = ( + select( + URL + ) + .join( + cte.cte, + cte.url_id == URL.id + ) + .options( + selectinload(URL.html_content) + ) + .limit(100) + .order_by(URL.id) + ) + urls: Sequence[Row[URL]] = await self.sh.scalars( + session=session, + query=query + ) + return DTOConverter.url_list_to_url_with_html_list(urls) diff --git a/src/core/tasks/url/operators/record_type/queries/prereq.py b/src/core/tasks/url/operators/record_type/queries/prereq.py new file mode 100644 index 00000000..32b70adb --- /dev/null +++ b/src/core/tasks/url/operators/record_type/queries/prereq.py @@ -0,0 +1,18 @@ +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.url.operators.record_type.queries.cte import RecordTypeTaskPrerequisiteCTEContainer +from src.db.queries.base.builder import QueryBuilderBase + + +class RecordTypeTaskPrerequisiteQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> bool: + container = RecordTypeTaskPrerequisiteCTEContainer() + query = ( + select( + container.url_id + ) + ) + return await self.sh.results_exist(session=session, query=query) + diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 10ee5b6c..913a0a35 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -1,10 +1,9 @@ from datetime import datetime from functools import wraps -from typing import Optional, Type, Any, List, Sequence +from typing import Optional, Any, List -from sqlalchemy import select, func, Select, and_, update, Row, text, Engine +from sqlalchemy import select, func, Select, and_, update, Row, text from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker, AsyncEngine -from sqlalchemy.orm import selectinload from src.api.endpoints.annotate.all.get.models.response import GetNextURLForAllAnnotationResponse from src.api.endpoints.annotate.all.get.queries.core import GetNextURLForAllAnnotationQueryBuilder @@ -50,7 +49,6 @@ from src.db.client.types import UserSuggestionModel from src.db.config_manager import ConfigManager from src.db.constants import PLACEHOLDER_AGENCY_NAME -from src.db.dto_converter import DTOConverter from src.db.dtos.url.html_content import URLHTMLContentInfo from src.db.dtos.url.insert import InsertURLsInfo from src.db.dtos.url.raw_html import RawHTMLInfo @@ -286,18 +284,6 @@ async def add_user_relevant_suggestion( # region record_type - @session_manager - async def add_auto_record_type_suggestions( - self, - session: AsyncSession, - url_and_record_type_list: list[tuple[int, RecordType]] - ): - for url_id, record_type in url_and_record_type_list: - suggestion = AutoRecordTypeSuggestion( - url_id=url_id, - record_type=record_type.value - ) - session.add(suggestion) async def add_auto_record_type_suggestion( self, @@ -381,59 +367,6 @@ async def add_miscellaneous_metadata(self, session: AsyncSession, tdos: list[URL async def get_non_errored_urls_without_html_data(self) -> list[URLInfo]: return await self.run_query_builder(GetPendingURLsWithoutHTMLDataQueryBuilder()) - async def get_urls_with_html_data_and_without_models( - self, - session: AsyncSession, - model: Type[Base] - ): - statement = (select(URL) - .options(selectinload(URL.html_content)) - .where(URL.status == URLStatus.OK.value)) - statement = self.statement_composer.exclude_urls_with_extant_model( - statement=statement, - model=model - ) - statement = statement.limit(100).order_by(URL.id) - raw_result = await session.execute(statement) - urls: Sequence[Row[URL]] = raw_result.unique().scalars().all() - final_results = DTOConverter.url_list_to_url_with_html_list(urls) - - return final_results - - @session_manager - async def get_urls_with_html_data_and_without_auto_record_type_suggestion( - self, - session: AsyncSession - ): - return await self.get_urls_with_html_data_and_without_models( - session=session, - model=AutoRecordTypeSuggestion - ) - - async def has_urls_with_html_data_and_without_models( - self, - session: AsyncSession, - model: Type[Base] - ) -> bool: - statement = (select(URL) - .join(URLCompressedHTML) - .where(URL.status == URLStatus.OK.value)) - # Exclude URLs with auto suggested record types - statement = self.statement_composer.exclude_urls_with_extant_model( - statement=statement, - model=model - ) - statement = statement.limit(1) - scalar_result = await session.scalars(statement) - return bool(scalar_result.first()) - - @session_manager - async def has_urls_with_html_data_and_without_auto_record_type_suggestion(self, session: AsyncSession) -> bool: - return await self.has_urls_with_html_data_and_without_models( - session=session, - model=AutoRecordTypeSuggestion - ) - @session_manager async def one_or_none_model( self, diff --git a/tests/automated/integration/tasks/url/impl/test_url_record_type_task.py b/tests/automated/integration/tasks/url/impl/test_url_record_type_task.py index 1373f3fa..57f41ded 100644 --- a/tests/automated/integration/tasks/url/impl/test_url_record_type_task.py +++ b/tests/automated/integration/tasks/url/impl/test_url_record_type_task.py @@ -7,6 +7,7 @@ from src.core.tasks.url.enums import TaskOperatorOutcome from src.core.tasks.url.operators.record_type.core import URLRecordTypeTaskOperator from src.core.enums import RecordType +from src.db.models.impl.url.task_error.sqlalchemy import URLTaskError from tests.helpers.data_creator.core import DBDataCreator from src.core.tasks.url.operators.record_type.llm_api.record_classifier.deepseek import DeepSeekRecordClassifier @@ -52,3 +53,9 @@ async def test_url_record_type_task(db_data_creator: DBDataCreator): for suggestion in suggestions: assert suggestion.record_type == RecordType.ACCIDENT_REPORTS.value + # Get URL Error Tasks + url_error_tasks: list[URLTaskError] = await db_data_creator.adb_client.get_all(URLTaskError) + assert len(url_error_tasks) == 1 + url_error_task = url_error_tasks[0] + assert url_error_task.url_id == url_ids[1] + assert url_error_task.task_type == TaskType.RECORD_TYPE \ No newline at end of file From bed608847c0baea00e4d56fd5f7b8fc8706856da Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sun, 23 Nov 2025 18:52:25 -0500 Subject: [PATCH 62/84] Add script for deleting hanging app links --- ...dfad3275_eliminate_hanging_data_sources.py | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 alembic/versions/2025_11_23_1850-1bb2dfad3275_eliminate_hanging_data_sources.py diff --git a/alembic/versions/2025_11_23_1850-1bb2dfad3275_eliminate_hanging_data_sources.py b/alembic/versions/2025_11_23_1850-1bb2dfad3275_eliminate_hanging_data_sources.py new file mode 100644 index 00000000..65982106 --- /dev/null +++ b/alembic/versions/2025_11_23_1850-1bb2dfad3275_eliminate_hanging_data_sources.py @@ -0,0 +1,32 @@ +"""Eliminate hanging data sources + +Revision ID: 1bb2dfad3275 +Revises: c4edeb795134 +Create Date: 2025-11-23 18:50:55.557428 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '1bb2dfad3275' +down_revision: Union[str, None] = 'c4edeb795134' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.execute(""" + DELETE FROM ds_app_link_data_source ds + USING ds_app_link_meta_url mu, + flag_url_validated fuv + WHERE ds.url_id = mu.url_id + AND ds.url_id = fuv.url_id; + """) + + +def downgrade() -> None: + pass From a0d0e5ebc61dda4ae372f39cbd0ac1f8fb4693ad Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 24 Nov 2025 09:35:41 -0500 Subject: [PATCH 63/84] Begin draft --- ...1923-5ac9d50b91c5_add_integrity_monitor.py | 98 +++++++++++++++++++ .../scheduled/impl/integrity/__init__.py | 0 .../scheduled/impl/integrity/operator.py | 9 ++ src/db/enums.py | 1 + src/db/models/helpers.py | 3 + src/db/models/mixins.py | 9 +- src/db/models/views/dependent_locations.py | 3 +- src/db/models/views/integrity/__init__.py | 0 .../integrity/incomplete_data_sources.py | 35 +++++++ .../views/integrity/incomplete_meta_urls.py | 34 +++++++ .../non_federal_agencies_no_location.py | 25 +++++ .../url_both_data_source_and_meta_url.py | 21 ++++ src/db/models/views/meta_url.py | 9 +- src/db/models/views/unvalidated_url.py | 11 +-- src/db/models/views/url_anno_count.py | 9 +- src/db/models/views/url_annotations_flags.py | 9 +- src/db/models/views/url_status/core.py | 9 +- .../scheduled/impl/integrity/__init__.py | 0 .../scheduled/impl/integrity/conftest.py | 11 +++ .../integrity/test_incomplete_data_sources.py | 10 ++ .../integrity/test_incomplete_meta_urls.py | 10 ++ .../test_non_federal_agencies_no_location.py | 26 +++++ .../test_url_both_data_source_and_meta_url.py | 10 ++ 23 files changed, 314 insertions(+), 38 deletions(-) create mode 100644 alembic/versions/2025_11_23_1923-5ac9d50b91c5_add_integrity_monitor.py create mode 100644 src/core/tasks/scheduled/impl/integrity/__init__.py create mode 100644 src/core/tasks/scheduled/impl/integrity/operator.py create mode 100644 src/db/models/views/integrity/__init__.py create mode 100644 src/db/models/views/integrity/incomplete_data_sources.py create mode 100644 src/db/models/views/integrity/incomplete_meta_urls.py create mode 100644 src/db/models/views/integrity/non_federal_agencies_no_location.py create mode 100644 src/db/models/views/integrity/url_both_data_source_and_meta_url.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/integrity/__init__.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/integrity/conftest.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_data_sources.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_meta_urls.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/integrity/test_non_federal_agencies_no_location.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/integrity/test_url_both_data_source_and_meta_url.py diff --git a/alembic/versions/2025_11_23_1923-5ac9d50b91c5_add_integrity_monitor.py b/alembic/versions/2025_11_23_1923-5ac9d50b91c5_add_integrity_monitor.py new file mode 100644 index 00000000..75f41186 --- /dev/null +++ b/alembic/versions/2025_11_23_1923-5ac9d50b91c5_add_integrity_monitor.py @@ -0,0 +1,98 @@ +"""Add integrity monitor + +Revision ID: 5ac9d50b91c5 +Revises: 1bb2dfad3275 +Create Date: 2025-11-23 19:23:45.487445 + +""" +from typing import Sequence, Union + +from alembic import op + +from src.util.alembic_helpers import add_enum_value + +# revision identifiers, used by Alembic. +revision: str = '5ac9d50b91c5' +down_revision: Union[str, None] = '1bb2dfad3275' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + + + +def upgrade() -> None: + _create_integrity_task() + _create_incomplete_data_sources_view() + _create_incomplete_meta_urls_view() + _create_url_both_data_source_and_meta_url_view() + _create_non_federal_agencies_no_location_view() + +def _create_non_federal_agencies_no_location_view(): + op.execute(""" + create view integrity__non_federal_agencies_no_location_view as + select + ag.name + from agencies ag + left join link_agencies__locations link on ag.id = link.agency_id + where ag.jurisdiction_type != 'federal' + and link.location_id is null + """) + + +def _create_url_both_data_source_and_meta_url_view(): + op.execute(""" + create view integrity__url_both_data_source_and_meta_url_view + select + ds.url_id + from + ds_app_link_data_source ds + join ds_app_link_meta_url mu + on mu.url_id = ds.url_id + """) + + +def _create_incomplete_meta_urls_view(): + op.execute(""" + create view integrity__incomplete_data_sources_view as + select + mu.url_id, + fuv.url_id is not null as has_validated_flag, + fuv.type as validated_type + from ds_app_link_meta_url mu + left join flag_url_validated fuv on fuv.url_id = mu.url_id + left join url_record_type urt on urt.url_id = mu.url_id + where + fuv.url_id is null + or fuv.type != 'meta url' + or urt.url_id is null + """) + + +def _create_incomplete_data_sources_view(): + op.execute(""" + create view integrity__incomplete_data_sources_view as + select + mu.url_id, + fuv.url_id is not null as has_validated_flag, + fuv.type as validated_type, + urt.url_id is not null as has_record_type + + from ds_app_link_meta_url mu + left join flag_url_validated fuv on fuv.url_id = mu.url_id + left join url_record_type urt on urt.url_id = mu.url_id + where + fuv.url_id is null + or fuv.type != 'data source' + or urt.url_id is null + """) + + +def _create_integrity_task(): + add_enum_value( + enum_name="task_type", + enum_value="Integrity Monitor", + ) + + +def downgrade() -> None: + pass diff --git a/src/core/tasks/scheduled/impl/integrity/__init__.py b/src/core/tasks/scheduled/impl/integrity/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/integrity/operator.py b/src/core/tasks/scheduled/impl/integrity/operator.py new file mode 100644 index 00000000..c9796c6d --- /dev/null +++ b/src/core/tasks/scheduled/impl/integrity/operator.py @@ -0,0 +1,9 @@ +from src.core.tasks.mixins.prereq import HasPrerequisitesMixin +from src.core.tasks.scheduled.templates.operator import ScheduledTaskOperatorBase + + +class IntegrityMonitorTaskOperator( + ScheduledTaskOperatorBase, + HasPrerequisitesMixin +): + pass \ No newline at end of file diff --git a/src/db/enums.py b/src/db/enums.py index 034ec0b8..65f446c5 100644 --- a/src/db/enums.py +++ b/src/db/enums.py @@ -63,6 +63,7 @@ class TaskType(PyEnum): TASK_CLEANUP = "Task Cleanup" REFRESH_MATERIALIZED_VIEWS = "Refresh Materialized Views" UPDATE_URL_STATUS = "Update URL Status" + INTEGRITY_MONITOR = "Integrity Monitor" # Sync Tasks SYNC_AGENCIES_ADD = "Sync Agencies Add" diff --git a/src/db/models/helpers.py b/src/db/models/helpers.py index 592973a6..e1c77978 100644 --- a/src/db/models/helpers.py +++ b/src/db/models/helpers.py @@ -46,9 +46,12 @@ def location_id_column() -> Column[int]: CURRENT_TIME_SERVER_DEFAULT = func.now() +VIEW_ARG = {"info": "view"} + def url_id_primary_key_constraint() -> PrimaryKeyConstraint: return PrimaryKeyConstraint('url_id') + def county_column(nullable: bool = False) -> Column[int]: return Column( Integer(), diff --git a/src/db/models/mixins.py b/src/db/models/mixins.py index 417eae40..7a7d6460 100644 --- a/src/db/models/mixins.py +++ b/src/db/models/mixins.py @@ -3,7 +3,8 @@ from sqlalchemy import Column, Integer, ForeignKey, TIMESTAMP, event from src.db.models.exceptions import WriteToViewError -from src.db.models.helpers import get_created_at_column, CURRENT_TIME_SERVER_DEFAULT +from src.db.models.helpers import get_created_at_column, CURRENT_TIME_SERVER_DEFAULT, url_id_primary_key_constraint, \ + VIEW_ARG class URLDependentMixin: @@ -90,3 +91,9 @@ def __declare_last__(cls) -> None: @staticmethod def _block_write(mapper, connection, target): raise WriteToViewError(f"{type(target).__name__} is a read-only view.") + +class URLDependentViewMixin(URLDependentMixin, ViewMixin): + __table_args__ = ( + url_id_primary_key_constraint(), + VIEW_ARG + ) \ No newline at end of file diff --git a/src/db/models/views/dependent_locations.py b/src/db/models/views/dependent_locations.py index 95f3db98..425e25a6 100644 --- a/src/db/models/views/dependent_locations.py +++ b/src/db/models/views/dependent_locations.py @@ -31,6 +31,7 @@ """ from sqlalchemy import Column, Integer, ForeignKey +from src.db.models.helpers import VIEW_ARG from src.db.models.mixins import ViewMixin from src.db.models.templates_.base import Base @@ -39,7 +40,7 @@ class DependentLocationView(Base, ViewMixin): __tablename__ = "dependent_locations" __table_args__ = ( - {"info": "view"} + VIEW_ARG, ) parent_location_id = Column( diff --git a/src/db/models/views/integrity/__init__.py b/src/db/models/views/integrity/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/views/integrity/incomplete_data_sources.py b/src/db/models/views/integrity/incomplete_data_sources.py new file mode 100644 index 00000000..8444b2e6 --- /dev/null +++ b/src/db/models/views/integrity/incomplete_data_sources.py @@ -0,0 +1,35 @@ +""" + create view integrity__incomplete_data_sources_view as + select + mu.url_id, + fuv.url_id is not null as has_validated_flag, + fuv.type as validated_type, + urt.url_id is not null as has_record_type + + from ds_app_link_meta_url mu + left join flag_url_validated fuv on fuv.url_id = mu.url_id + left join url_record_type urt on urt.url_id = mu.url_id + where + fuv.url_id is null + or fuv.type != 'data source' + or urt.url_id is null + """ +from sqlalchemy import Column, Boolean + +from src.db.models.helpers import enum_column +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.mixins import URLDependentViewMixin +from src.db.models.templates_.base import Base + +class IntegrityIncompleteDataSource( + Base, + URLDependentViewMixin +): + __tablename__ = "integrity__incomplete_data_sources_view" + + has_validated_flag = Column(Boolean) + validated_type = enum_column( + enum_type=URLType, + name="url_type", + ) + has_record_type = Column(Boolean) diff --git a/src/db/models/views/integrity/incomplete_meta_urls.py b/src/db/models/views/integrity/incomplete_meta_urls.py new file mode 100644 index 00000000..4c7ec01d --- /dev/null +++ b/src/db/models/views/integrity/incomplete_meta_urls.py @@ -0,0 +1,34 @@ +""" + create view integrity__incomplete_data_sources_view as + select + mu.url_id, + fuv.url_id is not null as has_validated_flag, + fuv.type as validated_type + from ds_app_link_meta_url mu + left join flag_url_validated fuv on fuv.url_id = mu.url_id + left join url_record_type urt on urt.url_id = mu.url_id + where + fuv.url_id is null + or fuv.type != 'meta url' + or urt.url_id is null + """ +from sqlalchemy import Column, Boolean + +from src.db.models.helpers import enum_column +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.mixins import URLDependentViewMixin +from src.db.models.templates_.base import Base + +class IntegrityIncompleteMetaURL( + Base, + URLDependentViewMixin +): + __tablename__ = "integrity__incomplete_meta_urls_view" + + has_validated_flag = Column(Boolean) + validated_type = enum_column( + enum_type=URLType, + name="url_type", + ) + + diff --git a/src/db/models/views/integrity/non_federal_agencies_no_location.py b/src/db/models/views/integrity/non_federal_agencies_no_location.py new file mode 100644 index 00000000..e45882fe --- /dev/null +++ b/src/db/models/views/integrity/non_federal_agencies_no_location.py @@ -0,0 +1,25 @@ +""" + create view integrity__non_federal_agencies_no_location_view as + select + ag.name + from agencies ag + left join link_agencies__locations link on ag.id = link.agency_id + where ag.jurisdiction_type != 'federal' + and link.location_id is null + """ +from sqlalchemy import String, Column + +from src.db.models.helpers import VIEW_ARG +from src.db.models.mixins import ViewMixin +from src.db.models.templates_.base import Base + +class IntegrityNonFederalAgenciesNoLocation( + Base, + ViewMixin +): + __tablename__ = "integrity__non_federal_agencies_no_location_view" + __table_args__ = ( + VIEW_ARG, + ) + + name = Column(String) diff --git a/src/db/models/views/integrity/url_both_data_source_and_meta_url.py b/src/db/models/views/integrity/url_both_data_source_and_meta_url.py new file mode 100644 index 00000000..eac08d03 --- /dev/null +++ b/src/db/models/views/integrity/url_both_data_source_and_meta_url.py @@ -0,0 +1,21 @@ +""" + create view integrity__url_both_data_source_and_meta_url_view + select + ds.url_id + from + ds_app_link_data_source ds + join ds_app_link_meta_url mu + on mu.url_id = ds.url_id +""" + +from src.db.models.mixins import URLDependentViewMixin +from src.db.models.templates_.base import Base + + +class IntegrityURLBothDataSourceAndMetaURL( + Base, + URLDependentViewMixin +): + __tablename__ = "integrity__url_both_data_source_and_meta_url_view" + + diff --git a/src/db/models/views/meta_url.py b/src/db/models/views/meta_url.py index 20437075..a2d64ca9 100644 --- a/src/db/models/views/meta_url.py +++ b/src/db/models/views/meta_url.py @@ -9,18 +9,13 @@ from sqlalchemy import PrimaryKeyConstraint -from src.db.models.mixins import ViewMixin, URLDependentMixin +from src.db.models.mixins import ViewMixin, URLDependentMixin, URLDependentViewMixin from src.db.models.templates_.base import Base class MetaURL( Base, - ViewMixin, - URLDependentMixin, + URLDependentViewMixin ): __tablename__ = "meta_url_view" - __table_args__ = ( - PrimaryKeyConstraint("url_id"), - {"info": "view"} - ) \ No newline at end of file diff --git a/src/db/models/views/unvalidated_url.py b/src/db/models/views/unvalidated_url.py index bcfa9293..baf5f071 100644 --- a/src/db/models/views/unvalidated_url.py +++ b/src/db/models/views/unvalidated_url.py @@ -11,18 +11,13 @@ """ from sqlalchemy import PrimaryKeyConstraint -from src.db.models.mixins import ViewMixin, URLDependentMixin +from src.db.models.mixins import ViewMixin, URLDependentMixin, URLDependentViewMixin from src.db.models.templates_.base import Base class UnvalidatedURL( Base, - ViewMixin, - URLDependentMixin, + URLDependentViewMixin ): - __tablename__ = "unvalidated_url_view" - __table_args__ = ( - PrimaryKeyConstraint("url_id"), - {"info": "view"} - ) \ No newline at end of file + __tablename__ = "unvalidated_url_view" \ No newline at end of file diff --git a/src/db/models/views/url_anno_count.py b/src/db/models/views/url_anno_count.py index 232f0d21..2e910afb 100644 --- a/src/db/models/views/url_anno_count.py +++ b/src/db/models/views/url_anno_count.py @@ -98,21 +98,16 @@ from sqlalchemy import PrimaryKeyConstraint, Column, Integer from src.db.models.helpers import url_id_primary_key_constraint -from src.db.models.mixins import ViewMixin, URLDependentMixin +from src.db.models.mixins import ViewMixin, URLDependentMixin, URLDependentViewMixin from src.db.models.templates_.base import Base class URLAnnotationCount( Base, - ViewMixin, - URLDependentMixin + URLDependentViewMixin ): __tablename__ = "url_annotation_count_view" - __table_args__ = ( - url_id_primary_key_constraint(), - {"info": "view"} - ) auto_agency_count = Column(Integer, nullable=False) auto_location_count = Column(Integer, nullable=False) diff --git a/src/db/models/views/url_annotations_flags.py b/src/db/models/views/url_annotations_flags.py index 47250d1b..c133fbfc 100644 --- a/src/db/models/views/url_annotations_flags.py +++ b/src/db/models/views/url_annotations_flags.py @@ -24,20 +24,15 @@ from sqlalchemy import PrimaryKeyConstraint, Column, Boolean -from src.db.models.mixins import ViewMixin, URLDependentMixin +from src.db.models.mixins import ViewMixin, URLDependentMixin, URLDependentViewMixin from src.db.models.templates_.base import Base class URLAnnotationFlagsView( Base, - ViewMixin, - URLDependentMixin + URLDependentViewMixin ): __tablename__ = "url_annotation_flags" - __table_args__ = ( - PrimaryKeyConstraint("url_id"), - {"info": "view"} - ) has_auto_record_type_suggestion = Column(Boolean, nullable=False) has_auto_relevant_suggestion = Column(Boolean, nullable=False) diff --git a/src/db/models/views/url_status/core.py b/src/db/models/views/url_status/core.py index 77a01139..be771fe5 100644 --- a/src/db/models/views/url_status/core.py +++ b/src/db/models/views/url_status/core.py @@ -59,19 +59,14 @@ from sqlalchemy import String, Column from src.db.models.helpers import url_id_primary_key_constraint -from src.db.models.mixins import ViewMixin, URLDependentMixin +from src.db.models.mixins import ViewMixin, URLDependentMixin, URLDependentViewMixin from src.db.models.templates_.base import Base class URLStatusMatView( Base, - ViewMixin, - URLDependentMixin + URLDependentViewMixin ): __tablename__ = "url_status_mat_view" - __table_args__ = ( - url_id_primary_key_constraint(), - {"info": "view"} - ) status = Column(String) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/__init__.py b/tests/automated/integration/tasks/scheduled/impl/integrity/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/conftest.py b/tests/automated/integration/tasks/scheduled/impl/integrity/conftest.py new file mode 100644 index 00000000..a3d3cd22 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/conftest.py @@ -0,0 +1,11 @@ +import pytest + +from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator +from src.db.client.async_ import AsyncDatabaseClient + + +@pytest.fixture +def operator( + adb_client_test: AsyncDatabaseClient +) -> IntegrityMonitorTaskOperator: + raise NotImplementedError \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_data_sources.py b/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_data_sources.py new file mode 100644 index 00000000..1553c409 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_data_sources.py @@ -0,0 +1,10 @@ +import pytest + +from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator + + +@pytest.mark.asyncio +async def test_core( + operator: IntegrityMonitorTaskOperator +): + pass diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_meta_urls.py b/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_meta_urls.py new file mode 100644 index 00000000..1553c409 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_meta_urls.py @@ -0,0 +1,10 @@ +import pytest + +from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator + + +@pytest.mark.asyncio +async def test_core( + operator: IntegrityMonitorTaskOperator +): + pass diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/test_non_federal_agencies_no_location.py b/tests/automated/integration/tasks/scheduled/impl/integrity/test_non_federal_agencies_no_location.py new file mode 100644 index 00000000..41ea653d --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/test_non_federal_agencies_no_location.py @@ -0,0 +1,26 @@ +import pytest + +from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator + + +@pytest.mark.asyncio +async def test_core( + operator: IntegrityMonitorTaskOperator +): + pass + + # Check does not meet prerequisites + + # Add federal agency + + # Check does not meet prerequisites + + # Add non-federal agency + + # Check meets prerequisites + + # Run task and confirm produces error + + # Add location to non-federal agency + + # Check no longer meets task prerequisites diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/test_url_both_data_source_and_meta_url.py b/tests/automated/integration/tasks/scheduled/impl/integrity/test_url_both_data_source_and_meta_url.py new file mode 100644 index 00000000..1553c409 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/test_url_both_data_source_and_meta_url.py @@ -0,0 +1,10 @@ +import pytest + +from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator + + +@pytest.mark.asyncio +async def test_core( + operator: IntegrityMonitorTaskOperator +): + pass From 5ed52f37193c872fcf1d8a960c398acec386164f Mon Sep 17 00:00:00 2001 From: maxachis Date: Mon, 24 Nov 2025 12:00:23 -0500 Subject: [PATCH 64/84] Continue draft --- .../impl/flag/url_validated/sqlalchemy.py | 3 +- .../tasks/scheduled/impl/integrity/helpers.py | 12 +++++ .../integrity/test_incomplete_data_sources.py | 54 +++++++++++++++++++ .../integrity/test_incomplete_meta_urls.py | 22 ++++++++ .../test_non_federal_agencies_no_location.py | 9 ++++ .../test_url_both_data_source_and_meta_url.py | 35 ++++++++++++ 6 files changed, 134 insertions(+), 1 deletion(-) create mode 100644 tests/automated/integration/tasks/scheduled/impl/integrity/helpers.py diff --git a/src/db/models/impl/flag/url_validated/sqlalchemy.py b/src/db/models/impl/flag/url_validated/sqlalchemy.py index 97abf056..081441d8 100644 --- a/src/db/models/impl/flag/url_validated/sqlalchemy.py +++ b/src/db/models/impl/flag/url_validated/sqlalchemy.py @@ -1,4 +1,5 @@ from sqlalchemy import PrimaryKeyConstraint +from sqlalchemy.orm import Mapped from src.db.models.helpers import enum_column from src.db.models.impl.flag.url_validated.enums import URLType @@ -19,7 +20,7 @@ class FlagURLValidated( ), ) - type = enum_column( + type: Mapped[URLType] = enum_column( enum_type=URLType, name="url_type", ) diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/helpers.py b/tests/automated/integration/tasks/scheduled/impl/integrity/helpers.py new file mode 100644 index 00000000..60177c3f --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/helpers.py @@ -0,0 +1,12 @@ +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator +from src.core.tasks.url.enums import TaskOperatorOutcome + + +async def run_task_and_confirm_error( + operator: IntegrityMonitorTaskOperator, + expected_error: str +) -> None: + run_info: TaskOperatorRunInfo = await operator.run_task() + assert run_info.outcome == TaskOperatorOutcome.ERROR + assert run_info.error_message == expected_error \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_data_sources.py b/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_data_sources.py index 1553c409..d716a7da 100644 --- a/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_data_sources.py +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_data_sources.py @@ -1,6 +1,14 @@ import pytest +from src.core.enums import RecordType from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource +from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType +from tests.automated.integration.tasks.scheduled.impl.integrity.helpers import run_task_and_confirm_error @pytest.mark.asyncio @@ -8,3 +16,49 @@ async def test_core( operator: IntegrityMonitorTaskOperator ): pass + + # Check does not meet prerequisites + assert not await operator.meets_task_prerequisites() + + # Add URL as data source but without record type or validated flag + ## URL + url = URL( + url="example.com", + source=URLSource.COLLECTOR, + trailing_slash=False + ) + url_id: int = await operator.adb_client.add(url, return_id=True) + + ## App Link + ds_app_link = DSAppLinkDataSource( + url_id=url_id, + ds_data_source_id=1 + ) + await operator.adb_client.add(ds_app_link) + + # Check meets prerequisites + assert await operator.meets_task_prerequisites() + + # Add validated URL flag + flag = FlagURLValidated( + url_id=url_id, + type=URLType.DATA_SOURCE + ) + await operator.adb_client.add(flag) + # Check still meets prerequisites + assert await operator.meets_task_prerequisites() + + # Run task and confirm produces error + await run_task_and_confirm_error( + operator=operator, + expected_error="" + ) + + # Add record type to data source + record_type = URLRecordType( + url_id=url_id, + record_type=RecordType.INCARCERATION_RECORDS + ) + + # Check no longer meets task prerequisites + assert not await operator.meets_task_prerequisites() diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_meta_urls.py b/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_meta_urls.py index 1553c409..def7cf9f 100644 --- a/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_meta_urls.py +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_meta_urls.py @@ -1,6 +1,7 @@ import pytest from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator +from tests.automated.integration.tasks.scheduled.impl.integrity.helpers import run_task_and_confirm_error @pytest.mark.asyncio @@ -8,3 +9,24 @@ async def test_core( operator: IntegrityMonitorTaskOperator ): pass + + # Check does not meet prerequisites + assert not await operator.meets_task_prerequisites() + + # Add Meta URL without linking an agency to it + + # Check meets prerequisites + assert await operator.meets_task_prerequisites() + + # Run task and confirm produces error + await run_task_and_confirm_error( + operator=operator, + expected_error="" + ) + + # Add agency to Meta URL + + # Check no longer meets task prerequisites + assert not await operator.meets_task_prerequisites() + + diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/test_non_federal_agencies_no_location.py b/tests/automated/integration/tasks/scheduled/impl/integrity/test_non_federal_agencies_no_location.py index 41ea653d..e77c0b31 100644 --- a/tests/automated/integration/tasks/scheduled/impl/integrity/test_non_federal_agencies_no_location.py +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/test_non_federal_agencies_no_location.py @@ -1,6 +1,7 @@ import pytest from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator +from tests.automated.integration.tasks.scheduled.impl.integrity.helpers import run_task_and_confirm_error @pytest.mark.asyncio @@ -10,17 +11,25 @@ async def test_core( pass # Check does not meet prerequisites + assert not await operator.meets_task_prerequisites() # Add federal agency # Check does not meet prerequisites + assert not await operator.meets_task_prerequisites() # Add non-federal agency # Check meets prerequisites + assert await operator.meets_task_prerequisites() # Run task and confirm produces error + await run_task_and_confirm_error( + operator=operator, + expected_error="" + ) # Add location to non-federal agency # Check no longer meets task prerequisites + assert not await operator.meets_task_prerequisites() diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/test_url_both_data_source_and_meta_url.py b/tests/automated/integration/tasks/scheduled/impl/integrity/test_url_both_data_source_and_meta_url.py index 1553c409..cce6269e 100644 --- a/tests/automated/integration/tasks/scheduled/impl/integrity/test_url_both_data_source_and_meta_url.py +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/test_url_both_data_source_and_meta_url.py @@ -1,6 +1,9 @@ import pytest +from sqlalchemy import delete from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource +from tests.automated.integration.tasks.scheduled.impl.integrity.helpers import run_task_and_confirm_error @pytest.mark.asyncio @@ -8,3 +11,35 @@ async def test_core( operator: IntegrityMonitorTaskOperator ): pass + + # Check does not meet prerequisites + assert not await operator.meets_task_prerequisites() + + # Add URL as data source + + # Check does not meet prerequisites + assert not await operator.meets_task_prerequisites() + + # Add same URL as Meta URL + + # Check meets prerequisites + assert await operator.meets_task_prerequisites() + + # Run task and confirm produces error + await run_task_and_confirm_error( + operator=operator, + expected_error="" + ) + + # Delete data source link + statement = ( + delete( + DSAppLinkDataSource + ).where( + DSAppLinkDataSource.url_id == url_id + ) + ) + await operator.adb_client.execute(statement) + + # Check no longer meets task prerequisites + assert not await operator.meets_task_prerequisites() From 5ffda4768b66df9a84247c716c52f11cfc421813 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 24 Nov 2025 17:46:02 -0500 Subject: [PATCH 65/84] Finish integrity monitor draft --- ENV.md | 47 +++++++------- ...1923-5ac9d50b91c5_add_integrity_monitor.py | 35 ++++++----- .../scheduled/impl/integrity/exceptions.py | 4 ++ .../scheduled/impl/integrity/operator.py | 23 ++++++- .../impl/integrity/queries/__init__.py | 0 .../scheduled/impl/integrity/queries/cte.py | 61 +++++++++++++++++++ .../scheduled/impl/integrity/queries/get.py | 20 ++++++ .../impl/integrity/queries/prereq.py | 16 +++++ src/core/tasks/scheduled/loader.py | 8 +++ .../integrity/incomplete_data_sources.py | 15 +++-- .../views/integrity/incomplete_meta_urls.py | 14 +++-- .../non_federal_agencies_no_location.py | 10 +-- .../url_both_data_source_and_meta_url.py | 2 +- tests/automated/integration/conftest.py | 15 +++++ .../scheduled/impl/integrity/conftest.py | 4 +- .../tasks/scheduled/impl/integrity/helpers.py | 4 +- .../integrity/test_incomplete_data_sources.py | 36 +++++------ .../integrity/test_incomplete_meta_urls.py | 30 +++++++-- .../test_non_federal_agencies_no_location.py | 28 +++++++-- .../test_url_both_data_source_and_meta_url.py | 27 +++++--- .../tasks/scheduled/loader/test_happy_path.py | 2 +- 21 files changed, 304 insertions(+), 97 deletions(-) create mode 100644 src/core/tasks/scheduled/impl/integrity/exceptions.py create mode 100644 src/core/tasks/scheduled/impl/integrity/queries/__init__.py create mode 100644 src/core/tasks/scheduled/impl/integrity/queries/cte.py create mode 100644 src/core/tasks/scheduled/impl/integrity/queries/get.py create mode 100644 src/core/tasks/scheduled/impl/integrity/queries/prereq.py diff --git a/ENV.md b/ENV.md index a4ae17a7..386dbdae 100644 --- a/ENV.md +++ b/ENV.md @@ -57,29 +57,30 @@ Note that some tasks/subtasks are themselves enabled by other tasks. ### Scheduled Task Flags -| Flag | Description | -|----------------------------------------|-------------------------------------------------------------------------------| -| `SCHEDULED_TASKS_FLAG` | All scheduled tasks. Disabling disables all other scheduled tasks. | -| `PUSH_TO_HUGGING_FACE_TASK_FLAG` | Pushes data to HuggingFace. | -| `POPULATE_BACKLOG_SNAPSHOT_TASK_FLAG` | Populates the backlog snapshot. | -| `DELETE_OLD_LOGS_TASK_FLAG` | Deletes old logs. | -| `RUN_URL_TASKS_TASK_FLAG` | Runs URL tasks. | -| `IA_PROBE_TASK_FLAG` | Extracts and links Internet Archives metadata to URLs. | -| `IA_SAVE_TASK_FLAG` | Saves URLs to Internet Archives. | -| `MARK_TASK_NEVER_COMPLETED_TASK_FLAG` | Marks tasks that were started but never completed (usually due to a restart). | -| `DELETE_STALE_SCREENSHOTS_TASK_FLAG` | Deletes stale screenshots for URLs already validated. | -| `TASK_CLEANUP_TASK_FLAG` | Cleans up tasks that are no longer needed. | -| `REFRESH_MATERIALIZED_VIEWS_TASK_FLAG` | Refreshes materialized views. | -| `UPDATE_URL_STATUS_TASK_FLAG` | Updates the status of URLs. | -| `DS_APP_SYNC_AGENCY_ADD_TASK_FLAG` | Adds new agencies to the Data Sources App| -| `DS_APP_SYNC_AGENCY_UPDATE_TASK_FLAG` | Updates existing agencies in the Data Sources App| -| `DS_APP_SYNC_AGENCY_DELETE_TASK_FLAG` | Deletes agencies in the Data Sources App| -| `DS_APP_SYNC_DATA_SOURCE_ADD_TASK_FLAG` | Adds new data sources to the Data Sources App| -| `DS_APP_SYNC_DATA_SOURCE_UPDATE_TASK_FLAG` | Updates existing data sources in the Data Sources App| -| `DS_APP_SYNC_DATA_SOURCE_DELETE_TASK_FLAG` | Deletes data sources in the Data Sources App| -| `DS_APP_SYNC_META_URL_ADD_TASK_FLAG` | Adds new meta URLs to the Data Sources App| -| `DS_APP_SYNC_META_URL_UPDATE_TASK_FLAG` | Updates existing meta URLs in the Data Sources App| -| `DS_APP_SYNC_META_URL_DELETE_TASK_FLAG` | Deletes meta URLs in the Data Sources App| +| Flag | Description | +|--------------------------------------------|-------------------------------------------------------------------------------| +| `SCHEDULED_TASKS_FLAG` | All scheduled tasks. Disabling disables all other scheduled tasks. | +| `PUSH_TO_HUGGING_FACE_TASK_FLAG` | Pushes data to HuggingFace. | +| `POPULATE_BACKLOG_SNAPSHOT_TASK_FLAG` | Populates the backlog snapshot. | +| `DELETE_OLD_LOGS_TASK_FLAG` | Deletes old logs. | +| `RUN_URL_TASKS_TASK_FLAG` | Runs URL tasks. | +| `IA_PROBE_TASK_FLAG` | Extracts and links Internet Archives metadata to URLs. | +| `IA_SAVE_TASK_FLAG` | Saves URLs to Internet Archives. | +| `MARK_TASK_NEVER_COMPLETED_TASK_FLAG` | Marks tasks that were started but never completed (usually due to a restart). | +| `DELETE_STALE_SCREENSHOTS_TASK_FLAG` | Deletes stale screenshots for URLs already validated. | +| `TASK_CLEANUP_TASK_FLAG` | Cleans up tasks that are no longer needed. | +| `REFRESH_MATERIALIZED_VIEWS_TASK_FLAG` | Refreshes materialized views. | +| `UPDATE_URL_STATUS_TASK_FLAG` | Updates the status of URLs. | +| `DS_APP_SYNC_AGENCY_ADD_TASK_FLAG` | Adds new agencies to the Data Sources App| +| `DS_APP_SYNC_AGENCY_UPDATE_TASK_FLAG` | Updates existing agencies in the Data Sources App| +| `DS_APP_SYNC_AGENCY_DELETE_TASK_FLAG` | Deletes agencies in the Data Sources App| +| `DS_APP_SYNC_DATA_SOURCE_ADD_TASK_FLAG` | Adds new data sources to the Data Sources App| +| `DS_APP_SYNC_DATA_SOURCE_UPDATE_TASK_FLAG` | Updates existing data sources in the Data Sources App| +| `DS_APP_SYNC_DATA_SOURCE_DELETE_TASK_FLAG` | Deletes data sources in the Data Sources App| +| `DS_APP_SYNC_META_URL_ADD_TASK_FLAG` | Adds new meta URLs to the Data Sources App| +| `DS_APP_SYNC_META_URL_UPDATE_TASK_FLAG` | Updates existing meta URLs in the Data Sources App| +| `DS_APP_SYNC_META_URL_DELETE_TASK_FLAG` | Deletes meta URLs in the Data Sources App| +| `INTEGRITY_MONITOR_TASK_FLAG` | Runs integrity checks. | ### URL Task Flags diff --git a/alembic/versions/2025_11_23_1923-5ac9d50b91c5_add_integrity_monitor.py b/alembic/versions/2025_11_23_1923-5ac9d50b91c5_add_integrity_monitor.py index 75f41186..1f44dd25 100644 --- a/alembic/versions/2025_11_23_1923-5ac9d50b91c5_add_integrity_monitor.py +++ b/alembic/versions/2025_11_23_1923-5ac9d50b91c5_add_integrity_monitor.py @@ -17,9 +17,6 @@ branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None - - - def upgrade() -> None: _create_integrity_task() _create_incomplete_data_sources_view() @@ -31,7 +28,7 @@ def _create_non_federal_agencies_no_location_view(): op.execute(""" create view integrity__non_federal_agencies_no_location_view as select - ag.name + ag.id as agency_id from agencies ag left join link_agencies__locations link on ag.id = link.agency_id where ag.jurisdiction_type != 'federal' @@ -41,7 +38,7 @@ def _create_non_federal_agencies_no_location_view(): def _create_url_both_data_source_and_meta_url_view(): op.execute(""" - create view integrity__url_both_data_source_and_meta_url_view + create view integrity__url_both_data_source_and_meta_url_view as select ds.url_id from @@ -53,18 +50,19 @@ def _create_url_both_data_source_and_meta_url_view(): def _create_incomplete_meta_urls_view(): op.execute(""" - create view integrity__incomplete_data_sources_view as + create view integrity__incomplete_meta_urls_view as select mu.url_id, fuv.url_id is not null as has_validated_flag, - fuv.type as validated_type + fuv.type as validated_type, + lau.url_id is not null as has_agency_flag from ds_app_link_meta_url mu - left join flag_url_validated fuv on fuv.url_id = mu.url_id - left join url_record_type urt on urt.url_id = mu.url_id + left join flag_url_validated fuv on fuv.url_id = mu.url_id + left join link_agencies__urls lau on lau.url_id = mu.url_id where fuv.url_id is null - or fuv.type != 'meta url' - or urt.url_id is null + or fuv.type != 'meta url' + or lau.url_id is null """) @@ -72,18 +70,20 @@ def _create_incomplete_data_sources_view(): op.execute(""" create view integrity__incomplete_data_sources_view as select - mu.url_id, + ds.url_id, fuv.url_id is not null as has_validated_flag, fuv.type as validated_type, - urt.url_id is not null as has_record_type - - from ds_app_link_meta_url mu - left join flag_url_validated fuv on fuv.url_id = mu.url_id - left join url_record_type urt on urt.url_id = mu.url_id + urt.url_id is not null as has_record_type, + lau.url_id is not null as has_agency_flag + from ds_app_link_data_source ds + left join flag_url_validated fuv on fuv.url_id = ds.url_id + left join url_record_type urt on urt.url_id = ds.url_id + left join link_agencies__urls lau on lau.url_id = ds.url_id where fuv.url_id is null or fuv.type != 'data source' or urt.url_id is null + or lau.url_id is null """) @@ -93,6 +93,5 @@ def _create_integrity_task(): enum_value="Integrity Monitor", ) - def downgrade() -> None: pass diff --git a/src/core/tasks/scheduled/impl/integrity/exceptions.py b/src/core/tasks/scheduled/impl/integrity/exceptions.py new file mode 100644 index 00000000..3e9f797e --- /dev/null +++ b/src/core/tasks/scheduled/impl/integrity/exceptions.py @@ -0,0 +1,4 @@ + + +class IntegrityMonitorTaskException(Exception): + pass \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/integrity/operator.py b/src/core/tasks/scheduled/impl/integrity/operator.py index c9796c6d..42ca43bb 100644 --- a/src/core/tasks/scheduled/impl/integrity/operator.py +++ b/src/core/tasks/scheduled/impl/integrity/operator.py @@ -1,9 +1,30 @@ from src.core.tasks.mixins.prereq import HasPrerequisitesMixin +from src.core.tasks.scheduled.impl.integrity.exceptions import IntegrityMonitorTaskException +from src.core.tasks.scheduled.impl.integrity.queries.get import GetIntegrityTaskDataQueryBuilder +from src.core.tasks.scheduled.impl.integrity.queries.prereq import GetIntegrityTaskPrerequisitesQueryBuilder from src.core.tasks.scheduled.templates.operator import ScheduledTaskOperatorBase +from src.db.enums import TaskType class IntegrityMonitorTaskOperator( ScheduledTaskOperatorBase, HasPrerequisitesMixin ): - pass \ No newline at end of file + + @property + def task_type(self) -> TaskType: + return TaskType.INTEGRITY_MONITOR + + async def meets_task_prerequisites(self) -> bool: + return await self.run_query_builder( + query_builder=GetIntegrityTaskPrerequisitesQueryBuilder() + ) + + async def inner_task_logic(self) -> None: + failing_views: list[str] = await self.run_query_builder( + query_builder=GetIntegrityTaskDataQueryBuilder() + ) + raise IntegrityMonitorTaskException( + f"Integrity Monitor Task failed for the following views {failing_views}", + ) + diff --git a/src/core/tasks/scheduled/impl/integrity/queries/__init__.py b/src/core/tasks/scheduled/impl/integrity/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/integrity/queries/cte.py b/src/core/tasks/scheduled/impl/integrity/queries/cte.py new file mode 100644 index 00000000..dc894ea7 --- /dev/null +++ b/src/core/tasks/scheduled/impl/integrity/queries/cte.py @@ -0,0 +1,61 @@ +from sqlalchemy import select, literal, Exists, Label, or_ + +from src.db.models.templates_.base import Base +from src.db.models.views.integrity.incomplete_data_sources import IntegrityIncompleteDataSource +from src.db.models.views.integrity.incomplete_meta_urls import IntegrityIncompleteMetaURL +from src.db.models.views.integrity.non_federal_agencies_no_location import IntegrityNonFederalAgenciesNoLocation +from src.db.models.views.integrity.url_both_data_source_and_meta_url import IntegrityURLBothDataSourceAndMetaURL + + +def any_row_exists( + model: type[Base] +) -> Exists: + return ( + select( + literal(1) + ) + .select_from( + model + ) + .exists() + ) + +class IntegrityTaskCTEContainer: + + def __init__( + self, + ): + self.models: list[type[Base]] = [ + IntegrityURLBothDataSourceAndMetaURL, + IntegrityNonFederalAgenciesNoLocation, + IntegrityIncompleteMetaURL, + IntegrityIncompleteDataSource, + ] + + expressions: list[Label[bool]] = [ + any_row_exists(model) + .label(model.__tablename__) + for model in self.models + ] + + self.cte = ( + select( + *expressions + ) + .cte( + name="integrity_task_cte", + ) + ) + + @property + def any_rows_exist_query(self) -> select: + expression = [ + getattr(self.cte.c, model.__tablename__) + for model in self.models + ] + return select(or_(*expression)) + + @property + def select_all_columns_query(self) -> select: + return select(self.cte) + diff --git a/src/core/tasks/scheduled/impl/integrity/queries/get.py b/src/core/tasks/scheduled/impl/integrity/queries/get.py new file mode 100644 index 00000000..b8632fa2 --- /dev/null +++ b/src/core/tasks/scheduled/impl/integrity/queries/get.py @@ -0,0 +1,20 @@ +from sqlalchemy import RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.integrity.queries.cte import IntegrityTaskCTEContainer +from src.db.queries.base.builder import QueryBuilderBase + + +class GetIntegrityTaskDataQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> list[str]: + cte = IntegrityTaskCTEContainer() + mapping: RowMapping = await self.sh.mapping( + session=session, + query=cte.select_all_columns_query + ) + return [ + model.__tablename__ + for model in cte.models + if mapping[model.__tablename__] + ] \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/integrity/queries/prereq.py b/src/core/tasks/scheduled/impl/integrity/queries/prereq.py new file mode 100644 index 00000000..12a6fa33 --- /dev/null +++ b/src/core/tasks/scheduled/impl/integrity/queries/prereq.py @@ -0,0 +1,16 @@ +from typing import Any + +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.integrity.queries.cte import IntegrityTaskCTEContainer +from src.db.queries.base.builder import QueryBuilderBase + + +class GetIntegrityTaskPrerequisitesQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> Any: + cte = IntegrityTaskCTEContainer() + return await self.sh.scalar( + session=session, + query=cte.any_rows_exist_query + ) diff --git a/src/core/tasks/scheduled/loader.py b/src/core/tasks/scheduled/loader.py index 394a60ce..116bf56d 100644 --- a/src/core/tasks/scheduled/loader.py +++ b/src/core/tasks/scheduled/loader.py @@ -6,6 +6,7 @@ from src.core.tasks.scheduled.impl.delete_logs.operator import DeleteOldLogsTaskOperator from src.core.tasks.scheduled.impl.delete_stale_screenshots.operator import DeleteStaleScreenshotsTaskOperator from src.core.tasks.scheduled.impl.huggingface.operator import PushToHuggingFaceTaskOperator +from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator from src.core.tasks.scheduled.impl.internet_archives.probe.operator import InternetArchivesProbeTaskOperator from src.core.tasks.scheduled.impl.internet_archives.save.operator import InternetArchivesSaveTaskOperator from src.core.tasks.scheduled.impl.mark_never_completed.operator import MarkTaskNeverCompletedOperator @@ -127,6 +128,13 @@ async def load_entries(self) -> list[ScheduledTaskEntry]: interval_minutes=IntervalEnum.DAILY.value, enabled=self.setup_flag("REFRESH_MATERIALIZED_VIEWS_TASK_FLAG") ), + ScheduledTaskEntry( + operator=IntegrityMonitorTaskOperator( + adb_client=self.adb_client + ), + interval_minutes=IntervalEnum.DAILY.value, + enabled=self.setup_flag("INTEGRITY_MONITOR_TASK_FLAG") + ), # Sync ## Adds ### Agency diff --git a/src/db/models/views/integrity/incomplete_data_sources.py b/src/db/models/views/integrity/incomplete_data_sources.py index 8444b2e6..06efa3b4 100644 --- a/src/db/models/views/integrity/incomplete_data_sources.py +++ b/src/db/models/views/integrity/incomplete_data_sources.py @@ -1,18 +1,20 @@ """ create view integrity__incomplete_data_sources_view as select - mu.url_id, + ds.url_id, fuv.url_id is not null as has_validated_flag, fuv.type as validated_type, - urt.url_id is not null as has_record_type - - from ds_app_link_meta_url mu - left join flag_url_validated fuv on fuv.url_id = mu.url_id - left join url_record_type urt on urt.url_id = mu.url_id + urt.url_id is not null as has_record_type, + lau.url_id is not null as has_agency_flag + from ds_app_link_data_source ds + left join flag_url_validated fuv on fuv.url_id = ds.url_id + left join url_record_type urt on urt.url_id = ds.url_id + left join link_agencies__urls lau on lau.url_id = ds.url_id where fuv.url_id is null or fuv.type != 'data source' or urt.url_id is null + or lau.url_id is null """ from sqlalchemy import Column, Boolean @@ -33,3 +35,4 @@ class IntegrityIncompleteDataSource( name="url_type", ) has_record_type = Column(Boolean) + has_agency_flag = Column(Boolean) diff --git a/src/db/models/views/integrity/incomplete_meta_urls.py b/src/db/models/views/integrity/incomplete_meta_urls.py index 4c7ec01d..a837c156 100644 --- a/src/db/models/views/integrity/incomplete_meta_urls.py +++ b/src/db/models/views/integrity/incomplete_meta_urls.py @@ -1,16 +1,17 @@ """ - create view integrity__incomplete_data_sources_view as + create view integrity__incomplete_meta_urls_view as select mu.url_id, fuv.url_id is not null as has_validated_flag, - fuv.type as validated_type + fuv.type as validated_type, + lau.url_id is not null as has_agency_flag from ds_app_link_meta_url mu - left join flag_url_validated fuv on fuv.url_id = mu.url_id - left join url_record_type urt on urt.url_id = mu.url_id + left join flag_url_validated fuv on fuv.url_id = mu.url_id + left join link_agencies__urls lau on lau.url_id = mu.url_id where fuv.url_id is null - or fuv.type != 'meta url' - or urt.url_id is null + or fuv.type != 'meta url' + or lau.url_id is null """ from sqlalchemy import Column, Boolean @@ -30,5 +31,6 @@ class IntegrityIncompleteMetaURL( enum_type=URLType, name="url_type", ) + has_agency_flag = Column(Boolean) diff --git a/src/db/models/views/integrity/non_federal_agencies_no_location.py b/src/db/models/views/integrity/non_federal_agencies_no_location.py index e45882fe..73e547b9 100644 --- a/src/db/models/views/integrity/non_federal_agencies_no_location.py +++ b/src/db/models/views/integrity/non_federal_agencies_no_location.py @@ -1,24 +1,26 @@ """ create view integrity__non_federal_agencies_no_location_view as select - ag.name + ag.id as agency_id from agencies ag left join link_agencies__locations link on ag.id = link.agency_id where ag.jurisdiction_type != 'federal' and link.location_id is null """ -from sqlalchemy import String, Column +from sqlalchemy import String, Column, PrimaryKeyConstraint from src.db.models.helpers import VIEW_ARG -from src.db.models.mixins import ViewMixin +from src.db.models.mixins import ViewMixin, AgencyDependentMixin from src.db.models.templates_.base import Base class IntegrityNonFederalAgenciesNoLocation( Base, - ViewMixin + ViewMixin, + AgencyDependentMixin, ): __tablename__ = "integrity__non_federal_agencies_no_location_view" __table_args__ = ( + PrimaryKeyConstraint("agency_id"), VIEW_ARG, ) diff --git a/src/db/models/views/integrity/url_both_data_source_and_meta_url.py b/src/db/models/views/integrity/url_both_data_source_and_meta_url.py index eac08d03..0de88314 100644 --- a/src/db/models/views/integrity/url_both_data_source_and_meta_url.py +++ b/src/db/models/views/integrity/url_both_data_source_and_meta_url.py @@ -1,5 +1,5 @@ """ - create view integrity__url_both_data_source_and_meta_url_view + create view integrity__url_both_data_source_and_meta_url_view as select ds.url_id from diff --git a/tests/automated/integration/conftest.py b/tests/automated/integration/conftest.py index 6e2be0f0..4c6a76d0 100644 --- a/tests/automated/integration/conftest.py +++ b/tests/automated/integration/conftest.py @@ -6,6 +6,7 @@ from starlette.testclient import TestClient from src.api.main import app +from src.collectors.enums import URLStatus from src.collectors.manager import AsyncCollectorManager from src.core.core import AsyncCore from src.core.enums import RecordType @@ -14,6 +15,8 @@ from src.db.client.sync import DatabaseClient from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.sqlalchemy import URL from src.security.dtos.access_info import AccessInfo from src.security.enums import Permissions from src.security.manager import get_access_info @@ -218,6 +221,18 @@ async def test_url_data_source_id( ) return url_id +@pytest_asyncio.fixture +async def test_url_id( + db_data_creator: DBDataCreator, +) -> int: + url = URL( + url="example.com", + source=URLSource.COLLECTOR, + trailing_slash=False, + status=URLStatus.OK + ) + return await db_data_creator.adb_client.add(url, return_id=True) + @pytest_asyncio.fixture async def test_url_data_source_mapping( db_data_creator: DBDataCreator, diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/conftest.py b/tests/automated/integration/tasks/scheduled/impl/integrity/conftest.py index a3d3cd22..9106f1b7 100644 --- a/tests/automated/integration/tasks/scheduled/impl/integrity/conftest.py +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/conftest.py @@ -8,4 +8,6 @@ def operator( adb_client_test: AsyncDatabaseClient ) -> IntegrityMonitorTaskOperator: - raise NotImplementedError \ No newline at end of file + return IntegrityMonitorTaskOperator( + adb_client=adb_client_test, + ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/helpers.py b/tests/automated/integration/tasks/scheduled/impl/integrity/helpers.py index 60177c3f..2b617ca2 100644 --- a/tests/automated/integration/tasks/scheduled/impl/integrity/helpers.py +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/helpers.py @@ -5,8 +5,8 @@ async def run_task_and_confirm_error( operator: IntegrityMonitorTaskOperator, - expected_error: str + expected_view: str ) -> None: run_info: TaskOperatorRunInfo = await operator.run_task() assert run_info.outcome == TaskOperatorOutcome.ERROR - assert run_info.error_message == expected_error \ No newline at end of file + assert expected_view in run_info.message \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_data_sources.py b/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_data_sources.py index d716a7da..3381d7f0 100644 --- a/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_data_sources.py +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_data_sources.py @@ -4,8 +4,7 @@ from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator from src.db.models.impl.flag.url_validated.enums import URLType from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated -from src.db.models.impl.url.core.enums import URLSource -from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType from tests.automated.integration.tasks.scheduled.impl.integrity.helpers import run_task_and_confirm_error @@ -13,25 +12,17 @@ @pytest.mark.asyncio async def test_core( - operator: IntegrityMonitorTaskOperator + operator: IntegrityMonitorTaskOperator, + test_url_id: int, + test_agency_id: int ): - pass - # Check does not meet prerequisites assert not await operator.meets_task_prerequisites() # Add URL as data source but without record type or validated flag - ## URL - url = URL( - url="example.com", - source=URLSource.COLLECTOR, - trailing_slash=False - ) - url_id: int = await operator.adb_client.add(url, return_id=True) - ## App Link ds_app_link = DSAppLinkDataSource( - url_id=url_id, + url_id=test_url_id, ds_data_source_id=1 ) await operator.adb_client.add(ds_app_link) @@ -41,7 +32,7 @@ async def test_core( # Add validated URL flag flag = FlagURLValidated( - url_id=url_id, + url_id=test_url_id, type=URLType.DATA_SOURCE ) await operator.adb_client.add(flag) @@ -51,14 +42,25 @@ async def test_core( # Run task and confirm produces error await run_task_and_confirm_error( operator=operator, - expected_error="" + expected_view="integrity__incomplete_data_sources_view" ) # Add record type to data source record_type = URLRecordType( - url_id=url_id, + url_id=test_url_id, record_type=RecordType.INCARCERATION_RECORDS ) + await operator.adb_client.add(record_type) + + # Check still meets prerequisites + assert await operator.meets_task_prerequisites() + + # Add agency to data source + agency = LinkURLAgency( + agency_id=test_agency_id, + url_id=test_url_id + ) + await operator.adb_client.add(agency) # Check no longer meets task prerequisites assert not await operator.meets_task_prerequisites() diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_meta_urls.py b/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_meta_urls.py index def7cf9f..9c3a147d 100644 --- a/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_meta_urls.py +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_meta_urls.py @@ -1,19 +1,36 @@ import pytest from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL from tests.automated.integration.tasks.scheduled.impl.integrity.helpers import run_task_and_confirm_error @pytest.mark.asyncio async def test_core( - operator: IntegrityMonitorTaskOperator + operator: IntegrityMonitorTaskOperator, + test_agency_id: int, + test_url_id: int ): - pass - # Check does not meet prerequisites assert not await operator.meets_task_prerequisites() # Add Meta URL without linking an agency to it + ## Validated Flag + flag = FlagURLValidated( + url_id=test_url_id, + type=URLType.META_URL + ) + await operator.adb_client.add(flag) + + ## App Link + ds_app_link = DSAppLinkMetaURL( + url_id=test_url_id, + ds_meta_url_id=1 + ) + await operator.adb_client.add(ds_app_link) # Check meets prerequisites assert await operator.meets_task_prerequisites() @@ -21,10 +38,15 @@ async def test_core( # Run task and confirm produces error await run_task_and_confirm_error( operator=operator, - expected_error="" + expected_view="integrity__incomplete_meta_urls_view" ) # Add agency to Meta URL + link = LinkURLAgency( + agency_id=test_agency_id, + url_id=test_url_id + ) + await operator.adb_client.add(link) # Check no longer meets task prerequisites assert not await operator.meets_task_prerequisites() diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/test_non_federal_agencies_no_location.py b/tests/automated/integration/tasks/scheduled/impl/integrity/test_non_federal_agencies_no_location.py index e77c0b31..ee189f64 100644 --- a/tests/automated/integration/tasks/scheduled/impl/integrity/test_non_federal_agencies_no_location.py +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/test_non_federal_agencies_no_location.py @@ -1,24 +1,39 @@ import pytest from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator +from src.db.models.impl.agency.enums import JurisdictionType, AgencyType +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.link.agency_location.sqlalchemy import LinkAgencyLocation from tests.automated.integration.tasks.scheduled.impl.integrity.helpers import run_task_and_confirm_error +from tests.helpers.data_creator.models.creation_info.locality import LocalityCreationInfo @pytest.mark.asyncio async def test_core( - operator: IntegrityMonitorTaskOperator + operator: IntegrityMonitorTaskOperator, + pittsburgh_locality: LocalityCreationInfo ): - pass - # Check does not meet prerequisites assert not await operator.meets_task_prerequisites() # Add federal agency + agency = Agency( + name="Federal Agency", + agency_type=AgencyType.COURT, + jurisdiction_type=JurisdictionType.FEDERAL + ) + await operator.adb_client.add(agency) # Check does not meet prerequisites assert not await operator.meets_task_prerequisites() # Add non-federal agency + agency = Agency( + name="Non-Federal Agency", + agency_type=AgencyType.COURT, + jurisdiction_type=JurisdictionType.LOCAL + ) + agency_id: int =await operator.adb_client.add(agency, return_id=True) # Check meets prerequisites assert await operator.meets_task_prerequisites() @@ -26,10 +41,15 @@ async def test_core( # Run task and confirm produces error await run_task_and_confirm_error( operator=operator, - expected_error="" + expected_view="integrity__non_federal_agencies_no_location_view" ) # Add location to non-federal agency + link = LinkAgencyLocation( + agency_id=agency_id, + location_id=pittsburgh_locality.location_id + ) + await operator.adb_client.add(link) # Check no longer meets task prerequisites assert not await operator.meets_task_prerequisites() diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/test_url_both_data_source_and_meta_url.py b/tests/automated/integration/tasks/scheduled/impl/integrity/test_url_both_data_source_and_meta_url.py index cce6269e..fa36a269 100644 --- a/tests/automated/integration/tasks/scheduled/impl/integrity/test_url_both_data_source_and_meta_url.py +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/test_url_both_data_source_and_meta_url.py @@ -3,24 +3,33 @@ from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource +from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL from tests.automated.integration.tasks.scheduled.impl.integrity.helpers import run_task_and_confirm_error @pytest.mark.asyncio async def test_core( - operator: IntegrityMonitorTaskOperator + operator: IntegrityMonitorTaskOperator, + test_url_data_source_id: int ): - pass # Check does not meet prerequisites assert not await operator.meets_task_prerequisites() - # Add URL as data source - - # Check does not meet prerequisites - assert not await operator.meets_task_prerequisites() + # Add DS App Link + ds_app_link_ds = DSAppLinkDataSource( + url_id=test_url_data_source_id, + ds_data_source_id=1 + ) + await operator.adb_client.add(ds_app_link_ds) # Add same URL as Meta URL + ## App Link + ds_app_link_mu = DSAppLinkMetaURL( + url_id=test_url_data_source_id, + ds_meta_url_id=1 + ) + await operator.adb_client.add(ds_app_link_mu) # Check meets prerequisites assert await operator.meets_task_prerequisites() @@ -28,15 +37,15 @@ async def test_core( # Run task and confirm produces error await run_task_and_confirm_error( operator=operator, - expected_error="" + expected_view="integrity__url_both_data_source_and_meta_url_view" ) # Delete data source link statement = ( delete( - DSAppLinkDataSource + DSAppLinkMetaURL ).where( - DSAppLinkDataSource.url_id == url_id + DSAppLinkMetaURL.url_id == test_url_data_source_id ) ) await operator.adb_client.execute(statement) diff --git a/tests/automated/integration/tasks/scheduled/loader/test_happy_path.py b/tests/automated/integration/tasks/scheduled/loader/test_happy_path.py index ae41bc30..4e5bb551 100644 --- a/tests/automated/integration/tasks/scheduled/loader/test_happy_path.py +++ b/tests/automated/integration/tasks/scheduled/loader/test_happy_path.py @@ -2,7 +2,7 @@ from src.core.tasks.scheduled.loader import ScheduledTaskOperatorLoader -NUMBER_OF_ENTRIES = 20 +NUMBER_OF_ENTRIES = 21 @pytest.mark.asyncio async def test_happy_path( From 09f7a770d648c5b1a302d3fab00b00b7f54af21d Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 24 Nov 2025 18:19:38 -0500 Subject: [PATCH 66/84] Migrate sync information into main README --- README.md | 67 +++++++++++++++++++ .../tasks/scheduled/impl/sync_to_ds/README.md | 65 ------------------ 2 files changed, 67 insertions(+), 65 deletions(-) delete mode 100644 src/core/tasks/scheduled/impl/sync_to_ds/README.md diff --git a/README.md b/README.md index 4fa95b40..56e8182d 100644 --- a/README.md +++ b/README.md @@ -157,3 +157,70 @@ These will *not* block any Pull request, but exist primarily as advisory comment Note that `python_checks.yml` will only function on pull requests made from within the repo, not from a forked repo. +# Syncing to Data Sources App + +The Source Manager (SM) is part of a two app system, with the other app being the Data Sources (DS) App. + + +## Add, Update, and Delete + +These are the core synchronization actions. + +In order to propagate changes to DS, we synchronize additions, updates, and deletions of the following entities: +- Agencies +- Data Sources +- Meta URLs + +Each action for each entity occurs through a separate task. At the moment, there are nine tasks total. + +Each task gathers requisite information from the SM database and sends a request to one of nine corresponding endpoints in the DS API. + +Each DS endpoint follows the following format: + +```text +/v3/sync/{entity}/{action} +``` + +Synchronizations are designed to occur on an hourly basis. + +Here is a high-level description of how each action works: + +### Add + +Adds the given entities to DS. + +These are denoted with the `/{entity}/add` path in the DS API. + +When an entity is added, it returns a unique DS ID that is mapped to the internal SM database ID via the DS app link tables. + +For an entity to be added, it must meet preconditions which are distinct for each entity: +- Agencies: Must have an agency entry in the database and be linked to a location. +- Data Sources: Must be a URL that has been internally validated as a data source and linked to an agency. +- Meta URLs: Must be a URL that has been internally validated as a meta URL and linked to an agency. + +### Update + +Updates the given entities in DS. + +These are denoted with the `/{entity}/update` path in the DS API. + +These consist of submitting the updated entities (in full) to the requisite endpoint, and updating the local app link to indicate that the update occurred. All updates are designed to be full overwrites of the entity. + +For an entity to be updated, it must meet preconditions which are distinct for each entity: +- Agencies: Must have either an agency row updated or an agency/location link updated or deleted. +- Data Sources: One of the following must be updated: + - The URL table + - The record type table + - The optional data sources metadata table + - The agency link table (either an addition or deletion) +- Meta URLs: Must be a URL that has been internally validated as a meta URL and linked to an agency. Either the URL table or the agency link table (addition or deletion) must be updated. + +### Delete + +Deletes the given entities from DS. + +These are denoted with the `/{entity}/delete` path in the DS API. + +This consists of submitting a set of DS IDs to the requisite endpoint, and removing the associated DS app link entry in the SM database. + +When an entity with a corresponding DS App Link is deleted from the Source Manager, the core data is removed but a deletion flag is appended to the DS App Link entry, indicating that the entry is not yet removed from the DS App. The deletion task uses this flag to identify entities to be deleted, submits the deletion request to the DS API, and removes both the flag and the DS App Link. \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/README.md b/src/core/tasks/scheduled/impl/sync_to_ds/README.md deleted file mode 100644 index 3af42af8..00000000 --- a/src/core/tasks/scheduled/impl/sync_to_ds/README.md +++ /dev/null @@ -1,65 +0,0 @@ -The Source Manager (SM) is part of a two app system, with the other app being the Data Sources (DS) App. - - -# Add, Update, and Delete - -These are the core synchronization actions. - -In order to propagate changes to DS, we synchronize additions, updates, and deletions of the following entities: -- Agencies -- Data Sources -- Meta URLs - -Each action for each entity occurs through a separate task. At the moment, there are nine tasks total. - -Each task gathers requisite information from the SM database and sends a request to one of nine corresponding endpoints in the DS API. - -Each DS endpoint follows the following format: - -```text -/v3/sync/{entity}/{action} -``` - -Synchronizations are designed to occur on an hourly basis. - -Here is a high-level description of how each action works: - -## Add - -Adds the given entities to DS. - -These are denoted with the `/{entity}/add` path in the DS API. - -When an entity is added, it returns a unique DS ID that is mapped to the internal SM database ID via the DS app link tables. - -For an entity to be added, it must meet preconditions which are distinct for each entity: -- Agencies: Must have an agency entry in the database and be linked to a location. -- Data Sources: Must be a URL that has been internally validated as a data source and linked to an agency. -- Meta URLs: Must be a URL that has been internally validated as a meta URL and linked to an agency. - -## Update - -Updates the given entities in DS. - -These are denoted with the `/{entity}/update` path in the DS API. - -These consist of submitting the updated entities (in full) to the requisite endpoint, and updating the local app link to indicate that the update occurred. All updates are designed to be full overwrites of the entity. - -For an entity to be updated, it must meet preconditions which are distinct for each entity: -- Agencies: Must have either an agency row updated or an agency/location link updated or deleted. -- Data Sources: One of the following must be updated: - - The URL table - - The record type table - - The optional data sources metadata table - - The agency link table (either an addition or deletion) -- Meta URLs: Must be a URL that has been internally validated as a meta URL and linked to an agency. Either the URL table or the agency link table (addition or deletion) must be updated. - -## Delete - -Deletes the given entities from DS. - -These are denoted with the `/{entity}/delete` path in the DS API. - -This consists of submitting a set of DS IDs to the requisite endpoint, and removing the associated DS app link entry in the SM database. - -When an entity with a corresponding DS App Link is deleted from the Source Manager, the core data is removed but a deletion flag is appended to the DS App Link entry, indicating that the entry is not yet removed from the DS App. The deletion task uses this flag to identify entities to be deleted, submits the deletion request to the DS API, and removes both the flag and the DS App Link. \ No newline at end of file From 5fd69046e55c2fc2b08f620faa59a883422fe64d Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 24 Nov 2025 18:56:56 -0500 Subject: [PATCH 67/84] Ensure consistent router capitalization --- src/api/endpoints/agencies/routes.py | 2 +- src/api/endpoints/check/routes.py | 2 +- src/api/endpoints/data_source/routes.py | 2 +- src/api/endpoints/meta_url/routes.py | 2 +- src/api/endpoints/root.py | 2 +- src/api/endpoints/search/routes.py | 2 +- src/api/endpoints/submit/routes.py | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/api/endpoints/agencies/routes.py b/src/api/endpoints/agencies/routes.py index 6edfdf03..b0a756aa 100644 --- a/src/api/endpoints/agencies/routes.py +++ b/src/api/endpoints/agencies/routes.py @@ -17,7 +17,7 @@ from src.api.shared.models.message_response import MessageResponse from src.core.core import AsyncCore -agencies_router = APIRouter(prefix="/agencies", tags=["agencies"]) +agencies_router = APIRouter(prefix="/agencies", tags=["Agencies"]) @agencies_router.get("") async def get_agencies( diff --git a/src/api/endpoints/check/routes.py b/src/api/endpoints/check/routes.py index 09870f15..9ea309a7 100644 --- a/src/api/endpoints/check/routes.py +++ b/src/api/endpoints/check/routes.py @@ -7,7 +7,7 @@ check_router = APIRouter( prefix="/check", - tags=["check"] + tags=["Check"] ) @check_router.get("/unique-url") diff --git a/src/api/endpoints/data_source/routes.py b/src/api/endpoints/data_source/routes.py index 04d81f10..25787b85 100644 --- a/src/api/endpoints/data_source/routes.py +++ b/src/api/endpoints/data_source/routes.py @@ -16,7 +16,7 @@ data_sources_router = APIRouter( prefix="/data-sources", - tags=["data-source"] + tags=["Data Sources"] ) diff --git a/src/api/endpoints/meta_url/routes.py b/src/api/endpoints/meta_url/routes.py index 79a5ab03..82a36756 100644 --- a/src/api/endpoints/meta_url/routes.py +++ b/src/api/endpoints/meta_url/routes.py @@ -15,7 +15,7 @@ meta_urls_router = APIRouter( prefix="/meta-urls", - tags=["meta-url"] + tags=["Meta Urls"] ) @meta_urls_router.get("") diff --git a/src/api/endpoints/root.py b/src/api/endpoints/root.py index b42a84d3..03b05ed4 100644 --- a/src/api/endpoints/root.py +++ b/src/api/endpoints/root.py @@ -3,7 +3,7 @@ from src.security.manager import get_access_info from src.security.dtos.access_info import AccessInfo -root_router = APIRouter(prefix="", tags=["root"]) +root_router = APIRouter(prefix="", tags=["Root"]) @root_router.get("/") async def root( diff --git a/src/api/endpoints/search/routes.py b/src/api/endpoints/search/routes.py index f2abb93c..dfbeeacd 100644 --- a/src/api/endpoints/search/routes.py +++ b/src/api/endpoints/search/routes.py @@ -11,7 +11,7 @@ from src.security.manager import get_access_info from src.security.dtos.access_info import AccessInfo -search_router = APIRouter(prefix="/search", tags=["search"]) +search_router = APIRouter(prefix="/search", tags=["Search"]) @search_router.get("/url") diff --git a/src/api/endpoints/submit/routes.py b/src/api/endpoints/submit/routes.py index 37f4a3c9..2eb46c15 100644 --- a/src/api/endpoints/submit/routes.py +++ b/src/api/endpoints/submit/routes.py @@ -15,7 +15,7 @@ from src.security.dtos.access_info import AccessInfo from src.security.manager import get_access_info -submit_router = APIRouter(prefix="/submit", tags=["submit"]) +submit_router = APIRouter(prefix="/submit", tags=["Submit"]) @submit_router.post( "/url" From a61ccd69bae64313811d69875f75a59de282585f Mon Sep 17 00:00:00 2001 From: Max Chis Date: Wed, 26 Nov 2025 20:29:36 -0500 Subject: [PATCH 68/84] Adjust CKAN/Muckrock Agency ID Logic --- .../agency_identification/subtasks/convert.py | 44 +++------------ .../subtasks/impl/ckan_/core.py | 15 +++--- .../subtasks/impl/muckrock_/core.py | 15 +++--- .../subtasks/queries/match_agency.py | 44 +++++++++++++++ src/external/pdap/client.py | 47 ---------------- .../pdap/dtos/match_agency/__init__.py | 0 src/external/pdap/dtos/match_agency/post.py | 11 ---- .../pdap/dtos/match_agency/response.py | 11 ---- src/external/pdap/enums.py | 6 --- .../subtasks/ckan/test_core.py | 38 +++---------- .../subtasks/muckrock/test_core.py | 53 +++---------------- .../manual/external/pdap/test_match_agency.py | 6 --- 12 files changed, 84 insertions(+), 206 deletions(-) create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/queries/match_agency.py delete mode 100644 src/external/pdap/dtos/match_agency/__init__.py delete mode 100644 src/external/pdap/dtos/match_agency/post.py delete mode 100644 src/external/pdap/dtos/match_agency/response.py delete mode 100644 tests/manual/external/pdap/test_match_agency.py diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/convert.py b/src/core/tasks/url/operators/agency_identification/subtasks/convert.py index 95c9e704..5cead5d3 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/convert.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/convert.py @@ -2,21 +2,15 @@ from src.core.tasks.url.operators.agency_identification.subtasks.models.suggestion import AgencySuggestion from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType from src.db.models.impl.url.suggestion.agency.subtask.pydantic import URLAutoAgencyIDSubtaskPydantic -from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo -from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse -from src.external.pdap.enums import MatchAgencyResponseStatus -def convert_match_agency_response_to_subtask_data( + +def convert_agency_suggestions_to_subtask_data( url_id: int, - response: MatchAgencyResponse, + agency_suggestions: list[AgencySuggestion], subtask_type: AutoAgencyIDSubtaskType, - task_id: int -): - suggestions: list[AgencySuggestion] = \ - _convert_match_agency_response_to_suggestions( - response - ) - agencies_found: bool = len(suggestions) > 0 + task_id: int, +) -> AutoAgencyIDSubtaskData: + agencies_found: bool = len(agency_suggestions) > 0 subtask_pydantic = URLAutoAgencyIDSubtaskPydantic( url_id=url_id, type=subtask_type, @@ -25,30 +19,6 @@ def convert_match_agency_response_to_subtask_data( ) return AutoAgencyIDSubtaskData( pydantic_model=subtask_pydantic, - suggestions=suggestions + suggestions=agency_suggestions ) -def _convert_match_agency_response_to_suggestions( - match_response: MatchAgencyResponse, -) -> list[AgencySuggestion]: - if match_response.status == MatchAgencyResponseStatus.EXACT_MATCH: - match_info: MatchAgencyInfo = match_response.matches[0] - return [ - AgencySuggestion( - agency_id=int(match_info.id), - confidence=100 - ) - ] - if match_response.status == MatchAgencyResponseStatus.NO_MATCH: - return [] - if match_response.status != MatchAgencyResponseStatus.PARTIAL_MATCH: - raise ValueError(f"Unknown Match Agency Response Status: {match_response.status}") - total_confidence: int = 100 - confidence_per_match: int = total_confidence // len(match_response.matches) - return [ - AgencySuggestion( - agency_id=int(match_info.id), - confidence=confidence_per_match - ) - for match_info in match_response.matches - ] \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/core.py index d1af5391..2603191a 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/core.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/core.py @@ -3,17 +3,18 @@ from typing_extensions import override from src.core.tasks.url.operators.agency_identification.subtasks.convert import \ - convert_match_agency_response_to_subtask_data + convert_agency_suggestions_to_subtask_data from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan_.params import CKANAgencyIDSubtaskParams from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan_.query import \ GetCKANAgencyIDSubtaskParamsQueryBuilder from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData +from src.core.tasks.url.operators.agency_identification.subtasks.models.suggestion import AgencySuggestion +from src.core.tasks.url.operators.agency_identification.subtasks.queries.match_agency import MatchAgencyQueryBuilder from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import \ AgencyIDSubtaskOperatorBase from src.db.client.async_ import AsyncDatabaseClient from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType from src.external.pdap.client import PDAPClient -from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse @final @@ -35,12 +36,14 @@ async def inner_logic(self) -> None: subtask_data_list: list[AutoAgencyIDSubtaskData] = [] for param in params: agency_name: str = param.collector_metadata["agency_name"] - response: MatchAgencyResponse = await self.pdap_client.match_agency( - name=agency_name + agency_suggestions: list[AgencySuggestion] = await self.adb_client.run_query_builder( + MatchAgencyQueryBuilder( + agency_name=agency_name + ) ) - subtask_data: AutoAgencyIDSubtaskData = convert_match_agency_response_to_subtask_data( + subtask_data: AutoAgencyIDSubtaskData = convert_agency_suggestions_to_subtask_data( url_id=param.url_id, - response=response, + agency_suggestions=agency_suggestions, subtask_type=AutoAgencyIDSubtaskType.CKAN, task_id=self.task_id ) diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/core.py index 4fa92c2e..030139ad 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/core.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/core.py @@ -6,18 +6,19 @@ from src.collectors.impl.muckrock.api_interface.lookup_response import AgencyLookupResponse from src.collectors.impl.muckrock.enums import AgencyLookupResponseType from src.core.tasks.url.operators.agency_identification.subtasks.convert import \ - convert_match_agency_response_to_subtask_data + convert_agency_suggestions_to_subtask_data from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock_.params import \ MuckrockAgencyIDSubtaskParams from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock_.query import \ GetMuckrockAgencyIDSubtaskParamsQueryBuilder from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData +from src.core.tasks.url.operators.agency_identification.subtasks.models.suggestion import AgencySuggestion +from src.core.tasks.url.operators.agency_identification.subtasks.queries.match_agency import MatchAgencyQueryBuilder from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIDSubtaskOperatorBase from src.db.client.async_ import AsyncDatabaseClient from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType, SubtaskDetailCode from src.db.models.impl.url.suggestion.agency.subtask.pydantic import URLAutoAgencyIDSubtaskPydantic from src.external.pdap.client import PDAPClient -from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse @final @@ -52,12 +53,14 @@ async def inner_logic(self) -> None: ) subtask_data_list.append(data) continue - match_agency_response: MatchAgencyResponse = await self.pdap_client.match_agency( - name=agency_lookup_response.name + agency_suggestions: list[AgencySuggestion] = await self.adb_client.run_query_builder( + MatchAgencyQueryBuilder( + agency_name=agency_lookup_response.name + ) ) - subtask_data: AutoAgencyIDSubtaskData = convert_match_agency_response_to_subtask_data( + subtask_data: AutoAgencyIDSubtaskData = convert_agency_suggestions_to_subtask_data( url_id=param.url_id, - response=match_agency_response, + agency_suggestions=agency_suggestions, subtask_type=AutoAgencyIDSubtaskType.MUCKROCK, task_id=self.task_id ) diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/match_agency.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/match_agency.py new file mode 100644 index 00000000..4b5d6516 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/match_agency.py @@ -0,0 +1,44 @@ +from typing import Sequence + +from sqlalchemy import select, func, desc, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.url.operators.agency_identification.subtasks.models.suggestion import AgencySuggestion +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.queries.base.builder import QueryBuilderBase + + +class MatchAgencyQueryBuilder(QueryBuilderBase): + + def __init__( + self, + agency_name: str + ): + super().__init__() + self.agency_name = agency_name + + async def run(self, session: AsyncSession) -> list[AgencySuggestion]: + query = ( + select( + Agency.id, + func.similarity(Agency.name, self.agency_name).label("similarity") + ) + .where( + func.similarity(Agency.name, self.agency_name) > 0.5 + ) + .order_by( + desc("similarity") + ) + .limit(10) + ) + mappings: Sequence[RowMapping] = await self.sh.mappings( + session=session, + query=query + ) + return [ + AgencySuggestion( + agency_id=mapping[Agency.id], + confidence=int(mapping["similarity"] * 100) + ) + for mapping in mappings + ] \ No newline at end of file diff --git a/src/external/pdap/client.py b/src/external/pdap/client.py index 0d6d9ec7..38c67e08 100644 --- a/src/external/pdap/client.py +++ b/src/external/pdap/client.py @@ -6,10 +6,7 @@ from pdap_access_manager.models.response import ResponseInfo from src.external.pdap._templates.request_builder import PDAPRequestBuilderBase -from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo -from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse from src.external.pdap.dtos.unique_url_duplicate import UniqueURLDuplicateInfo -from src.external.pdap.enums import MatchAgencyResponseStatus class PDAPClient: @@ -26,50 +23,6 @@ async def run_request_builder( ) -> Any: return await request_builder.run(self.access_manager) - async def match_agency( - self, - name: str, - state: str | None = None, - county: str | None = None, - locality: str | None = None - ) -> MatchAgencyResponse: - """ - Returns agencies, if any, that match or partially match the search criteria - """ - url: str = f"{self.access_manager.data_sources_url}/v2/match/agency" - - headers: dict[str, str] = await self.access_manager.jwt_header() - headers['Content-Type']: str = "application/json" - request_info = RequestInfo( - type_=RequestType.POST, - url=url, - headers=headers, - json_={ - "name": name, - "state": state, - "county": county, - "locality": locality - } - ) - response_info: ResponseInfo = await self.access_manager.make_request(request_info) - matches: list[MatchAgencyInfo] = [] - for agency in response_info.data["agencies"]: - mai = MatchAgencyInfo( - id=agency['id'], - submitted_name=agency['name'] - ) - if len(agency['locations']) > 0: - first_location: dict[str, Any] = agency['locations'][0] - mai.state = first_location['state'] - mai.county = first_location['county'] - mai.locality = first_location['locality'] - matches.append(mai) - - return MatchAgencyResponse( - status=MatchAgencyResponseStatus(response_info.data["status"]), - matches=matches - ) - async def is_url_duplicate( self, url_to_check: str diff --git a/src/external/pdap/dtos/match_agency/__init__.py b/src/external/pdap/dtos/match_agency/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/external/pdap/dtos/match_agency/post.py b/src/external/pdap/dtos/match_agency/post.py deleted file mode 100644 index 2be0b90e..00000000 --- a/src/external/pdap/dtos/match_agency/post.py +++ /dev/null @@ -1,11 +0,0 @@ -from typing import Optional - -from pydantic import BaseModel - - -class MatchAgencyInfo(BaseModel): - id: int - submitted_name: str - state: str | None = None - county: str | None = None - locality: str | None = None diff --git a/src/external/pdap/dtos/match_agency/response.py b/src/external/pdap/dtos/match_agency/response.py deleted file mode 100644 index aa4d9ec3..00000000 --- a/src/external/pdap/dtos/match_agency/response.py +++ /dev/null @@ -1,11 +0,0 @@ -from typing import List - -from pydantic import BaseModel - -from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo -from src.external.pdap.enums import MatchAgencyResponseStatus - - -class MatchAgencyResponse(BaseModel): - status: MatchAgencyResponseStatus - matches: List[MatchAgencyInfo] diff --git a/src/external/pdap/enums.py b/src/external/pdap/enums.py index c532f820..55819619 100644 --- a/src/external/pdap/enums.py +++ b/src/external/pdap/enums.py @@ -1,12 +1,6 @@ from enum import Enum -class MatchAgencyResponseStatus(Enum): - EXACT_MATCH = "Exact Match" - PARTIAL_MATCH = "Partial Matches" - NO_MATCH = "No Match" - - class ApprovalStatus(Enum): APPROVED = "approved" REJECTED = "rejected" diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/ckan/test_core.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/ckan/test_core.py index 90aacfa5..4ec99967 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/ckan/test_core.py +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/ckan/test_core.py @@ -1,5 +1,3 @@ -from unittest.mock import AsyncMock - import pytest from src.collectors.enums import CollectorType @@ -9,11 +7,6 @@ from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion -from src.external.pdap.enums import MatchAgencyResponseStatus -from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan_.core import CKANAgencyIDSubtaskOperator -from src.core.enums import SuggestionType -from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse -from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo from tests.helpers.asserts import assert_task_run_success from tests.helpers.data_creator.core import DBDataCreator @@ -21,7 +14,9 @@ @pytest.mark.asyncio async def test_ckan_subtask( operator: AgencyIdentificationTaskOperator, - db_data_creator: DBDataCreator + db_data_creator: DBDataCreator, + test_agency_id: int, + test_agency_id_2: int ): # Test that ckan subtask correctly sends agency id to # CKANAPIInterface, sends resultant agency name to @@ -53,25 +48,6 @@ async def test_ckan_subtask( assert await operator.meets_task_prerequisites() assert operator._subtask == AutoAgencyIDSubtaskType.CKAN - pdap_client_mock = operator.loader._pdap_client - pdap_client_mock.match_agency.return_value = MatchAgencyResponse( - status=MatchAgencyResponseStatus.PARTIAL_MATCH, - matches=[ - MatchAgencyInfo( - id=1, - submitted_name="Mock Agency Name", - ), - MatchAgencyInfo( - id=2, - submitted_name="Another Mock Agency Name", - ) - ] - ) - - # Create agencies - await db_data_creator.create_agency(1) - await db_data_creator.create_agency(2) - # Run the operator run_info: TaskOperatorRunInfo = await operator.run_task() assert_task_run_success(run_info) @@ -92,9 +68,9 @@ async def test_ckan_subtask( AgencyIDSubtaskSuggestion ) assert len(suggestions) == 2 - assert {suggestion.confidence for suggestion in suggestions} == {50} - assert {suggestion.agency_id for suggestion in suggestions} == {1, 2} + assert {suggestion.agency_id for suggestion in suggestions} == { + test_agency_id, + test_agency_id_2 + } assert {suggestion.subtask_id for suggestion in suggestions} == {subtask_id} - # Assert methods called as expected - pdap_client_mock.match_agency.assert_called_once_with(name="Test Agency") diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/muckrock/test_core.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/muckrock/test_core.py index 7cf72c5e..af41354d 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/muckrock/test_core.py +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/muckrock/test_core.py @@ -1,24 +1,14 @@ -from unittest.mock import MagicMock - import pytest from src.collectors.enums import CollectorType -from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface from src.collectors.impl.muckrock.api_interface.lookup_response import AgencyLookupResponse from src.collectors.impl.muckrock.enums import AgencyLookupResponseType -from src.core.enums import SuggestionType from src.core.tasks.base.run_info import TaskOperatorRunInfo from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator -from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock_.core import MuckrockAgencyIDSubtaskOperator from src.db.client.async_ import AsyncDatabaseClient from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion -from src.external.pdap.client import PDAPClient -from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo -from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse -from src.external.pdap.enums import MatchAgencyResponseStatus from tests.helpers.asserts import assert_task_run_success from tests.helpers.data_creator.core import DBDataCreator @@ -26,7 +16,9 @@ @pytest.mark.asyncio async def test_muckrock_subtask( operator: AgencyIdentificationTaskOperator, - db_data_creator: DBDataCreator + db_data_creator: DBDataCreator, + test_agency_id: int, + test_agency_id_2: int ): adb_client: AsyncDatabaseClient = operator.adb_client @@ -81,38 +73,16 @@ async def test_muckrock_subtask( assert await operator.meets_task_prerequisites() assert operator._subtask == AutoAgencyIDSubtaskType.MUCKROCK - # Test that muckrock subtask correctly sends agency name to - # MatchAgenciesInterface and adds received suggestions to - # url_agency_suggestions - # Create mock instances for dependency injections muckrock_api_interface_mock = operator.loader._muckrock_api_interface - pdap_client_mock = operator.loader._pdap_client # Set up mock return values for method calls muckrock_api_interface_mock.lookup_agency.return_value = AgencyLookupResponse( type=AgencyLookupResponseType.FOUND, - name="Mock Agency Name", + name="Test Agency", error=None ) - # Create agencies - await db_data_creator.create_agency(1) - await db_data_creator.create_agency(2) - - pdap_client_mock.match_agency.return_value = MatchAgencyResponse( - status=MatchAgencyResponseStatus.PARTIAL_MATCH, - matches=[ - MatchAgencyInfo( - id=1, - submitted_name="Mock Agency Name", - ), - MatchAgencyInfo( - id=2, - submitted_name="Another Mock Agency Name", - ) - ] - ) # Run the operator run_info: TaskOperatorRunInfo = await operator.run_task() @@ -134,15 +104,8 @@ async def test_muckrock_subtask( AgencyIDSubtaskSuggestion ) assert len(suggestions) == 2 - assert {suggestion.confidence for suggestion in suggestions} == {50} - assert {suggestion.agency_id for suggestion in suggestions} == {1, 2} + assert {suggestion.agency_id for suggestion in suggestions} == { + test_agency_id, + test_agency_id_2 + } assert {suggestion.subtask_id for suggestion in suggestions} == {subtask_id} - - - # # Assert methods called as expected - muckrock_api_interface_mock.lookup_agency.assert_called_once_with( - muckrock_agency_id=123 - ) - pdap_client_mock.match_agency.assert_called_once_with( - name="Mock Agency Name" - ) diff --git a/tests/manual/external/pdap/test_match_agency.py b/tests/manual/external/pdap/test_match_agency.py deleted file mode 100644 index a637dad0..00000000 --- a/tests/manual/external/pdap/test_match_agency.py +++ /dev/null @@ -1,6 +0,0 @@ -import pytest - - -@pytest.mark.asyncio -async def test_match_agency(pdap_client): - response = await pdap_client.match_agency(name="police") From 3ce56428cb076f50c9be95f018b391479efe7559 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 27 Nov 2025 09:57:21 -0500 Subject: [PATCH 69/84] Add logic to prevent HTML content duplicates from being sent to HuggingFace --- ...dd_html_duplicate_url_materialized_view.py | 57 +++++++++++++++++++ .../scheduled/impl/huggingface/operator.py | 2 +- .../scheduled/impl/huggingface/queries/cte.py | 38 +++++++++++++ .../impl/huggingface/queries/get/core.py | 12 +++- .../queries/{check => prereq}/__init__.py | 0 .../queries/{check => prereq}/core.py | 2 +- .../queries/{check => prereq}/requester.py | 19 +++---- src/db/client/async_.py | 3 + src/db/models/materialized_views/__init__.py | 0 .../materialized_views/html_duplicate_url.py | 9 +++ tests/automated/integration/conftest.py | 28 +++++++++ ...st_duplicate_html_content_not_picked_up.py | 38 +++++++++++++ 12 files changed, 192 insertions(+), 16 deletions(-) create mode 100644 alembic/versions/2025_11_27_0907-d5f0cc2be6b6_add_html_duplicate_url_materialized_view.py create mode 100644 src/core/tasks/scheduled/impl/huggingface/queries/cte.py rename src/core/tasks/scheduled/impl/huggingface/queries/{check => prereq}/__init__.py (100%) rename src/core/tasks/scheduled/impl/huggingface/queries/{check => prereq}/core.py (78%) rename src/core/tasks/scheduled/impl/huggingface/queries/{check => prereq}/requester.py (75%) create mode 100644 src/db/models/materialized_views/__init__.py create mode 100644 src/db/models/materialized_views/html_duplicate_url.py create mode 100644 tests/automated/integration/tasks/scheduled/impl/huggingface/test_duplicate_html_content_not_picked_up.py diff --git a/alembic/versions/2025_11_27_0907-d5f0cc2be6b6_add_html_duplicate_url_materialized_view.py b/alembic/versions/2025_11_27_0907-d5f0cc2be6b6_add_html_duplicate_url_materialized_view.py new file mode 100644 index 00000000..1eeb1eb3 --- /dev/null +++ b/alembic/versions/2025_11_27_0907-d5f0cc2be6b6_add_html_duplicate_url_materialized_view.py @@ -0,0 +1,57 @@ +"""Add html duplicate url materialized view + +Revision ID: d5f0cc2be6b6 +Revises: 5ac9d50b91c5 +Create Date: 2025-11-27 09:07:28.767553 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'd5f0cc2be6b6' +down_revision: Union[str, None] = '5ac9d50b91c5' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.execute(""" + create extension pgcrypto; + """) + + op.execute(""" + CREATE MATERIALIZED VIEW mat_view__html_duplicate_url AS + WITH + hashes AS ( + SELECT + url_id, + digest(compressed_html, 'sha256') AS hash + FROM + url_compressed_html + ) + , duplicate_hashes as ( + SELECT + hash AS content_hash, + COUNT(*) AS n, + ARRAY_AGG(url_id ORDER BY url_id) AS url_ids + FROM + hashes + GROUP BY + hash + HAVING + COUNT(*) > 1 + ) + select + urls.id as url_id + from urls + join hashes h on h.url_id = urls.id + join duplicate_hashes dh on dh.content_hash = h.hash; + """) + + +def downgrade() -> None: + pass diff --git a/src/core/tasks/scheduled/impl/huggingface/operator.py b/src/core/tasks/scheduled/impl/huggingface/operator.py index 9bb7a85e..f644ff94 100644 --- a/src/core/tasks/scheduled/impl/huggingface/operator.py +++ b/src/core/tasks/scheduled/impl/huggingface/operator.py @@ -1,7 +1,7 @@ from itertools import count from src.core.tasks.mixins.prereq import HasPrerequisitesMixin -from src.core.tasks.scheduled.impl.huggingface.queries.check.core import CheckValidURLsUpdatedQueryBuilder +from src.core.tasks.scheduled.impl.huggingface.queries.prereq.core import CheckValidURLsUpdatedQueryBuilder from src.core.tasks.scheduled.impl.huggingface.queries.get.core import GetForLoadingToHuggingFaceQueryBuilder from src.core.tasks.scheduled.impl.huggingface.queries.get.model import GetForLoadingToHuggingFaceOutput from src.core.tasks.scheduled.templates.operator import ScheduledTaskOperatorBase diff --git a/src/core/tasks/scheduled/impl/huggingface/queries/cte.py b/src/core/tasks/scheduled/impl/huggingface/queries/cte.py new file mode 100644 index 00000000..8ea75b0c --- /dev/null +++ b/src/core/tasks/scheduled/impl/huggingface/queries/cte.py @@ -0,0 +1,38 @@ +from datetime import datetime + +from sqlalchemy import select, Column + +from src.db.enums import TaskType +from src.db.helpers.query import exists_url, no_url_task_error, not_exists_url +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML +from src.db.models.materialized_views.html_duplicate_url import HTMLDuplicateURLMaterializedView + + +class HuggingfacePrereqCTEContainer: + + def __init__(self): + self.cte = ( + select( + URL.id, + URL.updated_at + ) + .join( + URLCompressedHTML, + URL.id == URLCompressedHTML.url_id + ) + .where( + exists_url(FlagURLValidated), + not_exists_url(HTMLDuplicateURLMaterializedView), + no_url_task_error(TaskType.PUSH_TO_HUGGINGFACE) + ) + ) + + @property + def url_id(self) -> Column[int]: + return self.cte.c.id + + @property + def updated_at(self) -> Column[datetime]: + return self.cte.c.updated_at \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/huggingface/queries/get/core.py b/src/core/tasks/scheduled/impl/huggingface/queries/get/core.py index 802e8ea5..10986a05 100644 --- a/src/core/tasks/scheduled/impl/huggingface/queries/get/core.py +++ b/src/core/tasks/scheduled/impl/huggingface/queries/get/core.py @@ -1,6 +1,7 @@ from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession +from src.core.tasks.scheduled.impl.huggingface.queries.cte import HuggingfacePrereqCTEContainer from src.core.tasks.scheduled.impl.huggingface.queries.get.convert import convert_fine_to_coarse_record_type, \ convert_validated_type_to_relevant from src.core.tasks.scheduled.impl.huggingface.queries.get.model import GetForLoadingToHuggingFaceOutput @@ -23,21 +24,26 @@ def __init__(self, page: int): async def run(self, session: AsyncSession) -> list[GetForLoadingToHuggingFaceOutput]: - label_url_id = 'url_id' label_url = 'url' label_record_type_fine = 'record_type_fine' label_html = 'html' label_type = 'type' + cte = HuggingfacePrereqCTEContainer() + query = ( select( - URL.id.label(label_url_id), + cte.url_id, URL.full_url.label(label_url), URLRecordType.record_type.label(label_record_type_fine), URLCompressedHTML.compressed_html.label(label_html), FlagURLValidated.type.label(label_type) ) + .join( + URL, + cte.url_id == URL.id + ) .join( URLRecordType, URL.id == URLRecordType.url_id @@ -65,7 +71,7 @@ async def run(self, session: AsyncSession) -> list[GetForLoadingToHuggingFaceOut final_results = [] for result in db_results: output = GetForLoadingToHuggingFaceOutput( - url_id=result[label_url_id], + url_id=result[cte.url_id], url=result[label_url], relevant=convert_validated_type_to_relevant( URLType(result[label_type]) diff --git a/src/core/tasks/scheduled/impl/huggingface/queries/check/__init__.py b/src/core/tasks/scheduled/impl/huggingface/queries/prereq/__init__.py similarity index 100% rename from src/core/tasks/scheduled/impl/huggingface/queries/check/__init__.py rename to src/core/tasks/scheduled/impl/huggingface/queries/prereq/__init__.py diff --git a/src/core/tasks/scheduled/impl/huggingface/queries/check/core.py b/src/core/tasks/scheduled/impl/huggingface/queries/prereq/core.py similarity index 78% rename from src/core/tasks/scheduled/impl/huggingface/queries/check/core.py rename to src/core/tasks/scheduled/impl/huggingface/queries/prereq/core.py index c76fa2e1..fdf82ba9 100644 --- a/src/core/tasks/scheduled/impl/huggingface/queries/check/core.py +++ b/src/core/tasks/scheduled/impl/huggingface/queries/prereq/core.py @@ -1,6 +1,6 @@ from sqlalchemy.ext.asyncio import AsyncSession -from src.core.tasks.scheduled.impl.huggingface.queries.check.requester import CheckValidURLsUpdatedRequester +from src.core.tasks.scheduled.impl.huggingface.queries.prereq.requester import CheckValidURLsUpdatedRequester from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/core/tasks/scheduled/impl/huggingface/queries/check/requester.py b/src/core/tasks/scheduled/impl/huggingface/queries/prereq/requester.py similarity index 75% rename from src/core/tasks/scheduled/impl/huggingface/queries/check/requester.py rename to src/core/tasks/scheduled/impl/huggingface/queries/prereq/requester.py index ef43bd3d..1eaa306d 100644 --- a/src/core/tasks/scheduled/impl/huggingface/queries/check/requester.py +++ b/src/core/tasks/scheduled/impl/huggingface/queries/prereq/requester.py @@ -6,6 +6,7 @@ from sqlalchemy.sql.functions import count from src.collectors.enums import URLStatus +from src.core.tasks.scheduled.impl.huggingface.queries.cte import HuggingfacePrereqCTEContainer from src.db.enums import TaskType from src.db.helpers.query import not_exists_url, no_url_task_error, exists_url from src.db.helpers.session import session_helper as sh @@ -32,21 +33,17 @@ async def latest_upload(self) -> datetime: ) async def has_valid_urls(self, last_upload_at: datetime | None) -> bool: + cte = HuggingfacePrereqCTEContainer() query = ( - select(count(URL.id)) - .join( - URLCompressedHTML, - URL.id == URLCompressedHTML.url_id - ) - .where( - exists_url(FlagURLValidated), - no_url_task_error(TaskType.PUSH_TO_HUGGINGFACE) + select( + cte.url_id ) ) if last_upload_at is not None: - query = query.where(URL.updated_at > last_upload_at) - url_count = await sh.scalar( + query = query.where(cte.updated_at > last_upload_at) + query = query.limit(1) + result = await sh.one_or_none( session=self.session, query=query ) - return url_count > 0 + return result is not None \ No newline at end of file diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 913a0a35..0fb99f76 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -919,4 +919,7 @@ async def refresh_materialized_views(self): ) await self.execute( text("REFRESH MATERIALIZED VIEW batch_url_status_mat_view") + ) + await self.execute( + text("REFRESH MATERIALIZED VIEW mat_view__html_duplicate_url") ) \ No newline at end of file diff --git a/src/db/models/materialized_views/__init__.py b/src/db/models/materialized_views/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/materialized_views/html_duplicate_url.py b/src/db/models/materialized_views/html_duplicate_url.py new file mode 100644 index 00000000..703bbbea --- /dev/null +++ b/src/db/models/materialized_views/html_duplicate_url.py @@ -0,0 +1,9 @@ +from src.db.models.mixins import URLDependentViewMixin +from src.db.models.templates_.base import Base + + +class HTMLDuplicateURLMaterializedView( + Base, + URLDependentViewMixin +): + __tablename__ = "mat_view__html_duplicate_url" \ No newline at end of file diff --git a/tests/automated/integration/conftest.py b/tests/automated/integration/conftest.py index 4c6a76d0..19a9fe19 100644 --- a/tests/automated/integration/conftest.py +++ b/tests/automated/integration/conftest.py @@ -221,6 +221,21 @@ async def test_url_data_source_id( ) return url_id +@pytest_asyncio.fixture +async def test_url_data_source_id_2( + db_data_creator: DBDataCreator, + test_agency_id: int +) -> int: + url_id: int = (await db_data_creator.create_validated_urls( + record_type=RecordType.CAR_GPS, + validation_type=URLType.DATA_SOURCE, + ))[0].url_id + await db_data_creator.link_urls_to_agencies( + url_ids=[url_id], + agency_ids=[test_agency_id] + ) + return url_id + @pytest_asyncio.fixture async def test_url_id( db_data_creator: DBDataCreator, @@ -233,6 +248,19 @@ async def test_url_id( ) return await db_data_creator.adb_client.add(url, return_id=True) +@pytest_asyncio.fixture +async def test_url_id_2( + db_data_creator: DBDataCreator, +) -> int: + url = URL( + url="example.com/2", + source=URLSource.COLLECTOR, + trailing_slash=False, + status=URLStatus.OK + ) + return await db_data_creator.adb_client.add(url, return_id=True) + + @pytest_asyncio.fixture async def test_url_data_source_mapping( db_data_creator: DBDataCreator, diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/test_duplicate_html_content_not_picked_up.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/test_duplicate_html_content_not_picked_up.py new file mode 100644 index 00000000..be84ffd4 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/huggingface/test_duplicate_html_content_not_picked_up.py @@ -0,0 +1,38 @@ +import pytest + +from src.core.tasks.scheduled.impl.huggingface.operator import PushToHuggingFaceTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML + + +@pytest.mark.asyncio +async def test_huggingface_task_duplicate_html_content_not_picked_up( + adb_client_test: AsyncDatabaseClient, + operator: PushToHuggingFaceTaskOperator, + test_url_data_source_id: int, + test_url_data_source_id_2: int +): + + # Add HTML content with the same hash + uch_1 = URLCompressedHTML( + url_id=test_url_data_source_id, + compressed_html=b"test" + ) + uch_2 = URLCompressedHTML( + url_id=test_url_data_source_id_2, + compressed_html=b"test" + ) + await adb_client_test.add_all([ + uch_1, + uch_2 + ]) + + # Confirm task meets prerequisites + assert await operator.meets_task_prerequisites() + + # Refresh materialized view + await adb_client_test.refresh_materialized_views() + + # Confirm task does not meet prerequisites + assert not await operator.meets_task_prerequisites() + From e8b00234f7bc5573d44602ae9ffac0659dc5477b Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 27 Nov 2025 11:39:17 -0500 Subject: [PATCH 70/84] Add logic to prevent HTML content duplicates from being sent to HuggingFace --- ...907-d5f0cc2be6b6_add_html_duplicate_url_materialized_view.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alembic/versions/2025_11_27_0907-d5f0cc2be6b6_add_html_duplicate_url_materialized_view.py b/alembic/versions/2025_11_27_0907-d5f0cc2be6b6_add_html_duplicate_url_materialized_view.py index 1eeb1eb3..ec726c07 100644 --- a/alembic/versions/2025_11_27_0907-d5f0cc2be6b6_add_html_duplicate_url_materialized_view.py +++ b/alembic/versions/2025_11_27_0907-d5f0cc2be6b6_add_html_duplicate_url_materialized_view.py @@ -20,7 +20,7 @@ def upgrade() -> None: op.execute(""" - create extension pgcrypto; + create extension if not exists pgcrypto; """) op.execute(""" From 6d79fb899204e367795927f6e62ba01733f3d497 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 27 Nov 2025 11:56:51 -0500 Subject: [PATCH 71/84] Change `UPDATE URL STATUS` task interval to daily --- src/core/tasks/scheduled/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/tasks/scheduled/loader.py b/src/core/tasks/scheduled/loader.py index 116bf56d..61169a66 100644 --- a/src/core/tasks/scheduled/loader.py +++ b/src/core/tasks/scheduled/loader.py @@ -225,7 +225,7 @@ async def load_entries(self) -> list[ScheduledTaskEntry]: operator=UpdateURLStatusOperator( adb_client=self.adb_client ), - interval_minutes=IntervalEnum.HOURLY.value, + interval_minutes=IntervalEnum.DAILY.value, enabled=self.setup_flag("UPDATE_URL_STATUS_TASK_FLAG") ), From 4ef61c6c995140cddb5b3ea01ac07ce184b272c6 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sat, 29 Nov 2025 14:24:32 -0500 Subject: [PATCH 72/84] Remove redundant ID columns --- ..._29_0717-5d6412540aba_remove_id_columns.py | 240 ++++++++++++++++++ .../annotate/all/get/queries/core.py | 4 +- .../queries/ctes/subtask/impl/nlp_location.py | 2 +- .../url/operators/html/content_info_getter.py | 9 +- .../operators/html/queries/insert/convert.py | 5 +- .../operators/html/queries/insert/query.py | 15 +- .../probe/queries/urls/not_probed/exists.py | 2 +- .../queries/urls/not_probed/get/query.py | 2 +- src/db/client/async_.py | 4 - src/db/models/impl/duplicate/sqlalchemy.py | 8 +- .../impl/link/agency_location/sqlalchemy.py | 11 +- .../models/impl/link/url_agency/sqlalchemy.py | 9 +- .../impl/link/url_redirect_url/sqlalchemy.py | 15 +- .../impl/link/urls_root_url/sqlalchemy.py | 8 +- src/db/models/impl/task/error.py | 9 +- .../models/impl/url/checked_for_duplicate.py | 6 +- src/db/models/impl/url/core/sqlalchemy.py | 2 +- .../impl/url/html/compressed/sqlalchemy.py | 8 +- .../impl/url/html/content/sqlalchemy.py | 11 +- .../url/internet_archives/probe/sqlalchemy.py | 8 +- .../url/internet_archives/save/sqlalchemy.py | 9 +- .../url/optional_ds_metadata/sqlalchemy.py | 9 +- src/db/models/impl/url/reviewing_user.py | 11 +- .../models/impl/url/scrape_info/sqlalchemy.py | 13 +- .../agency/suggestion/sqlalchemy.py | 8 +- .../models/impl/url/suggestion/agency/user.py | 12 +- .../impl/url/suggestion/record_type/auto.py | 7 +- .../impl/url/suggestion/record_type/user.py | 17 +- .../suggestion/url_type/auto/sqlalchemy.py | 6 +- .../impl/url/suggestion/url_type/user.py | 12 +- .../impl/url/web_metadata/sqlalchemy.py | 9 +- src/db/statement_composer.py | 2 +- .../data_creator/commands/impl/html_data.py | 11 +- 33 files changed, 406 insertions(+), 98 deletions(-) create mode 100644 alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py diff --git a/alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py b/alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py new file mode 100644 index 00000000..34ae8506 --- /dev/null +++ b/alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py @@ -0,0 +1,240 @@ +"""Remove ID columns + +Revision ID: 5d6412540aba +Revises: d5f0cc2be6b6 +Create Date: 2025-11-29 07:17:32.794305 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '5d6412540aba' +down_revision: Union[str, None] = 'd5f0cc2be6b6' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + +TABLES = [ + "task_errors", # + "agency_id_subtask_suggestions", # + "auto_record_type_suggestions", # + "auto_relevant_suggestions", # + "duplicates", # + "flag_url_validated", # + "link_agencies__locations", # + "link_urls_redirect_url", # + "link_urls_root_url", # + "reviewing_user_url", # + "url_checked_for_duplicate", # + "url_compressed_html", # + "url_html_content", # + "url_internet_archives_probe_metadata", # + "url_internet_archives_save_metadata", # + "url_optional_data_source_metadata", # + "url_scrape_info", # + "url_web_metadata", # + "user_record_type_suggestions", # + "user_url_type_suggestions", # +] + +URL_ONLY_PRIMARY_KEY_TABLES = [ + "url_checked_for_duplicate", # + "url_compressed_html", # + "url_internet_archives_probe_metadata", # + "url_internet_archives_save_metadata", # + "url_optional_data_source_metadata", # + "url_scrape_info", # + "url_web_metadata", # + "auto_relevant_suggestions", # + "auto_record_type_suggestions", # + "flag_url_validated" # +] + + + +USER_URL_ID_PRIMARY_KEY_TABLES = [ + "user_record_type_suggestions", # + "user_url_type_suggestions", # + "reviewing_user_url" # +] + +BESPOKE_UNIQUE_IDS: dict[str, list[str]] = { + "task_errors": ["task_id"], # + "agency_id_subtask_suggestions": ["agency_id", "subtask_id"], # + "link_agencies__locations": ["agency_id", "location_id"], # + "link_urls_redirect_url": ["source_url_id", "destination_url_id"], # + "link_urls_root_url": ["url_id", "root_url_id"], # + "url_html_content": ["url_id", "content_type"], # + "duplicates": ["batch_id", "original_url_id"] +} + +def drop_views(): + op.execute("drop materialized view if exists url_status_mat_view") + op.execute("drop materialized view if exists batch_url_status_mat_view") + +def recreate_views(): + op.execute(""" + create materialized view url_status_mat_view as + WITH + urls_with_relevant_errors AS ( + SELECT + ute.url_id + FROM + url_task_error ute + WHERE + ute.task_type = ANY (ARRAY ['Screenshot'::task_type, 'HTML'::task_type, 'URL Probe'::task_type]) + ) + , status_text AS ( + SELECT + u.id AS url_id, + CASE + WHEN fuv.type = ANY + (ARRAY ['not relevant'::url_type, 'individual record'::url_type, 'not found'::url_type]) + THEN 'Accepted'::text + WHEN fuv.type = 'data source'::url_type AND uds.url_id IS NULL OR + fuv.type = 'meta url'::url_type AND udmu.url_id IS NULL THEN 'Awaiting Submission'::text + WHEN fuv.type = 'data source'::url_type AND uds.url_id IS NOT NULL OR + fuv.type = 'meta url'::url_type AND udmu.url_id IS NOT NULL THEN 'Submitted'::text + WHEN uch.url_id IS NOT NULL AND uwm.url_id IS NOT NULL AND us.url_id IS NOT NULL + THEN 'Community Labeling'::text + WHEN uwre.url_id IS NOT NULL THEN 'Error'::text + ELSE 'Intake'::text + END AS status + FROM + urls u + LEFT JOIN urls_with_relevant_errors uwre + ON u.id = uwre.url_id + LEFT JOIN url_screenshot us + ON u.id = us.url_id + LEFT JOIN url_compressed_html uch + ON u.id = uch.url_id + LEFT JOIN url_web_metadata uwm + ON u.id = uwm.url_id + LEFT JOIN flag_url_validated fuv + ON u.id = fuv.url_id + LEFT JOIN ds_app_link_meta_url udmu + ON u.id = udmu.url_id + LEFT JOIN ds_app_link_data_source uds + ON u.id = uds.url_id + ) + SELECT + status_text.url_id, + status_text.status, + CASE status_text.status + WHEN 'Intake'::text THEN 100 + WHEN 'Error'::text THEN 110 + WHEN 'Community Labeling'::text THEN 200 + WHEN 'Accepted'::text THEN 300 + WHEN 'Awaiting Submission'::text THEN 380 + WHEN 'Submitted'::text THEN 390 + ELSE '-1'::integer + END AS code + FROM + status_text; + """) + + op.execute(""" + create materialized view batch_url_status_mat_view as + WITH + batches_with_urls AS ( + SELECT + b_1.id + FROM + batches b_1 + WHERE + (EXISTS ( + SELECT + 1 + FROM + link_batches__urls lbu + WHERE + lbu.batch_id = b_1.id + )) + ) + , batches_with_only_validated_urls AS ( + SELECT + b_1.id + FROM + batches b_1 + WHERE + (EXISTS ( + SELECT + 1 + FROM + link_batches__urls lbu + LEFT JOIN flag_url_validated fuv + ON fuv.url_id = lbu.url_id + WHERE + lbu.batch_id = b_1.id + AND fuv.url_id IS NOT NULL + )) + AND NOT (EXISTS ( + SELECT + 1 + FROM + link_batches__urls lbu + LEFT JOIN flag_url_validated fuv + ON fuv.url_id = lbu.url_id + WHERE + lbu.batch_id = b_1.id + AND fuv.url_id IS NULL + )) + ) + SELECT + b.id AS batch_id, + CASE + WHEN b.status = 'error'::batch_status THEN 'Error'::text + WHEN bwu.id IS NULL THEN 'No URLs'::text + WHEN bwovu.id IS NOT NULL THEN 'Labeling Complete'::text + ELSE 'Has Unlabeled URLs'::text + END AS batch_url_status + FROM + batches b + LEFT JOIN batches_with_urls bwu + ON bwu.id = b.id + LEFT JOIN batches_with_only_validated_urls bwovu + ON bwovu.id = b.id; + """) + + + +def upgrade() -> None: + drop_views() + + for table in TABLES: + op.drop_column(table, "id") + + # Add new primary keys + for table, columns in BESPOKE_UNIQUE_IDS.items(): + suffix = "_".join(columns) + op.create_primary_key( + f"pk_{table}_{suffix}", + table, + columns + ) + + for table in URL_ONLY_PRIMARY_KEY_TABLES: + op.create_primary_key( + f"pk_{table}", + table, + ["url_id"] + ) + + for table in USER_URL_ID_PRIMARY_KEY_TABLES: + op.create_primary_key( + f"pk_{table}", + table, + ["user_id", "url_id"] + ) + + recreate_views() + + + + + +def downgrade() -> None: + pass diff --git a/src/api/endpoints/annotate/all/get/queries/core.py b/src/api/endpoints/annotate/all/get/queries/core.py index 5b239db0..89975a08 100644 --- a/src/api/endpoints/annotate/all/get/queries/core.py +++ b/src/api/endpoints/annotate/all/get/queries/core.py @@ -61,14 +61,14 @@ async def run( URL.status == URLStatus.OK.value, # Must not have been previously annotated by user ~exists( - select(UserURLTypeSuggestion.id) + select(UserURLTypeSuggestion.url_id) .where( UserURLTypeSuggestion.url_id == URL.id, UserURLTypeSuggestion.user_id == self.user_id, ) ), ~exists( - select(UserURLAgencySuggestion.id) + select(UserURLAgencySuggestion.url_id) .where( UserURLAgencySuggestion.url_id == URL.id, UserURLAgencySuggestion.user_id == self.user_id, diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location.py index 17055d1a..7a15b67a 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location.py @@ -29,7 +29,7 @@ # One of the locations must be linked to an agency exists( select( - LinkAgencyLocation.id + LinkAgencyLocation.location_id ) .join( LocationIDSubtaskSuggestion, diff --git a/src/core/tasks/url/operators/html/content_info_getter.py b/src/core/tasks/url/operators/html/content_info_getter.py index bee7183c..a2d554ff 100644 --- a/src/core/tasks/url/operators/html/content_info_getter.py +++ b/src/core/tasks/url/operators/html/content_info_getter.py @@ -1,6 +1,7 @@ from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo from src.db.dtos.url.html_content import URLHTMLContentInfo from src.db.models.impl.url.html.content.enums import HTMLContentType +from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent class HTMLContentInfoGetter: @@ -10,7 +11,7 @@ def __init__(self, response_html_info: ResponseHTMLInfo, url_id: int): self.url_id = url_id self.html_content_infos = [] - def get_all_html_content(self) -> list[URLHTMLContentInfo]: + def get_all_html_content(self) -> list[URLHTMLContent]: for content_type in HTMLContentType: self.add_html_content(content_type) return self.html_content_infos @@ -20,9 +21,9 @@ def add_html_content(self, content_type: HTMLContentType): val = getattr(self.response_html_info, lower_str) if val is None or val.strip() == "": return - uhci = URLHTMLContentInfo( + uhc = URLHTMLContent( url_id=self.url_id, - content_type=content_type, + content_type=content_type.value, content=val ) - self.html_content_infos.append(uhci) + self.html_content_infos.append(uhc) diff --git a/src/core/tasks/url/operators/html/queries/insert/convert.py b/src/core/tasks/url/operators/html/queries/insert/convert.py index ca827c7e..e55b9843 100644 --- a/src/core/tasks/url/operators/html/queries/insert/convert.py +++ b/src/core/tasks/url/operators/html/queries/insert/convert.py @@ -6,6 +6,7 @@ from src.db.enums import TaskType from src.db.models.impl.url.error_info.pydantic import URLErrorInfoPydantic from src.db.models.impl.url.html.compressed.pydantic import URLCompressedHTMLPydantic +from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent from src.db.models.impl.url.scrape_info.enums import ScrapeStatus from src.db.models.impl.url.scrape_info.pydantic import URLScrapeInfoInsertModel from src.db.models.impl.url.task_error.pydantic_.insert import URLTaskErrorPydantic @@ -33,8 +34,8 @@ def _convert_to_html_content_info_getter(tdo: UrlHtmlTDO) -> HTMLContentInfoGett url_id=tdo.url_info.id ) -def convert_to_html_content_info_list(tdos: list[UrlHtmlTDO]) -> list[URLHTMLContentInfo]: - html_content_infos = [] +def convert_to_html_content_info_list(tdos: list[UrlHtmlTDO]) -> list[URLHTMLContent]: + html_content_infos: list[URLHTMLContent] = [] for tdo in tdos: if tdo.url_response_info.status != HTTPStatus.OK: continue diff --git a/src/core/tasks/url/operators/html/queries/insert/query.py b/src/core/tasks/url/operators/html/queries/insert/query.py index e0bff2e6..86f04e72 100644 --- a/src/core/tasks/url/operators/html/queries/insert/query.py +++ b/src/core/tasks/url/operators/html/queries/insert/query.py @@ -3,6 +3,10 @@ from src.core.tasks.url.operators.html.queries.insert.convert import convert_to_compressed_html, \ convert_to_html_content_info_list, convert_to_scrape_infos, convert_to_url_errors from src.core.tasks.url.operators.html.tdo import UrlHtmlTDO +from src.db.dtos.url.html_content import URLHTMLContentInfo +from src.db.models.impl.url.html.compressed.pydantic import URLCompressedHTMLPydantic +from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent +from src.db.models.impl.url.scrape_info.pydantic import URLScrapeInfoInsertModel from src.db.queries.base.builder import QueryBuilderBase from src.db.helpers.session import session_helper as sh @@ -14,17 +18,20 @@ def __init__(self, tdos: list[UrlHtmlTDO], task_id: int): self.task_id = task_id async def run(self, session: AsyncSession) -> None: - compressed_html_models = convert_to_compressed_html(self.tdos) - url_html_content_list = convert_to_html_content_info_list(self.tdos) - scrape_info_list = convert_to_scrape_infos(self.tdos) + compressed_html_models: list[URLCompressedHTMLPydantic] = convert_to_compressed_html(self.tdos) + url_html_content_list: list[URLHTMLContent] = convert_to_html_content_info_list(self.tdos) + scrape_info_list: list[URLScrapeInfoInsertModel] = convert_to_scrape_infos(self.tdos) url_errors = convert_to_url_errors(self.tdos, task_id=self.task_id) for models in [ compressed_html_models, - url_html_content_list, scrape_info_list, url_errors ]: await sh.bulk_insert(session, models=models) + await sh.add_all(session=session, models=url_html_content_list) + + + diff --git a/src/core/tasks/url/operators/probe/queries/urls/not_probed/exists.py b/src/core/tasks/url/operators/probe/queries/urls/not_probed/exists.py index 5954c197..087bef65 100644 --- a/src/core/tasks/url/operators/probe/queries/urls/not_probed/exists.py +++ b/src/core/tasks/url/operators/probe/queries/urls/not_probed/exists.py @@ -26,7 +26,7 @@ async def run(self, session: AsyncSession) -> bool: ) .where( or_( - URLWebMetadata.id.is_(None), + URLWebMetadata.url_id.is_(None), URLWebMetadata.updated_at < datetime.now() - timedelta(days=30) ), no_url_task_error(TaskType.PROBE_URL) diff --git a/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/query.py b/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/query.py index 7011a8de..e8eafd15 100644 --- a/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/query.py +++ b/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/query.py @@ -28,7 +28,7 @@ async def run(self, session: AsyncSession) -> list[FullURLMapping]: ) .where( or_( - URLWebMetadata.id.is_(None), + URLWebMetadata.url_id.is_(None), URLWebMetadata.updated_at < datetime.now() - timedelta(days=30) ) ) diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 0fb99f76..5ec64ad7 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -324,10 +324,6 @@ async def add_user_record_type_suggestion( # endregion record_type - @session_manager - async def add_html_content_infos(self, session: AsyncSession, html_content_infos: list[URLHTMLContentInfo]): - await self._add_models(session, URLHTMLContent, html_content_infos) - @session_manager async def has_non_errored_urls_without_html_data(self, session: AsyncSession) -> bool: statement = self.statement_composer.has_non_errored_urls_without_html_data() diff --git a/src/db/models/impl/duplicate/sqlalchemy.py b/src/db/models/impl/duplicate/sqlalchemy.py index 03c492e3..2b50409d 100644 --- a/src/db/models/impl/duplicate/sqlalchemy.py +++ b/src/db/models/impl/duplicate/sqlalchemy.py @@ -1,15 +1,19 @@ -from sqlalchemy import Column, Integer, ForeignKey +from sqlalchemy import Column, Integer, ForeignKey, PrimaryKeyConstraint from sqlalchemy.orm import relationship from src.db.models.mixins import BatchDependentMixin +from src.db.models.templates_.base import Base from src.db.models.templates_.with_id import WithIDBase -class Duplicate(BatchDependentMixin, WithIDBase): +class Duplicate(BatchDependentMixin, Base): """ Identifies duplicates which occur within a batch """ __tablename__ = 'duplicates' + __table_args__ = ( + PrimaryKeyConstraint("batch_id"), + ) original_url_id = Column( Integer, diff --git a/src/db/models/impl/link/agency_location/sqlalchemy.py b/src/db/models/impl/link/agency_location/sqlalchemy.py index fb7f34da..c4203d44 100644 --- a/src/db/models/impl/link/agency_location/sqlalchemy.py +++ b/src/db/models/impl/link/agency_location/sqlalchemy.py @@ -1,10 +1,15 @@ +from sqlalchemy import PrimaryKeyConstraint + from src.db.models.mixins import AgencyDependentMixin, LocationDependentMixin -from src.db.models.templates_.with_id import WithIDBase +from src.db.models.templates_.base import Base class LinkAgencyLocation( - WithIDBase, + Base, AgencyDependentMixin, LocationDependentMixin, ): - __tablename__ = "link_agencies__locations" \ No newline at end of file + __tablename__ = "link_agencies__locations" + __table_args__ = ( + PrimaryKeyConstraint("agency_id", "location_id"), + ) \ No newline at end of file diff --git a/src/db/models/impl/link/url_agency/sqlalchemy.py b/src/db/models/impl/link/url_agency/sqlalchemy.py index c4ca6124..7111bc6d 100644 --- a/src/db/models/impl/link/url_agency/sqlalchemy.py +++ b/src/db/models/impl/link/url_agency/sqlalchemy.py @@ -1,19 +1,20 @@ -from sqlalchemy import UniqueConstraint +from sqlalchemy import UniqueConstraint, PrimaryKeyConstraint from sqlalchemy.orm import relationship, Mapped from src.db.models.helpers import get_agency_id_foreign_column from src.db.models.mixins import URLDependentMixin +from src.db.models.templates_.base import Base from src.db.models.templates_.with_id import WithIDBase class LinkURLAgency(URLDependentMixin, WithIDBase): __tablename__ = "link_agencies__urls" + __table_args__ = ( + UniqueConstraint("url_id", "agency_id"), + ) agency_id: Mapped[int] = get_agency_id_foreign_column() url = relationship("URL") agency = relationship("Agency") - __table_args__ = ( - UniqueConstraint("url_id", "agency_id", name="uq_confirmed_url_agency"), - ) diff --git a/src/db/models/impl/link/url_redirect_url/sqlalchemy.py b/src/db/models/impl/link/url_redirect_url/sqlalchemy.py index 534c7213..c470e323 100644 --- a/src/db/models/impl/link/url_redirect_url/sqlalchemy.py +++ b/src/db/models/impl/link/url_redirect_url/sqlalchemy.py @@ -1,12 +1,21 @@ +from sqlalchemy import PrimaryKeyConstraint from sqlalchemy.orm import Mapped from src.db.models.helpers import url_id_column -from src.db.models.templates_.standard import StandardBase +from src.db.models.mixins import CreatedAtMixin, UpdatedAtMixin +from src.db.models.templates_.base import Base - -class LinkURLRedirectURL(StandardBase): +class LinkURLRedirectURL( + Base, + CreatedAtMixin, + UpdatedAtMixin +): __tablename__ = "link_urls_redirect_url" + __table_args__ = ( + PrimaryKeyConstraint("source_url_id", "destination_url_id"), + ) + source_url_id: Mapped[int] = url_id_column() destination_url_id: Mapped[int] = url_id_column() diff --git a/src/db/models/impl/link/urls_root_url/sqlalchemy.py b/src/db/models/impl/link/urls_root_url/sqlalchemy.py index 8dcd7085..d55a181f 100644 --- a/src/db/models/impl/link/urls_root_url/sqlalchemy.py +++ b/src/db/models/impl/link/urls_root_url/sqlalchemy.py @@ -1,16 +1,20 @@ +from sqlalchemy import PrimaryKeyConstraint from sqlalchemy.orm import Mapped from src.db.models.helpers import url_id_column from src.db.models.mixins import URLDependentMixin, CreatedAtMixin, UpdatedAtMixin -from src.db.models.templates_.with_id import WithIDBase +from src.db.models.templates_.base import Base class LinkURLRootURL( UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, - WithIDBase + Base, ): __tablename__ = "link_urls_root_url" + __table_args__ = ( + PrimaryKeyConstraint("url_id", "root_url_id"), + ) root_url_id: Mapped[int] = url_id_column() \ No newline at end of file diff --git a/src/db/models/impl/task/error.py b/src/db/models/impl/task/error.py index 2de0c66a..cd04a2ea 100644 --- a/src/db/models/impl/task/error.py +++ b/src/db/models/impl/task/error.py @@ -1,11 +1,12 @@ -from sqlalchemy import Column, Text, UniqueConstraint +from sqlalchemy import Column, Text, UniqueConstraint, PrimaryKeyConstraint from sqlalchemy.orm import relationship from src.db.models.mixins import UpdatedAtMixin, TaskDependentMixin +from src.db.models.templates_.base import Base from src.db.models.templates_.with_id import WithIDBase -class TaskError(UpdatedAtMixin, TaskDependentMixin, WithIDBase): +class TaskError(UpdatedAtMixin, TaskDependentMixin, Base): __tablename__ = 'task_errors' error = Column(Text, nullable=False) @@ -13,8 +14,8 @@ class TaskError(UpdatedAtMixin, TaskDependentMixin, WithIDBase): # Relationships task = relationship("Task") - __table_args__ = (UniqueConstraint( + __table_args__ = (PrimaryKeyConstraint( "task_id", "error", - name="uq_task_id_error"), + ), ) diff --git a/src/db/models/impl/url/checked_for_duplicate.py b/src/db/models/impl/url/checked_for_duplicate.py index bb7cf666..89192573 100644 --- a/src/db/models/impl/url/checked_for_duplicate.py +++ b/src/db/models/impl/url/checked_for_duplicate.py @@ -1,11 +1,13 @@ +from sqlalchemy import PrimaryKeyConstraint from sqlalchemy.orm import relationship from src.db.models.mixins import CreatedAtMixin, URLDependentMixin -from src.db.models.templates_.with_id import WithIDBase +from src.db.models.templates_.base import Base -class URLCheckedForDuplicate(CreatedAtMixin, URLDependentMixin, WithIDBase): +class URLCheckedForDuplicate(CreatedAtMixin, URLDependentMixin, Base): __tablename__ = 'url_checked_for_duplicate' + __table_args__ = (PrimaryKeyConstraint("url_id"),) # Relationships url = relationship("URL", uselist=False, back_populates="checked_for_duplicate") diff --git a/src/db/models/impl/url/core/sqlalchemy.py b/src/db/models/impl/url/core/sqlalchemy.py index 56681e3d..8ee51a43 100644 --- a/src/db/models/impl/url/core/sqlalchemy.py +++ b/src/db/models/impl/url/core/sqlalchemy.py @@ -75,7 +75,7 @@ def full_url(cls): uselist=False, ) duplicates = relationship("Duplicate", back_populates="original_url") - html_content = relationship("URLHTMLContent", back_populates="url", cascade="all, delete-orphan") + html_content = relationship("URLHTMLContent", cascade="all, delete-orphan") task_errors = relationship( URLTaskError, cascade="all, delete-orphan" diff --git a/src/db/models/impl/url/html/compressed/sqlalchemy.py b/src/db/models/impl/url/html/compressed/sqlalchemy.py index 995c5b25..4974e5f0 100644 --- a/src/db/models/impl/url/html/compressed/sqlalchemy.py +++ b/src/db/models/impl/url/html/compressed/sqlalchemy.py @@ -1,16 +1,20 @@ -from sqlalchemy import Column, LargeBinary +from sqlalchemy import Column, LargeBinary, PrimaryKeyConstraint from sqlalchemy.orm import relationship, Mapped from src.db.models.mixins import CreatedAtMixin, URLDependentMixin +from src.db.models.templates_.base import Base from src.db.models.templates_.with_id import WithIDBase class URLCompressedHTML( CreatedAtMixin, URLDependentMixin, - WithIDBase + Base ): __tablename__ = 'url_compressed_html' + __table_args__ = ( + PrimaryKeyConstraint("url_id"), + ) compressed_html: Mapped[bytes] = Column(LargeBinary, nullable=False) diff --git a/src/db/models/impl/url/html/content/sqlalchemy.py b/src/db/models/impl/url/html/content/sqlalchemy.py index 63e4da76..ded0957b 100644 --- a/src/db/models/impl/url/html/content/sqlalchemy.py +++ b/src/db/models/impl/url/html/content/sqlalchemy.py @@ -1,21 +1,20 @@ -from sqlalchemy import UniqueConstraint, Column, Text +from sqlalchemy import UniqueConstraint, Column, Text, PrimaryKeyConstraint from sqlalchemy.orm import relationship from src.db.enums import PGEnum from src.db.models.mixins import UpdatedAtMixin, URLDependentMixin +from src.db.models.templates_.base import Base from src.db.models.templates_.with_id import WithIDBase class URLHTMLContent( UpdatedAtMixin, URLDependentMixin, - WithIDBase + Base, ): __tablename__ = 'url_html_content' - __table_args__ = (UniqueConstraint( - "url_id", - "content_type", - name="uq_url_id_content_type"), + __table_args__ = ( + PrimaryKeyConstraint("url_id", "content_type"), ) content_type = Column( diff --git a/src/db/models/impl/url/internet_archives/probe/sqlalchemy.py b/src/db/models/impl/url/internet_archives/probe/sqlalchemy.py index 122905a7..ca9d1b0a 100644 --- a/src/db/models/impl/url/internet_archives/probe/sqlalchemy.py +++ b/src/db/models/impl/url/internet_archives/probe/sqlalchemy.py @@ -1,14 +1,18 @@ +from sqlalchemy import PrimaryKeyConstraint from sqlalchemy.orm import Mapped from src.db.models.mixins import URLDependentMixin -from src.db.models.templates_.standard import StandardBase +from src.db.models.templates_.base import Base class URLInternetArchivesProbeMetadata( - StandardBase, + Base, URLDependentMixin ): __tablename__ = 'url_internet_archives_probe_metadata' + __table_args__ = ( + PrimaryKeyConstraint("url_id"), + ) archive_url: Mapped[str] digest: Mapped[str] diff --git a/src/db/models/impl/url/internet_archives/save/sqlalchemy.py b/src/db/models/impl/url/internet_archives/save/sqlalchemy.py index 791f4077..f0aff36f 100644 --- a/src/db/models/impl/url/internet_archives/save/sqlalchemy.py +++ b/src/db/models/impl/url/internet_archives/save/sqlalchemy.py @@ -1,14 +1,17 @@ -from sqlalchemy import Column, DateTime, func +from sqlalchemy import Column, DateTime, func, PrimaryKeyConstraint from src.db.models.mixins import URLDependentMixin -from src.db.models.templates_.with_id import WithIDBase +from src.db.models.templates_.base import Base class URLInternetArchivesSaveMetadata( - WithIDBase, + Base, URLDependentMixin ): __tablename__ = 'url_internet_archives_save_metadata' + __table_args__ = ( + PrimaryKeyConstraint("url_id"), + ) created_at = Column(DateTime, nullable=False, server_default=func.now()) last_uploaded_at = Column(DateTime, nullable=False, server_default=func.now()) diff --git a/src/db/models/impl/url/optional_ds_metadata/sqlalchemy.py b/src/db/models/impl/url/optional_ds_metadata/sqlalchemy.py index 32156a38..04541ad6 100644 --- a/src/db/models/impl/url/optional_ds_metadata/sqlalchemy.py +++ b/src/db/models/impl/url/optional_ds_metadata/sqlalchemy.py @@ -1,19 +1,22 @@ -from sqlalchemy import Column, ARRAY, String, Date, Boolean, Enum +from sqlalchemy import Column, ARRAY, String, Date, Boolean, Enum, PrimaryKeyConstraint from sqlalchemy.orm import relationship, Mapped from src.db.models.helpers import enum_column from src.db.models.impl.url.optional_ds_metadata.enums import AgencyAggregationEnum, AccessTypeEnum, \ RetentionScheduleEnum, UpdateMethodEnum from src.db.models.mixins import URLDependentMixin, UpdatedAtMixin -from src.db.models.templates_.with_id import WithIDBase +from src.db.models.templates_.base import Base class URLOptionalDataSourceMetadata( URLDependentMixin, - WithIDBase, + Base, UpdatedAtMixin ): __tablename__ = 'url_optional_data_source_metadata' + __table_args__ = ( + PrimaryKeyConstraint("url_id"), + ) record_formats = Column(ARRAY(String), nullable=False, default=[]) data_portal_type = Column(String, nullable=True) diff --git a/src/db/models/impl/url/reviewing_user.py b/src/db/models/impl/url/reviewing_user.py index 9213a157..379cfee5 100644 --- a/src/db/models/impl/url/reviewing_user.py +++ b/src/db/models/impl/url/reviewing_user.py @@ -1,16 +1,17 @@ -from sqlalchemy import UniqueConstraint, Column, Integer +from sqlalchemy import UniqueConstraint, Column, Integer, PrimaryKeyConstraint from sqlalchemy.orm import relationship from src.db.models.mixins import CreatedAtMixin, URLDependentMixin +from src.db.models.templates_.base import Base from src.db.models.templates_.with_id import WithIDBase -class ReviewingUserURL(CreatedAtMixin, URLDependentMixin, WithIDBase): +class ReviewingUserURL(CreatedAtMixin, URLDependentMixin, Base): __tablename__ = 'reviewing_user_url' __table_args__ = ( - UniqueConstraint( - "url_id", - name="approving_user_url_uq_user_id_url_id"), + PrimaryKeyConstraint( + "url_id", + ), ) user_id = Column(Integer, nullable=False) diff --git a/src/db/models/impl/url/scrape_info/sqlalchemy.py b/src/db/models/impl/url/scrape_info/sqlalchemy.py index b50f2903..bd59c6ff 100644 --- a/src/db/models/impl/url/scrape_info/sqlalchemy.py +++ b/src/db/models/impl/url/scrape_info/sqlalchemy.py @@ -1,15 +1,22 @@ +from sqlalchemy import PrimaryKeyConstraint + from src.db.models.helpers import enum_column from src.db.models.impl.url.scrape_info.enums import ScrapeStatus -from src.db.models.mixins import URLDependentMixin -from src.db.models.templates_.standard import StandardBase +from src.db.models.mixins import URLDependentMixin, CreatedAtMixin, UpdatedAtMixin +from src.db.models.templates_.base import Base class URLScrapeInfo( - StandardBase, + Base, + CreatedAtMixin, + UpdatedAtMixin, URLDependentMixin ): __tablename__ = 'url_scrape_info' + __table_args__ = ( + PrimaryKeyConstraint("url_id"), + ) status = enum_column( enum_type=ScrapeStatus, diff --git a/src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py b/src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py index de6ee029..fea75df8 100644 --- a/src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py +++ b/src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py @@ -1,16 +1,20 @@ import sqlalchemy as sa +from sqlalchemy import PrimaryKeyConstraint from sqlalchemy.orm import relationship from src.db.models.mixins import CreatedAtMixin, AgencyDependentMixin -from src.db.models.templates_.with_id import WithIDBase +from src.db.models.templates_.base import Base class AgencyIDSubtaskSuggestion( - WithIDBase, + Base, CreatedAtMixin, AgencyDependentMixin, ): __tablename__ = "agency_id_subtask_suggestions" + __table_args__ = ( + PrimaryKeyConstraint("agency_id", "subtask_id"), + ) subtask_id = sa.Column( sa.Integer, diff --git a/src/db/models/impl/url/suggestion/agency/user.py b/src/db/models/impl/url/suggestion/agency/user.py index 2cd18851..79fa933c 100644 --- a/src/db/models/impl/url/suggestion/agency/user.py +++ b/src/db/models/impl/url/suggestion/agency/user.py @@ -1,13 +1,17 @@ -from sqlalchemy import Column, Boolean, UniqueConstraint, Integer +from sqlalchemy import Column, Boolean, UniqueConstraint, Integer, PrimaryKeyConstraint from sqlalchemy.orm import relationship, Mapped from src.db.models.helpers import get_agency_id_foreign_column from src.db.models.mixins import URLDependentMixin +from src.db.models.templates_.base import Base from src.db.models.templates_.with_id import WithIDBase -class UserURLAgencySuggestion(URLDependentMixin, WithIDBase): +class UserURLAgencySuggestion(URLDependentMixin, Base): __tablename__ = "user_url_agency_suggestions" + __table_args__ = ( + PrimaryKeyConstraint("agency_id", "url_id", "user_id"), + ) agency_id: Mapped[int] = get_agency_id_foreign_column(nullable=True) user_id = Column(Integer, nullable=False) @@ -15,7 +19,3 @@ class UserURLAgencySuggestion(URLDependentMixin, WithIDBase): agency = relationship("Agency") url = relationship("URL") - - __table_args__ = ( - UniqueConstraint("agency_id", "url_id", "user_id", name="uq_user_url_agency_suggestions"), - ) diff --git a/src/db/models/impl/url/suggestion/record_type/auto.py b/src/db/models/impl/url/suggestion/record_type/auto.py index 2aaed526..1c2c68d1 100644 --- a/src/db/models/impl/url/suggestion/record_type/auto.py +++ b/src/db/models/impl/url/suggestion/record_type/auto.py @@ -1,8 +1,9 @@ -from sqlalchemy import Column, UniqueConstraint +from sqlalchemy import Column, UniqueConstraint, PrimaryKeyConstraint from sqlalchemy.dialects import postgresql from sqlalchemy.orm import relationship from src.db.models.mixins import URLDependentMixin, UpdatedAtMixin, CreatedAtMixin +from src.db.models.templates_.base import Base from src.db.models.templates_.with_id import WithIDBase from src.db.models.types import record_type_values @@ -11,13 +12,13 @@ class AutoRecordTypeSuggestion( UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, - WithIDBase + Base, ): __tablename__ = "auto_record_type_suggestions" record_type = Column(postgresql.ENUM(*record_type_values, name='record_type'), nullable=False) __table_args__ = ( - UniqueConstraint("url_id", name="auto_record_type_suggestions_uq_url_id"), + PrimaryKeyConstraint("url_id"), ) # Relationships diff --git a/src/db/models/impl/url/suggestion/record_type/user.py b/src/db/models/impl/url/suggestion/record_type/user.py index 5b9dde8c..4e271225 100644 --- a/src/db/models/impl/url/suggestion/record_type/user.py +++ b/src/db/models/impl/url/suggestion/record_type/user.py @@ -1,22 +1,27 @@ -from sqlalchemy import Column, Integer, UniqueConstraint +from sqlalchemy import Column, Integer, UniqueConstraint, PrimaryKeyConstraint from sqlalchemy.dialects import postgresql from sqlalchemy.orm import relationship from src.db.models.mixins import UpdatedAtMixin, CreatedAtMixin, URLDependentMixin +from src.db.models.templates_.base import Base from src.db.models.templates_.with_id import WithIDBase from src.db.models.types import record_type_values -class UserRecordTypeSuggestion(UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, WithIDBase): +class UserRecordTypeSuggestion( + UpdatedAtMixin, + CreatedAtMixin, + URLDependentMixin, + Base, +): __tablename__ = "user_record_type_suggestions" + __table_args__ = ( + PrimaryKeyConstraint("url_id", "user_id"), + ) user_id = Column(Integer, nullable=False) record_type = Column(postgresql.ENUM(*record_type_values, name='record_type'), nullable=False) - __table_args__ = ( - UniqueConstraint("url_id", "user_id", name="uq_user_record_type_suggestions"), - ) # Relationships - url = relationship("URL", back_populates="user_record_type_suggestions") diff --git a/src/db/models/impl/url/suggestion/url_type/auto/sqlalchemy.py b/src/db/models/impl/url/suggestion/url_type/auto/sqlalchemy.py index dd109269..19b5dc09 100644 --- a/src/db/models/impl/url/suggestion/url_type/auto/sqlalchemy.py +++ b/src/db/models/impl/url/suggestion/url_type/auto/sqlalchemy.py @@ -1,7 +1,8 @@ -from sqlalchemy import Column, Boolean, UniqueConstraint, String, Float +from sqlalchemy import Column, Boolean, UniqueConstraint, String, Float, PrimaryKeyConstraint from sqlalchemy.orm import relationship from src.db.models.mixins import UpdatedAtMixin, CreatedAtMixin, URLDependentMixin +from src.db.models.templates_.base import Base from src.db.models.templates_.with_id import WithIDBase @@ -9,7 +10,7 @@ class AutoRelevantSuggestion( UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, - WithIDBase + Base, ): __tablename__ = "auto_relevant_suggestions" @@ -19,6 +20,7 @@ class AutoRelevantSuggestion( __table_args__ = ( UniqueConstraint("url_id", name="auto_relevant_suggestions_uq_url_id"), + PrimaryKeyConstraint("url_id"), ) # Relationships diff --git a/src/db/models/impl/url/suggestion/url_type/user.py b/src/db/models/impl/url/suggestion/url_type/user.py index c7070b5e..52bbc4eb 100644 --- a/src/db/models/impl/url/suggestion/url_type/user.py +++ b/src/db/models/impl/url/suggestion/url_type/user.py @@ -1,10 +1,11 @@ -from sqlalchemy import Column, UniqueConstraint, Integer +from sqlalchemy import Column, UniqueConstraint, Integer, PrimaryKeyConstraint from sqlalchemy.dialects import postgresql from sqlalchemy.orm import relationship, Mapped from src.db.models.helpers import enum_column from src.db.models.impl.flag.url_validated.enums import URLType from src.db.models.mixins import UpdatedAtMixin, CreatedAtMixin, URLDependentMixin +from src.db.models.templates_.base import Base from src.db.models.templates_.with_id import WithIDBase @@ -12,9 +13,12 @@ class UserURLTypeSuggestion( UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, - WithIDBase + Base, ): __tablename__ = "user_url_type_suggestions" + __table_args__ = ( + PrimaryKeyConstraint("url_id", "user_id"), + ) user_id = Column(Integer, nullable=False) type: Mapped[URLType | None] = enum_column( @@ -23,10 +27,6 @@ class UserURLTypeSuggestion( nullable=True ) - __table_args__ = ( - UniqueConstraint("url_id", "user_id", name="uq_user_relevant_suggestions"), - ) - # Relationships url = relationship("URL", back_populates="user_relevant_suggestions") diff --git a/src/db/models/impl/url/web_metadata/sqlalchemy.py b/src/db/models/impl/url/web_metadata/sqlalchemy.py index 45f5233c..3170a189 100644 --- a/src/db/models/impl/url/web_metadata/sqlalchemy.py +++ b/src/db/models/impl/url/web_metadata/sqlalchemy.py @@ -1,17 +1,20 @@ -from sqlalchemy import Column, Text, Boolean, Integer +from sqlalchemy import Column, Text, Boolean, Integer, PrimaryKeyConstraint from src.db.models.mixins import URLDependentMixin, CreatedAtMixin, UpdatedAtMixin -from src.db.models.templates_.with_id import WithIDBase +from src.db.models.templates_.base import Base class URLWebMetadata( - WithIDBase, + Base, URLDependentMixin, CreatedAtMixin, UpdatedAtMixin ): """Contains information about the web page.""" __tablename__ = "url_web_metadata" + __table_args__ = ( + PrimaryKeyConstraint("url_id"), + ) accessed = Column( Boolean(), diff --git a/src/db/statement_composer.py b/src/db/statement_composer.py index 31d6c7f9..faa965a8 100644 --- a/src/db/statement_composer.py +++ b/src/db/statement_composer.py @@ -38,7 +38,7 @@ def has_non_errored_urls_without_html_data() -> Select: .join(URLWebMetadata) .outerjoin(URLScrapeInfo) .where( - URLScrapeInfo.id == None, + URLScrapeInfo.url_id == None, ~exists(exclude_subquery), URLWebMetadata.status_code == HTTPStatus.OK.value, URLWebMetadata.content_type.like("%html%"), diff --git a/tests/helpers/data_creator/commands/impl/html_data.py b/tests/helpers/data_creator/commands/impl/html_data.py index c548eb5a..38ecb4bd 100644 --- a/tests/helpers/data_creator/commands/impl/html_data.py +++ b/tests/helpers/data_creator/commands/impl/html_data.py @@ -1,6 +1,7 @@ from src.db.dtos.url.html_content import URLHTMLContentInfo from src.db.models.impl.url.html.content.enums import HTMLContentType from src.db.dtos.url.raw_html import RawHTMLInfo +from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent from src.db.models.impl.url.scrape_info.enums import ScrapeStatus from src.db.models.impl.url.scrape_info.pydantic import URLScrapeInfoInsertModel from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase @@ -22,16 +23,16 @@ async def run(self) -> None: scraper_info_list = [] for url_id in self.url_ids: html_content_infos.append( - URLHTMLContentInfo( + URLHTMLContent( url_id=url_id, - content_type=HTMLContentType.TITLE, + content_type=HTMLContentType.TITLE.value, content="test html content" ) ) html_content_infos.append( - URLHTMLContentInfo( + URLHTMLContent( url_id=url_id, - content_type=HTMLContentType.DESCRIPTION, + content_type=HTMLContentType.DESCRIPTION.value, content="test description" ) ) @@ -47,5 +48,5 @@ async def run(self) -> None: scraper_info_list.append(scraper_info) await self.adb_client.add_raw_html(raw_html_info_list) - await self.adb_client.add_html_content_infos(html_content_infos) + await self.adb_client.add_all(html_content_infos) From 9fe1818d343b444b64aa827759894b085fb13862 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sat, 29 Nov 2025 16:14:05 -0500 Subject: [PATCH 73/84] Revise agency agreement logic --- .../user/queries/agreement/agency.py | 84 ++++++++++++------- .../contributions/user/queries/core.py | 2 +- 2 files changed, 55 insertions(+), 31 deletions(-) diff --git a/src/api/endpoints/contributions/user/queries/agreement/agency.py b/src/api/endpoints/contributions/user/queries/agreement/agency.py index 488e5c19..01000bf2 100644 --- a/src/api/endpoints/contributions/user/queries/agreement/agency.py +++ b/src/api/endpoints/contributions/user/queries/agreement/agency.py @@ -1,56 +1,80 @@ -from sqlalchemy import select, func, exists, and_ +from sqlalchemy import select, func, exists, and_, or_, any_, cast, Float -from src.api.endpoints.contributions.user.queries.annotated_and_validated import AnnotatedAndValidatedCTEContainer from src.api.endpoints.contributions.user.queries.templates.agreement import AgreementCTEContainer +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion -def get_agency_agreement_cte_container( - inner_cte: AnnotatedAndValidatedCTEContainer -) -> AgreementCTEContainer: +def get_agency_agreement_cte_container() -> AgreementCTEContainer: - count_cte = ( + uuas = UserURLAgencySuggestion + fuv = FlagURLValidated + lau = LinkURLAgency + # CTE 1: All validated Meta URLs/Data Sources and their agencies + validated_urls_with_agencies = ( select( - inner_cte.user_id, - func.count() + uuas.url_id, + func.array_agg(lau.agency_id).label("agency_ids"), + ) + .join(fuv, fuv.url_id == uuas.url_id) + .join(lau, lau.url_id == uuas.url_id, isouter=True) + .where( + or_( + uuas.is_new.is_(None), + uuas.is_new.is_(False) + ), + or_( + fuv.type == "meta url", + fuv.type == "data source" + ), + ) + .group_by(uuas.url_id) + .cte("validated_urls_with_agencies") + ) + + # CTE 2 + cte_2 = ( + select( + validated_urls_with_agencies.c.url_id, + validated_urls_with_agencies.c.agency_ids, + uuas.is_new, + uuas.user_id, + uuas.agency_id.label("suggested_agency_id"), + (uuas.agency_id == any_(validated_urls_with_agencies.c.agency_ids)).label( + "is_suggested_agency_validated" + ), ) .join( - UserURLAgencySuggestion, - and_( - inner_cte.user_id == UserURLAgencySuggestion.user_id, - inner_cte.url_id == UserURLAgencySuggestion.url_id - ) + validated_urls_with_agencies, + validated_urls_with_agencies.c.url_id == uuas.url_id, + ) + .cte("final") + ) + + count_cte = ( + select( + cte_2.c.user_id, + func.count() ) .group_by( - inner_cte.user_id + cte_2.c.user_id ) - .cte("agency_count_total") + .cte("count_cte") ) agreed_cte = ( select( - inner_cte.user_id, + cte_2.c.user_id, func.count() ) - .join( - UserURLAgencySuggestion, - and_( - inner_cte.user_id == UserURLAgencySuggestion.user_id, - inner_cte.url_id == UserURLAgencySuggestion.url_id - ) - ) .where( - exists() - .where( - LinkURLAgency.url_id == UserURLAgencySuggestion.url_id, - LinkURLAgency.agency_id == UserURLAgencySuggestion.agency_id - ) + cte_2.c.is_suggested_agency_validated.is_(True) ) .group_by( - inner_cte.user_id + cte_2.c.user_id ) - .cte("agency_count_agreed") + .cte("agreed_cte") ) return AgreementCTEContainer( diff --git a/src/api/endpoints/contributions/user/queries/core.py b/src/api/endpoints/contributions/user/queries/core.py index c7d4afef..d905026c 100644 --- a/src/api/endpoints/contributions/user/queries/core.py +++ b/src/api/endpoints/contributions/user/queries/core.py @@ -24,7 +24,7 @@ async def run(self, session: AsyncSession) -> ContributionsUserResponse: contributions_cte = ContributionsCTEContainer() record_type_agree: AgreementCTEContainer = get_record_type_agreement_cte_container(inner_cte) - agency_agree: AgreementCTEContainer = get_agency_agreement_cte_container(inner_cte) + agency_agree: AgreementCTEContainer = get_agency_agreement_cte_container() url_type_agree: AgreementCTEContainer = get_url_type_agreement_cte_container(inner_cte) query = ( From 9f3047ef02daa074a09dd0faff42cfcf86df2b4e Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sat, 29 Nov 2025 18:21:51 -0500 Subject: [PATCH 74/84] Add CORSMiddleware --- src/api/main.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/api/main.py b/src/api/main.py index ca6e56c4..87fa0d3a 100644 --- a/src/api/main.py +++ b/src/api/main.py @@ -6,6 +6,7 @@ from fastapi import FastAPI from pdap_access_manager.access_manager.async_ import AccessManagerAsync from pdap_access_manager.models.auth import AuthInfo +from starlette.middleware.cors import CORSMiddleware from starlette.responses import RedirectResponse from src.api.endpoints.agencies.routes import agencies_router @@ -166,6 +167,17 @@ async def setup_database(db_client): version="0.1.0", lifespan=lifespan ) +app.add_middleware( + CORSMiddleware, + allow_origins=[ + "http://localhost:8888", # For local development + "https://pdap.io", + "https://pdap.dev" + ], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) @app.get("/docs", include_in_schema=False) async def redirect_docs(): From fa8359f7962cc718f14fac54a36d10f68f3b4376 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sun, 30 Nov 2025 07:35:12 -0500 Subject: [PATCH 75/84] Revert AgencyIDSubtaskSuggestion --- .../2025_11_29_0717-5d6412540aba_remove_id_columns.py | 2 -- .../impl/url/suggestion/agency/suggestion/sqlalchemy.py | 8 +++----- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py b/alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py index 34ae8506..cd636a3b 100644 --- a/alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py +++ b/alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py @@ -19,7 +19,6 @@ TABLES = [ "task_errors", # - "agency_id_subtask_suggestions", # "auto_record_type_suggestions", # "auto_relevant_suggestions", # "duplicates", # @@ -63,7 +62,6 @@ BESPOKE_UNIQUE_IDS: dict[str, list[str]] = { "task_errors": ["task_id"], # - "agency_id_subtask_suggestions": ["agency_id", "subtask_id"], # "link_agencies__locations": ["agency_id", "location_id"], # "link_urls_redirect_url": ["source_url_id", "destination_url_id"], # "link_urls_root_url": ["url_id", "root_url_id"], # diff --git a/src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py b/src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py index fea75df8..b6b2cc01 100644 --- a/src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py +++ b/src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py @@ -3,18 +3,16 @@ from sqlalchemy.orm import relationship from src.db.models.mixins import CreatedAtMixin, AgencyDependentMixin -from src.db.models.templates_.base import Base +from src.db.models.templates_.with_id import WithIDBase class AgencyIDSubtaskSuggestion( - Base, + WithIDBase, CreatedAtMixin, AgencyDependentMixin, ): __tablename__ = "agency_id_subtask_suggestions" - __table_args__ = ( - PrimaryKeyConstraint("agency_id", "subtask_id"), - ) + subtask_id = sa.Column( sa.Integer, From c2ddbe3381e8f9f761d52f615898366131e42b29 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sun, 30 Nov 2025 07:53:41 -0500 Subject: [PATCH 76/84] Remove adding primary key to duplicates (ironically) --- .../versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py | 1 - 1 file changed, 1 deletion(-) diff --git a/alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py b/alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py index cd636a3b..9a20bafb 100644 --- a/alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py +++ b/alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py @@ -66,7 +66,6 @@ "link_urls_redirect_url": ["source_url_id", "destination_url_id"], # "link_urls_root_url": ["url_id", "root_url_id"], # "url_html_content": ["url_id", "content_type"], # - "duplicates": ["batch_id", "original_url_id"] } def drop_views(): From a6f27db2874ffabccc24484c616824b48a0ace75 Mon Sep 17 00:00:00 2001 From: maxachis Date: Mon, 1 Dec 2025 12:55:32 -0500 Subject: [PATCH 77/84] Begin draft --- .../annotate/all/get/models/agency.py | 18 ++- .../annotate/all/get/models/location.py | 20 +++- .../annotate/all/get/queries/agency/core.py | 1 + .../all/get/queries/location_/core.py | 3 + .../all/get/queries/location_/requester.py | 110 +++++++++++++++++- src/db/models/impl/url/core/sqlalchemy.py | 26 ++++- .../suggestion/location/user/sqlalchemy.py | 5 +- src/db/templates/requester.py | 7 ++ 8 files changed, 176 insertions(+), 14 deletions(-) diff --git a/src/api/endpoints/annotate/all/get/models/agency.py b/src/api/endpoints/annotate/all/get/models/agency.py index 45806d98..2c685e6e 100644 --- a/src/api/endpoints/annotate/all/get/models/agency.py +++ b/src/api/endpoints/annotate/all/get/models/agency.py @@ -1,6 +1,16 @@ from pydantic import BaseModel, Field +class AgencyAnnotationSuggestion(BaseModel): + agency_id: int + agency_name: str + user_count: int + robo_confidence: int | None = Field( + description="The robo labeler's given confidence for its suggestion. Null if no robo-label occurred.", + ge=0, + le=100, + ) +# TODO: Replace Usages and Delete class AgencyAnnotationAutoSuggestion(BaseModel): agency_id: int agency_name: str @@ -10,11 +20,13 @@ class AgencyAnnotationAutoSuggestion(BaseModel): le=100, ) +# TODO: Replace Usages and Delete class AgencyAnnotationUserSuggestion(BaseModel): agency_id: int agency_name: str user_count: int +# TODO: Replace Usages and Delete class AgencyAnnotationUserSuggestionOuterInfo(BaseModel): suggestions: list[AgencyAnnotationUserSuggestion] not_found_count: int = Field( @@ -23,5 +35,7 @@ class AgencyAnnotationUserSuggestionOuterInfo(BaseModel): ) class AgencyAnnotationResponseOuterInfo(BaseModel): - user: AgencyAnnotationUserSuggestionOuterInfo - auto: list[AgencyAnnotationAutoSuggestion] \ No newline at end of file + suggestions: list[AgencyAnnotationSuggestion] + not_found_count: int = Field( + description="How many users indicated the agency could not be found." + ) diff --git a/src/api/endpoints/annotate/all/get/models/location.py b/src/api/endpoints/annotate/all/get/models/location.py index fb467004..4660ee52 100644 --- a/src/api/endpoints/annotate/all/get/models/location.py +++ b/src/api/endpoints/annotate/all/get/models/location.py @@ -1,6 +1,17 @@ from pydantic import BaseModel, Field +class LocationAnnotationSuggestion(BaseModel): + location_id: int + location_name: str + user_count: int + robo_confidence: int | None = Field( + description="The robo labeler's given confidence for its suggestion. Null if no robo-label occurred.", + ge=0, + le=100, + ) + +# TODO: Replace Usages and Delete class LocationAnnotationAutoSuggestion(BaseModel): location_id: int location_name: str = Field( @@ -12,7 +23,7 @@ class LocationAnnotationAutoSuggestion(BaseModel): le=100, ) - +# TODO: Replace Usages and Delete class LocationAnnotationUserSuggestion(BaseModel): location_id: int location_name: str = Field( @@ -23,6 +34,7 @@ class LocationAnnotationUserSuggestion(BaseModel): ge=1, ) +# TODO: Replace Usages and Delete class LocationAnnotationUserSuggestionOuterInfo(BaseModel): suggestions: list[LocationAnnotationUserSuggestion] not_found_count: int = Field( @@ -31,5 +43,7 @@ class LocationAnnotationUserSuggestionOuterInfo(BaseModel): ) class LocationAnnotationResponseOuterInfo(BaseModel): - user: LocationAnnotationUserSuggestionOuterInfo - auto: list[LocationAnnotationAutoSuggestion] \ No newline at end of file + suggestions: list[LocationAnnotationSuggestion] + not_found_count: int = Field( + description="How many users indicated the location could not be found." + ) \ No newline at end of file diff --git a/src/api/endpoints/annotate/all/get/queries/agency/core.py b/src/api/endpoints/annotate/all/get/queries/agency/core.py index 28cfbd2d..d3502b96 100644 --- a/src/api/endpoints/annotate/all/get/queries/agency/core.py +++ b/src/api/endpoints/annotate/all/get/queries/agency/core.py @@ -30,6 +30,7 @@ async def run(self, session: AsyncSession) -> AgencyAnnotationResponseOuterInfo: location_id=self.location_id ) + # TODO: Pull both in single query user_suggestions: list[AgencyAnnotationUserSuggestion] = \ await requester.get_user_agency_suggestions() auto_suggestions: list[AgencyAnnotationAutoSuggestion] = \ diff --git a/src/api/endpoints/annotate/all/get/queries/location_/core.py b/src/api/endpoints/annotate/all/get/queries/location_/core.py index 85db523c..3ef0fb99 100644 --- a/src/api/endpoints/annotate/all/get/queries/location_/core.py +++ b/src/api/endpoints/annotate/all/get/queries/location_/core.py @@ -24,6 +24,9 @@ def __init__( async def run(self, session: AsyncSession) -> LocationAnnotationResponseOuterInfo: requester = GetLocationSuggestionsRequester(session) + + # TODO: Pull both in single query + suggestions user_suggestions: list[LocationAnnotationUserSuggestion] = \ await requester.get_user_location_suggestions(self.url_id) auto_suggestions: list[LocationAnnotationAutoSuggestion] = \ diff --git a/src/api/endpoints/annotate/all/get/queries/location_/requester.py b/src/api/endpoints/annotate/all/get/queries/location_/requester.py index 6ad56c56..abae28ee 100644 --- a/src/api/endpoints/annotate/all/get/queries/location_/requester.py +++ b/src/api/endpoints/annotate/all/get/queries/location_/requester.py @@ -1,9 +1,11 @@ from typing import Sequence -from sqlalchemy import select, func, RowMapping +from sqlalchemy import select, func, RowMapping, or_, and_ from src.api.endpoints.annotate.all.get.models.location import LocationAnnotationUserSuggestion, \ - LocationAnnotationAutoSuggestion + LocationAnnotationAutoSuggestion, LocationAnnotationSuggestion +from src.db.helpers.query import exists_url +from src.db.helpers.session import session_helper as sh from src.db.models.impl.link.user_suggestion_not_found.location.sqlalchemy import LinkUserSuggestionLocationNotFound from src.db.models.impl.url.suggestion.location.auto.subtask.sqlalchemy import AutoLocationIDSubtask from src.db.models.impl.url.suggestion.location.auto.suggestion.sqlalchemy import LocationIDSubtaskSuggestion @@ -11,10 +13,112 @@ from src.db.models.views.location_expanded import LocationExpandedView from src.db.templates.requester import RequesterBase -from src.db.helpers.session import session_helper as sh class GetLocationSuggestionsRequester(RequesterBase): + async def get_location_suggestions(self, url_id: int) -> list[LocationAnnotationSuggestion]: + # All locations with either a user or robo annotation + valid_locations_cte = ( + select( + LocationExpandedView.id, + ) + .where( + or_( + exists_url( + UserLocationSuggestion + ), + exists_url( + AutoLocationIDSubtask + ) + ) + ) + .cte("valid_locations") + ) + # Number of users who suggested each location + user_suggestions_cte = ( + select( + UserLocationSuggestion.url_id, + LocationExpandedView.id, + func.count(UserLocationSuggestion.user_id).label('user_count') + ) + .outerjoin( + LocationExpandedView, + LocationExpandedView.id == UserLocationSuggestion.location_id + ) + .group_by( + UserLocationSuggestion.location_id, + UserLocationSuggestion.url_id, + ) + .cte("user_suggestions") + ) + # Maximum confidence of robo annotation, if any + robo_suggestions_cte = ( + select( + AutoLocationIDSubtask.url_id, + LocationExpandedView.id, + func.max(LocationIDSubtaskSuggestion.confidence).label('robo_confidence') + ) + .outerjoin( + LocationExpandedView, + LocationExpandedView.id == LocationIDSubtaskSuggestion.location_id + ) + .join( + AutoLocationIDSubtask, + AutoLocationIDSubtask.id == LocationIDSubtaskSuggestion.subtask_id + ) + .group_by( + LocationExpandedView.id, + AutoLocationIDSubtask.url_id, + ) + .cte("robo_suggestions") + ) + # Join user and robo suggestions + joined_suggestions_query = ( + select( + valid_locations_cte.c.id.label("location_id"), + LocationExpandedView.full_display_name.label("location_name"), + user_suggestions_cte.c.user_count, + robo_suggestions_cte.c.robo_confidence, + ) + .join( + LocationExpandedView, + LocationExpandedView.id == valid_locations_cte.c.id + ) + .outerjoin( + user_suggestions_cte, + and_( + user_suggestions_cte.c.url_id == url_id, + user_suggestions_cte.c.location_id == LocationExpandedView.id + ) + ) + .outerjoin( + robo_suggestions_cte, + and_( + robo_suggestions_cte.c.url_id == url_id, + robo_suggestions_cte.c.location_id == LocationExpandedView.id + ) + ) + ) + + mappings: Sequence[RowMapping] = await self.mappings(joined_suggestions_query) + suggestions: list[LocationAnnotationSuggestion] = [ + LocationAnnotationSuggestion( + **mapping + ) + for mapping in mappings + ] + return suggestions + + async def get_location_not_found_suggestions(self, url_id: int ) -> int: + query = ( + select( + func.count(LinkUserSuggestionLocationNotFound.user_id) + ) + .where( + LinkUserSuggestionLocationNotFound.url_id == url_id + ) + ) + return await self.scalar(query) async def get_user_location_suggestions(self, url_id: int) -> list[LocationAnnotationUserSuggestion]: query = ( diff --git a/src/db/models/impl/url/core/sqlalchemy.py b/src/db/models/impl/url/core/sqlalchemy.py index 8ee51a43..de4af177 100644 --- a/src/db/models/impl/url/core/sqlalchemy.py +++ b/src/db/models/impl/url/core/sqlalchemy.py @@ -5,11 +5,13 @@ from src.collectors.enums import URLStatus from src.db.models.helpers import enum_column +from src.db.models.impl.link.user_suggestion_not_found.location.sqlalchemy import LinkUserSuggestionLocationNotFound from src.db.models.impl.url.checked_for_duplicate import URLCheckedForDuplicate from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType from src.db.models.impl.url.suggestion.location.auto.subtask.sqlalchemy import AutoLocationIDSubtask +from src.db.models.impl.url.suggestion.location.user.sqlalchemy import UserLocationSuggestion from src.db.models.impl.url.suggestion.name.sqlalchemy import URLNameSuggestion from src.db.models.impl.url.task_error.sqlalchemy import URLTaskError from src.db.models.mixins import UpdatedAtMixin, CreatedAtMixin @@ -85,25 +87,39 @@ def full_url(cls): secondary="link_tasks__urls", back_populates="urls", ) - auto_agency_subtasks = relationship( - "URLAutoAgencyIDSubtask" + + + name_suggestions = relationship( + URLNameSuggestion + ) + # Location + user_location_suggestions = relationship( + UserLocationSuggestion + ) + user_location_suggestion_not_found = relationship( + LinkUserSuggestionLocationNotFound ) auto_location_subtasks = relationship( AutoLocationIDSubtask ) - name_suggestions = relationship( - URLNameSuggestion - ) + + # Agency user_agency_suggestions = relationship( "UserURLAgencySuggestion", back_populates="url") + auto_agency_subtasks = relationship( + "URLAutoAgencyIDSubtask" + ) + # Record Type auto_record_type_suggestion = relationship( "AutoRecordTypeSuggestion", uselist=False, back_populates="url") user_record_type_suggestions = relationship( "UserRecordTypeSuggestion", back_populates="url") + # Relvant/URL Type auto_relevant_suggestion = relationship( "AutoRelevantSuggestion", uselist=False, back_populates="url") user_relevant_suggestions = relationship( "UserURLTypeSuggestion", back_populates="url") + reviewing_user = relationship( "ReviewingUserURL", uselist=False, back_populates="url") optional_data_source_metadata = relationship( diff --git a/src/db/models/impl/url/suggestion/location/user/sqlalchemy.py b/src/db/models/impl/url/suggestion/location/user/sqlalchemy.py index a9d4ae8b..18ac3851 100644 --- a/src/db/models/impl/url/suggestion/location/user/sqlalchemy.py +++ b/src/db/models/impl/url/suggestion/location/user/sqlalchemy.py @@ -1,5 +1,7 @@ from sqlalchemy import Integer, Column, PrimaryKeyConstraint +from sqlalchemy.orm import relationship +from src.db.models.impl.link.user_suggestion_not_found.location.sqlalchemy import LinkUserSuggestionLocationNotFound from src.db.models.mixins import CreatedAtMixin, URLDependentMixin, LocationDependentMixin from src.db.models.templates_.base import Base @@ -18,4 +20,5 @@ class UserLocationSuggestion( user_id = Column( Integer, nullable=False, - ) \ No newline at end of file + ) + diff --git a/src/db/templates/requester.py b/src/db/templates/requester.py index b56af87f..9588ea9d 100644 --- a/src/db/templates/requester.py +++ b/src/db/templates/requester.py @@ -4,6 +4,7 @@ """ from abc import ABC +from sqlalchemy import Select from sqlalchemy.ext.asyncio import AsyncSession import src.db.helpers.session.session_helper as sh @@ -16,5 +17,11 @@ def __init__(self, session: AsyncSession): self.session = session self.session_helper = sh + async def scalar(self, query: Select): + return await sh.scalar(self.session, query=query) + + async def mappings(self, query: Select): + return await sh.mappings(self.session, query=query) + async def run_query_builder(self, query_builder: QueryBuilderBase): return await query_builder.run(session=self.session) \ No newline at end of file From 6981bf939704a956990e8fa89e9c585ebd86fb2d Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 1 Dec 2025 14:51:35 -0500 Subject: [PATCH 78/84] Update annotations to join user and robo suggestions for locations and agencies --- .../annotate/all/get/models/agency.py | 24 --- .../annotate/all/get/models/location.py | 31 ---- .../annotate/all/get/queries/agency/core.py | 24 +-- .../all/get/queries/agency/requester.py | 148 +++++++++--------- .../all/get/queries/location_/core.py | 27 +--- .../all/get/queries/location_/requester.py | 92 +---------- .../agency/suggestion/sqlalchemy.py | 1 - .../api/annotate/all/test_happy_path.py | 23 ++- 8 files changed, 105 insertions(+), 265 deletions(-) diff --git a/src/api/endpoints/annotate/all/get/models/agency.py b/src/api/endpoints/annotate/all/get/models/agency.py index 2c685e6e..fc568af3 100644 --- a/src/api/endpoints/annotate/all/get/models/agency.py +++ b/src/api/endpoints/annotate/all/get/models/agency.py @@ -10,30 +10,6 @@ class AgencyAnnotationSuggestion(BaseModel): le=100, ) -# TODO: Replace Usages and Delete -class AgencyAnnotationAutoSuggestion(BaseModel): - agency_id: int - agency_name: str - confidence: int = Field( - title="The confidence of the location", - ge=0, - le=100, - ) - -# TODO: Replace Usages and Delete -class AgencyAnnotationUserSuggestion(BaseModel): - agency_id: int - agency_name: str - user_count: int - -# TODO: Replace Usages and Delete -class AgencyAnnotationUserSuggestionOuterInfo(BaseModel): - suggestions: list[AgencyAnnotationUserSuggestion] - not_found_count: int = Field( - title="How many users listed the agency as not found.", - ge=0, - ) - class AgencyAnnotationResponseOuterInfo(BaseModel): suggestions: list[AgencyAnnotationSuggestion] not_found_count: int = Field( diff --git a/src/api/endpoints/annotate/all/get/models/location.py b/src/api/endpoints/annotate/all/get/models/location.py index 4660ee52..0100bbc4 100644 --- a/src/api/endpoints/annotate/all/get/models/location.py +++ b/src/api/endpoints/annotate/all/get/models/location.py @@ -11,37 +11,6 @@ class LocationAnnotationSuggestion(BaseModel): le=100, ) -# TODO: Replace Usages and Delete -class LocationAnnotationAutoSuggestion(BaseModel): - location_id: int - location_name: str = Field( - title="The full name of the location" - ) - confidence: int = Field( - title="The confidence of the location", - ge=0, - le=100, - ) - -# TODO: Replace Usages and Delete -class LocationAnnotationUserSuggestion(BaseModel): - location_id: int - location_name: str = Field( - title="The full name of the location" - ) - user_count: int = Field( - title="The number of users who suggested this location", - ge=1, - ) - -# TODO: Replace Usages and Delete -class LocationAnnotationUserSuggestionOuterInfo(BaseModel): - suggestions: list[LocationAnnotationUserSuggestion] - not_found_count: int = Field( - title="How many users listed the location as not found.", - ge=0, - ) - class LocationAnnotationResponseOuterInfo(BaseModel): suggestions: list[LocationAnnotationSuggestion] not_found_count: int = Field( diff --git a/src/api/endpoints/annotate/all/get/queries/agency/core.py b/src/api/endpoints/annotate/all/get/queries/agency/core.py index d3502b96..d9a86717 100644 --- a/src/api/endpoints/annotate/all/get/queries/agency/core.py +++ b/src/api/endpoints/annotate/all/get/queries/agency/core.py @@ -1,13 +1,7 @@ from sqlalchemy.ext.asyncio import AsyncSession -from src.api.endpoints.annotate.all.get.models.agency import AgencyAnnotationResponseOuterInfo, \ - AgencyAnnotationUserSuggestionOuterInfo, AgencyAnnotationUserSuggestion, AgencyAnnotationAutoSuggestion -from src.api.endpoints.annotate.all.get.queries.agency.requester import GetAgencySuggestionsRequester -from src.db.queries.base.builder import QueryBuilderBase -from sqlalchemy.ext.asyncio import AsyncSession - -from src.api.endpoints.annotate.all.get.models.agency import AgencyAnnotationResponseOuterInfo, \ - AgencyAnnotationUserSuggestionOuterInfo, AgencyAnnotationUserSuggestion, AgencyAnnotationAutoSuggestion +from src.api.endpoints.annotate.all.get.models.agency import AgencyAnnotationResponseOuterInfo +from src.api.endpoints.annotate.all.get.models.agency import AgencyAnnotationSuggestion from src.api.endpoints.annotate.all.get.queries.agency.requester import GetAgencySuggestionsRequester from src.db.queries.base.builder import QueryBuilderBase @@ -30,19 +24,13 @@ async def run(self, session: AsyncSession) -> AgencyAnnotationResponseOuterInfo: location_id=self.location_id ) - # TODO: Pull both in single query - user_suggestions: list[AgencyAnnotationUserSuggestion] = \ - await requester.get_user_agency_suggestions() - auto_suggestions: list[AgencyAnnotationAutoSuggestion] = \ - await requester.get_auto_agency_suggestions() + suggestions: list[AgencyAnnotationSuggestion] = \ + await requester.get_agency_suggestions() not_found_count: int = \ await requester.get_not_found_count() return AgencyAnnotationResponseOuterInfo( - user=AgencyAnnotationUserSuggestionOuterInfo( - suggestions=user_suggestions, - not_found_count=not_found_count - ), - auto=auto_suggestions, + suggestions=suggestions, + not_found_count=not_found_count ) diff --git a/src/api/endpoints/annotate/all/get/queries/agency/requester.py b/src/api/endpoints/annotate/all/get/queries/agency/requester.py index e6ffb817..28923cf2 100644 --- a/src/api/endpoints/annotate/all/get/queries/agency/requester.py +++ b/src/api/endpoints/annotate/all/get/queries/agency/requester.py @@ -1,16 +1,15 @@ from typing import Sequence -from sqlalchemy import func, select, RowMapping +from sqlalchemy import func, select, RowMapping, or_, and_ from sqlalchemy.ext.asyncio import AsyncSession -from src.api.endpoints.annotate.all.get.models.agency import AgencyAnnotationAutoSuggestion, \ - AgencyAnnotationUserSuggestion -from src.api.endpoints.annotate.all.get.queries.agency.suggestions_with_highest_confidence import \ - SuggestionsWithHighestConfidenceCTE +from src.api.endpoints.annotate.all.get.models.agency import AgencyAnnotationSuggestion +from src.db.helpers.query import exists_url from src.db.helpers.session import session_helper as sh from src.db.models.impl.agency.sqlalchemy import Agency -from src.db.models.impl.link.agency_location.sqlalchemy import LinkAgencyLocation from src.db.models.impl.link.user_suggestion_not_found.agency.sqlalchemy import LinkUserSuggestionAgencyNotFound +from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask +from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion from src.db.templates.requester import RequesterBase @@ -27,102 +26,97 @@ def __init__( self.url_id = url_id self.location_id = location_id - async def get_user_agency_suggestions(self) -> list[AgencyAnnotationUserSuggestion]: - query = ( + async def get_agency_suggestions(self) -> list[AgencyAnnotationSuggestion]: + # All agencies with either a user or robo annotation + valid_agencies_cte = ( select( - UserURLAgencySuggestion.agency_id, - func.count(UserURLAgencySuggestion.user_id).label("count"), - Agency.name.label("agency_name"), - ) - .join( - Agency, - Agency.id == UserURLAgencySuggestion.agency_id + Agency.id, ) - - ) - - if self.location_id is not None: - query = ( - query.join( - LinkAgencyLocation, - LinkAgencyLocation.agency_id == UserURLAgencySuggestion.agency_id - ) - .where( - LinkAgencyLocation.location_id == self.location_id + .where( + or_( + exists_url( + UserURLAgencySuggestion + ), + exists_url( + URLAutoAgencyIDSubtask + ) ) ) + .cte("valid_agencies") + ) - query = ( - query.where( - UserURLAgencySuggestion.url_id == self.url_id + # Number of users who suggested each agency + user_suggestions_cte = ( + select( + UserURLAgencySuggestion.url_id, + UserURLAgencySuggestion.agency_id, + func.count(UserURLAgencySuggestion.user_id).label('user_count') ) .group_by( UserURLAgencySuggestion.agency_id, - Agency.name + UserURLAgencySuggestion.url_id, ) - .order_by( - func.count(UserURLAgencySuggestion.user_id).desc() - ) - .limit(3) + .cte("user_suggestions") ) - results: Sequence[RowMapping] = await sh.mappings(self.session, query=query) - - return [ - AgencyAnnotationUserSuggestion( - agency_id=autosuggestion["agency_id"], - user_count=autosuggestion["count"], - agency_name=autosuggestion["agency_name"], + # Maximum confidence of robo annotation, if any + robo_suggestions_cte = ( + select( + URLAutoAgencyIDSubtask.url_id, + Agency.id.label("agency_id"), + func.max(AgencyIDSubtaskSuggestion.confidence).label('robo_confidence') ) - for autosuggestion in results - ] - - - async def get_auto_agency_suggestions(self) -> list[AgencyAnnotationAutoSuggestion]: - cte = SuggestionsWithHighestConfidenceCTE() - query = ( + .join( + AgencyIDSubtaskSuggestion, + AgencyIDSubtaskSuggestion.subtask_id == URLAutoAgencyIDSubtask.id + ) + .join( + Agency, + Agency.id == AgencyIDSubtaskSuggestion.agency_id + ) + .group_by( + URLAutoAgencyIDSubtask.url_id, + Agency.id + ) + .cte("robo_suggestions") + ) + # Join user and robo suggestions + joined_suggestions_query = ( select( - cte.agency_id, - cte.confidence, + valid_agencies_cte.c.id.label("agency_id"), Agency.name.label("agency_name"), + func.coalesce(user_suggestions_cte.c.user_count, 0).label('user_count'), + func.coalesce(robo_suggestions_cte.c.robo_confidence, 0).label('robo_confidence'), ) .join( Agency, - Agency.id == cte.agency_id + Agency.id == valid_agencies_cte.c.id ) - ) - - if self.location_id is not None: - query = ( - query.join( - LinkAgencyLocation, - LinkAgencyLocation.agency_id == cte.agency_id - ) - .where( - LinkAgencyLocation.location_id == self.location_id + .outerjoin( + user_suggestions_cte, + and_( + user_suggestions_cte.c.url_id == self.url_id, + user_suggestions_cte.c.agency_id == Agency.id ) ) - - query = ( - query.where( - cte.url_id == self.url_id - ) - .order_by( - cte.confidence.desc() + .outerjoin( + robo_suggestions_cte, + and_( + robo_suggestions_cte.c.url_id == self.url_id, + robo_suggestions_cte.c.agency_id == Agency.id + ) ) - .limit(3) ) - results: Sequence[RowMapping] = await sh.mappings(self.session, query=query) - - return [ - AgencyAnnotationAutoSuggestion( - agency_id=autosuggestion["agency_id"], - confidence=autosuggestion["confidence"], - agency_name=autosuggestion["agency_name"], + # Return suggestions + mappings: Sequence[RowMapping] = await self.mappings(joined_suggestions_query) + suggestions: list[AgencyAnnotationSuggestion] = [ + AgencyAnnotationSuggestion( + **mapping ) - for autosuggestion in results + for mapping in mappings ] + return suggestions async def get_not_found_count(self) -> int: query = ( diff --git a/src/api/endpoints/annotate/all/get/queries/location_/core.py b/src/api/endpoints/annotate/all/get/queries/location_/core.py index 3ef0fb99..e1909b77 100644 --- a/src/api/endpoints/annotate/all/get/queries/location_/core.py +++ b/src/api/endpoints/annotate/all/get/queries/location_/core.py @@ -1,13 +1,7 @@ from sqlalchemy.ext.asyncio import AsyncSession -from src.api.endpoints.annotate.all.get.models.location import LocationAnnotationResponseOuterInfo, \ - LocationAnnotationUserSuggestion, LocationAnnotationAutoSuggestion, LocationAnnotationUserSuggestionOuterInfo -from src.api.endpoints.annotate.all.get.queries.location_.requester import GetLocationSuggestionsRequester -from src.db.queries.base.builder import QueryBuilderBase -from sqlalchemy.ext.asyncio import AsyncSession - -from src.api.endpoints.annotate.all.get.models.location import LocationAnnotationResponseOuterInfo, \ - LocationAnnotationUserSuggestion, LocationAnnotationAutoSuggestion +from src.api.endpoints.annotate.all.get.models.location import LocationAnnotationResponseOuterInfo +from src.api.endpoints.annotate.all.get.models.location import LocationAnnotationSuggestion from src.api.endpoints.annotate.all.get.queries.location_.requester import GetLocationSuggestionsRequester from src.db.queries.base.builder import QueryBuilderBase @@ -21,24 +15,17 @@ def __init__( super().__init__() self.url_id = url_id - + # TODO: Test async def run(self, session: AsyncSession) -> LocationAnnotationResponseOuterInfo: requester = GetLocationSuggestionsRequester(session) - # TODO: Pull both in single query - suggestions - user_suggestions: list[LocationAnnotationUserSuggestion] = \ - await requester.get_user_location_suggestions(self.url_id) - auto_suggestions: list[LocationAnnotationAutoSuggestion] = \ - await requester.get_auto_location_suggestions(self.url_id) + suggestions: list[LocationAnnotationSuggestion] = \ + await requester.get_location_suggestions(self.url_id) not_found_count: int = \ await requester.get_not_found_count(self.url_id) return LocationAnnotationResponseOuterInfo( - user=LocationAnnotationUserSuggestionOuterInfo( - suggestions=user_suggestions, - not_found_count=not_found_count - ), - auto=auto_suggestions + suggestions=suggestions, + not_found_count=not_found_count ) diff --git a/src/api/endpoints/annotate/all/get/queries/location_/requester.py b/src/api/endpoints/annotate/all/get/queries/location_/requester.py index abae28ee..26175322 100644 --- a/src/api/endpoints/annotate/all/get/queries/location_/requester.py +++ b/src/api/endpoints/annotate/all/get/queries/location_/requester.py @@ -2,8 +2,7 @@ from sqlalchemy import select, func, RowMapping, or_, and_ -from src.api.endpoints.annotate.all.get.models.location import LocationAnnotationUserSuggestion, \ - LocationAnnotationAutoSuggestion, LocationAnnotationSuggestion +from src.api.endpoints.annotate.all.get.models.location import LocationAnnotationSuggestion from src.db.helpers.query import exists_url from src.db.helpers.session import session_helper as sh from src.db.models.impl.link.user_suggestion_not_found.location.sqlalchemy import LinkUserSuggestionLocationNotFound @@ -38,13 +37,9 @@ async def get_location_suggestions(self, url_id: int) -> list[LocationAnnotation user_suggestions_cte = ( select( UserLocationSuggestion.url_id, - LocationExpandedView.id, + UserLocationSuggestion.location_id, func.count(UserLocationSuggestion.user_id).label('user_count') ) - .outerjoin( - LocationExpandedView, - LocationExpandedView.id == UserLocationSuggestion.location_id - ) .group_by( UserLocationSuggestion.location_id, UserLocationSuggestion.url_id, @@ -55,10 +50,10 @@ async def get_location_suggestions(self, url_id: int) -> list[LocationAnnotation robo_suggestions_cte = ( select( AutoLocationIDSubtask.url_id, - LocationExpandedView.id, + LocationExpandedView.id.label("location_id"), func.max(LocationIDSubtaskSuggestion.confidence).label('robo_confidence') ) - .outerjoin( + .join( LocationExpandedView, LocationExpandedView.id == LocationIDSubtaskSuggestion.location_id ) @@ -77,8 +72,8 @@ async def get_location_suggestions(self, url_id: int) -> list[LocationAnnotation select( valid_locations_cte.c.id.label("location_id"), LocationExpandedView.full_display_name.label("location_name"), - user_suggestions_cte.c.user_count, - robo_suggestions_cte.c.robo_confidence, + func.coalesce(user_suggestions_cte.c.user_count, 0).label("user_count"), + func.coalesce(robo_suggestions_cte.c.robo_confidence, 0).label("robo_confidence"), ) .join( LocationExpandedView, @@ -109,81 +104,6 @@ async def get_location_suggestions(self, url_id: int) -> list[LocationAnnotation ] return suggestions - async def get_location_not_found_suggestions(self, url_id: int ) -> int: - query = ( - select( - func.count(LinkUserSuggestionLocationNotFound.user_id) - ) - .where( - LinkUserSuggestionLocationNotFound.url_id == url_id - ) - ) - return await self.scalar(query) - - async def get_user_location_suggestions(self, url_id: int) -> list[LocationAnnotationUserSuggestion]: - query = ( - select( - UserLocationSuggestion.location_id, - LocationExpandedView.full_display_name.label("location_name"), - func.count(UserLocationSuggestion.user_id).label('user_count') - ) - .join( - LocationExpandedView, - LocationExpandedView.id == UserLocationSuggestion.location_id - ) - .where( - UserLocationSuggestion.url_id == url_id - ) - .group_by( - UserLocationSuggestion.location_id, - LocationExpandedView.full_display_name - ) - .order_by( - func.count(UserLocationSuggestion.user_id).desc() - ) - ) - raw_results: Sequence[RowMapping] = await sh.mappings(self.session, query) - return [ - LocationAnnotationUserSuggestion( - **raw_result - ) - for raw_result in raw_results - ] - - - - async def get_auto_location_suggestions( - self, - url_id: int - ) -> list[LocationAnnotationAutoSuggestion]: - query = ( - select( - LocationExpandedView.full_display_name.label("location_name"), - LocationIDSubtaskSuggestion.location_id, - LocationIDSubtaskSuggestion.confidence, - ) - .join( - LocationExpandedView, - LocationExpandedView.id == LocationIDSubtaskSuggestion.location_id - ) - .join( - AutoLocationIDSubtask, - AutoLocationIDSubtask.id == LocationIDSubtaskSuggestion.subtask_id - ) - .where( - AutoLocationIDSubtask.url_id == url_id - ) - .order_by( - LocationIDSubtaskSuggestion.confidence.desc() - ) - ) - raw_results: Sequence[RowMapping] = await sh.mappings(self.session, query) - return [ - LocationAnnotationAutoSuggestion( - **raw_result - ) - for raw_result in raw_results - ] async def get_not_found_count(self, url_id: int) -> int: query = ( diff --git a/src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py b/src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py index b6b2cc01..3f8b8186 100644 --- a/src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py +++ b/src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py @@ -1,5 +1,4 @@ import sqlalchemy as sa -from sqlalchemy import PrimaryKeyConstraint from sqlalchemy.orm import relationship from src.db.models.mixins import CreatedAtMixin, AgencyDependentMixin diff --git a/tests/automated/integration/api/annotate/all/test_happy_path.py b/tests/automated/integration/api/annotate/all/test_happy_path.py index 007e87f7..1505d0b7 100644 --- a/tests/automated/integration/api/annotate/all/test_happy_path.py +++ b/tests/automated/integration/api/annotate/all/test_happy_path.py @@ -1,6 +1,6 @@ import pytest -from src.api.endpoints.annotate.all.get.models.location import LocationAnnotationUserSuggestion +from src.api.endpoints.annotate.all.get.models.location import LocationAnnotationSuggestion from src.api.endpoints.annotate.all.get.models.response import GetNextURLForAllAnnotationResponse from src.api.endpoints.annotate.all.get.queries.core import GetNextURLForAllAnnotationQueryBuilder from src.api.endpoints.annotate.all.post.models.agency import AnnotationPostAgencyInfo @@ -140,20 +140,27 @@ async def test_annotate_all( user_id=99, ) ) - user_suggestions: list[LocationAnnotationUserSuggestion] = \ - response.next_annotation.location_suggestions.user.suggestions - assert len(user_suggestions) == 2 + suggestions: list[LocationAnnotationSuggestion] = response.next_annotation.location_suggestions.suggestions + assert len(suggestions) == 2 - response_location_ids: list[int] = [location_suggestion.location_id for location_suggestion in user_suggestions] - assert set(response_location_ids) == {california.location_id, pennsylvania.location_id} + response_location_ids: list[int] = [ + location_suggestion.location_id + for location_suggestion in suggestions] - response_location_names: list[str] = [location_suggestion.location_name for location_suggestion in user_suggestions] + assert set(response_location_ids) == { + california.location_id, + pennsylvania.location_id + } + + response_location_names: list[str] = [ + location_suggestion.location_name + for location_suggestion in suggestions] assert set(response_location_names) == { "California", "Pennsylvania" } - for user_suggestion in user_suggestions: + for user_suggestion in suggestions: assert user_suggestion.user_count == 1 # Confirm 3 name suggestions From 3ed9106b0d840072694233e739002866c69e4d76 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 1 Dec 2025 15:21:04 -0500 Subject: [PATCH 79/84] Update source collector permission --- src/security/enums.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/security/enums.py b/src/security/enums.py index c10c346b..0090c3bc 100644 --- a/src/security/enums.py +++ b/src/security/enums.py @@ -2,5 +2,5 @@ class Permissions(Enum): - SOURCE_COLLECTOR = "source_collector" + SOURCE_COLLECTOR = "access_source_collector" SOURCE_COLLECTOR_FINAL_REVIEW = "source_collector_final_review" From a0c2dae63c9d8518f911eaf4b88373157fb519ca Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 1 Dec 2025 16:14:47 -0500 Subject: [PATCH 80/84] Fix bug where locations/agencies without annotations were being returned --- .../all/get/queries/agency/requester.py | 6 ++ .../suggestions_with_highest_confidence.py | 62 ------------------- .../all/get/queries/location_/requester.py | 6 ++ .../api/annotate/all/test_happy_path.py | 2 + 4 files changed, 14 insertions(+), 62 deletions(-) delete mode 100644 src/api/endpoints/annotate/all/get/queries/agency/suggestions_with_highest_confidence.py diff --git a/src/api/endpoints/annotate/all/get/queries/agency/requester.py b/src/api/endpoints/annotate/all/get/queries/agency/requester.py index 28923cf2..136b5ee5 100644 --- a/src/api/endpoints/annotate/all/get/queries/agency/requester.py +++ b/src/api/endpoints/annotate/all/get/queries/agency/requester.py @@ -106,6 +106,12 @@ async def get_agency_suggestions(self) -> list[AgencyAnnotationSuggestion]: robo_suggestions_cte.c.agency_id == Agency.id ) ) + .where( + or_( + user_suggestions_cte.c.user_count > 0, + robo_suggestions_cte.c.robo_confidence > 0 + ) + ) ) # Return suggestions diff --git a/src/api/endpoints/annotate/all/get/queries/agency/suggestions_with_highest_confidence.py b/src/api/endpoints/annotate/all/get/queries/agency/suggestions_with_highest_confidence.py deleted file mode 100644 index 6d389b11..00000000 --- a/src/api/endpoints/annotate/all/get/queries/agency/suggestions_with_highest_confidence.py +++ /dev/null @@ -1,62 +0,0 @@ -from sqlalchemy import CTE, select, func, Column - -from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask -from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion - -SUGGESTIONS_WITH_HIGHEST_CONFIDENCE_CTE: CTE = ( - select( - URLAutoAgencyIDSubtask.url_id, - AgencyIDSubtaskSuggestion.agency_id, - func.max(AgencyIDSubtaskSuggestion.confidence) - ) - .select_from(URLAutoAgencyIDSubtask) - .join( - AgencyIDSubtaskSuggestion, - URLAutoAgencyIDSubtask.id == AgencyIDSubtaskSuggestion.subtask_id - ) - .group_by( - URLAutoAgencyIDSubtask.url_id, - AgencyIDSubtaskSuggestion.agency_id - ) - .cte("suggestions_with_highest_confidence") -) - -class SuggestionsWithHighestConfidenceCTE: - - def __init__(self): - self._cte = ( - select( - URLAutoAgencyIDSubtask.url_id, - AgencyIDSubtaskSuggestion.agency_id, - func.max(AgencyIDSubtaskSuggestion.confidence).label("confidence") - ) - .select_from(URLAutoAgencyIDSubtask) - .join( - AgencyIDSubtaskSuggestion, - URLAutoAgencyIDSubtask.id == AgencyIDSubtaskSuggestion.subtask_id - ) - .where( - AgencyIDSubtaskSuggestion.agency_id.isnot(None) - ) - .group_by( - URLAutoAgencyIDSubtask.url_id, - AgencyIDSubtaskSuggestion.agency_id - ) - .cte("suggestions_with_highest_confidence") - ) - - @property - def cte(self) -> CTE: - return self._cte - - @property - def url_id(self) -> Column[int]: - return self._cte.columns.url_id - - @property - def agency_id(self) -> Column[int]: - return self._cte.columns.agency_id - - @property - def confidence(self) -> Column[float]: - return self._cte.columns.confidence \ No newline at end of file diff --git a/src/api/endpoints/annotate/all/get/queries/location_/requester.py b/src/api/endpoints/annotate/all/get/queries/location_/requester.py index 26175322..66942661 100644 --- a/src/api/endpoints/annotate/all/get/queries/location_/requester.py +++ b/src/api/endpoints/annotate/all/get/queries/location_/requester.py @@ -93,6 +93,12 @@ async def get_location_suggestions(self, url_id: int) -> list[LocationAnnotation robo_suggestions_cte.c.location_id == LocationExpandedView.id ) ) + .where( + or_( + user_suggestions_cte.c.user_count > 0, + robo_suggestions_cte.c.robo_confidence > 0 + ) + ) ) mappings: Sequence[RowMapping] = await self.mappings(joined_suggestions_query) diff --git a/tests/automated/integration/api/annotate/all/test_happy_path.py b/tests/automated/integration/api/annotate/all/test_happy_path.py index 1505d0b7..7250de89 100644 --- a/tests/automated/integration/api/annotate/all/test_happy_path.py +++ b/tests/automated/integration/api/annotate/all/test_happy_path.py @@ -23,7 +23,9 @@ async def test_annotate_all( api_test_helper, pennsylvania: USStateCreationInfo, + allegheny_county: USStateCreationInfo, california: USStateCreationInfo, + test_agency_id: int ): """ Test the happy path workflow for the all-annotations endpoint From 00e5095b5dcd32b36d77f00ddee42084b63d736f Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 1 Dec 2025 17:46:59 -0500 Subject: [PATCH 81/84] Add sorting for suggestions --- ...98f9cd8a_create_anonymous_session_users.py | 39 +++++++++++++++++++ .../annotate/all/get/models/agency.py | 12 +----- .../annotate/all/get/models/location.py | 13 +------ .../annotate/all/get/models/suggestion.py | 17 ++++++++ .../all/get/queries/_shared/__init__.py | 0 .../annotate/all/get/queries/_shared/sort.py | 13 +++++++ .../annotate/all/get/queries/agency/core.py | 4 +- .../all/get/queries/agency/requester.py | 15 +++---- .../all/get/queries/location_/core.py | 4 +- .../all/get/queries/location_/requester.py | 16 ++++---- .../api/annotate/all/test_happy_path.py | 8 ++-- .../api/annotate/anonymous/test_core.py | 5 +++ 12 files changed, 102 insertions(+), 44 deletions(-) create mode 100644 alembic/versions/2025_12_01_1632-1d3398f9cd8a_create_anonymous_session_users.py create mode 100644 src/api/endpoints/annotate/all/get/models/suggestion.py create mode 100644 src/api/endpoints/annotate/all/get/queries/_shared/__init__.py create mode 100644 src/api/endpoints/annotate/all/get/queries/_shared/sort.py diff --git a/alembic/versions/2025_12_01_1632-1d3398f9cd8a_create_anonymous_session_users.py b/alembic/versions/2025_12_01_1632-1d3398f9cd8a_create_anonymous_session_users.py new file mode 100644 index 00000000..af4553d2 --- /dev/null +++ b/alembic/versions/2025_12_01_1632-1d3398f9cd8a_create_anonymous_session_users.py @@ -0,0 +1,39 @@ +"""Create anonymous_session_users + +Revision ID: 1d3398f9cd8a +Revises: 5d6412540aba +Create Date: 2025-12-01 16:32:27.842175 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects.postgresql import UUID + +# revision identifiers, used by Alembic. +revision: str = '1d3398f9cd8a' +down_revision: Union[str, None] = '5d6412540aba' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # Create anonymous_sessions table + op.create_table( + "anonymous_sessions", + sa.Column( + "id", + UUID, + server_default=sa.text("gen_random_uuid()"), + primary_key=True + ), + ) + + # TODO: Update anonymous tables to link to anonymous sessions table + + ## TODO: Drop any unique IDs forbidding more than a single ID for these columns + + +def downgrade() -> None: + pass diff --git a/src/api/endpoints/annotate/all/get/models/agency.py b/src/api/endpoints/annotate/all/get/models/agency.py index fc568af3..593438ce 100644 --- a/src/api/endpoints/annotate/all/get/models/agency.py +++ b/src/api/endpoints/annotate/all/get/models/agency.py @@ -1,17 +1,9 @@ from pydantic import BaseModel, Field -class AgencyAnnotationSuggestion(BaseModel): - agency_id: int - agency_name: str - user_count: int - robo_confidence: int | None = Field( - description="The robo labeler's given confidence for its suggestion. Null if no robo-label occurred.", - ge=0, - le=100, - ) +from src.api.endpoints.annotate.all.get.models.suggestion import SuggestionModel class AgencyAnnotationResponseOuterInfo(BaseModel): - suggestions: list[AgencyAnnotationSuggestion] + suggestions: list[SuggestionModel] not_found_count: int = Field( description="How many users indicated the agency could not be found." ) diff --git a/src/api/endpoints/annotate/all/get/models/location.py b/src/api/endpoints/annotate/all/get/models/location.py index 0100bbc4..be277c41 100644 --- a/src/api/endpoints/annotate/all/get/models/location.py +++ b/src/api/endpoints/annotate/all/get/models/location.py @@ -1,18 +1,9 @@ from pydantic import BaseModel, Field - -class LocationAnnotationSuggestion(BaseModel): - location_id: int - location_name: str - user_count: int - robo_confidence: int | None = Field( - description="The robo labeler's given confidence for its suggestion. Null if no robo-label occurred.", - ge=0, - le=100, - ) +from src.api.endpoints.annotate.all.get.models.suggestion import SuggestionModel class LocationAnnotationResponseOuterInfo(BaseModel): - suggestions: list[LocationAnnotationSuggestion] + suggestions: list[SuggestionModel] not_found_count: int = Field( description="How many users indicated the location could not be found." ) \ No newline at end of file diff --git a/src/api/endpoints/annotate/all/get/models/suggestion.py b/src/api/endpoints/annotate/all/get/models/suggestion.py new file mode 100644 index 00000000..bed981fe --- /dev/null +++ b/src/api/endpoints/annotate/all/get/models/suggestion.py @@ -0,0 +1,17 @@ +from pydantic import BaseModel, Field + + +class SuggestionModel(BaseModel): + id: int + display_name: str + user_count: int + robo_confidence: int | None = Field( + description="The robo labeler's given confidence for its suggestion. Null if no robo-label occurred.", + ge=0, + le=100, + ) + + @property + def score(self) -> float: + robo_score = (self.robo_confidence or 0) / 100 + return self.user_count + robo_score \ No newline at end of file diff --git a/src/api/endpoints/annotate/all/get/queries/_shared/__init__.py b/src/api/endpoints/annotate/all/get/queries/_shared/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/annotate/all/get/queries/_shared/sort.py b/src/api/endpoints/annotate/all/get/queries/_shared/sort.py new file mode 100644 index 00000000..0dae85b4 --- /dev/null +++ b/src/api/endpoints/annotate/all/get/queries/_shared/sort.py @@ -0,0 +1,13 @@ +from src.api.endpoints.annotate.all.get.models.suggestion import SuggestionModel + + +def sort_suggestions( + suggestions: list[SuggestionModel] +) -> list[SuggestionModel]: + """ + Sort according to the following criterion: + - Each user suggestion is a point + - The robo suggestion is a point * (confidence /100) + - Sort in descending order of points + """ + return sorted(suggestions, key=lambda s: s.score, reverse=True) \ No newline at end of file diff --git a/src/api/endpoints/annotate/all/get/queries/agency/core.py b/src/api/endpoints/annotate/all/get/queries/agency/core.py index d9a86717..f7cfaf42 100644 --- a/src/api/endpoints/annotate/all/get/queries/agency/core.py +++ b/src/api/endpoints/annotate/all/get/queries/agency/core.py @@ -1,7 +1,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.api.endpoints.annotate.all.get.models.agency import AgencyAnnotationResponseOuterInfo -from src.api.endpoints.annotate.all.get.models.agency import AgencyAnnotationSuggestion +from src.api.endpoints.annotate.all.get.models.suggestion import SuggestionModel from src.api.endpoints.annotate.all.get.queries.agency.requester import GetAgencySuggestionsRequester from src.db.queries.base.builder import QueryBuilderBase @@ -24,7 +24,7 @@ async def run(self, session: AsyncSession) -> AgencyAnnotationResponseOuterInfo: location_id=self.location_id ) - suggestions: list[AgencyAnnotationSuggestion] = \ + suggestions: list[SuggestionModel] = \ await requester.get_agency_suggestions() not_found_count: int = \ await requester.get_not_found_count() diff --git a/src/api/endpoints/annotate/all/get/queries/agency/requester.py b/src/api/endpoints/annotate/all/get/queries/agency/requester.py index 136b5ee5..9d933ae2 100644 --- a/src/api/endpoints/annotate/all/get/queries/agency/requester.py +++ b/src/api/endpoints/annotate/all/get/queries/agency/requester.py @@ -3,7 +3,8 @@ from sqlalchemy import func, select, RowMapping, or_, and_ from sqlalchemy.ext.asyncio import AsyncSession -from src.api.endpoints.annotate.all.get.models.agency import AgencyAnnotationSuggestion +from src.api.endpoints.annotate.all.get.models.suggestion import SuggestionModel +from src.api.endpoints.annotate.all.get.queries._shared.sort import sort_suggestions from src.db.helpers.query import exists_url from src.db.helpers.session import session_helper as sh from src.db.models.impl.agency.sqlalchemy import Agency @@ -26,7 +27,7 @@ def __init__( self.url_id = url_id self.location_id = location_id - async def get_agency_suggestions(self) -> list[AgencyAnnotationSuggestion]: + async def get_agency_suggestions(self) -> list[SuggestionModel]: # All agencies with either a user or robo annotation valid_agencies_cte = ( select( @@ -83,8 +84,8 @@ async def get_agency_suggestions(self) -> list[AgencyAnnotationSuggestion]: # Join user and robo suggestions joined_suggestions_query = ( select( - valid_agencies_cte.c.id.label("agency_id"), - Agency.name.label("agency_name"), + valid_agencies_cte.c.id, + Agency.name.label("display_name"), func.coalesce(user_suggestions_cte.c.user_count, 0).label('user_count'), func.coalesce(robo_suggestions_cte.c.robo_confidence, 0).label('robo_confidence'), ) @@ -116,13 +117,13 @@ async def get_agency_suggestions(self) -> list[AgencyAnnotationSuggestion]: # Return suggestions mappings: Sequence[RowMapping] = await self.mappings(joined_suggestions_query) - suggestions: list[AgencyAnnotationSuggestion] = [ - AgencyAnnotationSuggestion( + suggestions: list[SuggestionModel] = [ + SuggestionModel( **mapping ) for mapping in mappings ] - return suggestions + return sort_suggestions(suggestions) async def get_not_found_count(self) -> int: query = ( diff --git a/src/api/endpoints/annotate/all/get/queries/location_/core.py b/src/api/endpoints/annotate/all/get/queries/location_/core.py index e1909b77..6081c5f7 100644 --- a/src/api/endpoints/annotate/all/get/queries/location_/core.py +++ b/src/api/endpoints/annotate/all/get/queries/location_/core.py @@ -1,7 +1,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.api.endpoints.annotate.all.get.models.location import LocationAnnotationResponseOuterInfo -from src.api.endpoints.annotate.all.get.models.location import LocationAnnotationSuggestion +from src.api.endpoints.annotate.all.get.models.suggestion import SuggestionModel from src.api.endpoints.annotate.all.get.queries.location_.requester import GetLocationSuggestionsRequester from src.db.queries.base.builder import QueryBuilderBase @@ -19,7 +19,7 @@ def __init__( async def run(self, session: AsyncSession) -> LocationAnnotationResponseOuterInfo: requester = GetLocationSuggestionsRequester(session) - suggestions: list[LocationAnnotationSuggestion] = \ + suggestions: list[SuggestionModel] = \ await requester.get_location_suggestions(self.url_id) not_found_count: int = \ await requester.get_not_found_count(self.url_id) diff --git a/src/api/endpoints/annotate/all/get/queries/location_/requester.py b/src/api/endpoints/annotate/all/get/queries/location_/requester.py index 66942661..fad8e834 100644 --- a/src/api/endpoints/annotate/all/get/queries/location_/requester.py +++ b/src/api/endpoints/annotate/all/get/queries/location_/requester.py @@ -2,7 +2,8 @@ from sqlalchemy import select, func, RowMapping, or_, and_ -from src.api.endpoints.annotate.all.get.models.location import LocationAnnotationSuggestion +from src.api.endpoints.annotate.all.get.models.suggestion import SuggestionModel +from src.api.endpoints.annotate.all.get.queries._shared.sort import sort_suggestions from src.db.helpers.query import exists_url from src.db.helpers.session import session_helper as sh from src.db.models.impl.link.user_suggestion_not_found.location.sqlalchemy import LinkUserSuggestionLocationNotFound @@ -15,7 +16,7 @@ class GetLocationSuggestionsRequester(RequesterBase): - async def get_location_suggestions(self, url_id: int) -> list[LocationAnnotationSuggestion]: + async def get_location_suggestions(self, url_id: int) -> list[SuggestionModel]: # All locations with either a user or robo annotation valid_locations_cte = ( select( @@ -70,8 +71,8 @@ async def get_location_suggestions(self, url_id: int) -> list[LocationAnnotation # Join user and robo suggestions joined_suggestions_query = ( select( - valid_locations_cte.c.id.label("location_id"), - LocationExpandedView.full_display_name.label("location_name"), + valid_locations_cte.c.id, + LocationExpandedView.full_display_name.label("display_name"), func.coalesce(user_suggestions_cte.c.user_count, 0).label("user_count"), func.coalesce(robo_suggestions_cte.c.robo_confidence, 0).label("robo_confidence"), ) @@ -102,14 +103,13 @@ async def get_location_suggestions(self, url_id: int) -> list[LocationAnnotation ) mappings: Sequence[RowMapping] = await self.mappings(joined_suggestions_query) - suggestions: list[LocationAnnotationSuggestion] = [ - LocationAnnotationSuggestion( + suggestions: list[SuggestionModel] = [ + SuggestionModel( **mapping ) for mapping in mappings ] - return suggestions - + return sort_suggestions(suggestions) async def get_not_found_count(self, url_id: int) -> int: query = ( diff --git a/tests/automated/integration/api/annotate/all/test_happy_path.py b/tests/automated/integration/api/annotate/all/test_happy_path.py index 7250de89..47db2a09 100644 --- a/tests/automated/integration/api/annotate/all/test_happy_path.py +++ b/tests/automated/integration/api/annotate/all/test_happy_path.py @@ -1,7 +1,7 @@ import pytest -from src.api.endpoints.annotate.all.get.models.location import LocationAnnotationSuggestion from src.api.endpoints.annotate.all.get.models.response import GetNextURLForAllAnnotationResponse +from src.api.endpoints.annotate.all.get.models.suggestion import SuggestionModel from src.api.endpoints.annotate.all.get.queries.core import GetNextURLForAllAnnotationQueryBuilder from src.api.endpoints.annotate.all.post.models.agency import AnnotationPostAgencyInfo from src.api.endpoints.annotate.all.post.models.location import AnnotationPostLocationInfo @@ -142,11 +142,11 @@ async def test_annotate_all( user_id=99, ) ) - suggestions: list[LocationAnnotationSuggestion] = response.next_annotation.location_suggestions.suggestions + suggestions: list[SuggestionModel] = response.next_annotation.location_suggestions.suggestions assert len(suggestions) == 2 response_location_ids: list[int] = [ - location_suggestion.location_id + location_suggestion.id for location_suggestion in suggestions] assert set(response_location_ids) == { @@ -155,7 +155,7 @@ async def test_annotate_all( } response_location_names: list[str] = [ - location_suggestion.location_name + location_suggestion.display_name for location_suggestion in suggestions] assert set(response_location_names) == { "California", diff --git a/tests/automated/integration/api/annotate/anonymous/test_core.py b/tests/automated/integration/api/annotate/anonymous/test_core.py index d2b9f691..84781768 100644 --- a/tests/automated/integration/api/annotate/anonymous/test_core.py +++ b/tests/automated/integration/api/annotate/anonymous/test_core.py @@ -26,6 +26,11 @@ async def test_annotate_anonymous( api_test_helper, pennsylvania: USStateCreationInfo, ): + + # TODO: Update to include session ID + + # TODO: If session ID not included, user gets same annotation as before? + ath = api_test_helper ddc = ath.db_data_creator rv = ath.request_validator From a82fcf47dd72f5cafac1ad66450246e8ee08c8d0 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 2 Dec 2025 14:08:38 -0500 Subject: [PATCH 82/84] Add pagination for agency search --- src/api/endpoints/search/agency/query.py | 4 +++- src/api/endpoints/search/routes.py | 7 ++++++- .../integration/api/search/agency/test_search.py | 10 ++++++++++ 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/src/api/endpoints/search/agency/query.py b/src/api/endpoints/search/agency/query.py index 254d90f5..5e36e9a5 100644 --- a/src/api/endpoints/search/agency/query.py +++ b/src/api/endpoints/search/agency/query.py @@ -20,11 +20,13 @@ def __init__( location_id: int | None, query: str | None, jurisdiction_type: JurisdictionType | None, + page: int ): super().__init__() self.location_id = location_id self.query = query self.jurisdiction_type = jurisdiction_type + self.page = page async def run(self, session: AsyncSession) -> list[AgencySearchResponse]: @@ -68,7 +70,7 @@ async def run(self, session: AsyncSession) -> list[AgencySearchResponse]: ).desc() ) - query = query.limit(50) + query = query.limit(10).offset((self.page - 1) * 10) mappings: Sequence[RowMapping] = await sh.mappings(session, query) diff --git a/src/api/endpoints/search/routes.py b/src/api/endpoints/search/routes.py index dfbeeacd..58b661e8 100644 --- a/src/api/endpoints/search/routes.py +++ b/src/api/endpoints/search/routes.py @@ -40,6 +40,10 @@ async def search_agency( description="The jurisdiction type to search for", default=None ), + page: int = Query( + description="The page to search for", + default=1 + ), access_info: AccessInfo = Depends(get_access_info), async_core: AsyncCore = Depends(get_async_core), ) -> list[AgencySearchResponse]: @@ -53,6 +57,7 @@ async def search_agency( SearchAgencyQueryBuilder( location_id=location_id, query=query, - jurisdiction_type=jurisdiction_type + jurisdiction_type=jurisdiction_type, + page=page ) ) \ No newline at end of file diff --git a/tests/automated/integration/api/search/agency/test_search.py b/tests/automated/integration/api/search/agency/test_search.py index cc3fee19..f207b3ae 100644 --- a/tests/automated/integration/api/search/agency/test_search.py +++ b/tests/automated/integration/api/search/agency/test_search.py @@ -61,3 +61,13 @@ async def test_search_agency( } ) assert len(responses) == 3 + + # Test pagination + responses = api_test_helper.request_validator.get_v2( + url="/search/agency", + params={ + "query": "A Agency", + "location_id": allegheny_county.location_id, + "page": 2 + } + ) From a09829178e326c27457cc84710047a3ccce33613 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 5 Dec 2025 17:11:27 -0500 Subject: [PATCH 83/84] Add sessions for anonymous annotations --- ...98f9cd8a_create_anonymous_session_users.py | 132 +++++++++++++++++- src/api/endpoints/annotate/_shared/extract.py | 2 +- .../annotate/anonymous/get/helpers.py | 27 ++++ .../endpoints/annotate/anonymous/get/query.py | 58 +++++++- .../annotate/anonymous/get/response.py | 10 ++ .../annotate/anonymous/post/query.py | 16 ++- src/api/endpoints/annotate/routes.py | 27 +++- .../submit/data_source/queries/core.py | 24 +++- .../validate/queries/ctes/counts/constants.py | 3 + .../queries/ctes/counts/impl/agency.py | 70 ++++++++-- .../queries/ctes/counts/impl/location.py | 71 ++++++++-- .../queries/ctes/counts/impl/record_type.py | 55 +++++++- .../queries/ctes/counts/impl/url_type.py | 54 ++++++- src/db/client/async_.py | 2 + .../impl/url/suggestion/anonymous/__init__.py | 1 + .../suggestion/anonymous/agency/sqlalchemy.py | 7 +- .../anonymous/location/sqlalchemy.py | 7 +- .../anonymous/record_type/sqlalchemy.py | 7 +- .../suggestion/anonymous/session/__init__.py | 0 .../anonymous/session/sqlalchemy.py | 17 +++ .../anonymous/url_type/sqlalchemy.py | 7 +- src/db/models/mixins.py | 11 ++ .../implementations/anonymous_session.py | 16 +++ .../api/annotate/anonymous/helper.py | 23 ++- .../api/annotate/anonymous/test_core.py | 30 +++- .../api/submit/data_source/test_core.py | 18 +++ .../api/url/by_id/delete/test_any_url.py | 18 ++- .../tasks/url/impl/validate/helper.py | 8 ++ .../url/impl/validate/test_data_source.py | 59 +++++++- 29 files changed, 680 insertions(+), 100 deletions(-) create mode 100644 src/api/endpoints/annotate/anonymous/get/helpers.py create mode 100644 src/api/endpoints/annotate/anonymous/get/response.py create mode 100644 src/core/tasks/url/operators/validate/queries/ctes/counts/constants.py create mode 100644 src/db/models/impl/url/suggestion/anonymous/session/__init__.py create mode 100644 src/db/models/impl/url/suggestion/anonymous/session/sqlalchemy.py create mode 100644 src/db/queries/implementations/anonymous_session.py diff --git a/alembic/versions/2025_12_01_1632-1d3398f9cd8a_create_anonymous_session_users.py b/alembic/versions/2025_12_01_1632-1d3398f9cd8a_create_anonymous_session_users.py index af4553d2..e3dafbbc 100644 --- a/alembic/versions/2025_12_01_1632-1d3398f9cd8a_create_anonymous_session_users.py +++ b/alembic/versions/2025_12_01_1632-1d3398f9cd8a_create_anonymous_session_users.py @@ -11,15 +11,142 @@ import sqlalchemy as sa from sqlalchemy.dialects.postgresql import UUID +from src.util.alembic_helpers import created_at_column + # revision identifiers, used by Alembic. revision: str = '1d3398f9cd8a' down_revision: Union[str, None] = '5d6412540aba' branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None +def _alter_anonymous_annotation_agency(): + # Add new column + op.add_column( + "anonymous_annotation_agency", + sa.Column( + "session_id", + UUID, + sa.ForeignKey("anonymous_sessions.id"), + nullable=False + ) + ) + + # Drop prior unique constraint/primary key + op.drop_constraint( + "anonymous_annotation_agency_pkey", + "anonymous_annotation_agency" + ) + + # Add new unique constraint/primary key + op.create_primary_key( + "anonymous_annotation_agency_pkey", + "anonymous_annotation_agency", + ["session_id", "url_id", "agency_id"] + ) + +def _alter_anonymous_annotation_location(): + # Add new column + op.add_column( + "anonymous_annotation_location", + sa.Column( + "session_id", + UUID, + sa.ForeignKey("anonymous_sessions.id"), + nullable=False + ) + ) + + # Drop prior unique constraint/primary key + op.drop_constraint( + "anonymous_annotation_location_pkey", + "anonymous_annotation_location" + ) + + # Add new unique constraint/primary key + op.create_primary_key( + "anonymous_annotation_location_pkey", + "anonymous_annotation_location", + ["session_id", "url_id", "location_id"] + ) + +def _alter_anonymous_annotation_record_type(): + # Add new column + op.add_column( + "anonymous_annotation_record_type", + sa.Column( + "session_id", + UUID, + sa.ForeignKey("anonymous_sessions.id"), + nullable=False + ) + ) + + # Drop prior unique constraint/primary key + op.drop_constraint( + "anonymous_annotation_record_type_pkey", + "anonymous_annotation_record_type" + ) + + # Add new unique constraint/primary key + op.create_primary_key( + "anonymous_annotation_record_type_pkey", + "anonymous_annotation_record_type", + ["session_id", "url_id", "record_type"] + ) + +def _alter_anonymous_annotation_url_type(): + # Add new column + op.add_column( + "anonymous_annotation_url_type", + sa.Column( + "session_id", + UUID, + sa.ForeignKey("anonymous_sessions.id"), + nullable=False + ) + ) + + # Drop prior unique constraint/primary key + op.drop_constraint( + "anonymous_annotation_url_type_pkey", + "anonymous_annotation_url_type" + ) + + # Add new unique constraint/primary key + op.create_primary_key( + "anonymous_annotation_url_type_pkey", + "anonymous_annotation_url_type", + ["session_id", "url_id", "url_type"] + ) def upgrade() -> None: # Create anonymous_sessions table + _create_anonymous_sessions_table() + + # Remove all prior anonymous annotations + _remove_prior_sessions() + + _alter_anonymous_annotation_agency() + _alter_anonymous_annotation_location() + _alter_anonymous_annotation_record_type() + _alter_anonymous_annotation_url_type() + + +def _remove_prior_sessions(): + for table in [ + "anonymous_annotation_agency", + "anonymous_annotation_location", + "anonymous_annotation_record_type", + "anonymous_annotation_url_type" + ]: + op.execute( + f""" + DELETE FROM {table} + """ + ) + + +def _create_anonymous_sessions_table(): op.create_table( "anonymous_sessions", sa.Column( @@ -28,12 +155,9 @@ def upgrade() -> None: server_default=sa.text("gen_random_uuid()"), primary_key=True ), + created_at_column() ) - # TODO: Update anonymous tables to link to anonymous sessions table - - ## TODO: Drop any unique IDs forbidding more than a single ID for these columns - def downgrade() -> None: pass diff --git a/src/api/endpoints/annotate/_shared/extract.py b/src/api/endpoints/annotate/_shared/extract.py index 1a0932d3..c0459e04 100644 --- a/src/api/endpoints/annotate/_shared/extract.py +++ b/src/api/endpoints/annotate/_shared/extract.py @@ -24,7 +24,7 @@ async def extract_and_format_get_annotation_result( session: AsyncSession, url: URL, batch_id: int | None = None -): +) -> GetNextURLForAllAnnotationResponse: html_response_info = DTOConverter.html_content_list_to_html_response_info( url.html_content ) diff --git a/src/api/endpoints/annotate/anonymous/get/helpers.py b/src/api/endpoints/annotate/anonymous/get/helpers.py new file mode 100644 index 00000000..83a10845 --- /dev/null +++ b/src/api/endpoints/annotate/anonymous/get/helpers.py @@ -0,0 +1,27 @@ +from typing import Protocol, TypeVar +from uuid import UUID + +from marshmallow.fields import Bool +from sqlalchemy import Exists, select, exists, ColumnElement, Boolean + +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.mixins import AnonymousSessionMixin, URLDependentMixin +from src.db.models.templates_.base import Base + + +class AnonymousURLModelProtocol( + Protocol, +): + session_id: ColumnElement[UUID] + url_id: ColumnElement[int] + +AnonModel = TypeVar("AnonModel", bound=AnonymousURLModelProtocol) + +def not_exists_anon_annotation(session_id: UUID, anon_model: AnonModel) -> ColumnElement[bool]: + return ~exists( + select(anon_model.url_id) + .where( + anon_model.url_id == URL.id, + anon_model.session_id == session_id, + ) + ) \ No newline at end of file diff --git a/src/api/endpoints/annotate/anonymous/get/query.py b/src/api/endpoints/annotate/anonymous/get/query.py index 7e5f2e53..041d5cda 100644 --- a/src/api/endpoints/annotate/anonymous/get/query.py +++ b/src/api/endpoints/annotate/anonymous/get/query.py @@ -1,14 +1,21 @@ from typing import Any +from uuid import UUID -from sqlalchemy import Select, func +from sqlalchemy import Select, func, exists, select from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.orm import joinedload from src.api.endpoints.annotate._shared.extract import extract_and_format_get_annotation_result from src.api.endpoints.annotate.all.get.models.response import GetNextURLForAllAnnotationResponse +from src.api.endpoints.annotate.anonymous.get.helpers import not_exists_anon_annotation +from src.api.endpoints.annotate.anonymous.get.response import GetNextURLForAnonymousAnnotationResponse from src.collectors.enums import URLStatus from src.db.helpers.query import not_exists_url +from src.db.models.impl.flag.url_suspended.sqlalchemy import FlagURLSuspended from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.suggestion.anonymous.agency.sqlalchemy import AnonymousAnnotationAgency +from src.db.models.impl.url.suggestion.anonymous.location.sqlalchemy import AnonymousAnnotationLocation +from src.db.models.impl.url.suggestion.anonymous.record_type.sqlalchemy import AnonymousAnnotationRecordType from src.db.models.impl.url.suggestion.anonymous.url_type.sqlalchemy import AnonymousAnnotationURLType from src.db.models.views.unvalidated_url import UnvalidatedURL from src.db.models.views.url_anno_count import URLAnnotationCount @@ -18,7 +25,14 @@ class GetNextURLForAnonymousAnnotationQueryBuilder(QueryBuilderBase): - async def run(self, session: AsyncSession) -> GetNextURLForAllAnnotationResponse: + def __init__( + self, + session_id: UUID + ): + super().__init__() + self.session_id = session_id + + async def run(self, session: AsyncSession) -> GetNextURLForAnonymousAnnotationResponse: query = ( Select(URL) @@ -37,7 +51,31 @@ async def run(self, session: AsyncSession) -> GetNextURLForAllAnnotationResponse ) .where( URL.status == URLStatus.OK.value, - not_exists_url(AnonymousAnnotationURLType) + # Must not have been previously annotated by user + not_exists_anon_annotation( + session_id=self.session_id, + anon_model=AnonymousAnnotationURLType + ), + not_exists_anon_annotation( + session_id=self.session_id, + anon_model=AnonymousAnnotationRecordType + ), + not_exists_anon_annotation( + session_id=self.session_id, + anon_model=AnonymousAnnotationLocation + ), + not_exists_anon_annotation( + session_id=self.session_id, + anon_model=AnonymousAnnotationAgency + ), + ~exists( + select( + FlagURLSuspended.url_id + ) + .where( + FlagURLSuspended.url_id == URL.id, + ) + ) ) .options( joinedload(URL.html_content), @@ -46,7 +84,8 @@ async def run(self, session: AsyncSession) -> GetNextURLForAllAnnotationResponse joinedload(URL.name_suggestions), ) .order_by( - func.random() + URLAnnotationCount.total_anno_count.desc(), + URL.id.asc() ) .limit(1) ) @@ -54,8 +93,13 @@ async def run(self, session: AsyncSession) -> GetNextURLForAllAnnotationResponse raw_results = (await session.execute(query)).unique() url: URL | None = raw_results.scalars().one_or_none() if url is None: - return GetNextURLForAllAnnotationResponse( - next_annotation=None + return GetNextURLForAnonymousAnnotationResponse( + next_annotation=None, + session_id=self.session_id ) - return await extract_and_format_get_annotation_result(session, url=url) + response: GetNextURLForAllAnnotationResponse = await extract_and_format_get_annotation_result(session, url=url) + return GetNextURLForAnonymousAnnotationResponse( + session_id=self.session_id, + next_annotation=response.next_annotation + ) diff --git a/src/api/endpoints/annotate/anonymous/get/response.py b/src/api/endpoints/annotate/anonymous/get/response.py new file mode 100644 index 00000000..e54403bc --- /dev/null +++ b/src/api/endpoints/annotate/anonymous/get/response.py @@ -0,0 +1,10 @@ +from uuid import UUID + +from pydantic import BaseModel + +from src.api.endpoints.annotate.all.get.models.response import GetNextURLForAllAnnotationInnerResponse + + +class GetNextURLForAnonymousAnnotationResponse(BaseModel): + next_annotation: GetNextURLForAllAnnotationInnerResponse | None + session_id: UUID \ No newline at end of file diff --git a/src/api/endpoints/annotate/anonymous/post/query.py b/src/api/endpoints/annotate/anonymous/post/query.py index faa7aa1d..593d79d9 100644 --- a/src/api/endpoints/annotate/anonymous/post/query.py +++ b/src/api/endpoints/annotate/anonymous/post/query.py @@ -1,3 +1,5 @@ +from uuid import UUID + from sqlalchemy.ext.asyncio import AsyncSession from src.api.endpoints.annotate.all.post.models.request import AllAnnotationPostInfo @@ -11,10 +13,12 @@ class AddAnonymousAnnotationsToURLQueryBuilder(QueryBuilderBase): def __init__( self, + session_id: UUID, url_id: int, post_info: AllAnnotationPostInfo ): super().__init__() + self.session_id = session_id self.url_id = url_id self.post_info = post_info @@ -22,14 +26,16 @@ async def run(self, session: AsyncSession) -> None: url_type_suggestion = AnonymousAnnotationURLType( url_id=self.url_id, - url_type=self.post_info.suggested_status + url_type=self.post_info.suggested_status, + session_id=self.session_id ) session.add(url_type_suggestion) if self.post_info.record_type is not None: record_type_suggestion = AnonymousAnnotationRecordType( url_id=self.url_id, - record_type=self.post_info.record_type + record_type=self.post_info.record_type, + session_id=self.session_id ) session.add(record_type_suggestion) @@ -37,7 +43,8 @@ async def run(self, session: AsyncSession) -> None: location_suggestions = [ AnonymousAnnotationLocation( url_id=self.url_id, - location_id=location_id + location_id=location_id, + session_id=self.session_id ) for location_id in self.post_info.location_info.location_ids ] @@ -47,7 +54,8 @@ async def run(self, session: AsyncSession) -> None: agency_suggestions = [ AnonymousAnnotationAgency( url_id=self.url_id, - agency_id=agency_id + agency_id=agency_id, + session_id=self.session_id ) for agency_id in self.post_info.agency_info.agency_ids ] diff --git a/src/api/endpoints/annotate/routes.py b/src/api/endpoints/annotate/routes.py index a09ee1ec..1633eb5a 100644 --- a/src/api/endpoints/annotate/routes.py +++ b/src/api/endpoints/annotate/routes.py @@ -1,3 +1,6 @@ +import uuid +from uuid import UUID + from fastapi import APIRouter, Depends, Query from src.api.dependencies import get_async_core @@ -7,8 +10,10 @@ from src.api.endpoints.annotate.all.post.models.request import AllAnnotationPostInfo from src.api.endpoints.annotate.all.post.query import AddAllAnnotationsToURLQueryBuilder from src.api.endpoints.annotate.anonymous.get.query import GetNextURLForAnonymousAnnotationQueryBuilder +from src.api.endpoints.annotate.anonymous.get.response import GetNextURLForAnonymousAnnotationResponse from src.api.endpoints.annotate.anonymous.post.query import AddAnonymousAnnotationsToURLQueryBuilder from src.core.core import AsyncCore +from src.db.queries.implementations.anonymous_session import MakeAnonymousSessionQueryBuilder from src.security.dtos.access_info import AccessInfo from src.security.manager import get_access_info @@ -33,26 +38,38 @@ @annotate_router.get("/anonymous") async def get_next_url_for_all_annotations_anonymous( async_core: AsyncCore = Depends(get_async_core), -) -> GetNextURLForAllAnnotationResponse: + session_id: UUID | None = Query(description="The session id of the anonymous user.", default=None) +) -> GetNextURLForAnonymousAnnotationResponse: + # If session_id is not provided, generate new UUID + if session_id is None: + session_id: uuid.UUID = await async_core.adb_client.run_query_builder( + MakeAnonymousSessionQueryBuilder() + ) + return await async_core.adb_client.run_query_builder( - GetNextURLForAnonymousAnnotationQueryBuilder() + GetNextURLForAnonymousAnnotationQueryBuilder(session_id=session_id) ) + @annotate_router.post("/anonymous/{url_id}") async def annotate_url_for_all_annotations_and_get_next_url_anonymous( url_id: int, all_annotation_post_info: AllAnnotationPostInfo, async_core: AsyncCore = Depends(get_async_core), -) -> GetNextURLForAllAnnotationResponse: + session_id: UUID = Query(description="The session id of the anonymous user") +) -> GetNextURLForAnonymousAnnotationResponse: await async_core.adb_client.run_query_builder( AddAnonymousAnnotationsToURLQueryBuilder( url_id=url_id, - post_info=all_annotation_post_info + post_info=all_annotation_post_info, + session_id=session_id ) ) return await async_core.adb_client.run_query_builder( - GetNextURLForAnonymousAnnotationQueryBuilder() + GetNextURLForAnonymousAnnotationQueryBuilder( + session_id=session_id + ) ) diff --git a/src/api/endpoints/submit/data_source/queries/core.py b/src/api/endpoints/submit/data_source/queries/core.py index b3d1ff46..1f97cd11 100644 --- a/src/api/endpoints/submit/data_source/queries/core.py +++ b/src/api/endpoints/submit/data_source/queries/core.py @@ -1,3 +1,4 @@ +import uuid from typing import Any from sqlalchemy.exc import IntegrityError @@ -8,6 +9,7 @@ from src.collectors.enums import URLStatus from src.core.enums import BatchStatus from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.flag.url_validated.enums import URLType from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.core.sqlalchemy import URL @@ -15,9 +17,11 @@ from src.db.models.impl.url.suggestion.anonymous.agency.sqlalchemy import AnonymousAnnotationAgency from src.db.models.impl.url.suggestion.anonymous.location.sqlalchemy import AnonymousAnnotationLocation from src.db.models.impl.url.suggestion.anonymous.record_type.sqlalchemy import AnonymousAnnotationRecordType +from src.db.models.impl.url.suggestion.anonymous.url_type.sqlalchemy import AnonymousAnnotationURLType from src.db.models.impl.url.suggestion.name.enums import NameSuggestionSource from src.db.models.impl.url.suggestion.name.sqlalchemy import URLNameSuggestion from src.db.queries.base.builder import QueryBuilderBase +from src.db.queries.implementations.anonymous_session import MakeAnonymousSessionQueryBuilder from src.util.models.full_url import FullURL @@ -67,11 +71,23 @@ async def run( ) session.add(batch_url_link) + # Create single-use session id + session_id: uuid.UUID = await MakeAnonymousSessionQueryBuilder().run(session=session) + + # Add URL Type Suggestion + url_type_suggestion = AnonymousAnnotationURLType( + url_id=url_id, + url_type=URLType.DATA_SOURCE, + session_id=session_id + ) + session.add(url_type_suggestion) + # Optionally add Record Type as suggestion if self.request.record_type is not None: record_type_suggestion = AnonymousAnnotationRecordType( url_id=url_id, - record_type=self.request.record_type.value + record_type=self.request.record_type.value, + session_id=session_id ) session.add(record_type_suggestion) @@ -80,7 +96,8 @@ async def run( agency_id_suggestions = [ AnonymousAnnotationAgency( url_id=url_id, - agency_id=agency_id + agency_id=agency_id, + session_id=session_id ) for agency_id in self.request.agency_ids ] @@ -91,7 +108,8 @@ async def run( location_id_suggestions = [ AnonymousAnnotationLocation( url_id=url_id, - location_id=location_id + location_id=location_id, + session_id=session_id ) for location_id in self.request.location_ids ] diff --git a/src/core/tasks/url/operators/validate/queries/ctes/counts/constants.py b/src/core/tasks/url/operators/validate/queries/ctes/counts/constants.py new file mode 100644 index 00000000..d09029a4 --- /dev/null +++ b/src/core/tasks/url/operators/validate/queries/ctes/counts/constants.py @@ -0,0 +1,3 @@ + + +ANONYMOUS_VOTE_RATIO = 0.5 \ No newline at end of file diff --git a/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/agency.py b/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/agency.py index 141393bd..36fe0a87 100644 --- a/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/agency.py +++ b/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/agency.py @@ -1,24 +1,66 @@ from sqlalchemy import select, func +from src.core.tasks.url.operators.validate.queries.ctes.counts.constants import ANONYMOUS_VOTE_RATIO from src.core.tasks.url.operators.validate.queries.ctes.counts.core import ValidatedCountsCTEContainer from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion +from src.db.models.impl.url.suggestion.anonymous.agency.sqlalchemy import AnonymousAnnotationAgency from src.db.models.views.unvalidated_url import UnvalidatedURL +_user_counts = ( + select( + UserURLAgencySuggestion.url_id, + UserURLAgencySuggestion.agency_id.label("entity"), + func.count().label("votes") + ) + .group_by( + UserURLAgencySuggestion.url_id, + UserURLAgencySuggestion.agency_id + ) +) + +_anon_counts = ( + select( + AnonymousAnnotationAgency.url_id, + AnonymousAnnotationAgency.agency_id.label("entity"), + (func.count() / ANONYMOUS_VOTE_RATIO).label("votes") + ) + .group_by( + AnonymousAnnotationAgency.url_id, + AnonymousAnnotationAgency.agency_id + ) +) + +_union_counts = ( + select( + _user_counts.c.url_id, + _user_counts.c.entity, + _user_counts.c.votes + ) + .union_all( + select( + _anon_counts.c.url_id, + _anon_counts.c.entity, + _anon_counts.c.votes + ) + ) + .cte("counts_agency_union") +) + AGENCY_VALIDATION_COUNTS_CTE = ValidatedCountsCTEContainer( ( - select( - UserURLAgencySuggestion.url_id, - UserURLAgencySuggestion.agency_id.label("entity"), - func.count().label("votes") - ) - .join( - UnvalidatedURL, - UserURLAgencySuggestion.url_id == UnvalidatedURL.url_id - ) - .group_by( - UserURLAgencySuggestion.url_id, - UserURLAgencySuggestion.agency_id - ) - .cte("counts_agency") + select( + _union_counts.c.url_id, + _union_counts.c.entity, + func.sum(_union_counts.c.votes).label("votes") + ) + .join( + UnvalidatedURL, + _union_counts.c.url_id == UnvalidatedURL.url_id + ) + .group_by( + _union_counts.c.url_id, + _union_counts.c.entity, ) + .cte("counts_agency") + ) ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/location.py b/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/location.py index 2ef385cc..4e180e18 100644 --- a/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/location.py +++ b/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/location.py @@ -1,24 +1,67 @@ from sqlalchemy import select, func +from src.core.tasks.url.operators.validate.queries.ctes.counts.constants import ANONYMOUS_VOTE_RATIO from src.core.tasks.url.operators.validate.queries.ctes.counts.core import ValidatedCountsCTEContainer +from src.db.models.impl.url.suggestion.anonymous.location.sqlalchemy import AnonymousAnnotationLocation +from src.db.models.impl.url.suggestion.anonymous.url_type.sqlalchemy import AnonymousAnnotationURLType from src.db.models.impl.url.suggestion.location.user.sqlalchemy import UserLocationSuggestion from src.db.models.views.unvalidated_url import UnvalidatedURL +_user_counts = ( + select( + UserLocationSuggestion.url_id, + UserLocationSuggestion.location_id.label("entity"), + func.count().label("votes") + ) + .group_by( + UserLocationSuggestion.url_id, + UserLocationSuggestion.location_id + ) +) + +_anon_counts = ( + select( + AnonymousAnnotationLocation.url_id, + AnonymousAnnotationLocation.location_id.label("entity"), + (func.count() / ANONYMOUS_VOTE_RATIO).label("votes") + ) + .group_by( + AnonymousAnnotationLocation.url_id, + AnonymousAnnotationLocation.location_id + ) +) + +_union_counts = ( + select( + _user_counts.c.url_id, + _user_counts.c.entity, + _user_counts.c.votes + ) + .union_all( + select( + _anon_counts.c.url_id, + _anon_counts.c.entity, + _anon_counts.c.votes + ) + ) + .cte("counts_location_union") +) + LOCATION_VALIDATION_COUNTS_CTE = ValidatedCountsCTEContainer( ( - select( - UserLocationSuggestion.url_id, - UserLocationSuggestion.location_id.label("entity"), - func.count().label("votes") - ) - .join( - UnvalidatedURL, - UserLocationSuggestion.url_id == UnvalidatedURL.url_id - ) - .group_by( - UserLocationSuggestion.url_id, - UserLocationSuggestion.location_id - ) - .cte("counts_location") + select( + _union_counts.c.url_id, + _union_counts.c.entity, + func.sum(_union_counts.c.votes).label("votes") + ) + .join( + UnvalidatedURL, + _union_counts.c.url_id == UnvalidatedURL.url_id + ) + .group_by( + _union_counts.c.url_id, + _union_counts.c.entity, ) + .cte("counts_location") + ) ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/record_type.py b/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/record_type.py index 6300ec92..65b1f9b0 100644 --- a/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/record_type.py +++ b/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/record_type.py @@ -1,23 +1,66 @@ from sqlalchemy import select, func +from src.core.tasks.url.operators.validate.queries.ctes.counts.constants import ANONYMOUS_VOTE_RATIO from src.core.tasks.url.operators.validate.queries.ctes.counts.core import ValidatedCountsCTEContainer +from src.db.models.impl.url.suggestion.anonymous.record_type.sqlalchemy import AnonymousAnnotationRecordType from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion from src.db.models.views.unvalidated_url import UnvalidatedURL +_user_counts = ( + select( + UserRecordTypeSuggestion.url_id, + UserRecordTypeSuggestion.record_type.label("entity"), + func.count().label("votes") + ) + .group_by( + UserRecordTypeSuggestion.url_id, + UserRecordTypeSuggestion.record_type + ) +) + +_anon_counts = ( + select( + AnonymousAnnotationRecordType.url_id, + AnonymousAnnotationRecordType.record_type.label("entity"), + (func.count() * ANONYMOUS_VOTE_RATIO).label("votes") + ) + .group_by( + AnonymousAnnotationRecordType.url_id, + AnonymousAnnotationRecordType.record_type + ) +) + +_union_counts = ( + select( + _user_counts.c.url_id, + _user_counts.c.entity, + _user_counts.c.votes + ) + .union_all( + select( + _anon_counts.c.url_id, + _anon_counts.c.entity, + _anon_counts.c.votes + ) + ) + .cte("counts_record_type_union") +) + + RECORD_TYPE_COUNTS_CTE = ValidatedCountsCTEContainer( ( select( - UserRecordTypeSuggestion.url_id, - UserRecordTypeSuggestion.record_type.label("entity"), - func.count().label("votes") + _union_counts.c.url_id, + _union_counts.c.entity, + func.sum(_union_counts.c.votes).label("votes") ) .join( UnvalidatedURL, - UserRecordTypeSuggestion.url_id == UnvalidatedURL.url_id + _union_counts.c.url_id == UnvalidatedURL.url_id ) .group_by( - UserRecordTypeSuggestion.url_id, - UserRecordTypeSuggestion.record_type + _union_counts.c.url_id, + _union_counts.c.entity, ) .cte("counts_record_type") ) diff --git a/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/url_type.py b/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/url_type.py index f0d340e7..72638f19 100644 --- a/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/url_type.py +++ b/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/url_type.py @@ -1,23 +1,65 @@ from sqlalchemy import select, func +from src.core.tasks.url.operators.validate.queries.ctes.counts.constants import ANONYMOUS_VOTE_RATIO from src.core.tasks.url.operators.validate.queries.ctes.counts.core import ValidatedCountsCTEContainer +from src.db.models.impl.url.suggestion.anonymous.url_type.sqlalchemy import AnonymousAnnotationURLType from src.db.models.impl.url.suggestion.url_type.user import UserURLTypeSuggestion from src.db.models.views.unvalidated_url import UnvalidatedURL +_user_counts = ( + select( + UserURLTypeSuggestion.url_id, + UserURLTypeSuggestion.type.label("entity"), + func.count().label("votes") + ) + .group_by( + UserURLTypeSuggestion.url_id, + UserURLTypeSuggestion.type + ) +) + +_anon_counts = ( + select( + AnonymousAnnotationURLType.url_id, + AnonymousAnnotationURLType.url_type.label("entity"), + (func.count() / ANONYMOUS_VOTE_RATIO).label("votes") + ) + .group_by( + AnonymousAnnotationURLType.url_id, + AnonymousAnnotationURLType.url_type + ) +) + +_union_counts = ( + select( + _user_counts.c.url_id, + _user_counts.c.entity, + _user_counts.c.votes + ) + .union_all( + select( + _anon_counts.c.url_id, + _anon_counts.c.entity, + _anon_counts.c.votes + ) + ) + .cte("counts_url_type_union") +) + URL_TYPES_VALIDATION_COUNTS_CTE = ValidatedCountsCTEContainer( ( select( - UserURLTypeSuggestion.url_id, - UserURLTypeSuggestion.type.label("entity"), - func.count().label("votes") + _union_counts.c.url_id, + _union_counts.c.entity, + func.sum(_union_counts.c.votes).label("votes") ) .join( UnvalidatedURL, - UserURLTypeSuggestion.url_id == UnvalidatedURL.url_id + _union_counts.c.url_id == UnvalidatedURL.url_id ) .group_by( - UserURLTypeSuggestion.url_id, - UserURLTypeSuggestion.type + _union_counts.c.url_id, + _union_counts.c.entity, ) .cte("counts_url_type") ) diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 5ec64ad7..125c594e 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -1,6 +1,7 @@ from datetime import datetime from functools import wraps from typing import Optional, Any, List +from uuid import UUID, uuid4 from sqlalchemy import select, func, Select, and_, update, Row, text from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker, AsyncEngine @@ -77,6 +78,7 @@ from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion +from src.db.models.impl.url.suggestion.anonymous import AnonymousSession from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion from src.db.models.impl.url.suggestion.url_type.auto.pydantic.input import AutoRelevancyAnnotationInput diff --git a/src/db/models/impl/url/suggestion/anonymous/__init__.py b/src/db/models/impl/url/suggestion/anonymous/__init__.py index e69de29b..fddc715f 100644 --- a/src/db/models/impl/url/suggestion/anonymous/__init__.py +++ b/src/db/models/impl/url/suggestion/anonymous/__init__.py @@ -0,0 +1 @@ +from src.db.models.impl.url.suggestion.anonymous.session.sqlalchemy import AnonymousSession \ No newline at end of file diff --git a/src/db/models/impl/url/suggestion/anonymous/agency/sqlalchemy.py b/src/db/models/impl/url/suggestion/anonymous/agency/sqlalchemy.py index afea2f23..6f750289 100644 --- a/src/db/models/impl/url/suggestion/anonymous/agency/sqlalchemy.py +++ b/src/db/models/impl/url/suggestion/anonymous/agency/sqlalchemy.py @@ -1,6 +1,6 @@ from sqlalchemy import PrimaryKeyConstraint -from src.db.models.mixins import URLDependentMixin, AgencyDependentMixin, CreatedAtMixin +from src.db.models.mixins import URLDependentMixin, AgencyDependentMixin, CreatedAtMixin, AnonymousSessionMixin from src.db.models.templates_.base import Base @@ -8,9 +8,10 @@ class AnonymousAnnotationAgency( Base, URLDependentMixin, AgencyDependentMixin, - CreatedAtMixin + CreatedAtMixin, + AnonymousSessionMixin ): __tablename__ = "anonymous_annotation_agency" __table_args__ = ( - PrimaryKeyConstraint("url_id", "agency_id"), + PrimaryKeyConstraint("session_id", "url_id", "agency_id"), ) \ No newline at end of file diff --git a/src/db/models/impl/url/suggestion/anonymous/location/sqlalchemy.py b/src/db/models/impl/url/suggestion/anonymous/location/sqlalchemy.py index f02cb7ba..3e39810b 100644 --- a/src/db/models/impl/url/suggestion/anonymous/location/sqlalchemy.py +++ b/src/db/models/impl/url/suggestion/anonymous/location/sqlalchemy.py @@ -1,6 +1,6 @@ from sqlalchemy import PrimaryKeyConstraint -from src.db.models.mixins import LocationDependentMixin, URLDependentMixin, CreatedAtMixin +from src.db.models.mixins import LocationDependentMixin, URLDependentMixin, CreatedAtMixin, AnonymousSessionMixin from src.db.models.templates_.base import Base @@ -8,10 +8,11 @@ class AnonymousAnnotationLocation( Base, URLDependentMixin, LocationDependentMixin, - CreatedAtMixin + CreatedAtMixin, + AnonymousSessionMixin ): __tablename__ = "anonymous_annotation_location" __table_args__ = ( - PrimaryKeyConstraint("url_id", "location_id"), + PrimaryKeyConstraint("session_id", "url_id", "location_id"), ) \ No newline at end of file diff --git a/src/db/models/impl/url/suggestion/anonymous/record_type/sqlalchemy.py b/src/db/models/impl/url/suggestion/anonymous/record_type/sqlalchemy.py index 25a9ddec..22f37839 100644 --- a/src/db/models/impl/url/suggestion/anonymous/record_type/sqlalchemy.py +++ b/src/db/models/impl/url/suggestion/anonymous/record_type/sqlalchemy.py @@ -3,18 +3,19 @@ from src.core.enums import RecordType from src.db.models.helpers import enum_column -from src.db.models.mixins import URLDependentMixin, CreatedAtMixin +from src.db.models.mixins import URLDependentMixin, CreatedAtMixin, AnonymousSessionMixin from src.db.models.templates_.base import Base class AnonymousAnnotationRecordType( Base, URLDependentMixin, - CreatedAtMixin + CreatedAtMixin, + AnonymousSessionMixin ): __tablename__ = "anonymous_annotation_record_type" __table_args__ = ( - PrimaryKeyConstraint("url_id", "record_type"), + PrimaryKeyConstraint("session_id", "url_id", "record_type"), ) record_type: Mapped[RecordType] = enum_column( diff --git a/src/db/models/impl/url/suggestion/anonymous/session/__init__.py b/src/db/models/impl/url/suggestion/anonymous/session/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/impl/url/suggestion/anonymous/session/sqlalchemy.py b/src/db/models/impl/url/suggestion/anonymous/session/sqlalchemy.py new file mode 100644 index 00000000..cbb43448 --- /dev/null +++ b/src/db/models/impl/url/suggestion/anonymous/session/sqlalchemy.py @@ -0,0 +1,17 @@ +from sqlalchemy import text, Column + +from src.db.models.mixins import CreatedAtMixin +from src.db.models.templates_.base import Base +from sqlalchemy.dialects.postgresql import UUID + + +class AnonymousSession( + Base, + CreatedAtMixin +): + __tablename__ = "anonymous_sessions" + id = Column( + UUID(as_uuid=True), + primary_key=True, + server_default=text("gen_random_uuid()") + ) \ No newline at end of file diff --git a/src/db/models/impl/url/suggestion/anonymous/url_type/sqlalchemy.py b/src/db/models/impl/url/suggestion/anonymous/url_type/sqlalchemy.py index f9033ffa..f0cbc6a7 100644 --- a/src/db/models/impl/url/suggestion/anonymous/url_type/sqlalchemy.py +++ b/src/db/models/impl/url/suggestion/anonymous/url_type/sqlalchemy.py @@ -3,18 +3,19 @@ from src.db.models.helpers import enum_column from src.db.models.impl.flag.url_validated.enums import URLType -from src.db.models.mixins import URLDependentMixin, CreatedAtMixin +from src.db.models.mixins import URLDependentMixin, CreatedAtMixin, AnonymousSessionMixin from src.db.models.templates_.base import Base class AnonymousAnnotationURLType( Base, URLDependentMixin, - CreatedAtMixin + CreatedAtMixin, + AnonymousSessionMixin ): __tablename__ = "anonymous_annotation_url_type" __table_args__ = ( - PrimaryKeyConstraint("url_id", "url_type"), + PrimaryKeyConstraint("session_id", "url_id", "url_type"), ) url_type: Mapped[URLType] = enum_column( diff --git a/src/db/models/mixins.py b/src/db/models/mixins.py index 7a7d6460..640ec955 100644 --- a/src/db/models/mixins.py +++ b/src/db/models/mixins.py @@ -5,6 +5,7 @@ from src.db.models.exceptions import WriteToViewError from src.db.models.helpers import get_created_at_column, CURRENT_TIME_SERVER_DEFAULT, url_id_primary_key_constraint, \ VIEW_ARG +from sqlalchemy.dialects.postgresql import UUID class URLDependentMixin: @@ -96,4 +97,14 @@ class URLDependentViewMixin(URLDependentMixin, ViewMixin): __table_args__ = ( url_id_primary_key_constraint(), VIEW_ARG + ) + +class AnonymousSessionMixin: + session_id = Column( + UUID(as_uuid=True), + ForeignKey( + 'anonymous_sessions.id', + ondelete="CASCADE", + ), + nullable=False ) \ No newline at end of file diff --git a/src/db/queries/implementations/anonymous_session.py b/src/db/queries/implementations/anonymous_session.py new file mode 100644 index 00000000..0ff00ea3 --- /dev/null +++ b/src/db/queries/implementations/anonymous_session.py @@ -0,0 +1,16 @@ +from uuid import UUID + +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.models.impl.url.suggestion.anonymous import AnonymousSession +from src.db.queries.base.builder import QueryBuilderBase + + +class MakeAnonymousSessionQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> UUID: + return await self.sh.add( + session=session, + model=AnonymousSession(), + return_id=True + ) diff --git a/tests/automated/integration/api/annotate/anonymous/helper.py b/tests/automated/integration/api/annotate/anonymous/helper.py index ccfe518f..cb892091 100644 --- a/tests/automated/integration/api/annotate/anonymous/helper.py +++ b/tests/automated/integration/api/annotate/anonymous/helper.py @@ -1,23 +1,32 @@ -from src.api.endpoints.annotate.all.get.models.response import GetNextURLForAllAnnotationResponse +from uuid import UUID + from src.api.endpoints.annotate.all.post.models.request import AllAnnotationPostInfo +from src.api.endpoints.annotate.anonymous.get.response import GetNextURLForAnonymousAnnotationResponse from tests.automated.integration.api._helpers.RequestValidator import RequestValidator async def get_next_url_for_anonymous_annotation( request_validator: RequestValidator, -): + session_id: UUID | None = None +) -> GetNextURLForAnonymousAnnotationResponse: + url = "/annotate/anonymous" + if session_id is not None: + url += f"?session_id={session_id}" + data = request_validator.get( - url=f"/annotate/anonymous" + url=url ) - return GetNextURLForAllAnnotationResponse(**data) + return GetNextURLForAnonymousAnnotationResponse(**data) async def post_and_get_next_url_for_anonymous_annotation( request_validator: RequestValidator, url_id: int, all_annotation_post_info: AllAnnotationPostInfo, -): + session_id: UUID +) -> GetNextURLForAnonymousAnnotationResponse: + url = f"/annotate/anonymous/{url_id}?session_id={session_id}" data = request_validator.post( - url=f"/annotate/anonymous/{url_id}", + url=url, json=all_annotation_post_info.model_dump(mode='json') ) - return GetNextURLForAllAnnotationResponse(**data) \ No newline at end of file + return GetNextURLForAnonymousAnnotationResponse(**data) \ No newline at end of file diff --git a/tests/automated/integration/api/annotate/anonymous/test_core.py b/tests/automated/integration/api/annotate/anonymous/test_core.py index 84781768..b6fb93fa 100644 --- a/tests/automated/integration/api/annotate/anonymous/test_core.py +++ b/tests/automated/integration/api/annotate/anonymous/test_core.py @@ -1,3 +1,5 @@ +from uuid import UUID + import pytest from src.api.endpoints.annotate.all.get.models.name import NameAnnotationSuggestion @@ -6,6 +8,7 @@ from src.api.endpoints.annotate.all.post.models.location import AnnotationPostLocationInfo from src.api.endpoints.annotate.all.post.models.name import AnnotationPostNameInfo from src.api.endpoints.annotate.all.post.models.request import AllAnnotationPostInfo +from src.api.endpoints.annotate.anonymous.get.response import GetNextURLForAnonymousAnnotationResponse from src.core.enums import RecordType from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.flag.url_validated.enums import URLType @@ -27,10 +30,6 @@ async def test_annotate_anonymous( pennsylvania: USStateCreationInfo, ): - # TODO: Update to include session ID - - # TODO: If session ID not included, user gets same annotation as before? - ath = api_test_helper ddc = ath.db_data_creator rv = ath.request_validator @@ -45,7 +44,9 @@ async def test_annotate_anonymous( ) url_mapping_2: SimpleURLMapping = setup_info_2.url_mapping - get_response_1: GetNextURLForAllAnnotationResponse = await get_next_url_for_anonymous_annotation(rv) + get_response_1: GetNextURLForAnonymousAnnotationResponse = await get_next_url_for_anonymous_annotation(rv) + session_id: UUID = get_response_1.session_id + assert session_id is not None assert get_response_1.next_annotation is not None assert len(get_response_1.next_annotation.name_suggestions) == 1 name_suggestion: NameAnnotationSuggestion = get_response_1.next_annotation.name_suggestions[0] @@ -54,7 +55,7 @@ async def test_annotate_anonymous( agency_id: int = await ddc.agency() - post_response_1: GetNextURLForAllAnnotationResponse = await post_and_get_next_url_for_anonymous_annotation( + post_response_1: GetNextURLForAnonymousAnnotationResponse = await post_and_get_next_url_for_anonymous_annotation( rv, get_response_1.next_annotation.url_info.url_id, AllAnnotationPostInfo( @@ -69,8 +70,11 @@ async def test_annotate_anonymous( name_info=AnnotationPostNameInfo( new_name="New Name" ) - ) + ), + session_id=session_id ) + assert post_response_1.session_id == session_id + assert post_response_1.next_annotation is not None assert post_response_1.next_annotation.url_info.url_id != get_response_1.next_annotation.url_info.url_id @@ -86,3 +90,15 @@ async def test_annotate_anonymous( instance: model = instances[0] assert instance.url_id == get_response_1.next_annotation.url_info.url_id + # Run again without giving session ID, confirm original URL returned + get_response_2: GetNextURLForAnonymousAnnotationResponse = await get_next_url_for_anonymous_annotation(rv) + assert get_response_2.session_id != session_id + assert get_response_2.next_annotation is not None + assert get_response_2.next_annotation.url_info.url_id == get_response_1.next_annotation.url_info.url_id + + # Run again while giving session ID, confirm second URL returned + get_response_3: GetNextURLForAnonymousAnnotationResponse = await get_next_url_for_anonymous_annotation(rv, session_id) + assert get_response_3.session_id == session_id + assert get_response_3.next_annotation is not None + assert get_response_3.next_annotation.url_info.url_id == post_response_1.next_annotation.url_info.url_id + diff --git a/tests/automated/integration/api/submit/data_source/test_core.py b/tests/automated/integration/api/submit/data_source/test_core.py index eed0cd00..558327c3 100644 --- a/tests/automated/integration/api/submit/data_source/test_core.py +++ b/tests/automated/integration/api/submit/data_source/test_core.py @@ -1,4 +1,5 @@ from datetime import date +from uuid import UUID import pytest @@ -7,6 +8,7 @@ from src.core.enums import RecordType, BatchStatus from src.db.client.async_ import AsyncDatabaseClient from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.flag.url_validated.enums import URLType from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.core.sqlalchemy import URL @@ -15,6 +17,8 @@ from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata from src.db.models.impl.url.suggestion.anonymous.agency.sqlalchemy import AnonymousAnnotationAgency from src.db.models.impl.url.suggestion.anonymous.location.sqlalchemy import AnonymousAnnotationLocation +from src.db.models.impl.url.suggestion.anonymous.record_type.sqlalchemy import AnonymousAnnotationRecordType +from src.db.models.impl.url.suggestion.anonymous.url_type.sqlalchemy import AnonymousAnnotationURLType from src.db.models.impl.url.suggestion.name.sqlalchemy import URLNameSuggestion from tests.helpers.api_test_helper import APITestHelper from tests.helpers.data_creator.models.creation_info.locality import LocalityCreationInfo @@ -91,21 +95,35 @@ async def test_submit_data_source( assert batch_url_link.batch_id == batch.id assert batch_url_link.url_id == url.id + # Check for anonymous annotations + url_type_suggestion: AnonymousAnnotationURLType = await adb_client.one_or_none_model(AnonymousAnnotationURLType) + assert url_type_suggestion is not None + assert url_type_suggestion.url_id == url.id + assert url_type_suggestion.url_type == URLType.DATA_SOURCE + session_id: UUID = url_type_suggestion.session_id + # Check for Location Suggestion location_suggestion: AnonymousAnnotationLocation = await adb_client.one_or_none_model(AnonymousAnnotationLocation) assert location_suggestion is not None assert location_suggestion.location_id == pittsburgh_locality.location_id + assert location_suggestion.session_id == session_id # Check for Agency Suggestion agency_suggestion: AnonymousAnnotationAgency = await adb_client.one_or_none_model(AnonymousAnnotationAgency) assert agency_suggestion is not None assert agency_suggestion.agency_id == test_agency_id + assert agency_suggestion.session_id == session_id # Check for Name Suggestion name_suggestion: URLNameSuggestion = await adb_client.one_or_none_model(URLNameSuggestion) assert name_suggestion is not None assert name_suggestion.suggestion == "Example name" + # Check for Record Type Suggestion + record_type_suggestion: AnonymousAnnotationRecordType = await adb_client.one_or_none_model(AnonymousAnnotationRecordType) + assert record_type_suggestion.record_type == RecordType.COMPLAINTS_AND_MISCONDUCT + assert record_type_suggestion.session_id == session_id + # Check for URL DS Optional Metadata optional_ds: URLOptionalDataSourceMetadata = await adb_client.one_or_none_model(URLOptionalDataSourceMetadata) assert optional_ds is not None diff --git a/tests/automated/integration/api/url/by_id/delete/test_any_url.py b/tests/automated/integration/api/url/by_id/delete/test_any_url.py index bd17141b..50b3ca0c 100644 --- a/tests/automated/integration/api/url/by_id/delete/test_any_url.py +++ b/tests/automated/integration/api/url/by_id/delete/test_any_url.py @@ -1,3 +1,5 @@ +from uuid import UUID + import pytest from sqlalchemy import select @@ -44,6 +46,7 @@ from src.db.models.impl.url.suggestion.url_type.user import UserURLTypeSuggestion from src.db.models.impl.url.task_error.sqlalchemy import URLTaskError from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata +from src.db.queries.implementations.anonymous_session import MakeAnonymousSessionQueryBuilder from tests.helpers.api_test_helper import APITestHelper from tests.helpers.data_creator.core import DBDataCreator from tests.helpers.data_creator.models.creation_info.locality import LocalityCreationInfo @@ -414,27 +417,34 @@ async def _setup( user_id=1, ) ) + session_id: UUID = await dbc.run_query_builder( + MakeAnonymousSessionQueryBuilder() + ) ## ANONYMOUS for model in [ ### Agency AnonymousAnnotationAgency( url_id=url.url_id, - agency_id=agency_id + agency_id=agency_id, + session_id=session_id, ), ### Record Type AnonymousAnnotationRecordType( url_id=url.url_id, - record_type=RecordType.BOOKING_REPORTS.value + record_type=RecordType.BOOKING_REPORTS.value, + session_id=session_id, ), ### URL Type AnonymousAnnotationURLType( url_id=url.url_id, - url_type=URLType.INDIVIDUAL_RECORD + url_type=URLType.INDIVIDUAL_RECORD, + session_id=session_id, ), ### Location AnonymousAnnotationLocation( url_id=url.url_id, - location_id=pittsburgh_id + location_id=pittsburgh_id, + session_id=session_id ) ]: await dbc.add(model) diff --git a/tests/automated/integration/tasks/url/impl/validate/helper.py b/tests/automated/integration/tasks/url/impl/validate/helper.py index 6ab44984..879fbc66 100644 --- a/tests/automated/integration/tasks/url/impl/validate/helper.py +++ b/tests/automated/integration/tasks/url/impl/validate/helper.py @@ -1,3 +1,5 @@ +from uuid import UUID + from src.api.endpoints.annotate.agency.post.dto import URLAgencyAnnotationPostInfo from src.core.enums import RecordType from src.db.client.async_ import AsyncDatabaseClient @@ -8,6 +10,7 @@ from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType from src.db.models.impl.url.suggestion.name.enums import NameSuggestionSource +from src.db.queries.implementations.anonymous_session import MakeAnonymousSessionQueryBuilder from tests.conftest import db_data_creator from tests.helpers.counter import next_int from tests.helpers.data_creator.core import DBDataCreator @@ -95,6 +98,11 @@ async def add_agency_suggestions( ) ) + async def get_anonymous_session_id(self) -> UUID: + return await self.adb_client.run_query_builder( + MakeAnonymousSessionQueryBuilder() + ) + async def add_location_suggestions( self, count: int = 1, diff --git a/tests/automated/integration/tasks/url/impl/validate/test_data_source.py b/tests/automated/integration/tasks/url/impl/validate/test_data_source.py index 82bed288..4fe0d444 100644 --- a/tests/automated/integration/tasks/url/impl/validate/test_data_source.py +++ b/tests/automated/integration/tasks/url/impl/validate/test_data_source.py @@ -6,12 +6,18 @@ - URL Type (DATA SOURCE) And confirm it is validated as DATA SOURCE """ +from uuid import UUID + import pytest from src.core.enums import RecordType from src.core.tasks.url.operators.validate.core import AutoValidateURLTaskOperator from src.db.models.impl.flag.url_validated.enums import URLType -from tests.automated.integration.tasks.url.impl.validate.helper import TestValidateTaskHelper +from src.db.models.impl.url.suggestion.anonymous.agency.sqlalchemy import AnonymousAnnotationAgency +from src.db.models.impl.url.suggestion.anonymous.location.sqlalchemy import AnonymousAnnotationLocation +from src.db.models.impl.url.suggestion.anonymous.record_type.sqlalchemy import AnonymousAnnotationRecordType +from src.db.models.impl.url.suggestion.anonymous.url_type.sqlalchemy import AnonymousAnnotationURLType +from tests.automated.integration.tasks.url.impl.validate.helper import TestValidateTaskHelper, DEFAULT_RECORD_TYPE from tests.helpers.run import run_task_and_confirm_success @@ -27,20 +33,55 @@ async def test_data_source( assert not await operator.meets_task_prerequisites() - await helper.add_agency_suggestions(count=2) + await helper.add_agency_suggestions(count=1) assert not await operator.meets_task_prerequisites() - await helper.add_location_suggestions(count=2) + await helper.add_location_suggestions(count=1) assert not await operator.meets_task_prerequisites() - await helper.add_record_type_suggestions(count=2) + await helper.add_record_type_suggestions(count=1) assert not await operator.meets_task_prerequisites() await helper.add_name_suggestion(count=2) + assert not await operator.meets_task_prerequisites() + + # Add anonymous annotations + session_id_1: UUID = await helper.get_anonymous_session_id() + session_id_2: UUID = await helper.get_anonymous_session_id() + + for session_id in [session_id_1, session_id_2]: + anon_url_type = AnonymousAnnotationURLType( + url_type=URLType.DATA_SOURCE, + session_id=session_id, + url_id=helper.url_id + ) + anon_record_type = AnonymousAnnotationRecordType( + record_type=DEFAULT_RECORD_TYPE, + session_id=session_id, + url_id=helper.url_id + ) + anon_location = AnonymousAnnotationLocation( + location_id=helper.location_id, + session_id=session_id, + url_id=helper.url_id + ) + anon_agency = AnonymousAnnotationAgency( + agency_id=helper.agency_id, + session_id=session_id, + url_id=helper.url_id + ) + for model in [ + anon_url_type, + anon_record_type, + anon_location, + anon_agency + ]: + await helper.adb_client.add(model) + assert await operator.meets_task_prerequisites() # Add different record type suggestion @@ -52,8 +93,14 @@ async def test_data_source( # Assert no longer meets task prerequisites assert not await operator.meets_task_prerequisites() - # Add tiebreaker - await helper.add_record_type_suggestions() + # Add tiebreaker -- a single anonymous vote + session_id_3: UUID = await helper.get_anonymous_session_id() + anon_record_type = AnonymousAnnotationRecordType( + record_type=DEFAULT_RECORD_TYPE, + session_id=session_id_3, + url_id=helper.url_id + ) + await helper.adb_client.add(anon_record_type) assert await operator.meets_task_prerequisites() From f41e095f2652f48f6e6bbeb4f7774b51d6b8f1a8 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Mon, 8 Dec 2025 13:32:34 -0500 Subject: [PATCH 84/84] Update annotation endpoints --- src/api/endpoints/annotate/_shared/extract.py | 8 +++---- .../endpoints/annotate/all/get/models/name.py | 11 ++++++---- .../annotate/all/get/models/record_type.py | 16 +++++++++----- .../annotate/all/get/models/response.py | 13 ++++------- .../annotate/all/get/queries/convert.py | 19 +++++++++------- .../annotate/all/get/queries/name/core.py | 22 +++++++++++++------ .../api/annotate/all/test_happy_path.py | 8 +++---- .../api/annotate/anonymous/test_core.py | 8 +++---- 8 files changed, 60 insertions(+), 45 deletions(-) diff --git a/src/api/endpoints/annotate/_shared/extract.py b/src/api/endpoints/annotate/_shared/extract.py index c0459e04..3fb7770b 100644 --- a/src/api/endpoints/annotate/_shared/extract.py +++ b/src/api/endpoints/annotate/_shared/extract.py @@ -3,8 +3,8 @@ from src.api.endpoints.annotate._shared.queries.get_annotation_batch_info import GetAnnotationBatchInfoQueryBuilder from src.api.endpoints.annotate.all.get.models.agency import AgencyAnnotationResponseOuterInfo from src.api.endpoints.annotate.all.get.models.location import LocationAnnotationResponseOuterInfo -from src.api.endpoints.annotate.all.get.models.name import NameAnnotationSuggestion -from src.api.endpoints.annotate.all.get.models.record_type import RecordTypeAnnotationSuggestion +from src.api.endpoints.annotate.all.get.models.name import NameAnnotationResponseOuterInfo +from src.api.endpoints.annotate.all.get.models.record_type import RecordTypeAnnotationResponseOuterInfo from src.api.endpoints.annotate.all.get.models.response import GetNextURLForAllAnnotationResponse, \ GetNextURLForAllAnnotationInnerResponse from src.api.endpoints.annotate.all.get.models.url_type import URLTypeAnnotationSuggestion @@ -32,7 +32,7 @@ async def extract_and_format_get_annotation_result( convert_user_url_type_suggestion_to_url_type_annotation_suggestion( url.user_relevant_suggestions ) - record_type_suggestions: list[RecordTypeAnnotationSuggestion] = \ + record_type_suggestions: RecordTypeAnnotationResponseOuterInfo = \ convert_user_record_type_suggestion_to_record_type_annotation_suggestion( url.user_record_type_suggestions ) @@ -40,7 +40,7 @@ async def extract_and_format_get_annotation_result( await GetAgencySuggestionsQueryBuilder(url_id=url.id).run(session) location_suggestions: LocationAnnotationResponseOuterInfo = \ await GetLocationSuggestionsQueryBuilder(url_id=url.id).run(session) - name_suggestions: list[NameAnnotationSuggestion] = \ + name_suggestions: NameAnnotationResponseOuterInfo = \ await GetNameSuggestionsQueryBuilder(url_id=url.id).run(session) return GetNextURLForAllAnnotationResponse( next_annotation=GetNextURLForAllAnnotationInnerResponse( diff --git a/src/api/endpoints/annotate/all/get/models/name.py b/src/api/endpoints/annotate/all/get/models/name.py index 80857305..386b11de 100644 --- a/src/api/endpoints/annotate/all/get/models/name.py +++ b/src/api/endpoints/annotate/all/get/models/name.py @@ -1,7 +1,10 @@ from pydantic import BaseModel - class NameAnnotationSuggestion(BaseModel): - name: str - suggestion_id: int - endorsement_count: int \ No newline at end of file + id: int + display_name: str + user_count: int + robo_count: int + +class NameAnnotationResponseOuterInfo(BaseModel): + suggestions: list[NameAnnotationSuggestion] \ No newline at end of file diff --git a/src/api/endpoints/annotate/all/get/models/record_type.py b/src/api/endpoints/annotate/all/get/models/record_type.py index a1c24911..a99dfd7b 100644 --- a/src/api/endpoints/annotate/all/get/models/record_type.py +++ b/src/api/endpoints/annotate/all/get/models/record_type.py @@ -1,11 +1,17 @@ -from pydantic import BaseModel +from pydantic import BaseModel, Field +from src.api.endpoints.annotate.all.get.models.suggestion import SuggestionModel from src.core.enums import RecordType - - -class RecordTypeAnnotationSuggestion(BaseModel): +class RecordTypeSuggestionModel(BaseModel): record_type: RecordType - endorsement_count: int + user_count: int + robo_confidence: int | None = Field( + description="The robo labeler's given confidence for its suggestion. Null if no robo-label occurred.", + ge=0, + le=100, + ) +class RecordTypeAnnotationResponseOuterInfo(BaseModel): + suggestions: list[RecordTypeSuggestionModel] diff --git a/src/api/endpoints/annotate/all/get/models/response.py b/src/api/endpoints/annotate/all/get/models/response.py index 989dbf8d..7f924e3f 100644 --- a/src/api/endpoints/annotate/all/get/models/response.py +++ b/src/api/endpoints/annotate/all/get/models/response.py @@ -1,16 +1,11 @@ -from typing import Optional - from pydantic import Field, BaseModel -from src.api.endpoints.annotate.agency.get.dto import GetNextURLForAgencyAgencyInfo from src.api.endpoints.annotate.all.get.models.agency import AgencyAnnotationResponseOuterInfo from src.api.endpoints.annotate.all.get.models.location import LocationAnnotationResponseOuterInfo -from src.api.endpoints.annotate.all.get.models.name import NameAnnotationSuggestion -from src.api.endpoints.annotate.all.get.models.record_type import RecordTypeAnnotationSuggestion +from src.api.endpoints.annotate.all.get.models.name import NameAnnotationResponseOuterInfo +from src.api.endpoints.annotate.all.get.models.record_type import RecordTypeAnnotationResponseOuterInfo from src.api.endpoints.annotate.all.get.models.url_type import URLTypeAnnotationSuggestion from src.api.endpoints.annotate.dtos.shared.base.response import AnnotationInnerResponseInfoBase -from src.api.endpoints.annotate.relevance.get.dto import RelevanceAnnotationResponseInfo -from src.core.enums import RecordType class GetNextURLForAllAnnotationInnerResponse(AnnotationInnerResponseInfoBase): @@ -23,10 +18,10 @@ class GetNextURLForAllAnnotationInnerResponse(AnnotationInnerResponseInfoBase): url_type_suggestions: list[URLTypeAnnotationSuggestion] = Field( title="Whether the auto-labeler identified the URL as relevant or not" ) - record_type_suggestions: list[RecordTypeAnnotationSuggestion] = Field( + record_type_suggestions: RecordTypeAnnotationResponseOuterInfo = Field( title="What record type, if any, user and the auto-labeler identified the URL as" ) - name_suggestions: list[NameAnnotationSuggestion] | None = Field( + name_suggestions: NameAnnotationResponseOuterInfo = Field( title="User and Auto-Suggestions for names" ) diff --git a/src/api/endpoints/annotate/all/get/queries/convert.py b/src/api/endpoints/annotate/all/get/queries/convert.py index 386389a5..fe9b0777 100644 --- a/src/api/endpoints/annotate/all/get/queries/convert.py +++ b/src/api/endpoints/annotate/all/get/queries/convert.py @@ -1,6 +1,7 @@ from collections import Counter -from src.api.endpoints.annotate.all.get.models.record_type import RecordTypeAnnotationSuggestion +from src.api.endpoints.annotate.all.get.models.record_type import RecordTypeAnnotationResponseOuterInfo, \ + RecordTypeSuggestionModel from src.api.endpoints.annotate.all.get.models.url_type import URLTypeAnnotationSuggestion from src.core.enums import RecordType from src.db.models.impl.flag.url_validated.enums import URLType @@ -26,18 +27,20 @@ def convert_user_url_type_suggestion_to_url_type_annotation_suggestion( def convert_user_record_type_suggestion_to_record_type_annotation_suggestion( db_suggestions: list[UserRecordTypeSuggestion] -) -> list[RecordTypeAnnotationSuggestion]: +) -> RecordTypeAnnotationResponseOuterInfo: counter: Counter[RecordType] = Counter() for suggestion in db_suggestions: counter[suggestion.record_type] += 1 - anno_suggestions: list[RecordTypeAnnotationSuggestion] = [] + suggestions: list[RecordTypeSuggestionModel] = [] for record_type, endorsement_count in counter.most_common(3): - anno_suggestions.append( - RecordTypeAnnotationSuggestion( + suggestions.append( + RecordTypeSuggestionModel( record_type=record_type, - endorsement_count=endorsement_count, + user_count=endorsement_count, + robo_confidence=0, ) ) - - return anno_suggestions \ No newline at end of file + return RecordTypeAnnotationResponseOuterInfo( + suggestions=suggestions + ) diff --git a/src/api/endpoints/annotate/all/get/queries/name/core.py b/src/api/endpoints/annotate/all/get/queries/name/core.py index b048cb2c..9438f14e 100644 --- a/src/api/endpoints/annotate/all/get/queries/name/core.py +++ b/src/api/endpoints/annotate/all/get/queries/name/core.py @@ -1,11 +1,12 @@ from typing import Sequence -from sqlalchemy import select, func, RowMapping +from sqlalchemy import select, func, RowMapping, case from sqlalchemy.ext.asyncio import AsyncSession -from src.api.endpoints.annotate.all.get.models.name import NameAnnotationSuggestion +from src.api.endpoints.annotate.all.get.models.name import NameAnnotationSuggestion, NameAnnotationResponseOuterInfo from src.db.helpers.session import session_helper as sh from src.db.models.impl.link.user_name_suggestion.sqlalchemy import LinkUserNameSuggestion +from src.db.models.impl.url.suggestion.name.enums import NameSuggestionSource from src.db.models.impl.url.suggestion.name.sqlalchemy import URLNameSuggestion from src.db.queries.base.builder import QueryBuilderBase @@ -19,14 +20,18 @@ def __init__( super().__init__() self.url_id = url_id - async def run(self, session: AsyncSession) -> list[NameAnnotationSuggestion]: + async def run(self, session: AsyncSession) -> NameAnnotationResponseOuterInfo: query = ( select( - URLNameSuggestion.id.label('suggestion_id'), - URLNameSuggestion.suggestion.label('name'), + URLNameSuggestion.id.label('id'), + URLNameSuggestion.suggestion.label('display_name'), func.count( LinkUserNameSuggestion.user_id - ).label('endorsement_count'), + ).label('user_count'), + case( + (URLNameSuggestion.source == NameSuggestionSource.HTML_METADATA_TITLE, 1), + else_=0 + ).label("robo_count") ) .outerjoin( LinkUserNameSuggestion, @@ -47,12 +52,15 @@ async def run(self, session: AsyncSession) -> list[NameAnnotationSuggestion]: ) mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) - return [ + suggestions = [ NameAnnotationSuggestion( **mapping ) for mapping in mappings ] + return NameAnnotationResponseOuterInfo( + suggestions=suggestions + ) diff --git a/tests/automated/integration/api/annotate/all/test_happy_path.py b/tests/automated/integration/api/annotate/all/test_happy_path.py index 47db2a09..49d8bd97 100644 --- a/tests/automated/integration/api/annotate/all/test_happy_path.py +++ b/tests/automated/integration/api/annotate/all/test_happy_path.py @@ -48,10 +48,10 @@ async def test_annotate_all( # Get a valid URL to annotate get_response_1 = await ath.request_validator.get_next_url_for_all_annotations() assert get_response_1.next_annotation is not None - assert len(get_response_1.next_annotation.name_suggestions) == 1 - name_suggestion = get_response_1.next_annotation.name_suggestions[0] - assert name_suggestion.name is not None - assert name_suggestion.endorsement_count == 0 + assert len(get_response_1.next_annotation.name_suggestions.suggestions) == 1 + name_suggestion = get_response_1.next_annotation.name_suggestions.suggestions[0] + assert name_suggestion.display_name is not None + assert name_suggestion.user_count == 0 # Apply the second batch id as a filter and see that a different URL is returned get_response_2 = await ath.request_validator.get_next_url_for_all_annotations( diff --git a/tests/automated/integration/api/annotate/anonymous/test_core.py b/tests/automated/integration/api/annotate/anonymous/test_core.py index b6fb93fa..26516b16 100644 --- a/tests/automated/integration/api/annotate/anonymous/test_core.py +++ b/tests/automated/integration/api/annotate/anonymous/test_core.py @@ -48,10 +48,10 @@ async def test_annotate_anonymous( session_id: UUID = get_response_1.session_id assert session_id is not None assert get_response_1.next_annotation is not None - assert len(get_response_1.next_annotation.name_suggestions) == 1 - name_suggestion: NameAnnotationSuggestion = get_response_1.next_annotation.name_suggestions[0] - assert name_suggestion.name is not None - assert name_suggestion.endorsement_count == 0 + assert len(get_response_1.next_annotation.name_suggestions.suggestions) == 1 + name_suggestion: NameAnnotationSuggestion = get_response_1.next_annotation.name_suggestions.suggestions[0] + assert name_suggestion.display_name is not None + assert name_suggestion.user_count == 0 agency_id: int = await ddc.agency()