From 220c3196705aac9d9e47939b0391d86a98c7b4fa Mon Sep 17 00:00:00 2001 From: maxachis Date: Fri, 2 May 2025 21:22:27 -0400 Subject: [PATCH 1/4] DRAFT --- api/routes/collector.py | 15 ++++++ collector_db/AsyncDatabaseClient.py | 53 +++++++++++++++++++ collector_db/DatabaseClient.py | 13 ++--- collector_manager/enums.py | 1 + core/AsyncCore.py | 12 ++++- core/DTOs/ManualBatchInputDTO.py | 24 +++++++++ core/DTOs/ManualBatchOutputDTO.py | 6 +++ pyproject.toml | 3 ++ .../integration/api/test_manual_batch.py | 18 +++++++ 9 files changed, 138 insertions(+), 7 deletions(-) create mode 100644 core/DTOs/ManualBatchInputDTO.py create mode 100644 core/DTOs/ManualBatchOutputDTO.py create mode 100644 pyproject.toml create mode 100644 tests/test_automated/integration/api/test_manual_batch.py diff --git a/api/routes/collector.py b/api/routes/collector.py index e2789443..b7628c4f 100644 --- a/api/routes/collector.py +++ b/api/routes/collector.py @@ -6,6 +6,7 @@ from collector_manager.enums import CollectorType from core.AsyncCore import AsyncCore from core.DTOs.CollectorStartInfo import CollectorStartInfo +from core.DTOs.ManualBatchInputDTO import ManualBatchInputDTO from security_manager.SecurityManager import AccessInfo, get_access_info from source_collectors.auto_googler.DTOs import AutoGooglerInputDTO from source_collectors.ckan.DTOs import CKANInputDTO @@ -122,4 +123,18 @@ async def start_muckrock_all_foia_collector( collector_type=CollectorType.MUCKROCK_ALL_SEARCH, dto=dto, user_id=access_info.user_id + ) + +@collector_router.post("/manual") +async def upload_manual_collector( + dto: ManualBatchInputDTO, + core: AsyncCore = Depends(get_async_core), + access_info: AccessInfo = Depends(get_access_info), +) -> CollectorStartInfo: + """ + Uploads a manual "collector" with existing data + """ + return await core.upload_manual_batch( + dto=dto, + user_id=access_info.user_id ) \ No newline at end of file diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index 46cd89db..4bc60de7 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -41,6 +41,8 @@ from core.DTOs.GetTasksResponse import GetTasksResponse, GetTasksResponseTaskInfo from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo, GetURLsResponseErrorInfo, \ GetURLsResponseInnerInfo +from core.DTOs.ManualBatchInputDTO import ManualBatchInputDTO +from core.DTOs.ManualBatchOutputDTO import ManualBatchOutputDTO from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo from core.DTOs.task_data_objects.AgencyIdentificationTDO import AgencyIdentificationTDO from core.DTOs.task_data_objects.SubmitApprovedURLTDO import SubmitApprovedURLTDO, SubmittedURLInfo @@ -1719,6 +1721,57 @@ async def add_all_annotations_to_url( ) session.add(agency_suggestion) + @session_manager + async def upload_manual_batch( + self, + session: AsyncSession, + user_id: int, + dto: ManualBatchInputDTO + ) -> ManualBatchOutputDTO: + batch = Batch( + strategy=CollectorType.MANUAL.value, + status=BatchStatus.READY_TO_LABEL.value, + parameters={ + "name": dto.name + }, + user_id=user_id + ) + session.add(batch) + await session.flush() + + batch_id = batch.id + url_ids = [] + + for entry in dto.entries: + url = URL( + url=entry.url, + name=entry.name, + description=entry.description, + batch_id=batch_id, + collector_metadata=entry.collector_metadata, + outcome=URLStatus.PENDING.value, + record_type=entry.record_type.value if entry.record_type is not None else None, + ) + + session.add(url) + try: + await session.flush() + except IntegrityError: + await session.rollback() + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=f"URL already exists: {entry.url}" + ) + await session.flush() + optional_metadata = URLOptionalDataSourceMetadata( + url_id=url.id, + record_formats=entry.record_formats, + data_portal_type=entry.data_portal_type, + supplying_entity=entry.supplying_entity, + ) + session.add(optional_metadata) + url_ids.append(url.id) + return ManualBatchOutputDTO(batch_id=batch_id, urls=url_ids) diff --git a/collector_db/DatabaseClient.py b/collector_db/DatabaseClient.py index 3999dbc9..43ec9628 100644 --- a/collector_db/DatabaseClient.py +++ b/collector_db/DatabaseClient.py @@ -1,21 +1,22 @@ -from datetime import datetime, timedelta from functools import wraps from typing import Optional, List -from sqlalchemy import create_engine, Row +from sqlalchemy import create_engine from sqlalchemy.exc import IntegrityError -from sqlalchemy.orm import sessionmaker, scoped_session, aliased +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import sessionmaker, scoped_session from collector_db.ConfigManager import ConfigManager from collector_db.DTOs.BatchInfo import BatchInfo -from collector_db.DTOs.DuplicateInfo import DuplicateInfo, DuplicateInsertInfo +from collector_db.DTOs.DuplicateInfo import DuplicateInsertInfo from collector_db.DTOs.InsertURLsInfo import InsertURLsInfo -from collector_db.DTOs.LogInfo import LogInfo, LogOutputInfo +from collector_db.DTOs.LogInfo import LogInfo from collector_db.DTOs.URLInfo import URLInfo from collector_db.DTOs.URLMapping import URLMapping -from collector_db.helper_functions import get_postgres_connection_string from collector_db.models import Base, Batch, URL, Log, Duplicate from collector_manager.enums import CollectorType +from core.DTOs.ManualBatchInputDTO import ManualBatchInputDTO +from core.DTOs.ManualBatchOutputDTO import ManualBatchOutputDTO from core.EnvVarManager import EnvVarManager from core.enums import BatchStatus diff --git a/collector_manager/enums.py b/collector_manager/enums.py index 692b97e5..5b89ffe2 100644 --- a/collector_manager/enums.py +++ b/collector_manager/enums.py @@ -8,6 +8,7 @@ class CollectorType(Enum): MUCKROCK_COUNTY_SEARCH = "muckrock_county_search" MUCKROCK_ALL_SEARCH = "muckrock_all_search" CKAN = "ckan" + MANUAL = "manual" class URLStatus(Enum): PENDING = "pending" diff --git a/core/AsyncCore.py b/core/AsyncCore.py index 92f097db..2eba5d04 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -22,6 +22,8 @@ from core.DTOs.GetTasksResponse import GetTasksResponse from core.DTOs.GetURLsByBatchResponse import GetURLsByBatchResponse from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo +from core.DTOs.ManualBatchInputDTO import ManualBatchInputDTO +from core.DTOs.ManualBatchOutputDTO import ManualBatchOutputDTO from core.DTOs.MessageResponse import MessageResponse from core.TaskManager import TaskManager from core.enums import BatchStatus, RecordType @@ -270,5 +272,13 @@ async def reject_url( user_id=access_info.user_id ) - + async def upload_manual_batch( + self, + dto: ManualBatchInputDTO, + user_id: int + ) -> ManualBatchOutputDTO: + return await self.adb_client.upload_manual_batch( + user_id=user_id, + dto=dto + ) diff --git a/core/DTOs/ManualBatchInputDTO.py b/core/DTOs/ManualBatchInputDTO.py new file mode 100644 index 00000000..9bb98755 --- /dev/null +++ b/core/DTOs/ManualBatchInputDTO.py @@ -0,0 +1,24 @@ +from typing import Optional + +from pydantic import BaseModel, Field + +from core.enums import RecordType + + +class ManualBatchInnerInputDTO(BaseModel): + url: str + name: Optional[str] = None + description: Optional[str] = None + collector_metadata: Optional[dict] = None + record_type: Optional[RecordType] = None + record_formats: Optional[list[str]] = None + data_portal_type: Optional[str] = None + supplying_entity: Optional[str] = None + + +class ManualBatchInputDTO(BaseModel): + name: str + entries: list[ManualBatchInnerInputDTO] = Field( + min_length=1, + max_length=1000 + ) \ No newline at end of file diff --git a/core/DTOs/ManualBatchOutputDTO.py b/core/DTOs/ManualBatchOutputDTO.py new file mode 100644 index 00000000..119359a6 --- /dev/null +++ b/core/DTOs/ManualBatchOutputDTO.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class ManualBatchOutputDTO(BaseModel): + batch_id: int + urls: list[int] \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..161cc214 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,3 @@ +[project] +name="source-collector" +version="0.1.0" \ No newline at end of file diff --git a/tests/test_automated/integration/api/test_manual_batch.py b/tests/test_automated/integration/api/test_manual_batch.py new file mode 100644 index 00000000..82fb4d91 --- /dev/null +++ b/tests/test_automated/integration/api/test_manual_batch.py @@ -0,0 +1,18 @@ +import pytest + + +@pytest.mark.asyncio +async def test_manual_batch(api_test_helper): + + manual_batch_name = "test_manual_batch" + + # Create 50 entries with just URL + + + # Create 50 entries with URL and all optional fields + + + # TODO: Continue later + + + raise NotImplementedError \ No newline at end of file From 25ced55b6af67688d1773f916e4c74c9f39db5ae Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sat, 3 May 2025 10:18:05 -0400 Subject: [PATCH 2/4] feat(app): Add `/collector/manual` endpoint --- ..._add_manual_strategy_to_batch_strategy_.py | 54 +++++++ api/routes/collector.py | 3 +- collector_db/AsyncDatabaseClient.py | 6 +- collector_db/DatabaseClient.py | 2 +- collector_db/models.py | 1 + core/AsyncCore.py | 4 +- ...OutputDTO.py => ManualBatchResponseDTO.py} | 2 +- .../api/helpers/RequestValidator.py | 60 ++++++- .../integration/api/test_manual_batch.py | 148 +++++++++++++++++- 9 files changed, 267 insertions(+), 13 deletions(-) create mode 100644 alembic/versions/2025_05_03_0956-028565b77b9e_add_manual_strategy_to_batch_strategy_.py rename core/DTOs/{ManualBatchOutputDTO.py => ManualBatchResponseDTO.py} (63%) diff --git a/alembic/versions/2025_05_03_0956-028565b77b9e_add_manual_strategy_to_batch_strategy_.py b/alembic/versions/2025_05_03_0956-028565b77b9e_add_manual_strategy_to_batch_strategy_.py new file mode 100644 index 00000000..c5af4d33 --- /dev/null +++ b/alembic/versions/2025_05_03_0956-028565b77b9e_add_manual_strategy_to_batch_strategy_.py @@ -0,0 +1,54 @@ +"""Add manual strategy to Batch strategy enum + +Revision ID: 028565b77b9e +Revises: e285e6e7cf71 +Create Date: 2025-05-03 09:56:51.134406 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + +from util.alembic_helpers import switch_enum_type + +# revision identifiers, used by Alembic. +revision: str = '028565b77b9e' +down_revision: Union[str, None] = 'e285e6e7cf71' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + switch_enum_type( + table_name="batches", + column_name="strategy", + enum_name="batch_strategy", + new_enum_values=[ + "example", + "ckan", + "muckrock_county_search", + "auto_googler", + "muckrock_all_search", + "muckrock_simple_search", + "common_crawler", + "manual" + ], + ) + + +def downgrade() -> None: + switch_enum_type( + table_name="batches", + column_name="strategy", + enum_name="batch_strategy", + new_enum_values=[ + "example", + "ckan", + "muckrock_county_search", + "auto_googler", + "muckrock_all_search", + "muckrock_simple_search", + "common_crawler" + ], + ) diff --git a/api/routes/collector.py b/api/routes/collector.py index b7628c4f..16f5a900 100644 --- a/api/routes/collector.py +++ b/api/routes/collector.py @@ -7,6 +7,7 @@ from core.AsyncCore import AsyncCore from core.DTOs.CollectorStartInfo import CollectorStartInfo from core.DTOs.ManualBatchInputDTO import ManualBatchInputDTO +from core.DTOs.ManualBatchResponseDTO import ManualBatchResponseDTO from security_manager.SecurityManager import AccessInfo, get_access_info from source_collectors.auto_googler.DTOs import AutoGooglerInputDTO from source_collectors.ckan.DTOs import CKANInputDTO @@ -130,7 +131,7 @@ async def upload_manual_collector( dto: ManualBatchInputDTO, core: AsyncCore = Depends(get_async_core), access_info: AccessInfo = Depends(get_access_info), -) -> CollectorStartInfo: +) -> ManualBatchResponseDTO: """ Uploads a manual "collector" with existing data """ diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index 4bc60de7..b110c614 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -42,7 +42,7 @@ from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo, GetURLsResponseErrorInfo, \ GetURLsResponseInnerInfo from core.DTOs.ManualBatchInputDTO import ManualBatchInputDTO -from core.DTOs.ManualBatchOutputDTO import ManualBatchOutputDTO +from core.DTOs.ManualBatchResponseDTO import ManualBatchResponseDTO from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo from core.DTOs.task_data_objects.AgencyIdentificationTDO import AgencyIdentificationTDO from core.DTOs.task_data_objects.SubmitApprovedURLTDO import SubmitApprovedURLTDO, SubmittedURLInfo @@ -1727,7 +1727,7 @@ async def upload_manual_batch( session: AsyncSession, user_id: int, dto: ManualBatchInputDTO - ) -> ManualBatchOutputDTO: + ) -> ManualBatchResponseDTO: batch = Batch( strategy=CollectorType.MANUAL.value, status=BatchStatus.READY_TO_LABEL.value, @@ -1773,5 +1773,5 @@ async def upload_manual_batch( url_ids.append(url.id) - return ManualBatchOutputDTO(batch_id=batch_id, urls=url_ids) + return ManualBatchResponseDTO(batch_id=batch_id, urls=url_ids) diff --git a/collector_db/DatabaseClient.py b/collector_db/DatabaseClient.py index 43ec9628..94320fbc 100644 --- a/collector_db/DatabaseClient.py +++ b/collector_db/DatabaseClient.py @@ -16,7 +16,7 @@ from collector_db.models import Base, Batch, URL, Log, Duplicate from collector_manager.enums import CollectorType from core.DTOs.ManualBatchInputDTO import ManualBatchInputDTO -from core.DTOs.ManualBatchOutputDTO import ManualBatchOutputDTO +from core.DTOs.ManualBatchResponseDTO import ManualBatchResponseDTO from core.EnvVarManager import EnvVarManager from core.enums import BatchStatus diff --git a/collector_db/models.py b/collector_db/models.py index 42b113c6..b5e70cdc 100644 --- a/collector_db/models.py +++ b/collector_db/models.py @@ -41,6 +41,7 @@ class Batch(Base): 'muckrock_all_search', 'muckrock_simple_search', 'common_crawler', + 'manual', name='batch_strategy'), nullable=False) user_id = Column(Integer, nullable=False) diff --git a/core/AsyncCore.py b/core/AsyncCore.py index 2eba5d04..59a892ef 100644 --- a/core/AsyncCore.py +++ b/core/AsyncCore.py @@ -23,7 +23,7 @@ from core.DTOs.GetURLsByBatchResponse import GetURLsByBatchResponse from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo from core.DTOs.ManualBatchInputDTO import ManualBatchInputDTO -from core.DTOs.ManualBatchOutputDTO import ManualBatchOutputDTO +from core.DTOs.ManualBatchResponseDTO import ManualBatchResponseDTO from core.DTOs.MessageResponse import MessageResponse from core.TaskManager import TaskManager from core.enums import BatchStatus, RecordType @@ -276,7 +276,7 @@ async def upload_manual_batch( self, dto: ManualBatchInputDTO, user_id: int - ) -> ManualBatchOutputDTO: + ) -> ManualBatchResponseDTO: return await self.adb_client.upload_manual_batch( user_id=user_id, dto=dto diff --git a/core/DTOs/ManualBatchOutputDTO.py b/core/DTOs/ManualBatchResponseDTO.py similarity index 63% rename from core/DTOs/ManualBatchOutputDTO.py rename to core/DTOs/ManualBatchResponseDTO.py index 119359a6..f656fda0 100644 --- a/core/DTOs/ManualBatchOutputDTO.py +++ b/core/DTOs/ManualBatchResponseDTO.py @@ -1,6 +1,6 @@ from pydantic import BaseModel -class ManualBatchOutputDTO(BaseModel): +class ManualBatchResponseDTO(BaseModel): batch_id: int urls: list[int] \ No newline at end of file diff --git a/tests/test_automated/integration/api/helpers/RequestValidator.py b/tests/test_automated/integration/api/helpers/RequestValidator.py index 28e4b4a3..07de3c95 100644 --- a/tests/test_automated/integration/api/helpers/RequestValidator.py +++ b/tests/test_automated/integration/api/helpers/RequestValidator.py @@ -1,6 +1,7 @@ from http import HTTPStatus from typing import Optional, Annotated +from fastapi import HTTPException from pydantic import BaseModel from starlette.testclient import TestClient @@ -24,6 +25,8 @@ from core.DTOs.GetTasksResponse import GetTasksResponse from core.DTOs.GetURLsByBatchResponse import GetURLsByBatchResponse from core.DTOs.GetURLsResponseInfo import GetURLsResponseInfo +from core.DTOs.ManualBatchInputDTO import ManualBatchInputDTO +from core.DTOs.ManualBatchResponseDTO import ManualBatchResponseDTO from core.DTOs.MessageResponse import MessageResponse from core.DTOs.RecordTypeAnnotationPostInfo import RecordTypeAnnotationPostInfo from core.DTOs.RelevanceAnnotationPostInfo import RelevanceAnnotationPostInfo @@ -32,7 +35,11 @@ class ExpectedResponseInfo(BaseModel): - status_code: Annotated[HTTPStatus, "The expected status code"] = HTTPStatus.OK + status_code: Annotated[ + HTTPStatus, + "The expected status code" + ] = HTTPStatus.OK + message: Optional[str] = None class RequestValidator: """ @@ -64,6 +71,31 @@ def open( assert response.status_code == expected_response.status_code, response.text return response.json() + def open_v2( + self, + method: str, + url: str, + params: Optional[dict] = None, + **kwargs + ) -> dict: + """ + Variation on open that raises an exception rather than check the status code + """ + if params: + kwargs["params"] = params + response = self.client.request( + method=method, + url=url, + headers={"Authorization": "Bearer token"}, # Fake authentication that is overridden during testing + **kwargs + ) + if response.status_code != HTTPStatus.OK: + raise HTTPException( + status_code=response.status_code, + detail=response.json() + ) + return response.json() + def get( self, url: str, @@ -94,6 +126,20 @@ def post( **kwargs ) + def post_v2( + self, + url: str, + params: Optional[dict] = None, + **kwargs + ) -> dict: + return self.open_v2( + method="POST", + url=url, + params=params, + **kwargs + ) + + def put( self, url: str, @@ -329,4 +375,14 @@ async def post_all_annotations_and_get_next( params=params, json=all_annotations_post_info.model_dump(mode='json') ) - return GetNextURLForAllAnnotationResponse(**data) \ No newline at end of file + return GetNextURLForAllAnnotationResponse(**data) + + async def submit_manual_batch( + self, + dto: ManualBatchInputDTO, + ) -> ManualBatchResponseDTO: + data = self.post_v2( + url="/collector/manual", + json=dto.model_dump(mode='json'), + ) + return ManualBatchResponseDTO(**data) \ No newline at end of file diff --git a/tests/test_automated/integration/api/test_manual_batch.py b/tests/test_automated/integration/api/test_manual_batch.py index 82fb4d91..e7c34af1 100644 --- a/tests/test_automated/integration/api/test_manual_batch.py +++ b/tests/test_automated/integration/api/test_manual_batch.py @@ -1,18 +1,160 @@ + import pytest +from fastapi import HTTPException + +from collector_db.models import Batch, URL, URLOptionalDataSourceMetadata +from collector_manager.enums import CollectorType +from core.DTOs.ManualBatchInputDTO import ManualBatchInnerInputDTO, ManualBatchInputDTO +from core.enums import RecordType @pytest.mark.asyncio async def test_manual_batch(api_test_helper): + ath = api_test_helper manual_batch_name = "test_manual_batch" # Create 50 entries with just URL - + dtos = [] + for i in range(50): + dto = ManualBatchInnerInputDTO( + url=f"https://example.com/{i}", + ) + dtos.append(dto) # Create 50 entries with URL and all optional fields + for i in range(50): + dto = ManualBatchInnerInputDTO( + url=f"https://example.com/{i+50}", + name=manual_batch_name, + description=f"Description {i}", + collector_metadata={ + "name": f"Name {i}", + }, + record_type=RecordType.ARREST_RECORDS, + record_formats=[f"Record Format {i}"], + data_portal_type=f"Data Portal Type {i}", + supplying_entity=f"Supplying Entity {i}" + ) + dtos.append(dto) + + input_dto = ManualBatchInputDTO( + name=manual_batch_name, + entries=dtos + ) + + # Submit batch successfully + response = await ath.request_validator.submit_manual_batch(input_dto) + + # Check 100 URLs in url attribute + assert len(response.urls) == 100 + + # Get batch from database + adb_client = ath.adb_client() + batches = await adb_client.get_all(Batch) + + # Confirm only one batch + assert len(batches) == 1 + + batch: Batch = batches[0] + # Assert batch id matches response's batch id + assert batch.id == response.batch_id + # Assert strategy of manual + assert batch.strategy == CollectorType.MANUAL.value + # Assert parameters has name value of `test_manual_batch` + assert batch.parameters["name"] == manual_batch_name + # Assert has expected user id + assert batch.user_id == 1 + + # Get URLs from database + urls: list[URL] = await adb_client.get_all(URL) + + # Confirm 100 URLs + assert len(urls) == 100 + + def check_attributes( + object: URL or URLOptionalDataSourceMetadata, + attributes: list[str], + attributes_are_none: bool + ): + for attr in attributes: + if attributes_are_none: + if getattr(object, attr) is not None: + return False + else: + if getattr(object, attr) is None: + return False + return True + + def check_url(url: URL, url_only: bool): + assert url.batch_id == batch.id + assert url.url is not None + other_attributes = ["name", "description", "collector_metadata", "record_type"] + return check_attributes(url, other_attributes, url_only) + + + # Confirm 50 have only name value + count_only_name = 0 + for url in urls: + if check_url(url, True): + count_only_name += 1 + assert count_only_name == 50 + # Confirm 50 have all optional fields + count_all = 0 + for url in urls: + if check_url(url, False): + count_all += 1 + assert count_all == 50 + + # Get Optional URL Metadata from Database + opt_metadata: list[URLOptionalDataSourceMetadata] = await adb_client.get_all(URLOptionalDataSourceMetadata) + + # Confirm 100 + assert len(opt_metadata) == 100 + + def check_opt_metadata(metadata: URLOptionalDataSourceMetadata, no_optional: bool): + assert metadata.url_id is not None + other_attributes = ["record_formats", "data_portal_type", "supplying_entity"] + return check_attributes(metadata, other_attributes, no_optional) + + # Confirm 50 have nothing but URL id + count_only_url_id = 0 + for metadata in opt_metadata: + if check_opt_metadata(metadata, True): + count_only_url_id += 1 + assert count_only_url_id == 50 + + # Confirm 50 have all optional fields + count_all = 0 + for metadata in opt_metadata: + if check_opt_metadata(metadata, False): + count_all += 1 + assert count_all == 50 + # Insert another batch including good urls and one duplicate + more_dtos = [] + for i in range(49): + dto = ManualBatchInnerInputDTO( + url=f"https://example.com/{i+100}", + ) + more_dtos.append(dto) - # TODO: Continue later + dto = ManualBatchInnerInputDTO( + url=f"https://example.com/1", + ) + more_dtos.append(dto) + duplicate_input_dto = ManualBatchInputDTO( + name=manual_batch_name, + entries=more_dtos + ) - raise NotImplementedError \ No newline at end of file + # Submit batch + try: + response = await ath.request_validator.submit_manual_batch(duplicate_input_dto) + except HTTPException as e: + # Confirm got a BAD REQUEST error identifying the correct duplicate URL + assert e.status_code == 400 + assert e.detail == { + "detail": 'URL already exists: https://example.com/1' + } From addc5f57df3bdd6f361654bfc60d1e8c98e17914 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sat, 3 May 2025 10:28:52 -0400 Subject: [PATCH 3/4] feat(app): Add `/collector/manual` endpoint --- collector_db/AsyncDatabaseClient.py | 23 ++++++++------- core/DTOs/ManualBatchResponseDTO.py | 3 +- .../integration/api/test_manual_batch.py | 28 ++++++++++--------- 3 files changed, 30 insertions(+), 24 deletions(-) diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index b110c614..d6d949ea 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -1741,6 +1741,7 @@ async def upload_manual_batch( batch_id = batch.id url_ids = [] + duplicate_urls = [] for entry in dto.entries: url = URL( @@ -1753,15 +1754,13 @@ async def upload_manual_batch( record_type=entry.record_type.value if entry.record_type is not None else None, ) - session.add(url) - try: - await session.flush() - except IntegrityError: - await session.rollback() - raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, - detail=f"URL already exists: {entry.url}" - ) + async with session.begin_nested(): + try: + session.add(url) + await session.flush() + except IntegrityError: + duplicate_urls.append(entry.url) + continue await session.flush() optional_metadata = URLOptionalDataSourceMetadata( url_id=url.id, @@ -1773,5 +1772,9 @@ async def upload_manual_batch( url_ids.append(url.id) - return ManualBatchResponseDTO(batch_id=batch_id, urls=url_ids) + return ManualBatchResponseDTO( + batch_id=batch_id, + urls=url_ids, + duplicate_urls=duplicate_urls + ) diff --git a/core/DTOs/ManualBatchResponseDTO.py b/core/DTOs/ManualBatchResponseDTO.py index f656fda0..b572fbb2 100644 --- a/core/DTOs/ManualBatchResponseDTO.py +++ b/core/DTOs/ManualBatchResponseDTO.py @@ -3,4 +3,5 @@ class ManualBatchResponseDTO(BaseModel): batch_id: int - urls: list[int] \ No newline at end of file + urls: list[int] + duplicate_urls: list[str] \ No newline at end of file diff --git a/tests/test_automated/integration/api/test_manual_batch.py b/tests/test_automated/integration/api/test_manual_batch.py index e7c34af1..e9a101eb 100644 --- a/tests/test_automated/integration/api/test_manual_batch.py +++ b/tests/test_automated/integration/api/test_manual_batch.py @@ -1,6 +1,5 @@ import pytest -from fastapi import HTTPException from collector_db.models import Batch, URL, URLOptionalDataSourceMetadata from collector_manager.enums import CollectorType @@ -139,10 +138,12 @@ def check_opt_metadata(metadata: URLOptionalDataSourceMetadata, no_optional: boo ) more_dtos.append(dto) - dto = ManualBatchInnerInputDTO( - url=f"https://example.com/1", - ) - more_dtos.append(dto) + for i in range(2): + dto = ManualBatchInnerInputDTO( + url=f"https://example.com/{i+1}", + ) + more_dtos.append(dto) + duplicate_input_dto = ManualBatchInputDTO( name=manual_batch_name, @@ -150,11 +151,12 @@ def check_opt_metadata(metadata: URLOptionalDataSourceMetadata, no_optional: boo ) # Submit batch - try: - response = await ath.request_validator.submit_manual_batch(duplicate_input_dto) - except HTTPException as e: - # Confirm got a BAD REQUEST error identifying the correct duplicate URL - assert e.status_code == 400 - assert e.detail == { - "detail": 'URL already exists: https://example.com/1' - } + response = await ath.request_validator.submit_manual_batch(duplicate_input_dto) + # Check duplicate URLs + assert len(response.duplicate_urls) == 2 + assert response.duplicate_urls == ['https://example.com/1', 'https://example.com/2'] + assert len(response.urls) == 49 + + # Check 149 URLs in database + urls: list[URL] = await adb_client.get_all(URL) + assert len(urls) == 149 From 060bc11cce5650d00eaf48cc76d9c770c6544ed8 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sat, 3 May 2025 10:32:19 -0400 Subject: [PATCH 4/4] feat(app): Add `/collector/manual` endpoint --- collector_db/AsyncDatabaseClient.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index d6d949ea..52ab2c9c 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -563,7 +563,7 @@ async def add_to_root_url_cache(self, session: AsyncSession, url: str, page_titl async def get_urls(self, session: AsyncSession, page: int, errors: bool) -> GetURLsResponseInfo: statement = select(URL).options( selectinload(URL.error_info) - ) + ).order_by(URL.id) if errors: # Only return URLs with errors statement = statement.where(