diff --git a/src/api/endpoints/annotate/dtos/agency/response.py b/src/api/endpoints/annotate/dtos/agency/response.py index abd12877..f2dda0f5 100644 --- a/src/api/endpoints/annotate/dtos/agency/response.py +++ b/src/api/endpoints/annotate/dtos/agency/response.py @@ -2,8 +2,8 @@ from pydantic import BaseModel +from src.api.endpoints.annotate.dtos.shared.base.response import AnnotationInnerResponseInfoBase from src.core.enums import SuggestionType -from src.core.tasks.operators.url_html.scraper.parser.dtos.response_html import ResponseHTMLInfo class GetNextURLForAgencyAgencyInfo(BaseModel): suggestion_type: SuggestionType @@ -13,13 +13,10 @@ class GetNextURLForAgencyAgencyInfo(BaseModel): county: Optional[str] = None locality: Optional[str] = None -class GetNextURLForAgencyAnnotationInnerResponse(BaseModel): - url_id: int - url: str +class GetNextURLForAgencyAnnotationInnerResponse(AnnotationInnerResponseInfoBase): agency_suggestions: list[ GetNextURLForAgencyAgencyInfo ] - html_info: ResponseHTMLInfo class GetNextURLForAgencyAnnotationResponse(BaseModel): next_annotation: Optional[GetNextURLForAgencyAnnotationInnerResponse] diff --git a/src/api/endpoints/annotate/dtos/all/response.py b/src/api/endpoints/annotate/dtos/all/response.py index 0f938337..6620624f 100644 --- a/src/api/endpoints/annotate/dtos/all/response.py +++ b/src/api/endpoints/annotate/dtos/all/response.py @@ -3,15 +3,14 @@ from pydantic import Field, BaseModel from src.api.endpoints.annotate.dtos.agency.response import GetNextURLForAgencyAgencyInfo +from src.api.endpoints.annotate.dtos.shared.base.response import AnnotationInnerResponseInfoBase from src.core.enums import RecordType -from src.core.tasks.operators.url_html.scraper.parser.dtos.response_html import ResponseHTMLInfo -class GetNextURLForAllAnnotationInnerResponse(BaseModel): - url_id: int - url: str - html_info: ResponseHTMLInfo - agency_suggestions: Optional[list[GetNextURLForAgencyAgencyInfo]] +class GetNextURLForAllAnnotationInnerResponse(AnnotationInnerResponseInfoBase): + agency_suggestions: Optional[list[GetNextURLForAgencyAgencyInfo]] = Field( + title="The auto-labeler's suggestions for agencies" + ) suggested_relevant: Optional[bool] = Field( title="Whether the auto-labeler identified the URL as relevant or not" ) diff --git a/src/api/endpoints/annotate/dtos/record_type/response.py b/src/api/endpoints/annotate/dtos/record_type/response.py index 0b21eea2..7862d477 100644 --- a/src/api/endpoints/annotate/dtos/record_type/response.py +++ b/src/api/endpoints/annotate/dtos/record_type/response.py @@ -2,21 +2,16 @@ from pydantic import Field, BaseModel +from src.api.endpoints.annotate.dtos.shared.base.response import AnnotationInnerResponseInfoBase from src.db.dtos.url_mapping import URLMapping from src.core.enums import RecordType from src.core.tasks.operators.url_html.scraper.parser.dtos.response_html import ResponseHTMLInfo -class GetNextRecordTypeAnnotationResponseInfo(BaseModel): - url_info: URLMapping = Field( - title="Information about the URL" - ) +class GetNextRecordTypeAnnotationResponseInfo(AnnotationInnerResponseInfoBase): suggested_record_type: Optional[RecordType] = Field( title="What record type, if any, the auto-labeler identified the URL as" ) - html_info: ResponseHTMLInfo = Field( - title="HTML information about the URL" - ) class GetNextRecordTypeAnnotationResponseOuterInfo(BaseModel): next_annotation: Optional[GetNextRecordTypeAnnotationResponseInfo] diff --git a/src/api/endpoints/annotate/dtos/relevance/response.py b/src/api/endpoints/annotate/dtos/relevance/response.py index 188fcac7..e6030d61 100644 --- a/src/api/endpoints/annotate/dtos/relevance/response.py +++ b/src/api/endpoints/annotate/dtos/relevance/response.py @@ -2,20 +2,13 @@ from pydantic import BaseModel, Field -from src.db.dtos.url_mapping import URLMapping -from src.core.tasks.operators.url_html.scraper.parser.dtos.response_html import ResponseHTMLInfo +from src.api.endpoints.annotate.dtos.shared.base.response import AnnotationInnerResponseInfoBase -class GetNextRelevanceAnnotationResponseInfo(BaseModel): - url_info: URLMapping = Field( - title="Information about the URL" - ) +class GetNextRelevanceAnnotationResponseInfo(AnnotationInnerResponseInfoBase): suggested_relevant: Optional[bool] = Field( title="Whether the auto-labeler identified the URL as relevant or not" ) - html_info: ResponseHTMLInfo = Field( - title="HTML information about the URL" - ) class GetNextRelevanceAnnotationResponseOuterInfo(BaseModel): next_annotation: Optional[GetNextRelevanceAnnotationResponseInfo] diff --git a/src/api/endpoints/annotate/dtos/shared/__init__.py b/src/api/endpoints/annotate/dtos/shared/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/annotate/dtos/shared/base/__init__.py b/src/api/endpoints/annotate/dtos/shared/base/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/annotate/dtos/shared/base/response.py b/src/api/endpoints/annotate/dtos/shared/base/response.py new file mode 100644 index 00000000..7cd0c35d --- /dev/null +++ b/src/api/endpoints/annotate/dtos/shared/base/response.py @@ -0,0 +1,19 @@ +from typing import Optional + +from pydantic import BaseModel, Field + +from src.api.endpoints.annotate.dtos.shared.batch import AnnotationBatchInfo +from src.core.tasks.operators.url_html.scraper.parser.dtos.response_html import ResponseHTMLInfo +from src.db.dtos.url_mapping import URLMapping + + +class AnnotationInnerResponseInfoBase(BaseModel): + url_info: URLMapping = Field( + title="Information about the URL" + ) + html_info: ResponseHTMLInfo = Field( + title="HTML information about the URL" + ) + batch_info: Optional[AnnotationBatchInfo] = Field( + title="Information about the annotation batch" + ) \ No newline at end of file diff --git a/src/api/endpoints/annotate/dtos/shared/batch.py b/src/api/endpoints/annotate/dtos/shared/batch.py new file mode 100644 index 00000000..30be91ae --- /dev/null +++ b/src/api/endpoints/annotate/dtos/shared/batch.py @@ -0,0 +1,7 @@ +from pydantic import BaseModel + + +class AnnotationBatchInfo(BaseModel): + count_annotated: int + total_urls: int + count_not_annotated: int \ No newline at end of file diff --git a/src/db/client/async_.py b/src/db/client/async_.py index d6f7208d..05b394e7 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -19,6 +19,7 @@ GetNextURLForAllAnnotationInnerResponse from src.api.endpoints.annotate.dtos.record_type.response import GetNextRecordTypeAnnotationResponseInfo from src.api.endpoints.annotate.dtos.relevance.response import GetNextRelevanceAnnotationResponseInfo +from src.api.endpoints.annotate.dtos.shared.batch import AnnotationBatchInfo from src.api.endpoints.collector.dtos.manual_batch.post import ManualBatchInputDTO from src.api.endpoints.collector.dtos.manual_batch.response import ManualBatchResponseDTO from src.api.endpoints.metrics.dtos.get.backlog import GetMetricsBacklogResponseDTO, GetMetricsBacklogResponseInnerDTO @@ -68,6 +69,7 @@ from src.core.tasks.operators.url_miscellaneous_metadata.tdo import URLMiscellaneousMetadataTDO, URLHTMLMetadataInfo from src.core.env_var_manager import EnvVarManager from src.core.enums import BatchStatus, SuggestionType, RecordType, SuggestedStatus +from src.db.types import UserSuggestionType # Type Hints @@ -257,7 +259,14 @@ async def get_next_url_for_relevance_annotation( url_id=url.id ), suggested_relevant=suggestion, - html_info=html_response_info + html_info=html_response_info, + batch_info=await self.get_annotation_batch_info( + session, + batch_id=batch_id, + models=[ + UserRelevantSuggestion + ] + ) ) #endregion relevant @@ -298,7 +307,14 @@ async def get_next_url_for_record_type_annotation( url_id=url.id ), suggested_record_type=suggestion, - html_info=html_response_info + html_info=html_response_info, + batch_info=await self.get_annotation_batch_info( + session, + batch_id=batch_id, + models=[ + UserRecordTypeSuggestion, + ] + ) ) @@ -916,10 +932,19 @@ async def get_next_url_agency_for_annotation( return GetNextURLForAgencyAnnotationResponse( next_annotation=GetNextURLForAgencyAnnotationInnerResponse( - url_id=url_id, - url=url, + url_info=URLMapping( + url=url, + url_id=url_id + ), html_info=response_html_info, - agency_suggestions=agency_suggestions + agency_suggestions=agency_suggestions, + batch_info=await self.get_annotation_batch_info( + session, + batch_id=batch_id, + models=[ + UserUrlAgencySuggestion, + ] + ) ) ) @@ -1747,12 +1772,23 @@ async def get_next_url_for_all_annotations(self, session, batch_id: Optional[int return GetNextURLForAllAnnotationResponse( next_annotation=GetNextURLForAllAnnotationInnerResponse( - url_id=url.id, - url=url.url, + url_info=URLMapping( + url_id=url.id, + url=url.url + ), html_info=html_response_info, suggested_relevant=auto_relevant, suggested_record_type=auto_record_type, - agency_suggestions=agency_suggestions + agency_suggestions=agency_suggestions, + batch_info=await self.get_annotation_batch_info( + session, + batch_id=batch_id, + models=[ + UserUrlAgencySuggestion, + UserRecordTypeSuggestion, + UserRelevantSuggestion + ] + ) ) ) @@ -2380,4 +2416,58 @@ async def get_pending_urls_not_recently_probed_for_404(self, session: AsyncSessi raw_result = await session.execute(query) urls = raw_result.scalars().all() - return [URL404ProbeTDO(url=url.url, url_id=url.id) for url in urls] \ No newline at end of file + return [URL404ProbeTDO(url=url.url, url_id=url.id) for url in urls] + + @staticmethod + async def get_annotation_batch_info( + session: AsyncSession, + batch_id: Optional[int], + models: List[UserSuggestionType] + ) -> Optional[AnnotationBatchInfo]: + if batch_id is None: + return None + + sc = StatementComposer + include_queries = [ + sc.user_suggestion_exists(model) + for model in models + ] + + select_url = select(func.count(URL.id)) + + common_where_clause = [ + URL.outcome == URLStatus.PENDING.value, + URL.batch_id == batch_id, + ] + + annotated_query = ( + select_url + .where( + *common_where_clause, + *include_queries, + ) + ) + + exclude_queries = [ + sc.user_suggestion_not_exists(model) + for model in models + ] + + not_annotated_query = ( + select_url + .where( + *common_where_clause, + *exclude_queries, + ) + ) + + annotated_result_raw = await session.execute(annotated_query) + annotated_result = annotated_result_raw.scalars().one_or_none() + not_annotated_result_raw = await session.execute(not_annotated_query) + not_annotated_result = not_annotated_result_raw.scalars().one_or_none() + + return AnnotationBatchInfo( + count_annotated=annotated_result, + count_not_annotated=not_annotated_result, + total_urls=annotated_result + not_annotated_result + ) \ No newline at end of file diff --git a/src/db/statement_composer.py b/src/db/statement_composer.py index c1ccd367..cd313b59 100644 --- a/src/db/statement_composer.py +++ b/src/db/statement_composer.py @@ -8,6 +8,7 @@ from src.db.models.core import URL, URLHTMLContent, AutomatedUrlAgencySuggestion, URLOptionalDataSourceMetadata, Batch, \ ConfirmedURLAgency, LinkTaskURL, Task, UserUrlAgencySuggestion, UserRecordTypeSuggestion, UserRelevantSuggestion from src.core.enums import BatchStatus +from src.db.types import UserSuggestionType class StatementComposer: @@ -96,6 +97,18 @@ def pending_urls_missing_miscellaneous_metadata_query() -> Select: return query + @staticmethod + def user_suggestion_exists( + model_to_include: UserSuggestionType + ) -> ColumnElement[bool]: + subquery = exists( + select(model_to_include) + .where( + model_to_include.url_id == URL.id, + ) + ) + return subquery + @staticmethod def user_suggestion_not_exists( @@ -106,12 +119,7 @@ def user_suggestion_not_exists( # subquery = not_( - exists( - select(model_to_exclude) - .where( - model_to_exclude.url_id == URL.id, - ) - ) + StatementComposer.user_suggestion_exists(model_to_exclude) ) return subquery diff --git a/src/db/types.py b/src/db/types.py new file mode 100644 index 00000000..40dc9ef3 --- /dev/null +++ b/src/db/types.py @@ -0,0 +1,3 @@ +from src.db.models.core import UserUrlAgencySuggestion, UserRecordTypeSuggestion, UserRelevantSuggestion + +UserSuggestionType = UserUrlAgencySuggestion | UserRelevantSuggestion | UserRecordTypeSuggestion \ No newline at end of file diff --git a/tests/automated/integration/api/test_annotate.py b/tests/automated/integration/api/test_annotate.py index e75e3360..d6bbfa30 100644 --- a/tests/automated/integration/api/test_annotate.py +++ b/tests/automated/integration/api/test_annotate.py @@ -406,7 +406,7 @@ async def test_annotate_agency_multiple_auto_suggestions(api_test_helper): assert response.next_annotation next_annotation = response.next_annotation # Check that url_id matches the one we inserted - assert next_annotation.url_id == buci.url_ids[0] + assert next_annotation.url_info.url_id == buci.url_ids[0] # Check that html data is present assert next_annotation.html_info.description != "" @@ -448,7 +448,7 @@ async def test_annotate_agency_multiple_auto_suggestions_no_html(api_test_helper assert response.next_annotation next_annotation = response.next_annotation # Check that url_id matches the one we inserted - assert next_annotation.url_id == buci.url_ids[0] + assert next_annotation.url_info.url_id == buci.url_ids[0] # Check that html data is not present assert next_annotation.html_info.description == "" @@ -476,7 +476,7 @@ async def test_annotate_agency_single_unknown_auto_suggestion(api_test_helper): assert response.next_annotation next_annotation = response.next_annotation # Check that url_id matches the one we inserted - assert next_annotation.url_id == buci.url_ids[0] + assert next_annotation.url_info.url_id == buci.url_ids[0] # Check that html data is present assert next_annotation.html_info.description != "" @@ -532,7 +532,7 @@ async def test_annotate_agency_other_user_annotation(api_test_helper): assert response.next_annotation next_annotation = response.next_annotation # Check that url_id matches the one we inserted - assert next_annotation.url_id == url_ids[0] + assert next_annotation.url_info.url_id == url_ids[0] # Check that html data is present assert next_annotation.html_info.description != "" @@ -645,7 +645,7 @@ async def test_annotate_all(api_test_helper): batch_id=setup_info_2.batch_id ) - assert get_response_1.next_annotation.url_id != get_response_2.next_annotation.url_id + assert get_response_1.next_annotation.url_info.url_id != get_response_2.next_annotation.url_info.url_id # Annotate the first and submit agency_id = await ath.db_data_creator.agency() @@ -663,7 +663,7 @@ async def test_annotate_all(api_test_helper): assert post_response_1.next_annotation is not None # Confirm the second is received - assert post_response_1.next_annotation.url_id == url_mapping_2.url_id + assert post_response_1.next_annotation.url_info.url_id == url_mapping_2.url_id # Upon submitting the second, confirm that no more URLs are returned through either POST or GET post_response_2 = await ath.request_validator.post_all_annotations_and_get_next( @@ -729,7 +729,7 @@ async def test_annotate_all_post_batch_filtering(api_test_helper): ) ) - assert post_response_1.next_annotation.url_id == url_mapping_3.url_id + assert post_response_1.next_annotation.url_info.url_id == url_mapping_3.url_id @pytest.mark.asyncio diff --git a/tests/automated/integration/collector_db/test_db_client.py b/tests/automated/integration/collector_db/test_db_client.py index 7196af9f..47fa5598 100644 --- a/tests/automated/integration/collector_db/test_db_client.py +++ b/tests/automated/integration/collector_db/test_db_client.py @@ -485,9 +485,14 @@ async def test_get_next_url_for_annotation_batch_filtering( ) setup_info_2 = await setup_for_get_next_url_for_annotation( db_data_creator=db_data_creator, - url_count=1 + url_count=3 ) + def assert_batch_info(batch_info): + assert batch_info.total_urls == 3 + assert batch_info.count_annotated == 0 + assert batch_info.count_not_annotated == 3 + url_1 = setup_info_1.insert_urls_info.url_mappings[0] url_2 = setup_info_2.insert_urls_info.url_mappings[0] @@ -499,7 +504,7 @@ async def test_get_next_url_for_annotation_batch_filtering( ) assert result_with_batch_id.url_info.url == url_2.url - + assert_batch_info(result_with_batch_id.batch_info) # If no batch id is provided, return first valid URL result_no_batch_id = await db_data_creator.adb_client.get_next_url_for_relevance_annotation( user_id=1, @@ -516,6 +521,7 @@ async def test_get_next_url_for_annotation_batch_filtering( ) assert result_with_batch_id.url_info.url == url_2.url + assert_batch_info(result_with_batch_id.batch_info) # If no batch id is provided, return first valid URL result_no_batch_id = await db_data_creator.adb_client.get_next_url_for_record_type_annotation( @@ -539,7 +545,8 @@ async def test_get_next_url_for_annotation_batch_filtering( batch_id=setup_info_2.batch_id ) - assert result_with_batch_id.next_annotation.url == url_2.url + assert result_with_batch_id.next_annotation.url_info.url == url_2.url + assert_batch_info(result_with_batch_id.next_annotation.batch_info) # If no batch id is provided, return first valid URL result_no_batch_id = await db_data_creator.adb_client.get_next_url_agency_for_annotation( @@ -547,7 +554,24 @@ async def test_get_next_url_for_annotation_batch_filtering( batch_id=None ) - assert result_no_batch_id.next_annotation.url == url_1.url + assert result_no_batch_id.next_annotation.url_info.url == url_1.url + + + # All annotations + result_with_batch_id = await db_data_creator.adb_client.get_next_url_for_all_annotations( + batch_id=setup_info_2.batch_id + ) + + assert result_with_batch_id.next_annotation.url_info.url == url_2.url + assert_batch_info(result_with_batch_id.next_annotation.batch_info) + + # If no batch id is provided, return first valid URL + result_no_batch_id = await db_data_creator.adb_client.get_next_url_for_all_annotations( + batch_id=None + ) + + assert result_no_batch_id.next_annotation.url_info.url == url_1.url + @pytest.mark.asyncio @@ -630,14 +654,14 @@ async def test_annotate_url_marked_not_relevant(db_data_creator: DBDataCreator): user_id=1, batch_id=None ) - assert agency_annotation_info_user_1.next_annotation.url_id != url_to_mark_not_relevant.url_id + assert agency_annotation_info_user_1.next_annotation.url_info.url_id != url_to_mark_not_relevant.url_id # Other users also should not receive the URL for agency annotation agency_annotation_info_user_2 = await adb_client.get_next_url_agency_for_annotation( user_id=2, batch_id=None ) - assert agency_annotation_info_user_1.next_annotation.url_id != url_to_mark_not_relevant.url_id + assert agency_annotation_info_user_1.next_annotation.url_info.url_id != url_to_mark_not_relevant.url_id @pytest.mark.asyncio async def test_annotate_url_agency_agency_not_in_db(db_data_creator: DBDataCreator):