From 6a2e2a0049d0296eddd20a5f8b1e846d33271d2a Mon Sep 17 00:00:00 2001 From: maxachis Date: Tue, 27 May 2025 15:08:38 -0400 Subject: [PATCH 1/2] Adjust final review to not return URLs in the absence of user annotations --- collector_db/AsyncDatabaseClient.py | 75 ++++++++++--------- core/TaskManager.py | 2 - .../collector_db/test_db_client.py | 19 +---- 3 files changed, 42 insertions(+), 54 deletions(-) diff --git a/collector_db/AsyncDatabaseClient.py b/collector_db/AsyncDatabaseClient.py index b47f9ae4..81ab8de2 100644 --- a/collector_db/AsyncDatabaseClient.py +++ b/collector_db/AsyncDatabaseClient.py @@ -1018,27 +1018,44 @@ def annotations_exist_subquery(model: Type[Base]): ).subquery() ) - def count_subquery(model: Type[Base]): - return ( - select( - model.url_id, - func.count(model.url_id).label("count") - ).group_by(model.url_id).subquery() - ) + user_models = [ + UserRelevantSuggestion, + UserRecordTypeSuggestion, + UserUrlAgencySuggestion + ] models = [ AutoRelevantSuggestion, - UserRelevantSuggestion, AutoRecordTypeSuggestion, - UserRecordTypeSuggestion, AutomatedUrlAgencySuggestion, - UserUrlAgencySuggestion + *user_models + ] + + # The below relationships are joined directly to the URL + single_join_relationships = [ + URL.html_content, + URL.auto_record_type_suggestion, + URL.auto_relevant_suggestion, + URL.user_relevant_suggestion, + URL.user_record_type_suggestion, + URL.optional_data_source_metadata, + ] + + # The below relationships are joined to entities that are joined to the URL + double_join_relationships = [ + (URL.automated_agency_suggestions, AutomatedUrlAgencySuggestion.agency), + (URL.user_agency_suggestion, UserUrlAgencySuggestion.agency), + (URL.confirmed_agencies, ConfirmedURLAgency.agency) ] exist_subqueries = [ annotations_exist_subquery(model=model) for model in models ] + user_exist_subqueries = [ + annotations_exist_subquery(model=model) + for model in user_models + ] sum_of_exist_subqueries = ( sum( @@ -1064,39 +1081,27 @@ def count_subquery(model: Type[Base]): subquery, URL.id == subquery.c.url_id ) + where_subqueries = [ + subquery.c.exists == 1 + for subquery in user_exist_subqueries + ] + url_query = url_query.where( - URL.outcome == URLStatus.PENDING.value + and_( + URL.outcome == URLStatus.PENDING.value, + *where_subqueries ) + ) if batch_id is not None: url_query = url_query.where( URL.batch_id == batch_id ) - # The below relationships are joined directly to the URL - single_join_relationships = [ - URL.html_content, - URL.auto_record_type_suggestion, - URL.auto_relevant_suggestion, - URL.user_relevant_suggestion, - URL.user_record_type_suggestion, - URL.optional_data_source_metadata, - ] - - options = [ - joinedload(relationship) for relationship in single_join_relationships - ] - - # The below relationships are joined to entities that are joined to the URL - double_join_relationships = [ - (URL.automated_agency_suggestions, AutomatedUrlAgencySuggestion.agency), - (URL.user_agency_suggestion, UserUrlAgencySuggestion.agency), - (URL.confirmed_agencies, ConfirmedURLAgency.agency) - ] - for primary, secondary in double_join_relationships: - options.append(joinedload(primary).joinedload(secondary)) - # Apply options - url_query = url_query.options(*options) + url_query = url_query.options( + *[joinedload(relationship) for relationship in single_join_relationships], + *[joinedload(primary).joinedload(secondary) for primary, secondary in double_join_relationships] + ) # Apply order clause url_query = url_query.order_by( diff --git a/core/TaskManager.py b/core/TaskManager.py index c5f066e7..4dd92450 100644 --- a/core/TaskManager.py +++ b/core/TaskManager.py @@ -47,8 +47,6 @@ def __init__( self.task_trigger = FunctionTrigger(self.run_tasks) self.task_status: TaskType = TaskType.IDLE - - #region Task Operators async def get_url_html_task_operator(self): operator = URLHTMLTaskOperator( diff --git a/tests/test_automated/integration/collector_db/test_db_client.py b/tests/test_automated/integration/collector_db/test_db_client.py index e5454343..b0c5e91d 100644 --- a/tests/test_automated/integration/collector_db/test_db_client.py +++ b/tests/test_automated/integration/collector_db/test_db_client.py @@ -273,7 +273,7 @@ async def test_get_next_url_for_final_review_favor_more_components(db_data_creat async def test_get_next_url_for_final_review_no_annotations(db_data_creator: DBDataCreator): """ Test in the case of one URL with no annotations. - Should be returned if it is the only one available. + No annotations should be returned """ batch_id = db_data_creator.batch() url_mapping = db_data_creator.urls(batch_id=batch_id, url_count=1).url_mappings[0] @@ -282,22 +282,7 @@ async def test_get_next_url_for_final_review_no_annotations(db_data_creator: DBD batch_id=None ) - assert result.id == url_mapping.url_id - - annotations = result.annotations - - agency = annotations.agency - assert agency.confirmed == [] - assert agency.auto.unknown is True - assert agency.auto.suggestions == [] - - record_type = annotations.record_type - assert record_type.auto is None - assert record_type.user is None - - relevant = annotations.relevant - assert relevant.auto is None - assert relevant.user is None + assert result is None @pytest.mark.asyncio async def test_get_next_url_for_final_review_only_confirmed_urls(db_data_creator: DBDataCreator): From b1878d50f400d9f2f24daa5927837163dff53815 Mon Sep 17 00:00:00 2001 From: maxachis Date: Tue, 27 May 2025 15:16:09 -0400 Subject: [PATCH 2/2] Remove test_example_collector_lifecycle_multiple_batches --- .../core/test_example_collector_lifecycle.py | 42 ------------------- 1 file changed, 42 deletions(-) diff --git a/tests/test_automated/integration/core/test_example_collector_lifecycle.py b/tests/test_automated/integration/core/test_example_collector_lifecycle.py index 65ffc001..18411457 100644 --- a/tests/test_automated/integration/core/test_example_collector_lifecycle.py +++ b/tests/test_automated/integration/core/test_example_collector_lifecycle.py @@ -65,45 +65,3 @@ async def test_example_collector_lifecycle( assert url_infos[0].url == "https://example.com" assert url_infos[1].url == "https://example.com/2" - -@pytest.mark.asyncio -async def test_example_collector_lifecycle_multiple_batches( - test_core: SourceCollectorCore, - test_async_core: AsyncCore, - monkeypatch -): - """ - Test the flow of an example collector, which generates fake urls - and saves them to the database - """ - barrier = await block_sleep(monkeypatch) - acore = test_async_core - core = test_core - csis: list[CollectorStartInfo] = [] - - - for i in range(3): - dto = ExampleInputDTO( - example_field="example_value", - sleep_time=1 - ) - csi: CollectorStartInfo = await acore.initiate_collector( - collector_type=CollectorType.EXAMPLE, - dto=dto, - user_id=1 - ) - csis.append(csi) - - await asyncio.sleep(0) - - for csi in csis: - print("Batch ID:", csi.batch_id) - assert core.get_status(csi.batch_id) == BatchStatus.IN_PROCESS - - barrier.release() - - await asyncio.sleep(0.15) - - for csi in csis: - assert core.get_status(csi.batch_id) == BatchStatus.READY_TO_LABEL -