From a61ccd69bae64313811d69875f75a59de282585f Mon Sep 17 00:00:00 2001 From: Max Chis Date: Wed, 26 Nov 2025 20:29:36 -0500 Subject: [PATCH] Adjust CKAN/Muckrock Agency ID Logic --- .../agency_identification/subtasks/convert.py | 44 +++------------ .../subtasks/impl/ckan_/core.py | 15 +++--- .../subtasks/impl/muckrock_/core.py | 15 +++--- .../subtasks/queries/match_agency.py | 44 +++++++++++++++ src/external/pdap/client.py | 47 ---------------- .../pdap/dtos/match_agency/__init__.py | 0 src/external/pdap/dtos/match_agency/post.py | 11 ---- .../pdap/dtos/match_agency/response.py | 11 ---- src/external/pdap/enums.py | 6 --- .../subtasks/ckan/test_core.py | 38 +++---------- .../subtasks/muckrock/test_core.py | 53 +++---------------- .../manual/external/pdap/test_match_agency.py | 6 --- 12 files changed, 84 insertions(+), 206 deletions(-) create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/queries/match_agency.py delete mode 100644 src/external/pdap/dtos/match_agency/__init__.py delete mode 100644 src/external/pdap/dtos/match_agency/post.py delete mode 100644 src/external/pdap/dtos/match_agency/response.py delete mode 100644 tests/manual/external/pdap/test_match_agency.py diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/convert.py b/src/core/tasks/url/operators/agency_identification/subtasks/convert.py index 95c9e704..5cead5d3 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/convert.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/convert.py @@ -2,21 +2,15 @@ from src.core.tasks.url.operators.agency_identification.subtasks.models.suggestion import AgencySuggestion from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType from src.db.models.impl.url.suggestion.agency.subtask.pydantic import URLAutoAgencyIDSubtaskPydantic -from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo -from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse -from src.external.pdap.enums import MatchAgencyResponseStatus -def convert_match_agency_response_to_subtask_data( + +def convert_agency_suggestions_to_subtask_data( url_id: int, - response: MatchAgencyResponse, + agency_suggestions: list[AgencySuggestion], subtask_type: AutoAgencyIDSubtaskType, - task_id: int -): - suggestions: list[AgencySuggestion] = \ - _convert_match_agency_response_to_suggestions( - response - ) - agencies_found: bool = len(suggestions) > 0 + task_id: int, +) -> AutoAgencyIDSubtaskData: + agencies_found: bool = len(agency_suggestions) > 0 subtask_pydantic = URLAutoAgencyIDSubtaskPydantic( url_id=url_id, type=subtask_type, @@ -25,30 +19,6 @@ def convert_match_agency_response_to_subtask_data( ) return AutoAgencyIDSubtaskData( pydantic_model=subtask_pydantic, - suggestions=suggestions + suggestions=agency_suggestions ) -def _convert_match_agency_response_to_suggestions( - match_response: MatchAgencyResponse, -) -> list[AgencySuggestion]: - if match_response.status == MatchAgencyResponseStatus.EXACT_MATCH: - match_info: MatchAgencyInfo = match_response.matches[0] - return [ - AgencySuggestion( - agency_id=int(match_info.id), - confidence=100 - ) - ] - if match_response.status == MatchAgencyResponseStatus.NO_MATCH: - return [] - if match_response.status != MatchAgencyResponseStatus.PARTIAL_MATCH: - raise ValueError(f"Unknown Match Agency Response Status: {match_response.status}") - total_confidence: int = 100 - confidence_per_match: int = total_confidence // len(match_response.matches) - return [ - AgencySuggestion( - agency_id=int(match_info.id), - confidence=confidence_per_match - ) - for match_info in match_response.matches - ] \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/core.py index d1af5391..2603191a 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/core.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/core.py @@ -3,17 +3,18 @@ from typing_extensions import override from src.core.tasks.url.operators.agency_identification.subtasks.convert import \ - convert_match_agency_response_to_subtask_data + convert_agency_suggestions_to_subtask_data from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan_.params import CKANAgencyIDSubtaskParams from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan_.query import \ GetCKANAgencyIDSubtaskParamsQueryBuilder from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData +from src.core.tasks.url.operators.agency_identification.subtasks.models.suggestion import AgencySuggestion +from src.core.tasks.url.operators.agency_identification.subtasks.queries.match_agency import MatchAgencyQueryBuilder from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import \ AgencyIDSubtaskOperatorBase from src.db.client.async_ import AsyncDatabaseClient from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType from src.external.pdap.client import PDAPClient -from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse @final @@ -35,12 +36,14 @@ async def inner_logic(self) -> None: subtask_data_list: list[AutoAgencyIDSubtaskData] = [] for param in params: agency_name: str = param.collector_metadata["agency_name"] - response: MatchAgencyResponse = await self.pdap_client.match_agency( - name=agency_name + agency_suggestions: list[AgencySuggestion] = await self.adb_client.run_query_builder( + MatchAgencyQueryBuilder( + agency_name=agency_name + ) ) - subtask_data: AutoAgencyIDSubtaskData = convert_match_agency_response_to_subtask_data( + subtask_data: AutoAgencyIDSubtaskData = convert_agency_suggestions_to_subtask_data( url_id=param.url_id, - response=response, + agency_suggestions=agency_suggestions, subtask_type=AutoAgencyIDSubtaskType.CKAN, task_id=self.task_id ) diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/core.py index 4fa92c2e..030139ad 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/core.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/core.py @@ -6,18 +6,19 @@ from src.collectors.impl.muckrock.api_interface.lookup_response import AgencyLookupResponse from src.collectors.impl.muckrock.enums import AgencyLookupResponseType from src.core.tasks.url.operators.agency_identification.subtasks.convert import \ - convert_match_agency_response_to_subtask_data + convert_agency_suggestions_to_subtask_data from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock_.params import \ MuckrockAgencyIDSubtaskParams from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock_.query import \ GetMuckrockAgencyIDSubtaskParamsQueryBuilder from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData +from src.core.tasks.url.operators.agency_identification.subtasks.models.suggestion import AgencySuggestion +from src.core.tasks.url.operators.agency_identification.subtasks.queries.match_agency import MatchAgencyQueryBuilder from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIDSubtaskOperatorBase from src.db.client.async_ import AsyncDatabaseClient from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType, SubtaskDetailCode from src.db.models.impl.url.suggestion.agency.subtask.pydantic import URLAutoAgencyIDSubtaskPydantic from src.external.pdap.client import PDAPClient -from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse @final @@ -52,12 +53,14 @@ async def inner_logic(self) -> None: ) subtask_data_list.append(data) continue - match_agency_response: MatchAgencyResponse = await self.pdap_client.match_agency( - name=agency_lookup_response.name + agency_suggestions: list[AgencySuggestion] = await self.adb_client.run_query_builder( + MatchAgencyQueryBuilder( + agency_name=agency_lookup_response.name + ) ) - subtask_data: AutoAgencyIDSubtaskData = convert_match_agency_response_to_subtask_data( + subtask_data: AutoAgencyIDSubtaskData = convert_agency_suggestions_to_subtask_data( url_id=param.url_id, - response=match_agency_response, + agency_suggestions=agency_suggestions, subtask_type=AutoAgencyIDSubtaskType.MUCKROCK, task_id=self.task_id ) diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/match_agency.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/match_agency.py new file mode 100644 index 00000000..4b5d6516 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/match_agency.py @@ -0,0 +1,44 @@ +from typing import Sequence + +from sqlalchemy import select, func, desc, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.url.operators.agency_identification.subtasks.models.suggestion import AgencySuggestion +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.queries.base.builder import QueryBuilderBase + + +class MatchAgencyQueryBuilder(QueryBuilderBase): + + def __init__( + self, + agency_name: str + ): + super().__init__() + self.agency_name = agency_name + + async def run(self, session: AsyncSession) -> list[AgencySuggestion]: + query = ( + select( + Agency.id, + func.similarity(Agency.name, self.agency_name).label("similarity") + ) + .where( + func.similarity(Agency.name, self.agency_name) > 0.5 + ) + .order_by( + desc("similarity") + ) + .limit(10) + ) + mappings: Sequence[RowMapping] = await self.sh.mappings( + session=session, + query=query + ) + return [ + AgencySuggestion( + agency_id=mapping[Agency.id], + confidence=int(mapping["similarity"] * 100) + ) + for mapping in mappings + ] \ No newline at end of file diff --git a/src/external/pdap/client.py b/src/external/pdap/client.py index 0d6d9ec7..38c67e08 100644 --- a/src/external/pdap/client.py +++ b/src/external/pdap/client.py @@ -6,10 +6,7 @@ from pdap_access_manager.models.response import ResponseInfo from src.external.pdap._templates.request_builder import PDAPRequestBuilderBase -from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo -from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse from src.external.pdap.dtos.unique_url_duplicate import UniqueURLDuplicateInfo -from src.external.pdap.enums import MatchAgencyResponseStatus class PDAPClient: @@ -26,50 +23,6 @@ async def run_request_builder( ) -> Any: return await request_builder.run(self.access_manager) - async def match_agency( - self, - name: str, - state: str | None = None, - county: str | None = None, - locality: str | None = None - ) -> MatchAgencyResponse: - """ - Returns agencies, if any, that match or partially match the search criteria - """ - url: str = f"{self.access_manager.data_sources_url}/v2/match/agency" - - headers: dict[str, str] = await self.access_manager.jwt_header() - headers['Content-Type']: str = "application/json" - request_info = RequestInfo( - type_=RequestType.POST, - url=url, - headers=headers, - json_={ - "name": name, - "state": state, - "county": county, - "locality": locality - } - ) - response_info: ResponseInfo = await self.access_manager.make_request(request_info) - matches: list[MatchAgencyInfo] = [] - for agency in response_info.data["agencies"]: - mai = MatchAgencyInfo( - id=agency['id'], - submitted_name=agency['name'] - ) - if len(agency['locations']) > 0: - first_location: dict[str, Any] = agency['locations'][0] - mai.state = first_location['state'] - mai.county = first_location['county'] - mai.locality = first_location['locality'] - matches.append(mai) - - return MatchAgencyResponse( - status=MatchAgencyResponseStatus(response_info.data["status"]), - matches=matches - ) - async def is_url_duplicate( self, url_to_check: str diff --git a/src/external/pdap/dtos/match_agency/__init__.py b/src/external/pdap/dtos/match_agency/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/external/pdap/dtos/match_agency/post.py b/src/external/pdap/dtos/match_agency/post.py deleted file mode 100644 index 2be0b90e..00000000 --- a/src/external/pdap/dtos/match_agency/post.py +++ /dev/null @@ -1,11 +0,0 @@ -from typing import Optional - -from pydantic import BaseModel - - -class MatchAgencyInfo(BaseModel): - id: int - submitted_name: str - state: str | None = None - county: str | None = None - locality: str | None = None diff --git a/src/external/pdap/dtos/match_agency/response.py b/src/external/pdap/dtos/match_agency/response.py deleted file mode 100644 index aa4d9ec3..00000000 --- a/src/external/pdap/dtos/match_agency/response.py +++ /dev/null @@ -1,11 +0,0 @@ -from typing import List - -from pydantic import BaseModel - -from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo -from src.external.pdap.enums import MatchAgencyResponseStatus - - -class MatchAgencyResponse(BaseModel): - status: MatchAgencyResponseStatus - matches: List[MatchAgencyInfo] diff --git a/src/external/pdap/enums.py b/src/external/pdap/enums.py index c532f820..55819619 100644 --- a/src/external/pdap/enums.py +++ b/src/external/pdap/enums.py @@ -1,12 +1,6 @@ from enum import Enum -class MatchAgencyResponseStatus(Enum): - EXACT_MATCH = "Exact Match" - PARTIAL_MATCH = "Partial Matches" - NO_MATCH = "No Match" - - class ApprovalStatus(Enum): APPROVED = "approved" REJECTED = "rejected" diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/ckan/test_core.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/ckan/test_core.py index 90aacfa5..4ec99967 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/ckan/test_core.py +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/ckan/test_core.py @@ -1,5 +1,3 @@ -from unittest.mock import AsyncMock - import pytest from src.collectors.enums import CollectorType @@ -9,11 +7,6 @@ from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion -from src.external.pdap.enums import MatchAgencyResponseStatus -from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan_.core import CKANAgencyIDSubtaskOperator -from src.core.enums import SuggestionType -from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse -from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo from tests.helpers.asserts import assert_task_run_success from tests.helpers.data_creator.core import DBDataCreator @@ -21,7 +14,9 @@ @pytest.mark.asyncio async def test_ckan_subtask( operator: AgencyIdentificationTaskOperator, - db_data_creator: DBDataCreator + db_data_creator: DBDataCreator, + test_agency_id: int, + test_agency_id_2: int ): # Test that ckan subtask correctly sends agency id to # CKANAPIInterface, sends resultant agency name to @@ -53,25 +48,6 @@ async def test_ckan_subtask( assert await operator.meets_task_prerequisites() assert operator._subtask == AutoAgencyIDSubtaskType.CKAN - pdap_client_mock = operator.loader._pdap_client - pdap_client_mock.match_agency.return_value = MatchAgencyResponse( - status=MatchAgencyResponseStatus.PARTIAL_MATCH, - matches=[ - MatchAgencyInfo( - id=1, - submitted_name="Mock Agency Name", - ), - MatchAgencyInfo( - id=2, - submitted_name="Another Mock Agency Name", - ) - ] - ) - - # Create agencies - await db_data_creator.create_agency(1) - await db_data_creator.create_agency(2) - # Run the operator run_info: TaskOperatorRunInfo = await operator.run_task() assert_task_run_success(run_info) @@ -92,9 +68,9 @@ async def test_ckan_subtask( AgencyIDSubtaskSuggestion ) assert len(suggestions) == 2 - assert {suggestion.confidence for suggestion in suggestions} == {50} - assert {suggestion.agency_id for suggestion in suggestions} == {1, 2} + assert {suggestion.agency_id for suggestion in suggestions} == { + test_agency_id, + test_agency_id_2 + } assert {suggestion.subtask_id for suggestion in suggestions} == {subtask_id} - # Assert methods called as expected - pdap_client_mock.match_agency.assert_called_once_with(name="Test Agency") diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/muckrock/test_core.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/muckrock/test_core.py index 7cf72c5e..af41354d 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/muckrock/test_core.py +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/muckrock/test_core.py @@ -1,24 +1,14 @@ -from unittest.mock import MagicMock - import pytest from src.collectors.enums import CollectorType -from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface from src.collectors.impl.muckrock.api_interface.lookup_response import AgencyLookupResponse from src.collectors.impl.muckrock.enums import AgencyLookupResponseType -from src.core.enums import SuggestionType from src.core.tasks.base.run_info import TaskOperatorRunInfo from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator -from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock_.core import MuckrockAgencyIDSubtaskOperator from src.db.client.async_ import AsyncDatabaseClient from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion -from src.external.pdap.client import PDAPClient -from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo -from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse -from src.external.pdap.enums import MatchAgencyResponseStatus from tests.helpers.asserts import assert_task_run_success from tests.helpers.data_creator.core import DBDataCreator @@ -26,7 +16,9 @@ @pytest.mark.asyncio async def test_muckrock_subtask( operator: AgencyIdentificationTaskOperator, - db_data_creator: DBDataCreator + db_data_creator: DBDataCreator, + test_agency_id: int, + test_agency_id_2: int ): adb_client: AsyncDatabaseClient = operator.adb_client @@ -81,38 +73,16 @@ async def test_muckrock_subtask( assert await operator.meets_task_prerequisites() assert operator._subtask == AutoAgencyIDSubtaskType.MUCKROCK - # Test that muckrock subtask correctly sends agency name to - # MatchAgenciesInterface and adds received suggestions to - # url_agency_suggestions - # Create mock instances for dependency injections muckrock_api_interface_mock = operator.loader._muckrock_api_interface - pdap_client_mock = operator.loader._pdap_client # Set up mock return values for method calls muckrock_api_interface_mock.lookup_agency.return_value = AgencyLookupResponse( type=AgencyLookupResponseType.FOUND, - name="Mock Agency Name", + name="Test Agency", error=None ) - # Create agencies - await db_data_creator.create_agency(1) - await db_data_creator.create_agency(2) - - pdap_client_mock.match_agency.return_value = MatchAgencyResponse( - status=MatchAgencyResponseStatus.PARTIAL_MATCH, - matches=[ - MatchAgencyInfo( - id=1, - submitted_name="Mock Agency Name", - ), - MatchAgencyInfo( - id=2, - submitted_name="Another Mock Agency Name", - ) - ] - ) # Run the operator run_info: TaskOperatorRunInfo = await operator.run_task() @@ -134,15 +104,8 @@ async def test_muckrock_subtask( AgencyIDSubtaskSuggestion ) assert len(suggestions) == 2 - assert {suggestion.confidence for suggestion in suggestions} == {50} - assert {suggestion.agency_id for suggestion in suggestions} == {1, 2} + assert {suggestion.agency_id for suggestion in suggestions} == { + test_agency_id, + test_agency_id_2 + } assert {suggestion.subtask_id for suggestion in suggestions} == {subtask_id} - - - # # Assert methods called as expected - muckrock_api_interface_mock.lookup_agency.assert_called_once_with( - muckrock_agency_id=123 - ) - pdap_client_mock.match_agency.assert_called_once_with( - name="Mock Agency Name" - ) diff --git a/tests/manual/external/pdap/test_match_agency.py b/tests/manual/external/pdap/test_match_agency.py deleted file mode 100644 index a637dad0..00000000 --- a/tests/manual/external/pdap/test_match_agency.py +++ /dev/null @@ -1,6 +0,0 @@ -import pytest - - -@pytest.mark.asyncio -async def test_match_agency(pdap_client): - response = await pdap_client.match_agency(name="police")