Skip to content

Commit 2d32b57

Browse files
authored
Merge pull request #540 from Police-Data-Accessibility-Project/mc_405_ckan_muckrock_agency_identifiers
Adjust CKAN/Muckrock Agency ID Logic
2 parents 3ef9671 + a61ccd6 commit 2d32b57

File tree

12 files changed

+84
-206
lines changed

12 files changed

+84
-206
lines changed

src/core/tasks/url/operators/agency_identification/subtasks/convert.py

Lines changed: 7 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -2,21 +2,15 @@
22
from src.core.tasks.url.operators.agency_identification.subtasks.models.suggestion import AgencySuggestion
33
from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType
44
from src.db.models.impl.url.suggestion.agency.subtask.pydantic import URLAutoAgencyIDSubtaskPydantic
5-
from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo
6-
from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse
7-
from src.external.pdap.enums import MatchAgencyResponseStatus
85

9-
def convert_match_agency_response_to_subtask_data(
6+
7+
def convert_agency_suggestions_to_subtask_data(
108
url_id: int,
11-
response: MatchAgencyResponse,
9+
agency_suggestions: list[AgencySuggestion],
1210
subtask_type: AutoAgencyIDSubtaskType,
13-
task_id: int
14-
):
15-
suggestions: list[AgencySuggestion] = \
16-
_convert_match_agency_response_to_suggestions(
17-
response
18-
)
19-
agencies_found: bool = len(suggestions) > 0
11+
task_id: int,
12+
) -> AutoAgencyIDSubtaskData:
13+
agencies_found: bool = len(agency_suggestions) > 0
2014
subtask_pydantic = URLAutoAgencyIDSubtaskPydantic(
2115
url_id=url_id,
2216
type=subtask_type,
@@ -25,30 +19,6 @@ def convert_match_agency_response_to_subtask_data(
2519
)
2620
return AutoAgencyIDSubtaskData(
2721
pydantic_model=subtask_pydantic,
28-
suggestions=suggestions
22+
suggestions=agency_suggestions
2923
)
3024

31-
def _convert_match_agency_response_to_suggestions(
32-
match_response: MatchAgencyResponse,
33-
) -> list[AgencySuggestion]:
34-
if match_response.status == MatchAgencyResponseStatus.EXACT_MATCH:
35-
match_info: MatchAgencyInfo = match_response.matches[0]
36-
return [
37-
AgencySuggestion(
38-
agency_id=int(match_info.id),
39-
confidence=100
40-
)
41-
]
42-
if match_response.status == MatchAgencyResponseStatus.NO_MATCH:
43-
return []
44-
if match_response.status != MatchAgencyResponseStatus.PARTIAL_MATCH:
45-
raise ValueError(f"Unknown Match Agency Response Status: {match_response.status}")
46-
total_confidence: int = 100
47-
confidence_per_match: int = total_confidence // len(match_response.matches)
48-
return [
49-
AgencySuggestion(
50-
agency_id=int(match_info.id),
51-
confidence=confidence_per_match
52-
)
53-
for match_info in match_response.matches
54-
]

src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/core.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,18 @@
33
from typing_extensions import override
44

55
from src.core.tasks.url.operators.agency_identification.subtasks.convert import \
6-
convert_match_agency_response_to_subtask_data
6+
convert_agency_suggestions_to_subtask_data
77
from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan_.params import CKANAgencyIDSubtaskParams
88
from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan_.query import \
99
GetCKANAgencyIDSubtaskParamsQueryBuilder
1010
from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData
11+
from src.core.tasks.url.operators.agency_identification.subtasks.models.suggestion import AgencySuggestion
12+
from src.core.tasks.url.operators.agency_identification.subtasks.queries.match_agency import MatchAgencyQueryBuilder
1113
from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import \
1214
AgencyIDSubtaskOperatorBase
1315
from src.db.client.async_ import AsyncDatabaseClient
1416
from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType
1517
from src.external.pdap.client import PDAPClient
16-
from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse
1718

1819

1920
@final
@@ -35,12 +36,14 @@ async def inner_logic(self) -> None:
3536
subtask_data_list: list[AutoAgencyIDSubtaskData] = []
3637
for param in params:
3738
agency_name: str = param.collector_metadata["agency_name"]
38-
response: MatchAgencyResponse = await self.pdap_client.match_agency(
39-
name=agency_name
39+
agency_suggestions: list[AgencySuggestion] = await self.adb_client.run_query_builder(
40+
MatchAgencyQueryBuilder(
41+
agency_name=agency_name
42+
)
4043
)
41-
subtask_data: AutoAgencyIDSubtaskData = convert_match_agency_response_to_subtask_data(
44+
subtask_data: AutoAgencyIDSubtaskData = convert_agency_suggestions_to_subtask_data(
4245
url_id=param.url_id,
43-
response=response,
46+
agency_suggestions=agency_suggestions,
4447
subtask_type=AutoAgencyIDSubtaskType.CKAN,
4548
task_id=self.task_id
4649
)

src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/core.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,18 +6,19 @@
66
from src.collectors.impl.muckrock.api_interface.lookup_response import AgencyLookupResponse
77
from src.collectors.impl.muckrock.enums import AgencyLookupResponseType
88
from src.core.tasks.url.operators.agency_identification.subtasks.convert import \
9-
convert_match_agency_response_to_subtask_data
9+
convert_agency_suggestions_to_subtask_data
1010
from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock_.params import \
1111
MuckrockAgencyIDSubtaskParams
1212
from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock_.query import \
1313
GetMuckrockAgencyIDSubtaskParamsQueryBuilder
1414
from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData
15+
from src.core.tasks.url.operators.agency_identification.subtasks.models.suggestion import AgencySuggestion
16+
from src.core.tasks.url.operators.agency_identification.subtasks.queries.match_agency import MatchAgencyQueryBuilder
1517
from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIDSubtaskOperatorBase
1618
from src.db.client.async_ import AsyncDatabaseClient
1719
from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType, SubtaskDetailCode
1820
from src.db.models.impl.url.suggestion.agency.subtask.pydantic import URLAutoAgencyIDSubtaskPydantic
1921
from src.external.pdap.client import PDAPClient
20-
from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse
2122

2223

2324
@final
@@ -52,12 +53,14 @@ async def inner_logic(self) -> None:
5253
)
5354
subtask_data_list.append(data)
5455
continue
55-
match_agency_response: MatchAgencyResponse = await self.pdap_client.match_agency(
56-
name=agency_lookup_response.name
56+
agency_suggestions: list[AgencySuggestion] = await self.adb_client.run_query_builder(
57+
MatchAgencyQueryBuilder(
58+
agency_name=agency_lookup_response.name
59+
)
5760
)
58-
subtask_data: AutoAgencyIDSubtaskData = convert_match_agency_response_to_subtask_data(
61+
subtask_data: AutoAgencyIDSubtaskData = convert_agency_suggestions_to_subtask_data(
5962
url_id=param.url_id,
60-
response=match_agency_response,
63+
agency_suggestions=agency_suggestions,
6164
subtask_type=AutoAgencyIDSubtaskType.MUCKROCK,
6265
task_id=self.task_id
6366
)
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
from typing import Sequence
2+
3+
from sqlalchemy import select, func, desc, RowMapping
4+
from sqlalchemy.ext.asyncio import AsyncSession
5+
6+
from src.core.tasks.url.operators.agency_identification.subtasks.models.suggestion import AgencySuggestion
7+
from src.db.models.impl.agency.sqlalchemy import Agency
8+
from src.db.queries.base.builder import QueryBuilderBase
9+
10+
11+
class MatchAgencyQueryBuilder(QueryBuilderBase):
12+
13+
def __init__(
14+
self,
15+
agency_name: str
16+
):
17+
super().__init__()
18+
self.agency_name = agency_name
19+
20+
async def run(self, session: AsyncSession) -> list[AgencySuggestion]:
21+
query = (
22+
select(
23+
Agency.id,
24+
func.similarity(Agency.name, self.agency_name).label("similarity")
25+
)
26+
.where(
27+
func.similarity(Agency.name, self.agency_name) > 0.5
28+
)
29+
.order_by(
30+
desc("similarity")
31+
)
32+
.limit(10)
33+
)
34+
mappings: Sequence[RowMapping] = await self.sh.mappings(
35+
session=session,
36+
query=query
37+
)
38+
return [
39+
AgencySuggestion(
40+
agency_id=mapping[Agency.id],
41+
confidence=int(mapping["similarity"] * 100)
42+
)
43+
for mapping in mappings
44+
]

src/external/pdap/client.py

Lines changed: 0 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,7 @@
66
from pdap_access_manager.models.response import ResponseInfo
77

88
from src.external.pdap._templates.request_builder import PDAPRequestBuilderBase
9-
from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo
10-
from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse
119
from src.external.pdap.dtos.unique_url_duplicate import UniqueURLDuplicateInfo
12-
from src.external.pdap.enums import MatchAgencyResponseStatus
1310

1411

1512
class PDAPClient:
@@ -26,50 +23,6 @@ async def run_request_builder(
2623
) -> Any:
2724
return await request_builder.run(self.access_manager)
2825

29-
async def match_agency(
30-
self,
31-
name: str,
32-
state: str | None = None,
33-
county: str | None = None,
34-
locality: str | None = None
35-
) -> MatchAgencyResponse:
36-
"""
37-
Returns agencies, if any, that match or partially match the search criteria
38-
"""
39-
url: str = f"{self.access_manager.data_sources_url}/v2/match/agency"
40-
41-
headers: dict[str, str] = await self.access_manager.jwt_header()
42-
headers['Content-Type']: str = "application/json"
43-
request_info = RequestInfo(
44-
type_=RequestType.POST,
45-
url=url,
46-
headers=headers,
47-
json_={
48-
"name": name,
49-
"state": state,
50-
"county": county,
51-
"locality": locality
52-
}
53-
)
54-
response_info: ResponseInfo = await self.access_manager.make_request(request_info)
55-
matches: list[MatchAgencyInfo] = []
56-
for agency in response_info.data["agencies"]:
57-
mai = MatchAgencyInfo(
58-
id=agency['id'],
59-
submitted_name=agency['name']
60-
)
61-
if len(agency['locations']) > 0:
62-
first_location: dict[str, Any] = agency['locations'][0]
63-
mai.state = first_location['state']
64-
mai.county = first_location['county']
65-
mai.locality = first_location['locality']
66-
matches.append(mai)
67-
68-
return MatchAgencyResponse(
69-
status=MatchAgencyResponseStatus(response_info.data["status"]),
70-
matches=matches
71-
)
72-
7326
async def is_url_duplicate(
7427
self,
7528
url_to_check: str

src/external/pdap/dtos/match_agency/__init__.py

Whitespace-only changes.

src/external/pdap/dtos/match_agency/post.py

Lines changed: 0 additions & 11 deletions
This file was deleted.

src/external/pdap/dtos/match_agency/response.py

Lines changed: 0 additions & 11 deletions
This file was deleted.

src/external/pdap/enums.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,6 @@
11
from enum import Enum
22

33

4-
class MatchAgencyResponseStatus(Enum):
5-
EXACT_MATCH = "Exact Match"
6-
PARTIAL_MATCH = "Partial Matches"
7-
NO_MATCH = "No Match"
8-
9-
104
class ApprovalStatus(Enum):
115
APPROVED = "approved"
126
REJECTED = "rejected"

tests/automated/integration/tasks/url/impl/agency_identification/subtasks/ckan/test_core.py

Lines changed: 7 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
from unittest.mock import AsyncMock
2-
31
import pytest
42

53
from src.collectors.enums import CollectorType
@@ -9,19 +7,16 @@
97
from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType
108
from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask
119
from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion
12-
from src.external.pdap.enums import MatchAgencyResponseStatus
13-
from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan_.core import CKANAgencyIDSubtaskOperator
14-
from src.core.enums import SuggestionType
15-
from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse
16-
from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo
1710
from tests.helpers.asserts import assert_task_run_success
1811
from tests.helpers.data_creator.core import DBDataCreator
1912

2013

2114
@pytest.mark.asyncio
2215
async def test_ckan_subtask(
2316
operator: AgencyIdentificationTaskOperator,
24-
db_data_creator: DBDataCreator
17+
db_data_creator: DBDataCreator,
18+
test_agency_id: int,
19+
test_agency_id_2: int
2520
):
2621
# Test that ckan subtask correctly sends agency id to
2722
# CKANAPIInterface, sends resultant agency name to
@@ -53,25 +48,6 @@ async def test_ckan_subtask(
5348
assert await operator.meets_task_prerequisites()
5449
assert operator._subtask == AutoAgencyIDSubtaskType.CKAN
5550

56-
pdap_client_mock = operator.loader._pdap_client
57-
pdap_client_mock.match_agency.return_value = MatchAgencyResponse(
58-
status=MatchAgencyResponseStatus.PARTIAL_MATCH,
59-
matches=[
60-
MatchAgencyInfo(
61-
id=1,
62-
submitted_name="Mock Agency Name",
63-
),
64-
MatchAgencyInfo(
65-
id=2,
66-
submitted_name="Another Mock Agency Name",
67-
)
68-
]
69-
)
70-
71-
# Create agencies
72-
await db_data_creator.create_agency(1)
73-
await db_data_creator.create_agency(2)
74-
7551
# Run the operator
7652
run_info: TaskOperatorRunInfo = await operator.run_task()
7753
assert_task_run_success(run_info)
@@ -92,9 +68,9 @@ async def test_ckan_subtask(
9268
AgencyIDSubtaskSuggestion
9369
)
9470
assert len(suggestions) == 2
95-
assert {suggestion.confidence for suggestion in suggestions} == {50}
96-
assert {suggestion.agency_id for suggestion in suggestions} == {1, 2}
71+
assert {suggestion.agency_id for suggestion in suggestions} == {
72+
test_agency_id,
73+
test_agency_id_2
74+
}
9775
assert {suggestion.subtask_id for suggestion in suggestions} == {subtask_id}
9876

99-
# Assert methods called as expected
100-
pdap_client_mock.match_agency.assert_called_once_with(name="Test Agency")

0 commit comments

Comments
 (0)