Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/core/tasks/base/operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@


@abstractmethod
async def inner_task_logic(self):
async def inner_task_logic(self) -> None:

Check warning on line 48 in src/core/tasks/base/operator.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/base/operator.py#L48 <102>

Missing docstring in public method
Raw output
./src/core/tasks/base/operator.py:48:1: D102 Missing docstring in public method
raise NotImplementedError

async def handle_task_error(self, e):
Expand Down
9 changes: 6 additions & 3 deletions src/core/tasks/url/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from src.collectors.source_collectors.muckrock.api_interface.core import MuckrockAPIInterface
from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator
from src.core.tasks.url.operators.agency_identification.subtasks.loader import AgencyIdentificationSubtaskLoader
from src.core.tasks.url.operators.auto_relevant.core import URLAutoRelevantTaskOperator
from src.core.tasks.url.operators.base import URLTaskOperatorBase
from src.core.tasks.url.operators.record_type.core import URLRecordTypeTaskOperator
Expand Down Expand Up @@ -59,8 +60,10 @@ async def get_url_record_type_task_operator(self):
async def get_agency_identification_task_operator(self):
operator = AgencyIdentificationTaskOperator(
adb_client=self.adb_client,
pdap_client=self.pdap_client,
muckrock_api_interface=self.muckrock_api_interface
loader=AgencyIdentificationSubtaskLoader(
pdap_client=self.pdap_client,
muckrock_api_interface=self.muckrock_api_interface
)
)
return operator

Expand Down Expand Up @@ -104,7 +107,7 @@ async def get_task_operators(self) -> list[URLTaskOperatorBase]:
await self.get_url_duplicate_task_operator(),
await self.get_url_404_probe_task_operator(),
await self.get_url_record_type_task_operator(),
# await self.get_agency_identification_task_operator(),
await self.get_agency_identification_task_operator(),
await self.get_url_miscellaneous_metadata_task_operator(),
await self.get_submit_approved_url_task_operator(),
await self.get_url_auto_relevance_task_operator()
Expand Down
115 changes: 62 additions & 53 deletions src/core/tasks/url/operators/agency_identification/core.py
Original file line number Diff line number Diff line change
@@ -1,75 +1,87 @@
from src.collectors.source_collectors.muckrock.api_interface.core import MuckrockAPIInterface
from src.collectors.enums import CollectorType

Check warning on line 1 in src/core/tasks/url/operators/agency_identification/core.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/agency_identification/core.py#L1 <100>

Missing docstring in public module
Raw output
./src/core/tasks/url/operators/agency_identification/core.py:1:1: D100 Missing docstring in public module
from src.core.enums import SuggestionType
from src.core.tasks.url.operators.agency_identification.dtos.output import GetAgencySuggestionsOutput
from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo
from src.core.tasks.url.operators.agency_identification.dtos.tdo import AgencyIdentificationTDO
from src.core.tasks.url.operators.agency_identification.subtasks.impl.base import AgencyIdentificationSubtaskBase
from src.core.tasks.url.operators.agency_identification.subtasks.loader import AgencyIdentificationSubtaskLoader
from src.core.tasks.url.operators.base import URLTaskOperatorBase
from src.db.client.async_ import AsyncDatabaseClient
from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo
from src.db.enums import TaskType
from src.collectors.enums import CollectorType
from src.core.tasks.url.operators.base import URLTaskOperatorBase
from src.core.tasks.url.subtasks.agency_identification.auto_googler import AutoGooglerAgencyIdentificationSubtask
from src.core.tasks.url.subtasks.agency_identification.ckan import CKANAgencyIdentificationSubtask
from src.core.tasks.url.subtasks.agency_identification.common_crawler import CommonCrawlerAgencyIdentificationSubtask
from src.core.tasks.url.subtasks.agency_identification.muckrock import MuckrockAgencyIdentificationSubtask
from src.core.enums import SuggestionType
from src.external.pdap.client import PDAPClient

from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo

# TODO: Validate with Manual Tests

class AgencyIdentificationTaskOperator(URLTaskOperatorBase):

def __init__(
self,
adb_client: AsyncDatabaseClient,
pdap_client: PDAPClient,
muckrock_api_interface: MuckrockAPIInterface,
loader: AgencyIdentificationSubtaskLoader,
):
super().__init__(adb_client)
self.pdap_client = pdap_client
self.muckrock_api_interface = muckrock_api_interface
self.loader = loader

@property
def task_type(self):
def task_type(self) -> TaskType:

Check warning on line 25 in src/core/tasks/url/operators/agency_identification/core.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/agency_identification/core.py#L25 <102>

Missing docstring in public method
Raw output
./src/core/tasks/url/operators/agency_identification/core.py:25:1: D102 Missing docstring in public method
return TaskType.AGENCY_IDENTIFICATION

async def meets_task_prerequisites(self):
async def meets_task_prerequisites(self) -> bool:

Check warning on line 28 in src/core/tasks/url/operators/agency_identification/core.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/agency_identification/core.py#L28 <102>

Missing docstring in public method
Raw output
./src/core/tasks/url/operators/agency_identification/core.py:28:1: D102 Missing docstring in public method
has_urls_without_agency_suggestions = await self.adb_client.has_urls_without_agency_suggestions()
return has_urls_without_agency_suggestions

async def get_pending_urls_without_agency_identification(self):
async def get_pending_urls_without_agency_identification(self) -> list[AgencyIdentificationTDO]:

Check warning on line 32 in src/core/tasks/url/operators/agency_identification/core.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/agency_identification/core.py#L32 <102>

Missing docstring in public method
Raw output
./src/core/tasks/url/operators/agency_identification/core.py:32:1: D102 Missing docstring in public method
return await self.adb_client.get_urls_without_agency_suggestions()

async def get_muckrock_subtask(self):
return MuckrockAgencyIdentificationSubtask(
muckrock_api_interface=self.muckrock_api_interface,
pdap_client=self.pdap_client
)

async def get_subtask(self, collector_type: CollectorType):
match collector_type:
case CollectorType.MUCKROCK_SIMPLE_SEARCH:
return await self.get_muckrock_subtask()
case CollectorType.MUCKROCK_COUNTY_SEARCH:
return await self.get_muckrock_subtask()
case CollectorType.MUCKROCK_ALL_SEARCH:
return await self.get_muckrock_subtask()
case CollectorType.AUTO_GOOGLER:
return AutoGooglerAgencyIdentificationSubtask()
case CollectorType.COMMON_CRAWLER:
return CommonCrawlerAgencyIdentificationSubtask()
case CollectorType.CKAN:
return CKANAgencyIdentificationSubtask(
pdap_client=self.pdap_client
)
return None
async def get_subtask(
self,
collector_type: CollectorType
) -> AgencyIdentificationSubtaskBase:
"""Get subtask based on collector type."""
return await self.loader.load_subtask(collector_type)

@staticmethod
async def run_subtask(subtask, url_id, collector_metadata) -> list[URLAgencySuggestionInfo]:
return await subtask.run(url_id=url_id, collector_metadata=collector_metadata)
async def run_subtask(

Check warning on line 43 in src/core/tasks/url/operators/agency_identification/core.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/agency_identification/core.py#L43 <102>

Missing docstring in public method
Raw output
./src/core/tasks/url/operators/agency_identification/core.py:43:1: D102 Missing docstring in public method
subtask: AgencyIdentificationSubtaskBase,
url_id: int,
collector_metadata: dict | None
) -> list[URLAgencySuggestionInfo]:
return await subtask.run(
url_id=url_id,
collector_metadata=collector_metadata
)

async def inner_task_logic(self):
async def inner_task_logic(self) -> None:

Check warning on line 53 in src/core/tasks/url/operators/agency_identification/core.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/agency_identification/core.py#L53 <102>

Missing docstring in public method
Raw output
./src/core/tasks/url/operators/agency_identification/core.py:53:1: D102 Missing docstring in public method
tdos: list[AgencyIdentificationTDO] = await self.get_pending_urls_without_agency_identification()
await self.link_urls_to_task(url_ids=[tdo.url_id for tdo in tdos])
output = await self._get_agency_suggestions(tdos)

await self._process_agency_suggestions(output.agency_suggestions)
await self.adb_client.add_url_error_infos(output.error_infos)

async def _process_agency_suggestions(
self,
suggestions: list[URLAgencySuggestionInfo]
) -> None:
non_unknown_agency_suggestions = [
suggestion for suggestion in suggestions
if suggestion.suggestion_type != SuggestionType.UNKNOWN
]
await self.adb_client.upsert_new_agencies(non_unknown_agency_suggestions)
confirmed_suggestions = [
suggestion for suggestion in suggestions
if suggestion.suggestion_type == SuggestionType.CONFIRMED
]
await self.adb_client.add_confirmed_agency_url_links(confirmed_suggestions)
non_confirmed_suggestions = [
suggestion for suggestion in suggestions
if suggestion.suggestion_type != SuggestionType.CONFIRMED
]
await self.adb_client.add_agency_auto_suggestions(non_confirmed_suggestions)

async def _get_agency_suggestions(
self,
tdos: list[AgencyIdentificationTDO]
) -> GetAgencySuggestionsOutput:
error_infos = []
all_agency_suggestions = []
for tdo in tdos:
Expand All @@ -88,13 +100,10 @@
error=str(e),
)
error_infos.append(error_info)

non_unknown_agency_suggestions = [suggestion for suggestion in all_agency_suggestions if suggestion.suggestion_type != SuggestionType.UNKNOWN]
await self.adb_client.upsert_new_agencies(non_unknown_agency_suggestions)
confirmed_suggestions = [suggestion for suggestion in all_agency_suggestions if suggestion.suggestion_type == SuggestionType.CONFIRMED]
await self.adb_client.add_confirmed_agency_url_links(confirmed_suggestions)
non_confirmed_suggestions = [suggestion for suggestion in all_agency_suggestions if suggestion.suggestion_type != SuggestionType.CONFIRMED]
await self.adb_client.add_agency_auto_suggestions(non_confirmed_suggestions)
await self.adb_client.add_url_error_infos(error_infos)
output = GetAgencySuggestionsOutput(
agency_suggestions=all_agency_suggestions,
error_infos=error_infos
)
return output


Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from pydantic import BaseModel

Check warning on line 1 in src/core/tasks/url/operators/agency_identification/dtos/output.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/agency_identification/dtos/output.py#L1 <100>

Missing docstring in public module
Raw output
./src/core/tasks/url/operators/agency_identification/dtos/output.py:1:1: D100 Missing docstring in public module

from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo
from src.db.models.instantiations.url.error_info.pydantic import URLErrorPydanticInfo


class GetAgencySuggestionsOutput(BaseModel):

Check warning on line 7 in src/core/tasks/url/operators/agency_identification/dtos/output.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/agency_identification/dtos/output.py#L7 <101>

Missing docstring in public class
Raw output
./src/core/tasks/url/operators/agency_identification/dtos/output.py:7:1: D101 Missing docstring in public class
error_infos: list[URLErrorPydanticInfo]
agency_suggestions: list[URLAgencySuggestionInfo]

Check warning on line 9 in src/core/tasks/url/operators/agency_identification/dtos/output.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/agency_identification/dtos/output.py#L9 <292>

no newline at end of file
Raw output
./src/core/tasks/url/operators/agency_identification/dtos/output.py:9:54: W292 no newline at end of file
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

class URLAgencySuggestionInfo(BaseModel):
url_id: int
suggestion_type: SuggestionType
suggestion_type: SuggestionType = SuggestionType.UNKNOWN
pdap_agency_id: Optional[int] = None
agency_name: Optional[str] = None
state: Optional[str] = None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@
class AgencyIdentificationTDO(BaseModel):
url_id: int
collector_metadata: Optional[dict] = None
collector_type: CollectorType
collector_type: CollectorType | None
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,15 @@ class GetPendingURLsWithoutAgencySuggestionsQueryBuilder(QueryBuilderBase):
async def run(self, session: AsyncSession) -> list[AgencyIdentificationTDO]:

statement = (
select(URL.id, URL.collector_metadata, Batch.strategy)
select(
URL.id,
URL.collector_metadata,
Batch.strategy
)
.select_from(URL)
.where(URL.outcome == URLStatus.PENDING.value)
.join(LinkBatchURL)
.join(Batch)
.outerjoin(LinkBatchURL)
.outerjoin(Batch)
)
statement = StatementComposer.exclude_urls_with_agency_suggestions(statement)
statement = statement.limit(100)
Expand All @@ -28,7 +32,7 @@ async def run(self, session: AsyncSession) -> list[AgencyIdentificationTDO]:
AgencyIdentificationTDO(
url_id=raw_result[0],
collector_metadata=raw_result[1],
collector_type=CollectorType(raw_result[2])
collector_type=CollectorType(raw_result[2]) if raw_result[2] is not None else None
)
for raw_result in raw_results
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from sqlalchemy import select

Check warning on line 1 in src/core/tasks/url/operators/agency_identification/queries/has_urls_without_agency_suggestions.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/agency_identification/queries/has_urls_without_agency_suggestions.py#L1 <100>

Missing docstring in public module
Raw output
./src/core/tasks/url/operators/agency_identification/queries/has_urls_without_agency_suggestions.py:1:1: D100 Missing docstring in public module
from sqlalchemy.ext.asyncio import AsyncSession

from src.collectors.enums import URLStatus
from src.db.models.instantiations.url.core.sqlalchemy import URL
from src.db.queries.base.builder import QueryBuilderBase
from src.db.statement_composer import StatementComposer


class HasURLsWithoutAgencySuggestionsQueryBuilder(QueryBuilderBase):

Check warning on line 10 in src/core/tasks/url/operators/agency_identification/queries/has_urls_without_agency_suggestions.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/agency_identification/queries/has_urls_without_agency_suggestions.py#L10 <101>

Missing docstring in public class
Raw output
./src/core/tasks/url/operators/agency_identification/queries/has_urls_without_agency_suggestions.py:10:1: D101 Missing docstring in public class

async def run(

Check warning on line 12 in src/core/tasks/url/operators/agency_identification/queries/has_urls_without_agency_suggestions.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/agency_identification/queries/has_urls_without_agency_suggestions.py#L12 <102>

Missing docstring in public method
Raw output
./src/core/tasks/url/operators/agency_identification/queries/has_urls_without_agency_suggestions.py:12:1: D102 Missing docstring in public method
self,
session: AsyncSession
) -> bool:
statement = (
select(
URL.id
).where(
URL.outcome == URLStatus.PENDING.value
)
)

statement = StatementComposer.exclude_urls_with_agency_suggestions(statement)
raw_result = await session.execute(statement)
result = raw_result.all()
return len(result) != 0

Check warning on line 27 in src/core/tasks/url/operators/agency_identification/queries/has_urls_without_agency_suggestions.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/agency_identification/queries/has_urls_without_agency_suggestions.py#L27 <292>

no newline at end of file
Raw output
./src/core/tasks/url/operators/agency_identification/queries/has_urls_without_agency_suggestions.py:27:32: W292 no newline at end of file
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,6 @@
async def run(
self,
url_id: int,
collector_metadata: Optional[dict] = None
collector_metadata: dict | None = None

Check warning on line 14 in src/core/tasks/url/operators/agency_identification/subtasks/impl/base.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/agency_identification/subtasks/impl/base.py#L14 <100>

Unused argument 'collector_metadata'
Raw output
./src/core/tasks/url/operators/agency_identification/subtasks/impl/base.py:14:13: U100 Unused argument 'collector_metadata'
) -> list[URLAgencySuggestionInfo]:
raise NotImplementedError
Original file line number Diff line number Diff line change
@@ -1,23 +1,27 @@
from typing import Optional
from typing import final

Check warning on line 1 in src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan.py#L1 <100>

Missing docstring in public module
Raw output
./src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan.py:1:1: D100 Missing docstring in public module

from typing_extensions import override

from src.core.helpers import process_match_agency_response_to_suggestions
from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo
from src.core.tasks.url.operators.agency_identification.subtasks.impl.base import AgencyIdentificationSubtaskBase
from src.external.pdap.client import PDAPClient
from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse


class CKANAgencyIdentificationSubtask:
@final
class CKANAgencyIdentificationSubtask(AgencyIdentificationSubtaskBase):

Check warning on line 12 in src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan.py#L12 <101>

Missing docstring in public class
Raw output
./src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan.py:12:1: D101 Missing docstring in public class

def __init__(
self,
pdap_client: PDAPClient
):
self.pdap_client = pdap_client

@override
async def run(
self,
url_id: int,
collector_metadata: Optional[dict]
collector_metadata: dict | None = None
) -> list[URLAgencySuggestionInfo]:
agency_name = collector_metadata["agency_name"]
match_agency_response: MatchAgencyResponse = await self.pdap_client.match_agency(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,16 +1,19 @@
from typing import Optional
from typing import final

Check warning on line 1 in src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock.py#L1 <100>

Missing docstring in public module
Raw output
./src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock.py:1:1: D100 Missing docstring in public module

from typing_extensions import override

from src.collectors.source_collectors.muckrock.api_interface.core import MuckrockAPIInterface
from src.collectors.source_collectors.muckrock.api_interface.lookup_response import AgencyLookupResponse
from src.collectors.source_collectors.muckrock.enums import AgencyLookupResponseType
from src.core.exceptions import MuckrockAPIError
from src.core.helpers import process_match_agency_response_to_suggestions
from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo
from src.core.tasks.url.operators.agency_identification.subtasks.impl.base import AgencyIdentificationSubtaskBase
from src.external.pdap.client import PDAPClient
from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse


class MuckrockAgencyIdentificationSubtask:
@final
class MuckrockAgencyIdentificationSubtask(AgencyIdentificationSubtaskBase):

Check warning on line 16 in src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock.py#L16 <101>

Missing docstring in public class
Raw output
./src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock.py:16:1: D101 Missing docstring in public class

def __init__(
self,
Expand All @@ -20,10 +23,11 @@
self.muckrock_api_interface = muckrock_api_interface
self.pdap_client = pdap_client

@override
async def run(
self,
url_id: int,
collector_metadata: Optional[dict]
collector_metadata: dict | None = None
) -> list[URLAgencySuggestionInfo]:
muckrock_agency_id = collector_metadata["agency"]
agency_lookup_response: AgencyLookupResponse = await self.muckrock_api_interface.lookup_agency(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,16 +1,21 @@
from typing import Optional
from typing_extensions import override, final

Check warning on line 1 in src/core/tasks/url/operators/agency_identification/subtasks/impl/unknown.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/agency_identification/subtasks/impl/unknown.py#L1 <100>

Missing docstring in public module
Raw output
./src/core/tasks/url/operators/agency_identification/subtasks/impl/unknown.py:1:1: D100 Missing docstring in public module

from src.core.enums import SuggestionType
from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo
from src.core.tasks.url.subtasks.agency_identification.base import AgencyIdentificationSubtaskBase
from src.core.tasks.url.operators.agency_identification.subtasks.impl.base import AgencyIdentificationSubtaskBase

@final
class UnknownAgencyIdentificationSubtask(AgencyIdentificationSubtaskBase):
"""A subtask that returns an unknown suggestion.

class AutoGooglerAgencyIdentificationSubtask(AgencyIdentificationSubtaskBase):
Used in cases where the agency cannot be reliably inferred from the source.
"""

@override
async def run(
self,
url_id: int,
collector_metadata: Optional[dict] = None
collector_metadata: dict | None = None

Check warning on line 18 in src/core/tasks/url/operators/agency_identification/subtasks/impl/unknown.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/agency_identification/subtasks/impl/unknown.py#L18 <100>

Unused argument 'collector_metadata'
Raw output
./src/core/tasks/url/operators/agency_identification/subtasks/impl/unknown.py:18:13: U100 Unused argument 'collector_metadata'
) -> list[URLAgencySuggestionInfo]:
return [
URLAgencySuggestionInfo(
Expand Down
Loading