From 30560c2b6beb1a74a839342db77c35c22938fe66 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 16 Sep 2025 07:15:01 -0400 Subject: [PATCH 1/7] Add location annotation database components --- ...baa3b8e9b_add_location_annotation_logic.py | 180 ++++++++++++++++++ src/db/models/helpers.py | 7 + .../impl/url/suggestion/location/__init__.py | 0 .../url/suggestion/location/auto/__init__.py | 0 .../location/auto/subtask/__init__.py | 0 .../suggestion/location/auto/subtask/enums.py | 5 + .../location/auto/subtask/pydantic.py | 19 ++ .../location/auto/subtask/sqlalchemy.py | 22 +++ .../location/auto/suggestion/__init__.py | 0 .../location/auto/suggestion/pydantic.py | 15 ++ .../location/auto/suggestion/sqlalchemy.py | 19 ++ .../url/suggestion/location/user/__init__.py | 0 .../url/suggestion/location/user/pydantic.py | 16 ++ .../suggestion/location/user/sqlalchemy.py | 11 ++ src/util/alembic_helpers.py | 12 ++ 15 files changed, 306 insertions(+) create mode 100644 alembic/versions/2025_09_15_1905-93cbaa3b8e9b_add_location_annotation_logic.py create mode 100644 src/db/models/impl/url/suggestion/location/__init__.py create mode 100644 src/db/models/impl/url/suggestion/location/auto/__init__.py create mode 100644 src/db/models/impl/url/suggestion/location/auto/subtask/__init__.py create mode 100644 src/db/models/impl/url/suggestion/location/auto/subtask/enums.py create mode 100644 src/db/models/impl/url/suggestion/location/auto/subtask/pydantic.py create mode 100644 src/db/models/impl/url/suggestion/location/auto/subtask/sqlalchemy.py create mode 100644 src/db/models/impl/url/suggestion/location/auto/suggestion/__init__.py create mode 100644 src/db/models/impl/url/suggestion/location/auto/suggestion/pydantic.py create mode 100644 src/db/models/impl/url/suggestion/location/auto/suggestion/sqlalchemy.py create mode 100644 src/db/models/impl/url/suggestion/location/user/__init__.py create mode 100644 src/db/models/impl/url/suggestion/location/user/pydantic.py create mode 100644 src/db/models/impl/url/suggestion/location/user/sqlalchemy.py diff --git a/alembic/versions/2025_09_15_1905-93cbaa3b8e9b_add_location_annotation_logic.py b/alembic/versions/2025_09_15_1905-93cbaa3b8e9b_add_location_annotation_logic.py new file mode 100644 index 00000000..2062701a --- /dev/null +++ b/alembic/versions/2025_09_15_1905-93cbaa3b8e9b_add_location_annotation_logic.py @@ -0,0 +1,180 @@ +"""Add location annotation logic + +Revision ID: 93cbaa3b8e9b +Revises: d5f92e6fedf4 +Create Date: 2025-09-15 19:05:27.872875 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + +from src.util.alembic_helpers import switch_enum_type, url_id_column, location_id_column, created_at_column, id_column, \ + task_id_column, agency_id_column + +# revision identifiers, used by Alembic. +revision: str = '93cbaa3b8e9b' +down_revision: Union[str, None] = 'd5f92e6fedf4' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + +USER_LOCATION_SUGGESTIONS_TABLE_NAME = 'user_location_suggestions' +AUTO_LOCATION_ID_SUBTASK_TABLE_NAME = 'auto_location_id_subtask' +LOCATION_ID_SUBTASK_SUGGESTIONS_TABLE_NAME = 'location_id_subtask_suggestions' +LOCATION_ID_TASK_TYPE = 'location_id' +LOCATION_ID_SUBTASK_TYPE_NAME = 'location_id_subtask_type' + +def upgrade() -> None: + _add_location_id_task_type() + _create_user_location_suggestions_table() + _create_auto_location_id_subtask_table() + _create_location_id_subtask_suggestions_table() + +def downgrade() -> None: + _drop_location_id_subtask_suggestions_table() + _drop_auto_location_id_subtask_table() + _drop_user_location_suggestions_table() + _drop_location_id_task_type() + _drop_location_id_subtask_type() + +def _add_location_id_task_type(): + switch_enum_type( + table_name='tasks', + column_name='task_type', + enum_name='task_type', + new_enum_values=[ + 'HTML', + 'Relevancy', + 'Record Type', + 'Agency Identification', + 'Misc Metadata', + 'Submit Approved URLs', + 'Duplicate Detection', + '404 Probe', + 'Sync Agencies', + 'Sync Data Sources', + 'Push to Hugging Face', + 'URL Probe', + 'Populate Backlog Snapshot', + 'Delete Old Logs', + 'Run URL Task Cycles', + 'Root URL', + 'Internet Archives Probe', + 'Internet Archives Archive', + 'Screenshot', + LOCATION_ID_TASK_TYPE + ] + ) + + +def _create_user_location_suggestions_table(): + op.create_table( + USER_LOCATION_SUGGESTIONS_TABLE_NAME, + url_id_column(), + location_id_column(), + created_at_column(), + sa.PrimaryKeyConstraint( + 'url_id', + 'location_id', + name='user_location_suggestions_url_id_location_id_pk' + ) + ) + + +def _create_auto_location_id_subtask_table(): + op.create_table( + AUTO_LOCATION_ID_SUBTASK_TABLE_NAME, + id_column(), + task_id_column(), + url_id_column(), + sa.Column( + 'locations_found', + sa.Boolean(), + nullable=False + ), + sa.Column( + 'type', + sa.Enum( + 'nlp_location_frequency', + name='auto_location_id_subtask_type' + ), + nullable=False + ), + created_at_column(), + sa.UniqueConstraint( + 'url_id', + 'type', + name='auto_location_id_subtask_url_id_type_unique' + ) + ) + + +def _create_location_id_subtask_suggestions_table(): + op.create_table( + LOCATION_ID_SUBTASK_SUGGESTIONS_TABLE_NAME, + sa.Column( + 'subtask_id', + sa.Integer(), + sa.ForeignKey( + 'auto_location_id_subtask.id', + ondelete='CASCADE' + ), + primary_key=True + ), + location_id_column(), + sa.Column( + 'confidence', + sa.Float(), + nullable=False + ), + created_at_column(), + ) + + + +def _drop_location_id_task_type(): + switch_enum_type( + table_name='tasks', + column_name='task_type', + enum_name='task_type', + new_enum_values=[ + 'HTML', + 'Relevancy', + 'Record Type', + 'Agency Identification', + 'Misc Metadata', + 'Submit Approved URLs', + 'Duplicate Detection', + '404 Probe', + 'Sync Agencies', + 'Sync Data Sources', + 'Push to Hugging Face', + 'URL Probe', + 'Populate Backlog Snapshot', + 'Delete Old Logs', + 'Run URL Task Cycles', + 'Root URL', + 'Internet Archives Probe', + 'Internet Archives Archive', + 'Screenshot', + ] + ) + + +def _drop_auto_location_id_subtask_table(): + op.drop_table(AUTO_LOCATION_ID_SUBTASK_TABLE_NAME) + + +def _drop_user_location_suggestions_table(): + op.drop_table(USER_LOCATION_SUGGESTIONS_TABLE_NAME) + + +def _drop_location_id_subtask_suggestions_table(): + op.drop_table(LOCATION_ID_SUBTASK_SUGGESTIONS_TABLE_NAME) + +def _drop_location_id_subtask_type(): + op.execute(""" + DROP TYPE IF EXISTS auto_location_id_subtask_type; + """) + diff --git a/src/db/models/helpers.py b/src/db/models/helpers.py index 1782b1e9..f547e8d4 100644 --- a/src/db/models/helpers.py +++ b/src/db/models/helpers.py @@ -37,6 +37,13 @@ def url_id_column() -> Column[int]: nullable=False ) +def location_id_column() -> Column[int]: + return Column( + Integer(), + ForeignKey('locations.id', ondelete='CASCADE'), + nullable=False + ) + CURRENT_TIME_SERVER_DEFAULT = func.now() def url_id_primary_key_constraint() -> PrimaryKeyConstraint: diff --git a/src/db/models/impl/url/suggestion/location/__init__.py b/src/db/models/impl/url/suggestion/location/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/impl/url/suggestion/location/auto/__init__.py b/src/db/models/impl/url/suggestion/location/auto/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/impl/url/suggestion/location/auto/subtask/__init__.py b/src/db/models/impl/url/suggestion/location/auto/subtask/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/impl/url/suggestion/location/auto/subtask/enums.py b/src/db/models/impl/url/suggestion/location/auto/subtask/enums.py new file mode 100644 index 00000000..c42f53c2 --- /dev/null +++ b/src/db/models/impl/url/suggestion/location/auto/subtask/enums.py @@ -0,0 +1,5 @@ +from enum import Enum + + +class LocationIDSubtaskType(Enum): + NLP_LOCATION_FREQUENCY = 'nlp_location_frequency' \ No newline at end of file diff --git a/src/db/models/impl/url/suggestion/location/auto/subtask/pydantic.py b/src/db/models/impl/url/suggestion/location/auto/subtask/pydantic.py new file mode 100644 index 00000000..091a00b9 --- /dev/null +++ b/src/db/models/impl/url/suggestion/location/auto/subtask/pydantic.py @@ -0,0 +1,19 @@ +from src.db.models.impl.url.suggestion.location.auto.subtask.enums import LocationIDSubtaskType +from src.db.models.impl.url.suggestion.location.auto.subtask.sqlalchemy import AutoLocationIDSubtask +from src.db.models.templates_.base import Base +from src.db.templates.markers.bulk.insert import BulkInsertableModel + + +class AutoLocationIDSubtaskPydantic( + BulkInsertableModel, +): + + url_id: int + task_id: int + locations_found: bool + type: LocationIDSubtaskType + + @classmethod + def sa_model(cls) -> type[Base]: + """Defines the SQLAlchemy model.""" + return AutoLocationIDSubtask \ No newline at end of file diff --git a/src/db/models/impl/url/suggestion/location/auto/subtask/sqlalchemy.py b/src/db/models/impl/url/suggestion/location/auto/subtask/sqlalchemy.py new file mode 100644 index 00000000..6df14bf7 --- /dev/null +++ b/src/db/models/impl/url/suggestion/location/auto/subtask/sqlalchemy.py @@ -0,0 +1,22 @@ +from sqlalchemy import Column, Boolean + +from src.db.models.helpers import enum_column +from src.db.models.impl.url.suggestion.location.auto.subtask.enums import LocationIDSubtaskType +from src.db.models.mixins import CreatedAtMixin, TaskDependentMixin, URLDependentMixin +from src.db.models.templates_.with_id import WithIDBase + + +class AutoLocationIDSubtask( + WithIDBase, + CreatedAtMixin, + TaskDependentMixin, + URLDependentMixin, +): + + __tablename__ = 'auto_location_id_subtask' + + locations_found = Column(Boolean(), nullable=False) + type = enum_column( + LocationIDSubtaskType, + name='auto_location_id_subtask_type' + ) \ No newline at end of file diff --git a/src/db/models/impl/url/suggestion/location/auto/suggestion/__init__.py b/src/db/models/impl/url/suggestion/location/auto/suggestion/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/impl/url/suggestion/location/auto/suggestion/pydantic.py b/src/db/models/impl/url/suggestion/location/auto/suggestion/pydantic.py new file mode 100644 index 00000000..1ddc53d7 --- /dev/null +++ b/src/db/models/impl/url/suggestion/location/auto/suggestion/pydantic.py @@ -0,0 +1,15 @@ +from src.db.models.impl.url.suggestion.location.auto.suggestion.sqlalchemy import LocationIDSubtaskSuggestion +from src.db.models.templates_.base import Base +from src.db.templates.markers.bulk.insert import BulkInsertableModel + + +class LocationIDSubtaskSuggestionPydantic(BulkInsertableModel): + + subtask_id: int + location_id: int + confidence: float + + @classmethod + def sa_model(cls) -> type[Base]: + """Defines the SQLAlchemy model.""" + return LocationIDSubtaskSuggestion \ No newline at end of file diff --git a/src/db/models/impl/url/suggestion/location/auto/suggestion/sqlalchemy.py b/src/db/models/impl/url/suggestion/location/auto/suggestion/sqlalchemy.py new file mode 100644 index 00000000..688d1c4d --- /dev/null +++ b/src/db/models/impl/url/suggestion/location/auto/suggestion/sqlalchemy.py @@ -0,0 +1,19 @@ +from sqlalchemy import Column, Integer, ForeignKey, Float + +from src.db.models.helpers import location_id_column +from src.db.models.templates_.base import Base + + +class LocationIDSubtaskSuggestion( + Base, +): + + __tablename__ = 'location_id_subtask_suggestions' + subtask_id = Column( + Integer, + ForeignKey('auto_location_id_subtask.id'), + nullable=False, + primary_key=True, + ) + location_id = location_id_column() + confidence = Column(Float, nullable=False) \ No newline at end of file diff --git a/src/db/models/impl/url/suggestion/location/user/__init__.py b/src/db/models/impl/url/suggestion/location/user/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/impl/url/suggestion/location/user/pydantic.py b/src/db/models/impl/url/suggestion/location/user/pydantic.py new file mode 100644 index 00000000..11f2218b --- /dev/null +++ b/src/db/models/impl/url/suggestion/location/user/pydantic.py @@ -0,0 +1,16 @@ +from src.db.models.impl.url.suggestion.location.user.sqlalchemy import UserLocationSuggestion +from src.db.models.templates_.base import Base +from src.db.templates.markers.bulk.insert import BulkInsertableModel + + +class UserLocationSuggestionPydantic( + BulkInsertableModel, +): + + location_id: int + url_id: int + + @classmethod + def sa_model(cls) -> type[Base]: + """Defines the SQLAlchemy model.""" + return UserLocationSuggestion diff --git a/src/db/models/impl/url/suggestion/location/user/sqlalchemy.py b/src/db/models/impl/url/suggestion/location/user/sqlalchemy.py new file mode 100644 index 00000000..3d6cd0c6 --- /dev/null +++ b/src/db/models/impl/url/suggestion/location/user/sqlalchemy.py @@ -0,0 +1,11 @@ +from src.db.models.mixins import CreatedAtMixin, URLDependentMixin, LocationDependentMixin +from src.db.models.templates_.base import Base + + +class UserLocationSuggestion( + Base, + CreatedAtMixin, + LocationDependentMixin, + URLDependentMixin +): + __tablename__ = 'user_location_suggestions' \ No newline at end of file diff --git a/src/util/alembic_helpers.py b/src/util/alembic_helpers.py index 9df2be52..2ee64885 100644 --- a/src/util/alembic_helpers.py +++ b/src/util/alembic_helpers.py @@ -127,6 +127,18 @@ def url_id_column(name: str = 'url_id') -> sa.Column: comment='A foreign key to the `urls` table.' ) +def location_id_column(name: str = 'location_id') -> sa.Column: + return sa.Column( + name, + sa.Integer(), + sa.ForeignKey( + 'locations.id', + ondelete='CASCADE' + ), + nullable=False, + comment='A foreign key to the `locations` table.' + ) + def batch_id_column(nullable=False) -> sa.Column: return sa.Column( 'batch_id', From 489c12c9ca0cbe1396a6ad202f727137329e12f5 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Tue, 16 Sep 2025 08:24:22 -0400 Subject: [PATCH 2/7] Add location annotation database components --- ...baa3b8e9b_add_location_annotation_logic.py | 105 +++++++++++++++++- .../annotate/all/get/models/__init__.py | 0 .../annotate/all/get/models/location.py | 29 +++++ .../all/get/{dto.py => models/response.py} | 4 + src/api/endpoints/annotate/all/get/query.py | 56 ++++++---- .../annotate/all/post/models/__init__.py | 0 .../all/post/{dto.py => models/request.py} | 1 + src/api/endpoints/annotate/all/post/query.py | 62 +++++++++++ src/api/endpoints/annotate/routes.py | 6 +- src/core/core.py | 15 ++- src/db/client/async_.py | 40 +------ src/db/models/impl/url/core/sqlalchemy.py | 4 + .../location/auto/subtask/sqlalchemy.py | 6 + .../suggestion/location/user/sqlalchemy.py | 9 +- src/db/models/views/location_expanded.py | 59 ++++++++++ src/db/models/views/url_annotations_flags.py | 2 + src/db/statement_composer.py | 4 - src/util/alembic_helpers.py | 8 ++ .../api/_helpers/RequestValidator.py | 4 +- .../api/annotate/all/test_happy_path.py | 2 +- .../annotate/all/test_post_batch_filtering.py | 2 +- .../api/annotate/all/test_validation_error.py | 2 +- .../unit/dto/test_all_annotation_post_info.py | 2 +- 23 files changed, 336 insertions(+), 86 deletions(-) create mode 100644 src/api/endpoints/annotate/all/get/models/__init__.py create mode 100644 src/api/endpoints/annotate/all/get/models/location.py rename src/api/endpoints/annotate/all/get/{dto.py => models/response.py} (81%) create mode 100644 src/api/endpoints/annotate/all/post/models/__init__.py rename src/api/endpoints/annotate/all/post/{dto.py => models/request.py} (98%) create mode 100644 src/api/endpoints/annotate/all/post/query.py create mode 100644 src/db/models/views/location_expanded.py diff --git a/alembic/versions/2025_09_15_1905-93cbaa3b8e9b_add_location_annotation_logic.py b/alembic/versions/2025_09_15_1905-93cbaa3b8e9b_add_location_annotation_logic.py index 2062701a..712861bc 100644 --- a/alembic/versions/2025_09_15_1905-93cbaa3b8e9b_add_location_annotation_logic.py +++ b/alembic/versions/2025_09_15_1905-93cbaa3b8e9b_add_location_annotation_logic.py @@ -11,7 +11,7 @@ import sqlalchemy as sa from src.util.alembic_helpers import switch_enum_type, url_id_column, location_id_column, created_at_column, id_column, \ - task_id_column, agency_id_column + task_id_column, agency_id_column, user_id_column # revision identifiers, used by Alembic. revision: str = '93cbaa3b8e9b' @@ -20,24 +20,119 @@ depends_on: Union[str, Sequence[str], None] = None USER_LOCATION_SUGGESTIONS_TABLE_NAME = 'user_location_suggestions' -AUTO_LOCATION_ID_SUBTASK_TABLE_NAME = 'auto_location_id_subtask' +AUTO_LOCATION_ID_SUBTASK_TABLE_NAME = 'auto_location_id_subtasks' LOCATION_ID_SUBTASK_SUGGESTIONS_TABLE_NAME = 'location_id_subtask_suggestions' LOCATION_ID_TASK_TYPE = 'location_id' LOCATION_ID_SUBTASK_TYPE_NAME = 'location_id_subtask_type' + +def _create_new_url_annotation_flags_view(): + op.execute("""DROP VIEW IF EXISTS url_annotation_flags;""") + op.execute( + f""" + CREATE OR REPLACE VIEW url_annotation_flags AS + ( + SELECT u.id as url_id, + EXISTS (SELECT 1 FROM public.auto_record_type_suggestions a WHERE a.url_id = u.id) AS has_auto_record_type_suggestion, + EXISTS (SELECT 1 FROM public.auto_relevant_suggestions a WHERE a.url_id = u.id) AS has_auto_relevant_suggestion, + EXISTS (SELECT 1 FROM public.url_auto_agency_id_subtasks a WHERE a.url_id = u.id) AS has_auto_agency_suggestion, + EXISTS (SELECT 1 FROM public.auto_location_id_subtasks a WHERE a.url_id = u.id) AS has_auto_location_suggestion, + EXISTS (SELECT 1 FROM public.user_record_type_suggestions a WHERE a.url_id = u.id) AS has_user_record_type_suggestion, + EXISTS (SELECT 1 FROM public.user_relevant_suggestions a WHERE a.url_id = u.id) AS has_user_relevant_suggestion, + EXISTS (SELECT 1 FROM public.user_url_agency_suggestions a WHERE a.url_id = u.id) AS has_user_agency_suggestion, + EXISTS (SELECT 1 FROM public.user_location_suggestions a WHERE a.url_id = u.id) AS has_user_location_suggestion, + EXISTS (SELECT 1 FROM public.link_urls_agency a WHERE a.url_id = u.id) AS has_confirmed_agency, + EXISTS (SELECT 1 FROM public.reviewing_user_url a WHERE a.url_id = u.id) AS was_reviewed + FROM urls u + ) + """ + ) + +def _create_old_url_annotation_flags_view(): + op.execute("""DROP VIEW IF EXISTS url_annotation_flags;""") + op.execute( + f""" + CREATE OR REPLACE VIEW url_annotation_flags AS + ( + SELECT u.id as url_id, + EXISTS (SELECT 1 FROM public.auto_record_type_suggestions a WHERE a.url_id = u.id) AS has_auto_record_type_suggestion, + EXISTS (SELECT 1 FROM public.auto_relevant_suggestions a WHERE a.url_id = u.id) AS has_auto_relevant_suggestion, + EXISTS (SELECT 1 FROM public.url_auto_agency_id_subtasks a WHERE a.url_id = u.id) AS has_auto_agency_suggestion, + EXISTS (SELECT 1 FROM public.user_record_type_suggestions a WHERE a.url_id = u.id) AS has_user_record_type_suggestion, + EXISTS (SELECT 1 FROM public.user_relevant_suggestions a WHERE a.url_id = u.id) AS has_user_relevant_suggestion, + EXISTS (SELECT 1 FROM public.user_url_agency_suggestions a WHERE a.url_id = u.id) AS has_user_agency_suggestion, + EXISTS (SELECT 1 FROM public.link_urls_agency a WHERE a.url_id = u.id) AS has_confirmed_agency, + EXISTS (SELECT 1 FROM public.reviewing_user_url a WHERE a.url_id = u.id) AS was_reviewed + FROM urls u + ) + """ + ) + + def upgrade() -> None: _add_location_id_task_type() _create_user_location_suggestions_table() _create_auto_location_id_subtask_table() _create_location_id_subtask_suggestions_table() + _create_new_url_annotation_flags_view() + _create_locations_expanded_view() + + + def downgrade() -> None: + _drop_locations_expanded_view() + _create_old_url_annotation_flags_view() _drop_location_id_subtask_suggestions_table() _drop_auto_location_id_subtask_table() _drop_user_location_suggestions_table() _drop_location_id_task_type() _drop_location_id_subtask_type() +def _drop_locations_expanded_view(): + op.execute(""" + drop view if exists public.locations_expanded; + """) + +def _create_locations_expanded_view(): + op.execute(""" + create or replace view public.locations_expanded + (id, type, state_name, state_iso, county_name, county_fips, locality_name, locality_id, state_id, county_id, + display_name, full_display_name) + as + SELECT + locations.id, + locations.type, + us_states.state_name, + us_states.state_iso, + counties.name AS county_name, + counties.fips AS county_fips, + localities.name AS locality_name, + localities.id AS locality_id, + us_states.id AS state_id, + counties.id AS county_id, + CASE + WHEN locations.type = 'Locality'::location_type THEN localities.name + WHEN locations.type = 'County'::location_type THEN counties.name::character varying + WHEN locations.type = 'State'::location_type THEN us_states.state_name::character varying + ELSE NULL::character varying + END AS display_name, + CASE + WHEN locations.type = 'Locality'::location_type THEN concat(localities.name, ', ', counties.name, ', ', + us_states.state_name)::character varying + WHEN locations.type = 'County'::location_type + THEN concat(counties.name, ', ', us_states.state_name)::character varying + WHEN locations.type = 'State'::location_type THEN us_states.state_name::character varying + ELSE NULL::character varying + END AS full_display_name + FROM + locations + LEFT JOIN us_states ON locations.state_id = us_states.id + LEFT JOIN counties ON locations.county_id = counties.id + LEFT JOIN localities ON locations.locality_id = localities.id; + + """) + def _add_location_id_task_type(): switch_enum_type( table_name='tasks', @@ -72,12 +167,14 @@ def _create_user_location_suggestions_table(): op.create_table( USER_LOCATION_SUGGESTIONS_TABLE_NAME, url_id_column(), + user_id_column(), location_id_column(), created_at_column(), sa.PrimaryKeyConstraint( 'url_id', + 'user_id', 'location_id', - name='user_location_suggestions_url_id_location_id_pk' + name='user_location_suggestions_pk' ) ) @@ -117,7 +214,7 @@ def _create_location_id_subtask_suggestions_table(): 'subtask_id', sa.Integer(), sa.ForeignKey( - 'auto_location_id_subtask.id', + f'{AUTO_LOCATION_ID_SUBTASK_TABLE_NAME}.id', ondelete='CASCADE' ), primary_key=True diff --git a/src/api/endpoints/annotate/all/get/models/__init__.py b/src/api/endpoints/annotate/all/get/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/annotate/all/get/models/location.py b/src/api/endpoints/annotate/all/get/models/location.py new file mode 100644 index 00000000..69090b32 --- /dev/null +++ b/src/api/endpoints/annotate/all/get/models/location.py @@ -0,0 +1,29 @@ +from pydantic import BaseModel, Field + + +class LocationAnnotationAutoSuggestion(BaseModel): + location_id: int + location_name: str = Field( + title="The full name of the location" + ) + confidence: float = Field( + title="The confidence of the location", + ge=0, + le=1, + ) + + +class LocationAnnotationUserSuggestion(BaseModel): + location_id: int + location_name: str = Field( + title="The full name of the location" + ) + user_count: int = Field( + title="The number of users who suggested this location", + ge=1, + ) + + +class LocationAnnotationResponseOuterInfo(BaseModel): + user: list[LocationAnnotationUserSuggestion] + auto: list[LocationAnnotationAutoSuggestion] \ No newline at end of file diff --git a/src/api/endpoints/annotate/all/get/dto.py b/src/api/endpoints/annotate/all/get/models/response.py similarity index 81% rename from src/api/endpoints/annotate/all/get/dto.py rename to src/api/endpoints/annotate/all/get/models/response.py index 26bb5e07..0c584495 100644 --- a/src/api/endpoints/annotate/all/get/dto.py +++ b/src/api/endpoints/annotate/all/get/models/response.py @@ -3,6 +3,7 @@ from pydantic import Field, BaseModel from src.api.endpoints.annotate.agency.get.dto import GetNextURLForAgencyAgencyInfo +from src.api.endpoints.annotate.all.get.models.location import LocationAnnotationResponseOuterInfo from src.api.endpoints.annotate.dtos.shared.base.response import AnnotationInnerResponseInfoBase from src.api.endpoints.annotate.relevance.get.dto import RelevanceAnnotationResponseInfo from src.core.enums import RecordType @@ -12,6 +13,9 @@ class GetNextURLForAllAnnotationInnerResponse(AnnotationInnerResponseInfoBase): agency_suggestions: list[GetNextURLForAgencyAgencyInfo] | None = Field( title="The auto-labeler's suggestions for agencies" ) + location_suggestions: LocationAnnotationResponseOuterInfo | None = Field( + title="User and Auto-Suggestions for locations" + ) suggested_relevant: RelevanceAnnotationResponseInfo | None = Field( title="Whether the auto-labeler identified the URL as relevant or not" ) diff --git a/src/api/endpoints/annotate/all/get/query.py b/src/api/endpoints/annotate/all/get/query.py index 05855578..9237fd42 100644 --- a/src/api/endpoints/annotate/all/get/query.py +++ b/src/api/endpoints/annotate/all/get/query.py @@ -1,10 +1,10 @@ -from sqlalchemy import Select, and_ +from sqlalchemy import Select, and_, or_ from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.orm import selectinload from src.api.endpoints.annotate._shared.queries.get_annotation_batch_info import GetAnnotationBatchInfoQueryBuilder from src.api.endpoints.annotate.agency.get.queries.agency_suggestion_.core import GetAgencySuggestionsQueryBuilder -from src.api.endpoints.annotate.all.get.dto import GetNextURLForAllAnnotationResponse, \ +from src.api.endpoints.annotate.all.get.models.response import GetNextURLForAllAnnotationResponse, \ GetNextURLForAllAnnotationInnerResponse from src.api.endpoints.annotate.relevance.get.dto import RelevanceAnnotationResponseInfo from src.collectors.enums import URLStatus @@ -13,8 +13,12 @@ from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion +from src.db.models.impl.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion +from src.db.models.views.unvalidated_url import UnvalidatedURL +from src.db.models.views.url_annotations_flags import URLAnnotationFlagsView from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer @@ -32,7 +36,18 @@ async def run( self, session: AsyncSession ) -> GetNextURLForAllAnnotationResponse: - query = Select(URL) + query = ( + Select(URL) + # URL Must be unvalidated + .join( + UnvalidatedURL, + UnvalidatedURL.url_id == URL.id + ) + .join( + URLAnnotationFlagsView, + URLAnnotationFlagsView.url_id == URL.id + ) + ) if self.batch_id is not None: query = query.join(LinkBatchURL).where(LinkBatchURL.batch_id == self.batch_id) query = ( @@ -40,32 +55,29 @@ async def run( .where( and_( URL.status == URLStatus.OK.value, - StatementComposer.user_suggestion_not_exists(UserUrlAgencySuggestion), - StatementComposer.user_suggestion_not_exists(UserRecordTypeSuggestion), - StatementComposer.user_suggestion_not_exists(UserRelevantSuggestion), + # Must be missing at least some annotations + or_( + URLAnnotationFlagsView.has_user_agency_suggestion.is_(False), + URLAnnotationFlagsView.has_user_record_type_suggestion.is_(False), + URLAnnotationFlagsView.has_user_relevant_suggestion.is_(False), + URLAnnotationFlagsView.has_user_location_suggestion.is_(False), + ) + ) ) ) - - - load_options = [ + # Add load options + query = query.options( URL.html_content, URL.auto_agency_subtasks, URL.auto_relevant_suggestion, - URL.auto_record_type_suggestion - ] - select_in_loads = [ - selectinload(load_option) for load_option in load_options - ] - - # Add load options - query = query.options( - *select_in_loads + URL.auto_record_type_suggestion, + URL.auto_agency_subtasks.suggestions, ) query = query.order_by(URL.id.asc()).limit(1) raw_results = await session.execute(query) - url = raw_results.scalars().one_or_none() + url: URL | None = raw_results.scalars().one_or_none() if url is None: return GetNextURLForAllAnnotationResponse( next_annotation=None @@ -75,15 +87,13 @@ async def run( url.html_content ) + auto_relevant: AutoRelevantSuggestion | None = None if url.auto_relevant_suggestion is not None: auto_relevant = url.auto_relevant_suggestion - else: - auto_relevant = None + auto_record_type: AutoRecordTypeSuggestion | None = None if url.auto_record_type_suggestion is not None: auto_record_type = url.auto_record_type_suggestion.record_type - else: - auto_record_type = None agency_suggestions = await GetAgencySuggestionsQueryBuilder(url_id=url.id).run(session) diff --git a/src/api/endpoints/annotate/all/post/models/__init__.py b/src/api/endpoints/annotate/all/post/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/annotate/all/post/dto.py b/src/api/endpoints/annotate/all/post/models/request.py similarity index 98% rename from src/api/endpoints/annotate/all/post/dto.py rename to src/api/endpoints/annotate/all/post/models/request.py index 73c21606..f6d17749 100644 --- a/src/api/endpoints/annotate/all/post/dto.py +++ b/src/api/endpoints/annotate/all/post/models/request.py @@ -11,6 +11,7 @@ class AllAnnotationPostInfo(BaseModel): suggested_status: SuggestedStatus record_type: RecordType | None = None agency: URLAgencyAnnotationPostInfo | None = None + location_ids: list[int] @model_validator(mode="after") def allow_record_type_and_agency_only_if_relevant(self): diff --git a/src/api/endpoints/annotate/all/post/query.py b/src/api/endpoints/annotate/all/post/query.py new file mode 100644 index 00000000..a3ddb0c6 --- /dev/null +++ b/src/api/endpoints/annotate/all/post/query.py @@ -0,0 +1,62 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.annotate.all.post.models.request import AllAnnotationPostInfo +from src.core.enums import SuggestedStatus +from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.location.user.sqlalchemy import UserLocationSuggestion +from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion +from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion +from src.db.queries.base.builder import QueryBuilderBase + + +class AddAllAnnotationsToURLQueryBuilder(QueryBuilderBase): + + def __init__( + self, + user_id: int, + url_id: int, + post_info: AllAnnotationPostInfo + ): + super().__init__() + self.user_id = user_id + self.url_id = url_id + self.post_info = post_info + + + async def run(self, session: AsyncSession) -> None: + # Add relevant annotation + relevant_suggestion = UserRelevantSuggestion( + url_id=self.url_id, + user_id=self.user_id, + suggested_status=self.post_info.suggested_status.value + ) + session.add(relevant_suggestion) + + # If not relevant, do nothing else + # TODO: 1: Update to account for change in SuggestedStatus + if not self.post_info.suggested_status == SuggestedStatus.RELEVANT: + return + + locations: list[UserLocationSuggestion] = [] + for location_id in self.post_info.location_ids: + locations.append(UserLocationSuggestion( + url_id=self.url_id, + user_id=self.user_id, + location_id=location_id + )) + session.add_all(locations) + + record_type_suggestion = UserRecordTypeSuggestion( + url_id=self.url_id, + user_id=self.user_id, + record_type=self.post_info.record_type.value + ) + session.add(record_type_suggestion) + + agency_suggestion = UserUrlAgencySuggestion( + url_id=self.url_id, + user_id=self.user_id, + agency_id=self.post_info.agency.suggested_agency, + is_new=self.post_info.agency.is_new + ) + session.add(agency_suggestion) diff --git a/src/api/endpoints/annotate/routes.py b/src/api/endpoints/annotate/routes.py index ddcc24ca..7cd4b76b 100644 --- a/src/api/endpoints/annotate/routes.py +++ b/src/api/endpoints/annotate/routes.py @@ -1,12 +1,10 @@ -from typing import Optional - from fastapi import APIRouter, Depends, Path, Query from src.api.dependencies import get_async_core from src.api.endpoints.annotate.agency.get.dto import GetNextURLForAgencyAnnotationResponse from src.api.endpoints.annotate.agency.post.dto import URLAgencyAnnotationPostInfo -from src.api.endpoints.annotate.all.get.dto import GetNextURLForAllAnnotationResponse -from src.api.endpoints.annotate.all.post.dto import AllAnnotationPostInfo +from src.api.endpoints.annotate.all.get.models.response import GetNextURLForAllAnnotationResponse +from src.api.endpoints.annotate.all.post.models.request import AllAnnotationPostInfo from src.api.endpoints.annotate.dtos.record_type.post import RecordTypeAnnotationPostInfo from src.api.endpoints.annotate.dtos.record_type.response import GetNextRecordTypeAnnotationResponseOuterInfo from src.api.endpoints.annotate.relevance.get.dto import GetNextRelevanceAnnotationResponseOuterInfo diff --git a/src/core/core.py b/src/core/core.py index 0938586a..68a94c6d 100644 --- a/src/core/core.py +++ b/src/core/core.py @@ -7,8 +7,9 @@ from src.api.endpoints.annotate.agency.get.dto import GetNextURLForAgencyAnnotationResponse from src.api.endpoints.annotate.agency.post.dto import URLAgencyAnnotationPostInfo -from src.api.endpoints.annotate.all.get.dto import GetNextURLForAllAnnotationResponse -from src.api.endpoints.annotate.all.post.dto import AllAnnotationPostInfo +from src.api.endpoints.annotate.all.get.models.response import GetNextURLForAllAnnotationResponse +from src.api.endpoints.annotate.all.post.models.request import AllAnnotationPostInfo +from src.api.endpoints.annotate.all.post.query import AddAllAnnotationsToURLQueryBuilder from src.api.endpoints.annotate.dtos.record_type.response import GetNextRecordTypeAnnotationResponseOuterInfo from src.api.endpoints.annotate.relevance.get.dto import GetNextRelevanceAnnotationResponseOuterInfo from src.api.endpoints.batch.dtos.get.logs import GetBatchLogsResponse @@ -283,10 +284,12 @@ async def submit_url_for_all_annotations( url_id: int, post_info: AllAnnotationPostInfo ): - await self.adb_client.add_all_annotations_to_url( - user_id=user_id, - url_id=url_id, - post_info=post_info + await self.adb_client.run_query_builder( + AddAllAnnotationsToURLQueryBuilder( + user_id=user_id, + url_id=url_id, + post_info=post_info + ) ) async def approve_url( diff --git a/src/db/client/async_.py b/src/db/client/async_.py index cd266b1d..969e5dc6 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -13,9 +13,9 @@ GetNextURLForUserAnnotationQueryBuilder from src.api.endpoints.annotate.agency.get.dto import GetNextURLForAgencyAnnotationResponse from src.api.endpoints.annotate.agency.get.queries.next_for_annotation import GetNextURLAgencyForAnnotationQueryBuilder -from src.api.endpoints.annotate.all.get.dto import GetNextURLForAllAnnotationResponse +from src.api.endpoints.annotate.all.get.models.response import GetNextURLForAllAnnotationResponse from src.api.endpoints.annotate.all.get.query import GetNextURLForAllAnnotationQueryBuilder -from src.api.endpoints.annotate.all.post.dto import AllAnnotationPostInfo +from src.api.endpoints.annotate.all.post.models.request import AllAnnotationPostInfo from src.api.endpoints.annotate.dtos.record_type.response import GetNextRecordTypeAnnotationResponseInfo from src.api.endpoints.annotate.relevance.get.dto import GetNextRelevanceAnnotationResponseInfo from src.api.endpoints.annotate.relevance.get.query import GetNextUrlForRelevanceAnnotationQueryBuilder @@ -992,42 +992,6 @@ async def get_next_url_for_all_annotations( ) -> GetNextURLForAllAnnotationResponse: return await self.run_query_builder(GetNextURLForAllAnnotationQueryBuilder(batch_id)) - @session_manager - async def add_all_annotations_to_url( - self, - session, - user_id: int, - url_id: int, - post_info: AllAnnotationPostInfo - ): - - # Add relevant annotation - relevant_suggestion = UserRelevantSuggestion( - url_id=url_id, - user_id=user_id, - suggested_status=post_info.suggested_status.value - ) - session.add(relevant_suggestion) - - # If not relevant, do nothing else - if not post_info.suggested_status == SuggestedStatus.RELEVANT: - return - - record_type_suggestion = UserRecordTypeSuggestion( - url_id=url_id, - user_id=user_id, - record_type=post_info.record_type.value - ) - session.add(record_type_suggestion) - - agency_suggestion = UserUrlAgencySuggestion( - url_id=url_id, - user_id=user_id, - agency_id=post_info.agency.suggested_agency, - is_new=post_info.agency.is_new - ) - session.add(agency_suggestion) - async def upload_manual_batch( self, user_id: int, diff --git a/src/db/models/impl/url/core/sqlalchemy.py b/src/db/models/impl/url/core/sqlalchemy.py index 0d775feb..ddb606b3 100644 --- a/src/db/models/impl/url/core/sqlalchemy.py +++ b/src/db/models/impl/url/core/sqlalchemy.py @@ -7,6 +7,7 @@ from src.db.models.impl.url.checked_for_duplicate import URLCheckedForDuplicate from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.probed_for_404 import URLProbedFor404 +from src.db.models.impl.url.suggestion.location.auto.subtask.sqlalchemy import AutoLocationIDSubtask from src.db.models.mixins import UpdatedAtMixin, CreatedAtMixin from src.db.models.templates_.with_id import WithIDBase @@ -55,6 +56,9 @@ class URL(UpdatedAtMixin, CreatedAtMixin, WithIDBase): auto_agency_subtasks = relationship( "URLAutoAgencyIDSubtask" ) + auto_location_subtasks = relationship( + AutoLocationIDSubtask + ) user_agency_suggestion = relationship( "UserUrlAgencySuggestion", uselist=False, back_populates="url") auto_record_type_suggestion = relationship( diff --git a/src/db/models/impl/url/suggestion/location/auto/subtask/sqlalchemy.py b/src/db/models/impl/url/suggestion/location/auto/subtask/sqlalchemy.py index 6df14bf7..97df74b3 100644 --- a/src/db/models/impl/url/suggestion/location/auto/subtask/sqlalchemy.py +++ b/src/db/models/impl/url/suggestion/location/auto/subtask/sqlalchemy.py @@ -1,7 +1,9 @@ from sqlalchemy import Column, Boolean +from sqlalchemy.orm import relationship from src.db.models.helpers import enum_column from src.db.models.impl.url.suggestion.location.auto.subtask.enums import LocationIDSubtaskType +from src.db.models.impl.url.suggestion.location.auto.suggestion.sqlalchemy import LocationIDSubtaskSuggestion from src.db.models.mixins import CreatedAtMixin, TaskDependentMixin, URLDependentMixin from src.db.models.templates_.with_id import WithIDBase @@ -19,4 +21,8 @@ class AutoLocationIDSubtask( type = enum_column( LocationIDSubtaskType, name='auto_location_id_subtask_type' + ) + + suggestions = relationship( + LocationIDSubtaskSuggestion ) \ No newline at end of file diff --git a/src/db/models/impl/url/suggestion/location/user/sqlalchemy.py b/src/db/models/impl/url/suggestion/location/user/sqlalchemy.py index 3d6cd0c6..088ba3c3 100644 --- a/src/db/models/impl/url/suggestion/location/user/sqlalchemy.py +++ b/src/db/models/impl/url/suggestion/location/user/sqlalchemy.py @@ -1,3 +1,5 @@ +from sqlalchemy import Integer, Column + from src.db.models.mixins import CreatedAtMixin, URLDependentMixin, LocationDependentMixin from src.db.models.templates_.base import Base @@ -8,4 +10,9 @@ class UserLocationSuggestion( LocationDependentMixin, URLDependentMixin ): - __tablename__ = 'user_location_suggestions' \ No newline at end of file + __tablename__ = 'user_location_suggestions' + + user_id = Column( + Integer, + nullable=False, + ) \ No newline at end of file diff --git a/src/db/models/views/location_expanded.py b/src/db/models/views/location_expanded.py new file mode 100644 index 00000000..59df4f20 --- /dev/null +++ b/src/db/models/views/location_expanded.py @@ -0,0 +1,59 @@ +""" +create or replace view public.locations_expanded + (id, type, state_name, state_iso, county_name, county_fips, locality_name, locality_id, state_id, county_id, + display_name, full_display_name) +as +SELECT + locations.id, + locations.type, + us_states.state_name, + us_states.state_iso, + counties.name AS county_name, + counties.fips AS county_fips, + localities.name AS locality_name, + localities.id AS locality_id, + us_states.id AS state_id, + counties.id AS county_id, + CASE + WHEN locations.type = 'Locality'::location_type THEN localities.name + WHEN locations.type = 'County'::location_type THEN counties.name::character varying + WHEN locations.type = 'State'::location_type THEN us_states.state_name::character varying + ELSE NULL::character varying + END AS display_name, + CASE + WHEN locations.type = 'Locality'::location_type THEN concat(localities.name, ', ', counties.name, ', ', + us_states.state_name)::character varying + WHEN locations.type = 'County'::location_type + THEN concat(counties.name, ', ', us_states.state_name)::character varying + WHEN locations.type = 'State'::location_type THEN us_states.state_name::character varying + ELSE NULL::character varying + END AS full_display_name +FROM + locations + LEFT JOIN us_states ON locations.state_id = us_states.id + LEFT JOIN counties ON locations.county_id = counties.id + LEFT JOIN localities ON locations.locality_id = localities.id; +""" +from sqlalchemy import PrimaryKeyConstraint + +from src.db.models.helpers import enum_column +from src.db.models.impl.location.location.enums import LocationType +from src.db.models.mixins import ViewMixin, LocationDependentMixin +from src.db.models.templates_.base import Base + + +class LocationExpandedView( + Base, + ViewMixin, + LocationDependentMixin +): + + + __tablename__ = "locations_expanded" + __table_args__ = ( + PrimaryKeyConstraint("location_id"), + {"info": "view"} + ) + + type = enum_column(LocationType, name="location_type", nullable=False) + # TODO: Complete later \ No newline at end of file diff --git a/src/db/models/views/url_annotations_flags.py b/src/db/models/views/url_annotations_flags.py index 7289020f..57d8e866 100644 --- a/src/db/models/views/url_annotations_flags.py +++ b/src/db/models/views/url_annotations_flags.py @@ -42,8 +42,10 @@ class URLAnnotationFlagsView( has_auto_record_type_suggestion = Column(Boolean, nullable=False) has_auto_relevant_suggestion = Column(Boolean, nullable=False) has_auto_agency_suggestion = Column(Boolean, nullable=False) + has_auto_location_suggestion = Column(Boolean, nullable=False) has_user_record_type_suggestion = Column(Boolean, nullable=False) has_user_relevant_suggestion = Column(Boolean, nullable=False) has_user_agency_suggestion = Column(Boolean, nullable=False) + has_user_location_suggestion = Column(Boolean, nullable=False) has_confirmed_agency = Column(Boolean, nullable=False) was_reviewed = Column(Boolean, nullable=False) \ No newline at end of file diff --git a/src/db/statement_composer.py b/src/db/statement_composer.py index 8e172733..19b544a4 100644 --- a/src/db/statement_composer.py +++ b/src/db/statement_composer.py @@ -116,10 +116,6 @@ def user_suggestion_not_exists( def count_distinct(field, label): return func.count(func.distinct(field)).label(label) - @staticmethod - def sum_distinct(field, label): - return func.sum(func.distinct(field)).label(label) - @staticmethod def add_limit_and_page_offset(query: Select, page: int): zero_offset_page = page - 1 diff --git a/src/util/alembic_helpers.py b/src/util/alembic_helpers.py index 2ee64885..6ac7367c 100644 --- a/src/util/alembic_helpers.py +++ b/src/util/alembic_helpers.py @@ -127,6 +127,14 @@ def url_id_column(name: str = 'url_id') -> sa.Column: comment='A foreign key to the `urls` table.' ) +def user_id_column(name: str = 'user_id') -> sa.Column: + return sa.Column( + name, + sa.Integer(), + nullable=False, + ) + + def location_id_column(name: str = 'location_id') -> sa.Column: return sa.Column( name, diff --git a/tests/automated/integration/api/_helpers/RequestValidator.py b/tests/automated/integration/api/_helpers/RequestValidator.py index c5ff4eaf..7d0dc641 100644 --- a/tests/automated/integration/api/_helpers/RequestValidator.py +++ b/tests/automated/integration/api/_helpers/RequestValidator.py @@ -7,8 +7,8 @@ from src.api.endpoints.annotate.agency.get.dto import GetNextURLForAgencyAnnotationResponse from src.api.endpoints.annotate.agency.post.dto import URLAgencyAnnotationPostInfo -from src.api.endpoints.annotate.all.get.dto import GetNextURLForAllAnnotationResponse -from src.api.endpoints.annotate.all.post.dto import AllAnnotationPostInfo +from src.api.endpoints.annotate.all.get.models.response import GetNextURLForAllAnnotationResponse +from src.api.endpoints.annotate.all.post.models.request import AllAnnotationPostInfo from src.api.endpoints.annotate.dtos.record_type.post import RecordTypeAnnotationPostInfo from src.api.endpoints.annotate.dtos.record_type.response import GetNextRecordTypeAnnotationResponseOuterInfo from src.api.endpoints.annotate.relevance.get.dto import GetNextRelevanceAnnotationResponseOuterInfo diff --git a/tests/automated/integration/api/annotate/all/test_happy_path.py b/tests/automated/integration/api/annotate/all/test_happy_path.py index 5003f08f..b4dac9af 100644 --- a/tests/automated/integration/api/annotate/all/test_happy_path.py +++ b/tests/automated/integration/api/annotate/all/test_happy_path.py @@ -1,7 +1,7 @@ import pytest from src.api.endpoints.annotate.agency.post.dto import URLAgencyAnnotationPostInfo -from src.api.endpoints.annotate.all.post.dto import AllAnnotationPostInfo +from src.api.endpoints.annotate.all.post.models.request import AllAnnotationPostInfo from src.core.enums import SuggestedStatus, RecordType from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion diff --git a/tests/automated/integration/api/annotate/all/test_post_batch_filtering.py b/tests/automated/integration/api/annotate/all/test_post_batch_filtering.py index a11c43a3..a7579be2 100644 --- a/tests/automated/integration/api/annotate/all/test_post_batch_filtering.py +++ b/tests/automated/integration/api/annotate/all/test_post_batch_filtering.py @@ -1,7 +1,7 @@ import pytest from src.api.endpoints.annotate.agency.post.dto import URLAgencyAnnotationPostInfo -from src.api.endpoints.annotate.all.post.dto import AllAnnotationPostInfo +from src.api.endpoints.annotate.all.post.models.request import AllAnnotationPostInfo from src.core.enums import SuggestedStatus, RecordType from tests.helpers.setup.final_review.core import setup_for_get_next_url_for_final_review diff --git a/tests/automated/integration/api/annotate/all/test_validation_error.py b/tests/automated/integration/api/annotate/all/test_validation_error.py index b805a435..c2aa6f1c 100644 --- a/tests/automated/integration/api/annotate/all/test_validation_error.py +++ b/tests/automated/integration/api/annotate/all/test_validation_error.py @@ -1,6 +1,6 @@ import pytest -from src.api.endpoints.annotate.all.post.dto import AllAnnotationPostInfo +from src.api.endpoints.annotate.all.post.models.request import AllAnnotationPostInfo from src.core.enums import SuggestedStatus, RecordType from src.core.exceptions import FailedValidationException from tests.helpers.setup.final_review.core import setup_for_get_next_url_for_final_review diff --git a/tests/automated/unit/dto/test_all_annotation_post_info.py b/tests/automated/unit/dto/test_all_annotation_post_info.py index 0778c089..afa4e5b6 100644 --- a/tests/automated/unit/dto/test_all_annotation_post_info.py +++ b/tests/automated/unit/dto/test_all_annotation_post_info.py @@ -1,6 +1,6 @@ import pytest -from src.api.endpoints.annotate.all.post.dto import AllAnnotationPostInfo +from src.api.endpoints.annotate.all.post.models.request import AllAnnotationPostInfo from src.core.enums import RecordType, SuggestedStatus from src.core.exceptions import FailedValidationException From e830566cb6ef85200344ffdfd9b5bd84dd74c5ad Mon Sep 17 00:00:00 2001 From: maxachis Date: Tue, 16 Sep 2025 11:53:33 -0400 Subject: [PATCH 3/7] Update `annotate/all` `GET` logic and tests --- ...baa3b8e9b_add_location_annotation_logic.py | 177 ++++++++++++++++-- .../annotate/all/get/queries/__init__.py | 0 .../all/get/{query.py => queries/core.py} | 48 ++--- .../all/get/queries/location_/__init__.py | 0 .../all/get/queries/location_/convert.py | 81 ++++++++ .../all/get/queries/location_/core.py | 36 ++++ .../all/get/queries/location_/requester.py | 0 .../queries/previously_annotated/__init__.py | 0 .../get/queries/previously_annotated/build.py | 37 ++++ .../get/queries/previously_annotated/core.py | 22 +++ src/api/endpoints/annotate/routes.py | 6 +- src/core/core.py | 6 +- .../operators/submit_approved/queries/get.py | 6 +- src/db/__init__.py | 6 + src/db/client/async_.py | 12 +- .../location/auto/subtask/sqlalchemy.py | 2 +- .../location/auto/suggestion/sqlalchemy.py | 2 +- .../suggestion/location/user/sqlalchemy.py | 5 +- src/db/models/views/location_expanded.py | 19 +- .../api/annotate/all/test_happy_path.py | 6 +- .../annotate/all/test_post_batch_filtering.py | 3 +- .../api/annotate/all/test_validation_error.py | 5 +- ...next_url_for_annotation_batch_filtering.py | 6 +- 23 files changed, 415 insertions(+), 70 deletions(-) create mode 100644 src/api/endpoints/annotate/all/get/queries/__init__.py rename src/api/endpoints/annotate/all/get/{query.py => queries/core.py} (73%) create mode 100644 src/api/endpoints/annotate/all/get/queries/location_/__init__.py create mode 100644 src/api/endpoints/annotate/all/get/queries/location_/convert.py create mode 100644 src/api/endpoints/annotate/all/get/queries/location_/core.py create mode 100644 src/api/endpoints/annotate/all/get/queries/location_/requester.py create mode 100644 src/api/endpoints/annotate/all/get/queries/previously_annotated/__init__.py create mode 100644 src/api/endpoints/annotate/all/get/queries/previously_annotated/build.py create mode 100644 src/api/endpoints/annotate/all/get/queries/previously_annotated/core.py diff --git a/alembic/versions/2025_09_15_1905-93cbaa3b8e9b_add_location_annotation_logic.py b/alembic/versions/2025_09_15_1905-93cbaa3b8e9b_add_location_annotation_logic.py index 712861bc..844b28a9 100644 --- a/alembic/versions/2025_09_15_1905-93cbaa3b8e9b_add_location_annotation_logic.py +++ b/alembic/versions/2025_09_15_1905-93cbaa3b8e9b_add_location_annotation_logic.py @@ -26,6 +26,163 @@ LOCATION_ID_SUBTASK_TYPE_NAME = 'location_id_subtask_type' + +def upgrade() -> None: + _add_location_id_task_type() + _create_user_location_suggestions_table() + _create_auto_location_id_subtask_table() + _create_location_id_subtask_suggestions_table() + _create_new_url_annotation_flags_view() + _create_locations_expanded_view() + _create_state_location_trigger() + _create_county_location_trigger() + _create_locality_location_trigger() + + + + + + +def downgrade() -> None: + _drop_locations_expanded_view() + _create_old_url_annotation_flags_view() + _drop_location_id_subtask_suggestions_table() + _drop_auto_location_id_subtask_table() + _drop_user_location_suggestions_table() + _drop_location_id_task_type() + _drop_location_id_subtask_type() + _drop_state_location_trigger() + _drop_county_location_trigger() + _drop_locality_location_trigger() + + +def _create_state_location_trigger(): + # Function + op.execute(""" + create function insert_state_location() returns trigger + language plpgsql + as + $$ + BEGIN + -- Insert a new location of type 'State' when a new state is added + INSERT INTO locations (type, state_id) + VALUES ('State', NEW.id); + RETURN NEW; + END; + $$; + """) + + # Trigger + op.execute(""" + create trigger after_state_insert + after insert + on us_states + for each row + execute procedure insert_state_location(); + """) + + +def _create_county_location_trigger(): + # Function + op.execute(""" + create function insert_county_location() returns trigger + language plpgsql + as + $$ + BEGIN + -- Insert a new location of type 'County' when a new county is added + INSERT INTO locations (type, state_id, county_id) + VALUES ('County', NEW.state_id, NEW.id); + RETURN NEW; + END; + $$; + """) + + # Trigger + op.execute(""" + create trigger after_county_insert + after insert + on counties + for each row + execute procedure insert_county_location(); + """) + + +def _create_locality_location_trigger(): + # Function + op.execute(""" + create function insert_locality_location() returns trigger + language plpgsql + as + $$ + DECLARE + v_state_id BIGINT; + BEGIN + -- Get the state_id from the associated county + SELECT c.state_id INTO v_state_id + FROM counties c + WHERE c.id = NEW.county_id; + + -- Insert a new location of type 'Locality' when a new locality is added + INSERT INTO locations (type, state_id, county_id, locality_id) + VALUES ('Locality', v_state_id, NEW.county_id, NEW.id); + + RETURN NEW; + END; + $$; + """) + + # Trigger + op.execute(""" + create trigger after_locality_insert + after insert + on localities + for each row + execute procedure insert_locality_location(); + + """) + + +def _drop_state_location_trigger(): + # Trigger + op.execute(""" + drop trigger if exists after_state_insert on us_states; + """) + + # Function + op.execute(""" + drop function if exists insert_state_location; + """) + + + + +def _drop_locality_location_trigger(): + # Trigger + op.execute(""" + drop trigger if exists after_locality_insert on localities; + """) + + # Function + op.execute(""" + drop function if exists insert_locality_location; + """) + + + +def _drop_county_location_trigger(): + # Trigger + op.execute(""" + drop trigger if exists after_county_insert on counties; + """) + + # Function + op.execute(""" + drop function if exists insert_county_location; + """) + + + def _create_new_url_annotation_flags_view(): op.execute("""DROP VIEW IF EXISTS url_annotation_flags;""") op.execute( @@ -69,26 +226,6 @@ def _create_old_url_annotation_flags_view(): ) -def upgrade() -> None: - _add_location_id_task_type() - _create_user_location_suggestions_table() - _create_auto_location_id_subtask_table() - _create_location_id_subtask_suggestions_table() - _create_new_url_annotation_flags_view() - _create_locations_expanded_view() - - - - -def downgrade() -> None: - _drop_locations_expanded_view() - _create_old_url_annotation_flags_view() - _drop_location_id_subtask_suggestions_table() - _drop_auto_location_id_subtask_table() - _drop_user_location_suggestions_table() - _drop_location_id_task_type() - _drop_location_id_subtask_type() - def _drop_locations_expanded_view(): op.execute(""" drop view if exists public.locations_expanded; diff --git a/src/api/endpoints/annotate/all/get/queries/__init__.py b/src/api/endpoints/annotate/all/get/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/annotate/all/get/query.py b/src/api/endpoints/annotate/all/get/queries/core.py similarity index 73% rename from src/api/endpoints/annotate/all/get/query.py rename to src/api/endpoints/annotate/all/get/queries/core.py index 9237fd42..adc41477 100644 --- a/src/api/endpoints/annotate/all/get/query.py +++ b/src/api/endpoints/annotate/all/get/queries/core.py @@ -1,11 +1,16 @@ from sqlalchemy import Select, and_, or_ from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.orm import selectinload +from sqlalchemy.orm import joinedload from src.api.endpoints.annotate._shared.queries.get_annotation_batch_info import GetAnnotationBatchInfoQueryBuilder +from src.api.endpoints.annotate.agency.get.dto import GetNextURLForAgencyAgencyInfo from src.api.endpoints.annotate.agency.get.queries.agency_suggestion_.core import GetAgencySuggestionsQueryBuilder +from src.api.endpoints.annotate.all.get.models.location import LocationAnnotationResponseOuterInfo from src.api.endpoints.annotate.all.get.models.response import GetNextURLForAllAnnotationResponse, \ GetNextURLForAllAnnotationInnerResponse +from src.api.endpoints.annotate.all.get.queries.location_.core import GetLocationSuggestionsQueryBuilder +from src.api.endpoints.annotate.all.get.queries.previously_annotated.core import \ + URLPreviouslyAnnotatedByUserCTEContainer from src.api.endpoints.annotate.relevance.get.dto import RelevanceAnnotationResponseInfo from src.collectors.enums import URLStatus from src.db.dto_converter import DTOConverter @@ -14,28 +19,28 @@ from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion -from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion from src.db.models.impl.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion -from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion from src.db.models.views.unvalidated_url import UnvalidatedURL from src.db.models.views.url_annotations_flags import URLAnnotationFlagsView from src.db.queries.base.builder import QueryBuilderBase -from src.db.statement_composer import StatementComposer class GetNextURLForAllAnnotationQueryBuilder(QueryBuilderBase): def __init__( self, - batch_id: int | None + batch_id: int | None, + user_id: int ): super().__init__() self.batch_id = batch_id + self.user_id = user_id async def run( self, session: AsyncSession ) -> GetNextURLForAllAnnotationResponse: + prev_annotated_cte = URLPreviouslyAnnotatedByUserCTEContainer(user_id=self.user_id) query = ( Select(URL) # URL Must be unvalidated @@ -43,6 +48,11 @@ async def run( UnvalidatedURL, UnvalidatedURL.url_id == URL.id ) + # Must not have been previously annotated by user + .join( + prev_annotated_cte.cte, + prev_annotated_cte.url_id == URL.id + ) .join( URLAnnotationFlagsView, URLAnnotationFlagsView.url_id == URL.id @@ -53,30 +63,18 @@ async def run( query = ( query .where( - and_( URL.status == URLStatus.OK.value, - # Must be missing at least some annotations - or_( - URLAnnotationFlagsView.has_user_agency_suggestion.is_(False), - URLAnnotationFlagsView.has_user_record_type_suggestion.is_(False), - URLAnnotationFlagsView.has_user_relevant_suggestion.is_(False), - URLAnnotationFlagsView.has_user_location_suggestion.is_(False), - ) - - ) ) ) # Add load options query = query.options( - URL.html_content, - URL.auto_agency_subtasks, - URL.auto_relevant_suggestion, - URL.auto_record_type_suggestion, - URL.auto_agency_subtasks.suggestions, + joinedload(URL.html_content), + joinedload(URL.auto_relevant_suggestion), + joinedload(URL.auto_record_type_suggestion), ) query = query.order_by(URL.id.asc()).limit(1) - raw_results = await session.execute(query) + raw_results = (await session.execute(query)).unique() url: URL | None = raw_results.scalars().one_or_none() if url is None: return GetNextURLForAllAnnotationResponse( @@ -95,7 +93,10 @@ async def run( if url.auto_record_type_suggestion is not None: auto_record_type = url.auto_record_type_suggestion.record_type - agency_suggestions = await GetAgencySuggestionsQueryBuilder(url_id=url.id).run(session) + agency_suggestions: list[GetNextURLForAgencyAgencyInfo] = \ + await GetAgencySuggestionsQueryBuilder(url_id=url.id).run(session) + location_suggestions: LocationAnnotationResponseOuterInfo = \ + await GetLocationSuggestionsQueryBuilder(url_id=url.id).run(session) return GetNextURLForAllAnnotationResponse( next_annotation=GetNextURLForAllAnnotationInnerResponse( @@ -116,6 +117,7 @@ async def run( models=[ UserUrlAgencySuggestion, ] - ).run(session) + ).run(session), + location_suggestions=location_suggestions, ) ) \ No newline at end of file diff --git a/src/api/endpoints/annotate/all/get/queries/location_/__init__.py b/src/api/endpoints/annotate/all/get/queries/location_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/annotate/all/get/queries/location_/convert.py b/src/api/endpoints/annotate/all/get/queries/location_/convert.py new file mode 100644 index 00000000..6ed89186 --- /dev/null +++ b/src/api/endpoints/annotate/all/get/queries/location_/convert.py @@ -0,0 +1,81 @@ +from typing import Sequence + +from sqlalchemy import select, func, RowMapping + +from src.api.endpoints.annotate.all.get.models.location import LocationAnnotationUserSuggestion, \ + LocationAnnotationAutoSuggestion +from src.db.models.impl.url.suggestion.location.auto.subtask.sqlalchemy import AutoLocationIDSubtask +from src.db.models.impl.url.suggestion.location.auto.suggestion.sqlalchemy import LocationIDSubtaskSuggestion +from src.db.models.impl.url.suggestion.location.user.sqlalchemy import UserLocationSuggestion +from src.db.models.views.location_expanded import LocationExpandedView +from src.db.templates.requester import RequesterBase + +from src.db.helpers.session import session_helper as sh + +class GetLocationSuggestionsRequester(RequesterBase): + + + async def get_user_location_suggestions(self, url_id: int) -> list[LocationAnnotationUserSuggestion]: + query = ( + select( + UserLocationSuggestion.location_id, + LocationExpandedView.display_name.label("location_name"), + func.count(UserLocationSuggestion.user_id).label('user_count') + ) + .join( + LocationExpandedView, + LocationExpandedView.id == UserLocationSuggestion.location_id + ) + .where( + UserLocationSuggestion.url_id == url_id + ) + .group_by( + UserLocationSuggestion.location_id, + LocationExpandedView.display_name + ) + .order_by( + func.count(UserLocationSuggestion.user_id).desc() + ) + ) + raw_results: Sequence[RowMapping] = await sh.mappings(self.session, query) + return [ + LocationAnnotationUserSuggestion( + **raw_result + ) + for raw_result in raw_results + ] + + + + async def get_auto_location_suggestions( + self, + url_id: int + ) -> list[LocationAnnotationAutoSuggestion]: + query = ( + select( + LocationExpandedView.display_name.label("location_name"), + LocationIDSubtaskSuggestion.location_id, + LocationIDSubtaskSuggestion.confidence, + ) + .join( + LocationExpandedView, + LocationExpandedView.id == LocationIDSubtaskSuggestion.location_id + ) + .join( + AutoLocationIDSubtask, + AutoLocationIDSubtask.id == LocationIDSubtaskSuggestion.subtask_id + ) + .where( + AutoLocationIDSubtask.url_id == url_id + ) + .order_by( + LocationIDSubtaskSuggestion.confidence.desc() + ) + ) + raw_results: Sequence[RowMapping] = await sh.mappings(self.session, query) + return [ + LocationAnnotationAutoSuggestion( + **raw_result + ) + for raw_result in raw_results + ] diff --git a/src/api/endpoints/annotate/all/get/queries/location_/core.py b/src/api/endpoints/annotate/all/get/queries/location_/core.py new file mode 100644 index 00000000..cee9f758 --- /dev/null +++ b/src/api/endpoints/annotate/all/get/queries/location_/core.py @@ -0,0 +1,36 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.annotate.all.get.models.location import LocationAnnotationResponseOuterInfo, \ + LocationAnnotationUserSuggestion, LocationAnnotationAutoSuggestion +from src.api.endpoints.annotate.all.get.queries.location_.convert import GetLocationSuggestionsRequester +from src.db.queries.base.builder import QueryBuilderBase +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.annotate.all.get.models.location import LocationAnnotationResponseOuterInfo, \ + LocationAnnotationUserSuggestion, LocationAnnotationAutoSuggestion +from src.api.endpoints.annotate.all.get.queries.location_.convert import GetLocationSuggestionsRequester +from src.db.queries.base.builder import QueryBuilderBase + + +class GetLocationSuggestionsQueryBuilder(QueryBuilderBase): + + def __init__( + self, + url_id: int + ): + super().__init__() + self.url_id = url_id + + + async def run(self, session: AsyncSession) -> LocationAnnotationResponseOuterInfo: + requester = GetLocationSuggestionsRequester(session) + user_suggestions: list[LocationAnnotationUserSuggestion] = \ + await requester.get_user_location_suggestions(self.url_id) + auto_suggestions: list[LocationAnnotationAutoSuggestion] = \ + await requester.get_auto_location_suggestions(self.url_id) + + return LocationAnnotationResponseOuterInfo( + user=user_suggestions, + auto=auto_suggestions + ) + diff --git a/src/api/endpoints/annotate/all/get/queries/location_/requester.py b/src/api/endpoints/annotate/all/get/queries/location_/requester.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/annotate/all/get/queries/previously_annotated/__init__.py b/src/api/endpoints/annotate/all/get/queries/previously_annotated/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/annotate/all/get/queries/previously_annotated/build.py b/src/api/endpoints/annotate/all/get/queries/previously_annotated/build.py new file mode 100644 index 00000000..1d54df46 --- /dev/null +++ b/src/api/endpoints/annotate/all/get/queries/previously_annotated/build.py @@ -0,0 +1,37 @@ +from sqlalchemy import CTE, select, and_, or_ + +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.location.user.sqlalchemy import UserLocationSuggestion +from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion +from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion + + +def build_cte(user_id: int) -> CTE: + query = ( + select( + URL.id + ) + ) + for model in [ + UserLocationSuggestion, + UserRelevantSuggestion, + UserRecordTypeSuggestion, + UserUrlAgencySuggestion + ]: + query = query.outerjoin( + model, + and_( + model.url_id == URL.id, + model.user_id == user_id + ) + ) + query = query.where( + and_( + UserLocationSuggestion.user_id.is_(None), + UserRelevantSuggestion.user_id.is_(None), + UserRecordTypeSuggestion.user_id.is_(None), + UserUrlAgencySuggestion.user_id.is_(None) + ) + ) + return query.cte() diff --git a/src/api/endpoints/annotate/all/get/queries/previously_annotated/core.py b/src/api/endpoints/annotate/all/get/queries/previously_annotated/core.py new file mode 100644 index 00000000..2c91076b --- /dev/null +++ b/src/api/endpoints/annotate/all/get/queries/previously_annotated/core.py @@ -0,0 +1,22 @@ +from sqlalchemy import CTE +from sqlalchemy.orm import InstrumentedAttribute + +from src.api.endpoints.annotate.all.get.queries.previously_annotated.build import build_cte + + +class URLPreviouslyAnnotatedByUserCTEContainer: + + def __init__( + self, + user_id: int + ): + self.user_id = user_id + self._cte: CTE = build_cte(user_id=user_id) + + @property + def cte(self) -> CTE: + return self._cte + + @property + def url_id(self) -> InstrumentedAttribute[int]: + return self._cte.c.id \ No newline at end of file diff --git a/src/api/endpoints/annotate/routes.py b/src/api/endpoints/annotate/routes.py index 7cd4b76b..80c44cc8 100644 --- a/src/api/endpoints/annotate/routes.py +++ b/src/api/endpoints/annotate/routes.py @@ -132,7 +132,8 @@ async def get_next_url_for_all_annotations( batch_id: int | None = batch_query ) -> GetNextURLForAllAnnotationResponse: return await async_core.get_next_url_for_all_annotations( - batch_id=batch_id + batch_id=batch_id, + user_id=access_info.user_id ) @annotate_router.post("/all/{url_id}") @@ -152,5 +153,6 @@ async def annotate_url_for_all_annotations_and_get_next_url( post_info=all_annotation_post_info ) return await async_core.get_next_url_for_all_annotations( - batch_id=batch_id + batch_id=batch_id, + user_id=access_info.user_id ) \ No newline at end of file diff --git a/src/core/core.py b/src/core/core.py index 68a94c6d..4051b8f2 100644 --- a/src/core/core.py +++ b/src/core/core.py @@ -272,10 +272,12 @@ async def get_next_source_for_review( async def get_next_url_for_all_annotations( self, - batch_id: Optional[int] + user_id: int, + batch_id: int | None ) -> GetNextURLForAllAnnotationResponse: return await self.adb_client.get_next_url_for_all_annotations( - batch_id=batch_id + batch_id=batch_id, + user_id=user_id ) async def submit_url_for_all_annotations( diff --git a/src/core/tasks/url/operators/submit_approved/queries/get.py b/src/core/tasks/url/operators/submit_approved/queries/get.py index 16b38a82..2da731bd 100644 --- a/src/core/tasks/url/operators/submit_approved/queries/get.py +++ b/src/core/tasks/url/operators/submit_approved/queries/get.py @@ -2,14 +2,12 @@ from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.orm import selectinload -from src.collectors.enums import URLStatus from src.core.tasks.url.operators.submit_approved.queries.cte import VALIDATED_URLS_WITHOUT_DS_ALIAS from src.core.tasks.url.operators.submit_approved.tdo import SubmitApprovedURLTDO -from src.db.models.impl.flag.url_validated.enums import URLValidatedType -from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.helpers.session import session_helper as sh from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase -from src.db.helpers.session import session_helper as sh + class GetValidatedURLsQueryBuilder(QueryBuilderBase): diff --git a/src/db/__init__.py b/src/db/__init__.py index e69de29b..812e7e5b 100644 --- a/src/db/__init__.py +++ b/src/db/__init__.py @@ -0,0 +1,6 @@ + + +from src.db.models.impl.location.location.sqlalchemy import Location +from src.db.models.impl.location.us_state.sqlalchemy import USState +from src.db.models.impl.location.county.sqlalchemy import County +from src.db.models.impl.location.locality.sqlalchemy import Locality diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 969e5dc6..91995432 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -14,8 +14,7 @@ from src.api.endpoints.annotate.agency.get.dto import GetNextURLForAgencyAnnotationResponse from src.api.endpoints.annotate.agency.get.queries.next_for_annotation import GetNextURLAgencyForAnnotationQueryBuilder from src.api.endpoints.annotate.all.get.models.response import GetNextURLForAllAnnotationResponse -from src.api.endpoints.annotate.all.get.query import GetNextURLForAllAnnotationQueryBuilder -from src.api.endpoints.annotate.all.post.models.request import AllAnnotationPostInfo +from src.api.endpoints.annotate.all.get.queries.core import GetNextURLForAllAnnotationQueryBuilder from src.api.endpoints.annotate.dtos.record_type.response import GetNextRecordTypeAnnotationResponseInfo from src.api.endpoints.annotate.relevance.get.dto import GetNextRelevanceAnnotationResponseInfo from src.api.endpoints.annotate.relevance.get.query import GetNextUrlForRelevanceAnnotationQueryBuilder @@ -988,9 +987,14 @@ async def delete_old_logs(self): await self.execute(statement) async def get_next_url_for_all_annotations( - self, batch_id: int | None = None + self, + user_id: int, + batch_id: int | None = None ) -> GetNextURLForAllAnnotationResponse: - return await self.run_query_builder(GetNextURLForAllAnnotationQueryBuilder(batch_id)) + return await self.run_query_builder(GetNextURLForAllAnnotationQueryBuilder( + batch_id=batch_id, + user_id=user_id + )) async def upload_manual_batch( self, diff --git a/src/db/models/impl/url/suggestion/location/auto/subtask/sqlalchemy.py b/src/db/models/impl/url/suggestion/location/auto/subtask/sqlalchemy.py index 97df74b3..86f04b4b 100644 --- a/src/db/models/impl/url/suggestion/location/auto/subtask/sqlalchemy.py +++ b/src/db/models/impl/url/suggestion/location/auto/subtask/sqlalchemy.py @@ -15,7 +15,7 @@ class AutoLocationIDSubtask( URLDependentMixin, ): - __tablename__ = 'auto_location_id_subtask' + __tablename__ = 'auto_location_id_subtasks' locations_found = Column(Boolean(), nullable=False) type = enum_column( diff --git a/src/db/models/impl/url/suggestion/location/auto/suggestion/sqlalchemy.py b/src/db/models/impl/url/suggestion/location/auto/suggestion/sqlalchemy.py index 688d1c4d..9b478c91 100644 --- a/src/db/models/impl/url/suggestion/location/auto/suggestion/sqlalchemy.py +++ b/src/db/models/impl/url/suggestion/location/auto/suggestion/sqlalchemy.py @@ -11,7 +11,7 @@ class LocationIDSubtaskSuggestion( __tablename__ = 'location_id_subtask_suggestions' subtask_id = Column( Integer, - ForeignKey('auto_location_id_subtask.id'), + ForeignKey('auto_location_id_subtasks.id'), nullable=False, primary_key=True, ) diff --git a/src/db/models/impl/url/suggestion/location/user/sqlalchemy.py b/src/db/models/impl/url/suggestion/location/user/sqlalchemy.py index 088ba3c3..a9d4ae8b 100644 --- a/src/db/models/impl/url/suggestion/location/user/sqlalchemy.py +++ b/src/db/models/impl/url/suggestion/location/user/sqlalchemy.py @@ -1,4 +1,4 @@ -from sqlalchemy import Integer, Column +from sqlalchemy import Integer, Column, PrimaryKeyConstraint from src.db.models.mixins import CreatedAtMixin, URLDependentMixin, LocationDependentMixin from src.db.models.templates_.base import Base @@ -11,6 +11,9 @@ class UserLocationSuggestion( URLDependentMixin ): __tablename__ = 'user_location_suggestions' + __table_args__ = ( + PrimaryKeyConstraint('url_id', 'location_id', 'user_id'), + ) user_id = Column( Integer, diff --git a/src/db/models/views/location_expanded.py b/src/db/models/views/location_expanded.py index 59df4f20..1eb973aa 100644 --- a/src/db/models/views/location_expanded.py +++ b/src/db/models/views/location_expanded.py @@ -34,26 +34,33 @@ LEFT JOIN counties ON locations.county_id = counties.id LEFT JOIN localities ON locations.locality_id = localities.id; """ -from sqlalchemy import PrimaryKeyConstraint +from sqlalchemy import Column, String, Integer from src.db.models.helpers import enum_column from src.db.models.impl.location.location.enums import LocationType from src.db.models.mixins import ViewMixin, LocationDependentMixin -from src.db.models.templates_.base import Base +from src.db.models.templates_.with_id import WithIDBase class LocationExpandedView( - Base, + WithIDBase, ViewMixin, LocationDependentMixin ): - __tablename__ = "locations_expanded" __table_args__ = ( - PrimaryKeyConstraint("location_id"), {"info": "view"} ) type = enum_column(LocationType, name="location_type", nullable=False) - # TODO: Complete later \ No newline at end of file + state_name = Column(String) + state_iso = Column(String) + county_name = Column(String) + county_fips = Column(String) + locality_name = Column(String) + locality_id = Column(Integer) + state_id = Column(Integer) + county_id = Column(Integer) + display_name = Column(String) + full_display_name = Column(String) diff --git a/tests/automated/integration/api/annotate/all/test_happy_path.py b/tests/automated/integration/api/annotate/all/test_happy_path.py index b4dac9af..86c0d843 100644 --- a/tests/automated/integration/api/annotate/all/test_happy_path.py +++ b/tests/automated/integration/api/annotate/all/test_happy_path.py @@ -29,12 +29,14 @@ async def test_annotate_all(api_test_helper): # First, get a valid URL to annotate get_response_1 = await ath.request_validator.get_next_url_for_all_annotations() + assert get_response_1.next_annotation is not None # Apply the second batch id as a filter and see that a different URL is returned get_response_2 = await ath.request_validator.get_next_url_for_all_annotations( batch_id=setup_info_2.batch_id ) + assert get_response_2.next_annotation is not None assert get_response_1.next_annotation.url_info.url_id != get_response_2.next_annotation.url_info.url_id # Annotate the first and submit @@ -47,7 +49,8 @@ async def test_annotate_all(api_test_helper): agency=URLAgencyAnnotationPostInfo( is_new=False, suggested_agency=agency_id - ) + ), + location_ids=[] ) ) assert post_response_1.next_annotation is not None @@ -60,6 +63,7 @@ async def test_annotate_all(api_test_helper): url_id=url_mapping_2.url_id, all_annotations_post_info=AllAnnotationPostInfo( suggested_status=SuggestedStatus.NOT_RELEVANT, + location_ids=[] ) ) assert post_response_2.next_annotation is None diff --git a/tests/automated/integration/api/annotate/all/test_post_batch_filtering.py b/tests/automated/integration/api/annotate/all/test_post_batch_filtering.py index a7579be2..7a1d0578 100644 --- a/tests/automated/integration/api/annotate/all/test_post_batch_filtering.py +++ b/tests/automated/integration/api/annotate/all/test_post_batch_filtering.py @@ -34,7 +34,8 @@ async def test_annotate_all_post_batch_filtering(api_test_helper): record_type=RecordType.ACCIDENT_REPORTS, agency=URLAgencyAnnotationPostInfo( is_new=True - ) + ), + location_ids=[] ) ) diff --git a/tests/automated/integration/api/annotate/all/test_validation_error.py b/tests/automated/integration/api/annotate/all/test_validation_error.py index c2aa6f1c..e9f8702f 100644 --- a/tests/automated/integration/api/annotate/all/test_validation_error.py +++ b/tests/automated/integration/api/annotate/all/test_validation_error.py @@ -12,7 +12,7 @@ async def test_annotate_all_validation_error(api_test_helper): Validation errors in the PostInfo DTO should result in a 400 BAD REQUEST response """ ath = api_test_helper - setup_info_1 = await setup_for_get_next_url_for_final_review( + setup_info_1 = await setup_for_get_next_url_for_final_review( db_data_creator=ath.db_data_creator, include_user_annotations=False ) url_mapping_1 = setup_info_1.url_mapping @@ -22,6 +22,7 @@ async def test_annotate_all_validation_error(api_test_helper): url_id=url_mapping_1.url_id, all_annotations_post_info=AllAnnotationPostInfo( suggested_status=SuggestedStatus.NOT_RELEVANT, - record_type=RecordType.ACCIDENT_REPORTS + record_type=RecordType.ACCIDENT_REPORTS, + location_ids=[] ) ) diff --git a/tests/automated/integration/db/client/test_get_next_url_for_annotation_batch_filtering.py b/tests/automated/integration/db/client/test_get_next_url_for_annotation_batch_filtering.py index a1df2164..ab7e6cde 100644 --- a/tests/automated/integration/db/client/test_get_next_url_for_annotation_batch_filtering.py +++ b/tests/automated/integration/db/client/test_get_next_url_for_annotation_batch_filtering.py @@ -92,7 +92,8 @@ def assert_batch_info(batch_info): # All annotations result_with_batch_id = await db_data_creator.adb_client.get_next_url_for_all_annotations( - batch_id=setup_info_2.batch_id + batch_id=setup_info_2.batch_id, + user_id=1 ) assert result_with_batch_id.next_annotation.url_info.url == url_2.url @@ -100,7 +101,8 @@ def assert_batch_info(batch_info): # If no batch id is provided, return first valid URL result_no_batch_id = await db_data_creator.adb_client.get_next_url_for_all_annotations( - batch_id=None + batch_id=None, + user_id=1 ) assert result_no_batch_id.next_annotation.url_info.url == url_1.url From ef84df35779e846293ead3dd6712d6bfc38a8c5a Mon Sep 17 00:00:00 2001 From: maxachis Date: Tue, 16 Sep 2025 12:21:15 -0400 Subject: [PATCH 4/7] Continue draft --- src/api/endpoints/annotate/all/post/query.py | 1 + src/db/client/async_.py | 15 ++++++ .../implementations/location/__init__.py | 0 .../queries/implementations/location/get.py | 49 +++++++++++++++++++ tests/automated/integration/conftest.py | 47 +++++++++++++++++- tests/helpers/data_creator/create.py | 25 ++++++++++ .../models/creation_info/county.py | 6 +++ .../models/creation_info/locality.py | 6 +++ .../models/creation_info/us_state.py | 6 +++ 9 files changed, 154 insertions(+), 1 deletion(-) create mode 100644 src/db/queries/implementations/location/__init__.py create mode 100644 src/db/queries/implementations/location/get.py create mode 100644 tests/helpers/data_creator/models/creation_info/county.py create mode 100644 tests/helpers/data_creator/models/creation_info/locality.py create mode 100644 tests/helpers/data_creator/models/creation_info/us_state.py diff --git a/src/api/endpoints/annotate/all/post/query.py b/src/api/endpoints/annotate/all/post/query.py index a3ddb0c6..12374375 100644 --- a/src/api/endpoints/annotate/all/post/query.py +++ b/src/api/endpoints/annotate/all/post/query.py @@ -25,6 +25,7 @@ def __init__( async def run(self, session: AsyncSession) -> None: # Add relevant annotation + # TODO: Modify UserRelevantSuggestion to use `URLValidatedType` instead of `SuggestedStatus` relevant_suggestion = UserRelevantSuggestion( url_id=self.url_id, user_id=self.user_id, diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 91995432..fc5e013f 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -131,6 +131,7 @@ from src.db.queries.implementations.core.get.recent_batch_summaries.builder import GetRecentBatchSummariesQueryBuilder from src.db.queries.implementations.core.metrics.urls.aggregated.pending import \ GetMetricsURLSAggregatedPendingQueryBuilder +from src.db.queries.implementations.location.get import GetLocationQueryBuilder from src.db.statement_composer import StatementComposer from src.db.templates.markers.bulk.delete import BulkDeletableModel from src.db.templates.markers.bulk.insert import BulkInsertableModel @@ -1265,3 +1266,17 @@ async def get_urls_without_probe(self) -> list[URLMapping]: return await self.run_query_builder( GetURLsWithoutProbeQueryBuilder() ) + + async def get_location_id( + self, + us_state_id: int, + county_id: int | None = None, + locality_id: int | None = None + ) -> int | None: + return await self.run_query_builder( + GetLocationQueryBuilder( + us_state_id=us_state_id, + county_id=county_id, + locality_id=locality_id + ) + ) diff --git a/src/db/queries/implementations/location/__init__.py b/src/db/queries/implementations/location/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/queries/implementations/location/get.py b/src/db/queries/implementations/location/get.py new file mode 100644 index 00000000..7ab3c381 --- /dev/null +++ b/src/db/queries/implementations/location/get.py @@ -0,0 +1,49 @@ +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db import Location +from src.db.helpers.session import session_helper as sh +from src.db.queries.base.builder import QueryBuilderBase + + +class GetLocationQueryBuilder(QueryBuilderBase): + + def __init__( + self, + us_state_id: int, + county_id: int | None = None, + locality_id: int | None = None, + ): + super().__init__() + self.us_state_id = us_state_id + self.county_id = county_id + self.locality_id = locality_id + + async def run(self, session: AsyncSession) -> int | None: + query = ( + select( + Location.id + ) + .where( + Location.state_id == self.us_state_id, + ) + ) + if self.county_id is not None: + query = query.where( + Location.county_id == self.county_id + ) + else: + query = query.where( + Location.county_id.is_(None) + ) + + if self.locality_id is not None: + query = query.where( + Location.locality_id == self.locality_id + ) + else: + query = query.where( + Location.locality_id.is_(None) + ) + + return await sh.one_or_none(session, query=query) diff --git a/tests/automated/integration/conftest.py b/tests/automated/integration/conftest.py index 7e4fc535..732cb84c 100644 --- a/tests/automated/integration/conftest.py +++ b/tests/automated/integration/conftest.py @@ -1,11 +1,15 @@ from unittest.mock import MagicMock import pytest +import pytest_asyncio from src.collectors.manager import AsyncCollectorManager from src.core.core import AsyncCore from src.core.logger import AsyncCoreLogger from src.db.client.async_ import AsyncDatabaseClient +from tests.helpers.data_creator.models.creation_info.county import CountyCreationInfo +from tests.helpers.data_creator.models.creation_info.locality import LocalityCreationInfo +from tests.helpers.data_creator.models.creation_info.us_state import USStateCreationInfo @pytest.fixture @@ -25,4 +29,45 @@ def test_async_core(adb_client_test): ) yield core core.shutdown() - logger.shutdown() \ No newline at end of file + logger.shutdown() + +@pytest_asyncio.fixture +def pennsylvania( + adb_client_test: AsyncDatabaseClient +) -> USStateCreationInfo: + """Creates Pennsylvania state and returns its state and location ID""" + raise NotImplementedError + +@pytest_asyncio.fixture +def allegheny_county( + adb_client_test: AsyncDatabaseClient, + pennsylvania: USStateCreationInfo +) -> CountyCreationInfo: + raise NotImplementedError + +@pytest_asyncio.fixture +def pittsburgh_locality( + adb_client_test: AsyncDatabaseClient, + allegheny_county: CountyCreationInfo +) -> LocalityCreationInfo: + raise NotImplementedError + +@pytest_asyncio.fixture +def california( + adb_client_test: AsyncDatabaseClient +) -> USStateCreationInfo: + raise NotImplementedError + +@pytest_asyncio.fixture +def los_angeles_county( + adb_client_test: AsyncDatabaseClient, + california: USStateCreationInfo +) -> CountyCreationInfo: + raise NotImplementedError + +@pytest_asyncio.fixture +def los_angeles_locality( + adb_client_test: AsyncDatabaseClient, + los_angeles_county: CountyCreationInfo +) -> LocalityCreationInfo: + raise NotImplementedError \ No newline at end of file diff --git a/tests/helpers/data_creator/create.py b/tests/helpers/data_creator/create.py index 83b2e3f5..34f5187d 100644 --- a/tests/helpers/data_creator/create.py +++ b/tests/helpers/data_creator/create.py @@ -2,6 +2,8 @@ from src.collectors.enums import CollectorType, URLStatus from src.core.enums import BatchStatus, RecordType +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.us_state import \ + USState from src.db.client.async_ import AsyncDatabaseClient from src.db.dtos.url.mapping import URLMapping from src.db.models.impl.batch.pydantic.insert import BatchInsertModel @@ -13,6 +15,7 @@ from src.db.models.impl.url.data_source.pydantic import URLDataSourcePydantic from tests.helpers.data_creator.generate import generate_batch, generate_urls, generate_validated_flags, \ generate_url_data_sources, generate_batch_url_links +from tests.helpers.data_creator.models.creation_info.us_state import USStateCreationInfo async def create_batch( @@ -73,3 +76,25 @@ async def create_batch_url_links( ) await adb_client.bulk_insert(batch_url_links) +async def create_state( + adb_client: AsyncDatabaseClient, + name: str, + iso: str +) -> USStateCreationInfo: + + us_state_insert_model = USState( + name=name, + iso=iso, + ) + us_state_id: int = await adb_client.add( + us_state_insert_model, + return_id=True + ) + location_id: int = await adb_client.get_location_id( + us_state_id=us_state_id, + ) + return USStateCreationInfo( + us_state_id=us_state_id, + location_id=location_id, + ) + diff --git a/tests/helpers/data_creator/models/creation_info/county.py b/tests/helpers/data_creator/models/creation_info/county.py new file mode 100644 index 00000000..4a9511ec --- /dev/null +++ b/tests/helpers/data_creator/models/creation_info/county.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class CountyCreationInfo(BaseModel): + county_id: int + location_id: int \ No newline at end of file diff --git a/tests/helpers/data_creator/models/creation_info/locality.py b/tests/helpers/data_creator/models/creation_info/locality.py new file mode 100644 index 00000000..6e98899d --- /dev/null +++ b/tests/helpers/data_creator/models/creation_info/locality.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class LocalityCreationInfo(BaseModel): + locality_id: int + location_id: int \ No newline at end of file diff --git a/tests/helpers/data_creator/models/creation_info/us_state.py b/tests/helpers/data_creator/models/creation_info/us_state.py new file mode 100644 index 00000000..2c8914d6 --- /dev/null +++ b/tests/helpers/data_creator/models/creation_info/us_state.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class USStateCreationInfo(BaseModel): + us_state_id: int + location_id: int \ No newline at end of file From 91f2ebd8c1ca93e9303c759589f7599b9a1db599 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 18 Sep 2025 06:32:06 -0400 Subject: [PATCH 5/7] Begin splitting up Location Tasks --- ENV.md | 19 +- ...baa3b8e9b_add_location_annotation_logic.py | 20 ++- src/api/main.py | 4 +- src/core/tasks/url/loader.py | 22 ++- .../models => _shared}/__init__.py | 0 .../container}/__init__.py | 0 .../container/subtask}/__init__.py | 0 .../container/subtask/eligible.py} | 2 +- .../container/subtask/exists.py} | 2 +- .../mappings => _shared/ctes}/__init__.py | 0 .../exists/impl => _shared/ctes}/validated.py | 6 +- .../exceptions.py | 0 .../operators/agency_identification/core.py | 2 +- .../impl/nlp_location_match_/convert.py | 2 + .../subtasks/impl/nlp_location_match_/core.py | 45 +++-- .../nlp_location_match_/processor/convert.py | 162 ------------------ .../nlp_location_match_/processor/extract.py | 12 -- .../models/mappings/url_id_search_response.py | 8 - .../models/subsets => query_}/__init__.py | 0 .../impl/nlp_location_match_/query_/query.py | 26 +++ .../nlp_location_match_/query_/response.py | 8 + .../agency_identification/subtasks/loader.py | 2 +- .../queries/survey/queries/ctes/eligible.py | 4 +- .../{impl => }/high_confidence_annotations.py | 6 +- .../survey/queries/ctes/subtask/impl/ckan.py | 2 +- .../queries/ctes/subtask/impl/homepage.py | 2 +- .../queries/ctes/subtask/impl/muckrock.py | 2 +- .../queries/ctes/subtask/impl/nlp_location.py | 14 +- .../subtasks/templates/output.py | 5 - .../subtasks/templates/postprocessor.py | 26 --- .../processor/nlp => location_id}/__init__.py | 0 .../tasks/url/operators/location_id/core.py | 44 +++++ .../nlp => location_id}/models/__init__.py | 0 .../impl => location_id/subtasks}/__init__.py | 0 .../location_id/subtasks/flags/__init__.py | 0 .../location_id/subtasks/flags/core.py | 25 +++ .../location_id/subtasks/flags/mappings.py | 5 + .../location_id/subtasks/impl/__init__.py | 0 .../impl/nlp_location_freq/__init__.py | 0 .../impl/nlp_location_freq/constants.py | 4 + .../subtasks/impl/nlp_location_freq/core.py | 56 ++++++ .../impl/nlp_location_freq/models/__init__.py | 0 .../impl/nlp_location_freq}/models/input.py | 0 .../models/mappings/__init__.py | 0 .../models/mappings/url_id_nlp_response.py | 2 +- .../models/mappings/url_id_search_response.py | 10 ++ .../models/subsets/__init__.py | 0 .../models/subsets/nlp_responses.py | 2 +- .../nlp_location_freq/processor/__init__.py | 0 .../nlp_location_freq}/processor/constants.py | 0 .../nlp_location_freq/processor/convert.py | 147 ++++++++++++++++ .../impl/nlp_location_freq}/processor/core.py | 127 +++++++------- .../nlp_location_freq}/processor/counter.py | 0 .../nlp_location_freq}/processor/filter.py | 30 ++-- .../nlp_location_freq}/processor/mapper.py | 0 .../processor/models/__init__.py | 0 .../processor/models}/url_id_search_params.py | 4 +- .../processor/nlp/__init__.py | 0 .../nlp_location_freq}/processor/nlp/check.py | 2 +- .../processor/nlp/constants.py | 0 .../processor/nlp/convert.py | 4 +- .../nlp_location_freq}/processor/nlp/core.py | 14 +- .../nlp_location_freq}/processor/nlp/enums.py | 0 .../processor/nlp/extract.py | 4 +- .../processor/nlp/mappings.py | 0 .../processor/nlp/models/__init__.py | 0 .../processor/nlp/models/params.py | 0 .../processor/nlp/models/response.py | 2 +- .../processor/nlp/models/us_state.py | 0 .../processor/nlp/preprocess.py | 0 .../processor/query_/__init__.py | 0 .../processor/query_/core.py | 105 ++++++++++++ .../processor/query_/models/__init__.py | 0 .../processor/query_/models/params.py | 10 ++ .../processor/query_/models/response.py | 13 ++ .../subtasks/impl/nlp_location_freq}/query.py | 2 +- .../operators/location_id/subtasks/loader.py | 35 ++++ .../location_id/subtasks/models/__init__.py | 0 .../location_id/subtasks/models/run_info.py | 14 ++ .../location_id/subtasks/models/subtask.py | 18 ++ .../location_id/subtasks/models/suggestion.py | 6 + .../location_id/subtasks/queries/__init__.py | 0 .../subtasks/queries/survey/__init__.py | 0 .../subtasks/queries/survey/constants.py | 11 ++ .../queries/survey/queries/__init__.py | 0 .../subtasks/queries/survey/queries/core.py | 73 ++++++++ .../queries/survey/queries/ctes/__init__.py | 0 .../queries/survey/queries/ctes/eligible.py | 38 ++++ .../survey/queries/ctes/exists/__init__.py | 0 .../exists/high_confidence_annotations.py | 29 ++++ .../survey/queries/ctes/subtask/__init__.py | 0 .../survey/queries/ctes/subtask/helpers.py | 18 ++ .../queries/ctes/subtask/impl/__init__.py | 0 .../ctes/subtask/impl/nlp_location_freq.py | 25 +++ .../queries/survey/queries/eligible_counts.py | 21 +++ .../subtasks/templates/__init__.py | 0 .../location_id/subtasks/templates/subtask.py | 84 +++++++++ src/db/enums.py | 1 + .../models/impl/location/county/sqlalchemy.py | 2 +- .../impl/location/locality/sqlalchemy.py | 3 +- src/external/pdap/client.py | 32 ---- .../dtos/search_agency_by_location/params.py | 1 - .../api/annotate/all/test_happy_path.py | 57 +++++- tests/automated/integration/conftest.py | 59 +++++-- .../impl/agency_identification/conftest.py | 2 +- .../end_to_end/test_core.py | 4 +- .../match_urls_to_search_params/conftest.py | 2 +- .../test_nlp_response_valid.py | 4 +- .../integration/tasks/url/loader/conftest.py | 2 +- tests/helpers/data_creator/core.py | 42 ++++- tests/helpers/data_creator/create.py | 54 +++++- .../agency_identifier/test_nlp_processor.py | 2 +- .../pdap/test_sc_agency_search_location.py | 34 ---- 113 files changed, 1228 insertions(+), 460 deletions(-) rename src/core/tasks/url/operators/{agency_identification/subtasks/impl/nlp_location_match_/models => _shared}/__init__.py (100%) rename src/core/tasks/url/operators/{agency_identification/subtasks/impl/nlp_location_match_/processor => _shared/container}/__init__.py (100%) rename src/core/tasks/url/operators/{agency_identification/subtasks/impl/nlp_location_match_/processor/models => _shared/container/subtask}/__init__.py (100%) rename src/core/tasks/url/operators/{agency_identification/subtasks/queries/survey/queries/ctes/subtask/container.py => _shared/container/subtask/eligible.py} (96%) rename src/core/tasks/url/operators/{agency_identification/subtasks/queries/survey/queries/ctes/exists/container.py => _shared/container/subtask/exists.py} (95%) rename src/core/tasks/url/operators/{agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings => _shared/ctes}/__init__.py (100%) rename src/core/tasks/url/operators/{agency_identification/subtasks/queries/survey/queries/ctes/exists/impl => _shared/ctes}/validated.py (52%) rename src/core/tasks/url/operators/{agency_identification => _shared}/exceptions.py (100%) create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/convert.py delete mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/convert.py delete mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/extract.py delete mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_search_response.py rename src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/{processor/models/subsets => query_}/__init__.py (100%) create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/query_/query.py create mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/query_/response.py rename src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/{impl => }/high_confidence_annotations.py (76%) delete mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/templates/output.py delete mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/templates/postprocessor.py rename src/core/tasks/url/operators/{agency_identification/subtasks/impl/nlp_location_match_/processor/nlp => location_id}/__init__.py (100%) create mode 100644 src/core/tasks/url/operators/location_id/core.py rename src/core/tasks/url/operators/{agency_identification/subtasks/impl/nlp_location_match_/processor/nlp => location_id}/models/__init__.py (100%) rename src/core/tasks/url/operators/{agency_identification/subtasks/queries/survey/queries/ctes/exists/impl => location_id/subtasks}/__init__.py (100%) create mode 100644 src/core/tasks/url/operators/location_id/subtasks/flags/__init__.py create mode 100644 src/core/tasks/url/operators/location_id/subtasks/flags/core.py create mode 100644 src/core/tasks/url/operators/location_id/subtasks/flags/mappings.py create mode 100644 src/core/tasks/url/operators/location_id/subtasks/impl/__init__.py create mode 100644 src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/__init__.py create mode 100644 src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/constants.py create mode 100644 src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/core.py create mode 100644 src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/models/__init__.py rename src/core/tasks/url/operators/{agency_identification/subtasks/impl/nlp_location_match_ => location_id/subtasks/impl/nlp_location_freq}/models/input.py (100%) create mode 100644 src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/models/mappings/__init__.py rename src/core/tasks/url/operators/{agency_identification/subtasks/impl/nlp_location_match_/processor => location_id/subtasks/impl/nlp_location_freq}/models/mappings/url_id_nlp_response.py (55%) create mode 100644 src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/models/mappings/url_id_search_response.py create mode 100644 src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/models/subsets/__init__.py rename src/core/tasks/url/operators/{agency_identification/subtasks/impl/nlp_location_match_/processor => location_id/subtasks/impl/nlp_location_freq}/models/subsets/nlp_responses.py (55%) create mode 100644 src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/__init__.py rename src/core/tasks/url/operators/{agency_identification/subtasks/impl/nlp_location_match_ => location_id/subtasks/impl/nlp_location_freq}/processor/constants.py (100%) create mode 100644 src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/convert.py rename src/core/tasks/url/operators/{agency_identification/subtasks/impl/nlp_location_match_ => location_id/subtasks/impl/nlp_location_freq}/processor/core.py (53%) rename src/core/tasks/url/operators/{agency_identification/subtasks/impl/nlp_location_match_ => location_id/subtasks/impl/nlp_location_freq}/processor/counter.py (100%) rename src/core/tasks/url/operators/{agency_identification/subtasks/impl/nlp_location_match_ => location_id/subtasks/impl/nlp_location_freq}/processor/filter.py (51%) rename src/core/tasks/url/operators/{agency_identification/subtasks/impl/nlp_location_match_ => location_id/subtasks/impl/nlp_location_freq}/processor/mapper.py (100%) create mode 100644 src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/models/__init__.py rename src/core/tasks/url/operators/{agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings => location_id/subtasks/impl/nlp_location_freq/processor/models}/url_id_search_params.py (57%) create mode 100644 src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/__init__.py rename src/core/tasks/url/operators/{agency_identification/subtasks/impl/nlp_location_match_ => location_id/subtasks/impl/nlp_location_freq}/processor/nlp/check.py (63%) rename src/core/tasks/url/operators/{agency_identification/subtasks/impl/nlp_location_match_ => location_id/subtasks/impl/nlp_location_freq}/processor/nlp/constants.py (100%) rename src/core/tasks/url/operators/{agency_identification/subtasks/impl/nlp_location_match_ => location_id/subtasks/impl/nlp_location_freq}/processor/nlp/convert.py (67%) rename src/core/tasks/url/operators/{agency_identification/subtasks/impl/nlp_location_match_ => location_id/subtasks/impl/nlp_location_freq}/processor/nlp/core.py (75%) rename src/core/tasks/url/operators/{agency_identification/subtasks/impl/nlp_location_match_ => location_id/subtasks/impl/nlp_location_freq}/processor/nlp/enums.py (100%) rename src/core/tasks/url/operators/{agency_identification/subtasks/impl/nlp_location_match_ => location_id/subtasks/impl/nlp_location_freq}/processor/nlp/extract.py (70%) rename src/core/tasks/url/operators/{agency_identification/subtasks/impl/nlp_location_match_ => location_id/subtasks/impl/nlp_location_freq}/processor/nlp/mappings.py (100%) create mode 100644 src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/models/__init__.py rename src/core/tasks/url/operators/{agency_identification/subtasks/impl/nlp_location_match_ => location_id/subtasks/impl/nlp_location_freq}/processor/nlp/models/params.py (100%) rename src/core/tasks/url/operators/{agency_identification/subtasks/impl/nlp_location_match_ => location_id/subtasks/impl/nlp_location_freq}/processor/nlp/models/response.py (75%) rename src/core/tasks/url/operators/{agency_identification/subtasks/impl/nlp_location_match_ => location_id/subtasks/impl/nlp_location_freq}/processor/nlp/models/us_state.py (100%) rename src/core/tasks/url/operators/{agency_identification/subtasks/impl/nlp_location_match_ => location_id/subtasks/impl/nlp_location_freq}/processor/nlp/preprocess.py (100%) create mode 100644 src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/query_/__init__.py create mode 100644 src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/query_/core.py create mode 100644 src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/query_/models/__init__.py create mode 100644 src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/query_/models/params.py create mode 100644 src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/query_/models/response.py rename src/core/tasks/url/operators/{agency_identification/subtasks/impl/nlp_location_match_ => location_id/subtasks/impl/nlp_location_freq}/query.py (93%) create mode 100644 src/core/tasks/url/operators/location_id/subtasks/loader.py create mode 100644 src/core/tasks/url/operators/location_id/subtasks/models/__init__.py create mode 100644 src/core/tasks/url/operators/location_id/subtasks/models/run_info.py create mode 100644 src/core/tasks/url/operators/location_id/subtasks/models/subtask.py create mode 100644 src/core/tasks/url/operators/location_id/subtasks/models/suggestion.py create mode 100644 src/core/tasks/url/operators/location_id/subtasks/queries/__init__.py create mode 100644 src/core/tasks/url/operators/location_id/subtasks/queries/survey/__init__.py create mode 100644 src/core/tasks/url/operators/location_id/subtasks/queries/survey/constants.py create mode 100644 src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/__init__.py create mode 100644 src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/core.py create mode 100644 src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/ctes/__init__.py create mode 100644 src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/ctes/eligible.py create mode 100644 src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/ctes/exists/__init__.py create mode 100644 src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/ctes/exists/high_confidence_annotations.py create mode 100644 src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/ctes/subtask/__init__.py create mode 100644 src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/ctes/subtask/helpers.py create mode 100644 src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/ctes/subtask/impl/__init__.py create mode 100644 src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location_freq.py create mode 100644 src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/eligible_counts.py create mode 100644 src/core/tasks/url/operators/location_id/subtasks/templates/__init__.py create mode 100644 src/core/tasks/url/operators/location_id/subtasks/templates/subtask.py delete mode 100644 tests/manual/external/pdap/test_sc_agency_search_location.py diff --git a/ENV.md b/ENV.md index 95d15551..01a7e7ca 100644 --- a/ENV.md +++ b/ENV.md @@ -91,12 +91,21 @@ URL Task Flags are collectively controlled by the `RUN_URL_TASKS_TASK_FLAG` flag Agency ID Subtasks are collectively disabled by the `URL_AGENCY_IDENTIFICATION_TASK_FLAG` flag. -| Flag | Description | -|-------------------------------------|--------------------------------------------------------------------| -| `AGENCY_ID_HOMEPAGE_MATCH_FLAG` | Enables the homepage match subtask for agency identification. | +| Flag | Description | +|-------------------------------------|-------------------------------------------------------------------| +| `AGENCY_ID_HOMEPAGE_MATCH_FLAG` | Enables the homepage match subtask for agency identification. | | `AGENCY_ID_NLP_LOCATION_MATCH_FLAG` | Enables the NLP location match subtask for agency identification. | -| `AGENCY_ID_CKAN_FLAG` | Enables the CKAN subtask for agency identification. | -| `AGENCY_ID_MUCKROCK_FLAG` | Enables the MuckRock subtask for agency identification. | +| `AGENCY_ID_CKAN_FLAG` | Enables the CKAN subtask for agency identification. | +| `AGENCY_ID_MUCKROCK_FLAG` | Enables the MuckRock subtask for agency identification. | + + +### Location ID Subtasks + +Location ID Subtasks are collectively disabled by the `URL_LOCATION_IDENTIFICATION_TASK_FLAG` flag + +| Flag | Description | +|---------------------------------------|---------------------------------------------------------------------| +| `LOCATION_ID_NLP_LOCATION_MATCH_FLAG` | Enables the NLP location match subtask for location identification. | ## Foreign Data Wrapper (FDW) diff --git a/alembic/versions/2025_09_15_1905-93cbaa3b8e9b_add_location_annotation_logic.py b/alembic/versions/2025_09_15_1905-93cbaa3b8e9b_add_location_annotation_logic.py index 844b28a9..06d49980 100644 --- a/alembic/versions/2025_09_15_1905-93cbaa3b8e9b_add_location_annotation_logic.py +++ b/alembic/versions/2025_09_15_1905-93cbaa3b8e9b_add_location_annotation_logic.py @@ -22,11 +22,12 @@ USER_LOCATION_SUGGESTIONS_TABLE_NAME = 'user_location_suggestions' AUTO_LOCATION_ID_SUBTASK_TABLE_NAME = 'auto_location_id_subtasks' LOCATION_ID_SUBTASK_SUGGESTIONS_TABLE_NAME = 'location_id_subtask_suggestions' -LOCATION_ID_TASK_TYPE = 'location_id' +LOCATION_ID_TASK_TYPE = 'Location ID' LOCATION_ID_SUBTASK_TYPE_NAME = 'location_id_subtask_type' + def upgrade() -> None: _add_location_id_task_type() _create_user_location_suggestions_table() @@ -37,11 +38,7 @@ def upgrade() -> None: _create_state_location_trigger() _create_county_location_trigger() _create_locality_location_trigger() - - - - - + _add_pg_trgm_extension() def downgrade() -> None: _drop_locations_expanded_view() @@ -54,6 +51,17 @@ def downgrade() -> None: _drop_state_location_trigger() _drop_county_location_trigger() _drop_locality_location_trigger() + _drop_pg_trgm_extension() + +def _drop_pg_trgm_extension(): + op.execute(""" + drop extension if exists pg_trgm; + """) + +def _add_pg_trgm_extension(): + op.execute(""" + create extension if not exists pg_trgm; + """) def _create_state_location_trigger(): diff --git a/src/api/main.py b/src/api/main.py index 95041e19..d169d1e3 100644 --- a/src/api/main.py +++ b/src/api/main.py @@ -27,9 +27,9 @@ from src.core.tasks.scheduled.registry.core import ScheduledJobRegistry from src.core.tasks.url.loader import URLTaskOperatorLoader from src.core.tasks.url.manager import TaskManager -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.core import \ +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor import \ NLPProcessor -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.enums import \ +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.enums import \ SpacyModelType from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser from src.db.client.async_ import AsyncDatabaseClient diff --git a/src/core/tasks/url/loader.py b/src/core/tasks/url/loader.py index 8405a3bb..04ad1f23 100644 --- a/src/core/tasks/url/loader.py +++ b/src/core/tasks/url/loader.py @@ -7,12 +7,13 @@ from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface from src.core.tasks.url.models.entry import URLTaskEntry from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.core import \ +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor import \ NLPProcessor from src.core.tasks.url.operators.agency_identification.subtasks.loader import AgencyIdentificationSubtaskLoader from src.core.tasks.url.operators.auto_relevant.core import URLAutoRelevantTaskOperator from src.core.tasks.url.operators.html.core import URLHTMLTaskOperator from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser +from src.core.tasks.url.operators.location_id.subtasks.loader import LocationIdentificationSubtaskLoader from src.core.tasks.url.operators.misc_metadata.core import URLMiscellaneousMetadataTaskOperator from src.core.tasks.url.operators.probe.core import URLProbeTaskOperator from src.core.tasks.url.operators.probe_404.core import URL404ProbeTaskOperator @@ -184,6 +185,22 @@ def _get_url_screenshot_task_operator(self) -> URLTaskEntry: ) ) + def _get_location_id_task_operator(self) -> URLTaskEntry: + operator = URLLocationIDTaskOperator( + adb_client=self.adb_client, + loader=LocationIdentificationSubtaskLoader( + adb_client=self.adb_client, + nlp_processor=self.nlp_processor + ) + ) + return URLTaskEntry( + operator=operator, + enabled=self.env.bool( + "URL_LOCATION_IDENTIFICATION_TASK_FLAG", + default=True + ) + ) + async def load_entries(self) -> list[URLTaskEntry]: return [ @@ -196,5 +213,6 @@ async def load_entries(self) -> list[URLTaskEntry]: self._get_url_miscellaneous_metadata_task_operator(), self._get_submit_approved_url_task_operator(), self._get_url_auto_relevance_task_operator(), - self._get_url_screenshot_task_operator() + self._get_url_screenshot_task_operator(), + self._get_location_id_task_operator() ] diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/models/__init__.py b/src/core/tasks/url/operators/_shared/__init__.py similarity index 100% rename from src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/models/__init__.py rename to src/core/tasks/url/operators/_shared/__init__.py diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/__init__.py b/src/core/tasks/url/operators/_shared/container/__init__.py similarity index 100% rename from src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/__init__.py rename to src/core/tasks/url/operators/_shared/container/__init__.py diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/__init__.py b/src/core/tasks/url/operators/_shared/container/subtask/__init__.py similarity index 100% rename from src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/__init__.py rename to src/core/tasks/url/operators/_shared/container/subtask/__init__.py diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/container.py b/src/core/tasks/url/operators/_shared/container/subtask/eligible.py similarity index 96% rename from src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/container.py rename to src/core/tasks/url/operators/_shared/container/subtask/eligible.py index 9782e4fd..4ad60124 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/container.py +++ b/src/core/tasks/url/operators/_shared/container/subtask/eligible.py @@ -3,7 +3,7 @@ from src.db.models.impl.url.core.sqlalchemy import URL -class SubtaskCTEContainer: +class URLsSubtaskEligibleCTEContainer: """ CTE for URLs eligible for a given subtask. A successful left join on this indicates the URL is eligible for the subtask. diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/container.py b/src/core/tasks/url/operators/_shared/container/subtask/exists.py similarity index 95% rename from src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/container.py rename to src/core/tasks/url/operators/_shared/container/subtask/exists.py index d59c508c..f10956d3 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/container.py +++ b/src/core/tasks/url/operators/_shared/container/subtask/exists.py @@ -3,7 +3,7 @@ from src.db.models.impl.url.core.sqlalchemy import URL -class ExistsCTEContainer: +class URLsSubtaskExistsCTEContainer: """ Base class for CTEs that determine validity for each subtask. diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/__init__.py b/src/core/tasks/url/operators/_shared/ctes/__init__.py similarity index 100% rename from src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/__init__.py rename to src/core/tasks/url/operators/_shared/ctes/__init__.py diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/impl/validated.py b/src/core/tasks/url/operators/_shared/ctes/validated.py similarity index 52% rename from src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/impl/validated.py rename to src/core/tasks/url/operators/_shared/ctes/validated.py index f515c1d1..43f6a6ba 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/impl/validated.py +++ b/src/core/tasks/url/operators/_shared/ctes/validated.py @@ -1,7 +1,7 @@ from sqlalchemy import select -from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.exists.container import \ - ExistsCTEContainer +from src.core.tasks.url.operators._shared.container.subtask.exists import \ + URLsSubtaskExistsCTEContainer from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated cte = ( @@ -11,6 +11,6 @@ .cte("validated_exists") ) -VALIDATED_EXISTS_CONTAINER = ExistsCTEContainer( +VALIDATED_EXISTS_CONTAINER = URLsSubtaskExistsCTEContainer( cte, ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/exceptions.py b/src/core/tasks/url/operators/_shared/exceptions.py similarity index 100% rename from src/core/tasks/url/operators/agency_identification/exceptions.py rename to src/core/tasks/url/operators/_shared/exceptions.py diff --git a/src/core/tasks/url/operators/agency_identification/core.py b/src/core/tasks/url/operators/agency_identification/core.py index 92ece84e..4de9dd57 100644 --- a/src/core/tasks/url/operators/agency_identification/core.py +++ b/src/core/tasks/url/operators/agency_identification/core.py @@ -1,5 +1,5 @@ from src.core.tasks.mixins.link_urls import LinkURLsMixin -from src.core.tasks.url.operators.agency_identification.exceptions import SubtaskError +from src.core.tasks.url.operators._shared.exceptions import SubtaskError from src.core.tasks.url.operators.agency_identification.subtasks.flags.core import SubtaskFlagger from src.core.tasks.url.operators.agency_identification.subtasks.loader import AgencyIdentificationSubtaskLoader from src.core.tasks.url.operators.agency_identification.subtasks.models.run_info import AgencyIDSubtaskRunInfo diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/convert.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/convert.py new file mode 100644 index 00000000..139597f9 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/convert.py @@ -0,0 +1,2 @@ + + diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/core.py index 0c172e5d..b595c93c 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/core.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/core.py @@ -1,17 +1,14 @@ from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.constants import \ ITERATIONS_PER_SUBTASK -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.models.input import \ - NLPLocationMatchSubtaskInput from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.core import \ AgencyIDSubtaskInternalProcessor -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.core import \ - NLPProcessor -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.query import \ - GetNLPLocationMatchSubtaskInputQueryBuilder from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIDSubtaskOperatorBase +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.models.input import \ + NLPLocationMatchSubtaskInput +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.query import \ + GetNLPLocationMatchSubtaskInputQueryBuilder from src.db.client.async_ import AsyncDatabaseClient -from src.external.pdap.client import PDAPClient class NLPLocationMatchSubtaskOperator(AgencyIDSubtaskOperatorBase): @@ -20,15 +17,8 @@ def __init__( self, adb_client: AsyncDatabaseClient, task_id: int, - pdap_client: PDAPClient, - processor: NLPProcessor ) -> None: super().__init__(adb_client, task_id=task_id) - self.processor = AgencyIDSubtaskInternalProcessor( - nlp_processor=processor, - pdap_client=pdap_client, - task_id=task_id, - ) async def inner_logic(self) -> None: for iteration in range(ITERATIONS_PER_SUBTASK): @@ -39,7 +29,32 @@ async def inner_logic(self) -> None: async def run_subtask_iteration(self, inputs: list[NLPLocationMatchSubtaskInput]) -> None: self.linked_urls.extend([input_.url_id for input_ in inputs]) - subtask_data_list: list[AutoAgencyIDSubtaskData] = await self._process_inputs(inputs) + subtask_data_list: list[AutoAgencyIDSubtaskData] = [] + + # TODO: Get NLP Annotations + + # TODO: Process and Convert NLP Annotations + + # TODO: Resubmit NLP Annotations + + # TODO: For locations with no associated agencies, convert to subtask data with empty agencies + subtask_data_no_agency_list: list[AutoAgencyIDSubtaskData] = \ + convert_empty_location_agency_mappings_to_subtask_data_list( + mappings=nlp_response_subsets.invalid, + task_id=self._task_id, + ) + subtask_data_list.extend(subtask_data_no_agency_list) + + # For locations with agency mappings, convert to data with suggestions + subtask_data_list_agency_list: list[AutoAgencyIDSubtaskData] = \ + convert_location_agency_mappings_to_subtask_data_list( + mappings=response_mappings, + task_id=self._task_id, + ) + + subtask_data_list.extend(subtask_data_list_agency_list) + + return subtask_data_list await self._upload_subtask_data(subtask_data_list) diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/convert.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/convert.py deleted file mode 100644 index 103580da..00000000 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/convert.py +++ /dev/null @@ -1,162 +0,0 @@ -from math import ceil - -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.constants import \ - MAX_NLP_CONFIDENCE -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.counter import \ - RequestCounter -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.mappings.url_id_nlp_response import \ - URLToNLPResponseMapping -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.mappings.url_id_search_params import \ - URLToSearchParamsMapping -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.mappings.url_id_search_response import \ - URLToSearchResponseMapping -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.response import \ - NLPLocationMatchResponse -from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData -from src.core.tasks.url.operators.agency_identification.subtasks.models.suggestion import AgencySuggestion -from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType -from src.db.models.impl.url.suggestion.agency.subtask.pydantic import URLAutoAgencyIDSubtaskPydantic -from src.external.pdap.dtos.search_agency_by_location.params import SearchAgencyByLocationParams -from src.external.pdap.dtos.search_agency_by_location.response import SearchAgencyByLocationResponse - - -def convert_nlp_response_to_search_agency_by_location_params( - nlp_response: NLPLocationMatchResponse, - counter: RequestCounter -) -> list[SearchAgencyByLocationParams]: - params: list[SearchAgencyByLocationParams] = [] - for location in nlp_response.locations: - if nlp_response.us_state is None: - raise ValueError("US State is None; cannot convert NLP response to search agency by location params") - request_id: int = counter.next() - param = SearchAgencyByLocationParams( - request_id=request_id, - query=location, - iso=nlp_response.us_state.iso, - ) - params.append(param) - - return params - - - -def convert_search_agency_responses_to_subtask_data_list( - mappings: list[URLToSearchResponseMapping], - task_id: int -) -> list[AutoAgencyIDSubtaskData]: - subtask_data_list: list[AutoAgencyIDSubtaskData] = [] - - # First, extract agency suggestions for URL - for mapping in mappings: - url_id: int = mapping.url_id - search_responses: list[SearchAgencyByLocationResponse] = mapping.search_responses - suggestions: list[AgencySuggestion] = _convert_search_agency_response_to_agency_suggestions( - search_responses - ) - pydantic_model: URLAutoAgencyIDSubtaskPydantic = convert_search_agency_response_to_subtask_pydantic( - url_id=url_id, - task_id=task_id - ) - subtask_data = AutoAgencyIDSubtaskData( - pydantic_model=pydantic_model, - suggestions=suggestions - ) - subtask_data_list.append(subtask_data) - - return subtask_data_list - - -def _convert_search_agency_response_to_agency_suggestions( - responses: list[SearchAgencyByLocationResponse], -) -> list[AgencySuggestion]: - suggestions: list[AgencySuggestion] = [] - for response in responses: - for result in response.results: - agency_id: int = result.agency_id - similarity: float = result.similarity - confidence: int = min(ceil(similarity * 100), MAX_NLP_CONFIDENCE) - suggestion: AgencySuggestion = AgencySuggestion( - agency_id=agency_id, - confidence=confidence, - ) - suggestions.append(suggestion) - return suggestions - -def convert_url_ids_to_empty_subtask_data_list( - url_ids: list[int], - task_id: int -) -> list[AutoAgencyIDSubtaskData]: - results: list[AutoAgencyIDSubtaskData] = [] - for url_id in url_ids: - subtask_data = AutoAgencyIDSubtaskData( - pydantic_model=URLAutoAgencyIDSubtaskPydantic( - task_id=task_id, - url_id=url_id, - type=AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH, - agencies_found=False - ), - suggestions=[] - ) - results.append(subtask_data) - - return results - - - -def convert_empty_url_search_param_mappings_to_subtask_data_list( - mappings: list[URLToSearchParamsMapping], - task_id: int -) -> list[AutoAgencyIDSubtaskData]: - url_ids: list[int] = [] - for mapping in mappings: - url_ids.append(mapping.url_id) - - return convert_url_ids_to_empty_subtask_data_list( - url_ids=url_ids, - task_id=task_id - ) - -def convert_invalid_url_nlp_mappings_to_subtask_data_list( - mappings: list[URLToNLPResponseMapping], - task_id: int -) -> list[AutoAgencyIDSubtaskData]: - url_ids: list[int] = [] - for mapping in mappings: - url_ids.append(mapping.url_id) - - return convert_url_ids_to_empty_subtask_data_list( - url_ids=url_ids, - task_id=task_id - ) - - -def convert_search_agency_response_to_subtask_pydantic( - url_id: int, - task_id: int -) -> URLAutoAgencyIDSubtaskPydantic: - - return URLAutoAgencyIDSubtaskPydantic( - task_id=task_id, - url_id=url_id, - type=AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH, - agencies_found=True - ) - - -def convert_urls_to_search_params( - url_to_nlp_mappings: list[URLToNLPResponseMapping] -) -> list[URLToSearchParamsMapping]: - url_to_search_params_mappings: list[URLToSearchParamsMapping] = [] - counter = RequestCounter() - for mapping in url_to_nlp_mappings: - search_params: list[SearchAgencyByLocationParams] = \ - convert_nlp_response_to_search_agency_by_location_params( - counter=counter, - nlp_response=mapping.nlp_response, - ) - mapping = URLToSearchParamsMapping( - url_id=mapping.url_id, - search_params=search_params, - ) - url_to_search_params_mappings.append(mapping) - return url_to_search_params_mappings diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/extract.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/extract.py deleted file mode 100644 index 053f4fb5..00000000 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/extract.py +++ /dev/null @@ -1,12 +0,0 @@ -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.mappings.url_id_search_params import \ - URLToSearchParamsMapping -from src.external.pdap.dtos.search_agency_by_location.params import SearchAgencyByLocationParams - - -def _extract_all_search_params( - url_to_search_params_mappings: list[URLToSearchParamsMapping] -) -> list[SearchAgencyByLocationParams]: - all_search_params: list[SearchAgencyByLocationParams] = [] - for mapping in url_to_search_params_mappings: - all_search_params.extend(mapping.search_params) - return all_search_params diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_search_response.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_search_response.py deleted file mode 100644 index 9a88b89d..00000000 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_search_response.py +++ /dev/null @@ -1,8 +0,0 @@ -from pydantic import BaseModel - -from src.external.pdap.dtos.search_agency_by_location.response import SearchAgencyByLocationResponse - - -class URLToSearchResponseMapping(BaseModel): - url_id: int - search_responses: list[SearchAgencyByLocationResponse] \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/subsets/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/query_/__init__.py similarity index 100% rename from src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/subsets/__init__.py rename to src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/query_/__init__.py diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/query_/query.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/query_/query.py new file mode 100644 index 00000000..9ddc32e1 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/query_/query.py @@ -0,0 +1,26 @@ +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.query_.response import \ + GetAgenciesLinkedToAnnotatedLocationsResponse +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.suggestion.location.auto.suggestion.sqlalchemy import LocationIDSubtaskSuggestion +from src.db.queries.base.builder import QueryBuilderBase + + +class GetAgenciesLinkedToAnnotatedLocationsQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> list[GetAgenciesLinkedToAnnotatedLocationsResponse]: + + query = ( + select( + URL.id, + LocationIDSubtaskSuggestion.location_id, + LocationIDSubtaskSuggestion.confidence, + Agency.id + ) + .outerjoin( + + ) + ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/query_/response.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/query_/response.py new file mode 100644 index 00000000..6205de78 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/query_/response.py @@ -0,0 +1,8 @@ +from pydantic import BaseModel + + +class GetAgenciesLinkedToAnnotatedLocationsResponse(BaseModel): + url_id: int + location_id: int + location_confidence: int + agency_ids: list[int] \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/loader.py b/src/core/tasks/url/operators/agency_identification/subtasks/loader.py index 5dab9608..ff136a66 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/loader.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/loader.py @@ -6,7 +6,7 @@ MuckrockAgencyIDSubtaskOperator from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.core import \ NLPLocationMatchSubtaskOperator -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.core import \ +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor import \ NLPProcessor from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIDSubtaskOperatorBase from src.db.client.async_ import AsyncDatabaseClient diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/eligible.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/eligible.py index 5be64fbc..31d4e63c 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/eligible.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/eligible.py @@ -1,8 +1,8 @@ from sqlalchemy import select, CTE, Column -from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.exists.impl.high_confidence_annotations import \ +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.exists.high_confidence_annotations import \ HIGH_CONFIDENCE_ANNOTATIONS_EXISTS_CONTAINER -from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.exists.impl.validated import \ +from src.core.tasks.url.operators._shared.ctes.validated import \ VALIDATED_EXISTS_CONTAINER from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.impl.ckan import \ CKAN_SUBTASK_CONTAINER diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/impl/high_confidence_annotations.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/high_confidence_annotations.py similarity index 76% rename from src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/impl/high_confidence_annotations.py rename to src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/high_confidence_annotations.py index 3ac0ced7..cfb92327 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/impl/high_confidence_annotations.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/high_confidence_annotations.py @@ -1,7 +1,7 @@ from sqlalchemy import select -from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.exists.container import \ - ExistsCTEContainer +from src.core.tasks.url.operators._shared.container.subtask.exists import \ + URLsSubtaskExistsCTEContainer from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion @@ -24,6 +24,6 @@ .cte("high_confidence_annotations_exists") ) -HIGH_CONFIDENCE_ANNOTATIONS_EXISTS_CONTAINER = ExistsCTEContainer( +HIGH_CONFIDENCE_ANNOTATIONS_EXISTS_CONTAINER = URLsSubtaskExistsCTEContainer( cte, ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/ckan.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/ckan.py index b1b70cdb..39114acd 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/ckan.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/ckan.py @@ -3,7 +3,7 @@ from src.collectors.enums import CollectorType from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.helpers import \ get_exists_subtask_query -from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.container import \ +from src.core.tasks.url.operators._shared.subtask.container import \ SubtaskCTEContainer from src.db.models.impl.batch.sqlalchemy import Batch from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/homepage.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/homepage.py index 4d75b4e0..5c0a613f 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/homepage.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/homepage.py @@ -2,7 +2,7 @@ from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.consolidated import \ CONSOLIDATED_CTE -from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.container import \ +from src.core.tasks.url.operators._shared.subtask.container import \ SubtaskCTEContainer from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.helpers import \ get_exists_subtask_query diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/muckrock.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/muckrock.py index 1f059e86..1eeb4bd8 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/muckrock.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/muckrock.py @@ -1,7 +1,7 @@ from sqlalchemy import select from src.collectors.enums import CollectorType -from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.container import \ +from src.core.tasks.url.operators._shared.subtask.container import \ SubtaskCTEContainer from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.helpers import \ get_exists_subtask_query diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location.py index 40533809..21871785 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location.py @@ -1,12 +1,14 @@ +from operator import and_ + from sqlalchemy import select +from src.core.tasks.url.operators._shared.subtask.container import \ + SubtaskCTEContainer from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.helpers import \ get_exists_subtask_query -from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.container import \ - SubtaskCTEContainer from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType +from src.db.models.impl.url.suggestion.location.auto.subtask.sqlalchemy import AutoLocationIDSubtask cte = ( select( @@ -16,7 +18,11 @@ ) ) .join( - URLCompressedHTML + AutoLocationIDSubtask, + and_( + AutoLocationIDSubtask.url_id == URL.id, + AutoLocationIDSubtask.locations_found + ) ) .cte("nlp_location_eligible") ) diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/templates/output.py b/src/core/tasks/url/operators/agency_identification/subtasks/templates/output.py deleted file mode 100644 index 02ae76a4..00000000 --- a/src/core/tasks/url/operators/agency_identification/subtasks/templates/output.py +++ /dev/null @@ -1,5 +0,0 @@ -from pydantic import BaseModel - - -class AgencyIDSubtaskOutputBase(BaseModel): - pass \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/templates/postprocessor.py b/src/core/tasks/url/operators/agency_identification/subtasks/templates/postprocessor.py deleted file mode 100644 index b366747f..00000000 --- a/src/core/tasks/url/operators/agency_identification/subtasks/templates/postprocessor.py +++ /dev/null @@ -1,26 +0,0 @@ -from abc import ABC, abstractmethod - -from src.core.tasks.url.operators.agency_identification.subtasks.templates.output import AgencyIDSubtaskOutputBase -from src.db.client.async_ import AsyncDatabaseClient - - -class SubtaskPostprocessorBase(ABC): - """ - An optional class which takes - the output of the subtask along with the subtask id - and adds additional information to the database. - """ - - def __init__( - self, - subtask_id: int, - subtask_output: AgencyIDSubtaskOutputBase, - adb_client: AsyncDatabaseClient - ): - self.subtask_id = subtask_id - self.subtask_output = subtask_output - self.adb_client = adb_client - - @abstractmethod - async def run(self) -> None: - raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/__init__.py b/src/core/tasks/url/operators/location_id/__init__.py similarity index 100% rename from src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/__init__.py rename to src/core/tasks/url/operators/location_id/__init__.py diff --git a/src/core/tasks/url/operators/location_id/core.py b/src/core/tasks/url/operators/location_id/core.py new file mode 100644 index 00000000..01f14a02 --- /dev/null +++ b/src/core/tasks/url/operators/location_id/core.py @@ -0,0 +1,44 @@ +from src.core.tasks.mixins.link_urls import LinkURLsMixin +from src.core.tasks.url.operators.base import URLTaskOperatorBase +from src.core.tasks.url.operators.location_id.subtasks.loader import LocationIdentificationSubtaskLoader +from src.core.tasks.url.operators.location_id.subtasks.queries.survey.queries.core import LocationIDSurveyQueryBuilder +from src.db.client.async_ import AsyncDatabaseClient +from src.db.enums import TaskType +from src.db.models.impl.url.suggestion.location.auto.subtask.enums import LocationIDSubtaskType + + +class LocationIdentificationTaskOperator( + URLTaskOperatorBase, + LinkURLsMixin, +): + + def __init__( + self, + adb_client: AsyncDatabaseClient, + loader: LocationIdentificationSubtaskLoader, + ): + super().__init__(adb_client) + self.loader = loader + + @property + def task_type(self) -> TaskType: + return TaskType.LOCATION_ID + + async def meets_task_prerequisites(self) -> bool: + """ + Modifies: + - self._subtask + """ + flagger = SubtaskFlagger() + allowed_subtasks: list[LocationIDSubtaskType] = flagger.get_allowed_subtasks() + + next_subtask: LocationIDSubtaskType | None = \ + await self.adb_client.run_query_builder( + LocationIDSurveyQueryBuilder( + allowed_subtasks=allowed_subtasks + ) + ) + self._subtask = next_subtask + if next_subtask is None: + return False + return True diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/__init__.py b/src/core/tasks/url/operators/location_id/models/__init__.py similarity index 100% rename from src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/__init__.py rename to src/core/tasks/url/operators/location_id/models/__init__.py diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/impl/__init__.py b/src/core/tasks/url/operators/location_id/subtasks/__init__.py similarity index 100% rename from src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/impl/__init__.py rename to src/core/tasks/url/operators/location_id/subtasks/__init__.py diff --git a/src/core/tasks/url/operators/location_id/subtasks/flags/__init__.py b/src/core/tasks/url/operators/location_id/subtasks/flags/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/location_id/subtasks/flags/core.py b/src/core/tasks/url/operators/location_id/subtasks/flags/core.py new file mode 100644 index 00000000..1b6cb55c --- /dev/null +++ b/src/core/tasks/url/operators/location_id/subtasks/flags/core.py @@ -0,0 +1,25 @@ +from environs import Env + +from src.core.tasks.url.operators.location_id.subtasks.flags.mappings import SUBTASK_TO_ENV_FLAG +from src.db.models.impl.url.suggestion.location.auto.subtask.enums import LocationIDSubtaskType + + +class SubtaskFlagger: + """ + Manages flags allowing and disallowing subtasks + """ + def __init__(self): + self.env = Env() + + def _get_subtask_flag(self, subtask_type: LocationIDSubtaskType) -> bool: + return self.env.bool( + SUBTASK_TO_ENV_FLAG[subtask_type], + default=True + ) + + def get_allowed_subtasks(self) -> list[LocationIDSubtaskType]: + return [ + subtask_type + for subtask_type, flag in SUBTASK_TO_ENV_FLAG.items() + if self._get_subtask_flag(subtask_type) + ] \ No newline at end of file diff --git a/src/core/tasks/url/operators/location_id/subtasks/flags/mappings.py b/src/core/tasks/url/operators/location_id/subtasks/flags/mappings.py new file mode 100644 index 00000000..6a47590e --- /dev/null +++ b/src/core/tasks/url/operators/location_id/subtasks/flags/mappings.py @@ -0,0 +1,5 @@ +from src.db.models.impl.url.suggestion.location.auto.subtask.enums import LocationIDSubtaskType + +SUBTASK_TO_ENV_FLAG: dict[LocationIDSubtaskType, str] = { + LocationIDSubtaskType.NLP_LOCATION_FREQUENCY: "LOCATION_ID_NLP_LOCATION_MATCH_FLAG", +} \ No newline at end of file diff --git a/src/core/tasks/url/operators/location_id/subtasks/impl/__init__.py b/src/core/tasks/url/operators/location_id/subtasks/impl/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/__init__.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/constants.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/constants.py new file mode 100644 index 00000000..31890aaa --- /dev/null +++ b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/constants.py @@ -0,0 +1,4 @@ + + +ITERATIONS_PER_SUBTASK = 4 +NUMBER_OF_ENTRIES_PER_ITERATION = 10 \ No newline at end of file diff --git a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/core.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/core.py new file mode 100644 index 00000000..af096953 --- /dev/null +++ b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/core.py @@ -0,0 +1,56 @@ +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.constants import ITERATIONS_PER_SUBTASK +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.models.input import \ + NLPLocationMatchSubtaskInput +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.core import \ + NLPLocationFrequencySubtaskInternalProcessor +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.core import NLPProcessor +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.query import \ + GetNLPLocationMatchSubtaskInputQueryBuilder +from src.core.tasks.url.operators.location_id.subtasks.models.subtask import AutoLocationIDSubtaskData +from src.core.tasks.url.operators.location_id.subtasks.templates.subtask import LocationIDSubtaskOperatorBase +from src.db.client.async_ import AsyncDatabaseClient + + +class NLPLocationFrequencySubtaskOperator(LocationIDSubtaskOperatorBase): + + def __init__( + self, + task_id: int, + adb_client: AsyncDatabaseClient, + nlp_processor: NLPProcessor, + ): + super().__init__(adb_client=adb_client, task_id=task_id) + self._nlp_processor: NLPProcessor = nlp_processor + self.processor = NLPLocationFrequencySubtaskInternalProcessor( + nlp_processor=nlp_processor, + adb_client=adb_client, + task_id=task_id, + ) + + + async def inner_logic(self) -> None: + for iteration in range(ITERATIONS_PER_SUBTASK): + inputs: list[NLPLocationMatchSubtaskInput] = await self._get_from_db() + if len(inputs) == 0: + break + await self.run_subtask_iteration(inputs) + + async def run_subtask_iteration(self, inputs: list[NLPLocationMatchSubtaskInput]) -> None: + self.linked_urls.extend([input_.url_id for input_ in inputs]) + subtask_data_list: list[AutoLocationIDSubtaskData] = await self._process_inputs(inputs) + + await self._upload_subtask_data(subtask_data_list) + + async def _process_inputs( + self, + inputs: list[NLPLocationMatchSubtaskInput] + ) -> list[AutoLocationIDSubtaskData]: + return await self.processor.process( + inputs=inputs, + ) + + + async def _get_from_db(self) -> list[NLPLocationMatchSubtaskInput]: + return await self.adb_client.run_query_builder( + GetNLPLocationMatchSubtaskInputQueryBuilder(), + ) diff --git a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/models/__init__.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/models/input.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/models/input.py similarity index 100% rename from src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/models/input.py rename to src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/models/input.py diff --git a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/models/mappings/__init__.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/models/mappings/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_nlp_response.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/models/mappings/url_id_nlp_response.py similarity index 55% rename from src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_nlp_response.py rename to src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/models/mappings/url_id_nlp_response.py index 7bb7e701..1f611ad7 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_nlp_response.py +++ b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/models/mappings/url_id_nlp_response.py @@ -1,6 +1,6 @@ from pydantic import BaseModel -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.response import \ +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.models.response import \ NLPLocationMatchResponse diff --git a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/models/mappings/url_id_search_response.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/models/mappings/url_id_search_response.py new file mode 100644 index 00000000..807b38d0 --- /dev/null +++ b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/models/mappings/url_id_search_response.py @@ -0,0 +1,10 @@ +from pydantic import BaseModel + +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.query_.models.response import \ + SearchSimilarLocationsResponse +from src.external.pdap.dtos.search_agency_by_location.response import SearchAgencyByLocationResponse + + +class URLToSearchResponseMapping(BaseModel): + url_id: int + search_responses: list[SearchSimilarLocationsResponse] \ No newline at end of file diff --git a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/models/subsets/__init__.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/models/subsets/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/subsets/nlp_responses.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/models/subsets/nlp_responses.py similarity index 55% rename from src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/subsets/nlp_responses.py rename to src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/models/subsets/nlp_responses.py index 22fdcf98..304c7e01 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/subsets/nlp_responses.py +++ b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/models/subsets/nlp_responses.py @@ -1,6 +1,6 @@ from pydantic import BaseModel -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.mappings.url_id_nlp_response import \ +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.models.mappings.url_id_nlp_response import \ URLToNLPResponseMapping diff --git a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/__init__.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/constants.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/constants.py similarity index 100% rename from src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/constants.py rename to src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/constants.py diff --git a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/convert.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/convert.py new file mode 100644 index 00000000..d6d6c83c --- /dev/null +++ b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/convert.py @@ -0,0 +1,147 @@ +from math import ceil + +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.models.mappings.url_id_nlp_response import \ + URLToNLPResponseMapping +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.models.mappings.url_id_search_response import \ + URLToSearchResponseMapping +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.constants import \ + MAX_NLP_CONFIDENCE +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.counter import RequestCounter +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.models.url_id_search_params import \ + URLToSearchParamsMapping +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.models.response import \ + NLPLocationMatchResponse +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.query_.models.params import \ + SearchSimilarLocationsParams +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.query_.models.response import \ + SearchSimilarLocationsResponse +from src.core.tasks.url.operators.location_id.subtasks.models.subtask import AutoLocationIDSubtaskData +from src.core.tasks.url.operators.location_id.subtasks.models.suggestion import LocationSuggestion +from src.db.models.impl.url.suggestion.location.auto.subtask.enums import LocationIDSubtaskType +from src.db.models.impl.url.suggestion.location.auto.subtask.pydantic import AutoLocationIDSubtaskPydantic + + +def convert_invalid_url_nlp_mappings_to_subtask_data_list( + mappings: list[URLToNLPResponseMapping], + task_id: int +) -> list[AutoLocationIDSubtaskData]: + url_ids: list[int] = [] + for mapping in mappings: + url_ids.append(mapping.url_id) + + return convert_url_ids_to_empty_subtask_data_list( + url_ids=url_ids, + task_id=task_id + ) + +def convert_url_ids_to_empty_subtask_data_list( + url_ids: list[int], + task_id: int +) -> list[AutoLocationIDSubtaskData]: + results: list[AutoLocationIDSubtaskData] = [] + for url_id in url_ids: + subtask_data = AutoLocationIDSubtaskData( + pydantic_model=AutoLocationIDSubtaskPydantic( + task_id=task_id, + url_id=url_id, + type=LocationIDSubtaskType.NLP_LOCATION_FREQUENCY, + locations_found=False + ), + suggestions=[] + ) + results.append(subtask_data) + + return results + +def convert_search_location_responses_to_subtask_data_list( + mappings: list[URLToSearchResponseMapping], + task_id: int +) -> list[AutoLocationIDSubtaskData]: + subtask_data_list: list[AutoLocationIDSubtaskData] = [] + + # First, extract agency suggestions for URL + for mapping in mappings: + url_id: int = mapping.url_id + search_responses: list[SearchSimilarLocationsResponse] = mapping.search_responses + suggestions: list[LocationSuggestion] = _convert_search_agency_response_to_agency_suggestions( + search_responses + ) + pydantic_model: AutoLocationIDSubtaskPydantic = convert_search_agency_response_to_subtask_pydantic( + url_id=url_id, + task_id=task_id + ) + subtask_data = AutoLocationIDSubtaskData( + pydantic_model=pydantic_model, + suggestions=suggestions + ) + subtask_data_list.append(subtask_data) + + return subtask_data_list + +def convert_search_agency_response_to_subtask_pydantic( + url_id: int, + task_id: int +) -> AutoLocationIDSubtaskPydantic: + + return AutoLocationIDSubtaskPydantic( + task_id=task_id, + url_id=url_id, + type=LocationIDSubtaskType.NLP_LOCATION_FREQUENCY, + locations_found=True + ) + +def _convert_search_agency_response_to_agency_suggestions( + responses: list[SearchSimilarLocationsResponse], +) -> list[LocationSuggestion]: + suggestions: list[LocationSuggestion] = [] + for response in responses: + for result in response.results: + location_id: int = result.location_id + similarity: float = result.similarity + confidence: int = min(ceil(similarity * 100), MAX_NLP_CONFIDENCE) + suggestion: LocationSuggestion = LocationSuggestion( + location_id=location_id, + confidence=confidence, + ) + suggestions.append(suggestion) + return suggestions + + + +def convert_urls_to_search_params( + url_to_nlp_mappings: list[URLToNLPResponseMapping] +) -> list[URLToSearchParamsMapping]: + url_to_search_params_mappings: list[URLToSearchParamsMapping] = [] + counter = RequestCounter() + for mapping in url_to_nlp_mappings: + search_params: list[SearchSimilarLocationsParams] = \ + convert_nlp_response_to_search_similar_location_params( + counter=counter, + nlp_response=mapping.nlp_response, + ) + mapping = URLToSearchParamsMapping( + url_id=mapping.url_id, + search_params=search_params, + ) + url_to_search_params_mappings.append(mapping) + return url_to_search_params_mappings + + +def convert_nlp_response_to_search_similar_location_params( + nlp_response: NLPLocationMatchResponse, + counter: RequestCounter +) -> list[SearchSimilarLocationsParams]: + params: list[SearchSimilarLocationsParams] = [] + for location in nlp_response.locations: + if nlp_response.us_state is None: + raise ValueError("US State is None; cannot convert NLP response to search agency by location params") + request_id: int = counter.next() + param = SearchSimilarLocationsParams( + request_id=request_id, + query=location, + iso=nlp_response.us_state.iso, + ) + params.append(param) + + return params + diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/core.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/core.py similarity index 53% rename from src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/core.py rename to src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/core.py index 1e349318..4cbd4ab7 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/core.py +++ b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/core.py @@ -1,51 +1,54 @@ from collections import defaultdict -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.models.input import \ - NLPLocationMatchSubtaskInput -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.convert import \ - convert_search_agency_responses_to_subtask_data_list, \ - convert_invalid_url_nlp_mappings_to_subtask_data_list, convert_urls_to_search_params -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.filter import \ +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.filter import \ filter_valid_and_invalid_nlp_responses, filter_top_n_suggestions -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.mapper import \ +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.models.mappings.url_id_search_response import \ + URLToSearchResponseMapping +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.mapper import \ URLRequestIDMapper -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.mappings.url_id_nlp_response import \ +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.models.input import \ + NLPLocationMatchSubtaskInput +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.models.mappings.url_id_nlp_response import \ URLToNLPResponseMapping -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.mappings.url_id_search_params import \ - URLToSearchParamsMapping -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.mappings.url_id_search_response import \ - URLToSearchResponseMapping -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.subsets.nlp_responses import \ +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.models.subsets.nlp_responses import \ NLPResponseSubsets -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.core import \ - NLPProcessor -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.response import \ +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.convert import \ + convert_invalid_url_nlp_mappings_to_subtask_data_list, convert_search_location_responses_to_subtask_data_list, \ + convert_urls_to_search_params +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.models.url_id_search_params import \ + URLToSearchParamsMapping +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.core import NLPProcessor +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.models.response import \ NLPLocationMatchResponse -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.preprocess import \ +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.preprocess import \ preprocess_html -from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData -from src.external.pdap.client import PDAPClient -from src.external.pdap.dtos.search_agency_by_location.params import SearchAgencyByLocationParams -from src.external.pdap.dtos.search_agency_by_location.response import SearchAgencyByLocationResponse +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.query_.core import \ + SearchSimilarLocationsQueryBuilder +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.query_.models.params import \ + SearchSimilarLocationsParams +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.query_.models.response import \ + SearchSimilarLocationsResponse +from src.core.tasks.url.operators.location_id.subtasks.models.subtask import AutoLocationIDSubtaskData +from src.db.client.async_ import AsyncDatabaseClient -class AgencyIDSubtaskInternalProcessor: +class NLPLocationFrequencySubtaskInternalProcessor: def __init__( self, nlp_processor: NLPProcessor, - pdap_client: PDAPClient, + adb_client: AsyncDatabaseClient, task_id: int, ): self._nlp_processor = nlp_processor - self._pdap_client = pdap_client + self._adb_client = adb_client self._task_id = task_id async def process( - self, + self, inputs: list[NLPLocationMatchSubtaskInput] - ) -> list[AutoAgencyIDSubtaskData]: - subtask_data_list: list[AutoAgencyIDSubtaskData] = [] + ) -> list[AutoLocationIDSubtaskData]: + subtask_data_list: list[AutoLocationIDSubtaskData] = [] url_to_nlp_mappings: list[URLToNLPResponseMapping] = \ self._match_urls_to_nlp_responses(inputs) @@ -54,62 +57,41 @@ async def process( nlp_response_subsets: NLPResponseSubsets = \ filter_valid_and_invalid_nlp_responses(url_to_nlp_mappings) - # For invalid responses, convert to subtask data with empty agencies - subtask_data_no_agency_list: list[AutoAgencyIDSubtaskData] = \ + + # For invalid responses, convert to subtask data with empty locations + subtask_data_no_location_list: list[AutoLocationIDSubtaskData] = \ convert_invalid_url_nlp_mappings_to_subtask_data_list( mappings=nlp_response_subsets.invalid, task_id=self._task_id, ) - subtask_data_list.extend(subtask_data_no_agency_list) + subtask_data_list.extend(subtask_data_no_location_list) # For valid responses, convert to search param mappings url_to_search_params_mappings: list[URLToSearchParamsMapping] = \ convert_urls_to_search_params(nlp_response_subsets.valid) - response_mappings: list[URLToSearchResponseMapping] = \ - await self._get_pdap_info(url_to_search_params_mappings) + await self._get_db_location_info(url_to_search_params_mappings) - subtask_data_list_agency_list: list[AutoAgencyIDSubtaskData] = \ - convert_search_agency_responses_to_subtask_data_list( + subtask_data_list_location_list: list[AutoLocationIDSubtaskData] = \ + convert_search_location_responses_to_subtask_data_list( mappings=response_mappings, task_id=self._task_id, ) - filter_top_n_suggestions(subtask_data_list_agency_list) + filter_top_n_suggestions(subtask_data_list_location_list) - subtask_data_list.extend(subtask_data_list_agency_list) + subtask_data_list.extend(subtask_data_list_location_list) return subtask_data_list - def _match_urls_to_nlp_responses( - self, - inputs: list[NLPLocationMatchSubtaskInput] - ) -> list[URLToNLPResponseMapping]: - url_to_nlp_mappings: list[URLToNLPResponseMapping] = [] - for input_ in inputs: - nlp_response: NLPLocationMatchResponse = self._get_location_match(input_.html) - mapping = URLToNLPResponseMapping( - url_id=input_.url_id, - nlp_response=nlp_response, - ) - url_to_nlp_mappings.append(mapping) - return url_to_nlp_mappings - - def _get_location_match( - self, - html: str - ) -> NLPLocationMatchResponse: - preprocessed_html: str = preprocess_html(html) - return self._nlp_processor.parse_for_locations(preprocessed_html) - - async def _get_pdap_info( + async def _get_db_location_info( self, mappings: list[URLToSearchParamsMapping] ) -> list[URLToSearchResponseMapping]: if len(mappings) == 0: return [] - params: list[SearchAgencyByLocationParams] = [] + params: list[SearchSimilarLocationsParams] = [] # Map request IDs to URL IDs for later use mapper = URLRequestIDMapper() for mapping in mappings: @@ -120,9 +102,13 @@ async def _get_pdap_info( ) params.append(search_param) - url_id_to_search_responses: dict[int, list[SearchAgencyByLocationResponse]] = defaultdict(list) + url_id_to_search_responses: dict[int, list[SearchSimilarLocationsResponse]] = defaultdict(list) - responses: list[SearchAgencyByLocationResponse] = await self._pdap_client.search_agency_by_location(params) + responses: list[SearchSimilarLocationsResponse] = await self._adb_client.run_query_builder( + SearchSimilarLocationsQueryBuilder( + params=params, + ) + ) # Map responses to URL IDs via request IDs for response in responses: request_id: int = response.request_id @@ -140,4 +126,23 @@ async def _get_pdap_info( return response_mappings + def _match_urls_to_nlp_responses( + self, + inputs: list[NLPLocationMatchSubtaskInput] + ) -> list[URLToNLPResponseMapping]: + url_to_nlp_mappings: list[URLToNLPResponseMapping] = [] + for input_ in inputs: + nlp_response: NLPLocationMatchResponse = self._get_location_match(input_.html) + mapping = URLToNLPResponseMapping( + url_id=input_.url_id, + nlp_response=nlp_response, + ) + url_to_nlp_mappings.append(mapping) + return url_to_nlp_mappings + def _get_location_match( + self, + html: str + ) -> NLPLocationMatchResponse: + preprocessed_html: str = preprocess_html(html) + return self._nlp_processor.parse_for_locations(preprocessed_html) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/counter.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/counter.py similarity index 100% rename from src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/counter.py rename to src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/counter.py diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/filter.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/filter.py similarity index 51% rename from src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/filter.py rename to src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/filter.py index ff8b2de5..23c643b6 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/filter.py +++ b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/filter.py @@ -1,13 +1,13 @@ from collections import defaultdict -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.mappings.url_id_nlp_response import \ +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.models.mappings.url_id_nlp_response import \ URLToNLPResponseMapping -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.subsets.nlp_responses import \ +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.models.subsets.nlp_responses import \ NLPResponseSubsets -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.response import \ +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.models.response import \ NLPLocationMatchResponse -from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData -from src.core.tasks.url.operators.agency_identification.subtasks.models.suggestion import AgencySuggestion +from src.core.tasks.url.operators.location_id.subtasks.models.subtask import AutoLocationIDSubtaskData +from src.core.tasks.url.operators.location_id.subtasks.models.suggestion import LocationSuggestion def filter_valid_and_invalid_nlp_responses( @@ -27,31 +27,31 @@ def filter_valid_and_invalid_nlp_responses( ) def filter_top_n_suggestions( - subtask_data_list: list[AutoAgencyIDSubtaskData], + subtask_data_list: list[AutoLocationIDSubtaskData], n: int = 5 ) -> None: """Filters out all but the top N suggestions for each URL. Modifies: - - AutoAgencyIDSubtaskData.suggestions + - AutoLocationIDSubtaskData.suggestions """ for subtask_data in subtask_data_list: - # Eliminate agency ID duplicates; - agency_to_suggestions: dict[int, list[AgencySuggestion]] = defaultdict(list) + # Eliminate location ID duplicates; + location_to_suggestions: dict[int, list[LocationSuggestion]] = defaultdict(list) for suggestion in subtask_data.suggestions: - agency_to_suggestions[suggestion.agency_id].append(suggestion) + location_to_suggestions[suggestion.location_id].append(suggestion) # in the case of a tie, keep the suggestion with the highest confidence - deduped_suggestions: list[AgencySuggestion] = [] - for agency_suggestions in agency_to_suggestions.values(): - agency_suggestions.sort( + deduped_suggestions: list[LocationSuggestion] = [] + for location_suggestions in location_to_suggestions.values(): + location_suggestions.sort( key=lambda x: x.confidence, reverse=True # Descending order ) - deduped_suggestions.append(agency_suggestions[0]) + deduped_suggestions.append(location_suggestions[0]) # Sort suggestions by confidence and keep top N - suggestions_sorted: list[AgencySuggestion] = sorted( + suggestions_sorted: list[LocationSuggestion] = sorted( deduped_suggestions, key=lambda x: x.confidence, reverse=True # Descending order diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/mapper.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/mapper.py similarity index 100% rename from src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/mapper.py rename to src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/mapper.py diff --git a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/models/__init__.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_search_params.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/models/url_id_search_params.py similarity index 57% rename from src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_search_params.py rename to src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/models/url_id_search_params.py index 5ab9deac..d47992ee 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_search_params.py +++ b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/models/url_id_search_params.py @@ -1,11 +1,13 @@ from pydantic import BaseModel +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.query_.models.params import \ + SearchSimilarLocationsParams from src.external.pdap.dtos.search_agency_by_location.params import SearchAgencyByLocationParams class URLToSearchParamsMapping(BaseModel): url_id: int - search_params: list[SearchAgencyByLocationParams] + search_params: list[SearchSimilarLocationsParams] @property def is_empty(self) -> bool: diff --git a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/__init__.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/check.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/check.py similarity index 63% rename from src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/check.py rename to src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/check.py index ef60e038..2f3044b8 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/check.py +++ b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/check.py @@ -1,4 +1,4 @@ -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.mappings import \ +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.mappings import \ US_STATE_ISO_TO_NAME, US_NAME_TO_STATE_ISO diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/constants.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/constants.py similarity index 100% rename from src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/constants.py rename to src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/constants.py diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/convert.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/convert.py similarity index 67% rename from src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/convert.py rename to src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/convert.py index 040bc466..a0796b4c 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/convert.py +++ b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/convert.py @@ -1,6 +1,6 @@ -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.mappings import \ +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.mappings import \ US_STATE_ISO_TO_NAME, US_NAME_TO_STATE_ISO -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.us_state import \ +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.models.us_state import \ USState diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/core.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/core.py similarity index 75% rename from src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/core.py rename to src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/core.py index 8e723aa6..615684e5 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/core.py +++ b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/core.py @@ -4,19 +4,19 @@ from spacy import Language from spacy.tokens import Doc -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.check import \ +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.check import \ is_name_us_state, is_iso_us_state -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.constants import \ +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.constants import \ INVALID_LOCATION_CHARACTERS, INVALID_SCAN_ISOS -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.convert import \ +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.convert import \ convert_us_state_name_to_us_state, convert_us_state_iso_to_us_state -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.enums import \ +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.enums import \ SpacyModelType -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.extract import \ +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.extract import \ extract_most_common_us_state, extract_top_n_locations -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.response import \ +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.models.response import \ NLPLocationMatchResponse -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.us_state import \ +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.models.us_state import \ USState diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/enums.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/enums.py similarity index 100% rename from src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/enums.py rename to src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/enums.py diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/extract.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/extract.py similarity index 70% rename from src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/extract.py rename to src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/extract.py index ea732ef0..4b84ecc4 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/extract.py +++ b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/extract.py @@ -1,8 +1,8 @@ from collections import Counter -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.constants import \ +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.constants import \ TOP_N_LOCATIONS_COUNT -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.us_state import \ +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.models.us_state import \ USState diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/mappings.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/mappings.py similarity index 100% rename from src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/mappings.py rename to src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/mappings.py diff --git a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/models/__init__.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/params.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/models/params.py similarity index 100% rename from src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/params.py rename to src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/models/params.py diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/response.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/models/response.py similarity index 75% rename from src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/response.py rename to src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/models/response.py index 387e32de..11fc66e5 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/response.py +++ b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/models/response.py @@ -1,6 +1,6 @@ from pydantic import BaseModel -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.us_state import \ +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.models.us_state import \ USState diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/us_state.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/models/us_state.py similarity index 100% rename from src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/us_state.py rename to src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/models/us_state.py diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/preprocess.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/preprocess.py similarity index 100% rename from src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/preprocess.py rename to src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/preprocess.py diff --git a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/query_/__init__.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/query_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/query_/core.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/query_/core.py new file mode 100644 index 00000000..6a245d94 --- /dev/null +++ b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/query_/core.py @@ -0,0 +1,105 @@ +from collections import defaultdict +from typing import Any, Sequence + +from sqlalchemy import values, column, String, Integer, func, select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.query_.models.params import \ + SearchSimilarLocationsParams +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.query_.models.response import \ + SearchSimilarLocationsOuterResponse, SearchSimilarLocationsLocationInfo, SearchSimilarLocationsResponse +from src.db.models.views.location_expanded import LocationExpandedView +from src.db.queries.base.builder import QueryBuilderBase + +from src.db.helpers.session import session_helper as sh + +class SearchSimilarLocationsQueryBuilder(QueryBuilderBase): + + def __init__( + self, + params: list[SearchSimilarLocationsParams] + ): + super().__init__() + self.params = params + + async def run(self, session: AsyncSession) -> SearchSimilarLocationsOuterResponse: + queries_as_tups: list[tuple[int, str, str]] = [ + ( + param.request_id, + param.query, + param.iso, + ) + for param in self.params + ] + + vals = ( + values( + column("request_id", Integer), + column("query", String), + column("iso", String), + name="input_queries", + ) + .data(queries_as_tups) + .alias("input_queries_alias") + ) + + similarity = func.similarity( + vals.c.query, + LocationExpandedView.display_name, + ) + + lateral_top_5 = ( + select( + vals.c.request_id, + LocationExpandedView.location_id, + similarity.label("similarity"), + ) + .join( + LocationExpandedView, + LocationExpandedView.state_iso == vals.c.iso, + ) + .order_by( + similarity.desc(), + ) + .limit(5) + .lateral("lateral_top_5") + ) + + final = select( + vals.c.request_id, + lateral_top_5.c.location_id, + lateral_top_5.c.similarity, + ).join( + lateral_top_5, + vals.c.request_id == lateral_top_5.c.request_id, + ) + + mappings: Sequence[RowMapping] = await sh.mappings(session, query=final) + request_id_to_locations: dict[int, list[SearchSimilarLocationsLocationInfo]] = ( + defaultdict(list) + ) + for mapping in mappings: + inner_response = SearchSimilarLocationsLocationInfo( + location_id=mapping["location_id"], + similarity=mapping["similarity"], + ) + request_id: int = mapping["request_id"] + request_id_to_locations[request_id].append(inner_response) + + responses: list[SearchSimilarLocationsResponse] = [] + for request_id, inner_responses in request_id_to_locations.items(): + sorted_responses: list[SearchSimilarLocationsLocationInfo] = sorted( + inner_responses, + key=lambda x: x.similarity, + reverse=True, + ) + request_level_response = SearchSimilarLocationsResponse( + request_id=request_id, + results=sorted_responses, + ) + responses.append(request_level_response) + + return SearchSimilarLocationsOuterResponse( + responses=responses, + ) + diff --git a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/query_/models/__init__.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/query_/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/query_/models/params.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/query_/models/params.py new file mode 100644 index 00000000..180d27b4 --- /dev/null +++ b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/query_/models/params.py @@ -0,0 +1,10 @@ +from pydantic import BaseModel, Field + + +class SearchSimilarLocationsParams(BaseModel): + request_id: int + query: str + iso: str = Field( + description="US State ISO Code", + max_length=2, + ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/query_/models/response.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/query_/models/response.py new file mode 100644 index 00000000..95bf9e93 --- /dev/null +++ b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/query_/models/response.py @@ -0,0 +1,13 @@ +from pydantic import BaseModel, Field + + +class SearchSimilarLocationsLocationInfo(BaseModel): + location_id: int + similarity: float = Field(ge=0, le=1) + +class SearchSimilarLocationsResponse(BaseModel): + request_id: int + results: list[SearchSimilarLocationsLocationInfo] + +class SearchSimilarLocationsOuterResponse(BaseModel): + responses: list[SearchSimilarLocationsResponse] \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/query.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/query.py similarity index 93% rename from src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/query.py rename to src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/query.py index 32311bd1..9890db93 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/query.py +++ b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/query.py @@ -5,7 +5,7 @@ from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.constants import \ NUMBER_OF_ENTRIES_PER_ITERATION -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.models.input import \ +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.models.input import \ NLPLocationMatchSubtaskInput from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.eligible import \ EligibleContainer diff --git a/src/core/tasks/url/operators/location_id/subtasks/loader.py b/src/core/tasks/url/operators/location_id/subtasks/loader.py new file mode 100644 index 00000000..88d3aa82 --- /dev/null +++ b/src/core/tasks/url/operators/location_id/subtasks/loader.py @@ -0,0 +1,35 @@ +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor import NLPProcessor +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.core import \ + NLPLocationFrequencySubtaskOperator +from src.core.tasks.url.operators.location_id.subtasks.templates.subtask import LocationIDSubtaskOperatorBase +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.suggestion.location.auto.subtask.enums import LocationIDSubtaskType + + +class LocationIdentificationSubtaskLoader: + """Loads subtasks and associated dependencies.""" + + def __init__( + self, + adb_client: AsyncDatabaseClient, + nlp_processor: NLPProcessor, + ): + self.adb_client = adb_client + self._nlp_processor = nlp_processor + + def _load_nlp_location_match_subtask(self, task_id: int) -> NLPLocationFrequencySubtaskOperator: + return NLPLocationFrequencySubtaskOperator( + task_id=task_id, + adb_client=self.adb_client, + nlp_processor=self._nlp_processor + ) + + async def load_subtask( + self, + subtask_type: LocationIDSubtaskType, + task_id: int + ) -> LocationIDSubtaskOperatorBase: + match subtask_type: + case LocationIDSubtaskType.NLP_LOCATION_FREQUENCY: + return self._load_nlp_location_match_subtask(task_id=task_id) + raise ValueError(f"Unknown subtask type: {subtask_type}") diff --git a/src/core/tasks/url/operators/location_id/subtasks/models/__init__.py b/src/core/tasks/url/operators/location_id/subtasks/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/location_id/subtasks/models/run_info.py b/src/core/tasks/url/operators/location_id/subtasks/models/run_info.py new file mode 100644 index 00000000..de382736 --- /dev/null +++ b/src/core/tasks/url/operators/location_id/subtasks/models/run_info.py @@ -0,0 +1,14 @@ +from pydantic import BaseModel + + +class LocationIDSubtaskRunInfo(BaseModel): + error: str | None = None + linked_url_ids: list[int] | None = None + + @property + def is_success(self) -> bool: + return self.error is None + + @property + def has_linked_urls(self) -> bool: + return len(self.linked_url_ids) > 0 \ No newline at end of file diff --git a/src/core/tasks/url/operators/location_id/subtasks/models/subtask.py b/src/core/tasks/url/operators/location_id/subtasks/models/subtask.py new file mode 100644 index 00000000..b06d2ff9 --- /dev/null +++ b/src/core/tasks/url/operators/location_id/subtasks/models/subtask.py @@ -0,0 +1,18 @@ +from pydantic import BaseModel + +from src.core.tasks.url.operators.location_id.subtasks.models.suggestion import LocationSuggestion +from src.db.models.impl.url.suggestion.location.auto.subtask.pydantic import AutoLocationIDSubtaskPydantic + + +class AutoLocationIDSubtaskData(BaseModel): + pydantic_model: AutoLocationIDSubtaskPydantic + suggestions: list[LocationSuggestion] + error: str | None = None + + @property + def has_error(self) -> bool: + return self.error is not None + + @property + def url_id(self) -> int: + return self.pydantic_model.url_id \ No newline at end of file diff --git a/src/core/tasks/url/operators/location_id/subtasks/models/suggestion.py b/src/core/tasks/url/operators/location_id/subtasks/models/suggestion.py new file mode 100644 index 00000000..3c4ef6e9 --- /dev/null +++ b/src/core/tasks/url/operators/location_id/subtasks/models/suggestion.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel, Field + + +class LocationSuggestion(BaseModel): + location_id: int + confidence: int = Field(ge=0, le=100) \ No newline at end of file diff --git a/src/core/tasks/url/operators/location_id/subtasks/queries/__init__.py b/src/core/tasks/url/operators/location_id/subtasks/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/location_id/subtasks/queries/survey/__init__.py b/src/core/tasks/url/operators/location_id/subtasks/queries/survey/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/location_id/subtasks/queries/survey/constants.py b/src/core/tasks/url/operators/location_id/subtasks/queries/survey/constants.py new file mode 100644 index 00000000..0465f295 --- /dev/null +++ b/src/core/tasks/url/operators/location_id/subtasks/queries/survey/constants.py @@ -0,0 +1,11 @@ +# Determines priority of subtasks, all else being equal. +from src.db.models.impl.url.suggestion.location.auto.subtask.enums import LocationIDSubtaskType + +SUBTASK_HIERARCHY: list[LocationIDSubtaskType] = [ + LocationIDSubtaskType.NLP_LOCATION_FREQUENCY, +] + +SUBTASK_HIERARCHY_MAPPING: dict[LocationIDSubtaskType, int] = { + subtask: idx + for idx, subtask in enumerate(SUBTASK_HIERARCHY) +} \ No newline at end of file diff --git a/src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/__init__.py b/src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/core.py b/src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/core.py new file mode 100644 index 00000000..c267b89e --- /dev/null +++ b/src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/core.py @@ -0,0 +1,73 @@ +from collections import Counter + +from sqlalchemy import RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.url.operators.location_id.subtasks.queries.survey.constants import SUBTASK_HIERARCHY_MAPPING +from src.core.tasks.url.operators.location_id.subtasks.queries.survey.queries.eligible_counts import \ + ELIGIBLE_COUNTS_QUERY +from src.db.models.impl.url.suggestion.location.auto.subtask.enums import LocationIDSubtaskType +from src.db.queries.base.builder import QueryBuilderBase + +from src.db.helpers.session import session_helper as sh + +class LocationIDSurveyQueryBuilder(QueryBuilderBase): + """ + Survey applicable URLs to determine next subtask to run + + URLs are "inapplicable" if they have any of the following properties: + - Are validated via FlagURLValidated model + - Have at least one annotation with agency suggestion with confidence >= 95 + - Have all possible subtasks completed + + Returns a list of one or more subtasks to run + based on which subtask(s) have the most applicable URLs + (or an empty list if no subtasks have applicable URLs) + """ + + def __init__( + self, + allowed_subtasks: list[LocationIDSubtaskType] + ): + super().__init__() + self._allowed_subtasks = allowed_subtasks + + async def run(self, session: AsyncSession) -> LocationIDSubtaskType | None: + results: RowMapping = await sh.mapping(session, ELIGIBLE_COUNTS_QUERY) + counts: Counter[str] = Counter(results) + + allowed_counts: Counter[str] = await self._filter_allowed_counts(counts) + if len(allowed_counts) == 0: + return None + max_count: int = max(allowed_counts.values()) + if max_count == 0: + return None + subtasks_with_max_count: list[str] = [ + subtask for subtask, count in allowed_counts.items() + if count == max_count + ] + subtasks_as_enum_list: list[LocationIDSubtaskType] = [ + LocationIDSubtaskType(subtask) + for subtask in subtasks_with_max_count + ] + # Sort subtasks by priority + sorted_subtasks: list[LocationIDSubtaskType] = sorted( + subtasks_as_enum_list, + key=lambda subtask: SUBTASK_HIERARCHY_MAPPING[subtask], + reverse=True, + ) + # Return the highest priority subtask + return sorted_subtasks[0] + + async def _filter_allowed_counts(self, counts: Counter[str]) -> Counter[str]: + return Counter( + { + subtask: count + for subtask, count in counts.items() + if LocationIDSubtaskType(subtask) in self._allowed_subtasks + } + ) + + + + diff --git a/src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/ctes/__init__.py b/src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/ctes/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/ctes/eligible.py b/src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/ctes/eligible.py new file mode 100644 index 00000000..b2d2986c --- /dev/null +++ b/src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/ctes/eligible.py @@ -0,0 +1,38 @@ + + +from sqlalchemy import select, CTE, Column + +from src.core.tasks.url.operators._shared.ctes.validated import VALIDATED_EXISTS_CONTAINER +from src.core.tasks.url.operators.location_id.subtasks.queries.survey.queries.ctes.exists.high_confidence_annotations import \ + HIGH_CONFIDENCE_ANNOTATIONS_EXISTS_CONTAINER +from src.core.tasks.url.operators.location_id.subtasks.queries.survey.queries.ctes.subtask.impl.nlp_location_freq import \ + NLP_LOCATION_CONTAINER +from src.db.models.impl.url.core.sqlalchemy import URL + + +class EligibleContainer: + + def __init__(self): + self._cte = ( + select( + URL.id, + NLP_LOCATION_CONTAINER.eligible_query.label("nlp_location"), + ) + .where( + HIGH_CONFIDENCE_ANNOTATIONS_EXISTS_CONTAINER.not_exists_query, + VALIDATED_EXISTS_CONTAINER.not_exists_query, + ) + .cte("eligible") + ) + + @property + def cte(self) -> CTE: + return self._cte + + @property + def url_id(self) -> Column[int]: + return self._cte.c['id'] + + @property + def nlp_location(self) -> Column[bool]: + return self._cte.c['nlp_location'] \ No newline at end of file diff --git a/src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/ctes/exists/__init__.py b/src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/ctes/exists/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/ctes/exists/high_confidence_annotations.py b/src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/ctes/exists/high_confidence_annotations.py new file mode 100644 index 00000000..7d0dddfd --- /dev/null +++ b/src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/ctes/exists/high_confidence_annotations.py @@ -0,0 +1,29 @@ +from sqlalchemy import select + +from src.core.tasks.url.operators._shared.container.subtask.exists import \ + URLsSubtaskExistsCTEContainer +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.suggestion.location.auto.subtask.sqlalchemy import AutoLocationIDSubtask +from src.db.models.impl.url.suggestion.location.auto.suggestion.sqlalchemy import LocationIDSubtaskSuggestion + +cte = ( + select( + URL.id + ) + .join( + AutoLocationIDSubtask, + AutoLocationIDSubtask.url_id == URL.id, + ) + .join( + LocationIDSubtaskSuggestion, + LocationIDSubtaskSuggestion.subtask_id == AutoLocationIDSubtask.id, + ) + .where( + LocationIDSubtaskSuggestion.confidence >= 95, + ) + .cte("high_confidence_annotations_exists") +) + +HIGH_CONFIDENCE_ANNOTATIONS_EXISTS_CONTAINER = URLsSubtaskExistsCTEContainer( + cte, +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/ctes/subtask/__init__.py b/src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/ctes/subtask/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/ctes/subtask/helpers.py b/src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/ctes/subtask/helpers.py new file mode 100644 index 00000000..acd73c4b --- /dev/null +++ b/src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/ctes/subtask/helpers.py @@ -0,0 +1,18 @@ +from sqlalchemy import ColumnElement, exists + +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.suggestion.location.auto.subtask.enums import LocationIDSubtaskType +from src.db.models.impl.url.suggestion.location.auto.subtask.sqlalchemy import AutoLocationIDSubtask + + +def get_exists_subtask_query( + subtask_type: LocationIDSubtaskType, +) -> ColumnElement[bool]: + return ( + exists() + .where( + AutoLocationIDSubtask.url_id == URL.id, + AutoLocationIDSubtask.type == subtask_type, + ) + .label("subtask_entry_exists") + ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/ctes/subtask/impl/__init__.py b/src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/ctes/subtask/impl/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location_freq.py b/src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location_freq.py new file mode 100644 index 00000000..4998f4fe --- /dev/null +++ b/src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location_freq.py @@ -0,0 +1,25 @@ +from sqlalchemy import select + +from src.core.tasks.url.operators._shared.subtask.container import SubtaskCTEContainer +from src.core.tasks.url.operators.location_id.subtasks.queries.survey.queries.ctes.subtask.helpers import \ + get_exists_subtask_query +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML +from src.db.models.impl.url.suggestion.location.auto.subtask.enums import LocationIDSubtaskType + +cte = ( + select( + URL.id, + get_exists_subtask_query( + LocationIDSubtaskType.NLP_LOCATION_FREQUENCY + ) + ) + .join( + URLCompressedHTML, + ) + .cte("nlp_location_eligible") +) + +NLP_LOCATION_CONTAINER = SubtaskCTEContainer( + cte, +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/eligible_counts.py b/src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/eligible_counts.py new file mode 100644 index 00000000..707fffeb --- /dev/null +++ b/src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/eligible_counts.py @@ -0,0 +1,21 @@ +from sqlalchemy import ColumnElement, func, Integer, select + +from src.core.tasks.url.operators.location_id.subtasks.queries.survey.queries.ctes.eligible import EligibleContainer +from src.db.models.impl.url.suggestion.location.auto.subtask.enums import LocationIDSubtaskType + + +def sum_count(col: ColumnElement[bool], subtask_type: LocationIDSubtaskType) -> ColumnElement[int]: + return func.coalesce( + func.sum( + col.cast(Integer) + ), + 0, + ).label(subtask_type.value) + +container = EligibleContainer() + +ELIGIBLE_COUNTS_QUERY = ( + select( + sum_count(container.nlp_location, LocationIDSubtaskType.NLP_LOCATION_FREQUENCY), + ) +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/location_id/subtasks/templates/__init__.py b/src/core/tasks/url/operators/location_id/subtasks/templates/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/location_id/subtasks/templates/subtask.py b/src/core/tasks/url/operators/location_id/subtasks/templates/subtask.py new file mode 100644 index 00000000..43fe39de --- /dev/null +++ b/src/core/tasks/url/operators/location_id/subtasks/templates/subtask.py @@ -0,0 +1,84 @@ +import abc +import traceback +from abc import ABC + +from src.core.tasks.url.operators.location_id.subtasks.models.run_info import LocationIDSubtaskRunInfo +from src.core.tasks.url.operators.location_id.subtasks.models.subtask import AutoLocationIDSubtaskData +from src.core.tasks.url.operators.location_id.subtasks.models.suggestion import LocationSuggestion +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.error_info.pydantic import URLErrorInfoPydantic +from src.db.models.impl.url.suggestion.location.auto.subtask.pydantic import AutoLocationIDSubtaskPydantic +from src.db.models.impl.url.suggestion.location.auto.suggestion.pydantic import LocationIDSubtaskSuggestionPydantic + + +class LocationIDSubtaskOperatorBase(ABC): + + def __init__( + self, + adb_client: AsyncDatabaseClient, + task_id: int + ) -> None: + self.adb_client: AsyncDatabaseClient = adb_client + self.task_id: int = task_id + self.linked_urls: list[int] = [] + + async def run(self) -> LocationIDSubtaskRunInfo: + try: + await self.inner_logic() + except Exception as e: + # Get stack trace + stack_trace: str = traceback.format_exc() + return LocationIDSubtaskRunInfo( + error=f"{type(e).__name__}: {str(e)}: {stack_trace}", + linked_url_ids=self.linked_urls + ) + return LocationIDSubtaskRunInfo( + linked_url_ids=self.linked_urls + ) + + @abc.abstractmethod + async def inner_logic(self) -> LocationIDSubtaskRunInfo: + raise NotImplementedError + + async def _upload_subtask_data( + self, + subtask_data_list: list[AutoLocationIDSubtaskData] + ) -> None: + + subtask_models: list[AutoLocationIDSubtaskPydantic] = [ + subtask_data.pydantic_model + for subtask_data in subtask_data_list + ] + subtask_ids: list[int] = await self.adb_client.bulk_insert( + models=subtask_models, + return_ids=True + ) + suggestions: list[LocationIDSubtaskSuggestionPydantic] = [] + for subtask_id, subtask_info in zip(subtask_ids, subtask_data_list): + suggestions_raw: list[LocationSuggestion] = subtask_info.suggestions + for suggestion in suggestions_raw: + suggestion_pydantic = LocationIDSubtaskSuggestionPydantic( + subtask_id=subtask_id, + location_id=suggestion.location_id, + confidence=suggestion.confidence, + ) + suggestions.append(suggestion_pydantic) + + await self.adb_client.bulk_insert( + models=suggestions, + ) + + error_infos: list[URLErrorInfoPydantic] = [] + for subtask_info in subtask_data_list: + if not subtask_info.has_error: + continue + error_info = URLErrorInfoPydantic( + url_id=subtask_info.url_id, + error=subtask_info.error, + task_id=self.task_id, + ) + error_infos.append(error_info) + + await self.adb_client.bulk_insert( + models=error_infos, + ) diff --git a/src/db/enums.py b/src/db/enums.py index 25a4a728..62cf6ec0 100644 --- a/src/db/enums.py +++ b/src/db/enums.py @@ -48,6 +48,7 @@ class TaskType(PyEnum): IA_PROBE = "Internet Archives Probe" IA_SAVE = "Internet Archives Archive" SCREENSHOT = "Screenshot" + LOCATION_ID = "Location ID" # Scheduled Tasks PUSH_TO_HUGGINGFACE = "Push to Hugging Face" diff --git a/src/db/models/impl/location/county/sqlalchemy.py b/src/db/models/impl/location/county/sqlalchemy.py index b3428449..99d82bdc 100644 --- a/src/db/models/impl/location/county/sqlalchemy.py +++ b/src/db/models/impl/location/county/sqlalchemy.py @@ -11,7 +11,7 @@ class County( __tablename__ = "counties" name: Mapped[str] - state_id = us_state_column() + state_id: Mapped[int] = us_state_column() fips: Mapped[str] = Column(String(5), nullable=True) lat: Mapped[float] = Column(Float, nullable=True) lng: Mapped[float] = Column(Float, nullable=True) diff --git a/src/db/models/impl/location/locality/sqlalchemy.py b/src/db/models/impl/location/locality/sqlalchemy.py index 216706fd..c462a8c1 100644 --- a/src/db/models/impl/location/locality/sqlalchemy.py +++ b/src/db/models/impl/location/locality/sqlalchemy.py @@ -1,4 +1,5 @@ from sqlalchemy import String, Column +from sqlalchemy.orm import Mapped from src.db.models.helpers import county_column from src.db.models.templates_.with_id import WithIDBase @@ -11,4 +12,4 @@ class Locality( __tablename__ = "localities" name = Column(String(255), nullable=False) - county_id = county_column(nullable=False) + county_id: Mapped[int] = county_column(nullable=False) diff --git a/src/external/pdap/client.py b/src/external/pdap/client.py index 24cda6f9..1e997079 100644 --- a/src/external/pdap/client.py +++ b/src/external/pdap/client.py @@ -25,38 +25,6 @@ def __init__( ): self.access_manager = access_manager - async def search_agency_by_location( - self, - params: list[SearchAgencyByLocationParams] - ) -> list[SearchAgencyByLocationResponse]: - request_url: str = self.access_manager.build_url( - namespace=DataSourcesNamespaces.SOURCE_COLLECTOR, - subdomains=["agencies", "search", "location"] - ) - headers: dict[str, str] = await self.access_manager.jwt_header() - headers['Content-Type']: str = "application/json" - - json_params: list[dict[str, Any]] = [ - param.model_dump(mode='json') - for param in params - ] - - request_info = RequestInfo( - type_=RequestType.POST, - url=request_url, - headers=headers, - json_={ - "requests": json_params - } - ) - response_info: ResponseInfo = await self.access_manager.make_request(request_info) - - outer_response = SearchAgencyByLocationOuterResponse( - **response_info.data - ) - - return outer_response.responses - async def match_agency( self, name: str, diff --git a/src/external/pdap/dtos/search_agency_by_location/params.py b/src/external/pdap/dtos/search_agency_by_location/params.py index ca5a6213..96ebd2fa 100644 --- a/src/external/pdap/dtos/search_agency_by_location/params.py +++ b/src/external/pdap/dtos/search_agency_by_location/params.py @@ -7,5 +7,4 @@ class SearchAgencyByLocationParams(BaseModel): iso: str = Field( description="US State ISO Code", max_length=2, - ) \ No newline at end of file diff --git a/tests/automated/integration/api/annotate/all/test_happy_path.py b/tests/automated/integration/api/annotate/all/test_happy_path.py index 86c0d843..c50127a3 100644 --- a/tests/automated/integration/api/annotate/all/test_happy_path.py +++ b/tests/automated/integration/api/annotate/all/test_happy_path.py @@ -1,16 +1,27 @@ +from collections import Counter + import pytest from src.api.endpoints.annotate.agency.post.dto import URLAgencyAnnotationPostInfo +from src.api.endpoints.annotate.all.get.models.location import LocationAnnotationUserSuggestion +from src.api.endpoints.annotate.all.get.models.response import GetNextURLForAllAnnotationResponse +from src.api.endpoints.annotate.all.get.queries.core import GetNextURLForAllAnnotationQueryBuilder from src.api.endpoints.annotate.all.post.models.request import AllAnnotationPostInfo from src.core.enums import SuggestedStatus, RecordType from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.location.user.sqlalchemy import UserLocationSuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion +from tests.helpers.data_creator.models.creation_info.us_state import USStateCreationInfo from tests.helpers.setup.final_review.core import setup_for_get_next_url_for_final_review @pytest.mark.asyncio -async def test_annotate_all(api_test_helper): +async def test_annotate_all( + api_test_helper, + pennsylvania: USStateCreationInfo, + california: USStateCreationInfo, +): """ Test the happy path workflow for the all-annotations endpoint The user should be able to get a valid URL (filtering on batch id if needed), @@ -18,6 +29,8 @@ async def test_annotate_all(api_test_helper): """ ath = api_test_helper adb_client = ath.adb_client() + + # Set up URLs setup_info_1 = await setup_for_get_next_url_for_final_review( db_data_creator=ath.db_data_creator, include_user_annotations=False ) @@ -27,7 +40,7 @@ async def test_annotate_all(api_test_helper): ) url_mapping_2 = setup_info_2.url_mapping - # First, get a valid URL to annotate + # Get a valid URL to annotate get_response_1 = await ath.request_validator.get_next_url_for_all_annotations() assert get_response_1.next_annotation is not None @@ -50,7 +63,10 @@ async def test_annotate_all(api_test_helper): is_new=False, suggested_agency=agency_id ), - location_ids=[] + location_ids=[ + california.location_id, + pennsylvania.location_id, + ] ) ) assert post_response_1.next_annotation is not None @@ -90,3 +106,38 @@ async def test_annotate_all(api_test_helper): all_record_type_suggestions = await adb_client.get_all(UserRecordTypeSuggestion) assert len(all_record_type_suggestions) == 1 assert all_record_type_suggestions[0].record_type == RecordType.ACCIDENT_REPORTS.value + + # Confirm 3 Location Suggestions, with two belonging to California and one to Pennsylvania + all_location_suggestions = await adb_client.get_all(UserLocationSuggestion) + assert len(all_location_suggestions) == 2 + location_ids: list[int] = [location_suggestion.location_id for location_suggestion in all_location_suggestions] + assert set(location_ids) == {california.location_id, pennsylvania.location_id} + # Confirm that all location suggestions are for the correct URL + for location_suggestion in all_location_suggestions: + assert location_suggestion.url_id == url_mapping_1.url_id + + # Retrieve the same URL (directly from the database, leveraging a different User) + # And confirm the presence of the user annotations + response: GetNextURLForAllAnnotationResponse = await adb_client.run_query_builder( + GetNextURLForAllAnnotationQueryBuilder( + batch_id=None, + user_id=99 + ) + ) + user_suggestions: list[LocationAnnotationUserSuggestion] = \ + response.next_annotation.location_suggestions.user + assert len(user_suggestions) == 2 + + response_location_ids: list[int] = [location_suggestion.location_id for location_suggestion in user_suggestions] + assert set(response_location_ids) == {california.location_id, pennsylvania.location_id} + + response_location_names: list[str] = [location_suggestion.location_name for location_suggestion in user_suggestions] + assert set(response_location_names) == { + "California", + "Pennsylvania" + } + + for user_suggestion in user_suggestions: + assert user_suggestion.user_count == 1 + + diff --git a/tests/automated/integration/conftest.py b/tests/automated/integration/conftest.py index 732cb84c..574f35f4 100644 --- a/tests/automated/integration/conftest.py +++ b/tests/automated/integration/conftest.py @@ -7,6 +7,7 @@ from src.core.core import AsyncCore from src.core.logger import AsyncCoreLogger from src.db.client.async_ import AsyncDatabaseClient +from tests.helpers.data_creator.core import DBDataCreator from tests.helpers.data_creator.models.creation_info.county import CountyCreationInfo from tests.helpers.data_creator.models.creation_info.locality import LocalityCreationInfo from tests.helpers.data_creator.models.creation_info.us_state import USStateCreationInfo @@ -32,42 +33,64 @@ def test_async_core(adb_client_test): logger.shutdown() @pytest_asyncio.fixture -def pennsylvania( - adb_client_test: AsyncDatabaseClient +async def pennsylvania( + db_data_creator: DBDataCreator ) -> USStateCreationInfo: """Creates Pennsylvania state and returns its state and location ID""" - raise NotImplementedError + return await db_data_creator.create_us_state( + name="Pennsylvania", + iso="PA" + ) @pytest_asyncio.fixture -def allegheny_county( - adb_client_test: AsyncDatabaseClient, +async def allegheny_county( + db_data_creator: DBDataCreator, pennsylvania: USStateCreationInfo ) -> CountyCreationInfo: - raise NotImplementedError + return await db_data_creator.create_county( + state_id=pennsylvania.us_state_id, + name="Allegheny" + ) @pytest_asyncio.fixture -def pittsburgh_locality( - adb_client_test: AsyncDatabaseClient, +async def pittsburgh_locality( + db_data_creator: DBDataCreator, + pennsylvania: USStateCreationInfo, allegheny_county: CountyCreationInfo ) -> LocalityCreationInfo: - raise NotImplementedError + return await db_data_creator.create_locality( + state_id=pennsylvania.us_state_id, + county_id=allegheny_county.county_id, + name="Pittsburgh" + ) @pytest_asyncio.fixture -def california( - adb_client_test: AsyncDatabaseClient +async def california( + db_data_creator: DBDataCreator, ) -> USStateCreationInfo: - raise NotImplementedError + return await db_data_creator.create_us_state( + name="California", + iso="CA" + ) @pytest_asyncio.fixture -def los_angeles_county( - adb_client_test: AsyncDatabaseClient, +async def los_angeles_county( + db_data_creator: DBDataCreator, california: USStateCreationInfo ) -> CountyCreationInfo: - raise NotImplementedError + return await db_data_creator.create_county( + state_id=california.us_state_id, + name="Los Angeles" + ) @pytest_asyncio.fixture -def los_angeles_locality( - adb_client_test: AsyncDatabaseClient, +async def los_angeles_locality( + db_data_creator: DBDataCreator, + california: USStateCreationInfo, los_angeles_county: CountyCreationInfo ) -> LocalityCreationInfo: - raise NotImplementedError \ No newline at end of file + return await db_data_creator.create_locality( + state_id=california.us_state_id, + county_id=los_angeles_county.county_id, + name="Los Angeles" + ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/conftest.py b/tests/automated/integration/tasks/url/impl/agency_identification/conftest.py index 7feb6d61..975a14bd 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/conftest.py +++ b/tests/automated/integration/tasks/url/impl/agency_identification/conftest.py @@ -4,7 +4,7 @@ from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.core import \ +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor import \ NLPProcessor from src.core.tasks.url.operators.agency_identification.subtasks.loader import AgencyIdentificationSubtaskLoader from src.db.client.async_ import AsyncDatabaseClient diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/test_core.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/test_core.py index 2c3ed419..d4a65ed3 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/test_core.py +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/test_core.py @@ -1,10 +1,8 @@ -from unittest.mock import AsyncMock, MagicMock - import pytest from src.core.tasks.base.run_info import TaskOperatorRunInfo from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.models.input import \ +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.models.input import \ NLPLocationMatchSubtaskInput from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.core import \ AgencyIDSubtaskInternalProcessor diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/conftest.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/conftest.py index 2abee544..1e411037 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/conftest.py +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/conftest.py @@ -4,7 +4,7 @@ from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.core import \ AgencyIDSubtaskInternalProcessor -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.core import \ +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor import \ NLPProcessor from src.external.pdap.client import PDAPClient diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/test_nlp_response_valid.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/test_nlp_response_valid.py index ea81341c..1853f689 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/test_nlp_response_valid.py +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/test_nlp_response_valid.py @@ -1,8 +1,8 @@ import pytest -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.response import \ +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.models import \ NLPLocationMatchResponse -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.us_state import \ +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.models.us_state import \ USState US_STATE = USState( diff --git a/tests/automated/integration/tasks/url/loader/conftest.py b/tests/automated/integration/tasks/url/loader/conftest.py index 52a17b5e..8d6d105d 100644 --- a/tests/automated/integration/tasks/url/loader/conftest.py +++ b/tests/automated/integration/tasks/url/loader/conftest.py @@ -4,7 +4,7 @@ from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface from src.core.tasks.url.loader import URLTaskOperatorLoader -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.core import \ +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor import \ NLPProcessor from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser from src.db.client.async_ import AsyncDatabaseClient diff --git a/tests/helpers/data_creator/core.py b/tests/helpers/data_creator/core.py index 439f0459..6f5862f8 100644 --- a/tests/helpers/data_creator/core.py +++ b/tests/helpers/data_creator/core.py @@ -42,10 +42,13 @@ from tests.helpers.data_creator.commands.impl.urls_v2.core import URLsV2Command from tests.helpers.data_creator.commands.impl.urls_v2.response import URLsV2Response from tests.helpers.data_creator.create import create_urls, create_batch, create_batch_url_links, create_validated_flags, \ - create_url_data_sources + create_url_data_sources, create_state, create_county, create_locality from tests.helpers.data_creator.models.clients import DBDataCreatorClientContainer from tests.helpers.data_creator.models.creation_info.batch.v1 import BatchURLCreationInfo from tests.helpers.data_creator.models.creation_info.batch.v2 import BatchURLCreationInfoV2 +from tests.helpers.data_creator.models.creation_info.county import CountyCreationInfo +from tests.helpers.data_creator.models.creation_info.locality import LocalityCreationInfo +from tests.helpers.data_creator.models.creation_info.us_state import USStateCreationInfo from tests.helpers.simple_test_data_functions import generate_test_name @@ -561,4 +564,39 @@ async def create_web_metadata( ) for url_id in url_ids ] - await self.adb_client.add_all(web_metadata) \ No newline at end of file + await self.adb_client.add_all(web_metadata) + + async def create_us_state( + self, + name: str, + iso:str + ) -> USStateCreationInfo: + return await create_state( + adb_client=self.adb_client, + name=name, + iso=iso, + ) + + async def create_county( + self, + state_id: int, + name: str, + ) -> CountyCreationInfo: + return await create_county( + adb_client=self.adb_client, + state_id=state_id, + name=name, + ) + + async def create_locality( + self, + state_id: int, + county_id: int, + name: str, + ) -> LocalityCreationInfo: + return await create_locality( + adb_client=self.adb_client, + state_id=state_id, + county_id=county_id, + name=name, + ) \ No newline at end of file diff --git a/tests/helpers/data_creator/create.py b/tests/helpers/data_creator/create.py index 34f5187d..ae9814c2 100644 --- a/tests/helpers/data_creator/create.py +++ b/tests/helpers/data_creator/create.py @@ -2,8 +2,7 @@ from src.collectors.enums import CollectorType, URLStatus from src.core.enums import BatchStatus, RecordType -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.us_state import \ - USState +from src.db import County, Locality, USState from src.db.client.async_ import AsyncDatabaseClient from src.db.dtos.url.mapping import URLMapping from src.db.models.impl.batch.pydantic.insert import BatchInsertModel @@ -15,6 +14,8 @@ from src.db.models.impl.url.data_source.pydantic import URLDataSourcePydantic from tests.helpers.data_creator.generate import generate_batch, generate_urls, generate_validated_flags, \ generate_url_data_sources, generate_batch_url_links +from tests.helpers.data_creator.models.creation_info.county import CountyCreationInfo +from tests.helpers.data_creator.models.creation_info.locality import LocalityCreationInfo from tests.helpers.data_creator.models.creation_info.us_state import USStateCreationInfo @@ -83,8 +84,8 @@ async def create_state( ) -> USStateCreationInfo: us_state_insert_model = USState( - name=name, - iso=iso, + state_name=name, + state_iso=iso, ) us_state_id: int = await adb_client.add( us_state_insert_model, @@ -98,3 +99,48 @@ async def create_state( location_id=location_id, ) +async def create_county( + adb_client: AsyncDatabaseClient, + state_id: int, + name: str +) -> CountyCreationInfo: + county_insert_model = County( + name=name, + state_id=state_id, + ) + county_id: int = await adb_client.add( + county_insert_model, + return_id=True + ) + location_id: int = await adb_client.get_location_id( + us_state_id=state_id, + county_id=county_id + ) + return CountyCreationInfo( + county_id=county_id, + location_id=location_id, + ) + +async def create_locality( + adb_client: AsyncDatabaseClient, + state_id: int, + county_id: int, + name: str +) -> LocalityCreationInfo: + locality_insert_model = Locality( + name=name, + county_id=county_id, + ) + locality_id: int = await adb_client.add( + locality_insert_model, + return_id=True + ) + location_id: int = await adb_client.get_location_id( + us_state_id=state_id, + county_id=county_id, + locality_id=locality_id + ) + return LocalityCreationInfo( + locality_id=locality_id, + location_id=location_id, + ) \ No newline at end of file diff --git a/tests/manual/agency_identifier/test_nlp_processor.py b/tests/manual/agency_identifier/test_nlp_processor.py index c38a52b1..30978a56 100644 --- a/tests/manual/agency_identifier/test_nlp_processor.py +++ b/tests/manual/agency_identifier/test_nlp_processor.py @@ -1,6 +1,6 @@ import pytest -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.core import \ +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor import \ NLPProcessor SAMPLE_HTML: str = """ diff --git a/tests/manual/external/pdap/test_sc_agency_search_location.py b/tests/manual/external/pdap/test_sc_agency_search_location.py deleted file mode 100644 index 9b0aac28..00000000 --- a/tests/manual/external/pdap/test_sc_agency_search_location.py +++ /dev/null @@ -1,34 +0,0 @@ -""" - -Location ID, Agency ID -10464,9873, "Boonsboro, Washington, Maryland" -15648,9878, "Smithsburg, Washington, Maryland" -15656,9879, "Williamsport, Washington, Maryland" - -""" -import pytest - -from src.external.pdap.client import PDAPClient -from src.external.pdap.dtos.search_agency_by_location.params import SearchAgencyByLocationParams -from src.external.pdap.dtos.search_agency_by_location.response import SearchAgencyByLocationResponse - - -@pytest.mark.asyncio -async def test_sc_agency_search_location(pdap_client_dev: PDAPClient): - params: list[SearchAgencyByLocationParams] = [ - SearchAgencyByLocationParams( - request_id=1, - query="Boonsboro, Washington, Maryland" - ), - SearchAgencyByLocationParams( - request_id=0, - query="Smithsburg, Washington, Maryland" - ), - SearchAgencyByLocationParams( - request_id=-99, - query="Williamsport, Washington, Maryland" - ) - ] - response: list[SearchAgencyByLocationResponse] = await pdap_client_dev.search_agency_by_location(params) - print(response) - From 3a62dfd6b5bf13e0417ecdbdcf0674085e02b366 Mon Sep 17 00:00:00 2001 From: maxachis Date: Thu, 18 Sep 2025 09:17:55 -0400 Subject: [PATCH 6/7] Continue draft --- .../operators/agency_identification/core.py | 8 +--- .../subtasks/impl/nlp_location_match_/core.py | 4 -- .../agency_identification/subtasks/loader.py | 6 --- .../survey/queries/ctes/subtask/impl/ckan.py | 5 +-- .../queries/ctes/subtask/impl/homepage.py | 5 +-- .../queries/ctes/subtask/impl/muckrock.py | 5 +-- .../queries/ctes/subtask/impl/nlp_location.py | 5 +-- .../tasks/url/operators/location_id/core.py | 19 ++++++++ .../impl/nlp_location_freq/models/input.py | 13 +++++- .../subtasks/impl/nlp_location_freq/query.py | 11 ++++- .../impl/agency_identification/conftest.py | 3 -- .../impl/location_identification/__init__.py | 0 .../impl/location_identification/conftest.py | 23 ++++++++++ .../subtasks/__init__.py | 0 .../nlp_location_frequency/__init__.py | 0 .../survey/__init__.py | 0 .../survey/test_survey_flag.py | 44 +++++++++++++++++++ tests/helpers/data_creator/core.py | 16 ++++++- 18 files changed, 131 insertions(+), 36 deletions(-) create mode 100644 tests/automated/integration/tasks/url/impl/location_identification/__init__.py create mode 100644 tests/automated/integration/tasks/url/impl/location_identification/conftest.py create mode 100644 tests/automated/integration/tasks/url/impl/location_identification/subtasks/__init__.py create mode 100644 tests/automated/integration/tasks/url/impl/location_identification/subtasks/nlp_location_frequency/__init__.py create mode 100644 tests/automated/integration/tasks/url/impl/location_identification/survey/__init__.py create mode 100644 tests/automated/integration/tasks/url/impl/location_identification/survey/test_survey_flag.py diff --git a/src/core/tasks/url/operators/agency_identification/core.py b/src/core/tasks/url/operators/agency_identification/core.py index 4de9dd57..7657ea0e 100644 --- a/src/core/tasks/url/operators/agency_identification/core.py +++ b/src/core/tasks/url/operators/agency_identification/core.py @@ -57,16 +57,10 @@ async def load_subtask( """Get subtask based on collector type.""" return await self.loader.load_subtask(subtask_type, task_id=self.task_id) - @staticmethod - async def run_subtask( - subtask_operator: AgencyIDSubtaskOperatorBase, - ) -> AgencyIDSubtaskRunInfo: - return await subtask_operator.run() - async def inner_task_logic(self) -> None: subtask_operator: AgencyIDSubtaskOperatorBase = await self.load_subtask(self._subtask) print(f"Running Subtask: {self._subtask.value}") - run_info: AgencyIDSubtaskRunInfo = await self.run_subtask(subtask_operator) + run_info: AgencyIDSubtaskRunInfo = await subtask_operator.run() await self.link_urls_to_task(run_info.linked_url_ids) if not run_info.is_success: raise SubtaskError(run_info.error) diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/core.py index b595c93c..2894446d 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/core.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/core.py @@ -1,7 +1,5 @@ from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.constants import \ ITERATIONS_PER_SUBTASK -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.core import \ - AgencyIDSubtaskInternalProcessor from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIDSubtaskOperatorBase from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.models.input import \ @@ -31,8 +29,6 @@ async def run_subtask_iteration(self, inputs: list[NLPLocationMatchSubtaskInput] self.linked_urls.extend([input_.url_id for input_ in inputs]) subtask_data_list: list[AutoAgencyIDSubtaskData] = [] - # TODO: Get NLP Annotations - # TODO: Process and Convert NLP Annotations # TODO: Resubmit NLP Annotations diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/loader.py b/src/core/tasks/url/operators/agency_identification/subtasks/loader.py index ff136a66..50bbe255 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/loader.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/loader.py @@ -6,8 +6,6 @@ MuckrockAgencyIDSubtaskOperator from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.core import \ NLPLocationMatchSubtaskOperator -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor import \ - NLPProcessor from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIDSubtaskOperatorBase from src.db.client.async_ import AsyncDatabaseClient from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType @@ -22,11 +20,9 @@ def __init__( pdap_client: PDAPClient, muckrock_api_interface: MuckrockAPIInterface, adb_client: AsyncDatabaseClient, - nlp_processor: NLPProcessor ): self._pdap_client = pdap_client self._muckrock_api_interface = muckrock_api_interface - self._nlp_processor = nlp_processor self.adb_client = adb_client def _load_muckrock_subtask(self, task_id: int) -> MuckrockAgencyIDSubtaskOperator: @@ -54,8 +50,6 @@ def _load_nlp_location_match_subtask(self, task_id: int) -> NLPLocationMatchSubt return NLPLocationMatchSubtaskOperator( task_id=task_id, adb_client=self.adb_client, - pdap_client=self._pdap_client, - processor=self._nlp_processor ) diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/ckan.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/ckan.py index 39114acd..6b8ed9e8 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/ckan.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/ckan.py @@ -1,10 +1,9 @@ from sqlalchemy import select from src.collectors.enums import CollectorType +from src.core.tasks.url.operators._shared.container.subtask.eligible import URLsSubtaskEligibleCTEContainer from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.helpers import \ get_exists_subtask_query -from src.core.tasks.url.operators._shared.subtask.container import \ - SubtaskCTEContainer from src.db.models.impl.batch.sqlalchemy import Batch from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL @@ -32,6 +31,6 @@ .cte("ckan_eligible") ) -CKAN_SUBTASK_CONTAINER = SubtaskCTEContainer( +CKAN_SUBTASK_CONTAINER = URLsSubtaskEligibleCTEContainer( cte, ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/homepage.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/homepage.py index 5c0a613f..7daba916 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/homepage.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/homepage.py @@ -1,9 +1,8 @@ from sqlalchemy import select, exists +from src.core.tasks.url.operators._shared.container.subtask.eligible import URLsSubtaskEligibleCTEContainer from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.consolidated import \ CONSOLIDATED_CTE -from src.core.tasks.url.operators._shared.subtask.container import \ - SubtaskCTEContainer from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.helpers import \ get_exists_subtask_query from src.db.models.impl.url.core.sqlalchemy import URL @@ -29,6 +28,6 @@ .cte("homepage_eligible") ) -HOMEPAGE_SUBTASK_CONTAINER = SubtaskCTEContainer( +HOMEPAGE_SUBTASK_CONTAINER = URLsSubtaskEligibleCTEContainer( cte, ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/muckrock.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/muckrock.py index 1eeb4bd8..9e267f66 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/muckrock.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/muckrock.py @@ -1,8 +1,7 @@ from sqlalchemy import select from src.collectors.enums import CollectorType -from src.core.tasks.url.operators._shared.subtask.container import \ - SubtaskCTEContainer +from src.core.tasks.url.operators._shared.container.subtask.eligible import URLsSubtaskEligibleCTEContainer from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.helpers import \ get_exists_subtask_query from src.db.models.impl.batch.sqlalchemy import Batch @@ -35,6 +34,6 @@ .cte("muckrock_eligible") ) -MUCKROCK_SUBTASK_CONTAINER = SubtaskCTEContainer( +MUCKROCK_SUBTASK_CONTAINER = URLsSubtaskEligibleCTEContainer( cte, ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location.py index 21871785..d4d02b18 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location.py @@ -2,8 +2,7 @@ from sqlalchemy import select -from src.core.tasks.url.operators._shared.subtask.container import \ - SubtaskCTEContainer +from src.core.tasks.url.operators._shared.container.subtask.eligible import URLsSubtaskEligibleCTEContainer from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.helpers import \ get_exists_subtask_query from src.db.models.impl.url.core.sqlalchemy import URL @@ -27,6 +26,6 @@ .cte("nlp_location_eligible") ) -NLP_LOCATION_CONTAINER = SubtaskCTEContainer( +NLP_LOCATION_CONTAINER = URLsSubtaskEligibleCTEContainer( cte, ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/location_id/core.py b/src/core/tasks/url/operators/location_id/core.py index 01f14a02..3833a80c 100644 --- a/src/core/tasks/url/operators/location_id/core.py +++ b/src/core/tasks/url/operators/location_id/core.py @@ -1,7 +1,11 @@ from src.core.tasks.mixins.link_urls import LinkURLsMixin +from src.core.tasks.url.operators._shared.exceptions import SubtaskError from src.core.tasks.url.operators.base import URLTaskOperatorBase +from src.core.tasks.url.operators.location_id.subtasks.flags.core import SubtaskFlagger from src.core.tasks.url.operators.location_id.subtasks.loader import LocationIdentificationSubtaskLoader +from src.core.tasks.url.operators.location_id.subtasks.models.run_info import LocationIDSubtaskRunInfo from src.core.tasks.url.operators.location_id.subtasks.queries.survey.queries.core import LocationIDSurveyQueryBuilder +from src.core.tasks.url.operators.location_id.subtasks.templates.subtask import LocationIDSubtaskOperatorBase from src.db.client.async_ import AsyncDatabaseClient from src.db.enums import TaskType from src.db.models.impl.url.suggestion.location.auto.subtask.enums import LocationIDSubtaskType @@ -24,6 +28,12 @@ def __init__( def task_type(self) -> TaskType: return TaskType.LOCATION_ID + async def load_subtask( + self, + subtask_type: LocationIDSubtaskType + ) -> LocationIDSubtaskOperatorBase: + return await self.loader.load_subtask(subtask_type, task_id=self.task_id) + async def meets_task_prerequisites(self) -> bool: """ Modifies: @@ -42,3 +52,12 @@ async def meets_task_prerequisites(self) -> bool: if next_subtask is None: return False return True + + + async def inner_task_logic(self) -> None: + subtask_operator: LocationIDSubtaskOperatorBase = await self.load_subtask(self._subtask) + print(f"Running Subtask: {self._subtask.value}") + run_info: LocationIDSubtaskRunInfo = await subtask_operator.run() + await self.link_urls_to_task(run_info.linked_url_ids) + if not run_info.is_success: + raise SubtaskError(run_info.error) \ No newline at end of file diff --git a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/models/input.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/models/input.py index 398c1504..74fb49d1 100644 --- a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/models/input.py +++ b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/models/input.py @@ -1,6 +1,17 @@ from pydantic import BaseModel +class LocationAnnotation(BaseModel): + location_id: int + confidence: int + +class LocationAnnotationToAgencyIDMapping(BaseModel): + location_annotation: LocationAnnotation + agency_ids: list[int] class NLPLocationMatchSubtaskInput(BaseModel): url_id: int - html: str \ No newline at end of file + mappings: list[LocationAnnotationToAgencyIDMapping] + + @property + def has_locations_with_agencies(self) -> bool: + return len(self.mappings) > 0 \ No newline at end of file diff --git a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/query.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/query.py index 9890db93..7f2e00b8 100644 --- a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/query.py +++ b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/query.py @@ -11,12 +11,15 @@ EligibleContainer from src.db.helpers.session import session_helper as sh from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML +from src.db.models.impl.url.suggestion.location.auto.subtask.sqlalchemy import AutoLocationIDSubtask +from src.db.models.impl.url.suggestion.location.auto.suggestion.sqlalchemy import LocationIDSubtaskSuggestion from src.db.queries.base.builder import QueryBuilderBase from src.db.utils.compression import decompress_html class GetNLPLocationMatchSubtaskInputQueryBuilder(QueryBuilderBase): + # TODO: Change async def run( self, session: AsyncSession @@ -28,8 +31,12 @@ async def run( URLCompressedHTML.compressed_html ) .join( - URLCompressedHTML, - URLCompressedHTML.url_id == container.url_id, + AutoLocationIDSubtask, + AutoLocationIDSubtask.url_id == container.url_id, + ) + .join( + LocationIDSubtaskSuggestion, + LocationIDSubtaskSuggestion.subtask_id == AutoLocationIDSubtask.id ) .where( container.nlp_location, diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/conftest.py b/tests/automated/integration/tasks/url/impl/agency_identification/conftest.py index 975a14bd..b029c0e9 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/conftest.py +++ b/tests/automated/integration/tasks/url/impl/agency_identification/conftest.py @@ -4,8 +4,6 @@ from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor import \ - NLPProcessor from src.core.tasks.url.operators.agency_identification.subtasks.loader import AgencyIdentificationSubtaskLoader from src.db.client.async_ import AsyncDatabaseClient from src.external.pdap.client import PDAPClient @@ -22,7 +20,6 @@ def operator( pdap_client=create_autospec(PDAPClient), muckrock_api_interface=create_autospec(MuckrockAPIInterface), adb_client=adb_client_test, - nlp_processor=create_autospec(NLPProcessor) ), ) diff --git a/tests/automated/integration/tasks/url/impl/location_identification/__init__.py b/tests/automated/integration/tasks/url/impl/location_identification/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/impl/location_identification/conftest.py b/tests/automated/integration/tasks/url/impl/location_identification/conftest.py new file mode 100644 index 00000000..cbfa1c57 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/location_identification/conftest.py @@ -0,0 +1,23 @@ +from unittest.mock import create_autospec + +import pytest + +from src.core.tasks.url.operators.location_id.core import LocationIdentificationTaskOperator +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.core import NLPProcessor +from src.core.tasks.url.operators.location_id.subtasks.loader import LocationIdentificationSubtaskLoader +from src.db.client.async_ import AsyncDatabaseClient + + +@pytest.fixture +def operator( + adb_client_test: AsyncDatabaseClient +) -> LocationIdentificationTaskOperator: + + operator = LocationIdentificationTaskOperator( + adb_client=adb_client_test, + loader=LocationIdentificationSubtaskLoader( + adb_client=adb_client_test, + nlp_processor=create_autospec(NLPProcessor) + ) + ) + return operator \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/location_identification/subtasks/__init__.py b/tests/automated/integration/tasks/url/impl/location_identification/subtasks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/impl/location_identification/subtasks/nlp_location_frequency/__init__.py b/tests/automated/integration/tasks/url/impl/location_identification/subtasks/nlp_location_frequency/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/impl/location_identification/survey/__init__.py b/tests/automated/integration/tasks/url/impl/location_identification/survey/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/impl/location_identification/survey/test_survey_flag.py b/tests/automated/integration/tasks/url/impl/location_identification/survey/test_survey_flag.py new file mode 100644 index 00000000..338c604b --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/location_identification/survey/test_survey_flag.py @@ -0,0 +1,44 @@ +import pytest + +from src.core.tasks.url.operators.location_id.core import LocationIdentificationTaskOperator +from src.db.models.impl.url.suggestion.location.auto.subtask.enums import LocationIDSubtaskType +from tests.helpers.data_creator.core import DBDataCreator + + +@pytest.mark.asyncio +async def test_survey_flag( + operator: LocationIdentificationTaskOperator, + db_data_creator: DBDataCreator, + monkeypatch +): + """ + Test that survey correctly disables Subtask flags + when the environment variable is set to disable that subtask + """ + + # Run basic survey and confirm no next subtask + assert not await operator.meets_task_prerequisites() + assert operator._subtask is None + + applicable_url_id: int = ( + await db_data_creator.create_urls( + count=1, + collector_metadata={ + "agency_name": "Test Agency" + } + ) + )[0].url_id + + await db_data_creator.add_compressed_html([applicable_url_id]) + + # Confirm prerequisite met and subtask if Agency Location Frequency + assert await operator.meets_task_prerequisites() + assert operator._subtask == LocationIDSubtaskType.NLP_LOCATION_FREQUENCY + + # Set flag to disable NLP Location Frequency Subtask + monkeypatch.setenv( + "LOCATION_ID_NLP_LOCATION_MATCH_FLAG", "0" + ) + + # Confirm prerequisite no longer met. + assert not await operator.meets_task_prerequisites() diff --git a/tests/helpers/data_creator/core.py b/tests/helpers/data_creator/core.py index 6f5862f8..75aa798f 100644 --- a/tests/helpers/data_creator/core.py +++ b/tests/helpers/data_creator/core.py @@ -20,6 +20,7 @@ from src.collectors.enums import CollectorType, URLStatus from src.core.tasks.url.operators.misc_metadata.tdo import URLMiscellaneousMetadataTDO from src.core.enums import BatchStatus, SuggestionType, RecordType, SuggestedStatus +from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters from tests.helpers.batch_creation_parameters.enums import URLCreationEnum @@ -599,4 +600,17 @@ async def create_locality( state_id=state_id, county_id=county_id, name=name, - ) \ No newline at end of file + ) + + async def add_compressed_html( + self, + url_ids: list[int], + ): + compressed_html_inserts: list[URLCompressedHTML] = [ + URLCompressedHTML( + url_id=url_id, + compressed_html=b"Test HTML" + ) + for url_id in url_ids + ] + await self.adb_client.add_all(compressed_html_inserts) \ No newline at end of file From c99c221c93305b5314ed14e1aad649ce0d4a6ada Mon Sep 17 00:00:00 2001 From: Max Chis Date: Sun, 21 Sep 2025 08:49:21 -0400 Subject: [PATCH 7/7] Finish Location Annotation Draft --- ...baa3b8e9b_add_location_annotation_logic.py | 10 +- local_database/DataDumper/dump.sh | 3 +- .../annotate/all/get/queries/core.py | 2 + .../annotate/all/post/models/request.py | 1 + src/api/endpoints/annotate/all/post/query.py | 2 +- src/api/main.py | 3 +- src/core/enums.py | 2 + src/core/tasks/url/loader.py | 7 +- .../_shared/container/subtask/eligible.py | 2 +- .../impl/nlp_location_match_/constants.py | 4 - .../impl/nlp_location_match_/convert.py | 47 +++++++ .../subtasks/impl/nlp_location_match_/core.py | 56 ++------ .../nlp_location_match_}/models/__init__.py | 0 .../impl/nlp_location_match_}/models/input.py | 0 .../models/subsets/__init__.py | 0 .../models/subsets/nlp_responses.py | 0 .../impl/nlp_location_match_/query_/query.py | 88 ++++++++++--- .../queries/ctes/subtask/impl/nlp_location.py | 21 ++- .../subtasks/impl/nlp_location_freq/core.py | 16 +-- .../impl/nlp_location_freq/models/input_.py | 6 + .../impl/nlp_location_freq/models/subsets.py | 9 ++ .../nlp_location_freq/processor/convert.py | 8 +- .../impl/nlp_location_freq/processor/core.py | 29 +++-- .../nlp_location_freq/processor/filter.py | 10 +- .../nlp_location_freq/processor/nlp/check.py | 7 +- .../processor/nlp/constants.py | 8 ++ .../nlp_location_freq/processor/nlp/core.py | 4 +- .../processor/query_/core.py | 27 ++-- .../subtasks/impl/nlp_location_freq/query.py | 28 ++-- .../operators/location_id/subtasks/loader.py | 2 +- .../ctes/subtask/impl/nlp_location_freq.py | 4 +- .../models/impl/flag/url_validated/enums.py | 1 + src/db/models/impl/url/core/sqlalchemy.py | 3 + .../location/auto/subtask/sqlalchemy.py | 4 +- .../location/auto/suggestion/sqlalchemy.py | 12 +- .../nlp_location_match/end_to_end/conftest.py | 11 +- .../end_to_end/test_core.py | 116 ----------------- .../end_to_end/test_multi_agency_location.py | 70 ++++++++++ .../end_to_end/test_single_agency_location.py | 76 +++++++++++ .../match_urls_to_search_params/__init__.py | 0 .../match_urls_to_search_params/conftest.py | 18 --- .../match_urls_to_search_params/test_empty.py | 14 -- .../test_no_state_any_locations.py | 14 -- .../test_state_multiple_locations.py | 14 -- .../test_state_no_locations.py | 14 -- .../test_state_one_location.py | 14 -- .../end_to_end}/__init__.py | 0 .../end_to_end/conftest.py | 15 +++ .../end_to_end/test_core.py | 120 ++++++++++++++++++ .../test_nlp_response_valid.py | 2 +- .../integration/tasks/url/loader/conftest.py | 3 +- .../tasks/url/loader/test_happy_path.py | 2 +- tests/automated/unit/dto/__init__.py | 0 .../unit/dto/test_all_annotation_post_info.py | 36 ------ tests/helpers/data_creator/core.py | 50 +++++++- tests/helpers/data_creator/create.py | 2 + 56 files changed, 625 insertions(+), 392 deletions(-) delete mode 100644 src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/constants.py rename src/core/tasks/url/operators/{location_id => agency_identification/subtasks/impl/nlp_location_match_}/models/__init__.py (100%) rename src/core/tasks/url/operators/{location_id/subtasks/impl/nlp_location_freq => agency_identification/subtasks/impl/nlp_location_match_}/models/input.py (100%) rename src/core/tasks/url/operators/{location_id/subtasks/impl/nlp_location_freq => agency_identification/subtasks/impl/nlp_location_match_}/models/subsets/__init__.py (100%) rename src/core/tasks/url/operators/{location_id/subtasks/impl/nlp_location_freq => agency_identification/subtasks/impl/nlp_location_match_}/models/subsets/nlp_responses.py (100%) create mode 100644 src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/models/input_.py create mode 100644 src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/models/subsets.py delete mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/test_core.py create mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/test_multi_agency_location.py create mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/test_single_agency_location.py delete mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/__init__.py delete mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/conftest.py delete mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_empty.py delete mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_no_state_any_locations.py delete mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_state_multiple_locations.py delete mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_state_no_locations.py delete mode 100644 tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_state_one_location.py rename tests/automated/integration/tasks/url/impl/{agency_identification/subtasks/nlp_location_match/internal_processor => location_identification/subtasks/nlp_location_frequency/end_to_end}/__init__.py (100%) create mode 100644 tests/automated/integration/tasks/url/impl/location_identification/subtasks/nlp_location_frequency/end_to_end/conftest.py create mode 100644 tests/automated/integration/tasks/url/impl/location_identification/subtasks/nlp_location_frequency/end_to_end/test_core.py rename tests/automated/integration/tasks/url/impl/{agency_identification/subtasks/nlp_location_match => location_identification/subtasks/nlp_location_frequency}/test_nlp_response_valid.py (96%) delete mode 100644 tests/automated/unit/dto/__init__.py delete mode 100644 tests/automated/unit/dto/test_all_annotation_post_info.py diff --git a/alembic/versions/2025_09_15_1905-93cbaa3b8e9b_add_location_annotation_logic.py b/alembic/versions/2025_09_15_1905-93cbaa3b8e9b_add_location_annotation_logic.py index 06d49980..55bb5ea5 100644 --- a/alembic/versions/2025_09_15_1905-93cbaa3b8e9b_add_location_annotation_logic.py +++ b/alembic/versions/2025_09_15_1905-93cbaa3b8e9b_add_location_annotation_logic.py @@ -7,11 +7,11 @@ """ from typing import Sequence, Union -from alembic import op import sqlalchemy as sa +from alembic import op from src.util.alembic_helpers import switch_enum_type, url_id_column, location_id_column, created_at_column, id_column, \ - task_id_column, agency_id_column, user_id_column + task_id_column, user_id_column # revision identifiers, used by Alembic. revision: str = '93cbaa3b8e9b' @@ -362,7 +362,6 @@ def _create_location_id_subtask_suggestions_table(): f'{AUTO_LOCATION_ID_SUBTASK_TABLE_NAME}.id', ondelete='CASCADE' ), - primary_key=True ), location_id_column(), sa.Column( @@ -371,6 +370,11 @@ def _create_location_id_subtask_suggestions_table(): nullable=False ), created_at_column(), + sa.PrimaryKeyConstraint( + 'subtask_id', + 'location_id', + name='location_id_subtask_suggestions_pk' + ) ) diff --git a/local_database/DataDumper/dump.sh b/local_database/DataDumper/dump.sh index 482a3ca1..6d7fa669 100644 --- a/local_database/DataDumper/dump.sh +++ b/local_database/DataDumper/dump.sh @@ -23,6 +23,7 @@ else fi # Run pg_dump -pg_dump -h $DB_HOST -p $DB_PORT -U $DB_USER -d $DB_NAME $PG_DUMP_FLAGS -f $DUMP_FILE +echo "(Excluding url_screenshot table data)" +pg_dump -h $DB_HOST -p $DB_PORT -U $DB_USER -d $DB_NAME $PG_DUMP_FLAGS -f $DUMP_FILE --exclude-table-data=url_screenshot echo "Dump completed. File saved to $DUMP_FILE." diff --git a/src/api/endpoints/annotate/all/get/queries/core.py b/src/api/endpoints/annotate/all/get/queries/core.py index adc41477..615beab2 100644 --- a/src/api/endpoints/annotate/all/get/queries/core.py +++ b/src/api/endpoints/annotate/all/get/queries/core.py @@ -49,6 +49,7 @@ async def run( UnvalidatedURL.url_id == URL.id ) # Must not have been previously annotated by user + # TODO (SM422): Remove where conditional on whether it already has user suggestions .join( prev_annotated_cte.cte, prev_annotated_cte.url_id == URL.id @@ -73,6 +74,7 @@ async def run( joinedload(URL.auto_record_type_suggestion), ) + # TODO (SM422): Add order by highest number of suggestions (auto or user), desc query = query.order_by(URL.id.asc()).limit(1) raw_results = (await session.execute(query)).unique() url: URL | None = raw_results.scalars().one_or_none() diff --git a/src/api/endpoints/annotate/all/post/models/request.py b/src/api/endpoints/annotate/all/post/models/request.py index f6d17749..bd5c0121 100644 --- a/src/api/endpoints/annotate/all/post/models/request.py +++ b/src/api/endpoints/annotate/all/post/models/request.py @@ -13,6 +13,7 @@ class AllAnnotationPostInfo(BaseModel): agency: URLAgencyAnnotationPostInfo | None = None location_ids: list[int] + # TODO (SM422): Break up into multiple validation types @model_validator(mode="after") def allow_record_type_and_agency_only_if_relevant(self): suggested_status = self.suggested_status diff --git a/src/api/endpoints/annotate/all/post/query.py b/src/api/endpoints/annotate/all/post/query.py index 12374375..2203b368 100644 --- a/src/api/endpoints/annotate/all/post/query.py +++ b/src/api/endpoints/annotate/all/post/query.py @@ -34,7 +34,7 @@ async def run(self, session: AsyncSession) -> None: session.add(relevant_suggestion) # If not relevant, do nothing else - # TODO: 1: Update to account for change in SuggestedStatus + # TODO (SM422): Update to account for change in SuggestedStatus if not self.post_info.suggested_status == SuggestedStatus.RELEVANT: return diff --git a/src/api/main.py b/src/api/main.py index d169d1e3..ddf44a5b 100644 --- a/src/api/main.py +++ b/src/api/main.py @@ -27,8 +27,7 @@ from src.core.tasks.scheduled.registry.core import ScheduledJobRegistry from src.core.tasks.url.loader import URLTaskOperatorLoader from src.core.tasks.url.manager import TaskManager -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor import \ - NLPProcessor +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.core import NLPProcessor from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.enums import \ SpacyModelType from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser diff --git a/src/core/enums.py b/src/core/enums.py index edc18425..4fa903c1 100644 --- a/src/core/enums.py +++ b/src/core/enums.py @@ -42,6 +42,7 @@ class RecordType(Enum): # Info About Agencies ANNUAL_AND_MONTHLY_REPORTS = "Annual & Monthly Reports" BUDGETS_AND_FINANCES = "Budgets & Finances" + # TODO SM422: Remove below CONTACT_INFO_AND_AGENCY_META = "Contact Info & Agency Meta" GEOGRAPHIC = "Geographic" LIST_OF_DATA_SOURCES = "List of Data Sources" @@ -83,6 +84,7 @@ class SubmitResponseStatus(Enum): FAILURE = "FAILURE" ALREADY_EXISTS = "already_exists" +# TODO (SM422): Replace use of SuggestedStatus with URLValidationType class SuggestedStatus(Enum): """ Possible values for user_relevant_suggestions:suggested_status diff --git a/src/core/tasks/url/loader.py b/src/core/tasks/url/loader.py index 04ad1f23..b81d641a 100644 --- a/src/core/tasks/url/loader.py +++ b/src/core/tasks/url/loader.py @@ -7,12 +7,12 @@ from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface from src.core.tasks.url.models.entry import URLTaskEntry from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor import \ - NLPProcessor from src.core.tasks.url.operators.agency_identification.subtasks.loader import AgencyIdentificationSubtaskLoader from src.core.tasks.url.operators.auto_relevant.core import URLAutoRelevantTaskOperator from src.core.tasks.url.operators.html.core import URLHTMLTaskOperator from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser +from src.core.tasks.url.operators.location_id.core import LocationIdentificationTaskOperator +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.core import NLPProcessor from src.core.tasks.url.operators.location_id.subtasks.loader import LocationIdentificationSubtaskLoader from src.core.tasks.url.operators.misc_metadata.core import URLMiscellaneousMetadataTaskOperator from src.core.tasks.url.operators.probe.core import URLProbeTaskOperator @@ -86,7 +86,6 @@ def _get_agency_identification_task_operator(self) -> URLTaskEntry: pdap_client=self.pdap_client, muckrock_api_interface=self.muckrock_api_interface, adb_client=self.adb_client, - nlp_processor=self.nlp_processor ) ) return URLTaskEntry( @@ -186,7 +185,7 @@ def _get_url_screenshot_task_operator(self) -> URLTaskEntry: ) def _get_location_id_task_operator(self) -> URLTaskEntry: - operator = URLLocationIDTaskOperator( + operator = LocationIdentificationTaskOperator( adb_client=self.adb_client, loader=LocationIdentificationSubtaskLoader( adb_client=self.adb_client, diff --git a/src/core/tasks/url/operators/_shared/container/subtask/eligible.py b/src/core/tasks/url/operators/_shared/container/subtask/eligible.py index 4ad60124..989b509f 100644 --- a/src/core/tasks/url/operators/_shared/container/subtask/eligible.py +++ b/src/core/tasks/url/operators/_shared/container/subtask/eligible.py @@ -30,7 +30,7 @@ def url_id(self) -> Column[int]: return self.cte.c['id'] @property - def eligible_query(self) -> ColumnElement[int]: + def eligible_query(self) -> ColumnElement[bool]: return ( exists() .where( diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/constants.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/constants.py deleted file mode 100644 index 31890aaa..00000000 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/constants.py +++ /dev/null @@ -1,4 +0,0 @@ - - -ITERATIONS_PER_SUBTASK = 4 -NUMBER_OF_ENTRIES_PER_ITERATION = 10 \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/convert.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/convert.py index 139597f9..2766bff0 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/convert.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/convert.py @@ -1,2 +1,49 @@ +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.models.input import \ + NLPLocationMatchSubtaskInput +from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData +from src.core.tasks.url.operators.agency_identification.subtasks.models.suggestion import AgencySuggestion +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType +from src.db.models.impl.url.suggestion.agency.subtask.pydantic import URLAutoAgencyIDSubtaskPydantic +def convert_location_agency_mappings_to_subtask_data_list( + task_id: int, + inputs: list[NLPLocationMatchSubtaskInput] +) -> list[AutoAgencyIDSubtaskData]: + results: list[AutoAgencyIDSubtaskData] = [] + for input_ in inputs: + suggestions: list[AgencySuggestion] = [] + if not input_.has_locations_with_agencies: + agencies_found: bool = False + else: + agencies_found: bool = True + for mapping in input_.mappings: + agency_ids: list[int] = mapping.agency_ids + confidence_per_agency: int = _calculate_confidence_per_agency( + agency_ids, + confidence=mapping.location_annotation.confidence + ) + for agency_id in agency_ids: + suggestion = AgencySuggestion( + agency_id=agency_id, + confidence=confidence_per_agency, + ) + suggestions.append(suggestion) + data = AutoAgencyIDSubtaskData( + pydantic_model=URLAutoAgencyIDSubtaskPydantic( + url_id=input_.url_id, + type=AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH, + agencies_found=agencies_found, + task_id=task_id, + ), + suggestions=suggestions, + ) + results.append(data) + return results + + +def _calculate_confidence_per_agency(agency_ids: list[int], confidence: int): + num_agencies: int = len(agency_ids) + confidence_per_agency: int = confidence // num_agencies + return confidence_per_agency + diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/core.py index 2894446d..4463ff0d 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/core.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/core.py @@ -1,11 +1,11 @@ -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.constants import \ - ITERATIONS_PER_SUBTASK +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.convert import \ + convert_location_agency_mappings_to_subtask_data_list +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.models.input import \ + NLPLocationMatchSubtaskInput +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.query_.query import \ + GetAgenciesLinkedToAnnotatedLocationsQueryBuilder from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIDSubtaskOperatorBase -from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.models.input import \ - NLPLocationMatchSubtaskInput -from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.query import \ - GetNLPLocationMatchSubtaskInputQueryBuilder from src.db.client.async_ import AsyncDatabaseClient @@ -19,50 +19,18 @@ def __init__( super().__init__(adb_client, task_id=task_id) async def inner_logic(self) -> None: - for iteration in range(ITERATIONS_PER_SUBTASK): - inputs: list[NLPLocationMatchSubtaskInput] = await self._get_from_db() - if len(inputs) == 0: - break - await self.run_subtask_iteration(inputs) + inputs: list[NLPLocationMatchSubtaskInput] = await self._get_from_db() + await self.run_subtask_iteration(inputs) async def run_subtask_iteration(self, inputs: list[NLPLocationMatchSubtaskInput]) -> None: self.linked_urls.extend([input_.url_id for input_ in inputs]) - subtask_data_list: list[AutoAgencyIDSubtaskData] = [] - - # TODO: Process and Convert NLP Annotations - - # TODO: Resubmit NLP Annotations - - # TODO: For locations with no associated agencies, convert to subtask data with empty agencies - subtask_data_no_agency_list: list[AutoAgencyIDSubtaskData] = \ - convert_empty_location_agency_mappings_to_subtask_data_list( - mappings=nlp_response_subsets.invalid, - task_id=self._task_id, - ) - subtask_data_list.extend(subtask_data_no_agency_list) - - # For locations with agency mappings, convert to data with suggestions - subtask_data_list_agency_list: list[AutoAgencyIDSubtaskData] = \ - convert_location_agency_mappings_to_subtask_data_list( - mappings=response_mappings, - task_id=self._task_id, - ) - - subtask_data_list.extend(subtask_data_list_agency_list) - - return subtask_data_list - - await self._upload_subtask_data(subtask_data_list) - - async def _process_inputs( - self, - inputs: list[NLPLocationMatchSubtaskInput] - ) -> list[AutoAgencyIDSubtaskData]: - return await self.processor.process( + subtask_data_list: list[AutoAgencyIDSubtaskData] = convert_location_agency_mappings_to_subtask_data_list( + task_id=self.task_id, inputs=inputs, ) + await self._upload_subtask_data(subtask_data_list) async def _get_from_db(self) -> list[NLPLocationMatchSubtaskInput]: return await self.adb_client.run_query_builder( - GetNLPLocationMatchSubtaskInputQueryBuilder(), + GetAgenciesLinkedToAnnotatedLocationsQueryBuilder(), ) diff --git a/src/core/tasks/url/operators/location_id/models/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/models/__init__.py similarity index 100% rename from src/core/tasks/url/operators/location_id/models/__init__.py rename to src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/models/__init__.py diff --git a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/models/input.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/models/input.py similarity index 100% rename from src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/models/input.py rename to src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/models/input.py diff --git a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/models/subsets/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/models/subsets/__init__.py similarity index 100% rename from src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/models/subsets/__init__.py rename to src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/models/subsets/__init__.py diff --git a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/models/subsets/nlp_responses.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/models/subsets/nlp_responses.py similarity index 100% rename from src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/models/subsets/nlp_responses.py rename to src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/models/subsets/nlp_responses.py diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/query_/query.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/query_/query.py index 9ddc32e1..f0dcac94 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/query_/query.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/query_/query.py @@ -1,26 +1,84 @@ -from sqlalchemy import select +from collections import defaultdict +from typing import Sequence + +from sqlalchemy import select, RowMapping from sqlalchemy.ext.asyncio import AsyncSession -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.query_.response import \ - GetAgenciesLinkedToAnnotatedLocationsResponse -from src.db.models.impl.agency.sqlalchemy import Agency -from src.db.models.impl.url.core.sqlalchemy import URL +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.models.input import \ + NLPLocationMatchSubtaskInput, LocationAnnotationToAgencyIDMapping, LocationAnnotation +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.impl.nlp_location import \ + NLP_LOCATION_CONTAINER +from src.db.models.impl.link.agency_location.sqlalchemy import LinkAgencyLocation +from src.db.models.impl.url.suggestion.location.auto.subtask.sqlalchemy import AutoLocationIDSubtask from src.db.models.impl.url.suggestion.location.auto.suggestion.sqlalchemy import LocationIDSubtaskSuggestion from src.db.queries.base.builder import QueryBuilderBase +from src.db.helpers.session import session_helper as sh class GetAgenciesLinkedToAnnotatedLocationsQueryBuilder(QueryBuilderBase): - async def run(self, session: AsyncSession) -> list[GetAgenciesLinkedToAnnotatedLocationsResponse]: - - query = ( - select( - URL.id, - LocationIDSubtaskSuggestion.location_id, - LocationIDSubtaskSuggestion.confidence, - Agency.id + async def run(self, session: AsyncSession) -> list[NLPLocationMatchSubtaskInput]: + query = ( + select( + NLP_LOCATION_CONTAINER.url_id, + LocationIDSubtaskSuggestion.location_id, + LocationIDSubtaskSuggestion.confidence, + LinkAgencyLocation.agency_id, + ) + .join( + AutoLocationIDSubtask, + AutoLocationIDSubtask.url_id == NLP_LOCATION_CONTAINER.url_id + ) + .join( + LocationIDSubtaskSuggestion, + LocationIDSubtaskSuggestion.subtask_id == AutoLocationIDSubtask.id + ) + .join( + LinkAgencyLocation, + LinkAgencyLocation.location_id == LocationIDSubtaskSuggestion.location_id + ) + .where( + ~NLP_LOCATION_CONTAINER.entry_exists + ) ) - .outerjoin( + url_id_to_location_id_to_agency_ids: dict[int, dict[int, list[int]]] = defaultdict( + lambda: defaultdict(list) ) - ) \ No newline at end of file + url_id_to_location_id_to_annotations: dict[int, dict[int, LocationAnnotation]] = defaultdict(dict) + + mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) + for mapping in mappings: + url_id: int = mapping["id"] + location_id: int = mapping["location_id"] + confidence: int = mapping["confidence"] + agency_id: int = mapping["agency_id"] + + if agency_id is None: + continue + url_id_to_location_id_to_agency_ids[url_id][location_id].append(agency_id) + if location_id not in url_id_to_location_id_to_annotations[url_id]: + location_annotation = LocationAnnotation( + location_id=location_id, + confidence=confidence, + ) + url_id_to_location_id_to_annotations[url_id][location_id] = location_annotation + + results: list[NLPLocationMatchSubtaskInput] = [] + for url_id in url_id_to_location_id_to_agency_ids: + anno_mappings: list[LocationAnnotationToAgencyIDMapping] = [] + for location_id in url_id_to_location_id_to_agency_ids[url_id]: + location_annotation: LocationAnnotation = url_id_to_location_id_to_annotations[url_id][location_id] + agency_ids: list[int] = url_id_to_location_id_to_agency_ids[url_id][location_id] + anno_mapping: LocationAnnotationToAgencyIDMapping = LocationAnnotationToAgencyIDMapping( + location_annotation=location_annotation, + agency_ids=agency_ids, + ) + anno_mappings.append(anno_mapping) + input_ = NLPLocationMatchSubtaskInput( + url_id=url_id, + mappings=anno_mappings, + ) + results.append(input_) + return results + diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location.py index d4d02b18..17055d1a 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location.py @@ -1,13 +1,15 @@ from operator import and_ -from sqlalchemy import select +from sqlalchemy import select, exists from src.core.tasks.url.operators._shared.container.subtask.eligible import URLsSubtaskEligibleCTEContainer from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.helpers import \ get_exists_subtask_query +from src.db.models.impl.link.agency_location.sqlalchemy import LinkAgencyLocation from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType from src.db.models.impl.url.suggestion.location.auto.subtask.sqlalchemy import AutoLocationIDSubtask +from src.db.models.impl.url.suggestion.location.auto.suggestion.sqlalchemy import LocationIDSubtaskSuggestion cte = ( select( @@ -23,6 +25,23 @@ AutoLocationIDSubtask.locations_found ) ) + .where( + # One of the locations must be linked to an agency + exists( + select( + LinkAgencyLocation.id + ) + .join( + LocationIDSubtaskSuggestion, + LocationIDSubtaskSuggestion.location_id == LinkAgencyLocation.location_id, + ) + .join( + AutoLocationIDSubtask, + AutoLocationIDSubtask.id == LocationIDSubtaskSuggestion.subtask_id, + ) + ) + + ) .cte("nlp_location_eligible") ) diff --git a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/core.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/core.py index af096953..1f9c8d62 100644 --- a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/core.py +++ b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/core.py @@ -1,11 +1,11 @@ from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.constants import ITERATIONS_PER_SUBTASK -from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.models.input import \ - NLPLocationMatchSubtaskInput +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.models.input_ import \ + NLPLocationFrequencySubtaskInput from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.core import \ NLPLocationFrequencySubtaskInternalProcessor from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.core import NLPProcessor from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.query import \ - GetNLPLocationMatchSubtaskInputQueryBuilder + GetNLPLocationFrequencySubtaskInputQueryBuilder from src.core.tasks.url.operators.location_id.subtasks.models.subtask import AutoLocationIDSubtaskData from src.core.tasks.url.operators.location_id.subtasks.templates.subtask import LocationIDSubtaskOperatorBase from src.db.client.async_ import AsyncDatabaseClient @@ -30,12 +30,12 @@ def __init__( async def inner_logic(self) -> None: for iteration in range(ITERATIONS_PER_SUBTASK): - inputs: list[NLPLocationMatchSubtaskInput] = await self._get_from_db() + inputs: list[NLPLocationFrequencySubtaskInput] = await self._get_from_db() if len(inputs) == 0: break await self.run_subtask_iteration(inputs) - async def run_subtask_iteration(self, inputs: list[NLPLocationMatchSubtaskInput]) -> None: + async def run_subtask_iteration(self, inputs: list[NLPLocationFrequencySubtaskInput]) -> None: self.linked_urls.extend([input_.url_id for input_ in inputs]) subtask_data_list: list[AutoLocationIDSubtaskData] = await self._process_inputs(inputs) @@ -43,14 +43,14 @@ async def run_subtask_iteration(self, inputs: list[NLPLocationMatchSubtaskInput] async def _process_inputs( self, - inputs: list[NLPLocationMatchSubtaskInput] + inputs: list[NLPLocationFrequencySubtaskInput] ) -> list[AutoLocationIDSubtaskData]: return await self.processor.process( inputs=inputs, ) - async def _get_from_db(self) -> list[NLPLocationMatchSubtaskInput]: + async def _get_from_db(self) -> list[NLPLocationFrequencySubtaskInput]: return await self.adb_client.run_query_builder( - GetNLPLocationMatchSubtaskInputQueryBuilder(), + GetNLPLocationFrequencySubtaskInputQueryBuilder(), ) diff --git a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/models/input_.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/models/input_.py new file mode 100644 index 00000000..0ba1647e --- /dev/null +++ b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/models/input_.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class NLPLocationFrequencySubtaskInput(BaseModel): + url_id: int + html: str \ No newline at end of file diff --git a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/models/subsets.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/models/subsets.py new file mode 100644 index 00000000..304c7e01 --- /dev/null +++ b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/models/subsets.py @@ -0,0 +1,9 @@ +from pydantic import BaseModel + +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.models.mappings.url_id_nlp_response import \ + URLToNLPResponseMapping + + +class NLPResponseSubsets(BaseModel): + valid: list[URLToNLPResponseMapping] + invalid: list[URLToNLPResponseMapping] \ No newline at end of file diff --git a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/convert.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/convert.py index d6d6c83c..8ec60b35 100644 --- a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/convert.py +++ b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/convert.py @@ -68,7 +68,8 @@ def convert_search_location_responses_to_subtask_data_list( ) pydantic_model: AutoLocationIDSubtaskPydantic = convert_search_agency_response_to_subtask_pydantic( url_id=url_id, - task_id=task_id + task_id=task_id, + suggestions=suggestions ) subtask_data = AutoLocationIDSubtaskData( pydantic_model=pydantic_model, @@ -80,14 +81,15 @@ def convert_search_location_responses_to_subtask_data_list( def convert_search_agency_response_to_subtask_pydantic( url_id: int, - task_id: int + task_id: int, + suggestions: list[LocationSuggestion] ) -> AutoLocationIDSubtaskPydantic: return AutoLocationIDSubtaskPydantic( task_id=task_id, url_id=url_id, type=LocationIDSubtaskType.NLP_LOCATION_FREQUENCY, - locations_found=True + locations_found=len(suggestions) > 0, ) def _convert_search_agency_response_to_agency_suggestions( diff --git a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/core.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/core.py index 4cbd4ab7..bfacd67e 100644 --- a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/core.py +++ b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/core.py @@ -1,17 +1,16 @@ from collections import defaultdict +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.models.input_ import \ + NLPLocationFrequencySubtaskInput +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.models.subsets import NLPResponseSubsets from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.filter import \ - filter_valid_and_invalid_nlp_responses, filter_top_n_suggestions + filter_valid_and_invalid_nlp_responses, filter_top_n_suggestions, filter_out_responses_with_zero_similarity from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.models.mappings.url_id_search_response import \ URLToSearchResponseMapping from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.mapper import \ URLRequestIDMapper -from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.models.input import \ - NLPLocationMatchSubtaskInput from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.models.mappings.url_id_nlp_response import \ URLToNLPResponseMapping -from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.models.subsets.nlp_responses import \ - NLPResponseSubsets from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.convert import \ convert_invalid_url_nlp_mappings_to_subtask_data_list, convert_search_location_responses_to_subtask_data_list, \ convert_urls_to_search_params @@ -27,7 +26,7 @@ from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.query_.models.params import \ SearchSimilarLocationsParams from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.query_.models.response import \ - SearchSimilarLocationsResponse + SearchSimilarLocationsResponse, SearchSimilarLocationsOuterResponse from src.core.tasks.url.operators.location_id.subtasks.models.subtask import AutoLocationIDSubtaskData from src.db.client.async_ import AsyncDatabaseClient @@ -46,12 +45,12 @@ def __init__( async def process( self, - inputs: list[NLPLocationMatchSubtaskInput] + inputs: list[NLPLocationFrequencySubtaskInput] ) -> list[AutoLocationIDSubtaskData]: subtask_data_list: list[AutoLocationIDSubtaskData] = [] url_to_nlp_mappings: list[URLToNLPResponseMapping] = \ - self._match_urls_to_nlp_responses(inputs) + self._parse_all_url_htmls_for_locations(inputs) # Filter out valid and invalid NLP responses nlp_response_subsets: NLPResponseSubsets = \ @@ -104,11 +103,12 @@ async def _get_db_location_info( url_id_to_search_responses: dict[int, list[SearchSimilarLocationsResponse]] = defaultdict(list) - responses: list[SearchSimilarLocationsResponse] = await self._adb_client.run_query_builder( + outer_response: SearchSimilarLocationsOuterResponse = await self._adb_client.run_query_builder( SearchSimilarLocationsQueryBuilder( params=params, ) ) + responses: list[SearchSimilarLocationsResponse] = outer_response.responses # Map responses to URL IDs via request IDs for response in responses: request_id: int = response.request_id @@ -118,6 +118,9 @@ async def _get_db_location_info( # Reconcile URL IDs to search responses response_mappings: list[URLToSearchResponseMapping] = [] for url_id, responses in url_id_to_search_responses.items(): + for response in responses: + response.results = filter_out_responses_with_zero_similarity(response.results) + mapping = URLToSearchResponseMapping( url_id=url_id, search_responses=responses, @@ -126,13 +129,13 @@ async def _get_db_location_info( return response_mappings - def _match_urls_to_nlp_responses( + def _parse_all_url_htmls_for_locations( self, - inputs: list[NLPLocationMatchSubtaskInput] + inputs: list[NLPLocationFrequencySubtaskInput] ) -> list[URLToNLPResponseMapping]: url_to_nlp_mappings: list[URLToNLPResponseMapping] = [] for input_ in inputs: - nlp_response: NLPLocationMatchResponse = self._get_location_match(input_.html) + nlp_response: NLPLocationMatchResponse = self._parse_for_locations(input_.html) mapping = URLToNLPResponseMapping( url_id=input_.url_id, nlp_response=nlp_response, @@ -140,7 +143,7 @@ def _match_urls_to_nlp_responses( url_to_nlp_mappings.append(mapping) return url_to_nlp_mappings - def _get_location_match( + def _parse_for_locations( self, html: str ) -> NLPLocationMatchResponse: diff --git a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/filter.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/filter.py index 23c643b6..474279b0 100644 --- a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/filter.py +++ b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/filter.py @@ -2,10 +2,11 @@ from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.models.mappings.url_id_nlp_response import \ URLToNLPResponseMapping -from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.models.subsets.nlp_responses import \ - NLPResponseSubsets +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.models.subsets import NLPResponseSubsets from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.models.response import \ NLPLocationMatchResponse +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.query_.models.response import \ + SearchSimilarLocationsLocationInfo from src.core.tasks.url.operators.location_id.subtasks.models.subtask import AutoLocationIDSubtaskData from src.core.tasks.url.operators.location_id.subtasks.models.suggestion import LocationSuggestion @@ -57,3 +58,8 @@ def filter_top_n_suggestions( reverse=True # Descending order ) subtask_data.suggestions = suggestions_sorted[:n] + +def filter_out_responses_with_zero_similarity( + entries: list[SearchSimilarLocationsLocationInfo] +) -> list[SearchSimilarLocationsLocationInfo]: + return [entry for entry in entries if entry.similarity > 0] \ No newline at end of file diff --git a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/check.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/check.py index 2f3044b8..502014f0 100644 --- a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/check.py +++ b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/check.py @@ -1,3 +1,5 @@ +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.constants import \ + BLACKLISTED_WORDS from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.mappings import \ US_STATE_ISO_TO_NAME, US_NAME_TO_STATE_ISO @@ -6,4 +8,7 @@ def is_iso_us_state(iso: str) -> bool: return iso in US_STATE_ISO_TO_NAME def is_name_us_state(name: str) -> bool: - return name in US_NAME_TO_STATE_ISO \ No newline at end of file + return name in US_NAME_TO_STATE_ISO + +def is_blacklisted_word(word: str) -> bool: + return word.lower() in BLACKLISTED_WORDS \ No newline at end of file diff --git a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/constants.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/constants.py index 8b9076fe..01c13edb 100644 --- a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/constants.py +++ b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/constants.py @@ -15,4 +15,12 @@ INVALID_SCAN_ISOS: set[str] = { "IN", "OR", + "ME", + "ID" +} + +BLACKLISTED_WORDS: set[str] = { + "the united states", + "download", + "geoplatform" } \ No newline at end of file diff --git a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/core.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/core.py index 615684e5..275e2946 100644 --- a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/core.py +++ b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/nlp/core.py @@ -5,7 +5,7 @@ from spacy.tokens import Doc from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.check import \ - is_name_us_state, is_iso_us_state + is_name_us_state, is_iso_us_state, is_blacklisted_word from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.constants import \ INVALID_LOCATION_CHARACTERS, INVALID_SCAN_ISOS from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.convert import \ @@ -62,6 +62,8 @@ def parse_for_locations(self, html: str) -> NLPLocationMatchResponse: text: str = ent.text if any(char in text for char in INVALID_LOCATION_CHARACTERS): continue + if is_blacklisted_word(text): + continue if is_name_us_state(text): us_state: USState | None = convert_us_state_name_to_us_state(text) if us_state is not None: diff --git a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/query_/core.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/query_/core.py index 6a245d94..f6011f49 100644 --- a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/query_/core.py +++ b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/processor/query_/core.py @@ -51,7 +51,11 @@ async def run(self, session: AsyncSession) -> SearchSimilarLocationsOuterRespons lateral_top_5 = ( select( vals.c.request_id, - LocationExpandedView.location_id, + LocationExpandedView.id.label("location_id"), + func.row_number().over( + partition_by=vals.c.request_id, + order_by=similarity.desc(), + ).label("rank"), similarity.label("similarity"), ) .join( @@ -61,19 +65,24 @@ async def run(self, session: AsyncSession) -> SearchSimilarLocationsOuterRespons .order_by( similarity.desc(), ) - .limit(5) .lateral("lateral_top_5") ) - final = select( - vals.c.request_id, - lateral_top_5.c.location_id, - lateral_top_5.c.similarity, - ).join( - lateral_top_5, - vals.c.request_id == lateral_top_5.c.request_id, + final = ( + select( + vals.c.request_id, + lateral_top_5.c.location_id, + lateral_top_5.c.similarity, + ).join( + lateral_top_5, + vals.c.request_id == lateral_top_5.c.request_id, + ) + .where( + lateral_top_5.c.rank <= 5, + ) ) + mappings: Sequence[RowMapping] = await sh.mappings(session, query=final) request_id_to_locations: dict[int, list[SearchSimilarLocationsLocationInfo]] = ( defaultdict(list) diff --git a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/query.py b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/query.py index 7f2e00b8..96b63bb1 100644 --- a/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/query.py +++ b/src/core/tasks/url/operators/location_id/subtasks/impl/nlp_location_freq/query.py @@ -3,27 +3,23 @@ from sqlalchemy import select, RowMapping from sqlalchemy.ext.asyncio import AsyncSession -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.constants import \ +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.constants import \ NUMBER_OF_ENTRIES_PER_ITERATION -from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.models.input import \ - NLPLocationMatchSubtaskInput -from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.eligible import \ - EligibleContainer +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.models.input_ import \ + NLPLocationFrequencySubtaskInput +from src.core.tasks.url.operators.location_id.subtasks.queries.survey.queries.ctes.eligible import EligibleContainer from src.db.helpers.session import session_helper as sh from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML -from src.db.models.impl.url.suggestion.location.auto.subtask.sqlalchemy import AutoLocationIDSubtask -from src.db.models.impl.url.suggestion.location.auto.suggestion.sqlalchemy import LocationIDSubtaskSuggestion from src.db.queries.base.builder import QueryBuilderBase from src.db.utils.compression import decompress_html -class GetNLPLocationMatchSubtaskInputQueryBuilder(QueryBuilderBase): +class GetNLPLocationFrequencySubtaskInputQueryBuilder(QueryBuilderBase): - # TODO: Change async def run( self, session: AsyncSession - ) -> list[NLPLocationMatchSubtaskInput]: + ) -> list[NLPLocationFrequencySubtaskInput]: container = EligibleContainer() query = ( select( @@ -31,12 +27,8 @@ async def run( URLCompressedHTML.compressed_html ) .join( - AutoLocationIDSubtask, - AutoLocationIDSubtask.url_id == container.url_id, - ) - .join( - LocationIDSubtaskSuggestion, - LocationIDSubtaskSuggestion.subtask_id == AutoLocationIDSubtask.id + URLCompressedHTML, + URLCompressedHTML.url_id == container.url_id, ) .where( container.nlp_location, @@ -45,8 +37,8 @@ async def run( ) mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) - inputs: list[NLPLocationMatchSubtaskInput] = [ - NLPLocationMatchSubtaskInput( + inputs: list[NLPLocationFrequencySubtaskInput] = [ + NLPLocationFrequencySubtaskInput( url_id=mapping["id"], html=decompress_html(mapping["compressed_html"]), ) diff --git a/src/core/tasks/url/operators/location_id/subtasks/loader.py b/src/core/tasks/url/operators/location_id/subtasks/loader.py index 88d3aa82..b8267cdb 100644 --- a/src/core/tasks/url/operators/location_id/subtasks/loader.py +++ b/src/core/tasks/url/operators/location_id/subtasks/loader.py @@ -1,6 +1,6 @@ -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor import NLPProcessor from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.core import \ NLPLocationFrequencySubtaskOperator +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.core import NLPProcessor from src.core.tasks.url.operators.location_id.subtasks.templates.subtask import LocationIDSubtaskOperatorBase from src.db.client.async_ import AsyncDatabaseClient from src.db.models.impl.url.suggestion.location.auto.subtask.enums import LocationIDSubtaskType diff --git a/src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location_freq.py b/src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location_freq.py index 4998f4fe..7ab2e0eb 100644 --- a/src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location_freq.py +++ b/src/core/tasks/url/operators/location_id/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location_freq.py @@ -1,6 +1,6 @@ from sqlalchemy import select -from src.core.tasks.url.operators._shared.subtask.container import SubtaskCTEContainer +from src.core.tasks.url.operators._shared.container.subtask.eligible import URLsSubtaskEligibleCTEContainer from src.core.tasks.url.operators.location_id.subtasks.queries.survey.queries.ctes.subtask.helpers import \ get_exists_subtask_query from src.db.models.impl.url.core.sqlalchemy import URL @@ -20,6 +20,6 @@ .cte("nlp_location_eligible") ) -NLP_LOCATION_CONTAINER = SubtaskCTEContainer( +NLP_LOCATION_CONTAINER = URLsSubtaskEligibleCTEContainer( cte, ) \ No newline at end of file diff --git a/src/db/models/impl/flag/url_validated/enums.py b/src/db/models/impl/flag/url_validated/enums.py index fe74b84c..1dda4a69 100644 --- a/src/db/models/impl/flag/url_validated/enums.py +++ b/src/db/models/impl/flag/url_validated/enums.py @@ -1,6 +1,7 @@ from enum import Enum +# TODO (SM422): Rename to URLType class URLValidatedType(Enum): DATA_SOURCE = "data source" META_URL = "meta url" diff --git a/src/db/models/impl/url/core/sqlalchemy.py b/src/db/models/impl/url/core/sqlalchemy.py index ddb606b3..66bb3547 100644 --- a/src/db/models/impl/url/core/sqlalchemy.py +++ b/src/db/models/impl/url/core/sqlalchemy.py @@ -59,14 +59,17 @@ class URL(UpdatedAtMixin, CreatedAtMixin, WithIDBase): auto_location_subtasks = relationship( AutoLocationIDSubtask ) + # TODO (SM422): Remove uselist=False, pluralize user_agency_suggestion = relationship( "UserUrlAgencySuggestion", uselist=False, back_populates="url") auto_record_type_suggestion = relationship( "AutoRecordTypeSuggestion", uselist=False, back_populates="url") + # TODO (SM422): Remove uselist=False, pluralize user_record_type_suggestion = relationship( "UserRecordTypeSuggestion", uselist=False, back_populates="url") auto_relevant_suggestion = relationship( "AutoRelevantSuggestion", uselist=False, back_populates="url") + # TODO (SM422): Remove uselist=False, pluralize user_relevant_suggestion = relationship( "UserRelevantSuggestion", uselist=False, back_populates="url") reviewing_user = relationship( diff --git a/src/db/models/impl/url/suggestion/location/auto/subtask/sqlalchemy.py b/src/db/models/impl/url/suggestion/location/auto/subtask/sqlalchemy.py index 86f04b4b..b7412d1e 100644 --- a/src/db/models/impl/url/suggestion/location/auto/subtask/sqlalchemy.py +++ b/src/db/models/impl/url/suggestion/location/auto/subtask/sqlalchemy.py @@ -1,5 +1,5 @@ from sqlalchemy import Column, Boolean -from sqlalchemy.orm import relationship +from sqlalchemy.orm import relationship, Mapped from src.db.models.helpers import enum_column from src.db.models.impl.url.suggestion.location.auto.subtask.enums import LocationIDSubtaskType @@ -18,7 +18,7 @@ class AutoLocationIDSubtask( __tablename__ = 'auto_location_id_subtasks' locations_found = Column(Boolean(), nullable=False) - type = enum_column( + type: Mapped[LocationIDSubtaskType] = enum_column( LocationIDSubtaskType, name='auto_location_id_subtask_type' ) diff --git a/src/db/models/impl/url/suggestion/location/auto/suggestion/sqlalchemy.py b/src/db/models/impl/url/suggestion/location/auto/suggestion/sqlalchemy.py index 9b478c91..0d5ea926 100644 --- a/src/db/models/impl/url/suggestion/location/auto/suggestion/sqlalchemy.py +++ b/src/db/models/impl/url/suggestion/location/auto/suggestion/sqlalchemy.py @@ -1,4 +1,5 @@ -from sqlalchemy import Column, Integer, ForeignKey, Float +from sqlalchemy import Column, Integer, ForeignKey, Float, PrimaryKeyConstraint +from sqlalchemy.orm import Mapped from src.db.models.helpers import location_id_column from src.db.models.templates_.base import Base @@ -9,11 +10,18 @@ class LocationIDSubtaskSuggestion( ): __tablename__ = 'location_id_subtask_suggestions' + __table_args__ = ( + PrimaryKeyConstraint( + 'subtask_id', + 'location_id', + name='location_id_subtask_suggestions_pk' + ), + ) subtask_id = Column( Integer, ForeignKey('auto_location_id_subtasks.id'), nullable=False, primary_key=True, ) - location_id = location_id_column() + location_id: Mapped[int] = location_id_column() confidence = Column(Float, nullable=False) \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/conftest.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/conftest.py index 766a7ca5..d73de0a2 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/conftest.py +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/conftest.py @@ -1,15 +1,10 @@ import pytest_asyncio -from src.db.dtos.url.mapping import URLMapping from tests.helpers.data_creator.core import DBDataCreator @pytest_asyncio.fixture -async def url_ids( +async def url_id( db_data_creator: DBDataCreator, -) -> list[int]: - # Create 2 URLs with compressed HTML - url_mappings: list[URLMapping] = await db_data_creator.create_urls(count=2) - url_ids: list[int] = [url.url_id for url in url_mappings] - await db_data_creator.html_data(url_ids=url_ids) - return url_ids +) -> int: + return (await db_data_creator.create_urls(count=1))[0].url_id diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/test_core.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/test_core.py deleted file mode 100644 index d4a65ed3..00000000 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/test_core.py +++ /dev/null @@ -1,116 +0,0 @@ -import pytest - -from src.core.tasks.base.run_info import TaskOperatorRunInfo -from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator -from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.models.input import \ - NLPLocationMatchSubtaskInput -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.core import \ - AgencyIDSubtaskInternalProcessor -from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData -from src.core.tasks.url.operators.agency_identification.subtasks.models.suggestion import AgencySuggestion -from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.impl.link.task_url import LinkTaskURL -from src.db.models.impl.url.error_info.sqlalchemy import URLErrorInfo -from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType -from src.db.models.impl.url.suggestion.agency.subtask.pydantic import URLAutoAgencyIDSubtaskPydantic -from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask -from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion -from tests.helpers.asserts import assert_task_run_success -from tests.helpers.data_creator.core import DBDataCreator - -PATCH_ROOT = ( - "src.core.tasks.url.operators.agency_identification.subtasks." + - "impl.nlp_location_match_.core.AgencyIDSubtaskInternalProcessor.process" -) - - - -@pytest.mark.asyncio -async def test_nlp_location_match( - operator: AgencyIdentificationTaskOperator, - db_data_creator: DBDataCreator, - url_ids: list[int], - monkeypatch -): - # Confirm operator meets prerequisites - assert await operator.meets_task_prerequisites() - assert operator._subtask == AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH - - happy_path_url_id: int = url_ids[0] - error_url_id: int = url_ids[1] - - agency_ids: list[int] = await db_data_creator.create_agencies(count=2) - agency_id_25: int = agency_ids[0] - agency_id_75: int = agency_ids[1] - - async def mock_process_response( - self: AgencyIDSubtaskInternalProcessor, - inputs: list[NLPLocationMatchSubtaskInput], - ) -> list[AutoAgencyIDSubtaskData]: - response = [ - AutoAgencyIDSubtaskData( - pydantic_model=URLAutoAgencyIDSubtaskPydantic( - task_id=self._task_id, - url_id=happy_path_url_id, - type=AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH, - agencies_found=True, - ), - suggestions=[ - AgencySuggestion( - agency_id=agency_id_25, - confidence=25 - ), - AgencySuggestion( - agency_id=agency_id_75, - confidence=75 - ) - ] - ), - AutoAgencyIDSubtaskData( - pydantic_model=URLAutoAgencyIDSubtaskPydantic( - task_id=self._task_id, - url_id=error_url_id, - type=AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH, - agencies_found=False, - ), - suggestions=[], - error="Test error" - ) - ] - return response - - monkeypatch.setattr(AgencyIDSubtaskInternalProcessor, "process", mock_process_response) - run_info: TaskOperatorRunInfo = await operator.run_task() - assert_task_run_success(run_info) - - adb_client: AsyncDatabaseClient = operator.adb_client - # Confirm two URLs linked to the task - task_links: list[LinkTaskURL] = await adb_client.get_all(LinkTaskURL) - assert len(task_links) == 2 - assert {task_link.url_id for task_link in task_links} == set(url_ids) - assert {task_link.task_id for task_link in task_links} == {operator._task_id} - - # Confirm two subtasks were created - subtasks: list[URLAutoAgencyIDSubtask] = await adb_client.get_all(URLAutoAgencyIDSubtask) - assert len(subtasks) == 2 - assert {subtask.url_id for subtask in subtasks} == set(url_ids) - assert {subtask.task_id for subtask in subtasks} == {operator._task_id} - assert {subtask.type for subtask in subtasks} == {AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH} - assert {subtask.agencies_found for subtask in subtasks} == {True, False} - - - # Confirm one URL error info - error_infos: list[URLErrorInfo] = await adb_client.get_all(URLErrorInfo) - assert len(error_infos) == 1 - assert error_infos[0].task_id == operator._task_id - assert error_infos[0].url_id == error_url_id - assert error_infos[0].error == "Test error" - - # Confirm two suggestions for happy path URL id - suggestions: list[AgencyIDSubtaskSuggestion] = await adb_client.get_all(AgencyIDSubtaskSuggestion) - assert len(suggestions) == 2 - # Confirm expected agency ids - assert {suggestion.agency_id for suggestion in suggestions} == set(agency_ids) - # Confirm both have the expected confidence values - assert {suggestion.confidence for suggestion in suggestions} == {25, 75} - diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/test_multi_agency_location.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/test_multi_agency_location.py new file mode 100644 index 00000000..3da841a1 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/test_multi_agency_location.py @@ -0,0 +1,70 @@ +import pytest + +from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType +from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask +from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion +from tests.helpers.data_creator.core import DBDataCreator +from tests.helpers.data_creator.models.creation_info.locality import LocalityCreationInfo +from tests.helpers.run import run_task_and_confirm_success + + +@pytest.mark.asyncio +async def test_multi_agency_location( + operator: AgencyIdentificationTaskOperator, + db_data_creator: DBDataCreator, + pittsburgh_locality: LocalityCreationInfo, + url_id: int +): + adb_client: AsyncDatabaseClient = operator.adb_client + + # Confirm operator does not meet prerequisites yet + assert not await operator.meets_task_prerequisites() + + # Add a location suggestion that has multiple agencies linked to it + # Create multiple agencies + agency_ids: list[int] = [ + await db_data_creator.agency() + for _ in range(2) + ] + # Link agencies to pittsburgh + await db_data_creator.link_agencies_to_location( + agency_ids=agency_ids, + location_id=pittsburgh_locality.location_id + ) + # Add location suggestion + await db_data_creator.add_location_suggestion( + url_id=url_id, + location_ids=[pittsburgh_locality.location_id], + confidence=80, + ) + + # Confirm operator now meets prerequisites + assert await operator.meets_task_prerequisites() + + # Confirm next task is nlp location match + assert operator._subtask == AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH + + # Run operator and confirm runs without error + await run_task_and_confirm_success(operator) + + # Confirm subtask no longer meets prerequisites + assert not await operator.meets_task_prerequisites() + + # Check for presence of subtask + subtasks: list[URLAutoAgencyIDSubtask] = await adb_client.get_all(URLAutoAgencyIDSubtask) + assert len(subtasks) == 1 + subtask: URLAutoAgencyIDSubtask = subtasks[0] + assert subtask.type == AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH + + # Confirm subtask lists agencies found + assert subtask.agencies_found + + # Confirm multiple agency suggestions in database + suggestions: list[AgencyIDSubtaskSuggestion] = await adb_client.get_all(AgencyIDSubtaskSuggestion) + assert len(suggestions) == 2 + + # Confirm confidence of location suggestion is distributed evenly among agency suggestions + for suggestion in suggestions: + assert suggestion.confidence == 40 diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/test_single_agency_location.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/test_single_agency_location.py new file mode 100644 index 00000000..ecec3071 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/test_single_agency_location.py @@ -0,0 +1,76 @@ +import pytest + +from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType +from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask +from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion +from tests.helpers.data_creator.core import DBDataCreator +from tests.helpers.data_creator.models.creation_info.county import CountyCreationInfo +from tests.helpers.data_creator.models.creation_info.locality import LocalityCreationInfo +from tests.helpers.run import run_task_and_confirm_success + + +@pytest.mark.asyncio +async def test_single_agency_location( + operator: AgencyIdentificationTaskOperator, + db_data_creator: DBDataCreator, + pittsburgh_locality: LocalityCreationInfo, + allegheny_county: CountyCreationInfo, + url_id: int +): + adb_client: AsyncDatabaseClient = operator.adb_client + + # Confirm operator does not meet prerequisites yet + assert not await operator.meets_task_prerequisites() + + # Add a location suggestion that has one agency linked to it + + # Add location suggestion for two locations + await db_data_creator.add_location_suggestion( + url_id=url_id, + location_ids=[ + allegheny_county.location_id, + pittsburgh_locality.location_id + ], + confidence=68, + ) + # Confirm operator does not yet meet prerequisites + assert not await operator.meets_task_prerequisites() + + # Create agency + agency_id: int = await db_data_creator.agency() + # Link agency to pittsburgh + await db_data_creator.link_agencies_to_location( + agency_ids=[agency_id], + location_id=pittsburgh_locality.location_id + ) + + # Confirm operator now meets prerequisites + assert await operator.meets_task_prerequisites() + + # Confirm next task is nlp location match + assert operator._subtask == AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH + + # Run operator and confirm runs without error + await run_task_and_confirm_success(operator) + + # Confirm subtask no longer meets prerequisites + assert not await operator.meets_task_prerequisites() + + # Check for presence of subtask + subtasks: list[URLAutoAgencyIDSubtask] = await adb_client.get_all(URLAutoAgencyIDSubtask) + assert len(subtasks) == 1 + subtask: URLAutoAgencyIDSubtask = subtasks[0] + assert subtask.type == AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH + + # Confirm subtask lists agencies found + assert subtask.agencies_found + + # Confirm single agency suggestion in database + suggestions: list[AgencyIDSubtaskSuggestion] = await adb_client.get_all(AgencyIDSubtaskSuggestion) + assert len(suggestions) == 1 + + # Confirm confidence of agency suggestion equal to location suggestion + suggestion: AgencyIDSubtaskSuggestion = suggestions[0] + assert suggestion.confidence == 68 diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/__init__.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/conftest.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/conftest.py deleted file mode 100644 index 1e411037..00000000 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/conftest.py +++ /dev/null @@ -1,18 +0,0 @@ -from unittest.mock import AsyncMock - -import pytest - -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.core import \ - AgencyIDSubtaskInternalProcessor -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor import \ - NLPProcessor -from src.external.pdap.client import PDAPClient - - -@pytest.fixture -def internal_processor() -> AgencyIDSubtaskInternalProcessor: - return AgencyIDSubtaskInternalProcessor( - nlp_processor=AsyncMock(spec=NLPProcessor), - pdap_client=AsyncMock(PDAPClient), - task_id=1 - ) diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_empty.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_empty.py deleted file mode 100644 index 01899f30..00000000 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_empty.py +++ /dev/null @@ -1,14 +0,0 @@ -import pytest - -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.core import \ - AgencyIDSubtaskInternalProcessor - - -@pytest.mark.asyncio() -async def test_empty( - internal_processor: AgencyIDSubtaskInternalProcessor, -): - """ - Test that when an input has no US State or locations, - that result is not returned - """ \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_no_state_any_locations.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_no_state_any_locations.py deleted file mode 100644 index 5fbbc6b5..00000000 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_no_state_any_locations.py +++ /dev/null @@ -1,14 +0,0 @@ -import pytest - -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.core import \ - AgencyIDSubtaskInternalProcessor - - -@pytest.mark.asyncio() -async def test_no_state_any_locations( - internal_processor: AgencyIDSubtaskInternalProcessor, -): - """ - Test that when an input has no US State and any locations - that the result is not returned - """ \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_state_multiple_locations.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_state_multiple_locations.py deleted file mode 100644 index 6e7aef6a..00000000 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_state_multiple_locations.py +++ /dev/null @@ -1,14 +0,0 @@ -import pytest - -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.core import \ - AgencyIDSubtaskInternalProcessor - - -@pytest.mark.asyncio() -async def test_state_multiple_locations( - internal_processor: AgencyIDSubtaskInternalProcessor, -): - """ - Test that when an input has a US State and multiple locations - then multiple results are returned with separate request ids - """ \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_state_no_locations.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_state_no_locations.py deleted file mode 100644 index c0b1cef4..00000000 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_state_no_locations.py +++ /dev/null @@ -1,14 +0,0 @@ -import pytest - -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.core import \ - AgencyIDSubtaskInternalProcessor - - -@pytest.mark.asyncio() -async def test_state_no_locations( - internal_processor: AgencyIDSubtaskInternalProcessor, -): - """ - Test that when an input has a US State and no locations - then no result is returned - """ \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_state_one_location.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_state_one_location.py deleted file mode 100644 index 7b4ef303..00000000 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_state_one_location.py +++ /dev/null @@ -1,14 +0,0 @@ -import pytest - -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.core import \ - AgencyIDSubtaskInternalProcessor - - -@pytest.mark.asyncio() -async def test_state_one_location( - internal_processor: AgencyIDSubtaskInternalProcessor, -): - """ - Test that when an input has a US State and one locatio - then one result is returned - """ \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/__init__.py b/tests/automated/integration/tasks/url/impl/location_identification/subtasks/nlp_location_frequency/end_to_end/__init__.py similarity index 100% rename from tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/__init__.py rename to tests/automated/integration/tasks/url/impl/location_identification/subtasks/nlp_location_frequency/end_to_end/__init__.py diff --git a/tests/automated/integration/tasks/url/impl/location_identification/subtasks/nlp_location_frequency/end_to_end/conftest.py b/tests/automated/integration/tasks/url/impl/location_identification/subtasks/nlp_location_frequency/end_to_end/conftest.py new file mode 100644 index 00000000..766a7ca5 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/location_identification/subtasks/nlp_location_frequency/end_to_end/conftest.py @@ -0,0 +1,15 @@ +import pytest_asyncio + +from src.db.dtos.url.mapping import URLMapping +from tests.helpers.data_creator.core import DBDataCreator + + +@pytest_asyncio.fixture +async def url_ids( + db_data_creator: DBDataCreator, +) -> list[int]: + # Create 2 URLs with compressed HTML + url_mappings: list[URLMapping] = await db_data_creator.create_urls(count=2) + url_ids: list[int] = [url.url_id for url in url_mappings] + await db_data_creator.html_data(url_ids=url_ids) + return url_ids diff --git a/tests/automated/integration/tasks/url/impl/location_identification/subtasks/nlp_location_frequency/end_to_end/test_core.py b/tests/automated/integration/tasks/url/impl/location_identification/subtasks/nlp_location_frequency/end_to_end/test_core.py new file mode 100644 index 00000000..2042a588 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/location_identification/subtasks/nlp_location_frequency/end_to_end/test_core.py @@ -0,0 +1,120 @@ +import pytest + +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.url.operators.location_id.core import LocationIdentificationTaskOperator +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.core import \ + NLPLocationFrequencySubtaskOperator +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.models.input_ import \ + NLPLocationFrequencySubtaskInput +from src.core.tasks.url.operators.location_id.subtasks.models.subtask import AutoLocationIDSubtaskData +from src.core.tasks.url.operators.location_id.subtasks.models.suggestion import LocationSuggestion +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.link.task_url import LinkTaskURL +from src.db.models.impl.url.error_info.sqlalchemy import URLErrorInfo +from src.db.models.impl.url.suggestion.location.auto.subtask.enums import LocationIDSubtaskType +from src.db.models.impl.url.suggestion.location.auto.subtask.pydantic import AutoLocationIDSubtaskPydantic +from src.db.models.impl.url.suggestion.location.auto.subtask.sqlalchemy import AutoLocationIDSubtask +from src.db.models.impl.url.suggestion.location.auto.suggestion.sqlalchemy import LocationIDSubtaskSuggestion +from tests.helpers.asserts import assert_task_run_success +from tests.helpers.data_creator.core import DBDataCreator +from tests.helpers.data_creator.models.creation_info.county import CountyCreationInfo +from tests.helpers.data_creator.models.creation_info.locality import LocalityCreationInfo + + +@pytest.mark.asyncio +async def test_nlp_location_match( + operator: LocationIdentificationTaskOperator, + db_data_creator: DBDataCreator, + url_ids: list[int], + pittsburgh_locality: LocalityCreationInfo, + allegheny_county: CountyCreationInfo, + monkeypatch +): + # Confirm operator meets prerequisites + assert await operator.meets_task_prerequisites() + assert operator._subtask == LocationIDSubtaskType.NLP_LOCATION_FREQUENCY + + happy_path_url_id: int = url_ids[0] + error_url_id: int = url_ids[1] + + async def mock_process_inputs( + self: NLPLocationFrequencySubtaskOperator, + inputs: list[NLPLocationFrequencySubtaskInput], + ) -> list[AutoLocationIDSubtaskData]: + response = [ + AutoLocationIDSubtaskData( + pydantic_model=AutoLocationIDSubtaskPydantic( + task_id=self.task_id, + url_id=happy_path_url_id, + type=LocationIDSubtaskType.NLP_LOCATION_FREQUENCY, + locations_found=True, + ), + suggestions=[ + LocationSuggestion( + location_id=pittsburgh_locality.location_id, + confidence=25 + ), + LocationSuggestion( + location_id=allegheny_county.location_id, + confidence=75 + ) + ] + ), + AutoLocationIDSubtaskData( + pydantic_model=AutoLocationIDSubtaskPydantic( + task_id=self.task_id, + url_id=error_url_id, + type=LocationIDSubtaskType.NLP_LOCATION_FREQUENCY, + locations_found=False, + ), + suggestions=[], + error="Test error" + ) + ] + return response + + # Remove internal processor reference - mock NLP processor instead + monkeypatch.setattr( + NLPLocationFrequencySubtaskOperator, + "_process_inputs", + mock_process_inputs + ) + run_info: TaskOperatorRunInfo = await operator.run_task() + assert_task_run_success(run_info) + + adb_client: AsyncDatabaseClient = operator.adb_client + # Confirm two URLs linked to the task + task_links: list[LinkTaskURL] = await adb_client.get_all(LinkTaskURL) + assert len(task_links) == 2 + assert {task_link.url_id for task_link in task_links} == set(url_ids) + assert {task_link.task_id for task_link in task_links} == {operator._task_id} + + # Confirm two subtasks were created + subtasks: list[AutoLocationIDSubtask] = await adb_client.get_all(AutoLocationIDSubtask) + assert len(subtasks) == 2 + assert {subtask.url_id for subtask in subtasks} == set(url_ids) + assert {subtask.task_id for subtask in subtasks} == {operator._task_id} + assert {subtask.type for subtask in subtasks} == { + LocationIDSubtaskType.NLP_LOCATION_FREQUENCY + } + assert {subtask.locations_found for subtask in subtasks} == {True, False} + + + # Confirm one URL error info + error_infos: list[URLErrorInfo] = await adb_client.get_all(URLErrorInfo) + assert len(error_infos) == 1 + assert error_infos[0].task_id == operator._task_id + assert error_infos[0].url_id == error_url_id + assert error_infos[0].error == "Test error" + + # Confirm two suggestions for happy path URL id + suggestions: list[LocationIDSubtaskSuggestion] = await adb_client.get_all(LocationIDSubtaskSuggestion) + assert len(suggestions) == 2 + # Confirm expected agency ids + assert {suggestion.location_id for suggestion in suggestions} == { + pittsburgh_locality.location_id, + allegheny_county.location_id, + } + # Confirm both have the expected confidence values + assert {suggestion.confidence for suggestion in suggestions} == {25, 75} + diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/test_nlp_response_valid.py b/tests/automated/integration/tasks/url/impl/location_identification/subtasks/nlp_location_frequency/test_nlp_response_valid.py similarity index 96% rename from tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/test_nlp_response_valid.py rename to tests/automated/integration/tasks/url/impl/location_identification/subtasks/nlp_location_frequency/test_nlp_response_valid.py index 1853f689..4ad6ec3c 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/test_nlp_response_valid.py +++ b/tests/automated/integration/tasks/url/impl/location_identification/subtasks/nlp_location_frequency/test_nlp_response_valid.py @@ -1,6 +1,6 @@ import pytest -from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.models import \ +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.models.response import \ NLPLocationMatchResponse from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.models.us_state import \ USState diff --git a/tests/automated/integration/tasks/url/loader/conftest.py b/tests/automated/integration/tasks/url/loader/conftest.py index 8d6d105d..a5d39643 100644 --- a/tests/automated/integration/tasks/url/loader/conftest.py +++ b/tests/automated/integration/tasks/url/loader/conftest.py @@ -4,9 +4,8 @@ from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface from src.core.tasks.url.loader import URLTaskOperatorLoader -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor import \ - NLPProcessor from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.core import NLPProcessor from src.db.client.async_ import AsyncDatabaseClient from src.external.huggingface.inference.client import HuggingFaceInferenceClient from src.external.pdap.client import PDAPClient diff --git a/tests/automated/integration/tasks/url/loader/test_happy_path.py b/tests/automated/integration/tasks/url/loader/test_happy_path.py index cee1bb86..2ff92e69 100644 --- a/tests/automated/integration/tasks/url/loader/test_happy_path.py +++ b/tests/automated/integration/tasks/url/loader/test_happy_path.py @@ -2,7 +2,7 @@ from src.core.tasks.url.loader import URLTaskOperatorLoader -NUMBER_OF_TASK_OPERATORS = 10 +NUMBER_OF_TASK_OPERATORS = 11 @pytest.mark.asyncio async def test_happy_path( diff --git a/tests/automated/unit/dto/__init__.py b/tests/automated/unit/dto/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/automated/unit/dto/test_all_annotation_post_info.py b/tests/automated/unit/dto/test_all_annotation_post_info.py deleted file mode 100644 index afa4e5b6..00000000 --- a/tests/automated/unit/dto/test_all_annotation_post_info.py +++ /dev/null @@ -1,36 +0,0 @@ -import pytest - -from src.api.endpoints.annotate.all.post.models.request import AllAnnotationPostInfo -from src.core.enums import RecordType, SuggestedStatus -from src.core.exceptions import FailedValidationException - -# Mock values to pass -mock_record_type = RecordType.ARREST_RECORDS.value # replace with valid RecordType if Enum -mock_agency = {"is_new": False, "suggested_agency": 1} # replace with a valid dict for the URLAgencyAnnotationPostInfo model - -@pytest.mark.parametrize( - "suggested_status, record_type, agency, should_raise", - [ - (SuggestedStatus.RELEVANT, mock_record_type, mock_agency, False), # valid - (SuggestedStatus.RELEVANT, None, mock_agency, True), # missing record_type - (SuggestedStatus.RELEVANT, mock_record_type, None, True), # missing agency - (SuggestedStatus.RELEVANT, None, None, True), # missing both - (SuggestedStatus.NOT_RELEVANT, None, None, False), # valid - (SuggestedStatus.NOT_RELEVANT, mock_record_type, None, True), # record_type present - (SuggestedStatus.NOT_RELEVANT, None, mock_agency, True), # agency present - (SuggestedStatus.NOT_RELEVANT, mock_record_type, mock_agency, True), # both present - ] -) -def test_all_annotation_post_info_validation(suggested_status, record_type, agency, should_raise): - data = { - "suggested_status": suggested_status.value, - "record_type": record_type, - "agency": agency - } - - if should_raise: - with pytest.raises(FailedValidationException): - AllAnnotationPostInfo(**data) - else: - model = AllAnnotationPostInfo(**data) - assert model.suggested_status == suggested_status diff --git a/tests/helpers/data_creator/core.py b/tests/helpers/data_creator/core.py index 75aa798f..bacddfd6 100644 --- a/tests/helpers/data_creator/core.py +++ b/tests/helpers/data_creator/core.py @@ -11,6 +11,7 @@ from src.db.dtos.url.insert import InsertURLsInfo from src.db.models.impl.flag.root_url.sqlalchemy import FlagRootURL from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.link.agency_location.sqlalchemy import LinkAgencyLocation from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.link.urls_root_url.sqlalchemy import LinkURLRootURL from src.db.models.impl.url.core.enums import URLSource @@ -21,6 +22,9 @@ from src.core.tasks.url.operators.misc_metadata.tdo import URLMiscellaneousMetadataTDO from src.core.enums import BatchStatus, SuggestionType, RecordType, SuggestedStatus from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML +from src.db.models.impl.url.suggestion.location.auto.subtask.enums import LocationIDSubtaskType +from src.db.models.impl.url.suggestion.location.auto.subtask.sqlalchemy import AutoLocationIDSubtask +from src.db.models.impl.url.suggestion.location.auto.suggestion.sqlalchemy import LocationIDSubtaskSuggestion from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters from tests.helpers.batch_creation_parameters.enums import URLCreationEnum @@ -605,7 +609,7 @@ async def create_locality( async def add_compressed_html( self, url_ids: list[int], - ): + ) -> None: compressed_html_inserts: list[URLCompressedHTML] = [ URLCompressedHTML( url_id=url_id, @@ -613,4 +617,46 @@ async def add_compressed_html( ) for url_id in url_ids ] - await self.adb_client.add_all(compressed_html_inserts) \ No newline at end of file + await self.adb_client.add_all(compressed_html_inserts) + + async def add_location_suggestion( + self, + url_id: int, + location_ids: list[int], + confidence: float, + type_: LocationIDSubtaskType = LocationIDSubtaskType.NLP_LOCATION_FREQUENCY + ) -> None: + locations_found: bool = len(location_ids) > 0 + task_id: int = await self.task(url_ids=[url_id]) + subtask = AutoLocationIDSubtask( + url_id=url_id, + type=type_, + task_id=task_id, + locations_found=len(location_ids) > 0 + ) + subtask_id: int = await self.adb_client.add(subtask, return_id=True) + if not locations_found: + return + suggestions: list[LocationIDSubtaskSuggestion] = [] + for location_id in location_ids: + suggestion = LocationIDSubtaskSuggestion( + subtask_id=subtask_id, + location_id=location_id, + confidence=confidence + ) + suggestions.append(suggestion) + await self.adb_client.add_all(suggestions) + + async def link_agencies_to_location( + self, + agency_ids: list[int], + location_id: int + ) -> None: + links: list[LinkAgencyLocation] = [ + LinkAgencyLocation( + agency_id=agency_id, + location_id=location_id + ) + for agency_id in agency_ids + ] + await self.adb_client.add_all(links) \ No newline at end of file diff --git a/tests/helpers/data_creator/create.py b/tests/helpers/data_creator/create.py index ae9814c2..31c5c316 100644 --- a/tests/helpers/data_creator/create.py +++ b/tests/helpers/data_creator/create.py @@ -12,6 +12,7 @@ from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.core.pydantic.insert import URLInsertModel from src.db.models.impl.url.data_source.pydantic import URLDataSourcePydantic +from tests.helpers.counter import COUNTER, next_int from tests.helpers.data_creator.generate import generate_batch, generate_urls, generate_validated_flags, \ generate_url_data_sources, generate_batch_url_links from tests.helpers.data_creator.models.creation_info.county import CountyCreationInfo @@ -107,6 +108,7 @@ async def create_county( county_insert_model = County( name=name, state_id=state_id, + fips=str(next_int()), ) county_id: int = await adb_client.add( county_insert_model,