diff --git a/Dockerfile b/Dockerfile index 85931528..e96272b0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,6 +14,8 @@ RUN uv sync --locked --no-dev # Must call from the root directory because uv does not add playwright to path RUN playwright install-deps chromium RUN playwright install chromium +# Download Spacy Model +RUN python -m spacy download en_core_web_sm # Copy project files COPY src ./src diff --git a/ENV.md b/ENV.md index 4085fcd6..c0df0c2d 100644 --- a/ENV.md +++ b/ENV.md @@ -53,7 +53,10 @@ The following flags are available: | `RUN_URL_TASKS_TASK_FLAG` | Runs URL tasks. | | `IA_PROBE_TASK_FLAG` | Extracts and links Internet Archives metadata to URLs. | | `IA_SAVE_TASK_FLAG` | Saves URLs to Internet Archives. | - +| `AGENCY_ID_HOMEPAGE_MATCH_FLAG` | Enables the homepage match subtask for agency identification. | +| `AGENCY_ID_NLP_LOCATION_MATCH_FLAG` | Enables the NLP location match subtask for agency identification. | +| `AGENCY_ID_CKAN_FLAG` | Enables the CKAN subtask for agency identification. | +| `AGENCY_ID_MUCKROCK_FLAG` | Enables the MuckRock subtask for agency identification. | ## Foreign Data Wrapper (FDW) diff --git a/alembic/versions/2025_08_09_2031-8cd5aa7670ff_remove_functional_duplicates.py b/alembic/versions/2025_08_09_2031-8cd5aa7670ff_remove_functional_duplicates.py index 846329ca..201d2448 100644 --- a/alembic/versions/2025_08_09_2031-8cd5aa7670ff_remove_functional_duplicates.py +++ b/alembic/versions/2025_08_09_2031-8cd5aa7670ff_remove_functional_duplicates.py @@ -52,7 +52,7 @@ def downgrade() -> None: _remove_cascade_foreign_key(URL_ERROR_INFO_TABLE_NAME, foreign_key_name=URL_ERROR_INFO_FOREIGN_KEY_NAME) _remove_cascade_foreign_key(COMPRESSED_HTML_TABLE_NAME, foreign_key_name=COMPRESSED_HTML_FOREIGN_KEY_NAME) _remove_cascade_foreign_key(URL_HTML_CONTENT_TABLE_NAME, foreign_key_name=URL_HTML_CONTENT_FOREIGN_KEY_NAME) - _remove_cascade_foreign_key(AUTOMATED_URL_AGENCY_SUGGESTION_TABLE_NAME, foreign_key_name=AUTOMATED_URL_AGENCY_SUGGESTION_FOREIGN_KEY_NAME) + # _remove_cascade_foreign_key(AUTOMATED_URL_AGENCY_SUGGESTION_TABLE_NAME, foreign_key_name=AUTOMATED_URL_AGENCY_SUGGESTION_FOREIGN_KEY_NAME) def _delete_duplicate_urls() -> None: op.execute('delete from urls where id in (2341,2343,2344,2347,2348,2349,2354,2359,2361,2501,2504,2505,2506,2507)') diff --git a/alembic/versions/2025_08_19_0803-b741b65a1431_augment_auto_agency_suggestions.py b/alembic/versions/2025_08_19_0803-b741b65a1431_augment_auto_agency_suggestions.py new file mode 100644 index 00000000..de3069e2 --- /dev/null +++ b/alembic/versions/2025_08_19_0803-b741b65a1431_augment_auto_agency_suggestions.py @@ -0,0 +1,254 @@ +"""Augment auto_agency_suggestions + +Revision ID: b741b65a1431 +Revises: 8a70ee509a74 +Create Date: 2025-08-19 08:03:12.106575 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + +from src.util.alembic_helpers import created_at_column, updated_at_column, id_column, url_id_column, switch_enum_type + +# revision identifiers, used by Alembic. +revision: str = 'b741b65a1431' +down_revision: Union[str, None] = '8a70ee509a74' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + +OLD_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME = "automated_url_agency_suggestions" +NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME = "url_auto_agency_suggestions" + +OLD_LINK_URLS_AGENCY_TABLE_NAME = "link_urls_agencies" +NEW_LINK_URLS_AGENCY_TABLE_NAME = "link_urls_agency" + +AGENCY_AUTO_SUGGESTION_METHOD_ENUM = sa.Enum( + "homepage_match", + "nlp_location_match", + "muckrock_match", + "ckan_match", + name="agency_auto_suggestion_method", +) + +FLAG_URL_VALIDATED_TABLE_NAME = "flag_url_validated" + +VALIDATED_URL_TYPE_ENUM = sa.Enum( + "data source", + "meta url", + "not relevant", + "individual record", + name="validated_url_type" +) + + + + +def upgrade() -> None: + op.rename_table(OLD_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME) + op.rename_table(OLD_LINK_URLS_AGENCY_TABLE_NAME, NEW_LINK_URLS_AGENCY_TABLE_NAME) + _alter_auto_agency_suggestions_table() + _create_flag_url_validated_table() + _add_urls_to_flag_url_validated_table() + _remove_validated_and_submitted_url_statuses() + _reset_agencies_sync_state() + + +def downgrade() -> None: + op.rename_table(NEW_LINK_URLS_AGENCY_TABLE_NAME, OLD_LINK_URLS_AGENCY_TABLE_NAME) + _revert_auto_agency_suggestions_table() + op.rename_table(NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, OLD_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME) + _revert_url_statuses() + _update_validated_and_submitted_url_statuses() + op.drop_table(FLAG_URL_VALIDATED_TABLE_NAME) + _drop_validated_url_type_enum() + +def _reset_agencies_sync_state(): + op.execute( + """ + UPDATE agencies_sync_state + set + last_full_sync_at = null, + current_cutoff_date = null, + current_page = null + """ + ) + +def _remove_validated_and_submitted_url_statuses(): + switch_enum_type( + table_name="urls", + column_name="status", + enum_name="url_status", + new_enum_values=[ + 'ok', + 'duplicate', + 'error', + '404 not found', + ], + check_constraints_to_drop=['url_name_not_null_when_validated'], + conversion_mappings={ + 'validated': 'ok', + 'submitted': 'ok', + 'pending': 'ok', + 'not relevant': 'ok', + 'individual record': 'ok' + } + ) + +def _add_urls_to_flag_url_validated_table(): + op.execute(""" + INSERT INTO flag_url_validated (url_id, type) + SELECT + urls.id, + CASE urls.status::text + WHEN 'validated' THEN 'data source' + WHEN 'submitted' THEN 'data source' + ELSE urls.status::text + END::validated_url_type + FROM urls + WHERE urls.status in ('validated', 'submitted', 'individual record', 'not relevant')""") + +def _revert_url_statuses(): + switch_enum_type( + table_name="urls", + column_name="status", + enum_name="url_status", + new_enum_values=[ + 'pending', + 'validated', + 'submitted', + 'duplicate', + 'not relevant', + 'error', + '404 not found', + 'individual record' + ], + conversion_mappings={ + 'ok': 'pending', + } + ) + op.create_check_constraint( + "url_name_not_null_when_validated", + "urls", + "(name IS NOT NULL) OR (status <> 'validated'::url_status)" + ) + +def _update_validated_and_submitted_url_statuses(): + op.execute(""" + UPDATE urls + SET status = 'not relevant' + FROM flag_url_validated + WHERE urls.id = flag_url_validated.id + AND flag_url_validated.type = 'not relevant' + """) + + op.execute(""" + UPDATE urls + SET status = 'individual record' + FROM flag_url_validated + WHERE urls.id = flag_url_validated.id + AND flag_url_validated.type = 'individual record' + """) + + op.execute(""" + UPDATE urls + SET status = 'validated' + FROM flag_url_validated + left join url_data_source on flag_url_validated.url_id = url_data_source.url_id + WHERE urls.id = flag_url_validated.id + AND flag_url_validated.type = 'data source' + AND url_data_source.url_id is NULL + """) + + op.execute(""" + UPDATE urls + SET status = 'validated' + FROM flag_url_validated + left join url_data_source on flag_url_validated.url_id = url_data_source.url_id + WHERE urls.id = flag_url_validated.id + AND flag_url_validated.type = 'data source' + AND url_data_source.url_id is not NULL + """) + + +def _create_flag_url_validated_table(): + op.create_table( + FLAG_URL_VALIDATED_TABLE_NAME, + id_column(), + url_id_column(), + sa.Column( + 'type', + VALIDATED_URL_TYPE_ENUM, + nullable=False, + ), + created_at_column(), + updated_at_column(), + sa.UniqueConstraint('url_id', name='uq_flag_url_validated_url_id') + ) + +def _drop_validated_url_type_enum(): + VALIDATED_URL_TYPE_ENUM.drop(op.get_bind()) + +def _alter_auto_agency_suggestions_table(): + AGENCY_AUTO_SUGGESTION_METHOD_ENUM.create(op.get_bind()) + # Created At + op.add_column( + NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, + created_at_column() + ) + # Updated At + op.add_column( + NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, + updated_at_column() + ) + # Method + op.add_column( + NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, + sa.Column( + 'method', + AGENCY_AUTO_SUGGESTION_METHOD_ENUM, + nullable=True + ) + ) + # Confidence + op.add_column( + NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, + sa.Column( + 'confidence', + sa.Float(), + server_default=sa.text('0.0'), + nullable=False + ) + ) + # Check constraint that confidence is between 0 and 1 + op.create_check_constraint( + "auto_url_agency_suggestions_check_confidence_between_0_and_1", + NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, + "confidence BETWEEN 0 AND 1" + ) + + +def _revert_auto_agency_suggestions_table(): + # Created At + op.drop_column( + NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, + 'created_at' + ) + # Updated At + op.drop_column( + NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, + 'updated_at' + ) + # Method + op.drop_column( + NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, + 'method' + ) + # Confidence + op.drop_column( + NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, + 'confidence' + ) + AGENCY_AUTO_SUGGESTION_METHOD_ENUM.drop(op.get_bind()) + diff --git a/alembic/versions/2025_08_31_1930-70baaee0dd79_overhaul_agency_identification.py b/alembic/versions/2025_08_31_1930-70baaee0dd79_overhaul_agency_identification.py new file mode 100644 index 00000000..39703fde --- /dev/null +++ b/alembic/versions/2025_08_31_1930-70baaee0dd79_overhaul_agency_identification.py @@ -0,0 +1,267 @@ +"""Overhaul agency identification + +Revision ID: 70baaee0dd79 +Revises: b741b65a1431 +Create Date: 2025-08-31 19:30:20.690369 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + +from src.util.alembic_helpers import id_column, url_id_column, created_at_column, agency_id_column, updated_at_column, \ + task_id_column + +# revision identifiers, used by Alembic. +revision: str = '70baaee0dd79' +down_revision: Union[str, None] = 'b741b65a1431' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + +URL_HAS_AGENCY_SUGGESTIONS_VIEW_NAME: str = "url_has_agency_auto_suggestions_view" +URL_UNKNOWN_AGENCIES_VIEW_NAME: str = "url_unknown_agencies_view" + +URL_AUTO_AGENCY_SUBTASK_TABLE_NAME: str = "url_auto_agency_id_subtasks" +LINK_AGENCY_ID_SUBTASK_AGENCIES_TABLE_NAME: str = "agency_id_subtask_suggestions" + +META_URL_VIEW_NAME: str = "meta_url_view" +UNVALIDATED_URL_VIEW_NAME: str = "unvalidated_url_view" + +URL_AUTO_AGENCY_SUGGESTIONS_TABLE_NAME: str = "url_auto_agency_suggestions" + +AGENCY_AUTO_SUGGESTION_METHOD_ENUM = sa.dialects.postgresql.ENUM( + name="agency_auto_suggestion_method", + create_type=False +) + +SUBTASK_DETAIL_CODE_ENUM = sa.Enum( + 'no details', + 'retrieval error', + 'homepage-single agency', + 'homepage-multi agency', + name="agency_id_subtask_detail_code", +) + + + + + +def upgrade() -> None: + _create_url_auto_agency_subtask_table() + _create_url_unknown_agencies_view() + _create_meta_url_view() + _create_link_agency_id_subtask_agencies_table() + _drop_url_annotation_flags_view() + _create_new_url_annotation_flags_view() + _drop_url_auto_agency_suggestions_table() + _create_unvalidated_urls_view() + + +def downgrade() -> None: + _drop_url_unknown_agencies_view() + _create_url_auto_agency_suggestions_table() + _drop_url_annotation_flags_view() + _create_old_url_annotation_flags_view() + _drop_link_agency_id_subtask_agencies_table() + _drop_url_auto_agency_subtask_table() + _drop_meta_url_view() + SUBTASK_DETAIL_CODE_ENUM.drop(op.get_bind()) + _drop_unvalidated_urls_view() + +def _create_unvalidated_urls_view(): + op.execute(f""" + CREATE OR REPLACE VIEW {UNVALIDATED_URL_VIEW_NAME} as + select + u.id as url_id + from + urls u + left join flag_url_validated fuv + on fuv.url_id = u.id + where + fuv.type is null + """) + +def _drop_unvalidated_urls_view(): + op.execute(f"DROP VIEW IF EXISTS {UNVALIDATED_URL_VIEW_NAME}") + + +def _drop_url_annotation_flags_view(): + op.execute(f"DROP VIEW IF EXISTS url_annotation_flags") + + +def _drop_meta_url_view(): + op.execute(f"DROP VIEW IF EXISTS {META_URL_VIEW_NAME}") + + +def _create_meta_url_view(): + op.execute(f""" + CREATE OR REPLACE VIEW {META_URL_VIEW_NAME} AS + SELECT + urls.id as url_id + FROM urls + INNER JOIN flag_url_validated fuv on fuv.url_id = urls.id + where fuv.type = 'meta url' + """) + +def _drop_url_auto_agency_suggestions_table(): + op.drop_table(URL_AUTO_AGENCY_SUGGESTIONS_TABLE_NAME) + + +def _create_new_url_annotation_flags_view(): + + op.execute( + f""" + CREATE OR REPLACE VIEW url_annotation_flags AS + ( + SELECT u.id as url_id, + EXISTS (SELECT 1 FROM public.auto_record_type_suggestions a WHERE a.url_id = u.id) AS has_auto_record_type_suggestion, + EXISTS (SELECT 1 FROM public.auto_relevant_suggestions a WHERE a.url_id = u.id) AS has_auto_relevant_suggestion, + EXISTS (SELECT 1 FROM public.{URL_AUTO_AGENCY_SUBTASK_TABLE_NAME} a WHERE a.url_id = u.id) AS has_auto_agency_suggestion, + EXISTS (SELECT 1 FROM public.user_record_type_suggestions a WHERE a.url_id = u.id) AS has_user_record_type_suggestion, + EXISTS (SELECT 1 FROM public.user_relevant_suggestions a WHERE a.url_id = u.id) AS has_user_relevant_suggestion, + EXISTS (SELECT 1 FROM public.user_url_agency_suggestions a WHERE a.url_id = u.id) AS has_user_agency_suggestion, + EXISTS (SELECT 1 FROM public.link_urls_agency a WHERE a.url_id = u.id) AS has_confirmed_agency, + EXISTS (SELECT 1 FROM public.reviewing_user_url a WHERE a.url_id = u.id) AS was_reviewed + FROM urls u + ) + """ + ) + + +def _create_url_unknown_agencies_view(): + op.execute( + f""" + CREATE OR REPLACE VIEW {URL_UNKNOWN_AGENCIES_VIEW_NAME} AS + SELECT + u.id + FROM urls u + LEFT JOIN {URL_AUTO_AGENCY_SUBTASK_TABLE_NAME} uas ON u.id = uas.url_id + GROUP BY u.id + HAVING bool_or(uas.agencies_found) = false + """ + ) + + +def _create_url_auto_agency_subtask_table(): + op.create_table( + URL_AUTO_AGENCY_SUBTASK_TABLE_NAME, + id_column(), + task_id_column(), + url_id_column(), + sa.Column( + "type", + AGENCY_AUTO_SUGGESTION_METHOD_ENUM, + nullable=False + ), + sa.Column( + "agencies_found", + sa.Boolean(), + nullable=False + ), + sa.Column( + "detail", + SUBTASK_DETAIL_CODE_ENUM, + server_default=sa.text("'no details'"), + nullable=False + ), + created_at_column() + ) + + +def _create_link_agency_id_subtask_agencies_table(): + op.create_table( + LINK_AGENCY_ID_SUBTASK_AGENCIES_TABLE_NAME, + id_column(), + sa.Column( + "subtask_id", + sa.Integer(), + sa.ForeignKey( + f'{URL_AUTO_AGENCY_SUBTASK_TABLE_NAME}.id', + ondelete='CASCADE' + ), + nullable=False, + comment='A foreign key to the `url_auto_agency_subtask` table.' + ), + sa.Column( + "confidence", + sa.Integer, + sa.CheckConstraint( + "confidence BETWEEN 0 and 100" + ), + nullable=False, + ), + agency_id_column(), + created_at_column() + ) + + +def _drop_link_agency_id_subtask_agencies_table(): + op.drop_table(LINK_AGENCY_ID_SUBTASK_AGENCIES_TABLE_NAME) + + +def _drop_url_auto_agency_subtask_table(): + op.drop_table(URL_AUTO_AGENCY_SUBTASK_TABLE_NAME) + + +def _create_url_auto_agency_suggestions_table(): + op.create_table( + URL_AUTO_AGENCY_SUGGESTIONS_TABLE_NAME, + id_column(), + agency_id_column(), + url_id_column(), + sa.Column( + "is_unknown", + sa.Boolean(), + nullable=False + ), + created_at_column(), + updated_at_column(), + sa.Column( + 'method', + AGENCY_AUTO_SUGGESTION_METHOD_ENUM, + nullable=True + ), + sa.Column( + 'confidence', + sa.Float(), + server_default=sa.text('0.0'), + nullable=False + ), + sa.UniqueConstraint("agency_id", "url_id") + ) + + +def _drop_url_unknown_agencies_view(): + op.execute(f"DROP VIEW IF EXISTS {URL_UNKNOWN_AGENCIES_VIEW_NAME}") + +def _drop_url_annotation_flags_view(): + op.execute("DROP VIEW url_annotation_flags;") + + +def _create_old_url_annotation_flags_view(): + op.execute( + f""" + CREATE OR REPLACE VIEW url_annotation_flags AS + ( + SELECT u.id, + CASE WHEN arts.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_auto_record_type_suggestion, + CASE WHEN ars.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_auto_relevant_suggestion, + CASE WHEN auas.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_auto_agency_suggestion, + CASE WHEN urts.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_user_record_type_suggestion, + CASE WHEN urs.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_user_relevant_suggestion, + CASE WHEN uuas.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_user_agency_suggestion, + CASE WHEN cua.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_confirmed_agency, + CASE WHEN ruu.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS was_reviewed + FROM urls u + LEFT JOIN public.auto_record_type_suggestions arts ON u.id = arts.url_id + LEFT JOIN public.auto_relevant_suggestions ars ON u.id = ars.url_id + LEFT JOIN public.{URL_AUTO_AGENCY_SUGGESTIONS_TABLE_NAME} auas ON u.id = auas.url_id + LEFT JOIN public.user_record_type_suggestions urts ON u.id = urts.url_id + LEFT JOIN public.user_relevant_suggestions urs ON u.id = urs.url_id + LEFT JOIN public.user_url_agency_suggestions uuas ON u.id = uuas.url_id + LEFT JOIN public.reviewing_user_url ruu ON u.id = ruu.url_id + LEFT JOIN public.link_urls_agency cua on u.id = cua.url_id + ) + """ + ) diff --git a/alembic/versions/d7eb670edaf0_revise_agency_identification_logic.py b/alembic/versions/d7eb670edaf0_revise_agency_identification_logic.py index cd68a4b5..6ba6f7c9 100644 --- a/alembic/versions/d7eb670edaf0_revise_agency_identification_logic.py +++ b/alembic/versions/d7eb670edaf0_revise_agency_identification_logic.py @@ -118,7 +118,7 @@ def upgrade(): def downgrade(): # Drop constraints first op.drop_constraint("uq_confirmed_url_agency", "confirmed_url_agency", type_="unique") - op.drop_constraint("uq_automated_url_agency_suggestions", "automated_url_agency_suggestions", type_="unique") + # op.drop_constraint("uq_automated_url_agency_suggestions", "automated_url_agency_suggestions", type_="unique") op.drop_constraint("uq_user_url_agency_suggestions", "user_url_agency_suggestions", type_="unique") # Drop tables diff --git a/pyproject.toml b/pyproject.toml index 3eb1446d..afe4a89a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,7 @@ dependencies = [ "marshmallow~=3.23.2", "openai~=1.60.1", "pdap-access-manager==0.3.6", + "pip>=25.2", "playwright~=1.49.1", "psycopg2-binary~=2.9.6", "psycopg[binary]~=3.1.20", @@ -31,6 +32,8 @@ dependencies = [ "pyjwt~=2.10.1", "python-dotenv~=1.0.1", "requests~=2.32.3", + "side-effects>=1.6.dev0", + "spacy>=3.8.7", "sqlalchemy~=2.0.36", "starlette~=0.45.3", "tqdm>=4.64.1", diff --git a/src/api/endpoints/annotate/_shared/queries/get_annotation_batch_info.py b/src/api/endpoints/annotate/_shared/queries/get_annotation_batch_info.py index 9b3ffdeb..5a56cf32 100644 --- a/src/api/endpoints/annotate/_shared/queries/get_annotation_batch_info.py +++ b/src/api/endpoints/annotate/_shared/queries/get_annotation_batch_info.py @@ -5,7 +5,7 @@ from src.api.endpoints.annotate.dtos.shared.batch import AnnotationBatchInfo from src.collectors.enums import URLStatus -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer @@ -42,7 +42,7 @@ async def run( ) common_where_clause = [ - URL.status == URLStatus.PENDING.value, + URL.status == URLStatus.OK.value, LinkBatchURL.batch_id == self.batch_id, ] diff --git a/src/api/endpoints/annotate/_shared/queries/get_next_url_for_user_annotation.py b/src/api/endpoints/annotate/_shared/queries/get_next_url_for_user_annotation.py index a6a5b69d..6eed4b07 100644 --- a/src/api/endpoints/annotate/_shared/queries/get_next_url_for_user_annotation.py +++ b/src/api/endpoints/annotate/_shared/queries/get_next_url_for_user_annotation.py @@ -5,7 +5,8 @@ from src.collectors.enums import URLStatus from src.core.enums import SuggestedStatus from src.db.client.types import UserSuggestionModel -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion from src.db.queries.base.builder import QueryBuilderBase @@ -32,6 +33,10 @@ async def run(self, session: AsyncSession): select( URL, ) + .outerjoin( + FlagURLValidated, + FlagURLValidated.url_id == URL.id + ) ) if self.batch_id is not None: @@ -43,7 +48,7 @@ async def run(self, session: AsyncSession): query = ( query - .where(URL.status == URLStatus.PENDING.value) + .where(FlagURLValidated.url_id.is_(None)) # URL must not have user suggestion .where( StatementComposer.user_suggestion_not_exists(self.user_suggestion_model_to_exclude) diff --git a/src/api/endpoints/annotate/agency/get/queries/agency_suggestion.py b/src/api/endpoints/annotate/agency/get/queries/agency_suggestion.py deleted file mode 100644 index 1f202263..00000000 --- a/src/api/endpoints/annotate/agency/get/queries/agency_suggestion.py +++ /dev/null @@ -1,55 +0,0 @@ -from sqlalchemy import select -from sqlalchemy.ext.asyncio import AsyncSession - -from src.api.endpoints.annotate.agency.get.dto import GetNextURLForAgencyAgencyInfo -from src.core.enums import SuggestionType -from src.db.models.impl.agency.sqlalchemy import Agency -from src.db.models.impl.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion -from src.db.queries.base.builder import QueryBuilderBase - - -class GetAgencySuggestionsQueryBuilder(QueryBuilderBase): - - def __init__( - self, - url_id: int - ): - super().__init__() - self.url_id = url_id - - async def run(self, session: AsyncSession) -> list[GetNextURLForAgencyAgencyInfo]: - # Get relevant autosuggestions and agency info, if an associated agency exists - - statement = ( - select( - AutomatedUrlAgencySuggestion.agency_id, - AutomatedUrlAgencySuggestion.is_unknown, - Agency.name, - Agency.state, - Agency.county, - Agency.locality - ) - .join(Agency, isouter=True) - .where(AutomatedUrlAgencySuggestion.url_id == self.url_id) - ) - raw_autosuggestions = await session.execute(statement) - autosuggestions = raw_autosuggestions.all() - agency_suggestions = [] - for autosuggestion in autosuggestions: - agency_id = autosuggestion[0] - is_unknown = autosuggestion[1] - name = autosuggestion[2] - state = autosuggestion[3] - county = autosuggestion[4] - locality = autosuggestion[5] - agency_suggestions.append( - GetNextURLForAgencyAgencyInfo( - suggestion_type=SuggestionType.AUTO_SUGGESTION if not is_unknown else SuggestionType.UNKNOWN, - pdap_agency_id=agency_id, - agency_name=name, - state=state, - county=county, - locality=locality - ) - ) - return agency_suggestions \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/queries/__init__.py b/src/api/endpoints/annotate/agency/get/queries/agency_suggestion_/__init__.py similarity index 100% rename from src/core/tasks/url/operators/agency_identification/queries/__init__.py rename to src/api/endpoints/annotate/agency/get/queries/agency_suggestion_/__init__.py diff --git a/src/api/endpoints/annotate/agency/get/queries/agency_suggestion_/core.py b/src/api/endpoints/annotate/agency/get/queries/agency_suggestion_/core.py new file mode 100644 index 00000000..a9a33e84 --- /dev/null +++ b/src/api/endpoints/annotate/agency/get/queries/agency_suggestion_/core.py @@ -0,0 +1,73 @@ +from typing import Sequence + +from sqlalchemy import select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.annotate.agency.get.dto import GetNextURLForAgencyAgencyInfo +from src.api.endpoints.annotate.agency.get.queries.agency_suggestion_.suggestions_with_highest_confidence import \ + SuggestionsWithHighestConfidenceCTE +from src.core.enums import SuggestionType +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.queries.base.builder import QueryBuilderBase + +from src.db.helpers.session import session_helper as sh + +class GetAgencySuggestionsQueryBuilder(QueryBuilderBase): + + def __init__( + self, + url_id: int + ): + super().__init__() + self.url_id = url_id + + async def run(self, session: AsyncSession) -> list[GetNextURLForAgencyAgencyInfo]: + # Get relevant autosuggestions and agency info, if an associated agency exists + + cte = SuggestionsWithHighestConfidenceCTE() + + query = ( + select( + cte.agency_id, + cte.confidence, + Agency.name, + Agency.state, + Agency.county, + Agency.locality + ) + .outerjoin( + Agency, + Agency.agency_id == cte.agency_id + ) + .where( + cte.url_id == self.url_id + ) + ) + + raw_autosuggestions: Sequence[RowMapping] = await sh.mappings(session, query=query) + if len(raw_autosuggestions) == 0: + # Unknown agency + return [ + GetNextURLForAgencyAgencyInfo( + suggestion_type=SuggestionType.UNKNOWN, + ) + ] + + agency_suggestions: list[GetNextURLForAgencyAgencyInfo] = [] + for autosuggestion in raw_autosuggestions: + agency_id: int = autosuggestion["agency_id"] + name: str = autosuggestion["name"] + state: str | None = autosuggestion["state"] + county: str | None = autosuggestion["county"] + locality: str | None = autosuggestion["locality"] + agency_suggestions.append( + GetNextURLForAgencyAgencyInfo( + suggestion_type=SuggestionType.AUTO_SUGGESTION, + pdap_agency_id=agency_id, + agency_name=name, + state=state, + county=county, + locality=locality + ) + ) + return agency_suggestions \ No newline at end of file diff --git a/src/api/endpoints/annotate/agency/get/queries/agency_suggestion_/suggestions_with_highest_confidence.py b/src/api/endpoints/annotate/agency/get/queries/agency_suggestion_/suggestions_with_highest_confidence.py new file mode 100644 index 00000000..6d389b11 --- /dev/null +++ b/src/api/endpoints/annotate/agency/get/queries/agency_suggestion_/suggestions_with_highest_confidence.py @@ -0,0 +1,62 @@ +from sqlalchemy import CTE, select, func, Column + +from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask +from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion + +SUGGESTIONS_WITH_HIGHEST_CONFIDENCE_CTE: CTE = ( + select( + URLAutoAgencyIDSubtask.url_id, + AgencyIDSubtaskSuggestion.agency_id, + func.max(AgencyIDSubtaskSuggestion.confidence) + ) + .select_from(URLAutoAgencyIDSubtask) + .join( + AgencyIDSubtaskSuggestion, + URLAutoAgencyIDSubtask.id == AgencyIDSubtaskSuggestion.subtask_id + ) + .group_by( + URLAutoAgencyIDSubtask.url_id, + AgencyIDSubtaskSuggestion.agency_id + ) + .cte("suggestions_with_highest_confidence") +) + +class SuggestionsWithHighestConfidenceCTE: + + def __init__(self): + self._cte = ( + select( + URLAutoAgencyIDSubtask.url_id, + AgencyIDSubtaskSuggestion.agency_id, + func.max(AgencyIDSubtaskSuggestion.confidence).label("confidence") + ) + .select_from(URLAutoAgencyIDSubtask) + .join( + AgencyIDSubtaskSuggestion, + URLAutoAgencyIDSubtask.id == AgencyIDSubtaskSuggestion.subtask_id + ) + .where( + AgencyIDSubtaskSuggestion.agency_id.isnot(None) + ) + .group_by( + URLAutoAgencyIDSubtask.url_id, + AgencyIDSubtaskSuggestion.agency_id + ) + .cte("suggestions_with_highest_confidence") + ) + + @property + def cte(self) -> CTE: + return self._cte + + @property + def url_id(self) -> Column[int]: + return self._cte.columns.url_id + + @property + def agency_id(self) -> Column[int]: + return self._cte.columns.agency_id + + @property + def confidence(self) -> Column[float]: + return self._cte.columns.confidence \ No newline at end of file diff --git a/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py b/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py index 70ae112a..e8fdc6b2 100644 --- a/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py +++ b/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py @@ -4,17 +4,17 @@ from src.api.endpoints.annotate._shared.queries.get_annotation_batch_info import GetAnnotationBatchInfoQueryBuilder from src.api.endpoints.annotate.agency.get.dto import GetNextURLForAgencyAnnotationResponse, \ GetNextURLForAgencyAnnotationInnerResponse -from src.api.endpoints.annotate.agency.get.queries.agency_suggestion import GetAgencySuggestionsQueryBuilder +from src.api.endpoints.annotate.agency.get.queries.agency_suggestion_.core import GetAgencySuggestionsQueryBuilder from src.collectors.enums import URLStatus from src.core.enums import SuggestedStatus from src.core.tasks.url.operators.html.scraper.parser.util import convert_to_response_html_info from src.db.dtos.url.mapping import URLMapping -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion +from src.db.models.views.url_annotations_flags import URLAnnotationFlagsView from src.db.queries.base.builder import QueryBuilderBase from src.db.queries.implementations.core.get.html_content_info import GetHTMLContentInfoQueryBuilder @@ -48,30 +48,20 @@ async def run( # Must not have confirmed agencies query = query.where( - URL.status == URLStatus.PENDING.value + URL.status == URLStatus.OK.value ) - - # Must not have been annotated by a user query = ( - query.join(UserUrlAgencySuggestion, isouter=True) - .where( - ~exists( - select(UserUrlAgencySuggestion). - where(UserUrlAgencySuggestion.url_id == URL.id). - correlate(URL) - ) + query.join( + URLAnnotationFlagsView, + URLAnnotationFlagsView.url_id == URL.id ) - # Must have extant autosuggestions - .join(AutomatedUrlAgencySuggestion, isouter=True) + # Must not have been annotated by a user .where( - exists( - select(AutomatedUrlAgencySuggestion). - where(AutomatedUrlAgencySuggestion.url_id == URL.id). - correlate(URL) - ) + URLAnnotationFlagsView.has_user_agency_suggestion.is_(False), + # Must have extant autosuggestions + URLAnnotationFlagsView.has_auto_agency_suggestion.is_(True) ) - # Must not have confirmed agencies .join(LinkURLAgency, isouter=True) .where( ~exists( diff --git a/src/api/endpoints/annotate/all/get/query.py b/src/api/endpoints/annotate/all/get/query.py index a2afafd9..05855578 100644 --- a/src/api/endpoints/annotate/all/get/query.py +++ b/src/api/endpoints/annotate/all/get/query.py @@ -3,14 +3,14 @@ from sqlalchemy.orm import selectinload from src.api.endpoints.annotate._shared.queries.get_annotation_batch_info import GetAnnotationBatchInfoQueryBuilder -from src.api.endpoints.annotate.agency.get.queries.agency_suggestion import GetAgencySuggestionsQueryBuilder +from src.api.endpoints.annotate.agency.get.queries.agency_suggestion_.core import GetAgencySuggestionsQueryBuilder from src.api.endpoints.annotate.all.get.dto import GetNextURLForAllAnnotationResponse, \ GetNextURLForAllAnnotationInnerResponse from src.api.endpoints.annotate.relevance.get.dto import RelevanceAnnotationResponseInfo from src.collectors.enums import URLStatus from src.db.dto_converter import DTOConverter from src.db.dtos.url.mapping import URLMapping -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion @@ -39,7 +39,7 @@ async def run( query .where( and_( - URL.status == URLStatus.PENDING.value, + URL.status == URLStatus.OK.value, StatementComposer.user_suggestion_not_exists(UserUrlAgencySuggestion), StatementComposer.user_suggestion_not_exists(UserRecordTypeSuggestion), StatementComposer.user_suggestion_not_exists(UserRelevantSuggestion), @@ -50,7 +50,7 @@ async def run( load_options = [ URL.html_content, - URL.automated_agency_suggestions, + URL.auto_agency_subtasks, URL.auto_relevant_suggestion, URL.auto_record_type_suggestion ] diff --git a/src/api/endpoints/batch/duplicates/query.py b/src/api/endpoints/batch/duplicates/query.py index 2d8edff9..b09b6e5d 100644 --- a/src/api/endpoints/batch/duplicates/query.py +++ b/src/api/endpoints/batch/duplicates/query.py @@ -5,7 +5,7 @@ from src.db.models.impl.duplicate.pydantic.info import DuplicateInfo from src.db.models.impl.batch.sqlalchemy import Batch from src.db.models.impl.duplicate.sqlalchemy import Duplicate -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/api/endpoints/batch/urls/query.py b/src/api/endpoints/batch/urls/query.py index 6a88448f..391a265f 100644 --- a/src/api/endpoints/batch/urls/query.py +++ b/src/api/endpoints/batch/urls/query.py @@ -1,7 +1,7 @@ from sqlalchemy import Select from sqlalchemy.ext.asyncio import AsyncSession -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.pydantic.info import URLInfo from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/api/endpoints/collector/manual/query.py b/src/api/endpoints/collector/manual/query.py index 12b17ad3..73e3edb8 100644 --- a/src/api/endpoints/collector/manual/query.py +++ b/src/api/endpoints/collector/manual/query.py @@ -6,7 +6,7 @@ from src.collectors.enums import CollectorType, URLStatus from src.core.enums import BatchStatus from src.db.models.impl.batch.sqlalchemy import Batch -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata @@ -47,7 +47,7 @@ async def run(self, session: AsyncSession) -> ManualBatchResponseDTO: name=entry.name, description=entry.description, collector_metadata=entry.collector_metadata, - status=URLStatus.PENDING.value, + status=URLStatus.OK.value, record_type=entry.record_type.value if entry.record_type is not None else None, source=URLSource.MANUAL ) diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/__init__.py b/src/api/endpoints/metrics/backlog/__init__.py similarity index 100% rename from tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/__init__.py rename to src/api/endpoints/metrics/backlog/__init__.py diff --git a/src/api/endpoints/metrics/backlog/query.py b/src/api/endpoints/metrics/backlog/query.py new file mode 100644 index 00000000..788ef424 --- /dev/null +++ b/src/api/endpoints/metrics/backlog/query.py @@ -0,0 +1,53 @@ +from sqlalchemy import func, select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.metrics.dtos.get.backlog import GetMetricsBacklogResponseDTO, GetMetricsBacklogResponseInnerDTO +from src.db.models.impl.backlog_snapshot import BacklogSnapshot +from src.db.queries.base.builder import QueryBuilderBase + + +class GetBacklogMetricsQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> GetMetricsBacklogResponseDTO: + month = func.date_trunc('month', BacklogSnapshot.created_at) + + # 1. Create a subquery that assigns row_number() partitioned by month + monthly_snapshot_subq = ( + select( + BacklogSnapshot.id, + BacklogSnapshot.created_at, + BacklogSnapshot.count_pending_total, + month.label("month_start"), + func.row_number() + .over( + partition_by=month, + order_by=BacklogSnapshot.created_at.desc() + ) + .label("row_number") + ) + .subquery() + ) + + # 2. Filter for the top (most recent) row in each month + stmt = ( + select( + monthly_snapshot_subq.c.month_start, + monthly_snapshot_subq.c.created_at, + monthly_snapshot_subq.c.count_pending_total + ) + .where(monthly_snapshot_subq.c.row_number == 1) + .order_by(monthly_snapshot_subq.c.month_start) + ) + + raw_result = await session.execute(stmt) + results = raw_result.all() + final_results = [] + for result in results: + final_results.append( + GetMetricsBacklogResponseInnerDTO( + month=result.month_start.strftime("%B %Y"), + count_pending_total=result.count_pending_total, + ) + ) + + return GetMetricsBacklogResponseDTO(entries=final_results) \ No newline at end of file diff --git a/src/api/endpoints/metrics/batches/aggregated/query.py b/src/api/endpoints/metrics/batches/aggregated/query.py deleted file mode 100644 index e7de65fb..00000000 --- a/src/api/endpoints/metrics/batches/aggregated/query.py +++ /dev/null @@ -1,117 +0,0 @@ -from sqlalchemy import case, select -from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.sql.functions import coalesce - -from src.api.endpoints.metrics.batches.aggregated.dto import GetMetricsBatchesAggregatedResponseDTO, \ - GetMetricsBatchesAggregatedInnerResponseDTO -from src.collectors.enums import URLStatus, CollectorType -from src.core.enums import BatchStatus -from src.db.models.impl.batch.sqlalchemy import Batch -from src.db.models.impl.link.batch_url import LinkBatchURL -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.queries.base.builder import QueryBuilderBase -from src.db.statement_composer import StatementComposer - - -class GetBatchesAggregatedMetricsQueryBuilder(QueryBuilderBase): - - async def run( - self, - session: AsyncSession - ) -> GetMetricsBatchesAggregatedResponseDTO: - sc = StatementComposer - - # First, get all batches broken down by collector type and status - def batch_column(status: BatchStatus, label): - return sc.count_distinct( - case( - ( - Batch.status == status.value, - Batch.id - ) - ), - label=label - ) - - batch_count_subquery = select( - batch_column(BatchStatus.READY_TO_LABEL, label="done_count"), - batch_column(BatchStatus.ERROR, label="error_count"), - Batch.strategy, - ).group_by(Batch.strategy).subquery("batch_count") - - def url_column(status: URLStatus, label): - return sc.count_distinct( - case( - ( - URL.status == status.value, - URL.id - ) - ), - label=label - ) - - # Next, count urls - url_count_subquery = select( - Batch.strategy, - url_column(URLStatus.PENDING, label="pending_count"), - url_column(URLStatus.ERROR, label="error_count"), - url_column(URLStatus.VALIDATED, label="validated_count"), - url_column(URLStatus.SUBMITTED, label="submitted_count"), - url_column(URLStatus.NOT_RELEVANT, label="rejected_count"), - - ).join( - LinkBatchURL, - LinkBatchURL.url_id == URL.id - ).outerjoin( - Batch, Batch.id == LinkBatchURL.batch_id - ).group_by( - Batch.strategy - ).subquery("url_count") - - # Combine - query = select( - Batch.strategy, - batch_count_subquery.c.done_count.label("batch_done_count"), - batch_count_subquery.c.error_count.label("batch_error_count"), - coalesce(url_count_subquery.c.pending_count, 0).label("pending_count"), - coalesce(url_count_subquery.c.error_count, 0).label("error_count"), - coalesce(url_count_subquery.c.submitted_count, 0).label("submitted_count"), - coalesce(url_count_subquery.c.rejected_count, 0).label("rejected_count"), - coalesce(url_count_subquery.c.validated_count, 0).label("validated_count") - ).join( - batch_count_subquery, - Batch.strategy == batch_count_subquery.c.strategy - ).outerjoin( - url_count_subquery, - Batch.strategy == url_count_subquery.c.strategy - ) - raw_results = await session.execute(query) - results = raw_results.all() - d: dict[CollectorType, GetMetricsBatchesAggregatedInnerResponseDTO] = {} - for result in results: - d[CollectorType(result.strategy)] = GetMetricsBatchesAggregatedInnerResponseDTO( - count_successful_batches=result.batch_done_count, - count_failed_batches=result.batch_error_count, - count_urls=result.pending_count + result.submitted_count + - result.rejected_count + result.error_count + - result.validated_count, - count_urls_pending=result.pending_count, - count_urls_validated=result.validated_count, - count_urls_submitted=result.submitted_count, - count_urls_rejected=result.rejected_count, - count_urls_errors=result.error_count - ) - - total_batch_query = await session.execute( - select( - sc.count_distinct(Batch.id, label="count") - ) - ) - total_batch_count = total_batch_query.scalars().one_or_none() - if total_batch_count is None: - total_batch_count = 0 - - return GetMetricsBatchesAggregatedResponseDTO( - total_batches=total_batch_count, - by_strategy=d - ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/queries/__init__.py b/src/api/endpoints/metrics/batches/aggregated/query/__init__.py similarity index 100% rename from tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/queries/__init__.py rename to src/api/endpoints/metrics/batches/aggregated/query/__init__.py diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/__init__.py b/src/api/endpoints/metrics/batches/aggregated/query/all_urls/__init__.py similarity index 100% rename from tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/__init__.py rename to src/api/endpoints/metrics/batches/aggregated/query/all_urls/__init__.py diff --git a/src/api/endpoints/metrics/batches/aggregated/query/all_urls/query.py b/src/api/endpoints/metrics/batches/aggregated/query/all_urls/query.py new file mode 100644 index 00000000..7eed215a --- /dev/null +++ b/src/api/endpoints/metrics/batches/aggregated/query/all_urls/query.py @@ -0,0 +1,28 @@ +from typing import Sequence + +from sqlalchemy import func, select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.metrics.batches.aggregated.query.models.strategy_count import CountByBatchStrategyResponse +from src.db.helpers.session import session_helper as sh +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.queries.base.builder import QueryBuilderBase + + +class CountAllURLsByBatchStrategyQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> list[CountByBatchStrategyResponse]: + + query = ( + select( + Batch.strategy, + func.count(LinkBatchURL.url_id).label("count") + ) + .join(LinkBatchURL) + .group_by(Batch.strategy) + ) + + mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) + results = [CountByBatchStrategyResponse(**mapping) for mapping in mappings] + return results \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/url/__init__.py b/src/api/endpoints/metrics/batches/aggregated/query/batch_status_/__init__.py similarity index 100% rename from tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/url/__init__.py rename to src/api/endpoints/metrics/batches/aggregated/query/batch_status_/__init__.py diff --git a/src/api/endpoints/metrics/batches/aggregated/query/batch_status_/query.py b/src/api/endpoints/metrics/batches/aggregated/query/batch_status_/query.py new file mode 100644 index 00000000..f8587b68 --- /dev/null +++ b/src/api/endpoints/metrics/batches/aggregated/query/batch_status_/query.py @@ -0,0 +1,37 @@ +from typing import Sequence + +from sqlalchemy import CTE, select, func, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.metrics.batches.aggregated.query.batch_status_.response import \ + BatchStatusCountByBatchStrategyResponseDTO +from src.collectors.enums import CollectorType +from src.core.enums import BatchStatus +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.queries.base.builder import QueryBuilderBase + +from src.db.helpers.session import session_helper as sh + +class BatchStatusByBatchStrategyQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> list[BatchStatusCountByBatchStrategyResponseDTO]: + query = ( + select( + Batch.strategy, + Batch.status, + func.count(Batch.id).label("count") + ) + .group_by(Batch.strategy, Batch.status) + ) + mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) + + results: list[BatchStatusCountByBatchStrategyResponseDTO] = [] + for mapping in mappings: + results.append( + BatchStatusCountByBatchStrategyResponseDTO( + strategy=CollectorType(mapping["strategy"]), + status=BatchStatus(mapping["status"]), + count=mapping["count"] + ) + ) + return results \ No newline at end of file diff --git a/src/api/endpoints/metrics/batches/aggregated/query/batch_status_/response.py b/src/api/endpoints/metrics/batches/aggregated/query/batch_status_/response.py new file mode 100644 index 00000000..79c1b2dd --- /dev/null +++ b/src/api/endpoints/metrics/batches/aggregated/query/batch_status_/response.py @@ -0,0 +1,10 @@ +from pydantic import BaseModel + +from src.collectors.enums import CollectorType +from src.core.enums import BatchStatus + + +class BatchStatusCountByBatchStrategyResponseDTO(BaseModel): + strategy: CollectorType + status: BatchStatus + count: int \ No newline at end of file diff --git a/src/api/endpoints/metrics/batches/aggregated/query/core.py b/src/api/endpoints/metrics/batches/aggregated/query/core.py new file mode 100644 index 00000000..2642f002 --- /dev/null +++ b/src/api/endpoints/metrics/batches/aggregated/query/core.py @@ -0,0 +1,79 @@ +from sqlalchemy import case, select +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.sql.functions import coalesce, func + +from src.api.endpoints.metrics.batches.aggregated.dto import GetMetricsBatchesAggregatedResponseDTO, \ + GetMetricsBatchesAggregatedInnerResponseDTO +from src.api.endpoints.metrics.batches.aggregated.query.all_urls.query import CountAllURLsByBatchStrategyQueryBuilder +from src.api.endpoints.metrics.batches.aggregated.query.batch_status_.query import \ + BatchStatusByBatchStrategyQueryBuilder +from src.api.endpoints.metrics.batches.aggregated.query.requester_.requester import \ + GetBatchesAggregatedMetricsQueryRequester +from src.api.endpoints.metrics.batches.aggregated.query.submitted_.query import \ + CountSubmittedByBatchStrategyQueryBuilder +from src.api.endpoints.metrics.batches.aggregated.query.url_error.query import URLErrorByBatchStrategyQueryBuilder +from src.api.endpoints.metrics.batches.aggregated.query.validated_.query import \ + ValidatedURLCountByBatchStrategyQueryBuilder +from src.collectors.enums import URLStatus, CollectorType +from src.core.enums import BatchStatus +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource +from src.db.queries.base.builder import QueryBuilderBase +from src.db.statement_composer import StatementComposer + + +class GetBatchesAggregatedMetricsQueryBuilder(QueryBuilderBase): + + async def run( + self, + session: AsyncSession + ) -> GetMetricsBatchesAggregatedResponseDTO: + + requester = GetBatchesAggregatedMetricsQueryRequester(session=session) + + url_error_count_dict: dict[CollectorType, int] = await requester.url_error_by_collector_strategy() + url_pending_count_dict: dict[CollectorType, int] = await requester.pending_url_count_by_collector_strategy() + url_submitted_count_dict: dict[CollectorType, int] = await requester.submitted_url_count_by_collector_strategy() + url_validated_count_dict: dict[CollectorType, int] = await requester.validated_url_count_by_collector_strategy() + url_rejected_count_dict: dict[CollectorType, int] = await requester.rejected_url_count_by_collector_strategy() + url_total_count_dict: dict[CollectorType, int] = await requester.url_count_by_collector_strategy() + batch_status_count_dict: dict[ + CollectorType, + dict[BatchStatus, int] + ] = await requester.batch_status_by_collector_strategy() + + + + + + d: dict[CollectorType, GetMetricsBatchesAggregatedInnerResponseDTO] = {} + for collector_type in CollectorType: + inner_response = GetMetricsBatchesAggregatedInnerResponseDTO( + count_successful_batches=batch_status_count_dict[collector_type][BatchStatus.READY_TO_LABEL], + count_failed_batches=batch_status_count_dict[collector_type][BatchStatus.ERROR], + count_urls=url_total_count_dict[collector_type], + count_urls_pending=url_pending_count_dict[collector_type], + count_urls_validated=url_validated_count_dict[collector_type], + count_urls_submitted=url_submitted_count_dict[collector_type], + count_urls_rejected=url_rejected_count_dict[collector_type], + count_urls_errors=url_error_count_dict[collector_type], + ) + d[collector_type] = inner_response + + total_batch_query = await session.execute( + select( + func.count(Batch.id, label="count") + ) + ) + total_batch_count = total_batch_query.scalars().one_or_none() + if total_batch_count is None: + total_batch_count = 0 + + return GetMetricsBatchesAggregatedResponseDTO( + total_batches=total_batch_count, + by_strategy=d + ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/__init__.py b/src/api/endpoints/metrics/batches/aggregated/query/models/__init__.py similarity index 100% rename from tests/automated/integration/tasks/url/impl/agency_identification/happy_path/__init__.py rename to src/api/endpoints/metrics/batches/aggregated/query/models/__init__.py diff --git a/src/api/endpoints/metrics/batches/aggregated/query/models/strategy_count.py b/src/api/endpoints/metrics/batches/aggregated/query/models/strategy_count.py new file mode 100644 index 00000000..9ceb7781 --- /dev/null +++ b/src/api/endpoints/metrics/batches/aggregated/query/models/strategy_count.py @@ -0,0 +1,8 @@ +from pydantic import BaseModel + +from src.collectors.enums import CollectorType + + +class CountByBatchStrategyResponse(BaseModel): + strategy: CollectorType + count: int \ No newline at end of file diff --git a/src/api/endpoints/metrics/batches/aggregated/query/pending/__init__.py b/src/api/endpoints/metrics/batches/aggregated/query/pending/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/batches/aggregated/query/pending/query.py b/src/api/endpoints/metrics/batches/aggregated/query/pending/query.py new file mode 100644 index 00000000..224d3bad --- /dev/null +++ b/src/api/endpoints/metrics/batches/aggregated/query/pending/query.py @@ -0,0 +1,37 @@ +from typing import Sequence + +from sqlalchemy import select, func, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.metrics.batches.aggregated.query.models.strategy_count import CountByBatchStrategyResponse +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.queries.base.builder import QueryBuilderBase +from src.db.helpers.session import session_helper as sh + +class PendingURLCountByBatchStrategyQueryBuilder(QueryBuilderBase): + async def run( + self, session: AsyncSession + ) -> list[CountByBatchStrategyResponse]: + + query = ( + select( + Batch.strategy, + func.count(LinkBatchURL.url_id).label("count") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id + ) + .outerjoin( + FlagURLValidated, + FlagURLValidated.url_id == LinkBatchURL.url_id + ) + .where(FlagURLValidated.url_id.is_(None)) + .group_by(Batch.strategy) + ) + + mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) + results = [CountByBatchStrategyResponse(**mapping) for mapping in mappings] + return results diff --git a/src/api/endpoints/metrics/batches/aggregated/query/rejected/__init__.py b/src/api/endpoints/metrics/batches/aggregated/query/rejected/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/batches/aggregated/query/rejected/query.py b/src/api/endpoints/metrics/batches/aggregated/query/rejected/query.py new file mode 100644 index 00000000..6c1d9e0f --- /dev/null +++ b/src/api/endpoints/metrics/batches/aggregated/query/rejected/query.py @@ -0,0 +1,39 @@ +from typing import Sequence + +from sqlalchemy import select, func, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.metrics.batches.aggregated.query.models.strategy_count import CountByBatchStrategyResponse +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.queries.base.builder import QueryBuilderBase +from src.db.helpers.session import session_helper as sh + +class RejectedURLCountByBatchStrategyQueryBuilder(QueryBuilderBase): + + async def run( + self, session: AsyncSession + ) -> list[CountByBatchStrategyResponse]: + + query = ( + select( + Batch.strategy, + func.count(FlagURLValidated.url_id).label("count") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id + ) + .join( + FlagURLValidated, + FlagURLValidated.url_id == LinkBatchURL.url_id + ) + .where(FlagURLValidated.type == URLValidatedType.NOT_RELEVANT) + .group_by(Batch.strategy) + ) + + mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) + results = [CountByBatchStrategyResponse(**mapping) for mapping in mappings] + return results diff --git a/src/api/endpoints/metrics/batches/aggregated/query/requester_/__init__.py b/src/api/endpoints/metrics/batches/aggregated/query/requester_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/batches/aggregated/query/requester_/convert.py b/src/api/endpoints/metrics/batches/aggregated/query/requester_/convert.py new file mode 100644 index 00000000..4a129dfb --- /dev/null +++ b/src/api/endpoints/metrics/batches/aggregated/query/requester_/convert.py @@ -0,0 +1,11 @@ +from src.api.endpoints.metrics.batches.aggregated.query.models.strategy_count import CountByBatchStrategyResponse +from src.collectors.enums import CollectorType + + +def convert_strategy_counts_to_strategy_count_dict( + responses: list[CountByBatchStrategyResponse] +) -> dict[CollectorType, int]: + result: dict[CollectorType, int] = {collector_type: 0 for collector_type in CollectorType} + for response in responses: + result[response.strategy] = response.count + return result \ No newline at end of file diff --git a/src/api/endpoints/metrics/batches/aggregated/query/requester_/requester.py b/src/api/endpoints/metrics/batches/aggregated/query/requester_/requester.py new file mode 100644 index 00000000..ac4c6dfa --- /dev/null +++ b/src/api/endpoints/metrics/batches/aggregated/query/requester_/requester.py @@ -0,0 +1,75 @@ + +from src.api.endpoints.metrics.batches.aggregated.query.all_urls.query import CountAllURLsByBatchStrategyQueryBuilder +from src.api.endpoints.metrics.batches.aggregated.query.batch_status_.query import \ + BatchStatusByBatchStrategyQueryBuilder +from src.api.endpoints.metrics.batches.aggregated.query.batch_status_.response import \ + BatchStatusCountByBatchStrategyResponseDTO +from src.api.endpoints.metrics.batches.aggregated.query.models.strategy_count import CountByBatchStrategyResponse +from src.api.endpoints.metrics.batches.aggregated.query.pending.query import PendingURLCountByBatchStrategyQueryBuilder +from src.api.endpoints.metrics.batches.aggregated.query.rejected.query import \ + RejectedURLCountByBatchStrategyQueryBuilder +from src.api.endpoints.metrics.batches.aggregated.query.requester_.convert import \ + convert_strategy_counts_to_strategy_count_dict +from src.api.endpoints.metrics.batches.aggregated.query.submitted_.query import \ + CountSubmittedByBatchStrategyQueryBuilder +from src.api.endpoints.metrics.batches.aggregated.query.url_error.query import URLErrorByBatchStrategyQueryBuilder +from src.api.endpoints.metrics.batches.aggregated.query.validated_.query import \ + ValidatedURLCountByBatchStrategyQueryBuilder +from src.collectors.enums import CollectorType +from src.core.enums import BatchStatus +from src.db.queries.base.builder import QueryBuilderBase +from src.db.templates.requester import RequesterBase + + +class GetBatchesAggregatedMetricsQueryRequester(RequesterBase): + + async def _run_strategy_count_query_builder( + self, query_builder: type[QueryBuilderBase]) -> dict[CollectorType, int]: + responses: list[CountByBatchStrategyResponse] = \ + await query_builder().run(self.session) + + return convert_strategy_counts_to_strategy_count_dict(responses) + + async def url_error_by_collector_strategy(self) -> dict[CollectorType, int]: + return await self._run_strategy_count_query_builder(URLErrorByBatchStrategyQueryBuilder) + + async def url_count_by_collector_strategy(self) -> dict[CollectorType, int]: + return await self._run_strategy_count_query_builder(CountAllURLsByBatchStrategyQueryBuilder) + + async def submitted_url_count_by_collector_strategy(self) -> dict[CollectorType, int]: + return await self._run_strategy_count_query_builder(CountSubmittedByBatchStrategyQueryBuilder) + + async def validated_url_count_by_collector_strategy(self) -> dict[CollectorType, int]: + return await self._run_strategy_count_query_builder(ValidatedURLCountByBatchStrategyQueryBuilder) + + async def rejected_url_count_by_collector_strategy(self) -> dict[CollectorType, int]: + return await self._run_strategy_count_query_builder(RejectedURLCountByBatchStrategyQueryBuilder) + + async def pending_url_count_by_collector_strategy(self) -> dict[CollectorType, int]: + return await self._run_strategy_count_query_builder(PendingURLCountByBatchStrategyQueryBuilder) + + async def batch_status_by_collector_strategy(self) -> dict[ + CollectorType, + dict[BatchStatus, int] + ]: + + responses: list[BatchStatusCountByBatchStrategyResponseDTO] = \ + await BatchStatusByBatchStrategyQueryBuilder().run(self.session) + + result: dict[CollectorType, dict[BatchStatus, int]] = { + collector_type: { + BatchStatus.ERROR: 0, + BatchStatus.READY_TO_LABEL: 0, + } + for collector_type in CollectorType + } + for response in responses: + if response.status not in ( + BatchStatus.ERROR, + BatchStatus.READY_TO_LABEL + ): + continue + result[response.strategy][response.status] = response.count + + return result + diff --git a/src/api/endpoints/metrics/batches/aggregated/query/submitted_/__init__.py b/src/api/endpoints/metrics/batches/aggregated/query/submitted_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/batches/aggregated/query/submitted_/query.py b/src/api/endpoints/metrics/batches/aggregated/query/submitted_/query.py new file mode 100644 index 00000000..ee8f8065 --- /dev/null +++ b/src/api/endpoints/metrics/batches/aggregated/query/submitted_/query.py @@ -0,0 +1,45 @@ +from typing import Sequence + +from sqlalchemy import select, func, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.metrics.batches.aggregated.query.models.strategy_count import CountByBatchStrategyResponse +from src.collectors.enums import CollectorType +from src.db.helpers.session import session_helper as sh +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource +from src.db.queries.base.builder import QueryBuilderBase + + +class CountSubmittedByBatchStrategyQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> list[ + CountByBatchStrategyResponse + ]: + query = ( + select( + Batch.strategy, + func.count(URLDataSource.id).label("count") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id + ) + .join( + URLDataSource, + URLDataSource.url_id == LinkBatchURL.url_id + ) + .group_by(Batch.strategy) + ) + + mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) + results: list[CountByBatchStrategyResponse] = [] + for mapping in mappings: + results.append( + CountByBatchStrategyResponse( + strategy=CollectorType(mapping["strategy"]), + count=mapping["count"] + ) + ) + return results diff --git a/src/api/endpoints/metrics/batches/aggregated/query/url_error/__init__.py b/src/api/endpoints/metrics/batches/aggregated/query/url_error/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/batches/aggregated/query/url_error/query.py b/src/api/endpoints/metrics/batches/aggregated/query/url_error/query.py new file mode 100644 index 00000000..9bcc3a57 --- /dev/null +++ b/src/api/endpoints/metrics/batches/aggregated/query/url_error/query.py @@ -0,0 +1,34 @@ +from typing import Sequence + +from sqlalchemy import select, func, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.metrics.batches.aggregated.query.models.strategy_count import CountByBatchStrategyResponse +from src.collectors.enums import URLStatus +from src.db.helpers.session import session_helper as sh +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.base.builder import QueryBuilderBase + + +class URLErrorByBatchStrategyQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> list[CountByBatchStrategyResponse]: + query = ( + select( + Batch.strategy, + func.count(URL.id).label("count") + ) + .select_from(Batch) + .join(LinkBatchURL) + .join(URL) + .where(URL.status == URLStatus.ERROR) + .group_by(Batch.strategy, URL.status) + ) + + mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) + results = [CountByBatchStrategyResponse(**mapping) for mapping in mappings] + return results + + diff --git a/src/api/endpoints/metrics/batches/aggregated/query/validated_/__init__.py b/src/api/endpoints/metrics/batches/aggregated/query/validated_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/batches/aggregated/query/validated_/query.py b/src/api/endpoints/metrics/batches/aggregated/query/validated_/query.py new file mode 100644 index 00000000..155cbcb0 --- /dev/null +++ b/src/api/endpoints/metrics/batches/aggregated/query/validated_/query.py @@ -0,0 +1,38 @@ +from typing import Sequence + +from sqlalchemy import select, func, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.metrics.batches.aggregated.query.models.strategy_count import CountByBatchStrategyResponse +from src.db.helpers.session import session_helper as sh +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.queries.base.builder import QueryBuilderBase + + +class ValidatedURLCountByBatchStrategyQueryBuilder(QueryBuilderBase): + + async def run( + self, session: AsyncSession + ) -> list[CountByBatchStrategyResponse]: + + query = ( + select( + Batch.strategy, + func.count(FlagURLValidated.url_id).label("count") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id + ) + .join( + FlagURLValidated, + FlagURLValidated.url_id == LinkBatchURL.url_id + ) + .group_by(Batch.strategy) + ) + + mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) + results = [CountByBatchStrategyResponse(**mapping) for mapping in mappings] + return results diff --git a/src/api/endpoints/metrics/batches/breakdown/error/__init__.py b/src/api/endpoints/metrics/batches/breakdown/error/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/batches/breakdown/error/cte_.py b/src/api/endpoints/metrics/batches/breakdown/error/cte_.py new file mode 100644 index 00000000..ed2ff44f --- /dev/null +++ b/src/api/endpoints/metrics/batches/breakdown/error/cte_.py @@ -0,0 +1,25 @@ +from sqlalchemy import select, func, CTE, Column + +from src.collectors.enums import URLStatus +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.api.endpoints.metrics.batches.breakdown.templates.cte_ import BatchesBreakdownURLCTE +from src.db.models.impl.url.core.sqlalchemy import URL + +URL_ERROR_CTE = BatchesBreakdownURLCTE( + select( + Batch.id, + func.count(LinkBatchURL.url_id).label("count_error") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id + ) + .join( + URL, + URL.id == LinkBatchURL.url_id + ) + .where(URL.status == URLStatus.ERROR) + .group_by(Batch.id) + .cte("error") +) diff --git a/src/api/endpoints/metrics/batches/breakdown/not_relevant/__init__.py b/src/api/endpoints/metrics/batches/breakdown/not_relevant/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/batches/breakdown/not_relevant/cte_.py b/src/api/endpoints/metrics/batches/breakdown/not_relevant/cte_.py new file mode 100644 index 00000000..14403e86 --- /dev/null +++ b/src/api/endpoints/metrics/batches/breakdown/not_relevant/cte_.py @@ -0,0 +1,27 @@ +from sqlalchemy import select, func + +from src.api.endpoints.metrics.batches.breakdown.templates.cte_ import BatchesBreakdownURLCTE +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL + +NOT_RELEVANT_CTE = BatchesBreakdownURLCTE( + select( + Batch.id, + func.count(FlagURLValidated.url_id).label("count_rejected") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id + ) + .join( + FlagURLValidated, + FlagURLValidated.url_id == LinkBatchURL.url_id + ) + .where( + FlagURLValidated.type == URLValidatedType.NOT_RELEVANT + ) + .group_by(Batch.id) + .cte("not_relevant") +) \ No newline at end of file diff --git a/src/api/endpoints/metrics/batches/breakdown/pending/__init__.py b/src/api/endpoints/metrics/batches/breakdown/pending/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/batches/breakdown/pending/cte_.py b/src/api/endpoints/metrics/batches/breakdown/pending/cte_.py new file mode 100644 index 00000000..bf09f345 --- /dev/null +++ b/src/api/endpoints/metrics/batches/breakdown/pending/cte_.py @@ -0,0 +1,26 @@ +from sqlalchemy import select, func + +from src.api.endpoints.metrics.batches.breakdown.templates.cte_ import BatchesBreakdownURLCTE +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL + +PENDING_CTE = BatchesBreakdownURLCTE( + select( + Batch.id, + func.count(LinkBatchURL.url_id).label("count_pending") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id + ) + .outerjoin( + FlagURLValidated, + FlagURLValidated.url_id == LinkBatchURL.url_id + ) + .where( + FlagURLValidated.url_id.is_(None) + ) + .group_by(Batch.id) + .cte("pending") +) \ No newline at end of file diff --git a/src/api/endpoints/metrics/batches/breakdown/query.py b/src/api/endpoints/metrics/batches/breakdown/query.py index 6fe0eb71..5847e309 100644 --- a/src/api/endpoints/metrics/batches/breakdown/query.py +++ b/src/api/endpoints/metrics/batches/breakdown/query.py @@ -1,13 +1,20 @@ -from sqlalchemy import select, case +from sqlalchemy import select, case, Column from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.sql.functions import coalesce from src.api.endpoints.metrics.batches.breakdown.dto import GetMetricsBatchesBreakdownResponseDTO, \ GetMetricsBatchesBreakdownInnerResponseDTO +from src.api.endpoints.metrics.batches.breakdown.error.cte_ import URL_ERROR_CTE +from src.api.endpoints.metrics.batches.breakdown.not_relevant.cte_ import NOT_RELEVANT_CTE +from src.api.endpoints.metrics.batches.breakdown.pending.cte_ import PENDING_CTE +from src.api.endpoints.metrics.batches.breakdown.submitted.cte_ import SUBMITTED_CTE +from src.api.endpoints.metrics.batches.breakdown.templates.cte_ import BatchesBreakdownURLCTE +from src.api.endpoints.metrics.batches.breakdown.total.cte_ import TOTAL_CTE +from src.api.endpoints.metrics.batches.breakdown.validated.cte_ import VALIDATED_CTE from src.collectors.enums import URLStatus, CollectorType from src.core.enums import BatchStatus from src.db.models.impl.batch.sqlalchemy import Batch -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer @@ -32,28 +39,32 @@ async def run(self, session: AsyncSession) -> GetMetricsBatchesBreakdownResponse Batch.date_generated.label("created_at"), ) - def url_column(status: URLStatus, label): - return sc.count_distinct( - case( - ( - URL.status == status.value, - URL.id - ) - ), - label=label - ) + all_ctes: list[BatchesBreakdownURLCTE] = [ + URL_ERROR_CTE, + NOT_RELEVANT_CTE, + PENDING_CTE, + SUBMITTED_CTE, + TOTAL_CTE, + VALIDATED_CTE + ] + + count_columns: list[Column] = [ + cte.count for cte in all_ctes + ] + count_query = select( - LinkBatchURL.batch_id, - sc.count_distinct(URL.id, label="count_total"), - url_column(URLStatus.PENDING, label="count_pending"), - url_column(URLStatus.SUBMITTED, label="count_submitted"), - url_column(URLStatus.NOT_RELEVANT, label="count_rejected"), - url_column(URLStatus.ERROR, label="count_error"), - url_column(URLStatus.VALIDATED, label="count_validated"), - ).join(URL, LinkBatchURL.url_id == URL.id).group_by( - LinkBatchURL.batch_id - ).subquery("url_count") + Batch.id.label("batch_id"), + *count_columns + ) + for cte in all_ctes: + count_query = count_query.outerjoin( + cte.query, + Batch.id == cte.batch_id + ) + + count_query = count_query.cte("url_count") + query = (select( main_query.c.strategy, diff --git a/src/api/endpoints/metrics/batches/breakdown/submitted/__init__.py b/src/api/endpoints/metrics/batches/breakdown/submitted/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/batches/breakdown/submitted/cte_.py b/src/api/endpoints/metrics/batches/breakdown/submitted/cte_.py new file mode 100644 index 00000000..face1891 --- /dev/null +++ b/src/api/endpoints/metrics/batches/breakdown/submitted/cte_.py @@ -0,0 +1,23 @@ +from sqlalchemy import select, func + +from src.api.endpoints.metrics.batches.breakdown.templates.cte_ import BatchesBreakdownURLCTE +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource + +SUBMITTED_CTE = BatchesBreakdownURLCTE( + select( + Batch.id, + func.count(URLDataSource.id).label("count_submitted") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id + ) + .join( + URLDataSource, + URLDataSource.url_id == LinkBatchURL.url_id + ) + .group_by(Batch.id) + .cte("submitted") +) \ No newline at end of file diff --git a/src/api/endpoints/metrics/batches/breakdown/templates/__init__.py b/src/api/endpoints/metrics/batches/breakdown/templates/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/batches/breakdown/templates/cte_.py b/src/api/endpoints/metrics/batches/breakdown/templates/cte_.py new file mode 100644 index 00000000..3fdd7521 --- /dev/null +++ b/src/api/endpoints/metrics/batches/breakdown/templates/cte_.py @@ -0,0 +1,20 @@ +from psycopg import Column +from sqlalchemy import CTE + + +class BatchesBreakdownURLCTE: + + def __init__(self, query: CTE): + self._query = query + + @property + def query(self) -> CTE: + return self._query + + @property + def batch_id(self) -> Column: + return self._query.columns[0] + + @property + def count(self) -> Column: + return self._query.columns[1] \ No newline at end of file diff --git a/src/api/endpoints/metrics/batches/breakdown/total/__init__.py b/src/api/endpoints/metrics/batches/breakdown/total/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/batches/breakdown/total/cte_.py b/src/api/endpoints/metrics/batches/breakdown/total/cte_.py new file mode 100644 index 00000000..33cf0c84 --- /dev/null +++ b/src/api/endpoints/metrics/batches/breakdown/total/cte_.py @@ -0,0 +1,15 @@ +from sqlalchemy import select, func + +from src.api.endpoints.metrics.batches.breakdown.templates.cte_ import BatchesBreakdownURLCTE +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL + +TOTAL_CTE = BatchesBreakdownURLCTE( + select( + Batch.id, + func.count(LinkBatchURL.url_id).label("count_total") + ) + .join(LinkBatchURL) + .group_by(Batch.id) + .cte("total") +) \ No newline at end of file diff --git a/src/api/endpoints/metrics/batches/breakdown/validated/__init__.py b/src/api/endpoints/metrics/batches/breakdown/validated/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/batches/breakdown/validated/cte_.py b/src/api/endpoints/metrics/batches/breakdown/validated/cte_.py new file mode 100644 index 00000000..b6ff5ef1 --- /dev/null +++ b/src/api/endpoints/metrics/batches/breakdown/validated/cte_.py @@ -0,0 +1,23 @@ +from sqlalchemy import select, func + +from src.api.endpoints.metrics.batches.breakdown.templates.cte_ import BatchesBreakdownURLCTE +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL + +VALIDATED_CTE = BatchesBreakdownURLCTE( + select( + Batch.id, + func.count(FlagURLValidated.url_id).label("count_validated") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id + ) + .join( + FlagURLValidated, + FlagURLValidated.url_id == LinkBatchURL.url_id + ) + .group_by(Batch.id) + .cte("validated") +) \ No newline at end of file diff --git a/src/api/endpoints/metrics/urls/__init__.py b/src/api/endpoints/metrics/urls/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/urls/aggregated/__init__.py b/src/api/endpoints/metrics/urls/aggregated/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/urls/aggregated/query/__init__.py b/src/api/endpoints/metrics/urls/aggregated/query/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/urls/aggregated/query/core.py b/src/api/endpoints/metrics/urls/aggregated/query/core.py new file mode 100644 index 00000000..57bc4211 --- /dev/null +++ b/src/api/endpoints/metrics/urls/aggregated/query/core.py @@ -0,0 +1,48 @@ +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.metrics.dtos.get.urls.aggregated.core import GetMetricsURLsAggregatedResponseDTO +from src.api.endpoints.metrics.urls.aggregated.query.subqueries.all import ALL_SUBQUERY +from src.api.endpoints.metrics.urls.aggregated.query.subqueries.error import ERROR_SUBQUERY +from src.api.endpoints.metrics.urls.aggregated.query.subqueries.pending import PENDING_SUBQUERY +from src.api.endpoints.metrics.urls.aggregated.query.subqueries.rejected import REJECTED_SUBQUERY +from src.api.endpoints.metrics.urls.aggregated.query.subqueries.submitted import SUBMITTED_SUBQUERY +from src.api.endpoints.metrics.urls.aggregated.query.subqueries.validated import VALIDATED_SUBQUERY +from src.collectors.enums import URLStatus +from src.db.helpers.session import session_helper as sh +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.base.builder import QueryBuilderBase + + +class GetURLsAggregatedMetricsQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> GetMetricsURLsAggregatedResponseDTO: + + oldest_pending_url_query = select( + URL.id, + URL.created_at + ).where( + URL.status == URLStatus.OK.value + ).order_by( + URL.created_at.asc() + ).limit(1) + + oldest_pending_url = await session.execute(oldest_pending_url_query) + oldest_pending_url = oldest_pending_url.one_or_none() + if oldest_pending_url is None: + oldest_pending_url_id = None + oldest_pending_created_at = None + else: + oldest_pending_url_id = oldest_pending_url.id + oldest_pending_created_at = oldest_pending_url.created_at + + return GetMetricsURLsAggregatedResponseDTO( + count_urls_total=await sh.scalar(session, query=ALL_SUBQUERY), + count_urls_pending=await sh.scalar(session, query=PENDING_SUBQUERY), + count_urls_submitted=await sh.scalar(session, query=SUBMITTED_SUBQUERY), + count_urls_validated=await sh.scalar(session, query=VALIDATED_SUBQUERY), + count_urls_rejected=await sh.scalar(session, query=REJECTED_SUBQUERY), + count_urls_errors=await sh.scalar(session, query=ERROR_SUBQUERY), + oldest_pending_url_id=oldest_pending_url_id, + oldest_pending_url_created_at=oldest_pending_created_at, + ) diff --git a/src/api/endpoints/metrics/urls/aggregated/query/subqueries/__init__.py b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/urls/aggregated/query/subqueries/all.py b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/all.py new file mode 100644 index 00000000..a2d09217 --- /dev/null +++ b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/all.py @@ -0,0 +1,9 @@ +from sqlalchemy import select, func + +from src.db.models.impl.url.core.sqlalchemy import URL + +ALL_SUBQUERY = ( + select( + func.count(URL.id).label("count") + ) +) \ No newline at end of file diff --git a/src/api/endpoints/metrics/urls/aggregated/query/subqueries/error.py b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/error.py new file mode 100644 index 00000000..407b0e4b --- /dev/null +++ b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/error.py @@ -0,0 +1,11 @@ +from sqlalchemy import select, func + +from src.collectors.enums import URLStatus +from src.db.models.impl.url.core.sqlalchemy import URL + +ERROR_SUBQUERY = ( + select( + func.count(URL.id).label("count") + ) + .where(URL.status == URLStatus.ERROR) +) \ No newline at end of file diff --git a/src/api/endpoints/metrics/urls/aggregated/query/subqueries/pending.py b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/pending.py new file mode 100644 index 00000000..31d8e2b6 --- /dev/null +++ b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/pending.py @@ -0,0 +1,19 @@ +from sqlalchemy import select, func + +from src.collectors.enums import URLStatus +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.url.core.sqlalchemy import URL + +PENDING_SUBQUERY = ( + select( + func.count(URL.id).label("count") + ) + .outerjoin( + FlagURLValidated, + URL.id == FlagURLValidated.url_id, + ) + .where( + URL.status == URLStatus.OK, + FlagURLValidated.url_id.is_(None), + ) +) \ No newline at end of file diff --git a/src/api/endpoints/metrics/urls/aggregated/query/subqueries/rejected.py b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/rejected.py new file mode 100644 index 00000000..983554ab --- /dev/null +++ b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/rejected.py @@ -0,0 +1,18 @@ +from sqlalchemy import select, func + +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.url.core.sqlalchemy import URL + +REJECTED_SUBQUERY = ( + select( + func.count(URL.id).label("count") + ) + .join( + FlagURLValidated, + URL.id == FlagURLValidated.url_id, + ) + .where( + FlagURLValidated.type == URLValidatedType.NOT_RELEVANT, + ) +) \ No newline at end of file diff --git a/src/api/endpoints/metrics/urls/aggregated/query/subqueries/submitted.py b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/submitted.py new file mode 100644 index 00000000..34be5e26 --- /dev/null +++ b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/submitted.py @@ -0,0 +1,14 @@ +from sqlalchemy import func, select + +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource + +SUBMITTED_SUBQUERY = ( + select( + func.count(URL.id).label("count") + ) + .join( + URLDataSource, + URL.id == URLDataSource.url_id, + ) +) \ No newline at end of file diff --git a/src/api/endpoints/metrics/urls/aggregated/query/subqueries/validated.py b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/validated.py new file mode 100644 index 00000000..fb771db6 --- /dev/null +++ b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/validated.py @@ -0,0 +1,14 @@ +from sqlalchemy import select, func + +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.url.core.sqlalchemy import URL + +VALIDATED_SUBQUERY = ( + select( + func.count(URL.id).label("count") + ) + .join( + FlagURLValidated, + URL.id == FlagURLValidated.url_id, + ) +) \ No newline at end of file diff --git a/src/api/endpoints/metrics/urls/breakdown/__init__.py b/src/api/endpoints/metrics/urls/breakdown/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/urls/breakdown/query/__init__.py b/src/api/endpoints/metrics/urls/breakdown/query/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/urls/breakdown/query/core.py b/src/api/endpoints/metrics/urls/breakdown/query/core.py new file mode 100644 index 00000000..3fc52c3f --- /dev/null +++ b/src/api/endpoints/metrics/urls/breakdown/query/core.py @@ -0,0 +1,91 @@ +from typing import Any + +from sqlalchemy import select, case, literal, func +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.metrics.dtos.get.urls.breakdown.pending import GetMetricsURLsBreakdownPendingResponseInnerDTO, \ + GetMetricsURLsBreakdownPendingResponseDTO +from src.collectors.enums import URLStatus +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion +from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion +from src.db.queries.base.builder import QueryBuilderBase + + +class GetURLsBreakdownPendingMetricsQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> GetMetricsURLsBreakdownPendingResponseDTO: + + flags = ( + select( + URL.id.label("url_id"), + case((UserRecordTypeSuggestion.url_id != None, literal(True)), else_=literal(False)).label( + "has_user_record_type_annotation" + ), + case((UserRelevantSuggestion.url_id != None, literal(True)), else_=literal(False)).label( + "has_user_relevant_annotation" + ), + case((UserUrlAgencySuggestion.url_id != None, literal(True)), else_=literal(False)).label( + "has_user_agency_annotation" + ), + ) + .outerjoin(UserRecordTypeSuggestion, URL.id == UserRecordTypeSuggestion.url_id) + .outerjoin(UserRelevantSuggestion, URL.id == UserRelevantSuggestion.url_id) + .outerjoin(UserUrlAgencySuggestion, URL.id == UserUrlAgencySuggestion.url_id) + ).cte("flags") + + month = func.date_trunc('month', URL.created_at) + + # Build the query + query = ( + select( + month.label('month'), + func.count(URL.id).label('count_total'), + func.count( + case( + (flags.c.has_user_record_type_annotation == True, 1) + ) + ).label('user_record_type_count'), + func.count( + case( + (flags.c.has_user_relevant_annotation == True, 1) + ) + ).label('user_relevant_count'), + func.count( + case( + (flags.c.has_user_agency_annotation == True, 1) + ) + ).label('user_agency_count'), + ) + .outerjoin(flags, flags.c.url_id == URL.id) + .outerjoin( + FlagURLValidated, + FlagURLValidated.url_id == URL.id + ) + .where( + FlagURLValidated.url_id.is_(None), + URL.status == URLStatus.OK + ) + .group_by(month) + .order_by(month.asc()) + ) + + # Execute the query and return the results + results = await session.execute(query) + all_results = results.all() + final_results: list[GetMetricsURLsBreakdownPendingResponseInnerDTO] = [] + + for result in all_results: + dto = GetMetricsURLsBreakdownPendingResponseInnerDTO( + month=result.month.strftime("%B %Y"), + count_pending_total=result.count_total, + count_pending_relevant_user=result.user_relevant_count, + count_pending_record_type_user=result.user_record_type_count, + count_pending_agency_user=result.user_agency_count, + ) + final_results.append(dto) + return GetMetricsURLsBreakdownPendingResponseDTO( + entries=final_results, + ) \ No newline at end of file diff --git a/src/api/endpoints/review/approve/query_/core.py b/src/api/endpoints/review/approve/query_/core.py index af810a2b..86c0212c 100644 --- a/src/api/endpoints/review/approve/query_/core.py +++ b/src/api/endpoints/review/approve/query_/core.py @@ -9,6 +9,8 @@ from src.collectors.enums import URLStatus from src.db.constants import PLACEHOLDER_AGENCY_NAME from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata @@ -30,76 +32,38 @@ def __init__( async def run(self, session: AsyncSession) -> None: # Get URL + url = await self._get_url(session) - query = ( - Select(URL) - .where(URL.id == self.approval_info.url_id) - .options( - joinedload(URL.optional_data_source_metadata), - joinedload(URL.confirmed_agencies), - ) - ) - - url = await session.execute(query) - url = url.scalars().first() - - update_if_not_none( - url, - "record_type", - self.approval_info.record_type.value - if self.approval_info.record_type is not None else None, - required=True - ) + await self._optionally_update_record_type(url) # Get existing agency ids existing_agencies = url.confirmed_agencies or [] existing_agency_ids = [agency.agency_id for agency in existing_agencies] new_agency_ids = self.approval_info.agency_ids or [] - if len(existing_agency_ids) == 0 and len(new_agency_ids) == 0: - raise HTTPException( - status_code=HTTP_400_BAD_REQUEST, - detail="Must specify agency_id if URL does not already have a confirmed agency" - ) + await self._check_for_unspecified_agency_ids(existing_agency_ids, new_agency_ids) - # Get any existing agency ids that are not in the new agency ids - # If new agency ids are specified, overwrite existing - if len(new_agency_ids) != 0: - for existing_agency in existing_agencies: - if existing_agency.id not in new_agency_ids: - # If the existing agency id is not in the new agency ids, delete it - await session.delete(existing_agency) + await self._overwrite_existing_agencies(existing_agencies, new_agency_ids, session) # Add any new agency ids that are not in the existing agency ids - for new_agency_id in new_agency_ids: - if new_agency_id not in existing_agency_ids: - # Check if the new agency exists in the database - query = ( - select(Agency) - .where(Agency.agency_id == new_agency_id) - ) - existing_agency = await session.execute(query) - existing_agency = existing_agency.scalars().first() - if existing_agency is None: - # If not, create it - agency = Agency( - agency_id=new_agency_id, - name=PLACEHOLDER_AGENCY_NAME, - ) - session.add(agency) - - # If the new agency id is not in the existing agency ids, add it - confirmed_url_agency = LinkURLAgency( - url_id=self.approval_info.url_id, - agency_id=new_agency_id - ) - session.add(confirmed_url_agency) + await self._add_new_agencies(existing_agency_ids, new_agency_ids, session) - # If it does, do nothing + await self._add_validated_flag(session, url=url) - url.status = URLStatus.VALIDATED.value + await self._optionally_update_required_metadata(url) + await self._optionally_update_optional_metdata(url) + await self._add_approving_user(session) + async def _optionally_update_required_metadata(self, url: URL) -> None: update_if_not_none(url, "name", self.approval_info.name, required=True) update_if_not_none(url, "description", self.approval_info.description, required=False) + async def _add_approving_user(self, session: AsyncSession) -> None: + approving_user_url = ReviewingUserURL( + user_id=self.user_id, + url_id=self.approval_info.url_id + ) + session.add(approving_user_url) + + async def _optionally_update_optional_metdata(self, url: URL) -> None: optional_metadata = url.optional_data_source_metadata if optional_metadata is None: url.optional_data_source_metadata = URLOptionalDataSourceMetadata( @@ -124,10 +88,85 @@ async def run(self, session: AsyncSession) -> None: self.approval_info.supplying_entity ) - # Add approving user - approving_user_url = ReviewingUserURL( - user_id=self.user_id, - url_id=self.approval_info.url_id + async def _optionally_update_record_type(self, url: URL) -> None: + update_if_not_none( + url, + "record_type", + self.approval_info.record_type.value + if self.approval_info.record_type is not None else None, + required=True ) - session.add(approving_user_url) \ No newline at end of file + async def _get_url(self, session: AsyncSession) -> URL: + query = ( + Select(URL) + .where(URL.id == self.approval_info.url_id) + .options( + joinedload(URL.optional_data_source_metadata), + joinedload(URL.confirmed_agencies), + ) + ) + url = await session.execute(query) + url = url.scalars().first() + return url + + async def _check_for_unspecified_agency_ids( + self, + existing_agency_ids: list[int], + new_agency_ids: list[int] + ) -> None: + """ + raises: + HTTPException: If no agency ids are specified and no existing agency ids are found + """ + if len(existing_agency_ids) == 0 and len(new_agency_ids) == 0: + raise HTTPException( + status_code=HTTP_400_BAD_REQUEST, + detail="Must specify agency_id if URL does not already have a confirmed agency" + ) + + async def _overwrite_existing_agencies(self, existing_agencies, new_agency_ids, session): + # Get any existing agency ids that are not in the new agency ids + # If new agency ids are specified, overwrite existing + if len(new_agency_ids) != 0: + for existing_agency in existing_agencies: + if existing_agency.id not in new_agency_ids: + # If the existing agency id is not in the new agency ids, delete it + await session.delete(existing_agency) + + async def _add_new_agencies(self, existing_agency_ids, new_agency_ids, session): + for new_agency_id in new_agency_ids: + if new_agency_id in existing_agency_ids: + continue + # Check if the new agency exists in the database + query = ( + select(Agency) + .where(Agency.agency_id == new_agency_id) + ) + existing_agency = await session.execute(query) + existing_agency = existing_agency.scalars().first() + if existing_agency is None: + # If not, create it + agency = Agency( + agency_id=new_agency_id, + name=PLACEHOLDER_AGENCY_NAME, + ) + session.add(agency) + + # If the new agency id is not in the existing agency ids, add it + confirmed_url_agency = LinkURLAgency( + url_id=self.approval_info.url_id, + agency_id=new_agency_id + ) + session.add(confirmed_url_agency) + + async def _add_validated_flag( + self, + session: AsyncSession, + url: URL + ) -> None: + flag = FlagURLValidated( + url_id=url.id, + type=URLValidatedType.DATA_SOURCE + ) + session.add(flag) diff --git a/src/api/endpoints/review/next/convert.py b/src/api/endpoints/review/next/convert.py new file mode 100644 index 00000000..ca087895 --- /dev/null +++ b/src/api/endpoints/review/next/convert.py @@ -0,0 +1,108 @@ +from src.api.endpoints.annotate.agency.get.dto import GetNextURLForAgencyAgencyInfo +from src.api.endpoints.review.next.dto import FinalReviewAnnotationAgencyInfo, FinalReviewAnnotationAgencyAutoInfo +from src.core.enums import SuggestionType +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask +from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion +from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion + + +def convert_agency_info_to_final_review_annotation_agency_info( + subtasks: list[URLAutoAgencyIDSubtask], + confirmed_agencies: list[LinkURLAgency], + user_agency_suggestion: UserUrlAgencySuggestion +) -> FinalReviewAnnotationAgencyInfo: + + confirmed_agency_info: list[GetNextURLForAgencyAgencyInfo] = ( + _convert_confirmed_agencies_to_final_review_annotation_agency_info( + confirmed_agencies + ) + ) + + agency_auto_info: FinalReviewAnnotationAgencyAutoInfo = ( + _convert_url_auto_agency_suggestions_to_final_review_annotation_agency_auto_info( + subtasks + ) + ) + + agency_user_info: GetNextURLForAgencyAgencyInfo | None = ( + _convert_user_url_agency_suggestion_to_final_review_annotation_agency_user_info( + user_agency_suggestion + ) + ) + + return FinalReviewAnnotationAgencyInfo( + confirmed=confirmed_agency_info, + user=agency_user_info, + auto=agency_auto_info + ) + +def _convert_confirmed_agencies_to_final_review_annotation_agency_info( + confirmed_agencies: list[LinkURLAgency] +) -> list[GetNextURLForAgencyAgencyInfo]: + results: list[GetNextURLForAgencyAgencyInfo] = [] + for confirmed_agency in confirmed_agencies: + agency = confirmed_agency.agency + agency_info = _convert_agency_to_get_next_url_for_agency_agency_info( + suggestion_type=SuggestionType.CONFIRMED, + agency=agency + ) + results.append(agency_info) + return results + +def _convert_user_url_agency_suggestion_to_final_review_annotation_agency_user_info( + user_url_agency_suggestion: UserUrlAgencySuggestion +) -> GetNextURLForAgencyAgencyInfo | None: + suggestion = user_url_agency_suggestion + if suggestion is None: + return None + if suggestion.is_new: + return GetNextURLForAgencyAgencyInfo( + suggestion_type=SuggestionType.NEW_AGENCY, + ) + return _convert_agency_to_get_next_url_for_agency_agency_info( + suggestion_type=SuggestionType.USER_SUGGESTION, + agency=suggestion.agency + ) + +def _convert_agency_to_get_next_url_for_agency_agency_info( + suggestion_type: SuggestionType, + agency: Agency | None +) -> GetNextURLForAgencyAgencyInfo: + if agency is None: + if suggestion_type == SuggestionType.UNKNOWN: + return GetNextURLForAgencyAgencyInfo( + suggestion_type=suggestion_type, + ) + raise ValueError("agency cannot be None for suggestion type other than unknown") + + return GetNextURLForAgencyAgencyInfo( + suggestion_type=suggestion_type, + pdap_agency_id=agency.agency_id, + agency_name=agency.name, + state=agency.state, + county=agency.county, + locality=agency.locality + ) + +def _convert_url_auto_agency_suggestions_to_final_review_annotation_agency_auto_info( + subtasks: list[URLAutoAgencyIDSubtask] +) -> FinalReviewAnnotationAgencyAutoInfo: + results: list[GetNextURLForAgencyAgencyInfo] = [] + count_agencies_not_found: int = 0 + for subtask in subtasks: + if not subtask.agencies_found: + count_agencies_not_found += 1 + continue + suggestions: list[AgencyIDSubtaskSuggestion] = subtask.suggestions + for suggestion in suggestions: + info: GetNextURLForAgencyAgencyInfo = _convert_agency_to_get_next_url_for_agency_agency_info( + suggestion_type=SuggestionType.AUTO_SUGGESTION, + agency=suggestion.agency + ) + results.append(info) + return FinalReviewAnnotationAgencyAutoInfo( + unknown=count_agencies_not_found == len(subtasks), + suggestions=results + ) diff --git a/src/api/endpoints/review/next/query.py b/src/api/endpoints/review/next/core.py similarity index 60% rename from src/api/endpoints/review/next/query.py rename to src/api/endpoints/review/next/core.py index 7cb4670b..1736a970 100644 --- a/src/api/endpoints/review/next/query.py +++ b/src/api/endpoints/review/next/core.py @@ -1,26 +1,28 @@ -from typing import Optional, Type - -from sqlalchemy import FromClause, select, and_, Select, desc, asc, func +from sqlalchemy import FromClause, select, Select, desc, asc, func, CTE from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.orm import joinedload +from src.api.endpoints.review.next.convert import convert_agency_info_to_final_review_annotation_agency_info from src.api.endpoints.review.next.dto import FinalReviewOptionalMetadata, FinalReviewBatchInfo, \ GetNextURLForFinalReviewOuterResponse, GetNextURLForFinalReviewResponse, FinalReviewAnnotationInfo +from src.api.endpoints.review.next.extract import extract_html_content_infos, extract_optional_metadata +from src.api.endpoints.review.next.queries.count_reviewed import COUNT_REVIEWED_CTE +from src.api.endpoints.review.next.queries.eligible_urls import build_eligible_urls_cte +from src.api.endpoints.review.next.templates.count_cte import CountCTE from src.collectors.enums import URLStatus from src.core.tasks.url.operators.html.scraper.parser.util import convert_to_response_html_info from src.db.constants import USER_ANNOTATION_MODELS from src.db.dto_converter import DTOConverter from src.db.dtos.url.html_content import URLHTMLContentInfo from src.db.exceptions import FailedQueryException -from src.db.models.impl.batch.sqlalchemy import Batch -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask +from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion -from src.db.models.mixins import URLDependentMixin from src.db.queries.base.builder import QueryBuilderBase -from src.db.queries.implementations.core.common.annotation_exists import AnnotationExistsCTEQueryBuilder +from src.db.queries.implementations.core.common.annotation_exists_.core import AnnotationExistsCTEQueryBuilder TOTAL_DISTINCT_ANNOTATION_COUNT_LABEL = "total_distinct_annotation_count" @@ -42,7 +44,6 @@ def __init__(self, batch_id: int | None = None): ] # The below relationships are joined to entities that are joined to the URL self.double_join_relationships = [ - (URL.automated_agency_suggestions, AutomatedUrlAgencySuggestion.agency), (URL.user_agency_suggestion, UserUrlAgencySuggestion.agency), (URL.confirmed_agencies, LinkURLAgency.agency) ] @@ -60,58 +61,26 @@ def _get_where_exist_clauses( where_clauses.append(where_clause) return where_clauses - def _build_base_query( - self, - anno_exists_query: FromClause, - ) -> Select: - builder = self.anno_exists_builder - where_exist_clauses = self._get_where_exist_clauses( - builder.query - ) + def _build_base_query(self) -> Select: + eligible_urls: CTE = build_eligible_urls_cte(batch_id=self.batch_id) query = ( select( URL, - self._sum_exists_query(anno_exists_query, USER_ANNOTATION_MODELS) ) - .select_from(anno_exists_query) + .select_from( + eligible_urls + ) .join( URL, - URL.id == builder.url_id - ) - ) - if self.batch_id is not None: - query = ( - query.join( - LinkBatchURL - ) - .where( - LinkBatchURL.batch_id == self.batch_id - ) + URL.id == eligible_urls.c.url_id ) - - query = ( - query.where( - and_( - URL.status == URLStatus.PENDING.value, - *where_exist_clauses - ) + .where( + URL.status == URLStatus.OK.value ) ) return query - - def _sum_exists_query(self, query, models: list[Type[URLDependentMixin]]): - return sum( - [getattr(query.c, self.anno_exists_builder.get_exists_label(model)) for model in models] - ).label(TOTAL_DISTINCT_ANNOTATION_COUNT_LABEL) - - - async def _apply_batch_id_filter(self, url_query: Select, batch_id: int | None): - if batch_id is None: - return url_query - return url_query.where(URL.batch_id == batch_id) - async def _apply_options( self, url_query: Select @@ -124,49 +93,30 @@ async def _apply_options( *[ joinedload(primary).joinedload(secondary) for primary, secondary in self.double_join_relationships - ] - ) - - async def _apply_order_clause(self, url_query: Select): - return url_query.order_by( - desc(TOTAL_DISTINCT_ANNOTATION_COUNT_LABEL), - asc(URL.id) + ], + joinedload(URL.auto_agency_subtasks) + .joinedload(URLAutoAgencyIDSubtask.suggestions) + .contains_eager(AgencyIDSubtaskSuggestion.agency) ) - async def _extract_html_content_infos(self, url: URL) -> list[URLHTMLContentInfo]: - html_content = url.html_content - html_content_infos = [ - URLHTMLContentInfo(**html_info.__dict__) - for html_info in html_content - ] - return html_content_infos - - async def _extract_optional_metadata(self, url: URL) -> FinalReviewOptionalMetadata: - if url.optional_data_source_metadata is None: - return FinalReviewOptionalMetadata() - return FinalReviewOptionalMetadata( - record_formats=url.optional_data_source_metadata.record_formats, - data_portal_type=url.optional_data_source_metadata.data_portal_type, - supplying_entity=url.optional_data_source_metadata.supplying_entity - ) async def get_batch_info(self, session: AsyncSession) -> FinalReviewBatchInfo | None: if self.batch_id is None: return None - count_reviewed_query = await self.get_count_reviewed_query() + count_reviewed_query: CountCTE = COUNT_REVIEWED_CTE count_ready_query = await self.get_count_ready_query() full_query = ( select( - func.coalesce(count_reviewed_query.c[self.count_label], 0).label("count_reviewed"), + func.coalesce(count_reviewed_query.count, 0).label("count_reviewed"), func.coalesce(count_ready_query.c[self.count_label], 0).label("count_ready_for_review") ) .select_from( count_ready_query.outerjoin( - count_reviewed_query, - count_reviewed_query.c.batch_id == count_ready_query.c.batch_id + count_reviewed_query.cte, + count_reviewed_query.batch_id == count_ready_query.c.batch_id ) ) ) @@ -175,6 +125,7 @@ async def get_batch_info(self, session: AsyncSession) -> FinalReviewBatchInfo | return FinalReviewBatchInfo(**raw_result.mappings().one()) async def get_count_ready_query(self): + # TODO: Migrate to separate query builder builder = self.anno_exists_builder count_ready_query = ( select( @@ -189,7 +140,7 @@ async def get_count_ready_query(self): ) .where( LinkBatchURL.batch_id == self.batch_id, - URL.status == URLStatus.PENDING.value, + URL.status == URLStatus.OK.value, *self._get_where_exist_clauses( builder.query ) @@ -199,31 +150,6 @@ async def get_count_ready_query(self): ) return count_ready_query - async def get_count_reviewed_query(self): - count_reviewed_query = ( - select( - Batch.id.label("batch_id"), - func.count(URL.id).label(self.count_label) - ) - .select_from(Batch) - .join(LinkBatchURL) - .outerjoin(URL, URL.id == LinkBatchURL.url_id) - .where( - URL.status.in_( - [ - URLStatus.VALIDATED.value, - URLStatus.NOT_RELEVANT.value, - URLStatus.SUBMITTED.value, - URLStatus.INDIVIDUAL_RECORD.value - ] - ), - LinkBatchURL.batch_id == self.batch_id - ) - .group_by(Batch.id) - .subquery("count_reviewed") - ) - return count_reviewed_query - async def run( self, session: AsyncSession @@ -251,8 +177,8 @@ async def run( result: URL = row[0] - html_content_infos = await self._extract_html_content_infos(result) - optional_metadata = await self._extract_optional_metadata(result) + html_content_infos: list[URLHTMLContentInfo] = await extract_html_content_infos(result) + optional_metadata: FinalReviewOptionalMetadata = await extract_optional_metadata(result) batch_info = await self.get_batch_info(session) try: @@ -272,8 +198,8 @@ async def run( user_suggestion=result.user_record_type_suggestion, auto_suggestion=result.auto_record_type_suggestion ), - agency=DTOConverter.final_review_annotation_agency_info( - automated_agency_suggestions=result.automated_agency_suggestions, + agency=convert_agency_info_to_final_review_annotation_agency_info( + subtasks=result.auto_agency_subtasks, user_agency_suggestion=result.user_agency_suggestion, confirmed_agencies=result.confirmed_agencies ) @@ -289,9 +215,7 @@ async def run( raise FailedQueryException(f"Failed to convert result for url id {result.id} to response") from e async def build_url_query(self): - anno_exists_query = self.anno_exists_builder.query - url_query = self._build_base_query(anno_exists_query) + url_query = self._build_base_query() url_query = await self._apply_options(url_query) - url_query = await self._apply_order_clause(url_query) return url_query diff --git a/src/api/endpoints/review/next/extract.py b/src/api/endpoints/review/next/extract.py new file mode 100644 index 00000000..aca642e0 --- /dev/null +++ b/src/api/endpoints/review/next/extract.py @@ -0,0 +1,23 @@ +from src.api.endpoints.review.next.dto import FinalReviewOptionalMetadata +from src.db.dtos.url.html_content import URLHTMLContentInfo +from src.db.models.impl.url.core.sqlalchemy import URL + + +async def extract_html_content_infos( + url: URL +)-> list[URLHTMLContentInfo]: + html_content = url.html_content + html_content_infos = [ + URLHTMLContentInfo(**html_info.__dict__) + for html_info in html_content + ] + return html_content_infos + +async def extract_optional_metadata(url: URL) -> FinalReviewOptionalMetadata: + if url.optional_data_source_metadata is None: + return FinalReviewOptionalMetadata() + return FinalReviewOptionalMetadata( + record_formats=url.optional_data_source_metadata.record_formats, + data_portal_type=url.optional_data_source_metadata.data_portal_type, + supplying_entity=url.optional_data_source_metadata.supplying_entity + ) \ No newline at end of file diff --git a/src/api/endpoints/review/next/queries/__init__.py b/src/api/endpoints/review/next/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/review/next/queries/count_reviewed.py b/src/api/endpoints/review/next/queries/count_reviewed.py new file mode 100644 index 00000000..91349cb5 --- /dev/null +++ b/src/api/endpoints/review/next/queries/count_reviewed.py @@ -0,0 +1,18 @@ +from sqlalchemy import select, func + +from src.api.endpoints.review.next.templates.count_cte import CountCTE +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL + +COUNT_REVIEWED_CTE: CountCTE = CountCTE( + select( + Batch.id.label("batch_id"), + func.count(FlagURLValidated.url_id).label("count") + ) + .select_from(Batch) + .join(LinkBatchURL) + .outerjoin(FlagURLValidated, FlagURLValidated.url_id == LinkBatchURL.url_id) + .group_by(Batch.id) + .cte("count_reviewed") +) \ No newline at end of file diff --git a/src/api/endpoints/review/next/queries/eligible_urls.py b/src/api/endpoints/review/next/queries/eligible_urls.py new file mode 100644 index 00000000..bee5cea2 --- /dev/null +++ b/src/api/endpoints/review/next/queries/eligible_urls.py @@ -0,0 +1,35 @@ +from sqlalchemy import CTE, select, Select + +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.views.url_annotations_flags import URLAnnotationFlagsView + +uafw = URLAnnotationFlagsView + +def build_eligible_urls_cte(batch_id: int | None = None) -> CTE: + query: Select = ( + select( + uafw.url_id, + ) + .where( + # uafw.has_auto_agency_suggestion.is_(True), + # uafw.has_auto_record_type_suggestion.is_(True), + # uafw.has_auto_relevant_suggestion.is_(True), + uafw.has_user_relevant_suggestion.is_(True), + uafw.has_user_agency_suggestion.is_(True), + uafw.has_user_record_type_suggestion.is_(True), + uafw.was_reviewed.is_(False) + ) + ) + + if batch_id is not None: + query = ( + query.join( + LinkBatchURL, + LinkBatchURL.url_id == uafw.url_id + ) + .where( + LinkBatchURL.batch_id == batch_id + ) + ) + + return query.cte("eligible_urls") diff --git a/src/api/endpoints/review/next/templates/__init__.py b/src/api/endpoints/review/next/templates/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/review/next/templates/count_cte.py b/src/api/endpoints/review/next/templates/count_cte.py new file mode 100644 index 00000000..0abbbab4 --- /dev/null +++ b/src/api/endpoints/review/next/templates/count_cte.py @@ -0,0 +1,15 @@ +from sqlalchemy import CTE, Column + + +class CountCTE: + + def __init__(self, cte: CTE): + self.cte = cte + + @property + def batch_id(self) -> Column[int]: + return self.cte.c['batch_id'] + + @property + def count(self) -> Column[int]: + return self.cte.c['count'] \ No newline at end of file diff --git a/src/api/endpoints/review/reject/query.py b/src/api/endpoints/review/reject/query.py index 7d603fe1..c187a2a8 100644 --- a/src/api/endpoints/review/reject/query.py +++ b/src/api/endpoints/review/reject/query.py @@ -5,6 +5,8 @@ from src.api.endpoints.review.enums import RejectionReason from src.collectors.enums import URLStatus +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.reviewing_user import ReviewingUserURL from src.db.queries.base.builder import QueryBuilderBase @@ -33,19 +35,27 @@ async def run(self, session) -> None: url = await session.execute(query) url = url.scalars().first() + validation_type: URLValidatedType | None = None match self.rejection_reason: case RejectionReason.INDIVIDUAL_RECORD: - url.status = URLStatus.INDIVIDUAL_RECORD.value + validation_type = URLValidatedType.INDIVIDUAL_RECORD case RejectionReason.BROKEN_PAGE_404: url.status = URLStatus.NOT_FOUND.value case RejectionReason.NOT_RELEVANT: - url.status = URLStatus.NOT_RELEVANT.value + validation_type = URLValidatedType.NOT_RELEVANT case _: raise HTTPException( status_code=HTTP_400_BAD_REQUEST, detail="Invalid rejection reason" ) + if validation_type is not None: + flag_url_validated = FlagURLValidated( + url_id=self.url_id, + type=validation_type + ) + session.add(flag_url_validated) + # Add rejecting user rejecting_user_url = ReviewingUserURL( user_id=self.user_id, diff --git a/src/api/main.py b/src/api/main.py index b6679827..f17c147f 100644 --- a/src/api/main.py +++ b/src/api/main.py @@ -27,6 +27,10 @@ from src.core.tasks.scheduled.registry.core import ScheduledJobRegistry from src.core.tasks.url.loader import URLTaskOperatorLoader from src.core.tasks.url.manager import TaskManager +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.core import \ + NLPProcessor +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.enums import \ + SpacyModelType from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser from src.db.client.async_ import AsyncDatabaseClient from src.db.client.sync import DatabaseClient @@ -83,6 +87,9 @@ async def lifespan(app: FastAPI): session=session, token=env_var_manager.hf_inference_api_key ), + nlp_processor=NLPProcessor( + model_type=SpacyModelType.EN_CORE_WEB_SM + ) ), ) async_collector_manager = AsyncCollectorManager( diff --git a/src/collectors/enums.py b/src/collectors/enums.py index 1732bd19..c357d6bf 100644 --- a/src/collectors/enums.py +++ b/src/collectors/enums.py @@ -11,11 +11,7 @@ class CollectorType(Enum): MANUAL = "manual" class URLStatus(Enum): - PENDING = "pending" - SUBMITTED = "submitted" - VALIDATED = "validated" + OK = "ok" ERROR = "error" DUPLICATE = "duplicate" - NOT_RELEVANT = "not relevant" NOT_FOUND = "404 not found" - INDIVIDUAL_RECORD = "individual record" diff --git a/src/collectors/impl/muckrock/api_interface/lookup_response.py b/src/collectors/impl/muckrock/api_interface/lookup_response.py index 47ea855b..d1fd9635 100644 --- a/src/collectors/impl/muckrock/api_interface/lookup_response.py +++ b/src/collectors/impl/muckrock/api_interface/lookup_response.py @@ -6,6 +6,6 @@ class AgencyLookupResponse(BaseModel): - name: Optional[str] + name: str | None type: AgencyLookupResponseType - error: Optional[str] = None + error: str | None = None diff --git a/src/collectors/queries/insert/url.py b/src/collectors/queries/insert/url.py index 96365107..af72a3aa 100644 --- a/src/collectors/queries/insert/url.py +++ b/src/collectors/queries/insert/url.py @@ -1,6 +1,6 @@ from sqlalchemy.ext.asyncio import AsyncSession -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.pydantic.info import URLInfo from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/core/core.py b/src/core/core.py index c597a591..0938586a 100644 --- a/src/core/core.py +++ b/src/core/core.py @@ -35,7 +35,7 @@ from src.api.endpoints.task.dtos.get.tasks import GetTasksResponse from src.api.endpoints.url.get.dto import GetURLsResponseInfo from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.impl.batch.pydantic import BatchInfo +from src.db.models.impl.batch.pydantic.info import BatchInfo from src.api.endpoints.task.dtos.get.task_status import GetTaskStatusResponseInfo from src.db.enums import TaskType from src.collectors.manager import AsyncCollectorManager diff --git a/src/core/enums.py b/src/core/enums.py index c6f90c80..edc18425 100644 --- a/src/core/enums.py +++ b/src/core/enums.py @@ -16,6 +16,7 @@ class RecordType(Enum): """ All available URL record types """ + # Police and Public ACCIDENT_REPORTS = "Accident Reports" ARREST_RECORDS = "Arrest Records" CALLS_FOR_SERVICE = "Calls for Service" @@ -31,16 +32,22 @@ class RecordType(Enum): SURVEYS = "Surveys" USE_OF_FORCE_REPORTS = "Use of Force Reports" VEHICLE_PURSUITS = "Vehicle Pursuits" + + # Info About Officers COMPLAINTS_AND_MISCONDUCT = "Complaints & Misconduct" DAILY_ACTIVITY_LOGS = "Daily Activity Logs" TRAINING_AND_HIRING_INFO = "Training & Hiring Info" PERSONNEL_RECORDS = "Personnel Records" + + # Info About Agencies ANNUAL_AND_MONTHLY_REPORTS = "Annual & Monthly Reports" BUDGETS_AND_FINANCES = "Budgets & Finances" CONTACT_INFO_AND_AGENCY_META = "Contact Info & Agency Meta" GEOGRAPHIC = "Geographic" LIST_OF_DATA_SOURCES = "List of Data Sources" POLICIES_AND_CONTRACTS = "Policies & Contracts" + + # Agency-Published Resources CRIME_MAPS_AND_REPORTS = "Crime Maps & Reports" CRIME_STATISTICS = "Crime Statistics" MEDIA_BULLETINS = "Media Bulletins" @@ -48,9 +55,13 @@ class RecordType(Enum): RESOURCES = "Resources" SEX_OFFENDER_REGISTRY = "Sex Offender Registry" WANTED_PERSONS = "Wanted Persons" + + # Jails and Courts Specific BOOKING_REPORTS = "Booking Reports" COURT_CASES = "Court Cases" INCARCERATION_RECORDS = "Incarceration Records" + + # Other OTHER = "Other" diff --git a/src/core/exceptions.py b/src/core/exceptions.py index d4f9c4a8..a361a24d 100644 --- a/src/core/exceptions.py +++ b/src/core/exceptions.py @@ -14,3 +14,4 @@ class MatchAgencyError(Exception): class FailedValidationException(HTTPException): def __init__(self, detail: str): super().__init__(status_code=HTTPStatus.BAD_REQUEST, detail=detail) + diff --git a/src/core/helpers.py b/src/core/helpers.py deleted file mode 100644 index eeb951fe..00000000 --- a/src/core/helpers.py +++ /dev/null @@ -1,48 +0,0 @@ -from src.core.enums import SuggestionType -from src.core.exceptions import MatchAgencyError -from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse -from src.external.pdap.enums import MatchAgencyResponseStatus - - -def process_match_agency_response_to_suggestions( - url_id: int, - match_agency_response: MatchAgencyResponse -) -> list[URLAgencySuggestionInfo]: - if match_agency_response.status == MatchAgencyResponseStatus.EXACT_MATCH: - match = match_agency_response.matches[0] - return [ - URLAgencySuggestionInfo( - url_id=url_id, - suggestion_type=SuggestionType.CONFIRMED, - pdap_agency_id=int(match.id), - agency_name=match.submitted_name, - state=match.state, - county=match.county, - ) - ] - if match_agency_response.status == MatchAgencyResponseStatus.NO_MATCH: - return [ - URLAgencySuggestionInfo( - url_id=url_id, - suggestion_type=SuggestionType.UNKNOWN, - ) - ] - - if match_agency_response.status != MatchAgencyResponseStatus.PARTIAL_MATCH: - raise MatchAgencyError( - f"Unknown Match Agency Response Status: {match_agency_response.status}" - ) - - return [ - URLAgencySuggestionInfo( - url_id=url_id, - suggestion_type=SuggestionType.AUTO_SUGGESTION, - pdap_agency_id=match.id, - agency_name=match.submitted_name, - state=match.state, - county=match.county, - locality=match.locality - ) - for match in match_agency_response.matches - ] diff --git a/src/core/tasks/handler.py b/src/core/tasks/handler.py index 7f488594..6ddca6eb 100644 --- a/src/core/tasks/handler.py +++ b/src/core/tasks/handler.py @@ -50,8 +50,11 @@ async def handle_task_error(self, run_info: TaskOperatorRunInfo): # task_id=run_info.task_id, error=run_info.message ) + msg: str = f"Task {run_info.task_id} ({run_info.task_type.value}) failed with error: {run_info.message}" + print(msg) self.discord_poster.post_to_discord( - message=f"Task {run_info.task_id} ({run_info.task_type.value}) failed with error.") + message=msg + ) async def link_urls_to_task(self, task_id: int, url_ids: list[int]): await self.adb_client.link_urls_to_task( diff --git a/src/core/tasks/scheduled/impl/huggingface/operator.py b/src/core/tasks/scheduled/impl/huggingface/operator.py index 7d5324f5..9bb7a85e 100644 --- a/src/core/tasks/scheduled/impl/huggingface/operator.py +++ b/src/core/tasks/scheduled/impl/huggingface/operator.py @@ -1,12 +1,19 @@ from itertools import count +from src.core.tasks.mixins.prereq import HasPrerequisitesMixin +from src.core.tasks.scheduled.impl.huggingface.queries.check.core import CheckValidURLsUpdatedQueryBuilder +from src.core.tasks.scheduled.impl.huggingface.queries.get.core import GetForLoadingToHuggingFaceQueryBuilder +from src.core.tasks.scheduled.impl.huggingface.queries.get.model import GetForLoadingToHuggingFaceOutput from src.core.tasks.scheduled.templates.operator import ScheduledTaskOperatorBase from src.db.client.async_ import AsyncDatabaseClient from src.db.enums import TaskType from src.external.huggingface.hub.client import HuggingFaceHubClient -class PushToHuggingFaceTaskOperator(ScheduledTaskOperatorBase): +class PushToHuggingFaceTaskOperator( + ScheduledTaskOperatorBase, + HasPrerequisitesMixin +): @property def task_type(self) -> TaskType: @@ -20,21 +27,23 @@ def __init__( super().__init__(adb_client) self.hf_client = hf_client - async def inner_task_logic(self): - # Check if any valid urls have been updated - valid_urls_updated = await self.adb_client.check_valid_urls_updated() - print(f"Valid urls updated: {valid_urls_updated}") - if not valid_urls_updated: - print("No valid urls updated, skipping.") - return - + async def meets_task_prerequisites(self) -> bool: + return await self.adb_client.run_query_builder( + CheckValidURLsUpdatedQueryBuilder() + ) - # Otherwise, push to huggingface + async def inner_task_logic(self): + """Push raw data sources to huggingface.""" run_dt = await self.adb_client.get_current_database_time() for idx in count(start=1): - outputs = await self.adb_client.get_data_sources_raw_for_huggingface(page=idx) + outputs: list[GetForLoadingToHuggingFaceOutput] = await self._get_data_sources_raw_for_huggingface(page=idx) if len(outputs) == 0: break self.hf_client.push_data_sources_raw_to_hub(outputs, idx=idx) await self.adb_client.set_hugging_face_upload_state(run_dt.replace(tzinfo=None)) + + async def _get_data_sources_raw_for_huggingface(self, page: int) -> list[GetForLoadingToHuggingFaceOutput]: + return await self.adb_client.run_query_builder( + GetForLoadingToHuggingFaceQueryBuilder(page) + ) diff --git a/src/core/tasks/scheduled/impl/huggingface/queries/check/requester.py b/src/core/tasks/scheduled/impl/huggingface/queries/check/requester.py index 23e0b0b6..25124c95 100644 --- a/src/core/tasks/scheduled/impl/huggingface/queries/check/requester.py +++ b/src/core/tasks/scheduled/impl/huggingface/queries/check/requester.py @@ -1,4 +1,5 @@ from datetime import datetime +from operator import or_ from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession @@ -6,6 +7,7 @@ from src.collectors.enums import URLStatus from src.db.helpers.session import session_helper as sh +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.state.huggingface import HuggingFaceUploadState from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML from src.db.models.impl.url.core.sqlalchemy import URL @@ -34,14 +36,12 @@ async def has_valid_urls(self, last_upload_at: datetime | None) -> bool: URLCompressedHTML, URL.id == URLCompressedHTML.url_id ) + .outerjoin( + FlagURLValidated, + URL.id == FlagURLValidated.url_id + ) .where( - URL.status.in_( - [ - URLStatus.VALIDATED, - URLStatus.NOT_RELEVANT.value, - URLStatus.SUBMITTED.value, - ] - ), + FlagURLValidated.url_id.isnot(None) ) ) if last_upload_at is not None: diff --git a/src/core/tasks/scheduled/impl/huggingface/queries/get/convert.py b/src/core/tasks/scheduled/impl/huggingface/queries/get/convert.py index 9d5c4135..5ad96115 100644 --- a/src/core/tasks/scheduled/impl/huggingface/queries/get/convert.py +++ b/src/core/tasks/scheduled/impl/huggingface/queries/get/convert.py @@ -1,8 +1,7 @@ -from src.collectors.enums import URLStatus from src.core.enums import RecordType from src.core.tasks.scheduled.impl.huggingface.queries.get.enums import RecordTypeCoarse -from src.core.tasks.scheduled.impl.huggingface.queries.get.mappings import FINE_COARSE_RECORD_TYPE_MAPPING, \ - OUTCOME_RELEVANCY_MAPPING +from src.core.tasks.scheduled.impl.huggingface.queries.get.mappings import FINE_COARSE_RECORD_TYPE_MAPPING +from src.db.models.impl.flag.url_validated.enums import URLValidatedType def convert_fine_to_coarse_record_type( @@ -10,7 +9,14 @@ def convert_fine_to_coarse_record_type( ) -> RecordTypeCoarse: return FINE_COARSE_RECORD_TYPE_MAPPING[fine_record_type] -def convert_url_status_to_relevant( - url_status: URLStatus + +def convert_validated_type_to_relevant( + validated_type: URLValidatedType ) -> bool: - return OUTCOME_RELEVANCY_MAPPING[url_status] \ No newline at end of file + match validated_type: + case URLValidatedType.NOT_RELEVANT: + return False + case URLValidatedType.DATA_SOURCE: + return True + case _: + raise ValueError(f"Disallowed validated type: {validated_type}") \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/huggingface/queries/get/core.py b/src/core/tasks/scheduled/impl/huggingface/queries/get/core.py index 662f7fbb..d58cbdf7 100644 --- a/src/core/tasks/scheduled/impl/huggingface/queries/get/core.py +++ b/src/core/tasks/scheduled/impl/huggingface/queries/get/core.py @@ -1,16 +1,18 @@ from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession -from src.collectors.enums import URLStatus -from src.core.tasks.scheduled.impl.huggingface.queries.get.convert import convert_url_status_to_relevant, \ - convert_fine_to_coarse_record_type +from src.core.tasks.scheduled.impl.huggingface.queries.get.convert import convert_fine_to_coarse_record_type, \ + convert_validated_type_to_relevant from src.core.tasks.scheduled.impl.huggingface.queries.get.model import GetForLoadingToHuggingFaceOutput from src.db.client.helpers import add_standard_limit_and_offset -from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML +from src.db.helpers.session import session_helper as sh +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML from src.db.queries.base.builder import QueryBuilderBase from src.db.utils.compression import decompress_html -from src.db.helpers.session import session_helper as sh + class GetForLoadingToHuggingFaceQueryBuilder(QueryBuilderBase): @@ -22,29 +24,32 @@ def __init__(self, page: int): async def run(self, session: AsyncSession) -> list[GetForLoadingToHuggingFaceOutput]: label_url_id = 'url_id' label_url = 'url' - label_url_status = 'url_status' label_record_type_fine = 'record_type_fine' label_html = 'html' + label_type = 'type' query = ( select( URL.id.label(label_url_id), URL.url.label(label_url), - URL.status.label(label_url_status), URL.record_type.label(label_record_type_fine), - URLCompressedHTML.compressed_html.label(label_html) + URLCompressedHTML.compressed_html.label(label_html), + FlagURLValidated.type.label(label_type) ) .join( URLCompressedHTML, URL.id == URLCompressedHTML.url_id ) + .outerjoin( + FlagURLValidated, + URL.id == FlagURLValidated.url_id + ) .where( - URL.status.in_([ - URLStatus.VALIDATED, - URLStatus.NOT_RELEVANT, - URLStatus.SUBMITTED - ]) + FlagURLValidated.type.in_( + (URLValidatedType.DATA_SOURCE, + URLValidatedType.NOT_RELEVANT) + ) ) ) query = add_standard_limit_and_offset(page=self.page, statement=query) @@ -57,7 +62,9 @@ async def run(self, session: AsyncSession) -> list[GetForLoadingToHuggingFaceOut output = GetForLoadingToHuggingFaceOutput( url_id=result[label_url_id], url=result[label_url], - relevant=convert_url_status_to_relevant(result[label_url_status]), + relevant=convert_validated_type_to_relevant( + URLValidatedType(result[label_type]) + ), record_type_fine=result[label_record_type_fine], record_type_coarse=convert_fine_to_coarse_record_type( result[label_record_type_fine] diff --git a/src/core/tasks/scheduled/impl/huggingface/queries/get/mappings.py b/src/core/tasks/scheduled/impl/huggingface/queries/get/mappings.py index ed4a7da2..0fd12b28 100644 --- a/src/core/tasks/scheduled/impl/huggingface/queries/get/mappings.py +++ b/src/core/tasks/scheduled/impl/huggingface/queries/get/mappings.py @@ -47,9 +47,3 @@ RecordType.OTHER: RecordTypeCoarse.OTHER, None: RecordTypeCoarse.NOT_RELEVANT } - -OUTCOME_RELEVANCY_MAPPING = { - URLStatus.SUBMITTED: True, - URLStatus.VALIDATED: True, - URLStatus.NOT_RELEVANT: False -} \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/operator.py b/src/core/tasks/scheduled/impl/sync/agency/operator.py index db20acf1..6adff30b 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/operator.py +++ b/src/core/tasks/scheduled/impl/sync/agency/operator.py @@ -1,9 +1,11 @@ +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.core import UpsertAgenciesQueryBuilder from src.core.tasks.scheduled.impl.sync.check import check_max_sync_requests_not_exceeded from src.core.tasks.scheduled.impl.sync.agency.dtos.parameters import AgencySyncParameters from src.core.tasks.scheduled.templates.operator import ScheduledTaskOperatorBase from src.db.client.async_ import AsyncDatabaseClient from src.db.enums import TaskType from src.external.pdap.client import PDAPClient +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo class SyncAgenciesTaskOperator(ScheduledTaskOperatorBase): @@ -21,17 +23,19 @@ def task_type(self) -> TaskType: # return TaskType.SYNC_AGENCIES async def inner_task_logic(self): - count_agencies_synced = 0 params = await self.adb_client.get_agencies_sync_parameters() if params.page is None: params.page = 1 response = await self.pdap_client.sync_agencies(params) - count_agencies_synced += len(response.agencies) - request_count = 1 + count_agencies_synced = 0 + request_count = 0 while len(response.agencies) > 0: + await self.update_data(response.agencies) + count_agencies_synced += len(response.agencies) + request_count += 1 + check_max_sync_requests_not_exceeded(request_count) - await self.adb_client.upsert_agencies(response.agencies) params = AgencySyncParameters( page=params.page + 1, @@ -40,9 +44,13 @@ async def inner_task_logic(self): await self.adb_client.update_agencies_sync_progress(params.page) response = await self.pdap_client.sync_agencies(params) - count_agencies_synced += len(response.agencies) - request_count += 1 + await self.adb_client.mark_full_agencies_sync() print(f"Sync complete. Synced {count_agencies_synced} agencies") + async def update_data(self, agencies: list[AgenciesSyncResponseInnerInfo]): + # First, add new agencies + await self.adb_client.run_query_builder( + UpsertAgenciesQueryBuilder(agencies) + ) diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/__init__.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/convert.py similarity index 97% rename from src/core/tasks/scheduled/impl/sync/agency/queries/upsert.py rename to src/core/tasks/scheduled/impl/sync/agency/queries/upsert/convert.py index 61a0b104..4b944464 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/convert.py @@ -17,4 +17,4 @@ def convert_agencies_sync_response_to_agencies_upsert( ds_last_updated_at=agency.updated_at ) ) - return results \ No newline at end of file + return results diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/core.py new file mode 100644 index 00000000..fc909e48 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/core.py @@ -0,0 +1,30 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.core import UpdateAgencyURLLinksQueryBuilder +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.core import UpsertMetaUrlsQueryBuilder +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.convert import \ + convert_agencies_sync_response_to_agencies_upsert +from src.db.models.impl.agency.pydantic.upsert import AgencyUpsertModel +from src.db.queries.base.builder import QueryBuilderBase +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo + +from src.db.helpers.session import session_helper as sh + +class UpsertAgenciesQueryBuilder(QueryBuilderBase): + + def __init__(self, sync_responses: list[AgenciesSyncResponseInnerInfo]): + super().__init__() + self.sync_responses = sync_responses + + async def run(self, session: AsyncSession) -> None: + # Upsert Agencies + agency_upserts: list[AgencyUpsertModel] = convert_agencies_sync_response_to_agencies_upsert(self.sync_responses) + await sh.bulk_upsert(session=session, models=agency_upserts) + + # Add and update Meta URLs + meta_urls_query_builder = UpsertMetaUrlsQueryBuilder(self.sync_responses) + await meta_urls_query_builder.run(session=session) + + # Add and remove URL-Agency Links + update_url_links_query_builder = UpdateAgencyURLLinksQueryBuilder(self.sync_responses) + await update_url_links_query_builder.run(session=session) diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/extract.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/extract.py new file mode 100644 index 00000000..c05b55f1 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/extract.py @@ -0,0 +1,12 @@ +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo + + +def extract_urls_from_agencies_sync_response( + responses: list[AgenciesSyncResponseInnerInfo] +) -> list[str]: + url_set: set[str] = set() + for response in responses: + for url in response.meta_urls: + url_set.add(url) + + return list(url_set) diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/__init__.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/build.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/build.py new file mode 100644 index 00000000..5511ea65 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/build.py @@ -0,0 +1,23 @@ +from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo +from src.util.url_mapper import URLMapper + +def build_links_from_url_mappings_and_sync_responses( + url_mappings: list[URLMapping], + sync_responses: list[AgenciesSyncResponseInnerInfo], +) -> list[LinkURLAgencyPydantic]: + + links: list[LinkURLAgencyPydantic] = [] + + mapper = URLMapper(url_mappings) + for sync_response in sync_responses: + agency_id: int = sync_response.agency_id + for meta_url in sync_response.meta_urls: + url_id: int = mapper.get_id(meta_url) + link = LinkURLAgencyPydantic( + agency_id=agency_id, + url_id=url_id + ) + links.append(link) + return links \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/core.py new file mode 100644 index 00000000..37d63a03 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/core.py @@ -0,0 +1,50 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.extract import extract_urls_from_agencies_sync_response +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.build import \ + build_links_from_url_mappings_and_sync_responses +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.requester import UpdateAgencyURLLinksRequester +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.lookup.extract import \ + extract_agency_ids_from_agencies_sync_response +from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic +from src.db.queries.base.builder import QueryBuilderBase +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo + + +class UpdateAgencyURLLinksQueryBuilder(QueryBuilderBase): + """Updates agency URL links.""" + + def __init__( + self, + sync_responses: list[AgenciesSyncResponseInnerInfo] + ): + super().__init__() + self._sync_responses = sync_responses + + async def run(self, session: AsyncSession) -> None: + # Get all existing links + requester = UpdateAgencyURLLinksRequester(session) + + # Build new links from sync responses and URL mappings + sync_urls: list[str] = extract_urls_from_agencies_sync_response(self._sync_responses) + url_mappings: list[URLMapping] = await requester.get_url_mappings(urls=sync_urls) + new_links: list[LinkURLAgencyPydantic] = build_links_from_url_mappings_and_sync_responses( + url_mappings=url_mappings, + sync_responses=self._sync_responses, + ) + + sync_agency_ids: list[int] = extract_agency_ids_from_agencies_sync_response(self._sync_responses) + old_links: list[LinkURLAgencyPydantic] = await requester.get_current_agency_url_links( + agency_ids=sync_agency_ids, + ) + + new_set: set[LinkURLAgencyPydantic] = set(new_links) + old_set: set[LinkURLAgencyPydantic] = set(old_links) + + links_to_add: list[LinkURLAgencyPydantic] = list(new_set - old_set) + links_to_remove: list[LinkURLAgencyPydantic] = list(old_set - new_set) + + await requester.add_agency_url_links(links=links_to_add) + await requester.remove_agency_url_links(links=links_to_remove) + diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/filter.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/filter.py new file mode 100644 index 00000000..123bd0ba --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/filter.py @@ -0,0 +1,12 @@ +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.models.mappings import AgencyURLMappings + + +def filter_non_relevant_mappings( + mappings: list[AgencyURLMappings], + relevant_agency_ids: list[int] +) -> list[AgencyURLMappings]: + relevant_mappings: list[AgencyURLMappings] = [] + for mapping in mappings: + if mapping.agency_id in relevant_agency_ids: + relevant_mappings.append(mapping) + return relevant_mappings \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup_/__init__.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup_/links.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup_/links.py new file mode 100644 index 00000000..9336deaa --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup_/links.py @@ -0,0 +1,46 @@ +from typing import Sequence + +from sqlalchemy import select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.helpers.session import session_helper as sh +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.base.builder import QueryBuilderBase + + +class LookupMetaURLLinksQueryBuilder(QueryBuilderBase): + + def __init__(self, agency_ids: list[int]): + super().__init__() + self.agency_ids: list[int] = agency_ids + + async def run(self, session: AsyncSession) -> list[LinkURLAgencyPydantic]: + + query = ( + select( + LinkURLAgency.url_id, + LinkURLAgency.agency_id + ) + .join( + URL, + LinkURLAgency.url_id == URL.id, + ) + .join( + FlagURLValidated, + FlagURLValidated.url_id == URL.id, + ) + .where( + FlagURLValidated.type == URLValidatedType.META_URL, + LinkURLAgency.agency_id.in_(self.agency_ids), + ) + ) + + mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) + links: list[LinkURLAgencyPydantic] = [ + LinkURLAgencyPydantic(**mapping) for mapping in mappings + ] + return links \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup_/url.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup_/url.py new file mode 100644 index 00000000..8b526447 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup_/url.py @@ -0,0 +1,31 @@ +from typing import Sequence + +from sqlalchemy import select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.base.builder import QueryBuilderBase +from src.db.helpers.session import session_helper as sh + +class LookupURLQueryBuilder(QueryBuilderBase): + + def __init__(self, urls: list[str]): + super().__init__() + self.urls: list[str] = urls + + async def run(self, session: AsyncSession) -> list[URLMapping]: + query = ( + select( + URL.id.label("url_id"), + URL.url, + ) + .where( + URL.url.in_(self.urls), + ) + ) + mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) + urls: list[URLMapping] = [ + URLMapping(**mapping) for mapping in mappings + ] + return urls \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/models/__init__.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/models/mappings.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/models/mappings.py new file mode 100644 index 00000000..0f3c9d69 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/models/mappings.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class AgencyURLMappings(BaseModel): + agency_id: int + url_ids: list[int] \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/requester.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/requester.py new file mode 100644 index 00000000..96887dfa --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/requester.py @@ -0,0 +1,21 @@ +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.lookup_.links import LookupMetaURLLinksQueryBuilder +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.lookup_.url import LookupURLQueryBuilder +from src.db.dtos.url.mapping import URLMapping +from src.db.helpers.session import session_helper as sh +from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic +from src.db.templates.requester import RequesterBase + + +class UpdateAgencyURLLinksRequester(RequesterBase): + + async def get_url_mappings(self, urls: list[str]) -> list[URLMapping]: + return await LookupURLQueryBuilder(urls=urls).run(session=self.session) + + async def get_current_agency_url_links(self, agency_ids: list[int]) -> list[LinkURLAgencyPydantic]: + return await LookupMetaURLLinksQueryBuilder(agency_ids=agency_ids).run(session=self.session) + + async def add_agency_url_links(self, links: list[LinkURLAgencyPydantic]) -> None: + await sh.bulk_insert(self.session, models=links) + + async def remove_agency_url_links(self, links: list[LinkURLAgencyPydantic]) -> None: + await sh.bulk_delete(self.session, models=links) diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/__init__.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/add/__init__.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/add/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/add/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/add/core.py new file mode 100644 index 00000000..73761251 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/add/core.py @@ -0,0 +1,57 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.enums import RecordType +from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.pydantic.insert import URLInsertModel +from src.db.queries.base.builder import QueryBuilderBase + +from src.db.helpers.session import session_helper as sh + +class AddMetaURLsQueryBuilder(QueryBuilderBase): + + """Add Meta URLs to DB with: + - Record type set to CONTACT_INFO_AND_AGENCY_META + - Validation Flag added as META_URL + - Source set to DATA_SOURCES + """ + def __init__(self, urls: list[str]): + super().__init__() + self.urls = urls + + async def run(self, session: AsyncSession) -> list[URLMapping]: + # Add URLs + url_inserts: list[URLInsertModel] = [] + for url in self.urls: + url_inserts.append( + URLInsertModel( + url=url, + record_type=RecordType.CONTACT_INFO_AND_AGENCY_META, + source=URLSource.DATA_SOURCES + ) + ) + url_ids: list[int] = await sh.bulk_insert(session, models=url_inserts, return_ids=True) + + # Connect with URLs + mappings: list[URLMapping] = [ + URLMapping( + url=url, + url_id=url_id, + ) + for url, url_id in zip(self.urls, url_ids) + ] + + # Add Validation Flags + flag_inserts: list[FlagURLValidatedPydantic] = [] + for url_id in url_ids: + flag_inserts.append( + FlagURLValidatedPydantic( + url_id=url_id, + type=URLValidatedType.META_URL + ) + ) + await sh.bulk_insert(session, models=flag_inserts) + + return mappings diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/convert.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/convert.py new file mode 100644 index 00000000..8d3e8785 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/convert.py @@ -0,0 +1,27 @@ +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.lookup.response import MetaURLLookupResponse +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.update.params import UpdateMetaURLsParams +from src.db.dtos.url.mapping import URLMapping + + +def convert_to_update_meta_urls_params( + lookups: list[MetaURLLookupResponse] +) -> list[UpdateMetaURLsParams]: + return [ + UpdateMetaURLsParams( + url_id=lookup.url_id, + validation_type=lookup.validation_type, + record_type=lookup.record_type, + ) + for lookup in lookups + ] + +def convert_url_lookups_to_url_mappings( + lookups: list[MetaURLLookupResponse] +) -> list[URLMapping]: + return [ + URLMapping( + url_id=lookup.url_id, + url=lookup.url, + ) + for lookup in lookups + ] \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/core.py new file mode 100644 index 00000000..6f5c3593 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/core.py @@ -0,0 +1,33 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.extract import extract_urls_from_agencies_sync_response +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.filter import filter_urls_in_sync +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.lookup.response import MetaURLLookupResponse +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.requester import UpdateMetaURLsRequester +from src.db.queries.base.builder import QueryBuilderBase +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo + + +class UpsertMetaUrlsQueryBuilder(QueryBuilderBase): + """Add and update meta URLs for agencies.""" + + def __init__(self, sync_responses: list[AgenciesSyncResponseInnerInfo]): + super().__init__() + self.sync_responses = sync_responses + + async def run(self, session: AsyncSession) -> None: + + requester = UpdateMetaURLsRequester(session) + sync_urls: list[str] = extract_urls_from_agencies_sync_response(self.sync_responses) + + + lookup_responses: list[MetaURLLookupResponse] = \ + await requester.lookup_meta_urls(sync_urls) + await requester.add_new_urls_to_database(lookup_responses) + + filtered_lookup_responses: list[MetaURLLookupResponse] = \ + filter_urls_in_sync(self.sync_responses, lookup_responses=lookup_responses) + await requester.update_existing_urls(filtered_lookup_responses) + + + diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/filter.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/filter.py new file mode 100644 index 00000000..227f0edc --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/filter.py @@ -0,0 +1,37 @@ +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.extract import extract_urls_from_agencies_sync_response +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.lookup.response import MetaURLLookupResponse +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo + + +def filter_urls_to_add( + lookup_responses: list[MetaURLLookupResponse] +) -> list[str]: + return [ + lookup_response.url + for lookup_response in lookup_responses + if not lookup_response.exists_in_db + ] + +def filter_existing_url_mappings( + lookup_responses: list[MetaURLLookupResponse] +) -> list[MetaURLLookupResponse]: + """Filter only URL mappings that already exist in the database.""" + return [ + lookup_response + for lookup_response in lookup_responses + if lookup_response.exists_in_db + ] + +def filter_urls_in_sync( + sync_responses: list[AgenciesSyncResponseInnerInfo], + lookup_responses: list[MetaURLLookupResponse] +) -> list[MetaURLLookupResponse]: + """Filter only URLs that are in sync responses.""" + sync_urls: set[str] = set( + extract_urls_from_agencies_sync_response(sync_responses) + ) + filtered_lookup_responses: list[MetaURLLookupResponse] = [] + for lookup_response in lookup_responses: + if lookup_response.url in sync_urls: + filtered_lookup_responses.append(lookup_response) + return filtered_lookup_responses \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/__init__.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/core.py new file mode 100644 index 00000000..8a817bd4 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/core.py @@ -0,0 +1,66 @@ +from typing import Sequence + +from sqlalchemy import select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.lookup.response import MetaURLLookupResponse +from src.db.helpers.session import session_helper as sh +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.base.builder import QueryBuilderBase + + +class LookupMetaURLsQueryBuilder(QueryBuilderBase): + """Lookup whether URLs exist in DB and are validated as meta URLs""" + + def __init__(self, urls: list[str]): + super().__init__() + self.urls: list[str] = urls + + async def run(self, session: AsyncSession) -> list[MetaURLLookupResponse]: + url_id_label: str = "url_id" + + query = ( + select( + URL.id.label(url_id_label), + URL.url, + URL.record_type, + FlagURLValidated.type + ) + .select_from( + URL + ) + .outerjoin( + FlagURLValidated, + FlagURLValidated.url_id == URL.id, + ) + .where( + URL.url.in_(self.urls) + ) + ) + mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) + + urls_in_db = set() + extant_lookup_responses: list[MetaURLLookupResponse] = [] + for mapping in mappings: + url = mapping["url"] + urls_in_db.add(url) + response = MetaURLLookupResponse( + url=url, + url_id=mapping[url_id_label], + record_type=mapping["record_type"], + validation_type=mapping["type"], + ) + extant_lookup_responses.append(response) + + urls_not_in_db = set(self.urls) - set(urls_in_db) + non_extant_lookup_responses = [ + MetaURLLookupResponse( + url=url, + url_id=None, + record_type=None, + validation_type=None, + ) for url in urls_not_in_db + ] + + return extant_lookup_responses + non_extant_lookup_responses diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/extract.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/extract.py new file mode 100644 index 00000000..d054f645 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/extract.py @@ -0,0 +1,10 @@ +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo + + +def extract_agency_ids_from_agencies_sync_response( + responses: list[AgenciesSyncResponseInnerInfo] +) -> list[int]: + return [ + response.agency_id + for response in responses + ] diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/response.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/response.py new file mode 100644 index 00000000..ff2d668d --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/response.py @@ -0,0 +1,23 @@ +from pydantic import BaseModel + +from src.core.enums import RecordType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType + + +class MetaURLLookupResponse(BaseModel): + url: str + url_id: int | None + record_type: RecordType | None + validation_type: URLValidatedType | None + + @property + def exists_in_db(self) -> bool: + return self.url_id is not None + + @property + def is_meta_url(self) -> bool: + return self.record_type == RecordType.CONTACT_INFO_AND_AGENCY_META + + @property + def is_validated(self) -> bool: + return self.validation_type is not None diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/requester.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/requester.py new file mode 100644 index 00000000..0a3e3c76 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/requester.py @@ -0,0 +1,48 @@ +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.add.core import AddMetaURLsQueryBuilder +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.convert import \ + convert_to_update_meta_urls_params, convert_url_lookups_to_url_mappings +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.filter import filter_existing_url_mappings, \ + filter_urls_to_add +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.lookup.core import LookupMetaURLsQueryBuilder +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.lookup.response import MetaURLLookupResponse +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.update.core import UpdateMetaURLsQueryBuilder +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.update.params import UpdateMetaURLsParams +from src.db.dtos.url.mapping import URLMapping +from src.db.templates.requester import RequesterBase + + +class UpdateMetaURLsRequester(RequesterBase): + + async def lookup_meta_urls( + self, + urls: list[str] + ) -> list[MetaURLLookupResponse]: + return await LookupMetaURLsQueryBuilder( + urls + ).run(self.session) + + async def add_new_urls_to_database( + self, + lookup_responses: list[MetaURLLookupResponse] + ) -> list[URLMapping]: + if len(lookup_responses) == 0: + return [] + urls_to_add: list[str] = filter_urls_to_add(lookup_responses) + if len(urls_to_add) == 0: + return [] + return await AddMetaURLsQueryBuilder(urls_to_add).run(self.session) + + async def update_existing_urls( + self, + lookup_responses: list[MetaURLLookupResponse] + ) -> list[URLMapping]: + existing_url_lookups: list[MetaURLLookupResponse] = ( + filter_existing_url_mappings(lookup_responses) + ) + params: list[UpdateMetaURLsParams] = \ + convert_to_update_meta_urls_params(existing_url_lookups) + await UpdateMetaURLsQueryBuilder(params).run(self.session) + existing_url_mappings: list[URLMapping] = \ + convert_url_lookups_to_url_mappings(existing_url_lookups) + return existing_url_mappings + diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/__init__.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/core.py new file mode 100644 index 00000000..1e479652 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/core.py @@ -0,0 +1,39 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.update.filter import \ + filter_urls_with_non_meta_record_type, filter_urls_with_non_meta_url_validation_flag, \ + filter_urls_without_validation_flag +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.update.params import UpdateMetaURLsParams +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.update.requester import \ + UpdateMetaURLsUpdateURLAndValidationFlagsRequester +from src.db.queries.base.builder import QueryBuilderBase + + +class UpdateMetaURLsQueryBuilder(QueryBuilderBase): + """Update meta URLs in DB + + Meta URLs should be given a validation status as a Meta URL + and have their record type updated to CONTACT_INFO_AND_AGENCY_META + """ + + def __init__( + self, + params: list[UpdateMetaURLsParams] + ): + super().__init__() + self.params = params + + async def run( + self, + session: AsyncSession + ) -> None: + requester = UpdateMetaURLsUpdateURLAndValidationFlagsRequester(session) + + urls_with_non_meta_record_type: list[int] = filter_urls_with_non_meta_record_type(self.params) + await requester.update_urls(urls_with_non_meta_record_type) + + urls_without_validation_flag: list[int] = filter_urls_without_validation_flag(self.params) + await requester.add_validation_flags(urls_without_validation_flag) + + urls_with_non_meta_url_validation_flag: list[int] = filter_urls_with_non_meta_url_validation_flag(self.params) + await requester.update_validation_flags(urls_with_non_meta_url_validation_flag) diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/filter.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/filter.py new file mode 100644 index 00000000..b0c32a7e --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/filter.py @@ -0,0 +1,37 @@ +from src.core.enums import RecordType +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.update.params import UpdateMetaURLsParams +from src.db.models.impl.flag.url_validated.enums import URLValidatedType + + +def filter_urls_with_non_meta_record_type( + params: list[UpdateMetaURLsParams] +) -> list[int]: + url_ids: list[int] = [] + for param in params: + if param.record_type is None: + url_ids.append(param.url_id) + if param.record_type != RecordType.CONTACT_INFO_AND_AGENCY_META: + url_ids.append(param.url_id) + + return url_ids + +def filter_urls_without_validation_flag( + params: list[UpdateMetaURLsParams] +) -> list[int]: + url_ids: list[int] = [] + for param in params: + if param.validation_type is None: + url_ids.append(param.url_id) + return url_ids + +def filter_urls_with_non_meta_url_validation_flag( + params: list[UpdateMetaURLsParams] +) -> list[int]: + url_ids: list[int] = [] + for param in params: + if param.validation_type is None: + continue + if param.validation_type != URLValidatedType.META_URL: + url_ids.append(param.url_id) + + return url_ids \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/params.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/params.py new file mode 100644 index 00000000..cb74a378 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/params.py @@ -0,0 +1,11 @@ +from pydantic import BaseModel + +from src.core.enums import RecordType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType + + +class UpdateMetaURLsParams(BaseModel): + validation_type: URLValidatedType | None + url_id: int + record_type: RecordType | None + diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/requester.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/requester.py new file mode 100644 index 00000000..175b1bbf --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/requester.py @@ -0,0 +1,53 @@ +from sqlalchemy import update + +from src.core.enums import RecordType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.templates.requester import RequesterBase + +from src.db.helpers.session import session_helper as sh + +class UpdateMetaURLsUpdateURLAndValidationFlagsRequester(RequesterBase): + + async def update_validation_flags(self, url_ids: list[int]) -> None: + """Set validation flag for URLs to Meta URL""" + query = ( + update( + FlagURLValidated + ) + .where( + FlagURLValidated.url_id.in_(url_ids) + ) + .values( + type=URLValidatedType.META_URL + ) + ) + await self.session.execute(query) + + async def add_validation_flags(self, url_ids: list[int]) -> None: + inserts: list[FlagURLValidatedPydantic] = [] + for url_id in url_ids: + flag = FlagURLValidatedPydantic( + url_id=url_id, + type=URLValidatedType.META_URL, + ) + inserts.append(flag) + + await sh.bulk_insert(self.session, models=inserts) + + async def update_urls(self, url_ids: list[int]) -> None: + """Update URLs and set record type to Contact Info and Agency Meta""" + query = ( + update( + URL + ) + .values( + record_type=RecordType.CONTACT_INFO_AND_AGENCY_META, + ) + .where( + URL.id.in_(url_ids) + ) + ) + await self.session.execute(query) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/core.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/core.py index 6222d1fd..93c1cbc9 100644 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/core.py +++ b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/core.py @@ -1,13 +1,88 @@ +from collections import defaultdict + +from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.agency.query import URLAgencyLinkUpdateQueryBuilder -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.agency.params import UpdateLinkURLAgencyForDataSourcesSyncParams +from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.agency.convert import convert_to_link_url_agency_models +from src.db.helpers.session import session_helper as sh +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic +from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.agency.params import UpdateLinkURLAgencyParams +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.queries.base.builder import QueryBuilderBase + + +class URLAgencyLinkUpdateQueryBuilder(QueryBuilderBase): + """Given a set of URL-Agency links, remove all non-matching non-Meta URL links and add new ones.""" + + + def __init__(self, models: list[UpdateLinkURLAgencyParams]): + super().__init__() + self.models = models + self._new_links: dict[int, list[int]] = { + model.url_id: model.new_agency_ids + for model in self.models + } + self._existing_links: dict[int, list[int]] = defaultdict(list) + self.existing_url_ids: set[int] = { + model.url_id for model in self.models + } + + async def _get_existing_links(self, session: AsyncSession) -> None: + """Get existing non-meta URL agency links for provided URL IDs. + + Modifies: + self._existing_links + """ + query = ( + select(LinkURLAgency) + .outerjoin( + FlagURLValidated, + FlagURLValidated.url_id == LinkURLAgency.url_id, + ) + .where( + LinkURLAgency.url_id.in_( + self.existing_url_ids + ), + FlagURLValidated.type != URLValidatedType.META_URL + ) + ) + links = await session.scalars(query) + for link in links: + self._existing_links[link.url_id].append(link.agency_id) + + async def _update_links(self, session: AsyncSession) -> None: + # Remove all existing links not in new links + links_to_delete: list[LinkURLAgencyPydantic] = [] + links_to_insert: list[LinkURLAgencyPydantic] = [] + + for url_id in self.existing_url_ids: + new_agency_ids = self._new_links.get(url_id, []) + existing_agency_ids = self._existing_links.get(url_id, []) + # IDs to delete are existing agency ids that are not new agency ids + ids_to_delete = set(existing_agency_ids) - set(new_agency_ids) + # IDs to insert are new agency ids that are not existing agency ids + ids_to_insert = set(new_agency_ids) - set(existing_agency_ids) + + links_to_delete.extend( + convert_to_link_url_agency_models( + url_id=url_id, + agency_ids=list(ids_to_delete) + ) + ) + links_to_insert.extend( + convert_to_link_url_agency_models( + url_id=url_id, + agency_ids=list(ids_to_insert) + ) + ) + + await sh.bulk_delete(session=session, models=links_to_delete) + await sh.bulk_insert(session=session, models=links_to_insert) + + async def run(self, session: AsyncSession) -> None: + await self._get_existing_links(session=session) + await self._update_links(session=session) -async def update_agency_links( - session: AsyncSession, - params: list[UpdateLinkURLAgencyForDataSourcesSyncParams] -) -> None: - """Overwrite existing url_agency links with new ones, if applicable.""" - query = URLAgencyLinkUpdateQueryBuilder(params) - await query.run(session) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/params.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/params.py index d43bbbd8..6f8a14eb 100644 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/params.py +++ b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/params.py @@ -1,7 +1,7 @@ from pydantic import BaseModel -class UpdateLinkURLAgencyForDataSourcesSyncParams(BaseModel): +class UpdateLinkURLAgencyParams(BaseModel): url_id: int new_agency_ids: list[int] old_agency_ids: list[int] diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/query.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/query.py deleted file mode 100644 index a81be905..00000000 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/query.py +++ /dev/null @@ -1,79 +0,0 @@ -from collections import defaultdict - -from sqlalchemy import select -from sqlalchemy.ext.asyncio import AsyncSession - -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.agency.convert import convert_to_link_url_agency_models -from src.db.helpers.session import session_helper as sh -from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.agency.params import UpdateLinkURLAgencyForDataSourcesSyncParams -from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency -from src.db.queries.base.builder import QueryBuilderBase - - -class URLAgencyLinkUpdateQueryBuilder(QueryBuilderBase): - """Given a set of URL-Agency links, remove all non-matching links and add new ones.""" - - - def __init__(self, models: list[UpdateLinkURLAgencyForDataSourcesSyncParams]): - super().__init__() - self.models = models - self._new_links: dict[int, list[int]] = { - model.url_id: model.new_agency_ids - for model in self.models - } - self._existing_links: dict[int, list[int]] = defaultdict(list) - self.existing_url_ids = {model.url_id for model in self.models} - - async def _get_existing_links(self, session: AsyncSession): - """Get existing agency links for provided URLs. - - Modifies: - self._existing_links - """ - query = ( - select(LinkURLAgency) - .where( - LinkURLAgency.url_id.in_( - self.existing_url_ids - ) - ) - ) - links = await session.scalars(query) - for link in links: - self._existing_links[link.url_id].append(link.agency_id) - - async def _update_links(self, session: AsyncSession): - # Remove all existing links not in new links - links_to_delete: list[LinkURLAgencyPydantic] = [] - links_to_insert: list[LinkURLAgencyPydantic] = [] - - for url_id in self.existing_url_ids: - new_agency_ids = self._new_links.get(url_id, []) - existing_agency_ids = self._existing_links.get(url_id, []) - # IDs to delete are existing agency ids that are not new agency ids - ids_to_delete = set(existing_agency_ids) - set(new_agency_ids) - # IDs to insert are new agency ids that are not existing agency ids - ids_to_insert = set(new_agency_ids) - set(existing_agency_ids) - - links_to_delete.extend( - convert_to_link_url_agency_models( - url_id=url_id, - agency_ids=list(ids_to_delete) - ) - ) - links_to_insert.extend( - convert_to_link_url_agency_models( - url_id=url_id, - agency_ids=list(ids_to_insert) - ) - ) - - await sh.bulk_delete(session=session, models=links_to_delete) - await sh.bulk_insert(session=session, models=links_to_insert) - - async def run(self, session: AsyncSession): - await self._get_existing_links(session=session) - await self._update_links(session=session) - - diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/convert.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/convert.py new file mode 100644 index 00000000..e2def8c2 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/convert.py @@ -0,0 +1,24 @@ +from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.url.lookup.response import URLDataSyncInfo +from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.external.pdap.enums import ApprovalStatus + + +def convert_url_sync_info_to_url_mappings( + url_sync_info: URLDataSyncInfo +) -> URLMapping: + return URLMapping( + url=url_sync_info.url, + url_id=url_sync_info.url_id + ) + +def convert_approval_status_to_validated_type( + approval_status: ApprovalStatus +) -> URLValidatedType: + match approval_status: + case ApprovalStatus.APPROVED: + return URLValidatedType.DATA_SOURCE + case ApprovalStatus.REJECTED: + return URLValidatedType.NOT_RELEVANT + case _: + raise ValueError(f"Invalid approval status: {approval_status}") \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/core.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/core.py index 751192f9..2b021045 100644 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/core.py +++ b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/core.py @@ -3,6 +3,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from typing_extensions import override +from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.convert import convert_url_sync_info_to_url_mappings from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.helpers.filter import filter_for_urls_with_ids, \ get_mappings_for_urls_without_data_sources from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.mapper import URLSyncInfoMapper @@ -14,8 +15,11 @@ from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.url.lookup.response import \ LookupURLForDataSourcesSyncResponse from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.queries.base.builder import QueryBuilderBase from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo +from src.util.url_mapper import URLMapper @final @@ -50,24 +54,36 @@ async def run(self, session: AsyncSession) -> None: """ self._session = session - lookup_results = await self._lookup_urls() - lookups_existing_urls = filter_for_urls_with_ids(lookup_results) + lookup_results: list[LookupURLForDataSourcesSyncResponse] = await self._lookup_urls() + + # Update existing url and associated metadata + lookups_existing_urls: list[LookupURLForDataSourcesSyncResponse] = filter_for_urls_with_ids(lookup_results) await self._update_existing_urls(lookups_existing_urls) await self._update_agency_link(lookups_existing_urls) - mappings_without_data_sources = get_mappings_for_urls_without_data_sources(lookup_results) - await self._add_new_data_sources(mappings_without_data_sources) + existing_url_mappings: list[URLMapping] = [ + convert_url_sync_info_to_url_mappings(lookup.url_info) + for lookup in lookups_existing_urls + ] - extant_urls = {lookup.url_info.url for lookup in lookups_existing_urls} - urls_to_add = list(self.urls - extant_urls) - if len(urls_to_add) == 0: - return - url_mappings = await self._add_new_urls(urls_to_add) - await self._add_new_data_sources(url_mappings) - await self._insert_agency_link(url_mappings) - - async def _lookup_urls(self): - lookup_results = await self.requester.lookup_urls(list(self.urls)) - return lookup_results + # Add new URLs and associated metadata + mappings_without_data_sources: list[URLMapping] = get_mappings_for_urls_without_data_sources(lookup_results) + await self._add_new_data_sources(mappings_without_data_sources) + extant_urls: set[str] = {lookup.url_info.url for lookup in lookups_existing_urls} + urls_to_add: list[str] = list(self.urls - extant_urls) + if len(urls_to_add) != 0: + new_url_mappings: list[URLMapping] = await self._add_new_urls(urls_to_add) + await self._add_new_data_sources(new_url_mappings) + await self._insert_agency_link(new_url_mappings) + else: + new_url_mappings: list[URLMapping] = [] + + # Upsert validated flags + all_url_mappings: list[URLMapping] = existing_url_mappings + new_url_mappings + mapper = URLMapper(all_url_mappings) + await self._upsert_validated_flags(mapper) + + async def _lookup_urls(self) -> list[LookupURLForDataSourcesSyncResponse]: + return await self.requester.lookup_urls(list(self.urls)) async def _insert_agency_link(self, url_mappings: list[URLMapping]): link_url_agency_insert_params = self.param_manager.insert_agency_link( @@ -81,16 +97,19 @@ async def _update_agency_link(self, lookups_existing_urls: list[LookupURLForData ) await self.requester.update_agency_links(link_url_agency_update_params) - async def _add_new_data_sources(self, url_mappings: list[URLMapping]): + async def _add_new_data_sources(self, url_mappings: list[URLMapping]) -> None: url_ds_insert_params = self.param_manager.add_new_data_sources(url_mappings) await self.requester.add_new_data_sources(url_ds_insert_params) - async def _add_new_urls(self, urls: list[str]): + async def _add_new_urls(self, urls: list[str]) -> list[URLMapping]: url_insert_params: list[InsertURLForDataSourcesSyncParams] = self.param_manager.add_new_urls(urls) url_mappings = await self.requester.add_new_urls(url_insert_params) return url_mappings - async def _update_existing_urls(self, lookups_existing_urls: list[LookupURLForDataSourcesSyncResponse]): + async def _update_existing_urls(self, lookups_existing_urls: list[LookupURLForDataSourcesSyncResponse]) -> None: update_params = self.param_manager.update_existing_urls(lookups_existing_urls) await self.requester.update_existing_urls(update_params) + async def _upsert_validated_flags(self, url_mapper: URLMapper) -> None: + flags: list[FlagURLValidatedPydantic] = self.param_manager.upsert_validated_flags(url_mapper) + await self.requester.upsert_validated_flags(flags) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/helpers/convert.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/helpers/convert.py index 3240e409..168f2511 100644 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/helpers/convert.py +++ b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/helpers/convert.py @@ -23,13 +23,13 @@ def convert_to_source_collector_url_status( match ds_approval_status: case ApprovalStatus.APPROVED: - return URLStatus.SUBMITTED + return URLStatus.OK case ApprovalStatus.REJECTED: return URLStatus.NOT_RELEVANT case ApprovalStatus.NEEDS_IDENTIFICATION: - return URLStatus.PENDING + return URLStatus.OK case ApprovalStatus.PENDING: - return URLStatus.PENDING + return URLStatus.OK case _: raise NotImplementedError(f"Logic not implemented for this approval status: {ds_approval_status}") diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/param_manager.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/param_manager.py index 7ca8ebad..e0a7225f 100644 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/param_manager.py +++ b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/param_manager.py @@ -1,5 +1,7 @@ from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.agency.params import \ - UpdateLinkURLAgencyForDataSourcesSyncParams + UpdateLinkURLAgencyParams +from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.convert import \ + convert_approval_status_to_validated_type from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.helpers.convert import convert_to_url_update_params, \ convert_to_url_insert_params from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.mapper import URLSyncInfoMapper @@ -10,8 +12,14 @@ from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.url.update.params import \ UpdateURLForDataSourcesSyncParams from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic from src.db.models.impl.url.data_source.pydantic import URLDataSourcePydantic +from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo +from src.external.pdap.enums import ApprovalStatus +from src.util.url_mapper import URLMapper class UpsertURLsFromDataSourcesParamManager: @@ -53,12 +61,12 @@ def add_new_urls( def update_agency_link( self, lookup_results: list[LookupURLForDataSourcesSyncResponse] - ) -> list[UpdateLinkURLAgencyForDataSourcesSyncParams]: + ) -> list[UpdateLinkURLAgencyParams]: results = [] for lookup_result in lookup_results: url_info = lookup_result.url_info sync_info = self._mapper.get(url_info.url) - update_params = UpdateLinkURLAgencyForDataSourcesSyncParams( + update_params = UpdateLinkURLAgencyParams( url_id=url_info.url_id, new_agency_ids=sync_info.agency_ids, old_agency_ids=url_info.agency_ids @@ -98,4 +106,21 @@ def add_new_data_sources( ) return results + def upsert_validated_flags( + self, + mapper: URLMapper + ) -> list[FlagURLValidatedPydantic]: + urls: list[str] = mapper.get_all_urls() + flags: list[FlagURLValidatedPydantic] = [] + for url in urls: + url_id: int = mapper.get_id(url) + sync_info: DataSourcesSyncResponseInnerInfo = self._mapper.get(url) + approval_status: ApprovalStatus = sync_info.approval_status + validated_type: URLValidatedType = convert_approval_status_to_validated_type(approval_status) + flag = FlagURLValidatedPydantic( + url_id=url_id, + type=validated_type + ) + flags.append(flag) + return flags \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/requester.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/requester.py index 08b5df22..eaae3a17 100644 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/requester.py +++ b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/requester.py @@ -1,8 +1,8 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.agency.params import \ - UpdateLinkURLAgencyForDataSourcesSyncParams -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.agency.query import \ + UpdateLinkURLAgencyParams +from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.agency.core import \ URLAgencyLinkUpdateQueryBuilder from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.url.insert.params import \ InsertURLForDataSourcesSyncParams @@ -14,6 +14,7 @@ UpdateURLForDataSourcesSyncParams from src.db.dtos.url.mapping import URLMapping from src.db.helpers.session import session_helper as sh +from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic from src.db.models.impl.url.data_source.pydantic import URLDataSourcePydantic @@ -71,8 +72,11 @@ async def add_new_agency_links( async def update_agency_links( self, - params: list[UpdateLinkURLAgencyForDataSourcesSyncParams] + params: list[UpdateLinkURLAgencyParams] ) -> None: """Overwrite existing url_agency links with new ones, if applicable.""" query = URLAgencyLinkUpdateQueryBuilder(params) - await query.run(self.session) \ No newline at end of file + await query.run(self.session) + + async def upsert_validated_flags(self, flags: list[FlagURLValidatedPydantic]) -> None: + await sh.bulk_upsert(self.session, models=flags) \ No newline at end of file diff --git a/src/core/tasks/scheduled/loader.py b/src/core/tasks/scheduled/loader.py index 83c3b100..76c707ea 100644 --- a/src/core/tasks/scheduled/loader.py +++ b/src/core/tasks/scheduled/loader.py @@ -77,6 +77,14 @@ async def load_entries(self) -> list[ScheduledTaskEntry]: interval=IntervalEnum.DAILY, enabled=self.env.bool("SYNC_DATA_SOURCES_TASK_FLAG", default=True) ), + ScheduledTaskEntry( + operator=SyncAgenciesTaskOperator( + adb_client=self.async_core.adb_client, + pdap_client=self.pdap_client + ), + interval=IntervalEnum.DAILY, + enabled=self.env.bool("SYNC_AGENCIES_TASK_FLAG", default=True) + ), ScheduledTaskEntry( operator=RunURLTasksTaskOperator(async_core=self.async_core), interval=IntervalEnum.HOURLY, @@ -88,14 +96,6 @@ async def load_entries(self) -> list[ScheduledTaskEntry]: interval=IntervalEnum.DAILY, enabled=self.env.bool("POPULATE_BACKLOG_SNAPSHOT_TASK_FLAG", default=True) ), - ScheduledTaskEntry( - operator=SyncAgenciesTaskOperator( - adb_client=self.async_core.adb_client, - pdap_client=self.pdap_client - ), - interval=IntervalEnum.DAILY, - enabled=self.env.bool("SYNC_AGENCIES_TASK_FLAG", default=True) - ), ScheduledTaskEntry( operator=PushToHuggingFaceTaskOperator( adb_client=self.async_core.adb_client, diff --git a/src/core/tasks/scheduled/manager.py b/src/core/tasks/scheduled/manager.py index e97e0f8e..87cb5a27 100644 --- a/src/core/tasks/scheduled/manager.py +++ b/src/core/tasks/scheduled/manager.py @@ -1,6 +1,3 @@ -from apscheduler.job import Job -from apscheduler.schedulers.asyncio import AsyncIOScheduler - from src.core.tasks.base.run_info import TaskOperatorRunInfo from src.core.tasks.handler import TaskHandler from src.core.tasks.mixins.link_urls import LinkURLsMixin @@ -25,13 +22,13 @@ def __init__( self._loader = loader self._registry = registry - # Main objects - self.scheduler = AsyncIOScheduler() - async def setup(self): self._registry.start_scheduler() await self.add_scheduled_tasks() + await self._registry.report_next_scheduled_task() + + async def add_scheduled_tasks(self): """ @@ -39,15 +36,19 @@ async def add_scheduled_tasks(self): self._registry """ entries: list[ScheduledTaskEntry] = await self._loader.load_entries() - for idx, entry in enumerate(entries): + enabled_entries: list[ScheduledTaskEntry] = [] + for entry in entries: if not entry.enabled: print(f"{entry.operator.task_type.value} is disabled. Skipping add to scheduler.") continue + enabled_entries.append(entry) + initial_lag: int = 1 + for idx, entry in enumerate(enabled_entries): await self._registry.add_job( func=self.run_task, entry=entry, - minute_lag=idx + minute_lag=idx + initial_lag ) def shutdown(self): @@ -68,3 +69,4 @@ async def run_task(self, operator: ScheduledTaskOperatorBase): operator: ScheduledTaskOperatorBase raise Exception(f"Task {operator.task_type.value} has not been linked to any URLs but is designated as a link task") await self._handler.handle_outcome(run_info) + await self._registry.report_next_scheduled_task() diff --git a/src/core/tasks/scheduled/registry/core.py b/src/core/tasks/scheduled/registry/core.py index a1928504..a622346c 100644 --- a/src/core/tasks/scheduled/registry/core.py +++ b/src/core/tasks/scheduled/registry/core.py @@ -6,6 +6,7 @@ from apscheduler.triggers.interval import IntervalTrigger from src.core.tasks.scheduled.models.entry import ScheduledTaskEntry +from src.core.tasks.scheduled.registry.format import format_job_datetime from src.db.enums import TaskType @@ -29,8 +30,9 @@ async def add_job( Modifies: self._jobs """ - self._jobs[entry.operator.task_type] = self.scheduler.add_job( - func, + job: Job = self.scheduler.add_job( + id=entry.operator.task_type.value, + func=func, trigger=IntervalTrigger( minutes=entry.interval.value, start_date=datetime.now() + timedelta(minutes=minute_lag) @@ -38,6 +40,10 @@ async def add_job( misfire_grace_time=60, kwargs={"operator": entry.operator} ) + run_time_str: str = format_job_datetime(job.next_run_time) + print(f"Adding {job.id} task to scheduler. " + + f"First run at {run_time_str}") + self._jobs[entry.operator.task_type] = job def start_scheduler(self) -> None: """ @@ -48,4 +54,16 @@ def start_scheduler(self) -> None: def shutdown_scheduler(self) -> None: if self.scheduler.running: - self.scheduler.shutdown() \ No newline at end of file + self.scheduler.shutdown() + + async def report_next_scheduled_task(self): + jobs: list[Job] = self.scheduler.get_jobs() + if len(jobs) == 0: + print("No scheduled tasks found.") + return + + jobs_sorted: list[Job] = sorted(jobs, key=lambda job: job.next_run_time) + next_job: Job = jobs_sorted[0] + + run_time_str: str = format_job_datetime(next_job.next_run_time) + print(f"Next scheduled task: {run_time_str} ({next_job.id})") \ No newline at end of file diff --git a/src/core/tasks/scheduled/registry/format.py b/src/core/tasks/scheduled/registry/format.py new file mode 100644 index 00000000..23eea364 --- /dev/null +++ b/src/core/tasks/scheduled/registry/format.py @@ -0,0 +1,7 @@ +from datetime import datetime + +def format_job_datetime(dt: datetime) -> str: + date_str: str = dt.strftime("%Y-%m-%d") + format_24: str = dt.strftime("%H:%M:%S") + format_12: str = dt.strftime("%I:%M:%S %p") + return f"{date_str} {format_24} ({format_12})" \ No newline at end of file diff --git a/src/core/tasks/url/loader.py b/src/core/tasks/url/loader.py index 45f750af..600ea1d2 100644 --- a/src/core/tasks/url/loader.py +++ b/src/core/tasks/url/loader.py @@ -7,6 +7,8 @@ from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface from src.core.tasks.url.models.entry import URLTaskEntry from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.core import \ + NLPProcessor from src.core.tasks.url.operators.agency_identification.subtasks.loader import AgencyIdentificationSubtaskLoader from src.core.tasks.url.operators.auto_relevant.core import URLAutoRelevantTaskOperator from src.core.tasks.url.operators.html.core import URLHTMLTaskOperator @@ -20,7 +22,6 @@ from src.core.tasks.url.operators.submit_approved.core import SubmitApprovedURLTaskOperator from src.db.client.async_ import AsyncDatabaseClient from src.external.huggingface.inference.client import HuggingFaceInferenceClient -from src.external.internet_archives.client import InternetArchivesClient from src.external.pdap.client import PDAPClient from src.external.url_request.core import URLRequestInterface @@ -35,11 +36,13 @@ def __init__( pdap_client: PDAPClient, muckrock_api_interface: MuckrockAPIInterface, hf_inference_client: HuggingFaceInferenceClient, + nlp_processor: NLPProcessor ): # Dependencies self.adb_client = adb_client self.url_request_interface = url_request_interface self.html_parser = html_parser + self.nlp_processor = nlp_processor self.env = Env() # External clients and interfaces @@ -79,7 +82,9 @@ async def _get_agency_identification_task_operator(self) -> URLTaskEntry: adb_client=self.adb_client, loader=AgencyIdentificationSubtaskLoader( pdap_client=self.pdap_client, - muckrock_api_interface=self.muckrock_api_interface + muckrock_api_interface=self.muckrock_api_interface, + adb_client=self.adb_client, + nlp_processor=self.nlp_processor ) ) return URLTaskEntry( diff --git a/src/core/tasks/url/manager.py b/src/core/tasks/url/manager.py index 399da5b0..7fc6b4e3 100644 --- a/src/core/tasks/url/manager.py +++ b/src/core/tasks/url/manager.py @@ -56,8 +56,7 @@ async def _run_task(self, entry: URLTaskEntry) -> None: print(message) await self.handler.post_to_discord(message=message) break - task_id = await self.handler.initiate_task_in_db(task_type=operator.task_type) - run_info: TaskOperatorRunInfo = await operator.run_task(task_id) + run_info: TaskOperatorRunInfo = await operator.run_task() await self.conclude_task(run_info) if run_info.outcome == TaskOperatorOutcome.ERROR: break diff --git a/src/core/tasks/url/operators/agency_identification/core.py b/src/core/tasks/url/operators/agency_identification/core.py index 8ac1f632..92ece84e 100644 --- a/src/core/tasks/url/operators/agency_identification/core.py +++ b/src/core/tasks/url/operators/agency_identification/core.py @@ -1,17 +1,21 @@ -from src.collectors.enums import CollectorType -from src.core.enums import SuggestionType -from src.core.tasks.url.operators.agency_identification.dtos.output import GetAgencySuggestionsOutput -from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from src.core.tasks.url.operators.agency_identification.dtos.tdo import AgencyIdentificationTDO -from src.core.tasks.url.operators.agency_identification.subtasks.impl.base import AgencyIdentificationSubtaskBase +from src.core.tasks.mixins.link_urls import LinkURLsMixin +from src.core.tasks.url.operators.agency_identification.exceptions import SubtaskError +from src.core.tasks.url.operators.agency_identification.subtasks.flags.core import SubtaskFlagger from src.core.tasks.url.operators.agency_identification.subtasks.loader import AgencyIdentificationSubtaskLoader +from src.core.tasks.url.operators.agency_identification.subtasks.models.run_info import AgencyIDSubtaskRunInfo +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.core import \ + AgencyIDSubtaskSurveyQueryBuilder +from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIDSubtaskOperatorBase from src.core.tasks.url.operators.base import URLTaskOperatorBase from src.db.client.async_ import AsyncDatabaseClient from src.db.enums import TaskType -from src.db.models.impl.url.error_info.pydantic import URLErrorPydanticInfo +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType -class AgencyIdentificationTaskOperator(URLTaskOperatorBase): +class AgencyIdentificationTaskOperator( + URLTaskOperatorBase, + LinkURLsMixin +): def __init__( self, @@ -20,90 +24,51 @@ def __init__( ): super().__init__(adb_client) self.loader = loader + self._subtask: AutoAgencyIDSubtaskType | None = None @property def task_type(self) -> TaskType: return TaskType.AGENCY_IDENTIFICATION async def meets_task_prerequisites(self) -> bool: - has_urls_without_agency_suggestions = await self.adb_client.has_urls_without_agency_suggestions() - return has_urls_without_agency_suggestions + """ + Modifies: + - self._subtask + """ + flagger = SubtaskFlagger() + allowed_subtasks: list[AutoAgencyIDSubtaskType] = flagger.get_allowed_subtasks() + + next_subtask: AutoAgencyIDSubtaskType | None = \ + await self.adb_client.run_query_builder( + AgencyIDSubtaskSurveyQueryBuilder( + allowed_subtasks=allowed_subtasks + ) + ) + self._subtask = next_subtask + if next_subtask is None: + return False + return True - async def get_pending_urls_without_agency_identification(self) -> list[AgencyIdentificationTDO]: - return await self.adb_client.get_urls_without_agency_suggestions() - async def get_subtask( + async def load_subtask( self, - collector_type: CollectorType - ) -> AgencyIdentificationSubtaskBase: + subtask_type: AutoAgencyIDSubtaskType + ) -> AgencyIDSubtaskOperatorBase: """Get subtask based on collector type.""" - return await self.loader.load_subtask(collector_type) + return await self.loader.load_subtask(subtask_type, task_id=self.task_id) @staticmethod async def run_subtask( - subtask: AgencyIdentificationSubtaskBase, - url_id: int, - collector_metadata: dict | None - ) -> list[URLAgencySuggestionInfo]: - return await subtask.run( - url_id=url_id, - collector_metadata=collector_metadata - ) + subtask_operator: AgencyIDSubtaskOperatorBase, + ) -> AgencyIDSubtaskRunInfo: + return await subtask_operator.run() async def inner_task_logic(self) -> None: - tdos: list[AgencyIdentificationTDO] = await self.get_pending_urls_without_agency_identification() - await self.link_urls_to_task(url_ids=[tdo.url_id for tdo in tdos]) - output = await self._get_agency_suggestions(tdos) - - await self._process_agency_suggestions(output.agency_suggestions) - await self.adb_client.add_url_error_infos(output.error_infos) - - async def _process_agency_suggestions( - self, - suggestions: list[URLAgencySuggestionInfo] - ) -> None: - non_unknown_agency_suggestions = [ - suggestion for suggestion in suggestions - if suggestion.suggestion_type != SuggestionType.UNKNOWN - ] - await self.adb_client.upsert_new_agencies(non_unknown_agency_suggestions) - confirmed_suggestions = [ - suggestion for suggestion in suggestions - if suggestion.suggestion_type == SuggestionType.CONFIRMED - ] - await self.adb_client.add_confirmed_agency_url_links(confirmed_suggestions) - non_confirmed_suggestions = [ - suggestion for suggestion in suggestions - if suggestion.suggestion_type != SuggestionType.CONFIRMED - ] - await self.adb_client.add_agency_auto_suggestions(non_confirmed_suggestions) - - async def _get_agency_suggestions( - self, - tdos: list[AgencyIdentificationTDO] - ) -> GetAgencySuggestionsOutput: - error_infos = [] - all_agency_suggestions = [] - for tdo in tdos: - subtask = await self.get_subtask(tdo.collector_type) - try: - new_agency_suggestions = await self.run_subtask( - subtask, - tdo.url_id, - tdo.collector_metadata - ) - all_agency_suggestions.extend(new_agency_suggestions) - except Exception as e: - error_info = URLErrorPydanticInfo( - task_id=self.task_id, - url_id=tdo.url_id, - error=str(e), - ) - error_infos.append(error_info) - output = GetAgencySuggestionsOutput( - agency_suggestions=all_agency_suggestions, - error_infos=error_infos - ) - return output + subtask_operator: AgencyIDSubtaskOperatorBase = await self.load_subtask(self._subtask) + print(f"Running Subtask: {self._subtask.value}") + run_info: AgencyIDSubtaskRunInfo = await self.run_subtask(subtask_operator) + await self.link_urls_to_task(run_info.linked_url_ids) + if not run_info.is_success: + raise SubtaskError(run_info.error) diff --git a/src/core/tasks/url/operators/agency_identification/dtos/output.py b/src/core/tasks/url/operators/agency_identification/dtos/output.py deleted file mode 100644 index d7381129..00000000 --- a/src/core/tasks/url/operators/agency_identification/dtos/output.py +++ /dev/null @@ -1,9 +0,0 @@ -from pydantic import BaseModel - -from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from src.db.models.impl.url.error_info.pydantic import URLErrorPydanticInfo - - -class GetAgencySuggestionsOutput(BaseModel): - error_infos: list[URLErrorPydanticInfo] - agency_suggestions: list[URLAgencySuggestionInfo] \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/dtos/tdo.py b/src/core/tasks/url/operators/agency_identification/dtos/tdo.py deleted file mode 100644 index 72f24d97..00000000 --- a/src/core/tasks/url/operators/agency_identification/dtos/tdo.py +++ /dev/null @@ -1,11 +0,0 @@ -from typing import Optional - -from pydantic import BaseModel - -from src.collectors.enums import CollectorType - - -class AgencyIdentificationTDO(BaseModel): - url_id: int - collector_metadata: dict | None = None - collector_type: CollectorType | None diff --git a/src/core/tasks/url/operators/agency_identification/exceptions.py b/src/core/tasks/url/operators/agency_identification/exceptions.py new file mode 100644 index 00000000..709189e3 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/exceptions.py @@ -0,0 +1,4 @@ + + +class SubtaskError(Exception): + pass \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/queries/get_pending_urls_without_agency_suggestions.py b/src/core/tasks/url/operators/agency_identification/queries/get_pending_urls_without_agency_suggestions.py deleted file mode 100644 index 5eeb4355..00000000 --- a/src/core/tasks/url/operators/agency_identification/queries/get_pending_urls_without_agency_suggestions.py +++ /dev/null @@ -1,38 +0,0 @@ -from sqlalchemy import select -from sqlalchemy.ext.asyncio import AsyncSession - -from src.collectors.enums import URLStatus, CollectorType -from src.core.tasks.url.operators.agency_identification.dtos.tdo import AgencyIdentificationTDO -from src.db.models.impl.batch.sqlalchemy import Batch -from src.db.models.impl.link.batch_url import LinkBatchURL -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.queries.base.builder import QueryBuilderBase -from src.db.statement_composer import StatementComposer - - -class GetPendingURLsWithoutAgencySuggestionsQueryBuilder(QueryBuilderBase): - - async def run(self, session: AsyncSession) -> list[AgencyIdentificationTDO]: - - statement = ( - select( - URL.id, - URL.collector_metadata, - Batch.strategy - ) - .select_from(URL) - .where(URL.status == URLStatus.PENDING.value) - .outerjoin(LinkBatchURL) - .outerjoin(Batch) - ) - statement = StatementComposer.exclude_urls_with_agency_suggestions(statement) - statement = statement.limit(100) - raw_results = await session.execute(statement) - return [ - AgencyIdentificationTDO( - url_id=raw_result[0], - collector_metadata=raw_result[1], - collector_type=CollectorType(raw_result[2]) if raw_result[2] is not None else None - ) - for raw_result in raw_results - ] \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/queries/has_urls_without_agency_suggestions.py b/src/core/tasks/url/operators/agency_identification/queries/has_urls_without_agency_suggestions.py deleted file mode 100644 index e8a0e8ce..00000000 --- a/src/core/tasks/url/operators/agency_identification/queries/has_urls_without_agency_suggestions.py +++ /dev/null @@ -1,27 +0,0 @@ -from sqlalchemy import select -from sqlalchemy.ext.asyncio import AsyncSession - -from src.collectors.enums import URLStatus -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.queries.base.builder import QueryBuilderBase -from src.db.statement_composer import StatementComposer - - -class HasURLsWithoutAgencySuggestionsQueryBuilder(QueryBuilderBase): - - async def run( - self, - session: AsyncSession - ) -> bool: - statement = ( - select( - URL.id - ).where( - URL.status == URLStatus.PENDING.value - ) - ) - - statement = StatementComposer.exclude_urls_with_agency_suggestions(statement) - raw_result = await session.execute(statement) - result = raw_result.all() - return len(result) != 0 \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/convert.py b/src/core/tasks/url/operators/agency_identification/subtasks/convert.py new file mode 100644 index 00000000..95c9e704 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/convert.py @@ -0,0 +1,54 @@ +from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData +from src.core.tasks.url.operators.agency_identification.subtasks.models.suggestion import AgencySuggestion +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType +from src.db.models.impl.url.suggestion.agency.subtask.pydantic import URLAutoAgencyIDSubtaskPydantic +from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo +from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse +from src.external.pdap.enums import MatchAgencyResponseStatus + +def convert_match_agency_response_to_subtask_data( + url_id: int, + response: MatchAgencyResponse, + subtask_type: AutoAgencyIDSubtaskType, + task_id: int +): + suggestions: list[AgencySuggestion] = \ + _convert_match_agency_response_to_suggestions( + response + ) + agencies_found: bool = len(suggestions) > 0 + subtask_pydantic = URLAutoAgencyIDSubtaskPydantic( + url_id=url_id, + type=subtask_type, + agencies_found=agencies_found, + task_id=task_id + ) + return AutoAgencyIDSubtaskData( + pydantic_model=subtask_pydantic, + suggestions=suggestions + ) + +def _convert_match_agency_response_to_suggestions( + match_response: MatchAgencyResponse, +) -> list[AgencySuggestion]: + if match_response.status == MatchAgencyResponseStatus.EXACT_MATCH: + match_info: MatchAgencyInfo = match_response.matches[0] + return [ + AgencySuggestion( + agency_id=int(match_info.id), + confidence=100 + ) + ] + if match_response.status == MatchAgencyResponseStatus.NO_MATCH: + return [] + if match_response.status != MatchAgencyResponseStatus.PARTIAL_MATCH: + raise ValueError(f"Unknown Match Agency Response Status: {match_response.status}") + total_confidence: int = 100 + confidence_per_match: int = total_confidence // len(match_response.matches) + return [ + AgencySuggestion( + agency_id=int(match_info.id), + confidence=confidence_per_match + ) + for match_info in match_response.matches + ] \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/flags/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/flags/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/flags/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/flags/core.py new file mode 100644 index 00000000..41997322 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/flags/core.py @@ -0,0 +1,26 @@ + +from environs import Env + +from src.core.tasks.url.operators.agency_identification.subtasks.flags.mappings import SUBTASK_TO_ENV_FLAG +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType + + +class SubtaskFlagger: + """ + Manages flags allowing and disallowing subtasks + """ + def __init__(self): + self.env = Env() + + def _get_subtask_flag(self, subtask_type: AutoAgencyIDSubtaskType) -> bool: + return self.env.bool( + SUBTASK_TO_ENV_FLAG[subtask_type], + default=True + ) + + def get_allowed_subtasks(self) -> list[AutoAgencyIDSubtaskType]: + return [ + subtask_type + for subtask_type, flag in SUBTASK_TO_ENV_FLAG.items() + if self._get_subtask_flag(subtask_type) + ] \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/flags/mappings.py b/src/core/tasks/url/operators/agency_identification/subtasks/flags/mappings.py new file mode 100644 index 00000000..d6997423 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/flags/mappings.py @@ -0,0 +1,8 @@ +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType + +SUBTASK_TO_ENV_FLAG: dict[AutoAgencyIDSubtaskType, str] = { + AutoAgencyIDSubtaskType.HOMEPAGE_MATCH: "AGENCY_ID_HOMEPAGE_MATCH_FLAG", + AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH: "AGENCY_ID_NLP_LOCATION_MATCH_FLAG", + AutoAgencyIDSubtaskType.CKAN: "AGENCY_ID_CKAN_FLAG", + AutoAgencyIDSubtaskType.MUCKROCK: "AGENCY_ID_MUCKROCK_FLAG" +} \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/base.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/base.py deleted file mode 100644 index 96f98f30..00000000 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/base.py +++ /dev/null @@ -1,16 +0,0 @@ -import abc -from abc import ABC -from typing import Optional - -from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo - - -class AgencyIdentificationSubtaskBase(ABC): - - @abc.abstractmethod - async def run( - self, - url_id: int, - collector_metadata: dict | None = None - ) -> list[URLAgencySuggestionInfo]: - raise NotImplementedError diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan.py deleted file mode 100644 index 15dddf6f..00000000 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan.py +++ /dev/null @@ -1,33 +0,0 @@ -from typing import final - -from typing_extensions import override - -from src.core.helpers import process_match_agency_response_to_suggestions -from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from src.core.tasks.url.operators.agency_identification.subtasks.impl.base import AgencyIdentificationSubtaskBase -from src.external.pdap.client import PDAPClient -from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse - -@final -class CKANAgencyIdentificationSubtask(AgencyIdentificationSubtaskBase): - - def __init__( - self, - pdap_client: PDAPClient - ): - self.pdap_client = pdap_client - - @override - async def run( - self, - url_id: int, - collector_metadata: dict | None = None - ) -> list[URLAgencySuggestionInfo]: - agency_name = collector_metadata["agency_name"] - match_agency_response: MatchAgencyResponse = await self.pdap_client.match_agency( - name=agency_name - ) - return process_match_agency_response_to_suggestions( - url_id=url_id, - match_agency_response=match_agency_response - ) diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/core.py new file mode 100644 index 00000000..d1af5391 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/core.py @@ -0,0 +1,54 @@ +from typing import final + +from typing_extensions import override + +from src.core.tasks.url.operators.agency_identification.subtasks.convert import \ + convert_match_agency_response_to_subtask_data +from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan_.params import CKANAgencyIDSubtaskParams +from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan_.query import \ + GetCKANAgencyIDSubtaskParamsQueryBuilder +from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData +from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import \ + AgencyIDSubtaskOperatorBase +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType +from src.external.pdap.client import PDAPClient +from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse + + +@final +class CKANAgencyIDSubtaskOperator(AgencyIDSubtaskOperatorBase): + + def __init__( + self, + adb_client: AsyncDatabaseClient, + task_id: int, + pdap_client: PDAPClient + ): + super().__init__(adb_client, task_id=task_id) + self.pdap_client = pdap_client + + @override + async def inner_logic(self) -> None: + params: list[CKANAgencyIDSubtaskParams] = await self._get_params() + self.linked_urls = [param.url_id for param in params] + subtask_data_list: list[AutoAgencyIDSubtaskData] = [] + for param in params: + agency_name: str = param.collector_metadata["agency_name"] + response: MatchAgencyResponse = await self.pdap_client.match_agency( + name=agency_name + ) + subtask_data: AutoAgencyIDSubtaskData = convert_match_agency_response_to_subtask_data( + url_id=param.url_id, + response=response, + subtask_type=AutoAgencyIDSubtaskType.CKAN, + task_id=self.task_id + ) + subtask_data_list.append(subtask_data) + + await self._upload_subtask_data(subtask_data_list) + + async def _get_params(self) -> list[CKANAgencyIDSubtaskParams]: + return await self.adb_client.run_query_builder( + GetCKANAgencyIDSubtaskParamsQueryBuilder() + ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/params.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/params.py new file mode 100644 index 00000000..ce4b7ce1 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/params.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class CKANAgencyIDSubtaskParams(BaseModel): + url_id: int + collector_metadata: dict \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/query.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/query.py new file mode 100644 index 00000000..90e965e7 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/query.py @@ -0,0 +1,46 @@ +from typing import Sequence + +from sqlalchemy import select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.collectors.enums import CollectorType +from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan_.params import CKANAgencyIDSubtaskParams +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.eligible import \ + EligibleContainer +from src.db.helpers.session import session_helper as sh +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.base.builder import QueryBuilderBase + + +class GetCKANAgencyIDSubtaskParamsQueryBuilder(QueryBuilderBase): + + async def run( + self, + session: AsyncSession + ) -> list[CKANAgencyIDSubtaskParams]: + container = EligibleContainer() + query = ( + select( + container.url_id, + URL.collector_metadata + ) + .join( + URL, + URL.id == container.url_id, + ) + .where( + container.ckan, + ) + .limit(500) + ) + + results: Sequence[RowMapping] = await sh.mappings(session, query=query) + return [ + CKANAgencyIDSubtaskParams( + url_id=mapping["id"], + collector_metadata=mapping["collector_metadata"], + ) + for mapping in results + ] diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/convert.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/convert.py new file mode 100644 index 00000000..f4ba913e --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/convert.py @@ -0,0 +1,47 @@ +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.models.entry import \ + GetHomepageMatchParams +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.models.mapping import \ + SubtaskURLMapping +from src.db.models.impl.url.suggestion.agency.subtask.enum import SubtaskDetailCode, AutoAgencyIDSubtaskType +from src.db.models.impl.url.suggestion.agency.subtask.pydantic import URLAutoAgencyIDSubtaskPydantic +from src.db.models.impl.url.suggestion.agency.suggestion.pydantic import AgencyIDSubtaskSuggestionPydantic + + +def convert_params_to_subtask_entries( + params: list[GetHomepageMatchParams], + task_id: int +) -> list[URLAutoAgencyIDSubtaskPydantic]: + url_id_to_detail_code: dict[int, SubtaskDetailCode] = {} + for param in params: + url_id_to_detail_code[param.url_id] = param.detail_code + + results: list[URLAutoAgencyIDSubtaskPydantic] = [] + for url_id, detail_code in url_id_to_detail_code.items(): + result = URLAutoAgencyIDSubtaskPydantic( + task_id=task_id, + url_id=url_id, + type=AutoAgencyIDSubtaskType.HOMEPAGE_MATCH, + agencies_found=True, + detail=detail_code, + ) + results.append(result) + return results + +def convert_subtask_mappings_and_params_to_suggestions( + mappings: list[SubtaskURLMapping], + params: list[GetHomepageMatchParams] +) -> list[AgencyIDSubtaskSuggestionPydantic]: + url_id_to_subtask_id: dict[int, int] = { + mapping.url_id: mapping.subtask_id + for mapping in mappings + } + suggestions: list[AgencyIDSubtaskSuggestionPydantic] = [] + for param in params: + subtask_id = url_id_to_subtask_id.get(param.url_id) + suggestion = AgencyIDSubtaskSuggestionPydantic( + subtask_id=subtask_id, + agency_id=param.agency_id, + confidence=param.confidence, + ) + suggestions.append(suggestion) + return suggestions \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/core.py new file mode 100644 index 00000000..f335cb3a --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/core.py @@ -0,0 +1,63 @@ +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.convert import \ + convert_params_to_subtask_entries, convert_subtask_mappings_and_params_to_suggestions +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.models.entry import \ + GetHomepageMatchParams +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.models.mapping import \ + SubtaskURLMapping +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.get import \ + GetHomepageMatchSubtaskURLsQueryBuilder +from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIDSubtaskOperatorBase +from src.db.models.impl.url.suggestion.agency.subtask.pydantic import URLAutoAgencyIDSubtaskPydantic +from src.db.models.impl.url.suggestion.agency.suggestion.pydantic import AgencyIDSubtaskSuggestionPydantic + + +class HomepageMatchSubtaskOperator( + AgencyIDSubtaskOperatorBase, +): + + async def inner_logic(self) -> None: + # Get Params + params: list[GetHomepageMatchParams] = \ + await self.adb_client.run_query_builder( + GetHomepageMatchSubtaskURLsQueryBuilder() + ) + + # Insert Subtask Entries + subtask_entries: list[URLAutoAgencyIDSubtaskPydantic] = convert_params_to_subtask_entries( + params=params, + task_id=self.task_id + ) + subtask_mappings: list[SubtaskURLMapping] = await self.insert_subtask_entries( + entries=subtask_entries + ) + + # Link URLs + url_ids: list[int] = [mapping.url_id for mapping in subtask_mappings] + self.linked_urls = url_ids + + # Insert Entries + suggestions: list[AgencyIDSubtaskSuggestionPydantic] = convert_subtask_mappings_and_params_to_suggestions( + mappings=subtask_mappings, + params=params + ) + await self.adb_client.bulk_insert( + models=suggestions, + ) + + + async def insert_subtask_entries( + self, + entries: list[URLAutoAgencyIDSubtaskPydantic] + ) -> list[SubtaskURLMapping]: + subtask_ids: list[int] = await self.adb_client.bulk_insert( + models=entries, + return_ids=True + ) + mappings: list[SubtaskURLMapping] = [] + for subtask_id, entry in zip(subtask_ids, entries): + mapping = SubtaskURLMapping( + url_id=entry.url_id, + subtask_id=subtask_id, + ) + mappings.append(mapping) + return mappings diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/models/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/models/entry.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/models/entry.py new file mode 100644 index 00000000..6c65f9ad --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/models/entry.py @@ -0,0 +1,10 @@ +from pydantic import BaseModel, Field + +from src.db.models.impl.url.suggestion.agency.subtask.enum import SubtaskDetailCode + + +class GetHomepageMatchParams(BaseModel): + url_id: int + agency_id: int + confidence: int = Field(..., ge=0, le=100) + detail_code: SubtaskDetailCode \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/models/mapping.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/models/mapping.py new file mode 100644 index 00000000..2e4d2fbb --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/models/mapping.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class SubtaskURLMapping(BaseModel): + url_id: int + subtask_id: int \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/consolidated.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/consolidated.py new file mode 100644 index 00000000..d90dfed6 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/consolidated.py @@ -0,0 +1,28 @@ +from sqlalchemy import CTE, select + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.count_agency_per_url import \ + COUNT_AGENCY_PER_URL_CTE +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.meta_urls_with_root_agencies import \ + META_ROOT_URLS_WITH_AGENCIES +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.unvalidated_urls_with_root import \ + UNVALIDATED_URLS_WITH_ROOT + +CONSOLIDATED_CTE: CTE = ( + select( + UNVALIDATED_URLS_WITH_ROOT.c.url_id, + META_ROOT_URLS_WITH_AGENCIES.c.agency_id, + COUNT_AGENCY_PER_URL_CTE.c.agency_count, + ) + .join( + COUNT_AGENCY_PER_URL_CTE, + COUNT_AGENCY_PER_URL_CTE.c.root_url_id == UNVALIDATED_URLS_WITH_ROOT.c.root_url_id + ) + .join( + META_ROOT_URLS_WITH_AGENCIES, + META_ROOT_URLS_WITH_AGENCIES.c.root_url_id == UNVALIDATED_URLS_WITH_ROOT.c.root_url_id + ) + .where( + COUNT_AGENCY_PER_URL_CTE.c.agency_count >= 1 + ) + .cte("consolidated") +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/count_agency_per_url.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/count_agency_per_url.py new file mode 100644 index 00000000..774787b7 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/count_agency_per_url.py @@ -0,0 +1,20 @@ +from sqlalchemy import CTE, func, select + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.meta_urls_with_root import \ + META_ROOT_URLS_CTE +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency + +COUNT_AGENCY_PER_URL_CTE: CTE = ( + select( + META_ROOT_URLS_CTE.c.root_url_id, + func.count(LinkURLAgency.agency_id).label("agency_count") + ) + .join( + LinkURLAgency, + META_ROOT_URLS_CTE.c.meta_url_id == LinkURLAgency.url_id + ) + .group_by( + META_ROOT_URLS_CTE.c.root_url_id + ) + .cte("count_agency_per_url") +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/meta_urls_with_root.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/meta_urls_with_root.py new file mode 100644 index 00000000..63b6b417 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/meta_urls_with_root.py @@ -0,0 +1,23 @@ +from sqlalchemy import CTE, select + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.whitelisted_root_urls import \ + WHITELISTED_ROOT_URLS_CTE +from src.db.models.impl.link.urls_root_url.sqlalchemy import LinkURLRootURL +from src.db.models.views.meta_url import MetaURL + +META_ROOT_URLS_CTE: CTE = ( + select( + MetaURL.url_id.label("meta_url_id"), + LinkURLRootURL.root_url_id + ) + .join( + LinkURLRootURL, + MetaURL.url_id == LinkURLRootURL.url_id + ) + # Must be a Whitelisted Root URL + .join( + WHITELISTED_ROOT_URLS_CTE, + WHITELISTED_ROOT_URLS_CTE.c.id == LinkURLRootURL.root_url_id + ) + .cte("meta_root_urls") +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/meta_urls_with_root_agencies.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/meta_urls_with_root_agencies.py new file mode 100644 index 00000000..86b14ee4 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/meta_urls_with_root_agencies.py @@ -0,0 +1,20 @@ +from sqlalchemy import CTE, select + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.meta_urls_with_root import \ + META_ROOT_URLS_CTE +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency + +META_ROOT_URLS_WITH_AGENCIES: CTE = ( + select( + META_ROOT_URLS_CTE.c.meta_url_id, + META_ROOT_URLS_CTE.c.root_url_id, + LinkURLAgency.agency_id + ) + .join( + LinkURLAgency, + META_ROOT_URLS_CTE.c.meta_url_id == LinkURLAgency.url_id + ) + .cte( + "meta_root_urls_with_agencies" + ) +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/multi_agency_case.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/multi_agency_case.py new file mode 100644 index 00000000..edf9e601 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/multi_agency_case.py @@ -0,0 +1,17 @@ +from sqlalchemy import CTE, select, literal + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.consolidated import \ + CONSOLIDATED_CTE +from src.db.models.impl.url.suggestion.agency.subtask.enum import SubtaskDetailCode + +MULTI_AGENCY_CASE_QUERY = ( + select( + CONSOLIDATED_CTE.c.url_id, + CONSOLIDATED_CTE.c.agency_id, + (literal(100) / CONSOLIDATED_CTE.c.agency_count).label("confidence"), + literal(SubtaskDetailCode.HOMEPAGE_MULTI_AGENCY.value).label("detail_code") + ) + .where( + CONSOLIDATED_CTE.c.agency_count > 1 + ) +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/single_agency_case.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/single_agency_case.py new file mode 100644 index 00000000..5778ecb6 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/single_agency_case.py @@ -0,0 +1,17 @@ +from sqlalchemy import select, CTE, literal + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.consolidated import \ + CONSOLIDATED_CTE +from src.db.models.impl.url.suggestion.agency.subtask.enum import SubtaskDetailCode + +SINGLE_AGENCY_CASE_QUERY = ( + select( + CONSOLIDATED_CTE.c.url_id, + CONSOLIDATED_CTE.c.agency_id, + literal(95).label("confidence"), + literal(SubtaskDetailCode.HOMEPAGE_SINGLE_AGENCY.value).label("detail_code") + ) + .where( + CONSOLIDATED_CTE.c.agency_count == 1 + ) +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/unvalidated_urls_with_root.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/unvalidated_urls_with_root.py new file mode 100644 index 00000000..46702833 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/unvalidated_urls_with_root.py @@ -0,0 +1,22 @@ +from sqlalchemy import CTE, select + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.whitelisted_root_urls import \ + WHITELISTED_ROOT_URLS_CTE +from src.db.models.impl.link.urls_root_url.sqlalchemy import LinkURLRootURL +from src.db.models.views.unvalidated_url import UnvalidatedURL + +UNVALIDATED_URLS_WITH_ROOT: CTE = ( + select( + UnvalidatedURL.url_id, + LinkURLRootURL.root_url_id + ) + .join( + LinkURLRootURL, + UnvalidatedURL.url_id == LinkURLRootURL.url_id + ) + .join( + WHITELISTED_ROOT_URLS_CTE, + WHITELISTED_ROOT_URLS_CTE.c.id == LinkURLRootURL.root_url_id + ) + .cte("unvalidated_urls_with_root") +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/whitelisted_root_urls.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/whitelisted_root_urls.py new file mode 100644 index 00000000..1af8f46c --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/whitelisted_root_urls.py @@ -0,0 +1,47 @@ +from sqlalchemy import CTE, select, func + +from src.db.models.impl.flag.root_url.sqlalchemy import FlagRootURL +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.link.urls_root_url.sqlalchemy import LinkURLRootURL +from src.db.models.impl.url.core.sqlalchemy import URL + +WHITELISTED_ROOT_URLS_CTE: CTE = ( + select( + URL.id + ) + .join( + FlagRootURL, + URL.id == FlagRootURL.url_id + ) + # Must be linked to other URLs + .join( + LinkURLRootURL, + URL.id == LinkURLRootURL.root_url_id + ) + # Those URLs must be meta URLS + .join( + FlagURLValidated, + FlagURLValidated.url_id == LinkURLRootURL.url_id + ) + # Get the Agency URLs for those URLs + .join( + LinkURLAgency, + LinkURLAgency.url_id == LinkURLRootURL.url_id + ) + .where( + # The connected URLs must be Meta URLs + FlagURLValidated.type == URLValidatedType.META_URL, + # Root URL can't be "https://catalog.data.gov" + URL.url != "https://catalog.data.gov" + ) + .group_by( + URL.id + ) + # Must have no more than two agencies connected + .having( + func.count(LinkURLAgency.agency_id) <= 2 + ) + .cte("whitelisted_root_urls") +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/get.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/get.py new file mode 100644 index 00000000..10619531 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/get.py @@ -0,0 +1,35 @@ +from typing import Sequence + +from sqlalchemy import Select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.models.entry import \ + GetHomepageMatchParams +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.multi_agency_case import \ + MULTI_AGENCY_CASE_QUERY +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.single_agency_case import \ + SINGLE_AGENCY_CASE_QUERY +from src.db.helpers.session import session_helper as sh +from src.db.models.impl.url.suggestion.agency.subtask.enum import SubtaskDetailCode +from src.db.queries.base.builder import QueryBuilderBase + + +class GetHomepageMatchSubtaskURLsQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> list[GetHomepageMatchParams]: + + query: Select = SINGLE_AGENCY_CASE_QUERY.union(MULTI_AGENCY_CASE_QUERY) + + mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) + + results: list[GetHomepageMatchParams] = [] + for mapping in mappings: + response = GetHomepageMatchParams( + url_id=mapping["url_id"], + agency_id=mapping["agency_id"], + confidence=mapping["confidence"], + detail_code=SubtaskDetailCode(mapping["detail_code"]), + ) + results.append(response) + + return results \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock.py deleted file mode 100644 index 633d84ac..00000000 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock.py +++ /dev/null @@ -1,48 +0,0 @@ -from typing import final - -from typing_extensions import override - -from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface -from src.collectors.impl.muckrock.api_interface.lookup_response import AgencyLookupResponse -from src.collectors.impl.muckrock.enums import AgencyLookupResponseType -from src.core.exceptions import MuckrockAPIError -from src.core.helpers import process_match_agency_response_to_suggestions -from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from src.core.tasks.url.operators.agency_identification.subtasks.impl.base import AgencyIdentificationSubtaskBase -from src.external.pdap.client import PDAPClient -from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse - -@final -class MuckrockAgencyIdentificationSubtask(AgencyIdentificationSubtaskBase): - - def __init__( - self, - muckrock_api_interface: MuckrockAPIInterface, - pdap_client: PDAPClient - ): - self.muckrock_api_interface = muckrock_api_interface - self.pdap_client = pdap_client - - @override - async def run( - self, - url_id: int, - collector_metadata: dict | None = None - ) -> list[URLAgencySuggestionInfo]: - muckrock_agency_id = collector_metadata["agency"] - agency_lookup_response: AgencyLookupResponse = await self.muckrock_api_interface.lookup_agency( - muckrock_agency_id=muckrock_agency_id - ) - if agency_lookup_response.type != AgencyLookupResponseType.FOUND: - raise MuckrockAPIError( - f"Failed to lookup muckrock agency: {muckrock_agency_id}:" - f" {agency_lookup_response.type.value}: {agency_lookup_response.error}" - ) - - match_agency_response: MatchAgencyResponse = await self.pdap_client.match_agency( - name=agency_lookup_response.name - ) - return process_match_agency_response_to_suggestions( - url_id=url_id, - match_agency_response=match_agency_response - ) diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/core.py new file mode 100644 index 00000000..4fa92c2e --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/core.py @@ -0,0 +1,93 @@ +from typing import final + +from typing_extensions import override + +from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface +from src.collectors.impl.muckrock.api_interface.lookup_response import AgencyLookupResponse +from src.collectors.impl.muckrock.enums import AgencyLookupResponseType +from src.core.tasks.url.operators.agency_identification.subtasks.convert import \ + convert_match_agency_response_to_subtask_data +from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock_.params import \ + MuckrockAgencyIDSubtaskParams +from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock_.query import \ + GetMuckrockAgencyIDSubtaskParamsQueryBuilder +from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData +from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIDSubtaskOperatorBase +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType, SubtaskDetailCode +from src.db.models.impl.url.suggestion.agency.subtask.pydantic import URLAutoAgencyIDSubtaskPydantic +from src.external.pdap.client import PDAPClient +from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse + + +@final +class MuckrockAgencyIDSubtaskOperator(AgencyIDSubtaskOperatorBase): + + def __init__( + self, + adb_client: AsyncDatabaseClient, + task_id: int, + muckrock_api_interface: MuckrockAPIInterface, + pdap_client: PDAPClient + ): + super().__init__(adb_client, task_id=task_id) + self.muckrock_api_interface = muckrock_api_interface + self.pdap_client = pdap_client + + @override + async def inner_logic(self) -> None: + params: list[MuckrockAgencyIDSubtaskParams] = await self._get_params() + self.linked_urls = [param.url_id for param in params] + subtask_data_list: list[AutoAgencyIDSubtaskData] = [] + for param in params: + muckrock_agency_id: int = param.collector_metadata["agency"] + agency_lookup_response: AgencyLookupResponse = await self.muckrock_api_interface.lookup_agency( + muckrock_agency_id=muckrock_agency_id + ) + if agency_lookup_response.type != AgencyLookupResponseType.FOUND: + data: AutoAgencyIDSubtaskData = await self._error_subtask_data( + url_id=param.url_id, + muckrock_agency_id=muckrock_agency_id, + agency_lookup_response=agency_lookup_response + ) + subtask_data_list.append(data) + continue + match_agency_response: MatchAgencyResponse = await self.pdap_client.match_agency( + name=agency_lookup_response.name + ) + subtask_data: AutoAgencyIDSubtaskData = convert_match_agency_response_to_subtask_data( + url_id=param.url_id, + response=match_agency_response, + subtask_type=AutoAgencyIDSubtaskType.MUCKROCK, + task_id=self.task_id + ) + subtask_data_list.append(subtask_data) + + await self._upload_subtask_data(subtask_data_list) + + + async def _error_subtask_data( + self, + url_id: int, + muckrock_agency_id: int, + agency_lookup_response: AgencyLookupResponse + ) -> AutoAgencyIDSubtaskData: + pydantic_model = URLAutoAgencyIDSubtaskPydantic( + task_id=self.task_id, + url_id=url_id, + type=AutoAgencyIDSubtaskType.MUCKROCK, + agencies_found=False, + detail=SubtaskDetailCode.RETRIEVAL_ERROR + ) + error: str = f"Failed to lookup muckrock agency: {muckrock_agency_id}:" + \ + f" {agency_lookup_response.type.value}: {agency_lookup_response.error}" + return AutoAgencyIDSubtaskData( + pydantic_model=pydantic_model, + suggestions=[], + error=error + ) + + async def _get_params(self) -> list[MuckrockAgencyIDSubtaskParams]: + return await self.adb_client.run_query_builder( + GetMuckrockAgencyIDSubtaskParamsQueryBuilder() + ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/params.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/params.py new file mode 100644 index 00000000..6010f022 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/params.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class MuckrockAgencyIDSubtaskParams(BaseModel): + url_id: int + collector_metadata: dict \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/query.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/query.py new file mode 100644 index 00000000..6f575b4f --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/query.py @@ -0,0 +1,49 @@ +from typing import Sequence + +from sqlalchemy import select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.collectors.enums import CollectorType +from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock_.params import \ + MuckrockAgencyIDSubtaskParams +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.eligible import \ + EligibleContainer +from src.db.helpers.session import session_helper as sh +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.base.builder import QueryBuilderBase + + +class GetMuckrockAgencyIDSubtaskParamsQueryBuilder(QueryBuilderBase): + + async def run( + self, + session: AsyncSession + ) -> list[MuckrockAgencyIDSubtaskParams]: + container = EligibleContainer() + + query = ( + select( + container.url_id, + URL.collector_metadata + ) + .join( + URL, + URL.id == container.url_id, + ) + .where( + container.muckrock, + ) + .limit(500) + ) + + results: Sequence[RowMapping] = await sh.mappings(session, query=query) + return [ + MuckrockAgencyIDSubtaskParams( + url_id=mapping["id"], + collector_metadata=mapping["collector_metadata"], + ) + for mapping in results + ] + diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/constants.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/constants.py new file mode 100644 index 00000000..b8b4ce4d --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/constants.py @@ -0,0 +1,4 @@ + + +ITERATIONS_PER_SUBTASK = 2 +NUMBER_OF_ENTRIES_PER_ITERATION = 20 \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/core.py new file mode 100644 index 00000000..0c172e5d --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/core.py @@ -0,0 +1,57 @@ +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.constants import \ + ITERATIONS_PER_SUBTASK +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.models.input import \ + NLPLocationMatchSubtaskInput +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.core import \ + AgencyIDSubtaskInternalProcessor +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.core import \ + NLPProcessor +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.query import \ + GetNLPLocationMatchSubtaskInputQueryBuilder +from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData +from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIDSubtaskOperatorBase +from src.db.client.async_ import AsyncDatabaseClient +from src.external.pdap.client import PDAPClient + + +class NLPLocationMatchSubtaskOperator(AgencyIDSubtaskOperatorBase): + + def __init__( + self, + adb_client: AsyncDatabaseClient, + task_id: int, + pdap_client: PDAPClient, + processor: NLPProcessor + ) -> None: + super().__init__(adb_client, task_id=task_id) + self.processor = AgencyIDSubtaskInternalProcessor( + nlp_processor=processor, + pdap_client=pdap_client, + task_id=task_id, + ) + + async def inner_logic(self) -> None: + for iteration in range(ITERATIONS_PER_SUBTASK): + inputs: list[NLPLocationMatchSubtaskInput] = await self._get_from_db() + if len(inputs) == 0: + break + await self.run_subtask_iteration(inputs) + + async def run_subtask_iteration(self, inputs: list[NLPLocationMatchSubtaskInput]) -> None: + self.linked_urls.extend([input_.url_id for input_ in inputs]) + subtask_data_list: list[AutoAgencyIDSubtaskData] = await self._process_inputs(inputs) + + await self._upload_subtask_data(subtask_data_list) + + async def _process_inputs( + self, + inputs: list[NLPLocationMatchSubtaskInput] + ) -> list[AutoAgencyIDSubtaskData]: + return await self.processor.process( + inputs=inputs, + ) + + async def _get_from_db(self) -> list[NLPLocationMatchSubtaskInput]: + return await self.adb_client.run_query_builder( + GetNLPLocationMatchSubtaskInputQueryBuilder(), + ) diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/models/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/models/input.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/models/input.py new file mode 100644 index 00000000..398c1504 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/models/input.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class NLPLocationMatchSubtaskInput(BaseModel): + url_id: int + html: str \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/constants.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/constants.py new file mode 100644 index 00000000..cc16da9f --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/constants.py @@ -0,0 +1,3 @@ + + +MAX_NLP_CONFIDENCE: int = 90 \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/convert.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/convert.py new file mode 100644 index 00000000..103580da --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/convert.py @@ -0,0 +1,162 @@ +from math import ceil + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.constants import \ + MAX_NLP_CONFIDENCE +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.counter import \ + RequestCounter +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.mappings.url_id_nlp_response import \ + URLToNLPResponseMapping +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.mappings.url_id_search_params import \ + URLToSearchParamsMapping +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.mappings.url_id_search_response import \ + URLToSearchResponseMapping +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.response import \ + NLPLocationMatchResponse +from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData +from src.core.tasks.url.operators.agency_identification.subtasks.models.suggestion import AgencySuggestion +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType +from src.db.models.impl.url.suggestion.agency.subtask.pydantic import URLAutoAgencyIDSubtaskPydantic +from src.external.pdap.dtos.search_agency_by_location.params import SearchAgencyByLocationParams +from src.external.pdap.dtos.search_agency_by_location.response import SearchAgencyByLocationResponse + + +def convert_nlp_response_to_search_agency_by_location_params( + nlp_response: NLPLocationMatchResponse, + counter: RequestCounter +) -> list[SearchAgencyByLocationParams]: + params: list[SearchAgencyByLocationParams] = [] + for location in nlp_response.locations: + if nlp_response.us_state is None: + raise ValueError("US State is None; cannot convert NLP response to search agency by location params") + request_id: int = counter.next() + param = SearchAgencyByLocationParams( + request_id=request_id, + query=location, + iso=nlp_response.us_state.iso, + ) + params.append(param) + + return params + + + +def convert_search_agency_responses_to_subtask_data_list( + mappings: list[URLToSearchResponseMapping], + task_id: int +) -> list[AutoAgencyIDSubtaskData]: + subtask_data_list: list[AutoAgencyIDSubtaskData] = [] + + # First, extract agency suggestions for URL + for mapping in mappings: + url_id: int = mapping.url_id + search_responses: list[SearchAgencyByLocationResponse] = mapping.search_responses + suggestions: list[AgencySuggestion] = _convert_search_agency_response_to_agency_suggestions( + search_responses + ) + pydantic_model: URLAutoAgencyIDSubtaskPydantic = convert_search_agency_response_to_subtask_pydantic( + url_id=url_id, + task_id=task_id + ) + subtask_data = AutoAgencyIDSubtaskData( + pydantic_model=pydantic_model, + suggestions=suggestions + ) + subtask_data_list.append(subtask_data) + + return subtask_data_list + + +def _convert_search_agency_response_to_agency_suggestions( + responses: list[SearchAgencyByLocationResponse], +) -> list[AgencySuggestion]: + suggestions: list[AgencySuggestion] = [] + for response in responses: + for result in response.results: + agency_id: int = result.agency_id + similarity: float = result.similarity + confidence: int = min(ceil(similarity * 100), MAX_NLP_CONFIDENCE) + suggestion: AgencySuggestion = AgencySuggestion( + agency_id=agency_id, + confidence=confidence, + ) + suggestions.append(suggestion) + return suggestions + +def convert_url_ids_to_empty_subtask_data_list( + url_ids: list[int], + task_id: int +) -> list[AutoAgencyIDSubtaskData]: + results: list[AutoAgencyIDSubtaskData] = [] + for url_id in url_ids: + subtask_data = AutoAgencyIDSubtaskData( + pydantic_model=URLAutoAgencyIDSubtaskPydantic( + task_id=task_id, + url_id=url_id, + type=AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH, + agencies_found=False + ), + suggestions=[] + ) + results.append(subtask_data) + + return results + + + +def convert_empty_url_search_param_mappings_to_subtask_data_list( + mappings: list[URLToSearchParamsMapping], + task_id: int +) -> list[AutoAgencyIDSubtaskData]: + url_ids: list[int] = [] + for mapping in mappings: + url_ids.append(mapping.url_id) + + return convert_url_ids_to_empty_subtask_data_list( + url_ids=url_ids, + task_id=task_id + ) + +def convert_invalid_url_nlp_mappings_to_subtask_data_list( + mappings: list[URLToNLPResponseMapping], + task_id: int +) -> list[AutoAgencyIDSubtaskData]: + url_ids: list[int] = [] + for mapping in mappings: + url_ids.append(mapping.url_id) + + return convert_url_ids_to_empty_subtask_data_list( + url_ids=url_ids, + task_id=task_id + ) + + +def convert_search_agency_response_to_subtask_pydantic( + url_id: int, + task_id: int +) -> URLAutoAgencyIDSubtaskPydantic: + + return URLAutoAgencyIDSubtaskPydantic( + task_id=task_id, + url_id=url_id, + type=AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH, + agencies_found=True + ) + + +def convert_urls_to_search_params( + url_to_nlp_mappings: list[URLToNLPResponseMapping] +) -> list[URLToSearchParamsMapping]: + url_to_search_params_mappings: list[URLToSearchParamsMapping] = [] + counter = RequestCounter() + for mapping in url_to_nlp_mappings: + search_params: list[SearchAgencyByLocationParams] = \ + convert_nlp_response_to_search_agency_by_location_params( + counter=counter, + nlp_response=mapping.nlp_response, + ) + mapping = URLToSearchParamsMapping( + url_id=mapping.url_id, + search_params=search_params, + ) + url_to_search_params_mappings.append(mapping) + return url_to_search_params_mappings diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/core.py new file mode 100644 index 00000000..1e349318 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/core.py @@ -0,0 +1,143 @@ +from collections import defaultdict + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.models.input import \ + NLPLocationMatchSubtaskInput +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.convert import \ + convert_search_agency_responses_to_subtask_data_list, \ + convert_invalid_url_nlp_mappings_to_subtask_data_list, convert_urls_to_search_params +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.filter import \ + filter_valid_and_invalid_nlp_responses, filter_top_n_suggestions +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.mapper import \ + URLRequestIDMapper +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.mappings.url_id_nlp_response import \ + URLToNLPResponseMapping +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.mappings.url_id_search_params import \ + URLToSearchParamsMapping +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.mappings.url_id_search_response import \ + URLToSearchResponseMapping +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.subsets.nlp_responses import \ + NLPResponseSubsets +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.core import \ + NLPProcessor +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.response import \ + NLPLocationMatchResponse +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.preprocess import \ + preprocess_html +from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData +from src.external.pdap.client import PDAPClient +from src.external.pdap.dtos.search_agency_by_location.params import SearchAgencyByLocationParams +from src.external.pdap.dtos.search_agency_by_location.response import SearchAgencyByLocationResponse + + +class AgencyIDSubtaskInternalProcessor: + + def __init__( + self, + nlp_processor: NLPProcessor, + pdap_client: PDAPClient, + task_id: int, + ): + self._nlp_processor = nlp_processor + self._pdap_client = pdap_client + self._task_id = task_id + + async def process( + self, + inputs: list[NLPLocationMatchSubtaskInput] + ) -> list[AutoAgencyIDSubtaskData]: + subtask_data_list: list[AutoAgencyIDSubtaskData] = [] + + url_to_nlp_mappings: list[URLToNLPResponseMapping] = \ + self._match_urls_to_nlp_responses(inputs) + + # Filter out valid and invalid NLP responses + nlp_response_subsets: NLPResponseSubsets = \ + filter_valid_and_invalid_nlp_responses(url_to_nlp_mappings) + + # For invalid responses, convert to subtask data with empty agencies + subtask_data_no_agency_list: list[AutoAgencyIDSubtaskData] = \ + convert_invalid_url_nlp_mappings_to_subtask_data_list( + mappings=nlp_response_subsets.invalid, + task_id=self._task_id, + ) + subtask_data_list.extend(subtask_data_no_agency_list) + + # For valid responses, convert to search param mappings + url_to_search_params_mappings: list[URLToSearchParamsMapping] = \ + convert_urls_to_search_params(nlp_response_subsets.valid) + + + response_mappings: list[URLToSearchResponseMapping] = \ + await self._get_pdap_info(url_to_search_params_mappings) + + subtask_data_list_agency_list: list[AutoAgencyIDSubtaskData] = \ + convert_search_agency_responses_to_subtask_data_list( + mappings=response_mappings, + task_id=self._task_id, + ) + + filter_top_n_suggestions(subtask_data_list_agency_list) + + subtask_data_list.extend(subtask_data_list_agency_list) + + return subtask_data_list + + def _match_urls_to_nlp_responses( + self, + inputs: list[NLPLocationMatchSubtaskInput] + ) -> list[URLToNLPResponseMapping]: + url_to_nlp_mappings: list[URLToNLPResponseMapping] = [] + for input_ in inputs: + nlp_response: NLPLocationMatchResponse = self._get_location_match(input_.html) + mapping = URLToNLPResponseMapping( + url_id=input_.url_id, + nlp_response=nlp_response, + ) + url_to_nlp_mappings.append(mapping) + return url_to_nlp_mappings + + def _get_location_match( + self, + html: str + ) -> NLPLocationMatchResponse: + preprocessed_html: str = preprocess_html(html) + return self._nlp_processor.parse_for_locations(preprocessed_html) + + async def _get_pdap_info( + self, + mappings: list[URLToSearchParamsMapping] + ) -> list[URLToSearchResponseMapping]: + if len(mappings) == 0: + return [] + params: list[SearchAgencyByLocationParams] = [] + # Map request IDs to URL IDs for later use + mapper = URLRequestIDMapper() + for mapping in mappings: + for search_param in mapping.search_params: + mapper.add_mapping( + request_id=search_param.request_id, + url_id=mapping.url_id, + ) + params.append(search_param) + + url_id_to_search_responses: dict[int, list[SearchAgencyByLocationResponse]] = defaultdict(list) + + responses: list[SearchAgencyByLocationResponse] = await self._pdap_client.search_agency_by_location(params) + # Map responses to URL IDs via request IDs + for response in responses: + request_id: int = response.request_id + url_id: int = mapper.get_url_id_by_request_id(request_id) + url_id_to_search_responses[url_id].append(response) + + # Reconcile URL IDs to search responses + response_mappings: list[URLToSearchResponseMapping] = [] + for url_id, responses in url_id_to_search_responses.items(): + mapping = URLToSearchResponseMapping( + url_id=url_id, + search_responses=responses, + ) + response_mappings.append(mapping) + + return response_mappings + + diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/counter.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/counter.py new file mode 100644 index 00000000..12e9e048 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/counter.py @@ -0,0 +1,11 @@ + + + +class RequestCounter: + + def __init__(self): + self._counter: int = 0 + + def next(self) -> int: + self._counter += 1 + return self._counter \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/extract.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/extract.py new file mode 100644 index 00000000..053f4fb5 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/extract.py @@ -0,0 +1,12 @@ +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.mappings.url_id_search_params import \ + URLToSearchParamsMapping +from src.external.pdap.dtos.search_agency_by_location.params import SearchAgencyByLocationParams + + +def _extract_all_search_params( + url_to_search_params_mappings: list[URLToSearchParamsMapping] +) -> list[SearchAgencyByLocationParams]: + all_search_params: list[SearchAgencyByLocationParams] = [] + for mapping in url_to_search_params_mappings: + all_search_params.extend(mapping.search_params) + return all_search_params diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/filter.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/filter.py new file mode 100644 index 00000000..ff8b2de5 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/filter.py @@ -0,0 +1,59 @@ +from collections import defaultdict + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.mappings.url_id_nlp_response import \ + URLToNLPResponseMapping +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.subsets.nlp_responses import \ + NLPResponseSubsets +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.response import \ + NLPLocationMatchResponse +from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData +from src.core.tasks.url.operators.agency_identification.subtasks.models.suggestion import AgencySuggestion + + +def filter_valid_and_invalid_nlp_responses( + mappings: list[URLToNLPResponseMapping] +) -> NLPResponseSubsets: + valid: list[URLToNLPResponseMapping] = [] + invalid: list[URLToNLPResponseMapping] = [] + for mapping in mappings: + nlp_response: NLPLocationMatchResponse = mapping.nlp_response + if nlp_response.valid: + valid.append(mapping) + else: + invalid.append(mapping) + return NLPResponseSubsets( + valid=valid, + invalid=invalid, + ) + +def filter_top_n_suggestions( + subtask_data_list: list[AutoAgencyIDSubtaskData], + n: int = 5 +) -> None: + """Filters out all but the top N suggestions for each URL. + + Modifies: + - AutoAgencyIDSubtaskData.suggestions + """ + for subtask_data in subtask_data_list: + # Eliminate agency ID duplicates; + agency_to_suggestions: dict[int, list[AgencySuggestion]] = defaultdict(list) + for suggestion in subtask_data.suggestions: + agency_to_suggestions[suggestion.agency_id].append(suggestion) + + # in the case of a tie, keep the suggestion with the highest confidence + deduped_suggestions: list[AgencySuggestion] = [] + for agency_suggestions in agency_to_suggestions.values(): + agency_suggestions.sort( + key=lambda x: x.confidence, + reverse=True # Descending order + ) + deduped_suggestions.append(agency_suggestions[0]) + + # Sort suggestions by confidence and keep top N + suggestions_sorted: list[AgencySuggestion] = sorted( + deduped_suggestions, + key=lambda x: x.confidence, + reverse=True # Descending order + ) + subtask_data.suggestions = suggestions_sorted[:n] diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/mapper.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/mapper.py new file mode 100644 index 00000000..8192dbb6 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/mapper.py @@ -0,0 +1,10 @@ +class URLRequestIDMapper: + + def __init__(self): + self._request_id_to_url_id_mapper: dict[int, int] = {} + + def add_mapping(self, request_id: int, url_id: int) -> None: + self._request_id_to_url_id_mapper[request_id] = url_id + + def get_url_id_by_request_id(self, request_id: int) -> int: + return self._request_id_to_url_id_mapper[request_id] diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_nlp_response.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_nlp_response.py new file mode 100644 index 00000000..7bb7e701 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_nlp_response.py @@ -0,0 +1,9 @@ +from pydantic import BaseModel + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.response import \ + NLPLocationMatchResponse + + +class URLToNLPResponseMapping(BaseModel): + url_id: int + nlp_response: NLPLocationMatchResponse \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_search_params.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_search_params.py new file mode 100644 index 00000000..5ab9deac --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_search_params.py @@ -0,0 +1,12 @@ +from pydantic import BaseModel + +from src.external.pdap.dtos.search_agency_by_location.params import SearchAgencyByLocationParams + + +class URLToSearchParamsMapping(BaseModel): + url_id: int + search_params: list[SearchAgencyByLocationParams] + + @property + def is_empty(self) -> bool: + return len(self.search_params) == 0 \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_search_response.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_search_response.py new file mode 100644 index 00000000..9a88b89d --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_search_response.py @@ -0,0 +1,8 @@ +from pydantic import BaseModel + +from src.external.pdap.dtos.search_agency_by_location.response import SearchAgencyByLocationResponse + + +class URLToSearchResponseMapping(BaseModel): + url_id: int + search_responses: list[SearchAgencyByLocationResponse] \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/subsets/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/subsets/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/subsets/nlp_responses.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/subsets/nlp_responses.py new file mode 100644 index 00000000..22fdcf98 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/subsets/nlp_responses.py @@ -0,0 +1,9 @@ +from pydantic import BaseModel + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.mappings.url_id_nlp_response import \ + URLToNLPResponseMapping + + +class NLPResponseSubsets(BaseModel): + valid: list[URLToNLPResponseMapping] + invalid: list[URLToNLPResponseMapping] \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/check.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/check.py new file mode 100644 index 00000000..ef60e038 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/check.py @@ -0,0 +1,9 @@ +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.mappings import \ + US_STATE_ISO_TO_NAME, US_NAME_TO_STATE_ISO + + +def is_iso_us_state(iso: str) -> bool: + return iso in US_STATE_ISO_TO_NAME + +def is_name_us_state(name: str) -> bool: + return name in US_NAME_TO_STATE_ISO \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/constants.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/constants.py new file mode 100644 index 00000000..8b9076fe --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/constants.py @@ -0,0 +1,18 @@ + + +TOP_N_LOCATIONS_COUNT: int = 5 + +INVALID_LOCATION_CHARACTERS: set[str] = { + "=", + "\\", + "/", + "\'", + "\"," +} + +# State ISOs that commonly align with other words, +# Which cannot be used in simple text scanning +INVALID_SCAN_ISOS: set[str] = { + "IN", + "OR", +} \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/convert.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/convert.py new file mode 100644 index 00000000..040bc466 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/convert.py @@ -0,0 +1,27 @@ +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.mappings import \ + US_STATE_ISO_TO_NAME, US_NAME_TO_STATE_ISO +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.us_state import \ + USState + + +def convert_us_state_iso_to_us_state(iso: str) -> USState | None: + name: str | None = US_STATE_ISO_TO_NAME.get(iso, None) + + if name is None: + return None + + return USState( + name=name, + iso=iso + ) + +def convert_us_state_name_to_us_state(name: str) -> USState | None: + iso: str | None = US_NAME_TO_STATE_ISO.get(name, None) + + if iso is None: + return None + + return USState( + name=name, + iso=iso + ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/core.py new file mode 100644 index 00000000..8e723aa6 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/core.py @@ -0,0 +1,88 @@ +from collections import Counter + +import spacy +from spacy import Language +from spacy.tokens import Doc + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.check import \ + is_name_us_state, is_iso_us_state +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.constants import \ + INVALID_LOCATION_CHARACTERS, INVALID_SCAN_ISOS +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.convert import \ + convert_us_state_name_to_us_state, convert_us_state_iso_to_us_state +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.enums import \ + SpacyModelType +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.extract import \ + extract_most_common_us_state, extract_top_n_locations +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.response import \ + NLPLocationMatchResponse +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.us_state import \ + USState + + +class NLPProcessor: + + def __init__( + self, + model_type: SpacyModelType = SpacyModelType.EN_CORE_WEB_SM + ): + self._model_type: SpacyModelType = model_type + self._model: Language | None = None + + def lazy_load_model(self) -> Language: + if self._model is None: + self._model = spacy.load(self._model_type.value, disable=['parser']) + return self._model + + + def parse_for_locations(self, html: str) -> NLPLocationMatchResponse: + model: Language = self.lazy_load_model() + doc: Doc = model(html) + us_state_counter: Counter[USState] = Counter() + location_counter: Counter[str] = Counter() + + # Scan over tokens + for token in doc: + upper_token: str = token.text.upper() + # Disregard certain ISOs that align with common words + if upper_token in INVALID_SCAN_ISOS: + continue + if not is_iso_us_state(upper_token): + continue + + us_state: USState | None = convert_us_state_iso_to_us_state(upper_token) + if us_state is not None: + us_state_counter[us_state] += 1 + + + # Scan over entities using spacy + for ent in doc.ents: + if ent.label_ != "GPE": # Geopolitical Entity + continue + text: str = ent.text + if any(char in text for char in INVALID_LOCATION_CHARACTERS): + continue + if is_name_us_state(text): + us_state: USState | None = convert_us_state_name_to_us_state(text) + if us_state is not None: + us_state_counter[us_state] += 1 + continue + if is_iso_us_state(text): + us_state: USState | None = convert_us_state_iso_to_us_state(text) + if us_state is not None: + us_state_counter[us_state] += 1 + continue + location_counter[text] += 1 + + # Get most common US State if exists + most_common_us_state: USState | None = extract_most_common_us_state(us_state_counter) + + top_n_locations: list[str] = extract_top_n_locations(location_counter) + + return NLPLocationMatchResponse( + us_state=most_common_us_state, + locations=top_n_locations + ) + + + diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/enums.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/enums.py new file mode 100644 index 00000000..9d1b987b --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/enums.py @@ -0,0 +1,8 @@ +from enum import Enum + + +class SpacyModelType(Enum): + EN_CORE_WEB_SM = "en_core_web_sm" + EN_CORE_WEB_LG = "en_core_web_lg" + EN_CORE_WEB_MD = "en_core_web_md" + EN_CORE_WEB_TRF = "en_core_web_trf" \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/extract.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/extract.py new file mode 100644 index 00000000..ea732ef0 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/extract.py @@ -0,0 +1,25 @@ +from collections import Counter + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.constants import \ + TOP_N_LOCATIONS_COUNT +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.us_state import \ + USState + + +def extract_most_common_us_state( + us_state_counter: Counter[USState] +) -> USState | None: + try: + return us_state_counter.most_common(1)[0][0] + except IndexError: + return None + +def extract_top_n_locations( + location_counter: Counter[str] +) -> list[str]: + top_n_locations_raw: list[tuple[str, int]] = \ + location_counter.most_common(TOP_N_LOCATIONS_COUNT) + top_n_locations: list[str] = [] + for location, _ in top_n_locations_raw: + top_n_locations.append(location) + return top_n_locations \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/mappings.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/mappings.py new file mode 100644 index 00000000..03417480 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/mappings.py @@ -0,0 +1,59 @@ + + +US_STATE_ISO_TO_NAME: dict[str, str] = { + 'AL': 'Alabama', + 'AK': 'Alaska', + 'AZ': 'Arizona', + 'AR': 'Arkansas', + 'CA': 'California', + 'CO': 'Colorado', + 'CT': 'Connecticut', + 'DE': 'Delaware', + 'FL': 'Florida', + 'GA': 'Georgia', + 'HI': 'Hawaii', + 'ID': 'Idaho', + 'IL': 'Illinois', + 'IN': 'Indiana', + 'IA': 'Iowa', + 'KS': 'Kansas', + 'KY': 'Kentucky', + 'LA': 'Louisiana', + 'ME': 'Maine', + 'MD': 'Maryland', + 'MA': 'Massachusetts', + 'MI': 'Michigan', + 'MN': 'Minnesota', + 'MS': 'Mississippi', + 'MO': 'Missouri', + 'MT': 'Montana', + 'NE': 'Nebraska', + 'NV': 'Nevada', + 'NH': 'New Hampshire', + 'NJ': 'New Jersey', + 'NM': 'New Mexico', + 'NY': 'New York', + 'NC': 'North Carolina', + 'ND': 'North Dakota', + 'OH': 'Ohio', + 'OK': 'Oklahoma', + 'OR': 'Oregon', + 'PA': 'Pennsylvania', + 'RI': 'Rhode Island', + 'SC': 'South Carolina', + 'SD': 'South Dakota', + 'TN': 'Tennessee', + 'TX': 'Texas', + 'UT': 'Utah', + 'VT': 'Vermont', + 'VA': 'Virginia', + 'WA': 'Washington', + 'WV': 'West Virginia', + 'WI': 'Wisconsin', + 'WY': 'Wyoming', + 'DC': 'District of Columbia', +} + +US_NAME_TO_STATE_ISO: dict[str, str] = { + name: iso for iso, name in US_STATE_ISO_TO_NAME.items() +} \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/params.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/params.py new file mode 100644 index 00000000..79378612 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/params.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class NLPLocationMatchParams(BaseModel): + url_id: int + html: str \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/response.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/response.py new file mode 100644 index 00000000..387e32de --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/response.py @@ -0,0 +1,18 @@ +from pydantic import BaseModel + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.us_state import \ + USState + + +class NLPLocationMatchResponse(BaseModel): + locations: list[str] + us_state: USState | None + + @property + def valid(self) -> bool: + # Valid responses must have a US State and at least one location + if self.us_state is None: + return False + if len(self.locations) == 0: + return False + return True diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/us_state.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/us_state.py new file mode 100644 index 00000000..0b29771f --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/us_state.py @@ -0,0 +1,8 @@ +from pydantic import BaseModel, ConfigDict + + +class USState(BaseModel): + model_config = ConfigDict(frozen=True) + + name: str + iso: str diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/preprocess.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/preprocess.py new file mode 100644 index 00000000..da20f4f4 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/preprocess.py @@ -0,0 +1,20 @@ +import re + +import unicodedata +from bs4 import BeautifulSoup + + +def preprocess_html(raw_html: str) -> str: + """Preprocess HTML to extract text content.""" + soup = BeautifulSoup(raw_html, 'lxml') + + # Remove scripts, styles, and other non-textual elements + for tag in soup(['script','style','noscript','iframe','canvas','svg','header','footer','nav','aside']): + tag.decompose() + # Extract text + text = soup.get_text(separator=' ') + # Normalize text and collapse whitespace + text = unicodedata.normalize('NFKC', text) + text = re.sub(r'[ \t\u00A0]+', ' ', text) + text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text) + return text.strip() \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/query.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/query.py new file mode 100644 index 00000000..32311bd1 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/query.py @@ -0,0 +1,49 @@ +from typing import Sequence + +from sqlalchemy import select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.constants import \ + NUMBER_OF_ENTRIES_PER_ITERATION +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.models.input import \ + NLPLocationMatchSubtaskInput +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.eligible import \ + EligibleContainer +from src.db.helpers.session import session_helper as sh +from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML +from src.db.queries.base.builder import QueryBuilderBase +from src.db.utils.compression import decompress_html + + +class GetNLPLocationMatchSubtaskInputQueryBuilder(QueryBuilderBase): + + async def run( + self, + session: AsyncSession + ) -> list[NLPLocationMatchSubtaskInput]: + container = EligibleContainer() + query = ( + select( + container.url_id, + URLCompressedHTML.compressed_html + ) + .join( + URLCompressedHTML, + URLCompressedHTML.url_id == container.url_id, + ) + .where( + container.nlp_location, + ) + .limit(NUMBER_OF_ENTRIES_PER_ITERATION) + ) + + mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) + inputs: list[NLPLocationMatchSubtaskInput] = [ + NLPLocationMatchSubtaskInput( + url_id=mapping["id"], + html=decompress_html(mapping["compressed_html"]), + ) + for mapping in mappings + ] + return inputs + diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/unknown.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/unknown.py deleted file mode 100644 index 7ffd57bc..00000000 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/unknown.py +++ /dev/null @@ -1,30 +0,0 @@ -from typing_extensions import override, final - -from src.core.enums import SuggestionType -from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from src.core.tasks.url.operators.agency_identification.subtasks.impl.base import AgencyIdentificationSubtaskBase - -@final -class UnknownAgencyIdentificationSubtask(AgencyIdentificationSubtaskBase): - """A subtask that returns an unknown suggestion. - - Used in cases where the agency cannot be reliably inferred from the source. - """ - - @override - async def run( - self, - url_id: int, - collector_metadata: dict | None = None - ) -> list[URLAgencySuggestionInfo]: - return [ - URLAgencySuggestionInfo( - url_id=url_id, - suggestion_type=SuggestionType.UNKNOWN, - pdap_agency_id=None, - agency_name=None, - state=None, - county=None, - locality=None - ) - ] diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/loader.py b/src/core/tasks/url/operators/agency_identification/subtasks/loader.py index 6ef84149..5dab9608 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/loader.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/loader.py @@ -1,10 +1,16 @@ -from src.collectors.enums import CollectorType from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface -from src.core.tasks.url.operators.agency_identification.subtasks.impl.base import AgencyIdentificationSubtaskBase -from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan import CKANAgencyIdentificationSubtask -from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock import \ - MuckrockAgencyIdentificationSubtask -from src.core.tasks.url.operators.agency_identification.subtasks.impl.unknown import UnknownAgencyIdentificationSubtask +from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan_.core import CKANAgencyIDSubtaskOperator +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.core import \ + HomepageMatchSubtaskOperator +from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock_.core import \ + MuckrockAgencyIDSubtaskOperator +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.core import \ + NLPLocationMatchSubtaskOperator +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.core import \ + NLPProcessor +from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIDSubtaskOperatorBase +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType from src.external.pdap.client import PDAPClient @@ -14,35 +20,58 @@ class AgencyIdentificationSubtaskLoader: def __init__( self, pdap_client: PDAPClient, - muckrock_api_interface: MuckrockAPIInterface + muckrock_api_interface: MuckrockAPIInterface, + adb_client: AsyncDatabaseClient, + nlp_processor: NLPProcessor ): - self.pdap_client = pdap_client - self.muckrock_api_interface = muckrock_api_interface + self._pdap_client = pdap_client + self._muckrock_api_interface = muckrock_api_interface + self._nlp_processor = nlp_processor + self.adb_client = adb_client - async def _load_muckrock_subtask(self) -> MuckrockAgencyIdentificationSubtask: - return MuckrockAgencyIdentificationSubtask( - muckrock_api_interface=self.muckrock_api_interface, - pdap_client=self.pdap_client + def _load_muckrock_subtask(self, task_id: int) -> MuckrockAgencyIDSubtaskOperator: + return MuckrockAgencyIDSubtaskOperator( + task_id=task_id, + adb_client=self.adb_client, + muckrock_api_interface=self._muckrock_api_interface, + pdap_client=self._pdap_client ) - async def _load_ckan_subtask(self) -> CKANAgencyIdentificationSubtask: - return CKANAgencyIdentificationSubtask( - pdap_client=self.pdap_client + def _load_ckan_subtask(self, task_id: int) -> CKANAgencyIDSubtaskOperator: + return CKANAgencyIDSubtaskOperator( + task_id=task_id, + adb_client=self.adb_client, + pdap_client=self._pdap_client ) - async def load_subtask(self, collector_type: CollectorType) -> AgencyIdentificationSubtaskBase: + def _load_homepage_match_subtask(self, task_id: int) -> HomepageMatchSubtaskOperator: + return HomepageMatchSubtaskOperator( + task_id=task_id, + adb_client=self.adb_client, + ) + + def _load_nlp_location_match_subtask(self, task_id: int) -> NLPLocationMatchSubtaskOperator: + return NLPLocationMatchSubtaskOperator( + task_id=task_id, + adb_client=self.adb_client, + pdap_client=self._pdap_client, + processor=self._nlp_processor + ) + + + async def load_subtask( + self, + subtask_type: AutoAgencyIDSubtaskType, + task_id: int + ) -> AgencyIDSubtaskOperatorBase: """Get subtask based on collector type.""" - match collector_type: - case CollectorType.MUCKROCK_SIMPLE_SEARCH: - return await self._load_muckrock_subtask() - case CollectorType.MUCKROCK_COUNTY_SEARCH: - return await self._load_muckrock_subtask() - case CollectorType.MUCKROCK_ALL_SEARCH: - return await self._load_muckrock_subtask() - case CollectorType.AUTO_GOOGLER: - return UnknownAgencyIdentificationSubtask() - case CollectorType.COMMON_CRAWLER: - return UnknownAgencyIdentificationSubtask() - case CollectorType.CKAN: - return await self._load_ckan_subtask() - return UnknownAgencyIdentificationSubtask() \ No newline at end of file + match subtask_type: + case AutoAgencyIDSubtaskType.MUCKROCK: + return self._load_muckrock_subtask(task_id) + case AutoAgencyIDSubtaskType.CKAN: + return self._load_ckan_subtask(task_id) + case AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH: + return self._load_nlp_location_match_subtask(task_id) + case AutoAgencyIDSubtaskType.HOMEPAGE_MATCH: + return self._load_homepage_match_subtask(task_id) + raise ValueError(f"Unknown subtask type: {subtask_type}") diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/models/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/models/run_info.py b/src/core/tasks/url/operators/agency_identification/subtasks/models/run_info.py new file mode 100644 index 00000000..524830e3 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/models/run_info.py @@ -0,0 +1,14 @@ +from pydantic import BaseModel + + +class AgencyIDSubtaskRunInfo(BaseModel): + error: str | None = None + linked_url_ids: list[int] | None = None + + @property + def is_success(self) -> bool: + return self.error is None + + @property + def has_linked_urls(self) -> bool: + return len(self.linked_url_ids) > 0 \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/models/subtask.py b/src/core/tasks/url/operators/agency_identification/subtasks/models/subtask.py new file mode 100644 index 00000000..7da0a8f5 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/models/subtask.py @@ -0,0 +1,18 @@ +from pydantic import BaseModel + +from src.core.tasks.url.operators.agency_identification.subtasks.models.suggestion import AgencySuggestion +from src.db.models.impl.url.suggestion.agency.subtask.pydantic import URLAutoAgencyIDSubtaskPydantic + + +class AutoAgencyIDSubtaskData(BaseModel): + pydantic_model: URLAutoAgencyIDSubtaskPydantic + suggestions: list[AgencySuggestion] + error: str | None = None + + @property + def has_error(self) -> bool: + return self.error is not None + + @property + def url_id(self) -> int: + return self.pydantic_model.url_id \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/models/suggestion.py b/src/core/tasks/url/operators/agency_identification/subtasks/models/suggestion.py new file mode 100644 index 00000000..669c498c --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/models/suggestion.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel, Field + + +class AgencySuggestion(BaseModel): + agency_id: int + confidence: int = Field(ge=0, le=100) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/constants.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/constants.py new file mode 100644 index 00000000..749332e6 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/constants.py @@ -0,0 +1,14 @@ +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType + +# Determines priority of subtasks, all else being equal. +SUBTASK_HIERARCHY: list[AutoAgencyIDSubtaskType] = [ + AutoAgencyIDSubtaskType.CKAN, + AutoAgencyIDSubtaskType.MUCKROCK, + AutoAgencyIDSubtaskType.HOMEPAGE_MATCH, + AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH +] + +SUBTASK_HIERARCHY_MAPPING: dict[AutoAgencyIDSubtaskType, int] = { + subtask: idx + for idx, subtask in enumerate(SUBTASK_HIERARCHY) +} \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/core.py new file mode 100644 index 00000000..2b81d2de --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/core.py @@ -0,0 +1,77 @@ +from collections import Counter + +from sqlalchemy import RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.constants import SUBTASK_HIERARCHY, \ + SUBTASK_HIERARCHY_MAPPING +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.eligible_counts import \ + ELIGIBLE_COUNTS_QUERY +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType +from src.db.queries.base.builder import QueryBuilderBase + +from src.db.helpers.session import session_helper as sh + +class AgencyIDSubtaskSurveyQueryBuilder(QueryBuilderBase): + """ + Survey applicable URLs to determine next subtask to run + + URLs are "inapplicable" if they have any of the following properties: + - Are validated via FlagURLValidated model + - Have at least one annotation with agency suggestion with confidence >= 95 + - Have all possible subtasks completed + + Returns a list of one or more subtasks to run + based on which subtask(s) have the most applicable URLs + (or an empty list if no subtasks have applicable URLs) + """ + + def __init__( + self, + allowed_subtasks: list[AutoAgencyIDSubtaskType] + ): + super().__init__() + self._allowed_subtasks = allowed_subtasks + + async def run(self, session: AsyncSession) -> AutoAgencyIDSubtaskType | None: + results: RowMapping = await sh.mapping(session, ELIGIBLE_COUNTS_QUERY) + counts: Counter[str] = Counter(results) + + allowed_counts: Counter[str] = await self._filter_allowed_counts(counts) + if len(allowed_counts) == 0: + return None + max_count: int = max(allowed_counts.values()) + if max_count == 0: + return None + subtasks_with_max_count: list[str] = [ + subtask for subtask, count in allowed_counts.items() + if count == max_count + ] + subtasks_as_enum_list: list[AutoAgencyIDSubtaskType] = [ + AutoAgencyIDSubtaskType(subtask) + for subtask in subtasks_with_max_count + ] + # Sort subtasks by priority + sorted_subtasks: list[AutoAgencyIDSubtaskType] = sorted( + subtasks_as_enum_list, + key=lambda subtask: SUBTASK_HIERARCHY_MAPPING[subtask], + reverse=True, + ) + # Return the highest priority subtask + return sorted_subtasks[0] + + async def _filter_allowed_counts(self, counts: Counter[str]) -> Counter[str]: + return Counter( + { + subtask: count + for subtask, count in counts.items() + if AutoAgencyIDSubtaskType(subtask) in self._allowed_subtasks + } + ) + + + + + + + diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/README.md b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/README.md new file mode 100644 index 00000000..38324fa7 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/README.md @@ -0,0 +1,3 @@ +Contains CTEs for determining validity for each subtask. + +Each file corresponds to the validity CTE for that subtask. \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/eligible.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/eligible.py new file mode 100644 index 00000000..5be64fbc --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/eligible.py @@ -0,0 +1,57 @@ +from sqlalchemy import select, CTE, Column + +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.exists.impl.high_confidence_annotations import \ + HIGH_CONFIDENCE_ANNOTATIONS_EXISTS_CONTAINER +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.exists.impl.validated import \ + VALIDATED_EXISTS_CONTAINER +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.impl.ckan import \ + CKAN_SUBTASK_CONTAINER +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.impl.homepage import \ + HOMEPAGE_SUBTASK_CONTAINER +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.impl.muckrock import \ + MUCKROCK_SUBTASK_CONTAINER +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.impl.nlp_location import \ + NLP_LOCATION_CONTAINER +from src.db.models.impl.url.core.sqlalchemy import URL + +class EligibleContainer: + + def __init__(self): + self._cte = ( + select( + URL.id, + CKAN_SUBTASK_CONTAINER.eligible_query.label("ckan"), + MUCKROCK_SUBTASK_CONTAINER.eligible_query.label("muckrock"), + HOMEPAGE_SUBTASK_CONTAINER.eligible_query.label("homepage"), + NLP_LOCATION_CONTAINER.eligible_query.label("nlp_location"), + ) + .where( + HIGH_CONFIDENCE_ANNOTATIONS_EXISTS_CONTAINER.not_exists_query, + VALIDATED_EXISTS_CONTAINER.not_exists_query, + ) + .cte("eligible") + ) + + @property + def cte(self) -> CTE: + return self._cte + + @property + def url_id(self) -> Column[int]: + return self._cte.c['id'] + + @property + def ckan(self) -> Column[bool]: + return self._cte.c['ckan'] + + @property + def muckrock(self) -> Column[bool]: + return self._cte.c['muckrock'] + + @property + def homepage(self) -> Column[bool]: + return self._cte.c['homepage'] + + @property + def nlp_location(self) -> Column[bool]: + return self._cte.c['nlp_location'] \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/container.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/container.py new file mode 100644 index 00000000..d59c508c --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/container.py @@ -0,0 +1,33 @@ +from sqlalchemy import CTE, Column, ColumnElement, exists + +from src.db.models.impl.url.core.sqlalchemy import URL + + +class ExistsCTEContainer: + """ + Base class for CTEs that determine validity for each subtask. + + Single column CTEs intended to be left-joined and considered valid only + if the joined row is not null. + """ + + def __init__( + self, + cte: CTE, + ) -> None: + self._cte = cte + + @property + def cte(self) -> CTE: + return self._cte + + @property + def url_id(self) -> Column[int]: + return self.cte.columns[0] + + @property + def not_exists_query(self) -> ColumnElement[bool]: + return ( + ~exists() + .where(self.url_id == URL.id) + ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/impl/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/impl/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/impl/high_confidence_annotations.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/impl/high_confidence_annotations.py new file mode 100644 index 00000000..3ac0ced7 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/impl/high_confidence_annotations.py @@ -0,0 +1,29 @@ +from sqlalchemy import select + +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.exists.container import \ + ExistsCTEContainer +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask +from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion + +cte = ( + select( + URL.id + ) + .join( + URLAutoAgencyIDSubtask, + URLAutoAgencyIDSubtask.url_id == URL.id, + ) + .join( + AgencyIDSubtaskSuggestion, + AgencyIDSubtaskSuggestion.subtask_id == URLAutoAgencyIDSubtask.id, + ) + .where( + AgencyIDSubtaskSuggestion.confidence >= 95, + ) + .cte("high_confidence_annotations_exists") +) + +HIGH_CONFIDENCE_ANNOTATIONS_EXISTS_CONTAINER = ExistsCTEContainer( + cte, +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/impl/validated.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/impl/validated.py new file mode 100644 index 00000000..f515c1d1 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/impl/validated.py @@ -0,0 +1,16 @@ +from sqlalchemy import select + +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.exists.container import \ + ExistsCTEContainer +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated + +cte = ( + select( + FlagURLValidated.url_id + ) + .cte("validated_exists") +) + +VALIDATED_EXISTS_CONTAINER = ExistsCTEContainer( + cte, +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/container.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/container.py new file mode 100644 index 00000000..9782e4fd --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/container.py @@ -0,0 +1,40 @@ +from sqlalchemy import CTE, ColumnElement, Column, Select, exists, func + +from src.db.models.impl.url.core.sqlalchemy import URL + + +class SubtaskCTEContainer: + """ + CTE for URLs eligible for a given subtask. + A successful left join on this indicates the URL is eligible for the subtask. + A true value for `subtask_entry_exists` indicates + a subtask entry for the URL already exists + """ + + def __init__( + self, + cte: CTE, + ) -> None: + self._cte=cte + + @property + def cte(self) -> CTE: + return self._cte + + @property + def entry_exists(self) -> ColumnElement[bool]: + return self.cte.c['subtask_entry_exists'] + + @property + def url_id(self) -> Column[int]: + return self.cte.c['id'] + + @property + def eligible_query(self) -> ColumnElement[int]: + return ( + exists() + .where( + self.url_id == URL.id, + self.entry_exists.is_(False), + ) + ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/helpers.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/helpers.py new file mode 100644 index 00000000..b06442ea --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/helpers.py @@ -0,0 +1,18 @@ +from sqlalchemy import ColumnElement, exists + +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType +from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask + + +def get_exists_subtask_query( + subtask_type: AutoAgencyIDSubtaskType, +) -> ColumnElement[bool]: + return ( + exists() + .where( + URLAutoAgencyIDSubtask.url_id == URL.id, + URLAutoAgencyIDSubtask.type == subtask_type, + ) + .label("subtask_entry_exists") + ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/ckan.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/ckan.py new file mode 100644 index 00000000..b1b70cdb --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/ckan.py @@ -0,0 +1,37 @@ +from sqlalchemy import select + +from src.collectors.enums import CollectorType +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.helpers import \ + get_exists_subtask_query +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.container import \ + SubtaskCTEContainer +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType + +cte = ( + select( + URL.id, + get_exists_subtask_query( + AutoAgencyIDSubtaskType.CKAN, + ), + ) + .join( + LinkBatchURL, + LinkBatchURL.url_id == URL.id, + ) + .join( + Batch, + Batch.id == LinkBatchURL.batch_id, + ) + .where( + Batch.strategy == CollectorType.CKAN.value, + + ) + .cte("ckan_eligible") +) + +CKAN_SUBTASK_CONTAINER = SubtaskCTEContainer( + cte, +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/homepage.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/homepage.py new file mode 100644 index 00000000..4d75b4e0 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/homepage.py @@ -0,0 +1,34 @@ +from sqlalchemy import select, exists + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.consolidated import \ + CONSOLIDATED_CTE +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.container import \ + SubtaskCTEContainer +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.helpers import \ + get_exists_subtask_query +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType + +VALID_URL_FLAG = ( + exists() + .where( + URL.id == CONSOLIDATED_CTE.c.url_id, + ) +) + +cte = ( + select( + URL.id, + get_exists_subtask_query( + AutoAgencyIDSubtaskType.HOMEPAGE_MATCH, + ) + ) + .where( + VALID_URL_FLAG, + ) + .cte("homepage_eligible") +) + +HOMEPAGE_SUBTASK_CONTAINER = SubtaskCTEContainer( + cte, +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/muckrock.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/muckrock.py new file mode 100644 index 00000000..1f059e86 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/muckrock.py @@ -0,0 +1,40 @@ +from sqlalchemy import select + +from src.collectors.enums import CollectorType +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.container import \ + SubtaskCTEContainer +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.helpers import \ + get_exists_subtask_query +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType + +cte = ( + select( + URL.id, + get_exists_subtask_query( + AutoAgencyIDSubtaskType.MUCKROCK, + ) + ) + .join( + LinkBatchURL, + LinkBatchURL.url_id == URL.id, + ) + .join( + Batch, + Batch.id == LinkBatchURL.batch_id, + ) + .where( + Batch.strategy.in_( + (CollectorType.MUCKROCK_ALL_SEARCH.value, + CollectorType.MUCKROCK_COUNTY_SEARCH.value, + CollectorType.MUCKROCK_SIMPLE_SEARCH.value,) + ), + ) + .cte("muckrock_eligible") +) + +MUCKROCK_SUBTASK_CONTAINER = SubtaskCTEContainer( + cte, +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location.py new file mode 100644 index 00000000..40533809 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location.py @@ -0,0 +1,26 @@ +from sqlalchemy import select + +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.helpers import \ + get_exists_subtask_query +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.container import \ + SubtaskCTEContainer +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType + +cte = ( + select( + URL.id, + get_exists_subtask_query( + AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH + ) + ) + .join( + URLCompressedHTML + ) + .cte("nlp_location_eligible") +) + +NLP_LOCATION_CONTAINER = SubtaskCTEContainer( + cte, +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/eligible_counts.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/eligible_counts.py new file mode 100644 index 00000000..96a322cb --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/eligible_counts.py @@ -0,0 +1,25 @@ +from sqlalchemy import select, ColumnElement, Integer, func + +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.eligible import \ + EligibleContainer +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType + + +def sum_count(col: ColumnElement[bool], subtask_type: AutoAgencyIDSubtaskType) -> ColumnElement[int]: + return func.coalesce( + func.sum( + col.cast(Integer) + ), + 0, + ).label(subtask_type.value) + +container = EligibleContainer() + +ELIGIBLE_COUNTS_QUERY = ( + select( + sum_count(container.ckan, AutoAgencyIDSubtaskType.CKAN), + sum_count(container.muckrock, AutoAgencyIDSubtaskType.MUCKROCK), + sum_count(container.homepage, AutoAgencyIDSubtaskType.HOMEPAGE_MATCH), + sum_count(container.nlp_location, AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH), + ) +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/templates/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/templates/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/templates/output.py b/src/core/tasks/url/operators/agency_identification/subtasks/templates/output.py new file mode 100644 index 00000000..02ae76a4 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/templates/output.py @@ -0,0 +1,5 @@ +from pydantic import BaseModel + + +class AgencyIDSubtaskOutputBase(BaseModel): + pass \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/templates/postprocessor.py b/src/core/tasks/url/operators/agency_identification/subtasks/templates/postprocessor.py new file mode 100644 index 00000000..b366747f --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/templates/postprocessor.py @@ -0,0 +1,26 @@ +from abc import ABC, abstractmethod + +from src.core.tasks.url.operators.agency_identification.subtasks.templates.output import AgencyIDSubtaskOutputBase +from src.db.client.async_ import AsyncDatabaseClient + + +class SubtaskPostprocessorBase(ABC): + """ + An optional class which takes + the output of the subtask along with the subtask id + and adds additional information to the database. + """ + + def __init__( + self, + subtask_id: int, + subtask_output: AgencyIDSubtaskOutputBase, + adb_client: AsyncDatabaseClient + ): + self.subtask_id = subtask_id + self.subtask_output = subtask_output + self.adb_client = adb_client + + @abstractmethod + async def run(self) -> None: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/templates/subtask.py b/src/core/tasks/url/operators/agency_identification/subtasks/templates/subtask.py new file mode 100644 index 00000000..4085b6dd --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/templates/subtask.py @@ -0,0 +1,82 @@ +import abc +import traceback +from abc import ABC + +from src.core.tasks.url.operators.agency_identification.subtasks.models.run_info import AgencyIDSubtaskRunInfo +from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.error_info.pydantic import URLErrorPydanticInfo +from src.db.models.impl.url.suggestion.agency.subtask.pydantic import URLAutoAgencyIDSubtaskPydantic +from src.db.models.impl.url.suggestion.agency.suggestion.pydantic import AgencyIDSubtaskSuggestionPydantic + + +class AgencyIDSubtaskOperatorBase(ABC): + + def __init__( + self, + adb_client: AsyncDatabaseClient, + task_id: int + ) -> None: + self.adb_client: AsyncDatabaseClient = adb_client + self.task_id: int = task_id + self.linked_urls: list[int] = [] + + async def run(self) -> AgencyIDSubtaskRunInfo: + try: + await self.inner_logic() + except Exception as e: + # Get stack trace + stack_trace: str = traceback.format_exc() + return AgencyIDSubtaskRunInfo( + error=f"{type(e).__name__}: {str(e)}: {stack_trace}", + linked_url_ids=self.linked_urls + ) + return AgencyIDSubtaskRunInfo( + linked_url_ids=self.linked_urls + ) + + @abc.abstractmethod + async def inner_logic(self) -> AgencyIDSubtaskRunInfo: + raise NotImplementedError + + async def _upload_subtask_data( + self, + subtask_data_list: list[AutoAgencyIDSubtaskData] + ) -> None: + + subtask_models: list[URLAutoAgencyIDSubtaskPydantic] = [ + subtask_data.pydantic_model + for subtask_data in subtask_data_list + ] + subtask_ids: list[int] = await self.adb_client.bulk_insert( + models=subtask_models, + return_ids=True + ) + suggestions: list[AgencyIDSubtaskSuggestionPydantic] = [] + for subtask_id, subtask_info in zip(subtask_ids, subtask_data_list): + for suggestion in subtask_info.suggestions: + suggestion_pydantic = AgencyIDSubtaskSuggestionPydantic( + subtask_id=subtask_id, + agency_id=suggestion.agency_id, + confidence=suggestion.confidence, + ) + suggestions.append(suggestion_pydantic) + + await self.adb_client.bulk_insert( + models=suggestions, + ) + + error_infos: list[URLErrorPydanticInfo] = [] + for subtask_info in subtask_data_list: + if not subtask_info.has_error: + continue + error_info = URLErrorPydanticInfo( + url_id=subtask_info.url_id, + error=subtask_info.error, + task_id=self.task_id, + ) + error_infos.append(error_info) + + await self.adb_client.bulk_insert( + models=error_infos, + ) diff --git a/src/core/tasks/url/operators/auto_relevant/queries/get_tdos.py b/src/core/tasks/url/operators/auto_relevant/queries/get_tdos.py index b3ba90ec..384cb5c4 100644 --- a/src/core/tasks/url/operators/auto_relevant/queries/get_tdos.py +++ b/src/core/tasks/url/operators/auto_relevant/queries/get_tdos.py @@ -28,7 +28,7 @@ async def run(self, session: AsyncSession) -> list[URLRelevantTDO]: .join(URLCompressedHTML) .outerjoin(AutoRelevantSuggestion) .where( - URL.status == URLStatus.PENDING.value, + URL.status == URLStatus.OK.value, AutoRelevantSuggestion.id.is_(None), ) ) diff --git a/src/core/tasks/url/operators/submit_approved/queries/get.py b/src/core/tasks/url/operators/submit_approved/queries/get.py index 6c22c731..19b32b5d 100644 --- a/src/core/tasks/url/operators/submit_approved/queries/get.py +++ b/src/core/tasks/url/operators/submit_approved/queries/get.py @@ -4,6 +4,8 @@ from src.collectors.enums import URLStatus from src.core.tasks.url.operators.submit_approved.tdo import SubmitApprovedURLTDO +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.helpers.session import session_helper as sh @@ -29,7 +31,8 @@ async def _process_results(self, urls): async def _build_query(): query = ( select(URL) - .where(URL.status == URLStatus.VALIDATED.value) + .join(FlagURLValidated, FlagURLValidated.url_id == URL.id) + .where(FlagURLValidated.type == URLValidatedType.DATA_SOURCE) .options( selectinload(URL.optional_data_source_metadata), selectinload(URL.confirmed_agencies), diff --git a/src/core/tasks/url/operators/submit_approved/queries/has_validated.py b/src/core/tasks/url/operators/submit_approved/queries/has_validated.py index abd94d20..5a3ff464 100644 --- a/src/core/tasks/url/operators/submit_approved/queries/has_validated.py +++ b/src/core/tasks/url/operators/submit_approved/queries/has_validated.py @@ -2,6 +2,8 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.collectors.enums import URLStatus +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase @@ -11,7 +13,13 @@ class HasValidatedURLsQueryBuilder(QueryBuilderBase): async def run(self, session: AsyncSession) -> bool: query = ( select(URL) - .where(URL.status == URLStatus.VALIDATED.value) + .join( + FlagURLValidated, + FlagURLValidated.url_id == URL.id + ) + .where( + FlagURLValidated.type == URLValidatedType.DATA_SOURCE + ) ) urls = await session.execute(query) urls = urls.scalars().all() diff --git a/src/core/tasks/url/operators/submit_approved/queries/mark_submitted.py b/src/core/tasks/url/operators/submit_approved/queries/mark_submitted.py index d2563335..4ebfef56 100644 --- a/src/core/tasks/url/operators/submit_approved/queries/mark_submitted.py +++ b/src/core/tasks/url/operators/submit_approved/queries/mark_submitted.py @@ -19,14 +19,6 @@ async def run(self, session: AsyncSession): url_id = info.url_id data_source_id = info.data_source_id - query = ( - update(URL) - .where(URL.id == url_id) - .values( - status=URLStatus.SUBMITTED.value - ) - ) - url_data_source_object = URLDataSource( url_id=url_id, data_source_id=data_source_id @@ -35,4 +27,3 @@ async def run(self, session: AsyncSession): url_data_source_object.created_at = info.submitted_at session.add(url_data_source_object) - await session.execute(query) \ No newline at end of file diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 3b994f86..19cbc3f5 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -3,7 +3,7 @@ from operator import or_ from typing import Optional, Type, Any, List, Sequence -from sqlalchemy import select, exists, func, case, Select, and_, update, delete, literal, Row +from sqlalchemy import select, exists, func, Select, and_, update, delete, Row from sqlalchemy.dialects.postgresql import insert as pg_insert from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker from sqlalchemy.orm import selectinload, QueryableAttribute @@ -26,21 +26,23 @@ from src.api.endpoints.collector.dtos.manual_batch.post import ManualBatchInputDTO from src.api.endpoints.collector.dtos.manual_batch.response import ManualBatchResponseDTO from src.api.endpoints.collector.manual.query import UploadManualBatchQueryBuilder +from src.api.endpoints.metrics.backlog.query import GetBacklogMetricsQueryBuilder from src.api.endpoints.metrics.batches.aggregated.dto import GetMetricsBatchesAggregatedResponseDTO -from src.api.endpoints.metrics.batches.aggregated.query import GetBatchesAggregatedMetricsQueryBuilder +from src.api.endpoints.metrics.batches.aggregated.query.core import GetBatchesAggregatedMetricsQueryBuilder from src.api.endpoints.metrics.batches.breakdown.dto import GetMetricsBatchesBreakdownResponseDTO from src.api.endpoints.metrics.batches.breakdown.query import GetBatchesBreakdownMetricsQueryBuilder -from src.api.endpoints.metrics.dtos.get.backlog import GetMetricsBacklogResponseDTO, GetMetricsBacklogResponseInnerDTO +from src.api.endpoints.metrics.dtos.get.backlog import GetMetricsBacklogResponseDTO from src.api.endpoints.metrics.dtos.get.urls.aggregated.core import GetMetricsURLsAggregatedResponseDTO -from src.api.endpoints.metrics.dtos.get.urls.breakdown.pending import GetMetricsURLsBreakdownPendingResponseDTO, \ - GetMetricsURLsBreakdownPendingResponseInnerDTO +from src.api.endpoints.metrics.dtos.get.urls.breakdown.pending import GetMetricsURLsBreakdownPendingResponseDTO from src.api.endpoints.metrics.dtos.get.urls.breakdown.submitted import GetMetricsURLsBreakdownSubmittedResponseDTO, \ GetMetricsURLsBreakdownSubmittedInnerDTO +from src.api.endpoints.metrics.urls.aggregated.query.core import GetURLsAggregatedMetricsQueryBuilder +from src.api.endpoints.metrics.urls.breakdown.query.core import GetURLsBreakdownPendingMetricsQueryBuilder from src.api.endpoints.review.approve.dto import FinalReviewApprovalInfo from src.api.endpoints.review.approve.query_.core import ApproveURLQueryBuilder from src.api.endpoints.review.enums import RejectionReason +from src.api.endpoints.review.next.core import GetNextURLForFinalReviewQueryBuilder from src.api.endpoints.review.next.dto import GetNextURLForFinalReviewOuterResponse -from src.api.endpoints.review.next.query import GetNextURLForFinalReviewQueryBuilder from src.api.endpoints.review.reject.query import RejectURLQueryBuilder from src.api.endpoints.search.dtos.response import SearchURLResponse from src.api.endpoints.task.by_id.dto import TaskInfo @@ -50,19 +52,14 @@ from src.api.endpoints.url.get.query import GetURLsQueryBuilder from src.collectors.enums import URLStatus, CollectorType from src.collectors.queries.insert.urls.query import InsertURLsQueryBuilder -from src.core.enums import BatchStatus, SuggestionType, RecordType, SuggestedStatus +from src.core.enums import BatchStatus, RecordType, SuggestedStatus from src.core.env_var_manager import EnvVarManager -from src.core.tasks.scheduled.impl.huggingface.queries.check.core import CheckValidURLsUpdatedQueryBuilder -from src.core.tasks.scheduled.impl.huggingface.queries.get.core import GetForLoadingToHuggingFaceQueryBuilder -from src.core.tasks.scheduled.impl.huggingface.queries.get.model import GetForLoadingToHuggingFaceOutput from src.core.tasks.scheduled.impl.huggingface.queries.state import SetHuggingFaceUploadStateQueryBuilder from src.core.tasks.scheduled.impl.sync.agency.dtos.parameters import AgencySyncParameters from src.core.tasks.scheduled.impl.sync.agency.queries.get_sync_params import GetAgenciesSyncParametersQueryBuilder from src.core.tasks.scheduled.impl.sync.agency.queries.mark_full_sync import get_mark_full_agencies_sync_query from src.core.tasks.scheduled.impl.sync.agency.queries.update_sync_progress import \ get_update_agencies_sync_progress_query -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert import \ - convert_agencies_sync_response_to_agencies_upsert from src.core.tasks.scheduled.impl.sync.data_sources.params import DataSourcesSyncParameters from src.core.tasks.scheduled.impl.sync.data_sources.queries.get_sync_params import \ GetDataSourcesSyncParametersQueryBuilder @@ -72,11 +69,6 @@ from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.core import \ UpsertURLsFromDataSourcesQueryBuilder from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from src.core.tasks.url.operators.agency_identification.dtos.tdo import AgencyIdentificationTDO -from src.core.tasks.url.operators.agency_identification.queries.get_pending_urls_without_agency_suggestions import \ - GetPendingURLsWithoutAgencySuggestionsQueryBuilder -from src.core.tasks.url.operators.agency_identification.queries.has_urls_without_agency_suggestions import \ - HasURLsWithoutAgencySuggestionsQueryBuilder from src.core.tasks.url.operators.auto_relevant.models.tdo import URLRelevantTDO from src.core.tasks.url.operators.auto_relevant.queries.get_tdos import GetAutoRelevantTDOsQueryBuilder from src.core.tasks.url.operators.html.queries.get import \ @@ -106,9 +98,10 @@ from src.db.helpers.session import session_helper as sh from src.db.models.impl.agency.sqlalchemy import Agency from src.db.models.impl.backlog_snapshot import BacklogSnapshot -from src.db.models.impl.batch.pydantic import BatchInfo +from src.db.models.impl.batch.pydantic.info import BatchInfo from src.db.models.impl.batch.sqlalchemy import Batch from src.db.models.impl.duplicate.pydantic.info import DuplicateInfo +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.task_url import LinkTaskURL from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.log.pydantic.info import LogInfo @@ -126,7 +119,6 @@ from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from src.db.models.impl.url.probed_for_404 import URLProbedFor404 -from src.db.models.impl.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion @@ -145,7 +137,6 @@ from src.db.templates.markers.bulk.insert import BulkInsertableModel from src.db.templates.markers.bulk.upsert import BulkUpsertableModel from src.db.utils.compression import decompress_html, compress_html -from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo @@ -546,7 +537,7 @@ async def get_urls_with_html_data_and_without_models( ): statement = (select(URL) .options(selectinload(URL.html_content)) - .where(URL.status == URLStatus.PENDING.value)) + .where(URL.status == URLStatus.OK.value)) statement = self.statement_composer.exclude_urls_with_extant_model( statement=statement, model=model @@ -575,7 +566,7 @@ async def has_urls_with_html_data_and_without_models( ) -> bool: statement = (select(URL) .join(URLCompressedHTML) - .where(URL.status == URLStatus.PENDING.value)) + .where(URL.status == URLStatus.OK.value)) # Exclude URLs with auto suggested record types statement = self.statement_composer.exclude_urls_with_extant_model( statement=statement, @@ -614,9 +605,11 @@ async def get_urls( page: int, errors: bool ) -> GetURLsResponseInfo: - return await self.run_query_builder(GetURLsQueryBuilder( - page=page, errors=errors - )) + return await self.run_query_builder( + GetURLsQueryBuilder( + page=page, errors=errors + ) + ) @session_manager async def initiate_task( @@ -657,7 +650,12 @@ async def get_html_content_info(self, url_id: int) -> list[URLHTMLContentInfo]: return await self.run_query_builder(GetHTMLContentInfoQueryBuilder(url_id)) @session_manager - async def link_urls_to_task(self, session: AsyncSession, task_id: int, url_ids: list[int]): + async def link_urls_to_task( + self, + session: AsyncSession, + task_id: int, + url_ids: list[int] + ) -> None: for url_id in url_ids: link = LinkTaskURL( url_id=url_id, @@ -720,24 +718,19 @@ async def get_tasks( tasks=final_results ) - async def has_urls_without_agency_suggestions(self) -> bool: - return await self.run_query_builder(HasURLsWithoutAgencySuggestionsQueryBuilder()) - async def get_urls_without_agency_suggestions( - self - ) -> list[AgencyIdentificationTDO]: - """Retrieve URLs without confirmed or suggested agencies.""" - return await self.run_query_builder(GetPendingURLsWithoutAgencySuggestionsQueryBuilder()) async def get_next_url_agency_for_annotation( self, user_id: int, batch_id: int | None ) -> GetNextURLForAgencyAnnotationResponse: - return await self.run_query_builder(builder=GetNextURLAgencyForAnnotationQueryBuilder( - user_id=user_id, - batch_id=batch_id - )) + return await self.run_query_builder( + builder=GetNextURLAgencyForAnnotationQueryBuilder( + user_id=user_id, + batch_id=batch_id + ) + ) @session_manager async def upsert_new_agencies( @@ -773,20 +766,6 @@ async def add_confirmed_agency_url_links( ) session.add(confirmed_agency) - @session_manager - async def add_agency_auto_suggestions( - self, - session: AsyncSession, - suggestions: list[URLAgencySuggestionInfo] - ): - for suggestion in suggestions: - url_agency_suggestion = AutomatedUrlAgencySuggestion( - url_id=suggestion.url_id, - agency_id=suggestion.pdap_agency_id, - is_unknown=suggestion.suggestion_type == SuggestionType.UNKNOWN - ) - session.add(url_agency_suggestion) - @session_manager async def add_agency_manual_suggestion( self, @@ -842,10 +821,12 @@ async def approve_url( approval_info: FinalReviewApprovalInfo, user_id: int, ) -> None: - await self.run_query_builder(ApproveURLQueryBuilder( - user_id=user_id, - approval_info=approval_info - )) + await self.run_query_builder( + ApproveURLQueryBuilder( + user_id=user_id, + approval_info=approval_info + ) + ) async def reject_url( self, @@ -853,11 +834,13 @@ async def reject_url( user_id: int, rejection_reason: RejectionReason ) -> None: - await self.run_query_builder(RejectURLQueryBuilder( - url_id=url_id, - user_id=user_id, - rejection_reason=rejection_reason - )) + await self.run_query_builder( + RejectURLQueryBuilder( + url_id=url_id, + user_id=user_id, + rejection_reason=rejection_reason + ) + ) @session_manager async def get_batch_by_id(self, session, batch_id: int) -> Optional[BatchSummary]: @@ -873,10 +856,12 @@ async def get_batch_by_id(self, session, batch_id: int) -> Optional[BatchSummary async def get_urls_by_batch(self, batch_id: int, page: int = 1) -> list[URLInfo]: """Retrieve all URLs associated with a batch.""" - return await self.run_query_builder(GetURLsByBatchQueryBuilder( - batch_id=batch_id, - page=page - )) + return await self.run_query_builder( + GetURLsByBatchQueryBuilder( + batch_id=batch_id, + page=page + ) + ) @session_manager async def insert_logs( @@ -926,8 +911,6 @@ async def insert_urls( ) return await self.run_query_builder(builder) - - @session_manager async def update_batch_post_collection( self, @@ -960,10 +943,12 @@ async def mark_urls_as_submitted(self, infos: list[SubmittedURLInfo]): await self.run_query_builder(MarkURLsAsSubmittedQueryBuilder(infos)) async def get_duplicates_by_batch_id(self, batch_id: int, page: int) -> list[DuplicateInfo]: - return await self.run_query_builder(GetDuplicatesByBatchIDQueryBuilder( - batch_id=batch_id, - page=page - )) + return await self.run_query_builder( + GetDuplicatesByBatchIDQueryBuilder( + batch_id=batch_id, + page=page + ) + ) @session_manager async def get_batch_summaries( @@ -1048,10 +1033,12 @@ async def upload_manual_batch( user_id: int, dto: ManualBatchInputDTO ) -> ManualBatchResponseDTO: - return await self.run_query_builder(UploadManualBatchQueryBuilder( - user_id=user_id, - dto=dto - )) + return await self.run_query_builder( + UploadManualBatchQueryBuilder( + user_id=user_id, + dto=dto + ) + ) @session_manager async def search_for_url(self, session: AsyncSession, url: str) -> SearchURLResponse: @@ -1114,183 +1101,16 @@ async def get_urls_breakdown_submitted_metrics( entries=final_results ) - @session_manager - async def get_urls_aggregated_metrics( - self, - session: AsyncSession - ) -> GetMetricsURLsAggregatedResponseDTO: - sc = StatementComposer + async def get_urls_aggregated_metrics(self) -> GetMetricsURLsAggregatedResponseDTO: + return await self.run_query_builder(GetURLsAggregatedMetricsQueryBuilder()) - oldest_pending_url_query = select( - URL.id, - URL.created_at - ).where( - URL.status == URLStatus.PENDING.value - ).order_by( - URL.created_at.asc() - ).limit(1) - - oldest_pending_url = await session.execute(oldest_pending_url_query) - oldest_pending_url = oldest_pending_url.one_or_none() - if oldest_pending_url is None: - oldest_pending_url_id = None - oldest_pending_created_at = None - else: - oldest_pending_url_id = oldest_pending_url.id - oldest_pending_created_at = oldest_pending_url.created_at - - def case_column(status: URLStatus, label): - return sc.count_distinct( - case( - ( - URL.status == status.value, - URL.id - ) - ), - label=label - ) - - count_query = select( - sc.count_distinct(URL.id, label="count"), - case_column(URLStatus.PENDING, label="count_pending"), - case_column(URLStatus.SUBMITTED, label="count_submitted"), - case_column(URLStatus.VALIDATED, label="count_validated"), - case_column(URLStatus.NOT_RELEVANT, label="count_rejected"), - case_column(URLStatus.ERROR, label="count_error"), - ) - raw_results = await session.execute(count_query) - results = raw_results.all() - - return GetMetricsURLsAggregatedResponseDTO( - count_urls_total=results[0].count, - count_urls_pending=results[0].count_pending, - count_urls_submitted=results[0].count_submitted, - count_urls_validated=results[0].count_validated, - count_urls_rejected=results[0].count_rejected, - count_urls_errors=results[0].count_error, - oldest_pending_url_id=oldest_pending_url_id, - oldest_pending_url_created_at=oldest_pending_created_at, - ) + async def get_urls_breakdown_pending_metrics(self) -> GetMetricsURLsBreakdownPendingResponseDTO: + return await self.run_query_builder(GetURLsBreakdownPendingMetricsQueryBuilder()) - @session_manager - async def get_urls_breakdown_pending_metrics( - self, - session: AsyncSession - ) -> GetMetricsURLsBreakdownPendingResponseDTO: - sc = StatementComposer - - flags = ( - select( - URL.id.label("url_id"), - case((UserRecordTypeSuggestion.url_id != None, literal(True)), else_=literal(False)).label( - "has_user_record_type_annotation" - ), - case((UserRelevantSuggestion.url_id != None, literal(True)), else_=literal(False)).label( - "has_user_relevant_annotation" - ), - case((UserUrlAgencySuggestion.url_id != None, literal(True)), else_=literal(False)).label( - "has_user_agency_annotation" - ), - ) - .outerjoin(UserRecordTypeSuggestion, URL.id == UserRecordTypeSuggestion.url_id) - .outerjoin(UserRelevantSuggestion, URL.id == UserRelevantSuggestion.url_id) - .outerjoin(UserUrlAgencySuggestion, URL.id == UserUrlAgencySuggestion.url_id) - ).cte("flags") - - month = func.date_trunc('month', URL.created_at) - - # Build the query - query = ( - select( - month.label('month'), - func.count(URL.id).label('count_total'), - func.count( - case( - (flags.c.has_user_record_type_annotation == True, 1) - ) - ).label('user_record_type_count'), - func.count( - case( - (flags.c.has_user_relevant_annotation == True, 1) - ) - ).label('user_relevant_count'), - func.count( - case( - (flags.c.has_user_agency_annotation == True, 1) - ) - ).label('user_agency_count'), - ) - .outerjoin(flags, flags.c.url_id == URL.id) - .where(URL.status == URLStatus.PENDING.value) - .group_by(month) - .order_by(month.asc()) - ) - - # Execute the query and return the results - results = await session.execute(query) - all_results = results.all() - final_results: list[GetMetricsURLsBreakdownPendingResponseInnerDTO] = [] - - for result in all_results: - dto = GetMetricsURLsBreakdownPendingResponseInnerDTO( - month=result.month.strftime("%B %Y"), - count_pending_total=result.count_total, - count_pending_relevant_user=result.user_relevant_count, - count_pending_record_type_user=result.user_record_type_count, - count_pending_agency_user=result.user_agency_count, - ) - final_results.append(dto) - return GetMetricsURLsBreakdownPendingResponseDTO( - entries=final_results, - ) - - @session_manager async def get_backlog_metrics( self, - session: AsyncSession ) -> GetMetricsBacklogResponseDTO: - month = func.date_trunc('month', BacklogSnapshot.created_at) - - # 1. Create a subquery that assigns row_number() partitioned by month - monthly_snapshot_subq = ( - select( - BacklogSnapshot.id, - BacklogSnapshot.created_at, - BacklogSnapshot.count_pending_total, - month.label("month_start"), - func.row_number() - .over( - partition_by=month, - order_by=BacklogSnapshot.created_at.desc() - ) - .label("row_number") - ) - .subquery() - ) - - # 2. Filter for the top (most recent) row in each month - stmt = ( - select( - monthly_snapshot_subq.c.month_start, - monthly_snapshot_subq.c.created_at, - monthly_snapshot_subq.c.count_pending_total - ) - .where(monthly_snapshot_subq.c.row_number == 1) - .order_by(monthly_snapshot_subq.c.month_start) - ) - - raw_result = await session.execute(stmt) - results = raw_result.all() - final_results = [] - for result in results: - final_results.append( - GetMetricsBacklogResponseInnerDTO( - month=result.month_start.strftime("%B %Y"), - count_pending_total=result.count_pending_total, - ) - ) - - return GetMetricsBacklogResponseDTO(entries=final_results) + return await self.run_query_builder(GetBacklogMetricsQueryBuilder()) @session_manager async def populate_backlog_snapshot( @@ -1300,10 +1120,15 @@ async def populate_backlog_snapshot( ): sc = StatementComposer # Get count of pending URLs - query = select( - sc.count_distinct(URL.id, label="count") - ).where( - URL.status == URLStatus.PENDING.value + query = ( + select( + sc.count_distinct(URL.id, label="count") + ) + .outerjoin(FlagURLValidated, URL.id == FlagURLValidated.url_id) + .where( + URL.status == URLStatus.OK.value, + FlagURLValidated.url_id.is_(None), + ) ) raw_result = await session.execute(query) @@ -1355,7 +1180,7 @@ async def has_pending_urls_not_recently_probed_for_404(self, session: AsyncSessi URLProbedFor404 ).where( and_( - URL.status == URLStatus.PENDING.value, + URL.status == URLStatus.OK.value, or_( URLProbedFor404.id == None, URLProbedFor404.last_probed_at < month_ago @@ -1378,7 +1203,7 @@ async def get_pending_urls_not_recently_probed_for_404(self, session: AsyncSessi URLProbedFor404 ).where( and_( - URL.status == URLStatus.PENDING.value, + URL.status == URLStatus.OK.value, or_( URLProbedFor404.id == None, URLProbedFor404.last_probed_at < month_ago @@ -1404,14 +1229,6 @@ async def get_data_sources_sync_parameters(self) -> DataSourcesSyncParameters: GetDataSourcesSyncParametersQueryBuilder() ) - async def upsert_agencies( - self, - agencies: list[AgenciesSyncResponseInnerInfo] - ) -> None: - await self.bulk_upsert( - models=convert_agencies_sync_response_to_agencies_upsert(agencies) - ) - async def upsert_urls_from_data_sources( self, data_sources: list[DataSourcesSyncResponseInnerInfo] @@ -1463,21 +1280,11 @@ async def add_raw_html( ) session.add(compressed_html) - async def get_data_sources_raw_for_huggingface(self, page: int) -> list[GetForLoadingToHuggingFaceOutput]: - return await self.run_query_builder( - GetForLoadingToHuggingFaceQueryBuilder(page) - ) - async def set_hugging_face_upload_state(self, dt: datetime) -> None: await self.run_query_builder( SetHuggingFaceUploadStateQueryBuilder(dt=dt) ) - async def check_valid_urls_updated(self) -> bool: - return await self.run_query_builder( - CheckValidURLsUpdatedQueryBuilder() - ) - async def get_current_database_time(self) -> datetime: return await self.scalar(select(func.now())) diff --git a/src/db/client/sync.py b/src/db/client/sync.py index 03a45d3b..04ecc892 100644 --- a/src/db/client/sync.py +++ b/src/db/client/sync.py @@ -1,5 +1,5 @@ from functools import wraps -from typing import Optional, List +from typing import List from sqlalchemy import create_engine, update, Select from sqlalchemy.exc import IntegrityError @@ -7,12 +7,12 @@ from src.collectors.enums import URLStatus from src.db.config_manager import ConfigManager -from src.db.models.impl.batch.pydantic import BatchInfo +from src.db.models.impl.batch.pydantic.info import BatchInfo from src.db.models.impl.duplicate.pydantic.insert import DuplicateInsertInfo from src.db.dtos.url.insert import InsertURLsInfo from src.db.models.impl.log.pydantic.info import LogInfo from src.db.dtos.url.mapping import URLMapping -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.pydantic.info import URLInfo from src.db.models.templates_.base import Base from src.db.models.impl.duplicate.sqlalchemy import Duplicate @@ -58,6 +58,11 @@ def wrapper(self, *args, **kwargs): return wrapper + @session_manager + def add_all(self, session: Session, objects: list[Base]): + session.add_all(objects) + session.commit() + @session_manager def insert_batch(self, session: Session, batch_info: BatchInfo) -> int: """Insert a new batch into the database and return its ID.""" @@ -221,14 +226,6 @@ def mark_urls_as_submitted( url_id = info.url_id data_source_id = info.data_source_id - query = ( - update(URL) - .where(URL.id == url_id) - .values( - status=URLStatus.SUBMITTED.value - ) - ) - url_data_source_object = URLDataSource( url_id=url_id, data_source_id=data_source_id @@ -237,7 +234,6 @@ def mark_urls_as_submitted( url_data_source_object.created_at = info.submitted_at session.add(url_data_source_object) - session.execute(query) if __name__ == "__main__": client = DatabaseClient() diff --git a/src/db/client/types.py b/src/db/client/types.py index efdfdc72..02c0e39b 100644 --- a/src/db/client/types.py +++ b/src/db/client/types.py @@ -1,9 +1,5 @@ -from src.db.models.impl.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion -from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.impl.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion UserSuggestionModel = UserRelevantSuggestion or UserRecordTypeSuggestion or UserUrlAgencySuggestion -AutoSuggestionModel = AutoRelevantSuggestion or AutoRecordTypeSuggestion or AutomatedUrlAgencySuggestion diff --git a/src/db/constants.py b/src/db/constants.py index 505a6e58..f2cdefb1 100644 --- a/src/db/constants.py +++ b/src/db/constants.py @@ -1,23 +1,11 @@ -from src.db.models.impl.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion -from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.impl.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion PLACEHOLDER_AGENCY_NAME = "PLACEHOLDER_AGENCY_NAME" STANDARD_ROW_LIMIT = 100 -ALL_ANNOTATION_MODELS = [ - AutoRecordTypeSuggestion, - AutoRelevantSuggestion, - AutomatedUrlAgencySuggestion, - UserRelevantSuggestion, - UserRecordTypeSuggestion, - UserUrlAgencySuggestion -] - USER_ANNOTATION_MODELS = [ UserRelevantSuggestion, UserRecordTypeSuggestion, diff --git a/src/db/dto_converter.py b/src/db/dto_converter.py index 979a3b51..b19b834d 100644 --- a/src/db/dto_converter.py +++ b/src/db/dto_converter.py @@ -1,21 +1,18 @@ -from typing import Optional - from src.api.endpoints.annotate.agency.get.dto import GetNextURLForAgencyAgencyInfo from src.api.endpoints.annotate.relevance.get.dto import RelevanceAnnotationResponseInfo from src.api.endpoints.review.next.dto import FinalReviewAnnotationRelevantInfo, FinalReviewAnnotationRecordTypeInfo, \ - FinalReviewAnnotationAgencyAutoInfo, FinalReviewAnnotationAgencyInfo + FinalReviewAnnotationAgencyInfo from src.core.enums import RecordType, SuggestionType from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo from src.core.tasks.url.operators.html.scraper.parser.mapping import ENUM_TO_ATTRIBUTE_MAPPING from src.db.dtos.url.html_content import URLHTMLContentInfo -from src.db.models.impl.url.html.content.enums import HTMLContentType from src.db.dtos.url.with_html import URLWithHTML from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency -from src.db.models.impl.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion -from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion -from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion -from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.html.content.enums import HTMLContentType +from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent +from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion from src.db.models.impl.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion @@ -65,111 +62,6 @@ def final_review_annotation_record_type_info( user=user_value ) - @staticmethod - def final_review_annotation_agency_auto_info( - automated_agency_suggestions: list[AutomatedUrlAgencySuggestion] - ) -> FinalReviewAnnotationAgencyAutoInfo: - - if len(automated_agency_suggestions) == 0: - return FinalReviewAnnotationAgencyAutoInfo( - unknown=True, - suggestions=[] - ) - - if len(automated_agency_suggestions) == 1: - suggestion = automated_agency_suggestions[0] - unknown = suggestion.is_unknown - else: - unknown = False - - if unknown: - return FinalReviewAnnotationAgencyAutoInfo( - unknown=True, - suggestions=[ - GetNextURLForAgencyAgencyInfo( - suggestion_type=SuggestionType.UNKNOWN, - ) - ] - ) - - return FinalReviewAnnotationAgencyAutoInfo( - unknown=unknown, - suggestions=[ - GetNextURLForAgencyAgencyInfo( - suggestion_type=SuggestionType.AUTO_SUGGESTION, - pdap_agency_id=suggestion.agency_id, - agency_name=suggestion.agency.name, - state=suggestion.agency.state, - county=suggestion.agency.county, - locality=suggestion.agency.locality - ) for suggestion in automated_agency_suggestions - ] - ) - - @staticmethod - def user_url_agency_suggestion_to_final_review_annotation_agency_user_info( - user_url_agency_suggestion: UserUrlAgencySuggestion - ) -> GetNextURLForAgencyAgencyInfo | None: - suggestion = user_url_agency_suggestion - if suggestion is None: - return None - if suggestion.is_new: - return GetNextURLForAgencyAgencyInfo( - suggestion_type=SuggestionType.NEW_AGENCY, - ) - return GetNextURLForAgencyAgencyInfo( - suggestion_type=SuggestionType.USER_SUGGESTION, - pdap_agency_id=suggestion.agency_id, - agency_name=suggestion.agency.name, - state=suggestion.agency.state, - county=suggestion.agency.county, - locality=suggestion.agency.locality - ) - - - @staticmethod - def confirmed_agencies_to_final_review_annotation_agency_info( - confirmed_agencies: list[LinkURLAgency] - ) -> list[GetNextURLForAgencyAgencyInfo]: - results = [] - for confirmed_agency in confirmed_agencies: - agency = confirmed_agency.agency - agency_info = GetNextURLForAgencyAgencyInfo( - suggestion_type=SuggestionType.CONFIRMED, - pdap_agency_id=agency.agency_id, - agency_name=agency.name, - state=agency.state, - county=agency.county, - locality=agency.locality - ) - results.append(agency_info) - return results - - - @staticmethod - def final_review_annotation_agency_info( - automated_agency_suggestions: list[AutomatedUrlAgencySuggestion], - confirmed_agencies: list[LinkURLAgency], - user_agency_suggestion: UserUrlAgencySuggestion - ): - - confirmed_agency_info = DTOConverter.confirmed_agencies_to_final_review_annotation_agency_info( - confirmed_agencies - ) - - agency_auto_info = DTOConverter.final_review_annotation_agency_auto_info( - automated_agency_suggestions - ) - - agency_user_info = DTOConverter.user_url_agency_suggestion_to_final_review_annotation_agency_user_info( - user_agency_suggestion - ) - - return FinalReviewAnnotationAgencyInfo( - confirmed=confirmed_agency_info, - user=agency_user_info, - auto=agency_auto_info - ) @staticmethod diff --git a/src/db/dtos/url/mapping.py b/src/db/dtos/url/mapping.py index 18fc5be2..d48a4649 100644 --- a/src/db/dtos/url/mapping.py +++ b/src/db/dtos/url/mapping.py @@ -1,7 +1,9 @@ -from pydantic import BaseModel +from pydantic import BaseModel, ConfigDict class URLMapping(BaseModel): """Mapping between url and url_id.""" + model_config = ConfigDict(frozen=True) # <- makes it immutable & hashable + url: str url_id: int diff --git a/src/db/helpers/session/session_helper.py b/src/db/helpers/session/session_helper.py index a616664f..aebf236f 100644 --- a/src/db/helpers/session/session_helper.py +++ b/src/db/helpers/session/session_helper.py @@ -11,8 +11,8 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.db.helpers.session.parser import BulkActionParser -from src.db.models.templates_.with_id import WithIDBase from src.db.models.templates_.base import Base +from src.db.models.templates_.with_id import WithIDBase from src.db.templates.markers.bulk.delete import BulkDeletableModel from src.db.templates.markers.bulk.insert import BulkInsertableModel from src.db.templates.markers.bulk.update import BulkUpdatableModel @@ -51,21 +51,27 @@ async def has_results(session: AsyncSession, query: sa.Select) -> bool: async def bulk_upsert( session: AsyncSession, models: list[BulkUpsertableModel], -): +) -> None: if len(models) == 0: return + # Parse models to get sa_model and id_field parser = BulkActionParser(models) + # Create base insert query query = pg_insert(parser.sa_model) - upsert_mappings = [upsert_model.model_dump() for upsert_model in models] + upsert_mappings: list[dict[str, Any]] = [ + upsert_model.model_dump() for upsert_model in models + ] + # Set all non-id fields to the values in the upsert mapping set_ = {} for k, v in upsert_mappings[0].items(): if k == parser.id_field: continue set_[k] = getattr(query.excluded, k) + # Add upsert logic to update on conflict query = query.on_conflict_do_update( index_elements=[parser.id_field], set_=set_ @@ -216,4 +222,3 @@ async def bulk_update( ) await session.execute(stmt) - diff --git a/src/db/models/exceptions.py b/src/db/models/exceptions.py new file mode 100644 index 00000000..491aa9a4 --- /dev/null +++ b/src/db/models/exceptions.py @@ -0,0 +1,4 @@ + + +class WriteToViewError(Exception): + pass \ No newline at end of file diff --git a/src/db/models/impl/agency/sqlalchemy.py b/src/db/models/impl/agency/sqlalchemy.py index 556bde88..032dc397 100644 --- a/src/db/models/impl/agency/sqlalchemy.py +++ b/src/db/models/impl/agency/sqlalchemy.py @@ -25,6 +25,6 @@ class Agency( locality = Column(String, nullable=True) # Relationships - automated_suggestions = relationship("AutomatedUrlAgencySuggestion", back_populates="agency") + automated_suggestions = relationship("AgencyIDSubtaskSuggestion") user_suggestions = relationship("UserUrlAgencySuggestion", back_populates="agency") confirmed_urls = relationship("LinkURLAgency", back_populates="agency") diff --git a/src/db/models/impl/batch/pydantic/__init__.py b/src/db/models/impl/batch/pydantic/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/impl/batch/pydantic.py b/src/db/models/impl/batch/pydantic/info.py similarity index 100% rename from src/db/models/impl/batch/pydantic.py rename to src/db/models/impl/batch/pydantic/info.py diff --git a/src/db/models/impl/batch/pydantic/insert.py b/src/db/models/impl/batch/pydantic/insert.py new file mode 100644 index 00000000..882ab371 --- /dev/null +++ b/src/db/models/impl/batch/pydantic/insert.py @@ -0,0 +1,17 @@ +from datetime import datetime + +from src.core.enums import BatchStatus +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.templates.markers.bulk.insert import BulkInsertableModel + + +class BatchInsertModel(BulkInsertableModel): + strategy: str + status: BatchStatus + parameters: dict + user_id: int + date_generated: datetime + + @classmethod + def sa_model(cls) -> type[Batch]: + return Batch \ No newline at end of file diff --git a/src/db/models/impl/flag/url_validated/__init__.py b/src/db/models/impl/flag/url_validated/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/impl/flag/url_validated/enums.py b/src/db/models/impl/flag/url_validated/enums.py new file mode 100644 index 00000000..fe74b84c --- /dev/null +++ b/src/db/models/impl/flag/url_validated/enums.py @@ -0,0 +1,8 @@ +from enum import Enum + + +class URLValidatedType(Enum): + DATA_SOURCE = "data source" + META_URL = "meta url" + NOT_RELEVANT = "not relevant" + INDIVIDUAL_RECORD = "individual record" \ No newline at end of file diff --git a/src/db/models/impl/flag/url_validated/pydantic.py b/src/db/models/impl/flag/url_validated/pydantic.py new file mode 100644 index 00000000..197c05a0 --- /dev/null +++ b/src/db/models/impl/flag/url_validated/pydantic.py @@ -0,0 +1,22 @@ +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.templates.markers.bulk.insert import BulkInsertableModel +from src.db.templates.markers.bulk.upsert import BulkUpsertableModel + +type_ = type + +class FlagURLValidatedPydantic( + BulkInsertableModel, + BulkUpsertableModel +): + + url_id: int + type: URLValidatedType + + @classmethod + def sa_model(cls) -> type_[FlagURLValidated]: + return FlagURLValidated + + @classmethod + def id_field(cls) -> str: + return "url_id" \ No newline at end of file diff --git a/src/db/models/impl/flag/url_validated/sqlalchemy.py b/src/db/models/impl/flag/url_validated/sqlalchemy.py new file mode 100644 index 00000000..f6d4e770 --- /dev/null +++ b/src/db/models/impl/flag/url_validated/sqlalchemy.py @@ -0,0 +1,25 @@ +from sqlalchemy import PrimaryKeyConstraint + +from src.db.models.helpers import enum_column +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.mixins import URLDependentMixin, CreatedAtMixin, UpdatedAtMixin +from src.db.models.templates_.base import Base + + +class FlagURLValidated( + URLDependentMixin, + CreatedAtMixin, + UpdatedAtMixin, + Base, +): + __tablename__ = "flag_url_validated" + __table_args__ = ( + PrimaryKeyConstraint( + 'url_id', + ), + ) + + type = enum_column( + enum_type=URLValidatedType, + name="validated_url_type", + ) diff --git a/src/db/models/impl/link/batch_url/__init__.py b/src/db/models/impl/link/batch_url/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/impl/link/batch_url/pydantic.py b/src/db/models/impl/link/batch_url/pydantic.py new file mode 100644 index 00000000..143c57ce --- /dev/null +++ b/src/db/models/impl/link/batch_url/pydantic.py @@ -0,0 +1,11 @@ +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.templates.markers.bulk.insert import BulkInsertableModel + + +class LinkBatchURLPydantic(BulkInsertableModel): + batch_id: int + url_id: int + + @classmethod + def sa_model(cls) -> type[LinkBatchURL]: + return LinkBatchURL \ No newline at end of file diff --git a/src/db/models/impl/link/batch_url.py b/src/db/models/impl/link/batch_url/sqlalchemy.py similarity index 79% rename from src/db/models/impl/link/batch_url.py rename to src/db/models/impl/link/batch_url/sqlalchemy.py index 8fb8f42e..951ac539 100644 --- a/src/db/models/impl/link/batch_url.py +++ b/src/db/models/impl/link/batch_url/sqlalchemy.py @@ -13,5 +13,3 @@ class LinkBatchURL( ): __tablename__ = "link_batch_urls" - url = relationship('URL', overlaps="batch") - batch = relationship('Batch', overlaps="url") \ No newline at end of file diff --git a/src/db/models/impl/link/url_agency/pydantic.py b/src/db/models/impl/link/url_agency/pydantic.py index 77522a64..fe9194de 100644 --- a/src/db/models/impl/link/url_agency/pydantic.py +++ b/src/db/models/impl/link/url_agency/pydantic.py @@ -1,3 +1,5 @@ +from pydantic import ConfigDict + from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.templates.markers.bulk.delete import BulkDeletableModel from src.db.templates.markers.bulk.insert import BulkInsertableModel @@ -7,6 +9,8 @@ class LinkURLAgencyPydantic( BulkDeletableModel, BulkInsertableModel ): + model_config = ConfigDict(frozen=True) + url_id: int agency_id: int diff --git a/src/db/models/impl/link/url_agency/sqlalchemy.py b/src/db/models/impl/link/url_agency/sqlalchemy.py index f8d72065..875fa25f 100644 --- a/src/db/models/impl/link/url_agency/sqlalchemy.py +++ b/src/db/models/impl/link/url_agency/sqlalchemy.py @@ -7,7 +7,7 @@ class LinkURLAgency(URLDependentMixin, WithIDBase): - __tablename__ = "link_urls_agencies" + __tablename__ = "link_urls_agency" agency_id: Mapped[int] = get_agency_id_foreign_column() diff --git a/src/db/models/impl/url/core/pydantic/info.py b/src/db/models/impl/url/core/pydantic/info.py index 07df21fe..0985b3fc 100644 --- a/src/db/models/impl/url/core/pydantic/info.py +++ b/src/db/models/impl/url/core/pydantic/info.py @@ -12,7 +12,7 @@ class URLInfo(BaseModel): batch_id: int | None= None url: str collector_metadata: dict | None = None - status: URLStatus = URLStatus.PENDING + status: URLStatus = URLStatus.OK updated_at: datetime.datetime | None = None created_at: datetime.datetime | None = None name: str | None = None diff --git a/src/db/models/impl/url/core/pydantic/insert.py b/src/db/models/impl/url/core/pydantic/insert.py index b893e9fa..18743f1b 100644 --- a/src/db/models/impl/url/core/pydantic/insert.py +++ b/src/db/models/impl/url/core/pydantic/insert.py @@ -16,6 +16,6 @@ def sa_model(cls) -> type[Base]: url: str collector_metadata: dict | None = None name: str | None = None - status: URLStatus = URLStatus.PENDING + status: URLStatus = URLStatus.OK record_type: RecordType | None = None source: URLSource \ No newline at end of file diff --git a/src/db/models/impl/url/core/sqlalchemy.py b/src/db/models/impl/url/core/sqlalchemy.py index b9c38732..2001f9ed 100644 --- a/src/db/models/impl/url/core/sqlalchemy.py +++ b/src/db/models/impl/url/core/sqlalchemy.py @@ -40,7 +40,7 @@ class URL(UpdatedAtMixin, CreatedAtMixin, WithIDBase): "Batch", secondary="link_batch_urls", back_populates="urls", - uselist=False + uselist=False, ) duplicates = relationship("Duplicate", back_populates="original_url") html_content = relationship("URLHTMLContent", back_populates="url", cascade="all, delete-orphan") @@ -50,8 +50,9 @@ class URL(UpdatedAtMixin, CreatedAtMixin, WithIDBase): secondary="link_task_urls", back_populates="urls", ) - automated_agency_suggestions = relationship( - "AutomatedUrlAgencySuggestion", back_populates="url") + auto_agency_subtasks = relationship( + "URLAutoAgencyIDSubtask" + ) user_agency_suggestion = relationship( "UserUrlAgencySuggestion", uselist=False, back_populates="url") auto_record_type_suggestion = relationship( diff --git a/src/db/models/impl/url/suggestion/agency/auto.py b/src/db/models/impl/url/suggestion/agency/auto.py deleted file mode 100644 index 5ecfdf0a..00000000 --- a/src/db/models/impl/url/suggestion/agency/auto.py +++ /dev/null @@ -1,20 +0,0 @@ -from sqlalchemy import Column, Boolean, UniqueConstraint -from sqlalchemy.orm import relationship - -from src.db.models.helpers import get_agency_id_foreign_column -from src.db.models.mixins import URLDependentMixin -from src.db.models.templates_.with_id import WithIDBase - - -class AutomatedUrlAgencySuggestion(URLDependentMixin, WithIDBase): - __tablename__ = "automated_url_agency_suggestions" - - agency_id = get_agency_id_foreign_column(nullable=True) - is_unknown = Column(Boolean, nullable=True) - - agency = relationship("Agency", back_populates="automated_suggestions") - url = relationship("URL", back_populates="automated_agency_suggestions") - - __table_args__ = ( - UniqueConstraint("agency_id", "url_id", name="uq_automated_url_agency_suggestions"), - ) diff --git a/src/db/models/impl/url/suggestion/agency/subtask/__init__.py b/src/db/models/impl/url/suggestion/agency/subtask/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/impl/url/suggestion/agency/subtask/enum.py b/src/db/models/impl/url/suggestion/agency/subtask/enum.py new file mode 100644 index 00000000..f3ee7c3f --- /dev/null +++ b/src/db/models/impl/url/suggestion/agency/subtask/enum.py @@ -0,0 +1,14 @@ +from enum import Enum + + +class AutoAgencyIDSubtaskType(Enum): + HOMEPAGE_MATCH = "homepage_match" + NLP_LOCATION_MATCH = "nlp_location_match" + MUCKROCK = "muckrock_match" + CKAN = "ckan_match" + +class SubtaskDetailCode(Enum): + NO_DETAILS = "no details" + RETRIEVAL_ERROR = "retrieval error" + HOMEPAGE_SINGLE_AGENCY = "homepage-single agency" + HOMEPAGE_MULTI_AGENCY = "homepage-multi agency" \ No newline at end of file diff --git a/src/db/models/impl/url/suggestion/agency/subtask/pydantic.py b/src/db/models/impl/url/suggestion/agency/subtask/pydantic.py new file mode 100644 index 00000000..f2e9be57 --- /dev/null +++ b/src/db/models/impl/url/suggestion/agency/subtask/pydantic.py @@ -0,0 +1,17 @@ +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType, SubtaskDetailCode +from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask +from src.db.models.templates_.base import Base +from src.db.templates.markers.bulk.insert import BulkInsertableModel + +type_alias = type + +class URLAutoAgencyIDSubtaskPydantic(BulkInsertableModel): + task_id: int + url_id: int + type: AutoAgencyIDSubtaskType + agencies_found: bool + detail: SubtaskDetailCode = SubtaskDetailCode.NO_DETAILS + + @classmethod + def sa_model(cls) -> type_alias[Base]: + return URLAutoAgencyIDSubtask \ No newline at end of file diff --git a/src/db/models/impl/url/suggestion/agency/subtask/sqlalchemy.py b/src/db/models/impl/url/suggestion/agency/subtask/sqlalchemy.py new file mode 100644 index 00000000..89371498 --- /dev/null +++ b/src/db/models/impl/url/suggestion/agency/subtask/sqlalchemy.py @@ -0,0 +1,35 @@ +from sqlalchemy.orm import relationship + +from src.db.models.helpers import enum_column +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType, SubtaskDetailCode +from src.db.models.mixins import URLDependentMixin, CreatedAtMixin, TaskDependentMixin +from src.db.models.templates_.with_id import WithIDBase + +import sqlalchemy as sa + +class URLAutoAgencyIDSubtask( + WithIDBase, + URLDependentMixin, + TaskDependentMixin, + CreatedAtMixin +): + + __tablename__ = "url_auto_agency_id_subtasks" + + type = enum_column( + AutoAgencyIDSubtaskType, + name="agency_auto_suggestion_method" + ) + agencies_found = sa.Column( + sa.Boolean(), + nullable=False + ) + detail = enum_column( + SubtaskDetailCode, + name="agency_id_subtask_detail_code", + ) + + suggestions = relationship( + "AgencyIDSubtaskSuggestion", + cascade="all, delete-orphan" + ) \ No newline at end of file diff --git a/src/db/models/impl/url/suggestion/agency/suggestion/__init__.py b/src/db/models/impl/url/suggestion/agency/suggestion/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/impl/url/suggestion/agency/suggestion/pydantic.py b/src/db/models/impl/url/suggestion/agency/suggestion/pydantic.py new file mode 100644 index 00000000..5a0fd2b8 --- /dev/null +++ b/src/db/models/impl/url/suggestion/agency/suggestion/pydantic.py @@ -0,0 +1,16 @@ +from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion +from src.db.models.templates_.base import Base +from src.db.templates.markers.bulk.insert import BulkInsertableModel + + +class AgencyIDSubtaskSuggestionPydantic( + BulkInsertableModel, +): + subtask_id: int + agency_id: int + confidence: int + + @classmethod + def sa_model(cls) -> type[Base]: + """Defines the SQLAlchemy model.""" + return AgencyIDSubtaskSuggestion \ No newline at end of file diff --git a/src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py b/src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py new file mode 100644 index 00000000..de6ee029 --- /dev/null +++ b/src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py @@ -0,0 +1,28 @@ +import sqlalchemy as sa +from sqlalchemy.orm import relationship + +from src.db.models.mixins import CreatedAtMixin, AgencyDependentMixin +from src.db.models.templates_.with_id import WithIDBase + + +class AgencyIDSubtaskSuggestion( + WithIDBase, + CreatedAtMixin, + AgencyDependentMixin, +): + __tablename__ = "agency_id_subtask_suggestions" + + subtask_id = sa.Column( + sa.Integer, + sa.ForeignKey("url_auto_agency_id_subtasks.id"), + nullable=False + ) + confidence = sa.Column( + sa.Integer, + sa.CheckConstraint( + "confidence BETWEEN 0 and 100" + ), + nullable=False, + ) + + agency = relationship("Agency", viewonly=True) \ No newline at end of file diff --git a/src/db/models/mixins.py b/src/db/models/mixins.py index 541e5d09..d0dbbcab 100644 --- a/src/db/models/mixins.py +++ b/src/db/models/mixins.py @@ -1,5 +1,8 @@ -from sqlalchemy import Column, Integer, ForeignKey, TIMESTAMP +from typing import ClassVar +from sqlalchemy import Column, Integer, ForeignKey, TIMESTAMP, event + +from src.db.models.exceptions import WriteToViewError from src.db.models.helpers import get_created_at_column, CURRENT_TIME_SERVER_DEFAULT @@ -58,3 +61,17 @@ class UpdatedAtMixin: server_default=CURRENT_TIME_SERVER_DEFAULT, onupdate=CURRENT_TIME_SERVER_DEFAULT ) + +class ViewMixin: + """Attach to any mapped class that represents a DB view.""" + __is_view__: ClassVar[bool] = True + + @classmethod + def __declare_last__(cls) -> None: + # Block writes on this mapped class + for evt in ("before_insert", "before_update", "before_delete"): + event.listen(cls, evt, cls._block_write) + + @staticmethod + def _block_write(mapper, connection, target): + raise WriteToViewError(f"{type(target).__name__} is a read-only view.") diff --git a/src/db/models/views/__init__.py b/src/db/models/views/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/views/meta_url.py b/src/db/models/views/meta_url.py new file mode 100644 index 00000000..bc963e11 --- /dev/null +++ b/src/db/models/views/meta_url.py @@ -0,0 +1,26 @@ +""" + CREATE OR REPLACE VIEW meta_url_view AS + SELECT + urls.id + FROM urls + INNER JOIN flag_url_validated fuv on fuv.url_id = urls.id + where fuv.type = 'meta url' +""" + +from sqlalchemy import PrimaryKeyConstraint + +from src.db.models.mixins import ViewMixin, URLDependentMixin +from src.db.models.templates_.base import Base + + +class MetaURL( + Base, + ViewMixin, + URLDependentMixin, +): + + __tablename__ = "meta_url_view" + __table_args__ = ( + PrimaryKeyConstraint("url_id"), + {"info": "view"} + ) \ No newline at end of file diff --git a/src/db/models/views/unvalidated_url.py b/src/db/models/views/unvalidated_url.py new file mode 100644 index 00000000..767ee960 --- /dev/null +++ b/src/db/models/views/unvalidated_url.py @@ -0,0 +1,27 @@ +""" +select + u.id as url_id +from + urls u + left join flag_url_validated fuv + on fuv.url_id = u.id +where + fuv.type is null +""" +from sqlalchemy import PrimaryKeyConstraint + +from src.db.models.mixins import ViewMixin, URLDependentMixin +from src.db.models.templates_.base import Base + + +class UnvalidatedURL( + Base, + ViewMixin, + URLDependentMixin, +): + + __tablename__ = "unvalidated_url_view" + __table_args__ = ( + PrimaryKeyConstraint("url_id"), + {"info": "view"} + ) \ No newline at end of file diff --git a/src/db/models/views/url_annotations_flags.py b/src/db/models/views/url_annotations_flags.py new file mode 100644 index 00000000..7289020f --- /dev/null +++ b/src/db/models/views/url_annotations_flags.py @@ -0,0 +1,49 @@ +""" +CREATE OR REPLACE VIEW url_annotation_flags AS +( +SELECT u.id, + CASE WHEN arts.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_auto_record_type_suggestion, + CASE WHEN ars.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_auto_relevant_suggestion, + CASE WHEN auas.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_auto_agency_suggestion, + CASE WHEN urts.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_user_record_type_suggestion, + CASE WHEN urs.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_user_relevant_suggestion, + CASE WHEN uuas.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_user_agency_suggestion, + CASE WHEN cua.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_confirmed_agency, + CASE WHEN ruu.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS was_reviewed +FROM urls u + LEFT JOIN public.auto_record_type_suggestions arts ON u.id = arts.url_id + LEFT JOIN public.auto_relevant_suggestions ars ON u.id = ars.url_id + LEFT JOIN public.{URL_AUTO_AGENCY_SUGGESTIONS_TABLE_NAME} auas ON u.id = auas.url_id + LEFT JOIN public.user_record_type_suggestions urts ON u.id = urts.url_id + LEFT JOIN public.user_relevant_suggestions urs ON u.id = urs.url_id + LEFT JOIN public.user_url_agency_suggestions uuas ON u.id = uuas.url_id + LEFT JOIN public.reviewing_user_url ruu ON u.id = ruu.url_id + LEFT JOIN public.link_urls_agency cua on u.id = cua.url_id + ) +""" + +from sqlalchemy import PrimaryKeyConstraint, Column, Boolean + +from src.db.models.mixins import ViewMixin, URLDependentMixin +from src.db.models.templates_.base import Base + + +class URLAnnotationFlagsView( + Base, + ViewMixin, + URLDependentMixin +): + __tablename__ = "url_annotation_flags" + __table_args__ = ( + PrimaryKeyConstraint("url_id"), + {"info": "view"} + ) + + has_auto_record_type_suggestion = Column(Boolean, nullable=False) + has_auto_relevant_suggestion = Column(Boolean, nullable=False) + has_auto_agency_suggestion = Column(Boolean, nullable=False) + has_user_record_type_suggestion = Column(Boolean, nullable=False) + has_user_relevant_suggestion = Column(Boolean, nullable=False) + has_user_agency_suggestion = Column(Boolean, nullable=False) + has_confirmed_agency = Column(Boolean, nullable=False) + was_reviewed = Column(Boolean, nullable=False) \ No newline at end of file diff --git a/src/db/queries/implementations/core/common/annotation_exists_/__init__.py b/src/db/queries/implementations/core/common/annotation_exists_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/queries/implementations/core/common/annotation_exists_/constants.py b/src/db/queries/implementations/core/common/annotation_exists_/constants.py new file mode 100644 index 00000000..ead32bc0 --- /dev/null +++ b/src/db/queries/implementations/core/common/annotation_exists_/constants.py @@ -0,0 +1,15 @@ +from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask +from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion +from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion +from src.db.models.impl.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion +from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion + +ALL_ANNOTATION_MODELS = [ + AutoRecordTypeSuggestion, + AutoRelevantSuggestion, + URLAutoAgencyIDSubtask, + UserRelevantSuggestion, + UserRecordTypeSuggestion, + UserUrlAgencySuggestion +] diff --git a/src/db/queries/implementations/core/common/annotation_exists.py b/src/db/queries/implementations/core/common/annotation_exists_/core.py similarity index 79% rename from src/db/queries/implementations/core/common/annotation_exists.py rename to src/db/queries/implementations/core/common/annotation_exists_/core.py index f8dfa654..53e8bcf6 100644 --- a/src/db/queries/implementations/core/common/annotation_exists.py +++ b/src/db/queries/implementations/core/common/annotation_exists_/core.py @@ -17,7 +17,8 @@ from sqlalchemy import case, func, Select, select from src.collectors.enums import URLStatus -from src.db.constants import ALL_ANNOTATION_MODELS +from src.db.queries.implementations.core.common.annotation_exists_.constants import ALL_ANNOTATION_MODELS +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.mixins import URLDependentMixin from src.db.queries.base.builder import QueryBuilderBase @@ -29,7 +30,7 @@ class AnnotationExistsCTEQueryBuilder(QueryBuilderBase): def url_id(self): return self.query.c.url_id - def get_exists_label(self, model: Type[URLDependentMixin]): + def get_exists_label(self, model: Type[URLDependentMixin]) -> str: return f"{model.__name__}_exists" def get_all(self) -> list[Any]: @@ -67,6 +68,13 @@ async def build(self) -> Any: *annotation_exists_cases_all ) anno_exists_query = await self._outer_join_models(anno_exists_query) - anno_exists_query = anno_exists_query.where(URL.status == URLStatus.PENDING.value) + anno_exists_query = anno_exists_query.outerjoin( + FlagURLValidated, + FlagURLValidated.url_id == URL.id + ) + anno_exists_query = anno_exists_query.where( + URL.status == URLStatus.OK.value, + FlagURLValidated.url_id.is_(None) + ) anno_exists_query = anno_exists_query.group_by(URL.id).cte("annotations_exist") self.query = anno_exists_query diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/builder.py b/src/db/queries/implementations/core/get/recent_batch_summaries/builder.py index f9bb2ef8..86983b5c 100644 --- a/src/db/queries/implementations/core/get/recent_batch_summaries/builder.py +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/builder.py @@ -9,6 +9,7 @@ from src.core.enums import BatchStatus from src.db.models.impl.batch.sqlalchemy import Batch from src.db.queries.base.builder import QueryBuilderBase +from src.db.queries.implementations.core.get.recent_batch_summaries.pending_url.cte import PENDING_URL_CTE from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.builder import URLCountsCTEQueryBuilder from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.labels import URLCountsLabels @@ -24,9 +25,9 @@ def __init__( batch_id: int | None = None, ): super().__init__() + self.has_pending_urls = has_pending_urls self.url_counts_cte = URLCountsCTEQueryBuilder( page=page, - has_pending_urls=has_pending_urls, collector_type=collector_type, status=status, batch_id=batch_id, @@ -49,6 +50,14 @@ async def run(self, session: AsyncSession) -> list[BatchSummary]: builder.query, builder.get(count_labels.batch_id) == Batch.id, ) + if self.has_pending_urls is not None: + query = query.join( + PENDING_URL_CTE, + PENDING_URL_CTE.c.batch_id == Batch.id, + ).where( + PENDING_URL_CTE.c.has_pending_urls == self.has_pending_urls + ) + raw_results = await session.execute(query) summaries: list[BatchSummary] = [] diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/pending_url/__init__.py b/src/db/queries/implementations/core/get/recent_batch_summaries/pending_url/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/pending_url/cte.py b/src/db/queries/implementations/core/get/recent_batch_summaries/pending_url/cte.py new file mode 100644 index 00000000..a0722229 --- /dev/null +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/pending_url/cte.py @@ -0,0 +1,30 @@ +from sqlalchemy import select, func, case, and_ + +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL + +PENDING_URL_CTE = ( + select( + Batch.id.label("batch_id"), + case( + ( + and_( + func.count(LinkBatchURL.url_id) > func.count(FlagURLValidated.url_id), + ) + , True), + else_=False + ).label("has_pending_urls") + ) + .outerjoin( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id, + ) + .outerjoin( + FlagURLValidated, + FlagURLValidated.url_id == LinkBatchURL.url_id, + ) + .group_by( + Batch.id + ).cte("has_pending_urls") +) \ No newline at end of file diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py index 72a33336..634cf419 100644 --- a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py @@ -1,15 +1,22 @@ -from typing import Optional - from sqlalchemy import Select, case, Label, and_, exists -from sqlalchemy.sql.functions import count, coalesce +from sqlalchemy.sql.functions import count, coalesce, func from src.collectors.enums import URLStatus, CollectorType from src.core.enums import BatchStatus -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource from src.db.queries.base.builder import QueryBuilderBase from src.db.queries.helpers import add_page_offset +from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte.all import ALL_CTE +from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte.duplicate import DUPLICATE_CTE +from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte.error import ERROR_CTE +from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte.not_relevant import NOT_RELEVANT_CTE +from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte.pending import PENDING_CTE +from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte.submitted import SUBMITTED_CTE from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.labels import URLCountsLabels @@ -18,14 +25,12 @@ class URLCountsCTEQueryBuilder(QueryBuilderBase): def __init__( self, page: int = 1, - has_pending_urls: bool | None = None, collector_type: CollectorType | None = None, status: BatchStatus | None = None, batch_id: int | None = None ): super().__init__(URLCountsLabels()) self.page = page - self.has_pending_urls = has_pending_urls self.collector_type = collector_type self.status = status self.batch_id = batch_id @@ -33,31 +38,31 @@ def __init__( def get_core_query(self): labels: URLCountsLabels = self.labels - return ( + query = ( Select( Batch.id.label(labels.batch_id), - coalesce(count(URL.id), 0).label(labels.total), - self.count_case_url_status(URLStatus.PENDING, labels.pending), - self.count_case_url_status(URLStatus.SUBMITTED, labels.submitted), - self.count_case_url_status(URLStatus.NOT_RELEVANT, labels.not_relevant), - self.count_case_url_status(URLStatus.ERROR, labels.error), - self.count_case_url_status(URLStatus.DUPLICATE, labels.duplicate), + func.coalesce(DUPLICATE_CTE.count, 0).label(labels.duplicate), + func.coalesce(SUBMITTED_CTE.count, 0).label(labels.submitted), + func.coalesce(PENDING_CTE.count, 0).label(labels.pending), + func.coalesce(ALL_CTE.count, 0).label(labels.total), + func.coalesce(NOT_RELEVANT_CTE.count, 0).label(labels.not_relevant), + func.coalesce(ERROR_CTE.count, 0).label(labels.error), ) .select_from(Batch) - .outerjoin(LinkBatchURL) - .outerjoin( - URL - ) ) + for cte in [DUPLICATE_CTE, SUBMITTED_CTE, PENDING_CTE, ALL_CTE, NOT_RELEVANT_CTE, ERROR_CTE]: + query = query.outerjoin( + cte.cte, + Batch.id == cte.batch_id + ) + return query def build(self): query = self.get_core_query() - query = self.apply_pending_urls_filter(query) query = self.apply_collector_type_filter(query) query = self.apply_status_filter(query) query = self.apply_batch_id_filter(query) - query = query.group_by(Batch.id) query = add_page_offset(query, page=self.page) query = query.order_by(Batch.id) self.query = query.cte("url_counts") @@ -67,23 +72,6 @@ def apply_batch_id_filter(self, query: Select): return query return query.where(Batch.id == self.batch_id) - def apply_pending_urls_filter(self, query: Select): - if self.has_pending_urls is None: - return query - pending_url_subquery = ( - exists( - Select(URL).join(LinkBatchURL).where( - and_( - LinkBatchURL.batch_id == Batch.id, - URL.status == URLStatus.PENDING.value - ) - ) - ) - ).correlate(Batch) - if self.has_pending_urls: - return query.where(pending_url_subquery) - return query.where(~pending_url_subquery) - def apply_collector_type_filter(self, query: Select): if self.collector_type is None: return query @@ -93,18 +81,3 @@ def apply_status_filter(self, query: Select): if self.status is None: return query return query.where(Batch.status == self.status.value) - - @staticmethod - def count_case_url_status( - url_status: URLStatus, - label: str - ) -> Label: - return ( - coalesce( - count( - case( - (URL.status == url_status.value, 1) - ) - ) - , 0).label(label) - ) diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/__init__.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/all.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/all.py new file mode 100644 index 00000000..5cab51cf --- /dev/null +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/all.py @@ -0,0 +1,20 @@ +from sqlalchemy import select, func + +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte_container import \ + URLCountsCTEContainer + +ALL_CTE = URLCountsCTEContainer( + select( + Batch.id, + func.count(LinkBatchURL.url_id).label("total_count") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id, + ) + .group_by( + Batch.id + ).cte("total_count") +) \ No newline at end of file diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/duplicate.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/duplicate.py new file mode 100644 index 00000000..906dd49c --- /dev/null +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/duplicate.py @@ -0,0 +1,29 @@ +from sqlalchemy import select, func + +from src.collectors.enums import URLStatus +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte_container import \ + URLCountsCTEContainer + +DUPLICATE_CTE = URLCountsCTEContainer( + select( + Batch.id, + func.count(URL.id).label("duplicate_count") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id, + ) + .join( + URL, + URL.id == LinkBatchURL.url_id, + ) + .where( + URL.status == URLStatus.DUPLICATE + ) + .group_by( + Batch.id + ).cte("duplicate_count") +) \ No newline at end of file diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/error.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/error.py new file mode 100644 index 00000000..b74020c4 --- /dev/null +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/error.py @@ -0,0 +1,29 @@ +from sqlalchemy import select, func + +from src.collectors.enums import URLStatus +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte_container import \ + URLCountsCTEContainer + +ERROR_CTE = URLCountsCTEContainer( + select( + Batch.id, + func.count(URL.id).label("error_count") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id, + ) + .join( + URL, + URL.id == LinkBatchURL.url_id, + ) + .where( + URL.status == URLStatus.ERROR + ) + .group_by( + Batch.id + ).cte("error_count") +) \ No newline at end of file diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/not_relevant.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/not_relevant.py new file mode 100644 index 00000000..e84f597b --- /dev/null +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/not_relevant.py @@ -0,0 +1,34 @@ +from sqlalchemy import select, func + +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte_container import \ + URLCountsCTEContainer + +NOT_RELEVANT_CTE = URLCountsCTEContainer( + select( + Batch.id, + func.count(URL.id).label("not_relevant_count") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id, + ) + .join( + URL, + URL.id == LinkBatchURL.url_id, + ) + .join( + FlagURLValidated, + FlagURLValidated.url_id == URL.id, + ) + .where( + FlagURLValidated.type == URLValidatedType.NOT_RELEVANT + ) + .group_by( + Batch.id + ).cte("not_relevant_count") +) \ No newline at end of file diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/pending.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/pending.py new file mode 100644 index 00000000..b7e4594c --- /dev/null +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/pending.py @@ -0,0 +1,33 @@ +from sqlalchemy import select, func + +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte_container import \ + URLCountsCTEContainer + +PENDING_CTE = URLCountsCTEContainer( + select( + Batch.id, + func.count(URL.id).label("pending_count") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id, + ) + .join( + URL, + URL.id == LinkBatchURL.url_id, + ) + .outerjoin( + FlagURLValidated, + FlagURLValidated.url_id == URL.id, + ) + .where( + FlagURLValidated.type.is_(None) + ) + .group_by( + Batch.id + ).cte("pending_count") +) \ No newline at end of file diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/submitted.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/submitted.py new file mode 100644 index 00000000..5ab305cc --- /dev/null +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/submitted.py @@ -0,0 +1,32 @@ + + +from sqlalchemy import select, func + +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource +from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte_container import \ + URLCountsCTEContainer + +SUBMITTED_CTE = URLCountsCTEContainer( + select( + Batch.id, + func.count(URL.id).label("submitted_count") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id, + ) + .join( + URL, + URL.id == LinkBatchURL.url_id, + ) + .join( + URLDataSource, + URLDataSource.url_id == URL.id, + ) + .group_by( + Batch.id + ).cte("submitted_count") +) \ No newline at end of file diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte_container.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte_container.py new file mode 100644 index 00000000..7f769c76 --- /dev/null +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte_container.py @@ -0,0 +1,18 @@ +from sqlalchemy import CTE, Column + + +class URLCountsCTEContainer: + + def __init__( + self, + cte: CTE + ): + self.cte = cte + + @property + def batch_id(self) -> Column: + return self.cte.columns[0] + + @property + def count(self) -> Column: + return self.cte.columns[1] diff --git a/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py b/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py index 269dfced..5d69be2a 100644 --- a/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py +++ b/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py @@ -11,7 +11,7 @@ from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion from src.db.models.mixins import URLDependentMixin from src.db.queries.base.builder import QueryBuilderBase -from src.db.queries.implementations.core.common.annotation_exists import AnnotationExistsCTEQueryBuilder +from src.db.queries.implementations.core.common.annotation_exists_.core import AnnotationExistsCTEQueryBuilder class PendingAnnotationExistsCTEQueryBuilder(AnnotationExistsCTEQueryBuilder): @@ -44,7 +44,7 @@ async def build(self) -> Any: URL.id == self.url_id ) .where( - URL.status == URLStatus.PENDING.value + URL.status == URLStatus.OK.value ).cte("pending") ) diff --git a/src/db/statement_composer.py b/src/db/statement_composer.py index 45a281de..8e172733 100644 --- a/src/db/statement_composer.py +++ b/src/db/statement_composer.py @@ -2,22 +2,19 @@ from typing import Any from sqlalchemy import Select, select, exists, func, Subquery, and_, not_, ColumnElement -from sqlalchemy.orm import aliased, selectinload +from sqlalchemy.orm import selectinload from src.collectors.enums import URLStatus from src.core.enums import BatchStatus from src.db.constants import STANDARD_ROW_LIMIT from src.db.enums import TaskType -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.link.task_url import LinkTaskURL -from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.task.core import Task -from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent -from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from src.db.models.impl.url.scrape_info.sqlalchemy import URLScrapeInfo -from src.db.models.impl.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata from src.db.types import UserSuggestionType @@ -75,28 +72,11 @@ def simple_count_subquery(model, attribute: str, label: str) -> Subquery: func.count(attr_value).label(label) ).group_by(attr_value).subquery() - @staticmethod - def exclude_urls_with_agency_suggestions( - statement: Select - ): - # Aliases for clarity - AutomatedSuggestion = aliased(AutomatedUrlAgencySuggestion) - - # Exclude if automated suggestions exist - statement = statement.where( - ~exists().where(AutomatedSuggestion.url_id == URL.id) - ) - # Exclude if confirmed agencies exist - statement = statement.where( - ~exists().where(LinkURLAgency.url_id == URL.id) - ) - return statement - @staticmethod def pending_urls_missing_miscellaneous_metadata_query() -> Select: query = select(URL).where( and_( - URL.status == URLStatus.PENDING.value, + URL.status == URLStatus.OK.value, URL.name == None, URL.description == None, URLOptionalDataSourceMetadata.url_id == None diff --git a/src/db/templates/requester.py b/src/db/templates/requester.py new file mode 100644 index 00000000..b56af87f --- /dev/null +++ b/src/db/templates/requester.py @@ -0,0 +1,20 @@ +""" +A requester is a class that contains a session and provides methods for +performing database operations. +""" +from abc import ABC + +from sqlalchemy.ext.asyncio import AsyncSession + +import src.db.helpers.session.session_helper as sh +from src.db.queries.base.builder import QueryBuilderBase + + +class RequesterBase(ABC): + + def __init__(self, session: AsyncSession): + self.session = session + self.session_helper = sh + + async def run_query_builder(self, query_builder: QueryBuilderBase): + return await query_builder.run(session=self.session) \ No newline at end of file diff --git a/src/external/pdap/client.py b/src/external/pdap/client.py index ee357ad4..0e0d5a39 100644 --- a/src/external/pdap/client.py +++ b/src/external/pdap/client.py @@ -1,10 +1,14 @@ -from typing import Optional +from datetime import date +from typing import Any -from pdap_access_manager import AccessManager, DataSourcesNamespaces, RequestInfo, RequestType +from pdap_access_manager import AccessManager, DataSourcesNamespaces, RequestInfo, RequestType, ResponseInfo from src.core.tasks.scheduled.impl.sync.agency.dtos.parameters import AgencySyncParameters from src.core.tasks.scheduled.impl.sync.data_sources.params import DataSourcesSyncParameters from src.core.tasks.url.operators.submit_approved.tdo import SubmitApprovedURLTDO, SubmittedURLInfo +from src.external.pdap.dtos.search_agency_by_location.params import SearchAgencyByLocationParams +from src.external.pdap.dtos.search_agency_by_location.response import SearchAgencyByLocationResponse, \ + SearchAgencyByLocationOuterResponse from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo, AgenciesSyncResponseInfo from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse @@ -21,6 +25,38 @@ def __init__( ): self.access_manager = access_manager + async def search_agency_by_location( + self, + params: list[SearchAgencyByLocationParams] + ) -> list[SearchAgencyByLocationResponse]: + request_url: str = self.access_manager.build_url( + namespace=DataSourcesNamespaces.SOURCE_COLLECTOR, + subdomains=["agencies", "search", "location"] + ) + headers: dict[str, str] = await self.access_manager.jwt_header() + headers['Content-Type']: str = "application/json" + + json_params: list[dict[str, Any]] = [ + param.model_dump(mode='json') + for param in params + ] + + request_info = RequestInfo( + type_=RequestType.POST, + url=request_url, + headers=headers, + json_={ + "requests": json_params + } + ) + response_info: ResponseInfo = await self.access_manager.make_request(request_info) + + outer_response = SearchAgencyByLocationOuterResponse( + **response_info.data + ) + + return outer_response.responses + async def match_agency( self, name: str, @@ -31,13 +67,13 @@ async def match_agency( """ Returns agencies, if any, that match or partially match the search criteria """ - url = self.access_manager.build_url( + url: str = self.access_manager.build_url( namespace=DataSourcesNamespaces.MATCH, subdomains=["agency"] ) - headers = await self.access_manager.jwt_header() - headers['Content-Type'] = "application/json" + headers: dict[str, str] = await self.access_manager.jwt_header() + headers['Content-Type']: str = "application/json" request_info = RequestInfo( type_=RequestType.POST, url=url, @@ -49,15 +85,15 @@ async def match_agency( "locality": locality } ) - response_info = await self.access_manager.make_request(request_info) - matches = [] + response_info: ResponseInfo = await self.access_manager.make_request(request_info) + matches: list[MatchAgencyInfo] = [] for agency in response_info.data["agencies"]: mai = MatchAgencyInfo( id=agency['id'], submitted_name=agency['name'] ) if len(agency['locations']) > 0: - first_location = agency['locations'][0] + first_location: dict[str, Any] = agency['locations'][0] mai.state = first_location['state'] mai.county = first_location['county'] mai.locality = first_location['locality'] @@ -75,7 +111,7 @@ async def is_url_duplicate( """ Check if a URL is unique. Returns duplicate info otherwise """ - url = self.access_manager.build_url( + url: str = self.access_manager.build_url( namespace=DataSourcesNamespaces.CHECK, subdomains=["unique-url"] ) @@ -86,9 +122,11 @@ async def is_url_duplicate( "url": url_to_check } ) - response_info = await self.access_manager.make_request(request_info) - duplicates = [UniqueURLDuplicateInfo(**entry) for entry in response_info.data["duplicates"]] - is_duplicate = (len(duplicates) != 0) + response_info: ResponseInfo = await self.access_manager.make_request(request_info) + duplicates: list[UniqueURLDuplicateInfo] = [ + UniqueURLDuplicateInfo(**entry) for entry in response_info.data["duplicates"] + ] + is_duplicate: bool = (len(duplicates) != 0) return is_duplicate async def submit_urls( @@ -105,11 +143,11 @@ async def submit_urls( ) # Build url-id dictionary - url_id_dict = {} + url_id_dict: dict[str, int] = {} for tdo in tdos: url_id_dict[tdo.url] = tdo.url_id - data_sources_json = [] + data_sources_json: list[dict[str, Any]] = [] for tdo in tdos: data_sources_json.append( { @@ -125,7 +163,7 @@ async def submit_urls( } ) - headers = await self.access_manager.jwt_header() + headers: dict[str, str] = await self.access_manager.jwt_header() request_info = RequestInfo( type_=RequestType.POST, url=request_url, @@ -134,12 +172,12 @@ async def submit_urls( "data_sources": data_sources_json } ) - response_info = await self.access_manager.make_request(request_info) - data_sources_response_json = response_info.data["data_sources"] + response_info: ResponseInfo = await self.access_manager.make_request(request_info) + data_sources_response_json: list[dict[str, Any]] = response_info.data["data_sources"] - results = [] + results: list[SubmittedURLInfo] = [] for data_source in data_sources_response_json: - url = data_source["url"] + url: str = data_source["url"] response_object = SubmittedURLInfo( url_id=url_id_dict[url], data_source_id=data_source["data_source_id"], @@ -153,25 +191,28 @@ async def sync_agencies( self, params: AgencySyncParameters ) -> AgenciesSyncResponseInfo: - url =self.access_manager.build_url( + url: str = self.access_manager.build_url( namespace=DataSourcesNamespaces.SOURCE_COLLECTOR, subdomains=[ "agencies", "sync" ] ) - headers = await self.access_manager.jwt_header() - headers['Content-Type'] = "application/json" + headers: dict[str, str] = await self.access_manager.jwt_header() + headers['Content-Type']: str = "application/json" + request_params: dict[str, Any] = { + "page": params.page + } + if params.cutoff_date is not None: + params["updated_at"]: date = params.cutoff_date + request_info = RequestInfo( type_=RequestType.GET, url=url, headers=headers, - params={ - "page": params.page, - "updated_at": params.cutoff_date - } + params=request_params ) - response_info = await self.access_manager.make_request(request_info) + response_info: ResponseInfo = await self.access_manager.make_request(request_info) return AgenciesSyncResponseInfo( agencies=[ AgenciesSyncResponseInnerInfo(**entry) @@ -183,18 +224,18 @@ async def sync_data_sources( self, params: DataSourcesSyncParameters ) -> DataSourcesSyncResponseInfo: - url = self.access_manager.build_url( + url: str = self.access_manager.build_url( namespace=DataSourcesNamespaces.SOURCE_COLLECTOR, subdomains=[ "data-sources", "sync" ] ) - headers = await self.access_manager.jwt_header() - headers['Content-Type'] = "application/json" - params_dict = {"page": params.page} + headers: dict[str, str] = await self.access_manager.jwt_header() + headers['Content-Type']: str = "application/json" + params_dict: dict[str, Any] = {"page": params.page} if params.cutoff_date is not None: - params_dict["updated_at"] = params.cutoff_date + params_dict["updated_at"]: date = params.cutoff_date request_info = RequestInfo( type_=RequestType.GET, @@ -202,10 +243,10 @@ async def sync_data_sources( headers=headers, params=params_dict ) - response_info = await self.access_manager.make_request(request_info) + response_info: ResponseInfo = await self.access_manager.make_request(request_info) return DataSourcesSyncResponseInfo( data_sources=[ DataSourcesSyncResponseInnerInfo(**entry) for entry in response_info.data["data_sources"] ] - ) \ No newline at end of file + ) diff --git a/src/external/pdap/dtos/search_agency_by_location/__init__.py b/src/external/pdap/dtos/search_agency_by_location/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/dtos/search_agency_by_location/params.py b/src/external/pdap/dtos/search_agency_by_location/params.py new file mode 100644 index 00000000..ca5a6213 --- /dev/null +++ b/src/external/pdap/dtos/search_agency_by_location/params.py @@ -0,0 +1,11 @@ +from pydantic import BaseModel, Field + + +class SearchAgencyByLocationParams(BaseModel): + request_id: int + query: str + iso: str = Field( + description="US State ISO Code", + max_length=2, + + ) \ No newline at end of file diff --git a/src/external/pdap/dtos/search_agency_by_location/response.py b/src/external/pdap/dtos/search_agency_by_location/response.py new file mode 100644 index 00000000..92242b5a --- /dev/null +++ b/src/external/pdap/dtos/search_agency_by_location/response.py @@ -0,0 +1,12 @@ +from pydantic import BaseModel, Field + +class SearchAgencyByLocationAgencyInfo(BaseModel): + agency_id: int + similarity: float = Field(ge=0, le=1) + +class SearchAgencyByLocationResponse(BaseModel): + request_id: int + results: list[SearchAgencyByLocationAgencyInfo] = Field(min_length=1) + +class SearchAgencyByLocationOuterResponse(BaseModel): + responses: list[SearchAgencyByLocationResponse] \ No newline at end of file diff --git a/src/external/pdap/dtos/sync/agencies.py b/src/external/pdap/dtos/sync/agencies.py index 99483107..7e569a81 100644 --- a/src/external/pdap/dtos/sync/agencies.py +++ b/src/external/pdap/dtos/sync/agencies.py @@ -3,6 +3,8 @@ from pydantic import BaseModel + + class AgenciesSyncResponseInnerInfo(BaseModel): display_name: str agency_id: int @@ -10,6 +12,7 @@ class AgenciesSyncResponseInnerInfo(BaseModel): county_name: str | None locality_name: str | None updated_at: datetime.datetime + meta_urls: list[str] = [] class AgenciesSyncResponseInfo(BaseModel): agencies: list[AgenciesSyncResponseInnerInfo] diff --git a/src/util/alembic_helpers.py b/src/util/alembic_helpers.py index 47a24cac..9df2be52 100644 --- a/src/util/alembic_helpers.py +++ b/src/util/alembic_helpers.py @@ -8,6 +8,7 @@ def switch_enum_type( new_enum_values, drop_old_enum=True, check_constraints_to_drop: list[str] = None, + conversion_mappings: dict[str, str] = None ): """ Switches an ENUM type in a PostgreSQL column by: @@ -21,6 +22,8 @@ def switch_enum_type( :param enum_name: Name of the ENUM type in PostgreSQL. :param new_enum_values: List of new ENUM values. :param drop_old_enum: Whether to drop the old ENUM type. + :param check_constraints_to_drop: List of check constraints to drop before switching the ENUM type. + :param conversion_mappings: Dictionary of old values to new values for the ENUM type. """ # 1. Drop check constraints that reference the enum @@ -38,7 +41,21 @@ def switch_enum_type( new_enum_type.create(op.get_bind()) # Alter the column type to use the new enum type - op.execute(f'ALTER TABLE "{table_name}" ALTER COLUMN "{column_name}" TYPE "{enum_name}" USING "{column_name}"::text::{enum_name}') + if conversion_mappings is None: + op.execute(f'ALTER TABLE "{table_name}" ALTER COLUMN "{column_name}" TYPE "{enum_name}" USING "{column_name}"::text::{enum_name}') + if conversion_mappings is not None: + case_when: str = "" + for old_value, new_value in conversion_mappings.items(): + case_when += f"WHEN '{old_value}' THEN '{new_value}'\n" + + op.execute(f""" + ALTER TABLE "{table_name}" + ALTER COLUMN "{column_name}" TYPE "{enum_name}" + USING CASE {column_name}::text + {case_when} + ELSE "{column_name}"::text + END::{enum_name}; + """) # Drop the old enum type if drop_old_enum: @@ -86,6 +103,18 @@ def updated_at_column() -> sa.Column: comment='The last time the row was updated.' ) +def task_id_column() -> sa.Column: + return sa.Column( + 'task_id', + sa.Integer(), + sa.ForeignKey( + 'tasks.id', + ondelete='CASCADE' + ), + nullable=False, + comment='A foreign key to the `tasks` table.' + ) + def url_id_column(name: str = 'url_id') -> sa.Column: return sa.Column( name, @@ -108,4 +137,16 @@ def batch_id_column(nullable=False) -> sa.Column: ), nullable=nullable, comment='A foreign key to the `batches` table.' + ) + +def agency_id_column(nullable=False) -> sa.Column: + return sa.Column( + 'agency_id', + sa.Integer(), + sa.ForeignKey( + 'agencies.agency_id', + ondelete='CASCADE' + ), + nullable=nullable, + comment='A foreign key to the `agencies` table.' ) \ No newline at end of file diff --git a/tests/automated/integration/api/annotate/__init__.py b/tests/automated/integration/api/annotate/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/annotate/agency/__init__.py b/tests/automated/integration/api/annotate/agency/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/annotate/agency/test_multiple_auto_suggestions.py b/tests/automated/integration/api/annotate/agency/test_multiple_auto_suggestions.py new file mode 100644 index 00000000..65b20b0c --- /dev/null +++ b/tests/automated/integration/api/annotate/agency/test_multiple_auto_suggestions.py @@ -0,0 +1,46 @@ +import pytest + +from src.core.enums import SuggestionType +from tests.helpers.data_creator.models.creation_info.batch.v1 import BatchURLCreationInfo + + +@pytest.mark.asyncio +async def test_annotate_agency_multiple_auto_suggestions(api_test_helper): + """ + Test Scenario: Multiple Auto Suggestions + A URL has multiple Agency Auto Suggestion and has not been annotated by the User + The user should receive all of the auto suggestions with full detail + """ + ath = api_test_helper + buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( + url_count=1, + with_html_content=True + ) + await ath.db_data_creator.auto_suggestions( + url_ids=buci.url_ids, + num_suggestions=2, + suggestion_type=SuggestionType.AUTO_SUGGESTION + ) + + # User requests next annotation + response = await ath.request_validator.get_next_agency_annotation() + + assert response.next_annotation + next_annotation = response.next_annotation + # Check that url_id matches the one we inserted + assert next_annotation.url_info.url_id == buci.url_ids[0] + + # Check that html data is present + assert next_annotation.html_info.description != "" + assert next_annotation.html_info.title != "" + + # Check that two agency_suggestions exist + assert len(next_annotation.agency_suggestions) == 2 + + for agency_suggestion in next_annotation.agency_suggestions: + assert agency_suggestion.suggestion_type == SuggestionType.AUTO_SUGGESTION + assert agency_suggestion.pdap_agency_id is not None + assert agency_suggestion.agency_name is not None + assert agency_suggestion.state is not None + assert agency_suggestion.county is not None + assert agency_suggestion.locality is not None diff --git a/tests/automated/integration/api/annotate/agency/test_multiple_auto_suggestions_no_html.py b/tests/automated/integration/api/annotate/agency/test_multiple_auto_suggestions_no_html.py new file mode 100644 index 00000000..5bcb4569 --- /dev/null +++ b/tests/automated/integration/api/annotate/agency/test_multiple_auto_suggestions_no_html.py @@ -0,0 +1,35 @@ +import pytest + +from src.core.enums import SuggestionType +from tests.helpers.data_creator.models.creation_info.batch.v1 import BatchURLCreationInfo + + +@pytest.mark.asyncio +async def test_annotate_agency_multiple_auto_suggestions_no_html(api_test_helper): + """ + Test Scenario: Multiple Auto Suggestions + A URL has multiple Agency Auto Suggestion and has not been annotated by the User + The user should receive all of the auto suggestions with full detail + """ + ath = api_test_helper + buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( + url_count=1, + with_html_content=False + ) + await ath.db_data_creator.auto_suggestions( + url_ids=buci.url_ids, + num_suggestions=2, + suggestion_type=SuggestionType.AUTO_SUGGESTION + ) + + # User requests next annotation + response = await ath.request_validator.get_next_agency_annotation() + + assert response.next_annotation + next_annotation = response.next_annotation + # Check that url_id matches the one we inserted + assert next_annotation.url_info.url_id == buci.url_ids[0] + + # Check that html data is not present + assert next_annotation.html_info.description == "" + assert next_annotation.html_info.title == "" diff --git a/tests/automated/integration/api/annotate/agency/test_other_user_annotation.py b/tests/automated/integration/api/annotate/agency/test_other_user_annotation.py new file mode 100644 index 00000000..a3ecae79 --- /dev/null +++ b/tests/automated/integration/api/annotate/agency/test_other_user_annotation.py @@ -0,0 +1,44 @@ +import pytest + +from tests.automated.integration.api.conftest import MOCK_USER_ID +from tests.helpers.setup.annotate_agency.core import setup_for_annotate_agency +from tests.helpers.setup.annotate_agency.model import AnnotateAgencySetupInfo + + +@pytest.mark.asyncio +async def test_annotate_agency_other_user_annotation(api_test_helper): + """ + Test Scenario: Other User Annotation + A URL has been annotated by another User + Our user should still receive this URL to annotate + """ + ath = api_test_helper + setup_info: AnnotateAgencySetupInfo = await setup_for_annotate_agency( + db_data_creator=ath.db_data_creator, + url_count=1 + ) + url_ids = setup_info.url_ids + + response = await ath.request_validator.get_next_agency_annotation() + + assert response.next_annotation + next_annotation = response.next_annotation + # Check that url_id matches the one we inserted + assert next_annotation.url_info.url_id == url_ids[0] + + # Check that html data is present + assert next_annotation.html_info.description != "" + assert next_annotation.html_info.title != "" + + # Check that one agency_suggestion exists + assert len(next_annotation.agency_suggestions) == 1 + + # Test that another user can insert a suggestion + await ath.db_data_creator.manual_suggestion( + user_id=MOCK_USER_ID + 1, + url_id=url_ids[0], + ) + + # After this, text that our user does not receive this URL + response = await ath.request_validator.get_next_agency_annotation() + assert response.next_annotation is None diff --git a/tests/automated/integration/api/annotate/agency/test_single_confirmed_agency.py b/tests/automated/integration/api/annotate/agency/test_single_confirmed_agency.py new file mode 100644 index 00000000..e38421e1 --- /dev/null +++ b/tests/automated/integration/api/annotate/agency/test_single_confirmed_agency.py @@ -0,0 +1,22 @@ +import pytest + +from tests.helpers.data_creator.models.creation_info.batch.v1 import BatchURLCreationInfo + + +@pytest.mark.asyncio +async def test_annotate_agency_single_confirmed_agency(api_test_helper): + """ + Test Scenario: Single Confirmed Agency + A URL has a single Confirmed Agency and has not been annotated by the User + The user should not receive this URL to annotate + """ + ath = api_test_helper + buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( + url_count=1, + with_html_content=True + ) + await ath.db_data_creator.confirmed_suggestions( + url_ids=buci.url_ids, + ) + response = await ath.request_validator.get_next_agency_annotation() + assert response.next_annotation is None diff --git a/tests/automated/integration/api/annotate/agency/test_single_unknown_auto_suggestions.py b/tests/automated/integration/api/annotate/agency/test_single_unknown_auto_suggestions.py new file mode 100644 index 00000000..f911bba5 --- /dev/null +++ b/tests/automated/integration/api/annotate/agency/test_single_unknown_auto_suggestions.py @@ -0,0 +1,45 @@ +import pytest + +from src.core.enums import SuggestionType +from tests.helpers.data_creator.models.creation_info.batch.v1 import BatchURLCreationInfo + + +@pytest.mark.asyncio +async def test_annotate_agency_single_unknown_auto_suggestion(api_test_helper): + """ + Test Scenario: Single Unknown Auto Suggestion + A URL has a single Unknown Agency Auto Suggestion and has not been annotated by the User + The user should receive a single Unknown Auto Suggestion lacking other detail + """ + ath = api_test_helper + buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( + url_count=1, + with_html_content=True + ) + await ath.db_data_creator.auto_suggestions( + url_ids=buci.url_ids, + num_suggestions=1, + suggestion_type=SuggestionType.UNKNOWN + ) + response = await ath.request_validator.get_next_agency_annotation() + + assert response.next_annotation + next_annotation = response.next_annotation + # Check that url_id matches the one we inserted + assert next_annotation.url_info.url_id == buci.url_ids[0] + + # Check that html data is present + assert next_annotation.html_info.description != "" + assert next_annotation.html_info.title != "" + + # Check that one agency_suggestion exists + assert len(next_annotation.agency_suggestions) == 1 + + agency_suggestion = next_annotation.agency_suggestions[0] + + assert agency_suggestion.suggestion_type == SuggestionType.UNKNOWN + assert agency_suggestion.pdap_agency_id is None + assert agency_suggestion.agency_name is None + assert agency_suggestion.state is None + assert agency_suggestion.county is None + assert agency_suggestion.locality is None diff --git a/tests/automated/integration/api/annotate/agency/test_submit_and_get_next.py b/tests/automated/integration/api/annotate/agency/test_submit_and_get_next.py new file mode 100644 index 00000000..91049daa --- /dev/null +++ b/tests/automated/integration/api/annotate/agency/test_submit_and_get_next.py @@ -0,0 +1,42 @@ +import pytest + +from src.api.endpoints.annotate.agency.post.dto import URLAgencyAnnotationPostInfo +from tests.helpers.setup.annotate_agency.core import setup_for_annotate_agency +from tests.helpers.setup.annotate_agency.model import AnnotateAgencySetupInfo + + +@pytest.mark.asyncio +async def test_annotate_agency_submit_and_get_next(api_test_helper): + """ + Test Scenario: Submit and Get Next (no other URL available) + A URL has been annotated by our User, and no other valid URLs have not been annotated + Our user should not receive another URL to annotate + Until another relevant URL is added + """ + ath = api_test_helper + setup_info: AnnotateAgencySetupInfo = await setup_for_annotate_agency( + db_data_creator=ath.db_data_creator, + url_count=2 + ) + url_ids = setup_info.url_ids + + # User should submit an annotation and receive the next + response = await ath.request_validator.post_agency_annotation_and_get_next( + url_id=url_ids[0], + agency_annotation_post_info=URLAgencyAnnotationPostInfo( + suggested_agency=await ath.db_data_creator.agency(), + is_new=False + ) + + ) + assert response.next_annotation is not None + + # User should submit this annotation and receive none for the next + response = await ath.request_validator.post_agency_annotation_and_get_next( + url_id=url_ids[1], + agency_annotation_post_info=URLAgencyAnnotationPostInfo( + suggested_agency=await ath.db_data_creator.agency(), + is_new=False + ) + ) + assert response.next_annotation is None diff --git a/tests/automated/integration/api/annotate/agency/test_submit_new.py b/tests/automated/integration/api/annotate/agency/test_submit_new.py new file mode 100644 index 00000000..e82c767f --- /dev/null +++ b/tests/automated/integration/api/annotate/agency/test_submit_new.py @@ -0,0 +1,38 @@ +import pytest + +from src.api.endpoints.annotate.agency.post.dto import URLAgencyAnnotationPostInfo +from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from tests.helpers.setup.annotate_agency.core import setup_for_annotate_agency +from tests.helpers.setup.annotate_agency.model import AnnotateAgencySetupInfo + + +@pytest.mark.asyncio +async def test_annotate_agency_submit_new(api_test_helper): + """ + Test Scenario: Submit New + Our user receives an annotation and marks it as `NEW` + This should complete successfully + And within the database the annotation should be marked as `NEW` + """ + ath = api_test_helper + adb_client = ath.adb_client() + setup_info: AnnotateAgencySetupInfo = await setup_for_annotate_agency( + db_data_creator=ath.db_data_creator, + url_count=1 + ) + url_ids = setup_info.url_ids + + # User should submit an annotation and mark it as New + response = await ath.request_validator.post_agency_annotation_and_get_next( + url_id=url_ids[0], + agency_annotation_post_info=URLAgencyAnnotationPostInfo( + suggested_agency=await ath.db_data_creator.agency(), + is_new=True + ) + ) + assert response.next_annotation is None + + # Within database, the annotation should be marked as `NEW` + all_manual_suggestions = await adb_client.get_all(UserUrlAgencySuggestion) + assert len(all_manual_suggestions) == 1 + assert all_manual_suggestions[0].is_new diff --git a/tests/automated/integration/api/annotate/all/__init__.py b/tests/automated/integration/api/annotate/all/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/annotate/all/test_happy_path.py b/tests/automated/integration/api/annotate/all/test_happy_path.py new file mode 100644 index 00000000..5003f08f --- /dev/null +++ b/tests/automated/integration/api/annotate/all/test_happy_path.py @@ -0,0 +1,88 @@ +import pytest + +from src.api.endpoints.annotate.agency.post.dto import URLAgencyAnnotationPostInfo +from src.api.endpoints.annotate.all.post.dto import AllAnnotationPostInfo +from src.core.enums import SuggestedStatus, RecordType +from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion +from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion +from tests.helpers.setup.final_review.core import setup_for_get_next_url_for_final_review + + +@pytest.mark.asyncio +async def test_annotate_all(api_test_helper): + """ + Test the happy path workflow for the all-annotations endpoint + The user should be able to get a valid URL (filtering on batch id if needed), + submit a full annotation, and receive another URL + """ + ath = api_test_helper + adb_client = ath.adb_client() + setup_info_1 = await setup_for_get_next_url_for_final_review( + db_data_creator=ath.db_data_creator, include_user_annotations=False + ) + url_mapping_1 = setup_info_1.url_mapping + setup_info_2 = await setup_for_get_next_url_for_final_review( + db_data_creator=ath.db_data_creator, include_user_annotations=False + ) + url_mapping_2 = setup_info_2.url_mapping + + # First, get a valid URL to annotate + get_response_1 = await ath.request_validator.get_next_url_for_all_annotations() + + # Apply the second batch id as a filter and see that a different URL is returned + get_response_2 = await ath.request_validator.get_next_url_for_all_annotations( + batch_id=setup_info_2.batch_id + ) + + assert get_response_1.next_annotation.url_info.url_id != get_response_2.next_annotation.url_info.url_id + + # Annotate the first and submit + agency_id = await ath.db_data_creator.agency() + post_response_1 = await ath.request_validator.post_all_annotations_and_get_next( + url_id=url_mapping_1.url_id, + all_annotations_post_info=AllAnnotationPostInfo( + suggested_status=SuggestedStatus.RELEVANT, + record_type=RecordType.ACCIDENT_REPORTS, + agency=URLAgencyAnnotationPostInfo( + is_new=False, + suggested_agency=agency_id + ) + ) + ) + assert post_response_1.next_annotation is not None + + # Confirm the second is received + assert post_response_1.next_annotation.url_info.url_id == url_mapping_2.url_id + + # Upon submitting the second, confirm that no more URLs are returned through either POST or GET + post_response_2 = await ath.request_validator.post_all_annotations_and_get_next( + url_id=url_mapping_2.url_id, + all_annotations_post_info=AllAnnotationPostInfo( + suggested_status=SuggestedStatus.NOT_RELEVANT, + ) + ) + assert post_response_2.next_annotation is None + + get_response_3 = await ath.request_validator.get_next_url_for_all_annotations() + assert get_response_3.next_annotation is None + + + # Check that all annotations are present in the database + + # Should be two relevance annotations, one True and one False + all_relevance_suggestions: list[UserRelevantSuggestion] = await adb_client.get_all(UserRelevantSuggestion) + assert len(all_relevance_suggestions) == 2 + assert all_relevance_suggestions[0].suggested_status == SuggestedStatus.RELEVANT.value + assert all_relevance_suggestions[1].suggested_status == SuggestedStatus.NOT_RELEVANT.value + + # Should be one agency + all_agency_suggestions = await adb_client.get_all(UserUrlAgencySuggestion) + assert len(all_agency_suggestions) == 1 + assert all_agency_suggestions[0].is_new == False + assert all_agency_suggestions[0].agency_id == agency_id + + # Should be one record type + all_record_type_suggestions = await adb_client.get_all(UserRecordTypeSuggestion) + assert len(all_record_type_suggestions) == 1 + assert all_record_type_suggestions[0].record_type == RecordType.ACCIDENT_REPORTS.value diff --git a/tests/automated/integration/api/annotate/all/test_post_batch_filtering.py b/tests/automated/integration/api/annotate/all/test_post_batch_filtering.py new file mode 100644 index 00000000..a11c43a3 --- /dev/null +++ b/tests/automated/integration/api/annotate/all/test_post_batch_filtering.py @@ -0,0 +1,41 @@ +import pytest + +from src.api.endpoints.annotate.agency.post.dto import URLAgencyAnnotationPostInfo +from src.api.endpoints.annotate.all.post.dto import AllAnnotationPostInfo +from src.core.enums import SuggestedStatus, RecordType +from tests.helpers.setup.final_review.core import setup_for_get_next_url_for_final_review + + +@pytest.mark.asyncio +async def test_annotate_all_post_batch_filtering(api_test_helper): + """ + Batch filtering should also work when posting annotations + """ + ath = api_test_helper + adb_client = ath.adb_client() + setup_info_1 = await setup_for_get_next_url_for_final_review( + db_data_creator=ath.db_data_creator, include_user_annotations=False + ) + url_mapping_1 = setup_info_1.url_mapping + setup_info_2 = await setup_for_get_next_url_for_final_review( + db_data_creator=ath.db_data_creator, include_user_annotations=False + ) + setup_info_3 = await setup_for_get_next_url_for_final_review( + db_data_creator=ath.db_data_creator, include_user_annotations=False + ) + url_mapping_3 = setup_info_3.url_mapping + + # Submit the first annotation, using the third batch id, and receive the third URL + post_response_1 = await ath.request_validator.post_all_annotations_and_get_next( + url_id=url_mapping_1.url_id, + batch_id=setup_info_3.batch_id, + all_annotations_post_info=AllAnnotationPostInfo( + suggested_status=SuggestedStatus.RELEVANT, + record_type=RecordType.ACCIDENT_REPORTS, + agency=URLAgencyAnnotationPostInfo( + is_new=True + ) + ) + ) + + assert post_response_1.next_annotation.url_info.url_id == url_mapping_3.url_id diff --git a/tests/automated/integration/api/annotate/all/test_validation_error.py b/tests/automated/integration/api/annotate/all/test_validation_error.py new file mode 100644 index 00000000..b805a435 --- /dev/null +++ b/tests/automated/integration/api/annotate/all/test_validation_error.py @@ -0,0 +1,27 @@ +import pytest + +from src.api.endpoints.annotate.all.post.dto import AllAnnotationPostInfo +from src.core.enums import SuggestedStatus, RecordType +from src.core.exceptions import FailedValidationException +from tests.helpers.setup.final_review.core import setup_for_get_next_url_for_final_review + + +@pytest.mark.asyncio +async def test_annotate_all_validation_error(api_test_helper): + """ + Validation errors in the PostInfo DTO should result in a 400 BAD REQUEST response + """ + ath = api_test_helper + setup_info_1 = await setup_for_get_next_url_for_final_review( + db_data_creator=ath.db_data_creator, include_user_annotations=False + ) + url_mapping_1 = setup_info_1.url_mapping + + with pytest.raises(FailedValidationException) as e: + response = await ath.request_validator.post_all_annotations_and_get_next( + url_id=url_mapping_1.url_id, + all_annotations_post_info=AllAnnotationPostInfo( + suggested_status=SuggestedStatus.NOT_RELEVANT, + record_type=RecordType.ACCIDENT_REPORTS + ) + ) diff --git a/tests/automated/integration/api/annotate/helpers.py b/tests/automated/integration/api/annotate/helpers.py new file mode 100644 index 00000000..39cfedab --- /dev/null +++ b/tests/automated/integration/api/annotate/helpers.py @@ -0,0 +1,22 @@ +from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo +from src.db.dtos.url.mapping import URLMapping + + +def check_url_mappings_match( + map_1: URLMapping, + map_2: URLMapping +): + assert map_1.url_id == map_2.url_id + assert map_2.url == map_2.url + + +def check_html_info_not_empty( + html_info: ResponseHTMLInfo +): + assert not html_info_empty(html_info) + + +def html_info_empty( + html_info: ResponseHTMLInfo +) -> bool: + return html_info.description == "" and html_info.title == "" diff --git a/tests/automated/integration/api/annotate/record_type/__init__.py b/tests/automated/integration/api/annotate/record_type/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/annotate/record_type/test_record_type.py b/tests/automated/integration/api/annotate/record_type/test_record_type.py new file mode 100644 index 00000000..5e6d8917 --- /dev/null +++ b/tests/automated/integration/api/annotate/record_type/test_record_type.py @@ -0,0 +1,166 @@ +from http import HTTPStatus + +import pytest +from fastapi import HTTPException + +from src.api.endpoints.annotate.dtos.record_type.post import RecordTypeAnnotationPostInfo +from src.api.endpoints.annotate.dtos.record_type.response import GetNextRecordTypeAnnotationResponseOuterInfo +from src.core.enums import RecordType +from src.core.error_manager.enums import ErrorTypes +from src.db.dtos.url.insert import InsertURLsInfo +from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion +from tests.automated.integration.api.annotate.helpers import check_url_mappings_match, check_html_info_not_empty, \ + html_info_empty +from tests.helpers.data_creator.models.creation_info.batch.v1 import BatchURLCreationInfo + + +@pytest.mark.asyncio +async def test_annotate_record_type(api_test_helper): + ath = api_test_helper + + batch_id = ath.db_data_creator.batch() + + # Create 2 URLs with outcome `pending` + iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=2) + + url_1 = iui.url_mappings[0] + url_2 = iui.url_mappings[1] + + # Add record type attribute with value `Accident Reports` to 1st URL + await ath.db_data_creator.auto_record_type_suggestions( + url_id=url_1.url_id, + record_type=RecordType.ACCIDENT_REPORTS + ) + + # Add 'Record Type' attribute with value `Dispatch Recordings` to 2nd URL + await ath.db_data_creator.auto_record_type_suggestions( + url_id=url_2.url_id, + record_type=RecordType.DISPATCH_RECORDINGS + ) + + # Add HTML data to both + await ath.db_data_creator.html_data([url_1.url_id, url_2.url_id]) + + # Call `GET` `/annotate/record-type` and receive next URL + request_info_1: GetNextRecordTypeAnnotationResponseOuterInfo = api_test_helper.request_validator.get_next_record_type_annotation() + inner_info_1 = request_info_1.next_annotation + + check_url_mappings_match(inner_info_1.url_info, url_1) + check_html_info_not_empty(inner_info_1.html_info) + + # Validate that the correct record type is returned + assert inner_info_1.suggested_record_type == RecordType.ACCIDENT_REPORTS + + # Annotate with value 'Personnel Records' and get next URL + request_info_2: GetNextRecordTypeAnnotationResponseOuterInfo = api_test_helper.request_validator.post_record_type_annotation_and_get_next( + url_id=inner_info_1.url_info.url_id, + record_type_annotation_post_info=RecordTypeAnnotationPostInfo( + record_type=RecordType.PERSONNEL_RECORDS + ) + ) + + inner_info_2 = request_info_2.next_annotation + + check_url_mappings_match(inner_info_2.url_info, url_2) + check_html_info_not_empty(inner_info_2.html_info) + + request_info_3: GetNextRecordTypeAnnotationResponseOuterInfo = api_test_helper.request_validator.post_record_type_annotation_and_get_next( + url_id=inner_info_2.url_info.url_id, + record_type_annotation_post_info=RecordTypeAnnotationPostInfo( + record_type=RecordType.ANNUAL_AND_MONTHLY_REPORTS + ) + ) + + assert request_info_3.next_annotation is None + + # Get all URL annotations. Confirm they exist for user + adb_client = ath.adb_client() + results: list[UserRecordTypeSuggestion] = await adb_client.get_all(UserRecordTypeSuggestion) + result_1 = results[0] + result_2 = results[1] + + assert result_1.url_id == inner_info_1.url_info.url_id + assert result_1.record_type == RecordType.PERSONNEL_RECORDS.value + + assert result_2.url_id == inner_info_2.url_info.url_id + assert result_2.record_type == RecordType.ANNUAL_AND_MONTHLY_REPORTS.value + + # If user submits annotation for same URL, the URL should be overwritten + + request_info_4: GetNextRecordTypeAnnotationResponseOuterInfo = api_test_helper.request_validator.post_record_type_annotation_and_get_next( + url_id=inner_info_1.url_info.url_id, + record_type_annotation_post_info=RecordTypeAnnotationPostInfo( + record_type=RecordType.BOOKING_REPORTS + ) + ) + + assert request_info_4.next_annotation is None + + results: list[UserRecordTypeSuggestion] = await adb_client.get_all(UserRecordTypeSuggestion) + assert len(results) == 2 + + for result in results: + if result.url_id == inner_info_1.url_info.url_id: + assert result.record_type == RecordType.BOOKING_REPORTS.value + + +@pytest.mark.asyncio +async def test_annotate_record_type_already_annotated_by_different_user( + api_test_helper +): + ath = api_test_helper + + creation_info: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( + url_count=1 + ) + + await ath.db_data_creator.user_record_type_suggestion( + url_id=creation_info.url_ids[0], + user_id=2, + record_type=RecordType.ACCIDENT_REPORTS + ) + + # Annotate with different user (default is 1) and get conflict error + try: + response = await ath.request_validator.post_record_type_annotation_and_get_next( + url_id=creation_info.url_ids[0], + record_type_annotation_post_info=RecordTypeAnnotationPostInfo( + record_type=RecordType.ANNUAL_AND_MONTHLY_REPORTS + ) + ) + except HTTPException as e: + assert e.status_code == HTTPStatus.CONFLICT + assert e.detail["detail"]["code"] == ErrorTypes.ANNOTATION_EXISTS.value + assert e.detail["detail"]["message"] == f"Annotation of type RECORD_TYPE already exists for url {creation_info.url_ids[0]}" + + +@pytest.mark.asyncio +async def test_annotate_record_type_no_html_info(api_test_helper): + ath = api_test_helper + + batch_id = ath.db_data_creator.batch() + + # Create 2 URLs with outcome `pending` + iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=2) + + url_1 = iui.url_mappings[0] + url_2 = iui.url_mappings[1] + + # Add record type attribute with value `Accident Reports` to 1st URL + await ath.db_data_creator.auto_record_type_suggestions( + url_id=url_1.url_id, + record_type=RecordType.ACCIDENT_REPORTS + ) + + # Add 'Record Type' attribute with value `Dispatch Recordings` to 2nd URL + await ath.db_data_creator.auto_record_type_suggestions( + url_id=url_2.url_id, + record_type=RecordType.DISPATCH_RECORDINGS + ) + + # Call `GET` `/annotate/record-type` and receive next URL + request_info_1: GetNextRecordTypeAnnotationResponseOuterInfo = api_test_helper.request_validator.get_next_record_type_annotation() + inner_info_1 = request_info_1.next_annotation + + check_url_mappings_match(inner_info_1.url_info, url_1) + assert html_info_empty(inner_info_1.html_info) diff --git a/tests/automated/integration/api/annotate/relevancy/__init__.py b/tests/automated/integration/api/annotate/relevancy/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/annotate/relevancy/test_relevancy.py b/tests/automated/integration/api/annotate/relevancy/test_relevancy.py new file mode 100644 index 00000000..387d68c0 --- /dev/null +++ b/tests/automated/integration/api/annotate/relevancy/test_relevancy.py @@ -0,0 +1,213 @@ +from http import HTTPStatus + +import pytest +from fastapi import HTTPException + +from src.api.endpoints.annotate.relevance.get.dto import GetNextRelevanceAnnotationResponseOuterInfo +from src.api.endpoints.annotate.relevance.post.dto import RelevanceAnnotationPostInfo +from src.core.enums import SuggestedStatus +from src.core.error_manager.enums import ErrorTypes +from src.db.dtos.url.insert import InsertURLsInfo +from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion +from tests.automated.integration.api.annotate.helpers import check_url_mappings_match, check_html_info_not_empty, \ + html_info_empty +from tests.helpers.data_creator.models.creation_info.batch.v1 import BatchURLCreationInfo + + +@pytest.mark.asyncio +async def test_annotate_relevancy(api_test_helper): + ath = api_test_helper + + batch_id = ath.db_data_creator.batch() + + # Create 2 URLs with outcome `pending` + iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=2) + + url_1 = iui.url_mappings[0] + url_2 = iui.url_mappings[1] + + # Add `Relevancy` attribute with value `True` to 1st URL + await ath.db_data_creator.auto_relevant_suggestions( + url_id=url_1.url_id, + relevant=True + ) + + # Add 'Relevancy' attribute with value `False` to 2nd URL + await ath.db_data_creator.auto_relevant_suggestions( + url_id=url_2.url_id, + relevant=False + ) + + # Add HTML data to both + await ath.db_data_creator.html_data([url_1.url_id, url_2.url_id]) + # Call `GET` `/annotate/relevance` and receive next URL + request_info_1: GetNextRelevanceAnnotationResponseOuterInfo = api_test_helper.request_validator.get_next_relevance_annotation() + inner_info_1 = request_info_1.next_annotation + + check_url_mappings_match(inner_info_1.url_info, url_1) + check_html_info_not_empty(inner_info_1.html_info) + + # Validate that the correct relevant value is returned + assert inner_info_1.annotation.is_relevant is True + + # A second user should see the same URL + + + # Annotate with value 'False' and get next URL + request_info_2: GetNextRelevanceAnnotationResponseOuterInfo = api_test_helper.request_validator.post_relevance_annotation_and_get_next( + url_id=inner_info_1.url_info.url_id, + relevance_annotation_post_info=RelevanceAnnotationPostInfo( + suggested_status=SuggestedStatus.NOT_RELEVANT + ) + ) + + inner_info_2 = request_info_2.next_annotation + + check_url_mappings_match( + inner_info_2.url_info, + url_2 + ) + check_html_info_not_empty(inner_info_2.html_info) + + request_info_3: GetNextRelevanceAnnotationResponseOuterInfo = api_test_helper.request_validator.post_relevance_annotation_and_get_next( + url_id=inner_info_2.url_info.url_id, + relevance_annotation_post_info=RelevanceAnnotationPostInfo( + suggested_status=SuggestedStatus.RELEVANT + ) + ) + + assert request_info_3.next_annotation is None + + # Get all URL annotations. Confirm they exist for user + adb_client = ath.adb_client() + results: list[UserRelevantSuggestion] = await adb_client.get_all(UserRelevantSuggestion) + result_1 = results[0] + result_2 = results[1] + + assert result_1.url_id == inner_info_1.url_info.url_id + assert result_1.suggested_status == SuggestedStatus.NOT_RELEVANT.value + + assert result_2.url_id == inner_info_2.url_info.url_id + assert result_2.suggested_status == SuggestedStatus.RELEVANT.value + + # If user submits annotation for same URL, the URL should be overwritten + request_info_4: GetNextRelevanceAnnotationResponseOuterInfo = api_test_helper.request_validator.post_relevance_annotation_and_get_next( + url_id=inner_info_1.url_info.url_id, + relevance_annotation_post_info=RelevanceAnnotationPostInfo( + suggested_status=SuggestedStatus.RELEVANT + ) + ) + + assert request_info_4.next_annotation is None + + results: list[UserRelevantSuggestion] = await adb_client.get_all(UserRelevantSuggestion) + assert len(results) == 2 + + for result in results: + if result.url_id == inner_info_1.url_info.url_id: + assert results[0].suggested_status == SuggestedStatus.RELEVANT.value + + +async def post_and_validate_relevancy_annotation(ath, url_id, annotation: SuggestedStatus): + response = ath.request_validator.post_relevance_annotation_and_get_next( + url_id=url_id, + relevance_annotation_post_info=RelevanceAnnotationPostInfo( + suggested_status=annotation + ) + ) + + assert response.next_annotation is None + + results: list[UserRelevantSuggestion] = await ath.adb_client().get_all(UserRelevantSuggestion) + assert len(results) == 1 + assert results[0].suggested_status == annotation.value + + +@pytest.mark.asyncio +async def test_annotate_relevancy_broken_page(api_test_helper): + ath = api_test_helper + + creation_info = await ath.db_data_creator.batch_and_urls(url_count=1, with_html_content=False) + + await post_and_validate_relevancy_annotation( + ath, + url_id=creation_info.url_ids[0], + annotation=SuggestedStatus.BROKEN_PAGE_404 + ) + + +@pytest.mark.asyncio +async def test_annotate_relevancy_individual_record(api_test_helper): + ath = api_test_helper + + creation_info: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( + url_count=1 + ) + + await post_and_validate_relevancy_annotation( + ath, + url_id=creation_info.url_ids[0], + annotation=SuggestedStatus.INDIVIDUAL_RECORD + ) + + +@pytest.mark.asyncio +async def test_annotate_relevancy_already_annotated_by_different_user( + api_test_helper +): + ath = api_test_helper + + creation_info: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( + url_count=1 + ) + + await ath.db_data_creator.user_relevant_suggestion( + url_id=creation_info.url_ids[0], + user_id=2, + suggested_status=SuggestedStatus.RELEVANT + ) + + # Annotate with different user (default is 1) and get conflict error + try: + response = await ath.request_validator.post_relevance_annotation_and_get_next( + url_id=creation_info.url_ids[0], + relevance_annotation_post_info=RelevanceAnnotationPostInfo( + suggested_status=SuggestedStatus.NOT_RELEVANT + ) + ) + except HTTPException as e: + assert e.status_code == HTTPStatus.CONFLICT + assert e.detail["detail"]["code"] == ErrorTypes.ANNOTATION_EXISTS.value + assert e.detail["detail"]["message"] == f"Annotation of type RELEVANCE already exists for url {creation_info.url_ids[0]}" + + +@pytest.mark.asyncio +async def test_annotate_relevancy_no_html(api_test_helper): + ath = api_test_helper + + batch_id = ath.db_data_creator.batch() + + # Create 2 URLs with outcome `pending` + iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=2) + + url_1 = iui.url_mappings[0] + url_2 = iui.url_mappings[1] + + # Add `Relevancy` attribute with value `True` to 1st URL + await ath.db_data_creator.auto_relevant_suggestions( + url_id=url_1.url_id, + relevant=True + ) + + # Add 'Relevancy' attribute with value `False` to 2nd URL + await ath.db_data_creator.auto_relevant_suggestions( + url_id=url_2.url_id, + relevant=False + ) + + # Call `GET` `/annotate/relevance` and receive next URL + request_info_1: GetNextRelevanceAnnotationResponseOuterInfo = api_test_helper.request_validator.get_next_relevance_annotation() + inner_info_1 = request_info_1.next_annotation + + check_url_mappings_match(inner_info_1.url_info, url_1) + assert html_info_empty(inner_info_1.html_info) diff --git a/tests/automated/integration/api/annotate/test_.py b/tests/automated/integration/api/annotate/test_.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/batch/__init__.py b/tests/automated/integration/api/batch/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/batch/summaries/__init__.py b/tests/automated/integration/api/batch/summaries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/batch/summaries/test_happy_path.py b/tests/automated/integration/api/batch/summaries/test_happy_path.py new file mode 100644 index 00000000..d91e1a8c --- /dev/null +++ b/tests/automated/integration/api/batch/summaries/test_happy_path.py @@ -0,0 +1,95 @@ +import pytest + +from src.core.enums import BatchStatus +from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum +from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters + + +@pytest.mark.asyncio +async def test_get_batch_summaries(api_test_helper): + ath = api_test_helper + + batch_params = [ + TestBatchCreationParameters( + urls=[ + TestURLCreationParameters( + count=1, + status=URLCreationEnum.OK + ), + TestURLCreationParameters( + count=2, + status=URLCreationEnum.SUBMITTED + ) + ] + ), + TestBatchCreationParameters( + urls=[ + TestURLCreationParameters( + count=4, + status=URLCreationEnum.NOT_RELEVANT + ), + TestURLCreationParameters( + count=3, + status=URLCreationEnum.ERROR + ) + ] + ), + TestBatchCreationParameters( + urls=[ + TestURLCreationParameters( + count=7, + status=URLCreationEnum.DUPLICATE + ), + TestURLCreationParameters( + count=1, + status=URLCreationEnum.SUBMITTED + ) + ] + ) + ] + + batch_1_creation_info = await ath.db_data_creator.batch_v2(batch_params[0]) + batch_2_creation_info = await ath.db_data_creator.batch_v2(batch_params[1]) + batch_3_creation_info = await ath.db_data_creator.batch_v2(batch_params[2]) + + batch_1_id = batch_1_creation_info.batch_id + batch_2_id = batch_2_creation_info.batch_id + batch_3_id = batch_3_creation_info.batch_id + + + response = ath.request_validator.get_batch_statuses() + results = response.results + + assert len(results) == 3 + + result_1 = results[0] + assert result_1.id == batch_1_id + assert result_1.status == BatchStatus.READY_TO_LABEL + counts_1 = result_1.url_counts + assert counts_1.total == 3 + assert counts_1.pending == 1 + assert counts_1.submitted == 2 + assert counts_1.not_relevant == 0 + assert counts_1.duplicate == 0 + assert counts_1.errored == 0 + + result_2 = results[1] + assert result_2.id == batch_2_id + counts_2 = result_2.url_counts + assert counts_2.total == 7 + assert counts_2.not_relevant == 4 + assert counts_2.errored == 3 + assert counts_2.pending == 3 + assert counts_2.submitted == 0 + assert counts_2.duplicate == 0 + + result_3 = results[2] + assert result_3.id == batch_3_id + counts_3 = result_3.url_counts + assert counts_3.total == 8 + assert counts_3.not_relevant == 0 + assert counts_3.errored == 0 + assert counts_3.pending == 7 + assert counts_3.submitted == 1 + assert counts_3.duplicate == 7 diff --git a/tests/automated/integration/api/batch/summaries/test_pending_url_filter.py b/tests/automated/integration/api/batch/summaries/test_pending_url_filter.py new file mode 100644 index 00000000..7fdc96b1 --- /dev/null +++ b/tests/automated/integration/api/batch/summaries/test_pending_url_filter.py @@ -0,0 +1,75 @@ +import pytest + +from src.collectors.enums import CollectorType +from src.core.enums import BatchStatus +from src.db.dtos.url.mapping import URLMapping +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum +from tests.helpers.data_creator.core import DBDataCreator + + +@pytest.mark.asyncio +async def test_get_batch_summaries_pending_url_filter(api_test_helper): + ath = api_test_helper + dbdc: DBDataCreator = ath.db_data_creator + + # Add an errored out batch + batch_error: int = await dbdc.create_batch(status=BatchStatus.ERROR) + + # Add a batch with pending urls + batch_pending = await ath.db_data_creator.batch_and_urls( + strategy=CollectorType.EXAMPLE, + url_count=2, + batch_status=BatchStatus.READY_TO_LABEL, + with_html_content=True, + url_status=URLCreationEnum.OK + ) + + # Add a batch with submitted URLs + batch_submitted: int = await dbdc.create_batch(status=BatchStatus.READY_TO_LABEL) + submitted_url_mappings: list[URLMapping] = await dbdc.create_submitted_urls(count=2) + submitted_url_ids: list[int] = [url_mapping.url_id for url_mapping in submitted_url_mappings] + await dbdc.create_batch_url_links( + batch_id=batch_submitted, + url_ids=submitted_url_ids + ) + + # Add an aborted batch + batch_aborted: int = await dbdc.create_batch(status=BatchStatus.ABORTED) + + # Add a batch with validated URLs + batch_validated: int = await dbdc.create_batch(status=BatchStatus.READY_TO_LABEL) + validated_url_mappings: list[URLMapping] = await dbdc.create_validated_urls( + count=2 + ) + validated_url_ids: list[int] = [url_mapping.url_id for url_mapping in validated_url_mappings] + await dbdc.create_batch_url_links( + batch_id=batch_validated, + url_ids=validated_url_ids + ) + + # Test filter for pending URLs and only retrieve the second batch + pending_urls_results = ath.request_validator.get_batch_statuses( + has_pending_urls=True + ) + + assert len(pending_urls_results.results) == 1 + assert pending_urls_results.results[0].id == batch_pending.batch_id + + # Test filter without pending URLs and retrieve the other four batches + no_pending_urls_results = ath.request_validator.get_batch_statuses( + has_pending_urls=False + ) + + assert len(no_pending_urls_results.results) == 4 + for result in no_pending_urls_results.results: + assert result.id in [ + batch_error, + batch_submitted, + batch_validated, + batch_aborted + ] + + # Test no filter for pending URLs and retrieve all batches + no_filter_results = ath.request_validator.get_batch_statuses() + + assert len(no_filter_results.results) == 5 diff --git a/tests/automated/integration/api/batch/test_batch.py b/tests/automated/integration/api/batch/test_batch.py new file mode 100644 index 00000000..86f35cfc --- /dev/null +++ b/tests/automated/integration/api/batch/test_batch.py @@ -0,0 +1,64 @@ +from src.db.models.impl.batch.pydantic.info import BatchInfo +from src.db.dtos.url.insert import InsertURLsInfo +from src.collectors.impl.example.dtos.input import ExampleInputDTO +from src.core.enums import BatchStatus + + +def test_abort_batch(api_test_helper): + ath = api_test_helper + + dto = ExampleInputDTO( + sleep_time=1 + ) + + batch_id = ath.request_validator.example_collector(dto=dto)["batch_id"] + + response = ath.request_validator.abort_batch(batch_id=batch_id) + + assert response.message == "Batch aborted." + + bi: BatchInfo = ath.request_validator.get_batch_info(batch_id=batch_id) + + assert bi.status == BatchStatus.ABORTED + +def test_get_batch_urls(api_test_helper): + + # Insert batch and urls into database + ath = api_test_helper + batch_id = ath.db_data_creator.batch() + iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=101) + + response = ath.request_validator.get_batch_urls(batch_id=batch_id, page=1) + assert len(response.urls) == 100 + # Check that the first url corresponds to the first url inserted + assert response.urls[0].url == iui.url_mappings[0].url + # Check that the last url corresponds to the 100th url inserted + assert response.urls[-1].url == iui.url_mappings[99].url + + + # Check that a more limited set of urls exist + response = ath.request_validator.get_batch_urls(batch_id=batch_id, page=2) + assert len(response.urls) == 1 + # Check that this url corresponds to the last url inserted + assert response.urls[0].url == iui.url_mappings[-1].url + +def test_get_duplicate_urls(api_test_helper): + + # Insert batch and url into database + ath = api_test_helper + batch_id = ath.db_data_creator.batch() + iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=101) + # Get a list of all url ids + url_ids = [url.url_id for url in iui.url_mappings] + + # Create a second batch which will be associated with the duplicates + dup_batch_id = ath.db_data_creator.batch() + + # Insert duplicate urls into database + ath.db_data_creator.duplicate_urls(duplicate_batch_id=dup_batch_id, url_ids=url_ids) + + response = ath.request_validator.get_batch_url_duplicates(batch_id=dup_batch_id, page=1) + assert len(response.duplicates) == 100 + + response = ath.request_validator.get_batch_url_duplicates(batch_id=dup_batch_id, page=2) + assert len(response.duplicates) == 1 \ No newline at end of file diff --git a/tests/automated/integration/api/example_collector/test_happy_path.py b/tests/automated/integration/api/example_collector/test_happy_path.py index bbb52789..d580f546 100644 --- a/tests/automated/integration/api/example_collector/test_happy_path.py +++ b/tests/automated/integration/api/example_collector/test_happy_path.py @@ -6,7 +6,7 @@ from src.api.endpoints.batch.dtos.get.summaries.response import GetBatchSummariesResponse from src.api.endpoints.batch.dtos.get.summaries.summary import BatchSummary from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.impl.batch.pydantic import BatchInfo +from src.db.models.impl.batch.pydantic.info import BatchInfo from src.collectors.impl.example.dtos.input import ExampleInputDTO from src.collectors.enums import CollectorType from src.core.logger import AsyncCoreLogger diff --git a/tests/automated/integration/api/metrics/batches/test_aggregated.py b/tests/automated/integration/api/metrics/batches/test_aggregated.py index 084762b9..4b7b4f75 100644 --- a/tests/automated/integration/api/metrics/batches/test_aggregated.py +++ b/tests/automated/integration/api/metrics/batches/test_aggregated.py @@ -2,44 +2,65 @@ from src.collectors.enums import CollectorType, URLStatus from src.core.enums import BatchStatus +from src.db.client.async_ import AsyncDatabaseClient +from src.db.dtos.url.mapping import URLMapping +from src.db.helpers.connect import get_postgres_connection_string +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters -from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters +from tests.helpers.data_creator.create import create_batch, create_url_data_sources, create_urls, \ + create_batch_url_links, create_validated_flags +from tests.helpers.setup.wipe import wipe_database @pytest.mark.asyncio -async def test_get_batches_aggregated_metrics(api_test_helper): +async def test_get_batches_aggregated_metrics( + api_test_helper, + wiped_database +): ath = api_test_helper + adb_client: AsyncDatabaseClient = ath.adb_client() # Create successful batches with URLs of different statuses - all_params = [] for i in range(3): - params = TestBatchCreationParameters( + batch_id = await create_batch( + adb_client=adb_client, strategy=CollectorType.MANUAL, - urls=[ - TestURLCreationParameters( - count=1, - status=URLStatus.PENDING - ), - TestURLCreationParameters( - count=2, - status=URLStatus.SUBMITTED - ), - TestURLCreationParameters( - count=3, - status=URLStatus.NOT_RELEVANT - ), - TestURLCreationParameters( - count=4, - status=URLStatus.ERROR - ), - TestURLCreationParameters( - count=5, - status=URLStatus.VALIDATED - ) - ] ) - all_params.append(params) - + url_mappings_error: list[URLMapping] = await create_urls( + adb_client=adb_client, + status=URLStatus.ERROR, + count=4, + ) + url_mappings_ok: list[URLMapping] = await create_urls( + adb_client=adb_client, + status=URLStatus.OK, + count=11, + ) + url_mappings_all: list[URLMapping] = url_mappings_error + url_mappings_ok + url_ids_all: list[int] = [url_mapping.url_id for url_mapping in url_mappings_all] + await create_batch_url_links( + adb_client=adb_client, + batch_id=batch_id, + url_ids=url_ids_all, + ) + urls_submitted: list[int] = url_ids_all[:2] + urls_not_relevant: list[int] = url_ids_all[2:5] + urls_validated: list[int] = url_ids_all[5:10] + await create_validated_flags( + adb_client=adb_client, + url_ids=urls_validated + urls_submitted, + validation_type=URLValidatedType.DATA_SOURCE, + ) + await create_validated_flags( + adb_client=adb_client, + url_ids=urls_not_relevant, + validation_type=URLValidatedType.NOT_RELEVANT, + ) + await create_url_data_sources( + adb_client=adb_client, + url_ids=urls_submitted, + ) + all_params = [] # Create failed batches for i in range(2): params = TestBatchCreationParameters( @@ -66,8 +87,8 @@ async def test_get_batches_aggregated_metrics(api_test_helper): assert inner_dto_manual.count_urls == 45 assert inner_dto_manual.count_successful_batches == 3 assert inner_dto_manual.count_failed_batches == 0 - assert inner_dto_manual.count_urls_pending == 3 + assert inner_dto_manual.count_urls_pending == 15 assert inner_dto_manual.count_urls_submitted == 6 assert inner_dto_manual.count_urls_rejected == 9 assert inner_dto_manual.count_urls_errors == 12 - assert inner_dto_manual.count_urls_validated == 15 + assert inner_dto_manual.count_urls_validated == 30 diff --git a/tests/automated/integration/api/metrics/batches/test_breakdown.py b/tests/automated/integration/api/metrics/batches/test_breakdown.py index 0cce8740..0657c66f 100644 --- a/tests/automated/integration/api/metrics/batches/test_breakdown.py +++ b/tests/automated/integration/api/metrics/batches/test_breakdown.py @@ -1,79 +1,102 @@ +from datetime import datetime, timedelta + import pendulum import pytest from src.collectors.enums import CollectorType, URLStatus from src.core.enums import BatchStatus -from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters -from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters +from src.db.client.async_ import AsyncDatabaseClient +from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from tests.helpers.data_creator.create import create_batch, create_urls, create_batch_url_links, create_validated_flags, \ + create_url_data_sources @pytest.mark.asyncio async def test_get_batches_breakdown_metrics(api_test_helper): # Create a different batch for each month, with different URLs - today = pendulum.parse('2021-01-01') + today = datetime.now() ath = api_test_helper + adb_client: AsyncDatabaseClient = ath.adb_client() - batch_1_params = TestBatchCreationParameters( + batch_id_1 = await create_batch( + adb_client=adb_client, strategy=CollectorType.MANUAL, - urls=[ - TestURLCreationParameters( - count=1, - status=URLStatus.PENDING - ), - TestURLCreationParameters( - count=2, - status=URLStatus.SUBMITTED - ), - ] ) - batch_1 = await ath.db_data_creator.batch_v2(batch_1_params) - batch_2_params = TestBatchCreationParameters( - strategy=CollectorType.EXAMPLE, - outcome=BatchStatus.ERROR, - created_at=today.subtract(weeks=1), + url_mappings_1: list[URLMapping] = await create_urls( + adb_client=adb_client, + count=3, + ) + url_ids_1: list[int] = [url_mapping.url_id for url_mapping in url_mappings_1] + await create_batch_url_links(adb_client=adb_client, batch_id=batch_id_1, url_ids=url_ids_1) + await create_validated_flags( + adb_client=adb_client, + url_ids=url_ids_1[:2], + validation_type=URLValidatedType.DATA_SOURCE + ) + await create_url_data_sources( + adb_client=adb_client, + url_ids=url_ids_1[:2], ) - batch_2 = await ath.db_data_creator.batch_v2(batch_2_params) - batch_3_params = TestBatchCreationParameters( + + batch_id_2 = await create_batch( + adb_client=adb_client, + status=BatchStatus.ERROR, + date_generated=today - timedelta(days=7), + ) + + batch_id_3 = await create_batch( + adb_client=adb_client, strategy=CollectorType.AUTO_GOOGLER, - created_at=today.subtract(weeks=2), - urls=[ - TestURLCreationParameters( - count=3, - status=URLStatus.NOT_RELEVANT - ), - TestURLCreationParameters( - count=4, - status=URLStatus.ERROR - ), - TestURLCreationParameters( - count=5, - status=URLStatus.VALIDATED - ), - ] + date_generated=today - timedelta(days=14) ) - batch_3 = await ath.db_data_creator.batch_v2(batch_3_params) + error_url_mappings: list[URLMapping] = await create_urls( + adb_client=adb_client, + status=URLStatus.ERROR, + count=4, + ) + error_url_ids: list[int] = [url_mapping.url_id for url_mapping in error_url_mappings] + validated_url_mappings: list[URLMapping] = await create_urls( + adb_client=adb_client, + count=8, + ) + validated_url_ids: list[int] = [url_mapping.url_id for url_mapping in validated_url_mappings] + await create_validated_flags( + adb_client=adb_client, + url_ids=validated_url_ids[:3], + validation_type=URLValidatedType.NOT_RELEVANT, + ) + await create_validated_flags( + adb_client=adb_client, + url_ids=validated_url_ids[4:9], + validation_type=URLValidatedType.DATA_SOURCE, + ) + await create_batch_url_links( + adb_client=adb_client, + batch_id=batch_id_3, + url_ids=error_url_ids + validated_url_ids, + ) + dto_1 = await ath.request_validator.get_batches_breakdown_metrics( page=1 ) assert len(dto_1.batches) == 3 dto_batch_1 = dto_1.batches[2] - assert dto_batch_1.batch_id == batch_1.batch_id + assert dto_batch_1.batch_id == batch_id_1 assert dto_batch_1.strategy == CollectorType.MANUAL assert dto_batch_1.status == BatchStatus.READY_TO_LABEL - assert pendulum.instance(dto_batch_1.created_at) > today assert dto_batch_1.count_url_total == 3 assert dto_batch_1.count_url_pending == 1 assert dto_batch_1.count_url_submitted == 2 assert dto_batch_1.count_url_rejected == 0 assert dto_batch_1.count_url_error == 0 - assert dto_batch_1.count_url_validated == 0 + assert dto_batch_1.count_url_validated == 2 dto_batch_2 = dto_1.batches[1] - assert dto_batch_2.batch_id == batch_2.batch_id + assert dto_batch_2.batch_id == batch_id_2 assert dto_batch_2.status == BatchStatus.ERROR assert dto_batch_2.strategy == CollectorType.EXAMPLE - assert pendulum.instance(dto_batch_2.created_at) == today.subtract(weeks=1) assert dto_batch_2.count_url_total == 0 assert dto_batch_2.count_url_submitted == 0 assert dto_batch_2.count_url_pending == 0 @@ -82,16 +105,15 @@ async def test_get_batches_breakdown_metrics(api_test_helper): assert dto_batch_2.count_url_validated == 0 dto_batch_3 = dto_1.batches[0] - assert dto_batch_3.batch_id == batch_3.batch_id + assert dto_batch_3.batch_id == batch_id_3 assert dto_batch_3.status == BatchStatus.READY_TO_LABEL assert dto_batch_3.strategy == CollectorType.AUTO_GOOGLER - assert pendulum.instance(dto_batch_3.created_at) == today.subtract(weeks=2) assert dto_batch_3.count_url_total == 12 - assert dto_batch_3.count_url_pending == 0 + assert dto_batch_3.count_url_pending == 5 assert dto_batch_3.count_url_submitted == 0 assert dto_batch_3.count_url_rejected == 3 assert dto_batch_3.count_url_error == 4 - assert dto_batch_3.count_url_validated == 5 + assert dto_batch_3.count_url_validated == 7 dto_2 = await ath.request_validator.get_batches_breakdown_metrics( page=2 diff --git a/tests/automated/integration/api/metrics/test_backlog.py b/tests/automated/integration/api/metrics/test_backlog.py index a6807a23..e48db202 100644 --- a/tests/automated/integration/api/metrics/test_backlog.py +++ b/tests/automated/integration/api/metrics/test_backlog.py @@ -3,9 +3,13 @@ from src.collectors.enums import CollectorType, URLStatus from src.core.enums import SuggestedStatus +from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from tests.helpers.batch_creation_parameters.annotation_info import AnnotationInfo from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters +from tests.helpers.data_creator.core import DBDataCreator @pytest.mark.asyncio @@ -14,29 +18,22 @@ async def test_get_backlog_metrics(api_test_helper): ath = api_test_helper adb_client = ath.adb_client() + ddc: DBDataCreator = ath.db_data_creator # Populate the backlog table and test that backlog metrics returned on a monthly basis # Ensure that multiple days in each month are added to the backlog table, with different values - - batch_1_params = TestBatchCreationParameters( - strategy=CollectorType.MANUAL, - urls=[ - TestURLCreationParameters( - count=1, - status=URLStatus.PENDING, - annotation_info=AnnotationInfo( - user_relevant=SuggestedStatus.NOT_RELEVANT - ) - ), - TestURLCreationParameters( - count=2, - status=URLStatus.SUBMITTED - ), - ] + batch_1_id: int = await ddc.create_batch() + url_mappings_1: list[URLMapping] = await ddc.create_urls(count=3) + url_ids_1: list[int] = [url_mapping.url_id for url_mapping in url_mappings_1] + await ddc.create_batch_url_links(url_ids=url_ids_1, batch_id=batch_1_id) + submitted_url_ids_1: list[int] = url_ids_1[:2] + await ddc.create_validated_flags( + url_ids=submitted_url_ids_1, + validation_type=URLValidatedType.DATA_SOURCE ) - batch_1 = await ath.db_data_creator.batch_v2(batch_1_params) + await ddc.create_url_data_sources(url_ids=submitted_url_ids_1) await adb_client.populate_backlog_snapshot( dt=today.subtract(months=3).naive() @@ -46,23 +43,20 @@ async def test_get_backlog_metrics(api_test_helper): dt=today.subtract(months=2, days=3).naive() ) - batch_2_params = TestBatchCreationParameters( - strategy=CollectorType.AUTO_GOOGLER, - urls=[ - TestURLCreationParameters( - count=4, - status=URLStatus.PENDING, - annotation_info=AnnotationInfo( - user_relevant=SuggestedStatus.NOT_RELEVANT - ) - ), - TestURLCreationParameters( - count=2, - status=URLStatus.ERROR - ), - ] + batch_2_id: int = await ddc.create_batch() + not_relevant_url_mappings_2: list[URLMapping] = await ddc.create_urls(count=6) + not_relevant_url_ids_2: list[int] = [url_mapping.url_id for url_mapping in not_relevant_url_mappings_2] + await ddc.create_batch_url_links(url_ids=not_relevant_url_ids_2, batch_id=batch_2_id) + await ddc.create_validated_flags( + url_ids=not_relevant_url_ids_2[:4], + validation_type=URLValidatedType.NOT_RELEVANT + ) + error_url_mappings_2: list[URLMapping] = await ddc.create_urls( + status=URLStatus.ERROR, + count=2 ) - batch_2 = await ath.db_data_creator.batch_v2(batch_2_params) + error_url_ids_2: list[int] = [url_mapping.url_id for url_mapping in error_url_mappings_2] + await ddc.create_batch_url_links(url_ids=error_url_ids_2, batch_id=batch_2_id) await adb_client.populate_backlog_snapshot( dt=today.subtract(months=2).naive() @@ -72,23 +66,15 @@ async def test_get_backlog_metrics(api_test_helper): dt=today.subtract(months=1, days=4).naive() ) - batch_3_params = TestBatchCreationParameters( - strategy=CollectorType.AUTO_GOOGLER, - urls=[ - TestURLCreationParameters( - count=7, - status=URLStatus.PENDING, - annotation_info=AnnotationInfo( - user_relevant=SuggestedStatus.NOT_RELEVANT - ) - ), - TestURLCreationParameters( - count=5, - status=URLStatus.VALIDATED - ), - ] + batch_3_id: int = await ddc.create_batch() + url_mappings_3: list[URLMapping] = await ddc.create_urls(count=12) + url_ids_3: list[int] = [url_mapping.url_id for url_mapping in url_mappings_3] + await ddc.create_batch_url_links(url_ids=url_ids_3, batch_id=batch_3_id) + await ddc.create_validated_flags( + url_ids=url_ids_3[:5], + validation_type=URLValidatedType.DATA_SOURCE ) - batch_3 = await ath.db_data_creator.batch_v2(batch_3_params) + await adb_client.populate_backlog_snapshot( dt=today.subtract(months=1).naive() @@ -100,5 +86,5 @@ async def test_get_backlog_metrics(api_test_helper): # Test that the count closest to the beginning of the month is returned for each month assert dto.entries[0].count_pending_total == 1 - assert dto.entries[1].count_pending_total == 5 - assert dto.entries[2].count_pending_total == 12 + assert dto.entries[1].count_pending_total == 3 + assert dto.entries[2].count_pending_total == 10 diff --git a/tests/automated/integration/api/metrics/urls/aggregated/test_core.py b/tests/automated/integration/api/metrics/urls/aggregated/test_core.py index c8957952..08c52845 100644 --- a/tests/automated/integration/api/metrics/urls/aggregated/test_core.py +++ b/tests/automated/integration/api/metrics/urls/aggregated/test_core.py @@ -1,75 +1,70 @@ +from datetime import datetime, timedelta, timezone + import pendulum import pytest from src.collectors.enums import CollectorType, URLStatus +from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters +from tests.helpers.data_creator.core import DBDataCreator @pytest.mark.asyncio async def test_get_urls_aggregated_metrics(api_test_helper): ath = api_test_helper - today = pendulum.parse('2021-01-01') + today = datetime.now() + + ddc: DBDataCreator = ath.db_data_creator batch_0_params = TestBatchCreationParameters( strategy=CollectorType.MANUAL, - created_at=today.subtract(days=1), + created_at=today - timedelta(days=1), urls=[ TestURLCreationParameters( count=1, - status=URLStatus.PENDING, + status=URLCreationEnum.OK, ), ] ) - batch_0 = await ath.db_data_creator.batch_v2(batch_0_params) - oldest_url_id = batch_0.urls_by_status[URLStatus.PENDING].url_mappings[0].url_id - + batch_0: int = await ddc.create_batch( + strategy=CollectorType.MANUAL, + date_generated=today - timedelta(days=1) + ) + url_mappings_0: list[URLMapping] = await ddc.create_urls(batch_id=batch_0) + oldest_url_id: int = url_mappings_0[0].url_id - batch_1_params = TestBatchCreationParameters( + batch_1: int = await ddc.create_batch( strategy=CollectorType.MANUAL, - urls=[ - TestURLCreationParameters( - count=1, - status=URLStatus.PENDING, - ), - TestURLCreationParameters( - count=2, - status=URLStatus.SUBMITTED - ), - ] ) - batch_1 = await ath.db_data_creator.batch_v2(batch_1_params) + url_mappings_1_ok: list[URLMapping] = await ddc.create_urls(batch_id=batch_1, count=1) + url_mappings_1_submitted: list[URLMapping] = await ddc.create_submitted_urls(count=2) + url_ids_1_submitted: list[int] = [url_mapping.url_id for url_mapping in url_mappings_1_submitted] + await ddc.create_batch_url_links(url_ids=url_ids_1_submitted, batch_id=batch_1) - batch_2_params = TestBatchCreationParameters( + batch_2: int = await ddc.create_batch( strategy=CollectorType.AUTO_GOOGLER, - urls=[ - TestURLCreationParameters( - count=4, - status=URLStatus.PENDING, - ), - TestURLCreationParameters( - count=2, - status=URLStatus.ERROR - ), - TestURLCreationParameters( - count=1, - status=URLStatus.VALIDATED - ), - TestURLCreationParameters( - count=5, - status=URLStatus.NOT_RELEVANT - ), - ] ) - batch_2 = await ath.db_data_creator.batch_v2(batch_2_params) + url_mappings_2_ok: list[URLMapping] = await ddc.create_urls(batch_id=batch_2, count=4, status=URLStatus.OK) + url_mappings_2_error: list[URLMapping] = await ddc.create_urls(batch_id=batch_2, count=2, status=URLStatus.ERROR) + url_mappings_2_validated: list[URLMapping] = await ddc.create_validated_urls(count=1, validation_type=URLValidatedType.DATA_SOURCE) + url_mappings_2_not_relevant: list[URLMapping] = await ddc.create_validated_urls(count=5, validation_type=URLValidatedType.NOT_RELEVANT) + url_ids_2_validated: list[int] = [url_mapping.url_id for url_mapping in url_mappings_2_validated] + url_ids_2_not_relevant: list[int] = [url_mapping.url_id for url_mapping in url_mappings_2_not_relevant] + await ddc.create_batch_url_links( + url_ids=url_ids_2_validated + url_ids_2_not_relevant, + batch_id=batch_2 + ) + + dto = await ath.request_validator.get_urls_aggregated_metrics() assert dto.oldest_pending_url_id == oldest_url_id - assert dto.oldest_pending_url_created_at == today.subtract(days=1).in_timezone('UTC').naive() - assert dto.count_urls_pending == 6 assert dto.count_urls_rejected == 5 assert dto.count_urls_errors == 2 - assert dto.count_urls_validated == 1 + assert dto.count_urls_validated == 8 assert dto.count_urls_submitted == 2 assert dto.count_urls_total == 16 diff --git a/tests/automated/integration/api/metrics/urls/breakdown/test_pending.py b/tests/automated/integration/api/metrics/urls/breakdown/test_pending.py index e81d6ec7..02f1aae2 100644 --- a/tests/automated/integration/api/metrics/urls/breakdown/test_pending.py +++ b/tests/automated/integration/api/metrics/urls/breakdown/test_pending.py @@ -6,6 +6,7 @@ from src.core.enums import SuggestedStatus, RecordType from tests.helpers.batch_creation_parameters.annotation_info import AnnotationInfo from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters @@ -27,14 +28,14 @@ async def test_get_urls_breakdown_pending_metrics(api_test_helper): urls=[ TestURLCreationParameters( count=1, - status=URLStatus.PENDING, + status=URLCreationEnum.OK, annotation_info=AnnotationInfo( user_relevant=SuggestedStatus.NOT_RELEVANT ) ), TestURLCreationParameters( count=2, - status=URLStatus.SUBMITTED + status=URLCreationEnum.SUBMITTED ), ] ) @@ -44,7 +45,7 @@ async def test_get_urls_breakdown_pending_metrics(api_test_helper): urls=[ TestURLCreationParameters( count=3, - status=URLStatus.PENDING, + status=URLCreationEnum.OK, annotation_info=AnnotationInfo( user_relevant=SuggestedStatus.RELEVANT, user_record_type=RecordType.CALLS_FOR_SERVICE @@ -60,15 +61,15 @@ async def test_get_urls_breakdown_pending_metrics(api_test_helper): urls=[ TestURLCreationParameters( count=3, - status=URLStatus.SUBMITTED + status=URLCreationEnum.SUBMITTED ), TestURLCreationParameters( count=4, - status=URLStatus.ERROR + status=URLCreationEnum.ERROR ), TestURLCreationParameters( count=5, - status=URLStatus.PENDING, + status=URLCreationEnum.OK, annotation_info=AnnotationInfo( user_relevant=SuggestedStatus.RELEVANT, user_record_type=RecordType.INCARCERATION_RECORDS, diff --git a/tests/automated/integration/api/metrics/urls/breakdown/test_submitted.py b/tests/automated/integration/api/metrics/urls/breakdown/test_submitted.py index 71e00e51..cbd30f8b 100644 --- a/tests/automated/integration/api/metrics/urls/breakdown/test_submitted.py +++ b/tests/automated/integration/api/metrics/urls/breakdown/test_submitted.py @@ -3,6 +3,7 @@ from src.collectors.enums import CollectorType, URLStatus from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters @@ -18,11 +19,11 @@ async def test_get_urls_breakdown_submitted_metrics(api_test_helper): urls=[ TestURLCreationParameters( count=1, - status=URLStatus.PENDING + status=URLCreationEnum.OK ), TestURLCreationParameters( count=2, - status=URLStatus.SUBMITTED + status=URLCreationEnum.SUBMITTED ), ] ) @@ -32,7 +33,7 @@ async def test_get_urls_breakdown_submitted_metrics(api_test_helper): urls=[ TestURLCreationParameters( count=3, - status=URLStatus.SUBMITTED + status=URLCreationEnum.SUBMITTED ) ], created_at=today.subtract(weeks=1), @@ -44,15 +45,15 @@ async def test_get_urls_breakdown_submitted_metrics(api_test_helper): urls=[ TestURLCreationParameters( count=3, - status=URLStatus.SUBMITTED + status=URLCreationEnum.SUBMITTED ), TestURLCreationParameters( count=4, - status=URLStatus.ERROR + status=URLCreationEnum.ERROR ), TestURLCreationParameters( count=5, - status=URLStatus.VALIDATED + status=URLCreationEnum.VALIDATED ), ] ) diff --git a/tests/automated/integration/api/review/conftest.py b/tests/automated/integration/api/review/conftest.py index e4345821..59d76930 100644 --- a/tests/automated/integration/api/review/conftest.py +++ b/tests/automated/integration/api/review/conftest.py @@ -5,32 +5,18 @@ from src.core.enums import SuggestedStatus, RecordType from tests.helpers.batch_creation_parameters.annotation_info import AnnotationInfo from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters @pytest_asyncio.fixture async def batch_url_creation_info(db_data_creator): - simple_parameter_statuses = [ - URLStatus.VALIDATED, - URLStatus.SUBMITTED, - URLStatus.INDIVIDUAL_RECORD, - URLStatus.NOT_RELEVANT, - URLStatus.ERROR, - URLStatus.DUPLICATE, - URLStatus.NOT_FOUND - ] - simple_parameters = [ - TestURLCreationParameters( - status=status - ) for status in simple_parameter_statuses - ] parameters = TestBatchCreationParameters( urls=[ - *simple_parameters, TestURLCreationParameters( count=2, - status=URLStatus.PENDING, + status=URLCreationEnum.OK, annotation_info=AnnotationInfo( user_relevant=SuggestedStatus.RELEVANT, user_record_type=RecordType.ARREST_RECORDS, diff --git a/tests/automated/integration/api/review/rejection/test_individual_record.py b/tests/automated/integration/api/review/rejection/test_individual_record.py index 6e81d378..33addd91 100644 --- a/tests/automated/integration/api/review/rejection/test_individual_record.py +++ b/tests/automated/integration/api/review/rejection/test_individual_record.py @@ -2,14 +2,21 @@ from src.api.endpoints.review.enums import RejectionReason from src.collectors.enums import URLStatus +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from tests.automated.integration.api.review.rejection.helpers import run_rejection_test +from tests.helpers.api_test_helper import APITestHelper @pytest.mark.asyncio -async def test_rejection_individual_record(api_test_helper): +async def test_rejection_individual_record(api_test_helper: APITestHelper): await run_rejection_test( api_test_helper, rejection_reason=RejectionReason.INDIVIDUAL_RECORD, - url_status=URLStatus.INDIVIDUAL_RECORD + url_status=URLStatus.OK ) + # Get FlagURLValidated and confirm Individual Record + flag: FlagURLValidated = (await api_test_helper.adb_client().get_all(FlagURLValidated))[0] + assert flag.type == URLValidatedType.INDIVIDUAL_RECORD + diff --git a/tests/automated/integration/api/review/rejection/test_not_relevant.py b/tests/automated/integration/api/review/rejection/test_not_relevant.py index 1ad2847f..03ee72d3 100644 --- a/tests/automated/integration/api/review/rejection/test_not_relevant.py +++ b/tests/automated/integration/api/review/rejection/test_not_relevant.py @@ -2,6 +2,8 @@ from src.api.endpoints.review.enums import RejectionReason from src.collectors.enums import URLStatus +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from tests.automated.integration.api.review.rejection.helpers import run_rejection_test @@ -10,5 +12,9 @@ async def test_rejection_not_relevant(api_test_helper): await run_rejection_test( api_test_helper, rejection_reason=RejectionReason.NOT_RELEVANT, - url_status=URLStatus.NOT_RELEVANT + url_status=URLStatus.OK ) + + # Get FlagURLValidated and confirm Not Relevant + flag: FlagURLValidated = (await api_test_helper.adb_client().get_all(FlagURLValidated))[0] + assert flag.type == URLValidatedType.NOT_RELEVANT \ No newline at end of file diff --git a/tests/automated/integration/api/review/test_approve_and_get_next_source.py b/tests/automated/integration/api/review/test_approve_and_get_next_source.py index bfa126b1..69cf13d2 100644 --- a/tests/automated/integration/api/review/test_approve_and_get_next_source.py +++ b/tests/automated/integration/api/review/test_approve_and_get_next_source.py @@ -6,6 +6,8 @@ from src.core.enums import RecordType from src.db.constants import PLACEHOLDER_AGENCY_NAME from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata @@ -55,7 +57,7 @@ async def test_approve_and_get_next_source_for_review(api_test_helper): url = urls[0] assert url.id == url_mapping.url_id assert url.record_type == RecordType.ARREST_RECORDS - assert url.status == URLStatus.VALIDATED + assert url.status == URLStatus.OK assert url.name == "New Test Name" assert url.description == "New Test Description" @@ -76,3 +78,8 @@ async def test_approve_and_get_next_source_for_review(api_test_helper): for agency in agencies: if agency.agency_id == additional_agency: assert agency.name == PLACEHOLDER_AGENCY_NAME + + # Confirm presence of FlagURLValidated + flag_url_validated = await adb_client.get_all(FlagURLValidated) + assert len(flag_url_validated) == 1 + assert flag_url_validated[0].type == URLValidatedType.DATA_SOURCE \ No newline at end of file diff --git a/tests/automated/integration/api/review/test_batch_filtering.py b/tests/automated/integration/api/review/test_batch_filtering.py index 2e8aa63c..481f7e90 100644 --- a/tests/automated/integration/api/review/test_batch_filtering.py +++ b/tests/automated/integration/api/review/test_batch_filtering.py @@ -1,21 +1,37 @@ import pytest +from src.collectors.enums import URLStatus +from src.db.dtos.url.mapping import URLMapping +from tests.helpers.data_creator.core import DBDataCreator +from tests.helpers.data_creator.models.creation_info.batch.v1 import BatchURLCreationInfo + @pytest.mark.asyncio async def test_batch_filtering( - batch_url_creation_info, + batch_url_creation_info: BatchURLCreationInfo, api_test_helper ): ath = api_test_helper rv = ath.request_validator + dbdc: DBDataCreator = ath.db_data_creator + + batch_id: int = batch_url_creation_info.batch_id + + validated_url_mappings: list[URLMapping] = await dbdc.create_validated_urls(count=4) + validated_url_ids: list[int] = [url_mapping.url_id for url_mapping in validated_url_mappings] + await dbdc.create_batch_url_links( + url_ids=validated_url_ids, + batch_id=batch_id + ) + # Receive null batch info if batch id not provided outer_result_no_batch_info = await rv.review_next_source() assert outer_result_no_batch_info.next_source.batch_info is None # Get batch info if batch id is provided outer_result = await ath.request_validator.review_next_source( - batch_id=batch_url_creation_info.batch_id + batch_id=batch_id ) assert outer_result.remaining == 2 batch_info = outer_result.next_source.batch_info diff --git a/tests/automated/integration/api/test_annotate.py b/tests/automated/integration/api/test_annotate.py deleted file mode 100644 index 51688765..00000000 --- a/tests/automated/integration/api/test_annotate.py +++ /dev/null @@ -1,756 +0,0 @@ -from http import HTTPStatus - -import pytest -from fastapi import HTTPException - -from src.api.endpoints.annotate.agency.post.dto import URLAgencyAnnotationPostInfo -from src.api.endpoints.annotate.all.post.dto import AllAnnotationPostInfo -from src.api.endpoints.annotate.dtos.record_type.post import RecordTypeAnnotationPostInfo -from src.api.endpoints.annotate.dtos.record_type.response import GetNextRecordTypeAnnotationResponseOuterInfo -from src.api.endpoints.annotate.relevance.get.dto import GetNextRelevanceAnnotationResponseOuterInfo -from src.api.endpoints.annotate.relevance.post.dto import RelevanceAnnotationPostInfo -from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo -from src.db.dtos.url.insert import InsertURLsInfo -from src.db.dtos.url.mapping import URLMapping -from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion -from src.core.error_manager.enums import ErrorTypes -from src.core.enums import RecordType, SuggestionType, SuggestedStatus -from src.core.exceptions import FailedValidationException -from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion -from tests.helpers.setup.annotate_agency.model import AnnotateAgencySetupInfo -from tests.helpers.setup.final_review.core import setup_for_get_next_url_for_final_review -from tests.helpers.setup.annotate_agency.core import setup_for_annotate_agency -from tests.helpers.data_creator.models.creation_info.batch.v1 import BatchURLCreationInfo -from tests.automated.integration.api.conftest import MOCK_USER_ID - -def check_url_mappings_match( - map_1: URLMapping, - map_2: URLMapping -): - assert map_1.url_id == map_2.url_id - assert map_2.url == map_2.url - -def check_html_info_not_empty( - html_info: ResponseHTMLInfo -): - assert not html_info_empty(html_info) - -def html_info_empty( - html_info: ResponseHTMLInfo -) -> bool: - return html_info.description == "" and html_info.title == "" - -@pytest.mark.asyncio -async def test_annotate_relevancy(api_test_helper): - ath = api_test_helper - - batch_id = ath.db_data_creator.batch() - - # Create 2 URLs with outcome `pending` - iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=2) - - url_1 = iui.url_mappings[0] - url_2 = iui.url_mappings[1] - - # Add `Relevancy` attribute with value `True` to 1st URL - await ath.db_data_creator.auto_relevant_suggestions( - url_id=url_1.url_id, - relevant=True - ) - - # Add 'Relevancy' attribute with value `False` to 2nd URL - await ath.db_data_creator.auto_relevant_suggestions( - url_id=url_2.url_id, - relevant=False - ) - - # Add HTML data to both - await ath.db_data_creator.html_data([url_1.url_id, url_2.url_id]) - # Call `GET` `/annotate/relevance` and receive next URL - request_info_1: GetNextRelevanceAnnotationResponseOuterInfo = api_test_helper.request_validator.get_next_relevance_annotation() - inner_info_1 = request_info_1.next_annotation - - check_url_mappings_match(inner_info_1.url_info, url_1) - check_html_info_not_empty(inner_info_1.html_info) - - # Validate that the correct relevant value is returned - assert inner_info_1.annotation.is_relevant is True - - # A second user should see the same URL - - - # Annotate with value 'False' and get next URL - request_info_2: GetNextRelevanceAnnotationResponseOuterInfo = api_test_helper.request_validator.post_relevance_annotation_and_get_next( - url_id=inner_info_1.url_info.url_id, - relevance_annotation_post_info=RelevanceAnnotationPostInfo( - suggested_status=SuggestedStatus.NOT_RELEVANT - ) - ) - - inner_info_2 = request_info_2.next_annotation - - check_url_mappings_match( - inner_info_2.url_info, - url_2 - ) - check_html_info_not_empty(inner_info_2.html_info) - - request_info_3: GetNextRelevanceAnnotationResponseOuterInfo = api_test_helper.request_validator.post_relevance_annotation_and_get_next( - url_id=inner_info_2.url_info.url_id, - relevance_annotation_post_info=RelevanceAnnotationPostInfo( - suggested_status=SuggestedStatus.RELEVANT - ) - ) - - assert request_info_3.next_annotation is None - - # Get all URL annotations. Confirm they exist for user - adb_client = ath.adb_client() - results: list[UserRelevantSuggestion] = await adb_client.get_all(UserRelevantSuggestion) - result_1 = results[0] - result_2 = results[1] - - assert result_1.url_id == inner_info_1.url_info.url_id - assert result_1.suggested_status == SuggestedStatus.NOT_RELEVANT.value - - assert result_2.url_id == inner_info_2.url_info.url_id - assert result_2.suggested_status == SuggestedStatus.RELEVANT.value - - # If user submits annotation for same URL, the URL should be overwritten - request_info_4: GetNextRelevanceAnnotationResponseOuterInfo = api_test_helper.request_validator.post_relevance_annotation_and_get_next( - url_id=inner_info_1.url_info.url_id, - relevance_annotation_post_info=RelevanceAnnotationPostInfo( - suggested_status=SuggestedStatus.RELEVANT - ) - ) - - assert request_info_4.next_annotation is None - - results: list[UserRelevantSuggestion] = await adb_client.get_all(UserRelevantSuggestion) - assert len(results) == 2 - - for result in results: - if result.url_id == inner_info_1.url_info.url_id: - assert results[0].suggested_status == SuggestedStatus.RELEVANT.value - -async def post_and_validate_relevancy_annotation(ath, url_id, annotation: SuggestedStatus): - response = ath.request_validator.post_relevance_annotation_and_get_next( - url_id=url_id, - relevance_annotation_post_info=RelevanceAnnotationPostInfo( - suggested_status=annotation - ) - ) - - assert response.next_annotation is None - - results: list[UserRelevantSuggestion] = await ath.adb_client().get_all(UserRelevantSuggestion) - assert len(results) == 1 - assert results[0].suggested_status == annotation.value - -@pytest.mark.asyncio -async def test_annotate_relevancy_broken_page(api_test_helper): - ath = api_test_helper - - creation_info = await ath.db_data_creator.batch_and_urls(url_count=1, with_html_content=False) - - await post_and_validate_relevancy_annotation( - ath, - url_id=creation_info.url_ids[0], - annotation=SuggestedStatus.BROKEN_PAGE_404 - ) - -@pytest.mark.asyncio -async def test_annotate_relevancy_individual_record(api_test_helper): - ath = api_test_helper - - creation_info: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( - url_count=1 - ) - - await post_and_validate_relevancy_annotation( - ath, - url_id=creation_info.url_ids[0], - annotation=SuggestedStatus.INDIVIDUAL_RECORD - ) - -@pytest.mark.asyncio -async def test_annotate_relevancy_already_annotated_by_different_user( - api_test_helper -): - ath = api_test_helper - - creation_info: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( - url_count=1 - ) - - await ath.db_data_creator.user_relevant_suggestion( - url_id=creation_info.url_ids[0], - user_id=2, - suggested_status=SuggestedStatus.RELEVANT - ) - - # Annotate with different user (default is 1) and get conflict error - try: - response = await ath.request_validator.post_relevance_annotation_and_get_next( - url_id=creation_info.url_ids[0], - relevance_annotation_post_info=RelevanceAnnotationPostInfo( - suggested_status=SuggestedStatus.NOT_RELEVANT - ) - ) - except HTTPException as e: - assert e.status_code == HTTPStatus.CONFLICT - assert e.detail["detail"]["code"] == ErrorTypes.ANNOTATION_EXISTS.value - assert e.detail["detail"]["message"] == f"Annotation of type RELEVANCE already exists for url {creation_info.url_ids[0]}" - - -@pytest.mark.asyncio -async def test_annotate_relevancy_no_html(api_test_helper): - ath = api_test_helper - - batch_id = ath.db_data_creator.batch() - - # Create 2 URLs with outcome `pending` - iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=2) - - url_1 = iui.url_mappings[0] - url_2 = iui.url_mappings[1] - - # Add `Relevancy` attribute with value `True` to 1st URL - await ath.db_data_creator.auto_relevant_suggestions( - url_id=url_1.url_id, - relevant=True - ) - - # Add 'Relevancy' attribute with value `False` to 2nd URL - await ath.db_data_creator.auto_relevant_suggestions( - url_id=url_2.url_id, - relevant=False - ) - - # Call `GET` `/annotate/relevance` and receive next URL - request_info_1: GetNextRelevanceAnnotationResponseOuterInfo = api_test_helper.request_validator.get_next_relevance_annotation() - inner_info_1 = request_info_1.next_annotation - - check_url_mappings_match(inner_info_1.url_info, url_1) - assert html_info_empty(inner_info_1.html_info) - -@pytest.mark.asyncio -async def test_annotate_record_type(api_test_helper): - ath = api_test_helper - - batch_id = ath.db_data_creator.batch() - - # Create 2 URLs with outcome `pending` - iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=2) - - url_1 = iui.url_mappings[0] - url_2 = iui.url_mappings[1] - - # Add record type attribute with value `Accident Reports` to 1st URL - await ath.db_data_creator.auto_record_type_suggestions( - url_id=url_1.url_id, - record_type=RecordType.ACCIDENT_REPORTS - ) - - # Add 'Record Type' attribute with value `Dispatch Recordings` to 2nd URL - await ath.db_data_creator.auto_record_type_suggestions( - url_id=url_2.url_id, - record_type=RecordType.DISPATCH_RECORDINGS - ) - - # Add HTML data to both - await ath.db_data_creator.html_data([url_1.url_id, url_2.url_id]) - - # Call `GET` `/annotate/record-type` and receive next URL - request_info_1: GetNextRecordTypeAnnotationResponseOuterInfo = api_test_helper.request_validator.get_next_record_type_annotation() - inner_info_1 = request_info_1.next_annotation - - check_url_mappings_match(inner_info_1.url_info, url_1) - check_html_info_not_empty(inner_info_1.html_info) - - # Validate that the correct record type is returned - assert inner_info_1.suggested_record_type == RecordType.ACCIDENT_REPORTS - - # Annotate with value 'Personnel Records' and get next URL - request_info_2: GetNextRecordTypeAnnotationResponseOuterInfo = api_test_helper.request_validator.post_record_type_annotation_and_get_next( - url_id=inner_info_1.url_info.url_id, - record_type_annotation_post_info=RecordTypeAnnotationPostInfo( - record_type=RecordType.PERSONNEL_RECORDS - ) - ) - - inner_info_2 = request_info_2.next_annotation - - check_url_mappings_match(inner_info_2.url_info, url_2) - check_html_info_not_empty(inner_info_2.html_info) - - request_info_3: GetNextRecordTypeAnnotationResponseOuterInfo = api_test_helper.request_validator.post_record_type_annotation_and_get_next( - url_id=inner_info_2.url_info.url_id, - record_type_annotation_post_info=RecordTypeAnnotationPostInfo( - record_type=RecordType.ANNUAL_AND_MONTHLY_REPORTS - ) - ) - - assert request_info_3.next_annotation is None - - # Get all URL annotations. Confirm they exist for user - adb_client = ath.adb_client() - results: list[UserRecordTypeSuggestion] = await adb_client.get_all(UserRecordTypeSuggestion) - result_1 = results[0] - result_2 = results[1] - - assert result_1.url_id == inner_info_1.url_info.url_id - assert result_1.record_type == RecordType.PERSONNEL_RECORDS.value - - assert result_2.url_id == inner_info_2.url_info.url_id - assert result_2.record_type == RecordType.ANNUAL_AND_MONTHLY_REPORTS.value - - # If user submits annotation for same URL, the URL should be overwritten - - request_info_4: GetNextRecordTypeAnnotationResponseOuterInfo = api_test_helper.request_validator.post_record_type_annotation_and_get_next( - url_id=inner_info_1.url_info.url_id, - record_type_annotation_post_info=RecordTypeAnnotationPostInfo( - record_type=RecordType.BOOKING_REPORTS - ) - ) - - assert request_info_4.next_annotation is None - - results: list[UserRecordTypeSuggestion] = await adb_client.get_all(UserRecordTypeSuggestion) - assert len(results) == 2 - - for result in results: - if result.url_id == inner_info_1.url_info.url_id: - assert result.record_type == RecordType.BOOKING_REPORTS.value - -@pytest.mark.asyncio -async def test_annotate_record_type_already_annotated_by_different_user( - api_test_helper -): - ath = api_test_helper - - creation_info: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( - url_count=1 - ) - - await ath.db_data_creator.user_record_type_suggestion( - url_id=creation_info.url_ids[0], - user_id=2, - record_type=RecordType.ACCIDENT_REPORTS - ) - - # Annotate with different user (default is 1) and get conflict error - try: - response = await ath.request_validator.post_record_type_annotation_and_get_next( - url_id=creation_info.url_ids[0], - record_type_annotation_post_info=RecordTypeAnnotationPostInfo( - record_type=RecordType.ANNUAL_AND_MONTHLY_REPORTS - ) - ) - except HTTPException as e: - assert e.status_code == HTTPStatus.CONFLICT - assert e.detail["detail"]["code"] == ErrorTypes.ANNOTATION_EXISTS.value - assert e.detail["detail"]["message"] == f"Annotation of type RECORD_TYPE already exists for url {creation_info.url_ids[0]}" - - -@pytest.mark.asyncio -async def test_annotate_record_type_no_html_info(api_test_helper): - ath = api_test_helper - - batch_id = ath.db_data_creator.batch() - - # Create 2 URLs with outcome `pending` - iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=2) - - url_1 = iui.url_mappings[0] - url_2 = iui.url_mappings[1] - - # Add record type attribute with value `Accident Reports` to 1st URL - await ath.db_data_creator.auto_record_type_suggestions( - url_id=url_1.url_id, - record_type=RecordType.ACCIDENT_REPORTS - ) - - # Add 'Record Type' attribute with value `Dispatch Recordings` to 2nd URL - await ath.db_data_creator.auto_record_type_suggestions( - url_id=url_2.url_id, - record_type=RecordType.DISPATCH_RECORDINGS - ) - - # Call `GET` `/annotate/record-type` and receive next URL - request_info_1: GetNextRecordTypeAnnotationResponseOuterInfo = api_test_helper.request_validator.get_next_record_type_annotation() - inner_info_1 = request_info_1.next_annotation - - check_url_mappings_match(inner_info_1.url_info, url_1) - assert html_info_empty(inner_info_1.html_info) - -@pytest.mark.asyncio -async def test_annotate_agency_multiple_auto_suggestions(api_test_helper): - """ - Test Scenario: Multiple Auto Suggestions - A URL has multiple Agency Auto Suggestion and has not been annotated by the User - The user should receive all of the auto suggestions with full detail - """ - ath = api_test_helper - buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( - url_count=1, - with_html_content=True - ) - await ath.db_data_creator.auto_suggestions( - url_ids=buci.url_ids, - num_suggestions=2, - suggestion_type=SuggestionType.AUTO_SUGGESTION - ) - - # User requests next annotation - response = await ath.request_validator.get_next_agency_annotation() - - assert response.next_annotation - next_annotation = response.next_annotation - # Check that url_id matches the one we inserted - assert next_annotation.url_info.url_id == buci.url_ids[0] - - # Check that html data is present - assert next_annotation.html_info.description != "" - assert next_annotation.html_info.title != "" - - # Check that two agency_suggestions exist - assert len(next_annotation.agency_suggestions) == 2 - - for agency_suggestion in next_annotation.agency_suggestions: - assert agency_suggestion.suggestion_type == SuggestionType.AUTO_SUGGESTION - assert agency_suggestion.pdap_agency_id is not None - assert agency_suggestion.agency_name is not None - assert agency_suggestion.state is not None - assert agency_suggestion.county is not None - assert agency_suggestion.locality is not None - - -@pytest.mark.asyncio -async def test_annotate_agency_multiple_auto_suggestions_no_html(api_test_helper): - """ - Test Scenario: Multiple Auto Suggestions - A URL has multiple Agency Auto Suggestion and has not been annotated by the User - The user should receive all of the auto suggestions with full detail - """ - ath = api_test_helper - buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( - url_count=1, - with_html_content=False - ) - await ath.db_data_creator.auto_suggestions( - url_ids=buci.url_ids, - num_suggestions=2, - suggestion_type=SuggestionType.AUTO_SUGGESTION - ) - - # User requests next annotation - response = await ath.request_validator.get_next_agency_annotation() - - assert response.next_annotation - next_annotation = response.next_annotation - # Check that url_id matches the one we inserted - assert next_annotation.url_info.url_id == buci.url_ids[0] - - # Check that html data is not present - assert next_annotation.html_info.description == "" - assert next_annotation.html_info.title == "" - -@pytest.mark.asyncio -async def test_annotate_agency_single_unknown_auto_suggestion(api_test_helper): - """ - Test Scenario: Single Unknown Auto Suggestion - A URL has a single Unknown Agency Auto Suggestion and has not been annotated by the User - The user should receive a single Unknown Auto Suggestion lacking other detail - """ - ath = api_test_helper - buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( - url_count=1, - with_html_content=True - ) - await ath.db_data_creator.auto_suggestions( - url_ids=buci.url_ids, - num_suggestions=1, - suggestion_type=SuggestionType.UNKNOWN - ) - response = await ath.request_validator.get_next_agency_annotation() - - assert response.next_annotation - next_annotation = response.next_annotation - # Check that url_id matches the one we inserted - assert next_annotation.url_info.url_id == buci.url_ids[0] - - # Check that html data is present - assert next_annotation.html_info.description != "" - assert next_annotation.html_info.title != "" - - # Check that one agency_suggestion exists - assert len(next_annotation.agency_suggestions) == 1 - - agency_suggestion = next_annotation.agency_suggestions[0] - - assert agency_suggestion.suggestion_type == SuggestionType.UNKNOWN - assert agency_suggestion.pdap_agency_id is None - assert agency_suggestion.agency_name is None - assert agency_suggestion.state is None - assert agency_suggestion.county is None - assert agency_suggestion.locality is None - - -@pytest.mark.asyncio -async def test_annotate_agency_single_confirmed_agency(api_test_helper): - """ - Test Scenario: Single Confirmed Agency - A URL has a single Confirmed Agency and has not been annotated by the User - The user should not receive this URL to annotate - """ - ath = api_test_helper - buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( - url_count=1, - with_html_content=True - ) - await ath.db_data_creator.confirmed_suggestions( - url_ids=buci.url_ids, - ) - response = await ath.request_validator.get_next_agency_annotation() - assert response.next_annotation is None - -@pytest.mark.asyncio -async def test_annotate_agency_other_user_annotation(api_test_helper): - """ - Test Scenario: Other User Annotation - A URL has been annotated by another User - Our user should still receive this URL to annotate - """ - ath = api_test_helper - setup_info: AnnotateAgencySetupInfo = await setup_for_annotate_agency( - db_data_creator=ath.db_data_creator, - url_count=1 - ) - url_ids = setup_info.url_ids - - response = await ath.request_validator.get_next_agency_annotation() - - assert response.next_annotation - next_annotation = response.next_annotation - # Check that url_id matches the one we inserted - assert next_annotation.url_info.url_id == url_ids[0] - - # Check that html data is present - assert next_annotation.html_info.description != "" - assert next_annotation.html_info.title != "" - - # Check that one agency_suggestion exists - assert len(next_annotation.agency_suggestions) == 1 - - # Test that another user can insert a suggestion - await ath.db_data_creator.manual_suggestion( - user_id=MOCK_USER_ID + 1, - url_id=url_ids[0], - ) - - # After this, text that our user does not receive this URL - response = await ath.request_validator.get_next_agency_annotation() - assert response.next_annotation is None - -@pytest.mark.asyncio -async def test_annotate_agency_submit_and_get_next(api_test_helper): - """ - Test Scenario: Submit and Get Next (no other URL available) - A URL has been annotated by our User, and no other valid URLs have not been annotated - Our user should not receive another URL to annotate - Until another relevant URL is added - """ - ath = api_test_helper - setup_info: AnnotateAgencySetupInfo = await setup_for_annotate_agency( - db_data_creator=ath.db_data_creator, - url_count=2 - ) - url_ids = setup_info.url_ids - - # User should submit an annotation and receive the next - response = await ath.request_validator.post_agency_annotation_and_get_next( - url_id=url_ids[0], - agency_annotation_post_info=URLAgencyAnnotationPostInfo( - suggested_agency=await ath.db_data_creator.agency(), - is_new=False - ) - - ) - assert response.next_annotation is not None - - # User should submit this annotation and receive none for the next - response = await ath.request_validator.post_agency_annotation_and_get_next( - url_id=url_ids[1], - agency_annotation_post_info=URLAgencyAnnotationPostInfo( - suggested_agency=await ath.db_data_creator.agency(), - is_new=False - ) - ) - assert response.next_annotation is None - - -@pytest.mark.asyncio -async def test_annotate_agency_submit_new(api_test_helper): - """ - Test Scenario: Submit New - Our user receives an annotation and marks it as `NEW` - This should complete successfully - And within the database the annotation should be marked as `NEW` - """ - ath = api_test_helper - adb_client = ath.adb_client() - setup_info: AnnotateAgencySetupInfo = await setup_for_annotate_agency( - db_data_creator=ath.db_data_creator, - url_count=1 - ) - url_ids = setup_info.url_ids - - # User should submit an annotation and mark it as New - response = await ath.request_validator.post_agency_annotation_and_get_next( - url_id=url_ids[0], - agency_annotation_post_info=URLAgencyAnnotationPostInfo( - suggested_agency=await ath.db_data_creator.agency(), - is_new=True - ) - ) - assert response.next_annotation is None - - # Within database, the annotation should be marked as `NEW` - all_manual_suggestions = await adb_client.get_all(UserUrlAgencySuggestion) - assert len(all_manual_suggestions) == 1 - assert all_manual_suggestions[0].is_new - -@pytest.mark.asyncio -async def test_annotate_all(api_test_helper): - """ - Test the happy path workflow for the all-annotations endpoint - The user should be able to get a valid URL (filtering on batch id if needed), - submit a full annotation, and receive another URL - """ - ath = api_test_helper - adb_client = ath.adb_client() - setup_info_1 = await setup_for_get_next_url_for_final_review( - db_data_creator=ath.db_data_creator, include_user_annotations=False - ) - url_mapping_1 = setup_info_1.url_mapping - setup_info_2 = await setup_for_get_next_url_for_final_review( - db_data_creator=ath.db_data_creator, include_user_annotations=False - ) - url_mapping_2 = setup_info_2.url_mapping - - # First, get a valid URL to annotate - get_response_1 = await ath.request_validator.get_next_url_for_all_annotations() - - # Apply the second batch id as a filter and see that a different URL is returned - get_response_2 = await ath.request_validator.get_next_url_for_all_annotations( - batch_id=setup_info_2.batch_id - ) - - assert get_response_1.next_annotation.url_info.url_id != get_response_2.next_annotation.url_info.url_id - - # Annotate the first and submit - agency_id = await ath.db_data_creator.agency() - post_response_1 = await ath.request_validator.post_all_annotations_and_get_next( - url_id=url_mapping_1.url_id, - all_annotations_post_info=AllAnnotationPostInfo( - suggested_status=SuggestedStatus.RELEVANT, - record_type=RecordType.ACCIDENT_REPORTS, - agency=URLAgencyAnnotationPostInfo( - is_new=False, - suggested_agency=agency_id - ) - ) - ) - assert post_response_1.next_annotation is not None - - # Confirm the second is received - assert post_response_1.next_annotation.url_info.url_id == url_mapping_2.url_id - - # Upon submitting the second, confirm that no more URLs are returned through either POST or GET - post_response_2 = await ath.request_validator.post_all_annotations_and_get_next( - url_id=url_mapping_2.url_id, - all_annotations_post_info=AllAnnotationPostInfo( - suggested_status=SuggestedStatus.NOT_RELEVANT, - ) - ) - assert post_response_2.next_annotation is None - - get_response_3 = await ath.request_validator.get_next_url_for_all_annotations() - assert get_response_3.next_annotation is None - - - # Check that all annotations are present in the database - - # Should be two relevance annotations, one True and one False - all_relevance_suggestions: list[UserRelevantSuggestion] = await adb_client.get_all(UserRelevantSuggestion) - assert len(all_relevance_suggestions) == 2 - assert all_relevance_suggestions[0].suggested_status == SuggestedStatus.RELEVANT.value - assert all_relevance_suggestions[1].suggested_status == SuggestedStatus.NOT_RELEVANT.value - - # Should be one agency - all_agency_suggestions = await adb_client.get_all(UserUrlAgencySuggestion) - assert len(all_agency_suggestions) == 1 - assert all_agency_suggestions[0].is_new == False - assert all_agency_suggestions[0].agency_id == agency_id - - # Should be one record type - all_record_type_suggestions = await adb_client.get_all(UserRecordTypeSuggestion) - assert len(all_record_type_suggestions) == 1 - assert all_record_type_suggestions[0].record_type == RecordType.ACCIDENT_REPORTS.value - -@pytest.mark.asyncio -async def test_annotate_all_post_batch_filtering(api_test_helper): - """ - Batch filtering should also work when posting annotations - """ - ath = api_test_helper - adb_client = ath.adb_client() - setup_info_1 = await setup_for_get_next_url_for_final_review( - db_data_creator=ath.db_data_creator, include_user_annotations=False - ) - url_mapping_1 = setup_info_1.url_mapping - setup_info_2 = await setup_for_get_next_url_for_final_review( - db_data_creator=ath.db_data_creator, include_user_annotations=False - ) - setup_info_3 = await setup_for_get_next_url_for_final_review( - db_data_creator=ath.db_data_creator, include_user_annotations=False - ) - url_mapping_3 = setup_info_3.url_mapping - - # Submit the first annotation, using the third batch id, and receive the third URL - post_response_1 = await ath.request_validator.post_all_annotations_and_get_next( - url_id=url_mapping_1.url_id, - batch_id=setup_info_3.batch_id, - all_annotations_post_info=AllAnnotationPostInfo( - suggested_status=SuggestedStatus.RELEVANT, - record_type=RecordType.ACCIDENT_REPORTS, - agency=URLAgencyAnnotationPostInfo( - is_new=True - ) - ) - ) - - assert post_response_1.next_annotation.url_info.url_id == url_mapping_3.url_id - - -@pytest.mark.asyncio -async def test_annotate_all_validation_error(api_test_helper): - """ - Validation errors in the PostInfo DTO should result in a 400 BAD REQUEST response - """ - ath = api_test_helper - setup_info_1 = await setup_for_get_next_url_for_final_review( - db_data_creator=ath.db_data_creator, include_user_annotations=False - ) - url_mapping_1 = setup_info_1.url_mapping - - with pytest.raises(FailedValidationException) as e: - response = await ath.request_validator.post_all_annotations_and_get_next( - url_id=url_mapping_1.url_id, - all_annotations_post_info=AllAnnotationPostInfo( - suggested_status=SuggestedStatus.NOT_RELEVANT, - record_type=RecordType.ACCIDENT_REPORTS - ) - ) diff --git a/tests/automated/integration/api/test_batch.py b/tests/automated/integration/api/test_batch.py deleted file mode 100644 index 4dd21a49..00000000 --- a/tests/automated/integration/api/test_batch.py +++ /dev/null @@ -1,237 +0,0 @@ -import pytest - -from src.db.models.impl.batch.pydantic import BatchInfo -from src.db.dtos.url.insert import InsertURLsInfo -from src.collectors.impl.example.dtos.input import ExampleInputDTO -from src.collectors.enums import CollectorType, URLStatus -from src.core.enums import BatchStatus -from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters -from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters - - -@pytest.mark.asyncio -async def test_get_batch_summaries(api_test_helper): - ath = api_test_helper - - batch_params = [ - TestBatchCreationParameters( - urls=[ - TestURLCreationParameters( - count=1, - status=URLStatus.PENDING - ), - TestURLCreationParameters( - count=2, - status=URLStatus.SUBMITTED - ) - ] - ), - TestBatchCreationParameters( - urls=[ - TestURLCreationParameters( - count=4, - status=URLStatus.NOT_RELEVANT - ), - TestURLCreationParameters( - count=3, - status=URLStatus.ERROR - ) - ] - ), - TestBatchCreationParameters( - urls=[ - TestURLCreationParameters( - count=7, - status=URLStatus.DUPLICATE - ), - TestURLCreationParameters( - count=1, - status=URLStatus.SUBMITTED - ) - ] - ) - ] - - batch_1_creation_info = await ath.db_data_creator.batch_v2(batch_params[0]) - batch_2_creation_info = await ath.db_data_creator.batch_v2(batch_params[1]) - batch_3_creation_info = await ath.db_data_creator.batch_v2(batch_params[2]) - - batch_1_id = batch_1_creation_info.batch_id - batch_2_id = batch_2_creation_info.batch_id - batch_3_id = batch_3_creation_info.batch_id - - - response = ath.request_validator.get_batch_statuses() - results = response.results - - assert len(results) == 3 - - result_1 = results[0] - assert result_1.id == batch_1_id - assert result_1.status == BatchStatus.READY_TO_LABEL - counts_1 = result_1.url_counts - assert counts_1.total == 3 - assert counts_1.pending == 1 - assert counts_1.submitted == 2 - assert counts_1.not_relevant == 0 - assert counts_1.duplicate == 0 - assert counts_1.errored == 0 - - result_2 = results[1] - assert result_2.id == batch_2_id - counts_2 = result_2.url_counts - assert counts_2.total == 7 - assert counts_2.not_relevant == 4 - assert counts_2.errored == 3 - assert counts_2.pending == 0 - assert counts_2.submitted == 0 - assert counts_2.duplicate == 0 - - result_3 = results[2] - assert result_3.id == batch_3_id - counts_3 = result_3.url_counts - assert counts_3.total == 8 - assert counts_3.not_relevant == 0 - assert counts_3.errored == 0 - assert counts_3.pending == 0 - assert counts_3.submitted == 1 - assert counts_3.duplicate == 7 - - - - - - -@pytest.mark.asyncio -async def test_get_batch_summaries_pending_url_filter(api_test_helper): - ath = api_test_helper - - # Add an errored out batch - batch_error = await ath.db_data_creator.batch_and_urls( - strategy=CollectorType.EXAMPLE, - url_count=2, - batch_status=BatchStatus.ERROR - ) - - # Add a batch with pending urls - batch_pending = await ath.db_data_creator.batch_and_urls( - strategy=CollectorType.EXAMPLE, - url_count=2, - batch_status=BatchStatus.READY_TO_LABEL, - with_html_content=True, - url_status=URLStatus.PENDING - ) - - # Add a batch with submitted URLs - batch_submitted = await ath.db_data_creator.batch_and_urls( - strategy=CollectorType.EXAMPLE, - url_count=2, - batch_status=BatchStatus.READY_TO_LABEL, - with_html_content=True, - url_status=URLStatus.SUBMITTED - ) - - # Add an aborted batch - batch_aborted = await ath.db_data_creator.batch_and_urls( - strategy=CollectorType.EXAMPLE, - url_count=2, - batch_status=BatchStatus.ABORTED - ) - - # Add a batch with validated URLs - batch_validated = await ath.db_data_creator.batch_and_urls( - strategy=CollectorType.EXAMPLE, - url_count=2, - batch_status=BatchStatus.READY_TO_LABEL, - with_html_content=True, - url_status=URLStatus.VALIDATED - ) - - # Test filter for pending URLs and only retrieve the second batch - pending_urls_results = ath.request_validator.get_batch_statuses( - has_pending_urls=True - ) - - assert len(pending_urls_results.results) == 1 - assert pending_urls_results.results[0].id == batch_pending.batch_id - - # Test filter without pending URLs and retrieve the other four batches - no_pending_urls_results = ath.request_validator.get_batch_statuses( - has_pending_urls=False - ) - - assert len(no_pending_urls_results.results) == 4 - for result in no_pending_urls_results.results: - assert result.id in [ - batch_error.batch_id, - batch_submitted.batch_id, - batch_validated.batch_id, - batch_aborted.batch_id - ] - - # Test no filter for pending URLs and retrieve all batches - no_filter_results = ath.request_validator.get_batch_statuses() - - assert len(no_filter_results.results) == 5 - - - - -def test_abort_batch(api_test_helper): - ath = api_test_helper - - dto = ExampleInputDTO( - sleep_time=1 - ) - - batch_id = ath.request_validator.example_collector(dto=dto)["batch_id"] - - response = ath.request_validator.abort_batch(batch_id=batch_id) - - assert response.message == "Batch aborted." - - bi: BatchInfo = ath.request_validator.get_batch_info(batch_id=batch_id) - - assert bi.status == BatchStatus.ABORTED - -def test_get_batch_urls(api_test_helper): - - # Insert batch and urls into database - ath = api_test_helper - batch_id = ath.db_data_creator.batch() - iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=101) - - response = ath.request_validator.get_batch_urls(batch_id=batch_id, page=1) - assert len(response.urls) == 100 - # Check that the first url corresponds to the first url inserted - assert response.urls[0].url == iui.url_mappings[0].url - # Check that the last url corresponds to the 100th url inserted - assert response.urls[-1].url == iui.url_mappings[99].url - - - # Check that a more limited set of urls exist - response = ath.request_validator.get_batch_urls(batch_id=batch_id, page=2) - assert len(response.urls) == 1 - # Check that this url corresponds to the last url inserted - assert response.urls[0].url == iui.url_mappings[-1].url - -def test_get_duplicate_urls(api_test_helper): - - # Insert batch and url into database - ath = api_test_helper - batch_id = ath.db_data_creator.batch() - iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=101) - # Get a list of all url ids - url_ids = [url.url_id for url in iui.url_mappings] - - # Create a second batch which will be associated with the duplicates - dup_batch_id = ath.db_data_creator.batch() - - # Insert duplicate urls into database - ath.db_data_creator.duplicate_urls(duplicate_batch_id=dup_batch_id, url_ids=url_ids) - - response = ath.request_validator.get_batch_url_duplicates(batch_id=dup_batch_id, page=1) - assert len(response.duplicates) == 100 - - response = ath.request_validator.get_batch_url_duplicates(batch_id=dup_batch_id, page=2) - assert len(response.duplicates) == 1 \ No newline at end of file diff --git a/tests/automated/integration/api/test_manual_batch.py b/tests/automated/integration/api/test_manual_batch.py index 9b3fb326..1d2e595d 100644 --- a/tests/automated/integration/api/test_manual_batch.py +++ b/tests/automated/integration/api/test_manual_batch.py @@ -2,7 +2,7 @@ import pytest from src.api.endpoints.collector.dtos.manual_batch.post import ManualBatchInnerInputDTO, ManualBatchInputDTO -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.batch.sqlalchemy import Batch diff --git a/tests/automated/integration/core/async_/run_task/test_break_loop.py b/tests/automated/integration/core/async_/run_task/test_break_loop.py index 0d8a9bc2..71b5704f 100644 --- a/tests/automated/integration/core/async_/run_task/test_break_loop.py +++ b/tests/automated/integration/core/async_/run_task/test_break_loop.py @@ -21,9 +21,9 @@ async def test_run_task_break_loop(db_data_creator: DBDataCreator): and an alert should be sent to discord """ - async def run_task(self, task_id: int) -> TaskOperatorRunInfo: + async def run_task(self) -> TaskOperatorRunInfo: return TaskOperatorRunInfo( - task_id=task_id, + task_id=1, outcome=TaskOperatorOutcome.SUCCESS, task_type=TaskType.HTML ) diff --git a/tests/automated/integration/core/async_/run_task/test_prereq_met.py b/tests/automated/integration/core/async_/run_task/test_prereq_met.py index a7724a45..e5425fd9 100644 --- a/tests/automated/integration/core/async_/run_task/test_prereq_met.py +++ b/tests/automated/integration/core/async_/run_task/test_prereq_met.py @@ -18,12 +18,11 @@ async def test_run_task_prereq_met(db_data_creator: DBDataCreator): """ When a task pre-requisite is met, the task should be run - And a task entry should be created in the database """ - async def run_task(self, task_id: int) -> TaskOperatorRunInfo: + async def run_task(self) -> TaskOperatorRunInfo: return TaskOperatorRunInfo( - task_id=task_id, + task_id=1, task_type=TaskType.HTML, outcome=TaskOperatorOutcome.SUCCESS, ) @@ -48,9 +47,4 @@ async def run_task(self, task_id: int) -> TaskOperatorRunInfo: # There should be two calls to meets_task_prerequisites mock_operator.meets_task_prerequisites.assert_has_calls([call(), call()]) - results = await db_data_creator.adb_client.get_all(Task) - - assert len(results) == 1 - assert results[0].task_status == BatchStatus.IN_PROCESS.value - core.task_manager.conclude_task.assert_called_once() diff --git a/tests/automated/integration/db/client/approve_url/test_basic.py b/tests/automated/integration/db/client/approve_url/test_basic.py index 2a7f9569..62f215fb 100644 --- a/tests/automated/integration/db/client/approve_url/test_basic.py +++ b/tests/automated/integration/db/client/approve_url/test_basic.py @@ -3,6 +3,7 @@ from src.api.endpoints.review.approve.dto import FinalReviewApprovalInfo from src.collectors.enums import URLStatus from src.core.enums import RecordType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata @@ -42,10 +43,16 @@ async def test_approve_url_basic(db_data_creator: DBDataCreator): url = urls[0] assert url.id == url_mapping.url_id assert url.record_type == RecordType.ARREST_RECORDS - assert url.status == URLStatus.VALIDATED + assert url.status == URLStatus.OK assert url.name == "Test Name" assert url.description == "Test Description" + # Confirm presence of validated flag + validated_flags: list[FlagURLValidated] = await adb_client.get_all(FlagURLValidated) + assert len(validated_flags) == 1 + assert validated_flags[0].url_id == url_mapping.url_id + + confirmed_agency: list[LinkURLAgency] = await adb_client.get_all(LinkURLAgency) assert len(confirmed_agency) == 1 assert confirmed_agency[0].url_id == url_mapping.url_id diff --git a/tests/automated/integration/db/client/get_next_url_for_final_review/test_new_agency.py b/tests/automated/integration/db/client/get_next_url_for_final_review/test_new_agency.py deleted file mode 100644 index 72430fec..00000000 --- a/tests/automated/integration/db/client/get_next_url_for_final_review/test_new_agency.py +++ /dev/null @@ -1,41 +0,0 @@ -import pytest - -from src.api.endpoints.annotate.agency.post.dto import URLAgencyAnnotationPostInfo -from src.core.enums import SuggestedStatus, RecordType, SuggestionType -from tests.helpers.batch_creation_parameters.annotation_info import AnnotationInfo -from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters -from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters -from tests.helpers.data_creator.core import DBDataCreator - - -@pytest.mark.asyncio -async def test_get_next_url_for_final_review_new_agency(db_data_creator: DBDataCreator): - """ - Test that a URL with a new agency is properly returned - """ - - # Apply batch v2 - parameters = TestBatchCreationParameters( - urls=[ - TestURLCreationParameters( - annotation_info=AnnotationInfo( - user_relevant=SuggestedStatus.RELEVANT, - user_agency=URLAgencyAnnotationPostInfo( - is_new=True - ), - user_record_type=RecordType.ARREST_RECORDS - ) - ) - ] - ) - creation_info = await db_data_creator.batch_v2(parameters) - outer_result = await db_data_creator.adb_client.get_next_url_for_final_review( - batch_id=None - ) - result = outer_result.next_source - - assert result is not None - user_suggestion = result.annotations.agency.user - assert user_suggestion.suggestion_type == SuggestionType.NEW_AGENCY - assert user_suggestion.pdap_agency_id is None - assert user_suggestion.agency_name is None diff --git a/tests/automated/integration/db/client/get_next_url_for_final_review/test_only_confirmed_urls.py b/tests/automated/integration/db/client/get_next_url_for_final_review/test_only_confirmed_urls.py index 7e68ada4..72706aaf 100644 --- a/tests/automated/integration/db/client/get_next_url_for_final_review/test_only_confirmed_urls.py +++ b/tests/automated/integration/db/client/get_next_url_for_final_review/test_only_confirmed_urls.py @@ -1,6 +1,7 @@ import pytest from src.collectors.enums import URLStatus +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.data_creator.core import DBDataCreator @@ -14,7 +15,7 @@ async def test_get_next_url_for_final_review_only_confirmed_urls(db_data_creator url_mapping = db_data_creator.urls( batch_id=batch_id, url_count=1, - outcome=URLStatus.SUBMITTED + outcome=URLCreationEnum.SUBMITTED ).url_mappings[0] result = await db_data_creator.adb_client.get_next_url_for_final_review( diff --git a/tests/automated/integration/db/client/get_next_url_for_user_relevance_annotation/test_validated.py b/tests/automated/integration/db/client/get_next_url_for_user_relevance_annotation/test_validated.py index 95e40847..ab5acd59 100644 --- a/tests/automated/integration/db/client/get_next_url_for_user_relevance_annotation/test_validated.py +++ b/tests/automated/integration/db/client/get_next_url_for_user_relevance_annotation/test_validated.py @@ -1,6 +1,7 @@ import pytest from src.collectors.enums import URLStatus +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.setup.annotation.core import setup_for_get_next_url_for_annotation from tests.helpers.data_creator.core import DBDataCreator @@ -12,19 +13,12 @@ async def test_get_next_url_for_user_relevance_annotation_validated( """ A validated URL should not turn up in get_next_url_for_user_annotation """ - - setup_info = await setup_for_get_next_url_for_annotation( - db_data_creator=db_data_creator, - url_count=1, - outcome=URLStatus.VALIDATED - ) - - - url_1 = setup_info.insert_urls_info.url_mappings[0] + dbdc = db_data_creator + url_1: int = (await dbdc.create_validated_urls())[0].url_id # Add `Relevancy` attribute with value `True` await db_data_creator.auto_relevant_suggestions( - url_id=url_1.url_id, + url_id=url_1, relevant=True ) diff --git a/tests/automated/integration/db/client/test_insert_urls.py b/tests/automated/integration/db/client/test_insert_urls.py index 78578c6b..f2d73f00 100644 --- a/tests/automated/integration/db/client/test_insert_urls.py +++ b/tests/automated/integration/db/client/test_insert_urls.py @@ -1,8 +1,8 @@ import pytest from src.core.enums import BatchStatus -from src.db.models.impl.batch.pydantic import BatchInfo -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.batch.pydantic.info import BatchInfo +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.core.pydantic.info import URLInfo from src.db.models.impl.url.core.sqlalchemy import URL diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/check.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/check.py new file mode 100644 index 00000000..81bef537 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/check.py @@ -0,0 +1,30 @@ +from unittest.mock import AsyncMock + +from src.core.tasks.scheduled.impl.huggingface.operator import PushToHuggingFaceTaskOperator +from src.core.tasks.scheduled.impl.huggingface.queries.get.model import GetForLoadingToHuggingFaceOutput + + +def check_results_called( + operator: PushToHuggingFaceTaskOperator, + expected_outputs: list[GetForLoadingToHuggingFaceOutput] +) -> None: + mock_hf_client: AsyncMock = operator.hf_client + mock_push: AsyncMock = mock_hf_client.push_data_sources_raw_to_hub + outputs: list[GetForLoadingToHuggingFaceOutput] = mock_push.call_args.args[0] + outputs = sorted(outputs, key=lambda x: x.url_id) + expected_outputs = sorted(expected_outputs, key=lambda x: x.url_id) + for output, expected_output in zip(outputs, expected_outputs): + assert output.url_id == expected_output.url_id + assert output.url == expected_output.url + assert output.relevant == expected_output.relevant, f"Expected {expected_output.relevant}, got {output.relevant}" + assert output.record_type_fine == expected_output.record_type_fine + assert output.record_type_coarse == expected_output.record_type_coarse + assert output.html == expected_output.html + + +def check_not_called( + operator: PushToHuggingFaceTaskOperator, +) -> None: + mock_hf_client: AsyncMock = operator.hf_client + mock_push: AsyncMock = mock_hf_client.push_data_sources_raw_to_hub + mock_push.assert_not_called() \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/data.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/data.py index 64a16f9f..e7a9a69b 100644 --- a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/data.py +++ b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/data.py @@ -1,71 +1,30 @@ -from src.collectors.enums import URLStatus from src.core.enums import RecordType from src.core.tasks.scheduled.impl.huggingface.queries.get.enums import RecordTypeCoarse -from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.models.entry \ - import TestPushToHuggingFaceURLSetupEntry as Entry -from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.models.output import \ - TestPushToHuggingFaceURLSetupExpectedOutput as Output -from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.models.input import \ - TestPushToHuggingFaceURLSetupEntryInput as Input +from src.core.tasks.scheduled.impl.huggingface.queries.get.model import GetForLoadingToHuggingFaceOutput + + +def get_test_url(i: int) -> str: + return f"www.testPushToHuggingFaceURLSetupEntry.com/{i}" + +def get_test_html(i: int) -> str: + return f"
Test Push to Hugging Face URL Setup Entry {i}
" + +def generate_expected_outputs( + url_ids: list[int], + relevant: bool, + record_type_fine: RecordType, + record_type_coarse: RecordTypeCoarse +) -> list[GetForLoadingToHuggingFaceOutput]: + results: list[GetForLoadingToHuggingFaceOutput] = [] + for i in range(2): + output = GetForLoadingToHuggingFaceOutput( + url_id=url_ids[i], + url=get_test_url(i), + relevant=relevant, + record_type_fine=record_type_fine, + record_type_coarse=record_type_coarse, + html=get_test_html(i) + ) + results.append(output) + return results -ENTRIES = [ - # Because pending, should not be picked up - Entry( - input=Input( - status=URLStatus.PENDING, - has_html_content=True, - record_type=RecordType.INCARCERATION_RECORDS - ), - expected_output=Output( - picked_up=False, - ) - ), - # Because no html content, should not be picked up - Entry( - input=Input( - status=URLStatus.SUBMITTED, - has_html_content=False, - record_type=RecordType.RECORDS_REQUEST_INFO - ), - expected_output=Output( - picked_up=False, - ) - ), - # Remainder should be picked up - Entry( - input=Input( - status=URLStatus.VALIDATED, - has_html_content=True, - record_type=RecordType.RECORDS_REQUEST_INFO - ), - expected_output=Output( - picked_up=True, - coarse_record_type=RecordTypeCoarse.AGENCY_PUBLISHED_RESOURCES, - relevant=True - ) - ), - Entry( - input=Input( - status=URLStatus.SUBMITTED, - has_html_content=True, - record_type=RecordType.INCARCERATION_RECORDS - ), - expected_output=Output( - picked_up=True, - coarse_record_type=RecordTypeCoarse.JAILS_AND_COURTS, - relevant=True - ) - ), - Entry( - input=Input( - status=URLStatus.NOT_RELEVANT, - has_html_content=True, - record_type=None - ), - expected_output=Output( - picked_up=True, - coarse_record_type=RecordTypeCoarse.NOT_RELEVANT, - relevant=False - ) - ), -] diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/enums.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/enums.py new file mode 100644 index 00000000..0bb8cc87 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/enums.py @@ -0,0 +1,7 @@ +from enum import Enum + + +class PushToHuggingFaceTestSetupStatusEnum(Enum): + NOT_VALIDATED = "NOT_VALIDATED" + NOT_RELEVANT = "NOT_RELEVANT" + DATA_SOURCE = "DATA_SOURCE" diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/helper.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/helper.py new file mode 100644 index 00000000..bbb40067 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/helper.py @@ -0,0 +1,16 @@ +from src.db.client.async_ import AsyncDatabaseClient +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.models.input import \ + TestPushToHuggingFaceURLSetupEntryInput +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.queries.setup import \ + SetupTestPushToHuggingFaceEntryQueryBuilder + + +async def setup_urls( + dbc: AsyncDatabaseClient, + inp: TestPushToHuggingFaceURLSetupEntryInput +) -> list[int]: + # Set up 2 URLs + builder = SetupTestPushToHuggingFaceEntryQueryBuilder(inp) + return await dbc.run_query_builder(builder) + + diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/manager.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/manager.py deleted file mode 100644 index d6438472..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/manager.py +++ /dev/null @@ -1,43 +0,0 @@ -from src.core.tasks.scheduled.impl.huggingface.queries.get.model import GetForLoadingToHuggingFaceOutput -from src.db.client.async_ import AsyncDatabaseClient -from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.data import ENTRIES -from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.models.record import \ - TestPushToHuggingFaceRecordSetupRecord as Record, TestPushToHuggingFaceRecordSetupRecord -from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.queries.setup import \ - SetupTestPushToHuggingFaceEntryQueryBuilder - - -class PushToHuggingFaceTestSetupManager: - - def __init__(self, adb_client: AsyncDatabaseClient): - self.adb_client = adb_client - self.entries = ENTRIES - # Connects a URL ID to the expectation that it will be picked up - self._id_to_record: dict[int, TestPushToHuggingFaceRecordSetupRecord] = {} - - async def setup(self) -> None: - records: list[Record] = await self.adb_client.run_query_builder( - SetupTestPushToHuggingFaceEntryQueryBuilder(self.entries) - ) - for record in records: - if not record.expected_output.picked_up: - continue - self._id_to_record[record.url_id] = record - - def check_results(self, outputs: list[GetForLoadingToHuggingFaceOutput]) -> None: - # Check that both expected and actual results are same length - length_expected = len(self._id_to_record.keys()) - length_actual = len(outputs) - assert length_expected == length_actual, f"Expected {length_expected} results, got {length_actual}" - - # Check attributes of each URL match what is expected - for output in outputs: - url_id = output.url_id - record = self._id_to_record[url_id] - - expected_output = record.expected_output - assert output.relevant == expected_output.relevant - assert output.record_type_coarse == expected_output.coarse_record_type, \ - f"Expected {expected_output.coarse_record_type} but got {output.record_type_coarse}" - assert output.record_type_fine == record.record_type_fine - diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/models/entry.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/models/entry.py deleted file mode 100644 index 16bb74aa..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/models/entry.py +++ /dev/null @@ -1,12 +0,0 @@ -from pydantic import BaseModel - -from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.models.input import \ - TestPushToHuggingFaceURLSetupEntryInput -from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.models.output import \ - TestPushToHuggingFaceURLSetupExpectedOutput - - -class TestPushToHuggingFaceURLSetupEntry(BaseModel): - input: TestPushToHuggingFaceURLSetupEntryInput - expected_output: TestPushToHuggingFaceURLSetupExpectedOutput - diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/models/input.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/models/input.py index b5128375..2bdf21a5 100644 --- a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/models/input.py +++ b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/models/input.py @@ -1,10 +1,11 @@ from pydantic import BaseModel -from src.collectors.enums import URLStatus from src.core.enums import RecordType +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.enums import \ + PushToHuggingFaceTestSetupStatusEnum class TestPushToHuggingFaceURLSetupEntryInput(BaseModel): - status: URLStatus + status: PushToHuggingFaceTestSetupStatusEnum record_type: RecordType | None has_html_content: bool diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/models/output.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/models/output.py deleted file mode 100644 index 736bd97e..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/models/output.py +++ /dev/null @@ -1,21 +0,0 @@ -from typing import Self - -from pydantic import BaseModel, model_validator - -from src.core.tasks.scheduled.impl.huggingface.queries.get.enums import RecordTypeCoarse - - -class TestPushToHuggingFaceURLSetupExpectedOutput(BaseModel): - picked_up: bool - relevant: bool | None = None - coarse_record_type: RecordTypeCoarse | None = None - - @model_validator(mode='after') - def validate_coarse_record_type_and_relevant(self) -> Self: - if not self.picked_up: - return self - if self.coarse_record_type is None: - raise ValueError('Coarse record type should be provided if picked up') - if self.relevant is None: - raise ValueError('Relevant should be provided if picked up') - return self diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/models/record.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/models/record.py deleted file mode 100644 index 4ce15770..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/models/record.py +++ /dev/null @@ -1,11 +0,0 @@ -from pydantic import BaseModel - -from src.core.enums import RecordType -from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.models.output import \ - TestPushToHuggingFaceURLSetupExpectedOutput - - -class TestPushToHuggingFaceRecordSetupRecord(BaseModel): - expected_output: TestPushToHuggingFaceURLSetupExpectedOutput - record_type_fine: RecordType | None - url_id: int \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/convert.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/convert.py new file mode 100644 index 00000000..2fb5b2d0 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/convert.py @@ -0,0 +1,14 @@ +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.enums import \ + PushToHuggingFaceTestSetupStatusEnum + +def convert_test_status_to_validated_status( + status: PushToHuggingFaceTestSetupStatusEnum +) -> URLValidatedType: + match status: + case PushToHuggingFaceTestSetupStatusEnum.DATA_SOURCE: + return URLValidatedType.DATA_SOURCE + case PushToHuggingFaceTestSetupStatusEnum.NOT_RELEVANT: + return URLValidatedType.NOT_RELEVANT + case _: + raise ValueError(f"Invalid test status for function: {status}") \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/setup.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/setup.py index 8e01c86b..05b829df 100644 --- a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/setup.py +++ b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/setup.py @@ -1,57 +1,66 @@ from sqlalchemy.ext.asyncio import AsyncSession +from src.collectors.enums import URLStatus +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.url.core.enums import URLSource -from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML from src.db.queries.base.builder import QueryBuilderBase from src.db.utils.compression import compress_html -from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.models.entry import \ - TestPushToHuggingFaceURLSetupEntry as Entry -from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.models.record import \ - TestPushToHuggingFaceRecordSetupRecord as Record +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.data import get_test_url, get_test_html +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.enums import \ + PushToHuggingFaceTestSetupStatusEnum +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.models.input import \ + TestPushToHuggingFaceURLSetupEntryInput +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.queries.convert import \ + convert_test_status_to_validated_status class SetupTestPushToHuggingFaceEntryQueryBuilder(QueryBuilderBase): def __init__( self, - entries: list[Entry] + inp: TestPushToHuggingFaceURLSetupEntryInput ): super().__init__() - self.entries = entries + self.inp = inp - async def run(self, session: AsyncSession) -> list[Record]: - records = [] - for idx, entry in enumerate(self.entries): - if idx % 2 == 0: + async def run(self, session: AsyncSession) -> list[int]: + url_ids: list[int] = [] + for i in range(2): + if i % 2 == 0: name = "Test Push to Hugging Face URL Setup Entry" description = "This is a test push to Hugging Face URL setup entry" else: name = None description = None - inp = entry.input url = URL( - url=f"www.testPushToHuggingFaceURLSetupEntry.com/{idx}", - status=inp.status, + url=get_test_url(i), + status=URLStatus.OK, name=name, description=description, - record_type=inp.record_type, + record_type=self.inp.record_type, source=URLSource.COLLECTOR ) session.add(url) await session.flush() - if entry.input.has_html_content: + url_ids.append(url.id) + if self.inp.status in ( + PushToHuggingFaceTestSetupStatusEnum.DATA_SOURCE, + PushToHuggingFaceTestSetupStatusEnum.NOT_RELEVANT + ): + flag = FlagURLValidated( + url_id=url.id, + type=convert_test_status_to_validated_status(self.inp.status), + ) + session.add(flag) + + if self.inp.has_html_content: compressed_html = URLCompressedHTML( url_id=url.id, - compressed_html=compress_html(f"
Test Push to Hugging Face URL Setup Entry {idx}
"), + compressed_html=compress_html(get_test_html(i)), ) session.add(compressed_html) - record = Record( - url_id=url.id, - expected_output=entry.expected_output, - record_type_fine=inp.record_type - ) - records.append(record) - return records + return url_ids diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/test_happy_path.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/test_happy_path.py deleted file mode 100644 index d3c3e056..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/huggingface/test_happy_path.py +++ /dev/null @@ -1,42 +0,0 @@ -from unittest.mock import AsyncMock - -import pytest - -from src.core.tasks.scheduled.impl.huggingface.operator import PushToHuggingFaceTaskOperator -from src.core.tasks.scheduled.impl.huggingface.queries.get.model import GetForLoadingToHuggingFaceOutput -from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error -from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.manager import PushToHuggingFaceTestSetupManager -from tests.helpers.data_creator.core import DBDataCreator - - -@pytest.mark.asyncio -async def test_happy_path( - operator: PushToHuggingFaceTaskOperator, - db_data_creator: DBDataCreator -): - hf_client = operator.hf_client - push_function: AsyncMock = hf_client.push_data_sources_raw_to_hub - - # Check, prior to adding URLs, that task does not run - task_info = await operator.run_task() - assert_task_ran_without_error(task_info) - push_function.assert_not_called() - - # Add URLs - manager = PushToHuggingFaceTestSetupManager(adb_client=db_data_creator.adb_client) - await manager.setup() - - # Run task - task_info = await operator.run_task() - assert_task_ran_without_error(task_info) - push_function.assert_called_once() - - call_args: list[GetForLoadingToHuggingFaceOutput] = push_function.call_args.args[0] - - # Check for calls to HF Client - manager.check_results(call_args) - - # Test that after update, running again yields no results - task_info = await operator.run_task() - assert_task_ran_without_error(task_info) - push_function.assert_called_once() \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/test_no_html_content_not_picked_up.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/test_no_html_content_not_picked_up.py new file mode 100644 index 00000000..25c4d09d --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/huggingface/test_no_html_content_not_picked_up.py @@ -0,0 +1,45 @@ +import pytest + +from src.core.enums import RecordType +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.scheduled.impl.huggingface.operator import PushToHuggingFaceTaskOperator +from src.core.tasks.scheduled.impl.huggingface.queries.get.enums import RecordTypeCoarse +from src.db.client.async_ import AsyncDatabaseClient +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.check import check_not_called +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.enums import \ + PushToHuggingFaceTestSetupStatusEnum +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.helper import setup_urls +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.models.input import \ + TestPushToHuggingFaceURLSetupEntryInput +from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error + + +@pytest.mark.asyncio +async def test_huggingface_task_no_html_content_not_picked_up( + adb_client_test: AsyncDatabaseClient, + operator: PushToHuggingFaceTaskOperator +): + record_type = RecordType.ACCIDENT_REPORTS + + # Add URLs with no html content + inp = TestPushToHuggingFaceURLSetupEntryInput( + record_type=record_type, + status=PushToHuggingFaceTestSetupStatusEnum.DATA_SOURCE, + has_html_content=False + ) + _ = await setup_urls(adb_client_test, inp=inp) + + # Confirm task does not meet prerequisites + assert not await operator.meets_task_prerequisites() + + # Run task as though it did meet prerequisites + run_info: TaskOperatorRunInfo = await operator.run_task() + + # Confirm task ran without error + assert_task_ran_without_error(run_info) + + # Confirm task still does not meet prerequisites + assert not await operator.meets_task_prerequisites() + + # Confirm no URLs were picked up + check_not_called(operator) diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/test_not_relevant_picked_up.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/test_not_relevant_picked_up.py new file mode 100644 index 00000000..b4abc0ee --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/huggingface/test_not_relevant_picked_up.py @@ -0,0 +1,58 @@ +import pytest + +from src.collectors.enums import URLStatus +from src.core.enums import RecordType +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.scheduled.impl.huggingface.operator import PushToHuggingFaceTaskOperator +from src.core.tasks.scheduled.impl.huggingface.queries.get.enums import RecordTypeCoarse +from src.core.tasks.scheduled.impl.huggingface.queries.get.model import GetForLoadingToHuggingFaceOutput +from src.db.client.async_ import AsyncDatabaseClient +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.check import check_results_called +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.data import generate_expected_outputs +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.helper import setup_urls +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.models.input import \ + TestPushToHuggingFaceURLSetupEntryInput +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.enums import \ + PushToHuggingFaceTestSetupStatusEnum +from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error + + +@pytest.mark.asyncio +async def test_huggingface_task_not_relevant_picked_up( + adb_client_test: AsyncDatabaseClient, + operator: PushToHuggingFaceTaskOperator +): + record_type = RecordType.COMPLAINTS_AND_MISCONDUCT + rt_coarse = RecordTypeCoarse.INFO_ABOUT_OFFICERS + + # Add URLs with not relevant status + inp = TestPushToHuggingFaceURLSetupEntryInput( + record_type=record_type, + status=PushToHuggingFaceTestSetupStatusEnum.NOT_RELEVANT, + has_html_content=True + ) + url_ids: list[int] = await setup_urls(adb_client_test, inp=inp) + + # Confirm task meets prerequisites + assert await operator.meets_task_prerequisites() + + # Run task + run_info: TaskOperatorRunInfo = await operator.run_task() + + # Confirm task ran without error + assert_task_ran_without_error(run_info) + + # Confirm task no longer meets prerequisites + assert not await operator.meets_task_prerequisites() + + # Confirm expected URLs picked up + expected_outputs: list[GetForLoadingToHuggingFaceOutput] = generate_expected_outputs( + url_ids=url_ids, + relevant=False, + record_type_fine=record_type, + record_type_coarse=rt_coarse, + ) + check_results_called( + operator=operator, + expected_outputs=expected_outputs, + ) diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/test_not_validated_not_picked_up.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/test_not_validated_not_picked_up.py new file mode 100644 index 00000000..8fa07928 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/huggingface/test_not_validated_not_picked_up.py @@ -0,0 +1,44 @@ +import pytest + +from src.core.enums import RecordType +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.scheduled.impl.huggingface.operator import PushToHuggingFaceTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.check import check_not_called +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.enums import \ + PushToHuggingFaceTestSetupStatusEnum +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.helper import setup_urls +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.models.input import \ + TestPushToHuggingFaceURLSetupEntryInput +from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error + + +@pytest.mark.asyncio +async def test_huggingface_task_not_validated_not_picked_up( + adb_client_test: AsyncDatabaseClient, + operator: PushToHuggingFaceTaskOperator +): + record_type = RecordType.COURT_CASES + + # Add URLs with pending status + inp = TestPushToHuggingFaceURLSetupEntryInput( + record_type=record_type, + status=PushToHuggingFaceTestSetupStatusEnum.NOT_VALIDATED, + has_html_content=True + ) + _ = await setup_urls(adb_client_test, inp=inp) + + # Confirm task doesn't meet prerequisites + assert not await operator.meets_task_prerequisites() + + # Run task as though it did meet prerequisites + run_info: TaskOperatorRunInfo = await operator.run_task() + + # Confirm task ran without error + assert_task_ran_without_error(run_info) + + # Confirm task still doesn't meet prerequisites + assert not await operator.meets_task_prerequisites() + + # Confirm pending URL not picked up + check_not_called(operator) diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/test_validated_picked_up.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/test_validated_picked_up.py new file mode 100644 index 00000000..4ca89aa1 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/huggingface/test_validated_picked_up.py @@ -0,0 +1,60 @@ +import pytest + +from src.collectors.enums import URLStatus +from src.core.enums import RecordType +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.scheduled.impl.huggingface.operator import PushToHuggingFaceTaskOperator +from src.core.tasks.scheduled.impl.huggingface.queries.get.enums import RecordTypeCoarse +from src.core.tasks.scheduled.impl.huggingface.queries.get.model import GetForLoadingToHuggingFaceOutput +from src.db.client.async_ import AsyncDatabaseClient +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.check import check_results_called +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.data import generate_expected_outputs +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.helper import setup_urls +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.models.input import \ + TestPushToHuggingFaceURLSetupEntryInput +from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.enums import \ + PushToHuggingFaceTestSetupStatusEnum +from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error + + +@pytest.mark.asyncio +async def test_huggingface_task_validated_picked_up( + adb_client_test: AsyncDatabaseClient, + operator: PushToHuggingFaceTaskOperator +): + record_type = RecordType.GEOGRAPHIC + rt_coarse = RecordTypeCoarse.INFO_ABOUT_AGENCIES + + # Add URLs with validated status + inp = TestPushToHuggingFaceURLSetupEntryInput( + record_type=record_type, + status=PushToHuggingFaceTestSetupStatusEnum.DATA_SOURCE, + has_html_content=True + ) + url_ids: list[int] = await setup_urls(adb_client_test, inp=inp) + + # Confirm task meets prerequisites + assert await operator.meets_task_prerequisites() + + # Run task + run_info: TaskOperatorRunInfo = await operator.run_task() + + # Confirm task ran without error + assert_task_ran_without_error(run_info) + + # Confirm task no longer meets prerequisites + assert not await operator.meets_task_prerequisites() + + # Confirm URLs picked up + # Confirm expected URLs picked up + expected_outputs: list[GetForLoadingToHuggingFaceOutput] = generate_expected_outputs( + url_ids=url_ids, + relevant=True, + record_type_fine=record_type, + record_type_coarse=rt_coarse, + ) + check_results_called( + operator=operator, + expected_outputs=expected_outputs, + ) + diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/agency/conftest.py b/tests/automated/integration/tasks/scheduled/impl/sync/agency/conftest.py index 5b0539e7..85b9f1bc 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync/agency/conftest.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync/agency/conftest.py @@ -1,20 +1,30 @@ import pytest_asyncio from src.core.tasks.scheduled.impl.sync.agency.operator import SyncAgenciesTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.external.pdap.client import PDAPClient from tests.automated.integration.tasks.scheduled.impl.sync.agency.helpers import update_existing_agencies_updated_at, \ add_existing_agencies + +@pytest_asyncio.fixture +async def operator( + adb_client_test: AsyncDatabaseClient, + mock_pdap_client: PDAPClient +) -> SyncAgenciesTaskOperator: + return SyncAgenciesTaskOperator( + adb_client=adb_client_test, + pdap_client=mock_pdap_client + ) + @pytest_asyncio.fixture async def setup( db_data_creator, - mock_pdap_client + operator ) -> SyncAgenciesTaskOperator: await add_existing_agencies(db_data_creator) await update_existing_agencies_updated_at(db_data_creator) - return SyncAgenciesTaskOperator( - adb_client=db_data_creator.adb_client, - pdap_client=mock_pdap_client - ) + return operator diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/agency/setup/__init__.py b/tests/automated/integration/tasks/scheduled/impl/sync/agency/setup/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/agency/setup/core.py b/tests/automated/integration/tasks/scheduled/impl/sync/agency/setup/core.py new file mode 100644 index 00000000..cb84b014 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync/agency/setup/core.py @@ -0,0 +1,53 @@ +from contextlib import contextmanager +from datetime import timedelta, datetime +from unittest.mock import patch, AsyncMock + +from src.core.enums import RecordType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.external.pdap.client import PDAPClient +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInfo, AgenciesSyncResponseInnerInfo +from tests.helpers.data_creator.core import DBDataCreator +from tests.helpers.simple_test_data_functions import generate_test_name + + +def set_up_mock_pdap_client_responses( + mock_pdap_client: PDAPClient, + responses: list[AgenciesSyncResponseInfo | Exception] +) -> None: + """ + Modifies: + - pdap_client.sync_agencies + """ + mock_sync_agencies = AsyncMock( + side_effect=responses + [AgenciesSyncResponseInfo(agencies=[])] + ) + mock_pdap_client.sync_agencies = mock_sync_agencies + +async def set_up_urls( + db_data_creator: DBDataCreator, + record_type: RecordType, + validated_type: URLValidatedType | None = None, + agency_ids: list[int] | None = None, +) -> list[int]: + """Create 2 Test URLs in database.""" + url_ids: list[int] = await db_data_creator.create_urls(record_type=record_type, count=2) + if validated_type is not None: + await db_data_creator.create_validated_flags(url_ids=url_ids, validation_type=validated_type) + if agency_ids is not None: + await db_data_creator.create_url_agency_links(url_ids=url_ids, agency_ids=agency_ids) + return url_ids + +def set_up_sync_response_info( + agency_id: int, + meta_urls: list[str], +) -> AgenciesSyncResponseInfo: + yesterday = datetime.now() - timedelta(days=1) + return AgenciesSyncResponseInfo(agencies=[AgenciesSyncResponseInnerInfo( + agency_id=agency_id, + meta_urls=meta_urls, + updated_at=yesterday, + state_name=None, + county_name=None, + locality_name=None, + display_name=generate_test_name(agency_id) + )]) diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_ds_url_in_db_not_sync.py b/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_ds_url_in_db_not_sync.py new file mode 100644 index 00000000..42384615 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_ds_url_in_db_not_sync.py @@ -0,0 +1,90 @@ +import pytest + +from src.core.enums import RecordType +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.scheduled.impl.sync.agency.operator import SyncAgenciesTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.core.sqlalchemy import URL +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInfo +from tests.automated.integration.tasks.scheduled.impl.sync.agency.helpers import check_sync_concluded +from tests.automated.integration.tasks.scheduled.impl.sync.agency.setup.core import set_up_sync_response_info, \ + set_up_mock_pdap_client_responses +from tests.helpers.asserts import assert_task_run_success +from tests.helpers.data_creator.core import DBDataCreator + + +@pytest.mark.asyncio +async def test_data_sources_url_in_db_not_meta_url_sync( + wiped_database, + operator: SyncAgenciesTaskOperator, + db_data_creator: DBDataCreator +): + """ + In an Agency Sync, a URL validated as a Data Source linked to the agency + should be untouched if the URL is not in the sync response. + """ + db_client: AsyncDatabaseClient = operator.adb_client + + agency_id: int = 1 + + # Create agency + await db_data_creator.create_agency(agency_id) + + # Set up sync response with new meta URL + sync_response: AgenciesSyncResponseInfo = set_up_sync_response_info( + agency_id=agency_id, + meta_urls=[ + "https://example.com/meta-url-1", + ] + ) + + # Create additional URL Validated as data source and link to agency + ds_url_mapping: URLMapping = (await db_data_creator.create_validated_urls( + validation_type=URLValidatedType.DATA_SOURCE, + record_type=RecordType.ACCIDENT_REPORTS + ))[0] + ds_url_id: int = ds_url_mapping.url_id + await db_data_creator.create_url_agency_links( + url_ids=[ds_url_id], + agency_ids=[agency_id] + ) + + set_up_mock_pdap_client_responses(operator.pdap_client, [sync_response]) + + run_info: TaskOperatorRunInfo = await operator.run_task() + assert_task_run_success(run_info) + + await check_sync_concluded(db_client) + + # Confirm one agency in the database + agencies: list[Agency] = await db_client.get_all(Agency) + assert len(agencies) == 1 + + # Confirm 2 URLs in database + urls: list[URL] = await db_client.get_all(URL) + assert len(urls) == 2 + assert set(url.record_type for url in urls) == { + RecordType.CONTACT_INFO_AND_AGENCY_META, + RecordType.ACCIDENT_REPORTS + } + + # Confirm 2 Agency-URL Links + links: list[LinkURLAgency] = await db_client.get_all(LinkURLAgency) + assert len(links) == 2 + assert all(link.agency_id == 1 for link in links) + assert set(link.url_id for link in links) == set(url.id for url in urls) + + # Confirm 2 Validated Flags with different Validation Types + flags: list[FlagURLValidated] = await db_client.get_all(FlagURLValidated) + assert len(flags) == 2 + assert set(flag.type for flag in flags) == { + URLValidatedType.META_URL, + URLValidatedType.DATA_SOURCE + } + assert set(flag.url_id for flag in flags) == set(url.id for url in urls) + diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_interruption.py b/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_interruption.py index bf4ff81e..80b338db 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_interruption.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_interruption.py @@ -22,15 +22,12 @@ async def test_agency_sync_interruption( operator = setup db_client = operator.adb_client - - with patch_sync_agencies( [FIRST_CALL_RESPONSE, ValueError("test error")] ): run_info = await operator.run_task() assert run_info.outcome == TaskOperatorOutcome.ERROR, run_info.message - # Get current updated_ats from database for the 5 recently updated query = ( select( diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_meta_url_in_db_not_sync.py b/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_meta_url_in_db_not_sync.py new file mode 100644 index 00000000..9db57ec7 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_meta_url_in_db_not_sync.py @@ -0,0 +1,78 @@ +import pytest + +from src.core.enums import RecordType +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.scheduled.impl.sync.agency.operator import SyncAgenciesTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.core.sqlalchemy import URL +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInfo +from tests.automated.integration.tasks.scheduled.impl.sync.agency.helpers import check_sync_concluded +from tests.automated.integration.tasks.scheduled.impl.sync.agency.setup.core import set_up_sync_response_info, \ + set_up_mock_pdap_client_responses +from tests.helpers.asserts import assert_task_run_success +from tests.helpers.data_creator.core import DBDataCreator + + +@pytest.mark.asyncio +async def test_meta_url_in_db_not_sync( + wiped_database, + operator: SyncAgenciesTaskOperator, + db_data_creator: DBDataCreator +): + """ + In an Agency Sync, a URL in the DB validated as a Meta URL linked to the agency + but not included in the most recent sync response should be removed as a link + """ + db_client: AsyncDatabaseClient = operator.adb_client + + # Create Meta URL and link to Agency + agency_id: int = 1 + await db_data_creator.create_agency(agency_id) + meta_url_mapping: URLMapping = (await db_data_creator.create_validated_urls( + validation_type=URLValidatedType.META_URL, + record_type=RecordType.CONTACT_INFO_AND_AGENCY_META + ))[0] + meta_url_id: int = meta_url_mapping.url_id + await db_data_creator.create_url_agency_links( + url_ids=[meta_url_id], + agency_ids=[agency_id] + ) + + # Create Sync Response for agency with no Meta URLs + sync_response: AgenciesSyncResponseInfo = set_up_sync_response_info( + agency_id=agency_id, + meta_urls=[] + ) + + set_up_mock_pdap_client_responses(operator.pdap_client, [sync_response]) + run_info: TaskOperatorRunInfo = await operator.run_task() + assert_task_run_success(run_info) + + await check_sync_concluded(db_client) + + # Confirm one agency in the database + agencies: list[Agency] = await db_client.get_all(Agency) + assert len(agencies) == 1 + + # Confirm 1 URL in database + urls: list[URL] = await db_client.get_all(URL) + assert len(urls) == 1 + assert all(url.record_type == RecordType.CONTACT_INFO_AND_AGENCY_META for url in urls) + + # Confirm no Agency-URL Links + links: list[LinkURLAgency] = await db_client.get_all(LinkURLAgency) + assert len(links) == 0 + + # Confirm 1 Validated Flag + flags: list[FlagURLValidated] = await db_client.get_all(FlagURLValidated) + assert len(flags) == 1 + assert all(flag.type == URLValidatedType.META_URL for flag in flags) + assert all(flag.url_id == meta_url_id for flag in flags) + + + diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_happy_path.py b/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_no_meta_urls.py similarity index 95% rename from tests/automated/integration/tasks/scheduled/impl/sync/agency/test_happy_path.py rename to tests/automated/integration/tasks/scheduled/impl/sync/agency/test_no_meta_urls.py index d783b5cb..772139f4 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_happy_path.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_no_meta_urls.py @@ -17,6 +17,9 @@ async def test_agency_sync_happy_path( wiped_database, setup: SyncAgenciesTaskOperator ): + """ + Test behavior of Agency sync where no meta URLs are returned. + """ operator = setup db_client = operator.adb_client diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_same_meta_url_diff_agency.py b/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_same_meta_url_diff_agency.py new file mode 100644 index 00000000..9a0e920b --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_same_meta_url_diff_agency.py @@ -0,0 +1,77 @@ +import pytest + +from src.core.enums import RecordType +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.scheduled.impl.sync.agency.operator import SyncAgenciesTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.core.sqlalchemy import URL +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInfo +from tests.automated.integration.tasks.scheduled.impl.sync.agency.helpers import check_sync_concluded +from tests.automated.integration.tasks.scheduled.impl.sync.agency.setup.core import set_up_sync_response_info, \ + set_up_mock_pdap_client_responses +from tests.helpers.asserts import assert_task_run_success +from tests.helpers.data_creator.core import DBDataCreator + + +@pytest.mark.asyncio +async def test_same_meta_url_diff_agency( + wiped_database, + operator: SyncAgenciesTaskOperator, + db_data_creator: DBDataCreator +): + """ + Test that, in the case of a Meta URL already linked with one agency in the DB and + a new sync response with the same Meta URL but linked to a different agency, + the link to the original agency should be untouched while the link to the new agency + should be added. + """ + db_client: AsyncDatabaseClient = operator.adb_client + existing_agency_id: int = 1 + + await db_data_creator.create_agency(existing_agency_id) + meta_url_mapping: URLMapping = (await db_data_creator.create_validated_urls( + validation_type=URLValidatedType.META_URL, + record_type=RecordType.CONTACT_INFO_AND_AGENCY_META + ))[0] + meta_url_id: int = meta_url_mapping.url_id + await db_data_creator.create_url_agency_links( + url_ids=[meta_url_id], + agency_ids=[existing_agency_id] + ) + + new_agency_id: int = 2 + meta_url: str = meta_url_mapping.url + sync_response: AgenciesSyncResponseInfo = set_up_sync_response_info( + agency_id=new_agency_id, + meta_urls=[meta_url] + ) + + set_up_mock_pdap_client_responses(operator.pdap_client, [sync_response]) + run_info: TaskOperatorRunInfo = await operator.run_task() + assert_task_run_success(run_info) + + await check_sync_concluded(db_client) + + # Confirm two agencies in the database + agencies: list[Agency] = await db_client.get_all(Agency) + assert len(agencies) == 2 + + # Confirm 1 URL in database + urls: list[URL] = await db_client.get_all(URL) + assert len(urls) == 1 + assert all(url.record_type == RecordType.CONTACT_INFO_AND_AGENCY_META for url in urls) + + # Confirm 2 Agency-URL Links + links: list[LinkURLAgency] = await db_client.get_all(LinkURLAgency) + assert len(links) == 2 + + # Confirm 2 Validated Flag + flags: list[FlagURLValidated] = await db_client.get_all(FlagURLValidated) + assert len(flags) == 1 + assert all(flag.type == URLValidatedType.META_URL for flag in flags) + assert all(flag.url_id == meta_url_id for flag in flags) diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_with_meta_url_not_in_database.py b/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_with_meta_url_not_in_database.py new file mode 100644 index 00000000..13a8eb20 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync/agency/test_with_meta_url_not_in_database.py @@ -0,0 +1,67 @@ +import pytest + +from src.core.enums import RecordType +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.scheduled.impl.sync.agency.operator import SyncAgenciesTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.core.sqlalchemy import URL +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo, AgenciesSyncResponseInfo +from tests.automated.integration.tasks.scheduled.impl.sync.agency.helpers import patch_sync_agencies, \ + check_sync_concluded +from tests.automated.integration.tasks.scheduled.impl.sync.agency.setup.core import set_up_sync_response_info, \ + set_up_mock_pdap_client_responses +from tests.helpers.asserts import assert_task_run_success + + +@pytest.mark.asyncio +async def test_with_meta_url_not_in_database( + wiped_database, + operator: SyncAgenciesTaskOperator +): + """ + In an Agency Sync, a Meta URL included in the sync response + but not present in the DB should be added to the DB with: + - The URLValidationFlag set to `Meta URL` + - The Record Type set to `Contact Info and Agency Meta` + - The link to the agency added + """ + db_client: AsyncDatabaseClient = operator.adb_client + + sync_response: AgenciesSyncResponseInfo = set_up_sync_response_info( + agency_id=1, + meta_urls=[ + "https://example.com/meta-url-1", + "https://example.com/meta-url-2", + ] + ) + + set_up_mock_pdap_client_responses(operator.pdap_client, [sync_response]) + run_info: TaskOperatorRunInfo = await operator.run_task() + assert_task_run_success(run_info) + + await check_sync_concluded(db_client) + + # Confirm one agency in the database + agencies: list[Agency] = await db_client.get_all(Agency) + assert len(agencies) == 1 + + # Confirm 2 URLs in database + urls: list[URL] = await db_client.get_all(URL) + assert len(urls) == 2 + assert all(url.record_type == RecordType.CONTACT_INFO_AND_AGENCY_META for url in urls) + + # Confirm 2 Agency-URL Links + links: list[LinkURLAgency] = await db_client.get_all(LinkURLAgency) + assert len(links) == 2 + assert all(link.agency_id == 1 for link in links) + assert set(link.url_id for link in links) == set(url.id for url in urls) + + # Confirm 2 Validated Flags + flags: list[FlagURLValidated] = await db_client.get_all(FlagURLValidated) + assert len(flags) == 2 + assert all(flag.type == URLValidatedType.META_URL for flag in flags) + assert set(flag.url_id for flag in flags) == set(url.id for url in urls) diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/check.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/check.py index 12428d7d..dcc1fc23 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/check.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/check.py @@ -1,4 +1,4 @@ -from datetime import timedelta +from datetime import timedelta, datetime from sqlalchemy import select, cast, func, TIMESTAMP @@ -9,14 +9,9 @@ async def check_sync_concluded( db_client: AsyncDatabaseClient, + current_db_datetime: datetime, check_updated_at: bool = True -): - - current_db_datetime = await db_client.scalar( - select( - cast(func.now(), TIMESTAMP) - ) - ) +) -> None: sync_state_results = await db_client.scalar( select( diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/conftest.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/conftest.py index 44239db8..e91461ea 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/conftest.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/conftest.py @@ -1,12 +1,16 @@ +from datetime import datetime + import pytest_asyncio from src.core.tasks.scheduled.impl.sync.data_sources.operator import SyncDataSourcesTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.agency.sqlalchemy import Agency from src.external.pdap.client import PDAPClient from tests.helpers.data_creator.core import DBDataCreator @pytest_asyncio.fixture -async def test_operator( +async def operator( db_data_creator: DBDataCreator, mock_pdap_client: PDAPClient ) -> SyncDataSourcesTaskOperator: @@ -14,3 +18,30 @@ async def test_operator( adb_client=db_data_creator.adb_client, pdap_client=mock_pdap_client ) + +@pytest_asyncio.fixture +async def current_db_time( + adb_client_test: AsyncDatabaseClient +) -> datetime: + return (await adb_client_test.get_current_database_time()).replace(tzinfo=None) + + +@pytest_asyncio.fixture +async def agency_ids( + adb_client_test: AsyncDatabaseClient +) -> list[int]: + """Creates and returns the ids of 4 agencies""" + agencies: list[Agency] = [] + agency_ids: list[int] = [] + for i in range(4): + agency = Agency( + agency_id=i, + name=f"Test Agency {i}", + state="test_state", + county="test_county", + locality="test_locality" + ) + agency_ids.append(i) + agencies.append(agency) + await adb_client_test.add_all(agencies) + return agency_ids diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/existence_checker.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/existence_checker.py deleted file mode 100644 index 4007c38d..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/existence_checker.py +++ /dev/null @@ -1,42 +0,0 @@ -from collections import defaultdict - -from src.db.models.impl.link.url_agency_.sqlalchemy import LinkURLAgency -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource -from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInfo, DataSourcesSyncResponseInnerInfo - - -class URLExistenceChecker: - - def __init__( - self, - responses: list[DataSourcesSyncResponseInfo], - url_ds_links: list[URLDataSource], - url_agency_links: list[LinkURLAgency] - ): - self._ds_id_response_dict: dict[int, DataSourcesSyncResponseInnerInfo] = {} - for response in responses: - for data_source in response.data_sources: - self._ds_id_response_dict[data_source.id] = data_source - self._ds_id_url_link_dict = {} - for link in url_ds_links: - self._ds_id_url_link_dict[link.data_source_id] = link.url_id - self._url_id_agency_link_dict = defaultdict(list) - for link in url_agency_links: - self._url_id_agency_link_dict[link.url_id].append(link.agency_id) - - - def check(self, url: URL): - ds_id = self._ds_id_url_link_dict.get(url.id) - if ds_id is None: - raise AssertionError(f"URL {url.id} has no data source link") - response = self._ds_id_response_dict.get(ds_id) - if response is None: - raise AssertionError(f"Data source {ds_id} has no response") - - assert response.url == url.url - assert response.description == url.description - assert response.name == url.name - - agency_ids = self._url_id_agency_link_dict.get(url.id) - assert set(response.agency_ids) == set(agency_ids) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/core.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/core.py index 932d2518..f7cd3337 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/core.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/core.py @@ -1,7 +1,17 @@ from contextlib import contextmanager -from unittest.mock import patch +from datetime import datetime, timedelta +from unittest.mock import patch, create_autospec, AsyncMock +from src.collectors.enums import URLStatus +from src.core.enums import RecordType +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from src.external.pdap.client import PDAPClient +from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInfo, DataSourcesSyncResponseInnerInfo +from src.external.pdap.enums import ApprovalStatus, DataSourcesURLStatus +from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.queries.url_.url import \ + TestDataSourcesSyncURLSetupQueryBuilder +from tests.helpers.simple_test_data_functions import generate_test_url @contextmanager @@ -11,4 +21,68 @@ def patch_sync_data_sources(side_effects: list): "sync_data_sources", side_effect=side_effects ): - yield \ No newline at end of file + yield + + + +def set_up_mock_pdap_client_responses( + mock_pdap_client: PDAPClient, + responses: list[DataSourcesSyncResponseInfo | Exception] +) -> None: + """ + Modifies: + - pdap_client.sync_data_sources + """ + mock_sync_data_sources = AsyncMock( + side_effect=responses + [DataSourcesSyncResponseInfo(data_sources=[])] + ) + mock_pdap_client.sync_data_sources = mock_sync_data_sources + +async def set_up_urls( + adb_client: AsyncDatabaseClient, + record_type: RecordType, + validated_type: URLValidatedType | None = None, + previously_synced: bool = False, +) -> list[int]: + """Creates 2 test URLs.""" + + builder = TestDataSourcesSyncURLSetupQueryBuilder( + record_type=record_type, + validated_type=validated_type, + previously_synced=previously_synced, + ) + + return await adb_client.run_query_builder(builder) + +def _generate_test_data_source_name(i: int) -> str: + return f"Test Data Source {i}" + +def _generate_test_data_source_description(i: int) -> str: + return f"Test Data Source Description {i}" + +def set_up_sync_response_info( + ids: list[int], + record_type: RecordType, + agency_ids: list[int], + approval_status: ApprovalStatus, + ds_url_status: DataSourcesURLStatus, +) -> DataSourcesSyncResponseInfo: + yesterday = datetime.now() - timedelta(days=1) + inner_info_list: list[DataSourcesSyncResponseInnerInfo] = [] + for id_ in ids: + inner_info_list.append( + DataSourcesSyncResponseInnerInfo( + id=id_, + url=generate_test_url(id_), + name=_generate_test_data_source_name(id_), + description=_generate_test_data_source_description(id_), + record_type=record_type, + agency_ids=agency_ids, + approval_status=approval_status, + url_status=ds_url_status, + updated_at=yesterday, + ) + ) + return DataSourcesSyncResponseInfo( + data_sources=inner_info_list, + ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/data.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/data.py deleted file mode 100644 index e4094b38..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/data.py +++ /dev/null @@ -1,100 +0,0 @@ -from src.collectors.enums import URLStatus -from src.core.enums import RecordType -from src.external.pdap.enums import DataSourcesURLStatus, ApprovalStatus -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.models.url.data_sources import TestDSURLSetupEntry -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.enums import SyncResponseOrder, AgencyAssigned -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.models.url.source_collector import TestSCURLSetupEntry -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.models.url.core import TestURLSetupEntry - -ENTRIES = [ - TestURLSetupEntry( - # A URL in both DBs that should be overwritten - url='https://example.com/1', - ds_info=TestDSURLSetupEntry( - id=100, - name='Overwritten URL 1 Name', - description='Overwritten URL 1 Description', - url_status=DataSourcesURLStatus.OK, - approval_status=ApprovalStatus.APPROVED, - record_type=RecordType.ACCIDENT_REPORTS, - agencies_assigned=[AgencyAssigned.ONE, AgencyAssigned.TWO], - sync_response_order=SyncResponseOrder.FIRST - ), - sc_info=TestSCURLSetupEntry( - name='Pre-existing URL 1 Name', - description='Pre-existing URL 1 Description', - record_type=RecordType.ACCIDENT_REPORTS, - url_status=URLStatus.PENDING, - agencies_assigned=[AgencyAssigned.ONE, AgencyAssigned.THREE] - ), - final_url_status=URLStatus.SUBMITTED - ), - TestURLSetupEntry( - # A DS-only approved but broken URL - url='https://example.com/2', - ds_info=TestDSURLSetupEntry( - id=101, - name='New URL 2 Name', - description='New URL 2 Description', - url_status=DataSourcesURLStatus.BROKEN, - approval_status=ApprovalStatus.APPROVED, - record_type=RecordType.INCARCERATION_RECORDS, - agencies_assigned=[AgencyAssigned.TWO], - sync_response_order=SyncResponseOrder.FIRST - ), - sc_info=None, - final_url_status=URLStatus.NOT_FOUND - ), - TestURLSetupEntry( - # An SC-only pending URL, should be unchanged. - url='https://example.com/3', - ds_info=None, - sc_info=TestSCURLSetupEntry( - name='Pre-existing URL 3 Name', - description='Pre-existing URL 3 Description', - record_type=RecordType.FIELD_CONTACTS, - url_status=URLStatus.PENDING, - agencies_assigned=[AgencyAssigned.ONE, AgencyAssigned.THREE] - ), - final_url_status=URLStatus.PENDING - ), - TestURLSetupEntry( - # A DS-only rejected URL - url='https://example.com/4', - ds_info=TestDSURLSetupEntry( - id=102, - name='New URL 4 Name', - description=None, - url_status=DataSourcesURLStatus.OK, - approval_status=ApprovalStatus.REJECTED, - record_type=RecordType.ACCIDENT_REPORTS, - agencies_assigned=[AgencyAssigned.ONE], - sync_response_order=SyncResponseOrder.FIRST - ), - sc_info=None, - final_url_status=URLStatus.NOT_RELEVANT - ), - TestURLSetupEntry( - # A pre-existing URL in the second response - url='https://example.com/5', - ds_info=TestDSURLSetupEntry( - id=103, - name='New URL 5 Name', - description=None, - url_status=DataSourcesURLStatus.OK, - approval_status=ApprovalStatus.APPROVED, - record_type=RecordType.INCARCERATION_RECORDS, - agencies_assigned=[AgencyAssigned.ONE], - sync_response_order=SyncResponseOrder.SECOND - ), - sc_info=TestSCURLSetupEntry( - name='Pre-existing URL 5 Name', - description='Pre-existing URL 5 Description', - record_type=None, - url_status=URLStatus.PENDING, - agencies_assigned=[] - ), - final_url_status=URLStatus.SUBMITTED - ) -] - diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/enums.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/enums.py deleted file mode 100644 index fd1e1da2..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/enums.py +++ /dev/null @@ -1,16 +0,0 @@ -from enum import Enum - - -class SyncResponseOrder(Enum): - """Represents which sync response the entry is in.""" - FIRST = 1 - SECOND = 2 - # No entries should be in 3 - THIRD = 3 - - -class AgencyAssigned(Enum): - """Represents which of several pre-created agencies the entry is assigned to.""" - ONE = 1 - TWO = 2 - THREE = 3 diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/agency.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/agency.py deleted file mode 100644 index 0321aec9..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/agency.py +++ /dev/null @@ -1,31 +0,0 @@ -from sqlalchemy import select - -from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.impl.agency.sqlalchemy import Agency -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.enums import AgencyAssigned - - -class AgencyAssignmentManager: - - def __init__(self, adb_client: AsyncDatabaseClient): - self.adb_client = adb_client - self._dict: dict[AgencyAssigned, int] = {} - - async def setup(self): - agencies = [] - for ag_enum in AgencyAssigned: - agency = Agency( - agency_id=ag_enum.value, - name=f"Test Agency {ag_enum.name}", - state="test_state", - county="test_county", - locality="test_locality" - ) - agencies.append(agency) - await self.adb_client.add_all(agencies) - agency_ids = await self.adb_client.scalars(select(Agency.agency_id)) - for ag_enum, agency_id in zip(AgencyAssigned, agency_ids): - self._dict[ag_enum] = agency_id - - async def get(self, ag_enum: AgencyAssigned) -> int: - return self._dict[ag_enum] diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/core.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/core.py deleted file mode 100644 index 8f1ab8fa..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/core.py +++ /dev/null @@ -1,111 +0,0 @@ -from collections import defaultdict - -from src.db.client.async_ import AsyncDatabaseClient -from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo, DataSourcesSyncResponseInfo -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.enums import SyncResponseOrder -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.manager.agency import AgencyAssignmentManager -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.manager.queries.check import \ - CheckURLQueryBuilder -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.manager.url import URLSetupFunctor -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.models.url.core import TestURLSetupEntry -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.models.url.post import TestURLPostSetupRecord - - -class DataSourcesSyncTestSetupManager: - - def __init__( - self, - adb_client: AsyncDatabaseClient, - entries: list[TestURLSetupEntry], - ): - self.adb_client = adb_client - self.entries = entries - self.agency_assignment_manager = AgencyAssignmentManager(self.adb_client) - - self.url_id_to_setup_record: dict[int, TestURLPostSetupRecord] = {} - self.ds_id_to_setup_record: dict[int, TestURLPostSetupRecord] = {} - self.sync_response_order_to_setup_record: dict[ - SyncResponseOrder, list[TestURLPostSetupRecord] - ] = defaultdict(list) - - self.response_dict: dict[ - SyncResponseOrder, list[DataSourcesSyncResponseInnerInfo] - ] = defaultdict(list) - - async def setup(self): - await self.setup_agencies() - await self.setup_entries() - - async def setup_entries(self): - for entry in self.entries: - await self.setup_entry(entry) - - async def setup_entry( - self, - entry: TestURLSetupEntry - ) -> None: - """ - Modifies: - self.url_id_to_setup_record - self.ds_id_to_setup_record - self.response_dict - """ - functor = URLSetupFunctor( - entry=entry, - agency_assignment_manager=self.agency_assignment_manager, - adb_client=self.adb_client - ) - result = await functor() - response_info = result.ds_response_info - if response_info is not None: - self.response_dict[entry.ds_info.sync_response_order].append(response_info) - if result.url_id is not None: - self.url_id_to_setup_record[result.url_id] = result - if result.data_sources_id is not None: - self.ds_id_to_setup_record[result.data_sources_id] = result - if entry.ds_info is not None: - self.sync_response_order_to_setup_record[ - entry.ds_info.sync_response_order - ].append(result) - - async def setup_agencies(self): - await self.agency_assignment_manager.setup() - - async def get_data_sources_sync_responses( - self, - orders: list[SyncResponseOrder | ValueError] - ) -> list[DataSourcesSyncResponseInfo]: - results = [] - for order in orders: - results.append( - DataSourcesSyncResponseInfo( - data_sources=self.response_dict[order] - ) - ) - return results - - async def check_via_url(self, url_id: int): - builder = CheckURLQueryBuilder( - record=self.url_id_to_setup_record[url_id] - ) - await self.adb_client.run_query_builder(builder) - - async def check_via_data_source(self, data_source_id: int): - builder = CheckURLQueryBuilder( - record=self.ds_id_to_setup_record[data_source_id] - ) - await self.adb_client.run_query_builder(builder) - - async def check_results(self): - for url_id in self.url_id_to_setup_record.keys(): - await self.check_via_url(url_id) - for data_source_id in self.ds_id_to_setup_record.keys(): - await self.check_via_data_source(data_source_id) - - async def check_via_sync_response_order(self, order: SyncResponseOrder): - records = self.sync_response_order_to_setup_record[order] - for record in records: - builder = CheckURLQueryBuilder( - record=record - ) - await self.adb_client.run_query_builder(builder) diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/queries/check.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/queries/check.py deleted file mode 100644 index ad1bc4c0..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/queries/check.py +++ /dev/null @@ -1,46 +0,0 @@ -from sqlalchemy import select -from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.orm import selectinload - -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource -from src.db.queries.base.builder import QueryBuilderBase -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.models.url.post import TestURLPostSetupRecord -from src.db.helpers.session import session_helper as sh - - -class CheckURLQueryBuilder(QueryBuilderBase): - - def __init__(self, record: TestURLPostSetupRecord): - super().__init__() - self.record = record - - async def run(self, session: AsyncSession) -> None: - """Check if url and associated properties match record. - Raises: - AssertionError: if url and associated properties do not match record - """ - query = ( - select(URL) - .options( - selectinload(URL.data_source), - selectinload(URL.confirmed_agencies), - ) - .outerjoin(URLDataSource, URL.id == URLDataSource.url_id) - ) - if self.record.url_id is not None: - query = query.where(URL.id == self.record.url_id) - if self.record.data_sources_id is not None: - query = query.where(URLDataSource.data_source_id == self.record.data_sources_id) - - result = await sh.one_or_none(session=session, query=query) - assert result is not None, f"URL not found for {self.record}" - await self.check_results(result) - - async def check_results(self, url: URL): - assert url.record_type == self.record.final_record_type - assert url.description == self.record.final_description - assert url.name == self.record.final_name - agencies = [agency.agency_id for agency in url.confirmed_agencies] - assert set(agencies) == set(self.record.final_agency_ids) - assert url.status == self.record.final_url_status diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/url.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/url.py deleted file mode 100644 index 81eaa50f..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/url.py +++ /dev/null @@ -1,97 +0,0 @@ -from pendulum import today - -from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency -from src.db.models.impl.url.core.enums import URLSource -from src.db.models.impl.url.core.sqlalchemy import URL -from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.enums import AgencyAssigned -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.manager.agency import AgencyAssignmentManager -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.models.url.core import TestURLSetupEntry -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.models.url.data_sources import \ - TestDSURLSetupEntry -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.models.url.post import TestURLPostSetupRecord -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.models.url.source_collector import \ - TestSCURLSetupEntry - - -class URLSetupFunctor: - - def __init__( - self, - entry: TestURLSetupEntry, - agency_assignment_manager: AgencyAssignmentManager, - adb_client: AsyncDatabaseClient - ): - self.adb_client = adb_client - self.agency_assignment_manager = agency_assignment_manager - self.prime_entry = entry - self.sc_agency_ids = None - self.ds_agency_ids = None - self.sc_url_id = None - self.ds_response_info = None - - async def __call__(self) -> TestURLPostSetupRecord: - await self.setup_entry() - return TestURLPostSetupRecord( - url_id=self.sc_url_id, - sc_setup_entry=self.prime_entry.sc_info, - ds_setup_entry=self.prime_entry.ds_info, - sc_agency_ids=self.sc_agency_ids, - ds_agency_ids=self.ds_agency_ids, - ds_response_info=self.ds_response_info, - final_url_status=self.prime_entry.final_url_status, - ) - - async def setup_entry(self): - if self.prime_entry.sc_info is not None: - self.sc_url_id = await self.setup_sc_entry(self.prime_entry.sc_info) - if self.prime_entry.ds_info is not None: - self.ds_response_info = await self.setup_ds_entry(self.prime_entry.ds_info) - - async def get_agency_ids(self, ags: list[AgencyAssigned]): - results = [] - for ag in ags: - results.append(await self.agency_assignment_manager.get(ag)) - return results - - async def setup_sc_entry( - self, - entry: TestSCURLSetupEntry - ) -> int: - """Set up source collector entry and return url id.""" - self.sc_agency_ids = await self.get_agency_ids(self.prime_entry.sc_info.agencies_assigned) - url = URL( - url=self.prime_entry.url, - name=entry.name, - description=entry.description, - collector_metadata={}, - status=entry.url_status.value, - record_type=entry.record_type.value if entry.record_type is not None else None, - source=URLSource.COLLECTOR - ) - url_id = await self.adb_client.add(url, return_id=True) - links = [] - for ag_id in self.sc_agency_ids: - link = LinkURLAgency(url_id=url_id, agency_id=ag_id) - links.append(link) - await self.adb_client.add_all(links) - return url_id - - async def setup_ds_entry( - self, - ds_entry: TestDSURLSetupEntry - ) -> DataSourcesSyncResponseInnerInfo: - """Set up data source entry and return response info.""" - self.ds_agency_ids = await self.get_agency_ids(self.prime_entry.ds_info.agencies_assigned) - return DataSourcesSyncResponseInnerInfo( - id=ds_entry.id, - url=self.prime_entry.url, - name=ds_entry.name, - description=ds_entry.description, - url_status=ds_entry.url_status, - approval_status=ds_entry.approval_status, - record_type=ds_entry.record_type, - updated_at=today(), - agency_ids=self.ds_agency_ids - ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/url/core.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/url/core.py deleted file mode 100644 index 155a3ace..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/url/core.py +++ /dev/null @@ -1,14 +0,0 @@ -from pydantic import BaseModel - -from src.collectors.enums import URLStatus -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.models.url.data_sources import TestDSURLSetupEntry -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.models.url.source_collector import \ - TestSCURLSetupEntry - - -class TestURLSetupEntry(BaseModel): - url: str - ds_info: TestDSURLSetupEntry | None # Represents URL previously existing in DS DB - sc_info: TestSCURLSetupEntry | None # Represents URL previously existing in SC DB - - final_url_status: URLStatus diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/url/data_sources.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/url/data_sources.py deleted file mode 100644 index 47809293..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/url/data_sources.py +++ /dev/null @@ -1,20 +0,0 @@ -from pydantic import BaseModel - -from src.core.enums import RecordType -from src.external.pdap.enums import DataSourcesURLStatus, ApprovalStatus -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.enums import AgencyAssigned, SyncResponseOrder - - -class TestDSURLSetupEntry(BaseModel): - """Represents URL previously existing in DS DB. - - These values should overwrite any SC values - """ - id: int # ID of URL in DS App - name: str - description: str | None - url_status: DataSourcesURLStatus - approval_status: ApprovalStatus - record_type: RecordType - agencies_assigned: list[AgencyAssigned] - sync_response_order: SyncResponseOrder diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/url/post.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/url/post.py deleted file mode 100644 index e535cd56..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/url/post.py +++ /dev/null @@ -1,50 +0,0 @@ -from pydantic import BaseModel - -from src.collectors.enums import URLStatus -from src.core.enums import RecordType -from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.models.url.data_sources import \ - TestDSURLSetupEntry -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.models.url.source_collector import \ - TestSCURLSetupEntry - - -class TestURLPostSetupRecord(BaseModel): - """Stores a setup entry along with relevant database-generated ids""" - url_id: int | None - sc_setup_entry: TestSCURLSetupEntry | None - ds_setup_entry: TestDSURLSetupEntry | None - sc_agency_ids: list[int] | None - ds_agency_ids: list[int] | None - ds_response_info: DataSourcesSyncResponseInnerInfo | None - final_url_status: URLStatus - - @property - def data_sources_id(self) -> int | None: - if self.ds_setup_entry is None: - return None - return self.ds_setup_entry.id - - @property - def final_record_type(self) -> RecordType: - if self.ds_setup_entry is not None: - return self.ds_setup_entry.record_type - return self.sc_setup_entry.record_type - - @property - def final_name(self) -> str: - if self.ds_setup_entry is not None: - return self.ds_setup_entry.name - return self.sc_setup_entry.name - - @property - def final_description(self) -> str: - if self.ds_setup_entry is not None: - return self.ds_setup_entry.description - return self.sc_setup_entry.description - - @property - def final_agency_ids(self) -> list[int] | None: - if self.ds_setup_entry is not None: - return self.ds_agency_ids - return self.sc_agency_ids \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/url/source_collector.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/url/source_collector.py deleted file mode 100644 index c151d783..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/url/source_collector.py +++ /dev/null @@ -1,17 +0,0 @@ -from pydantic import BaseModel - -from src.collectors.enums import URLStatus -from src.core.enums import RecordType -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.enums import AgencyAssigned - - -class TestSCURLSetupEntry(BaseModel): - """Represents URL previously existing in SC DB. - - These values should be overridden by any DS values - """ - name: str - description: str - record_type: RecordType | None - url_status: URLStatus - agencies_assigned: list[AgencyAssigned] diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/__init__.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/url_/__init__.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/url_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/url_/requester.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/url_/requester.py new file mode 100644 index 00000000..a514b151 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/url_/requester.py @@ -0,0 +1,59 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.enums import RecordType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.pydantic.insert import URLInsertModel +from src.db.models.impl.url.data_source.pydantic import URLDataSourcePydantic +from src.db.templates.requester import RequesterBase +from tests.helpers.simple_test_data_functions import generate_test_name, generate_test_url + + +class TestDataSourcesSyncURLSetupQueryRequester(RequesterBase): + + async def insert_urls( + self, + record_type: RecordType, + ) -> list[int]: + + insert_models: list[URLInsertModel] = [] + for i in range(2): + url = URLInsertModel( + url=generate_test_url(i), + name=generate_test_name(i), + record_type=record_type, + source=URLSource.COLLECTOR, + ) + insert_models.append(url) + + return await self.session_helper.bulk_insert(self.session, models=insert_models, return_ids=True) + + async def insert_validated_flags( + self, + url_ids: list[int], + validated_type: URLValidatedType + ) -> None: + to_insert: list[FlagURLValidatedPydantic] = [] + for url_id in url_ids: + flag = FlagURLValidatedPydantic( + url_id=url_id, + type=validated_type, + ) + to_insert.append(flag) + + await self.session_helper.bulk_insert(self.session, models=to_insert) + + async def insert_data_source_entry( + self, + url_ids: list[int], + ): + to_insert: list[URLDataSourcePydantic] = [ + URLDataSourcePydantic( + url_id=url_id, + data_source_id=url_id, + ) + for url_id in url_ids + ] + + await self.session_helper.bulk_insert(self.session, models=to_insert) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/url_/url.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/url_/url.py new file mode 100644 index 00000000..0176a95f --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/queries/url_/url.py @@ -0,0 +1,35 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.enums import RecordType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.queries.base.builder import QueryBuilderBase +from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.queries.url_.requester import \ + TestDataSourcesSyncURLSetupQueryRequester + + +class TestDataSourcesSyncURLSetupQueryBuilder(QueryBuilderBase): + + def __init__( + self, + record_type: RecordType, + validated_type: URLValidatedType | None = None, + previously_synced: bool = False, + ): + super().__init__() + self.record_type = record_type + self.validated_type = validated_type + self.previously_synced = previously_synced + + async def run(self, session: AsyncSession) -> list[int]: + requester = TestDataSourcesSyncURLSetupQueryRequester(session=session) + + url_ids: list[int] = await requester.insert_urls(record_type=self.record_type) + + if self.validated_type is not None: + await requester.insert_validated_flags(url_ids=url_ids, validated_type=self.validated_type) + + if self.previously_synced: + await requester.insert_data_source_entry(url_ids=url_ids) + + return url_ids + diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_db_only.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_db_only.py new file mode 100644 index 00000000..87cf163a --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_db_only.py @@ -0,0 +1,76 @@ +from datetime import datetime + +import pytest + +from src.collectors.enums import URLStatus +from src.core.enums import RecordType +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.scheduled.impl.sync.data_sources.operator import SyncDataSourcesTaskOperator +from src.core.tasks.scheduled.impl.sync.data_sources.params import DataSourcesSyncParameters +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.url.core.sqlalchemy import URL +from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInfo +from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.check import check_sync_concluded +from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.core import \ + set_up_mock_pdap_client_responses, set_up_urls + +from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error + + +@pytest.mark.asyncio +async def test_db_only( + operator: SyncDataSourcesTaskOperator, + adb_client_test: AsyncDatabaseClient, + current_db_time: datetime +): + """ + Test that operator does nothing with entries only in the database, and nothing is returned by the endpoint. + """ + + # Add URLs to database + url_ids: list[int] = await set_up_urls( + adb_client=adb_client_test, + record_type=RecordType.COMPLAINTS_AND_MISCONDUCT, + validated_type=None, + ) + + # Set up pdap client to return nothing + set_up_mock_pdap_client_responses( + operator.pdap_client, + responses=[ + DataSourcesSyncResponseInfo(data_sources=[]) + ] + ) + + # Run operator + run_info: TaskOperatorRunInfo = await operator.run_task() + + # Confirm operator ran without error + assert_task_ran_without_error(run_info) + + # Check sync concluded + assert operator.pdap_client.sync_data_sources.call_count == 1 + assert operator.pdap_client.sync_data_sources.call_args[0][0] == DataSourcesSyncParameters( + cutoff_date=None, + page=1 + ) + + # Confirm URLs are unchanged in database + urls: list[URL] = await adb_client_test.get_all(URL) + assert len(urls) == len(url_ids) + assert {url.id for url in urls} == set(url_ids) + assert all(url.status == URLStatus.OK for url in urls) + assert all(url.record_type == RecordType.COMPLAINTS_AND_MISCONDUCT for url in urls) + + # Confirm presence of sync status row with cutoff date and last updated at after initial db time + await check_sync_concluded( + adb_client_test, + check_updated_at=False, + current_db_datetime=current_db_time + ) + + # Confirm no validated flags + flags: list[FlagURLValidated] = await adb_client_test.get_all(FlagURLValidated) + assert len(flags) == 0 diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_happy_path.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_happy_path.py deleted file mode 100644 index 41f38b2a..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_happy_path.py +++ /dev/null @@ -1,62 +0,0 @@ -from unittest.mock import MagicMock, call - -import pytest - -from src.core.tasks.scheduled.impl.sync.data_sources.operator import SyncDataSourcesTaskOperator -from src.core.tasks.scheduled.impl.sync.data_sources.params import DataSourcesSyncParameters -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.check import check_sync_concluded -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.core import patch_sync_data_sources -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.data import ENTRIES -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.enums import SyncResponseOrder -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.manager.core import \ - DataSourcesSyncTestSetupManager -from tests.helpers.asserts import assert_task_run_success - - -@pytest.mark.asyncio -async def test_data_sources_sync_happy_path( - test_operator: SyncDataSourcesTaskOperator -): - adb_client = test_operator.adb_client - - manager = DataSourcesSyncTestSetupManager( - adb_client=adb_client, - entries=ENTRIES - ) - await manager.setup() - - with patch_sync_data_sources( - await manager.get_data_sources_sync_responses([order for order in SyncResponseOrder]) - ): - run_info = await test_operator.run_task() - assert_task_run_success(run_info) - mock_func: MagicMock = test_operator.pdap_client.sync_data_sources - - mock_func.assert_has_calls( - [ - call( - DataSourcesSyncParameters( - cutoff_date=None, - page=1 - ) - ), - call( - DataSourcesSyncParameters( - cutoff_date=None, - page=2 - ) - ), - call( - DataSourcesSyncParameters( - cutoff_date=None, - page=3 - ) - ) - ] - ) - await check_sync_concluded(adb_client, check_updated_at=False) - - # Check results according to expectations. - await manager.check_results() - - diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_interruption.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_interruption.py index 0441a102..3aa26866 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_interruption.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_interruption.py @@ -1,50 +1,73 @@ +from datetime import datetime + import pytest from sqlalchemy import select +from src.core.enums import RecordType +from src.core.tasks.base.run_info import TaskOperatorRunInfo from src.core.tasks.scheduled.impl.sync.data_sources.operator import SyncDataSourcesTaskOperator from src.core.tasks.url.enums import TaskOperatorOutcome +from src.db.client.async_ import AsyncDatabaseClient from src.db.models.impl.state.sync.data_sources import DataSourcesSyncState -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.check import check_sync_concluded -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.core import patch_sync_data_sources -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.data import ENTRIES -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.enums import SyncResponseOrder -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.manager.core import \ - DataSourcesSyncTestSetupManager - - +from src.db.models.impl.url.core.sqlalchemy import URL +from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInfo +from src.external.pdap.enums import ApprovalStatus, DataSourcesURLStatus +from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.core import patch_sync_data_sources, \ + set_up_mock_pdap_client_responses, set_up_sync_response_info +from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error @pytest.mark.asyncio async def test_data_sources_sync_interruption( - test_operator: SyncDataSourcesTaskOperator + operator: SyncDataSourcesTaskOperator, + adb_client_test: AsyncDatabaseClient, + current_db_time: datetime, + agency_ids: list[int] ): - adb_client = test_operator.adb_client + """ + Test that in the case of an interruption. + The data sources sync will resume from the last processed page. + """ - manager = DataSourcesSyncTestSetupManager( - adb_client=adb_client, - entries=ENTRIES + # Set up endpoint to return URLs on page 1, raise error on page 2 + # return URLs on page 2 on the second call, and return nothing on page 3 + set_up_mock_pdap_client_responses( + mock_pdap_client=operator.pdap_client, + responses=[ + set_up_sync_response_info( + ids=[0, 1], + record_type=RecordType.ACCIDENT_REPORTS, + agency_ids=agency_ids, + approval_status=ApprovalStatus.APPROVED, + ds_url_status=DataSourcesURLStatus.OK, + ), + ValueError("test ds sync error"), + set_up_sync_response_info( + ids=[2, 3], + record_type=RecordType.ACCIDENT_REPORTS, + agency_ids=agency_ids, + approval_status=ApprovalStatus.APPROVED, + ds_url_status=DataSourcesURLStatus.OK, + ), + DataSourcesSyncResponseInfo( + data_sources=[], + ) + ] ) - await manager.setup() - first_response = await manager.get_data_sources_sync_responses( - [SyncResponseOrder.FIRST] - ) - with patch_sync_data_sources( - side_effects= - first_response + - [ValueError("test error")] - ): - run_info = await test_operator.run_task() - assert run_info.outcome == TaskOperatorOutcome.ERROR, run_info.message + # Run operator + run_info: TaskOperatorRunInfo = await operator.run_task() - await manager.check_via_sync_response_order(SyncResponseOrder.FIRST) + # Confirm presence of error + assert run_info.outcome == TaskOperatorOutcome.ERROR + assert "test ds sync error" in run_info.message - # Second response should not be processed - with pytest.raises(AssertionError): - await manager.check_via_sync_response_order(SyncResponseOrder.SECOND) + # Confirm first URLs added to database + urls: list[URL] = await adb_client_test.get_all(URL) + assert len(urls) == 2 - # Check sync state results - sync_state_results = await adb_client.scalar( + # Confirm sync status updated to page 2 and cutoff date is null + sync_state_results = await adb_client_test.scalar( select( DataSourcesSyncState ) @@ -53,13 +76,22 @@ async def test_data_sources_sync_interruption( assert sync_state_results.last_full_sync_at is None assert sync_state_results.current_cutoff_date is None - second_response = await manager.get_data_sources_sync_responses( - [SyncResponseOrder.SECOND, SyncResponseOrder.THIRD] - ) - with patch_sync_data_sources(second_response): - await test_operator.run_task() + # Run operator again + run_info: TaskOperatorRunInfo = await operator.run_task() - await check_sync_concluded(adb_client) + # Confirm operator ran without error + assert_task_ran_without_error(run_info) - await manager.check_via_sync_response_order(SyncResponseOrder.SECOND) - await manager.check_via_sync_response_order(SyncResponseOrder.THIRD) \ No newline at end of file + # Confirm second URLs added to database + urls: list[URL] = await adb_client_test.get_all(URL) + assert len(urls) == 4 + + # Confirm page updated to null and cutoff date updated + sync_state_results = await adb_client_test.scalar( + select( + DataSourcesSyncState + ) + ) + assert sync_state_results.current_page is None + assert sync_state_results.last_full_sync_at is not None + assert sync_state_results.current_cutoff_date is not None diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_meta_url_not_modified.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_meta_url_not_modified.py new file mode 100644 index 00000000..51d40d6f --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_meta_url_not_modified.py @@ -0,0 +1,88 @@ +import pytest + +from src.collectors.enums import URLStatus +from src.core.enums import RecordType +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.scheduled.impl.sync.data_sources.operator import SyncDataSourcesTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.core.sqlalchemy import URL +from src.external.pdap.enums import ApprovalStatus, DataSourcesURLStatus +from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.core import set_up_urls, \ + set_up_mock_pdap_client_responses, set_up_sync_response_info +from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error +from tests.helpers.data_creator.core import DBDataCreator + + +@pytest.mark.asyncio +async def test_meta_url_not_modified( + operator: SyncDataSourcesTaskOperator, + adb_client_test: AsyncDatabaseClient, + agency_ids: list[int], + db_data_creator: DBDataCreator, +): + """ + In a Data Source Sync, a validated Meta URL linked to an agency should be untouched + if the sync response includes that same agency with other Data Sources URL + """ + original_url_ids: list[int] = await set_up_urls( + adb_client=adb_client_test, + record_type=RecordType.CONTACT_INFO_AND_AGENCY_META, + validated_type=URLValidatedType.META_URL, + ) + # Link URLs to existing agencies + await db_data_creator.create_url_agency_links( + url_ids=original_url_ids, + agency_ids=agency_ids, + ) + + set_up_mock_pdap_client_responses( + mock_pdap_client=operator.pdap_client, + responses=[ + set_up_sync_response_info( + ids=[2, 3], + record_type=RecordType.COMPLAINTS_AND_MISCONDUCT, + agency_ids=agency_ids, + approval_status=ApprovalStatus.APPROVED, + ds_url_status=DataSourcesURLStatus.OK, + ), + ] + ) + + # Run operator + run_info: TaskOperatorRunInfo = await operator.run_task() + + # Confirm operator ran without error + assert_task_ran_without_error(run_info) + + # Check sync concluded + operator.pdap_client.sync_data_sources.call_count == 2 + + # Confirm presence of 4 URLs in database + urls: list[URL] = await adb_client_test.get_all(URL) + assert len(urls) == 4 + assert all([url.status == URLStatus.OK for url in urls]) + assert set([url.record_type for url in urls]) == { + RecordType.CONTACT_INFO_AND_AGENCY_META, + RecordType.COMPLAINTS_AND_MISCONDUCT + } + all_url_ids: list[int] = [url.id for url in urls] + # Check that all original URLs are present + assert set(all_url_ids) >= set(original_url_ids) + + links: list[LinkURLAgency] = await adb_client_test.get_all(LinkURLAgency) + assert len(links) == 16 + assert set(link.url_id for link in links) == set(all_url_ids) + assert set(link.agency_id for link in links) == set(agency_ids) + + # Confirm presence of validated flag + flags: list[FlagURLValidated] = await adb_client_test.get_all(FlagURLValidated) + assert len(flags) == 4 + assert set([flag.type for flag in flags]) == { + URLValidatedType.META_URL, + URLValidatedType.DATA_SOURCE, + } + assert set(flag.url_id for flag in flags) == set(all_url_ids) + diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_multiple_calls.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_multiple_calls.py new file mode 100644 index 00000000..0ae831bd --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_multiple_calls.py @@ -0,0 +1,107 @@ +from datetime import datetime, timedelta + +import pytest +from sqlalchemy import select + +from src.collectors.enums import URLStatus +from src.core.enums import RecordType +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.scheduled.impl.sync.data_sources.operator import SyncDataSourcesTaskOperator +from src.core.tasks.scheduled.impl.sync.data_sources.params import DataSourcesSyncParameters +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.state.sync.data_sources import DataSourcesSyncState +from src.db.models.impl.url.core.sqlalchemy import URL +from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInfo +from src.external.pdap.enums import ApprovalStatus, DataSourcesURLStatus +from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.core import \ + set_up_mock_pdap_client_responses, set_up_sync_response_info +from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error + + +@pytest.mark.asyncio +async def test_ds_sync_multiple_calls( + operator: SyncDataSourcesTaskOperator, + adb_client_test: AsyncDatabaseClient, + current_db_time: datetime, + agency_ids: list[int] +): + """ + Test that operator properly handles multiple calls to sync endpoint. + """ + + # Set up endpoint to return URLs on page 1 and 2, and stop on page 3 + set_up_mock_pdap_client_responses( + mock_pdap_client=operator.pdap_client, + responses=[ + set_up_sync_response_info( + ids=[0, 1], + record_type=RecordType.ACCIDENT_REPORTS, + agency_ids=agency_ids, + approval_status=ApprovalStatus.APPROVED, + ds_url_status=DataSourcesURLStatus.OK, + ), + set_up_sync_response_info( + ids=[2, 3], + record_type=RecordType.ACCIDENT_REPORTS, + agency_ids=agency_ids, + approval_status=ApprovalStatus.APPROVED, + ds_url_status=DataSourcesURLStatus.OK, + ), + DataSourcesSyncResponseInfo( + data_sources=[], + ) + ] + ) + + # Run operator + run_info: TaskOperatorRunInfo = await operator.run_task() + + # Confirm operator ran without error + assert_task_ran_without_error(run_info) + + + # Confirm URLs are added to database + urls: list[URL] = await adb_client_test.get_all(URL) + assert all(url.status == URLStatus.OK for url in urls) + assert all(url.record_type == RecordType.ACCIDENT_REPORTS for url in urls) + url_ids: list[int] = [url.id for url in urls] + + # Confirm 3 calls to pdap_client.sync_data_sources + assert operator.pdap_client.sync_data_sources.call_count == 3 + + # Confirm sync status updated + sync_state_results = await adb_client_test.scalar( + select( + DataSourcesSyncState + ) + ) + assert sync_state_results.current_page is None + assert sync_state_results.last_full_sync_at > current_db_time - timedelta(minutes=5) + assert sync_state_results.current_cutoff_date > (current_db_time - timedelta(days=2)).date() + + set_up_mock_pdap_client_responses( + mock_pdap_client=operator.pdap_client, + responses=[ + DataSourcesSyncResponseInfo( + data_sources=[], + ) + ] + ) + + # Run operator again + run_info: TaskOperatorRunInfo = await operator.run_task() + + # Confirm operator ran without error + assert_task_ran_without_error(run_info) + + # Confirm no new URLs added + urls: list[URL] = await adb_client_test.get_all(URL) + assert set([url.id for url in urls]) == set(url_ids) + + # Confirm call to pdap_client.sync_data_sources made with cutoff_date + assert operator.pdap_client.sync_data_sources.called_once_with( + DataSourcesSyncParameters( + cutoff_date=sync_state_results.current_cutoff_date, + page=1 + ) + ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_no_new_results.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_no_new_results.py deleted file mode 100644 index ebcbe856..00000000 --- a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_no_new_results.py +++ /dev/null @@ -1,59 +0,0 @@ -from datetime import datetime -from unittest.mock import MagicMock - -import pytest - -from src.core.tasks.scheduled.impl.sync.data_sources.operator import SyncDataSourcesTaskOperator -from src.core.tasks.scheduled.impl.sync.data_sources.params import DataSourcesSyncParameters -from src.db.models.impl.state.sync.data_sources import DataSourcesSyncState -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.check import check_sync_concluded -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.core import patch_sync_data_sources -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.data import ENTRIES -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.enums import SyncResponseOrder -from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.manager.core import \ - DataSourcesSyncTestSetupManager -from tests.helpers.asserts import assert_task_run_success - - -@pytest.mark.asyncio -async def test_data_sources_sync_no_new_results( - test_operator: SyncDataSourcesTaskOperator -): - adb_client = test_operator.adb_client - - cutoff_date = datetime(2025, 5, 1).date() - - manager = DataSourcesSyncTestSetupManager( - adb_client=adb_client, - entries=ENTRIES - ) - await manager.setup() - - first_response = await manager.get_data_sources_sync_responses( - [SyncResponseOrder.THIRD] - ) - - # Add cutoff date to database - await adb_client.add( - DataSourcesSyncState( - current_cutoff_date=cutoff_date - ) - ) - - with patch_sync_data_sources(first_response): - run_info = await test_operator.run_task() - assert_task_run_success(run_info) - mock_func: MagicMock = test_operator.pdap_client.sync_data_sources - - mock_func.assert_called_once_with( - DataSourcesSyncParameters( - cutoff_date=cutoff_date, - page=1 - ) - ) - await check_sync_concluded(adb_client, check_updated_at=False) - - # Check no syncs occurred - for sync_response_order in [SyncResponseOrder.FIRST, SyncResponseOrder.SECOND]: - with pytest.raises(AssertionError): - await manager.check_via_sync_response_order(sync_response_order) diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_broken_approved.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_broken_approved.py new file mode 100644 index 00000000..7878c83f --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_broken_approved.py @@ -0,0 +1,85 @@ +from datetime import datetime + +import pytest + +from src.collectors.enums import URLStatus +from src.core.enums import RecordType +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.scheduled.impl.sync.data_sources.operator import SyncDataSourcesTaskOperator +from src.core.tasks.scheduled.impl.sync.data_sources.params import DataSourcesSyncParameters +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.core.sqlalchemy import URL +from src.external.pdap.enums import ApprovalStatus, DataSourcesURLStatus +from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.check import check_sync_concluded +from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.core import \ + set_up_mock_pdap_client_responses, set_up_sync_response_info +from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error + + +@pytest.mark.asyncio +async def test_url_broken_approved( + operator: SyncDataSourcesTaskOperator, + adb_client_test: AsyncDatabaseClient, + agency_ids: list[int], + current_db_time: datetime +): + """ + Test that a data source with + - a broken URL status + - an approved status + Is added to the data source with a 404 Not Found status. + """ + + # Set up pdap client to return url with broken url status but approved + set_up_mock_pdap_client_responses( + mock_pdap_client=operator.pdap_client, + responses=[ + set_up_sync_response_info( + ids=[0, 1], + record_type=RecordType.COMPLAINTS_AND_MISCONDUCT, + agency_ids=agency_ids, + approval_status=ApprovalStatus.APPROVED, + ds_url_status=DataSourcesURLStatus.BROKEN, + ), + ] + ) + + # Run operator + run_info: TaskOperatorRunInfo = await operator.run_task() + + # Confirm operator ran without error + assert_task_ran_without_error(run_info) + + # Check sync concluded + operator.pdap_client.sync_data_sources.call_count == 2 + + # Confirm presence of URL with status of `404 not found` + urls: list[URL] = await adb_client_test.get_all(URL) + assert len(urls) == 2 + assert all([url.status == URLStatus.NOT_FOUND for url in urls]) + assert all([url.record_type == RecordType.COMPLAINTS_AND_MISCONDUCT for url in urls]) + url_ids: list[int] = [url.id for url in urls] + + # Confirm presence of agencies + links: list[LinkURLAgency] = await adb_client_test.get_all(LinkURLAgency) + assert len(links) == 8 + assert set(link.url_id for link in links) == set(url_ids) + assert set(link.agency_id for link in links) == set(agency_ids) + + # Confirm presence of validated flag + flags: list[FlagURLValidated] = await adb_client_test.get_all(FlagURLValidated) + assert len(flags) == 2 + assert all([flag.type == URLValidatedType.DATA_SOURCE for flag in flags]) + assert set(flag.url_id for flag in flags) == set(url_ids) + + # Confirm presence of sync status row + await check_sync_concluded( + adb_client_test, + current_db_datetime=current_db_time + ) + + + diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_in_db_overwritten_by_ds.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_in_db_overwritten_by_ds.py new file mode 100644 index 00000000..e1c7f33c --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_in_db_overwritten_by_ds.py @@ -0,0 +1,94 @@ +import pytest + +from src.collectors.enums import URLStatus +from src.core.enums import RecordType +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.scheduled.impl.sync.data_sources.operator import SyncDataSourcesTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.core.sqlalchemy import URL +from src.external.pdap.enums import ApprovalStatus, DataSourcesURLStatus +from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.core import set_up_urls, \ + set_up_mock_pdap_client_responses, set_up_sync_response_info +from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error + + +@pytest.mark.asyncio +async def test_url_in_db_overwritten_by_ds( + operator: SyncDataSourcesTaskOperator, + adb_client_test: AsyncDatabaseClient, + agency_ids: list[int] +): + """ + Test that a URL in the database is overwritten by a data source with the same URL, + if their information is different. + """ + old_agency_ids: list[int] = agency_ids[:2] + new_agency_ids: list[int] = agency_ids[2:4] + + + # Add URLs to database + url_ids: list[int] = await set_up_urls( + adb_client=adb_client_test, + record_type=RecordType.COMPLAINTS_AND_MISCONDUCT, + validated_type=URLValidatedType.DATA_SOURCE, + ) + # Link URLs to 2 existing agencies + links: list[LinkURLAgency] = [] + for url_id in url_ids: + for agency_id in old_agency_ids: + link = LinkURLAgency( + url_id=url_id, + agency_id=agency_id, + ) + links.append(link) + await adb_client_test.add_all(links) + + # Set up pdap client to return same URLs with different information + # - different name + # - different description + # - different status + # - different approval status (approved vs. not relevant) + # - different record type + # - different agencies assigned + set_up_mock_pdap_client_responses( + mock_pdap_client=operator.pdap_client, + responses=[ + set_up_sync_response_info( + ids=[0, 1], + record_type=RecordType.ACCIDENT_REPORTS, + agency_ids=new_agency_ids, + approval_status=ApprovalStatus.REJECTED, + ds_url_status=DataSourcesURLStatus.BROKEN, + ), + ] + ) + + # Run operator + run_info: TaskOperatorRunInfo = await operator.run_task() + + # Confirm operator ran without error + assert_task_ran_without_error(run_info) + + + # Confirm URL name, description, record type, and status are overwritten + urls: list[URL] = await adb_client_test.get_all(URL) + assert len(urls) == 2 + assert all([url.status == URLStatus.NOT_FOUND for url in urls]) + assert all([url.record_type == RecordType.ACCIDENT_REPORTS for url in urls]) + url_ids: list[int] = [url.id for url in urls] + + # Confirm agencies are overwritten + links: list[LinkURLAgency] = await adb_client_test.get_all(LinkURLAgency) + assert len(links) == 4 + assert set(link.url_id for link in links) == set(url_ids) + assert set(link.agency_id for link in links) == set(new_agency_ids) + + # Confirm validated types overwritten + flags: list[FlagURLValidated] = await adb_client_test.get_all(FlagURLValidated) + assert len(flags) == 2 + assert all([flag.type == URLValidatedType.NOT_RELEVANT for flag in flags]) + assert set(flag.url_id for flag in flags) == set(url_ids) + diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_ok_approved.py b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_ok_approved.py new file mode 100644 index 00000000..eeff4028 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/test_url_ok_approved.py @@ -0,0 +1,63 @@ +import pytest + +from src.collectors.enums import URLStatus +from src.core.enums import RecordType +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.scheduled.impl.sync.data_sources.operator import SyncDataSourcesTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.url.core.sqlalchemy import URL +from src.external.pdap.enums import ApprovalStatus, DataSourcesURLStatus +from tests.automated.integration.tasks.scheduled.impl.sync.data_sources.setup.core import \ + set_up_mock_pdap_client_responses, set_up_sync_response_info +from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error + + +@pytest.mark.asyncio +async def test_url_ok_approved( + operator: SyncDataSourcesTaskOperator, + adb_client_test: AsyncDatabaseClient, + agency_ids: list[int] +): + """ + Test that a URL with an OK URL status and an approved status + is added to the database with an OK status + and a validated flag with `submitted=True` + """ + + # Set up pdap client to return url with ok url status and approved + set_up_mock_pdap_client_responses( + mock_pdap_client=operator.pdap_client, + responses=[ + set_up_sync_response_info( + ids=[0, 1], + record_type=RecordType.OTHER, + agency_ids=agency_ids, + approval_status=ApprovalStatus.APPROVED, + ds_url_status=DataSourcesURLStatus.OK, + ), + ] + ) + + # Run operator + run_info: TaskOperatorRunInfo = await operator.run_task() + + # Confirm operator ran without error + assert_task_ran_without_error(run_info) + + # Check sync concluded + operator.pdap_client.sync_data_sources.call_count == 2 + + # Confirm URL is added to database with OK status + urls: list[URL] = await adb_client_test.get_all(URL) + assert len(urls) == 2 + assert all([url.status == URLStatus.OK for url in urls]) + assert all([url.record_type == RecordType.OTHER for url in urls]) + url_ids: list[int] = [url.id for url in urls] + + # Confirm presence of validated flag + flags: list[FlagURLValidated] = await adb_client_test.get_all(FlagURLValidated) + assert len(flags) == 2 + assert all([flag.type == URLValidatedType.DATA_SOURCE for flag in flags]) + assert set(flag.url_id for flag in flags) == set(url_ids) diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/conftest.py b/tests/automated/integration/tasks/url/impl/agency_identification/conftest.py similarity index 69% rename from tests/automated/integration/tasks/url/impl/agency_identification/happy_path/conftest.py rename to tests/automated/integration/tasks/url/impl/agency_identification/conftest.py index b6787899..7feb6d61 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/conftest.py +++ b/tests/automated/integration/tasks/url/impl/agency_identification/conftest.py @@ -1,29 +1,29 @@ -from unittest.mock import create_autospec, AsyncMock +from unittest.mock import create_autospec import pytest from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.core import \ + NLPProcessor from src.core.tasks.url.operators.agency_identification.subtasks.loader import AgencyIdentificationSubtaskLoader from src.db.client.async_ import AsyncDatabaseClient from src.external.pdap.client import PDAPClient -from tests.automated.integration.tasks.url.impl.agency_identification.happy_path.mock import mock_run_subtask @pytest.fixture def operator( adb_client_test: AsyncDatabaseClient -): +) -> AgencyIdentificationTaskOperator: operator = AgencyIdentificationTaskOperator( adb_client=adb_client_test, loader=AgencyIdentificationSubtaskLoader( pdap_client=create_autospec(PDAPClient), - muckrock_api_interface=create_autospec(MuckrockAPIInterface) - ) - ) - operator.run_subtask = AsyncMock( - side_effect=mock_run_subtask + muckrock_api_interface=create_autospec(MuckrockAPIInterface), + adb_client=adb_client_test, + nlp_processor=create_autospec(NLPProcessor) + ), ) return operator diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/asserts.py b/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/asserts.py deleted file mode 100644 index c7818e77..00000000 --- a/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/asserts.py +++ /dev/null @@ -1,19 +0,0 @@ -from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.impl.agency.sqlalchemy import Agency -from src.db.models.impl.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion - - -async def assert_expected_confirmed_and_auto_suggestions(adb_client: AsyncDatabaseClient): - confirmed_suggestions = await adb_client.get_urls_with_confirmed_agencies() - - # The number of confirmed suggestions is dependent on how often - # the subtask iterated through the sample agency suggestions defined in `data.py` - assert len(confirmed_suggestions) == 3, f"Expected 3 confirmed suggestions, got {len(confirmed_suggestions)}" - agencies = await adb_client.get_all(Agency) - assert len(agencies) == 2 - auto_suggestions = await adb_client.get_all(AutomatedUrlAgencySuggestion) - assert len(auto_suggestions) == 4, f"Expected 4 auto suggestions, got {len(auto_suggestions)}" - # Of the auto suggestions, 2 should be unknown - assert len([s for s in auto_suggestions if s.is_unknown]) == 2 - # Of the auto suggestions, 2 should not be unknown - assert len([s for s in auto_suggestions if not s.is_unknown]) == 2 diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/data.py b/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/data.py deleted file mode 100644 index ea224c37..00000000 --- a/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/data.py +++ /dev/null @@ -1,34 +0,0 @@ - - -from src.core.enums import SuggestionType -from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo - -SAMPLE_AGENCY_SUGGESTIONS = [ - URLAgencySuggestionInfo( - url_id=-1, # This will be overwritten - suggestion_type=SuggestionType.UNKNOWN, - pdap_agency_id=None, - agency_name=None, - state=None, - county=None, - locality=None - ), - URLAgencySuggestionInfo( - url_id=-1, # This will be overwritten - suggestion_type=SuggestionType.CONFIRMED, - pdap_agency_id=-1, - agency_name="Test Agency", - state="Test State", - county="Test County", - locality="Test Locality" - ), - URLAgencySuggestionInfo( - url_id=-1, # This will be overwritten - suggestion_type=SuggestionType.AUTO_SUGGESTION, - pdap_agency_id=-1, - agency_name="Test Agency 2", - state="Test State 2", - county="Test County 2", - locality="Test Locality 2" - ) -] diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/mock.py b/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/mock.py deleted file mode 100644 index a4dcb227..00000000 --- a/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/mock.py +++ /dev/null @@ -1,19 +0,0 @@ -from copy import deepcopy -from typing import Optional - -from src.core.enums import SuggestionType -from tests.automated.integration.tasks.url.impl.agency_identification.happy_path.data import SAMPLE_AGENCY_SUGGESTIONS - - -async def mock_run_subtask( - subtask, - url_id: int, - collector_metadata: Optional[dict] -): - """A mocked version of run_subtask that returns a single suggestion for each url_id.""" - - # Deepcopy to prevent using the same instance in memory - suggestion = deepcopy(SAMPLE_AGENCY_SUGGESTIONS[url_id % 3]) - suggestion.url_id = url_id - suggestion.pdap_agency_id = (url_id % 3) if suggestion.suggestion_type != SuggestionType.UNKNOWN else None - return [suggestion] diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/test_happy_path.py b/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/test_happy_path.py deleted file mode 100644 index dc261c12..00000000 --- a/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/test_happy_path.py +++ /dev/null @@ -1,129 +0,0 @@ -from unittest.mock import AsyncMock - -import pytest -from aiohttp import ClientSession - -from src.collectors.enums import CollectorType, URLStatus -from src.core.tasks.url.enums import TaskOperatorOutcome -from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator -from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan import CKANAgencyIdentificationSubtask -from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock import \ - MuckrockAgencyIdentificationSubtask -from src.core.tasks.url.operators.agency_identification.subtasks.impl.unknown import UnknownAgencyIdentificationSubtask -from tests.automated.integration.tasks.url.impl.agency_identification.happy_path.asserts import \ - assert_expected_confirmed_and_auto_suggestions -from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters -from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters -from tests.helpers.data_creator.core import DBDataCreator -from tests.helpers.data_creator.models.creation_info.batch.v2 import BatchURLCreationInfoV2 - - -@pytest.mark.asyncio -async def test_agency_identification_task( - db_data_creator: DBDataCreator, - test_client_session: ClientSession, - operator: AgencyIdentificationTaskOperator, -): - """Test full flow of AgencyIdentificationTaskOperator""" - - # Confirm does not yet meet prerequisites - assert not await operator.meets_task_prerequisites() - - collector_type_to_url_id: dict[CollectorType | None, int] = {} - - # Create six urls, one from each strategy - for strategy in [ - CollectorType.COMMON_CRAWLER, - CollectorType.AUTO_GOOGLER, - CollectorType.MUCKROCK_COUNTY_SEARCH, - CollectorType.MUCKROCK_SIMPLE_SEARCH, - CollectorType.MUCKROCK_ALL_SEARCH, - CollectorType.CKAN, - ]: - # Create two URLs for each, one pending and one errored - creation_info: BatchURLCreationInfoV2 = await db_data_creator.batch_v2( - parameters=TestBatchCreationParameters( - strategy=strategy, - urls=[ - TestURLCreationParameters( - count=1, - status=URLStatus.PENDING, - with_html_content=True - ), - TestURLCreationParameters( - count=1, - status=URLStatus.ERROR, - with_html_content=True - ) - ] - ) - ) - collector_type_to_url_id[strategy] = creation_info.urls_by_status[URLStatus.PENDING].url_mappings[0].url_id - - # Create an additional two urls with no collector. - response = await db_data_creator.url_v2( - parameters=[ - TestURLCreationParameters( - count=1, - status=URLStatus.PENDING, - with_html_content=True - ), - TestURLCreationParameters( - count=1, - status=URLStatus.ERROR, - with_html_content=True - ) - ] - ) - collector_type_to_url_id[None] = response.urls_by_status[URLStatus.PENDING].url_mappings[0].url_id - - - # Confirm meets prerequisites - assert await operator.meets_task_prerequisites() - # Run task - run_info = await operator.run_task() - assert run_info.outcome == TaskOperatorOutcome.SUCCESS, run_info.message - - # Confirm tasks are piped into the correct subtasks - # * common_crawler into common_crawler_subtask - # * auto_googler into auto_googler_subtask - # * muckrock_county_search into muckrock_subtask - # * muckrock_simple_search into muckrock_subtask - # * muckrock_all_search into muckrock_subtask - # * ckan into ckan_subtask - - - mock_run_subtask: AsyncMock = operator.run_subtask - - # Check correct number of calls to run_subtask - assert mock_run_subtask.call_count == 7 - - # Confirm subtask classes are correct for the given urls - d2 = {} - for call_arg in mock_run_subtask.call_args_list: - subtask_class = call_arg[0][0].__class__ - url_id = call_arg[0][1] - d2[url_id] = subtask_class - - - subtask_class_collector_type = [ - (MuckrockAgencyIdentificationSubtask, CollectorType.MUCKROCK_ALL_SEARCH), - (MuckrockAgencyIdentificationSubtask, CollectorType.MUCKROCK_COUNTY_SEARCH), - (MuckrockAgencyIdentificationSubtask, CollectorType.MUCKROCK_SIMPLE_SEARCH), - (CKANAgencyIdentificationSubtask, CollectorType.CKAN), - (UnknownAgencyIdentificationSubtask, CollectorType.COMMON_CRAWLER), - (UnknownAgencyIdentificationSubtask, CollectorType.AUTO_GOOGLER), - (UnknownAgencyIdentificationSubtask, None) - ] - - for subtask_class, collector_type in subtask_class_collector_type: - url_id = collector_type_to_url_id[collector_type] - assert d2[url_id] == subtask_class - - # Confirm task again does not meet prerequisites - assert not await operator.meets_task_prerequisites() - # # Check confirmed and auto suggestions - adb_client = db_data_creator.adb_client - # TODO: This component appears to be affected by the order of other tests being run - # but does pass when run alone. Resolve. - # await assert_expected_confirmed_and_auto_suggestions(adb_client) diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/ckan/__init__.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/ckan/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/ckan/test_core.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/ckan/test_core.py new file mode 100644 index 00000000..90aacfa5 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/ckan/test_core.py @@ -0,0 +1,100 @@ +from unittest.mock import AsyncMock + +import pytest + +from src.collectors.enums import CollectorType +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType +from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask +from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion +from src.external.pdap.enums import MatchAgencyResponseStatus +from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan_.core import CKANAgencyIDSubtaskOperator +from src.core.enums import SuggestionType +from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse +from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo +from tests.helpers.asserts import assert_task_run_success +from tests.helpers.data_creator.core import DBDataCreator + + +@pytest.mark.asyncio +async def test_ckan_subtask( + operator: AgencyIdentificationTaskOperator, + db_data_creator: DBDataCreator +): + # Test that ckan subtask correctly sends agency id to + # CKANAPIInterface, sends resultant agency name to + # PDAPClient and adds received suggestions to + # url_agency_suggestions + adb_client: AsyncDatabaseClient = operator.adb_client + + # Run basic survey and confirm no next subtask + assert not await operator.meets_task_prerequisites() + assert operator._subtask is None + + applicable_url_id: int = ( + await db_data_creator.create_urls( + count=1, + collector_metadata={ + "agency_name": "Test Agency" + } + ) + )[0].url_id + applicable_batch_id: int = await db_data_creator.create_batch( + strategy=CollectorType.CKAN + ) + await db_data_creator.create_batch_url_links( + url_ids=[applicable_url_id], + batch_id=applicable_batch_id + ) + + # Confirm prerequisite met and subtask is CKAN + assert await operator.meets_task_prerequisites() + assert operator._subtask == AutoAgencyIDSubtaskType.CKAN + + pdap_client_mock = operator.loader._pdap_client + pdap_client_mock.match_agency.return_value = MatchAgencyResponse( + status=MatchAgencyResponseStatus.PARTIAL_MATCH, + matches=[ + MatchAgencyInfo( + id=1, + submitted_name="Mock Agency Name", + ), + MatchAgencyInfo( + id=2, + submitted_name="Another Mock Agency Name", + ) + ] + ) + + # Create agencies + await db_data_creator.create_agency(1) + await db_data_creator.create_agency(2) + + # Run the operator + run_info: TaskOperatorRunInfo = await operator.run_task() + assert_task_run_success(run_info) + + # Confirm prerequisite no longer met + assert not await operator.meets_task_prerequisites() + assert operator._subtask is None + + # Verify results + subtasks: list[URLAutoAgencyIDSubtask] = await adb_client.get_all(URLAutoAgencyIDSubtask) + assert len(subtasks) == 1 + subtask: URLAutoAgencyIDSubtask = subtasks[0] + assert subtask.type == AutoAgencyIDSubtaskType.CKAN + assert subtask.url_id == applicable_url_id + subtask_id: int = subtask.id + + suggestions: list[AgencyIDSubtaskSuggestion] = await adb_client.get_all( + AgencyIDSubtaskSuggestion + ) + assert len(suggestions) == 2 + assert {suggestion.confidence for suggestion in suggestions} == {50} + assert {suggestion.agency_id for suggestion in suggestions} == {1, 2} + assert {suggestion.subtask_id for suggestion in suggestions} == {subtask_id} + + # Assert methods called as expected + pdap_client_mock.match_agency.assert_called_once_with(name="Test Agency") diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/__init__.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/ineligible_cases/__init__.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/ineligible_cases/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/ineligible_cases/test_blacklist.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/ineligible_cases/test_blacklist.py new file mode 100644 index 00000000..05a9e2bb --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/ineligible_cases/test_blacklist.py @@ -0,0 +1,51 @@ +import pytest + +from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator +from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from tests.helpers.data_creator.core import DBDataCreator + + +@pytest.mark.asyncio +async def test_blacklist( + db_data_creator: DBDataCreator, + operator: AgencyIdentificationTaskOperator, +): + """Test Survey does not pick up for Homepage Match + URLs with root URLs that have more than two agencies + whose meta_urls have it as a root""" + # Create Root URL + root_url_id: int = (await db_data_creator.create_urls(count=1))[0].url_id + + # Flag as Root + await db_data_creator.flag_as_root([root_url_id]) + + # Create ineligible URL + url_id: int = (await db_data_creator.create_urls(count=1))[0].url_id + + # Link Root URL to ineligible URL + await db_data_creator.link_urls_to_root([url_id], root_url_id=root_url_id) + + # Create Meta URLs + meta_urls: list[URLMapping] = await db_data_creator.create_validated_urls( + count=3, + validation_type=URLValidatedType.META_URL + ) + + # Create 3 agencies + agency_ids: list[int] = await db_data_creator.create_agencies(count=3) + + # Link Meta URLs to Agencies + await db_data_creator.link_urls_to_agencies( + url_ids=[url.url_id for url in meta_urls], + agency_ids=agency_ids + ) + + # Link Meta URLs to Root URL + await db_data_creator.link_urls_to_root( + url_ids=[url.url_id for url in meta_urls], + root_url_id=root_url_id + ) + + # Run survey and confirm prerequisites not met + assert not await operator.meets_task_prerequisites() diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/ineligible_cases/test_no_validated_meta_urls.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/ineligible_cases/test_no_validated_meta_urls.py new file mode 100644 index 00000000..a9576768 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/ineligible_cases/test_no_validated_meta_urls.py @@ -0,0 +1,29 @@ + +import pytest + +from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator +from tests.helpers.data_creator.core import DBDataCreator + + +@pytest.mark.asyncio +async def test_no_validated_meta_urls( + db_data_creator: DBDataCreator, + operator: AgencyIdentificationTaskOperator, +): + """Test survey does not pick up for Homepage Match + URLs whose Root URLs do not have validated meta URLs.""" + + # Create Root URL + root_url_id: int = (await db_data_creator.create_urls(count=1))[0].url_id + + # Flag as Root + await db_data_creator.flag_as_root([root_url_id]) + + # Create ineligible URL + url_id: int = (await db_data_creator.create_urls(count=1))[0].url_id + + # Link Root URL to ineligible URL + await db_data_creator.link_urls_to_root([url_id], root_url_id=root_url_id) + + # Run survey and confirm prerequisites not met + assert not await operator.meets_task_prerequisites() \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/ineligible_cases/test_root_urls.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/ineligible_cases/test_root_urls.py new file mode 100644 index 00000000..627dd05a --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/ineligible_cases/test_root_urls.py @@ -0,0 +1,22 @@ +import pytest + +from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator +from tests.conftest import db_data_creator +from tests.helpers.data_creator.core import DBDataCreator + + +@pytest.mark.asyncio +async def test_root_urls( + db_data_creator: DBDataCreator, + operator: AgencyIdentificationTaskOperator, +): + """Test survey does not pick up root URLs for Homepage Match.""" + + # Create URL + url_id: int = (await db_data_creator.create_urls(count=1))[0].url_id + + # Flag as Root + await db_data_creator.flag_as_root([url_id]) + + # Run survey and confirm prerequisites not met + assert not await operator.meets_task_prerequisites() \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/test_happy_path.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/test_happy_path.py new file mode 100644 index 00000000..43a1677c --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/test_happy_path.py @@ -0,0 +1,159 @@ +from collections import defaultdict + +import pytest + +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType, SubtaskDetailCode +from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask +from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion +from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error +from tests.helpers.data_creator.core import DBDataCreator + + +@pytest.mark.asyncio +async def test_homepage_match( + db_data_creator: DBDataCreator, + operator: AgencyIdentificationTaskOperator, +): + """ + Test the following cases: + Single Agency: A URL whose Root URL has one meta URL is properly linked + Multi Agency: A URL whose Root URL has multiple meta URLs is properly linked + """ + + # Create 2 root URLs + root_url_mappings: list[URLMapping] = ( + await db_data_creator.create_urls(count=2) + ) + root_url_ids: list[int] = [url_mapping.url_id for url_mapping in root_url_mappings] + + # Flag as Root + await db_data_creator.flag_as_root(root_url_ids) + + # Separate Root URLs + single_agency_root_url_id: int = root_url_ids[0] + multi_agency_root_url_id: int = root_url_ids[1] + + # Create 3 agencies + agency_ids: list[int] = await db_data_creator.create_agencies(count=3) + single_agency_id: int = agency_ids[0] + multi_agency_ids: list[int] = agency_ids[1:] + + # Create 1 Meta URL for single agency case + single_meta_url_id: int = (await db_data_creator.create_validated_urls( + count=1, + validation_type=URLValidatedType.META_URL + ))[0].url_id + # Link single meta URL to single agency + await db_data_creator.create_url_agency_links( + url_ids=[single_meta_url_id], + agency_ids=[single_agency_id]) + # Link single meta URL to root + await db_data_creator.link_urls_to_root( + url_ids=[single_meta_url_id], + root_url_id=single_agency_root_url_id + ) + + + # Create 2 Meta URLs and agencies for multi agency case + multi_meta_urls: list[URLMapping] = await db_data_creator.create_validated_urls( + count=2, + validation_type=URLValidatedType.META_URL + ) + multi_meta_url_ids: list[int] = [url_mapping.url_id for url_mapping in multi_meta_urls] + # Link multi meta URLs to agencies + await db_data_creator.create_url_agency_links( + url_ids=[multi_meta_url_ids[0]], + agency_ids=[multi_agency_ids[0]] + ) + await db_data_creator.create_url_agency_links( + url_ids=[multi_meta_url_ids[1]], + agency_ids=[multi_agency_ids[1]] + ) + # Link multi meta URLs to root + await db_data_creator.link_urls_to_root( + url_ids=multi_meta_url_ids, + root_url_id=multi_agency_root_url_id + ) + + # Check operator does not meet prerequisites + assert not await operator.meets_task_prerequisites() + + # Set up eligible URLs + eligible_urls: list[URLMapping] = await db_data_creator.create_urls( + count=2, + ) + single_url_id: int = eligible_urls[0].url_id + multi_url_id: int = eligible_urls[1].url_id + + # Link eligible URLs to each root + await db_data_creator.link_urls_to_root( + url_ids=[single_url_id], + root_url_id=single_agency_root_url_id + ) + await db_data_creator.link_urls_to_root( + url_ids=[multi_url_id], + root_url_id=multi_agency_root_url_id + ) + + # Check operator now meets prerequisites + assert await operator.meets_task_prerequisites() + assert operator._subtask == AutoAgencyIDSubtaskType.HOMEPAGE_MATCH + + # Run operator + run_info: TaskOperatorRunInfo = await operator.run_task() + + # Confirm operator ran without error + assert_task_ran_without_error(run_info) + + adb_client: AsyncDatabaseClient = db_data_creator.adb_client + + # Confirm presence of subtasks + subtasks: list[URLAutoAgencyIDSubtask] = await adb_client.get_all(URLAutoAgencyIDSubtask) + assert len(subtasks) == 2 + + # Confirm both listed as agencies found + assert all(subtask.agencies_found for subtask in subtasks) + + url_id_to_subtask: dict[int, URLAutoAgencyIDSubtask] = { + subtask.url_id: subtask for subtask in subtasks + } + single_subtask: URLAutoAgencyIDSubtask = url_id_to_subtask[single_url_id] + multi_subtask: URLAutoAgencyIDSubtask = url_id_to_subtask[multi_url_id] + + # Check subtasks have expected detail codes + assert single_subtask.detail == SubtaskDetailCode.HOMEPAGE_SINGLE_AGENCY + assert multi_subtask.detail == SubtaskDetailCode.HOMEPAGE_MULTI_AGENCY + + + # Get suggestions + suggestions: list[AgencyIDSubtaskSuggestion] = await adb_client.get_all(AgencyIDSubtaskSuggestion) + assert len(suggestions) == 3 + + # Confirm each suggestion properly linked to expected subtask + subtask_id_to_suggestions: dict[int, list[AgencyIDSubtaskSuggestion]] = defaultdict(list) + for suggestion in suggestions: + subtask_id_to_suggestions[suggestion.subtask_id].append(suggestion) + + # Check Single Agency Case Suggestion + single_suggestion: AgencyIDSubtaskSuggestion = \ + subtask_id_to_suggestions[single_subtask.id][0] + # Check Single Agency Case Suggestion has expected agency + assert single_suggestion.agency_id == single_agency_id + # Confirm confidence is 95 + assert single_suggestion.confidence == 95 + + # Check Multi Agency Case Suggestion + multi_suggestions: list[AgencyIDSubtaskSuggestion] = subtask_id_to_suggestions[multi_subtask.id] + # Check Multi Agency Case Suggestion has expected agencies + assert {suggestion.agency_id for suggestion in multi_suggestions} \ + == set(multi_agency_ids) + # Confirm confidence for each is 50 + assert all(suggestion.confidence == 50 for suggestion in multi_suggestions) + + # Test operator no longer meets prerequisites + assert not await operator.meets_task_prerequisites() \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/muckrock/__init__.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/muckrock/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/muckrock/test_core.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/muckrock/test_core.py new file mode 100644 index 00000000..7cf72c5e --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/muckrock/test_core.py @@ -0,0 +1,148 @@ +from unittest.mock import MagicMock + +import pytest + +from src.collectors.enums import CollectorType +from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface +from src.collectors.impl.muckrock.api_interface.lookup_response import AgencyLookupResponse +from src.collectors.impl.muckrock.enums import AgencyLookupResponseType +from src.core.enums import SuggestionType +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator +from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo +from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock_.core import MuckrockAgencyIDSubtaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType +from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask +from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion +from src.external.pdap.client import PDAPClient +from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo +from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse +from src.external.pdap.enums import MatchAgencyResponseStatus +from tests.helpers.asserts import assert_task_run_success +from tests.helpers.data_creator.core import DBDataCreator + + +@pytest.mark.asyncio +async def test_muckrock_subtask( + operator: AgencyIdentificationTaskOperator, + db_data_creator: DBDataCreator +): + adb_client: AsyncDatabaseClient = operator.adb_client + + # Run basic survey and confirm no next subtask + assert not await operator.meets_task_prerequisites() + assert operator._subtask is None + + # Add validated URL and confirm no next subtask + await db_data_creator.create_validated_urls(count=1) + + assert not await operator.meets_task_prerequisites() + assert operator._subtask is None + + # Add unvalidated URL without collector type + inapplicable_url_id: int = (await db_data_creator.create_urls(count=1))[0].url_id + + # Should still not have subtask + assert not await operator.meets_task_prerequisites() + assert operator._subtask is None + + # Create Auto Googler batch and link to validated URL + inapplicable_batch_id: int = await db_data_creator.create_batch( + strategy=CollectorType.AUTO_GOOGLER + ) + await db_data_creator.create_batch_url_links( + url_ids=[inapplicable_url_id], + batch_id=inapplicable_batch_id + ) + + # Confirm prerequisite not met + assert not await operator.meets_task_prerequisites() + assert operator._subtask is None + + # Create Muckrock batch and link to validated URL + applicable_url_id: int = ( + await db_data_creator.create_urls( + count=1, + collector_metadata={ + "agency": 123 + } + ) + )[0].url_id + applicable_batch_id: int = await db_data_creator.create_batch( + strategy=CollectorType.MUCKROCK_SIMPLE_SEARCH + ) + await db_data_creator.create_batch_url_links( + url_ids=[applicable_url_id], + batch_id=applicable_batch_id + ) + + # Confirm prerequisite met and subtask is Muckrock + assert await operator.meets_task_prerequisites() + assert operator._subtask == AutoAgencyIDSubtaskType.MUCKROCK + + # Test that muckrock subtask correctly sends agency name to + # MatchAgenciesInterface and adds received suggestions to + # url_agency_suggestions + + # Create mock instances for dependency injections + muckrock_api_interface_mock = operator.loader._muckrock_api_interface + pdap_client_mock = operator.loader._pdap_client + + # Set up mock return values for method calls + muckrock_api_interface_mock.lookup_agency.return_value = AgencyLookupResponse( + type=AgencyLookupResponseType.FOUND, + name="Mock Agency Name", + error=None + ) + + # Create agencies + await db_data_creator.create_agency(1) + await db_data_creator.create_agency(2) + + pdap_client_mock.match_agency.return_value = MatchAgencyResponse( + status=MatchAgencyResponseStatus.PARTIAL_MATCH, + matches=[ + MatchAgencyInfo( + id=1, + submitted_name="Mock Agency Name", + ), + MatchAgencyInfo( + id=2, + submitted_name="Another Mock Agency Name", + ) + ] + ) + + # Run the operator + run_info: TaskOperatorRunInfo = await operator.run_task() + assert_task_run_success(run_info) + + # Confirm prerequisite no longer met + assert not await operator.meets_task_prerequisites() + assert operator._subtask is None + + # Verify results + subtasks: list[URLAutoAgencyIDSubtask] = await adb_client.get_all(URLAutoAgencyIDSubtask) + assert len(subtasks) == 1 + subtask: URLAutoAgencyIDSubtask = subtasks[0] + assert subtask.type == AutoAgencyIDSubtaskType.MUCKROCK + assert subtask.url_id == applicable_url_id + subtask_id: int = subtask.id + + suggestions: list[AgencyIDSubtaskSuggestion] = await adb_client.get_all( + AgencyIDSubtaskSuggestion + ) + assert len(suggestions) == 2 + assert {suggestion.confidence for suggestion in suggestions} == {50} + assert {suggestion.agency_id for suggestion in suggestions} == {1, 2} + assert {suggestion.subtask_id for suggestion in suggestions} == {subtask_id} + + + # # Assert methods called as expected + muckrock_api_interface_mock.lookup_agency.assert_called_once_with( + muckrock_agency_id=123 + ) + pdap_client_mock.match_agency.assert_called_once_with( + name="Mock Agency Name" + ) diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/__init__.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/__init__.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/conftest.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/conftest.py new file mode 100644 index 00000000..766a7ca5 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/conftest.py @@ -0,0 +1,15 @@ +import pytest_asyncio + +from src.db.dtos.url.mapping import URLMapping +from tests.helpers.data_creator.core import DBDataCreator + + +@pytest_asyncio.fixture +async def url_ids( + db_data_creator: DBDataCreator, +) -> list[int]: + # Create 2 URLs with compressed HTML + url_mappings: list[URLMapping] = await db_data_creator.create_urls(count=2) + url_ids: list[int] = [url.url_id for url in url_mappings] + await db_data_creator.html_data(url_ids=url_ids) + return url_ids diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/test_core.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/test_core.py new file mode 100644 index 00000000..2c3ed419 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/end_to_end/test_core.py @@ -0,0 +1,118 @@ +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.models.input import \ + NLPLocationMatchSubtaskInput +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.core import \ + AgencyIDSubtaskInternalProcessor +from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData +from src.core.tasks.url.operators.agency_identification.subtasks.models.suggestion import AgencySuggestion +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.link.task_url import LinkTaskURL +from src.db.models.impl.url.error_info.sqlalchemy import URLErrorInfo +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType +from src.db.models.impl.url.suggestion.agency.subtask.pydantic import URLAutoAgencyIDSubtaskPydantic +from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask +from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion +from tests.helpers.asserts import assert_task_run_success +from tests.helpers.data_creator.core import DBDataCreator + +PATCH_ROOT = ( + "src.core.tasks.url.operators.agency_identification.subtasks." + + "impl.nlp_location_match_.core.AgencyIDSubtaskInternalProcessor.process" +) + + + +@pytest.mark.asyncio +async def test_nlp_location_match( + operator: AgencyIdentificationTaskOperator, + db_data_creator: DBDataCreator, + url_ids: list[int], + monkeypatch +): + # Confirm operator meets prerequisites + assert await operator.meets_task_prerequisites() + assert operator._subtask == AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH + + happy_path_url_id: int = url_ids[0] + error_url_id: int = url_ids[1] + + agency_ids: list[int] = await db_data_creator.create_agencies(count=2) + agency_id_25: int = agency_ids[0] + agency_id_75: int = agency_ids[1] + + async def mock_process_response( + self: AgencyIDSubtaskInternalProcessor, + inputs: list[NLPLocationMatchSubtaskInput], + ) -> list[AutoAgencyIDSubtaskData]: + response = [ + AutoAgencyIDSubtaskData( + pydantic_model=URLAutoAgencyIDSubtaskPydantic( + task_id=self._task_id, + url_id=happy_path_url_id, + type=AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH, + agencies_found=True, + ), + suggestions=[ + AgencySuggestion( + agency_id=agency_id_25, + confidence=25 + ), + AgencySuggestion( + agency_id=agency_id_75, + confidence=75 + ) + ] + ), + AutoAgencyIDSubtaskData( + pydantic_model=URLAutoAgencyIDSubtaskPydantic( + task_id=self._task_id, + url_id=error_url_id, + type=AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH, + agencies_found=False, + ), + suggestions=[], + error="Test error" + ) + ] + return response + + monkeypatch.setattr(AgencyIDSubtaskInternalProcessor, "process", mock_process_response) + run_info: TaskOperatorRunInfo = await operator.run_task() + assert_task_run_success(run_info) + + adb_client: AsyncDatabaseClient = operator.adb_client + # Confirm two URLs linked to the task + task_links: list[LinkTaskURL] = await adb_client.get_all(LinkTaskURL) + assert len(task_links) == 2 + assert {task_link.url_id for task_link in task_links} == set(url_ids) + assert {task_link.task_id for task_link in task_links} == {operator._task_id} + + # Confirm two subtasks were created + subtasks: list[URLAutoAgencyIDSubtask] = await adb_client.get_all(URLAutoAgencyIDSubtask) + assert len(subtasks) == 2 + assert {subtask.url_id for subtask in subtasks} == set(url_ids) + assert {subtask.task_id for subtask in subtasks} == {operator._task_id} + assert {subtask.type for subtask in subtasks} == {AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH} + assert {subtask.agencies_found for subtask in subtasks} == {True, False} + + + # Confirm one URL error info + error_infos: list[URLErrorInfo] = await adb_client.get_all(URLErrorInfo) + assert len(error_infos) == 1 + assert error_infos[0].task_id == operator._task_id + assert error_infos[0].url_id == error_url_id + assert error_infos[0].error == "Test error" + + # Confirm two suggestions for happy path URL id + suggestions: list[AgencyIDSubtaskSuggestion] = await adb_client.get_all(AgencyIDSubtaskSuggestion) + assert len(suggestions) == 2 + # Confirm expected agency ids + assert {suggestion.agency_id for suggestion in suggestions} == set(agency_ids) + # Confirm both have the expected confidence values + assert {suggestion.confidence for suggestion in suggestions} == {25, 75} + diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/__init__.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/__init__.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/conftest.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/conftest.py new file mode 100644 index 00000000..2abee544 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/conftest.py @@ -0,0 +1,18 @@ +from unittest.mock import AsyncMock + +import pytest + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.core import \ + AgencyIDSubtaskInternalProcessor +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.core import \ + NLPProcessor +from src.external.pdap.client import PDAPClient + + +@pytest.fixture +def internal_processor() -> AgencyIDSubtaskInternalProcessor: + return AgencyIDSubtaskInternalProcessor( + nlp_processor=AsyncMock(spec=NLPProcessor), + pdap_client=AsyncMock(PDAPClient), + task_id=1 + ) diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_empty.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_empty.py new file mode 100644 index 00000000..01899f30 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_empty.py @@ -0,0 +1,14 @@ +import pytest + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.core import \ + AgencyIDSubtaskInternalProcessor + + +@pytest.mark.asyncio() +async def test_empty( + internal_processor: AgencyIDSubtaskInternalProcessor, +): + """ + Test that when an input has no US State or locations, + that result is not returned + """ \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_no_state_any_locations.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_no_state_any_locations.py new file mode 100644 index 00000000..5fbbc6b5 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_no_state_any_locations.py @@ -0,0 +1,14 @@ +import pytest + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.core import \ + AgencyIDSubtaskInternalProcessor + + +@pytest.mark.asyncio() +async def test_no_state_any_locations( + internal_processor: AgencyIDSubtaskInternalProcessor, +): + """ + Test that when an input has no US State and any locations + that the result is not returned + """ \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_state_multiple_locations.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_state_multiple_locations.py new file mode 100644 index 00000000..6e7aef6a --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_state_multiple_locations.py @@ -0,0 +1,14 @@ +import pytest + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.core import \ + AgencyIDSubtaskInternalProcessor + + +@pytest.mark.asyncio() +async def test_state_multiple_locations( + internal_processor: AgencyIDSubtaskInternalProcessor, +): + """ + Test that when an input has a US State and multiple locations + then multiple results are returned with separate request ids + """ \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_state_no_locations.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_state_no_locations.py new file mode 100644 index 00000000..c0b1cef4 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_state_no_locations.py @@ -0,0 +1,14 @@ +import pytest + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.core import \ + AgencyIDSubtaskInternalProcessor + + +@pytest.mark.asyncio() +async def test_state_no_locations( + internal_processor: AgencyIDSubtaskInternalProcessor, +): + """ + Test that when an input has a US State and no locations + then no result is returned + """ \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_state_one_location.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_state_one_location.py new file mode 100644 index 00000000..7b4ef303 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/internal_processor/match_urls_to_search_params/test_state_one_location.py @@ -0,0 +1,14 @@ +import pytest + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.core import \ + AgencyIDSubtaskInternalProcessor + + +@pytest.mark.asyncio() +async def test_state_one_location( + internal_processor: AgencyIDSubtaskInternalProcessor, +): + """ + Test that when an input has a US State and one locatio + then one result is returned + """ \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/test_nlp_response_valid.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/test_nlp_response_valid.py new file mode 100644 index 00000000..ea81341c --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/nlp_location_match/test_nlp_response_valid.py @@ -0,0 +1,57 @@ +import pytest + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.response import \ + NLPLocationMatchResponse +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.us_state import \ + USState + +US_STATE = USState( + name="Pennsylvania", + iso="PA", +) + +SINGLE_LOCATION: list[str] = ["Pittsburgh"] +MULTIPLE_LOCATION: list[str] = ["Pittsburgh", "Allegheny"] + +@pytest.mark.parametrize( + argnames="nlp_response, expected_result", + argvalues=[ + ( + NLPLocationMatchResponse( + locations=SINGLE_LOCATION, + us_state=US_STATE + ), + True, + ), + ( + NLPLocationMatchResponse( + locations=MULTIPLE_LOCATION, + us_state=US_STATE, + ), + True + ), + ( + NLPLocationMatchResponse( + locations=MULTIPLE_LOCATION, + us_state=None, + ), + False, + ), + ( + NLPLocationMatchResponse( + locations=[], + us_state=US_STATE, + ), + False, + ), + ( + NLPLocationMatchResponse( + locations=[], + us_state=None, + ), + False + ) + ], +) +def test_nlp_response_valid(nlp_response: NLPLocationMatchResponse, expected_result: bool): + assert nlp_response.valid == expected_result \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/test_ckan.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/test_ckan.py deleted file mode 100644 index 6a2e4fed..00000000 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/test_ckan.py +++ /dev/null @@ -1,58 +0,0 @@ -from unittest.mock import AsyncMock - -import pytest - -from src.external.pdap.enums import MatchAgencyResponseStatus -from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan import CKANAgencyIdentificationSubtask -from src.core.enums import SuggestionType -from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse -from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo -from tests.helpers.data_creator.core import DBDataCreator - - -@pytest.mark.asyncio -async def test_ckan_subtask(db_data_creator: DBDataCreator): - # Test that ckan subtask correctly sends agency id to - # CKANAPIInterface, sends resultant agency name to - # PDAPClient and adds received suggestions to - # url_agency_suggestions - - pdap_client = AsyncMock() - pdap_client.match_agency.return_value = MatchAgencyResponse( - status=MatchAgencyResponseStatus.PARTIAL_MATCH, - matches=[ - MatchAgencyInfo( - id=1, - submitted_name="Mock Agency Name", - ), - MatchAgencyInfo( - id=2, - submitted_name="Another Mock Agency Name", - ) - ] - ) # Assuming MatchAgencyResponse is a class - - # Create an instance of CKANAgencyIdentificationSubtask - task = CKANAgencyIdentificationSubtask(pdap_client) - - # Call the run method with static values - collector_metadata = {"agency_name": "Test Agency"} - url_id = 1 - - # Call the run method - result = await task.run(url_id, collector_metadata) - - # Check the result - assert len(result) == 2 - assert result[0].url_id == 1 - assert result[0].suggestion_type == SuggestionType.AUTO_SUGGESTION - assert result[0].pdap_agency_id == 1 - assert result[0].agency_name == "Mock Agency Name" - assert result[1].url_id == 1 - assert result[1].suggestion_type == SuggestionType.AUTO_SUGGESTION - assert result[1].pdap_agency_id == 2 - assert result[1].agency_name == "Another Mock Agency Name" - - # Assert methods called as expected - pdap_client.match_agency.assert_called_once_with(name="Test Agency") - diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/test_muckrock.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/test_muckrock.py deleted file mode 100644 index 80f92ec4..00000000 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/test_muckrock.py +++ /dev/null @@ -1,80 +0,0 @@ -from unittest.mock import MagicMock - -import pytest - -from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface -from src.collectors.impl.muckrock.api_interface.lookup_response import AgencyLookupResponse -from src.collectors.impl.muckrock.enums import AgencyLookupResponseType -from src.core.enums import SuggestionType -from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock import MuckrockAgencyIdentificationSubtask -from src.external.pdap.client import PDAPClient -from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo -from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse -from src.external.pdap.enums import MatchAgencyResponseStatus -from tests.helpers.data_creator.core import DBDataCreator - - -@pytest.mark.asyncio -async def test_muckrock_subtask(db_data_creator: DBDataCreator): - # Test that muckrock subtask correctly sends agency name to - # MatchAgenciesInterface and adds received suggestions to - # url_agency_suggestions - - # Create mock instances for dependency injections - muckrock_api_interface_mock = MagicMock(spec=MuckrockAPIInterface) - pdap_client_mock = MagicMock(spec=PDAPClient) - - # Set up mock return values for method calls - muckrock_api_interface_mock.lookup_agency.return_value = AgencyLookupResponse( - type=AgencyLookupResponseType.FOUND, - name="Mock Agency Name", - error=None - ) - - pdap_client_mock.match_agency.return_value = MatchAgencyResponse( - status=MatchAgencyResponseStatus.PARTIAL_MATCH, - matches=[ - MatchAgencyInfo( - id=1, - submitted_name="Mock Agency Name", - ), - MatchAgencyInfo( - id=2, - submitted_name="Another Mock Agency Name", - ) - ] - ) - - # Create an instance of MuckrockAgencyIdentificationSubtask with mock dependencies - muckrock_agency_identification_subtask = MuckrockAgencyIdentificationSubtask( - muckrock_api_interface=muckrock_api_interface_mock, - pdap_client=pdap_client_mock - ) - - # Run the subtask - results: list[URLAgencySuggestionInfo] = await muckrock_agency_identification_subtask.run( - url_id=1, - collector_metadata={ - "agency": 123 - } - ) - - # Verify the results - assert len(results) == 2 - assert results[0].url_id == 1 - assert results[0].suggestion_type == SuggestionType.AUTO_SUGGESTION - assert results[0].pdap_agency_id == 1 - assert results[0].agency_name == "Mock Agency Name" - assert results[1].url_id == 1 - assert results[1].suggestion_type == SuggestionType.AUTO_SUGGESTION - assert results[1].pdap_agency_id == 2 - assert results[1].agency_name == "Another Mock Agency Name" - - # Assert methods called as expected - muckrock_api_interface_mock.lookup_agency.assert_called_once_with( - muckrock_agency_id=123 - ) - pdap_client_mock.match_agency.assert_called_once_with( - name="Mock Agency Name" - ) diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/test_unknown.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/test_unknown.py deleted file mode 100644 index aab59dca..00000000 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/test_unknown.py +++ /dev/null @@ -1,16 +0,0 @@ -import pytest - -from src.core.enums import SuggestionType -from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from src.core.tasks.url.operators.agency_identification.subtasks.impl.unknown import UnknownAgencyIdentificationSubtask - - -@pytest.mark.asyncio -async def test_unknown_agency_identification_subtask(): - # Test that no_collector subtask correctly adds URL to - # url_agency_suggestions with label 'Unknown' - subtask = UnknownAgencyIdentificationSubtask() - results: list[URLAgencySuggestionInfo] = await subtask.run(url_id=1, collector_metadata={}) - assert len(results) == 1 - assert results[0].url_id == 1 - assert results[0].suggestion_type == SuggestionType.UNKNOWN \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/survey/__init__.py b/tests/automated/integration/tasks/url/impl/agency_identification/survey/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/survey/test_survey_flag.py b/tests/automated/integration/tasks/url/impl/agency_identification/survey/test_survey_flag.py new file mode 100644 index 00000000..8ace042e --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/agency_identification/survey/test_survey_flag.py @@ -0,0 +1,49 @@ +import pytest + +from src.collectors.enums import CollectorType +from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType +from tests.helpers.data_creator.core import DBDataCreator + +@pytest.mark.asyncio +async def test_survey_flag( + operator: AgencyIdentificationTaskOperator, + db_data_creator: DBDataCreator, + monkeypatch +): + """ + Test that survey correctly disables Subtask flags + when the environment variable is set to disable that subtask + """ + + # Run basic survey and confirm no next subtask + assert not await operator.meets_task_prerequisites() + assert operator._subtask is None + + applicable_url_id: int = ( + await db_data_creator.create_urls( + count=1, + collector_metadata={ + "agency_name": "Test Agency" + } + ) + )[0].url_id + applicable_batch_id: int = await db_data_creator.create_batch( + strategy=CollectorType.CKAN + ) + await db_data_creator.create_batch_url_links( + url_ids=[applicable_url_id], + batch_id=applicable_batch_id + ) + + # Confirm prerequisite met and subtask is CKAN + assert await operator.meets_task_prerequisites() + assert operator._subtask == AutoAgencyIDSubtaskType.CKAN + + # Set flag to disable CKAN Subtask + monkeypatch.setenv( + "AGENCY_ID_CKAN_FLAG", "0" + ) + + # Confirm prerequisite no longer met. + assert not await operator.meets_task_prerequisites() \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/auto_relevant/test_task.py b/tests/automated/integration/tasks/url/impl/auto_relevant/test_task.py index 81b03070..5943213b 100644 --- a/tests/automated/integration/tasks/url/impl/auto_relevant/test_task.py +++ b/tests/automated/integration/tasks/url/impl/auto_relevant/test_task.py @@ -32,7 +32,7 @@ async def test_url_auto_relevant_task(db_data_creator): assert len(urls) == 3 counter = Counter([url.status for url in urls]) assert counter[URLStatus.ERROR] == 1 - assert counter[URLStatus.PENDING] == 2 + assert counter[URLStatus.OK] == 2 # Confirm two annotations were created suggestions: list[AutoRelevantSuggestion] = await adb_client.get_all(AutoRelevantSuggestion) diff --git a/tests/automated/integration/tasks/url/impl/html/mocks/url_request_interface/setup.py b/tests/automated/integration/tasks/url/impl/html/mocks/url_request_interface/setup.py index 76f1969e..c0dbef6a 100644 --- a/tests/automated/integration/tasks/url/impl/html/mocks/url_request_interface/setup.py +++ b/tests/automated/integration/tasks/url/impl/html/mocks/url_request_interface/setup.py @@ -3,7 +3,6 @@ from src.external.url_request.dtos.url_response import URLResponseInfo from tests.automated.integration.tasks.url.impl.html.setup.data import TEST_ENTRIES from tests.automated.integration.tasks.url.impl.html.setup.models.entry import TestURLHTMLTaskSetupEntry, TestErrorType -from tests.helpers.simple_test_data_functions import generate_test_html def _get_success( @@ -29,6 +28,19 @@ def _get_content_type( return None return "text/html" +def _generate_test_html() -> str: + return """ + + + + Example HTML + + +

Example HTML

+

This is an example of HTML content.

+ + + """ def setup_url_to_response_info( ) -> dict[str, URLResponseInfo]: @@ -37,7 +49,7 @@ def setup_url_to_response_info( response_info = URLResponseInfo( success=_get_success(entry), status=get_http_status(entry), - html=generate_test_html() if _get_success(entry) else None, + html=_generate_test_html() if _get_success(entry) else None, content_type=_get_content_type(entry), exception=None if _get_success(entry) else "Error" ) diff --git a/tests/automated/integration/tasks/url/impl/html/setup/data.py b/tests/automated/integration/tasks/url/impl/html/setup/data.py index e9495ad4..5615392c 100644 --- a/tests/automated/integration/tasks/url/impl/html/setup/data.py +++ b/tests/automated/integration/tasks/url/impl/html/setup/data.py @@ -11,7 +11,7 @@ TestURLHTMLTaskSetupEntry( url_info=TestURLInfo( url="https://happy-path.com/pending", - status=URLStatus.PENDING + status=URLStatus.OK ), web_metadata_info=TestWebMetadataInfo( accessed=True, @@ -66,7 +66,7 @@ TestURLHTMLTaskSetupEntry( url_info=TestURLInfo( url="https://not-200-path.com/submitted", - status=URLStatus.PENDING + status=URLStatus.OK ), web_metadata_info=TestWebMetadataInfo( accessed=True, @@ -83,7 +83,7 @@ TestURLHTMLTaskSetupEntry( url_info=TestURLInfo( url="https://no-web-metadata.com/submitted", - status=URLStatus.PENDING + status=URLStatus.OK ), web_metadata_info=None, expected_result=ExpectedResult( diff --git a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_error.py b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_error.py index 404f00e1..e788fff1 100644 --- a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_error.py +++ b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_error.py @@ -1,15 +1,19 @@ import pytest from src.collectors.enums import URLStatus +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error from tests.automated.integration.tasks.url.impl.probe.check.manager import TestURLProbeCheckManager from tests.automated.integration.tasks.url.impl.probe.setup.manager import TestURLProbeSetupManager +from tests.helpers.data_creator.core import DBDataCreator @pytest.mark.asyncio async def test_url_probe_task_error( setup_manager: TestURLProbeSetupManager, - check_manager: TestURLProbeCheckManager + check_manager: TestURLProbeCheckManager, + db_data_creator: DBDataCreator ): """ If a URL returns a 500 error response (or any other error), @@ -28,15 +32,20 @@ async def test_url_probe_task_error( ) ) assert not await operator.meets_task_prerequisites() - url_id = await setup_manager.setup_url(URLStatus.SUBMITTED) + url_id: int = await setup_manager.setup_url(URLStatus.OK) + await db_data_creator.create_validated_flags([url_id], validation_type=URLValidatedType.DATA_SOURCE) + await db_data_creator.create_url_data_sources([url_id]) + assert await operator.meets_task_prerequisites() run_info = await operator.run_task() assert_task_ran_without_error(run_info) assert not await operator.meets_task_prerequisites() await check_manager.check_url( url_id=url_id, - expected_status=URLStatus.SUBMITTED + expected_status=URLStatus.OK ) + + await check_manager.check_web_metadata( url_id=url_id, status_code=500, diff --git a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_not_found.py b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_not_found.py index 97937c15..7fc54da4 100644 --- a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_not_found.py +++ b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_not_found.py @@ -1,15 +1,18 @@ import pytest from src.collectors.enums import URLStatus +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error from tests.automated.integration.tasks.url.impl.probe.check.manager import TestURLProbeCheckManager from tests.automated.integration.tasks.url.impl.probe.setup.manager import TestURLProbeSetupManager +from tests.helpers.data_creator.core import DBDataCreator @pytest.mark.asyncio async def test_url_probe_task_not_found( setup_manager: TestURLProbeSetupManager, - check_manager: TestURLProbeCheckManager + check_manager: TestURLProbeCheckManager, + db_data_creator: DBDataCreator ): """ If a URL returns a 404 error response, @@ -29,14 +32,15 @@ async def test_url_probe_task_not_found( ) ) assert not await operator.meets_task_prerequisites() - url_id = await setup_manager.setup_url(URLStatus.NOT_RELEVANT) + url_id = await setup_manager.setup_url(URLStatus.OK) + await db_data_creator.create_validated_flags([url_id], validation_type=URLValidatedType.NOT_RELEVANT) assert await operator.meets_task_prerequisites() run_info = await operator.run_task() assert_task_ran_without_error(run_info) assert not await operator.meets_task_prerequisites() await check_manager.check_url( url_id=url_id, - expected_status=URLStatus.NOT_RELEVANT + expected_status=URLStatus.OK ) await check_manager.check_web_metadata( url_id=url_id, diff --git a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_ok.py b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_ok.py index a02f1ba4..ecaec084 100644 --- a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_ok.py +++ b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_ok.py @@ -28,14 +28,14 @@ async def test_url_probe_task_no_redirect_ok( ) ) assert not await operator.meets_task_prerequisites() - url_id = await setup_manager.setup_url(URLStatus.PENDING) + url_id = await setup_manager.setup_url(URLStatus.OK) assert await operator.meets_task_prerequisites() run_info = await operator.run_task() assert_task_ran_without_error(run_info) assert not await operator.meets_task_prerequisites() await check_manager.check_url( url_id=url_id, - expected_status=URLStatus.PENDING + expected_status=URLStatus.OK ) await check_manager.check_web_metadata( url_id=url_id, diff --git a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_two_urls.py b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_two_urls.py index 0c1da5fd..cfd1f68f 100644 --- a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_two_urls.py +++ b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_two_urls.py @@ -31,8 +31,8 @@ async def test_two_urls( ] ) assert not await operator.meets_task_prerequisites() - url_id_1 = await setup_manager.setup_url(URLStatus.PENDING, url=url_1) - url_id_2 = await setup_manager.setup_url(URLStatus.NOT_RELEVANT, url=url_2) + url_id_1 = await setup_manager.setup_url(URLStatus.OK, url=url_1) + url_id_2 = await setup_manager.setup_url(URLStatus.OK, url=url_2) assert await operator.meets_task_prerequisites() run_info = await operator.run_task() assert_task_ran_without_error(run_info) diff --git a/tests/automated/integration/tasks/url/impl/probe/redirect/dest_new/test_dest_ok.py b/tests/automated/integration/tasks/url/impl/probe/redirect/dest_new/test_dest_ok.py index 88098b16..df695021 100644 --- a/tests/automated/integration/tasks/url/impl/probe/redirect/dest_new/test_dest_ok.py +++ b/tests/automated/integration/tasks/url/impl/probe/redirect/dest_new/test_dest_ok.py @@ -28,12 +28,12 @@ async def test_url_probe_task_redirect_dest_new_ok( dest_error=None ) ) - source_url_id = await setup_manager.setup_url(URLStatus.PENDING) + source_url_id = await setup_manager.setup_url(URLStatus.OK) run_info = await operator.run_task() assert_task_ran_without_error(run_info) await check_manager.check_url( url_id=source_url_id, - expected_status=URLStatus.PENDING + expected_status=URLStatus.OK ) await check_manager.check_web_metadata( url_id=source_url_id, @@ -45,7 +45,7 @@ async def test_url_probe_task_redirect_dest_new_ok( dest_url_id = await check_manager.check_redirect(source_url_id) await check_manager.check_url( url_id=dest_url_id, - expected_status=URLStatus.PENDING + expected_status=URLStatus.OK ) await check_manager.check_web_metadata( url_id=dest_url_id, diff --git a/tests/automated/integration/tasks/url/impl/probe/redirect/test_dest_exists_in_db.py b/tests/automated/integration/tasks/url/impl/probe/redirect/test_dest_exists_in_db.py index 0744f3b9..b52dce6b 100644 --- a/tests/automated/integration/tasks/url/impl/probe/redirect/test_dest_exists_in_db.py +++ b/tests/automated/integration/tasks/url/impl/probe/redirect/test_dest_exists_in_db.py @@ -29,8 +29,8 @@ async def test_url_probe_task_redirect_dest_exists_in_db( dest_error=None ) ) - source_url_id = await setup_manager.setup_url(URLStatus.INDIVIDUAL_RECORD) - dest_url_id = await setup_manager.setup_url(URLStatus.PENDING, url=TEST_DEST_URL) + source_url_id = await setup_manager.setup_url(URLStatus.OK) + dest_url_id = await setup_manager.setup_url(URLStatus.OK, url=TEST_DEST_URL) # Add web metadata for destination URL, to prevent it from being pulled web_metadata = URLWebMetadataPydantic( url_id=dest_url_id, @@ -44,11 +44,11 @@ async def test_url_probe_task_redirect_dest_exists_in_db( assert_task_ran_without_error(run_info) await check_manager.check_url( url_id=source_url_id, - expected_status=URLStatus.INDIVIDUAL_RECORD + expected_status=URLStatus.OK ) await check_manager.check_url( url_id=dest_url_id, - expected_status=URLStatus.PENDING + expected_status=URLStatus.OK ) await check_manager.check_web_metadata( url_id=source_url_id, diff --git a/tests/automated/integration/tasks/url/impl/probe/redirect/test_redirect_infinite.py b/tests/automated/integration/tasks/url/impl/probe/redirect/test_redirect_infinite.py index ed9c38ac..5a66af3d 100644 --- a/tests/automated/integration/tasks/url/impl/probe/redirect/test_redirect_infinite.py +++ b/tests/automated/integration/tasks/url/impl/probe/redirect/test_redirect_infinite.py @@ -27,11 +27,11 @@ async def test_url_probe_task_redirect_infinite( redirect_url=TEST_URL ) ) - url_id = await setup_manager.setup_url(URLStatus.PENDING) + url_id = await setup_manager.setup_url(URLStatus.OK) run_info = await operator.run_task() await check_manager.check_url( url_id=url_id, - expected_status=URLStatus.PENDING + expected_status=URLStatus.OK ) await check_manager.check_web_metadata( url_id=url_id, diff --git a/tests/automated/integration/tasks/url/impl/probe/redirect/test_two_urls_same_dest.py b/tests/automated/integration/tasks/url/impl/probe/redirect/test_two_urls_same_dest.py index 267d9015..f0e113ff 100644 --- a/tests/automated/integration/tasks/url/impl/probe/redirect/test_two_urls_same_dest.py +++ b/tests/automated/integration/tasks/url/impl/probe/redirect/test_two_urls_same_dest.py @@ -34,17 +34,17 @@ async def test_url_probe_task_redirect_two_urls_same_dest( ), ] ) - source_url_id_1 = await setup_manager.setup_url(URLStatus.PENDING) - source_url_id_2 = await setup_manager.setup_url(URLStatus.PENDING, url="https://example.com/2") + source_url_id_1 = await setup_manager.setup_url(URLStatus.OK) + source_url_id_2 = await setup_manager.setup_url(URLStatus.OK, url="https://example.com/2") run_info = await operator.run_task() assert_task_ran_without_error(run_info) await check_manager.check_url( url_id=source_url_id_1, - expected_status=URLStatus.PENDING + expected_status=URLStatus.OK ) await check_manager.check_url( url_id=source_url_id_2, - expected_status=URLStatus.PENDING + expected_status=URLStatus.OK ) redirect_url_id_1 = await check_manager.check_redirect( source_url_id=source_url_id_1 diff --git a/tests/automated/integration/tasks/url/impl/submit_approved/test_submit_approved_url_task.py b/tests/automated/integration/tasks/url/impl/submit_approved/test_submit_approved_url_task.py index 7d56ddcf..f992fbb6 100644 --- a/tests/automated/integration/tasks/url/impl/submit_approved/test_submit_approved_url_task.py +++ b/tests/automated/integration/tasks/url/impl/submit_approved/test_submit_approved_url_task.py @@ -16,9 +16,9 @@ @pytest.mark.asyncio async def test_submit_approved_url_task( - db_data_creator, - mock_pdap_client: PDAPClient, - monkeypatch + db_data_creator, + mock_pdap_client: PDAPClient, + monkeypatch ): """ The submit_approved_url_task should submit @@ -37,7 +37,7 @@ async def test_submit_approved_url_task( # Create URLs with status 'validated' in database and all requisite URL values # Ensure they have optional metadata as well - urls = await setup_validated_urls(db_data_creator) + urls: list[str] = await setup_validated_urls(db_data_creator) mock_make_request(mock_pdap_client, urls) # Check Task Operator does meet pre-requisites @@ -50,14 +50,14 @@ async def test_submit_approved_url_task( assert run_info.outcome == TaskOperatorOutcome.SUCCESS, run_info.message # Get URLs - urls = await db_data_creator.adb_client.get_all(URL, order_by_attribute="id") - url_1 = urls[0] - url_2 = urls[1] - url_3 = urls[2] + urls: list[URL] = await db_data_creator.adb_client.get_all(URL, order_by_attribute="id") + url_1: URL = urls[0] + url_2: URL = urls[1] + url_3: URL = urls[2] # Check URLs have been marked as 'submitted' - assert url_1.status == URLStatus.SUBMITTED - assert url_2.status == URLStatus.SUBMITTED + assert url_1.status == URLStatus.OK + assert url_2.status == URLStatus.OK assert url_3.status == URLStatus.ERROR # Get URL Data Source Links diff --git a/tests/automated/integration/tasks/url/impl/submit_approved/test_validated_meta_url.py b/tests/automated/integration/tasks/url/impl/submit_approved/test_validated_meta_url.py new file mode 100644 index 00000000..5f927159 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/submit_approved/test_validated_meta_url.py @@ -0,0 +1,42 @@ +import pytest + +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.url.operators.submit_approved.core import SubmitApprovedURLTaskOperator +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource +from src.external.pdap.client import PDAPClient +from tests.helpers.asserts import assert_task_run_success + + +@pytest.mark.asyncio +async def test_validated_meta_url_not_included( + db_data_creator, + mock_pdap_client: PDAPClient, + monkeypatch +): + """ + If a validated Meta URL is included in the database + This should not be included in the submit approved task + """ + + # Get Task Operator + operator = SubmitApprovedURLTaskOperator( + adb_client=db_data_creator.adb_client, + pdap_client=mock_pdap_client + ) + + dbdc = db_data_creator + url_1: int = (await dbdc.create_validated_urls( + validation_type=URLValidatedType.META_URL + ))[0].url_id + + # Test task operator does not meet prerequisites + assert not await operator.meets_task_prerequisites() + + # Run task and confirm runs without error + run_info: TaskOperatorRunInfo = await operator.run_task() + assert_task_run_success(run_info) + + # Confirm entry not included in database + ds_urls: list[URLDataSource] = await dbdc.adb_client.get_all(URLDataSource) + assert len(ds_urls) == 0 diff --git a/tests/automated/integration/tasks/url/impl/test_url_404_probe.py b/tests/automated/integration/tasks/url/impl/test_url_404_probe.py index 630f7f4e..e55ad9ad 100644 --- a/tests/automated/integration/tasks/url/impl/test_url_404_probe.py +++ b/tests/automated/integration/tasks/url/impl/test_url_404_probe.py @@ -12,6 +12,7 @@ from src.collectors.enums import URLStatus from src.core.tasks.url.enums import TaskOperatorOutcome from src.external.url_request.dtos.url_response import URLResponseInfo +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.data_creator.core import DBDataCreator from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters @@ -19,6 +20,7 @@ @pytest.mark.asyncio async def test_url_404_probe_task( + wiped_database, db_data_creator: DBDataCreator ): @@ -84,12 +86,12 @@ async def mock_make_simple_requests(self, urls: list[str]) -> list[URLResponseIn urls=[ TestURLCreationParameters( count=3, - status=URLStatus.PENDING, + status=URLCreationEnum.OK, with_html_content=True ), TestURLCreationParameters( count=1, - status=URLStatus.ERROR, + status=URLCreationEnum.ERROR, with_html_content=False ), ] @@ -104,12 +106,12 @@ async def mock_make_simple_requests(self, urls: list[str]) -> list[URLResponseIn assert run_info.outcome == TaskOperatorOutcome.SUCCESS, run_info.message - pending_url_mappings = creation_info.urls_by_status[URLStatus.PENDING].url_mappings + pending_url_mappings = creation_info.urls_by_status[URLCreationEnum.OK].url_mappings url_id_success = pending_url_mappings[0].url_id url_id_404 = pending_url_mappings[1].url_id url_id_error = pending_url_mappings[2].url_id - url_id_initial_error = creation_info.urls_by_status[URLStatus.ERROR].url_mappings[0].url_id + url_id_initial_error = creation_info.urls_by_status[URLCreationEnum.ERROR].url_mappings[0].url_id # Check that URLProbedFor404 has been appropriately populated probed_for_404_objects: list[URLProbedFor404] = await db_data_creator.adb_client.get_all(URLProbedFor404) @@ -128,9 +130,9 @@ def find_url(url_id: int) -> URL: return url raise Exception(f"URL with id {url_id} not found") - assert find_url(url_id_success).status == URLStatus.PENDING + assert find_url(url_id_success).status == URLStatus.OK assert find_url(url_id_404).status == URLStatus.NOT_FOUND - assert find_url(url_id_error).status == URLStatus.PENDING + assert find_url(url_id_error).status == URLStatus.OK assert find_url(url_id_initial_error).status == URLStatus.ERROR # Check that meets_task_prerequisites now returns False diff --git a/tests/automated/integration/tasks/url/loader/conftest.py b/tests/automated/integration/tasks/url/loader/conftest.py index 045236f9..52a17b5e 100644 --- a/tests/automated/integration/tasks/url/loader/conftest.py +++ b/tests/automated/integration/tasks/url/loader/conftest.py @@ -4,10 +4,11 @@ from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface from src.core.tasks.url.loader import URLTaskOperatorLoader +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.core import \ + NLPProcessor from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser from src.db.client.async_ import AsyncDatabaseClient from src.external.huggingface.inference.client import HuggingFaceInferenceClient -from src.external.internet_archives.client import InternetArchivesClient from src.external.pdap.client import PDAPClient from src.external.url_request.core import URLRequestInterface @@ -22,4 +23,5 @@ def loader() -> URLTaskOperatorLoader: pdap_client=AsyncMock(spec=PDAPClient), muckrock_api_interface=AsyncMock(spec=MuckrockAPIInterface), hf_inference_client=AsyncMock(spec=HuggingFaceInferenceClient), + nlp_processor=AsyncMock(spec=NLPProcessor) ) \ No newline at end of file diff --git a/tests/helpers/batch_creation_parameters/core.py b/tests/helpers/batch_creation_parameters/core.py index dfc33644..4562cbdf 100644 --- a/tests/helpers/batch_creation_parameters/core.py +++ b/tests/helpers/batch_creation_parameters/core.py @@ -9,10 +9,10 @@ class TestBatchCreationParameters(BaseModel): - created_at: Optional[datetime.datetime] = None + created_at: datetime.datetime | None = None outcome: BatchStatus = BatchStatus.READY_TO_LABEL strategy: CollectorType = CollectorType.EXAMPLE - urls: Optional[list[TestURLCreationParameters]] = None + urls: list[TestURLCreationParameters] | None = None @model_validator(mode='after') def validate_urls(self): diff --git a/tests/helpers/batch_creation_parameters/enums.py b/tests/helpers/batch_creation_parameters/enums.py new file mode 100644 index 00000000..d61a2793 --- /dev/null +++ b/tests/helpers/batch_creation_parameters/enums.py @@ -0,0 +1,11 @@ +from enum import Enum + + +class URLCreationEnum(Enum): + OK = "ok" + SUBMITTED = "submitted" + VALIDATED = "validated" + ERROR = "error" + NOT_RELEVANT = "not_relevant" + DUPLICATE = "duplicate" + NOT_FOUND = "not_found" \ No newline at end of file diff --git a/tests/helpers/batch_creation_parameters/url_creation_parameters.py b/tests/helpers/batch_creation_parameters/url_creation_parameters.py index 2e30cca0..701a239b 100644 --- a/tests/helpers/batch_creation_parameters/url_creation_parameters.py +++ b/tests/helpers/batch_creation_parameters/url_creation_parameters.py @@ -1,23 +1,26 @@ from pydantic import BaseModel, model_validator from src.api.endpoints.annotate.agency.post.dto import URLAgencyAnnotationPostInfo -from src.collectors.enums import URLStatus from src.core.enums import RecordType from tests.helpers.batch_creation_parameters.annotation_info import AnnotationInfo +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum class TestURLCreationParameters(BaseModel): count: int = 1 - status: URLStatus = URLStatus.PENDING + status: URLCreationEnum = URLCreationEnum.OK with_html_content: bool = False annotation_info: AnnotationInfo = AnnotationInfo() @model_validator(mode='after') def validate_annotation_info(self): - if self.status == URLStatus.NOT_RELEVANT: + if self.status == URLCreationEnum.NOT_RELEVANT: self.annotation_info.final_review_approved = False return self - if self.status != URLStatus.VALIDATED: + if self.status not in ( + URLCreationEnum.SUBMITTED, + URLCreationEnum.VALIDATED + ): return self # Assume is validated diff --git a/tests/helpers/counter.py b/tests/helpers/counter.py new file mode 100644 index 00000000..8d9de1a0 --- /dev/null +++ b/tests/helpers/counter.py @@ -0,0 +1,7 @@ + +from itertools import count + +COUNTER = count(1) + +def next_int() -> int: + return next(COUNTER) \ No newline at end of file diff --git a/tests/helpers/data_creator/commands/impl/annotate.py b/tests/helpers/data_creator/commands/impl/annotate.py index 5f341326..1f549615 100644 --- a/tests/helpers/data_creator/commands/impl/annotate.py +++ b/tests/helpers/data_creator/commands/impl/annotate.py @@ -7,7 +7,7 @@ from src.core.enums import SuggestionType from tests.helpers.batch_creation_parameters.annotation_info import AnnotationInfo from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase -from tests.helpers.data_creator.commands.impl.suggestion.auto.agency import AgencyAutoSuggestionsCommand +from tests.helpers.data_creator.commands.impl.suggestion.auto.agency_.core import AgencyAutoSuggestionsCommand from tests.helpers.data_creator.commands.impl.suggestion.auto.record_type import AutoRecordTypeSuggestionCommand from tests.helpers.data_creator.commands.impl.suggestion.auto.relevant import AutoRelevantSuggestionCommand from tests.helpers.data_creator.commands.impl.suggestion.user.agency import AgencyUserSuggestionsCommand diff --git a/tests/helpers/data_creator/commands/impl/batch.py b/tests/helpers/data_creator/commands/impl/batch.py index 69583a45..6871661d 100644 --- a/tests/helpers/data_creator/commands/impl/batch.py +++ b/tests/helpers/data_creator/commands/impl/batch.py @@ -3,7 +3,7 @@ from src.collectors.enums import CollectorType from src.core.enums import BatchStatus -from src.db.models.impl.batch.pydantic import BatchInfo +from src.db.models.impl.batch.pydantic.info import BatchInfo from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase diff --git a/tests/helpers/data_creator/commands/impl/suggestion/auto/agency.py b/tests/helpers/data_creator/commands/impl/suggestion/auto/agency.py deleted file mode 100644 index 96743df8..00000000 --- a/tests/helpers/data_creator/commands/impl/suggestion/auto/agency.py +++ /dev/null @@ -1,46 +0,0 @@ -from typing import final - -from typing_extensions import override - -from src.core.enums import SuggestionType -from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase -from tests.helpers.data_creator.commands.impl.agency import AgencyCommand - -@final -class AgencyAutoSuggestionsCommand(DBDataCreatorCommandBase): - - def __init__( - self, - url_id: int, - count: int, - suggestion_type: SuggestionType = SuggestionType.AUTO_SUGGESTION - ): - super().__init__() - if suggestion_type == SuggestionType.UNKNOWN: - count = 1 # Can only be one auto suggestion if unknown - self.url_id = url_id - self.count = count - self.suggestion_type = suggestion_type - - @override - async def run(self) -> None: - suggestions = [] - for _ in range(self.count): - if self.suggestion_type == SuggestionType.UNKNOWN: - pdap_agency_id = None - else: - pdap_agency_id = await self.run_command(AgencyCommand()) - suggestion = URLAgencySuggestionInfo( - url_id=self.url_id, - suggestion_type=self.suggestion_type, - pdap_agency_id=pdap_agency_id, - state="Test State", - county="Test County", - locality="Test Locality" - ) - suggestions.append(suggestion) - - await self.adb_client.add_agency_auto_suggestions( - suggestions=suggestions - ) \ No newline at end of file diff --git a/tests/helpers/data_creator/commands/impl/suggestion/auto/agency_/__init__.py b/tests/helpers/data_creator/commands/impl/suggestion/auto/agency_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/helpers/data_creator/commands/impl/suggestion/auto/agency_/core.py b/tests/helpers/data_creator/commands/impl/suggestion/auto/agency_/core.py new file mode 100644 index 00000000..fe54c6f9 --- /dev/null +++ b/tests/helpers/data_creator/commands/impl/suggestion/auto/agency_/core.py @@ -0,0 +1,78 @@ +from typing import final + +from typing_extensions import override + +from src.core.enums import SuggestionType +from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo +from src.db.enums import TaskType +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType +from src.db.models.impl.url.suggestion.agency.subtask.pydantic import URLAutoAgencyIDSubtaskPydantic +from src.db.models.impl.url.suggestion.agency.suggestion.pydantic import AgencyIDSubtaskSuggestionPydantic +from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase +from tests.helpers.data_creator.commands.impl.agency import AgencyCommand + +@final +class AgencyAutoSuggestionsCommand(DBDataCreatorCommandBase): + + def __init__( + self, + url_id: int, + count: int, + suggestion_type: SuggestionType = SuggestionType.AUTO_SUGGESTION, + subtask_type: AutoAgencyIDSubtaskType = AutoAgencyIDSubtaskType.HOMEPAGE_MATCH, + confidence: int = 50 + ): + super().__init__() + if suggestion_type == SuggestionType.UNKNOWN: + count = 1 # Can only be one auto suggestion if unknown + agencies_found = False + else: + agencies_found = True + self.url_id = url_id + self.count = count + self.suggestion_type = suggestion_type + self.subtask_type = subtask_type + self.confidence = confidence + self.agencies_found = agencies_found + + @override + async def run(self) -> None: + task_id: int = await self.add_task() + subtask_id: int = await self.create_subtask(task_id) + if not self.agencies_found: + return + + suggestions: list[AgencyIDSubtaskSuggestionPydantic] = [] + for _ in range(self.count): + pdap_agency_id: int = await self.run_command(AgencyCommand()) + + suggestion = AgencyIDSubtaskSuggestionPydantic( + subtask_id=subtask_id, + agency_id=pdap_agency_id, + confidence=self.confidence, + ) + suggestions.append(suggestion) + + await self.adb_client.bulk_insert( + models=suggestions, + ) + + async def add_task(self) -> int: + task_id: int = await self.adb_client.initiate_task( + task_type=TaskType.AGENCY_IDENTIFICATION, + ) + return task_id + + async def create_subtask(self, task_id: int) -> int: + obj: URLAutoAgencyIDSubtaskPydantic = URLAutoAgencyIDSubtaskPydantic( + task_id=task_id, + type=self.subtask_type, + url_id=self.url_id, + agencies_found=self.agencies_found, + ) + subtask_id: int = (await self.adb_client.bulk_insert( + models=[obj], + return_ids=True + ))[0] + return subtask_id + diff --git a/tests/helpers/data_creator/commands/impl/urls_/__init__.py b/tests/helpers/data_creator/commands/impl/urls_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/helpers/data_creator/commands/impl/urls_/convert.py b/tests/helpers/data_creator/commands/impl/urls_/convert.py new file mode 100644 index 00000000..d76edfe5 --- /dev/null +++ b/tests/helpers/data_creator/commands/impl/urls_/convert.py @@ -0,0 +1,36 @@ +from src.collectors.enums import URLStatus +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum + + +def convert_url_creation_enum_to_url_status(url_creation_enum: URLCreationEnum) -> URLStatus: + match url_creation_enum: + case URLCreationEnum.OK: + return URLStatus.OK + case URLCreationEnum.SUBMITTED: + return URLStatus.OK + case URLCreationEnum.VALIDATED: + return URLStatus.OK + case URLCreationEnum.NOT_RELEVANT: + return URLStatus.OK + case URLCreationEnum.ERROR: + return URLStatus.ERROR + case URLCreationEnum.DUPLICATE: + return URLStatus.DUPLICATE + case URLCreationEnum.NOT_FOUND: + return URLStatus.NOT_FOUND + case _: + raise ValueError(f"Unknown URLCreationEnum: {url_creation_enum}") + +def convert_url_creation_enum_to_validated_type( + url_creation_enum: URLCreationEnum +) -> URLValidatedType: + match url_creation_enum: + case URLCreationEnum.SUBMITTED: + return URLValidatedType.DATA_SOURCE + case URLCreationEnum.VALIDATED: + return URLValidatedType.DATA_SOURCE + case URLCreationEnum.NOT_RELEVANT: + return URLValidatedType.NOT_RELEVANT + case _: + raise ValueError(f"Unknown URLCreationEnum: {url_creation_enum}") \ No newline at end of file diff --git a/tests/helpers/data_creator/commands/impl/urls.py b/tests/helpers/data_creator/commands/impl/urls_/query.py similarity index 79% rename from tests/helpers/data_creator/commands/impl/urls.py rename to tests/helpers/data_creator/commands/impl/urls_/query.py index ee9ef954..7587abfb 100644 --- a/tests/helpers/data_creator/commands/impl/urls.py +++ b/tests/helpers/data_creator/commands/impl/urls_/query.py @@ -1,11 +1,12 @@ from datetime import datetime -from src.collectors.enums import URLStatus from src.core.tasks.url.operators.submit_approved.tdo import SubmittedURLInfo from src.db.dtos.url.insert import InsertURLsInfo from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.core.pydantic.info import URLInfo +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase +from tests.helpers.data_creator.commands.impl.urls_.convert import convert_url_creation_enum_to_url_status from tests.helpers.simple_test_data_functions import generate_test_urls @@ -16,7 +17,7 @@ def __init__( batch_id: int | None, url_count: int, collector_metadata: dict | None = None, - status: URLStatus = URLStatus.PENDING, + status: URLCreationEnum = URLCreationEnum.OK, created_at: datetime | None = None ): super().__init__() @@ -36,8 +37,11 @@ def run_sync(self) -> InsertURLsInfo: url_infos.append( URLInfo( url=url, - status=self.status, - name="Test Name" if self.status == URLStatus.VALIDATED else None, + status=convert_url_creation_enum_to_url_status(self.status), + name="Test Name" if self.status in ( + URLCreationEnum.VALIDATED, + URLCreationEnum.SUBMITTED, + ) else None, collector_metadata=self.collector_metadata, created_at=self.created_at, source=URLSource.COLLECTOR @@ -50,7 +54,7 @@ def run_sync(self) -> InsertURLsInfo: ) # If outcome is submitted, also add entry to DataSourceURL - if self.status == URLStatus.SUBMITTED: + if self.status == URLCreationEnum.SUBMITTED: submitted_url_infos = [] for url_id in url_insert_info.url_ids: submitted_url_info = SubmittedURLInfo( diff --git a/tests/helpers/data_creator/commands/impl/urls_v2/core.py b/tests/helpers/data_creator/commands/impl/urls_v2/core.py index c80dc447..f7042720 100644 --- a/tests/helpers/data_creator/commands/impl/urls_v2/core.py +++ b/tests/helpers/data_creator/commands/impl/urls_v2/core.py @@ -1,14 +1,16 @@ from datetime import datetime -from src.collectors.enums import URLStatus from src.db.dtos.url.insert import InsertURLsInfo +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase from tests.helpers.data_creator.commands.impl.annotate import AnnotateCommand from tests.helpers.data_creator.commands.impl.html_data import HTMLDataCreatorCommand -from tests.helpers.data_creator.commands.impl.urls import URLsDBDataCreatorCommand +from tests.helpers.data_creator.commands.impl.urls_.convert import convert_url_creation_enum_to_validated_type +from tests.helpers.data_creator.commands.impl.urls_.query import URLsDBDataCreatorCommand from tests.helpers.data_creator.commands.impl.urls_v2.response import URLsV2Response -from tests.helpers.data_creator.models.creation_info.batch.v2 import BatchURLCreationInfoV2 +from tests.helpers.data_creator.generate import generate_validated_flags from tests.helpers.data_creator.models.creation_info.url import URLCreationInfo @@ -26,7 +28,7 @@ def __init__( self.created_at = created_at async def run(self) -> URLsV2Response: - urls_by_status: dict[URLStatus, URLCreationInfo] = {} + urls_by_status: dict[URLCreationEnum, URLCreationInfo] = {} urls_by_order: list[URLCreationInfo] = [] # Create urls for url_parameters in self.parameters: diff --git a/tests/helpers/data_creator/commands/impl/urls_v2/response.py b/tests/helpers/data_creator/commands/impl/urls_v2/response.py index db19328e..74aa8e20 100644 --- a/tests/helpers/data_creator/commands/impl/urls_v2/response.py +++ b/tests/helpers/data_creator/commands/impl/urls_v2/response.py @@ -1,9 +1,10 @@ from pydantic import BaseModel from src.collectors.enums import URLStatus +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.data_creator.models.creation_info.url import URLCreationInfo class URLsV2Response(BaseModel): - urls_by_status: dict[URLStatus, URLCreationInfo] = {} + urls_by_status: dict[URLCreationEnum, URLCreationInfo] = {} urls_by_order: list[URLCreationInfo] = [] \ No newline at end of file diff --git a/tests/helpers/data_creator/core.py b/tests/helpers/data_creator/core.py index 096bad32..57ee3576 100644 --- a/tests/helpers/data_creator/core.py +++ b/tests/helpers/data_creator/core.py @@ -5,8 +5,15 @@ from src.api.endpoints.annotate.agency.post.dto import URLAgencyAnnotationPostInfo from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo from src.db.client.async_ import AsyncDatabaseClient +from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.agency.sqlalchemy import Agency from src.db.models.impl.duplicate.pydantic.insert import DuplicateInsertInfo from src.db.dtos.url.insert import InsertURLsInfo +from src.db.models.impl.flag.root_url.sqlalchemy import FlagRootURL +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.link.urls_root_url.sqlalchemy import LinkURLRootURL +from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.error_info.pydantic import URLErrorPydanticInfo from src.db.client.sync import DatabaseClient from src.db.enums import TaskType @@ -14,26 +21,31 @@ from src.core.tasks.url.operators.misc_metadata.tdo import URLMiscellaneousMetadataTDO from src.core.enums import BatchStatus, SuggestionType, RecordType, SuggestedStatus from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters +from tests.helpers.counter import next_int from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase from tests.helpers.data_creator.commands.impl.agency import AgencyCommand from tests.helpers.data_creator.commands.impl.batch import DBDataCreatorBatchCommand from tests.helpers.data_creator.commands.impl.batch_v2 import BatchV2Command from tests.helpers.data_creator.commands.impl.html_data import HTMLDataCreatorCommand from tests.helpers.data_creator.commands.impl.suggestion.agency_confirmed import AgencyConfirmedSuggestionCommand -from tests.helpers.data_creator.commands.impl.suggestion.auto.agency import AgencyAutoSuggestionsCommand +from tests.helpers.data_creator.commands.impl.suggestion.auto.agency_.core import AgencyAutoSuggestionsCommand from tests.helpers.data_creator.commands.impl.suggestion.auto.record_type import AutoRecordTypeSuggestionCommand from tests.helpers.data_creator.commands.impl.suggestion.auto.relevant import AutoRelevantSuggestionCommand from tests.helpers.data_creator.commands.impl.suggestion.user.agency import AgencyUserSuggestionsCommand from tests.helpers.data_creator.commands.impl.suggestion.user.record_type import UserRecordTypeSuggestionCommand from tests.helpers.data_creator.commands.impl.suggestion.user.relevant import UserRelevantSuggestionCommand from tests.helpers.data_creator.commands.impl.url_metadata import URLMetadataCommand -from tests.helpers.data_creator.commands.impl.urls import URLsDBDataCreatorCommand +from tests.helpers.data_creator.commands.impl.urls_.query import URLsDBDataCreatorCommand from tests.helpers.data_creator.commands.impl.urls_v2.core import URLsV2Command from tests.helpers.data_creator.commands.impl.urls_v2.response import URLsV2Response +from tests.helpers.data_creator.create import create_urls, create_batch, create_batch_url_links, create_validated_flags, \ + create_url_data_sources from tests.helpers.data_creator.models.clients import DBDataCreatorClientContainer from tests.helpers.data_creator.models.creation_info.batch.v1 import BatchURLCreationInfo from tests.helpers.data_creator.models.creation_info.batch.v2 import BatchURLCreationInfoV2 +from tests.helpers.simple_test_data_functions import generate_test_name class DBDataCreator: @@ -105,7 +117,7 @@ async def batch_and_urls( url_count: int = 3, with_html_content: bool = False, batch_status: BatchStatus = BatchStatus.READY_TO_LABEL, - url_status: URLStatus = URLStatus.PENDING + url_status: URLCreationEnum = URLCreationEnum.OK ) -> BatchURLCreationInfo: batch_id = self.batch( strategy=strategy, @@ -194,23 +206,14 @@ async def auto_suggestions( raise ValueError(f"suggestion_type must be one of {allowed_suggestion_types}") if suggestion_type == SuggestionType.UNKNOWN and num_suggestions > 1: raise ValueError("num_suggestions must be 1 when suggestion_type is unknown") - + for url_id in url_ids: - suggestions = [] - for i in range(num_suggestions): - if suggestion_type == SuggestionType.UNKNOWN: - agency_id = None - else: - agency_id = await self.agency() - suggestion = URLAgencySuggestionInfo( + await self.run_command( + AgencyAutoSuggestionsCommand( url_id=url_id, - suggestion_type=suggestion_type, - pdap_agency_id=agency_id + count=num_suggestions, + suggestion_type=suggestion_type ) - suggestions.append(suggestion) - - await self.adb_client.add_agency_auto_suggestions( - suggestions=suggestions ) async def confirmed_suggestions(self, url_ids: list[int]): @@ -239,7 +242,7 @@ def urls( batch_id: int, url_count: int, collector_metadata: dict | None = None, - outcome: URLStatus = URLStatus.PENDING, + outcome: URLCreationEnum = URLCreationEnum.OK, created_at: datetime | None = None ) -> InsertURLsInfo: command = URLsDBDataCreatorCommand( @@ -259,7 +262,7 @@ async def url_miscellaneous_metadata( record_formats: Optional[list[str]] = None, data_portal_type: Optional[str] = "Test Data Portal Type", supplying_entity: Optional[str] = "Test Supplying Entity" - ): + ) -> None: if record_formats is None: record_formats = ["Test Record Format", "Test Record Format 2"] @@ -277,7 +280,11 @@ async def url_miscellaneous_metadata( await self.adb_client.add_miscellaneous_metadata([tdo]) - def duplicate_urls(self, duplicate_batch_id: int, url_ids: list[int]): + def duplicate_urls( + self, + duplicate_batch_id: int, + url_ids: list[int] + ) -> None: """ Create duplicates for all given url ids, and associate them with the given batch @@ -302,7 +309,7 @@ async def error_info( self, url_ids: list[int], task_id: Optional[int] = None - ): + ) -> None: if task_id is None: task_id = await self.task() error_infos = [] @@ -368,3 +375,173 @@ async def url_metadata( status_code=status_code ) ) + + async def create_validated_urls( + self, + record_type: RecordType = RecordType.RESOURCES, + validation_type: URLValidatedType = URLValidatedType.DATA_SOURCE, + count: int = 1 + ) -> list[URLMapping]: + url_mappings: list[URLMapping] = await self.create_urls( + record_type=record_type, + count=count + ) + url_ids: list[int] = [url_mapping.url_id for url_mapping in url_mappings] + await self.create_validated_flags( + url_ids=url_ids, + validation_type=validation_type + ) + return url_mappings + + async def create_submitted_urls( + self, + record_type: RecordType = RecordType.RESOURCES, + count: int = 1 + ) -> list[URLMapping]: + url_mappings: list[URLMapping] = await self.create_urls( + record_type=record_type, + count=count + ) + url_ids: list[int] = [url_mapping.url_id for url_mapping in url_mappings] + await self.create_validated_flags( + url_ids=url_ids, + validation_type=URLValidatedType.DATA_SOURCE + ) + await self.create_url_data_sources(url_ids=url_ids) + return url_mappings + + + async def create_urls( + self, + status: URLStatus = URLStatus.OK, + source: URLSource = URLSource.COLLECTOR, + record_type: RecordType | None = RecordType.RESOURCES, + collector_metadata: dict | None = None, + count: int = 1, + batch_id: int | None = None + ) -> list[URLMapping]: + + url_mappings: list[URLMapping] = await create_urls( + adb_client=self.adb_client, + status=status, + source=source, + record_type=record_type, + collector_metadata=collector_metadata, + count=count + ) + url_ids: list[int] = [url_mapping.url_id for url_mapping in url_mappings] + if batch_id is not None: + await self.create_batch_url_links( + url_ids=url_ids, + batch_id=batch_id + ) + return url_mappings + + async def create_batch( + self, + status: BatchStatus = BatchStatus.READY_TO_LABEL, + strategy: CollectorType = CollectorType.EXAMPLE, + date_generated: datetime = datetime.now(), + ) -> int: + return await create_batch( + adb_client=self.adb_client, + status=status, + strategy=strategy, + date_generated=date_generated + ) + + async def create_batch_url_links( + self, + url_ids: list[int], + batch_id: int, + ) -> None: + await create_batch_url_links( + adb_client=self.adb_client, + url_ids=url_ids, + batch_id=batch_id + ) + + async def create_validated_flags( + self, + url_ids: list[int], + validation_type: URLValidatedType, + ) -> None: + await create_validated_flags( + adb_client=self.adb_client, + url_ids=url_ids, + validation_type=validation_type + ) + + async def create_url_data_sources( + self, + url_ids: list[int], + ) -> None: + await create_url_data_sources( + adb_client=self.adb_client, + url_ids=url_ids + ) + + async def create_url_agency_links( + self, + url_ids: list[int], + agency_ids: list[int], + ) -> None: + links: list[LinkURLAgency] = [] + for url_id in url_ids: + for agency_id in agency_ids: + link = LinkURLAgency( + url_id=url_id, + agency_id=agency_id, + ) + links.append(link) + await self.adb_client.add_all(links) + + async def create_agency(self, agency_id: int = 1) -> None: + agency = Agency( + agency_id=agency_id, + name=generate_test_name(agency_id), + state=None, + county=None, + locality=None + ) + await self.adb_client.add_all([agency]) + + async def create_agencies(self, count: int = 3) -> list[int]: + agencies: list[Agency] = [] + agency_ids: list[int] = [] + for _ in range(count): + agency_id = next_int() + agency = Agency( + agency_id=agency_id, + name=generate_test_name(agency_id), + state=None, + county=None, + locality=None + ) + agencies.append(agency) + agency_ids.append(agency_id) + await self.adb_client.add_all(agencies) + return agency_ids + + async def flag_as_root(self, url_ids: list[int]) -> None: + flag_root_urls: list[FlagRootURL] = [ + FlagRootURL(url_id=url_id) for url_id in url_ids + ] + await self.adb_client.add_all(flag_root_urls) + + async def link_urls_to_root(self, url_ids: list[int], root_url_id: int) -> None: + links: list[LinkURLRootURL] = [ + LinkURLRootURL(url_id=url_id, root_url_id=root_url_id) for url_id in url_ids + ] + await self.adb_client.add_all(links) + + async def link_urls_to_agencies(self, url_ids: list[int], agency_ids: list[int]) -> None: + assert len(url_ids) == len(agency_ids) + links: list[LinkURLAgency] = [] + for url_id, agency_id in zip(url_ids, agency_ids): + link = LinkURLAgency( + url_id=url_id, + agency_id=agency_id + ) + links.append(link) + await self.adb_client.add_all(links) \ No newline at end of file diff --git a/tests/helpers/data_creator/create.py b/tests/helpers/data_creator/create.py new file mode 100644 index 00000000..83b2e3f5 --- /dev/null +++ b/tests/helpers/data_creator/create.py @@ -0,0 +1,75 @@ +from datetime import datetime + +from src.collectors.enums import CollectorType, URLStatus +from src.core.enums import BatchStatus, RecordType +from src.db.client.async_ import AsyncDatabaseClient +from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.batch.pydantic.insert import BatchInsertModel +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic +from src.db.models.impl.link.batch_url.pydantic import LinkBatchURLPydantic +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.pydantic.insert import URLInsertModel +from src.db.models.impl.url.data_source.pydantic import URLDataSourcePydantic +from tests.helpers.data_creator.generate import generate_batch, generate_urls, generate_validated_flags, \ + generate_url_data_sources, generate_batch_url_links + + +async def create_batch( + adb_client: AsyncDatabaseClient, + status: BatchStatus = BatchStatus.READY_TO_LABEL, + strategy: CollectorType = CollectorType.EXAMPLE, + date_generated: datetime = datetime.now(), +) -> int: + batch: BatchInsertModel = generate_batch(status=status, strategy=strategy, date_generated=date_generated) + return (await adb_client.bulk_insert([batch], return_ids=True))[0] + +async def create_urls( + adb_client: AsyncDatabaseClient, + status: URLStatus = URLStatus.OK, + source: URLSource = URLSource.COLLECTOR, + record_type: RecordType | None = RecordType.RESOURCES, + collector_metadata: dict | None = None, + count: int = 1 +) -> list[URLMapping]: + urls: list[URLInsertModel] = generate_urls( + status=status, + source=source, + record_type=record_type, + collector_metadata=collector_metadata, + count=count, + ) + url_ids = await adb_client.bulk_insert(urls, return_ids=True) + return [URLMapping(url_id=url_id, url=url.url) for url_id, url in zip(url_ids, urls)] + +async def create_validated_flags( + adb_client: AsyncDatabaseClient, + url_ids: list[int], + validation_type: URLValidatedType, +) -> None: + validated_flags: list[FlagURLValidatedPydantic] = generate_validated_flags( + url_ids=url_ids, + validation_type=validation_type, + ) + await adb_client.bulk_insert(validated_flags) + +async def create_url_data_sources( + adb_client: AsyncDatabaseClient, + url_ids: list[int], +) -> None: + url_data_sources: list[URLDataSourcePydantic] = generate_url_data_sources( + url_ids=url_ids, + ) + await adb_client.bulk_insert(url_data_sources) + +async def create_batch_url_links( + adb_client: AsyncDatabaseClient, + url_ids: list[int], + batch_id: int, +) -> None: + batch_url_links: list[LinkBatchURLPydantic] = generate_batch_url_links( + url_ids=url_ids, + batch_id=batch_id, + ) + await adb_client.bulk_insert(batch_url_links) + diff --git a/tests/helpers/data_creator/generate.py b/tests/helpers/data_creator/generate.py new file mode 100644 index 00000000..5dabc016 --- /dev/null +++ b/tests/helpers/data_creator/generate.py @@ -0,0 +1,82 @@ +from datetime import datetime + +from src.collectors.enums import URLStatus, CollectorType +from src.core.enums import BatchStatus, RecordType +from src.db.models.impl.batch.pydantic.insert import BatchInsertModel +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.pydantic import LinkBatchURLPydantic +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.pydantic.insert import URLInsertModel +from src.db.models.impl.url.data_source.pydantic import URLDataSourcePydantic +from tests.helpers.counter import next_int + + +def generate_batch( + status: BatchStatus, + strategy: CollectorType = CollectorType.EXAMPLE, + date_generated: datetime = datetime.now(), +) -> BatchInsertModel: + return BatchInsertModel( + strategy=strategy.value, + status=status, + parameters={}, + user_id=1, + date_generated=date_generated, + ) + +def generate_batch_url_links( + url_ids: list[int], + batch_id: int +) -> list[LinkBatchURLPydantic]: + return [ + LinkBatchURLPydantic( + url_id=url_id, + batch_id=batch_id, + ) + for url_id in url_ids + ] + +def generate_urls( + status: URLStatus = URLStatus.OK, + source: URLSource = URLSource.COLLECTOR, + record_type: RecordType | None = RecordType.RESOURCES, + collector_metadata: dict | None = None, + count: int = 1 +) -> list[URLInsertModel]: + results: list[URLInsertModel] = [] + for i in range(count): + val: int = next_int() + results.append(URLInsertModel( + url=f"http://example.com/{val}", + status=status, + source=source, + name=f"Example {val}", + collector_metadata=collector_metadata, + record_type=record_type, + )) + return results + +def generate_validated_flags( + url_ids: list[int], + validation_type: URLValidatedType, +) -> list[FlagURLValidatedPydantic]: + return [ + FlagURLValidatedPydantic( + url_id=url_id, + type=validation_type, + ) + for url_id in url_ids + ] + +def generate_url_data_sources( + url_ids: list[int], +) -> list[URLDataSourcePydantic]: + return [ + URLDataSourcePydantic( + url_id=url_id, + data_source_id=url_id, + ) + for url_id in url_ids + ] \ No newline at end of file diff --git a/tests/helpers/data_creator/insert.py b/tests/helpers/data_creator/insert.py new file mode 100644 index 00000000..06b207e3 --- /dev/null +++ b/tests/helpers/data_creator/insert.py @@ -0,0 +1,10 @@ +from src.db.client.async_ import AsyncDatabaseClient +from src.db.templates.markers.bulk.insert import BulkInsertableModel + + +async def bulk_insert_all( + adb_client: AsyncDatabaseClient, + lists_of_models: list[list[BulkInsertableModel]], +): + for list_of_models in lists_of_models: + await adb_client.bulk_insert(list_of_models) \ No newline at end of file diff --git a/tests/helpers/data_creator/models/creation_info/batch/v2.py b/tests/helpers/data_creator/models/creation_info/batch/v2.py index 3e6ed74a..52d7e37d 100644 --- a/tests/helpers/data_creator/models/creation_info/batch/v2.py +++ b/tests/helpers/data_creator/models/creation_info/batch/v2.py @@ -1,12 +1,12 @@ from pydantic import BaseModel -from src.collectors.enums import URLStatus +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.data_creator.models.creation_info.url import URLCreationInfo class BatchURLCreationInfoV2(BaseModel): batch_id: int - urls_by_status: dict[URLStatus, URLCreationInfo] = {} + urls_by_status: dict[URLCreationEnum, URLCreationInfo] = {} @property def url_ids(self) -> list[int]: diff --git a/tests/helpers/data_creator/models/creation_info/url.py b/tests/helpers/data_creator/models/creation_info/url.py index 082769e7..16c45a0a 100644 --- a/tests/helpers/data_creator/models/creation_info/url.py +++ b/tests/helpers/data_creator/models/creation_info/url.py @@ -5,11 +5,12 @@ from src.collectors.enums import URLStatus from src.db.dtos.url.mapping import URLMapping from tests.helpers.batch_creation_parameters.annotation_info import AnnotationInfo +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum class URLCreationInfo(BaseModel): url_mappings: list[URLMapping] - outcome: URLStatus + outcome: URLCreationEnum annotation_info: Optional[AnnotationInfo] = None @property diff --git a/tests/helpers/setup/annotation/core.py b/tests/helpers/setup/annotation/core.py index ff5105cd..70123cb9 100644 --- a/tests/helpers/setup/annotation/core.py +++ b/tests/helpers/setup/annotation/core.py @@ -1,4 +1,5 @@ from src.collectors.enums import URLStatus +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.data_creator.core import DBDataCreator from tests.helpers.setup.annotation.model import AnnotationSetupInfo @@ -6,7 +7,7 @@ async def setup_for_get_next_url_for_annotation( db_data_creator: DBDataCreator, url_count: int, - outcome: URLStatus = URLStatus.PENDING + outcome: URLCreationEnum = URLCreationEnum.OK ) -> AnnotationSetupInfo: batch_id = db_data_creator.batch() insert_urls_info = db_data_creator.urls( diff --git a/tests/helpers/setup/final_review/core.py b/tests/helpers/setup/final_review/core.py index 6c4a3498..58b1ae49 100644 --- a/tests/helpers/setup/final_review/core.py +++ b/tests/helpers/setup/final_review/core.py @@ -37,7 +37,7 @@ async def add_agency_suggestion() -> int: ) return agency_id - async def add_record_type_suggestion(record_type: RecordType): + async def add_record_type_suggestion(record_type: RecordType) -> None: await db_data_creator.user_record_type_suggestion( url_id=url_mapping.url_id, record_type=record_type diff --git a/tests/helpers/setup/wipe.py b/tests/helpers/setup/wipe.py index 630d0f71..e81c266d 100644 --- a/tests/helpers/setup/wipe.py +++ b/tests/helpers/setup/wipe.py @@ -8,5 +8,7 @@ def wipe_database(connection_string: str) -> None: engine = create_engine(connection_string) with engine.connect() as connection: for table in reversed(Base.metadata.sorted_tables): + if table.info == "view": + continue connection.execute(table.delete()) connection.commit() diff --git a/tests/helpers/simple_test_data_functions.py b/tests/helpers/simple_test_data_functions.py index df455e0e..7c42fd8d 100644 --- a/tests/helpers/simple_test_data_functions.py +++ b/tests/helpers/simple_test_data_functions.py @@ -13,16 +13,15 @@ def generate_test_urls(count: int) -> list[str]: return results -def generate_test_html() -> str: - return """ - - - - Example HTML - - -

Example HTML

-

This is an example of HTML content.

- - - """ \ No newline at end of file + +def generate_test_url(i: int) -> str: + return f"https://test.com/{i}" + +def generate_test_name(i: int) -> str: + return f"Test Name {i}" + +def generate_test_description(i: int) -> str: + return f"Test description {i}" + +def generate_test_html(i: int) -> str: + return f"

Test {i}

" \ No newline at end of file diff --git a/tests/manual/agency_identifier/test_nlp_processor.py b/tests/manual/agency_identifier/test_nlp_processor.py new file mode 100644 index 00000000..c38a52b1 --- /dev/null +++ b/tests/manual/agency_identifier/test_nlp_processor.py @@ -0,0 +1,22 @@ +import pytest + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.core import \ + NLPProcessor + +SAMPLE_HTML: str = """ + +I live in Pittsburgh, Allegheny, Pennsylvania. + +""" + +@pytest.mark.asyncio +async def test_nlp_processor_happy_path(): + nlp_processor = NLPProcessor() + response = nlp_processor.parse_for_locations(SAMPLE_HTML) + print(response) + +@pytest.mark.asyncio +async def test_nlp_processor_empty_html(): + nlp_processor = NLPProcessor() + response = nlp_processor.parse_for_locations("") + print(response) \ No newline at end of file diff --git a/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py b/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py index 584facdd..bc9b5dfa 100644 --- a/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py +++ b/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py @@ -2,7 +2,7 @@ import dotenv -from src.db.models.impl.batch.pydantic import BatchInfo +from src.db.models.impl.batch.pydantic.info import BatchInfo from src.collectors import CollectorType from src.core.enums import BatchStatus from test_automated.integration.core.helpers.common_test_procedures import run_collector_and_wait_for_completion diff --git a/tests/manual/core/lifecycle/test_ckan_lifecycle.py b/tests/manual/core/lifecycle/test_ckan_lifecycle.py index 9a896392..66020a92 100644 --- a/tests/manual/core/lifecycle/test_ckan_lifecycle.py +++ b/tests/manual/core/lifecycle/test_ckan_lifecycle.py @@ -1,4 +1,4 @@ -from src.db.models.impl.batch.pydantic import BatchInfo +from src.db.models.impl.batch.pydantic.info import BatchInfo from src.collectors import CollectorType from src.core.enums import BatchStatus from src.collectors.impl.ckan import group_search, package_search, organization_search diff --git a/tests/manual/core/lifecycle/test_muckrock_lifecycles.py b/tests/manual/core/lifecycle/test_muckrock_lifecycles.py index 417e7240..216638dc 100644 --- a/tests/manual/core/lifecycle/test_muckrock_lifecycles.py +++ b/tests/manual/core/lifecycle/test_muckrock_lifecycles.py @@ -1,4 +1,4 @@ -from src.db.models.impl.batch.pydantic import BatchInfo +from src.db.models.impl.batch.pydantic.info import BatchInfo from src.collectors import CollectorType from src.core.enums import BatchStatus from test_automated.integration.core.helpers.common_test_procedures import run_collector_and_wait_for_completion diff --git a/tests/manual/external/pdap/test_sc_agency_search_location.py b/tests/manual/external/pdap/test_sc_agency_search_location.py new file mode 100644 index 00000000..9b0aac28 --- /dev/null +++ b/tests/manual/external/pdap/test_sc_agency_search_location.py @@ -0,0 +1,34 @@ +""" + +Location ID, Agency ID +10464,9873, "Boonsboro, Washington, Maryland" +15648,9878, "Smithsburg, Washington, Maryland" +15656,9879, "Williamsport, Washington, Maryland" + +""" +import pytest + +from src.external.pdap.client import PDAPClient +from src.external.pdap.dtos.search_agency_by_location.params import SearchAgencyByLocationParams +from src.external.pdap.dtos.search_agency_by_location.response import SearchAgencyByLocationResponse + + +@pytest.mark.asyncio +async def test_sc_agency_search_location(pdap_client_dev: PDAPClient): + params: list[SearchAgencyByLocationParams] = [ + SearchAgencyByLocationParams( + request_id=1, + query="Boonsboro, Washington, Maryland" + ), + SearchAgencyByLocationParams( + request_id=0, + query="Smithsburg, Washington, Maryland" + ), + SearchAgencyByLocationParams( + request_id=-99, + query="Williamsport, Washington, Maryland" + ) + ] + response: list[SearchAgencyByLocationResponse] = await pdap_client_dev.search_agency_by_location(params) + print(response) + diff --git a/uv.lock b/uv.lock index c97b9828..3dffe619 100644 --- a/uv.lock +++ b/uv.lock @@ -151,6 +151,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d0/ae/9a053dd9229c0fde6b1f1f33f609ccff1ee79ddda364c756a924c6d8563b/APScheduler-3.11.0-py3-none-any.whl", hash = "sha256:fc134ca32e50f5eadcc4938e3a4545ab19131435e851abb40b34d63d5141c6da", size = 64004, upload_time = "2024-11-24T19:39:24.442Z" }, ] +[[package]] +name = "asgiref" +version = "3.9.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/90/61/0aa957eec22ff70b830b22ff91f825e70e1ef732c06666a805730f28b36b/asgiref-3.9.1.tar.gz", hash = "sha256:a5ab6582236218e5ef1648f242fd9f10626cfd4de8dc377db215d5d5098e3142", size = 36870, upload_time = "2025-07-08T09:07:43.344Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7c/3c/0464dcada90d5da0e71018c04a140ad6349558afb30b3051b4264cc5b965/asgiref-3.9.1-py3-none-any.whl", hash = "sha256:f3bba7092a48005b5f5bacd747d36ee4a5a61f4a269a6df590b43144355ebd2c", size = 23790, upload_time = "2025-07-08T09:07:41.548Z" }, +] + [[package]] name = "asyncpg" version = "0.30.0" @@ -205,6 +214,35 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/50/cd/30110dc0ffcf3b131156077b90e9f60ed75711223f306da4db08eff8403b/beautifulsoup4-4.13.4-py3-none-any.whl", hash = "sha256:9bbbb14bfde9d79f38b8cd5f8c7c85f4b8f2523190ebed90e950a8dea4cb1c4b", size = 187285, upload_time = "2025-04-15T17:05:12.221Z" }, ] +[[package]] +name = "blis" +version = "1.2.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/61/aa/0743c994884de83472c854bb534c9edab8d711e1880d4fa194e6d876bb60/blis-1.2.1.tar.gz", hash = "sha256:1066beedbedc2143c22bd28742658de05694afebacde8d8c2d14dd4b5a96765a", size = 2510297, upload_time = "2025-04-01T12:01:56.849Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/67/57/ae6596b1e27859886e0b81fb99497bcfff139895585a9e2284681c8a8846/blis-1.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:778c4f72b71f97187e3304acfbd30eab98c9ba1a5b03b65128bc3875400ae604", size = 6976808, upload_time = "2025-04-01T12:01:21.175Z" }, + { url = "https://files.pythonhosted.org/packages/ce/35/6225e6ad2bccf23ac124448d59112c098d63a8917462e9f73967bc217168/blis-1.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5c5f2ffb0ae9c1f5aaa95b9681bcdd9a777d007c501fa220796329b939ca2790", size = 1281913, upload_time = "2025-04-01T12:01:23.202Z" }, + { url = "https://files.pythonhosted.org/packages/7a/84/c6a6d1c0a8a00799d2ec5db05d676bd9a9b0472cac4d3eff2e2fd1953521/blis-1.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:db4dc5d2d57106bb411633603a5c7d178a0845267c3efc7e5ea4fa7a44772976", size = 3104139, upload_time = "2025-04-01T12:01:24.781Z" }, + { url = "https://files.pythonhosted.org/packages/a5/6c/c5fab7ed1fe6e8bdcda732017400d1adc53db5b6dd2c2a6046acab91f4fa/blis-1.2.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c621271c2843101927407e052b35a67f853da59d5c74e9e070e982c7f82e2e04", size = 3304143, upload_time = "2025-04-01T12:01:27.363Z" }, + { url = "https://files.pythonhosted.org/packages/22/d1/85f03269886253758546fcfdbeddee7e717d843ea134596b60db9c2648c4/blis-1.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:43f65f882250b817566d7543abd1f6da297f1662e5dd9936e14c04b88285a497", size = 11660080, upload_time = "2025-04-01T12:01:29.478Z" }, + { url = "https://files.pythonhosted.org/packages/78/c8/c81ed3036e8ce0d6ce0d19a032c7f3d69247f221c5357e18548dea9380d3/blis-1.2.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:78a0613d559ccc426c101c67e8f84e1f93491e29d722c370872c538ee652bd07", size = 3133133, upload_time = "2025-04-01T12:01:31.537Z" }, + { url = "https://files.pythonhosted.org/packages/b8/42/7c296e04b979204777ecae2fe9287ac7b0255d8c4c2111d2a735c439b9d7/blis-1.2.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:2f5e32e5e5635fc7087b724b53120dbcd86201f56c0405882ce254bc0e493392", size = 4360695, upload_time = "2025-04-01T12:01:33.449Z" }, + { url = "https://files.pythonhosted.org/packages/0c/ab/aa5c8dfd0068d2cc976830797dd092779259860f964286db05739154e3a7/blis-1.2.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d339c97cc83f53e39c1013d0dcd7d5278c853dc102d931132eeb05b226e28429", size = 14828081, upload_time = "2025-04-01T12:01:35.129Z" }, + { url = "https://files.pythonhosted.org/packages/7c/c0/047fef3ac4a531903c52ba7c108fd608556627723bfef7554f040b10e556/blis-1.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:8d284323cc994e9b818c32046f1aa3e57bcc41c74e02daebdf0d3bc3e14355cb", size = 6232639, upload_time = "2025-04-01T12:01:37.268Z" }, + { url = "https://files.pythonhosted.org/packages/2f/f1/2aecd2447de0eb5deea3a13e471ab43e42e8561afe56a13d830f95c58909/blis-1.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:1cd35e94a1a97b37b31b11f097f998a3a0e75ac06d57e6edf7d9597200f55756", size = 6989811, upload_time = "2025-04-01T12:01:39.013Z" }, + { url = "https://files.pythonhosted.org/packages/cf/39/4c097508f6b9ef7df27dd5ada0a175e8169f58cbe33d40a303a844abdaea/blis-1.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7b6394d27f2259c580df8d13ebe9c0a188a6ace0a689e93d6e49cb15018d4d9c", size = 1282669, upload_time = "2025-04-01T12:01:41.418Z" }, + { url = "https://files.pythonhosted.org/packages/7a/8e/b8a5eafa9824fcc7f3339a283e910f7af110d749fd09f52e83f432124543/blis-1.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a9c127159415dc772f345abc3575e1e2d02bb1ae7cb7f532267d67705be04c66", size = 3063750, upload_time = "2025-04-01T12:01:43.277Z" }, + { url = "https://files.pythonhosted.org/packages/f7/7a/f88e935f2cd3ad52ef363beeddf9a537d5038e519aa7b09dc18c762fbb66/blis-1.2.1-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5f9fa589aa72448009fd5001afb05e69f3bc953fe778b44580fd7d79ee8201a1", size = 3260903, upload_time = "2025-04-01T12:01:44.815Z" }, + { url = "https://files.pythonhosted.org/packages/4a/26/283f1392974e5c597228f8485f45f89de33f2c85becebc25e846d0485e44/blis-1.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1aa6150259caf4fa0b527bfc8c1e858542f9ca88a386aa90b93e1ca4c2add6df", size = 11616588, upload_time = "2025-04-01T12:01:46.356Z" }, + { url = "https://files.pythonhosted.org/packages/fa/86/57047b688e42c92e35d0581ef9db15ee3bdf14deff4d9a2481ce331f2dae/blis-1.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3ba67c09883cae52da3d9e9d3f4305464efedd336032c4d5c6c429b27b16f4c1", size = 3072892, upload_time = "2025-04-01T12:01:48.314Z" }, + { url = "https://files.pythonhosted.org/packages/c7/db/85b6f5fa2a2515470cc5a2cbeaedd25aa465fa572801f18d14c24c9e5102/blis-1.2.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:7d9c5fca21b01c4b2f3cb95b71ce7ef95e58b3b62f0d79d1f699178c72c1e03e", size = 4310005, upload_time = "2025-04-01T12:01:49.815Z" }, + { url = "https://files.pythonhosted.org/packages/e2/ae/6e610e950476ebc9868a0207a827d67433ef65e2b14b837d317e60248e5a/blis-1.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6952a4a1f15e0d1f73cc1206bd71368b32551f2e94852dae288b50c4ea0daf31", size = 14790198, upload_time = "2025-04-01T12:01:52.601Z" }, + { url = "https://files.pythonhosted.org/packages/e4/0e/353e29e8dd3d31bba25a3eabbbfb798d82bd19ca2d24fd00583b6d3992f3/blis-1.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:bd0360427b1669684cd35a8355be126d7a33992ccac6dcb1fbef5e100f4e3026", size = 6260640, upload_time = "2025-04-01T12:01:54.849Z" }, +] + [[package]] name = "boltons" version = "25.0.0" @@ -289,6 +327,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/72/76/20fa66124dbe6be5cafeb312ece67de6b61dd91a0247d1ea13db4ebb33c2/cachetools-5.5.2-py3-none-any.whl", hash = "sha256:d26a22bcc62eb95c3beabd9f1ee5e820d3d2704fe2967cbe350e20c8ffcd3f0a", size = 10080, upload_time = "2025-02-20T21:01:16.647Z" }, ] +[[package]] +name = "catalogue" +version = "2.0.10" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/38/b4/244d58127e1cdf04cf2dc7d9566f0d24ef01d5ce21811bab088ecc62b5ea/catalogue-2.0.10.tar.gz", hash = "sha256:4f56daa940913d3f09d589c191c74e5a6d51762b3a9e37dd53b7437afd6cda15", size = 19561, upload_time = "2023-09-25T06:29:24.962Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9e/96/d32b941a501ab566a16358d68b6eb4e4acc373fab3c3c4d7d9e649f7b4bb/catalogue-2.0.10-py3-none-any.whl", hash = "sha256:58c2de0020aa90f4a2da7dfad161bf7b3b054c86a5f09fcedc0b2b740c109a9f", size = 17325, upload_time = "2023-09-25T06:29:23.337Z" }, +] + [[package]] name = "certifi" version = "2025.4.26" @@ -375,6 +422,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a2/58/1f37bf81e3c689cc74ffa42102fa8915b59085f54a6e4a80bc6265c0f6bf/click-8.2.0-py3-none-any.whl", hash = "sha256:6b303f0b2aa85f1cb4e5303078fadcbcd4e476f114fab9b5007005711839325c", size = 102156, upload_time = "2025-05-10T22:21:01.352Z" }, ] +[[package]] +name = "cloudpathlib" +version = "0.22.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/05/bc/d7345595a4467144b9e0b32e5eda9e4633ea6e4982262b0696935adb2229/cloudpathlib-0.22.0.tar.gz", hash = "sha256:6c0cb0ceab4f66a3a05a84055f9318fb8316cae5e096819f3f8e4be64feab6e9", size = 52304, upload_time = "2025-08-30T05:20:04.6Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f1/72/e8e53d8232e801e040f4b557ff3a453cecbb630d53ae107bd5e66a206bb9/cloudpathlib-0.22.0-py3-none-any.whl", hash = "sha256:2fdfaf5c4f85810ae8374d336d04dee371914d0e41a984695ae67308d7a5a009", size = 61520, upload_time = "2025-08-30T05:20:03.232Z" }, +] + [[package]] name = "colorama" version = "0.4.6" @@ -384,6 +440,48 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload_time = "2022-10-25T02:36:20.889Z" }, ] +[[package]] +name = "confection" +version = "0.1.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pydantic" }, + { name = "srsly" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/51/d3/57c6631159a1b48d273b40865c315cf51f89df7a9d1101094ef12e3a37c2/confection-0.1.5.tar.gz", hash = "sha256:8e72dd3ca6bd4f48913cd220f10b8275978e740411654b6e8ca6d7008c590f0e", size = 38924, upload_time = "2024-05-31T16:17:01.559Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0c/00/3106b1854b45bd0474ced037dfe6b73b90fe68a68968cef47c23de3d43d2/confection-0.1.5-py3-none-any.whl", hash = "sha256:e29d3c3f8eac06b3f77eb9dfb4bf2fc6bcc9622a98ca00a698e3d019c6430b14", size = 35451, upload_time = "2024-05-31T16:16:59.075Z" }, +] + +[[package]] +name = "cymem" +version = "2.0.11" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f2/4a/1acd761fb6ac4c560e823ce40536a62f886f2d59b2763b5c3fc7e9d92101/cymem-2.0.11.tar.gz", hash = "sha256:efe49a349d4a518be6b6c6b255d4a80f740a341544bde1a807707c058b88d0bd", size = 10346, upload_time = "2025-01-16T21:50:41.045Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/03/e3/d98e3976f4ffa99cddebc1ce379d4d62e3eb1da22285267f902c99cc3395/cymem-2.0.11-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3ee54039aad3ef65de82d66c40516bf54586287b46d32c91ea0530c34e8a2745", size = 42005, upload_time = "2025-01-16T21:49:34.977Z" }, + { url = "https://files.pythonhosted.org/packages/41/b4/7546faf2ab63e59befc95972316d62276cec153f7d4d60e7b0d5e08f0602/cymem-2.0.11-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4c05ef75b5db217be820604e43a47ccbbafea98ab6659d07cea92fa3c864ea58", size = 41747, upload_time = "2025-01-16T21:49:36.108Z" }, + { url = "https://files.pythonhosted.org/packages/7d/4e/042f372e5b3eb7f5f3dd7677161771d301de2b6fa3f7c74e1cebcd502552/cymem-2.0.11-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a8d5381e5793ce531bac0dbc00829c8381f18605bb67e4b61d34f8850463da40", size = 217647, upload_time = "2025-01-16T21:49:37.433Z" }, + { url = "https://files.pythonhosted.org/packages/48/cb/2207679e4b92701f78cf141e1ab4f81f55247dbe154eb426b842a0a993de/cymem-2.0.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f2b9d3f42d7249ac81802135cad51d707def058001a32f73fc7fbf3de7045ac7", size = 218857, upload_time = "2025-01-16T21:49:40.09Z" }, + { url = "https://files.pythonhosted.org/packages/31/7a/76ae3b7a39ab2531029d281e43fcfcaad728c2341b150a81a3a1f5587cf3/cymem-2.0.11-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:39b78f2195d20b75c2d465732f6b8e8721c5d4eb012777c2cb89bdb45a043185", size = 206148, upload_time = "2025-01-16T21:49:41.383Z" }, + { url = "https://files.pythonhosted.org/packages/25/f9/d0fc0191ac79f15638ddb59237aa76f234691374d7d7950e10f384bd8a25/cymem-2.0.11-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:2203bd6525a80d8fd0c94654a263af21c0387ae1d5062cceaebb652bf9bad7bc", size = 207112, upload_time = "2025-01-16T21:49:43.986Z" }, + { url = "https://files.pythonhosted.org/packages/56/c8/75f75889401b20f4c3a7c5965dda09df42913e904ddc2ffe7ef3bdf25061/cymem-2.0.11-cp311-cp311-win_amd64.whl", hash = "sha256:aa54af7314de400634448da1f935b61323da80a49484074688d344fb2036681b", size = 39360, upload_time = "2025-01-16T21:49:45.479Z" }, + { url = "https://files.pythonhosted.org/packages/71/67/0d74f7e9d79f934368a78fb1d1466b94bebdbff14f8ae94dd3e4ea8738bb/cymem-2.0.11-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:a0fbe19ce653cd688842d81e5819dc63f911a26e192ef30b0b89f0ab2b192ff2", size = 42621, upload_time = "2025-01-16T21:49:46.585Z" }, + { url = "https://files.pythonhosted.org/packages/4a/d6/f7a19c63b48efc3f00a3ee8d69070ac90202e1e378f6cf81b8671f0cf762/cymem-2.0.11-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:de72101dc0e6326f6a2f73e05a438d1f3c6110d41044236d0fbe62925091267d", size = 42249, upload_time = "2025-01-16T21:49:48.973Z" }, + { url = "https://files.pythonhosted.org/packages/d7/60/cdc434239813eef547fb99b6d0bafe31178501702df9b77c4108c9a216f6/cymem-2.0.11-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bee4395917f6588b8ac1699499128842768b391fe8896e8626950b4da5f9a406", size = 224758, upload_time = "2025-01-16T21:49:51.382Z" }, + { url = "https://files.pythonhosted.org/packages/1d/68/8fa6efae17cd3b2ba9a2f83b824867c5b65b06f7aec3f8a0d0cabdeffb9b/cymem-2.0.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5b02f2b17d760dc3fe5812737b1ce4f684641cdd751d67761d333a3b5ea97b83", size = 227995, upload_time = "2025-01-16T21:49:54.538Z" }, + { url = "https://files.pythonhosted.org/packages/e4/f3/ceda70bf6447880140602285b7c6fa171cb7c78b623d35345cc32505cd06/cymem-2.0.11-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:04ee6b4041ddec24512d6e969ed6445e57917f01e73b9dabbe17b7e6b27fef05", size = 215325, upload_time = "2025-01-16T21:49:57.229Z" }, + { url = "https://files.pythonhosted.org/packages/d3/47/6915eaa521e1ce7a0ba480eecb6870cb4f681bcd64ced88c2f0ed7a744b4/cymem-2.0.11-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e1048dae7e627ee25f22c87bb670b13e06bc0aecc114b89b959a798d487d1bf4", size = 216447, upload_time = "2025-01-16T21:50:00.432Z" }, + { url = "https://files.pythonhosted.org/packages/7b/be/8e02bdd31e557f642741a06c8e886782ef78f0b00daffd681922dc9bbc88/cymem-2.0.11-cp312-cp312-win_amd64.whl", hash = "sha256:0c269c7a867d74adeb9db65fa1d226342aacf44d64b7931282f0b0eb22eb6275", size = 39283, upload_time = "2025-01-16T21:50:03.384Z" }, + { url = "https://files.pythonhosted.org/packages/bd/90/b064e2677e27a35cf3605146abc3285d4f599cc1b6c18fc445ae876dd1e3/cymem-2.0.11-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f4a311c82f743275c84f708df89ac5bf60ddefe4713d532000c887931e22941f", size = 42389, upload_time = "2025-01-16T21:50:05.925Z" }, + { url = "https://files.pythonhosted.org/packages/fd/60/7aa0561a6c1f0d42643b02c4fdeb2a16181b0ff4e85d73d2d80c6689e92a/cymem-2.0.11-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:02ed92bead896cca36abad00502b14fa651bdf5d8319461126a2d5ac8c9674c5", size = 41948, upload_time = "2025-01-16T21:50:08.375Z" }, + { url = "https://files.pythonhosted.org/packages/5f/4e/88a29cc5575374982e527b4ebcab3781bdc826ce693c6418a0f836544246/cymem-2.0.11-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:44ddd3588379f8f376116384af99e3fb5f90091d90f520c341942618bf22f05e", size = 219382, upload_time = "2025-01-16T21:50:13.089Z" }, + { url = "https://files.pythonhosted.org/packages/9b/3a/8f96e167e93b7f7ec105ed7b25c77bbf215d15bcbf4a24082cdc12234cd6/cymem-2.0.11-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87ec985623624bbd298762d8163fc194a096cb13282731a017e09ff8a60bb8b1", size = 222974, upload_time = "2025-01-16T21:50:17.969Z" }, + { url = "https://files.pythonhosted.org/packages/6a/fc/ce016bb0c66a4776345fac7508fddec3b739b9dd4363094ac89cce048832/cymem-2.0.11-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e3385a47285435848e0ed66cfd29b35f3ed8703218e2b17bd7a0c053822f26bf", size = 213426, upload_time = "2025-01-16T21:50:19.349Z" }, + { url = "https://files.pythonhosted.org/packages/5c/c8/accf7cc768f751447a5050b14a195af46798bc22767ac25f49b02861b1eb/cymem-2.0.11-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5461e65340d6572eb64deadce79242a446a1d39cb7bf70fe7b7e007eb0d799b0", size = 219195, upload_time = "2025-01-16T21:50:21.407Z" }, + { url = "https://files.pythonhosted.org/packages/74/65/c162fbac63e867a055240b6600b92ef96c0eb7a1895312ac53c4be93d056/cymem-2.0.11-cp313-cp313-win_amd64.whl", hash = "sha256:25da111adf425c29af0cfd9fecfec1c71c8d82e2244a85166830a0817a66ada7", size = 39090, upload_time = "2025-01-16T21:50:24.239Z" }, +] + [[package]] name = "data-source-identification" version = "0.1.0" @@ -410,6 +508,7 @@ dependencies = [ { name = "marshmallow" }, { name = "openai" }, { name = "pdap-access-manager" }, + { name = "pip" }, { name = "playwright" }, { name = "psycopg", extra = ["binary"] }, { name = "psycopg2-binary" }, @@ -417,6 +516,8 @@ dependencies = [ { name = "pyjwt" }, { name = "python-dotenv" }, { name = "requests" }, + { name = "side-effects" }, + { name = "spacy" }, { name = "sqlalchemy" }, { name = "starlette" }, { name = "tqdm" }, @@ -458,6 +559,7 @@ requires-dist = [ { name = "marshmallow", specifier = "~=3.23.2" }, { name = "openai", specifier = "~=1.60.1" }, { name = "pdap-access-manager", specifier = "==0.3.6" }, + { name = "pip", specifier = ">=25.2" }, { name = "playwright", specifier = "~=1.49.1" }, { name = "psycopg", extras = ["binary"], specifier = "~=3.1.20" }, { name = "psycopg2-binary", specifier = "~=2.9.6" }, @@ -465,6 +567,8 @@ requires-dist = [ { name = "pyjwt", specifier = "~=2.10.1" }, { name = "python-dotenv", specifier = "~=1.0.1" }, { name = "requests", specifier = "~=2.32.3" }, + { name = "side-effects", specifier = ">=1.6.dev0" }, + { name = "spacy", specifier = ">=3.8.7" }, { name = "sqlalchemy", specifier = "~=2.0.36" }, { name = "starlette", specifier = "~=0.45.3" }, { name = "tqdm", specifier = ">=4.64.1" }, @@ -551,6 +655,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277, upload_time = "2023-12-24T09:54:30.421Z" }, ] +[[package]] +name = "django" +version = "3.2.25" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "asgiref" }, + { name = "pytz" }, + { name = "sqlparse" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ec/68/0e744f07b57bfdf99abbb6b3eb14fcba188867021c05f4a104e04f6d56b8/Django-3.2.25.tar.gz", hash = "sha256:7ca38a78654aee72378594d63e51636c04b8e28574f5505dff630895b5472777", size = 9836336, upload_time = "2024-03-04T08:57:02.257Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/30/8e/cc23c762c5dcd1d367d73cf006a326e0df2bd0e785cba18b658b39904c1e/Django-3.2.25-py3-none-any.whl", hash = "sha256:a52ea7fcf280b16f7b739cec38fa6d3f8953a5456986944c3ca97e79882b4e38", size = 7890550, upload_time = "2024-03-04T08:56:47.529Z" }, +] + [[package]] name = "dnspython" version = "2.7.0" @@ -1044,6 +1162,30 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ee/47/3729f00f35a696e68da15d64eb9283c330e776f3b5789bac7f2c0c4df209/jiter-0.9.0-cp313-cp313t-win_amd64.whl", hash = "sha256:6f7838bc467ab7e8ef9f387bd6de195c43bad82a569c1699cb822f6609dd4cdf", size = 206867, upload_time = "2025-03-10T21:36:25.843Z" }, ] +[[package]] +name = "langcodes" +version = "3.5.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "language-data" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3a/7a/5a97e327063409a5caa21541e6d08ae4a0f2da328447e9f2c7b39e179226/langcodes-3.5.0.tar.gz", hash = "sha256:1eef8168d07e51e131a2497ffecad4b663f6208e7c3ae3b8dc15c51734a6f801", size = 191030, upload_time = "2024-11-19T10:23:45.546Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c3/6b/068c2ea7a712bf805c62445bd9e9c06d7340358ef2824150eceac027444b/langcodes-3.5.0-py3-none-any.whl", hash = "sha256:853c69d1a35e0e13da2f427bb68fb2fa4a8f4fb899e0c62ad8df8d073dcfed33", size = 182974, upload_time = "2024-11-19T10:23:42.824Z" }, +] + +[[package]] +name = "language-data" +version = "1.3.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "marisa-trie" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/dd/ce/3f144716a9f2cbf42aa86ebc8b085a184be25c80aa453eea17c294d239c1/language_data-1.3.0.tar.gz", hash = "sha256:7600ef8aa39555145d06c89f0c324bf7dab834ea0b0a439d8243762e3ebad7ec", size = 5129310, upload_time = "2024-11-19T10:21:37.912Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5d/e9/5a5ffd9b286db82be70d677d0a91e4d58f7912bb8dd026ddeeb4abe70679/language_data-1.3.0-py3-none-any.whl", hash = "sha256:e2ee943551b5ae5f89cd0e801d1fc3835bb0ef5b7e9c3a4e8e17b2b214548fbf", size = 5385760, upload_time = "2024-11-19T10:21:36.005Z" }, +] + [[package]] name = "lxml" version = "5.1.1" @@ -1082,6 +1224,62 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/87/fb/99f81ac72ae23375f22b7afdb7642aba97c00a713c217124420147681a2f/mako-1.3.10-py3-none-any.whl", hash = "sha256:baef24a52fc4fc514a0887ac600f9f1cff3d82c61d4d700a1fa84d597b88db59", size = 78509, upload_time = "2025-04-10T12:50:53.297Z" }, ] +[[package]] +name = "marisa-trie" +version = "1.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/c5/e3/c9066e74076b90f9701ccd23d6a0b8c1d583feefdec576dc3e1bb093c50d/marisa_trie-1.3.1.tar.gz", hash = "sha256:97107fd12f30e4f8fea97790343a2d2d9a79d93697fe14e1b6f6363c984ff85b", size = 212454, upload_time = "2025-08-26T15:13:18.401Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a7/bf/2f1fe6c9fcd2b509c6dfaaf26e35128947d6d3718d0b39510903c55b7bed/marisa_trie-1.3.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5ef045f694ef66079b4e00c4c9063a00183d6af7d1ff643de6ea5c3b0d9af01b", size = 174027, upload_time = "2025-08-26T15:12:01.434Z" }, + { url = "https://files.pythonhosted.org/packages/a9/5a/de7936d58ed0de847180cee2b95143d420223c5ade0c093d55113f628237/marisa_trie-1.3.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cbd28f95d5f30d9a7af6130869568e75bfd7ef2e0adfb1480f1f44480f5d3603", size = 158478, upload_time = "2025-08-26T15:12:02.429Z" }, + { url = "https://files.pythonhosted.org/packages/48/cc/80611aadefcd0bcf8cd1795cb4643bb27213319a221ba04fe071da0b75cd/marisa_trie-1.3.1-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b173ec46d521308f7c97d96d6e05cf2088e0548f82544ec9a8656af65593304d", size = 1257535, upload_time = "2025-08-26T15:12:04.271Z" }, + { url = "https://files.pythonhosted.org/packages/36/89/c4eeefb956318047036e6bdc572b6112b2059d595e85961267a90aa40458/marisa_trie-1.3.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:954fef9185f8a79441b4e433695116636bf66402945cfee404f8983bafa59788", size = 1275566, upload_time = "2025-08-26T15:12:05.874Z" }, + { url = "https://files.pythonhosted.org/packages/c4/63/d775a2fdfc4b555120381cd2aa6dff1845576bc14fb13796ae1b1e8dbaf7/marisa_trie-1.3.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ca644534f15f85bba14c412afc17de07531e79a766ce85b8dbf3f8b6e7758f20", size = 2199831, upload_time = "2025-08-26T15:12:07.175Z" }, + { url = "https://files.pythonhosted.org/packages/50/aa/e5053927dc3cac77acc9b27f6f87e75c880f5d3d5eac9111fe13b1d8bf6f/marisa_trie-1.3.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:3834304fdeaa1c9b73596ad5a6c01a44fc19c13c115194704b85f7fbdf0a7b8e", size = 2283830, upload_time = "2025-08-26T15:12:08.319Z" }, + { url = "https://files.pythonhosted.org/packages/71/3e/e314906d0de5b1a44780a23c79bb62a9aafd876e2a4e80fb34f58c721da4/marisa_trie-1.3.1-cp311-cp311-win32.whl", hash = "sha256:70b4c96f9119cfeb4dc6a0cf4afc9f92f0b002cde225bcd910915d976c78e66a", size = 117335, upload_time = "2025-08-26T15:12:09.776Z" }, + { url = "https://files.pythonhosted.org/packages/b0/2b/85623566621135de3d57497811f94679b4fb2a8f16148ef67133c2abab7a/marisa_trie-1.3.1-cp311-cp311-win_amd64.whl", hash = "sha256:986eaf35a7f63c878280609ecd37edf8a074f7601c199acfec81d03f1ee9a39a", size = 143985, upload_time = "2025-08-26T15:12:10.988Z" }, + { url = "https://files.pythonhosted.org/packages/3f/40/ee7ea61b88d62d2189b5c4a27bc0fc8d9c32f8b8dc6daf1c93a7b7ad34ac/marisa_trie-1.3.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:5b7c1e7fa6c3b855e8cfbabf38454d7decbaba1c567d0cd58880d033c6b363bd", size = 173454, upload_time = "2025-08-26T15:12:12.13Z" }, + { url = "https://files.pythonhosted.org/packages/9c/fc/58635811586898041004b2197a085253706ede211324a53ec01612a50e20/marisa_trie-1.3.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c12b44c190deb0d67655021da1f2d0a7d61a257bf844101cf982e68ed344f28d", size = 155305, upload_time = "2025-08-26T15:12:13.374Z" }, + { url = "https://files.pythonhosted.org/packages/fe/98/88ca0c98d37034a3237acaf461d210cbcfeb6687929e5ba0e354971fa3ed/marisa_trie-1.3.1-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9688c7b45f744366a4ef661e399f24636ebe440d315ab35d768676c59c613186", size = 1244834, upload_time = "2025-08-26T15:12:14.795Z" }, + { url = "https://files.pythonhosted.org/packages/f3/5f/93b3e3607ccd693a768eafee60829cd14ea1810b75aa48e8b20e27b332c4/marisa_trie-1.3.1-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:99a00cab4cf9643a87977c87a5c8961aa44fff8d5dd46e00250135f686e7dedf", size = 1265148, upload_time = "2025-08-26T15:12:16.229Z" }, + { url = "https://files.pythonhosted.org/packages/db/6e/051d7d25c7fb2b3df605c8bd782513ebbb33fddf3bae6cf46cf268cca89f/marisa_trie-1.3.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:83efc045fc58ca04c91a96c9b894d8a19ac6553677a76f96df01ff9f0405f53d", size = 2172726, upload_time = "2025-08-26T15:12:18.467Z" }, + { url = "https://files.pythonhosted.org/packages/58/da/244d9d4e414ce6c73124cba4cc293dd140bf3b04ca18dec64c2775cca951/marisa_trie-1.3.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:0b9816ab993001a7854b02a7daec228892f35bd5ab0ac493bacbd1b80baec9f1", size = 2256104, upload_time = "2025-08-26T15:12:20.168Z" }, + { url = "https://files.pythonhosted.org/packages/c4/f1/1a36ecd7da6668685a7753522af89a19928ffc80f1cc1dbc301af216f011/marisa_trie-1.3.1-cp312-cp312-win32.whl", hash = "sha256:c785fd6dae9daa6825734b7b494cdac972f958be1f9cb3fb1f32be8598d2b936", size = 115624, upload_time = "2025-08-26T15:12:21.233Z" }, + { url = "https://files.pythonhosted.org/packages/35/b2/aabd1c9f1c102aa31d66633ed5328c447be166e0a703f9723e682478fd83/marisa_trie-1.3.1-cp312-cp312-win_amd64.whl", hash = "sha256:9868b7a8e0f648d09ffe25ac29511e6e208cc5fb0d156c295385f9d5dc2a138e", size = 138562, upload_time = "2025-08-26T15:12:22.632Z" }, + { url = "https://files.pythonhosted.org/packages/46/a2/8331b995c1b3eee83aa745f4a6502d737ec523d5955a48f167d4177db105/marisa_trie-1.3.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:9de573d933db4753a50af891bcb3ffbfe14e200406214c223aa5dfe2163f316d", size = 172272, upload_time = "2025-08-26T15:12:24.016Z" }, + { url = "https://files.pythonhosted.org/packages/97/b8/7b9681b5c0ea1bb950f907a4e3919eb7f7b7b3febafaae346f3b3f199f6f/marisa_trie-1.3.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f4bae4f920f2a1082eaf766c1883df7da84abdf333bafa15b8717c10416a615e", size = 154671, upload_time = "2025-08-26T15:12:25.013Z" }, + { url = "https://files.pythonhosted.org/packages/ca/16/929c1f83fdcff13f8d08500f434aaa18c21c8168d16cf81585d69085e980/marisa_trie-1.3.1-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bf9f2b97fcfd5e2dbb0090d0664023872dcde990df0b545eca8d0ce95795a409", size = 1238754, upload_time = "2025-08-26T15:12:26.217Z" }, + { url = "https://files.pythonhosted.org/packages/0f/0a/b0e04d3ef91a87d4c7ea0b66c004fdfc6e65c9ed83edaebecfb482dfe0ed/marisa_trie-1.3.1-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ecdb19d33b26738a32602ef432b06cc6deeca4b498ce67ba8e5e39c8a7c19745", size = 1262653, upload_time = "2025-08-26T15:12:27.422Z" }, + { url = "https://files.pythonhosted.org/packages/de/1f/0ecf610ddc9a209ee63116baabb47584d5b8ecd01610091a593d9429537e/marisa_trie-1.3.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a7416f1a084eb889c5792c57317875aeaa86abfe0bdc6f167712cebcec1d36ee", size = 2172399, upload_time = "2025-08-26T15:12:28.926Z" }, + { url = "https://files.pythonhosted.org/packages/ac/74/6b47deff3b3920449c135b9187c80f0d656adcdc5d41463745a61b012ea1/marisa_trie-1.3.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ee428575377e29c636f2b4b3b0488875dcea310c6c5b3412ec4ef997f7bb37cc", size = 2255138, upload_time = "2025-08-26T15:12:30.271Z" }, + { url = "https://files.pythonhosted.org/packages/bd/fa/3dbcbe93dfaa626a5b3e741e7bcf3d7389aa5777175213bd8d9a9d3c992d/marisa_trie-1.3.1-cp313-cp313-win32.whl", hash = "sha256:d0f87bdf660f01e88ab3a507955697b2e3284065afa0b94fc9e77d6ad153ed5e", size = 115391, upload_time = "2025-08-26T15:12:31.465Z" }, + { url = "https://files.pythonhosted.org/packages/3b/ce/ddfab303646b21aef07ff9dbc83fba92e5d493f49d3bc03d899ffd45c86f/marisa_trie-1.3.1-cp313-cp313-win_amd64.whl", hash = "sha256:a83f5f7ae3494e0cc25211296252b1b86901c788ed82c83adda19d0c98f828d6", size = 139130, upload_time = "2025-08-26T15:12:32.4Z" }, + { url = "https://files.pythonhosted.org/packages/5a/1e/734b618048ad05c50cb1673ce2c6e836dc38ddeeeb011ed1804af07327a4/marisa_trie-1.3.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:a850b151bd1e3a5d9afef113adc22727d696603659d575d7e84f994bd8d04bf1", size = 175131, upload_time = "2025-08-26T15:12:33.728Z" }, + { url = "https://files.pythonhosted.org/packages/d3/78/c7051147cc918cb8ff4a2920e11a9b17d9dcb4d8fc122122694b486e2bfe/marisa_trie-1.3.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:9dc61fb8f8993589544f6df268229c6cf0a56ad4ed3e8585a9cd23c5ad79527b", size = 163094, upload_time = "2025-08-26T15:12:35.312Z" }, + { url = "https://files.pythonhosted.org/packages/ee/b8/3b904178d7878319aacaabae5131c1f281519aaac0f8c68c8ed312912ccf/marisa_trie-1.3.1-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d4bd41a6e73c0d0adafe4de449b6d35530a4ce6a836a6ee839baf117785ecfd7", size = 1279812, upload_time = "2025-08-26T15:12:36.831Z" }, + { url = "https://files.pythonhosted.org/packages/fb/bf/e77a1284247b980560b4104bbdd5d06ed2c2ae3d56ab954f97293b6dbbcd/marisa_trie-1.3.1-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8c8b2386d2d22c57880ed20a913ceca86363765623175671137484a7d223f07a", size = 1285690, upload_time = "2025-08-26T15:12:38.754Z" }, + { url = "https://files.pythonhosted.org/packages/48/82/f6f10db5ec72de2642499f3a6e4e8607bbd2cfb28269ea08d0d8ddac3313/marisa_trie-1.3.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:9c56001badaf1779afae5c24b7ab85938644ab8ef3c5fd438ab5d49621b84482", size = 2197943, upload_time = "2025-08-26T15:12:40.584Z" }, + { url = "https://files.pythonhosted.org/packages/2a/d0/74b6c3011b1ebf4a8131430156b14c3af694082cf34c392fff766096fd4b/marisa_trie-1.3.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:83a3748088d117a9b15d8981c947df9e4f56eb2e4b5456ae34fe1f83666c9185", size = 2280132, upload_time = "2025-08-26T15:12:42.059Z" }, + { url = "https://files.pythonhosted.org/packages/28/b2/b8b0cb738fa3ab07309ed92025c6e1b278f84c7255e976921a52b30d8d1b/marisa_trie-1.3.1-cp313-cp313t-win32.whl", hash = "sha256:137010598d8cebc53dbfb7caf59bde96c33a6af555e3e1bdbf30269b6a157e1e", size = 126446, upload_time = "2025-08-26T15:12:43.339Z" }, + { url = "https://files.pythonhosted.org/packages/b6/c6/2381648d0c946556ef51c673397cea40712d945444ceed0a0a0b51a174d2/marisa_trie-1.3.1-cp313-cp313t-win_amd64.whl", hash = "sha256:ec633e108f277f2b7f4671d933a909f39bba549910bf103e2940b87a14da2783", size = 153885, upload_time = "2025-08-26T15:12:44.309Z" }, + { url = "https://files.pythonhosted.org/packages/40/8a/590f25a281e08879791aabec7b8584c7934ff3d5f9d52859197d587246ec/marisa_trie-1.3.1-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:389721481c14a92fa042e4b91ae065bff13e2bc567c85a10aa9d9de80aaa8622", size = 172803, upload_time = "2025-08-26T15:12:45.342Z" }, + { url = "https://files.pythonhosted.org/packages/20/7f/fd19a4aa57ad169d08e518a6ee2438e7e77bfba7786c59f65891db69d202/marisa_trie-1.3.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:0e6f3b45def6ff23e254eeaa9079267004f0069d0a34eba30a620780caa4f2cb", size = 155506, upload_time = "2025-08-26T15:12:46.701Z" }, + { url = "https://files.pythonhosted.org/packages/e3/05/857832b8fe6b2ec441de1154eadc66dee067ce5fb6673c3ee0b8616108ee/marisa_trie-1.3.1-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3a96ef3e461ecc85ec7d2233ddc449ff5a3fbdc520caea752bc5bc8faa975231", size = 1239979, upload_time = "2025-08-26T15:12:47.943Z" }, + { url = "https://files.pythonhosted.org/packages/4c/08/f9ea8b720a627d54e8e19f19a0ec1cc2011e01aa2b4f40d078e7f5e9e21f/marisa_trie-1.3.1-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5370f9ef6c008e502537cc1ff518c80ddf749367ce90179efa0e7f6275903a76", size = 1255705, upload_time = "2025-08-26T15:12:49.24Z" }, + { url = "https://files.pythonhosted.org/packages/e9/c3/42360fb38cdfde5db1783e2d7cfeb8b91eea837f89ef678f308ee026d794/marisa_trie-1.3.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:0dcd42774e367ceb423c211a4fc8e7ce586acfaf0929c9c06d98002112075239", size = 2175092, upload_time = "2025-08-26T15:12:50.602Z" }, + { url = "https://files.pythonhosted.org/packages/09/ba/215b0d821fd37cdc600e834a75708aa2e117124dcf495c9a6c6dc7fdcb6b/marisa_trie-1.3.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:3e2a0e1be95237981bd375a388f44b33d69ea5669a2f79fea038e45fff326595", size = 2250454, upload_time = "2025-08-26T15:12:52.435Z" }, + { url = "https://files.pythonhosted.org/packages/f5/a3/292ab31a12ec1cb356e6bc8b9cc8aaec920aa892a805757c011d77e8cd93/marisa_trie-1.3.1-cp314-cp314-win32.whl", hash = "sha256:c7a33506d0451112911c69f38d55da3e0e050f2be0ea4e5176865cf03baf26a9", size = 119101, upload_time = "2025-08-26T15:12:53.615Z" }, + { url = "https://files.pythonhosted.org/packages/95/83/0ea5de53209993cf301dd9d18d4cb22c20c84c753b4357b66660a8b9eb48/marisa_trie-1.3.1-cp314-cp314-win_amd64.whl", hash = "sha256:68678816818efcd4a1787b557af81f215b989ec88680a86c85c34c914d413690", size = 142886, upload_time = "2025-08-26T15:12:54.835Z" }, + { url = "https://files.pythonhosted.org/packages/37/00/c7e063867988067992a9d9d2aceaede0be7787ca6d77ef34f2eca9d2708e/marisa_trie-1.3.1-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:9e467e13971c64db6aed8afe4c2a131c3f73f048bec3f788a6141216acda598d", size = 175163, upload_time = "2025-08-26T15:12:55.908Z" }, + { url = "https://files.pythonhosted.org/packages/5f/64/eaf49d10c8506ecd717bbbeda907e474842c298354a444b875741ef4a0d9/marisa_trie-1.3.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:076731f79f8603cb3216cb6e5bbbc56536c89f63f175ad47014219ecb01e5996", size = 163119, upload_time = "2025-08-26T15:12:58.054Z" }, + { url = "https://files.pythonhosted.org/packages/b4/26/f24dd9c98ce6fc8c8d554b556e1c43f326c5df414b79aba33bd7d2d2fbfd/marisa_trie-1.3.1-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:82de2de90488d0fbbf74cf9f20e1afd62e320693b88f5e9565fc80b28f5bbad3", size = 1277783, upload_time = "2025-08-26T15:12:59.225Z" }, + { url = "https://files.pythonhosted.org/packages/b2/1a/efd63e75d1374e08f8ebe2e15ff1b1ed5f6d5cf57614a5b0884bd9c882ee/marisa_trie-1.3.1-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0c2bc6bee737f4d47fce48c5b03a7bd3214ef2d83eb5c9f84210091370a5f195", size = 1282309, upload_time = "2025-08-26T15:13:00.797Z" }, + { url = "https://files.pythonhosted.org/packages/33/4c/0cefa1eceec7858766af5939979857ac079c6c5251e00c6991c1a26bb1b7/marisa_trie-1.3.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:56043cf908ddf3d7364498085dbc2855d4ea8969aff3bf2439a79482a79e68e2", size = 2196594, upload_time = "2025-08-26T15:13:02.158Z" }, + { url = "https://files.pythonhosted.org/packages/bb/64/900f4132fc345be4b40073e66284707afa4cc203d8d0f1fe78c6b111cd47/marisa_trie-1.3.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:9651daa1fdc471df5a5fa6a4833d3b01e76ac512eea141a5995681aebac5555f", size = 2277730, upload_time = "2025-08-26T15:13:03.528Z" }, + { url = "https://files.pythonhosted.org/packages/62/ab/6d6cf25a5c8835589a601a9a916ec5cdee740e277fed8ee620df546834bb/marisa_trie-1.3.1-cp314-cp314t-win32.whl", hash = "sha256:c6571462417cda2239b1ade86ceaf3852da9b52c6286046e87d404afc6da20a7", size = 131409, upload_time = "2025-08-26T15:13:05.106Z" }, + { url = "https://files.pythonhosted.org/packages/9a/61/c4efc044141429e67e8fd5536be86d76303f250179c7f92b2cc0c72e8d0b/marisa_trie-1.3.1-cp314-cp314t-win_amd64.whl", hash = "sha256:9e6496bbad3068e3bbbb934b1e1307bf1a9cb4609f9ec47b57e8ea37f1b5ee40", size = 162564, upload_time = "2025-08-26T15:13:06.112Z" }, +] + [[package]] name = "markdown-it-py" version = "3.0.0" @@ -1256,6 +1454,35 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/da/d9/f7f9379981e39b8c2511c9e0326d212accacb82f12fbfdc1aa2ce2a7b2b6/multiprocess-0.70.16-py39-none-any.whl", hash = "sha256:a0bafd3ae1b732eac64be2e72038231c1ba97724b60b09400d68f229fcc2fbf3", size = 133351, upload_time = "2024-01-28T18:52:31.981Z" }, ] +[[package]] +name = "murmurhash" +version = "1.0.13" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/54/e9/02efbc6dfc2dd2085da3daacf9a8c17e8356019eceaedbfa21555e32d2af/murmurhash-1.0.13.tar.gz", hash = "sha256:737246d41ee00ff74b07b0bd1f0888be304d203ce668e642c86aa64ede30f8b7", size = 13258, upload_time = "2025-05-22T12:35:57.019Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2c/d1/9d13a02d9c8bfff10b1f68d19df206eaf2a8011defeccf7eb05ea0b8c54e/murmurhash-1.0.13-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b20d168370bc3ce82920121b78ab35ae244070a9b18798f4a2e8678fa03bd7e0", size = 26410, upload_time = "2025-05-22T12:35:20.786Z" }, + { url = "https://files.pythonhosted.org/packages/14/b0/3ee762e98cf9a8c2df9c8b377c326f3dd4495066d4eace9066fca46eba7a/murmurhash-1.0.13-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cef667d2e83bdceea3bc20c586c491fa442662ace1aea66ff5e3a18bb38268d8", size = 26679, upload_time = "2025-05-22T12:35:21.808Z" }, + { url = "https://files.pythonhosted.org/packages/39/06/24618f79cd5aac48490932e50263bddfd1ea90f7123d49bfe806a5982675/murmurhash-1.0.13-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:507148e50929ba1fce36898808573b9f81c763d5676f3fc6e4e832ff56b66992", size = 125970, upload_time = "2025-05-22T12:35:23.222Z" }, + { url = "https://files.pythonhosted.org/packages/e8/09/0e7afce0a422692506c85474a26fb3a03c1971b2b5f7e7745276c4b3de7f/murmurhash-1.0.13-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64d50f6173d266ad165beb8bca6101d824217fc9279f9e9981f4c0245c1e7ee6", size = 123390, upload_time = "2025-05-22T12:35:24.303Z" }, + { url = "https://files.pythonhosted.org/packages/22/4c/c98f579b1a951b2bcc722a35270a2eec105c1e21585c9b314a02079e3c4d/murmurhash-1.0.13-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0f272e15a84a8ae5f8b4bc0a68f9f47be38518ddffc72405791178058e9d019a", size = 124007, upload_time = "2025-05-22T12:35:25.446Z" }, + { url = "https://files.pythonhosted.org/packages/df/f8/1b0dcebc8df8e091341617102b5b3b97deb6435f345b84f75382c290ec2c/murmurhash-1.0.13-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f9423e0b0964ed1013a06c970199538c7ef9ca28c0be54798c0f1473a6591761", size = 123705, upload_time = "2025-05-22T12:35:26.709Z" }, + { url = "https://files.pythonhosted.org/packages/79/17/f2a38558e150a0669d843f75e128afb83c1a67af41885ea2acb940e18e2a/murmurhash-1.0.13-cp311-cp311-win_amd64.whl", hash = "sha256:83b81e7084b696df3d853f2c78e0c9bda6b285d643f923f1a6fa9ab145d705c5", size = 24572, upload_time = "2025-05-22T12:35:30.38Z" }, + { url = "https://files.pythonhosted.org/packages/e1/53/56ce2d8d4b9ab89557cb1d00ffce346b80a2eb2d8c7944015e5c83eacdec/murmurhash-1.0.13-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:bbe882e46cb3f86e092d8a1dd7a5a1c992da1ae3b39f7dd4507b6ce33dae7f92", size = 26859, upload_time = "2025-05-22T12:35:31.815Z" }, + { url = "https://files.pythonhosted.org/packages/f8/85/3a0ad54a61257c31496545ae6861515d640316f93681d1dd917e7be06634/murmurhash-1.0.13-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:52a33a12ecedc432493692c207c784b06b6427ffaa897fc90b7a76e65846478d", size = 26900, upload_time = "2025-05-22T12:35:34.267Z" }, + { url = "https://files.pythonhosted.org/packages/d0/cd/6651de26744b50ff11c79f0c0d41244db039625de53c0467a7a52876b2d8/murmurhash-1.0.13-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:950403a7f0dc2d9c8d0710f07c296f2daab66299d9677d6c65d6b6fa2cb30aaa", size = 131367, upload_time = "2025-05-22T12:35:35.258Z" }, + { url = "https://files.pythonhosted.org/packages/50/6c/01ded95ddce33811c9766cae4ce32e0a54288da1d909ee2bcaa6ed13b9f1/murmurhash-1.0.13-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fde9fb5d2c106d86ff3ef2e4a9a69c2a8d23ba46e28c6b30034dc58421bc107b", size = 128943, upload_time = "2025-05-22T12:35:36.358Z" }, + { url = "https://files.pythonhosted.org/packages/ab/27/e539a9622d7bea3ae22706c1eb80d4af80f9dddd93b54d151955c2ae4011/murmurhash-1.0.13-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3aa55d62773745616e1ab19345dece122f6e6d09224f7be939cc5b4c513c8473", size = 129108, upload_time = "2025-05-22T12:35:37.864Z" }, + { url = "https://files.pythonhosted.org/packages/7a/84/18af5662e07d06839ad4db18ce026e6f8ef850d7b0ba92817b28dad28ba6/murmurhash-1.0.13-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:060dfef1b405cf02c450f182fb629f76ebe7f79657cced2db5054bc29b34938b", size = 129175, upload_time = "2025-05-22T12:35:38.928Z" }, + { url = "https://files.pythonhosted.org/packages/fe/8d/b01d3ee1f1cf3957250223b7c6ce35454f38fbf4abe236bf04a3f769341d/murmurhash-1.0.13-cp312-cp312-win_amd64.whl", hash = "sha256:a8e79627d44a6e20a6487effc30bfe1c74754c13d179106e68cc6d07941b022c", size = 24869, upload_time = "2025-05-22T12:35:40.035Z" }, + { url = "https://files.pythonhosted.org/packages/00/b4/8919dfdc4a131ad38a57b2c5de69f4bd74538bf546637ee59ebaebe6e5a4/murmurhash-1.0.13-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b8a7f8befd901379b6dc57a9e49c5188454113747ad6aa8cdd951a6048e10790", size = 26852, upload_time = "2025-05-22T12:35:41.061Z" }, + { url = "https://files.pythonhosted.org/packages/b4/32/ce78bef5d6101568bcb12f5bb5103fabcbe23723ec52e76ff66132d5dbb7/murmurhash-1.0.13-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f741aab86007510199193eee4f87c5ece92bc5a6ca7d0fe0d27335c1203dface", size = 26900, upload_time = "2025-05-22T12:35:42.097Z" }, + { url = "https://files.pythonhosted.org/packages/0c/4c/0f47c0b4f6b31a1de84d65f9573832c78cd47b4b8ce25ab5596a8238d150/murmurhash-1.0.13-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:82614f18fa6d9d83da6bb0918f3789a3e1555d0ce12c2548153e97f79b29cfc9", size = 130033, upload_time = "2025-05-22T12:35:43.113Z" }, + { url = "https://files.pythonhosted.org/packages/e0/cb/e47233e32fb792dcc9fb18a2cf65f795d47179b29c2b4a2034689f14c707/murmurhash-1.0.13-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:91f22a48b9454712e0690aa0b76cf0156a5d5a083d23ec7e209cfaeef28f56ff", size = 130619, upload_time = "2025-05-22T12:35:44.229Z" }, + { url = "https://files.pythonhosted.org/packages/8f/f1/f89911bf304ba5d385ccd346cc7fbb1c1450a24f093b592c3bfe87768467/murmurhash-1.0.13-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:c4bc7938627b8fcb3d598fe6657cc96d1e31f4eba6a871b523c1512ab6dacb3e", size = 127643, upload_time = "2025-05-22T12:35:45.369Z" }, + { url = "https://files.pythonhosted.org/packages/a4/24/262229221f6840c1a04a46051075e99675e591571abcca6b9a8b6aa1602b/murmurhash-1.0.13-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:58a61f1fc840f9ef704e638c39b8517bab1d21f1a9dbb6ba3ec53e41360e44ec", size = 127981, upload_time = "2025-05-22T12:35:46.503Z" }, + { url = "https://files.pythonhosted.org/packages/18/25/addbc1d28f83252732ac3e57334d42f093890b4c2cce483ba01a42bc607c/murmurhash-1.0.13-cp313-cp313-win_amd64.whl", hash = "sha256:c451a22f14c2f40e7abaea521ee24fa0e46fbec480c4304c25c946cdb6e81883", size = 24880, upload_time = "2025-05-22T12:35:47.625Z" }, +] + [[package]] name = "numpy" version = "1.26.4" @@ -1416,6 +1643,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6e/23/e98758924d1b3aac11a626268eabf7f3cf177e7837c28d47bf84c64532d0/pendulum-3.1.0-py3-none-any.whl", hash = "sha256:f9178c2a8e291758ade1e8dd6371b1d26d08371b4c7730a6e9a3ef8b16ebae0f", size = 111799, upload_time = "2025-04-19T14:02:34.739Z" }, ] +[[package]] +name = "pip" +version = "25.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/20/16/650289cd3f43d5a2fadfd98c68bd1e1e7f2550a1a5326768cddfbcedb2c5/pip-25.2.tar.gz", hash = "sha256:578283f006390f85bb6282dffb876454593d637f5d1be494b5202ce4877e71f2", size = 1840021, upload_time = "2025-07-30T21:50:15.401Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b7/3f/945ef7ab14dc4f9d7f40288d2df998d1837ee0888ec3659c813487572faa/pip-25.2-py3-none-any.whl", hash = "sha256:6d67a2b4e7f14d8b31b8b52648866fa717f45a1eb70e83002f4331d07e953717", size = 1752557, upload_time = "2025-07-30T21:50:13.323Z" }, +] + [[package]] name = "playwright" version = "1.49.1" @@ -1443,6 +1679,39 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669", size = 20556, upload_time = "2024-04-20T21:34:40.434Z" }, ] +[[package]] +name = "preshed" +version = "3.0.10" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cymem" }, + { name = "murmurhash" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/4d/3a/db814f67a05b6d7f9c15d38edef5ec9b21415710705b393883de92aee5ef/preshed-3.0.10.tar.gz", hash = "sha256:5a5c8e685e941f4ffec97f1fbf32694b8107858891a4bc34107fac981d8296ff", size = 15039, upload_time = "2025-05-26T15:18:33.612Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/08/99/c3709638f687da339504d1daeca48604cadb338bf3556a1484d1f0cd95e6/preshed-3.0.10-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d96c4fe2b41c1cdcc8c4fc1fdb10f922a6095c0430a3ebe361fe62c78902d068", size = 131486, upload_time = "2025-05-26T15:17:52.231Z" }, + { url = "https://files.pythonhosted.org/packages/e0/27/0fd36b63caa8bbf57b31a121d9565d385bbd7521771d4eb93e17d326873d/preshed-3.0.10-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cb01ea930b96f3301526a2ab26f41347d07555e4378c4144c6b7645074f2ebb0", size = 127938, upload_time = "2025-05-26T15:17:54.19Z" }, + { url = "https://files.pythonhosted.org/packages/90/54/6a876d9cc8d401a9c1fb6bb8ca5a31b3664d0bcb888a9016258a1ae17344/preshed-3.0.10-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9dd1f0a7b7d150e229d073fd4fe94f72610cae992e907cee74687c4695873a98", size = 842263, upload_time = "2025-05-26T15:17:55.398Z" }, + { url = "https://files.pythonhosted.org/packages/1c/7d/ff19f74d15ee587905bafa3582883cfe2f72b574e6d691ee64dc690dc276/preshed-3.0.10-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fd7b350c280137f324cd447afbf6ba9a849af0e8898850046ac6f34010e08bd", size = 842913, upload_time = "2025-05-26T15:17:56.687Z" }, + { url = "https://files.pythonhosted.org/packages/f1/3a/1c345a26463345557705b61965e1e0a732cc0e9c6dfd4787845dbfa50b4a/preshed-3.0.10-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cf6a5fdc89ad06079aa6ee63621e417d4f4cf2a3d8b63c72728baad35a9ff641", size = 820548, upload_time = "2025-05-26T15:17:58.057Z" }, + { url = "https://files.pythonhosted.org/packages/7f/6b/71f25e2b7a23dba168f43edfae0bb508552dbef89114ce65c73f2ea7172f/preshed-3.0.10-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b4c29a7bd66985808ad181c9ad05205a6aa7400cd0f98426acd7bc86588b93f8", size = 840379, upload_time = "2025-05-26T15:17:59.565Z" }, + { url = "https://files.pythonhosted.org/packages/3a/86/d8f32b0b31a36ee8770a9b1a95321430e364cd0ba4bfebb7348aed2f198d/preshed-3.0.10-cp311-cp311-win_amd64.whl", hash = "sha256:1367c1fd6f44296305315d4e1c3fe3171787d4d01c1008a76bc9466bd79c3249", size = 117655, upload_time = "2025-05-26T15:18:00.836Z" }, + { url = "https://files.pythonhosted.org/packages/c3/14/322a4f58bc25991a87f216acb1351800739b0794185d27508ee86c35f382/preshed-3.0.10-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6e9c46933d55c8898c8f7a6019a8062cd87ef257b075ada2dd5d1e57810189ea", size = 131367, upload_time = "2025-05-26T15:18:02.408Z" }, + { url = "https://files.pythonhosted.org/packages/38/80/67507653c35620cace913f617df6d6f658b87e8da83087b851557d65dd86/preshed-3.0.10-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5c4ebc4f8ef0114d55f2ffdce4965378129c7453d0203664aeeb03055572d9e4", size = 126535, upload_time = "2025-05-26T15:18:03.589Z" }, + { url = "https://files.pythonhosted.org/packages/db/b1/ab4f811aeaf20af0fa47148c1c54b62d7e8120d59025bd0a3f773bb67725/preshed-3.0.10-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ab5ab4c6dfd3746fb4328e7fbeb2a0544416b872db02903bfac18e6f5cd412f", size = 864907, upload_time = "2025-05-26T15:18:04.794Z" }, + { url = "https://files.pythonhosted.org/packages/fb/db/fe37c1f99cfb26805dd89381ddd54901307feceb267332eaaca228e9f9c1/preshed-3.0.10-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:40586fd96ae3974c552a7cd78781b6844ecb1559ee7556586f487058cf13dd96", size = 869329, upload_time = "2025-05-26T15:18:06.353Z" }, + { url = "https://files.pythonhosted.org/packages/a7/fd/efb6a6233d1cd969966f3f65bdd8e662579c3d83114e5c356cec1927b1f7/preshed-3.0.10-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a606c24cda931306b98e0edfafed3309bffcf8d6ecfe07804db26024c4f03cd6", size = 846829, upload_time = "2025-05-26T15:18:07.716Z" }, + { url = "https://files.pythonhosted.org/packages/14/49/0e4ce5db3bf86b081abb08a404fb37b7c2dbfd7a73ec6c0bc71b650307eb/preshed-3.0.10-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:394015566f9354738be903447039e8dbc6d93ba5adf091af694eb03c4e726b1e", size = 874008, upload_time = "2025-05-26T15:18:09.364Z" }, + { url = "https://files.pythonhosted.org/packages/6f/17/76d6593fc2d055d4e413b68a8c87b70aa9b7697d4972cb8062559edcf6e9/preshed-3.0.10-cp312-cp312-win_amd64.whl", hash = "sha256:fd7e38225937e580420c84d1996dde9b4f726aacd9405093455c3a2fa60fede5", size = 116701, upload_time = "2025-05-26T15:18:11.905Z" }, + { url = "https://files.pythonhosted.org/packages/bf/5e/87671bc58c4f6c8cf0a5601ccd74b8bb50281ff28aa4ab3e3cad5cd9d06a/preshed-3.0.10-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:23e6e0581a517597f3f76bc24a4cdb0ba5509933d4f61c34fca49649dd71edf9", size = 129184, upload_time = "2025-05-26T15:18:13.331Z" }, + { url = "https://files.pythonhosted.org/packages/92/69/b3969a3c95778def5bf5126484a1f7d2ad324d1040077f55f56e027d8ea4/preshed-3.0.10-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:574e6d6056981540310ff181b47a2912f4bddc91bcace3c7a9c6726eafda24ca", size = 124258, upload_time = "2025-05-26T15:18:14.497Z" }, + { url = "https://files.pythonhosted.org/packages/32/df/6e828ec4565bf33bd4803a3eb3b1102830b739143e5d6c132bf7181a58ec/preshed-3.0.10-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2bd658dd73e853d1bb5597976a407feafa681b9d6155bc9bc7b4c2acc2a6ee96", size = 825445, upload_time = "2025-05-26T15:18:15.71Z" }, + { url = "https://files.pythonhosted.org/packages/05/3d/478b585f304920e51f328c9231e22f30dc64baa68e079e08a46ab72be738/preshed-3.0.10-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5b95396046328ffb461a68859ce2141aca4815b8624167832d28ced70d541626", size = 831690, upload_time = "2025-05-26T15:18:17.08Z" }, + { url = "https://files.pythonhosted.org/packages/c3/65/938f21f77227e8d398d46fb10b9d1b3467be859468ce8db138fc3d50589c/preshed-3.0.10-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:3e6728b2028bbe79565eb6cf676b5bae5ce1f9cc56e4bf99bb28ce576f88054d", size = 808593, upload_time = "2025-05-26T15:18:18.535Z" }, + { url = "https://files.pythonhosted.org/packages/6c/1c/2a3961fc88bc72300ff7e4ca54689bda90d2d77cc994167cc09a310480b6/preshed-3.0.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:c4ef96cb28bf5f08de9c070143113e168efccbb68fd4961e7d445f734c051a97", size = 837333, upload_time = "2025-05-26T15:18:19.937Z" }, + { url = "https://files.pythonhosted.org/packages/fa/8c/d3e30f80b2ef21f267f09f0b7d18995adccc928ede5b73ea3fe54e1303f4/preshed-3.0.10-cp313-cp313-win_amd64.whl", hash = "sha256:97e0e2edfd25a7dfba799b49b3c5cc248ad0318a76edd9d5fd2c82aa3d5c64ed", size = 115769, upload_time = "2025-05-26T15:18:21.842Z" }, +] + [[package]] name = "propcache" version = "0.3.1" @@ -1897,6 +2166,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6a/3e/b68c118422ec867fa7ab88444e1274aa40681c606d59ac27de5a5588f082/python_dotenv-1.0.1-py3-none-any.whl", hash = "sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a", size = 19863, upload_time = "2024-01-23T06:32:58.246Z" }, ] +[[package]] +name = "python-env-utils" +version = "0.4.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "python-dateutil" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/57/96/c49c675b9a8cfb79b7377bb5e357feafb810dd2831201cde4e499c0a5e52/python-env-utils-0.4.1.tar.gz", hash = "sha256:6357d9ae024e5039158ce337bafeca662453f41cd7789a4517217c1a9093ce57", size = 5711, upload_time = "2017-04-09T18:43:59.347Z" } + [[package]] name = "python-multipart" version = "0.0.20" @@ -2050,6 +2328,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755, upload_time = "2023-10-24T04:13:38.866Z" }, ] +[[package]] +name = "side-effects" +version = "1.6.dev0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "django" }, + { name = "python-env-utils" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/44/39/c7feca6a6154195b135a4539802bc3c909b931e296c868d6974ff0c9d819/side-effects-1.6.dev0.tar.gz", hash = "sha256:9d069359fc46dbcb78938ca4a7c1e6266db84de0cdf5fc2d8ce664bfe5cae255", size = 16186, upload_time = "2020-01-01T21:29:09.983Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6c/24/a6def6872e165cc8d3846e5b9c2615f6f566c424d5eb6d99a15eaad7c558/side_effects-1.6.dev0-py3-none-any.whl", hash = "sha256:343f8f34de51f477238e03b0c33d79a5ef31604991a44c187ebfce0fae628c97", size = 13563, upload_time = "2020-01-01T21:29:13.045Z" }, +] + [[package]] name = "simplejson" version = "3.20.1" @@ -2107,6 +2398,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload_time = "2024-12-04T17:35:26.475Z" }, ] +[[package]] +name = "smart-open" +version = "7.3.0.post1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "wrapt" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/18/2b/5e7234c68ed5bc872ad6ae77b8a421c2ed70dcb1190b44dc1abdeed5e347/smart_open-7.3.0.post1.tar.gz", hash = "sha256:ce6a3d9bc1afbf6234ad13c010b77f8cd36d24636811e3c52c3b5160f5214d1e", size = 51557, upload_time = "2025-07-03T10:06:31.271Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/08/5b/a2a3d4514c64818925f4e886d39981f1926eeb5288a4549c6b3c17ed66bb/smart_open-7.3.0.post1-py3-none-any.whl", hash = "sha256:c73661a2c24bf045c1e04e08fffc585b59af023fe783d57896f590489db66fb4", size = 61946, upload_time = "2025-07-03T10:06:29.599Z" }, +] + [[package]] name = "sniffio" version = "1.3.1" @@ -2125,6 +2428,74 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e7/9c/0e6afc12c269578be5c0c1c9f4b49a8d32770a080260c333ac04cc1c832d/soupsieve-2.7-py3-none-any.whl", hash = "sha256:6e60cc5c1ffaf1cebcc12e8188320b72071e922c2e897f737cadce79ad5d30c4", size = 36677, upload_time = "2025-04-20T18:50:07.196Z" }, ] +[[package]] +name = "spacy" +version = "3.8.7" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "catalogue" }, + { name = "cymem" }, + { name = "jinja2" }, + { name = "langcodes" }, + { name = "murmurhash" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "preshed" }, + { name = "pydantic" }, + { name = "requests" }, + { name = "setuptools" }, + { name = "spacy-legacy" }, + { name = "spacy-loggers" }, + { name = "srsly" }, + { name = "thinc" }, + { name = "tqdm" }, + { name = "typer" }, + { name = "wasabi" }, + { name = "weasel" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/1e/9e/fb4e1cefe3fbd51ea6a243e5a3d2bc629baa9a28930bf4be6fe5672fa1ca/spacy-3.8.7.tar.gz", hash = "sha256:700fd174c6c552276be142c48e70bb53cae24c4dd86003c4432af9cb93e4c908", size = 1316143, upload_time = "2025-05-23T08:55:39.538Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/29/c5/5fbb3a4e694d4855a5bab87af9664377c48b89691f180ad3cde4faeaf35c/spacy-3.8.7-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:bdff8b9b556468a6dd527af17f0ddf9fb0b0bee92ee7703339ddf542361cff98", size = 6746140, upload_time = "2025-05-23T08:54:23.483Z" }, + { url = "https://files.pythonhosted.org/packages/03/2a/43afac516eb82409ca47d7206f982beaf265d2ba06a72ca07cf06b290c20/spacy-3.8.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9194b7cf015ed9b4450ffb162da49c8a9305e76b468de036b0948abdfc748a37", size = 6392440, upload_time = "2025-05-23T08:54:25.12Z" }, + { url = "https://files.pythonhosted.org/packages/6f/83/2ea68c18e2b1b9a6f6b30ef63eb9d07e979626b9595acfdb5394f18923c4/spacy-3.8.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7dc38b78d48b9c2a80a3eea95f776304993f63fc307f07cdd104441442f92f1e", size = 32699126, upload_time = "2025-05-23T08:54:27.385Z" }, + { url = "https://files.pythonhosted.org/packages/0a/0a/bb90e9aa0b3c527876627567d82517aabab08006ccf63796c33b0242254d/spacy-3.8.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e43bd70772751b8fc7a14f338d087a3d297195d43d171832923ef66204b23ab", size = 33008865, upload_time = "2025-05-23T08:54:30.248Z" }, + { url = "https://files.pythonhosted.org/packages/39/dd/8e906ba378457107ab0394976ea9f7b12fdb2cad682ef1a2ccf473d61e5f/spacy-3.8.7-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c402bf5dcf345fd96d202378c54bc345219681e3531f911d99567d569328c45f", size = 31933169, upload_time = "2025-05-23T08:54:33.199Z" }, + { url = "https://files.pythonhosted.org/packages/c9/b5/42df07eb837a923fbb42509864d5c7c2072d010de933dccdfb3c655b3a76/spacy-3.8.7-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4234189861e486d86f1269e50542d87e8a6391a1ee190652479cf1a793db115f", size = 32776322, upload_time = "2025-05-23T08:54:36.891Z" }, + { url = "https://files.pythonhosted.org/packages/92/e7/8176484801c67dcd814f141991fe0a3c9b5b4a3583ea30c2062e93d1aa6b/spacy-3.8.7-cp311-cp311-win_amd64.whl", hash = "sha256:e9d12e2eb7f36bc11dd9edae011032fe49ea100d63e83177290d3cbd80eaa650", size = 14938936, upload_time = "2025-05-23T08:54:40.322Z" }, + { url = "https://files.pythonhosted.org/packages/a5/10/89852f40f926e0902c11c34454493ba0d15530b322711e754b89a6d7dfe6/spacy-3.8.7-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:88b397e37793cea51df298e6c651a763e49877a25bead5ba349761531a456687", size = 6265335, upload_time = "2025-05-23T08:54:42.876Z" }, + { url = "https://files.pythonhosted.org/packages/16/fb/b5d54522969a632c06f4af354763467553b66d5bf0671ac39f3cceb3fd54/spacy-3.8.7-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f70b676955fa6959347ca86ed6edd8ff0d6eb2ba20561fdfec76924bd3e540f9", size = 5906035, upload_time = "2025-05-23T08:54:44.824Z" }, + { url = "https://files.pythonhosted.org/packages/3a/03/70f06753fd65081404ade30408535eb69f627a36ffce2107116d1aa16239/spacy-3.8.7-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c4b5a624797ade30c25b5b69daa35a93ee24bcc56bd79b0884b2565f76f35d6", size = 33420084, upload_time = "2025-05-23T08:54:46.889Z" }, + { url = "https://files.pythonhosted.org/packages/f9/19/b60e1ebf4985ee2b33d85705b89a5024942b65dad04dbdc3fb46f168b410/spacy-3.8.7-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9d83e006df66decccefa3872fa958b3756228fb216d83783595444cf42ca10c", size = 33922188, upload_time = "2025-05-23T08:54:49.781Z" }, + { url = "https://files.pythonhosted.org/packages/8f/a3/1fb1a49dc6d982d96fffc30c3a31bb431526008eea72ac3773f6518720a6/spacy-3.8.7-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0dca25deba54f3eb5dcfbf63bf16e613e6c601da56f91c4a902d38533c098941", size = 31939285, upload_time = "2025-05-23T08:54:53.162Z" }, + { url = "https://files.pythonhosted.org/packages/2d/55/6cf1aff8e5c01ee683e828f3ccd9282d2aff7ca1143a9349ee3d0c1291ff/spacy-3.8.7-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5eef3f805a1c118d9b709a23e2d378f5f20da5a0d6258c9cfdc87c4cb234b4fc", size = 32988845, upload_time = "2025-05-23T08:54:57.776Z" }, + { url = "https://files.pythonhosted.org/packages/8c/47/c17ee61b51aa8497d8af0999224b4b62485111a55ec105a06886685b2c68/spacy-3.8.7-cp312-cp312-win_amd64.whl", hash = "sha256:25d7a68e445200c9e9dc0044f8b7278ec0ef01ccc7cb5a95d1de2bd8e3ed6be2", size = 13918682, upload_time = "2025-05-23T08:55:00.387Z" }, + { url = "https://files.pythonhosted.org/packages/2a/95/7125bea6d432c601478bf922f7a568762c8be425bbde5b66698260ab0358/spacy-3.8.7-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:dda7d57f42ec57c19fbef348095a9c82504e4777bca7b8db4b0d8318ba280fc7", size = 6235950, upload_time = "2025-05-23T08:55:02.92Z" }, + { url = "https://files.pythonhosted.org/packages/96/c3/d2362846154d4d341136774831605df02d61f49ac637524a15f4f2794874/spacy-3.8.7-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:de0e0bddb810ed05bce44bcb91460eabe52bc56323da398d2ca74288a906da35", size = 5878106, upload_time = "2025-05-23T08:55:04.496Z" }, + { url = "https://files.pythonhosted.org/packages/50/b6/b2943acfbfc4fc12642dac9feb571e712dd1569ab481db8f3daedee045fe/spacy-3.8.7-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5a2e58f92b684465777a7c1a65d5578b1dc36fe55c48d9964fb6d46cc9449768", size = 33085866, upload_time = "2025-05-23T08:55:06.65Z" }, + { url = "https://files.pythonhosted.org/packages/65/98/c4415cbb217ac0b502dbb3372136015c699dd16a0c47cd6d338cd15f4bed/spacy-3.8.7-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:46330da2eb357d6979f40ea8fc16ee5776ee75cd0c70aac2a4ea10c80364b8f3", size = 33398424, upload_time = "2025-05-23T08:55:10.477Z" }, + { url = "https://files.pythonhosted.org/packages/12/45/12a198858f1f11c21844876e039ba90df59d550527c72996d418c1faf78d/spacy-3.8.7-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:86b6a6ad23ca5440ef9d29c2b1e3125e28722c927db612ae99e564d49202861c", size = 31530066, upload_time = "2025-05-23T08:55:13.329Z" }, + { url = "https://files.pythonhosted.org/packages/9c/df/80524f99822eb96c9649200042ec5912357eec100cf0cd678a2e9ef0ecb3/spacy-3.8.7-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ccfe468cbb370888153df145ce3693af8e54dae551940df49057258081b2112f", size = 32613343, upload_time = "2025-05-23T08:55:16.711Z" }, + { url = "https://files.pythonhosted.org/packages/02/99/881f6f24c279a5a70b8d69aaf8266fd411a0a58fd1c8848112aaa348f6f6/spacy-3.8.7-cp313-cp313-win_amd64.whl", hash = "sha256:ca81e416ff35209769e8b5dd5d13acc52e4f57dd9d028364bccbbe157c2ae86b", size = 13911250, upload_time = "2025-05-23T08:55:19.606Z" }, +] + +[[package]] +name = "spacy-legacy" +version = "3.0.12" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d9/79/91f9d7cc8db5642acad830dcc4b49ba65a7790152832c4eceb305e46d681/spacy-legacy-3.0.12.tar.gz", hash = "sha256:b37d6e0c9b6e1d7ca1cf5bc7152ab64a4c4671f59c85adaf7a3fcb870357a774", size = 23806, upload_time = "2023-01-23T09:04:15.104Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c3/55/12e842c70ff8828e34e543a2c7176dac4da006ca6901c9e8b43efab8bc6b/spacy_legacy-3.0.12-py2.py3-none-any.whl", hash = "sha256:476e3bd0d05f8c339ed60f40986c07387c0a71479245d6d0f4298dbd52cda55f", size = 29971, upload_time = "2023-01-23T09:04:13.45Z" }, +] + +[[package]] +name = "spacy-loggers" +version = "1.0.5" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/67/3d/926db774c9c98acf66cb4ed7faf6c377746f3e00b84b700d0868b95d0712/spacy-loggers-1.0.5.tar.gz", hash = "sha256:d60b0bdbf915a60e516cc2e653baeff946f0cfc461b452d11a4d5458c6fe5f24", size = 20811, upload_time = "2023-09-11T12:26:52.323Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/33/78/d1a1a026ef3af911159398c939b1509d5c36fe524c7b644f34a5146c4e16/spacy_loggers-1.0.5-py3-none-any.whl", hash = "sha256:196284c9c446cc0cdb944005384270d775fdeaf4f494d8e269466cfa497ef645", size = 22343, upload_time = "2023-09-11T12:26:50.586Z" }, +] + [[package]] name = "sqlalchemy" version = "2.0.40" @@ -2162,6 +2533,47 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/7c/5fc8e802e7506fe8b55a03a2e1dab156eae205c91bee46305755e086d2e2/sqlalchemy-2.0.40-py3-none-any.whl", hash = "sha256:32587e2e1e359276957e6fe5dad089758bc042a971a8a09ae8ecf7a8fe23d07a", size = 1903894, upload_time = "2025-03-27T18:40:43.796Z" }, ] +[[package]] +name = "sqlparse" +version = "0.5.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e5/40/edede8dd6977b0d3da179a342c198ed100dd2aba4be081861ee5911e4da4/sqlparse-0.5.3.tar.gz", hash = "sha256:09f67787f56a0b16ecdbde1bfc7f5d9c3371ca683cfeaa8e6ff60b4807ec9272", size = 84999, upload_time = "2024-12-10T12:05:30.728Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a9/5c/bfd6bd0bf979426d405cc6e71eceb8701b148b16c21d2dc3c261efc61c7b/sqlparse-0.5.3-py3-none-any.whl", hash = "sha256:cf2196ed3418f3ba5de6af7e82c694a9fbdbfecccdfc72e281548517081f16ca", size = 44415, upload_time = "2024-12-10T12:05:27.824Z" }, +] + +[[package]] +name = "srsly" +version = "2.5.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "catalogue" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b7/e8/eb51b1349f50bac0222398af0942613fdc9d1453ae67cbe4bf9936a1a54b/srsly-2.5.1.tar.gz", hash = "sha256:ab1b4bf6cf3e29da23dae0493dd1517fb787075206512351421b89b4fc27c77e", size = 466464, upload_time = "2025-01-17T09:26:26.919Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/df/9c/a248bb49de499fe0990e3cb0fb341c2373d8863ef9a8b5799353cade5731/srsly-2.5.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:58f0736794ce00a71d62a39cbba1d62ea8d5be4751df956e802d147da20ecad7", size = 635917, upload_time = "2025-01-17T09:25:25.109Z" }, + { url = "https://files.pythonhosted.org/packages/41/47/1bdaad84502df973ecb8ca658117234cf7fb20e1dec60da71dce82de993f/srsly-2.5.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7a8269c40859806d71920396d185f4f38dc985cdb6a28d3a326a701e29a5f629", size = 634374, upload_time = "2025-01-17T09:25:26.609Z" }, + { url = "https://files.pythonhosted.org/packages/e5/2a/d73c71989fcf2a6d1fa518d75322aff4db01a8763f167f8c5e00aac11097/srsly-2.5.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:889905900401fefc1032e22b73aecbed8b4251aa363f632b2d1f86fc16f1ad8e", size = 1108390, upload_time = "2025-01-17T09:25:29.32Z" }, + { url = "https://files.pythonhosted.org/packages/35/a3/9eda9997a8bd011caed18fdaa5ce606714eb06d8dab587ed0522b3e92ab1/srsly-2.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf454755f22589df49c25dc799d8af7b47dce3d861dded35baf0f0b6ceab4422", size = 1110712, upload_time = "2025-01-17T09:25:31.051Z" }, + { url = "https://files.pythonhosted.org/packages/8a/ef/4b50bc05d06349f905b27f824cc23b652098efd4be19aead3af4981df647/srsly-2.5.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cc0607c8a59013a51dde5c1b4e465558728e9e0a35dcfa73c7cbefa91a0aad50", size = 1081244, upload_time = "2025-01-17T09:25:32.611Z" }, + { url = "https://files.pythonhosted.org/packages/90/af/d4a2512d9a5048d2b18efead39d4c4404bddd4972935bbc68211292a736c/srsly-2.5.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d5421ba3ab3c790e8b41939c51a1d0f44326bfc052d7a0508860fb79a47aee7f", size = 1091692, upload_time = "2025-01-17T09:25:34.15Z" }, + { url = "https://files.pythonhosted.org/packages/bb/da/657a685f63028dcb00ccdc4ac125ed347c8bff6fa0dab6a9eb3dc45f3223/srsly-2.5.1-cp311-cp311-win_amd64.whl", hash = "sha256:b96ea5a9a0d0379a79c46d255464a372fb14c30f59a8bc113e4316d131a530ab", size = 632627, upload_time = "2025-01-17T09:25:37.36Z" }, + { url = "https://files.pythonhosted.org/packages/fb/f6/bebc20d75bd02121fc0f65ad8c92a5dd2570e870005e940faa55a263e61a/srsly-2.5.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:683b54ed63d7dfee03bc2abc4b4a5f2152f81ec217bbadbac01ef1aaf2a75790", size = 636717, upload_time = "2025-01-17T09:25:40.236Z" }, + { url = "https://files.pythonhosted.org/packages/b6/e8/9372317a4742c70b87b413335adfcdfb2bee4f88f3faba89fabb9e6abf21/srsly-2.5.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:459d987130e57e83ce9e160899afbeb871d975f811e6958158763dd9a8a20f23", size = 634697, upload_time = "2025-01-17T09:25:43.605Z" }, + { url = "https://files.pythonhosted.org/packages/d5/00/c6a7b99ab27b051a27bd26fe1a8c1885225bb8980282bf9cb99f70610368/srsly-2.5.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:184e3c98389aab68ff04aab9095bd5f1a8e5a72cc5edcba9d733bac928f5cf9f", size = 1134655, upload_time = "2025-01-17T09:25:45.238Z" }, + { url = "https://files.pythonhosted.org/packages/c2/e6/861459e8241ec3b78c111081bd5efa414ef85867e17c45b6882954468d6e/srsly-2.5.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:00c2a3e4856e63b7efd47591d049aaee8e5a250e098917f50d93ea68853fab78", size = 1143544, upload_time = "2025-01-17T09:25:47.485Z" }, + { url = "https://files.pythonhosted.org/packages/2d/85/8448fe874dd2042a4eceea5315cfff3af03ac77ff5073812071852c4e7e2/srsly-2.5.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:366b4708933cd8d6025c13c2cea3331f079c7bb5c25ec76fca392b6fc09818a0", size = 1098330, upload_time = "2025-01-17T09:25:52.55Z" }, + { url = "https://files.pythonhosted.org/packages/ef/7e/04d0e1417da140b2ac4053a3d4fcfc86cd59bf4829f69d370bb899f74d5d/srsly-2.5.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c8a0b03c64eb6e150d772c5149befbadd981cc734ab13184b0561c17c8cef9b1", size = 1110670, upload_time = "2025-01-17T09:25:54.02Z" }, + { url = "https://files.pythonhosted.org/packages/96/1a/a8cd627eaa81a91feb6ceab50155f4ceff3eef6107916cb87ef796958427/srsly-2.5.1-cp312-cp312-win_amd64.whl", hash = "sha256:7952538f6bba91b9d8bf31a642ac9e8b9ccc0ccbb309feb88518bfb84bb0dc0d", size = 632598, upload_time = "2025-01-17T09:25:55.499Z" }, + { url = "https://files.pythonhosted.org/packages/42/94/cab36845aad6e2c22ecee1178accaa365657296ff87305b805648fd41118/srsly-2.5.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:84b372f7ef1604b4a5b3cee1571993931f845a5b58652ac01bcb32c52586d2a8", size = 634883, upload_time = "2025-01-17T09:25:58.363Z" }, + { url = "https://files.pythonhosted.org/packages/67/8b/501f51f4eaee7e1fd7327764799cb0a42f5d0de042a97916d30dbff770fc/srsly-2.5.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6ac3944c112acb3347a39bfdc2ebfc9e2d4bace20fe1c0b764374ac5b83519f2", size = 632842, upload_time = "2025-01-17T09:25:59.777Z" }, + { url = "https://files.pythonhosted.org/packages/07/be/5b8fce4829661e070a7d3e262d2e533f0e297b11b8993d57240da67d7330/srsly-2.5.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6118f9c4b221cde0a990d06a42c8a4845218d55b425d8550746fe790acf267e9", size = 1118516, upload_time = "2025-01-17T09:26:01.234Z" }, + { url = "https://files.pythonhosted.org/packages/91/60/a34e97564eac352c0e916c98f44b6f566b7eb6a9fb60bcd60ffa98530762/srsly-2.5.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7481460110d9986781d9e4ac0f5f991f1d6839284a80ad268625f9a23f686950", size = 1127974, upload_time = "2025-01-17T09:26:04.007Z" }, + { url = "https://files.pythonhosted.org/packages/70/a2/f642334db0cabd187fa86b8773257ee6993c6009338a6831d4804e2c5b3c/srsly-2.5.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6e57b8138082f09e35db60f99757e16652489e9e3692471d8e0c39aa95180688", size = 1086098, upload_time = "2025-01-17T09:26:05.612Z" }, + { url = "https://files.pythonhosted.org/packages/0d/9b/be48e185c5a010e71b5135e4cdf317ff56b8ac4bc08f394bbf882ac13b05/srsly-2.5.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:bab90b85a63a1fe0bbc74d373c8bb9bb0499ddfa89075e0ebe8d670f12d04691", size = 1100354, upload_time = "2025-01-17T09:26:07.215Z" }, + { url = "https://files.pythonhosted.org/packages/3a/e2/745aeba88a8513017fbac2fd2f9f07b8a36065e51695f818541eb795ec0c/srsly-2.5.1-cp313-cp313-win_amd64.whl", hash = "sha256:e73712be1634b5e1de6f81c273a7d47fe091ad3c79dc779c03d3416a5c117cee", size = 630634, upload_time = "2025-01-17T09:26:10.018Z" }, +] + [[package]] name = "starlette" version = "0.45.3" @@ -2183,6 +2595,38 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a6/a5/c0b6468d3824fe3fde30dbb5e1f687b291608f9473681bbf7dabbf5a87d7/text_unidecode-1.3-py2.py3-none-any.whl", hash = "sha256:1311f10e8b895935241623731c2ba64f4c455287888b18189350b67134a822e8", size = 78154, upload_time = "2019-08-30T21:37:03.543Z" }, ] +[[package]] +name = "thinc" +version = "8.3.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "blis" }, + { name = "catalogue" }, + { name = "confection" }, + { name = "cymem" }, + { name = "murmurhash" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "preshed" }, + { name = "pydantic" }, + { name = "setuptools" }, + { name = "srsly" }, + { name = "wasabi" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b5/ff/60c9bcfe28e56c905aac8e61a838c7afe5dc3073c9beed0b63a26ace0bb7/thinc-8.3.4.tar.gz", hash = "sha256:b5925482498bbb6dca0771e375b35c915818f735891e93d93a662dab15f6ffd8", size = 193903, upload_time = "2025-01-13T12:47:51.698Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/85/47/68187c78a04cdc31cbd3ae393068f994b60476b5ecac6dfe7d04b124aacf/thinc-8.3.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a8bb4b47358a1855803b375f4432cefdf373f46ef249b554418d2e77c7323040", size = 839320, upload_time = "2025-01-13T12:47:12.317Z" }, + { url = "https://files.pythonhosted.org/packages/49/ea/066dd415e61fcef20083bbca41c2c02e640fea71326531f2619708efee1e/thinc-8.3.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:00ed92f9a34b9794f51fcd48467c863f4eb7c5b41559aef6ef3c980c21378fec", size = 774196, upload_time = "2025-01-13T12:47:15.315Z" }, + { url = "https://files.pythonhosted.org/packages/8c/68/36c1a92a374891e0d496677c59f5f9fdc1e57bbb214c487bb8bb3e9290c2/thinc-8.3.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85691fca84a6a1506f7ddbd2c1706a5524d56f65582e76b2e260a06d9e83e86d", size = 3922504, upload_time = "2025-01-13T12:47:22.07Z" }, + { url = "https://files.pythonhosted.org/packages/ec/8a/48e463240a586e91f83c87660986e520aa91fbd839f6631ee9bc0fbb3cbd/thinc-8.3.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:eae1573fc19e514defc1bfd4f93f0b4bfc1dcefdb6d70bad1863825747f24800", size = 4932946, upload_time = "2025-01-13T12:47:24.177Z" }, + { url = "https://files.pythonhosted.org/packages/d9/98/f910b8d8113ab9b955a68e9bbf0d5bd0e828f22dd6d3c226af6ec3970817/thinc-8.3.4-cp311-cp311-win_amd64.whl", hash = "sha256:81e8638f9bdc38e366674acc4b63cf7c6267266a15477963a5db21b3d9f1aa36", size = 1490133, upload_time = "2025-01-13T12:47:26.152Z" }, + { url = "https://files.pythonhosted.org/packages/90/ff/d1b5d7e1a7f95581e9a736f50a5a9aff72327ddbbc629a68070c36acefd9/thinc-8.3.4-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c9da6375b106df5186bd2bfd1273bc923c01ab7d482f8942e4ee528a28965c3a", size = 825099, upload_time = "2025-01-13T12:47:27.881Z" }, + { url = "https://files.pythonhosted.org/packages/ce/0b/d207c917886dc40671361de0880ec3ea0443a718aae9dbb0a50ac0849f92/thinc-8.3.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:07091c6b5faace50857c4cf0982204969d77388d0a6f156dd2442297dceeb838", size = 761024, upload_time = "2025-01-13T12:47:29.739Z" }, + { url = "https://files.pythonhosted.org/packages/4b/a3/3ec5e9d7cbebc3257b8223a3d188216b91ab6ec1e66b6fdd99d22394bc62/thinc-8.3.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd40ad71bcd8b1b9daa0462e1255b1c1e86e901c2fd773966601f44a95878032", size = 3710390, upload_time = "2025-01-13T12:47:33.019Z" }, + { url = "https://files.pythonhosted.org/packages/40/ee/955c74e4e6ff2f694c99dcbbf7be8d478a8868503aeb3474517277c07667/thinc-8.3.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:eb10823b3a3f1c6440998b11bf9a3571dd859feaed0fdb510a1c1097d9dc6a86", size = 4731524, upload_time = "2025-01-13T12:47:35.203Z" }, + { url = "https://files.pythonhosted.org/packages/a4/44/3786431e5c1eeebed3d7a4c97122896ca6d4a502b03d02c2171c417052fd/thinc-8.3.4-cp312-cp312-win_amd64.whl", hash = "sha256:b5e5e7bf5dae142fd50ed9785971292c4aab4d9ed18e4947653b6a0584d5227c", size = 1455883, upload_time = "2025-01-13T12:47:36.914Z" }, +] + [[package]] name = "tqdm" version = "4.67.1" @@ -2320,6 +2764,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/63/9a/0962b05b308494e3202d3f794a6e85abe471fe3cafdbcf95c2e8c713aabd/uvloop-0.21.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a5c39f217ab3c663dc699c04cbd50c13813e31d917642d459fdcec07555cc553", size = 4660018, upload_time = "2024-10-14T23:38:10.888Z" }, ] +[[package]] +name = "wasabi" +version = "1.1.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ac/f9/054e6e2f1071e963b5e746b48d1e3727470b2a490834d18ad92364929db3/wasabi-1.1.3.tar.gz", hash = "sha256:4bb3008f003809db0c3e28b4daf20906ea871a2bb43f9914197d540f4f2e0878", size = 30391, upload_time = "2024-05-31T16:56:18.99Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/06/7c/34330a89da55610daa5f245ddce5aab81244321101614751e7537f125133/wasabi-1.1.3-py3-none-any.whl", hash = "sha256:f76e16e8f7e79f8c4c8be49b4024ac725713ab10cd7f19350ad18a8e3f71728c", size = 27880, upload_time = "2024-05-31T16:56:16.699Z" }, +] + [[package]] name = "watchfiles" version = "1.0.5" @@ -2369,6 +2825,26 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a8/b4/c57b99518fadf431f3ef47a610839e46e5f8abf9814f969859d1c65c02c7/watchfiles-1.0.5-cp313-cp313-win_amd64.whl", hash = "sha256:f436601594f15bf406518af922a89dcaab416568edb6f65c4e5bbbad1ea45c11", size = 291087, upload_time = "2025-04-08T10:35:52.458Z" }, ] +[[package]] +name = "weasel" +version = "0.4.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cloudpathlib" }, + { name = "confection" }, + { name = "packaging" }, + { name = "pydantic" }, + { name = "requests" }, + { name = "smart-open" }, + { name = "srsly" }, + { name = "typer" }, + { name = "wasabi" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a7/1a/9c522dd61b52939c217925d3e55c95f9348b73a66a956f52608e1e59a2c0/weasel-0.4.1.tar.gz", hash = "sha256:aabc210f072e13f6744e5c3a28037f93702433405cd35673f7c6279147085aa9", size = 38417, upload_time = "2024-05-15T08:52:54.765Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2a/87/abd57374044e1f627f0a905ac33c1a7daab35a3a815abfea4e1bafd3fdb1/weasel-0.4.1-py3-none-any.whl", hash = "sha256:24140a090ea1ac512a2b2f479cc64192fd1d527a7f3627671268d08ed5ac418c", size = 50270, upload_time = "2024-05-15T08:52:52.977Z" }, +] + [[package]] name = "websockets" version = "15.0.1" @@ -2411,6 +2887,65 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fa/a8/5b41e0da817d64113292ab1f8247140aac61cbf6cfd085d6a0fa77f4984f/websockets-15.0.1-py3-none-any.whl", hash = "sha256:f7a866fbc1e97b5c617ee4116daaa09b722101d4a3c170c787450ba409f9736f", size = 169743, upload_time = "2025-03-05T20:03:39.41Z" }, ] +[[package]] +name = "wrapt" +version = "1.17.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/95/8f/aeb76c5b46e273670962298c23e7ddde79916cb74db802131d49a85e4b7d/wrapt-1.17.3.tar.gz", hash = "sha256:f66eb08feaa410fe4eebd17f2a2c8e2e46d3476e9f8c783daa8e09e0faa666d0", size = 55547, upload_time = "2025-08-12T05:53:21.714Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/52/db/00e2a219213856074a213503fdac0511203dceefff26e1daa15250cc01a0/wrapt-1.17.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:273a736c4645e63ac582c60a56b0acb529ef07f78e08dc6bfadf6a46b19c0da7", size = 53482, upload_time = "2025-08-12T05:51:45.79Z" }, + { url = "https://files.pythonhosted.org/packages/5e/30/ca3c4a5eba478408572096fe9ce36e6e915994dd26a4e9e98b4f729c06d9/wrapt-1.17.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5531d911795e3f935a9c23eb1c8c03c211661a5060aab167065896bbf62a5f85", size = 38674, upload_time = "2025-08-12T05:51:34.629Z" }, + { url = "https://files.pythonhosted.org/packages/31/25/3e8cc2c46b5329c5957cec959cb76a10718e1a513309c31399a4dad07eb3/wrapt-1.17.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:0610b46293c59a3adbae3dee552b648b984176f8562ee0dba099a56cfbe4df1f", size = 38959, upload_time = "2025-08-12T05:51:56.074Z" }, + { url = "https://files.pythonhosted.org/packages/5d/8f/a32a99fc03e4b37e31b57cb9cefc65050ea08147a8ce12f288616b05ef54/wrapt-1.17.3-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:b32888aad8b6e68f83a8fdccbf3165f5469702a7544472bdf41f582970ed3311", size = 82376, upload_time = "2025-08-12T05:52:32.134Z" }, + { url = "https://files.pythonhosted.org/packages/31/57/4930cb8d9d70d59c27ee1332a318c20291749b4fba31f113c2f8ac49a72e/wrapt-1.17.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8cccf4f81371f257440c88faed6b74f1053eef90807b77e31ca057b2db74edb1", size = 83604, upload_time = "2025-08-12T05:52:11.663Z" }, + { url = "https://files.pythonhosted.org/packages/a8/f3/1afd48de81d63dd66e01b263a6fbb86e1b5053b419b9b33d13e1f6d0f7d0/wrapt-1.17.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d8a210b158a34164de8bb68b0e7780041a903d7b00c87e906fb69928bf7890d5", size = 82782, upload_time = "2025-08-12T05:52:12.626Z" }, + { url = "https://files.pythonhosted.org/packages/1e/d7/4ad5327612173b144998232f98a85bb24b60c352afb73bc48e3e0d2bdc4e/wrapt-1.17.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:79573c24a46ce11aab457b472efd8d125e5a51da2d1d24387666cd85f54c05b2", size = 82076, upload_time = "2025-08-12T05:52:33.168Z" }, + { url = "https://files.pythonhosted.org/packages/bb/59/e0adfc831674a65694f18ea6dc821f9fcb9ec82c2ce7e3d73a88ba2e8718/wrapt-1.17.3-cp311-cp311-win32.whl", hash = "sha256:c31eebe420a9a5d2887b13000b043ff6ca27c452a9a22fa71f35f118e8d4bf89", size = 36457, upload_time = "2025-08-12T05:53:03.936Z" }, + { url = "https://files.pythonhosted.org/packages/83/88/16b7231ba49861b6f75fc309b11012ede4d6b0a9c90969d9e0db8d991aeb/wrapt-1.17.3-cp311-cp311-win_amd64.whl", hash = "sha256:0b1831115c97f0663cb77aa27d381237e73ad4f721391a9bfb2fe8bc25fa6e77", size = 38745, upload_time = "2025-08-12T05:53:02.885Z" }, + { url = "https://files.pythonhosted.org/packages/9a/1e/c4d4f3398ec073012c51d1c8d87f715f56765444e1a4b11e5180577b7e6e/wrapt-1.17.3-cp311-cp311-win_arm64.whl", hash = "sha256:5a7b3c1ee8265eb4c8f1b7d29943f195c00673f5ab60c192eba2d4a7eae5f46a", size = 36806, upload_time = "2025-08-12T05:52:53.368Z" }, + { url = "https://files.pythonhosted.org/packages/9f/41/cad1aba93e752f1f9268c77270da3c469883d56e2798e7df6240dcb2287b/wrapt-1.17.3-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:ab232e7fdb44cdfbf55fc3afa31bcdb0d8980b9b95c38b6405df2acb672af0e0", size = 53998, upload_time = "2025-08-12T05:51:47.138Z" }, + { url = "https://files.pythonhosted.org/packages/60/f8/096a7cc13097a1869fe44efe68dace40d2a16ecb853141394047f0780b96/wrapt-1.17.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:9baa544e6acc91130e926e8c802a17f3b16fbea0fd441b5a60f5cf2cc5c3deba", size = 39020, upload_time = "2025-08-12T05:51:35.906Z" }, + { url = "https://files.pythonhosted.org/packages/33/df/bdf864b8997aab4febb96a9ae5c124f700a5abd9b5e13d2a3214ec4be705/wrapt-1.17.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6b538e31eca1a7ea4605e44f81a48aa24c4632a277431a6ed3f328835901f4fd", size = 39098, upload_time = "2025-08-12T05:51:57.474Z" }, + { url = "https://files.pythonhosted.org/packages/9f/81/5d931d78d0eb732b95dc3ddaeeb71c8bb572fb01356e9133916cd729ecdd/wrapt-1.17.3-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:042ec3bb8f319c147b1301f2393bc19dba6e176b7da446853406d041c36c7828", size = 88036, upload_time = "2025-08-12T05:52:34.784Z" }, + { url = "https://files.pythonhosted.org/packages/ca/38/2e1785df03b3d72d34fc6252d91d9d12dc27a5c89caef3335a1bbb8908ca/wrapt-1.17.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3af60380ba0b7b5aeb329bc4e402acd25bd877e98b3727b0135cb5c2efdaefe9", size = 88156, upload_time = "2025-08-12T05:52:13.599Z" }, + { url = "https://files.pythonhosted.org/packages/b3/8b/48cdb60fe0603e34e05cffda0b2a4adab81fd43718e11111a4b0100fd7c1/wrapt-1.17.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0b02e424deef65c9f7326d8c19220a2c9040c51dc165cddb732f16198c168396", size = 87102, upload_time = "2025-08-12T05:52:14.56Z" }, + { url = "https://files.pythonhosted.org/packages/3c/51/d81abca783b58f40a154f1b2c56db1d2d9e0d04fa2d4224e357529f57a57/wrapt-1.17.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:74afa28374a3c3a11b3b5e5fca0ae03bef8450d6aa3ab3a1e2c30e3a75d023dc", size = 87732, upload_time = "2025-08-12T05:52:36.165Z" }, + { url = "https://files.pythonhosted.org/packages/9e/b1/43b286ca1392a006d5336412d41663eeef1ad57485f3e52c767376ba7e5a/wrapt-1.17.3-cp312-cp312-win32.whl", hash = "sha256:4da9f45279fff3543c371d5ababc57a0384f70be244de7759c85a7f989cb4ebe", size = 36705, upload_time = "2025-08-12T05:53:07.123Z" }, + { url = "https://files.pythonhosted.org/packages/28/de/49493f962bd3c586ab4b88066e967aa2e0703d6ef2c43aa28cb83bf7b507/wrapt-1.17.3-cp312-cp312-win_amd64.whl", hash = "sha256:e71d5c6ebac14875668a1e90baf2ea0ef5b7ac7918355850c0908ae82bcb297c", size = 38877, upload_time = "2025-08-12T05:53:05.436Z" }, + { url = "https://files.pythonhosted.org/packages/f1/48/0f7102fe9cb1e8a5a77f80d4f0956d62d97034bbe88d33e94699f99d181d/wrapt-1.17.3-cp312-cp312-win_arm64.whl", hash = "sha256:604d076c55e2fdd4c1c03d06dc1a31b95130010517b5019db15365ec4a405fc6", size = 36885, upload_time = "2025-08-12T05:52:54.367Z" }, + { url = "https://files.pythonhosted.org/packages/fc/f6/759ece88472157acb55fc195e5b116e06730f1b651b5b314c66291729193/wrapt-1.17.3-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a47681378a0439215912ef542c45a783484d4dd82bac412b71e59cf9c0e1cea0", size = 54003, upload_time = "2025-08-12T05:51:48.627Z" }, + { url = "https://files.pythonhosted.org/packages/4f/a9/49940b9dc6d47027dc850c116d79b4155f15c08547d04db0f07121499347/wrapt-1.17.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:54a30837587c6ee3cd1a4d1c2ec5d24e77984d44e2f34547e2323ddb4e22eb77", size = 39025, upload_time = "2025-08-12T05:51:37.156Z" }, + { url = "https://files.pythonhosted.org/packages/45/35/6a08de0f2c96dcdd7fe464d7420ddb9a7655a6561150e5fc4da9356aeaab/wrapt-1.17.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:16ecf15d6af39246fe33e507105d67e4b81d8f8d2c6598ff7e3ca1b8a37213f7", size = 39108, upload_time = "2025-08-12T05:51:58.425Z" }, + { url = "https://files.pythonhosted.org/packages/0c/37/6faf15cfa41bf1f3dba80cd3f5ccc6622dfccb660ab26ed79f0178c7497f/wrapt-1.17.3-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:6fd1ad24dc235e4ab88cda009e19bf347aabb975e44fd5c2fb22a3f6e4141277", size = 88072, upload_time = "2025-08-12T05:52:37.53Z" }, + { url = "https://files.pythonhosted.org/packages/78/f2/efe19ada4a38e4e15b6dff39c3e3f3f73f5decf901f66e6f72fe79623a06/wrapt-1.17.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0ed61b7c2d49cee3c027372df5809a59d60cf1b6c2f81ee980a091f3afed6a2d", size = 88214, upload_time = "2025-08-12T05:52:15.886Z" }, + { url = "https://files.pythonhosted.org/packages/40/90/ca86701e9de1622b16e09689fc24b76f69b06bb0150990f6f4e8b0eeb576/wrapt-1.17.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:423ed5420ad5f5529db9ce89eac09c8a2f97da18eb1c870237e84c5a5c2d60aa", size = 87105, upload_time = "2025-08-12T05:52:17.914Z" }, + { url = "https://files.pythonhosted.org/packages/fd/e0/d10bd257c9a3e15cbf5523025252cc14d77468e8ed644aafb2d6f54cb95d/wrapt-1.17.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e01375f275f010fcbf7f643b4279896d04e571889b8a5b3f848423d91bf07050", size = 87766, upload_time = "2025-08-12T05:52:39.243Z" }, + { url = "https://files.pythonhosted.org/packages/e8/cf/7d848740203c7b4b27eb55dbfede11aca974a51c3d894f6cc4b865f42f58/wrapt-1.17.3-cp313-cp313-win32.whl", hash = "sha256:53e5e39ff71b3fc484df8a522c933ea2b7cdd0d5d15ae82e5b23fde87d44cbd8", size = 36711, upload_time = "2025-08-12T05:53:10.074Z" }, + { url = "https://files.pythonhosted.org/packages/57/54/35a84d0a4d23ea675994104e667ceff49227ce473ba6a59ba2c84f250b74/wrapt-1.17.3-cp313-cp313-win_amd64.whl", hash = "sha256:1f0b2f40cf341ee8cc1a97d51ff50dddb9fcc73241b9143ec74b30fc4f44f6cb", size = 38885, upload_time = "2025-08-12T05:53:08.695Z" }, + { url = "https://files.pythonhosted.org/packages/01/77/66e54407c59d7b02a3c4e0af3783168fff8e5d61def52cda8728439d86bc/wrapt-1.17.3-cp313-cp313-win_arm64.whl", hash = "sha256:7425ac3c54430f5fc5e7b6f41d41e704db073309acfc09305816bc6a0b26bb16", size = 36896, upload_time = "2025-08-12T05:52:55.34Z" }, + { url = "https://files.pythonhosted.org/packages/02/a2/cd864b2a14f20d14f4c496fab97802001560f9f41554eef6df201cd7f76c/wrapt-1.17.3-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:cf30f6e3c077c8e6a9a7809c94551203c8843e74ba0c960f4a98cd80d4665d39", size = 54132, upload_time = "2025-08-12T05:51:49.864Z" }, + { url = "https://files.pythonhosted.org/packages/d5/46/d011725b0c89e853dc44cceb738a307cde5d240d023d6d40a82d1b4e1182/wrapt-1.17.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:e228514a06843cae89621384cfe3a80418f3c04aadf8a3b14e46a7be704e4235", size = 39091, upload_time = "2025-08-12T05:51:38.935Z" }, + { url = "https://files.pythonhosted.org/packages/2e/9e/3ad852d77c35aae7ddebdbc3b6d35ec8013af7d7dddad0ad911f3d891dae/wrapt-1.17.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:5ea5eb3c0c071862997d6f3e02af1d055f381b1d25b286b9d6644b79db77657c", size = 39172, upload_time = "2025-08-12T05:51:59.365Z" }, + { url = "https://files.pythonhosted.org/packages/c3/f7/c983d2762bcce2326c317c26a6a1e7016f7eb039c27cdf5c4e30f4160f31/wrapt-1.17.3-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:281262213373b6d5e4bb4353bc36d1ba4084e6d6b5d242863721ef2bf2c2930b", size = 87163, upload_time = "2025-08-12T05:52:40.965Z" }, + { url = "https://files.pythonhosted.org/packages/e4/0f/f673f75d489c7f22d17fe0193e84b41540d962f75fce579cf6873167c29b/wrapt-1.17.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dc4a8d2b25efb6681ecacad42fca8859f88092d8732b170de6a5dddd80a1c8fa", size = 87963, upload_time = "2025-08-12T05:52:20.326Z" }, + { url = "https://files.pythonhosted.org/packages/df/61/515ad6caca68995da2fac7a6af97faab8f78ebe3bf4f761e1b77efbc47b5/wrapt-1.17.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:373342dd05b1d07d752cecbec0c41817231f29f3a89aa8b8843f7b95992ed0c7", size = 86945, upload_time = "2025-08-12T05:52:21.581Z" }, + { url = "https://files.pythonhosted.org/packages/d3/bd/4e70162ce398462a467bc09e768bee112f1412e563620adc353de9055d33/wrapt-1.17.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:d40770d7c0fd5cbed9d84b2c3f2e156431a12c9a37dc6284060fb4bec0b7ffd4", size = 86857, upload_time = "2025-08-12T05:52:43.043Z" }, + { url = "https://files.pythonhosted.org/packages/2b/b8/da8560695e9284810b8d3df8a19396a6e40e7518059584a1a394a2b35e0a/wrapt-1.17.3-cp314-cp314-win32.whl", hash = "sha256:fbd3c8319de8e1dc79d346929cd71d523622da527cca14e0c1d257e31c2b8b10", size = 37178, upload_time = "2025-08-12T05:53:12.605Z" }, + { url = "https://files.pythonhosted.org/packages/db/c8/b71eeb192c440d67a5a0449aaee2310a1a1e8eca41676046f99ed2487e9f/wrapt-1.17.3-cp314-cp314-win_amd64.whl", hash = "sha256:e1a4120ae5705f673727d3253de3ed0e016f7cd78dc463db1b31e2463e1f3cf6", size = 39310, upload_time = "2025-08-12T05:53:11.106Z" }, + { url = "https://files.pythonhosted.org/packages/45/20/2cda20fd4865fa40f86f6c46ed37a2a8356a7a2fde0773269311f2af56c7/wrapt-1.17.3-cp314-cp314-win_arm64.whl", hash = "sha256:507553480670cab08a800b9463bdb881b2edeed77dc677b0a5915e6106e91a58", size = 37266, upload_time = "2025-08-12T05:52:56.531Z" }, + { url = "https://files.pythonhosted.org/packages/77/ed/dd5cf21aec36c80443c6f900449260b80e2a65cf963668eaef3b9accce36/wrapt-1.17.3-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:ed7c635ae45cfbc1a7371f708727bf74690daedc49b4dba310590ca0bd28aa8a", size = 56544, upload_time = "2025-08-12T05:51:51.109Z" }, + { url = "https://files.pythonhosted.org/packages/8d/96/450c651cc753877ad100c7949ab4d2e2ecc4d97157e00fa8f45df682456a/wrapt-1.17.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:249f88ed15503f6492a71f01442abddd73856a0032ae860de6d75ca62eed8067", size = 40283, upload_time = "2025-08-12T05:51:39.912Z" }, + { url = "https://files.pythonhosted.org/packages/d1/86/2fcad95994d9b572db57632acb6f900695a648c3e063f2cd344b3f5c5a37/wrapt-1.17.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:5a03a38adec8066d5a37bea22f2ba6bbf39fcdefbe2d91419ab864c3fb515454", size = 40366, upload_time = "2025-08-12T05:52:00.693Z" }, + { url = "https://files.pythonhosted.org/packages/64/0e/f4472f2fdde2d4617975144311f8800ef73677a159be7fe61fa50997d6c0/wrapt-1.17.3-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:5d4478d72eb61c36e5b446e375bbc49ed002430d17cdec3cecb36993398e1a9e", size = 108571, upload_time = "2025-08-12T05:52:44.521Z" }, + { url = "https://files.pythonhosted.org/packages/cc/01/9b85a99996b0a97c8a17484684f206cbb6ba73c1ce6890ac668bcf3838fb/wrapt-1.17.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:223db574bb38637e8230eb14b185565023ab624474df94d2af18f1cdb625216f", size = 113094, upload_time = "2025-08-12T05:52:22.618Z" }, + { url = "https://files.pythonhosted.org/packages/25/02/78926c1efddcc7b3aa0bc3d6b33a822f7d898059f7cd9ace8c8318e559ef/wrapt-1.17.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e405adefb53a435f01efa7ccdec012c016b5a1d3f35459990afc39b6be4d5056", size = 110659, upload_time = "2025-08-12T05:52:24.057Z" }, + { url = "https://files.pythonhosted.org/packages/dc/ee/c414501ad518ac3e6fe184753632fe5e5ecacdcf0effc23f31c1e4f7bfcf/wrapt-1.17.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:88547535b787a6c9ce4086917b6e1d291aa8ed914fdd3a838b3539dc95c12804", size = 106946, upload_time = "2025-08-12T05:52:45.976Z" }, + { url = "https://files.pythonhosted.org/packages/be/44/a1bd64b723d13bb151d6cc91b986146a1952385e0392a78567e12149c7b4/wrapt-1.17.3-cp314-cp314t-win32.whl", hash = "sha256:41b1d2bc74c2cac6f9074df52b2efbef2b30bdfe5f40cb78f8ca22963bc62977", size = 38717, upload_time = "2025-08-12T05:53:15.214Z" }, + { url = "https://files.pythonhosted.org/packages/79/d9/7cfd5a312760ac4dd8bf0184a6ee9e43c33e47f3dadc303032ce012b8fa3/wrapt-1.17.3-cp314-cp314t-win_amd64.whl", hash = "sha256:73d496de46cd2cdbdbcce4ae4bcdb4afb6a11234a1df9c085249d55166b95116", size = 41334, upload_time = "2025-08-12T05:53:14.178Z" }, + { url = "https://files.pythonhosted.org/packages/46/78/10ad9781128ed2f99dbc474f43283b13fea8ba58723e98844367531c18e9/wrapt-1.17.3-cp314-cp314t-win_arm64.whl", hash = "sha256:f38e60678850c42461d4202739f9bf1e3a737c7ad283638251e79cc49effb6b6", size = 38471, upload_time = "2025-08-12T05:52:57.784Z" }, + { url = "https://files.pythonhosted.org/packages/1f/f6/a933bd70f98e9cf3e08167fc5cd7aaaca49147e48411c0bd5ae701bb2194/wrapt-1.17.3-py3-none-any.whl", hash = "sha256:7171ae35d2c33d326ac19dd8facb1e82e5fd04ef8c6c0e394d7af55a55051c22", size = 23591, upload_time = "2025-08-12T05:53:20.674Z" }, +] + [[package]] name = "xxhash" version = "3.5.0"