diff --git a/Dockerfile b/Dockerfile index 85931528..e96272b0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,6 +14,8 @@ RUN uv sync --locked --no-dev # Must call from the root directory because uv does not add playwright to path RUN playwright install-deps chromium RUN playwright install chromium +# Download Spacy Model +RUN python -m spacy download en_core_web_sm # Copy project files COPY src ./src diff --git a/ENV.md b/ENV.md index 4085fcd6..c0df0c2d 100644 --- a/ENV.md +++ b/ENV.md @@ -53,7 +53,10 @@ The following flags are available: | `RUN_URL_TASKS_TASK_FLAG` | Runs URL tasks. | | `IA_PROBE_TASK_FLAG` | Extracts and links Internet Archives metadata to URLs. | | `IA_SAVE_TASK_FLAG` | Saves URLs to Internet Archives. | - +| `AGENCY_ID_HOMEPAGE_MATCH_FLAG` | Enables the homepage match subtask for agency identification. | +| `AGENCY_ID_NLP_LOCATION_MATCH_FLAG` | Enables the NLP location match subtask for agency identification. | +| `AGENCY_ID_CKAN_FLAG` | Enables the CKAN subtask for agency identification. | +| `AGENCY_ID_MUCKROCK_FLAG` | Enables the MuckRock subtask for agency identification. | ## Foreign Data Wrapper (FDW) diff --git a/alembic/versions/2025_08_09_2031-8cd5aa7670ff_remove_functional_duplicates.py b/alembic/versions/2025_08_09_2031-8cd5aa7670ff_remove_functional_duplicates.py index 846329ca..201d2448 100644 --- a/alembic/versions/2025_08_09_2031-8cd5aa7670ff_remove_functional_duplicates.py +++ b/alembic/versions/2025_08_09_2031-8cd5aa7670ff_remove_functional_duplicates.py @@ -52,7 +52,7 @@ def downgrade() -> None: _remove_cascade_foreign_key(URL_ERROR_INFO_TABLE_NAME, foreign_key_name=URL_ERROR_INFO_FOREIGN_KEY_NAME) _remove_cascade_foreign_key(COMPRESSED_HTML_TABLE_NAME, foreign_key_name=COMPRESSED_HTML_FOREIGN_KEY_NAME) _remove_cascade_foreign_key(URL_HTML_CONTENT_TABLE_NAME, foreign_key_name=URL_HTML_CONTENT_FOREIGN_KEY_NAME) - _remove_cascade_foreign_key(AUTOMATED_URL_AGENCY_SUGGESTION_TABLE_NAME, foreign_key_name=AUTOMATED_URL_AGENCY_SUGGESTION_FOREIGN_KEY_NAME) + # _remove_cascade_foreign_key(AUTOMATED_URL_AGENCY_SUGGESTION_TABLE_NAME, foreign_key_name=AUTOMATED_URL_AGENCY_SUGGESTION_FOREIGN_KEY_NAME) def _delete_duplicate_urls() -> None: op.execute('delete from urls where id in (2341,2343,2344,2347,2348,2349,2354,2359,2361,2501,2504,2505,2506,2507)') diff --git a/alembic/versions/2025_08_19_0803-b741b65a1431_augment_auto_agency_suggestions.py b/alembic/versions/2025_08_19_0803-b741b65a1431_augment_auto_agency_suggestions.py new file mode 100644 index 00000000..de3069e2 --- /dev/null +++ b/alembic/versions/2025_08_19_0803-b741b65a1431_augment_auto_agency_suggestions.py @@ -0,0 +1,254 @@ +"""Augment auto_agency_suggestions + +Revision ID: b741b65a1431 +Revises: 8a70ee509a74 +Create Date: 2025-08-19 08:03:12.106575 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + +from src.util.alembic_helpers import created_at_column, updated_at_column, id_column, url_id_column, switch_enum_type + +# revision identifiers, used by Alembic. +revision: str = 'b741b65a1431' +down_revision: Union[str, None] = '8a70ee509a74' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + +OLD_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME = "automated_url_agency_suggestions" +NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME = "url_auto_agency_suggestions" + +OLD_LINK_URLS_AGENCY_TABLE_NAME = "link_urls_agencies" +NEW_LINK_URLS_AGENCY_TABLE_NAME = "link_urls_agency" + +AGENCY_AUTO_SUGGESTION_METHOD_ENUM = sa.Enum( + "homepage_match", + "nlp_location_match", + "muckrock_match", + "ckan_match", + name="agency_auto_suggestion_method", +) + +FLAG_URL_VALIDATED_TABLE_NAME = "flag_url_validated" + +VALIDATED_URL_TYPE_ENUM = sa.Enum( + "data source", + "meta url", + "not relevant", + "individual record", + name="validated_url_type" +) + + + + +def upgrade() -> None: + op.rename_table(OLD_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME) + op.rename_table(OLD_LINK_URLS_AGENCY_TABLE_NAME, NEW_LINK_URLS_AGENCY_TABLE_NAME) + _alter_auto_agency_suggestions_table() + _create_flag_url_validated_table() + _add_urls_to_flag_url_validated_table() + _remove_validated_and_submitted_url_statuses() + _reset_agencies_sync_state() + + +def downgrade() -> None: + op.rename_table(NEW_LINK_URLS_AGENCY_TABLE_NAME, OLD_LINK_URLS_AGENCY_TABLE_NAME) + _revert_auto_agency_suggestions_table() + op.rename_table(NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, OLD_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME) + _revert_url_statuses() + _update_validated_and_submitted_url_statuses() + op.drop_table(FLAG_URL_VALIDATED_TABLE_NAME) + _drop_validated_url_type_enum() + +def _reset_agencies_sync_state(): + op.execute( + """ + UPDATE agencies_sync_state + set + last_full_sync_at = null, + current_cutoff_date = null, + current_page = null + """ + ) + +def _remove_validated_and_submitted_url_statuses(): + switch_enum_type( + table_name="urls", + column_name="status", + enum_name="url_status", + new_enum_values=[ + 'ok', + 'duplicate', + 'error', + '404 not found', + ], + check_constraints_to_drop=['url_name_not_null_when_validated'], + conversion_mappings={ + 'validated': 'ok', + 'submitted': 'ok', + 'pending': 'ok', + 'not relevant': 'ok', + 'individual record': 'ok' + } + ) + +def _add_urls_to_flag_url_validated_table(): + op.execute(""" + INSERT INTO flag_url_validated (url_id, type) + SELECT + urls.id, + CASE urls.status::text + WHEN 'validated' THEN 'data source' + WHEN 'submitted' THEN 'data source' + ELSE urls.status::text + END::validated_url_type + FROM urls + WHERE urls.status in ('validated', 'submitted', 'individual record', 'not relevant')""") + +def _revert_url_statuses(): + switch_enum_type( + table_name="urls", + column_name="status", + enum_name="url_status", + new_enum_values=[ + 'pending', + 'validated', + 'submitted', + 'duplicate', + 'not relevant', + 'error', + '404 not found', + 'individual record' + ], + conversion_mappings={ + 'ok': 'pending', + } + ) + op.create_check_constraint( + "url_name_not_null_when_validated", + "urls", + "(name IS NOT NULL) OR (status <> 'validated'::url_status)" + ) + +def _update_validated_and_submitted_url_statuses(): + op.execute(""" + UPDATE urls + SET status = 'not relevant' + FROM flag_url_validated + WHERE urls.id = flag_url_validated.id + AND flag_url_validated.type = 'not relevant' + """) + + op.execute(""" + UPDATE urls + SET status = 'individual record' + FROM flag_url_validated + WHERE urls.id = flag_url_validated.id + AND flag_url_validated.type = 'individual record' + """) + + op.execute(""" + UPDATE urls + SET status = 'validated' + FROM flag_url_validated + left join url_data_source on flag_url_validated.url_id = url_data_source.url_id + WHERE urls.id = flag_url_validated.id + AND flag_url_validated.type = 'data source' + AND url_data_source.url_id is NULL + """) + + op.execute(""" + UPDATE urls + SET status = 'validated' + FROM flag_url_validated + left join url_data_source on flag_url_validated.url_id = url_data_source.url_id + WHERE urls.id = flag_url_validated.id + AND flag_url_validated.type = 'data source' + AND url_data_source.url_id is not NULL + """) + + +def _create_flag_url_validated_table(): + op.create_table( + FLAG_URL_VALIDATED_TABLE_NAME, + id_column(), + url_id_column(), + sa.Column( + 'type', + VALIDATED_URL_TYPE_ENUM, + nullable=False, + ), + created_at_column(), + updated_at_column(), + sa.UniqueConstraint('url_id', name='uq_flag_url_validated_url_id') + ) + +def _drop_validated_url_type_enum(): + VALIDATED_URL_TYPE_ENUM.drop(op.get_bind()) + +def _alter_auto_agency_suggestions_table(): + AGENCY_AUTO_SUGGESTION_METHOD_ENUM.create(op.get_bind()) + # Created At + op.add_column( + NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, + created_at_column() + ) + # Updated At + op.add_column( + NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, + updated_at_column() + ) + # Method + op.add_column( + NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, + sa.Column( + 'method', + AGENCY_AUTO_SUGGESTION_METHOD_ENUM, + nullable=True + ) + ) + # Confidence + op.add_column( + NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, + sa.Column( + 'confidence', + sa.Float(), + server_default=sa.text('0.0'), + nullable=False + ) + ) + # Check constraint that confidence is between 0 and 1 + op.create_check_constraint( + "auto_url_agency_suggestions_check_confidence_between_0_and_1", + NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, + "confidence BETWEEN 0 AND 1" + ) + + +def _revert_auto_agency_suggestions_table(): + # Created At + op.drop_column( + NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, + 'created_at' + ) + # Updated At + op.drop_column( + NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, + 'updated_at' + ) + # Method + op.drop_column( + NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, + 'method' + ) + # Confidence + op.drop_column( + NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, + 'confidence' + ) + AGENCY_AUTO_SUGGESTION_METHOD_ENUM.drop(op.get_bind()) + diff --git a/alembic/versions/2025_08_31_1930-70baaee0dd79_overhaul_agency_identification.py b/alembic/versions/2025_08_31_1930-70baaee0dd79_overhaul_agency_identification.py new file mode 100644 index 00000000..39703fde --- /dev/null +++ b/alembic/versions/2025_08_31_1930-70baaee0dd79_overhaul_agency_identification.py @@ -0,0 +1,267 @@ +"""Overhaul agency identification + +Revision ID: 70baaee0dd79 +Revises: b741b65a1431 +Create Date: 2025-08-31 19:30:20.690369 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + +from src.util.alembic_helpers import id_column, url_id_column, created_at_column, agency_id_column, updated_at_column, \ + task_id_column + +# revision identifiers, used by Alembic. +revision: str = '70baaee0dd79' +down_revision: Union[str, None] = 'b741b65a1431' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + +URL_HAS_AGENCY_SUGGESTIONS_VIEW_NAME: str = "url_has_agency_auto_suggestions_view" +URL_UNKNOWN_AGENCIES_VIEW_NAME: str = "url_unknown_agencies_view" + +URL_AUTO_AGENCY_SUBTASK_TABLE_NAME: str = "url_auto_agency_id_subtasks" +LINK_AGENCY_ID_SUBTASK_AGENCIES_TABLE_NAME: str = "agency_id_subtask_suggestions" + +META_URL_VIEW_NAME: str = "meta_url_view" +UNVALIDATED_URL_VIEW_NAME: str = "unvalidated_url_view" + +URL_AUTO_AGENCY_SUGGESTIONS_TABLE_NAME: str = "url_auto_agency_suggestions" + +AGENCY_AUTO_SUGGESTION_METHOD_ENUM = sa.dialects.postgresql.ENUM( + name="agency_auto_suggestion_method", + create_type=False +) + +SUBTASK_DETAIL_CODE_ENUM = sa.Enum( + 'no details', + 'retrieval error', + 'homepage-single agency', + 'homepage-multi agency', + name="agency_id_subtask_detail_code", +) + + + + + +def upgrade() -> None: + _create_url_auto_agency_subtask_table() + _create_url_unknown_agencies_view() + _create_meta_url_view() + _create_link_agency_id_subtask_agencies_table() + _drop_url_annotation_flags_view() + _create_new_url_annotation_flags_view() + _drop_url_auto_agency_suggestions_table() + _create_unvalidated_urls_view() + + +def downgrade() -> None: + _drop_url_unknown_agencies_view() + _create_url_auto_agency_suggestions_table() + _drop_url_annotation_flags_view() + _create_old_url_annotation_flags_view() + _drop_link_agency_id_subtask_agencies_table() + _drop_url_auto_agency_subtask_table() + _drop_meta_url_view() + SUBTASK_DETAIL_CODE_ENUM.drop(op.get_bind()) + _drop_unvalidated_urls_view() + +def _create_unvalidated_urls_view(): + op.execute(f""" + CREATE OR REPLACE VIEW {UNVALIDATED_URL_VIEW_NAME} as + select + u.id as url_id + from + urls u + left join flag_url_validated fuv + on fuv.url_id = u.id + where + fuv.type is null + """) + +def _drop_unvalidated_urls_view(): + op.execute(f"DROP VIEW IF EXISTS {UNVALIDATED_URL_VIEW_NAME}") + + +def _drop_url_annotation_flags_view(): + op.execute(f"DROP VIEW IF EXISTS url_annotation_flags") + + +def _drop_meta_url_view(): + op.execute(f"DROP VIEW IF EXISTS {META_URL_VIEW_NAME}") + + +def _create_meta_url_view(): + op.execute(f""" + CREATE OR REPLACE VIEW {META_URL_VIEW_NAME} AS + SELECT + urls.id as url_id + FROM urls + INNER JOIN flag_url_validated fuv on fuv.url_id = urls.id + where fuv.type = 'meta url' + """) + +def _drop_url_auto_agency_suggestions_table(): + op.drop_table(URL_AUTO_AGENCY_SUGGESTIONS_TABLE_NAME) + + +def _create_new_url_annotation_flags_view(): + + op.execute( + f""" + CREATE OR REPLACE VIEW url_annotation_flags AS + ( + SELECT u.id as url_id, + EXISTS (SELECT 1 FROM public.auto_record_type_suggestions a WHERE a.url_id = u.id) AS has_auto_record_type_suggestion, + EXISTS (SELECT 1 FROM public.auto_relevant_suggestions a WHERE a.url_id = u.id) AS has_auto_relevant_suggestion, + EXISTS (SELECT 1 FROM public.{URL_AUTO_AGENCY_SUBTASK_TABLE_NAME} a WHERE a.url_id = u.id) AS has_auto_agency_suggestion, + EXISTS (SELECT 1 FROM public.user_record_type_suggestions a WHERE a.url_id = u.id) AS has_user_record_type_suggestion, + EXISTS (SELECT 1 FROM public.user_relevant_suggestions a WHERE a.url_id = u.id) AS has_user_relevant_suggestion, + EXISTS (SELECT 1 FROM public.user_url_agency_suggestions a WHERE a.url_id = u.id) AS has_user_agency_suggestion, + EXISTS (SELECT 1 FROM public.link_urls_agency a WHERE a.url_id = u.id) AS has_confirmed_agency, + EXISTS (SELECT 1 FROM public.reviewing_user_url a WHERE a.url_id = u.id) AS was_reviewed + FROM urls u + ) + """ + ) + + +def _create_url_unknown_agencies_view(): + op.execute( + f""" + CREATE OR REPLACE VIEW {URL_UNKNOWN_AGENCIES_VIEW_NAME} AS + SELECT + u.id + FROM urls u + LEFT JOIN {URL_AUTO_AGENCY_SUBTASK_TABLE_NAME} uas ON u.id = uas.url_id + GROUP BY u.id + HAVING bool_or(uas.agencies_found) = false + """ + ) + + +def _create_url_auto_agency_subtask_table(): + op.create_table( + URL_AUTO_AGENCY_SUBTASK_TABLE_NAME, + id_column(), + task_id_column(), + url_id_column(), + sa.Column( + "type", + AGENCY_AUTO_SUGGESTION_METHOD_ENUM, + nullable=False + ), + sa.Column( + "agencies_found", + sa.Boolean(), + nullable=False + ), + sa.Column( + "detail", + SUBTASK_DETAIL_CODE_ENUM, + server_default=sa.text("'no details'"), + nullable=False + ), + created_at_column() + ) + + +def _create_link_agency_id_subtask_agencies_table(): + op.create_table( + LINK_AGENCY_ID_SUBTASK_AGENCIES_TABLE_NAME, + id_column(), + sa.Column( + "subtask_id", + sa.Integer(), + sa.ForeignKey( + f'{URL_AUTO_AGENCY_SUBTASK_TABLE_NAME}.id', + ondelete='CASCADE' + ), + nullable=False, + comment='A foreign key to the `url_auto_agency_subtask` table.' + ), + sa.Column( + "confidence", + sa.Integer, + sa.CheckConstraint( + "confidence BETWEEN 0 and 100" + ), + nullable=False, + ), + agency_id_column(), + created_at_column() + ) + + +def _drop_link_agency_id_subtask_agencies_table(): + op.drop_table(LINK_AGENCY_ID_SUBTASK_AGENCIES_TABLE_NAME) + + +def _drop_url_auto_agency_subtask_table(): + op.drop_table(URL_AUTO_AGENCY_SUBTASK_TABLE_NAME) + + +def _create_url_auto_agency_suggestions_table(): + op.create_table( + URL_AUTO_AGENCY_SUGGESTIONS_TABLE_NAME, + id_column(), + agency_id_column(), + url_id_column(), + sa.Column( + "is_unknown", + sa.Boolean(), + nullable=False + ), + created_at_column(), + updated_at_column(), + sa.Column( + 'method', + AGENCY_AUTO_SUGGESTION_METHOD_ENUM, + nullable=True + ), + sa.Column( + 'confidence', + sa.Float(), + server_default=sa.text('0.0'), + nullable=False + ), + sa.UniqueConstraint("agency_id", "url_id") + ) + + +def _drop_url_unknown_agencies_view(): + op.execute(f"DROP VIEW IF EXISTS {URL_UNKNOWN_AGENCIES_VIEW_NAME}") + +def _drop_url_annotation_flags_view(): + op.execute("DROP VIEW url_annotation_flags;") + + +def _create_old_url_annotation_flags_view(): + op.execute( + f""" + CREATE OR REPLACE VIEW url_annotation_flags AS + ( + SELECT u.id, + CASE WHEN arts.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_auto_record_type_suggestion, + CASE WHEN ars.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_auto_relevant_suggestion, + CASE WHEN auas.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_auto_agency_suggestion, + CASE WHEN urts.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_user_record_type_suggestion, + CASE WHEN urs.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_user_relevant_suggestion, + CASE WHEN uuas.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_user_agency_suggestion, + CASE WHEN cua.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_confirmed_agency, + CASE WHEN ruu.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS was_reviewed + FROM urls u + LEFT JOIN public.auto_record_type_suggestions arts ON u.id = arts.url_id + LEFT JOIN public.auto_relevant_suggestions ars ON u.id = ars.url_id + LEFT JOIN public.{URL_AUTO_AGENCY_SUGGESTIONS_TABLE_NAME} auas ON u.id = auas.url_id + LEFT JOIN public.user_record_type_suggestions urts ON u.id = urts.url_id + LEFT JOIN public.user_relevant_suggestions urs ON u.id = urs.url_id + LEFT JOIN public.user_url_agency_suggestions uuas ON u.id = uuas.url_id + LEFT JOIN public.reviewing_user_url ruu ON u.id = ruu.url_id + LEFT JOIN public.link_urls_agency cua on u.id = cua.url_id + ) + """ + ) diff --git a/alembic/versions/d7eb670edaf0_revise_agency_identification_logic.py b/alembic/versions/d7eb670edaf0_revise_agency_identification_logic.py index cd68a4b5..6ba6f7c9 100644 --- a/alembic/versions/d7eb670edaf0_revise_agency_identification_logic.py +++ b/alembic/versions/d7eb670edaf0_revise_agency_identification_logic.py @@ -118,7 +118,7 @@ def upgrade(): def downgrade(): # Drop constraints first op.drop_constraint("uq_confirmed_url_agency", "confirmed_url_agency", type_="unique") - op.drop_constraint("uq_automated_url_agency_suggestions", "automated_url_agency_suggestions", type_="unique") + # op.drop_constraint("uq_automated_url_agency_suggestions", "automated_url_agency_suggestions", type_="unique") op.drop_constraint("uq_user_url_agency_suggestions", "user_url_agency_suggestions", type_="unique") # Drop tables diff --git a/pyproject.toml b/pyproject.toml index 3eb1446d..afe4a89a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,7 @@ dependencies = [ "marshmallow~=3.23.2", "openai~=1.60.1", "pdap-access-manager==0.3.6", + "pip>=25.2", "playwright~=1.49.1", "psycopg2-binary~=2.9.6", "psycopg[binary]~=3.1.20", @@ -31,6 +32,8 @@ dependencies = [ "pyjwt~=2.10.1", "python-dotenv~=1.0.1", "requests~=2.32.3", + "side-effects>=1.6.dev0", + "spacy>=3.8.7", "sqlalchemy~=2.0.36", "starlette~=0.45.3", "tqdm>=4.64.1", diff --git a/src/api/endpoints/annotate/_shared/queries/get_annotation_batch_info.py b/src/api/endpoints/annotate/_shared/queries/get_annotation_batch_info.py index 9b3ffdeb..5a56cf32 100644 --- a/src/api/endpoints/annotate/_shared/queries/get_annotation_batch_info.py +++ b/src/api/endpoints/annotate/_shared/queries/get_annotation_batch_info.py @@ -5,7 +5,7 @@ from src.api.endpoints.annotate.dtos.shared.batch import AnnotationBatchInfo from src.collectors.enums import URLStatus -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer @@ -42,7 +42,7 @@ async def run( ) common_where_clause = [ - URL.status == URLStatus.PENDING.value, + URL.status == URLStatus.OK.value, LinkBatchURL.batch_id == self.batch_id, ] diff --git a/src/api/endpoints/annotate/_shared/queries/get_next_url_for_user_annotation.py b/src/api/endpoints/annotate/_shared/queries/get_next_url_for_user_annotation.py index a6a5b69d..6eed4b07 100644 --- a/src/api/endpoints/annotate/_shared/queries/get_next_url_for_user_annotation.py +++ b/src/api/endpoints/annotate/_shared/queries/get_next_url_for_user_annotation.py @@ -5,7 +5,8 @@ from src.collectors.enums import URLStatus from src.core.enums import SuggestedStatus from src.db.client.types import UserSuggestionModel -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion from src.db.queries.base.builder import QueryBuilderBase @@ -32,6 +33,10 @@ async def run(self, session: AsyncSession): select( URL, ) + .outerjoin( + FlagURLValidated, + FlagURLValidated.url_id == URL.id + ) ) if self.batch_id is not None: @@ -43,7 +48,7 @@ async def run(self, session: AsyncSession): query = ( query - .where(URL.status == URLStatus.PENDING.value) + .where(FlagURLValidated.url_id.is_(None)) # URL must not have user suggestion .where( StatementComposer.user_suggestion_not_exists(self.user_suggestion_model_to_exclude) diff --git a/src/api/endpoints/annotate/agency/get/queries/agency_suggestion.py b/src/api/endpoints/annotate/agency/get/queries/agency_suggestion.py deleted file mode 100644 index 1f202263..00000000 --- a/src/api/endpoints/annotate/agency/get/queries/agency_suggestion.py +++ /dev/null @@ -1,55 +0,0 @@ -from sqlalchemy import select -from sqlalchemy.ext.asyncio import AsyncSession - -from src.api.endpoints.annotate.agency.get.dto import GetNextURLForAgencyAgencyInfo -from src.core.enums import SuggestionType -from src.db.models.impl.agency.sqlalchemy import Agency -from src.db.models.impl.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion -from src.db.queries.base.builder import QueryBuilderBase - - -class GetAgencySuggestionsQueryBuilder(QueryBuilderBase): - - def __init__( - self, - url_id: int - ): - super().__init__() - self.url_id = url_id - - async def run(self, session: AsyncSession) -> list[GetNextURLForAgencyAgencyInfo]: - # Get relevant autosuggestions and agency info, if an associated agency exists - - statement = ( - select( - AutomatedUrlAgencySuggestion.agency_id, - AutomatedUrlAgencySuggestion.is_unknown, - Agency.name, - Agency.state, - Agency.county, - Agency.locality - ) - .join(Agency, isouter=True) - .where(AutomatedUrlAgencySuggestion.url_id == self.url_id) - ) - raw_autosuggestions = await session.execute(statement) - autosuggestions = raw_autosuggestions.all() - agency_suggestions = [] - for autosuggestion in autosuggestions: - agency_id = autosuggestion[0] - is_unknown = autosuggestion[1] - name = autosuggestion[2] - state = autosuggestion[3] - county = autosuggestion[4] - locality = autosuggestion[5] - agency_suggestions.append( - GetNextURLForAgencyAgencyInfo( - suggestion_type=SuggestionType.AUTO_SUGGESTION if not is_unknown else SuggestionType.UNKNOWN, - pdap_agency_id=agency_id, - agency_name=name, - state=state, - county=county, - locality=locality - ) - ) - return agency_suggestions \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/queries/__init__.py b/src/api/endpoints/annotate/agency/get/queries/agency_suggestion_/__init__.py similarity index 100% rename from src/core/tasks/url/operators/agency_identification/queries/__init__.py rename to src/api/endpoints/annotate/agency/get/queries/agency_suggestion_/__init__.py diff --git a/src/api/endpoints/annotate/agency/get/queries/agency_suggestion_/core.py b/src/api/endpoints/annotate/agency/get/queries/agency_suggestion_/core.py new file mode 100644 index 00000000..a9a33e84 --- /dev/null +++ b/src/api/endpoints/annotate/agency/get/queries/agency_suggestion_/core.py @@ -0,0 +1,73 @@ +from typing import Sequence + +from sqlalchemy import select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.annotate.agency.get.dto import GetNextURLForAgencyAgencyInfo +from src.api.endpoints.annotate.agency.get.queries.agency_suggestion_.suggestions_with_highest_confidence import \ + SuggestionsWithHighestConfidenceCTE +from src.core.enums import SuggestionType +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.queries.base.builder import QueryBuilderBase + +from src.db.helpers.session import session_helper as sh + +class GetAgencySuggestionsQueryBuilder(QueryBuilderBase): + + def __init__( + self, + url_id: int + ): + super().__init__() + self.url_id = url_id + + async def run(self, session: AsyncSession) -> list[GetNextURLForAgencyAgencyInfo]: + # Get relevant autosuggestions and agency info, if an associated agency exists + + cte = SuggestionsWithHighestConfidenceCTE() + + query = ( + select( + cte.agency_id, + cte.confidence, + Agency.name, + Agency.state, + Agency.county, + Agency.locality + ) + .outerjoin( + Agency, + Agency.agency_id == cte.agency_id + ) + .where( + cte.url_id == self.url_id + ) + ) + + raw_autosuggestions: Sequence[RowMapping] = await sh.mappings(session, query=query) + if len(raw_autosuggestions) == 0: + # Unknown agency + return [ + GetNextURLForAgencyAgencyInfo( + suggestion_type=SuggestionType.UNKNOWN, + ) + ] + + agency_suggestions: list[GetNextURLForAgencyAgencyInfo] = [] + for autosuggestion in raw_autosuggestions: + agency_id: int = autosuggestion["agency_id"] + name: str = autosuggestion["name"] + state: str | None = autosuggestion["state"] + county: str | None = autosuggestion["county"] + locality: str | None = autosuggestion["locality"] + agency_suggestions.append( + GetNextURLForAgencyAgencyInfo( + suggestion_type=SuggestionType.AUTO_SUGGESTION, + pdap_agency_id=agency_id, + agency_name=name, + state=state, + county=county, + locality=locality + ) + ) + return agency_suggestions \ No newline at end of file diff --git a/src/api/endpoints/annotate/agency/get/queries/agency_suggestion_/suggestions_with_highest_confidence.py b/src/api/endpoints/annotate/agency/get/queries/agency_suggestion_/suggestions_with_highest_confidence.py new file mode 100644 index 00000000..6d389b11 --- /dev/null +++ b/src/api/endpoints/annotate/agency/get/queries/agency_suggestion_/suggestions_with_highest_confidence.py @@ -0,0 +1,62 @@ +from sqlalchemy import CTE, select, func, Column + +from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask +from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion + +SUGGESTIONS_WITH_HIGHEST_CONFIDENCE_CTE: CTE = ( + select( + URLAutoAgencyIDSubtask.url_id, + AgencyIDSubtaskSuggestion.agency_id, + func.max(AgencyIDSubtaskSuggestion.confidence) + ) + .select_from(URLAutoAgencyIDSubtask) + .join( + AgencyIDSubtaskSuggestion, + URLAutoAgencyIDSubtask.id == AgencyIDSubtaskSuggestion.subtask_id + ) + .group_by( + URLAutoAgencyIDSubtask.url_id, + AgencyIDSubtaskSuggestion.agency_id + ) + .cte("suggestions_with_highest_confidence") +) + +class SuggestionsWithHighestConfidenceCTE: + + def __init__(self): + self._cte = ( + select( + URLAutoAgencyIDSubtask.url_id, + AgencyIDSubtaskSuggestion.agency_id, + func.max(AgencyIDSubtaskSuggestion.confidence).label("confidence") + ) + .select_from(URLAutoAgencyIDSubtask) + .join( + AgencyIDSubtaskSuggestion, + URLAutoAgencyIDSubtask.id == AgencyIDSubtaskSuggestion.subtask_id + ) + .where( + AgencyIDSubtaskSuggestion.agency_id.isnot(None) + ) + .group_by( + URLAutoAgencyIDSubtask.url_id, + AgencyIDSubtaskSuggestion.agency_id + ) + .cte("suggestions_with_highest_confidence") + ) + + @property + def cte(self) -> CTE: + return self._cte + + @property + def url_id(self) -> Column[int]: + return self._cte.columns.url_id + + @property + def agency_id(self) -> Column[int]: + return self._cte.columns.agency_id + + @property + def confidence(self) -> Column[float]: + return self._cte.columns.confidence \ No newline at end of file diff --git a/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py b/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py index 70ae112a..e8fdc6b2 100644 --- a/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py +++ b/src/api/endpoints/annotate/agency/get/queries/next_for_annotation.py @@ -4,17 +4,17 @@ from src.api.endpoints.annotate._shared.queries.get_annotation_batch_info import GetAnnotationBatchInfoQueryBuilder from src.api.endpoints.annotate.agency.get.dto import GetNextURLForAgencyAnnotationResponse, \ GetNextURLForAgencyAnnotationInnerResponse -from src.api.endpoints.annotate.agency.get.queries.agency_suggestion import GetAgencySuggestionsQueryBuilder +from src.api.endpoints.annotate.agency.get.queries.agency_suggestion_.core import GetAgencySuggestionsQueryBuilder from src.collectors.enums import URLStatus from src.core.enums import SuggestedStatus from src.core.tasks.url.operators.html.scraper.parser.util import convert_to_response_html_info from src.db.dtos.url.mapping import URLMapping -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion +from src.db.models.views.url_annotations_flags import URLAnnotationFlagsView from src.db.queries.base.builder import QueryBuilderBase from src.db.queries.implementations.core.get.html_content_info import GetHTMLContentInfoQueryBuilder @@ -48,30 +48,20 @@ async def run( # Must not have confirmed agencies query = query.where( - URL.status == URLStatus.PENDING.value + URL.status == URLStatus.OK.value ) - - # Must not have been annotated by a user query = ( - query.join(UserUrlAgencySuggestion, isouter=True) - .where( - ~exists( - select(UserUrlAgencySuggestion). - where(UserUrlAgencySuggestion.url_id == URL.id). - correlate(URL) - ) + query.join( + URLAnnotationFlagsView, + URLAnnotationFlagsView.url_id == URL.id ) - # Must have extant autosuggestions - .join(AutomatedUrlAgencySuggestion, isouter=True) + # Must not have been annotated by a user .where( - exists( - select(AutomatedUrlAgencySuggestion). - where(AutomatedUrlAgencySuggestion.url_id == URL.id). - correlate(URL) - ) + URLAnnotationFlagsView.has_user_agency_suggestion.is_(False), + # Must have extant autosuggestions + URLAnnotationFlagsView.has_auto_agency_suggestion.is_(True) ) - # Must not have confirmed agencies .join(LinkURLAgency, isouter=True) .where( ~exists( diff --git a/src/api/endpoints/annotate/all/get/query.py b/src/api/endpoints/annotate/all/get/query.py index a2afafd9..05855578 100644 --- a/src/api/endpoints/annotate/all/get/query.py +++ b/src/api/endpoints/annotate/all/get/query.py @@ -3,14 +3,14 @@ from sqlalchemy.orm import selectinload from src.api.endpoints.annotate._shared.queries.get_annotation_batch_info import GetAnnotationBatchInfoQueryBuilder -from src.api.endpoints.annotate.agency.get.queries.agency_suggestion import GetAgencySuggestionsQueryBuilder +from src.api.endpoints.annotate.agency.get.queries.agency_suggestion_.core import GetAgencySuggestionsQueryBuilder from src.api.endpoints.annotate.all.get.dto import GetNextURLForAllAnnotationResponse, \ GetNextURLForAllAnnotationInnerResponse from src.api.endpoints.annotate.relevance.get.dto import RelevanceAnnotationResponseInfo from src.collectors.enums import URLStatus from src.db.dto_converter import DTOConverter from src.db.dtos.url.mapping import URLMapping -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion @@ -39,7 +39,7 @@ async def run( query .where( and_( - URL.status == URLStatus.PENDING.value, + URL.status == URLStatus.OK.value, StatementComposer.user_suggestion_not_exists(UserUrlAgencySuggestion), StatementComposer.user_suggestion_not_exists(UserRecordTypeSuggestion), StatementComposer.user_suggestion_not_exists(UserRelevantSuggestion), @@ -50,7 +50,7 @@ async def run( load_options = [ URL.html_content, - URL.automated_agency_suggestions, + URL.auto_agency_subtasks, URL.auto_relevant_suggestion, URL.auto_record_type_suggestion ] diff --git a/src/api/endpoints/batch/duplicates/query.py b/src/api/endpoints/batch/duplicates/query.py index 2d8edff9..b09b6e5d 100644 --- a/src/api/endpoints/batch/duplicates/query.py +++ b/src/api/endpoints/batch/duplicates/query.py @@ -5,7 +5,7 @@ from src.db.models.impl.duplicate.pydantic.info import DuplicateInfo from src.db.models.impl.batch.sqlalchemy import Batch from src.db.models.impl.duplicate.sqlalchemy import Duplicate -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/api/endpoints/batch/urls/query.py b/src/api/endpoints/batch/urls/query.py index 6a88448f..391a265f 100644 --- a/src/api/endpoints/batch/urls/query.py +++ b/src/api/endpoints/batch/urls/query.py @@ -1,7 +1,7 @@ from sqlalchemy import Select from sqlalchemy.ext.asyncio import AsyncSession -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.pydantic.info import URLInfo from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/api/endpoints/collector/manual/query.py b/src/api/endpoints/collector/manual/query.py index 12b17ad3..73e3edb8 100644 --- a/src/api/endpoints/collector/manual/query.py +++ b/src/api/endpoints/collector/manual/query.py @@ -6,7 +6,7 @@ from src.collectors.enums import CollectorType, URLStatus from src.core.enums import BatchStatus from src.db.models.impl.batch.sqlalchemy import Batch -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata @@ -47,7 +47,7 @@ async def run(self, session: AsyncSession) -> ManualBatchResponseDTO: name=entry.name, description=entry.description, collector_metadata=entry.collector_metadata, - status=URLStatus.PENDING.value, + status=URLStatus.OK.value, record_type=entry.record_type.value if entry.record_type is not None else None, source=URLSource.MANUAL ) diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/__init__.py b/src/api/endpoints/metrics/backlog/__init__.py similarity index 100% rename from tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/__init__.py rename to src/api/endpoints/metrics/backlog/__init__.py diff --git a/src/api/endpoints/metrics/backlog/query.py b/src/api/endpoints/metrics/backlog/query.py new file mode 100644 index 00000000..788ef424 --- /dev/null +++ b/src/api/endpoints/metrics/backlog/query.py @@ -0,0 +1,53 @@ +from sqlalchemy import func, select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.metrics.dtos.get.backlog import GetMetricsBacklogResponseDTO, GetMetricsBacklogResponseInnerDTO +from src.db.models.impl.backlog_snapshot import BacklogSnapshot +from src.db.queries.base.builder import QueryBuilderBase + + +class GetBacklogMetricsQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> GetMetricsBacklogResponseDTO: + month = func.date_trunc('month', BacklogSnapshot.created_at) + + # 1. Create a subquery that assigns row_number() partitioned by month + monthly_snapshot_subq = ( + select( + BacklogSnapshot.id, + BacklogSnapshot.created_at, + BacklogSnapshot.count_pending_total, + month.label("month_start"), + func.row_number() + .over( + partition_by=month, + order_by=BacklogSnapshot.created_at.desc() + ) + .label("row_number") + ) + .subquery() + ) + + # 2. Filter for the top (most recent) row in each month + stmt = ( + select( + monthly_snapshot_subq.c.month_start, + monthly_snapshot_subq.c.created_at, + monthly_snapshot_subq.c.count_pending_total + ) + .where(monthly_snapshot_subq.c.row_number == 1) + .order_by(monthly_snapshot_subq.c.month_start) + ) + + raw_result = await session.execute(stmt) + results = raw_result.all() + final_results = [] + for result in results: + final_results.append( + GetMetricsBacklogResponseInnerDTO( + month=result.month_start.strftime("%B %Y"), + count_pending_total=result.count_pending_total, + ) + ) + + return GetMetricsBacklogResponseDTO(entries=final_results) \ No newline at end of file diff --git a/src/api/endpoints/metrics/batches/aggregated/query.py b/src/api/endpoints/metrics/batches/aggregated/query.py deleted file mode 100644 index e7de65fb..00000000 --- a/src/api/endpoints/metrics/batches/aggregated/query.py +++ /dev/null @@ -1,117 +0,0 @@ -from sqlalchemy import case, select -from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.sql.functions import coalesce - -from src.api.endpoints.metrics.batches.aggregated.dto import GetMetricsBatchesAggregatedResponseDTO, \ - GetMetricsBatchesAggregatedInnerResponseDTO -from src.collectors.enums import URLStatus, CollectorType -from src.core.enums import BatchStatus -from src.db.models.impl.batch.sqlalchemy import Batch -from src.db.models.impl.link.batch_url import LinkBatchURL -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.queries.base.builder import QueryBuilderBase -from src.db.statement_composer import StatementComposer - - -class GetBatchesAggregatedMetricsQueryBuilder(QueryBuilderBase): - - async def run( - self, - session: AsyncSession - ) -> GetMetricsBatchesAggregatedResponseDTO: - sc = StatementComposer - - # First, get all batches broken down by collector type and status - def batch_column(status: BatchStatus, label): - return sc.count_distinct( - case( - ( - Batch.status == status.value, - Batch.id - ) - ), - label=label - ) - - batch_count_subquery = select( - batch_column(BatchStatus.READY_TO_LABEL, label="done_count"), - batch_column(BatchStatus.ERROR, label="error_count"), - Batch.strategy, - ).group_by(Batch.strategy).subquery("batch_count") - - def url_column(status: URLStatus, label): - return sc.count_distinct( - case( - ( - URL.status == status.value, - URL.id - ) - ), - label=label - ) - - # Next, count urls - url_count_subquery = select( - Batch.strategy, - url_column(URLStatus.PENDING, label="pending_count"), - url_column(URLStatus.ERROR, label="error_count"), - url_column(URLStatus.VALIDATED, label="validated_count"), - url_column(URLStatus.SUBMITTED, label="submitted_count"), - url_column(URLStatus.NOT_RELEVANT, label="rejected_count"), - - ).join( - LinkBatchURL, - LinkBatchURL.url_id == URL.id - ).outerjoin( - Batch, Batch.id == LinkBatchURL.batch_id - ).group_by( - Batch.strategy - ).subquery("url_count") - - # Combine - query = select( - Batch.strategy, - batch_count_subquery.c.done_count.label("batch_done_count"), - batch_count_subquery.c.error_count.label("batch_error_count"), - coalesce(url_count_subquery.c.pending_count, 0).label("pending_count"), - coalesce(url_count_subquery.c.error_count, 0).label("error_count"), - coalesce(url_count_subquery.c.submitted_count, 0).label("submitted_count"), - coalesce(url_count_subquery.c.rejected_count, 0).label("rejected_count"), - coalesce(url_count_subquery.c.validated_count, 0).label("validated_count") - ).join( - batch_count_subquery, - Batch.strategy == batch_count_subquery.c.strategy - ).outerjoin( - url_count_subquery, - Batch.strategy == url_count_subquery.c.strategy - ) - raw_results = await session.execute(query) - results = raw_results.all() - d: dict[CollectorType, GetMetricsBatchesAggregatedInnerResponseDTO] = {} - for result in results: - d[CollectorType(result.strategy)] = GetMetricsBatchesAggregatedInnerResponseDTO( - count_successful_batches=result.batch_done_count, - count_failed_batches=result.batch_error_count, - count_urls=result.pending_count + result.submitted_count + - result.rejected_count + result.error_count + - result.validated_count, - count_urls_pending=result.pending_count, - count_urls_validated=result.validated_count, - count_urls_submitted=result.submitted_count, - count_urls_rejected=result.rejected_count, - count_urls_errors=result.error_count - ) - - total_batch_query = await session.execute( - select( - sc.count_distinct(Batch.id, label="count") - ) - ) - total_batch_count = total_batch_query.scalars().one_or_none() - if total_batch_count is None: - total_batch_count = 0 - - return GetMetricsBatchesAggregatedResponseDTO( - total_batches=total_batch_count, - by_strategy=d - ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/queries/__init__.py b/src/api/endpoints/metrics/batches/aggregated/query/__init__.py similarity index 100% rename from tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/manager/queries/__init__.py rename to src/api/endpoints/metrics/batches/aggregated/query/__init__.py diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/__init__.py b/src/api/endpoints/metrics/batches/aggregated/query/all_urls/__init__.py similarity index 100% rename from tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/__init__.py rename to src/api/endpoints/metrics/batches/aggregated/query/all_urls/__init__.py diff --git a/src/api/endpoints/metrics/batches/aggregated/query/all_urls/query.py b/src/api/endpoints/metrics/batches/aggregated/query/all_urls/query.py new file mode 100644 index 00000000..7eed215a --- /dev/null +++ b/src/api/endpoints/metrics/batches/aggregated/query/all_urls/query.py @@ -0,0 +1,28 @@ +from typing import Sequence + +from sqlalchemy import func, select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.metrics.batches.aggregated.query.models.strategy_count import CountByBatchStrategyResponse +from src.db.helpers.session import session_helper as sh +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.queries.base.builder import QueryBuilderBase + + +class CountAllURLsByBatchStrategyQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> list[CountByBatchStrategyResponse]: + + query = ( + select( + Batch.strategy, + func.count(LinkBatchURL.url_id).label("count") + ) + .join(LinkBatchURL) + .group_by(Batch.strategy) + ) + + mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) + results = [CountByBatchStrategyResponse(**mapping) for mapping in mappings] + return results \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/url/__init__.py b/src/api/endpoints/metrics/batches/aggregated/query/batch_status_/__init__.py similarity index 100% rename from tests/automated/integration/tasks/scheduled/impl/sync/data_sources/setup/models/url/__init__.py rename to src/api/endpoints/metrics/batches/aggregated/query/batch_status_/__init__.py diff --git a/src/api/endpoints/metrics/batches/aggregated/query/batch_status_/query.py b/src/api/endpoints/metrics/batches/aggregated/query/batch_status_/query.py new file mode 100644 index 00000000..f8587b68 --- /dev/null +++ b/src/api/endpoints/metrics/batches/aggregated/query/batch_status_/query.py @@ -0,0 +1,37 @@ +from typing import Sequence + +from sqlalchemy import CTE, select, func, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.metrics.batches.aggregated.query.batch_status_.response import \ + BatchStatusCountByBatchStrategyResponseDTO +from src.collectors.enums import CollectorType +from src.core.enums import BatchStatus +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.queries.base.builder import QueryBuilderBase + +from src.db.helpers.session import session_helper as sh + +class BatchStatusByBatchStrategyQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> list[BatchStatusCountByBatchStrategyResponseDTO]: + query = ( + select( + Batch.strategy, + Batch.status, + func.count(Batch.id).label("count") + ) + .group_by(Batch.strategy, Batch.status) + ) + mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) + + results: list[BatchStatusCountByBatchStrategyResponseDTO] = [] + for mapping in mappings: + results.append( + BatchStatusCountByBatchStrategyResponseDTO( + strategy=CollectorType(mapping["strategy"]), + status=BatchStatus(mapping["status"]), + count=mapping["count"] + ) + ) + return results \ No newline at end of file diff --git a/src/api/endpoints/metrics/batches/aggregated/query/batch_status_/response.py b/src/api/endpoints/metrics/batches/aggregated/query/batch_status_/response.py new file mode 100644 index 00000000..79c1b2dd --- /dev/null +++ b/src/api/endpoints/metrics/batches/aggregated/query/batch_status_/response.py @@ -0,0 +1,10 @@ +from pydantic import BaseModel + +from src.collectors.enums import CollectorType +from src.core.enums import BatchStatus + + +class BatchStatusCountByBatchStrategyResponseDTO(BaseModel): + strategy: CollectorType + status: BatchStatus + count: int \ No newline at end of file diff --git a/src/api/endpoints/metrics/batches/aggregated/query/core.py b/src/api/endpoints/metrics/batches/aggregated/query/core.py new file mode 100644 index 00000000..2642f002 --- /dev/null +++ b/src/api/endpoints/metrics/batches/aggregated/query/core.py @@ -0,0 +1,79 @@ +from sqlalchemy import case, select +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.sql.functions import coalesce, func + +from src.api.endpoints.metrics.batches.aggregated.dto import GetMetricsBatchesAggregatedResponseDTO, \ + GetMetricsBatchesAggregatedInnerResponseDTO +from src.api.endpoints.metrics.batches.aggregated.query.all_urls.query import CountAllURLsByBatchStrategyQueryBuilder +from src.api.endpoints.metrics.batches.aggregated.query.batch_status_.query import \ + BatchStatusByBatchStrategyQueryBuilder +from src.api.endpoints.metrics.batches.aggregated.query.requester_.requester import \ + GetBatchesAggregatedMetricsQueryRequester +from src.api.endpoints.metrics.batches.aggregated.query.submitted_.query import \ + CountSubmittedByBatchStrategyQueryBuilder +from src.api.endpoints.metrics.batches.aggregated.query.url_error.query import URLErrorByBatchStrategyQueryBuilder +from src.api.endpoints.metrics.batches.aggregated.query.validated_.query import \ + ValidatedURLCountByBatchStrategyQueryBuilder +from src.collectors.enums import URLStatus, CollectorType +from src.core.enums import BatchStatus +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource +from src.db.queries.base.builder import QueryBuilderBase +from src.db.statement_composer import StatementComposer + + +class GetBatchesAggregatedMetricsQueryBuilder(QueryBuilderBase): + + async def run( + self, + session: AsyncSession + ) -> GetMetricsBatchesAggregatedResponseDTO: + + requester = GetBatchesAggregatedMetricsQueryRequester(session=session) + + url_error_count_dict: dict[CollectorType, int] = await requester.url_error_by_collector_strategy() + url_pending_count_dict: dict[CollectorType, int] = await requester.pending_url_count_by_collector_strategy() + url_submitted_count_dict: dict[CollectorType, int] = await requester.submitted_url_count_by_collector_strategy() + url_validated_count_dict: dict[CollectorType, int] = await requester.validated_url_count_by_collector_strategy() + url_rejected_count_dict: dict[CollectorType, int] = await requester.rejected_url_count_by_collector_strategy() + url_total_count_dict: dict[CollectorType, int] = await requester.url_count_by_collector_strategy() + batch_status_count_dict: dict[ + CollectorType, + dict[BatchStatus, int] + ] = await requester.batch_status_by_collector_strategy() + + + + + + d: dict[CollectorType, GetMetricsBatchesAggregatedInnerResponseDTO] = {} + for collector_type in CollectorType: + inner_response = GetMetricsBatchesAggregatedInnerResponseDTO( + count_successful_batches=batch_status_count_dict[collector_type][BatchStatus.READY_TO_LABEL], + count_failed_batches=batch_status_count_dict[collector_type][BatchStatus.ERROR], + count_urls=url_total_count_dict[collector_type], + count_urls_pending=url_pending_count_dict[collector_type], + count_urls_validated=url_validated_count_dict[collector_type], + count_urls_submitted=url_submitted_count_dict[collector_type], + count_urls_rejected=url_rejected_count_dict[collector_type], + count_urls_errors=url_error_count_dict[collector_type], + ) + d[collector_type] = inner_response + + total_batch_query = await session.execute( + select( + func.count(Batch.id, label="count") + ) + ) + total_batch_count = total_batch_query.scalars().one_or_none() + if total_batch_count is None: + total_batch_count = 0 + + return GetMetricsBatchesAggregatedResponseDTO( + total_batches=total_batch_count, + by_strategy=d + ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/happy_path/__init__.py b/src/api/endpoints/metrics/batches/aggregated/query/models/__init__.py similarity index 100% rename from tests/automated/integration/tasks/url/impl/agency_identification/happy_path/__init__.py rename to src/api/endpoints/metrics/batches/aggregated/query/models/__init__.py diff --git a/src/api/endpoints/metrics/batches/aggregated/query/models/strategy_count.py b/src/api/endpoints/metrics/batches/aggregated/query/models/strategy_count.py new file mode 100644 index 00000000..9ceb7781 --- /dev/null +++ b/src/api/endpoints/metrics/batches/aggregated/query/models/strategy_count.py @@ -0,0 +1,8 @@ +from pydantic import BaseModel + +from src.collectors.enums import CollectorType + + +class CountByBatchStrategyResponse(BaseModel): + strategy: CollectorType + count: int \ No newline at end of file diff --git a/src/api/endpoints/metrics/batches/aggregated/query/pending/__init__.py b/src/api/endpoints/metrics/batches/aggregated/query/pending/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/batches/aggregated/query/pending/query.py b/src/api/endpoints/metrics/batches/aggregated/query/pending/query.py new file mode 100644 index 00000000..224d3bad --- /dev/null +++ b/src/api/endpoints/metrics/batches/aggregated/query/pending/query.py @@ -0,0 +1,37 @@ +from typing import Sequence + +from sqlalchemy import select, func, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.metrics.batches.aggregated.query.models.strategy_count import CountByBatchStrategyResponse +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.queries.base.builder import QueryBuilderBase +from src.db.helpers.session import session_helper as sh + +class PendingURLCountByBatchStrategyQueryBuilder(QueryBuilderBase): + async def run( + self, session: AsyncSession + ) -> list[CountByBatchStrategyResponse]: + + query = ( + select( + Batch.strategy, + func.count(LinkBatchURL.url_id).label("count") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id + ) + .outerjoin( + FlagURLValidated, + FlagURLValidated.url_id == LinkBatchURL.url_id + ) + .where(FlagURLValidated.url_id.is_(None)) + .group_by(Batch.strategy) + ) + + mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) + results = [CountByBatchStrategyResponse(**mapping) for mapping in mappings] + return results diff --git a/src/api/endpoints/metrics/batches/aggregated/query/rejected/__init__.py b/src/api/endpoints/metrics/batches/aggregated/query/rejected/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/batches/aggregated/query/rejected/query.py b/src/api/endpoints/metrics/batches/aggregated/query/rejected/query.py new file mode 100644 index 00000000..6c1d9e0f --- /dev/null +++ b/src/api/endpoints/metrics/batches/aggregated/query/rejected/query.py @@ -0,0 +1,39 @@ +from typing import Sequence + +from sqlalchemy import select, func, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.metrics.batches.aggregated.query.models.strategy_count import CountByBatchStrategyResponse +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.queries.base.builder import QueryBuilderBase +from src.db.helpers.session import session_helper as sh + +class RejectedURLCountByBatchStrategyQueryBuilder(QueryBuilderBase): + + async def run( + self, session: AsyncSession + ) -> list[CountByBatchStrategyResponse]: + + query = ( + select( + Batch.strategy, + func.count(FlagURLValidated.url_id).label("count") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id + ) + .join( + FlagURLValidated, + FlagURLValidated.url_id == LinkBatchURL.url_id + ) + .where(FlagURLValidated.type == URLValidatedType.NOT_RELEVANT) + .group_by(Batch.strategy) + ) + + mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) + results = [CountByBatchStrategyResponse(**mapping) for mapping in mappings] + return results diff --git a/src/api/endpoints/metrics/batches/aggregated/query/requester_/__init__.py b/src/api/endpoints/metrics/batches/aggregated/query/requester_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/batches/aggregated/query/requester_/convert.py b/src/api/endpoints/metrics/batches/aggregated/query/requester_/convert.py new file mode 100644 index 00000000..4a129dfb --- /dev/null +++ b/src/api/endpoints/metrics/batches/aggregated/query/requester_/convert.py @@ -0,0 +1,11 @@ +from src.api.endpoints.metrics.batches.aggregated.query.models.strategy_count import CountByBatchStrategyResponse +from src.collectors.enums import CollectorType + + +def convert_strategy_counts_to_strategy_count_dict( + responses: list[CountByBatchStrategyResponse] +) -> dict[CollectorType, int]: + result: dict[CollectorType, int] = {collector_type: 0 for collector_type in CollectorType} + for response in responses: + result[response.strategy] = response.count + return result \ No newline at end of file diff --git a/src/api/endpoints/metrics/batches/aggregated/query/requester_/requester.py b/src/api/endpoints/metrics/batches/aggregated/query/requester_/requester.py new file mode 100644 index 00000000..ac4c6dfa --- /dev/null +++ b/src/api/endpoints/metrics/batches/aggregated/query/requester_/requester.py @@ -0,0 +1,75 @@ + +from src.api.endpoints.metrics.batches.aggregated.query.all_urls.query import CountAllURLsByBatchStrategyQueryBuilder +from src.api.endpoints.metrics.batches.aggregated.query.batch_status_.query import \ + BatchStatusByBatchStrategyQueryBuilder +from src.api.endpoints.metrics.batches.aggregated.query.batch_status_.response import \ + BatchStatusCountByBatchStrategyResponseDTO +from src.api.endpoints.metrics.batches.aggregated.query.models.strategy_count import CountByBatchStrategyResponse +from src.api.endpoints.metrics.batches.aggregated.query.pending.query import PendingURLCountByBatchStrategyQueryBuilder +from src.api.endpoints.metrics.batches.aggregated.query.rejected.query import \ + RejectedURLCountByBatchStrategyQueryBuilder +from src.api.endpoints.metrics.batches.aggregated.query.requester_.convert import \ + convert_strategy_counts_to_strategy_count_dict +from src.api.endpoints.metrics.batches.aggregated.query.submitted_.query import \ + CountSubmittedByBatchStrategyQueryBuilder +from src.api.endpoints.metrics.batches.aggregated.query.url_error.query import URLErrorByBatchStrategyQueryBuilder +from src.api.endpoints.metrics.batches.aggregated.query.validated_.query import \ + ValidatedURLCountByBatchStrategyQueryBuilder +from src.collectors.enums import CollectorType +from src.core.enums import BatchStatus +from src.db.queries.base.builder import QueryBuilderBase +from src.db.templates.requester import RequesterBase + + +class GetBatchesAggregatedMetricsQueryRequester(RequesterBase): + + async def _run_strategy_count_query_builder( + self, query_builder: type[QueryBuilderBase]) -> dict[CollectorType, int]: + responses: list[CountByBatchStrategyResponse] = \ + await query_builder().run(self.session) + + return convert_strategy_counts_to_strategy_count_dict(responses) + + async def url_error_by_collector_strategy(self) -> dict[CollectorType, int]: + return await self._run_strategy_count_query_builder(URLErrorByBatchStrategyQueryBuilder) + + async def url_count_by_collector_strategy(self) -> dict[CollectorType, int]: + return await self._run_strategy_count_query_builder(CountAllURLsByBatchStrategyQueryBuilder) + + async def submitted_url_count_by_collector_strategy(self) -> dict[CollectorType, int]: + return await self._run_strategy_count_query_builder(CountSubmittedByBatchStrategyQueryBuilder) + + async def validated_url_count_by_collector_strategy(self) -> dict[CollectorType, int]: + return await self._run_strategy_count_query_builder(ValidatedURLCountByBatchStrategyQueryBuilder) + + async def rejected_url_count_by_collector_strategy(self) -> dict[CollectorType, int]: + return await self._run_strategy_count_query_builder(RejectedURLCountByBatchStrategyQueryBuilder) + + async def pending_url_count_by_collector_strategy(self) -> dict[CollectorType, int]: + return await self._run_strategy_count_query_builder(PendingURLCountByBatchStrategyQueryBuilder) + + async def batch_status_by_collector_strategy(self) -> dict[ + CollectorType, + dict[BatchStatus, int] + ]: + + responses: list[BatchStatusCountByBatchStrategyResponseDTO] = \ + await BatchStatusByBatchStrategyQueryBuilder().run(self.session) + + result: dict[CollectorType, dict[BatchStatus, int]] = { + collector_type: { + BatchStatus.ERROR: 0, + BatchStatus.READY_TO_LABEL: 0, + } + for collector_type in CollectorType + } + for response in responses: + if response.status not in ( + BatchStatus.ERROR, + BatchStatus.READY_TO_LABEL + ): + continue + result[response.strategy][response.status] = response.count + + return result + diff --git a/src/api/endpoints/metrics/batches/aggregated/query/submitted_/__init__.py b/src/api/endpoints/metrics/batches/aggregated/query/submitted_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/batches/aggregated/query/submitted_/query.py b/src/api/endpoints/metrics/batches/aggregated/query/submitted_/query.py new file mode 100644 index 00000000..ee8f8065 --- /dev/null +++ b/src/api/endpoints/metrics/batches/aggregated/query/submitted_/query.py @@ -0,0 +1,45 @@ +from typing import Sequence + +from sqlalchemy import select, func, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.metrics.batches.aggregated.query.models.strategy_count import CountByBatchStrategyResponse +from src.collectors.enums import CollectorType +from src.db.helpers.session import session_helper as sh +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource +from src.db.queries.base.builder import QueryBuilderBase + + +class CountSubmittedByBatchStrategyQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> list[ + CountByBatchStrategyResponse + ]: + query = ( + select( + Batch.strategy, + func.count(URLDataSource.id).label("count") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id + ) + .join( + URLDataSource, + URLDataSource.url_id == LinkBatchURL.url_id + ) + .group_by(Batch.strategy) + ) + + mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) + results: list[CountByBatchStrategyResponse] = [] + for mapping in mappings: + results.append( + CountByBatchStrategyResponse( + strategy=CollectorType(mapping["strategy"]), + count=mapping["count"] + ) + ) + return results diff --git a/src/api/endpoints/metrics/batches/aggregated/query/url_error/__init__.py b/src/api/endpoints/metrics/batches/aggregated/query/url_error/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/batches/aggregated/query/url_error/query.py b/src/api/endpoints/metrics/batches/aggregated/query/url_error/query.py new file mode 100644 index 00000000..9bcc3a57 --- /dev/null +++ b/src/api/endpoints/metrics/batches/aggregated/query/url_error/query.py @@ -0,0 +1,34 @@ +from typing import Sequence + +from sqlalchemy import select, func, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.metrics.batches.aggregated.query.models.strategy_count import CountByBatchStrategyResponse +from src.collectors.enums import URLStatus +from src.db.helpers.session import session_helper as sh +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.base.builder import QueryBuilderBase + + +class URLErrorByBatchStrategyQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> list[CountByBatchStrategyResponse]: + query = ( + select( + Batch.strategy, + func.count(URL.id).label("count") + ) + .select_from(Batch) + .join(LinkBatchURL) + .join(URL) + .where(URL.status == URLStatus.ERROR) + .group_by(Batch.strategy, URL.status) + ) + + mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) + results = [CountByBatchStrategyResponse(**mapping) for mapping in mappings] + return results + + diff --git a/src/api/endpoints/metrics/batches/aggregated/query/validated_/__init__.py b/src/api/endpoints/metrics/batches/aggregated/query/validated_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/batches/aggregated/query/validated_/query.py b/src/api/endpoints/metrics/batches/aggregated/query/validated_/query.py new file mode 100644 index 00000000..155cbcb0 --- /dev/null +++ b/src/api/endpoints/metrics/batches/aggregated/query/validated_/query.py @@ -0,0 +1,38 @@ +from typing import Sequence + +from sqlalchemy import select, func, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.metrics.batches.aggregated.query.models.strategy_count import CountByBatchStrategyResponse +from src.db.helpers.session import session_helper as sh +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.queries.base.builder import QueryBuilderBase + + +class ValidatedURLCountByBatchStrategyQueryBuilder(QueryBuilderBase): + + async def run( + self, session: AsyncSession + ) -> list[CountByBatchStrategyResponse]: + + query = ( + select( + Batch.strategy, + func.count(FlagURLValidated.url_id).label("count") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id + ) + .join( + FlagURLValidated, + FlagURLValidated.url_id == LinkBatchURL.url_id + ) + .group_by(Batch.strategy) + ) + + mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) + results = [CountByBatchStrategyResponse(**mapping) for mapping in mappings] + return results diff --git a/src/api/endpoints/metrics/batches/breakdown/error/__init__.py b/src/api/endpoints/metrics/batches/breakdown/error/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/batches/breakdown/error/cte_.py b/src/api/endpoints/metrics/batches/breakdown/error/cte_.py new file mode 100644 index 00000000..ed2ff44f --- /dev/null +++ b/src/api/endpoints/metrics/batches/breakdown/error/cte_.py @@ -0,0 +1,25 @@ +from sqlalchemy import select, func, CTE, Column + +from src.collectors.enums import URLStatus +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.api.endpoints.metrics.batches.breakdown.templates.cte_ import BatchesBreakdownURLCTE +from src.db.models.impl.url.core.sqlalchemy import URL + +URL_ERROR_CTE = BatchesBreakdownURLCTE( + select( + Batch.id, + func.count(LinkBatchURL.url_id).label("count_error") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id + ) + .join( + URL, + URL.id == LinkBatchURL.url_id + ) + .where(URL.status == URLStatus.ERROR) + .group_by(Batch.id) + .cte("error") +) diff --git a/src/api/endpoints/metrics/batches/breakdown/not_relevant/__init__.py b/src/api/endpoints/metrics/batches/breakdown/not_relevant/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/batches/breakdown/not_relevant/cte_.py b/src/api/endpoints/metrics/batches/breakdown/not_relevant/cte_.py new file mode 100644 index 00000000..14403e86 --- /dev/null +++ b/src/api/endpoints/metrics/batches/breakdown/not_relevant/cte_.py @@ -0,0 +1,27 @@ +from sqlalchemy import select, func + +from src.api.endpoints.metrics.batches.breakdown.templates.cte_ import BatchesBreakdownURLCTE +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL + +NOT_RELEVANT_CTE = BatchesBreakdownURLCTE( + select( + Batch.id, + func.count(FlagURLValidated.url_id).label("count_rejected") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id + ) + .join( + FlagURLValidated, + FlagURLValidated.url_id == LinkBatchURL.url_id + ) + .where( + FlagURLValidated.type == URLValidatedType.NOT_RELEVANT + ) + .group_by(Batch.id) + .cte("not_relevant") +) \ No newline at end of file diff --git a/src/api/endpoints/metrics/batches/breakdown/pending/__init__.py b/src/api/endpoints/metrics/batches/breakdown/pending/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/batches/breakdown/pending/cte_.py b/src/api/endpoints/metrics/batches/breakdown/pending/cte_.py new file mode 100644 index 00000000..bf09f345 --- /dev/null +++ b/src/api/endpoints/metrics/batches/breakdown/pending/cte_.py @@ -0,0 +1,26 @@ +from sqlalchemy import select, func + +from src.api.endpoints.metrics.batches.breakdown.templates.cte_ import BatchesBreakdownURLCTE +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL + +PENDING_CTE = BatchesBreakdownURLCTE( + select( + Batch.id, + func.count(LinkBatchURL.url_id).label("count_pending") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id + ) + .outerjoin( + FlagURLValidated, + FlagURLValidated.url_id == LinkBatchURL.url_id + ) + .where( + FlagURLValidated.url_id.is_(None) + ) + .group_by(Batch.id) + .cte("pending") +) \ No newline at end of file diff --git a/src/api/endpoints/metrics/batches/breakdown/query.py b/src/api/endpoints/metrics/batches/breakdown/query.py index 6fe0eb71..5847e309 100644 --- a/src/api/endpoints/metrics/batches/breakdown/query.py +++ b/src/api/endpoints/metrics/batches/breakdown/query.py @@ -1,13 +1,20 @@ -from sqlalchemy import select, case +from sqlalchemy import select, case, Column from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.sql.functions import coalesce from src.api.endpoints.metrics.batches.breakdown.dto import GetMetricsBatchesBreakdownResponseDTO, \ GetMetricsBatchesBreakdownInnerResponseDTO +from src.api.endpoints.metrics.batches.breakdown.error.cte_ import URL_ERROR_CTE +from src.api.endpoints.metrics.batches.breakdown.not_relevant.cte_ import NOT_RELEVANT_CTE +from src.api.endpoints.metrics.batches.breakdown.pending.cte_ import PENDING_CTE +from src.api.endpoints.metrics.batches.breakdown.submitted.cte_ import SUBMITTED_CTE +from src.api.endpoints.metrics.batches.breakdown.templates.cte_ import BatchesBreakdownURLCTE +from src.api.endpoints.metrics.batches.breakdown.total.cte_ import TOTAL_CTE +from src.api.endpoints.metrics.batches.breakdown.validated.cte_ import VALIDATED_CTE from src.collectors.enums import URLStatus, CollectorType from src.core.enums import BatchStatus from src.db.models.impl.batch.sqlalchemy import Batch -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer @@ -32,28 +39,32 @@ async def run(self, session: AsyncSession) -> GetMetricsBatchesBreakdownResponse Batch.date_generated.label("created_at"), ) - def url_column(status: URLStatus, label): - return sc.count_distinct( - case( - ( - URL.status == status.value, - URL.id - ) - ), - label=label - ) + all_ctes: list[BatchesBreakdownURLCTE] = [ + URL_ERROR_CTE, + NOT_RELEVANT_CTE, + PENDING_CTE, + SUBMITTED_CTE, + TOTAL_CTE, + VALIDATED_CTE + ] + + count_columns: list[Column] = [ + cte.count for cte in all_ctes + ] + count_query = select( - LinkBatchURL.batch_id, - sc.count_distinct(URL.id, label="count_total"), - url_column(URLStatus.PENDING, label="count_pending"), - url_column(URLStatus.SUBMITTED, label="count_submitted"), - url_column(URLStatus.NOT_RELEVANT, label="count_rejected"), - url_column(URLStatus.ERROR, label="count_error"), - url_column(URLStatus.VALIDATED, label="count_validated"), - ).join(URL, LinkBatchURL.url_id == URL.id).group_by( - LinkBatchURL.batch_id - ).subquery("url_count") + Batch.id.label("batch_id"), + *count_columns + ) + for cte in all_ctes: + count_query = count_query.outerjoin( + cte.query, + Batch.id == cte.batch_id + ) + + count_query = count_query.cte("url_count") + query = (select( main_query.c.strategy, diff --git a/src/api/endpoints/metrics/batches/breakdown/submitted/__init__.py b/src/api/endpoints/metrics/batches/breakdown/submitted/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/batches/breakdown/submitted/cte_.py b/src/api/endpoints/metrics/batches/breakdown/submitted/cte_.py new file mode 100644 index 00000000..face1891 --- /dev/null +++ b/src/api/endpoints/metrics/batches/breakdown/submitted/cte_.py @@ -0,0 +1,23 @@ +from sqlalchemy import select, func + +from src.api.endpoints.metrics.batches.breakdown.templates.cte_ import BatchesBreakdownURLCTE +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource + +SUBMITTED_CTE = BatchesBreakdownURLCTE( + select( + Batch.id, + func.count(URLDataSource.id).label("count_submitted") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id + ) + .join( + URLDataSource, + URLDataSource.url_id == LinkBatchURL.url_id + ) + .group_by(Batch.id) + .cte("submitted") +) \ No newline at end of file diff --git a/src/api/endpoints/metrics/batches/breakdown/templates/__init__.py b/src/api/endpoints/metrics/batches/breakdown/templates/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/batches/breakdown/templates/cte_.py b/src/api/endpoints/metrics/batches/breakdown/templates/cte_.py new file mode 100644 index 00000000..3fdd7521 --- /dev/null +++ b/src/api/endpoints/metrics/batches/breakdown/templates/cte_.py @@ -0,0 +1,20 @@ +from psycopg import Column +from sqlalchemy import CTE + + +class BatchesBreakdownURLCTE: + + def __init__(self, query: CTE): + self._query = query + + @property + def query(self) -> CTE: + return self._query + + @property + def batch_id(self) -> Column: + return self._query.columns[0] + + @property + def count(self) -> Column: + return self._query.columns[1] \ No newline at end of file diff --git a/src/api/endpoints/metrics/batches/breakdown/total/__init__.py b/src/api/endpoints/metrics/batches/breakdown/total/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/batches/breakdown/total/cte_.py b/src/api/endpoints/metrics/batches/breakdown/total/cte_.py new file mode 100644 index 00000000..33cf0c84 --- /dev/null +++ b/src/api/endpoints/metrics/batches/breakdown/total/cte_.py @@ -0,0 +1,15 @@ +from sqlalchemy import select, func + +from src.api.endpoints.metrics.batches.breakdown.templates.cte_ import BatchesBreakdownURLCTE +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL + +TOTAL_CTE = BatchesBreakdownURLCTE( + select( + Batch.id, + func.count(LinkBatchURL.url_id).label("count_total") + ) + .join(LinkBatchURL) + .group_by(Batch.id) + .cte("total") +) \ No newline at end of file diff --git a/src/api/endpoints/metrics/batches/breakdown/validated/__init__.py b/src/api/endpoints/metrics/batches/breakdown/validated/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/batches/breakdown/validated/cte_.py b/src/api/endpoints/metrics/batches/breakdown/validated/cte_.py new file mode 100644 index 00000000..b6ff5ef1 --- /dev/null +++ b/src/api/endpoints/metrics/batches/breakdown/validated/cte_.py @@ -0,0 +1,23 @@ +from sqlalchemy import select, func + +from src.api.endpoints.metrics.batches.breakdown.templates.cte_ import BatchesBreakdownURLCTE +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL + +VALIDATED_CTE = BatchesBreakdownURLCTE( + select( + Batch.id, + func.count(FlagURLValidated.url_id).label("count_validated") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id + ) + .join( + FlagURLValidated, + FlagURLValidated.url_id == LinkBatchURL.url_id + ) + .group_by(Batch.id) + .cte("validated") +) \ No newline at end of file diff --git a/src/api/endpoints/metrics/urls/__init__.py b/src/api/endpoints/metrics/urls/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/urls/aggregated/__init__.py b/src/api/endpoints/metrics/urls/aggregated/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/urls/aggregated/query/__init__.py b/src/api/endpoints/metrics/urls/aggregated/query/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/urls/aggregated/query/core.py b/src/api/endpoints/metrics/urls/aggregated/query/core.py new file mode 100644 index 00000000..57bc4211 --- /dev/null +++ b/src/api/endpoints/metrics/urls/aggregated/query/core.py @@ -0,0 +1,48 @@ +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.metrics.dtos.get.urls.aggregated.core import GetMetricsURLsAggregatedResponseDTO +from src.api.endpoints.metrics.urls.aggregated.query.subqueries.all import ALL_SUBQUERY +from src.api.endpoints.metrics.urls.aggregated.query.subqueries.error import ERROR_SUBQUERY +from src.api.endpoints.metrics.urls.aggregated.query.subqueries.pending import PENDING_SUBQUERY +from src.api.endpoints.metrics.urls.aggregated.query.subqueries.rejected import REJECTED_SUBQUERY +from src.api.endpoints.metrics.urls.aggregated.query.subqueries.submitted import SUBMITTED_SUBQUERY +from src.api.endpoints.metrics.urls.aggregated.query.subqueries.validated import VALIDATED_SUBQUERY +from src.collectors.enums import URLStatus +from src.db.helpers.session import session_helper as sh +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.base.builder import QueryBuilderBase + + +class GetURLsAggregatedMetricsQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> GetMetricsURLsAggregatedResponseDTO: + + oldest_pending_url_query = select( + URL.id, + URL.created_at + ).where( + URL.status == URLStatus.OK.value + ).order_by( + URL.created_at.asc() + ).limit(1) + + oldest_pending_url = await session.execute(oldest_pending_url_query) + oldest_pending_url = oldest_pending_url.one_or_none() + if oldest_pending_url is None: + oldest_pending_url_id = None + oldest_pending_created_at = None + else: + oldest_pending_url_id = oldest_pending_url.id + oldest_pending_created_at = oldest_pending_url.created_at + + return GetMetricsURLsAggregatedResponseDTO( + count_urls_total=await sh.scalar(session, query=ALL_SUBQUERY), + count_urls_pending=await sh.scalar(session, query=PENDING_SUBQUERY), + count_urls_submitted=await sh.scalar(session, query=SUBMITTED_SUBQUERY), + count_urls_validated=await sh.scalar(session, query=VALIDATED_SUBQUERY), + count_urls_rejected=await sh.scalar(session, query=REJECTED_SUBQUERY), + count_urls_errors=await sh.scalar(session, query=ERROR_SUBQUERY), + oldest_pending_url_id=oldest_pending_url_id, + oldest_pending_url_created_at=oldest_pending_created_at, + ) diff --git a/src/api/endpoints/metrics/urls/aggregated/query/subqueries/__init__.py b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/urls/aggregated/query/subqueries/all.py b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/all.py new file mode 100644 index 00000000..a2d09217 --- /dev/null +++ b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/all.py @@ -0,0 +1,9 @@ +from sqlalchemy import select, func + +from src.db.models.impl.url.core.sqlalchemy import URL + +ALL_SUBQUERY = ( + select( + func.count(URL.id).label("count") + ) +) \ No newline at end of file diff --git a/src/api/endpoints/metrics/urls/aggregated/query/subqueries/error.py b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/error.py new file mode 100644 index 00000000..407b0e4b --- /dev/null +++ b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/error.py @@ -0,0 +1,11 @@ +from sqlalchemy import select, func + +from src.collectors.enums import URLStatus +from src.db.models.impl.url.core.sqlalchemy import URL + +ERROR_SUBQUERY = ( + select( + func.count(URL.id).label("count") + ) + .where(URL.status == URLStatus.ERROR) +) \ No newline at end of file diff --git a/src/api/endpoints/metrics/urls/aggregated/query/subqueries/pending.py b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/pending.py new file mode 100644 index 00000000..31d8e2b6 --- /dev/null +++ b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/pending.py @@ -0,0 +1,19 @@ +from sqlalchemy import select, func + +from src.collectors.enums import URLStatus +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.url.core.sqlalchemy import URL + +PENDING_SUBQUERY = ( + select( + func.count(URL.id).label("count") + ) + .outerjoin( + FlagURLValidated, + URL.id == FlagURLValidated.url_id, + ) + .where( + URL.status == URLStatus.OK, + FlagURLValidated.url_id.is_(None), + ) +) \ No newline at end of file diff --git a/src/api/endpoints/metrics/urls/aggregated/query/subqueries/rejected.py b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/rejected.py new file mode 100644 index 00000000..983554ab --- /dev/null +++ b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/rejected.py @@ -0,0 +1,18 @@ +from sqlalchemy import select, func + +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.url.core.sqlalchemy import URL + +REJECTED_SUBQUERY = ( + select( + func.count(URL.id).label("count") + ) + .join( + FlagURLValidated, + URL.id == FlagURLValidated.url_id, + ) + .where( + FlagURLValidated.type == URLValidatedType.NOT_RELEVANT, + ) +) \ No newline at end of file diff --git a/src/api/endpoints/metrics/urls/aggregated/query/subqueries/submitted.py b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/submitted.py new file mode 100644 index 00000000..34be5e26 --- /dev/null +++ b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/submitted.py @@ -0,0 +1,14 @@ +from sqlalchemy import func, select + +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource + +SUBMITTED_SUBQUERY = ( + select( + func.count(URL.id).label("count") + ) + .join( + URLDataSource, + URL.id == URLDataSource.url_id, + ) +) \ No newline at end of file diff --git a/src/api/endpoints/metrics/urls/aggregated/query/subqueries/validated.py b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/validated.py new file mode 100644 index 00000000..fb771db6 --- /dev/null +++ b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/validated.py @@ -0,0 +1,14 @@ +from sqlalchemy import select, func + +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.url.core.sqlalchemy import URL + +VALIDATED_SUBQUERY = ( + select( + func.count(URL.id).label("count") + ) + .join( + FlagURLValidated, + URL.id == FlagURLValidated.url_id, + ) +) \ No newline at end of file diff --git a/src/api/endpoints/metrics/urls/breakdown/__init__.py b/src/api/endpoints/metrics/urls/breakdown/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/urls/breakdown/query/__init__.py b/src/api/endpoints/metrics/urls/breakdown/query/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/metrics/urls/breakdown/query/core.py b/src/api/endpoints/metrics/urls/breakdown/query/core.py new file mode 100644 index 00000000..3fc52c3f --- /dev/null +++ b/src/api/endpoints/metrics/urls/breakdown/query/core.py @@ -0,0 +1,91 @@ +from typing import Any + +from sqlalchemy import select, case, literal, func +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.metrics.dtos.get.urls.breakdown.pending import GetMetricsURLsBreakdownPendingResponseInnerDTO, \ + GetMetricsURLsBreakdownPendingResponseDTO +from src.collectors.enums import URLStatus +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion +from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion +from src.db.queries.base.builder import QueryBuilderBase + + +class GetURLsBreakdownPendingMetricsQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> GetMetricsURLsBreakdownPendingResponseDTO: + + flags = ( + select( + URL.id.label("url_id"), + case((UserRecordTypeSuggestion.url_id != None, literal(True)), else_=literal(False)).label( + "has_user_record_type_annotation" + ), + case((UserRelevantSuggestion.url_id != None, literal(True)), else_=literal(False)).label( + "has_user_relevant_annotation" + ), + case((UserUrlAgencySuggestion.url_id != None, literal(True)), else_=literal(False)).label( + "has_user_agency_annotation" + ), + ) + .outerjoin(UserRecordTypeSuggestion, URL.id == UserRecordTypeSuggestion.url_id) + .outerjoin(UserRelevantSuggestion, URL.id == UserRelevantSuggestion.url_id) + .outerjoin(UserUrlAgencySuggestion, URL.id == UserUrlAgencySuggestion.url_id) + ).cte("flags") + + month = func.date_trunc('month', URL.created_at) + + # Build the query + query = ( + select( + month.label('month'), + func.count(URL.id).label('count_total'), + func.count( + case( + (flags.c.has_user_record_type_annotation == True, 1) + ) + ).label('user_record_type_count'), + func.count( + case( + (flags.c.has_user_relevant_annotation == True, 1) + ) + ).label('user_relevant_count'), + func.count( + case( + (flags.c.has_user_agency_annotation == True, 1) + ) + ).label('user_agency_count'), + ) + .outerjoin(flags, flags.c.url_id == URL.id) + .outerjoin( + FlagURLValidated, + FlagURLValidated.url_id == URL.id + ) + .where( + FlagURLValidated.url_id.is_(None), + URL.status == URLStatus.OK + ) + .group_by(month) + .order_by(month.asc()) + ) + + # Execute the query and return the results + results = await session.execute(query) + all_results = results.all() + final_results: list[GetMetricsURLsBreakdownPendingResponseInnerDTO] = [] + + for result in all_results: + dto = GetMetricsURLsBreakdownPendingResponseInnerDTO( + month=result.month.strftime("%B %Y"), + count_pending_total=result.count_total, + count_pending_relevant_user=result.user_relevant_count, + count_pending_record_type_user=result.user_record_type_count, + count_pending_agency_user=result.user_agency_count, + ) + final_results.append(dto) + return GetMetricsURLsBreakdownPendingResponseDTO( + entries=final_results, + ) \ No newline at end of file diff --git a/src/api/endpoints/review/approve/query_/core.py b/src/api/endpoints/review/approve/query_/core.py index af810a2b..86c0212c 100644 --- a/src/api/endpoints/review/approve/query_/core.py +++ b/src/api/endpoints/review/approve/query_/core.py @@ -9,6 +9,8 @@ from src.collectors.enums import URLStatus from src.db.constants import PLACEHOLDER_AGENCY_NAME from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata @@ -30,76 +32,38 @@ def __init__( async def run(self, session: AsyncSession) -> None: # Get URL + url = await self._get_url(session) - query = ( - Select(URL) - .where(URL.id == self.approval_info.url_id) - .options( - joinedload(URL.optional_data_source_metadata), - joinedload(URL.confirmed_agencies), - ) - ) - - url = await session.execute(query) - url = url.scalars().first() - - update_if_not_none( - url, - "record_type", - self.approval_info.record_type.value - if self.approval_info.record_type is not None else None, - required=True - ) + await self._optionally_update_record_type(url) # Get existing agency ids existing_agencies = url.confirmed_agencies or [] existing_agency_ids = [agency.agency_id for agency in existing_agencies] new_agency_ids = self.approval_info.agency_ids or [] - if len(existing_agency_ids) == 0 and len(new_agency_ids) == 0: - raise HTTPException( - status_code=HTTP_400_BAD_REQUEST, - detail="Must specify agency_id if URL does not already have a confirmed agency" - ) + await self._check_for_unspecified_agency_ids(existing_agency_ids, new_agency_ids) - # Get any existing agency ids that are not in the new agency ids - # If new agency ids are specified, overwrite existing - if len(new_agency_ids) != 0: - for existing_agency in existing_agencies: - if existing_agency.id not in new_agency_ids: - # If the existing agency id is not in the new agency ids, delete it - await session.delete(existing_agency) + await self._overwrite_existing_agencies(existing_agencies, new_agency_ids, session) # Add any new agency ids that are not in the existing agency ids - for new_agency_id in new_agency_ids: - if new_agency_id not in existing_agency_ids: - # Check if the new agency exists in the database - query = ( - select(Agency) - .where(Agency.agency_id == new_agency_id) - ) - existing_agency = await session.execute(query) - existing_agency = existing_agency.scalars().first() - if existing_agency is None: - # If not, create it - agency = Agency( - agency_id=new_agency_id, - name=PLACEHOLDER_AGENCY_NAME, - ) - session.add(agency) - - # If the new agency id is not in the existing agency ids, add it - confirmed_url_agency = LinkURLAgency( - url_id=self.approval_info.url_id, - agency_id=new_agency_id - ) - session.add(confirmed_url_agency) + await self._add_new_agencies(existing_agency_ids, new_agency_ids, session) - # If it does, do nothing + await self._add_validated_flag(session, url=url) - url.status = URLStatus.VALIDATED.value + await self._optionally_update_required_metadata(url) + await self._optionally_update_optional_metdata(url) + await self._add_approving_user(session) + async def _optionally_update_required_metadata(self, url: URL) -> None: update_if_not_none(url, "name", self.approval_info.name, required=True) update_if_not_none(url, "description", self.approval_info.description, required=False) + async def _add_approving_user(self, session: AsyncSession) -> None: + approving_user_url = ReviewingUserURL( + user_id=self.user_id, + url_id=self.approval_info.url_id + ) + session.add(approving_user_url) + + async def _optionally_update_optional_metdata(self, url: URL) -> None: optional_metadata = url.optional_data_source_metadata if optional_metadata is None: url.optional_data_source_metadata = URLOptionalDataSourceMetadata( @@ -124,10 +88,85 @@ async def run(self, session: AsyncSession) -> None: self.approval_info.supplying_entity ) - # Add approving user - approving_user_url = ReviewingUserURL( - user_id=self.user_id, - url_id=self.approval_info.url_id + async def _optionally_update_record_type(self, url: URL) -> None: + update_if_not_none( + url, + "record_type", + self.approval_info.record_type.value + if self.approval_info.record_type is not None else None, + required=True ) - session.add(approving_user_url) \ No newline at end of file + async def _get_url(self, session: AsyncSession) -> URL: + query = ( + Select(URL) + .where(URL.id == self.approval_info.url_id) + .options( + joinedload(URL.optional_data_source_metadata), + joinedload(URL.confirmed_agencies), + ) + ) + url = await session.execute(query) + url = url.scalars().first() + return url + + async def _check_for_unspecified_agency_ids( + self, + existing_agency_ids: list[int], + new_agency_ids: list[int] + ) -> None: + """ + raises: + HTTPException: If no agency ids are specified and no existing agency ids are found + """ + if len(existing_agency_ids) == 0 and len(new_agency_ids) == 0: + raise HTTPException( + status_code=HTTP_400_BAD_REQUEST, + detail="Must specify agency_id if URL does not already have a confirmed agency" + ) + + async def _overwrite_existing_agencies(self, existing_agencies, new_agency_ids, session): + # Get any existing agency ids that are not in the new agency ids + # If new agency ids are specified, overwrite existing + if len(new_agency_ids) != 0: + for existing_agency in existing_agencies: + if existing_agency.id not in new_agency_ids: + # If the existing agency id is not in the new agency ids, delete it + await session.delete(existing_agency) + + async def _add_new_agencies(self, existing_agency_ids, new_agency_ids, session): + for new_agency_id in new_agency_ids: + if new_agency_id in existing_agency_ids: + continue + # Check if the new agency exists in the database + query = ( + select(Agency) + .where(Agency.agency_id == new_agency_id) + ) + existing_agency = await session.execute(query) + existing_agency = existing_agency.scalars().first() + if existing_agency is None: + # If not, create it + agency = Agency( + agency_id=new_agency_id, + name=PLACEHOLDER_AGENCY_NAME, + ) + session.add(agency) + + # If the new agency id is not in the existing agency ids, add it + confirmed_url_agency = LinkURLAgency( + url_id=self.approval_info.url_id, + agency_id=new_agency_id + ) + session.add(confirmed_url_agency) + + async def _add_validated_flag( + self, + session: AsyncSession, + url: URL + ) -> None: + flag = FlagURLValidated( + url_id=url.id, + type=URLValidatedType.DATA_SOURCE + ) + session.add(flag) diff --git a/src/api/endpoints/review/next/convert.py b/src/api/endpoints/review/next/convert.py new file mode 100644 index 00000000..ca087895 --- /dev/null +++ b/src/api/endpoints/review/next/convert.py @@ -0,0 +1,108 @@ +from src.api.endpoints.annotate.agency.get.dto import GetNextURLForAgencyAgencyInfo +from src.api.endpoints.review.next.dto import FinalReviewAnnotationAgencyInfo, FinalReviewAnnotationAgencyAutoInfo +from src.core.enums import SuggestionType +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask +from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion +from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion + + +def convert_agency_info_to_final_review_annotation_agency_info( + subtasks: list[URLAutoAgencyIDSubtask], + confirmed_agencies: list[LinkURLAgency], + user_agency_suggestion: UserUrlAgencySuggestion +) -> FinalReviewAnnotationAgencyInfo: + + confirmed_agency_info: list[GetNextURLForAgencyAgencyInfo] = ( + _convert_confirmed_agencies_to_final_review_annotation_agency_info( + confirmed_agencies + ) + ) + + agency_auto_info: FinalReviewAnnotationAgencyAutoInfo = ( + _convert_url_auto_agency_suggestions_to_final_review_annotation_agency_auto_info( + subtasks + ) + ) + + agency_user_info: GetNextURLForAgencyAgencyInfo | None = ( + _convert_user_url_agency_suggestion_to_final_review_annotation_agency_user_info( + user_agency_suggestion + ) + ) + + return FinalReviewAnnotationAgencyInfo( + confirmed=confirmed_agency_info, + user=agency_user_info, + auto=agency_auto_info + ) + +def _convert_confirmed_agencies_to_final_review_annotation_agency_info( + confirmed_agencies: list[LinkURLAgency] +) -> list[GetNextURLForAgencyAgencyInfo]: + results: list[GetNextURLForAgencyAgencyInfo] = [] + for confirmed_agency in confirmed_agencies: + agency = confirmed_agency.agency + agency_info = _convert_agency_to_get_next_url_for_agency_agency_info( + suggestion_type=SuggestionType.CONFIRMED, + agency=agency + ) + results.append(agency_info) + return results + +def _convert_user_url_agency_suggestion_to_final_review_annotation_agency_user_info( + user_url_agency_suggestion: UserUrlAgencySuggestion +) -> GetNextURLForAgencyAgencyInfo | None: + suggestion = user_url_agency_suggestion + if suggestion is None: + return None + if suggestion.is_new: + return GetNextURLForAgencyAgencyInfo( + suggestion_type=SuggestionType.NEW_AGENCY, + ) + return _convert_agency_to_get_next_url_for_agency_agency_info( + suggestion_type=SuggestionType.USER_SUGGESTION, + agency=suggestion.agency + ) + +def _convert_agency_to_get_next_url_for_agency_agency_info( + suggestion_type: SuggestionType, + agency: Agency | None +) -> GetNextURLForAgencyAgencyInfo: + if agency is None: + if suggestion_type == SuggestionType.UNKNOWN: + return GetNextURLForAgencyAgencyInfo( + suggestion_type=suggestion_type, + ) + raise ValueError("agency cannot be None for suggestion type other than unknown") + + return GetNextURLForAgencyAgencyInfo( + suggestion_type=suggestion_type, + pdap_agency_id=agency.agency_id, + agency_name=agency.name, + state=agency.state, + county=agency.county, + locality=agency.locality + ) + +def _convert_url_auto_agency_suggestions_to_final_review_annotation_agency_auto_info( + subtasks: list[URLAutoAgencyIDSubtask] +) -> FinalReviewAnnotationAgencyAutoInfo: + results: list[GetNextURLForAgencyAgencyInfo] = [] + count_agencies_not_found: int = 0 + for subtask in subtasks: + if not subtask.agencies_found: + count_agencies_not_found += 1 + continue + suggestions: list[AgencyIDSubtaskSuggestion] = subtask.suggestions + for suggestion in suggestions: + info: GetNextURLForAgencyAgencyInfo = _convert_agency_to_get_next_url_for_agency_agency_info( + suggestion_type=SuggestionType.AUTO_SUGGESTION, + agency=suggestion.agency + ) + results.append(info) + return FinalReviewAnnotationAgencyAutoInfo( + unknown=count_agencies_not_found == len(subtasks), + suggestions=results + ) diff --git a/src/api/endpoints/review/next/query.py b/src/api/endpoints/review/next/core.py similarity index 60% rename from src/api/endpoints/review/next/query.py rename to src/api/endpoints/review/next/core.py index 7cb4670b..1736a970 100644 --- a/src/api/endpoints/review/next/query.py +++ b/src/api/endpoints/review/next/core.py @@ -1,26 +1,28 @@ -from typing import Optional, Type - -from sqlalchemy import FromClause, select, and_, Select, desc, asc, func +from sqlalchemy import FromClause, select, Select, desc, asc, func, CTE from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.orm import joinedload +from src.api.endpoints.review.next.convert import convert_agency_info_to_final_review_annotation_agency_info from src.api.endpoints.review.next.dto import FinalReviewOptionalMetadata, FinalReviewBatchInfo, \ GetNextURLForFinalReviewOuterResponse, GetNextURLForFinalReviewResponse, FinalReviewAnnotationInfo +from src.api.endpoints.review.next.extract import extract_html_content_infos, extract_optional_metadata +from src.api.endpoints.review.next.queries.count_reviewed import COUNT_REVIEWED_CTE +from src.api.endpoints.review.next.queries.eligible_urls import build_eligible_urls_cte +from src.api.endpoints.review.next.templates.count_cte import CountCTE from src.collectors.enums import URLStatus from src.core.tasks.url.operators.html.scraper.parser.util import convert_to_response_html_info from src.db.constants import USER_ANNOTATION_MODELS from src.db.dto_converter import DTOConverter from src.db.dtos.url.html_content import URLHTMLContentInfo from src.db.exceptions import FailedQueryException -from src.db.models.impl.batch.sqlalchemy import Batch -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask +from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion -from src.db.models.mixins import URLDependentMixin from src.db.queries.base.builder import QueryBuilderBase -from src.db.queries.implementations.core.common.annotation_exists import AnnotationExistsCTEQueryBuilder +from src.db.queries.implementations.core.common.annotation_exists_.core import AnnotationExistsCTEQueryBuilder TOTAL_DISTINCT_ANNOTATION_COUNT_LABEL = "total_distinct_annotation_count" @@ -42,7 +44,6 @@ def __init__(self, batch_id: int | None = None): ] # The below relationships are joined to entities that are joined to the URL self.double_join_relationships = [ - (URL.automated_agency_suggestions, AutomatedUrlAgencySuggestion.agency), (URL.user_agency_suggestion, UserUrlAgencySuggestion.agency), (URL.confirmed_agencies, LinkURLAgency.agency) ] @@ -60,58 +61,26 @@ def _get_where_exist_clauses( where_clauses.append(where_clause) return where_clauses - def _build_base_query( - self, - anno_exists_query: FromClause, - ) -> Select: - builder = self.anno_exists_builder - where_exist_clauses = self._get_where_exist_clauses( - builder.query - ) + def _build_base_query(self) -> Select: + eligible_urls: CTE = build_eligible_urls_cte(batch_id=self.batch_id) query = ( select( URL, - self._sum_exists_query(anno_exists_query, USER_ANNOTATION_MODELS) ) - .select_from(anno_exists_query) + .select_from( + eligible_urls + ) .join( URL, - URL.id == builder.url_id - ) - ) - if self.batch_id is not None: - query = ( - query.join( - LinkBatchURL - ) - .where( - LinkBatchURL.batch_id == self.batch_id - ) + URL.id == eligible_urls.c.url_id ) - - query = ( - query.where( - and_( - URL.status == URLStatus.PENDING.value, - *where_exist_clauses - ) + .where( + URL.status == URLStatus.OK.value ) ) return query - - def _sum_exists_query(self, query, models: list[Type[URLDependentMixin]]): - return sum( - [getattr(query.c, self.anno_exists_builder.get_exists_label(model)) for model in models] - ).label(TOTAL_DISTINCT_ANNOTATION_COUNT_LABEL) - - - async def _apply_batch_id_filter(self, url_query: Select, batch_id: int | None): - if batch_id is None: - return url_query - return url_query.where(URL.batch_id == batch_id) - async def _apply_options( self, url_query: Select @@ -124,49 +93,30 @@ async def _apply_options( *[ joinedload(primary).joinedload(secondary) for primary, secondary in self.double_join_relationships - ] - ) - - async def _apply_order_clause(self, url_query: Select): - return url_query.order_by( - desc(TOTAL_DISTINCT_ANNOTATION_COUNT_LABEL), - asc(URL.id) + ], + joinedload(URL.auto_agency_subtasks) + .joinedload(URLAutoAgencyIDSubtask.suggestions) + .contains_eager(AgencyIDSubtaskSuggestion.agency) ) - async def _extract_html_content_infos(self, url: URL) -> list[URLHTMLContentInfo]: - html_content = url.html_content - html_content_infos = [ - URLHTMLContentInfo(**html_info.__dict__) - for html_info in html_content - ] - return html_content_infos - - async def _extract_optional_metadata(self, url: URL) -> FinalReviewOptionalMetadata: - if url.optional_data_source_metadata is None: - return FinalReviewOptionalMetadata() - return FinalReviewOptionalMetadata( - record_formats=url.optional_data_source_metadata.record_formats, - data_portal_type=url.optional_data_source_metadata.data_portal_type, - supplying_entity=url.optional_data_source_metadata.supplying_entity - ) async def get_batch_info(self, session: AsyncSession) -> FinalReviewBatchInfo | None: if self.batch_id is None: return None - count_reviewed_query = await self.get_count_reviewed_query() + count_reviewed_query: CountCTE = COUNT_REVIEWED_CTE count_ready_query = await self.get_count_ready_query() full_query = ( select( - func.coalesce(count_reviewed_query.c[self.count_label], 0).label("count_reviewed"), + func.coalesce(count_reviewed_query.count, 0).label("count_reviewed"), func.coalesce(count_ready_query.c[self.count_label], 0).label("count_ready_for_review") ) .select_from( count_ready_query.outerjoin( - count_reviewed_query, - count_reviewed_query.c.batch_id == count_ready_query.c.batch_id + count_reviewed_query.cte, + count_reviewed_query.batch_id == count_ready_query.c.batch_id ) ) ) @@ -175,6 +125,7 @@ async def get_batch_info(self, session: AsyncSession) -> FinalReviewBatchInfo | return FinalReviewBatchInfo(**raw_result.mappings().one()) async def get_count_ready_query(self): + # TODO: Migrate to separate query builder builder = self.anno_exists_builder count_ready_query = ( select( @@ -189,7 +140,7 @@ async def get_count_ready_query(self): ) .where( LinkBatchURL.batch_id == self.batch_id, - URL.status == URLStatus.PENDING.value, + URL.status == URLStatus.OK.value, *self._get_where_exist_clauses( builder.query ) @@ -199,31 +150,6 @@ async def get_count_ready_query(self): ) return count_ready_query - async def get_count_reviewed_query(self): - count_reviewed_query = ( - select( - Batch.id.label("batch_id"), - func.count(URL.id).label(self.count_label) - ) - .select_from(Batch) - .join(LinkBatchURL) - .outerjoin(URL, URL.id == LinkBatchURL.url_id) - .where( - URL.status.in_( - [ - URLStatus.VALIDATED.value, - URLStatus.NOT_RELEVANT.value, - URLStatus.SUBMITTED.value, - URLStatus.INDIVIDUAL_RECORD.value - ] - ), - LinkBatchURL.batch_id == self.batch_id - ) - .group_by(Batch.id) - .subquery("count_reviewed") - ) - return count_reviewed_query - async def run( self, session: AsyncSession @@ -251,8 +177,8 @@ async def run( result: URL = row[0] - html_content_infos = await self._extract_html_content_infos(result) - optional_metadata = await self._extract_optional_metadata(result) + html_content_infos: list[URLHTMLContentInfo] = await extract_html_content_infos(result) + optional_metadata: FinalReviewOptionalMetadata = await extract_optional_metadata(result) batch_info = await self.get_batch_info(session) try: @@ -272,8 +198,8 @@ async def run( user_suggestion=result.user_record_type_suggestion, auto_suggestion=result.auto_record_type_suggestion ), - agency=DTOConverter.final_review_annotation_agency_info( - automated_agency_suggestions=result.automated_agency_suggestions, + agency=convert_agency_info_to_final_review_annotation_agency_info( + subtasks=result.auto_agency_subtasks, user_agency_suggestion=result.user_agency_suggestion, confirmed_agencies=result.confirmed_agencies ) @@ -289,9 +215,7 @@ async def run( raise FailedQueryException(f"Failed to convert result for url id {result.id} to response") from e async def build_url_query(self): - anno_exists_query = self.anno_exists_builder.query - url_query = self._build_base_query(anno_exists_query) + url_query = self._build_base_query() url_query = await self._apply_options(url_query) - url_query = await self._apply_order_clause(url_query) return url_query diff --git a/src/api/endpoints/review/next/extract.py b/src/api/endpoints/review/next/extract.py new file mode 100644 index 00000000..aca642e0 --- /dev/null +++ b/src/api/endpoints/review/next/extract.py @@ -0,0 +1,23 @@ +from src.api.endpoints.review.next.dto import FinalReviewOptionalMetadata +from src.db.dtos.url.html_content import URLHTMLContentInfo +from src.db.models.impl.url.core.sqlalchemy import URL + + +async def extract_html_content_infos( + url: URL +)-> list[URLHTMLContentInfo]: + html_content = url.html_content + html_content_infos = [ + URLHTMLContentInfo(**html_info.__dict__) + for html_info in html_content + ] + return html_content_infos + +async def extract_optional_metadata(url: URL) -> FinalReviewOptionalMetadata: + if url.optional_data_source_metadata is None: + return FinalReviewOptionalMetadata() + return FinalReviewOptionalMetadata( + record_formats=url.optional_data_source_metadata.record_formats, + data_portal_type=url.optional_data_source_metadata.data_portal_type, + supplying_entity=url.optional_data_source_metadata.supplying_entity + ) \ No newline at end of file diff --git a/src/api/endpoints/review/next/queries/__init__.py b/src/api/endpoints/review/next/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/review/next/queries/count_reviewed.py b/src/api/endpoints/review/next/queries/count_reviewed.py new file mode 100644 index 00000000..91349cb5 --- /dev/null +++ b/src/api/endpoints/review/next/queries/count_reviewed.py @@ -0,0 +1,18 @@ +from sqlalchemy import select, func + +from src.api.endpoints.review.next.templates.count_cte import CountCTE +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL + +COUNT_REVIEWED_CTE: CountCTE = CountCTE( + select( + Batch.id.label("batch_id"), + func.count(FlagURLValidated.url_id).label("count") + ) + .select_from(Batch) + .join(LinkBatchURL) + .outerjoin(FlagURLValidated, FlagURLValidated.url_id == LinkBatchURL.url_id) + .group_by(Batch.id) + .cte("count_reviewed") +) \ No newline at end of file diff --git a/src/api/endpoints/review/next/queries/eligible_urls.py b/src/api/endpoints/review/next/queries/eligible_urls.py new file mode 100644 index 00000000..bee5cea2 --- /dev/null +++ b/src/api/endpoints/review/next/queries/eligible_urls.py @@ -0,0 +1,35 @@ +from sqlalchemy import CTE, select, Select + +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.views.url_annotations_flags import URLAnnotationFlagsView + +uafw = URLAnnotationFlagsView + +def build_eligible_urls_cte(batch_id: int | None = None) -> CTE: + query: Select = ( + select( + uafw.url_id, + ) + .where( + # uafw.has_auto_agency_suggestion.is_(True), + # uafw.has_auto_record_type_suggestion.is_(True), + # uafw.has_auto_relevant_suggestion.is_(True), + uafw.has_user_relevant_suggestion.is_(True), + uafw.has_user_agency_suggestion.is_(True), + uafw.has_user_record_type_suggestion.is_(True), + uafw.was_reviewed.is_(False) + ) + ) + + if batch_id is not None: + query = ( + query.join( + LinkBatchURL, + LinkBatchURL.url_id == uafw.url_id + ) + .where( + LinkBatchURL.batch_id == batch_id + ) + ) + + return query.cte("eligible_urls") diff --git a/src/api/endpoints/review/next/templates/__init__.py b/src/api/endpoints/review/next/templates/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/review/next/templates/count_cte.py b/src/api/endpoints/review/next/templates/count_cte.py new file mode 100644 index 00000000..0abbbab4 --- /dev/null +++ b/src/api/endpoints/review/next/templates/count_cte.py @@ -0,0 +1,15 @@ +from sqlalchemy import CTE, Column + + +class CountCTE: + + def __init__(self, cte: CTE): + self.cte = cte + + @property + def batch_id(self) -> Column[int]: + return self.cte.c['batch_id'] + + @property + def count(self) -> Column[int]: + return self.cte.c['count'] \ No newline at end of file diff --git a/src/api/endpoints/review/reject/query.py b/src/api/endpoints/review/reject/query.py index 7d603fe1..c187a2a8 100644 --- a/src/api/endpoints/review/reject/query.py +++ b/src/api/endpoints/review/reject/query.py @@ -5,6 +5,8 @@ from src.api.endpoints.review.enums import RejectionReason from src.collectors.enums import URLStatus +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.reviewing_user import ReviewingUserURL from src.db.queries.base.builder import QueryBuilderBase @@ -33,19 +35,27 @@ async def run(self, session) -> None: url = await session.execute(query) url = url.scalars().first() + validation_type: URLValidatedType | None = None match self.rejection_reason: case RejectionReason.INDIVIDUAL_RECORD: - url.status = URLStatus.INDIVIDUAL_RECORD.value + validation_type = URLValidatedType.INDIVIDUAL_RECORD case RejectionReason.BROKEN_PAGE_404: url.status = URLStatus.NOT_FOUND.value case RejectionReason.NOT_RELEVANT: - url.status = URLStatus.NOT_RELEVANT.value + validation_type = URLValidatedType.NOT_RELEVANT case _: raise HTTPException( status_code=HTTP_400_BAD_REQUEST, detail="Invalid rejection reason" ) + if validation_type is not None: + flag_url_validated = FlagURLValidated( + url_id=self.url_id, + type=validation_type + ) + session.add(flag_url_validated) + # Add rejecting user rejecting_user_url = ReviewingUserURL( user_id=self.user_id, diff --git a/src/api/main.py b/src/api/main.py index b6679827..f17c147f 100644 --- a/src/api/main.py +++ b/src/api/main.py @@ -27,6 +27,10 @@ from src.core.tasks.scheduled.registry.core import ScheduledJobRegistry from src.core.tasks.url.loader import URLTaskOperatorLoader from src.core.tasks.url.manager import TaskManager +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.core import \ + NLPProcessor +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.enums import \ + SpacyModelType from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser from src.db.client.async_ import AsyncDatabaseClient from src.db.client.sync import DatabaseClient @@ -83,6 +87,9 @@ async def lifespan(app: FastAPI): session=session, token=env_var_manager.hf_inference_api_key ), + nlp_processor=NLPProcessor( + model_type=SpacyModelType.EN_CORE_WEB_SM + ) ), ) async_collector_manager = AsyncCollectorManager( diff --git a/src/collectors/enums.py b/src/collectors/enums.py index 1732bd19..c357d6bf 100644 --- a/src/collectors/enums.py +++ b/src/collectors/enums.py @@ -11,11 +11,7 @@ class CollectorType(Enum): MANUAL = "manual" class URLStatus(Enum): - PENDING = "pending" - SUBMITTED = "submitted" - VALIDATED = "validated" + OK = "ok" ERROR = "error" DUPLICATE = "duplicate" - NOT_RELEVANT = "not relevant" NOT_FOUND = "404 not found" - INDIVIDUAL_RECORD = "individual record" diff --git a/src/collectors/impl/muckrock/api_interface/lookup_response.py b/src/collectors/impl/muckrock/api_interface/lookup_response.py index 47ea855b..d1fd9635 100644 --- a/src/collectors/impl/muckrock/api_interface/lookup_response.py +++ b/src/collectors/impl/muckrock/api_interface/lookup_response.py @@ -6,6 +6,6 @@ class AgencyLookupResponse(BaseModel): - name: Optional[str] + name: str | None type: AgencyLookupResponseType - error: Optional[str] = None + error: str | None = None diff --git a/src/collectors/queries/insert/url.py b/src/collectors/queries/insert/url.py index 96365107..af72a3aa 100644 --- a/src/collectors/queries/insert/url.py +++ b/src/collectors/queries/insert/url.py @@ -1,6 +1,6 @@ from sqlalchemy.ext.asyncio import AsyncSession -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.pydantic.info import URLInfo from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/core/core.py b/src/core/core.py index c597a591..0938586a 100644 --- a/src/core/core.py +++ b/src/core/core.py @@ -35,7 +35,7 @@ from src.api.endpoints.task.dtos.get.tasks import GetTasksResponse from src.api.endpoints.url.get.dto import GetURLsResponseInfo from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.impl.batch.pydantic import BatchInfo +from src.db.models.impl.batch.pydantic.info import BatchInfo from src.api.endpoints.task.dtos.get.task_status import GetTaskStatusResponseInfo from src.db.enums import TaskType from src.collectors.manager import AsyncCollectorManager diff --git a/src/core/enums.py b/src/core/enums.py index c6f90c80..edc18425 100644 --- a/src/core/enums.py +++ b/src/core/enums.py @@ -16,6 +16,7 @@ class RecordType(Enum): """ All available URL record types """ + # Police and Public ACCIDENT_REPORTS = "Accident Reports" ARREST_RECORDS = "Arrest Records" CALLS_FOR_SERVICE = "Calls for Service" @@ -31,16 +32,22 @@ class RecordType(Enum): SURVEYS = "Surveys" USE_OF_FORCE_REPORTS = "Use of Force Reports" VEHICLE_PURSUITS = "Vehicle Pursuits" + + # Info About Officers COMPLAINTS_AND_MISCONDUCT = "Complaints & Misconduct" DAILY_ACTIVITY_LOGS = "Daily Activity Logs" TRAINING_AND_HIRING_INFO = "Training & Hiring Info" PERSONNEL_RECORDS = "Personnel Records" + + # Info About Agencies ANNUAL_AND_MONTHLY_REPORTS = "Annual & Monthly Reports" BUDGETS_AND_FINANCES = "Budgets & Finances" CONTACT_INFO_AND_AGENCY_META = "Contact Info & Agency Meta" GEOGRAPHIC = "Geographic" LIST_OF_DATA_SOURCES = "List of Data Sources" POLICIES_AND_CONTRACTS = "Policies & Contracts" + + # Agency-Published Resources CRIME_MAPS_AND_REPORTS = "Crime Maps & Reports" CRIME_STATISTICS = "Crime Statistics" MEDIA_BULLETINS = "Media Bulletins" @@ -48,9 +55,13 @@ class RecordType(Enum): RESOURCES = "Resources" SEX_OFFENDER_REGISTRY = "Sex Offender Registry" WANTED_PERSONS = "Wanted Persons" + + # Jails and Courts Specific BOOKING_REPORTS = "Booking Reports" COURT_CASES = "Court Cases" INCARCERATION_RECORDS = "Incarceration Records" + + # Other OTHER = "Other" diff --git a/src/core/exceptions.py b/src/core/exceptions.py index d4f9c4a8..a361a24d 100644 --- a/src/core/exceptions.py +++ b/src/core/exceptions.py @@ -14,3 +14,4 @@ class MatchAgencyError(Exception): class FailedValidationException(HTTPException): def __init__(self, detail: str): super().__init__(status_code=HTTPStatus.BAD_REQUEST, detail=detail) + diff --git a/src/core/helpers.py b/src/core/helpers.py deleted file mode 100644 index eeb951fe..00000000 --- a/src/core/helpers.py +++ /dev/null @@ -1,48 +0,0 @@ -from src.core.enums import SuggestionType -from src.core.exceptions import MatchAgencyError -from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse -from src.external.pdap.enums import MatchAgencyResponseStatus - - -def process_match_agency_response_to_suggestions( - url_id: int, - match_agency_response: MatchAgencyResponse -) -> list[URLAgencySuggestionInfo]: - if match_agency_response.status == MatchAgencyResponseStatus.EXACT_MATCH: - match = match_agency_response.matches[0] - return [ - URLAgencySuggestionInfo( - url_id=url_id, - suggestion_type=SuggestionType.CONFIRMED, - pdap_agency_id=int(match.id), - agency_name=match.submitted_name, - state=match.state, - county=match.county, - ) - ] - if match_agency_response.status == MatchAgencyResponseStatus.NO_MATCH: - return [ - URLAgencySuggestionInfo( - url_id=url_id, - suggestion_type=SuggestionType.UNKNOWN, - ) - ] - - if match_agency_response.status != MatchAgencyResponseStatus.PARTIAL_MATCH: - raise MatchAgencyError( - f"Unknown Match Agency Response Status: {match_agency_response.status}" - ) - - return [ - URLAgencySuggestionInfo( - url_id=url_id, - suggestion_type=SuggestionType.AUTO_SUGGESTION, - pdap_agency_id=match.id, - agency_name=match.submitted_name, - state=match.state, - county=match.county, - locality=match.locality - ) - for match in match_agency_response.matches - ] diff --git a/src/core/tasks/handler.py b/src/core/tasks/handler.py index 7f488594..6ddca6eb 100644 --- a/src/core/tasks/handler.py +++ b/src/core/tasks/handler.py @@ -50,8 +50,11 @@ async def handle_task_error(self, run_info: TaskOperatorRunInfo): # task_id=run_info.task_id, error=run_info.message ) + msg: str = f"Task {run_info.task_id} ({run_info.task_type.value}) failed with error: {run_info.message}" + print(msg) self.discord_poster.post_to_discord( - message=f"Task {run_info.task_id} ({run_info.task_type.value}) failed with error.") + message=msg + ) async def link_urls_to_task(self, task_id: int, url_ids: list[int]): await self.adb_client.link_urls_to_task( diff --git a/src/core/tasks/scheduled/impl/huggingface/operator.py b/src/core/tasks/scheduled/impl/huggingface/operator.py index 7d5324f5..9bb7a85e 100644 --- a/src/core/tasks/scheduled/impl/huggingface/operator.py +++ b/src/core/tasks/scheduled/impl/huggingface/operator.py @@ -1,12 +1,19 @@ from itertools import count +from src.core.tasks.mixins.prereq import HasPrerequisitesMixin +from src.core.tasks.scheduled.impl.huggingface.queries.check.core import CheckValidURLsUpdatedQueryBuilder +from src.core.tasks.scheduled.impl.huggingface.queries.get.core import GetForLoadingToHuggingFaceQueryBuilder +from src.core.tasks.scheduled.impl.huggingface.queries.get.model import GetForLoadingToHuggingFaceOutput from src.core.tasks.scheduled.templates.operator import ScheduledTaskOperatorBase from src.db.client.async_ import AsyncDatabaseClient from src.db.enums import TaskType from src.external.huggingface.hub.client import HuggingFaceHubClient -class PushToHuggingFaceTaskOperator(ScheduledTaskOperatorBase): +class PushToHuggingFaceTaskOperator( + ScheduledTaskOperatorBase, + HasPrerequisitesMixin +): @property def task_type(self) -> TaskType: @@ -20,21 +27,23 @@ def __init__( super().__init__(adb_client) self.hf_client = hf_client - async def inner_task_logic(self): - # Check if any valid urls have been updated - valid_urls_updated = await self.adb_client.check_valid_urls_updated() - print(f"Valid urls updated: {valid_urls_updated}") - if not valid_urls_updated: - print("No valid urls updated, skipping.") - return - + async def meets_task_prerequisites(self) -> bool: + return await self.adb_client.run_query_builder( + CheckValidURLsUpdatedQueryBuilder() + ) - # Otherwise, push to huggingface + async def inner_task_logic(self): + """Push raw data sources to huggingface.""" run_dt = await self.adb_client.get_current_database_time() for idx in count(start=1): - outputs = await self.adb_client.get_data_sources_raw_for_huggingface(page=idx) + outputs: list[GetForLoadingToHuggingFaceOutput] = await self._get_data_sources_raw_for_huggingface(page=idx) if len(outputs) == 0: break self.hf_client.push_data_sources_raw_to_hub(outputs, idx=idx) await self.adb_client.set_hugging_face_upload_state(run_dt.replace(tzinfo=None)) + + async def _get_data_sources_raw_for_huggingface(self, page: int) -> list[GetForLoadingToHuggingFaceOutput]: + return await self.adb_client.run_query_builder( + GetForLoadingToHuggingFaceQueryBuilder(page) + ) diff --git a/src/core/tasks/scheduled/impl/huggingface/queries/check/requester.py b/src/core/tasks/scheduled/impl/huggingface/queries/check/requester.py index 23e0b0b6..25124c95 100644 --- a/src/core/tasks/scheduled/impl/huggingface/queries/check/requester.py +++ b/src/core/tasks/scheduled/impl/huggingface/queries/check/requester.py @@ -1,4 +1,5 @@ from datetime import datetime +from operator import or_ from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession @@ -6,6 +7,7 @@ from src.collectors.enums import URLStatus from src.db.helpers.session import session_helper as sh +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.state.huggingface import HuggingFaceUploadState from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML from src.db.models.impl.url.core.sqlalchemy import URL @@ -34,14 +36,12 @@ async def has_valid_urls(self, last_upload_at: datetime | None) -> bool: URLCompressedHTML, URL.id == URLCompressedHTML.url_id ) + .outerjoin( + FlagURLValidated, + URL.id == FlagURLValidated.url_id + ) .where( - URL.status.in_( - [ - URLStatus.VALIDATED, - URLStatus.NOT_RELEVANT.value, - URLStatus.SUBMITTED.value, - ] - ), + FlagURLValidated.url_id.isnot(None) ) ) if last_upload_at is not None: diff --git a/src/core/tasks/scheduled/impl/huggingface/queries/get/convert.py b/src/core/tasks/scheduled/impl/huggingface/queries/get/convert.py index 9d5c4135..5ad96115 100644 --- a/src/core/tasks/scheduled/impl/huggingface/queries/get/convert.py +++ b/src/core/tasks/scheduled/impl/huggingface/queries/get/convert.py @@ -1,8 +1,7 @@ -from src.collectors.enums import URLStatus from src.core.enums import RecordType from src.core.tasks.scheduled.impl.huggingface.queries.get.enums import RecordTypeCoarse -from src.core.tasks.scheduled.impl.huggingface.queries.get.mappings import FINE_COARSE_RECORD_TYPE_MAPPING, \ - OUTCOME_RELEVANCY_MAPPING +from src.core.tasks.scheduled.impl.huggingface.queries.get.mappings import FINE_COARSE_RECORD_TYPE_MAPPING +from src.db.models.impl.flag.url_validated.enums import URLValidatedType def convert_fine_to_coarse_record_type( @@ -10,7 +9,14 @@ def convert_fine_to_coarse_record_type( ) -> RecordTypeCoarse: return FINE_COARSE_RECORD_TYPE_MAPPING[fine_record_type] -def convert_url_status_to_relevant( - url_status: URLStatus + +def convert_validated_type_to_relevant( + validated_type: URLValidatedType ) -> bool: - return OUTCOME_RELEVANCY_MAPPING[url_status] \ No newline at end of file + match validated_type: + case URLValidatedType.NOT_RELEVANT: + return False + case URLValidatedType.DATA_SOURCE: + return True + case _: + raise ValueError(f"Disallowed validated type: {validated_type}") \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/huggingface/queries/get/core.py b/src/core/tasks/scheduled/impl/huggingface/queries/get/core.py index 662f7fbb..d58cbdf7 100644 --- a/src/core/tasks/scheduled/impl/huggingface/queries/get/core.py +++ b/src/core/tasks/scheduled/impl/huggingface/queries/get/core.py @@ -1,16 +1,18 @@ from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession -from src.collectors.enums import URLStatus -from src.core.tasks.scheduled.impl.huggingface.queries.get.convert import convert_url_status_to_relevant, \ - convert_fine_to_coarse_record_type +from src.core.tasks.scheduled.impl.huggingface.queries.get.convert import convert_fine_to_coarse_record_type, \ + convert_validated_type_to_relevant from src.core.tasks.scheduled.impl.huggingface.queries.get.model import GetForLoadingToHuggingFaceOutput from src.db.client.helpers import add_standard_limit_and_offset -from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML +from src.db.helpers.session import session_helper as sh +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML from src.db.queries.base.builder import QueryBuilderBase from src.db.utils.compression import decompress_html -from src.db.helpers.session import session_helper as sh + class GetForLoadingToHuggingFaceQueryBuilder(QueryBuilderBase): @@ -22,29 +24,32 @@ def __init__(self, page: int): async def run(self, session: AsyncSession) -> list[GetForLoadingToHuggingFaceOutput]: label_url_id = 'url_id' label_url = 'url' - label_url_status = 'url_status' label_record_type_fine = 'record_type_fine' label_html = 'html' + label_type = 'type' query = ( select( URL.id.label(label_url_id), URL.url.label(label_url), - URL.status.label(label_url_status), URL.record_type.label(label_record_type_fine), - URLCompressedHTML.compressed_html.label(label_html) + URLCompressedHTML.compressed_html.label(label_html), + FlagURLValidated.type.label(label_type) ) .join( URLCompressedHTML, URL.id == URLCompressedHTML.url_id ) + .outerjoin( + FlagURLValidated, + URL.id == FlagURLValidated.url_id + ) .where( - URL.status.in_([ - URLStatus.VALIDATED, - URLStatus.NOT_RELEVANT, - URLStatus.SUBMITTED - ]) + FlagURLValidated.type.in_( + (URLValidatedType.DATA_SOURCE, + URLValidatedType.NOT_RELEVANT) + ) ) ) query = add_standard_limit_and_offset(page=self.page, statement=query) @@ -57,7 +62,9 @@ async def run(self, session: AsyncSession) -> list[GetForLoadingToHuggingFaceOut output = GetForLoadingToHuggingFaceOutput( url_id=result[label_url_id], url=result[label_url], - relevant=convert_url_status_to_relevant(result[label_url_status]), + relevant=convert_validated_type_to_relevant( + URLValidatedType(result[label_type]) + ), record_type_fine=result[label_record_type_fine], record_type_coarse=convert_fine_to_coarse_record_type( result[label_record_type_fine] diff --git a/src/core/tasks/scheduled/impl/huggingface/queries/get/mappings.py b/src/core/tasks/scheduled/impl/huggingface/queries/get/mappings.py index ed4a7da2..0fd12b28 100644 --- a/src/core/tasks/scheduled/impl/huggingface/queries/get/mappings.py +++ b/src/core/tasks/scheduled/impl/huggingface/queries/get/mappings.py @@ -47,9 +47,3 @@ RecordType.OTHER: RecordTypeCoarse.OTHER, None: RecordTypeCoarse.NOT_RELEVANT } - -OUTCOME_RELEVANCY_MAPPING = { - URLStatus.SUBMITTED: True, - URLStatus.VALIDATED: True, - URLStatus.NOT_RELEVANT: False -} \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/operator.py b/src/core/tasks/scheduled/impl/sync/agency/operator.py index db20acf1..6adff30b 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/operator.py +++ b/src/core/tasks/scheduled/impl/sync/agency/operator.py @@ -1,9 +1,11 @@ +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.core import UpsertAgenciesQueryBuilder from src.core.tasks.scheduled.impl.sync.check import check_max_sync_requests_not_exceeded from src.core.tasks.scheduled.impl.sync.agency.dtos.parameters import AgencySyncParameters from src.core.tasks.scheduled.templates.operator import ScheduledTaskOperatorBase from src.db.client.async_ import AsyncDatabaseClient from src.db.enums import TaskType from src.external.pdap.client import PDAPClient +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo class SyncAgenciesTaskOperator(ScheduledTaskOperatorBase): @@ -21,17 +23,19 @@ def task_type(self) -> TaskType: # return TaskType.SYNC_AGENCIES async def inner_task_logic(self): - count_agencies_synced = 0 params = await self.adb_client.get_agencies_sync_parameters() if params.page is None: params.page = 1 response = await self.pdap_client.sync_agencies(params) - count_agencies_synced += len(response.agencies) - request_count = 1 + count_agencies_synced = 0 + request_count = 0 while len(response.agencies) > 0: + await self.update_data(response.agencies) + count_agencies_synced += len(response.agencies) + request_count += 1 + check_max_sync_requests_not_exceeded(request_count) - await self.adb_client.upsert_agencies(response.agencies) params = AgencySyncParameters( page=params.page + 1, @@ -40,9 +44,13 @@ async def inner_task_logic(self): await self.adb_client.update_agencies_sync_progress(params.page) response = await self.pdap_client.sync_agencies(params) - count_agencies_synced += len(response.agencies) - request_count += 1 + await self.adb_client.mark_full_agencies_sync() print(f"Sync complete. Synced {count_agencies_synced} agencies") + async def update_data(self, agencies: list[AgenciesSyncResponseInnerInfo]): + # First, add new agencies + await self.adb_client.run_query_builder( + UpsertAgenciesQueryBuilder(agencies) + ) diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/__init__.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/convert.py similarity index 97% rename from src/core/tasks/scheduled/impl/sync/agency/queries/upsert.py rename to src/core/tasks/scheduled/impl/sync/agency/queries/upsert/convert.py index 61a0b104..4b944464 100644 --- a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert.py +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/convert.py @@ -17,4 +17,4 @@ def convert_agencies_sync_response_to_agencies_upsert( ds_last_updated_at=agency.updated_at ) ) - return results \ No newline at end of file + return results diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/core.py new file mode 100644 index 00000000..fc909e48 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/core.py @@ -0,0 +1,30 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.core import UpdateAgencyURLLinksQueryBuilder +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.core import UpsertMetaUrlsQueryBuilder +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.convert import \ + convert_agencies_sync_response_to_agencies_upsert +from src.db.models.impl.agency.pydantic.upsert import AgencyUpsertModel +from src.db.queries.base.builder import QueryBuilderBase +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo + +from src.db.helpers.session import session_helper as sh + +class UpsertAgenciesQueryBuilder(QueryBuilderBase): + + def __init__(self, sync_responses: list[AgenciesSyncResponseInnerInfo]): + super().__init__() + self.sync_responses = sync_responses + + async def run(self, session: AsyncSession) -> None: + # Upsert Agencies + agency_upserts: list[AgencyUpsertModel] = convert_agencies_sync_response_to_agencies_upsert(self.sync_responses) + await sh.bulk_upsert(session=session, models=agency_upserts) + + # Add and update Meta URLs + meta_urls_query_builder = UpsertMetaUrlsQueryBuilder(self.sync_responses) + await meta_urls_query_builder.run(session=session) + + # Add and remove URL-Agency Links + update_url_links_query_builder = UpdateAgencyURLLinksQueryBuilder(self.sync_responses) + await update_url_links_query_builder.run(session=session) diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/extract.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/extract.py new file mode 100644 index 00000000..c05b55f1 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/extract.py @@ -0,0 +1,12 @@ +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo + + +def extract_urls_from_agencies_sync_response( + responses: list[AgenciesSyncResponseInnerInfo] +) -> list[str]: + url_set: set[str] = set() + for response in responses: + for url in response.meta_urls: + url_set.add(url) + + return list(url_set) diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/__init__.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/build.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/build.py new file mode 100644 index 00000000..5511ea65 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/build.py @@ -0,0 +1,23 @@ +from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo +from src.util.url_mapper import URLMapper + +def build_links_from_url_mappings_and_sync_responses( + url_mappings: list[URLMapping], + sync_responses: list[AgenciesSyncResponseInnerInfo], +) -> list[LinkURLAgencyPydantic]: + + links: list[LinkURLAgencyPydantic] = [] + + mapper = URLMapper(url_mappings) + for sync_response in sync_responses: + agency_id: int = sync_response.agency_id + for meta_url in sync_response.meta_urls: + url_id: int = mapper.get_id(meta_url) + link = LinkURLAgencyPydantic( + agency_id=agency_id, + url_id=url_id + ) + links.append(link) + return links \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/core.py new file mode 100644 index 00000000..37d63a03 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/core.py @@ -0,0 +1,50 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.extract import extract_urls_from_agencies_sync_response +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.build import \ + build_links_from_url_mappings_and_sync_responses +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.requester import UpdateAgencyURLLinksRequester +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.lookup.extract import \ + extract_agency_ids_from_agencies_sync_response +from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic +from src.db.queries.base.builder import QueryBuilderBase +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo + + +class UpdateAgencyURLLinksQueryBuilder(QueryBuilderBase): + """Updates agency URL links.""" + + def __init__( + self, + sync_responses: list[AgenciesSyncResponseInnerInfo] + ): + super().__init__() + self._sync_responses = sync_responses + + async def run(self, session: AsyncSession) -> None: + # Get all existing links + requester = UpdateAgencyURLLinksRequester(session) + + # Build new links from sync responses and URL mappings + sync_urls: list[str] = extract_urls_from_agencies_sync_response(self._sync_responses) + url_mappings: list[URLMapping] = await requester.get_url_mappings(urls=sync_urls) + new_links: list[LinkURLAgencyPydantic] = build_links_from_url_mappings_and_sync_responses( + url_mappings=url_mappings, + sync_responses=self._sync_responses, + ) + + sync_agency_ids: list[int] = extract_agency_ids_from_agencies_sync_response(self._sync_responses) + old_links: list[LinkURLAgencyPydantic] = await requester.get_current_agency_url_links( + agency_ids=sync_agency_ids, + ) + + new_set: set[LinkURLAgencyPydantic] = set(new_links) + old_set: set[LinkURLAgencyPydantic] = set(old_links) + + links_to_add: list[LinkURLAgencyPydantic] = list(new_set - old_set) + links_to_remove: list[LinkURLAgencyPydantic] = list(old_set - new_set) + + await requester.add_agency_url_links(links=links_to_add) + await requester.remove_agency_url_links(links=links_to_remove) + diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/filter.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/filter.py new file mode 100644 index 00000000..123bd0ba --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/filter.py @@ -0,0 +1,12 @@ +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.models.mappings import AgencyURLMappings + + +def filter_non_relevant_mappings( + mappings: list[AgencyURLMappings], + relevant_agency_ids: list[int] +) -> list[AgencyURLMappings]: + relevant_mappings: list[AgencyURLMappings] = [] + for mapping in mappings: + if mapping.agency_id in relevant_agency_ids: + relevant_mappings.append(mapping) + return relevant_mappings \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup_/__init__.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup_/links.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup_/links.py new file mode 100644 index 00000000..9336deaa --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup_/links.py @@ -0,0 +1,46 @@ +from typing import Sequence + +from sqlalchemy import select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.helpers.session import session_helper as sh +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.base.builder import QueryBuilderBase + + +class LookupMetaURLLinksQueryBuilder(QueryBuilderBase): + + def __init__(self, agency_ids: list[int]): + super().__init__() + self.agency_ids: list[int] = agency_ids + + async def run(self, session: AsyncSession) -> list[LinkURLAgencyPydantic]: + + query = ( + select( + LinkURLAgency.url_id, + LinkURLAgency.agency_id + ) + .join( + URL, + LinkURLAgency.url_id == URL.id, + ) + .join( + FlagURLValidated, + FlagURLValidated.url_id == URL.id, + ) + .where( + FlagURLValidated.type == URLValidatedType.META_URL, + LinkURLAgency.agency_id.in_(self.agency_ids), + ) + ) + + mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) + links: list[LinkURLAgencyPydantic] = [ + LinkURLAgencyPydantic(**mapping) for mapping in mappings + ] + return links \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup_/url.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup_/url.py new file mode 100644 index 00000000..8b526447 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/lookup_/url.py @@ -0,0 +1,31 @@ +from typing import Sequence + +from sqlalchemy import select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.base.builder import QueryBuilderBase +from src.db.helpers.session import session_helper as sh + +class LookupURLQueryBuilder(QueryBuilderBase): + + def __init__(self, urls: list[str]): + super().__init__() + self.urls: list[str] = urls + + async def run(self, session: AsyncSession) -> list[URLMapping]: + query = ( + select( + URL.id.label("url_id"), + URL.url, + ) + .where( + URL.url.in_(self.urls), + ) + ) + mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) + urls: list[URLMapping] = [ + URLMapping(**mapping) for mapping in mappings + ] + return urls \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/models/__init__.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/models/mappings.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/models/mappings.py new file mode 100644 index 00000000..0f3c9d69 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/models/mappings.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class AgencyURLMappings(BaseModel): + agency_id: int + url_ids: list[int] \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/requester.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/requester.py new file mode 100644 index 00000000..96887dfa --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/links/requester.py @@ -0,0 +1,21 @@ +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.lookup_.links import LookupMetaURLLinksQueryBuilder +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.links.lookup_.url import LookupURLQueryBuilder +from src.db.dtos.url.mapping import URLMapping +from src.db.helpers.session import session_helper as sh +from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic +from src.db.templates.requester import RequesterBase + + +class UpdateAgencyURLLinksRequester(RequesterBase): + + async def get_url_mappings(self, urls: list[str]) -> list[URLMapping]: + return await LookupURLQueryBuilder(urls=urls).run(session=self.session) + + async def get_current_agency_url_links(self, agency_ids: list[int]) -> list[LinkURLAgencyPydantic]: + return await LookupMetaURLLinksQueryBuilder(agency_ids=agency_ids).run(session=self.session) + + async def add_agency_url_links(self, links: list[LinkURLAgencyPydantic]) -> None: + await sh.bulk_insert(self.session, models=links) + + async def remove_agency_url_links(self, links: list[LinkURLAgencyPydantic]) -> None: + await sh.bulk_delete(self.session, models=links) diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/__init__.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/add/__init__.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/add/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/add/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/add/core.py new file mode 100644 index 00000000..73761251 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/add/core.py @@ -0,0 +1,57 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.enums import RecordType +from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.pydantic.insert import URLInsertModel +from src.db.queries.base.builder import QueryBuilderBase + +from src.db.helpers.session import session_helper as sh + +class AddMetaURLsQueryBuilder(QueryBuilderBase): + + """Add Meta URLs to DB with: + - Record type set to CONTACT_INFO_AND_AGENCY_META + - Validation Flag added as META_URL + - Source set to DATA_SOURCES + """ + def __init__(self, urls: list[str]): + super().__init__() + self.urls = urls + + async def run(self, session: AsyncSession) -> list[URLMapping]: + # Add URLs + url_inserts: list[URLInsertModel] = [] + for url in self.urls: + url_inserts.append( + URLInsertModel( + url=url, + record_type=RecordType.CONTACT_INFO_AND_AGENCY_META, + source=URLSource.DATA_SOURCES + ) + ) + url_ids: list[int] = await sh.bulk_insert(session, models=url_inserts, return_ids=True) + + # Connect with URLs + mappings: list[URLMapping] = [ + URLMapping( + url=url, + url_id=url_id, + ) + for url, url_id in zip(self.urls, url_ids) + ] + + # Add Validation Flags + flag_inserts: list[FlagURLValidatedPydantic] = [] + for url_id in url_ids: + flag_inserts.append( + FlagURLValidatedPydantic( + url_id=url_id, + type=URLValidatedType.META_URL + ) + ) + await sh.bulk_insert(session, models=flag_inserts) + + return mappings diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/convert.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/convert.py new file mode 100644 index 00000000..8d3e8785 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/convert.py @@ -0,0 +1,27 @@ +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.lookup.response import MetaURLLookupResponse +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.update.params import UpdateMetaURLsParams +from src.db.dtos.url.mapping import URLMapping + + +def convert_to_update_meta_urls_params( + lookups: list[MetaURLLookupResponse] +) -> list[UpdateMetaURLsParams]: + return [ + UpdateMetaURLsParams( + url_id=lookup.url_id, + validation_type=lookup.validation_type, + record_type=lookup.record_type, + ) + for lookup in lookups + ] + +def convert_url_lookups_to_url_mappings( + lookups: list[MetaURLLookupResponse] +) -> list[URLMapping]: + return [ + URLMapping( + url_id=lookup.url_id, + url=lookup.url, + ) + for lookup in lookups + ] \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/core.py new file mode 100644 index 00000000..6f5c3593 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/core.py @@ -0,0 +1,33 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.extract import extract_urls_from_agencies_sync_response +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.filter import filter_urls_in_sync +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.lookup.response import MetaURLLookupResponse +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.requester import UpdateMetaURLsRequester +from src.db.queries.base.builder import QueryBuilderBase +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo + + +class UpsertMetaUrlsQueryBuilder(QueryBuilderBase): + """Add and update meta URLs for agencies.""" + + def __init__(self, sync_responses: list[AgenciesSyncResponseInnerInfo]): + super().__init__() + self.sync_responses = sync_responses + + async def run(self, session: AsyncSession) -> None: + + requester = UpdateMetaURLsRequester(session) + sync_urls: list[str] = extract_urls_from_agencies_sync_response(self.sync_responses) + + + lookup_responses: list[MetaURLLookupResponse] = \ + await requester.lookup_meta_urls(sync_urls) + await requester.add_new_urls_to_database(lookup_responses) + + filtered_lookup_responses: list[MetaURLLookupResponse] = \ + filter_urls_in_sync(self.sync_responses, lookup_responses=lookup_responses) + await requester.update_existing_urls(filtered_lookup_responses) + + + diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/filter.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/filter.py new file mode 100644 index 00000000..227f0edc --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/filter.py @@ -0,0 +1,37 @@ +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.extract import extract_urls_from_agencies_sync_response +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.lookup.response import MetaURLLookupResponse +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo + + +def filter_urls_to_add( + lookup_responses: list[MetaURLLookupResponse] +) -> list[str]: + return [ + lookup_response.url + for lookup_response in lookup_responses + if not lookup_response.exists_in_db + ] + +def filter_existing_url_mappings( + lookup_responses: list[MetaURLLookupResponse] +) -> list[MetaURLLookupResponse]: + """Filter only URL mappings that already exist in the database.""" + return [ + lookup_response + for lookup_response in lookup_responses + if lookup_response.exists_in_db + ] + +def filter_urls_in_sync( + sync_responses: list[AgenciesSyncResponseInnerInfo], + lookup_responses: list[MetaURLLookupResponse] +) -> list[MetaURLLookupResponse]: + """Filter only URLs that are in sync responses.""" + sync_urls: set[str] = set( + extract_urls_from_agencies_sync_response(sync_responses) + ) + filtered_lookup_responses: list[MetaURLLookupResponse] = [] + for lookup_response in lookup_responses: + if lookup_response.url in sync_urls: + filtered_lookup_responses.append(lookup_response) + return filtered_lookup_responses \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/__init__.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/core.py new file mode 100644 index 00000000..8a817bd4 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/core.py @@ -0,0 +1,66 @@ +from typing import Sequence + +from sqlalchemy import select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.lookup.response import MetaURLLookupResponse +from src.db.helpers.session import session_helper as sh +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.base.builder import QueryBuilderBase + + +class LookupMetaURLsQueryBuilder(QueryBuilderBase): + """Lookup whether URLs exist in DB and are validated as meta URLs""" + + def __init__(self, urls: list[str]): + super().__init__() + self.urls: list[str] = urls + + async def run(self, session: AsyncSession) -> list[MetaURLLookupResponse]: + url_id_label: str = "url_id" + + query = ( + select( + URL.id.label(url_id_label), + URL.url, + URL.record_type, + FlagURLValidated.type + ) + .select_from( + URL + ) + .outerjoin( + FlagURLValidated, + FlagURLValidated.url_id == URL.id, + ) + .where( + URL.url.in_(self.urls) + ) + ) + mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) + + urls_in_db = set() + extant_lookup_responses: list[MetaURLLookupResponse] = [] + for mapping in mappings: + url = mapping["url"] + urls_in_db.add(url) + response = MetaURLLookupResponse( + url=url, + url_id=mapping[url_id_label], + record_type=mapping["record_type"], + validation_type=mapping["type"], + ) + extant_lookup_responses.append(response) + + urls_not_in_db = set(self.urls) - set(urls_in_db) + non_extant_lookup_responses = [ + MetaURLLookupResponse( + url=url, + url_id=None, + record_type=None, + validation_type=None, + ) for url in urls_not_in_db + ] + + return extant_lookup_responses + non_extant_lookup_responses diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/extract.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/extract.py new file mode 100644 index 00000000..d054f645 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/extract.py @@ -0,0 +1,10 @@ +from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo + + +def extract_agency_ids_from_agencies_sync_response( + responses: list[AgenciesSyncResponseInnerInfo] +) -> list[int]: + return [ + response.agency_id + for response in responses + ] diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/response.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/response.py new file mode 100644 index 00000000..ff2d668d --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/lookup/response.py @@ -0,0 +1,23 @@ +from pydantic import BaseModel + +from src.core.enums import RecordType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType + + +class MetaURLLookupResponse(BaseModel): + url: str + url_id: int | None + record_type: RecordType | None + validation_type: URLValidatedType | None + + @property + def exists_in_db(self) -> bool: + return self.url_id is not None + + @property + def is_meta_url(self) -> bool: + return self.record_type == RecordType.CONTACT_INFO_AND_AGENCY_META + + @property + def is_validated(self) -> bool: + return self.validation_type is not None diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/requester.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/requester.py new file mode 100644 index 00000000..0a3e3c76 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/requester.py @@ -0,0 +1,48 @@ +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.add.core import AddMetaURLsQueryBuilder +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.convert import \ + convert_to_update_meta_urls_params, convert_url_lookups_to_url_mappings +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.filter import filter_existing_url_mappings, \ + filter_urls_to_add +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.lookup.core import LookupMetaURLsQueryBuilder +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.lookup.response import MetaURLLookupResponse +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.update.core import UpdateMetaURLsQueryBuilder +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.update.params import UpdateMetaURLsParams +from src.db.dtos.url.mapping import URLMapping +from src.db.templates.requester import RequesterBase + + +class UpdateMetaURLsRequester(RequesterBase): + + async def lookup_meta_urls( + self, + urls: list[str] + ) -> list[MetaURLLookupResponse]: + return await LookupMetaURLsQueryBuilder( + urls + ).run(self.session) + + async def add_new_urls_to_database( + self, + lookup_responses: list[MetaURLLookupResponse] + ) -> list[URLMapping]: + if len(lookup_responses) == 0: + return [] + urls_to_add: list[str] = filter_urls_to_add(lookup_responses) + if len(urls_to_add) == 0: + return [] + return await AddMetaURLsQueryBuilder(urls_to_add).run(self.session) + + async def update_existing_urls( + self, + lookup_responses: list[MetaURLLookupResponse] + ) -> list[URLMapping]: + existing_url_lookups: list[MetaURLLookupResponse] = ( + filter_existing_url_mappings(lookup_responses) + ) + params: list[UpdateMetaURLsParams] = \ + convert_to_update_meta_urls_params(existing_url_lookups) + await UpdateMetaURLsQueryBuilder(params).run(self.session) + existing_url_mappings: list[URLMapping] = \ + convert_url_lookups_to_url_mappings(existing_url_lookups) + return existing_url_mappings + diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/__init__.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/core.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/core.py new file mode 100644 index 00000000..1e479652 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/core.py @@ -0,0 +1,39 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.update.filter import \ + filter_urls_with_non_meta_record_type, filter_urls_with_non_meta_url_validation_flag, \ + filter_urls_without_validation_flag +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.update.params import UpdateMetaURLsParams +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.update.requester import \ + UpdateMetaURLsUpdateURLAndValidationFlagsRequester +from src.db.queries.base.builder import QueryBuilderBase + + +class UpdateMetaURLsQueryBuilder(QueryBuilderBase): + """Update meta URLs in DB + + Meta URLs should be given a validation status as a Meta URL + and have their record type updated to CONTACT_INFO_AND_AGENCY_META + """ + + def __init__( + self, + params: list[UpdateMetaURLsParams] + ): + super().__init__() + self.params = params + + async def run( + self, + session: AsyncSession + ) -> None: + requester = UpdateMetaURLsUpdateURLAndValidationFlagsRequester(session) + + urls_with_non_meta_record_type: list[int] = filter_urls_with_non_meta_record_type(self.params) + await requester.update_urls(urls_with_non_meta_record_type) + + urls_without_validation_flag: list[int] = filter_urls_without_validation_flag(self.params) + await requester.add_validation_flags(urls_without_validation_flag) + + urls_with_non_meta_url_validation_flag: list[int] = filter_urls_with_non_meta_url_validation_flag(self.params) + await requester.update_validation_flags(urls_with_non_meta_url_validation_flag) diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/filter.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/filter.py new file mode 100644 index 00000000..b0c32a7e --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/filter.py @@ -0,0 +1,37 @@ +from src.core.enums import RecordType +from src.core.tasks.scheduled.impl.sync.agency.queries.upsert.meta_urls.update.params import UpdateMetaURLsParams +from src.db.models.impl.flag.url_validated.enums import URLValidatedType + + +def filter_urls_with_non_meta_record_type( + params: list[UpdateMetaURLsParams] +) -> list[int]: + url_ids: list[int] = [] + for param in params: + if param.record_type is None: + url_ids.append(param.url_id) + if param.record_type != RecordType.CONTACT_INFO_AND_AGENCY_META: + url_ids.append(param.url_id) + + return url_ids + +def filter_urls_without_validation_flag( + params: list[UpdateMetaURLsParams] +) -> list[int]: + url_ids: list[int] = [] + for param in params: + if param.validation_type is None: + url_ids.append(param.url_id) + return url_ids + +def filter_urls_with_non_meta_url_validation_flag( + params: list[UpdateMetaURLsParams] +) -> list[int]: + url_ids: list[int] = [] + for param in params: + if param.validation_type is None: + continue + if param.validation_type != URLValidatedType.META_URL: + url_ids.append(param.url_id) + + return url_ids \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/params.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/params.py new file mode 100644 index 00000000..cb74a378 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/params.py @@ -0,0 +1,11 @@ +from pydantic import BaseModel + +from src.core.enums import RecordType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType + + +class UpdateMetaURLsParams(BaseModel): + validation_type: URLValidatedType | None + url_id: int + record_type: RecordType | None + diff --git a/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/requester.py b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/requester.py new file mode 100644 index 00000000..175b1bbf --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/agency/queries/upsert/meta_urls/update/requester.py @@ -0,0 +1,53 @@ +from sqlalchemy import update + +from src.core.enums import RecordType +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.templates.requester import RequesterBase + +from src.db.helpers.session import session_helper as sh + +class UpdateMetaURLsUpdateURLAndValidationFlagsRequester(RequesterBase): + + async def update_validation_flags(self, url_ids: list[int]) -> None: + """Set validation flag for URLs to Meta URL""" + query = ( + update( + FlagURLValidated + ) + .where( + FlagURLValidated.url_id.in_(url_ids) + ) + .values( + type=URLValidatedType.META_URL + ) + ) + await self.session.execute(query) + + async def add_validation_flags(self, url_ids: list[int]) -> None: + inserts: list[FlagURLValidatedPydantic] = [] + for url_id in url_ids: + flag = FlagURLValidatedPydantic( + url_id=url_id, + type=URLValidatedType.META_URL, + ) + inserts.append(flag) + + await sh.bulk_insert(self.session, models=inserts) + + async def update_urls(self, url_ids: list[int]) -> None: + """Update URLs and set record type to Contact Info and Agency Meta""" + query = ( + update( + URL + ) + .values( + record_type=RecordType.CONTACT_INFO_AND_AGENCY_META, + ) + .where( + URL.id.in_(url_ids) + ) + ) + await self.session.execute(query) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/core.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/core.py index 6222d1fd..93c1cbc9 100644 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/core.py +++ b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/core.py @@ -1,13 +1,88 @@ +from collections import defaultdict + +from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.agency.query import URLAgencyLinkUpdateQueryBuilder -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.agency.params import UpdateLinkURLAgencyForDataSourcesSyncParams +from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.agency.convert import convert_to_link_url_agency_models +from src.db.helpers.session import session_helper as sh +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic +from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.agency.params import UpdateLinkURLAgencyParams +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.queries.base.builder import QueryBuilderBase + + +class URLAgencyLinkUpdateQueryBuilder(QueryBuilderBase): + """Given a set of URL-Agency links, remove all non-matching non-Meta URL links and add new ones.""" + + + def __init__(self, models: list[UpdateLinkURLAgencyParams]): + super().__init__() + self.models = models + self._new_links: dict[int, list[int]] = { + model.url_id: model.new_agency_ids + for model in self.models + } + self._existing_links: dict[int, list[int]] = defaultdict(list) + self.existing_url_ids: set[int] = { + model.url_id for model in self.models + } + + async def _get_existing_links(self, session: AsyncSession) -> None: + """Get existing non-meta URL agency links for provided URL IDs. + + Modifies: + self._existing_links + """ + query = ( + select(LinkURLAgency) + .outerjoin( + FlagURLValidated, + FlagURLValidated.url_id == LinkURLAgency.url_id, + ) + .where( + LinkURLAgency.url_id.in_( + self.existing_url_ids + ), + FlagURLValidated.type != URLValidatedType.META_URL + ) + ) + links = await session.scalars(query) + for link in links: + self._existing_links[link.url_id].append(link.agency_id) + + async def _update_links(self, session: AsyncSession) -> None: + # Remove all existing links not in new links + links_to_delete: list[LinkURLAgencyPydantic] = [] + links_to_insert: list[LinkURLAgencyPydantic] = [] + + for url_id in self.existing_url_ids: + new_agency_ids = self._new_links.get(url_id, []) + existing_agency_ids = self._existing_links.get(url_id, []) + # IDs to delete are existing agency ids that are not new agency ids + ids_to_delete = set(existing_agency_ids) - set(new_agency_ids) + # IDs to insert are new agency ids that are not existing agency ids + ids_to_insert = set(new_agency_ids) - set(existing_agency_ids) + + links_to_delete.extend( + convert_to_link_url_agency_models( + url_id=url_id, + agency_ids=list(ids_to_delete) + ) + ) + links_to_insert.extend( + convert_to_link_url_agency_models( + url_id=url_id, + agency_ids=list(ids_to_insert) + ) + ) + + await sh.bulk_delete(session=session, models=links_to_delete) + await sh.bulk_insert(session=session, models=links_to_insert) + + async def run(self, session: AsyncSession) -> None: + await self._get_existing_links(session=session) + await self._update_links(session=session) -async def update_agency_links( - session: AsyncSession, - params: list[UpdateLinkURLAgencyForDataSourcesSyncParams] -) -> None: - """Overwrite existing url_agency links with new ones, if applicable.""" - query = URLAgencyLinkUpdateQueryBuilder(params) - await query.run(session) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/params.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/params.py index d43bbbd8..6f8a14eb 100644 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/params.py +++ b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/params.py @@ -1,7 +1,7 @@ from pydantic import BaseModel -class UpdateLinkURLAgencyForDataSourcesSyncParams(BaseModel): +class UpdateLinkURLAgencyParams(BaseModel): url_id: int new_agency_ids: list[int] old_agency_ids: list[int] diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/query.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/query.py deleted file mode 100644 index a81be905..00000000 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/agency/query.py +++ /dev/null @@ -1,79 +0,0 @@ -from collections import defaultdict - -from sqlalchemy import select -from sqlalchemy.ext.asyncio import AsyncSession - -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.agency.convert import convert_to_link_url_agency_models -from src.db.helpers.session import session_helper as sh -from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.agency.params import UpdateLinkURLAgencyForDataSourcesSyncParams -from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency -from src.db.queries.base.builder import QueryBuilderBase - - -class URLAgencyLinkUpdateQueryBuilder(QueryBuilderBase): - """Given a set of URL-Agency links, remove all non-matching links and add new ones.""" - - - def __init__(self, models: list[UpdateLinkURLAgencyForDataSourcesSyncParams]): - super().__init__() - self.models = models - self._new_links: dict[int, list[int]] = { - model.url_id: model.new_agency_ids - for model in self.models - } - self._existing_links: dict[int, list[int]] = defaultdict(list) - self.existing_url_ids = {model.url_id for model in self.models} - - async def _get_existing_links(self, session: AsyncSession): - """Get existing agency links for provided URLs. - - Modifies: - self._existing_links - """ - query = ( - select(LinkURLAgency) - .where( - LinkURLAgency.url_id.in_( - self.existing_url_ids - ) - ) - ) - links = await session.scalars(query) - for link in links: - self._existing_links[link.url_id].append(link.agency_id) - - async def _update_links(self, session: AsyncSession): - # Remove all existing links not in new links - links_to_delete: list[LinkURLAgencyPydantic] = [] - links_to_insert: list[LinkURLAgencyPydantic] = [] - - for url_id in self.existing_url_ids: - new_agency_ids = self._new_links.get(url_id, []) - existing_agency_ids = self._existing_links.get(url_id, []) - # IDs to delete are existing agency ids that are not new agency ids - ids_to_delete = set(existing_agency_ids) - set(new_agency_ids) - # IDs to insert are new agency ids that are not existing agency ids - ids_to_insert = set(new_agency_ids) - set(existing_agency_ids) - - links_to_delete.extend( - convert_to_link_url_agency_models( - url_id=url_id, - agency_ids=list(ids_to_delete) - ) - ) - links_to_insert.extend( - convert_to_link_url_agency_models( - url_id=url_id, - agency_ids=list(ids_to_insert) - ) - ) - - await sh.bulk_delete(session=session, models=links_to_delete) - await sh.bulk_insert(session=session, models=links_to_insert) - - async def run(self, session: AsyncSession): - await self._get_existing_links(session=session) - await self._update_links(session=session) - - diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/convert.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/convert.py new file mode 100644 index 00000000..e2def8c2 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/convert.py @@ -0,0 +1,24 @@ +from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.url.lookup.response import URLDataSyncInfo +from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.external.pdap.enums import ApprovalStatus + + +def convert_url_sync_info_to_url_mappings( + url_sync_info: URLDataSyncInfo +) -> URLMapping: + return URLMapping( + url=url_sync_info.url, + url_id=url_sync_info.url_id + ) + +def convert_approval_status_to_validated_type( + approval_status: ApprovalStatus +) -> URLValidatedType: + match approval_status: + case ApprovalStatus.APPROVED: + return URLValidatedType.DATA_SOURCE + case ApprovalStatus.REJECTED: + return URLValidatedType.NOT_RELEVANT + case _: + raise ValueError(f"Invalid approval status: {approval_status}") \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/core.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/core.py index 751192f9..2b021045 100644 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/core.py +++ b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/core.py @@ -3,6 +3,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from typing_extensions import override +from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.convert import convert_url_sync_info_to_url_mappings from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.helpers.filter import filter_for_urls_with_ids, \ get_mappings_for_urls_without_data_sources from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.mapper import URLSyncInfoMapper @@ -14,8 +15,11 @@ from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.url.lookup.response import \ LookupURLForDataSourcesSyncResponse from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.queries.base.builder import QueryBuilderBase from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo +from src.util.url_mapper import URLMapper @final @@ -50,24 +54,36 @@ async def run(self, session: AsyncSession) -> None: """ self._session = session - lookup_results = await self._lookup_urls() - lookups_existing_urls = filter_for_urls_with_ids(lookup_results) + lookup_results: list[LookupURLForDataSourcesSyncResponse] = await self._lookup_urls() + + # Update existing url and associated metadata + lookups_existing_urls: list[LookupURLForDataSourcesSyncResponse] = filter_for_urls_with_ids(lookup_results) await self._update_existing_urls(lookups_existing_urls) await self._update_agency_link(lookups_existing_urls) - mappings_without_data_sources = get_mappings_for_urls_without_data_sources(lookup_results) - await self._add_new_data_sources(mappings_without_data_sources) + existing_url_mappings: list[URLMapping] = [ + convert_url_sync_info_to_url_mappings(lookup.url_info) + for lookup in lookups_existing_urls + ] - extant_urls = {lookup.url_info.url for lookup in lookups_existing_urls} - urls_to_add = list(self.urls - extant_urls) - if len(urls_to_add) == 0: - return - url_mappings = await self._add_new_urls(urls_to_add) - await self._add_new_data_sources(url_mappings) - await self._insert_agency_link(url_mappings) - - async def _lookup_urls(self): - lookup_results = await self.requester.lookup_urls(list(self.urls)) - return lookup_results + # Add new URLs and associated metadata + mappings_without_data_sources: list[URLMapping] = get_mappings_for_urls_without_data_sources(lookup_results) + await self._add_new_data_sources(mappings_without_data_sources) + extant_urls: set[str] = {lookup.url_info.url for lookup in lookups_existing_urls} + urls_to_add: list[str] = list(self.urls - extant_urls) + if len(urls_to_add) != 0: + new_url_mappings: list[URLMapping] = await self._add_new_urls(urls_to_add) + await self._add_new_data_sources(new_url_mappings) + await self._insert_agency_link(new_url_mappings) + else: + new_url_mappings: list[URLMapping] = [] + + # Upsert validated flags + all_url_mappings: list[URLMapping] = existing_url_mappings + new_url_mappings + mapper = URLMapper(all_url_mappings) + await self._upsert_validated_flags(mapper) + + async def _lookup_urls(self) -> list[LookupURLForDataSourcesSyncResponse]: + return await self.requester.lookup_urls(list(self.urls)) async def _insert_agency_link(self, url_mappings: list[URLMapping]): link_url_agency_insert_params = self.param_manager.insert_agency_link( @@ -81,16 +97,19 @@ async def _update_agency_link(self, lookups_existing_urls: list[LookupURLForData ) await self.requester.update_agency_links(link_url_agency_update_params) - async def _add_new_data_sources(self, url_mappings: list[URLMapping]): + async def _add_new_data_sources(self, url_mappings: list[URLMapping]) -> None: url_ds_insert_params = self.param_manager.add_new_data_sources(url_mappings) await self.requester.add_new_data_sources(url_ds_insert_params) - async def _add_new_urls(self, urls: list[str]): + async def _add_new_urls(self, urls: list[str]) -> list[URLMapping]: url_insert_params: list[InsertURLForDataSourcesSyncParams] = self.param_manager.add_new_urls(urls) url_mappings = await self.requester.add_new_urls(url_insert_params) return url_mappings - async def _update_existing_urls(self, lookups_existing_urls: list[LookupURLForDataSourcesSyncResponse]): + async def _update_existing_urls(self, lookups_existing_urls: list[LookupURLForDataSourcesSyncResponse]) -> None: update_params = self.param_manager.update_existing_urls(lookups_existing_urls) await self.requester.update_existing_urls(update_params) + async def _upsert_validated_flags(self, url_mapper: URLMapper) -> None: + flags: list[FlagURLValidatedPydantic] = self.param_manager.upsert_validated_flags(url_mapper) + await self.requester.upsert_validated_flags(flags) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/helpers/convert.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/helpers/convert.py index 3240e409..168f2511 100644 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/helpers/convert.py +++ b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/helpers/convert.py @@ -23,13 +23,13 @@ def convert_to_source_collector_url_status( match ds_approval_status: case ApprovalStatus.APPROVED: - return URLStatus.SUBMITTED + return URLStatus.OK case ApprovalStatus.REJECTED: return URLStatus.NOT_RELEVANT case ApprovalStatus.NEEDS_IDENTIFICATION: - return URLStatus.PENDING + return URLStatus.OK case ApprovalStatus.PENDING: - return URLStatus.PENDING + return URLStatus.OK case _: raise NotImplementedError(f"Logic not implemented for this approval status: {ds_approval_status}") diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/param_manager.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/param_manager.py index 7ca8ebad..e0a7225f 100644 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/param_manager.py +++ b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/param_manager.py @@ -1,5 +1,7 @@ from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.agency.params import \ - UpdateLinkURLAgencyForDataSourcesSyncParams + UpdateLinkURLAgencyParams +from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.convert import \ + convert_approval_status_to_validated_type from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.helpers.convert import convert_to_url_update_params, \ convert_to_url_insert_params from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.mapper import URLSyncInfoMapper @@ -10,8 +12,14 @@ from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.url.update.params import \ UpdateURLForDataSourcesSyncParams from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic from src.db.models.impl.url.data_source.pydantic import URLDataSourcePydantic +from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo +from src.external.pdap.enums import ApprovalStatus +from src.util.url_mapper import URLMapper class UpsertURLsFromDataSourcesParamManager: @@ -53,12 +61,12 @@ def add_new_urls( def update_agency_link( self, lookup_results: list[LookupURLForDataSourcesSyncResponse] - ) -> list[UpdateLinkURLAgencyForDataSourcesSyncParams]: + ) -> list[UpdateLinkURLAgencyParams]: results = [] for lookup_result in lookup_results: url_info = lookup_result.url_info sync_info = self._mapper.get(url_info.url) - update_params = UpdateLinkURLAgencyForDataSourcesSyncParams( + update_params = UpdateLinkURLAgencyParams( url_id=url_info.url_id, new_agency_ids=sync_info.agency_ids, old_agency_ids=url_info.agency_ids @@ -98,4 +106,21 @@ def add_new_data_sources( ) return results + def upsert_validated_flags( + self, + mapper: URLMapper + ) -> list[FlagURLValidatedPydantic]: + urls: list[str] = mapper.get_all_urls() + flags: list[FlagURLValidatedPydantic] = [] + for url in urls: + url_id: int = mapper.get_id(url) + sync_info: DataSourcesSyncResponseInnerInfo = self._mapper.get(url) + approval_status: ApprovalStatus = sync_info.approval_status + validated_type: URLValidatedType = convert_approval_status_to_validated_type(approval_status) + flag = FlagURLValidatedPydantic( + url_id=url_id, + type=validated_type + ) + flags.append(flag) + return flags \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/requester.py b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/requester.py index 08b5df22..eaae3a17 100644 --- a/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/requester.py +++ b/src/core/tasks/scheduled/impl/sync/data_sources/queries/upsert/requester.py @@ -1,8 +1,8 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.agency.params import \ - UpdateLinkURLAgencyForDataSourcesSyncParams -from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.agency.query import \ + UpdateLinkURLAgencyParams +from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.agency.core import \ URLAgencyLinkUpdateQueryBuilder from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.url.insert.params import \ InsertURLForDataSourcesSyncParams @@ -14,6 +14,7 @@ UpdateURLForDataSourcesSyncParams from src.db.dtos.url.mapping import URLMapping from src.db.helpers.session import session_helper as sh +from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic from src.db.models.impl.url.data_source.pydantic import URLDataSourcePydantic @@ -71,8 +72,11 @@ async def add_new_agency_links( async def update_agency_links( self, - params: list[UpdateLinkURLAgencyForDataSourcesSyncParams] + params: list[UpdateLinkURLAgencyParams] ) -> None: """Overwrite existing url_agency links with new ones, if applicable.""" query = URLAgencyLinkUpdateQueryBuilder(params) - await query.run(self.session) \ No newline at end of file + await query.run(self.session) + + async def upsert_validated_flags(self, flags: list[FlagURLValidatedPydantic]) -> None: + await sh.bulk_upsert(self.session, models=flags) \ No newline at end of file diff --git a/src/core/tasks/scheduled/loader.py b/src/core/tasks/scheduled/loader.py index 83c3b100..76c707ea 100644 --- a/src/core/tasks/scheduled/loader.py +++ b/src/core/tasks/scheduled/loader.py @@ -77,6 +77,14 @@ async def load_entries(self) -> list[ScheduledTaskEntry]: interval=IntervalEnum.DAILY, enabled=self.env.bool("SYNC_DATA_SOURCES_TASK_FLAG", default=True) ), + ScheduledTaskEntry( + operator=SyncAgenciesTaskOperator( + adb_client=self.async_core.adb_client, + pdap_client=self.pdap_client + ), + interval=IntervalEnum.DAILY, + enabled=self.env.bool("SYNC_AGENCIES_TASK_FLAG", default=True) + ), ScheduledTaskEntry( operator=RunURLTasksTaskOperator(async_core=self.async_core), interval=IntervalEnum.HOURLY, @@ -88,14 +96,6 @@ async def load_entries(self) -> list[ScheduledTaskEntry]: interval=IntervalEnum.DAILY, enabled=self.env.bool("POPULATE_BACKLOG_SNAPSHOT_TASK_FLAG", default=True) ), - ScheduledTaskEntry( - operator=SyncAgenciesTaskOperator( - adb_client=self.async_core.adb_client, - pdap_client=self.pdap_client - ), - interval=IntervalEnum.DAILY, - enabled=self.env.bool("SYNC_AGENCIES_TASK_FLAG", default=True) - ), ScheduledTaskEntry( operator=PushToHuggingFaceTaskOperator( adb_client=self.async_core.adb_client, diff --git a/src/core/tasks/scheduled/manager.py b/src/core/tasks/scheduled/manager.py index e97e0f8e..87cb5a27 100644 --- a/src/core/tasks/scheduled/manager.py +++ b/src/core/tasks/scheduled/manager.py @@ -1,6 +1,3 @@ -from apscheduler.job import Job -from apscheduler.schedulers.asyncio import AsyncIOScheduler - from src.core.tasks.base.run_info import TaskOperatorRunInfo from src.core.tasks.handler import TaskHandler from src.core.tasks.mixins.link_urls import LinkURLsMixin @@ -25,13 +22,13 @@ def __init__( self._loader = loader self._registry = registry - # Main objects - self.scheduler = AsyncIOScheduler() - async def setup(self): self._registry.start_scheduler() await self.add_scheduled_tasks() + await self._registry.report_next_scheduled_task() + + async def add_scheduled_tasks(self): """ @@ -39,15 +36,19 @@ async def add_scheduled_tasks(self): self._registry """ entries: list[ScheduledTaskEntry] = await self._loader.load_entries() - for idx, entry in enumerate(entries): + enabled_entries: list[ScheduledTaskEntry] = [] + for entry in entries: if not entry.enabled: print(f"{entry.operator.task_type.value} is disabled. Skipping add to scheduler.") continue + enabled_entries.append(entry) + initial_lag: int = 1 + for idx, entry in enumerate(enabled_entries): await self._registry.add_job( func=self.run_task, entry=entry, - minute_lag=idx + minute_lag=idx + initial_lag ) def shutdown(self): @@ -68,3 +69,4 @@ async def run_task(self, operator: ScheduledTaskOperatorBase): operator: ScheduledTaskOperatorBase raise Exception(f"Task {operator.task_type.value} has not been linked to any URLs but is designated as a link task") await self._handler.handle_outcome(run_info) + await self._registry.report_next_scheduled_task() diff --git a/src/core/tasks/scheduled/registry/core.py b/src/core/tasks/scheduled/registry/core.py index a1928504..a622346c 100644 --- a/src/core/tasks/scheduled/registry/core.py +++ b/src/core/tasks/scheduled/registry/core.py @@ -6,6 +6,7 @@ from apscheduler.triggers.interval import IntervalTrigger from src.core.tasks.scheduled.models.entry import ScheduledTaskEntry +from src.core.tasks.scheduled.registry.format import format_job_datetime from src.db.enums import TaskType @@ -29,8 +30,9 @@ async def add_job( Modifies: self._jobs """ - self._jobs[entry.operator.task_type] = self.scheduler.add_job( - func, + job: Job = self.scheduler.add_job( + id=entry.operator.task_type.value, + func=func, trigger=IntervalTrigger( minutes=entry.interval.value, start_date=datetime.now() + timedelta(minutes=minute_lag) @@ -38,6 +40,10 @@ async def add_job( misfire_grace_time=60, kwargs={"operator": entry.operator} ) + run_time_str: str = format_job_datetime(job.next_run_time) + print(f"Adding {job.id} task to scheduler. " + + f"First run at {run_time_str}") + self._jobs[entry.operator.task_type] = job def start_scheduler(self) -> None: """ @@ -48,4 +54,16 @@ def start_scheduler(self) -> None: def shutdown_scheduler(self) -> None: if self.scheduler.running: - self.scheduler.shutdown() \ No newline at end of file + self.scheduler.shutdown() + + async def report_next_scheduled_task(self): + jobs: list[Job] = self.scheduler.get_jobs() + if len(jobs) == 0: + print("No scheduled tasks found.") + return + + jobs_sorted: list[Job] = sorted(jobs, key=lambda job: job.next_run_time) + next_job: Job = jobs_sorted[0] + + run_time_str: str = format_job_datetime(next_job.next_run_time) + print(f"Next scheduled task: {run_time_str} ({next_job.id})") \ No newline at end of file diff --git a/src/core/tasks/scheduled/registry/format.py b/src/core/tasks/scheduled/registry/format.py new file mode 100644 index 00000000..23eea364 --- /dev/null +++ b/src/core/tasks/scheduled/registry/format.py @@ -0,0 +1,7 @@ +from datetime import datetime + +def format_job_datetime(dt: datetime) -> str: + date_str: str = dt.strftime("%Y-%m-%d") + format_24: str = dt.strftime("%H:%M:%S") + format_12: str = dt.strftime("%I:%M:%S %p") + return f"{date_str} {format_24} ({format_12})" \ No newline at end of file diff --git a/src/core/tasks/url/loader.py b/src/core/tasks/url/loader.py index 45f750af..600ea1d2 100644 --- a/src/core/tasks/url/loader.py +++ b/src/core/tasks/url/loader.py @@ -7,6 +7,8 @@ from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface from src.core.tasks.url.models.entry import URLTaskEntry from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.core import \ + NLPProcessor from src.core.tasks.url.operators.agency_identification.subtasks.loader import AgencyIdentificationSubtaskLoader from src.core.tasks.url.operators.auto_relevant.core import URLAutoRelevantTaskOperator from src.core.tasks.url.operators.html.core import URLHTMLTaskOperator @@ -20,7 +22,6 @@ from src.core.tasks.url.operators.submit_approved.core import SubmitApprovedURLTaskOperator from src.db.client.async_ import AsyncDatabaseClient from src.external.huggingface.inference.client import HuggingFaceInferenceClient -from src.external.internet_archives.client import InternetArchivesClient from src.external.pdap.client import PDAPClient from src.external.url_request.core import URLRequestInterface @@ -35,11 +36,13 @@ def __init__( pdap_client: PDAPClient, muckrock_api_interface: MuckrockAPIInterface, hf_inference_client: HuggingFaceInferenceClient, + nlp_processor: NLPProcessor ): # Dependencies self.adb_client = adb_client self.url_request_interface = url_request_interface self.html_parser = html_parser + self.nlp_processor = nlp_processor self.env = Env() # External clients and interfaces @@ -79,7 +82,9 @@ async def _get_agency_identification_task_operator(self) -> URLTaskEntry: adb_client=self.adb_client, loader=AgencyIdentificationSubtaskLoader( pdap_client=self.pdap_client, - muckrock_api_interface=self.muckrock_api_interface + muckrock_api_interface=self.muckrock_api_interface, + adb_client=self.adb_client, + nlp_processor=self.nlp_processor ) ) return URLTaskEntry( diff --git a/src/core/tasks/url/manager.py b/src/core/tasks/url/manager.py index 399da5b0..7fc6b4e3 100644 --- a/src/core/tasks/url/manager.py +++ b/src/core/tasks/url/manager.py @@ -56,8 +56,7 @@ async def _run_task(self, entry: URLTaskEntry) -> None: print(message) await self.handler.post_to_discord(message=message) break - task_id = await self.handler.initiate_task_in_db(task_type=operator.task_type) - run_info: TaskOperatorRunInfo = await operator.run_task(task_id) + run_info: TaskOperatorRunInfo = await operator.run_task() await self.conclude_task(run_info) if run_info.outcome == TaskOperatorOutcome.ERROR: break diff --git a/src/core/tasks/url/operators/agency_identification/core.py b/src/core/tasks/url/operators/agency_identification/core.py index 8ac1f632..92ece84e 100644 --- a/src/core/tasks/url/operators/agency_identification/core.py +++ b/src/core/tasks/url/operators/agency_identification/core.py @@ -1,17 +1,21 @@ -from src.collectors.enums import CollectorType -from src.core.enums import SuggestionType -from src.core.tasks.url.operators.agency_identification.dtos.output import GetAgencySuggestionsOutput -from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from src.core.tasks.url.operators.agency_identification.dtos.tdo import AgencyIdentificationTDO -from src.core.tasks.url.operators.agency_identification.subtasks.impl.base import AgencyIdentificationSubtaskBase +from src.core.tasks.mixins.link_urls import LinkURLsMixin +from src.core.tasks.url.operators.agency_identification.exceptions import SubtaskError +from src.core.tasks.url.operators.agency_identification.subtasks.flags.core import SubtaskFlagger from src.core.tasks.url.operators.agency_identification.subtasks.loader import AgencyIdentificationSubtaskLoader +from src.core.tasks.url.operators.agency_identification.subtasks.models.run_info import AgencyIDSubtaskRunInfo +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.core import \ + AgencyIDSubtaskSurveyQueryBuilder +from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIDSubtaskOperatorBase from src.core.tasks.url.operators.base import URLTaskOperatorBase from src.db.client.async_ import AsyncDatabaseClient from src.db.enums import TaskType -from src.db.models.impl.url.error_info.pydantic import URLErrorPydanticInfo +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType -class AgencyIdentificationTaskOperator(URLTaskOperatorBase): +class AgencyIdentificationTaskOperator( + URLTaskOperatorBase, + LinkURLsMixin +): def __init__( self, @@ -20,90 +24,51 @@ def __init__( ): super().__init__(adb_client) self.loader = loader + self._subtask: AutoAgencyIDSubtaskType | None = None @property def task_type(self) -> TaskType: return TaskType.AGENCY_IDENTIFICATION async def meets_task_prerequisites(self) -> bool: - has_urls_without_agency_suggestions = await self.adb_client.has_urls_without_agency_suggestions() - return has_urls_without_agency_suggestions + """ + Modifies: + - self._subtask + """ + flagger = SubtaskFlagger() + allowed_subtasks: list[AutoAgencyIDSubtaskType] = flagger.get_allowed_subtasks() + + next_subtask: AutoAgencyIDSubtaskType | None = \ + await self.adb_client.run_query_builder( + AgencyIDSubtaskSurveyQueryBuilder( + allowed_subtasks=allowed_subtasks + ) + ) + self._subtask = next_subtask + if next_subtask is None: + return False + return True - async def get_pending_urls_without_agency_identification(self) -> list[AgencyIdentificationTDO]: - return await self.adb_client.get_urls_without_agency_suggestions() - async def get_subtask( + async def load_subtask( self, - collector_type: CollectorType - ) -> AgencyIdentificationSubtaskBase: + subtask_type: AutoAgencyIDSubtaskType + ) -> AgencyIDSubtaskOperatorBase: """Get subtask based on collector type.""" - return await self.loader.load_subtask(collector_type) + return await self.loader.load_subtask(subtask_type, task_id=self.task_id) @staticmethod async def run_subtask( - subtask: AgencyIdentificationSubtaskBase, - url_id: int, - collector_metadata: dict | None - ) -> list[URLAgencySuggestionInfo]: - return await subtask.run( - url_id=url_id, - collector_metadata=collector_metadata - ) + subtask_operator: AgencyIDSubtaskOperatorBase, + ) -> AgencyIDSubtaskRunInfo: + return await subtask_operator.run() async def inner_task_logic(self) -> None: - tdos: list[AgencyIdentificationTDO] = await self.get_pending_urls_without_agency_identification() - await self.link_urls_to_task(url_ids=[tdo.url_id for tdo in tdos]) - output = await self._get_agency_suggestions(tdos) - - await self._process_agency_suggestions(output.agency_suggestions) - await self.adb_client.add_url_error_infos(output.error_infos) - - async def _process_agency_suggestions( - self, - suggestions: list[URLAgencySuggestionInfo] - ) -> None: - non_unknown_agency_suggestions = [ - suggestion for suggestion in suggestions - if suggestion.suggestion_type != SuggestionType.UNKNOWN - ] - await self.adb_client.upsert_new_agencies(non_unknown_agency_suggestions) - confirmed_suggestions = [ - suggestion for suggestion in suggestions - if suggestion.suggestion_type == SuggestionType.CONFIRMED - ] - await self.adb_client.add_confirmed_agency_url_links(confirmed_suggestions) - non_confirmed_suggestions = [ - suggestion for suggestion in suggestions - if suggestion.suggestion_type != SuggestionType.CONFIRMED - ] - await self.adb_client.add_agency_auto_suggestions(non_confirmed_suggestions) - - async def _get_agency_suggestions( - self, - tdos: list[AgencyIdentificationTDO] - ) -> GetAgencySuggestionsOutput: - error_infos = [] - all_agency_suggestions = [] - for tdo in tdos: - subtask = await self.get_subtask(tdo.collector_type) - try: - new_agency_suggestions = await self.run_subtask( - subtask, - tdo.url_id, - tdo.collector_metadata - ) - all_agency_suggestions.extend(new_agency_suggestions) - except Exception as e: - error_info = URLErrorPydanticInfo( - task_id=self.task_id, - url_id=tdo.url_id, - error=str(e), - ) - error_infos.append(error_info) - output = GetAgencySuggestionsOutput( - agency_suggestions=all_agency_suggestions, - error_infos=error_infos - ) - return output + subtask_operator: AgencyIDSubtaskOperatorBase = await self.load_subtask(self._subtask) + print(f"Running Subtask: {self._subtask.value}") + run_info: AgencyIDSubtaskRunInfo = await self.run_subtask(subtask_operator) + await self.link_urls_to_task(run_info.linked_url_ids) + if not run_info.is_success: + raise SubtaskError(run_info.error) diff --git a/src/core/tasks/url/operators/agency_identification/dtos/output.py b/src/core/tasks/url/operators/agency_identification/dtos/output.py deleted file mode 100644 index d7381129..00000000 --- a/src/core/tasks/url/operators/agency_identification/dtos/output.py +++ /dev/null @@ -1,9 +0,0 @@ -from pydantic import BaseModel - -from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from src.db.models.impl.url.error_info.pydantic import URLErrorPydanticInfo - - -class GetAgencySuggestionsOutput(BaseModel): - error_infos: list[URLErrorPydanticInfo] - agency_suggestions: list[URLAgencySuggestionInfo] \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/dtos/tdo.py b/src/core/tasks/url/operators/agency_identification/dtos/tdo.py deleted file mode 100644 index 72f24d97..00000000 --- a/src/core/tasks/url/operators/agency_identification/dtos/tdo.py +++ /dev/null @@ -1,11 +0,0 @@ -from typing import Optional - -from pydantic import BaseModel - -from src.collectors.enums import CollectorType - - -class AgencyIdentificationTDO(BaseModel): - url_id: int - collector_metadata: dict | None = None - collector_type: CollectorType | None diff --git a/src/core/tasks/url/operators/agency_identification/exceptions.py b/src/core/tasks/url/operators/agency_identification/exceptions.py new file mode 100644 index 00000000..709189e3 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/exceptions.py @@ -0,0 +1,4 @@ + + +class SubtaskError(Exception): + pass \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/queries/get_pending_urls_without_agency_suggestions.py b/src/core/tasks/url/operators/agency_identification/queries/get_pending_urls_without_agency_suggestions.py deleted file mode 100644 index 5eeb4355..00000000 --- a/src/core/tasks/url/operators/agency_identification/queries/get_pending_urls_without_agency_suggestions.py +++ /dev/null @@ -1,38 +0,0 @@ -from sqlalchemy import select -from sqlalchemy.ext.asyncio import AsyncSession - -from src.collectors.enums import URLStatus, CollectorType -from src.core.tasks.url.operators.agency_identification.dtos.tdo import AgencyIdentificationTDO -from src.db.models.impl.batch.sqlalchemy import Batch -from src.db.models.impl.link.batch_url import LinkBatchURL -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.queries.base.builder import QueryBuilderBase -from src.db.statement_composer import StatementComposer - - -class GetPendingURLsWithoutAgencySuggestionsQueryBuilder(QueryBuilderBase): - - async def run(self, session: AsyncSession) -> list[AgencyIdentificationTDO]: - - statement = ( - select( - URL.id, - URL.collector_metadata, - Batch.strategy - ) - .select_from(URL) - .where(URL.status == URLStatus.PENDING.value) - .outerjoin(LinkBatchURL) - .outerjoin(Batch) - ) - statement = StatementComposer.exclude_urls_with_agency_suggestions(statement) - statement = statement.limit(100) - raw_results = await session.execute(statement) - return [ - AgencyIdentificationTDO( - url_id=raw_result[0], - collector_metadata=raw_result[1], - collector_type=CollectorType(raw_result[2]) if raw_result[2] is not None else None - ) - for raw_result in raw_results - ] \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/queries/has_urls_without_agency_suggestions.py b/src/core/tasks/url/operators/agency_identification/queries/has_urls_without_agency_suggestions.py deleted file mode 100644 index e8a0e8ce..00000000 --- a/src/core/tasks/url/operators/agency_identification/queries/has_urls_without_agency_suggestions.py +++ /dev/null @@ -1,27 +0,0 @@ -from sqlalchemy import select -from sqlalchemy.ext.asyncio import AsyncSession - -from src.collectors.enums import URLStatus -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.queries.base.builder import QueryBuilderBase -from src.db.statement_composer import StatementComposer - - -class HasURLsWithoutAgencySuggestionsQueryBuilder(QueryBuilderBase): - - async def run( - self, - session: AsyncSession - ) -> bool: - statement = ( - select( - URL.id - ).where( - URL.status == URLStatus.PENDING.value - ) - ) - - statement = StatementComposer.exclude_urls_with_agency_suggestions(statement) - raw_result = await session.execute(statement) - result = raw_result.all() - return len(result) != 0 \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/convert.py b/src/core/tasks/url/operators/agency_identification/subtasks/convert.py new file mode 100644 index 00000000..95c9e704 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/convert.py @@ -0,0 +1,54 @@ +from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData +from src.core.tasks.url.operators.agency_identification.subtasks.models.suggestion import AgencySuggestion +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType +from src.db.models.impl.url.suggestion.agency.subtask.pydantic import URLAutoAgencyIDSubtaskPydantic +from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo +from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse +from src.external.pdap.enums import MatchAgencyResponseStatus + +def convert_match_agency_response_to_subtask_data( + url_id: int, + response: MatchAgencyResponse, + subtask_type: AutoAgencyIDSubtaskType, + task_id: int +): + suggestions: list[AgencySuggestion] = \ + _convert_match_agency_response_to_suggestions( + response + ) + agencies_found: bool = len(suggestions) > 0 + subtask_pydantic = URLAutoAgencyIDSubtaskPydantic( + url_id=url_id, + type=subtask_type, + agencies_found=agencies_found, + task_id=task_id + ) + return AutoAgencyIDSubtaskData( + pydantic_model=subtask_pydantic, + suggestions=suggestions + ) + +def _convert_match_agency_response_to_suggestions( + match_response: MatchAgencyResponse, +) -> list[AgencySuggestion]: + if match_response.status == MatchAgencyResponseStatus.EXACT_MATCH: + match_info: MatchAgencyInfo = match_response.matches[0] + return [ + AgencySuggestion( + agency_id=int(match_info.id), + confidence=100 + ) + ] + if match_response.status == MatchAgencyResponseStatus.NO_MATCH: + return [] + if match_response.status != MatchAgencyResponseStatus.PARTIAL_MATCH: + raise ValueError(f"Unknown Match Agency Response Status: {match_response.status}") + total_confidence: int = 100 + confidence_per_match: int = total_confidence // len(match_response.matches) + return [ + AgencySuggestion( + agency_id=int(match_info.id), + confidence=confidence_per_match + ) + for match_info in match_response.matches + ] \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/flags/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/flags/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/flags/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/flags/core.py new file mode 100644 index 00000000..41997322 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/flags/core.py @@ -0,0 +1,26 @@ + +from environs import Env + +from src.core.tasks.url.operators.agency_identification.subtasks.flags.mappings import SUBTASK_TO_ENV_FLAG +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType + + +class SubtaskFlagger: + """ + Manages flags allowing and disallowing subtasks + """ + def __init__(self): + self.env = Env() + + def _get_subtask_flag(self, subtask_type: AutoAgencyIDSubtaskType) -> bool: + return self.env.bool( + SUBTASK_TO_ENV_FLAG[subtask_type], + default=True + ) + + def get_allowed_subtasks(self) -> list[AutoAgencyIDSubtaskType]: + return [ + subtask_type + for subtask_type, flag in SUBTASK_TO_ENV_FLAG.items() + if self._get_subtask_flag(subtask_type) + ] \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/flags/mappings.py b/src/core/tasks/url/operators/agency_identification/subtasks/flags/mappings.py new file mode 100644 index 00000000..d6997423 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/flags/mappings.py @@ -0,0 +1,8 @@ +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType + +SUBTASK_TO_ENV_FLAG: dict[AutoAgencyIDSubtaskType, str] = { + AutoAgencyIDSubtaskType.HOMEPAGE_MATCH: "AGENCY_ID_HOMEPAGE_MATCH_FLAG", + AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH: "AGENCY_ID_NLP_LOCATION_MATCH_FLAG", + AutoAgencyIDSubtaskType.CKAN: "AGENCY_ID_CKAN_FLAG", + AutoAgencyIDSubtaskType.MUCKROCK: "AGENCY_ID_MUCKROCK_FLAG" +} \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/base.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/base.py deleted file mode 100644 index 96f98f30..00000000 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/base.py +++ /dev/null @@ -1,16 +0,0 @@ -import abc -from abc import ABC -from typing import Optional - -from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo - - -class AgencyIdentificationSubtaskBase(ABC): - - @abc.abstractmethod - async def run( - self, - url_id: int, - collector_metadata: dict | None = None - ) -> list[URLAgencySuggestionInfo]: - raise NotImplementedError diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan.py deleted file mode 100644 index 15dddf6f..00000000 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan.py +++ /dev/null @@ -1,33 +0,0 @@ -from typing import final - -from typing_extensions import override - -from src.core.helpers import process_match_agency_response_to_suggestions -from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from src.core.tasks.url.operators.agency_identification.subtasks.impl.base import AgencyIdentificationSubtaskBase -from src.external.pdap.client import PDAPClient -from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse - -@final -class CKANAgencyIdentificationSubtask(AgencyIdentificationSubtaskBase): - - def __init__( - self, - pdap_client: PDAPClient - ): - self.pdap_client = pdap_client - - @override - async def run( - self, - url_id: int, - collector_metadata: dict | None = None - ) -> list[URLAgencySuggestionInfo]: - agency_name = collector_metadata["agency_name"] - match_agency_response: MatchAgencyResponse = await self.pdap_client.match_agency( - name=agency_name - ) - return process_match_agency_response_to_suggestions( - url_id=url_id, - match_agency_response=match_agency_response - ) diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/core.py new file mode 100644 index 00000000..d1af5391 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/core.py @@ -0,0 +1,54 @@ +from typing import final + +from typing_extensions import override + +from src.core.tasks.url.operators.agency_identification.subtasks.convert import \ + convert_match_agency_response_to_subtask_data +from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan_.params import CKANAgencyIDSubtaskParams +from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan_.query import \ + GetCKANAgencyIDSubtaskParamsQueryBuilder +from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData +from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import \ + AgencyIDSubtaskOperatorBase +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType +from src.external.pdap.client import PDAPClient +from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse + + +@final +class CKANAgencyIDSubtaskOperator(AgencyIDSubtaskOperatorBase): + + def __init__( + self, + adb_client: AsyncDatabaseClient, + task_id: int, + pdap_client: PDAPClient + ): + super().__init__(adb_client, task_id=task_id) + self.pdap_client = pdap_client + + @override + async def inner_logic(self) -> None: + params: list[CKANAgencyIDSubtaskParams] = await self._get_params() + self.linked_urls = [param.url_id for param in params] + subtask_data_list: list[AutoAgencyIDSubtaskData] = [] + for param in params: + agency_name: str = param.collector_metadata["agency_name"] + response: MatchAgencyResponse = await self.pdap_client.match_agency( + name=agency_name + ) + subtask_data: AutoAgencyIDSubtaskData = convert_match_agency_response_to_subtask_data( + url_id=param.url_id, + response=response, + subtask_type=AutoAgencyIDSubtaskType.CKAN, + task_id=self.task_id + ) + subtask_data_list.append(subtask_data) + + await self._upload_subtask_data(subtask_data_list) + + async def _get_params(self) -> list[CKANAgencyIDSubtaskParams]: + return await self.adb_client.run_query_builder( + GetCKANAgencyIDSubtaskParamsQueryBuilder() + ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/params.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/params.py new file mode 100644 index 00000000..ce4b7ce1 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/params.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class CKANAgencyIDSubtaskParams(BaseModel): + url_id: int + collector_metadata: dict \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/query.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/query.py new file mode 100644 index 00000000..90e965e7 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/query.py @@ -0,0 +1,46 @@ +from typing import Sequence + +from sqlalchemy import select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.collectors.enums import CollectorType +from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan_.params import CKANAgencyIDSubtaskParams +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.eligible import \ + EligibleContainer +from src.db.helpers.session import session_helper as sh +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.base.builder import QueryBuilderBase + + +class GetCKANAgencyIDSubtaskParamsQueryBuilder(QueryBuilderBase): + + async def run( + self, + session: AsyncSession + ) -> list[CKANAgencyIDSubtaskParams]: + container = EligibleContainer() + query = ( + select( + container.url_id, + URL.collector_metadata + ) + .join( + URL, + URL.id == container.url_id, + ) + .where( + container.ckan, + ) + .limit(500) + ) + + results: Sequence[RowMapping] = await sh.mappings(session, query=query) + return [ + CKANAgencyIDSubtaskParams( + url_id=mapping["id"], + collector_metadata=mapping["collector_metadata"], + ) + for mapping in results + ] diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/convert.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/convert.py new file mode 100644 index 00000000..f4ba913e --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/convert.py @@ -0,0 +1,47 @@ +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.models.entry import \ + GetHomepageMatchParams +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.models.mapping import \ + SubtaskURLMapping +from src.db.models.impl.url.suggestion.agency.subtask.enum import SubtaskDetailCode, AutoAgencyIDSubtaskType +from src.db.models.impl.url.suggestion.agency.subtask.pydantic import URLAutoAgencyIDSubtaskPydantic +from src.db.models.impl.url.suggestion.agency.suggestion.pydantic import AgencyIDSubtaskSuggestionPydantic + + +def convert_params_to_subtask_entries( + params: list[GetHomepageMatchParams], + task_id: int +) -> list[URLAutoAgencyIDSubtaskPydantic]: + url_id_to_detail_code: dict[int, SubtaskDetailCode] = {} + for param in params: + url_id_to_detail_code[param.url_id] = param.detail_code + + results: list[URLAutoAgencyIDSubtaskPydantic] = [] + for url_id, detail_code in url_id_to_detail_code.items(): + result = URLAutoAgencyIDSubtaskPydantic( + task_id=task_id, + url_id=url_id, + type=AutoAgencyIDSubtaskType.HOMEPAGE_MATCH, + agencies_found=True, + detail=detail_code, + ) + results.append(result) + return results + +def convert_subtask_mappings_and_params_to_suggestions( + mappings: list[SubtaskURLMapping], + params: list[GetHomepageMatchParams] +) -> list[AgencyIDSubtaskSuggestionPydantic]: + url_id_to_subtask_id: dict[int, int] = { + mapping.url_id: mapping.subtask_id + for mapping in mappings + } + suggestions: list[AgencyIDSubtaskSuggestionPydantic] = [] + for param in params: + subtask_id = url_id_to_subtask_id.get(param.url_id) + suggestion = AgencyIDSubtaskSuggestionPydantic( + subtask_id=subtask_id, + agency_id=param.agency_id, + confidence=param.confidence, + ) + suggestions.append(suggestion) + return suggestions \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/core.py new file mode 100644 index 00000000..f335cb3a --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/core.py @@ -0,0 +1,63 @@ +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.convert import \ + convert_params_to_subtask_entries, convert_subtask_mappings_and_params_to_suggestions +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.models.entry import \ + GetHomepageMatchParams +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.models.mapping import \ + SubtaskURLMapping +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.get import \ + GetHomepageMatchSubtaskURLsQueryBuilder +from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIDSubtaskOperatorBase +from src.db.models.impl.url.suggestion.agency.subtask.pydantic import URLAutoAgencyIDSubtaskPydantic +from src.db.models.impl.url.suggestion.agency.suggestion.pydantic import AgencyIDSubtaskSuggestionPydantic + + +class HomepageMatchSubtaskOperator( + AgencyIDSubtaskOperatorBase, +): + + async def inner_logic(self) -> None: + # Get Params + params: list[GetHomepageMatchParams] = \ + await self.adb_client.run_query_builder( + GetHomepageMatchSubtaskURLsQueryBuilder() + ) + + # Insert Subtask Entries + subtask_entries: list[URLAutoAgencyIDSubtaskPydantic] = convert_params_to_subtask_entries( + params=params, + task_id=self.task_id + ) + subtask_mappings: list[SubtaskURLMapping] = await self.insert_subtask_entries( + entries=subtask_entries + ) + + # Link URLs + url_ids: list[int] = [mapping.url_id for mapping in subtask_mappings] + self.linked_urls = url_ids + + # Insert Entries + suggestions: list[AgencyIDSubtaskSuggestionPydantic] = convert_subtask_mappings_and_params_to_suggestions( + mappings=subtask_mappings, + params=params + ) + await self.adb_client.bulk_insert( + models=suggestions, + ) + + + async def insert_subtask_entries( + self, + entries: list[URLAutoAgencyIDSubtaskPydantic] + ) -> list[SubtaskURLMapping]: + subtask_ids: list[int] = await self.adb_client.bulk_insert( + models=entries, + return_ids=True + ) + mappings: list[SubtaskURLMapping] = [] + for subtask_id, entry in zip(subtask_ids, entries): + mapping = SubtaskURLMapping( + url_id=entry.url_id, + subtask_id=subtask_id, + ) + mappings.append(mapping) + return mappings diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/models/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/models/entry.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/models/entry.py new file mode 100644 index 00000000..6c65f9ad --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/models/entry.py @@ -0,0 +1,10 @@ +from pydantic import BaseModel, Field + +from src.db.models.impl.url.suggestion.agency.subtask.enum import SubtaskDetailCode + + +class GetHomepageMatchParams(BaseModel): + url_id: int + agency_id: int + confidence: int = Field(..., ge=0, le=100) + detail_code: SubtaskDetailCode \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/models/mapping.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/models/mapping.py new file mode 100644 index 00000000..2e4d2fbb --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/models/mapping.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class SubtaskURLMapping(BaseModel): + url_id: int + subtask_id: int \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/consolidated.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/consolidated.py new file mode 100644 index 00000000..d90dfed6 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/consolidated.py @@ -0,0 +1,28 @@ +from sqlalchemy import CTE, select + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.count_agency_per_url import \ + COUNT_AGENCY_PER_URL_CTE +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.meta_urls_with_root_agencies import \ + META_ROOT_URLS_WITH_AGENCIES +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.unvalidated_urls_with_root import \ + UNVALIDATED_URLS_WITH_ROOT + +CONSOLIDATED_CTE: CTE = ( + select( + UNVALIDATED_URLS_WITH_ROOT.c.url_id, + META_ROOT_URLS_WITH_AGENCIES.c.agency_id, + COUNT_AGENCY_PER_URL_CTE.c.agency_count, + ) + .join( + COUNT_AGENCY_PER_URL_CTE, + COUNT_AGENCY_PER_URL_CTE.c.root_url_id == UNVALIDATED_URLS_WITH_ROOT.c.root_url_id + ) + .join( + META_ROOT_URLS_WITH_AGENCIES, + META_ROOT_URLS_WITH_AGENCIES.c.root_url_id == UNVALIDATED_URLS_WITH_ROOT.c.root_url_id + ) + .where( + COUNT_AGENCY_PER_URL_CTE.c.agency_count >= 1 + ) + .cte("consolidated") +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/count_agency_per_url.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/count_agency_per_url.py new file mode 100644 index 00000000..774787b7 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/count_agency_per_url.py @@ -0,0 +1,20 @@ +from sqlalchemy import CTE, func, select + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.meta_urls_with_root import \ + META_ROOT_URLS_CTE +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency + +COUNT_AGENCY_PER_URL_CTE: CTE = ( + select( + META_ROOT_URLS_CTE.c.root_url_id, + func.count(LinkURLAgency.agency_id).label("agency_count") + ) + .join( + LinkURLAgency, + META_ROOT_URLS_CTE.c.meta_url_id == LinkURLAgency.url_id + ) + .group_by( + META_ROOT_URLS_CTE.c.root_url_id + ) + .cte("count_agency_per_url") +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/meta_urls_with_root.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/meta_urls_with_root.py new file mode 100644 index 00000000..63b6b417 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/meta_urls_with_root.py @@ -0,0 +1,23 @@ +from sqlalchemy import CTE, select + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.whitelisted_root_urls import \ + WHITELISTED_ROOT_URLS_CTE +from src.db.models.impl.link.urls_root_url.sqlalchemy import LinkURLRootURL +from src.db.models.views.meta_url import MetaURL + +META_ROOT_URLS_CTE: CTE = ( + select( + MetaURL.url_id.label("meta_url_id"), + LinkURLRootURL.root_url_id + ) + .join( + LinkURLRootURL, + MetaURL.url_id == LinkURLRootURL.url_id + ) + # Must be a Whitelisted Root URL + .join( + WHITELISTED_ROOT_URLS_CTE, + WHITELISTED_ROOT_URLS_CTE.c.id == LinkURLRootURL.root_url_id + ) + .cte("meta_root_urls") +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/meta_urls_with_root_agencies.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/meta_urls_with_root_agencies.py new file mode 100644 index 00000000..86b14ee4 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/meta_urls_with_root_agencies.py @@ -0,0 +1,20 @@ +from sqlalchemy import CTE, select + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.meta_urls_with_root import \ + META_ROOT_URLS_CTE +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency + +META_ROOT_URLS_WITH_AGENCIES: CTE = ( + select( + META_ROOT_URLS_CTE.c.meta_url_id, + META_ROOT_URLS_CTE.c.root_url_id, + LinkURLAgency.agency_id + ) + .join( + LinkURLAgency, + META_ROOT_URLS_CTE.c.meta_url_id == LinkURLAgency.url_id + ) + .cte( + "meta_root_urls_with_agencies" + ) +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/multi_agency_case.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/multi_agency_case.py new file mode 100644 index 00000000..edf9e601 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/multi_agency_case.py @@ -0,0 +1,17 @@ +from sqlalchemy import CTE, select, literal + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.consolidated import \ + CONSOLIDATED_CTE +from src.db.models.impl.url.suggestion.agency.subtask.enum import SubtaskDetailCode + +MULTI_AGENCY_CASE_QUERY = ( + select( + CONSOLIDATED_CTE.c.url_id, + CONSOLIDATED_CTE.c.agency_id, + (literal(100) / CONSOLIDATED_CTE.c.agency_count).label("confidence"), + literal(SubtaskDetailCode.HOMEPAGE_MULTI_AGENCY.value).label("detail_code") + ) + .where( + CONSOLIDATED_CTE.c.agency_count > 1 + ) +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/single_agency_case.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/single_agency_case.py new file mode 100644 index 00000000..5778ecb6 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/single_agency_case.py @@ -0,0 +1,17 @@ +from sqlalchemy import select, CTE, literal + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.consolidated import \ + CONSOLIDATED_CTE +from src.db.models.impl.url.suggestion.agency.subtask.enum import SubtaskDetailCode + +SINGLE_AGENCY_CASE_QUERY = ( + select( + CONSOLIDATED_CTE.c.url_id, + CONSOLIDATED_CTE.c.agency_id, + literal(95).label("confidence"), + literal(SubtaskDetailCode.HOMEPAGE_SINGLE_AGENCY.value).label("detail_code") + ) + .where( + CONSOLIDATED_CTE.c.agency_count == 1 + ) +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/unvalidated_urls_with_root.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/unvalidated_urls_with_root.py new file mode 100644 index 00000000..46702833 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/unvalidated_urls_with_root.py @@ -0,0 +1,22 @@ +from sqlalchemy import CTE, select + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.whitelisted_root_urls import \ + WHITELISTED_ROOT_URLS_CTE +from src.db.models.impl.link.urls_root_url.sqlalchemy import LinkURLRootURL +from src.db.models.views.unvalidated_url import UnvalidatedURL + +UNVALIDATED_URLS_WITH_ROOT: CTE = ( + select( + UnvalidatedURL.url_id, + LinkURLRootURL.root_url_id + ) + .join( + LinkURLRootURL, + UnvalidatedURL.url_id == LinkURLRootURL.url_id + ) + .join( + WHITELISTED_ROOT_URLS_CTE, + WHITELISTED_ROOT_URLS_CTE.c.id == LinkURLRootURL.root_url_id + ) + .cte("unvalidated_urls_with_root") +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/whitelisted_root_urls.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/whitelisted_root_urls.py new file mode 100644 index 00000000..1af8f46c --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/whitelisted_root_urls.py @@ -0,0 +1,47 @@ +from sqlalchemy import CTE, select, func + +from src.db.models.impl.flag.root_url.sqlalchemy import FlagRootURL +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.link.urls_root_url.sqlalchemy import LinkURLRootURL +from src.db.models.impl.url.core.sqlalchemy import URL + +WHITELISTED_ROOT_URLS_CTE: CTE = ( + select( + URL.id + ) + .join( + FlagRootURL, + URL.id == FlagRootURL.url_id + ) + # Must be linked to other URLs + .join( + LinkURLRootURL, + URL.id == LinkURLRootURL.root_url_id + ) + # Those URLs must be meta URLS + .join( + FlagURLValidated, + FlagURLValidated.url_id == LinkURLRootURL.url_id + ) + # Get the Agency URLs for those URLs + .join( + LinkURLAgency, + LinkURLAgency.url_id == LinkURLRootURL.url_id + ) + .where( + # The connected URLs must be Meta URLs + FlagURLValidated.type == URLValidatedType.META_URL, + # Root URL can't be "https://catalog.data.gov" + URL.url != "https://catalog.data.gov" + ) + .group_by( + URL.id + ) + # Must have no more than two agencies connected + .having( + func.count(LinkURLAgency.agency_id) <= 2 + ) + .cte("whitelisted_root_urls") +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/get.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/get.py new file mode 100644 index 00000000..10619531 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/get.py @@ -0,0 +1,35 @@ +from typing import Sequence + +from sqlalchemy import Select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.models.entry import \ + GetHomepageMatchParams +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.multi_agency_case import \ + MULTI_AGENCY_CASE_QUERY +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.single_agency_case import \ + SINGLE_AGENCY_CASE_QUERY +from src.db.helpers.session import session_helper as sh +from src.db.models.impl.url.suggestion.agency.subtask.enum import SubtaskDetailCode +from src.db.queries.base.builder import QueryBuilderBase + + +class GetHomepageMatchSubtaskURLsQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> list[GetHomepageMatchParams]: + + query: Select = SINGLE_AGENCY_CASE_QUERY.union(MULTI_AGENCY_CASE_QUERY) + + mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) + + results: list[GetHomepageMatchParams] = [] + for mapping in mappings: + response = GetHomepageMatchParams( + url_id=mapping["url_id"], + agency_id=mapping["agency_id"], + confidence=mapping["confidence"], + detail_code=SubtaskDetailCode(mapping["detail_code"]), + ) + results.append(response) + + return results \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock.py deleted file mode 100644 index 633d84ac..00000000 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock.py +++ /dev/null @@ -1,48 +0,0 @@ -from typing import final - -from typing_extensions import override - -from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface -from src.collectors.impl.muckrock.api_interface.lookup_response import AgencyLookupResponse -from src.collectors.impl.muckrock.enums import AgencyLookupResponseType -from src.core.exceptions import MuckrockAPIError -from src.core.helpers import process_match_agency_response_to_suggestions -from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from src.core.tasks.url.operators.agency_identification.subtasks.impl.base import AgencyIdentificationSubtaskBase -from src.external.pdap.client import PDAPClient -from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse - -@final -class MuckrockAgencyIdentificationSubtask(AgencyIdentificationSubtaskBase): - - def __init__( - self, - muckrock_api_interface: MuckrockAPIInterface, - pdap_client: PDAPClient - ): - self.muckrock_api_interface = muckrock_api_interface - self.pdap_client = pdap_client - - @override - async def run( - self, - url_id: int, - collector_metadata: dict | None = None - ) -> list[URLAgencySuggestionInfo]: - muckrock_agency_id = collector_metadata["agency"] - agency_lookup_response: AgencyLookupResponse = await self.muckrock_api_interface.lookup_agency( - muckrock_agency_id=muckrock_agency_id - ) - if agency_lookup_response.type != AgencyLookupResponseType.FOUND: - raise MuckrockAPIError( - f"Failed to lookup muckrock agency: {muckrock_agency_id}:" - f" {agency_lookup_response.type.value}: {agency_lookup_response.error}" - ) - - match_agency_response: MatchAgencyResponse = await self.pdap_client.match_agency( - name=agency_lookup_response.name - ) - return process_match_agency_response_to_suggestions( - url_id=url_id, - match_agency_response=match_agency_response - ) diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/core.py new file mode 100644 index 00000000..4fa92c2e --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/core.py @@ -0,0 +1,93 @@ +from typing import final + +from typing_extensions import override + +from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface +from src.collectors.impl.muckrock.api_interface.lookup_response import AgencyLookupResponse +from src.collectors.impl.muckrock.enums import AgencyLookupResponseType +from src.core.tasks.url.operators.agency_identification.subtasks.convert import \ + convert_match_agency_response_to_subtask_data +from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock_.params import \ + MuckrockAgencyIDSubtaskParams +from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock_.query import \ + GetMuckrockAgencyIDSubtaskParamsQueryBuilder +from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData +from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIDSubtaskOperatorBase +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType, SubtaskDetailCode +from src.db.models.impl.url.suggestion.agency.subtask.pydantic import URLAutoAgencyIDSubtaskPydantic +from src.external.pdap.client import PDAPClient +from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse + + +@final +class MuckrockAgencyIDSubtaskOperator(AgencyIDSubtaskOperatorBase): + + def __init__( + self, + adb_client: AsyncDatabaseClient, + task_id: int, + muckrock_api_interface: MuckrockAPIInterface, + pdap_client: PDAPClient + ): + super().__init__(adb_client, task_id=task_id) + self.muckrock_api_interface = muckrock_api_interface + self.pdap_client = pdap_client + + @override + async def inner_logic(self) -> None: + params: list[MuckrockAgencyIDSubtaskParams] = await self._get_params() + self.linked_urls = [param.url_id for param in params] + subtask_data_list: list[AutoAgencyIDSubtaskData] = [] + for param in params: + muckrock_agency_id: int = param.collector_metadata["agency"] + agency_lookup_response: AgencyLookupResponse = await self.muckrock_api_interface.lookup_agency( + muckrock_agency_id=muckrock_agency_id + ) + if agency_lookup_response.type != AgencyLookupResponseType.FOUND: + data: AutoAgencyIDSubtaskData = await self._error_subtask_data( + url_id=param.url_id, + muckrock_agency_id=muckrock_agency_id, + agency_lookup_response=agency_lookup_response + ) + subtask_data_list.append(data) + continue + match_agency_response: MatchAgencyResponse = await self.pdap_client.match_agency( + name=agency_lookup_response.name + ) + subtask_data: AutoAgencyIDSubtaskData = convert_match_agency_response_to_subtask_data( + url_id=param.url_id, + response=match_agency_response, + subtask_type=AutoAgencyIDSubtaskType.MUCKROCK, + task_id=self.task_id + ) + subtask_data_list.append(subtask_data) + + await self._upload_subtask_data(subtask_data_list) + + + async def _error_subtask_data( + self, + url_id: int, + muckrock_agency_id: int, + agency_lookup_response: AgencyLookupResponse + ) -> AutoAgencyIDSubtaskData: + pydantic_model = URLAutoAgencyIDSubtaskPydantic( + task_id=self.task_id, + url_id=url_id, + type=AutoAgencyIDSubtaskType.MUCKROCK, + agencies_found=False, + detail=SubtaskDetailCode.RETRIEVAL_ERROR + ) + error: str = f"Failed to lookup muckrock agency: {muckrock_agency_id}:" + \ + f" {agency_lookup_response.type.value}: {agency_lookup_response.error}" + return AutoAgencyIDSubtaskData( + pydantic_model=pydantic_model, + suggestions=[], + error=error + ) + + async def _get_params(self) -> list[MuckrockAgencyIDSubtaskParams]: + return await self.adb_client.run_query_builder( + GetMuckrockAgencyIDSubtaskParamsQueryBuilder() + ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/params.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/params.py new file mode 100644 index 00000000..6010f022 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/params.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class MuckrockAgencyIDSubtaskParams(BaseModel): + url_id: int + collector_metadata: dict \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/query.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/query.py new file mode 100644 index 00000000..6f575b4f --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/query.py @@ -0,0 +1,49 @@ +from typing import Sequence + +from sqlalchemy import select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.collectors.enums import CollectorType +from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock_.params import \ + MuckrockAgencyIDSubtaskParams +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.eligible import \ + EligibleContainer +from src.db.helpers.session import session_helper as sh +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.base.builder import QueryBuilderBase + + +class GetMuckrockAgencyIDSubtaskParamsQueryBuilder(QueryBuilderBase): + + async def run( + self, + session: AsyncSession + ) -> list[MuckrockAgencyIDSubtaskParams]: + container = EligibleContainer() + + query = ( + select( + container.url_id, + URL.collector_metadata + ) + .join( + URL, + URL.id == container.url_id, + ) + .where( + container.muckrock, + ) + .limit(500) + ) + + results: Sequence[RowMapping] = await sh.mappings(session, query=query) + return [ + MuckrockAgencyIDSubtaskParams( + url_id=mapping["id"], + collector_metadata=mapping["collector_metadata"], + ) + for mapping in results + ] + diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/constants.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/constants.py new file mode 100644 index 00000000..b8b4ce4d --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/constants.py @@ -0,0 +1,4 @@ + + +ITERATIONS_PER_SUBTASK = 2 +NUMBER_OF_ENTRIES_PER_ITERATION = 20 \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/core.py new file mode 100644 index 00000000..0c172e5d --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/core.py @@ -0,0 +1,57 @@ +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.constants import \ + ITERATIONS_PER_SUBTASK +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.models.input import \ + NLPLocationMatchSubtaskInput +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.core import \ + AgencyIDSubtaskInternalProcessor +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.core import \ + NLPProcessor +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.query import \ + GetNLPLocationMatchSubtaskInputQueryBuilder +from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData +from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIDSubtaskOperatorBase +from src.db.client.async_ import AsyncDatabaseClient +from src.external.pdap.client import PDAPClient + + +class NLPLocationMatchSubtaskOperator(AgencyIDSubtaskOperatorBase): + + def __init__( + self, + adb_client: AsyncDatabaseClient, + task_id: int, + pdap_client: PDAPClient, + processor: NLPProcessor + ) -> None: + super().__init__(adb_client, task_id=task_id) + self.processor = AgencyIDSubtaskInternalProcessor( + nlp_processor=processor, + pdap_client=pdap_client, + task_id=task_id, + ) + + async def inner_logic(self) -> None: + for iteration in range(ITERATIONS_PER_SUBTASK): + inputs: list[NLPLocationMatchSubtaskInput] = await self._get_from_db() + if len(inputs) == 0: + break + await self.run_subtask_iteration(inputs) + + async def run_subtask_iteration(self, inputs: list[NLPLocationMatchSubtaskInput]) -> None: + self.linked_urls.extend([input_.url_id for input_ in inputs]) + subtask_data_list: list[AutoAgencyIDSubtaskData] = await self._process_inputs(inputs) + + await self._upload_subtask_data(subtask_data_list) + + async def _process_inputs( + self, + inputs: list[NLPLocationMatchSubtaskInput] + ) -> list[AutoAgencyIDSubtaskData]: + return await self.processor.process( + inputs=inputs, + ) + + async def _get_from_db(self) -> list[NLPLocationMatchSubtaskInput]: + return await self.adb_client.run_query_builder( + GetNLPLocationMatchSubtaskInputQueryBuilder(), + ) diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/models/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/models/input.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/models/input.py new file mode 100644 index 00000000..398c1504 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/models/input.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class NLPLocationMatchSubtaskInput(BaseModel): + url_id: int + html: str \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/constants.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/constants.py new file mode 100644 index 00000000..cc16da9f --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/constants.py @@ -0,0 +1,3 @@ + + +MAX_NLP_CONFIDENCE: int = 90 \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/convert.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/convert.py new file mode 100644 index 00000000..103580da --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/convert.py @@ -0,0 +1,162 @@ +from math import ceil + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.constants import \ + MAX_NLP_CONFIDENCE +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.counter import \ + RequestCounter +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.mappings.url_id_nlp_response import \ + URLToNLPResponseMapping +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.mappings.url_id_search_params import \ + URLToSearchParamsMapping +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.mappings.url_id_search_response import \ + URLToSearchResponseMapping +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.response import \ + NLPLocationMatchResponse +from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData +from src.core.tasks.url.operators.agency_identification.subtasks.models.suggestion import AgencySuggestion +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType +from src.db.models.impl.url.suggestion.agency.subtask.pydantic import URLAutoAgencyIDSubtaskPydantic +from src.external.pdap.dtos.search_agency_by_location.params import SearchAgencyByLocationParams +from src.external.pdap.dtos.search_agency_by_location.response import SearchAgencyByLocationResponse + + +def convert_nlp_response_to_search_agency_by_location_params( + nlp_response: NLPLocationMatchResponse, + counter: RequestCounter +) -> list[SearchAgencyByLocationParams]: + params: list[SearchAgencyByLocationParams] = [] + for location in nlp_response.locations: + if nlp_response.us_state is None: + raise ValueError("US State is None; cannot convert NLP response to search agency by location params") + request_id: int = counter.next() + param = SearchAgencyByLocationParams( + request_id=request_id, + query=location, + iso=nlp_response.us_state.iso, + ) + params.append(param) + + return params + + + +def convert_search_agency_responses_to_subtask_data_list( + mappings: list[URLToSearchResponseMapping], + task_id: int +) -> list[AutoAgencyIDSubtaskData]: + subtask_data_list: list[AutoAgencyIDSubtaskData] = [] + + # First, extract agency suggestions for URL + for mapping in mappings: + url_id: int = mapping.url_id + search_responses: list[SearchAgencyByLocationResponse] = mapping.search_responses + suggestions: list[AgencySuggestion] = _convert_search_agency_response_to_agency_suggestions( + search_responses + ) + pydantic_model: URLAutoAgencyIDSubtaskPydantic = convert_search_agency_response_to_subtask_pydantic( + url_id=url_id, + task_id=task_id + ) + subtask_data = AutoAgencyIDSubtaskData( + pydantic_model=pydantic_model, + suggestions=suggestions + ) + subtask_data_list.append(subtask_data) + + return subtask_data_list + + +def _convert_search_agency_response_to_agency_suggestions( + responses: list[SearchAgencyByLocationResponse], +) -> list[AgencySuggestion]: + suggestions: list[AgencySuggestion] = [] + for response in responses: + for result in response.results: + agency_id: int = result.agency_id + similarity: float = result.similarity + confidence: int = min(ceil(similarity * 100), MAX_NLP_CONFIDENCE) + suggestion: AgencySuggestion = AgencySuggestion( + agency_id=agency_id, + confidence=confidence, + ) + suggestions.append(suggestion) + return suggestions + +def convert_url_ids_to_empty_subtask_data_list( + url_ids: list[int], + task_id: int +) -> list[AutoAgencyIDSubtaskData]: + results: list[AutoAgencyIDSubtaskData] = [] + for url_id in url_ids: + subtask_data = AutoAgencyIDSubtaskData( + pydantic_model=URLAutoAgencyIDSubtaskPydantic( + task_id=task_id, + url_id=url_id, + type=AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH, + agencies_found=False + ), + suggestions=[] + ) + results.append(subtask_data) + + return results + + + +def convert_empty_url_search_param_mappings_to_subtask_data_list( + mappings: list[URLToSearchParamsMapping], + task_id: int +) -> list[AutoAgencyIDSubtaskData]: + url_ids: list[int] = [] + for mapping in mappings: + url_ids.append(mapping.url_id) + + return convert_url_ids_to_empty_subtask_data_list( + url_ids=url_ids, + task_id=task_id + ) + +def convert_invalid_url_nlp_mappings_to_subtask_data_list( + mappings: list[URLToNLPResponseMapping], + task_id: int +) -> list[AutoAgencyIDSubtaskData]: + url_ids: list[int] = [] + for mapping in mappings: + url_ids.append(mapping.url_id) + + return convert_url_ids_to_empty_subtask_data_list( + url_ids=url_ids, + task_id=task_id + ) + + +def convert_search_agency_response_to_subtask_pydantic( + url_id: int, + task_id: int +) -> URLAutoAgencyIDSubtaskPydantic: + + return URLAutoAgencyIDSubtaskPydantic( + task_id=task_id, + url_id=url_id, + type=AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH, + agencies_found=True + ) + + +def convert_urls_to_search_params( + url_to_nlp_mappings: list[URLToNLPResponseMapping] +) -> list[URLToSearchParamsMapping]: + url_to_search_params_mappings: list[URLToSearchParamsMapping] = [] + counter = RequestCounter() + for mapping in url_to_nlp_mappings: + search_params: list[SearchAgencyByLocationParams] = \ + convert_nlp_response_to_search_agency_by_location_params( + counter=counter, + nlp_response=mapping.nlp_response, + ) + mapping = URLToSearchParamsMapping( + url_id=mapping.url_id, + search_params=search_params, + ) + url_to_search_params_mappings.append(mapping) + return url_to_search_params_mappings diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/core.py new file mode 100644 index 00000000..1e349318 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/core.py @@ -0,0 +1,143 @@ +from collections import defaultdict + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.models.input import \ + NLPLocationMatchSubtaskInput +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.convert import \ + convert_search_agency_responses_to_subtask_data_list, \ + convert_invalid_url_nlp_mappings_to_subtask_data_list, convert_urls_to_search_params +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.filter import \ + filter_valid_and_invalid_nlp_responses, filter_top_n_suggestions +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.mapper import \ + URLRequestIDMapper +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.mappings.url_id_nlp_response import \ + URLToNLPResponseMapping +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.mappings.url_id_search_params import \ + URLToSearchParamsMapping +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.mappings.url_id_search_response import \ + URLToSearchResponseMapping +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.subsets.nlp_responses import \ + NLPResponseSubsets +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.core import \ + NLPProcessor +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.response import \ + NLPLocationMatchResponse +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.preprocess import \ + preprocess_html +from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData +from src.external.pdap.client import PDAPClient +from src.external.pdap.dtos.search_agency_by_location.params import SearchAgencyByLocationParams +from src.external.pdap.dtos.search_agency_by_location.response import SearchAgencyByLocationResponse + + +class AgencyIDSubtaskInternalProcessor: + + def __init__( + self, + nlp_processor: NLPProcessor, + pdap_client: PDAPClient, + task_id: int, + ): + self._nlp_processor = nlp_processor + self._pdap_client = pdap_client + self._task_id = task_id + + async def process( + self, + inputs: list[NLPLocationMatchSubtaskInput] + ) -> list[AutoAgencyIDSubtaskData]: + subtask_data_list: list[AutoAgencyIDSubtaskData] = [] + + url_to_nlp_mappings: list[URLToNLPResponseMapping] = \ + self._match_urls_to_nlp_responses(inputs) + + # Filter out valid and invalid NLP responses + nlp_response_subsets: NLPResponseSubsets = \ + filter_valid_and_invalid_nlp_responses(url_to_nlp_mappings) + + # For invalid responses, convert to subtask data with empty agencies + subtask_data_no_agency_list: list[AutoAgencyIDSubtaskData] = \ + convert_invalid_url_nlp_mappings_to_subtask_data_list( + mappings=nlp_response_subsets.invalid, + task_id=self._task_id, + ) + subtask_data_list.extend(subtask_data_no_agency_list) + + # For valid responses, convert to search param mappings + url_to_search_params_mappings: list[URLToSearchParamsMapping] = \ + convert_urls_to_search_params(nlp_response_subsets.valid) + + + response_mappings: list[URLToSearchResponseMapping] = \ + await self._get_pdap_info(url_to_search_params_mappings) + + subtask_data_list_agency_list: list[AutoAgencyIDSubtaskData] = \ + convert_search_agency_responses_to_subtask_data_list( + mappings=response_mappings, + task_id=self._task_id, + ) + + filter_top_n_suggestions(subtask_data_list_agency_list) + + subtask_data_list.extend(subtask_data_list_agency_list) + + return subtask_data_list + + def _match_urls_to_nlp_responses( + self, + inputs: list[NLPLocationMatchSubtaskInput] + ) -> list[URLToNLPResponseMapping]: + url_to_nlp_mappings: list[URLToNLPResponseMapping] = [] + for input_ in inputs: + nlp_response: NLPLocationMatchResponse = self._get_location_match(input_.html) + mapping = URLToNLPResponseMapping( + url_id=input_.url_id, + nlp_response=nlp_response, + ) + url_to_nlp_mappings.append(mapping) + return url_to_nlp_mappings + + def _get_location_match( + self, + html: str + ) -> NLPLocationMatchResponse: + preprocessed_html: str = preprocess_html(html) + return self._nlp_processor.parse_for_locations(preprocessed_html) + + async def _get_pdap_info( + self, + mappings: list[URLToSearchParamsMapping] + ) -> list[URLToSearchResponseMapping]: + if len(mappings) == 0: + return [] + params: list[SearchAgencyByLocationParams] = [] + # Map request IDs to URL IDs for later use + mapper = URLRequestIDMapper() + for mapping in mappings: + for search_param in mapping.search_params: + mapper.add_mapping( + request_id=search_param.request_id, + url_id=mapping.url_id, + ) + params.append(search_param) + + url_id_to_search_responses: dict[int, list[SearchAgencyByLocationResponse]] = defaultdict(list) + + responses: list[SearchAgencyByLocationResponse] = await self._pdap_client.search_agency_by_location(params) + # Map responses to URL IDs via request IDs + for response in responses: + request_id: int = response.request_id + url_id: int = mapper.get_url_id_by_request_id(request_id) + url_id_to_search_responses[url_id].append(response) + + # Reconcile URL IDs to search responses + response_mappings: list[URLToSearchResponseMapping] = [] + for url_id, responses in url_id_to_search_responses.items(): + mapping = URLToSearchResponseMapping( + url_id=url_id, + search_responses=responses, + ) + response_mappings.append(mapping) + + return response_mappings + + diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/counter.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/counter.py new file mode 100644 index 00000000..12e9e048 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/counter.py @@ -0,0 +1,11 @@ + + + +class RequestCounter: + + def __init__(self): + self._counter: int = 0 + + def next(self) -> int: + self._counter += 1 + return self._counter \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/extract.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/extract.py new file mode 100644 index 00000000..053f4fb5 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/extract.py @@ -0,0 +1,12 @@ +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.mappings.url_id_search_params import \ + URLToSearchParamsMapping +from src.external.pdap.dtos.search_agency_by_location.params import SearchAgencyByLocationParams + + +def _extract_all_search_params( + url_to_search_params_mappings: list[URLToSearchParamsMapping] +) -> list[SearchAgencyByLocationParams]: + all_search_params: list[SearchAgencyByLocationParams] = [] + for mapping in url_to_search_params_mappings: + all_search_params.extend(mapping.search_params) + return all_search_params diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/filter.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/filter.py new file mode 100644 index 00000000..ff8b2de5 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/filter.py @@ -0,0 +1,59 @@ +from collections import defaultdict + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.mappings.url_id_nlp_response import \ + URLToNLPResponseMapping +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.subsets.nlp_responses import \ + NLPResponseSubsets +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.response import \ + NLPLocationMatchResponse +from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData +from src.core.tasks.url.operators.agency_identification.subtasks.models.suggestion import AgencySuggestion + + +def filter_valid_and_invalid_nlp_responses( + mappings: list[URLToNLPResponseMapping] +) -> NLPResponseSubsets: + valid: list[URLToNLPResponseMapping] = [] + invalid: list[URLToNLPResponseMapping] = [] + for mapping in mappings: + nlp_response: NLPLocationMatchResponse = mapping.nlp_response + if nlp_response.valid: + valid.append(mapping) + else: + invalid.append(mapping) + return NLPResponseSubsets( + valid=valid, + invalid=invalid, + ) + +def filter_top_n_suggestions( + subtask_data_list: list[AutoAgencyIDSubtaskData], + n: int = 5 +) -> None: + """Filters out all but the top N suggestions for each URL. + + Modifies: + - AutoAgencyIDSubtaskData.suggestions + """ + for subtask_data in subtask_data_list: + # Eliminate agency ID duplicates; + agency_to_suggestions: dict[int, list[AgencySuggestion]] = defaultdict(list) + for suggestion in subtask_data.suggestions: + agency_to_suggestions[suggestion.agency_id].append(suggestion) + + # in the case of a tie, keep the suggestion with the highest confidence + deduped_suggestions: list[AgencySuggestion] = [] + for agency_suggestions in agency_to_suggestions.values(): + agency_suggestions.sort( + key=lambda x: x.confidence, + reverse=True # Descending order + ) + deduped_suggestions.append(agency_suggestions[0]) + + # Sort suggestions by confidence and keep top N + suggestions_sorted: list[AgencySuggestion] = sorted( + deduped_suggestions, + key=lambda x: x.confidence, + reverse=True # Descending order + ) + subtask_data.suggestions = suggestions_sorted[:n] diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/mapper.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/mapper.py new file mode 100644 index 00000000..8192dbb6 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/mapper.py @@ -0,0 +1,10 @@ +class URLRequestIDMapper: + + def __init__(self): + self._request_id_to_url_id_mapper: dict[int, int] = {} + + def add_mapping(self, request_id: int, url_id: int) -> None: + self._request_id_to_url_id_mapper[request_id] = url_id + + def get_url_id_by_request_id(self, request_id: int) -> int: + return self._request_id_to_url_id_mapper[request_id] diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_nlp_response.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_nlp_response.py new file mode 100644 index 00000000..7bb7e701 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_nlp_response.py @@ -0,0 +1,9 @@ +from pydantic import BaseModel + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.response import \ + NLPLocationMatchResponse + + +class URLToNLPResponseMapping(BaseModel): + url_id: int + nlp_response: NLPLocationMatchResponse \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_search_params.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_search_params.py new file mode 100644 index 00000000..5ab9deac --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_search_params.py @@ -0,0 +1,12 @@ +from pydantic import BaseModel + +from src.external.pdap.dtos.search_agency_by_location.params import SearchAgencyByLocationParams + + +class URLToSearchParamsMapping(BaseModel): + url_id: int + search_params: list[SearchAgencyByLocationParams] + + @property + def is_empty(self) -> bool: + return len(self.search_params) == 0 \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_search_response.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_search_response.py new file mode 100644 index 00000000..9a88b89d --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/mappings/url_id_search_response.py @@ -0,0 +1,8 @@ +from pydantic import BaseModel + +from src.external.pdap.dtos.search_agency_by_location.response import SearchAgencyByLocationResponse + + +class URLToSearchResponseMapping(BaseModel): + url_id: int + search_responses: list[SearchAgencyByLocationResponse] \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/subsets/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/subsets/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/subsets/nlp_responses.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/subsets/nlp_responses.py new file mode 100644 index 00000000..22fdcf98 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/models/subsets/nlp_responses.py @@ -0,0 +1,9 @@ +from pydantic import BaseModel + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.models.mappings.url_id_nlp_response import \ + URLToNLPResponseMapping + + +class NLPResponseSubsets(BaseModel): + valid: list[URLToNLPResponseMapping] + invalid: list[URLToNLPResponseMapping] \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/check.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/check.py new file mode 100644 index 00000000..ef60e038 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/check.py @@ -0,0 +1,9 @@ +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.mappings import \ + US_STATE_ISO_TO_NAME, US_NAME_TO_STATE_ISO + + +def is_iso_us_state(iso: str) -> bool: + return iso in US_STATE_ISO_TO_NAME + +def is_name_us_state(name: str) -> bool: + return name in US_NAME_TO_STATE_ISO \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/constants.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/constants.py new file mode 100644 index 00000000..8b9076fe --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/constants.py @@ -0,0 +1,18 @@ + + +TOP_N_LOCATIONS_COUNT: int = 5 + +INVALID_LOCATION_CHARACTERS: set[str] = { + "=", + "\\", + "/", + "\'", + "\"," +} + +# State ISOs that commonly align with other words, +# Which cannot be used in simple text scanning +INVALID_SCAN_ISOS: set[str] = { + "IN", + "OR", +} \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/convert.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/convert.py new file mode 100644 index 00000000..040bc466 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/convert.py @@ -0,0 +1,27 @@ +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.mappings import \ + US_STATE_ISO_TO_NAME, US_NAME_TO_STATE_ISO +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.us_state import \ + USState + + +def convert_us_state_iso_to_us_state(iso: str) -> USState | None: + name: str | None = US_STATE_ISO_TO_NAME.get(iso, None) + + if name is None: + return None + + return USState( + name=name, + iso=iso + ) + +def convert_us_state_name_to_us_state(name: str) -> USState | None: + iso: str | None = US_NAME_TO_STATE_ISO.get(name, None) + + if iso is None: + return None + + return USState( + name=name, + iso=iso + ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/core.py new file mode 100644 index 00000000..8e723aa6 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/core.py @@ -0,0 +1,88 @@ +from collections import Counter + +import spacy +from spacy import Language +from spacy.tokens import Doc + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.check import \ + is_name_us_state, is_iso_us_state +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.constants import \ + INVALID_LOCATION_CHARACTERS, INVALID_SCAN_ISOS +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.convert import \ + convert_us_state_name_to_us_state, convert_us_state_iso_to_us_state +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.enums import \ + SpacyModelType +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.extract import \ + extract_most_common_us_state, extract_top_n_locations +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.response import \ + NLPLocationMatchResponse +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.us_state import \ + USState + + +class NLPProcessor: + + def __init__( + self, + model_type: SpacyModelType = SpacyModelType.EN_CORE_WEB_SM + ): + self._model_type: SpacyModelType = model_type + self._model: Language | None = None + + def lazy_load_model(self) -> Language: + if self._model is None: + self._model = spacy.load(self._model_type.value, disable=['parser']) + return self._model + + + def parse_for_locations(self, html: str) -> NLPLocationMatchResponse: + model: Language = self.lazy_load_model() + doc: Doc = model(html) + us_state_counter: Counter[USState] = Counter() + location_counter: Counter[str] = Counter() + + # Scan over tokens + for token in doc: + upper_token: str = token.text.upper() + # Disregard certain ISOs that align with common words + if upper_token in INVALID_SCAN_ISOS: + continue + if not is_iso_us_state(upper_token): + continue + + us_state: USState | None = convert_us_state_iso_to_us_state(upper_token) + if us_state is not None: + us_state_counter[us_state] += 1 + + + # Scan over entities using spacy + for ent in doc.ents: + if ent.label_ != "GPE": # Geopolitical Entity + continue + text: str = ent.text + if any(char in text for char in INVALID_LOCATION_CHARACTERS): + continue + if is_name_us_state(text): + us_state: USState | None = convert_us_state_name_to_us_state(text) + if us_state is not None: + us_state_counter[us_state] += 1 + continue + if is_iso_us_state(text): + us_state: USState | None = convert_us_state_iso_to_us_state(text) + if us_state is not None: + us_state_counter[us_state] += 1 + continue + location_counter[text] += 1 + + # Get most common US State if exists + most_common_us_state: USState | None = extract_most_common_us_state(us_state_counter) + + top_n_locations: list[str] = extract_top_n_locations(location_counter) + + return NLPLocationMatchResponse( + us_state=most_common_us_state, + locations=top_n_locations + ) + + + diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/enums.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/enums.py new file mode 100644 index 00000000..9d1b987b --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/enums.py @@ -0,0 +1,8 @@ +from enum import Enum + + +class SpacyModelType(Enum): + EN_CORE_WEB_SM = "en_core_web_sm" + EN_CORE_WEB_LG = "en_core_web_lg" + EN_CORE_WEB_MD = "en_core_web_md" + EN_CORE_WEB_TRF = "en_core_web_trf" \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/extract.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/extract.py new file mode 100644 index 00000000..ea732ef0 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/extract.py @@ -0,0 +1,25 @@ +from collections import Counter + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.constants import \ + TOP_N_LOCATIONS_COUNT +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.us_state import \ + USState + + +def extract_most_common_us_state( + us_state_counter: Counter[USState] +) -> USState | None: + try: + return us_state_counter.most_common(1)[0][0] + except IndexError: + return None + +def extract_top_n_locations( + location_counter: Counter[str] +) -> list[str]: + top_n_locations_raw: list[tuple[str, int]] = \ + location_counter.most_common(TOP_N_LOCATIONS_COUNT) + top_n_locations: list[str] = [] + for location, _ in top_n_locations_raw: + top_n_locations.append(location) + return top_n_locations \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/mappings.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/mappings.py new file mode 100644 index 00000000..03417480 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/mappings.py @@ -0,0 +1,59 @@ + + +US_STATE_ISO_TO_NAME: dict[str, str] = { + 'AL': 'Alabama', + 'AK': 'Alaska', + 'AZ': 'Arizona', + 'AR': 'Arkansas', + 'CA': 'California', + 'CO': 'Colorado', + 'CT': 'Connecticut', + 'DE': 'Delaware', + 'FL': 'Florida', + 'GA': 'Georgia', + 'HI': 'Hawaii', + 'ID': 'Idaho', + 'IL': 'Illinois', + 'IN': 'Indiana', + 'IA': 'Iowa', + 'KS': 'Kansas', + 'KY': 'Kentucky', + 'LA': 'Louisiana', + 'ME': 'Maine', + 'MD': 'Maryland', + 'MA': 'Massachusetts', + 'MI': 'Michigan', + 'MN': 'Minnesota', + 'MS': 'Mississippi', + 'MO': 'Missouri', + 'MT': 'Montana', + 'NE': 'Nebraska', + 'NV': 'Nevada', + 'NH': 'New Hampshire', + 'NJ': 'New Jersey', + 'NM': 'New Mexico', + 'NY': 'New York', + 'NC': 'North Carolina', + 'ND': 'North Dakota', + 'OH': 'Ohio', + 'OK': 'Oklahoma', + 'OR': 'Oregon', + 'PA': 'Pennsylvania', + 'RI': 'Rhode Island', + 'SC': 'South Carolina', + 'SD': 'South Dakota', + 'TN': 'Tennessee', + 'TX': 'Texas', + 'UT': 'Utah', + 'VT': 'Vermont', + 'VA': 'Virginia', + 'WA': 'Washington', + 'WV': 'West Virginia', + 'WI': 'Wisconsin', + 'WY': 'Wyoming', + 'DC': 'District of Columbia', +} + +US_NAME_TO_STATE_ISO: dict[str, str] = { + name: iso for iso, name in US_STATE_ISO_TO_NAME.items() +} \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/params.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/params.py new file mode 100644 index 00000000..79378612 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/params.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class NLPLocationMatchParams(BaseModel): + url_id: int + html: str \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/response.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/response.py new file mode 100644 index 00000000..387e32de --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/response.py @@ -0,0 +1,18 @@ +from pydantic import BaseModel + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.models.us_state import \ + USState + + +class NLPLocationMatchResponse(BaseModel): + locations: list[str] + us_state: USState | None + + @property + def valid(self) -> bool: + # Valid responses must have a US State and at least one location + if self.us_state is None: + return False + if len(self.locations) == 0: + return False + return True diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/us_state.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/us_state.py new file mode 100644 index 00000000..0b29771f --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/models/us_state.py @@ -0,0 +1,8 @@ +from pydantic import BaseModel, ConfigDict + + +class USState(BaseModel): + model_config = ConfigDict(frozen=True) + + name: str + iso: str diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/preprocess.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/preprocess.py new file mode 100644 index 00000000..da20f4f4 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/processor/nlp/preprocess.py @@ -0,0 +1,20 @@ +import re + +import unicodedata +from bs4 import BeautifulSoup + + +def preprocess_html(raw_html: str) -> str: + """Preprocess HTML to extract text content.""" + soup = BeautifulSoup(raw_html, 'lxml') + + # Remove scripts, styles, and other non-textual elements + for tag in soup(['script','style','noscript','iframe','canvas','svg','header','footer','nav','aside']): + tag.decompose() + # Extract text + text = soup.get_text(separator=' ') + # Normalize text and collapse whitespace + text = unicodedata.normalize('NFKC', text) + text = re.sub(r'[ \t\u00A0]+', ' ', text) + text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text) + return text.strip() \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/query.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/query.py new file mode 100644 index 00000000..32311bd1 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/nlp_location_match_/query.py @@ -0,0 +1,49 @@ +from typing import Sequence + +from sqlalchemy import select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.constants import \ + NUMBER_OF_ENTRIES_PER_ITERATION +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.models.input import \ + NLPLocationMatchSubtaskInput +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.eligible import \ + EligibleContainer +from src.db.helpers.session import session_helper as sh +from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML +from src.db.queries.base.builder import QueryBuilderBase +from src.db.utils.compression import decompress_html + + +class GetNLPLocationMatchSubtaskInputQueryBuilder(QueryBuilderBase): + + async def run( + self, + session: AsyncSession + ) -> list[NLPLocationMatchSubtaskInput]: + container = EligibleContainer() + query = ( + select( + container.url_id, + URLCompressedHTML.compressed_html + ) + .join( + URLCompressedHTML, + URLCompressedHTML.url_id == container.url_id, + ) + .where( + container.nlp_location, + ) + .limit(NUMBER_OF_ENTRIES_PER_ITERATION) + ) + + mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) + inputs: list[NLPLocationMatchSubtaskInput] = [ + NLPLocationMatchSubtaskInput( + url_id=mapping["id"], + html=decompress_html(mapping["compressed_html"]), + ) + for mapping in mappings + ] + return inputs + diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/unknown.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/unknown.py deleted file mode 100644 index 7ffd57bc..00000000 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/unknown.py +++ /dev/null @@ -1,30 +0,0 @@ -from typing_extensions import override, final - -from src.core.enums import SuggestionType -from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from src.core.tasks.url.operators.agency_identification.subtasks.impl.base import AgencyIdentificationSubtaskBase - -@final -class UnknownAgencyIdentificationSubtask(AgencyIdentificationSubtaskBase): - """A subtask that returns an unknown suggestion. - - Used in cases where the agency cannot be reliably inferred from the source. - """ - - @override - async def run( - self, - url_id: int, - collector_metadata: dict | None = None - ) -> list[URLAgencySuggestionInfo]: - return [ - URLAgencySuggestionInfo( - url_id=url_id, - suggestion_type=SuggestionType.UNKNOWN, - pdap_agency_id=None, - agency_name=None, - state=None, - county=None, - locality=None - ) - ] diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/loader.py b/src/core/tasks/url/operators/agency_identification/subtasks/loader.py index 6ef84149..5dab9608 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/loader.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/loader.py @@ -1,10 +1,16 @@ -from src.collectors.enums import CollectorType from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface -from src.core.tasks.url.operators.agency_identification.subtasks.impl.base import AgencyIdentificationSubtaskBase -from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan import CKANAgencyIdentificationSubtask -from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock import \ - MuckrockAgencyIdentificationSubtask -from src.core.tasks.url.operators.agency_identification.subtasks.impl.unknown import UnknownAgencyIdentificationSubtask +from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan_.core import CKANAgencyIDSubtaskOperator +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.core import \ + HomepageMatchSubtaskOperator +from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock_.core import \ + MuckrockAgencyIDSubtaskOperator +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.core import \ + NLPLocationMatchSubtaskOperator +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.core import \ + NLPProcessor +from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIDSubtaskOperatorBase +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType from src.external.pdap.client import PDAPClient @@ -14,35 +20,58 @@ class AgencyIdentificationSubtaskLoader: def __init__( self, pdap_client: PDAPClient, - muckrock_api_interface: MuckrockAPIInterface + muckrock_api_interface: MuckrockAPIInterface, + adb_client: AsyncDatabaseClient, + nlp_processor: NLPProcessor ): - self.pdap_client = pdap_client - self.muckrock_api_interface = muckrock_api_interface + self._pdap_client = pdap_client + self._muckrock_api_interface = muckrock_api_interface + self._nlp_processor = nlp_processor + self.adb_client = adb_client - async def _load_muckrock_subtask(self) -> MuckrockAgencyIdentificationSubtask: - return MuckrockAgencyIdentificationSubtask( - muckrock_api_interface=self.muckrock_api_interface, - pdap_client=self.pdap_client + def _load_muckrock_subtask(self, task_id: int) -> MuckrockAgencyIDSubtaskOperator: + return MuckrockAgencyIDSubtaskOperator( + task_id=task_id, + adb_client=self.adb_client, + muckrock_api_interface=self._muckrock_api_interface, + pdap_client=self._pdap_client ) - async def _load_ckan_subtask(self) -> CKANAgencyIdentificationSubtask: - return CKANAgencyIdentificationSubtask( - pdap_client=self.pdap_client + def _load_ckan_subtask(self, task_id: int) -> CKANAgencyIDSubtaskOperator: + return CKANAgencyIDSubtaskOperator( + task_id=task_id, + adb_client=self.adb_client, + pdap_client=self._pdap_client ) - async def load_subtask(self, collector_type: CollectorType) -> AgencyIdentificationSubtaskBase: + def _load_homepage_match_subtask(self, task_id: int) -> HomepageMatchSubtaskOperator: + return HomepageMatchSubtaskOperator( + task_id=task_id, + adb_client=self.adb_client, + ) + + def _load_nlp_location_match_subtask(self, task_id: int) -> NLPLocationMatchSubtaskOperator: + return NLPLocationMatchSubtaskOperator( + task_id=task_id, + adb_client=self.adb_client, + pdap_client=self._pdap_client, + processor=self._nlp_processor + ) + + + async def load_subtask( + self, + subtask_type: AutoAgencyIDSubtaskType, + task_id: int + ) -> AgencyIDSubtaskOperatorBase: """Get subtask based on collector type.""" - match collector_type: - case CollectorType.MUCKROCK_SIMPLE_SEARCH: - return await self._load_muckrock_subtask() - case CollectorType.MUCKROCK_COUNTY_SEARCH: - return await self._load_muckrock_subtask() - case CollectorType.MUCKROCK_ALL_SEARCH: - return await self._load_muckrock_subtask() - case CollectorType.AUTO_GOOGLER: - return UnknownAgencyIdentificationSubtask() - case CollectorType.COMMON_CRAWLER: - return UnknownAgencyIdentificationSubtask() - case CollectorType.CKAN: - return await self._load_ckan_subtask() - return UnknownAgencyIdentificationSubtask() \ No newline at end of file + match subtask_type: + case AutoAgencyIDSubtaskType.MUCKROCK: + return self._load_muckrock_subtask(task_id) + case AutoAgencyIDSubtaskType.CKAN: + return self._load_ckan_subtask(task_id) + case AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH: + return self._load_nlp_location_match_subtask(task_id) + case AutoAgencyIDSubtaskType.HOMEPAGE_MATCH: + return self._load_homepage_match_subtask(task_id) + raise ValueError(f"Unknown subtask type: {subtask_type}") diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/models/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/models/run_info.py b/src/core/tasks/url/operators/agency_identification/subtasks/models/run_info.py new file mode 100644 index 00000000..524830e3 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/models/run_info.py @@ -0,0 +1,14 @@ +from pydantic import BaseModel + + +class AgencyIDSubtaskRunInfo(BaseModel): + error: str | None = None + linked_url_ids: list[int] | None = None + + @property + def is_success(self) -> bool: + return self.error is None + + @property + def has_linked_urls(self) -> bool: + return len(self.linked_url_ids) > 0 \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/models/subtask.py b/src/core/tasks/url/operators/agency_identification/subtasks/models/subtask.py new file mode 100644 index 00000000..7da0a8f5 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/models/subtask.py @@ -0,0 +1,18 @@ +from pydantic import BaseModel + +from src.core.tasks.url.operators.agency_identification.subtasks.models.suggestion import AgencySuggestion +from src.db.models.impl.url.suggestion.agency.subtask.pydantic import URLAutoAgencyIDSubtaskPydantic + + +class AutoAgencyIDSubtaskData(BaseModel): + pydantic_model: URLAutoAgencyIDSubtaskPydantic + suggestions: list[AgencySuggestion] + error: str | None = None + + @property + def has_error(self) -> bool: + return self.error is not None + + @property + def url_id(self) -> int: + return self.pydantic_model.url_id \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/models/suggestion.py b/src/core/tasks/url/operators/agency_identification/subtasks/models/suggestion.py new file mode 100644 index 00000000..669c498c --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/models/suggestion.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel, Field + + +class AgencySuggestion(BaseModel): + agency_id: int + confidence: int = Field(ge=0, le=100) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/constants.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/constants.py new file mode 100644 index 00000000..749332e6 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/constants.py @@ -0,0 +1,14 @@ +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType + +# Determines priority of subtasks, all else being equal. +SUBTASK_HIERARCHY: list[AutoAgencyIDSubtaskType] = [ + AutoAgencyIDSubtaskType.CKAN, + AutoAgencyIDSubtaskType.MUCKROCK, + AutoAgencyIDSubtaskType.HOMEPAGE_MATCH, + AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH +] + +SUBTASK_HIERARCHY_MAPPING: dict[AutoAgencyIDSubtaskType, int] = { + subtask: idx + for idx, subtask in enumerate(SUBTASK_HIERARCHY) +} \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/core.py new file mode 100644 index 00000000..2b81d2de --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/core.py @@ -0,0 +1,77 @@ +from collections import Counter + +from sqlalchemy import RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.constants import SUBTASK_HIERARCHY, \ + SUBTASK_HIERARCHY_MAPPING +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.eligible_counts import \ + ELIGIBLE_COUNTS_QUERY +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType +from src.db.queries.base.builder import QueryBuilderBase + +from src.db.helpers.session import session_helper as sh + +class AgencyIDSubtaskSurveyQueryBuilder(QueryBuilderBase): + """ + Survey applicable URLs to determine next subtask to run + + URLs are "inapplicable" if they have any of the following properties: + - Are validated via FlagURLValidated model + - Have at least one annotation with agency suggestion with confidence >= 95 + - Have all possible subtasks completed + + Returns a list of one or more subtasks to run + based on which subtask(s) have the most applicable URLs + (or an empty list if no subtasks have applicable URLs) + """ + + def __init__( + self, + allowed_subtasks: list[AutoAgencyIDSubtaskType] + ): + super().__init__() + self._allowed_subtasks = allowed_subtasks + + async def run(self, session: AsyncSession) -> AutoAgencyIDSubtaskType | None: + results: RowMapping = await sh.mapping(session, ELIGIBLE_COUNTS_QUERY) + counts: Counter[str] = Counter(results) + + allowed_counts: Counter[str] = await self._filter_allowed_counts(counts) + if len(allowed_counts) == 0: + return None + max_count: int = max(allowed_counts.values()) + if max_count == 0: + return None + subtasks_with_max_count: list[str] = [ + subtask for subtask, count in allowed_counts.items() + if count == max_count + ] + subtasks_as_enum_list: list[AutoAgencyIDSubtaskType] = [ + AutoAgencyIDSubtaskType(subtask) + for subtask in subtasks_with_max_count + ] + # Sort subtasks by priority + sorted_subtasks: list[AutoAgencyIDSubtaskType] = sorted( + subtasks_as_enum_list, + key=lambda subtask: SUBTASK_HIERARCHY_MAPPING[subtask], + reverse=True, + ) + # Return the highest priority subtask + return sorted_subtasks[0] + + async def _filter_allowed_counts(self, counts: Counter[str]) -> Counter[str]: + return Counter( + { + subtask: count + for subtask, count in counts.items() + if AutoAgencyIDSubtaskType(subtask) in self._allowed_subtasks + } + ) + + + + + + + diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/README.md b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/README.md new file mode 100644 index 00000000..38324fa7 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/README.md @@ -0,0 +1,3 @@ +Contains CTEs for determining validity for each subtask. + +Each file corresponds to the validity CTE for that subtask. \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/eligible.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/eligible.py new file mode 100644 index 00000000..5be64fbc --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/eligible.py @@ -0,0 +1,57 @@ +from sqlalchemy import select, CTE, Column + +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.exists.impl.high_confidence_annotations import \ + HIGH_CONFIDENCE_ANNOTATIONS_EXISTS_CONTAINER +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.exists.impl.validated import \ + VALIDATED_EXISTS_CONTAINER +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.impl.ckan import \ + CKAN_SUBTASK_CONTAINER +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.impl.homepage import \ + HOMEPAGE_SUBTASK_CONTAINER +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.impl.muckrock import \ + MUCKROCK_SUBTASK_CONTAINER +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.impl.nlp_location import \ + NLP_LOCATION_CONTAINER +from src.db.models.impl.url.core.sqlalchemy import URL + +class EligibleContainer: + + def __init__(self): + self._cte = ( + select( + URL.id, + CKAN_SUBTASK_CONTAINER.eligible_query.label("ckan"), + MUCKROCK_SUBTASK_CONTAINER.eligible_query.label("muckrock"), + HOMEPAGE_SUBTASK_CONTAINER.eligible_query.label("homepage"), + NLP_LOCATION_CONTAINER.eligible_query.label("nlp_location"), + ) + .where( + HIGH_CONFIDENCE_ANNOTATIONS_EXISTS_CONTAINER.not_exists_query, + VALIDATED_EXISTS_CONTAINER.not_exists_query, + ) + .cte("eligible") + ) + + @property + def cte(self) -> CTE: + return self._cte + + @property + def url_id(self) -> Column[int]: + return self._cte.c['id'] + + @property + def ckan(self) -> Column[bool]: + return self._cte.c['ckan'] + + @property + def muckrock(self) -> Column[bool]: + return self._cte.c['muckrock'] + + @property + def homepage(self) -> Column[bool]: + return self._cte.c['homepage'] + + @property + def nlp_location(self) -> Column[bool]: + return self._cte.c['nlp_location'] \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/container.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/container.py new file mode 100644 index 00000000..d59c508c --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/container.py @@ -0,0 +1,33 @@ +from sqlalchemy import CTE, Column, ColumnElement, exists + +from src.db.models.impl.url.core.sqlalchemy import URL + + +class ExistsCTEContainer: + """ + Base class for CTEs that determine validity for each subtask. + + Single column CTEs intended to be left-joined and considered valid only + if the joined row is not null. + """ + + def __init__( + self, + cte: CTE, + ) -> None: + self._cte = cte + + @property + def cte(self) -> CTE: + return self._cte + + @property + def url_id(self) -> Column[int]: + return self.cte.columns[0] + + @property + def not_exists_query(self) -> ColumnElement[bool]: + return ( + ~exists() + .where(self.url_id == URL.id) + ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/impl/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/impl/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/impl/high_confidence_annotations.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/impl/high_confidence_annotations.py new file mode 100644 index 00000000..3ac0ced7 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/impl/high_confidence_annotations.py @@ -0,0 +1,29 @@ +from sqlalchemy import select + +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.exists.container import \ + ExistsCTEContainer +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask +from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion + +cte = ( + select( + URL.id + ) + .join( + URLAutoAgencyIDSubtask, + URLAutoAgencyIDSubtask.url_id == URL.id, + ) + .join( + AgencyIDSubtaskSuggestion, + AgencyIDSubtaskSuggestion.subtask_id == URLAutoAgencyIDSubtask.id, + ) + .where( + AgencyIDSubtaskSuggestion.confidence >= 95, + ) + .cte("high_confidence_annotations_exists") +) + +HIGH_CONFIDENCE_ANNOTATIONS_EXISTS_CONTAINER = ExistsCTEContainer( + cte, +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/impl/validated.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/impl/validated.py new file mode 100644 index 00000000..f515c1d1 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/exists/impl/validated.py @@ -0,0 +1,16 @@ +from sqlalchemy import select + +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.exists.container import \ + ExistsCTEContainer +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated + +cte = ( + select( + FlagURLValidated.url_id + ) + .cte("validated_exists") +) + +VALIDATED_EXISTS_CONTAINER = ExistsCTEContainer( + cte, +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/container.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/container.py new file mode 100644 index 00000000..9782e4fd --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/container.py @@ -0,0 +1,40 @@ +from sqlalchemy import CTE, ColumnElement, Column, Select, exists, func + +from src.db.models.impl.url.core.sqlalchemy import URL + + +class SubtaskCTEContainer: + """ + CTE for URLs eligible for a given subtask. + A successful left join on this indicates the URL is eligible for the subtask. + A true value for `subtask_entry_exists` indicates + a subtask entry for the URL already exists + """ + + def __init__( + self, + cte: CTE, + ) -> None: + self._cte=cte + + @property + def cte(self) -> CTE: + return self._cte + + @property + def entry_exists(self) -> ColumnElement[bool]: + return self.cte.c['subtask_entry_exists'] + + @property + def url_id(self) -> Column[int]: + return self.cte.c['id'] + + @property + def eligible_query(self) -> ColumnElement[int]: + return ( + exists() + .where( + self.url_id == URL.id, + self.entry_exists.is_(False), + ) + ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/helpers.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/helpers.py new file mode 100644 index 00000000..b06442ea --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/helpers.py @@ -0,0 +1,18 @@ +from sqlalchemy import ColumnElement, exists + +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType +from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask + + +def get_exists_subtask_query( + subtask_type: AutoAgencyIDSubtaskType, +) -> ColumnElement[bool]: + return ( + exists() + .where( + URLAutoAgencyIDSubtask.url_id == URL.id, + URLAutoAgencyIDSubtask.type == subtask_type, + ) + .label("subtask_entry_exists") + ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/ckan.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/ckan.py new file mode 100644 index 00000000..b1b70cdb --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/ckan.py @@ -0,0 +1,37 @@ +from sqlalchemy import select + +from src.collectors.enums import CollectorType +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.helpers import \ + get_exists_subtask_query +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.container import \ + SubtaskCTEContainer +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType + +cte = ( + select( + URL.id, + get_exists_subtask_query( + AutoAgencyIDSubtaskType.CKAN, + ), + ) + .join( + LinkBatchURL, + LinkBatchURL.url_id == URL.id, + ) + .join( + Batch, + Batch.id == LinkBatchURL.batch_id, + ) + .where( + Batch.strategy == CollectorType.CKAN.value, + + ) + .cte("ckan_eligible") +) + +CKAN_SUBTASK_CONTAINER = SubtaskCTEContainer( + cte, +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/homepage.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/homepage.py new file mode 100644 index 00000000..4d75b4e0 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/homepage.py @@ -0,0 +1,34 @@ +from sqlalchemy import select, exists + +from src.core.tasks.url.operators.agency_identification.subtasks.impl.homepage_match_.queries.ctes.consolidated import \ + CONSOLIDATED_CTE +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.container import \ + SubtaskCTEContainer +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.helpers import \ + get_exists_subtask_query +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType + +VALID_URL_FLAG = ( + exists() + .where( + URL.id == CONSOLIDATED_CTE.c.url_id, + ) +) + +cte = ( + select( + URL.id, + get_exists_subtask_query( + AutoAgencyIDSubtaskType.HOMEPAGE_MATCH, + ) + ) + .where( + VALID_URL_FLAG, + ) + .cte("homepage_eligible") +) + +HOMEPAGE_SUBTASK_CONTAINER = SubtaskCTEContainer( + cte, +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/muckrock.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/muckrock.py new file mode 100644 index 00000000..1f059e86 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/muckrock.py @@ -0,0 +1,40 @@ +from sqlalchemy import select + +from src.collectors.enums import CollectorType +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.container import \ + SubtaskCTEContainer +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.helpers import \ + get_exists_subtask_query +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType + +cte = ( + select( + URL.id, + get_exists_subtask_query( + AutoAgencyIDSubtaskType.MUCKROCK, + ) + ) + .join( + LinkBatchURL, + LinkBatchURL.url_id == URL.id, + ) + .join( + Batch, + Batch.id == LinkBatchURL.batch_id, + ) + .where( + Batch.strategy.in_( + (CollectorType.MUCKROCK_ALL_SEARCH.value, + CollectorType.MUCKROCK_COUNTY_SEARCH.value, + CollectorType.MUCKROCK_SIMPLE_SEARCH.value,) + ), + ) + .cte("muckrock_eligible") +) + +MUCKROCK_SUBTASK_CONTAINER = SubtaskCTEContainer( + cte, +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location.py new file mode 100644 index 00000000..40533809 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location.py @@ -0,0 +1,26 @@ +from sqlalchemy import select + +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.helpers import \ + get_exists_subtask_query +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.subtask.container import \ + SubtaskCTEContainer +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType + +cte = ( + select( + URL.id, + get_exists_subtask_query( + AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH + ) + ) + .join( + URLCompressedHTML + ) + .cte("nlp_location_eligible") +) + +NLP_LOCATION_CONTAINER = SubtaskCTEContainer( + cte, +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/eligible_counts.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/eligible_counts.py new file mode 100644 index 00000000..96a322cb --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/eligible_counts.py @@ -0,0 +1,25 @@ +from sqlalchemy import select, ColumnElement, Integer, func + +from src.core.tasks.url.operators.agency_identification.subtasks.queries.survey.queries.ctes.eligible import \ + EligibleContainer +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType + + +def sum_count(col: ColumnElement[bool], subtask_type: AutoAgencyIDSubtaskType) -> ColumnElement[int]: + return func.coalesce( + func.sum( + col.cast(Integer) + ), + 0, + ).label(subtask_type.value) + +container = EligibleContainer() + +ELIGIBLE_COUNTS_QUERY = ( + select( + sum_count(container.ckan, AutoAgencyIDSubtaskType.CKAN), + sum_count(container.muckrock, AutoAgencyIDSubtaskType.MUCKROCK), + sum_count(container.homepage, AutoAgencyIDSubtaskType.HOMEPAGE_MATCH), + sum_count(container.nlp_location, AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH), + ) +) \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/templates/__init__.py b/src/core/tasks/url/operators/agency_identification/subtasks/templates/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/templates/output.py b/src/core/tasks/url/operators/agency_identification/subtasks/templates/output.py new file mode 100644 index 00000000..02ae76a4 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/templates/output.py @@ -0,0 +1,5 @@ +from pydantic import BaseModel + + +class AgencyIDSubtaskOutputBase(BaseModel): + pass \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/templates/postprocessor.py b/src/core/tasks/url/operators/agency_identification/subtasks/templates/postprocessor.py new file mode 100644 index 00000000..b366747f --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/templates/postprocessor.py @@ -0,0 +1,26 @@ +from abc import ABC, abstractmethod + +from src.core.tasks.url.operators.agency_identification.subtasks.templates.output import AgencyIDSubtaskOutputBase +from src.db.client.async_ import AsyncDatabaseClient + + +class SubtaskPostprocessorBase(ABC): + """ + An optional class which takes + the output of the subtask along with the subtask id + and adds additional information to the database. + """ + + def __init__( + self, + subtask_id: int, + subtask_output: AgencyIDSubtaskOutputBase, + adb_client: AsyncDatabaseClient + ): + self.subtask_id = subtask_id + self.subtask_output = subtask_output + self.adb_client = adb_client + + @abstractmethod + async def run(self) -> None: + raise NotImplementedError \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/templates/subtask.py b/src/core/tasks/url/operators/agency_identification/subtasks/templates/subtask.py new file mode 100644 index 00000000..4085b6dd --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/templates/subtask.py @@ -0,0 +1,82 @@ +import abc +import traceback +from abc import ABC + +from src.core.tasks.url.operators.agency_identification.subtasks.models.run_info import AgencyIDSubtaskRunInfo +from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.error_info.pydantic import URLErrorPydanticInfo +from src.db.models.impl.url.suggestion.agency.subtask.pydantic import URLAutoAgencyIDSubtaskPydantic +from src.db.models.impl.url.suggestion.agency.suggestion.pydantic import AgencyIDSubtaskSuggestionPydantic + + +class AgencyIDSubtaskOperatorBase(ABC): + + def __init__( + self, + adb_client: AsyncDatabaseClient, + task_id: int + ) -> None: + self.adb_client: AsyncDatabaseClient = adb_client + self.task_id: int = task_id + self.linked_urls: list[int] = [] + + async def run(self) -> AgencyIDSubtaskRunInfo: + try: + await self.inner_logic() + except Exception as e: + # Get stack trace + stack_trace: str = traceback.format_exc() + return AgencyIDSubtaskRunInfo( + error=f"{type(e).__name__}: {str(e)}: {stack_trace}", + linked_url_ids=self.linked_urls + ) + return AgencyIDSubtaskRunInfo( + linked_url_ids=self.linked_urls + ) + + @abc.abstractmethod + async def inner_logic(self) -> AgencyIDSubtaskRunInfo: + raise NotImplementedError + + async def _upload_subtask_data( + self, + subtask_data_list: list[AutoAgencyIDSubtaskData] + ) -> None: + + subtask_models: list[URLAutoAgencyIDSubtaskPydantic] = [ + subtask_data.pydantic_model + for subtask_data in subtask_data_list + ] + subtask_ids: list[int] = await self.adb_client.bulk_insert( + models=subtask_models, + return_ids=True + ) + suggestions: list[AgencyIDSubtaskSuggestionPydantic] = [] + for subtask_id, subtask_info in zip(subtask_ids, subtask_data_list): + for suggestion in subtask_info.suggestions: + suggestion_pydantic = AgencyIDSubtaskSuggestionPydantic( + subtask_id=subtask_id, + agency_id=suggestion.agency_id, + confidence=suggestion.confidence, + ) + suggestions.append(suggestion_pydantic) + + await self.adb_client.bulk_insert( + models=suggestions, + ) + + error_infos: list[URLErrorPydanticInfo] = [] + for subtask_info in subtask_data_list: + if not subtask_info.has_error: + continue + error_info = URLErrorPydanticInfo( + url_id=subtask_info.url_id, + error=subtask_info.error, + task_id=self.task_id, + ) + error_infos.append(error_info) + + await self.adb_client.bulk_insert( + models=error_infos, + ) diff --git a/src/core/tasks/url/operators/auto_relevant/queries/get_tdos.py b/src/core/tasks/url/operators/auto_relevant/queries/get_tdos.py index b3ba90ec..384cb5c4 100644 --- a/src/core/tasks/url/operators/auto_relevant/queries/get_tdos.py +++ b/src/core/tasks/url/operators/auto_relevant/queries/get_tdos.py @@ -28,7 +28,7 @@ async def run(self, session: AsyncSession) -> list[URLRelevantTDO]: .join(URLCompressedHTML) .outerjoin(AutoRelevantSuggestion) .where( - URL.status == URLStatus.PENDING.value, + URL.status == URLStatus.OK.value, AutoRelevantSuggestion.id.is_(None), ) ) diff --git a/src/core/tasks/url/operators/submit_approved/queries/get.py b/src/core/tasks/url/operators/submit_approved/queries/get.py index 6c22c731..19b32b5d 100644 --- a/src/core/tasks/url/operators/submit_approved/queries/get.py +++ b/src/core/tasks/url/operators/submit_approved/queries/get.py @@ -4,6 +4,8 @@ from src.collectors.enums import URLStatus from src.core.tasks.url.operators.submit_approved.tdo import SubmitApprovedURLTDO +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase from src.db.helpers.session import session_helper as sh @@ -29,7 +31,8 @@ async def _process_results(self, urls): async def _build_query(): query = ( select(URL) - .where(URL.status == URLStatus.VALIDATED.value) + .join(FlagURLValidated, FlagURLValidated.url_id == URL.id) + .where(FlagURLValidated.type == URLValidatedType.DATA_SOURCE) .options( selectinload(URL.optional_data_source_metadata), selectinload(URL.confirmed_agencies), diff --git a/src/core/tasks/url/operators/submit_approved/queries/has_validated.py b/src/core/tasks/url/operators/submit_approved/queries/has_validated.py index abd94d20..5a3ff464 100644 --- a/src/core/tasks/url/operators/submit_approved/queries/has_validated.py +++ b/src/core/tasks/url/operators/submit_approved/queries/has_validated.py @@ -2,6 +2,8 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.collectors.enums import URLStatus +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase @@ -11,7 +13,13 @@ class HasValidatedURLsQueryBuilder(QueryBuilderBase): async def run(self, session: AsyncSession) -> bool: query = ( select(URL) - .where(URL.status == URLStatus.VALIDATED.value) + .join( + FlagURLValidated, + FlagURLValidated.url_id == URL.id + ) + .where( + FlagURLValidated.type == URLValidatedType.DATA_SOURCE + ) ) urls = await session.execute(query) urls = urls.scalars().all() diff --git a/src/core/tasks/url/operators/submit_approved/queries/mark_submitted.py b/src/core/tasks/url/operators/submit_approved/queries/mark_submitted.py index d2563335..4ebfef56 100644 --- a/src/core/tasks/url/operators/submit_approved/queries/mark_submitted.py +++ b/src/core/tasks/url/operators/submit_approved/queries/mark_submitted.py @@ -19,14 +19,6 @@ async def run(self, session: AsyncSession): url_id = info.url_id data_source_id = info.data_source_id - query = ( - update(URL) - .where(URL.id == url_id) - .values( - status=URLStatus.SUBMITTED.value - ) - ) - url_data_source_object = URLDataSource( url_id=url_id, data_source_id=data_source_id @@ -35,4 +27,3 @@ async def run(self, session: AsyncSession): url_data_source_object.created_at = info.submitted_at session.add(url_data_source_object) - await session.execute(query) \ No newline at end of file diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 3b994f86..19cbc3f5 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -3,7 +3,7 @@ from operator import or_ from typing import Optional, Type, Any, List, Sequence -from sqlalchemy import select, exists, func, case, Select, and_, update, delete, literal, Row +from sqlalchemy import select, exists, func, Select, and_, update, delete, Row from sqlalchemy.dialects.postgresql import insert as pg_insert from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker from sqlalchemy.orm import selectinload, QueryableAttribute @@ -26,21 +26,23 @@ from src.api.endpoints.collector.dtos.manual_batch.post import ManualBatchInputDTO from src.api.endpoints.collector.dtos.manual_batch.response import ManualBatchResponseDTO from src.api.endpoints.collector.manual.query import UploadManualBatchQueryBuilder +from src.api.endpoints.metrics.backlog.query import GetBacklogMetricsQueryBuilder from src.api.endpoints.metrics.batches.aggregated.dto import GetMetricsBatchesAggregatedResponseDTO -from src.api.endpoints.metrics.batches.aggregated.query import GetBatchesAggregatedMetricsQueryBuilder +from src.api.endpoints.metrics.batches.aggregated.query.core import GetBatchesAggregatedMetricsQueryBuilder from src.api.endpoints.metrics.batches.breakdown.dto import GetMetricsBatchesBreakdownResponseDTO from src.api.endpoints.metrics.batches.breakdown.query import GetBatchesBreakdownMetricsQueryBuilder -from src.api.endpoints.metrics.dtos.get.backlog import GetMetricsBacklogResponseDTO, GetMetricsBacklogResponseInnerDTO +from src.api.endpoints.metrics.dtos.get.backlog import GetMetricsBacklogResponseDTO from src.api.endpoints.metrics.dtos.get.urls.aggregated.core import GetMetricsURLsAggregatedResponseDTO -from src.api.endpoints.metrics.dtos.get.urls.breakdown.pending import GetMetricsURLsBreakdownPendingResponseDTO, \ - GetMetricsURLsBreakdownPendingResponseInnerDTO +from src.api.endpoints.metrics.dtos.get.urls.breakdown.pending import GetMetricsURLsBreakdownPendingResponseDTO from src.api.endpoints.metrics.dtos.get.urls.breakdown.submitted import GetMetricsURLsBreakdownSubmittedResponseDTO, \ GetMetricsURLsBreakdownSubmittedInnerDTO +from src.api.endpoints.metrics.urls.aggregated.query.core import GetURLsAggregatedMetricsQueryBuilder +from src.api.endpoints.metrics.urls.breakdown.query.core import GetURLsBreakdownPendingMetricsQueryBuilder from src.api.endpoints.review.approve.dto import FinalReviewApprovalInfo from src.api.endpoints.review.approve.query_.core import ApproveURLQueryBuilder from src.api.endpoints.review.enums import RejectionReason +from src.api.endpoints.review.next.core import GetNextURLForFinalReviewQueryBuilder from src.api.endpoints.review.next.dto import GetNextURLForFinalReviewOuterResponse -from src.api.endpoints.review.next.query import GetNextURLForFinalReviewQueryBuilder from src.api.endpoints.review.reject.query import RejectURLQueryBuilder from src.api.endpoints.search.dtos.response import SearchURLResponse from src.api.endpoints.task.by_id.dto import TaskInfo @@ -50,19 +52,14 @@ from src.api.endpoints.url.get.query import GetURLsQueryBuilder from src.collectors.enums import URLStatus, CollectorType from src.collectors.queries.insert.urls.query import InsertURLsQueryBuilder -from src.core.enums import BatchStatus, SuggestionType, RecordType, SuggestedStatus +from src.core.enums import BatchStatus, RecordType, SuggestedStatus from src.core.env_var_manager import EnvVarManager -from src.core.tasks.scheduled.impl.huggingface.queries.check.core import CheckValidURLsUpdatedQueryBuilder -from src.core.tasks.scheduled.impl.huggingface.queries.get.core import GetForLoadingToHuggingFaceQueryBuilder -from src.core.tasks.scheduled.impl.huggingface.queries.get.model import GetForLoadingToHuggingFaceOutput from src.core.tasks.scheduled.impl.huggingface.queries.state import SetHuggingFaceUploadStateQueryBuilder from src.core.tasks.scheduled.impl.sync.agency.dtos.parameters import AgencySyncParameters from src.core.tasks.scheduled.impl.sync.agency.queries.get_sync_params import GetAgenciesSyncParametersQueryBuilder from src.core.tasks.scheduled.impl.sync.agency.queries.mark_full_sync import get_mark_full_agencies_sync_query from src.core.tasks.scheduled.impl.sync.agency.queries.update_sync_progress import \ get_update_agencies_sync_progress_query -from src.core.tasks.scheduled.impl.sync.agency.queries.upsert import \ - convert_agencies_sync_response_to_agencies_upsert from src.core.tasks.scheduled.impl.sync.data_sources.params import DataSourcesSyncParameters from src.core.tasks.scheduled.impl.sync.data_sources.queries.get_sync_params import \ GetDataSourcesSyncParametersQueryBuilder @@ -72,11 +69,6 @@ from src.core.tasks.scheduled.impl.sync.data_sources.queries.upsert.core import \ UpsertURLsFromDataSourcesQueryBuilder from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from src.core.tasks.url.operators.agency_identification.dtos.tdo import AgencyIdentificationTDO -from src.core.tasks.url.operators.agency_identification.queries.get_pending_urls_without_agency_suggestions import \ - GetPendingURLsWithoutAgencySuggestionsQueryBuilder -from src.core.tasks.url.operators.agency_identification.queries.has_urls_without_agency_suggestions import \ - HasURLsWithoutAgencySuggestionsQueryBuilder from src.core.tasks.url.operators.auto_relevant.models.tdo import URLRelevantTDO from src.core.tasks.url.operators.auto_relevant.queries.get_tdos import GetAutoRelevantTDOsQueryBuilder from src.core.tasks.url.operators.html.queries.get import \ @@ -106,9 +98,10 @@ from src.db.helpers.session import session_helper as sh from src.db.models.impl.agency.sqlalchemy import Agency from src.db.models.impl.backlog_snapshot import BacklogSnapshot -from src.db.models.impl.batch.pydantic import BatchInfo +from src.db.models.impl.batch.pydantic.info import BatchInfo from src.db.models.impl.batch.sqlalchemy import Batch from src.db.models.impl.duplicate.pydantic.info import DuplicateInfo +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.task_url import LinkTaskURL from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.log.pydantic.info import LogInfo @@ -126,7 +119,6 @@ from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from src.db.models.impl.url.probed_for_404 import URLProbedFor404 -from src.db.models.impl.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion @@ -145,7 +137,6 @@ from src.db.templates.markers.bulk.insert import BulkInsertableModel from src.db.templates.markers.bulk.upsert import BulkUpsertableModel from src.db.utils.compression import decompress_html, compress_html -from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo from src.external.pdap.dtos.sync.data_sources import DataSourcesSyncResponseInnerInfo @@ -546,7 +537,7 @@ async def get_urls_with_html_data_and_without_models( ): statement = (select(URL) .options(selectinload(URL.html_content)) - .where(URL.status == URLStatus.PENDING.value)) + .where(URL.status == URLStatus.OK.value)) statement = self.statement_composer.exclude_urls_with_extant_model( statement=statement, model=model @@ -575,7 +566,7 @@ async def has_urls_with_html_data_and_without_models( ) -> bool: statement = (select(URL) .join(URLCompressedHTML) - .where(URL.status == URLStatus.PENDING.value)) + .where(URL.status == URLStatus.OK.value)) # Exclude URLs with auto suggested record types statement = self.statement_composer.exclude_urls_with_extant_model( statement=statement, @@ -614,9 +605,11 @@ async def get_urls( page: int, errors: bool ) -> GetURLsResponseInfo: - return await self.run_query_builder(GetURLsQueryBuilder( - page=page, errors=errors - )) + return await self.run_query_builder( + GetURLsQueryBuilder( + page=page, errors=errors + ) + ) @session_manager async def initiate_task( @@ -657,7 +650,12 @@ async def get_html_content_info(self, url_id: int) -> list[URLHTMLContentInfo]: return await self.run_query_builder(GetHTMLContentInfoQueryBuilder(url_id)) @session_manager - async def link_urls_to_task(self, session: AsyncSession, task_id: int, url_ids: list[int]): + async def link_urls_to_task( + self, + session: AsyncSession, + task_id: int, + url_ids: list[int] + ) -> None: for url_id in url_ids: link = LinkTaskURL( url_id=url_id, @@ -720,24 +718,19 @@ async def get_tasks( tasks=final_results ) - async def has_urls_without_agency_suggestions(self) -> bool: - return await self.run_query_builder(HasURLsWithoutAgencySuggestionsQueryBuilder()) - async def get_urls_without_agency_suggestions( - self - ) -> list[AgencyIdentificationTDO]: - """Retrieve URLs without confirmed or suggested agencies.""" - return await self.run_query_builder(GetPendingURLsWithoutAgencySuggestionsQueryBuilder()) async def get_next_url_agency_for_annotation( self, user_id: int, batch_id: int | None ) -> GetNextURLForAgencyAnnotationResponse: - return await self.run_query_builder(builder=GetNextURLAgencyForAnnotationQueryBuilder( - user_id=user_id, - batch_id=batch_id - )) + return await self.run_query_builder( + builder=GetNextURLAgencyForAnnotationQueryBuilder( + user_id=user_id, + batch_id=batch_id + ) + ) @session_manager async def upsert_new_agencies( @@ -773,20 +766,6 @@ async def add_confirmed_agency_url_links( ) session.add(confirmed_agency) - @session_manager - async def add_agency_auto_suggestions( - self, - session: AsyncSession, - suggestions: list[URLAgencySuggestionInfo] - ): - for suggestion in suggestions: - url_agency_suggestion = AutomatedUrlAgencySuggestion( - url_id=suggestion.url_id, - agency_id=suggestion.pdap_agency_id, - is_unknown=suggestion.suggestion_type == SuggestionType.UNKNOWN - ) - session.add(url_agency_suggestion) - @session_manager async def add_agency_manual_suggestion( self, @@ -842,10 +821,12 @@ async def approve_url( approval_info: FinalReviewApprovalInfo, user_id: int, ) -> None: - await self.run_query_builder(ApproveURLQueryBuilder( - user_id=user_id, - approval_info=approval_info - )) + await self.run_query_builder( + ApproveURLQueryBuilder( + user_id=user_id, + approval_info=approval_info + ) + ) async def reject_url( self, @@ -853,11 +834,13 @@ async def reject_url( user_id: int, rejection_reason: RejectionReason ) -> None: - await self.run_query_builder(RejectURLQueryBuilder( - url_id=url_id, - user_id=user_id, - rejection_reason=rejection_reason - )) + await self.run_query_builder( + RejectURLQueryBuilder( + url_id=url_id, + user_id=user_id, + rejection_reason=rejection_reason + ) + ) @session_manager async def get_batch_by_id(self, session, batch_id: int) -> Optional[BatchSummary]: @@ -873,10 +856,12 @@ async def get_batch_by_id(self, session, batch_id: int) -> Optional[BatchSummary async def get_urls_by_batch(self, batch_id: int, page: int = 1) -> list[URLInfo]: """Retrieve all URLs associated with a batch.""" - return await self.run_query_builder(GetURLsByBatchQueryBuilder( - batch_id=batch_id, - page=page - )) + return await self.run_query_builder( + GetURLsByBatchQueryBuilder( + batch_id=batch_id, + page=page + ) + ) @session_manager async def insert_logs( @@ -926,8 +911,6 @@ async def insert_urls( ) return await self.run_query_builder(builder) - - @session_manager async def update_batch_post_collection( self, @@ -960,10 +943,12 @@ async def mark_urls_as_submitted(self, infos: list[SubmittedURLInfo]): await self.run_query_builder(MarkURLsAsSubmittedQueryBuilder(infos)) async def get_duplicates_by_batch_id(self, batch_id: int, page: int) -> list[DuplicateInfo]: - return await self.run_query_builder(GetDuplicatesByBatchIDQueryBuilder( - batch_id=batch_id, - page=page - )) + return await self.run_query_builder( + GetDuplicatesByBatchIDQueryBuilder( + batch_id=batch_id, + page=page + ) + ) @session_manager async def get_batch_summaries( @@ -1048,10 +1033,12 @@ async def upload_manual_batch( user_id: int, dto: ManualBatchInputDTO ) -> ManualBatchResponseDTO: - return await self.run_query_builder(UploadManualBatchQueryBuilder( - user_id=user_id, - dto=dto - )) + return await self.run_query_builder( + UploadManualBatchQueryBuilder( + user_id=user_id, + dto=dto + ) + ) @session_manager async def search_for_url(self, session: AsyncSession, url: str) -> SearchURLResponse: @@ -1114,183 +1101,16 @@ async def get_urls_breakdown_submitted_metrics( entries=final_results ) - @session_manager - async def get_urls_aggregated_metrics( - self, - session: AsyncSession - ) -> GetMetricsURLsAggregatedResponseDTO: - sc = StatementComposer + async def get_urls_aggregated_metrics(self) -> GetMetricsURLsAggregatedResponseDTO: + return await self.run_query_builder(GetURLsAggregatedMetricsQueryBuilder()) - oldest_pending_url_query = select( - URL.id, - URL.created_at - ).where( - URL.status == URLStatus.PENDING.value - ).order_by( - URL.created_at.asc() - ).limit(1) - - oldest_pending_url = await session.execute(oldest_pending_url_query) - oldest_pending_url = oldest_pending_url.one_or_none() - if oldest_pending_url is None: - oldest_pending_url_id = None - oldest_pending_created_at = None - else: - oldest_pending_url_id = oldest_pending_url.id - oldest_pending_created_at = oldest_pending_url.created_at - - def case_column(status: URLStatus, label): - return sc.count_distinct( - case( - ( - URL.status == status.value, - URL.id - ) - ), - label=label - ) - - count_query = select( - sc.count_distinct(URL.id, label="count"), - case_column(URLStatus.PENDING, label="count_pending"), - case_column(URLStatus.SUBMITTED, label="count_submitted"), - case_column(URLStatus.VALIDATED, label="count_validated"), - case_column(URLStatus.NOT_RELEVANT, label="count_rejected"), - case_column(URLStatus.ERROR, label="count_error"), - ) - raw_results = await session.execute(count_query) - results = raw_results.all() - - return GetMetricsURLsAggregatedResponseDTO( - count_urls_total=results[0].count, - count_urls_pending=results[0].count_pending, - count_urls_submitted=results[0].count_submitted, - count_urls_validated=results[0].count_validated, - count_urls_rejected=results[0].count_rejected, - count_urls_errors=results[0].count_error, - oldest_pending_url_id=oldest_pending_url_id, - oldest_pending_url_created_at=oldest_pending_created_at, - ) + async def get_urls_breakdown_pending_metrics(self) -> GetMetricsURLsBreakdownPendingResponseDTO: + return await self.run_query_builder(GetURLsBreakdownPendingMetricsQueryBuilder()) - @session_manager - async def get_urls_breakdown_pending_metrics( - self, - session: AsyncSession - ) -> GetMetricsURLsBreakdownPendingResponseDTO: - sc = StatementComposer - - flags = ( - select( - URL.id.label("url_id"), - case((UserRecordTypeSuggestion.url_id != None, literal(True)), else_=literal(False)).label( - "has_user_record_type_annotation" - ), - case((UserRelevantSuggestion.url_id != None, literal(True)), else_=literal(False)).label( - "has_user_relevant_annotation" - ), - case((UserUrlAgencySuggestion.url_id != None, literal(True)), else_=literal(False)).label( - "has_user_agency_annotation" - ), - ) - .outerjoin(UserRecordTypeSuggestion, URL.id == UserRecordTypeSuggestion.url_id) - .outerjoin(UserRelevantSuggestion, URL.id == UserRelevantSuggestion.url_id) - .outerjoin(UserUrlAgencySuggestion, URL.id == UserUrlAgencySuggestion.url_id) - ).cte("flags") - - month = func.date_trunc('month', URL.created_at) - - # Build the query - query = ( - select( - month.label('month'), - func.count(URL.id).label('count_total'), - func.count( - case( - (flags.c.has_user_record_type_annotation == True, 1) - ) - ).label('user_record_type_count'), - func.count( - case( - (flags.c.has_user_relevant_annotation == True, 1) - ) - ).label('user_relevant_count'), - func.count( - case( - (flags.c.has_user_agency_annotation == True, 1) - ) - ).label('user_agency_count'), - ) - .outerjoin(flags, flags.c.url_id == URL.id) - .where(URL.status == URLStatus.PENDING.value) - .group_by(month) - .order_by(month.asc()) - ) - - # Execute the query and return the results - results = await session.execute(query) - all_results = results.all() - final_results: list[GetMetricsURLsBreakdownPendingResponseInnerDTO] = [] - - for result in all_results: - dto = GetMetricsURLsBreakdownPendingResponseInnerDTO( - month=result.month.strftime("%B %Y"), - count_pending_total=result.count_total, - count_pending_relevant_user=result.user_relevant_count, - count_pending_record_type_user=result.user_record_type_count, - count_pending_agency_user=result.user_agency_count, - ) - final_results.append(dto) - return GetMetricsURLsBreakdownPendingResponseDTO( - entries=final_results, - ) - - @session_manager async def get_backlog_metrics( self, - session: AsyncSession ) -> GetMetricsBacklogResponseDTO: - month = func.date_trunc('month', BacklogSnapshot.created_at) - - # 1. Create a subquery that assigns row_number() partitioned by month - monthly_snapshot_subq = ( - select( - BacklogSnapshot.id, - BacklogSnapshot.created_at, - BacklogSnapshot.count_pending_total, - month.label("month_start"), - func.row_number() - .over( - partition_by=month, - order_by=BacklogSnapshot.created_at.desc() - ) - .label("row_number") - ) - .subquery() - ) - - # 2. Filter for the top (most recent) row in each month - stmt = ( - select( - monthly_snapshot_subq.c.month_start, - monthly_snapshot_subq.c.created_at, - monthly_snapshot_subq.c.count_pending_total - ) - .where(monthly_snapshot_subq.c.row_number == 1) - .order_by(monthly_snapshot_subq.c.month_start) - ) - - raw_result = await session.execute(stmt) - results = raw_result.all() - final_results = [] - for result in results: - final_results.append( - GetMetricsBacklogResponseInnerDTO( - month=result.month_start.strftime("%B %Y"), - count_pending_total=result.count_pending_total, - ) - ) - - return GetMetricsBacklogResponseDTO(entries=final_results) + return await self.run_query_builder(GetBacklogMetricsQueryBuilder()) @session_manager async def populate_backlog_snapshot( @@ -1300,10 +1120,15 @@ async def populate_backlog_snapshot( ): sc = StatementComposer # Get count of pending URLs - query = select( - sc.count_distinct(URL.id, label="count") - ).where( - URL.status == URLStatus.PENDING.value + query = ( + select( + sc.count_distinct(URL.id, label="count") + ) + .outerjoin(FlagURLValidated, URL.id == FlagURLValidated.url_id) + .where( + URL.status == URLStatus.OK.value, + FlagURLValidated.url_id.is_(None), + ) ) raw_result = await session.execute(query) @@ -1355,7 +1180,7 @@ async def has_pending_urls_not_recently_probed_for_404(self, session: AsyncSessi URLProbedFor404 ).where( and_( - URL.status == URLStatus.PENDING.value, + URL.status == URLStatus.OK.value, or_( URLProbedFor404.id == None, URLProbedFor404.last_probed_at < month_ago @@ -1378,7 +1203,7 @@ async def get_pending_urls_not_recently_probed_for_404(self, session: AsyncSessi URLProbedFor404 ).where( and_( - URL.status == URLStatus.PENDING.value, + URL.status == URLStatus.OK.value, or_( URLProbedFor404.id == None, URLProbedFor404.last_probed_at < month_ago @@ -1404,14 +1229,6 @@ async def get_data_sources_sync_parameters(self) -> DataSourcesSyncParameters: GetDataSourcesSyncParametersQueryBuilder() ) - async def upsert_agencies( - self, - agencies: list[AgenciesSyncResponseInnerInfo] - ) -> None: - await self.bulk_upsert( - models=convert_agencies_sync_response_to_agencies_upsert(agencies) - ) - async def upsert_urls_from_data_sources( self, data_sources: list[DataSourcesSyncResponseInnerInfo] @@ -1463,21 +1280,11 @@ async def add_raw_html( ) session.add(compressed_html) - async def get_data_sources_raw_for_huggingface(self, page: int) -> list[GetForLoadingToHuggingFaceOutput]: - return await self.run_query_builder( - GetForLoadingToHuggingFaceQueryBuilder(page) - ) - async def set_hugging_face_upload_state(self, dt: datetime) -> None: await self.run_query_builder( SetHuggingFaceUploadStateQueryBuilder(dt=dt) ) - async def check_valid_urls_updated(self) -> bool: - return await self.run_query_builder( - CheckValidURLsUpdatedQueryBuilder() - ) - async def get_current_database_time(self) -> datetime: return await self.scalar(select(func.now())) diff --git a/src/db/client/sync.py b/src/db/client/sync.py index 03a45d3b..04ecc892 100644 --- a/src/db/client/sync.py +++ b/src/db/client/sync.py @@ -1,5 +1,5 @@ from functools import wraps -from typing import Optional, List +from typing import List from sqlalchemy import create_engine, update, Select from sqlalchemy.exc import IntegrityError @@ -7,12 +7,12 @@ from src.collectors.enums import URLStatus from src.db.config_manager import ConfigManager -from src.db.models.impl.batch.pydantic import BatchInfo +from src.db.models.impl.batch.pydantic.info import BatchInfo from src.db.models.impl.duplicate.pydantic.insert import DuplicateInsertInfo from src.db.dtos.url.insert import InsertURLsInfo from src.db.models.impl.log.pydantic.info import LogInfo from src.db.dtos.url.mapping import URLMapping -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.pydantic.info import URLInfo from src.db.models.templates_.base import Base from src.db.models.impl.duplicate.sqlalchemy import Duplicate @@ -58,6 +58,11 @@ def wrapper(self, *args, **kwargs): return wrapper + @session_manager + def add_all(self, session: Session, objects: list[Base]): + session.add_all(objects) + session.commit() + @session_manager def insert_batch(self, session: Session, batch_info: BatchInfo) -> int: """Insert a new batch into the database and return its ID.""" @@ -221,14 +226,6 @@ def mark_urls_as_submitted( url_id = info.url_id data_source_id = info.data_source_id - query = ( - update(URL) - .where(URL.id == url_id) - .values( - status=URLStatus.SUBMITTED.value - ) - ) - url_data_source_object = URLDataSource( url_id=url_id, data_source_id=data_source_id @@ -237,7 +234,6 @@ def mark_urls_as_submitted( url_data_source_object.created_at = info.submitted_at session.add(url_data_source_object) - session.execute(query) if __name__ == "__main__": client = DatabaseClient() diff --git a/src/db/client/types.py b/src/db/client/types.py index efdfdc72..02c0e39b 100644 --- a/src/db/client/types.py +++ b/src/db/client/types.py @@ -1,9 +1,5 @@ -from src.db.models.impl.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion -from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.impl.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion UserSuggestionModel = UserRelevantSuggestion or UserRecordTypeSuggestion or UserUrlAgencySuggestion -AutoSuggestionModel = AutoRelevantSuggestion or AutoRecordTypeSuggestion or AutomatedUrlAgencySuggestion diff --git a/src/db/constants.py b/src/db/constants.py index 505a6e58..f2cdefb1 100644 --- a/src/db/constants.py +++ b/src/db/constants.py @@ -1,23 +1,11 @@ -from src.db.models.impl.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion -from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.impl.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion PLACEHOLDER_AGENCY_NAME = "PLACEHOLDER_AGENCY_NAME" STANDARD_ROW_LIMIT = 100 -ALL_ANNOTATION_MODELS = [ - AutoRecordTypeSuggestion, - AutoRelevantSuggestion, - AutomatedUrlAgencySuggestion, - UserRelevantSuggestion, - UserRecordTypeSuggestion, - UserUrlAgencySuggestion -] - USER_ANNOTATION_MODELS = [ UserRelevantSuggestion, UserRecordTypeSuggestion, diff --git a/src/db/dto_converter.py b/src/db/dto_converter.py index 979a3b51..b19b834d 100644 --- a/src/db/dto_converter.py +++ b/src/db/dto_converter.py @@ -1,21 +1,18 @@ -from typing import Optional - from src.api.endpoints.annotate.agency.get.dto import GetNextURLForAgencyAgencyInfo from src.api.endpoints.annotate.relevance.get.dto import RelevanceAnnotationResponseInfo from src.api.endpoints.review.next.dto import FinalReviewAnnotationRelevantInfo, FinalReviewAnnotationRecordTypeInfo, \ - FinalReviewAnnotationAgencyAutoInfo, FinalReviewAnnotationAgencyInfo + FinalReviewAnnotationAgencyInfo from src.core.enums import RecordType, SuggestionType from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo from src.core.tasks.url.operators.html.scraper.parser.mapping import ENUM_TO_ATTRIBUTE_MAPPING from src.db.dtos.url.html_content import URLHTMLContentInfo -from src.db.models.impl.url.html.content.enums import HTMLContentType from src.db.dtos.url.with_html import URLWithHTML from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency -from src.db.models.impl.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion -from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion -from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion -from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.html.content.enums import HTMLContentType +from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent +from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion from src.db.models.impl.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion @@ -65,111 +62,6 @@ def final_review_annotation_record_type_info( user=user_value ) - @staticmethod - def final_review_annotation_agency_auto_info( - automated_agency_suggestions: list[AutomatedUrlAgencySuggestion] - ) -> FinalReviewAnnotationAgencyAutoInfo: - - if len(automated_agency_suggestions) == 0: - return FinalReviewAnnotationAgencyAutoInfo( - unknown=True, - suggestions=[] - ) - - if len(automated_agency_suggestions) == 1: - suggestion = automated_agency_suggestions[0] - unknown = suggestion.is_unknown - else: - unknown = False - - if unknown: - return FinalReviewAnnotationAgencyAutoInfo( - unknown=True, - suggestions=[ - GetNextURLForAgencyAgencyInfo( - suggestion_type=SuggestionType.UNKNOWN, - ) - ] - ) - - return FinalReviewAnnotationAgencyAutoInfo( - unknown=unknown, - suggestions=[ - GetNextURLForAgencyAgencyInfo( - suggestion_type=SuggestionType.AUTO_SUGGESTION, - pdap_agency_id=suggestion.agency_id, - agency_name=suggestion.agency.name, - state=suggestion.agency.state, - county=suggestion.agency.county, - locality=suggestion.agency.locality - ) for suggestion in automated_agency_suggestions - ] - ) - - @staticmethod - def user_url_agency_suggestion_to_final_review_annotation_agency_user_info( - user_url_agency_suggestion: UserUrlAgencySuggestion - ) -> GetNextURLForAgencyAgencyInfo | None: - suggestion = user_url_agency_suggestion - if suggestion is None: - return None - if suggestion.is_new: - return GetNextURLForAgencyAgencyInfo( - suggestion_type=SuggestionType.NEW_AGENCY, - ) - return GetNextURLForAgencyAgencyInfo( - suggestion_type=SuggestionType.USER_SUGGESTION, - pdap_agency_id=suggestion.agency_id, - agency_name=suggestion.agency.name, - state=suggestion.agency.state, - county=suggestion.agency.county, - locality=suggestion.agency.locality - ) - - - @staticmethod - def confirmed_agencies_to_final_review_annotation_agency_info( - confirmed_agencies: list[LinkURLAgency] - ) -> list[GetNextURLForAgencyAgencyInfo]: - results = [] - for confirmed_agency in confirmed_agencies: - agency = confirmed_agency.agency - agency_info = GetNextURLForAgencyAgencyInfo( - suggestion_type=SuggestionType.CONFIRMED, - pdap_agency_id=agency.agency_id, - agency_name=agency.name, - state=agency.state, - county=agency.county, - locality=agency.locality - ) - results.append(agency_info) - return results - - - @staticmethod - def final_review_annotation_agency_info( - automated_agency_suggestions: list[AutomatedUrlAgencySuggestion], - confirmed_agencies: list[LinkURLAgency], - user_agency_suggestion: UserUrlAgencySuggestion - ): - - confirmed_agency_info = DTOConverter.confirmed_agencies_to_final_review_annotation_agency_info( - confirmed_agencies - ) - - agency_auto_info = DTOConverter.final_review_annotation_agency_auto_info( - automated_agency_suggestions - ) - - agency_user_info = DTOConverter.user_url_agency_suggestion_to_final_review_annotation_agency_user_info( - user_agency_suggestion - ) - - return FinalReviewAnnotationAgencyInfo( - confirmed=confirmed_agency_info, - user=agency_user_info, - auto=agency_auto_info - ) @staticmethod diff --git a/src/db/dtos/url/mapping.py b/src/db/dtos/url/mapping.py index 18fc5be2..d48a4649 100644 --- a/src/db/dtos/url/mapping.py +++ b/src/db/dtos/url/mapping.py @@ -1,7 +1,9 @@ -from pydantic import BaseModel +from pydantic import BaseModel, ConfigDict class URLMapping(BaseModel): """Mapping between url and url_id.""" + model_config = ConfigDict(frozen=True) # <- makes it immutable & hashable + url: str url_id: int diff --git a/src/db/helpers/session/session_helper.py b/src/db/helpers/session/session_helper.py index a616664f..aebf236f 100644 --- a/src/db/helpers/session/session_helper.py +++ b/src/db/helpers/session/session_helper.py @@ -11,8 +11,8 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.db.helpers.session.parser import BulkActionParser -from src.db.models.templates_.with_id import WithIDBase from src.db.models.templates_.base import Base +from src.db.models.templates_.with_id import WithIDBase from src.db.templates.markers.bulk.delete import BulkDeletableModel from src.db.templates.markers.bulk.insert import BulkInsertableModel from src.db.templates.markers.bulk.update import BulkUpdatableModel @@ -51,21 +51,27 @@ async def has_results(session: AsyncSession, query: sa.Select) -> bool: async def bulk_upsert( session: AsyncSession, models: list[BulkUpsertableModel], -): +) -> None: if len(models) == 0: return + # Parse models to get sa_model and id_field parser = BulkActionParser(models) + # Create base insert query query = pg_insert(parser.sa_model) - upsert_mappings = [upsert_model.model_dump() for upsert_model in models] + upsert_mappings: list[dict[str, Any]] = [ + upsert_model.model_dump() for upsert_model in models + ] + # Set all non-id fields to the values in the upsert mapping set_ = {} for k, v in upsert_mappings[0].items(): if k == parser.id_field: continue set_[k] = getattr(query.excluded, k) + # Add upsert logic to update on conflict query = query.on_conflict_do_update( index_elements=[parser.id_field], set_=set_ @@ -216,4 +222,3 @@ async def bulk_update( ) await session.execute(stmt) - diff --git a/src/db/models/exceptions.py b/src/db/models/exceptions.py new file mode 100644 index 00000000..491aa9a4 --- /dev/null +++ b/src/db/models/exceptions.py @@ -0,0 +1,4 @@ + + +class WriteToViewError(Exception): + pass \ No newline at end of file diff --git a/src/db/models/impl/agency/sqlalchemy.py b/src/db/models/impl/agency/sqlalchemy.py index 556bde88..032dc397 100644 --- a/src/db/models/impl/agency/sqlalchemy.py +++ b/src/db/models/impl/agency/sqlalchemy.py @@ -25,6 +25,6 @@ class Agency( locality = Column(String, nullable=True) # Relationships - automated_suggestions = relationship("AutomatedUrlAgencySuggestion", back_populates="agency") + automated_suggestions = relationship("AgencyIDSubtaskSuggestion") user_suggestions = relationship("UserUrlAgencySuggestion", back_populates="agency") confirmed_urls = relationship("LinkURLAgency", back_populates="agency") diff --git a/src/db/models/impl/batch/pydantic/__init__.py b/src/db/models/impl/batch/pydantic/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/impl/batch/pydantic.py b/src/db/models/impl/batch/pydantic/info.py similarity index 100% rename from src/db/models/impl/batch/pydantic.py rename to src/db/models/impl/batch/pydantic/info.py diff --git a/src/db/models/impl/batch/pydantic/insert.py b/src/db/models/impl/batch/pydantic/insert.py new file mode 100644 index 00000000..882ab371 --- /dev/null +++ b/src/db/models/impl/batch/pydantic/insert.py @@ -0,0 +1,17 @@ +from datetime import datetime + +from src.core.enums import BatchStatus +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.templates.markers.bulk.insert import BulkInsertableModel + + +class BatchInsertModel(BulkInsertableModel): + strategy: str + status: BatchStatus + parameters: dict + user_id: int + date_generated: datetime + + @classmethod + def sa_model(cls) -> type[Batch]: + return Batch \ No newline at end of file diff --git a/src/db/models/impl/flag/url_validated/__init__.py b/src/db/models/impl/flag/url_validated/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/impl/flag/url_validated/enums.py b/src/db/models/impl/flag/url_validated/enums.py new file mode 100644 index 00000000..fe74b84c --- /dev/null +++ b/src/db/models/impl/flag/url_validated/enums.py @@ -0,0 +1,8 @@ +from enum import Enum + + +class URLValidatedType(Enum): + DATA_SOURCE = "data source" + META_URL = "meta url" + NOT_RELEVANT = "not relevant" + INDIVIDUAL_RECORD = "individual record" \ No newline at end of file diff --git a/src/db/models/impl/flag/url_validated/pydantic.py b/src/db/models/impl/flag/url_validated/pydantic.py new file mode 100644 index 00000000..197c05a0 --- /dev/null +++ b/src/db/models/impl/flag/url_validated/pydantic.py @@ -0,0 +1,22 @@ +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.templates.markers.bulk.insert import BulkInsertableModel +from src.db.templates.markers.bulk.upsert import BulkUpsertableModel + +type_ = type + +class FlagURLValidatedPydantic( + BulkInsertableModel, + BulkUpsertableModel +): + + url_id: int + type: URLValidatedType + + @classmethod + def sa_model(cls) -> type_[FlagURLValidated]: + return FlagURLValidated + + @classmethod + def id_field(cls) -> str: + return "url_id" \ No newline at end of file diff --git a/src/db/models/impl/flag/url_validated/sqlalchemy.py b/src/db/models/impl/flag/url_validated/sqlalchemy.py new file mode 100644 index 00000000..f6d4e770 --- /dev/null +++ b/src/db/models/impl/flag/url_validated/sqlalchemy.py @@ -0,0 +1,25 @@ +from sqlalchemy import PrimaryKeyConstraint + +from src.db.models.helpers import enum_column +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.mixins import URLDependentMixin, CreatedAtMixin, UpdatedAtMixin +from src.db.models.templates_.base import Base + + +class FlagURLValidated( + URLDependentMixin, + CreatedAtMixin, + UpdatedAtMixin, + Base, +): + __tablename__ = "flag_url_validated" + __table_args__ = ( + PrimaryKeyConstraint( + 'url_id', + ), + ) + + type = enum_column( + enum_type=URLValidatedType, + name="validated_url_type", + ) diff --git a/src/db/models/impl/link/batch_url/__init__.py b/src/db/models/impl/link/batch_url/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/impl/link/batch_url/pydantic.py b/src/db/models/impl/link/batch_url/pydantic.py new file mode 100644 index 00000000..143c57ce --- /dev/null +++ b/src/db/models/impl/link/batch_url/pydantic.py @@ -0,0 +1,11 @@ +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.templates.markers.bulk.insert import BulkInsertableModel + + +class LinkBatchURLPydantic(BulkInsertableModel): + batch_id: int + url_id: int + + @classmethod + def sa_model(cls) -> type[LinkBatchURL]: + return LinkBatchURL \ No newline at end of file diff --git a/src/db/models/impl/link/batch_url.py b/src/db/models/impl/link/batch_url/sqlalchemy.py similarity index 79% rename from src/db/models/impl/link/batch_url.py rename to src/db/models/impl/link/batch_url/sqlalchemy.py index 8fb8f42e..951ac539 100644 --- a/src/db/models/impl/link/batch_url.py +++ b/src/db/models/impl/link/batch_url/sqlalchemy.py @@ -13,5 +13,3 @@ class LinkBatchURL( ): __tablename__ = "link_batch_urls" - url = relationship('URL', overlaps="batch") - batch = relationship('Batch', overlaps="url") \ No newline at end of file diff --git a/src/db/models/impl/link/url_agency/pydantic.py b/src/db/models/impl/link/url_agency/pydantic.py index 77522a64..fe9194de 100644 --- a/src/db/models/impl/link/url_agency/pydantic.py +++ b/src/db/models/impl/link/url_agency/pydantic.py @@ -1,3 +1,5 @@ +from pydantic import ConfigDict + from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.templates.markers.bulk.delete import BulkDeletableModel from src.db.templates.markers.bulk.insert import BulkInsertableModel @@ -7,6 +9,8 @@ class LinkURLAgencyPydantic( BulkDeletableModel, BulkInsertableModel ): + model_config = ConfigDict(frozen=True) + url_id: int agency_id: int diff --git a/src/db/models/impl/link/url_agency/sqlalchemy.py b/src/db/models/impl/link/url_agency/sqlalchemy.py index f8d72065..875fa25f 100644 --- a/src/db/models/impl/link/url_agency/sqlalchemy.py +++ b/src/db/models/impl/link/url_agency/sqlalchemy.py @@ -7,7 +7,7 @@ class LinkURLAgency(URLDependentMixin, WithIDBase): - __tablename__ = "link_urls_agencies" + __tablename__ = "link_urls_agency" agency_id: Mapped[int] = get_agency_id_foreign_column() diff --git a/src/db/models/impl/url/core/pydantic/info.py b/src/db/models/impl/url/core/pydantic/info.py index 07df21fe..0985b3fc 100644 --- a/src/db/models/impl/url/core/pydantic/info.py +++ b/src/db/models/impl/url/core/pydantic/info.py @@ -12,7 +12,7 @@ class URLInfo(BaseModel): batch_id: int | None= None url: str collector_metadata: dict | None = None - status: URLStatus = URLStatus.PENDING + status: URLStatus = URLStatus.OK updated_at: datetime.datetime | None = None created_at: datetime.datetime | None = None name: str | None = None diff --git a/src/db/models/impl/url/core/pydantic/insert.py b/src/db/models/impl/url/core/pydantic/insert.py index b893e9fa..18743f1b 100644 --- a/src/db/models/impl/url/core/pydantic/insert.py +++ b/src/db/models/impl/url/core/pydantic/insert.py @@ -16,6 +16,6 @@ def sa_model(cls) -> type[Base]: url: str collector_metadata: dict | None = None name: str | None = None - status: URLStatus = URLStatus.PENDING + status: URLStatus = URLStatus.OK record_type: RecordType | None = None source: URLSource \ No newline at end of file diff --git a/src/db/models/impl/url/core/sqlalchemy.py b/src/db/models/impl/url/core/sqlalchemy.py index b9c38732..2001f9ed 100644 --- a/src/db/models/impl/url/core/sqlalchemy.py +++ b/src/db/models/impl/url/core/sqlalchemy.py @@ -40,7 +40,7 @@ class URL(UpdatedAtMixin, CreatedAtMixin, WithIDBase): "Batch", secondary="link_batch_urls", back_populates="urls", - uselist=False + uselist=False, ) duplicates = relationship("Duplicate", back_populates="original_url") html_content = relationship("URLHTMLContent", back_populates="url", cascade="all, delete-orphan") @@ -50,8 +50,9 @@ class URL(UpdatedAtMixin, CreatedAtMixin, WithIDBase): secondary="link_task_urls", back_populates="urls", ) - automated_agency_suggestions = relationship( - "AutomatedUrlAgencySuggestion", back_populates="url") + auto_agency_subtasks = relationship( + "URLAutoAgencyIDSubtask" + ) user_agency_suggestion = relationship( "UserUrlAgencySuggestion", uselist=False, back_populates="url") auto_record_type_suggestion = relationship( diff --git a/src/db/models/impl/url/suggestion/agency/auto.py b/src/db/models/impl/url/suggestion/agency/auto.py deleted file mode 100644 index 5ecfdf0a..00000000 --- a/src/db/models/impl/url/suggestion/agency/auto.py +++ /dev/null @@ -1,20 +0,0 @@ -from sqlalchemy import Column, Boolean, UniqueConstraint -from sqlalchemy.orm import relationship - -from src.db.models.helpers import get_agency_id_foreign_column -from src.db.models.mixins import URLDependentMixin -from src.db.models.templates_.with_id import WithIDBase - - -class AutomatedUrlAgencySuggestion(URLDependentMixin, WithIDBase): - __tablename__ = "automated_url_agency_suggestions" - - agency_id = get_agency_id_foreign_column(nullable=True) - is_unknown = Column(Boolean, nullable=True) - - agency = relationship("Agency", back_populates="automated_suggestions") - url = relationship("URL", back_populates="automated_agency_suggestions") - - __table_args__ = ( - UniqueConstraint("agency_id", "url_id", name="uq_automated_url_agency_suggestions"), - ) diff --git a/src/db/models/impl/url/suggestion/agency/subtask/__init__.py b/src/db/models/impl/url/suggestion/agency/subtask/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/impl/url/suggestion/agency/subtask/enum.py b/src/db/models/impl/url/suggestion/agency/subtask/enum.py new file mode 100644 index 00000000..f3ee7c3f --- /dev/null +++ b/src/db/models/impl/url/suggestion/agency/subtask/enum.py @@ -0,0 +1,14 @@ +from enum import Enum + + +class AutoAgencyIDSubtaskType(Enum): + HOMEPAGE_MATCH = "homepage_match" + NLP_LOCATION_MATCH = "nlp_location_match" + MUCKROCK = "muckrock_match" + CKAN = "ckan_match" + +class SubtaskDetailCode(Enum): + NO_DETAILS = "no details" + RETRIEVAL_ERROR = "retrieval error" + HOMEPAGE_SINGLE_AGENCY = "homepage-single agency" + HOMEPAGE_MULTI_AGENCY = "homepage-multi agency" \ No newline at end of file diff --git a/src/db/models/impl/url/suggestion/agency/subtask/pydantic.py b/src/db/models/impl/url/suggestion/agency/subtask/pydantic.py new file mode 100644 index 00000000..f2e9be57 --- /dev/null +++ b/src/db/models/impl/url/suggestion/agency/subtask/pydantic.py @@ -0,0 +1,17 @@ +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType, SubtaskDetailCode +from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask +from src.db.models.templates_.base import Base +from src.db.templates.markers.bulk.insert import BulkInsertableModel + +type_alias = type + +class URLAutoAgencyIDSubtaskPydantic(BulkInsertableModel): + task_id: int + url_id: int + type: AutoAgencyIDSubtaskType + agencies_found: bool + detail: SubtaskDetailCode = SubtaskDetailCode.NO_DETAILS + + @classmethod + def sa_model(cls) -> type_alias[Base]: + return URLAutoAgencyIDSubtask \ No newline at end of file diff --git a/src/db/models/impl/url/suggestion/agency/subtask/sqlalchemy.py b/src/db/models/impl/url/suggestion/agency/subtask/sqlalchemy.py new file mode 100644 index 00000000..89371498 --- /dev/null +++ b/src/db/models/impl/url/suggestion/agency/subtask/sqlalchemy.py @@ -0,0 +1,35 @@ +from sqlalchemy.orm import relationship + +from src.db.models.helpers import enum_column +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType, SubtaskDetailCode +from src.db.models.mixins import URLDependentMixin, CreatedAtMixin, TaskDependentMixin +from src.db.models.templates_.with_id import WithIDBase + +import sqlalchemy as sa + +class URLAutoAgencyIDSubtask( + WithIDBase, + URLDependentMixin, + TaskDependentMixin, + CreatedAtMixin +): + + __tablename__ = "url_auto_agency_id_subtasks" + + type = enum_column( + AutoAgencyIDSubtaskType, + name="agency_auto_suggestion_method" + ) + agencies_found = sa.Column( + sa.Boolean(), + nullable=False + ) + detail = enum_column( + SubtaskDetailCode, + name="agency_id_subtask_detail_code", + ) + + suggestions = relationship( + "AgencyIDSubtaskSuggestion", + cascade="all, delete-orphan" + ) \ No newline at end of file diff --git a/src/db/models/impl/url/suggestion/agency/suggestion/__init__.py b/src/db/models/impl/url/suggestion/agency/suggestion/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/impl/url/suggestion/agency/suggestion/pydantic.py b/src/db/models/impl/url/suggestion/agency/suggestion/pydantic.py new file mode 100644 index 00000000..5a0fd2b8 --- /dev/null +++ b/src/db/models/impl/url/suggestion/agency/suggestion/pydantic.py @@ -0,0 +1,16 @@ +from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion +from src.db.models.templates_.base import Base +from src.db.templates.markers.bulk.insert import BulkInsertableModel + + +class AgencyIDSubtaskSuggestionPydantic( + BulkInsertableModel, +): + subtask_id: int + agency_id: int + confidence: int + + @classmethod + def sa_model(cls) -> type[Base]: + """Defines the SQLAlchemy model.""" + return AgencyIDSubtaskSuggestion \ No newline at end of file diff --git a/src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py b/src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py new file mode 100644 index 00000000..de6ee029 --- /dev/null +++ b/src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py @@ -0,0 +1,28 @@ +import sqlalchemy as sa +from sqlalchemy.orm import relationship + +from src.db.models.mixins import CreatedAtMixin, AgencyDependentMixin +from src.db.models.templates_.with_id import WithIDBase + + +class AgencyIDSubtaskSuggestion( + WithIDBase, + CreatedAtMixin, + AgencyDependentMixin, +): + __tablename__ = "agency_id_subtask_suggestions" + + subtask_id = sa.Column( + sa.Integer, + sa.ForeignKey("url_auto_agency_id_subtasks.id"), + nullable=False + ) + confidence = sa.Column( + sa.Integer, + sa.CheckConstraint( + "confidence BETWEEN 0 and 100" + ), + nullable=False, + ) + + agency = relationship("Agency", viewonly=True) \ No newline at end of file diff --git a/src/db/models/mixins.py b/src/db/models/mixins.py index 541e5d09..d0dbbcab 100644 --- a/src/db/models/mixins.py +++ b/src/db/models/mixins.py @@ -1,5 +1,8 @@ -from sqlalchemy import Column, Integer, ForeignKey, TIMESTAMP +from typing import ClassVar +from sqlalchemy import Column, Integer, ForeignKey, TIMESTAMP, event + +from src.db.models.exceptions import WriteToViewError from src.db.models.helpers import get_created_at_column, CURRENT_TIME_SERVER_DEFAULT @@ -58,3 +61,17 @@ class UpdatedAtMixin: server_default=CURRENT_TIME_SERVER_DEFAULT, onupdate=CURRENT_TIME_SERVER_DEFAULT ) + +class ViewMixin: + """Attach to any mapped class that represents a DB view.""" + __is_view__: ClassVar[bool] = True + + @classmethod + def __declare_last__(cls) -> None: + # Block writes on this mapped class + for evt in ("before_insert", "before_update", "before_delete"): + event.listen(cls, evt, cls._block_write) + + @staticmethod + def _block_write(mapper, connection, target): + raise WriteToViewError(f"{type(target).__name__} is a read-only view.") diff --git a/src/db/models/views/__init__.py b/src/db/models/views/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/views/meta_url.py b/src/db/models/views/meta_url.py new file mode 100644 index 00000000..bc963e11 --- /dev/null +++ b/src/db/models/views/meta_url.py @@ -0,0 +1,26 @@ +""" + CREATE OR REPLACE VIEW meta_url_view AS + SELECT + urls.id + FROM urls + INNER JOIN flag_url_validated fuv on fuv.url_id = urls.id + where fuv.type = 'meta url' +""" + +from sqlalchemy import PrimaryKeyConstraint + +from src.db.models.mixins import ViewMixin, URLDependentMixin +from src.db.models.templates_.base import Base + + +class MetaURL( + Base, + ViewMixin, + URLDependentMixin, +): + + __tablename__ = "meta_url_view" + __table_args__ = ( + PrimaryKeyConstraint("url_id"), + {"info": "view"} + ) \ No newline at end of file diff --git a/src/db/models/views/unvalidated_url.py b/src/db/models/views/unvalidated_url.py new file mode 100644 index 00000000..767ee960 --- /dev/null +++ b/src/db/models/views/unvalidated_url.py @@ -0,0 +1,27 @@ +""" +select + u.id as url_id +from + urls u + left join flag_url_validated fuv + on fuv.url_id = u.id +where + fuv.type is null +""" +from sqlalchemy import PrimaryKeyConstraint + +from src.db.models.mixins import ViewMixin, URLDependentMixin +from src.db.models.templates_.base import Base + + +class UnvalidatedURL( + Base, + ViewMixin, + URLDependentMixin, +): + + __tablename__ = "unvalidated_url_view" + __table_args__ = ( + PrimaryKeyConstraint("url_id"), + {"info": "view"} + ) \ No newline at end of file diff --git a/src/db/models/views/url_annotations_flags.py b/src/db/models/views/url_annotations_flags.py new file mode 100644 index 00000000..7289020f --- /dev/null +++ b/src/db/models/views/url_annotations_flags.py @@ -0,0 +1,49 @@ +""" +CREATE OR REPLACE VIEW url_annotation_flags AS +( +SELECT u.id, + CASE WHEN arts.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_auto_record_type_suggestion, + CASE WHEN ars.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_auto_relevant_suggestion, + CASE WHEN auas.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_auto_agency_suggestion, + CASE WHEN urts.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_user_record_type_suggestion, + CASE WHEN urs.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_user_relevant_suggestion, + CASE WHEN uuas.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_user_agency_suggestion, + CASE WHEN cua.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS has_confirmed_agency, + CASE WHEN ruu.url_id IS NOT NULL THEN TRUE ELSE FALSE END AS was_reviewed +FROM urls u + LEFT JOIN public.auto_record_type_suggestions arts ON u.id = arts.url_id + LEFT JOIN public.auto_relevant_suggestions ars ON u.id = ars.url_id + LEFT JOIN public.{URL_AUTO_AGENCY_SUGGESTIONS_TABLE_NAME} auas ON u.id = auas.url_id + LEFT JOIN public.user_record_type_suggestions urts ON u.id = urts.url_id + LEFT JOIN public.user_relevant_suggestions urs ON u.id = urs.url_id + LEFT JOIN public.user_url_agency_suggestions uuas ON u.id = uuas.url_id + LEFT JOIN public.reviewing_user_url ruu ON u.id = ruu.url_id + LEFT JOIN public.link_urls_agency cua on u.id = cua.url_id + ) +""" + +from sqlalchemy import PrimaryKeyConstraint, Column, Boolean + +from src.db.models.mixins import ViewMixin, URLDependentMixin +from src.db.models.templates_.base import Base + + +class URLAnnotationFlagsView( + Base, + ViewMixin, + URLDependentMixin +): + __tablename__ = "url_annotation_flags" + __table_args__ = ( + PrimaryKeyConstraint("url_id"), + {"info": "view"} + ) + + has_auto_record_type_suggestion = Column(Boolean, nullable=False) + has_auto_relevant_suggestion = Column(Boolean, nullable=False) + has_auto_agency_suggestion = Column(Boolean, nullable=False) + has_user_record_type_suggestion = Column(Boolean, nullable=False) + has_user_relevant_suggestion = Column(Boolean, nullable=False) + has_user_agency_suggestion = Column(Boolean, nullable=False) + has_confirmed_agency = Column(Boolean, nullable=False) + was_reviewed = Column(Boolean, nullable=False) \ No newline at end of file diff --git a/src/db/queries/implementations/core/common/annotation_exists_/__init__.py b/src/db/queries/implementations/core/common/annotation_exists_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/queries/implementations/core/common/annotation_exists_/constants.py b/src/db/queries/implementations/core/common/annotation_exists_/constants.py new file mode 100644 index 00000000..ead32bc0 --- /dev/null +++ b/src/db/queries/implementations/core/common/annotation_exists_/constants.py @@ -0,0 +1,15 @@ +from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask +from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion +from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion +from src.db.models.impl.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion +from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion + +ALL_ANNOTATION_MODELS = [ + AutoRecordTypeSuggestion, + AutoRelevantSuggestion, + URLAutoAgencyIDSubtask, + UserRelevantSuggestion, + UserRecordTypeSuggestion, + UserUrlAgencySuggestion +] diff --git a/src/db/queries/implementations/core/common/annotation_exists.py b/src/db/queries/implementations/core/common/annotation_exists_/core.py similarity index 79% rename from src/db/queries/implementations/core/common/annotation_exists.py rename to src/db/queries/implementations/core/common/annotation_exists_/core.py index f8dfa654..53e8bcf6 100644 --- a/src/db/queries/implementations/core/common/annotation_exists.py +++ b/src/db/queries/implementations/core/common/annotation_exists_/core.py @@ -17,7 +17,8 @@ from sqlalchemy import case, func, Select, select from src.collectors.enums import URLStatus -from src.db.constants import ALL_ANNOTATION_MODELS +from src.db.queries.implementations.core.common.annotation_exists_.constants import ALL_ANNOTATION_MODELS +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.mixins import URLDependentMixin from src.db.queries.base.builder import QueryBuilderBase @@ -29,7 +30,7 @@ class AnnotationExistsCTEQueryBuilder(QueryBuilderBase): def url_id(self): return self.query.c.url_id - def get_exists_label(self, model: Type[URLDependentMixin]): + def get_exists_label(self, model: Type[URLDependentMixin]) -> str: return f"{model.__name__}_exists" def get_all(self) -> list[Any]: @@ -67,6 +68,13 @@ async def build(self) -> Any: *annotation_exists_cases_all ) anno_exists_query = await self._outer_join_models(anno_exists_query) - anno_exists_query = anno_exists_query.where(URL.status == URLStatus.PENDING.value) + anno_exists_query = anno_exists_query.outerjoin( + FlagURLValidated, + FlagURLValidated.url_id == URL.id + ) + anno_exists_query = anno_exists_query.where( + URL.status == URLStatus.OK.value, + FlagURLValidated.url_id.is_(None) + ) anno_exists_query = anno_exists_query.group_by(URL.id).cte("annotations_exist") self.query = anno_exists_query diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/builder.py b/src/db/queries/implementations/core/get/recent_batch_summaries/builder.py index f9bb2ef8..86983b5c 100644 --- a/src/db/queries/implementations/core/get/recent_batch_summaries/builder.py +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/builder.py @@ -9,6 +9,7 @@ from src.core.enums import BatchStatus from src.db.models.impl.batch.sqlalchemy import Batch from src.db.queries.base.builder import QueryBuilderBase +from src.db.queries.implementations.core.get.recent_batch_summaries.pending_url.cte import PENDING_URL_CTE from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.builder import URLCountsCTEQueryBuilder from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.labels import URLCountsLabels @@ -24,9 +25,9 @@ def __init__( batch_id: int | None = None, ): super().__init__() + self.has_pending_urls = has_pending_urls self.url_counts_cte = URLCountsCTEQueryBuilder( page=page, - has_pending_urls=has_pending_urls, collector_type=collector_type, status=status, batch_id=batch_id, @@ -49,6 +50,14 @@ async def run(self, session: AsyncSession) -> list[BatchSummary]: builder.query, builder.get(count_labels.batch_id) == Batch.id, ) + if self.has_pending_urls is not None: + query = query.join( + PENDING_URL_CTE, + PENDING_URL_CTE.c.batch_id == Batch.id, + ).where( + PENDING_URL_CTE.c.has_pending_urls == self.has_pending_urls + ) + raw_results = await session.execute(query) summaries: list[BatchSummary] = [] diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/pending_url/__init__.py b/src/db/queries/implementations/core/get/recent_batch_summaries/pending_url/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/pending_url/cte.py b/src/db/queries/implementations/core/get/recent_batch_summaries/pending_url/cte.py new file mode 100644 index 00000000..a0722229 --- /dev/null +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/pending_url/cte.py @@ -0,0 +1,30 @@ +from sqlalchemy import select, func, case, and_ + +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL + +PENDING_URL_CTE = ( + select( + Batch.id.label("batch_id"), + case( + ( + and_( + func.count(LinkBatchURL.url_id) > func.count(FlagURLValidated.url_id), + ) + , True), + else_=False + ).label("has_pending_urls") + ) + .outerjoin( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id, + ) + .outerjoin( + FlagURLValidated, + FlagURLValidated.url_id == LinkBatchURL.url_id, + ) + .group_by( + Batch.id + ).cte("has_pending_urls") +) \ No newline at end of file diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py index 72a33336..634cf419 100644 --- a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py @@ -1,15 +1,22 @@ -from typing import Optional - from sqlalchemy import Select, case, Label, and_, exists -from sqlalchemy.sql.functions import count, coalesce +from sqlalchemy.sql.functions import count, coalesce, func from src.collectors.enums import URLStatus, CollectorType from src.core.enums import BatchStatus -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource from src.db.queries.base.builder import QueryBuilderBase from src.db.queries.helpers import add_page_offset +from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte.all import ALL_CTE +from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte.duplicate import DUPLICATE_CTE +from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte.error import ERROR_CTE +from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte.not_relevant import NOT_RELEVANT_CTE +from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte.pending import PENDING_CTE +from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte.submitted import SUBMITTED_CTE from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.labels import URLCountsLabels @@ -18,14 +25,12 @@ class URLCountsCTEQueryBuilder(QueryBuilderBase): def __init__( self, page: int = 1, - has_pending_urls: bool | None = None, collector_type: CollectorType | None = None, status: BatchStatus | None = None, batch_id: int | None = None ): super().__init__(URLCountsLabels()) self.page = page - self.has_pending_urls = has_pending_urls self.collector_type = collector_type self.status = status self.batch_id = batch_id @@ -33,31 +38,31 @@ def __init__( def get_core_query(self): labels: URLCountsLabels = self.labels - return ( + query = ( Select( Batch.id.label(labels.batch_id), - coalesce(count(URL.id), 0).label(labels.total), - self.count_case_url_status(URLStatus.PENDING, labels.pending), - self.count_case_url_status(URLStatus.SUBMITTED, labels.submitted), - self.count_case_url_status(URLStatus.NOT_RELEVANT, labels.not_relevant), - self.count_case_url_status(URLStatus.ERROR, labels.error), - self.count_case_url_status(URLStatus.DUPLICATE, labels.duplicate), + func.coalesce(DUPLICATE_CTE.count, 0).label(labels.duplicate), + func.coalesce(SUBMITTED_CTE.count, 0).label(labels.submitted), + func.coalesce(PENDING_CTE.count, 0).label(labels.pending), + func.coalesce(ALL_CTE.count, 0).label(labels.total), + func.coalesce(NOT_RELEVANT_CTE.count, 0).label(labels.not_relevant), + func.coalesce(ERROR_CTE.count, 0).label(labels.error), ) .select_from(Batch) - .outerjoin(LinkBatchURL) - .outerjoin( - URL - ) ) + for cte in [DUPLICATE_CTE, SUBMITTED_CTE, PENDING_CTE, ALL_CTE, NOT_RELEVANT_CTE, ERROR_CTE]: + query = query.outerjoin( + cte.cte, + Batch.id == cte.batch_id + ) + return query def build(self): query = self.get_core_query() - query = self.apply_pending_urls_filter(query) query = self.apply_collector_type_filter(query) query = self.apply_status_filter(query) query = self.apply_batch_id_filter(query) - query = query.group_by(Batch.id) query = add_page_offset(query, page=self.page) query = query.order_by(Batch.id) self.query = query.cte("url_counts") @@ -67,23 +72,6 @@ def apply_batch_id_filter(self, query: Select): return query return query.where(Batch.id == self.batch_id) - def apply_pending_urls_filter(self, query: Select): - if self.has_pending_urls is None: - return query - pending_url_subquery = ( - exists( - Select(URL).join(LinkBatchURL).where( - and_( - LinkBatchURL.batch_id == Batch.id, - URL.status == URLStatus.PENDING.value - ) - ) - ) - ).correlate(Batch) - if self.has_pending_urls: - return query.where(pending_url_subquery) - return query.where(~pending_url_subquery) - def apply_collector_type_filter(self, query: Select): if self.collector_type is None: return query @@ -93,18 +81,3 @@ def apply_status_filter(self, query: Select): if self.status is None: return query return query.where(Batch.status == self.status.value) - - @staticmethod - def count_case_url_status( - url_status: URLStatus, - label: str - ) -> Label: - return ( - coalesce( - count( - case( - (URL.status == url_status.value, 1) - ) - ) - , 0).label(label) - ) diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/__init__.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/all.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/all.py new file mode 100644 index 00000000..5cab51cf --- /dev/null +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/all.py @@ -0,0 +1,20 @@ +from sqlalchemy import select, func + +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte_container import \ + URLCountsCTEContainer + +ALL_CTE = URLCountsCTEContainer( + select( + Batch.id, + func.count(LinkBatchURL.url_id).label("total_count") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id, + ) + .group_by( + Batch.id + ).cte("total_count") +) \ No newline at end of file diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/duplicate.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/duplicate.py new file mode 100644 index 00000000..906dd49c --- /dev/null +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/duplicate.py @@ -0,0 +1,29 @@ +from sqlalchemy import select, func + +from src.collectors.enums import URLStatus +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte_container import \ + URLCountsCTEContainer + +DUPLICATE_CTE = URLCountsCTEContainer( + select( + Batch.id, + func.count(URL.id).label("duplicate_count") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id, + ) + .join( + URL, + URL.id == LinkBatchURL.url_id, + ) + .where( + URL.status == URLStatus.DUPLICATE + ) + .group_by( + Batch.id + ).cte("duplicate_count") +) \ No newline at end of file diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/error.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/error.py new file mode 100644 index 00000000..b74020c4 --- /dev/null +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/error.py @@ -0,0 +1,29 @@ +from sqlalchemy import select, func + +from src.collectors.enums import URLStatus +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte_container import \ + URLCountsCTEContainer + +ERROR_CTE = URLCountsCTEContainer( + select( + Batch.id, + func.count(URL.id).label("error_count") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id, + ) + .join( + URL, + URL.id == LinkBatchURL.url_id, + ) + .where( + URL.status == URLStatus.ERROR + ) + .group_by( + Batch.id + ).cte("error_count") +) \ No newline at end of file diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/not_relevant.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/not_relevant.py new file mode 100644 index 00000000..e84f597b --- /dev/null +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/not_relevant.py @@ -0,0 +1,34 @@ +from sqlalchemy import select, func + +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte_container import \ + URLCountsCTEContainer + +NOT_RELEVANT_CTE = URLCountsCTEContainer( + select( + Batch.id, + func.count(URL.id).label("not_relevant_count") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id, + ) + .join( + URL, + URL.id == LinkBatchURL.url_id, + ) + .join( + FlagURLValidated, + FlagURLValidated.url_id == URL.id, + ) + .where( + FlagURLValidated.type == URLValidatedType.NOT_RELEVANT + ) + .group_by( + Batch.id + ).cte("not_relevant_count") +) \ No newline at end of file diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/pending.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/pending.py new file mode 100644 index 00000000..b7e4594c --- /dev/null +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/pending.py @@ -0,0 +1,33 @@ +from sqlalchemy import select, func + +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte_container import \ + URLCountsCTEContainer + +PENDING_CTE = URLCountsCTEContainer( + select( + Batch.id, + func.count(URL.id).label("pending_count") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id, + ) + .join( + URL, + URL.id == LinkBatchURL.url_id, + ) + .outerjoin( + FlagURLValidated, + FlagURLValidated.url_id == URL.id, + ) + .where( + FlagURLValidated.type.is_(None) + ) + .group_by( + Batch.id + ).cte("pending_count") +) \ No newline at end of file diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/submitted.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/submitted.py new file mode 100644 index 00000000..5ab305cc --- /dev/null +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/submitted.py @@ -0,0 +1,32 @@ + + +from sqlalchemy import select, func + +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource +from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte_container import \ + URLCountsCTEContainer + +SUBMITTED_CTE = URLCountsCTEContainer( + select( + Batch.id, + func.count(URL.id).label("submitted_count") + ) + .join( + LinkBatchURL, + LinkBatchURL.batch_id == Batch.id, + ) + .join( + URL, + URL.id == LinkBatchURL.url_id, + ) + .join( + URLDataSource, + URLDataSource.url_id == URL.id, + ) + .group_by( + Batch.id + ).cte("submitted_count") +) \ No newline at end of file diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte_container.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte_container.py new file mode 100644 index 00000000..7f769c76 --- /dev/null +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte_container.py @@ -0,0 +1,18 @@ +from sqlalchemy import CTE, Column + + +class URLCountsCTEContainer: + + def __init__( + self, + cte: CTE + ): + self.cte = cte + + @property + def batch_id(self) -> Column: + return self.cte.columns[0] + + @property + def count(self) -> Column: + return self.cte.columns[1] diff --git a/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py b/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py index 269dfced..5d69be2a 100644 --- a/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py +++ b/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py @@ -11,7 +11,7 @@ from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion from src.db.models.mixins import URLDependentMixin from src.db.queries.base.builder import QueryBuilderBase -from src.db.queries.implementations.core.common.annotation_exists import AnnotationExistsCTEQueryBuilder +from src.db.queries.implementations.core.common.annotation_exists_.core import AnnotationExistsCTEQueryBuilder class PendingAnnotationExistsCTEQueryBuilder(AnnotationExistsCTEQueryBuilder): @@ -44,7 +44,7 @@ async def build(self) -> Any: URL.id == self.url_id ) .where( - URL.status == URLStatus.PENDING.value + URL.status == URLStatus.OK.value ).cte("pending") ) diff --git a/src/db/statement_composer.py b/src/db/statement_composer.py index 45a281de..8e172733 100644 --- a/src/db/statement_composer.py +++ b/src/db/statement_composer.py @@ -2,22 +2,19 @@ from typing import Any from sqlalchemy import Select, select, exists, func, Subquery, and_, not_, ColumnElement -from sqlalchemy.orm import aliased, selectinload +from sqlalchemy.orm import selectinload from src.collectors.enums import URLStatus from src.core.enums import BatchStatus from src.db.constants import STANDARD_ROW_LIMIT from src.db.enums import TaskType -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.link.task_url import LinkTaskURL -from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.task.core import Task -from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent -from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from src.db.models.impl.url.scrape_info.sqlalchemy import URLScrapeInfo -from src.db.models.impl.url.suggestion.agency.auto import AutomatedUrlAgencySuggestion from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata from src.db.types import UserSuggestionType @@ -75,28 +72,11 @@ def simple_count_subquery(model, attribute: str, label: str) -> Subquery: func.count(attr_value).label(label) ).group_by(attr_value).subquery() - @staticmethod - def exclude_urls_with_agency_suggestions( - statement: Select - ): - # Aliases for clarity - AutomatedSuggestion = aliased(AutomatedUrlAgencySuggestion) - - # Exclude if automated suggestions exist - statement = statement.where( - ~exists().where(AutomatedSuggestion.url_id == URL.id) - ) - # Exclude if confirmed agencies exist - statement = statement.where( - ~exists().where(LinkURLAgency.url_id == URL.id) - ) - return statement - @staticmethod def pending_urls_missing_miscellaneous_metadata_query() -> Select: query = select(URL).where( and_( - URL.status == URLStatus.PENDING.value, + URL.status == URLStatus.OK.value, URL.name == None, URL.description == None, URLOptionalDataSourceMetadata.url_id == None diff --git a/src/db/templates/requester.py b/src/db/templates/requester.py new file mode 100644 index 00000000..b56af87f --- /dev/null +++ b/src/db/templates/requester.py @@ -0,0 +1,20 @@ +""" +A requester is a class that contains a session and provides methods for +performing database operations. +""" +from abc import ABC + +from sqlalchemy.ext.asyncio import AsyncSession + +import src.db.helpers.session.session_helper as sh +from src.db.queries.base.builder import QueryBuilderBase + + +class RequesterBase(ABC): + + def __init__(self, session: AsyncSession): + self.session = session + self.session_helper = sh + + async def run_query_builder(self, query_builder: QueryBuilderBase): + return await query_builder.run(session=self.session) \ No newline at end of file diff --git a/src/external/pdap/client.py b/src/external/pdap/client.py index ee357ad4..0e0d5a39 100644 --- a/src/external/pdap/client.py +++ b/src/external/pdap/client.py @@ -1,10 +1,14 @@ -from typing import Optional +from datetime import date +from typing import Any -from pdap_access_manager import AccessManager, DataSourcesNamespaces, RequestInfo, RequestType +from pdap_access_manager import AccessManager, DataSourcesNamespaces, RequestInfo, RequestType, ResponseInfo from src.core.tasks.scheduled.impl.sync.agency.dtos.parameters import AgencySyncParameters from src.core.tasks.scheduled.impl.sync.data_sources.params import DataSourcesSyncParameters from src.core.tasks.url.operators.submit_approved.tdo import SubmitApprovedURLTDO, SubmittedURLInfo +from src.external.pdap.dtos.search_agency_by_location.params import SearchAgencyByLocationParams +from src.external.pdap.dtos.search_agency_by_location.response import SearchAgencyByLocationResponse, \ + SearchAgencyByLocationOuterResponse from src.external.pdap.dtos.sync.agencies import AgenciesSyncResponseInnerInfo, AgenciesSyncResponseInfo from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse @@ -21,6 +25,38 @@ def __init__( ): self.access_manager = access_manager + async def search_agency_by_location( + self, + params: list[SearchAgencyByLocationParams] + ) -> list[SearchAgencyByLocationResponse]: + request_url: str = self.access_manager.build_url( + namespace=DataSourcesNamespaces.SOURCE_COLLECTOR, + subdomains=["agencies", "search", "location"] + ) + headers: dict[str, str] = await self.access_manager.jwt_header() + headers['Content-Type']: str = "application/json" + + json_params: list[dict[str, Any]] = [ + param.model_dump(mode='json') + for param in params + ] + + request_info = RequestInfo( + type_=RequestType.POST, + url=request_url, + headers=headers, + json_={ + "requests": json_params + } + ) + response_info: ResponseInfo = await self.access_manager.make_request(request_info) + + outer_response = SearchAgencyByLocationOuterResponse( + **response_info.data + ) + + return outer_response.responses + async def match_agency( self, name: str, @@ -31,13 +67,13 @@ async def match_agency( """ Returns agencies, if any, that match or partially match the search criteria """ - url = self.access_manager.build_url( + url: str = self.access_manager.build_url( namespace=DataSourcesNamespaces.MATCH, subdomains=["agency"] ) - headers = await self.access_manager.jwt_header() - headers['Content-Type'] = "application/json" + headers: dict[str, str] = await self.access_manager.jwt_header() + headers['Content-Type']: str = "application/json" request_info = RequestInfo( type_=RequestType.POST, url=url, @@ -49,15 +85,15 @@ async def match_agency( "locality": locality } ) - response_info = await self.access_manager.make_request(request_info) - matches = [] + response_info: ResponseInfo = await self.access_manager.make_request(request_info) + matches: list[MatchAgencyInfo] = [] for agency in response_info.data["agencies"]: mai = MatchAgencyInfo( id=agency['id'], submitted_name=agency['name'] ) if len(agency['locations']) > 0: - first_location = agency['locations'][0] + first_location: dict[str, Any] = agency['locations'][0] mai.state = first_location['state'] mai.county = first_location['county'] mai.locality = first_location['locality'] @@ -75,7 +111,7 @@ async def is_url_duplicate( """ Check if a URL is unique. Returns duplicate info otherwise """ - url = self.access_manager.build_url( + url: str = self.access_manager.build_url( namespace=DataSourcesNamespaces.CHECK, subdomains=["unique-url"] ) @@ -86,9 +122,11 @@ async def is_url_duplicate( "url": url_to_check } ) - response_info = await self.access_manager.make_request(request_info) - duplicates = [UniqueURLDuplicateInfo(**entry) for entry in response_info.data["duplicates"]] - is_duplicate = (len(duplicates) != 0) + response_info: ResponseInfo = await self.access_manager.make_request(request_info) + duplicates: list[UniqueURLDuplicateInfo] = [ + UniqueURLDuplicateInfo(**entry) for entry in response_info.data["duplicates"] + ] + is_duplicate: bool = (len(duplicates) != 0) return is_duplicate async def submit_urls( @@ -105,11 +143,11 @@ async def submit_urls( ) # Build url-id dictionary - url_id_dict = {} + url_id_dict: dict[str, int] = {} for tdo in tdos: url_id_dict[tdo.url] = tdo.url_id - data_sources_json = [] + data_sources_json: list[dict[str, Any]] = [] for tdo in tdos: data_sources_json.append( { @@ -125,7 +163,7 @@ async def submit_urls( } ) - headers = await self.access_manager.jwt_header() + headers: dict[str, str] = await self.access_manager.jwt_header() request_info = RequestInfo( type_=RequestType.POST, url=request_url, @@ -134,12 +172,12 @@ async def submit_urls( "data_sources": data_sources_json } ) - response_info = await self.access_manager.make_request(request_info) - data_sources_response_json = response_info.data["data_sources"] + response_info: ResponseInfo = await self.access_manager.make_request(request_info) + data_sources_response_json: list[dict[str, Any]] = response_info.data["data_sources"] - results = [] + results: list[SubmittedURLInfo] = [] for data_source in data_sources_response_json: - url = data_source["url"] + url: str = data_source["url"] response_object = SubmittedURLInfo( url_id=url_id_dict[url], data_source_id=data_source["data_source_id"], @@ -153,25 +191,28 @@ async def sync_agencies( self, params: AgencySyncParameters ) -> AgenciesSyncResponseInfo: - url =self.access_manager.build_url( + url: str = self.access_manager.build_url( namespace=DataSourcesNamespaces.SOURCE_COLLECTOR, subdomains=[ "agencies", "sync" ] ) - headers = await self.access_manager.jwt_header() - headers['Content-Type'] = "application/json" + headers: dict[str, str] = await self.access_manager.jwt_header() + headers['Content-Type']: str = "application/json" + request_params: dict[str, Any] = { + "page": params.page + } + if params.cutoff_date is not None: + params["updated_at"]: date = params.cutoff_date + request_info = RequestInfo( type_=RequestType.GET, url=url, headers=headers, - params={ - "page": params.page, - "updated_at": params.cutoff_date - } + params=request_params ) - response_info = await self.access_manager.make_request(request_info) + response_info: ResponseInfo = await self.access_manager.make_request(request_info) return AgenciesSyncResponseInfo( agencies=[ AgenciesSyncResponseInnerInfo(**entry) @@ -183,18 +224,18 @@ async def sync_data_sources( self, params: DataSourcesSyncParameters ) -> DataSourcesSyncResponseInfo: - url = self.access_manager.build_url( + url: str = self.access_manager.build_url( namespace=DataSourcesNamespaces.SOURCE_COLLECTOR, subdomains=[ "data-sources", "sync" ] ) - headers = await self.access_manager.jwt_header() - headers['Content-Type'] = "application/json" - params_dict = {"page": params.page} + headers: dict[str, str] = await self.access_manager.jwt_header() + headers['Content-Type']: str = "application/json" + params_dict: dict[str, Any] = {"page": params.page} if params.cutoff_date is not None: - params_dict["updated_at"] = params.cutoff_date + params_dict["updated_at"]: date = params.cutoff_date request_info = RequestInfo( type_=RequestType.GET, @@ -202,10 +243,10 @@ async def sync_data_sources( headers=headers, params=params_dict ) - response_info = await self.access_manager.make_request(request_info) + response_info: ResponseInfo = await self.access_manager.make_request(request_info) return DataSourcesSyncResponseInfo( data_sources=[ DataSourcesSyncResponseInnerInfo(**entry) for entry in response_info.data["data_sources"] ] - ) \ No newline at end of file + ) diff --git a/src/external/pdap/dtos/search_agency_by_location/__init__.py b/src/external/pdap/dtos/search_agency_by_location/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/dtos/search_agency_by_location/params.py b/src/external/pdap/dtos/search_agency_by_location/params.py new file mode 100644 index 00000000..ca5a6213 --- /dev/null +++ b/src/external/pdap/dtos/search_agency_by_location/params.py @@ -0,0 +1,11 @@ +from pydantic import BaseModel, Field + + +class SearchAgencyByLocationParams(BaseModel): + request_id: int + query: str + iso: str = Field( + description="US State ISO Code", + max_length=2, + + ) \ No newline at end of file diff --git a/src/external/pdap/dtos/search_agency_by_location/response.py b/src/external/pdap/dtos/search_agency_by_location/response.py new file mode 100644 index 00000000..92242b5a --- /dev/null +++ b/src/external/pdap/dtos/search_agency_by_location/response.py @@ -0,0 +1,12 @@ +from pydantic import BaseModel, Field + +class SearchAgencyByLocationAgencyInfo(BaseModel): + agency_id: int + similarity: float = Field(ge=0, le=1) + +class SearchAgencyByLocationResponse(BaseModel): + request_id: int + results: list[SearchAgencyByLocationAgencyInfo] = Field(min_length=1) + +class SearchAgencyByLocationOuterResponse(BaseModel): + responses: list[SearchAgencyByLocationResponse] \ No newline at end of file diff --git a/src/external/pdap/dtos/sync/agencies.py b/src/external/pdap/dtos/sync/agencies.py index 99483107..7e569a81 100644 --- a/src/external/pdap/dtos/sync/agencies.py +++ b/src/external/pdap/dtos/sync/agencies.py @@ -3,6 +3,8 @@ from pydantic import BaseModel + + class AgenciesSyncResponseInnerInfo(BaseModel): display_name: str agency_id: int @@ -10,6 +12,7 @@ class AgenciesSyncResponseInnerInfo(BaseModel): county_name: str | None locality_name: str | None updated_at: datetime.datetime + meta_urls: list[str] = [] class AgenciesSyncResponseInfo(BaseModel): agencies: list[AgenciesSyncResponseInnerInfo] diff --git a/src/util/alembic_helpers.py b/src/util/alembic_helpers.py index 47a24cac..9df2be52 100644 --- a/src/util/alembic_helpers.py +++ b/src/util/alembic_helpers.py @@ -8,6 +8,7 @@ def switch_enum_type( new_enum_values, drop_old_enum=True, check_constraints_to_drop: list[str] = None, + conversion_mappings: dict[str, str] = None ): """ Switches an ENUM type in a PostgreSQL column by: @@ -21,6 +22,8 @@ def switch_enum_type( :param enum_name: Name of the ENUM type in PostgreSQL. :param new_enum_values: List of new ENUM values. :param drop_old_enum: Whether to drop the old ENUM type. + :param check_constraints_to_drop: List of check constraints to drop before switching the ENUM type. + :param conversion_mappings: Dictionary of old values to new values for the ENUM type. """ # 1. Drop check constraints that reference the enum @@ -38,7 +41,21 @@ def switch_enum_type( new_enum_type.create(op.get_bind()) # Alter the column type to use the new enum type - op.execute(f'ALTER TABLE "{table_name}" ALTER COLUMN "{column_name}" TYPE "{enum_name}" USING "{column_name}"::text::{enum_name}') + if conversion_mappings is None: + op.execute(f'ALTER TABLE "{table_name}" ALTER COLUMN "{column_name}" TYPE "{enum_name}" USING "{column_name}"::text::{enum_name}') + if conversion_mappings is not None: + case_when: str = "" + for old_value, new_value in conversion_mappings.items(): + case_when += f"WHEN '{old_value}' THEN '{new_value}'\n" + + op.execute(f""" + ALTER TABLE "{table_name}" + ALTER COLUMN "{column_name}" TYPE "{enum_name}" + USING CASE {column_name}::text + {case_when} + ELSE "{column_name}"::text + END::{enum_name}; + """) # Drop the old enum type if drop_old_enum: @@ -86,6 +103,18 @@ def updated_at_column() -> sa.Column: comment='The last time the row was updated.' ) +def task_id_column() -> sa.Column: + return sa.Column( + 'task_id', + sa.Integer(), + sa.ForeignKey( + 'tasks.id', + ondelete='CASCADE' + ), + nullable=False, + comment='A foreign key to the `tasks` table.' + ) + def url_id_column(name: str = 'url_id') -> sa.Column: return sa.Column( name, @@ -108,4 +137,16 @@ def batch_id_column(nullable=False) -> sa.Column: ), nullable=nullable, comment='A foreign key to the `batches` table.' + ) + +def agency_id_column(nullable=False) -> sa.Column: + return sa.Column( + 'agency_id', + sa.Integer(), + sa.ForeignKey( + 'agencies.agency_id', + ondelete='CASCADE' + ), + nullable=nullable, + comment='A foreign key to the `agencies` table.' ) \ No newline at end of file diff --git a/tests/automated/integration/api/annotate/__init__.py b/tests/automated/integration/api/annotate/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/annotate/agency/__init__.py b/tests/automated/integration/api/annotate/agency/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/annotate/agency/test_multiple_auto_suggestions.py b/tests/automated/integration/api/annotate/agency/test_multiple_auto_suggestions.py new file mode 100644 index 00000000..65b20b0c --- /dev/null +++ b/tests/automated/integration/api/annotate/agency/test_multiple_auto_suggestions.py @@ -0,0 +1,46 @@ +import pytest + +from src.core.enums import SuggestionType +from tests.helpers.data_creator.models.creation_info.batch.v1 import BatchURLCreationInfo + + +@pytest.mark.asyncio +async def test_annotate_agency_multiple_auto_suggestions(api_test_helper): + """ + Test Scenario: Multiple Auto Suggestions + A URL has multiple Agency Auto Suggestion and has not been annotated by the User + The user should receive all of the auto suggestions with full detail + """ + ath = api_test_helper + buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( + url_count=1, + with_html_content=True + ) + await ath.db_data_creator.auto_suggestions( + url_ids=buci.url_ids, + num_suggestions=2, + suggestion_type=SuggestionType.AUTO_SUGGESTION + ) + + # User requests next annotation + response = await ath.request_validator.get_next_agency_annotation() + + assert response.next_annotation + next_annotation = response.next_annotation + # Check that url_id matches the one we inserted + assert next_annotation.url_info.url_id == buci.url_ids[0] + + # Check that html data is present + assert next_annotation.html_info.description != "" + assert next_annotation.html_info.title != "" + + # Check that two agency_suggestions exist + assert len(next_annotation.agency_suggestions) == 2 + + for agency_suggestion in next_annotation.agency_suggestions: + assert agency_suggestion.suggestion_type == SuggestionType.AUTO_SUGGESTION + assert agency_suggestion.pdap_agency_id is not None + assert agency_suggestion.agency_name is not None + assert agency_suggestion.state is not None + assert agency_suggestion.county is not None + assert agency_suggestion.locality is not None diff --git a/tests/automated/integration/api/annotate/agency/test_multiple_auto_suggestions_no_html.py b/tests/automated/integration/api/annotate/agency/test_multiple_auto_suggestions_no_html.py new file mode 100644 index 00000000..5bcb4569 --- /dev/null +++ b/tests/automated/integration/api/annotate/agency/test_multiple_auto_suggestions_no_html.py @@ -0,0 +1,35 @@ +import pytest + +from src.core.enums import SuggestionType +from tests.helpers.data_creator.models.creation_info.batch.v1 import BatchURLCreationInfo + + +@pytest.mark.asyncio +async def test_annotate_agency_multiple_auto_suggestions_no_html(api_test_helper): + """ + Test Scenario: Multiple Auto Suggestions + A URL has multiple Agency Auto Suggestion and has not been annotated by the User + The user should receive all of the auto suggestions with full detail + """ + ath = api_test_helper + buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( + url_count=1, + with_html_content=False + ) + await ath.db_data_creator.auto_suggestions( + url_ids=buci.url_ids, + num_suggestions=2, + suggestion_type=SuggestionType.AUTO_SUGGESTION + ) + + # User requests next annotation + response = await ath.request_validator.get_next_agency_annotation() + + assert response.next_annotation + next_annotation = response.next_annotation + # Check that url_id matches the one we inserted + assert next_annotation.url_info.url_id == buci.url_ids[0] + + # Check that html data is not present + assert next_annotation.html_info.description == "" + assert next_annotation.html_info.title == "" diff --git a/tests/automated/integration/api/annotate/agency/test_other_user_annotation.py b/tests/automated/integration/api/annotate/agency/test_other_user_annotation.py new file mode 100644 index 00000000..a3ecae79 --- /dev/null +++ b/tests/automated/integration/api/annotate/agency/test_other_user_annotation.py @@ -0,0 +1,44 @@ +import pytest + +from tests.automated.integration.api.conftest import MOCK_USER_ID +from tests.helpers.setup.annotate_agency.core import setup_for_annotate_agency +from tests.helpers.setup.annotate_agency.model import AnnotateAgencySetupInfo + + +@pytest.mark.asyncio +async def test_annotate_agency_other_user_annotation(api_test_helper): + """ + Test Scenario: Other User Annotation + A URL has been annotated by another User + Our user should still receive this URL to annotate + """ + ath = api_test_helper + setup_info: AnnotateAgencySetupInfo = await setup_for_annotate_agency( + db_data_creator=ath.db_data_creator, + url_count=1 + ) + url_ids = setup_info.url_ids + + response = await ath.request_validator.get_next_agency_annotation() + + assert response.next_annotation + next_annotation = response.next_annotation + # Check that url_id matches the one we inserted + assert next_annotation.url_info.url_id == url_ids[0] + + # Check that html data is present + assert next_annotation.html_info.description != "" + assert next_annotation.html_info.title != "" + + # Check that one agency_suggestion exists + assert len(next_annotation.agency_suggestions) == 1 + + # Test that another user can insert a suggestion + await ath.db_data_creator.manual_suggestion( + user_id=MOCK_USER_ID + 1, + url_id=url_ids[0], + ) + + # After this, text that our user does not receive this URL + response = await ath.request_validator.get_next_agency_annotation() + assert response.next_annotation is None diff --git a/tests/automated/integration/api/annotate/agency/test_single_confirmed_agency.py b/tests/automated/integration/api/annotate/agency/test_single_confirmed_agency.py new file mode 100644 index 00000000..e38421e1 --- /dev/null +++ b/tests/automated/integration/api/annotate/agency/test_single_confirmed_agency.py @@ -0,0 +1,22 @@ +import pytest + +from tests.helpers.data_creator.models.creation_info.batch.v1 import BatchURLCreationInfo + + +@pytest.mark.asyncio +async def test_annotate_agency_single_confirmed_agency(api_test_helper): + """ + Test Scenario: Single Confirmed Agency + A URL has a single Confirmed Agency and has not been annotated by the User + The user should not receive this URL to annotate + """ + ath = api_test_helper + buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( + url_count=1, + with_html_content=True + ) + await ath.db_data_creator.confirmed_suggestions( + url_ids=buci.url_ids, + ) + response = await ath.request_validator.get_next_agency_annotation() + assert response.next_annotation is None diff --git a/tests/automated/integration/api/annotate/agency/test_single_unknown_auto_suggestions.py b/tests/automated/integration/api/annotate/agency/test_single_unknown_auto_suggestions.py new file mode 100644 index 00000000..f911bba5 --- /dev/null +++ b/tests/automated/integration/api/annotate/agency/test_single_unknown_auto_suggestions.py @@ -0,0 +1,45 @@ +import pytest + +from src.core.enums import SuggestionType +from tests.helpers.data_creator.models.creation_info.batch.v1 import BatchURLCreationInfo + + +@pytest.mark.asyncio +async def test_annotate_agency_single_unknown_auto_suggestion(api_test_helper): + """ + Test Scenario: Single Unknown Auto Suggestion + A URL has a single Unknown Agency Auto Suggestion and has not been annotated by the User + The user should receive a single Unknown Auto Suggestion lacking other detail + """ + ath = api_test_helper + buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( + url_count=1, + with_html_content=True + ) + await ath.db_data_creator.auto_suggestions( + url_ids=buci.url_ids, + num_suggestions=1, + suggestion_type=SuggestionType.UNKNOWN + ) + response = await ath.request_validator.get_next_agency_annotation() + + assert response.next_annotation + next_annotation = response.next_annotation + # Check that url_id matches the one we inserted + assert next_annotation.url_info.url_id == buci.url_ids[0] + + # Check that html data is present + assert next_annotation.html_info.description != "" + assert next_annotation.html_info.title != "" + + # Check that one agency_suggestion exists + assert len(next_annotation.agency_suggestions) == 1 + + agency_suggestion = next_annotation.agency_suggestions[0] + + assert agency_suggestion.suggestion_type == SuggestionType.UNKNOWN + assert agency_suggestion.pdap_agency_id is None + assert agency_suggestion.agency_name is None + assert agency_suggestion.state is None + assert agency_suggestion.county is None + assert agency_suggestion.locality is None diff --git a/tests/automated/integration/api/annotate/agency/test_submit_and_get_next.py b/tests/automated/integration/api/annotate/agency/test_submit_and_get_next.py new file mode 100644 index 00000000..91049daa --- /dev/null +++ b/tests/automated/integration/api/annotate/agency/test_submit_and_get_next.py @@ -0,0 +1,42 @@ +import pytest + +from src.api.endpoints.annotate.agency.post.dto import URLAgencyAnnotationPostInfo +from tests.helpers.setup.annotate_agency.core import setup_for_annotate_agency +from tests.helpers.setup.annotate_agency.model import AnnotateAgencySetupInfo + + +@pytest.mark.asyncio +async def test_annotate_agency_submit_and_get_next(api_test_helper): + """ + Test Scenario: Submit and Get Next (no other URL available) + A URL has been annotated by our User, and no other valid URLs have not been annotated + Our user should not receive another URL to annotate + Until another relevant URL is added + """ + ath = api_test_helper + setup_info: AnnotateAgencySetupInfo = await setup_for_annotate_agency( + db_data_creator=ath.db_data_creator, + url_count=2 + ) + url_ids = setup_info.url_ids + + # User should submit an annotation and receive the next + response = await ath.request_validator.post_agency_annotation_and_get_next( + url_id=url_ids[0], + agency_annotation_post_info=URLAgencyAnnotationPostInfo( + suggested_agency=await ath.db_data_creator.agency(), + is_new=False + ) + + ) + assert response.next_annotation is not None + + # User should submit this annotation and receive none for the next + response = await ath.request_validator.post_agency_annotation_and_get_next( + url_id=url_ids[1], + agency_annotation_post_info=URLAgencyAnnotationPostInfo( + suggested_agency=await ath.db_data_creator.agency(), + is_new=False + ) + ) + assert response.next_annotation is None diff --git a/tests/automated/integration/api/annotate/agency/test_submit_new.py b/tests/automated/integration/api/annotate/agency/test_submit_new.py new file mode 100644 index 00000000..e82c767f --- /dev/null +++ b/tests/automated/integration/api/annotate/agency/test_submit_new.py @@ -0,0 +1,38 @@ +import pytest + +from src.api.endpoints.annotate.agency.post.dto import URLAgencyAnnotationPostInfo +from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from tests.helpers.setup.annotate_agency.core import setup_for_annotate_agency +from tests.helpers.setup.annotate_agency.model import AnnotateAgencySetupInfo + + +@pytest.mark.asyncio +async def test_annotate_agency_submit_new(api_test_helper): + """ + Test Scenario: Submit New + Our user receives an annotation and marks it as `NEW` + This should complete successfully + And within the database the annotation should be marked as `NEW` + """ + ath = api_test_helper + adb_client = ath.adb_client() + setup_info: AnnotateAgencySetupInfo = await setup_for_annotate_agency( + db_data_creator=ath.db_data_creator, + url_count=1 + ) + url_ids = setup_info.url_ids + + # User should submit an annotation and mark it as New + response = await ath.request_validator.post_agency_annotation_and_get_next( + url_id=url_ids[0], + agency_annotation_post_info=URLAgencyAnnotationPostInfo( + suggested_agency=await ath.db_data_creator.agency(), + is_new=True + ) + ) + assert response.next_annotation is None + + # Within database, the annotation should be marked as `NEW` + all_manual_suggestions = await adb_client.get_all(UserUrlAgencySuggestion) + assert len(all_manual_suggestions) == 1 + assert all_manual_suggestions[0].is_new diff --git a/tests/automated/integration/api/annotate/all/__init__.py b/tests/automated/integration/api/annotate/all/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/annotate/all/test_happy_path.py b/tests/automated/integration/api/annotate/all/test_happy_path.py new file mode 100644 index 00000000..5003f08f --- /dev/null +++ b/tests/automated/integration/api/annotate/all/test_happy_path.py @@ -0,0 +1,88 @@ +import pytest + +from src.api.endpoints.annotate.agency.post.dto import URLAgencyAnnotationPostInfo +from src.api.endpoints.annotate.all.post.dto import AllAnnotationPostInfo +from src.core.enums import SuggestedStatus, RecordType +from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion +from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion +from tests.helpers.setup.final_review.core import setup_for_get_next_url_for_final_review + + +@pytest.mark.asyncio +async def test_annotate_all(api_test_helper): + """ + Test the happy path workflow for the all-annotations endpoint + The user should be able to get a valid URL (filtering on batch id if needed), + submit a full annotation, and receive another URL + """ + ath = api_test_helper + adb_client = ath.adb_client() + setup_info_1 = await setup_for_get_next_url_for_final_review( + db_data_creator=ath.db_data_creator, include_user_annotations=False + ) + url_mapping_1 = setup_info_1.url_mapping + setup_info_2 = await setup_for_get_next_url_for_final_review( + db_data_creator=ath.db_data_creator, include_user_annotations=False + ) + url_mapping_2 = setup_info_2.url_mapping + + # First, get a valid URL to annotate + get_response_1 = await ath.request_validator.get_next_url_for_all_annotations() + + # Apply the second batch id as a filter and see that a different URL is returned + get_response_2 = await ath.request_validator.get_next_url_for_all_annotations( + batch_id=setup_info_2.batch_id + ) + + assert get_response_1.next_annotation.url_info.url_id != get_response_2.next_annotation.url_info.url_id + + # Annotate the first and submit + agency_id = await ath.db_data_creator.agency() + post_response_1 = await ath.request_validator.post_all_annotations_and_get_next( + url_id=url_mapping_1.url_id, + all_annotations_post_info=AllAnnotationPostInfo( + suggested_status=SuggestedStatus.RELEVANT, + record_type=RecordType.ACCIDENT_REPORTS, + agency=URLAgencyAnnotationPostInfo( + is_new=False, + suggested_agency=agency_id + ) + ) + ) + assert post_response_1.next_annotation is not None + + # Confirm the second is received + assert post_response_1.next_annotation.url_info.url_id == url_mapping_2.url_id + + # Upon submitting the second, confirm that no more URLs are returned through either POST or GET + post_response_2 = await ath.request_validator.post_all_annotations_and_get_next( + url_id=url_mapping_2.url_id, + all_annotations_post_info=AllAnnotationPostInfo( + suggested_status=SuggestedStatus.NOT_RELEVANT, + ) + ) + assert post_response_2.next_annotation is None + + get_response_3 = await ath.request_validator.get_next_url_for_all_annotations() + assert get_response_3.next_annotation is None + + + # Check that all annotations are present in the database + + # Should be two relevance annotations, one True and one False + all_relevance_suggestions: list[UserRelevantSuggestion] = await adb_client.get_all(UserRelevantSuggestion) + assert len(all_relevance_suggestions) == 2 + assert all_relevance_suggestions[0].suggested_status == SuggestedStatus.RELEVANT.value + assert all_relevance_suggestions[1].suggested_status == SuggestedStatus.NOT_RELEVANT.value + + # Should be one agency + all_agency_suggestions = await adb_client.get_all(UserUrlAgencySuggestion) + assert len(all_agency_suggestions) == 1 + assert all_agency_suggestions[0].is_new == False + assert all_agency_suggestions[0].agency_id == agency_id + + # Should be one record type + all_record_type_suggestions = await adb_client.get_all(UserRecordTypeSuggestion) + assert len(all_record_type_suggestions) == 1 + assert all_record_type_suggestions[0].record_type == RecordType.ACCIDENT_REPORTS.value diff --git a/tests/automated/integration/api/annotate/all/test_post_batch_filtering.py b/tests/automated/integration/api/annotate/all/test_post_batch_filtering.py new file mode 100644 index 00000000..a11c43a3 --- /dev/null +++ b/tests/automated/integration/api/annotate/all/test_post_batch_filtering.py @@ -0,0 +1,41 @@ +import pytest + +from src.api.endpoints.annotate.agency.post.dto import URLAgencyAnnotationPostInfo +from src.api.endpoints.annotate.all.post.dto import AllAnnotationPostInfo +from src.core.enums import SuggestedStatus, RecordType +from tests.helpers.setup.final_review.core import setup_for_get_next_url_for_final_review + + +@pytest.mark.asyncio +async def test_annotate_all_post_batch_filtering(api_test_helper): + """ + Batch filtering should also work when posting annotations + """ + ath = api_test_helper + adb_client = ath.adb_client() + setup_info_1 = await setup_for_get_next_url_for_final_review( + db_data_creator=ath.db_data_creator, include_user_annotations=False + ) + url_mapping_1 = setup_info_1.url_mapping + setup_info_2 = await setup_for_get_next_url_for_final_review( + db_data_creator=ath.db_data_creator, include_user_annotations=False + ) + setup_info_3 = await setup_for_get_next_url_for_final_review( + db_data_creator=ath.db_data_creator, include_user_annotations=False + ) + url_mapping_3 = setup_info_3.url_mapping + + # Submit the first annotation, using the third batch id, and receive the third URL + post_response_1 = await ath.request_validator.post_all_annotations_and_get_next( + url_id=url_mapping_1.url_id, + batch_id=setup_info_3.batch_id, + all_annotations_post_info=AllAnnotationPostInfo( + suggested_status=SuggestedStatus.RELEVANT, + record_type=RecordType.ACCIDENT_REPORTS, + agency=URLAgencyAnnotationPostInfo( + is_new=True + ) + ) + ) + + assert post_response_1.next_annotation.url_info.url_id == url_mapping_3.url_id diff --git a/tests/automated/integration/api/annotate/all/test_validation_error.py b/tests/automated/integration/api/annotate/all/test_validation_error.py new file mode 100644 index 00000000..b805a435 --- /dev/null +++ b/tests/automated/integration/api/annotate/all/test_validation_error.py @@ -0,0 +1,27 @@ +import pytest + +from src.api.endpoints.annotate.all.post.dto import AllAnnotationPostInfo +from src.core.enums import SuggestedStatus, RecordType +from src.core.exceptions import FailedValidationException +from tests.helpers.setup.final_review.core import setup_for_get_next_url_for_final_review + + +@pytest.mark.asyncio +async def test_annotate_all_validation_error(api_test_helper): + """ + Validation errors in the PostInfo DTO should result in a 400 BAD REQUEST response + """ + ath = api_test_helper + setup_info_1 = await setup_for_get_next_url_for_final_review( + db_data_creator=ath.db_data_creator, include_user_annotations=False + ) + url_mapping_1 = setup_info_1.url_mapping + + with pytest.raises(FailedValidationException) as e: + response = await ath.request_validator.post_all_annotations_and_get_next( + url_id=url_mapping_1.url_id, + all_annotations_post_info=AllAnnotationPostInfo( + suggested_status=SuggestedStatus.NOT_RELEVANT, + record_type=RecordType.ACCIDENT_REPORTS + ) + ) diff --git a/tests/automated/integration/api/annotate/helpers.py b/tests/automated/integration/api/annotate/helpers.py new file mode 100644 index 00000000..39cfedab --- /dev/null +++ b/tests/automated/integration/api/annotate/helpers.py @@ -0,0 +1,22 @@ +from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo +from src.db.dtos.url.mapping import URLMapping + + +def check_url_mappings_match( + map_1: URLMapping, + map_2: URLMapping +): + assert map_1.url_id == map_2.url_id + assert map_2.url == map_2.url + + +def check_html_info_not_empty( + html_info: ResponseHTMLInfo +): + assert not html_info_empty(html_info) + + +def html_info_empty( + html_info: ResponseHTMLInfo +) -> bool: + return html_info.description == "" and html_info.title == "" diff --git a/tests/automated/integration/api/annotate/record_type/__init__.py b/tests/automated/integration/api/annotate/record_type/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/annotate/record_type/test_record_type.py b/tests/automated/integration/api/annotate/record_type/test_record_type.py new file mode 100644 index 00000000..5e6d8917 --- /dev/null +++ b/tests/automated/integration/api/annotate/record_type/test_record_type.py @@ -0,0 +1,166 @@ +from http import HTTPStatus + +import pytest +from fastapi import HTTPException + +from src.api.endpoints.annotate.dtos.record_type.post import RecordTypeAnnotationPostInfo +from src.api.endpoints.annotate.dtos.record_type.response import GetNextRecordTypeAnnotationResponseOuterInfo +from src.core.enums import RecordType +from src.core.error_manager.enums import ErrorTypes +from src.db.dtos.url.insert import InsertURLsInfo +from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion +from tests.automated.integration.api.annotate.helpers import check_url_mappings_match, check_html_info_not_empty, \ + html_info_empty +from tests.helpers.data_creator.models.creation_info.batch.v1 import BatchURLCreationInfo + + +@pytest.mark.asyncio +async def test_annotate_record_type(api_test_helper): + ath = api_test_helper + + batch_id = ath.db_data_creator.batch() + + # Create 2 URLs with outcome `pending` + iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=2) + + url_1 = iui.url_mappings[0] + url_2 = iui.url_mappings[1] + + # Add record type attribute with value `Accident Reports` to 1st URL + await ath.db_data_creator.auto_record_type_suggestions( + url_id=url_1.url_id, + record_type=RecordType.ACCIDENT_REPORTS + ) + + # Add 'Record Type' attribute with value `Dispatch Recordings` to 2nd URL + await ath.db_data_creator.auto_record_type_suggestions( + url_id=url_2.url_id, + record_type=RecordType.DISPATCH_RECORDINGS + ) + + # Add HTML data to both + await ath.db_data_creator.html_data([url_1.url_id, url_2.url_id]) + + # Call `GET` `/annotate/record-type` and receive next URL + request_info_1: GetNextRecordTypeAnnotationResponseOuterInfo = api_test_helper.request_validator.get_next_record_type_annotation() + inner_info_1 = request_info_1.next_annotation + + check_url_mappings_match(inner_info_1.url_info, url_1) + check_html_info_not_empty(inner_info_1.html_info) + + # Validate that the correct record type is returned + assert inner_info_1.suggested_record_type == RecordType.ACCIDENT_REPORTS + + # Annotate with value 'Personnel Records' and get next URL + request_info_2: GetNextRecordTypeAnnotationResponseOuterInfo = api_test_helper.request_validator.post_record_type_annotation_and_get_next( + url_id=inner_info_1.url_info.url_id, + record_type_annotation_post_info=RecordTypeAnnotationPostInfo( + record_type=RecordType.PERSONNEL_RECORDS + ) + ) + + inner_info_2 = request_info_2.next_annotation + + check_url_mappings_match(inner_info_2.url_info, url_2) + check_html_info_not_empty(inner_info_2.html_info) + + request_info_3: GetNextRecordTypeAnnotationResponseOuterInfo = api_test_helper.request_validator.post_record_type_annotation_and_get_next( + url_id=inner_info_2.url_info.url_id, + record_type_annotation_post_info=RecordTypeAnnotationPostInfo( + record_type=RecordType.ANNUAL_AND_MONTHLY_REPORTS + ) + ) + + assert request_info_3.next_annotation is None + + # Get all URL annotations. Confirm they exist for user + adb_client = ath.adb_client() + results: list[UserRecordTypeSuggestion] = await adb_client.get_all(UserRecordTypeSuggestion) + result_1 = results[0] + result_2 = results[1] + + assert result_1.url_id == inner_info_1.url_info.url_id + assert result_1.record_type == RecordType.PERSONNEL_RECORDS.value + + assert result_2.url_id == inner_info_2.url_info.url_id + assert result_2.record_type == RecordType.ANNUAL_AND_MONTHLY_REPORTS.value + + # If user submits annotation for same URL, the URL should be overwritten + + request_info_4: GetNextRecordTypeAnnotationResponseOuterInfo = api_test_helper.request_validator.post_record_type_annotation_and_get_next( + url_id=inner_info_1.url_info.url_id, + record_type_annotation_post_info=RecordTypeAnnotationPostInfo( + record_type=RecordType.BOOKING_REPORTS + ) + ) + + assert request_info_4.next_annotation is None + + results: list[UserRecordTypeSuggestion] = await adb_client.get_all(UserRecordTypeSuggestion) + assert len(results) == 2 + + for result in results: + if result.url_id == inner_info_1.url_info.url_id: + assert result.record_type == RecordType.BOOKING_REPORTS.value + + +@pytest.mark.asyncio +async def test_annotate_record_type_already_annotated_by_different_user( + api_test_helper +): + ath = api_test_helper + + creation_info: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( + url_count=1 + ) + + await ath.db_data_creator.user_record_type_suggestion( + url_id=creation_info.url_ids[0], + user_id=2, + record_type=RecordType.ACCIDENT_REPORTS + ) + + # Annotate with different user (default is 1) and get conflict error + try: + response = await ath.request_validator.post_record_type_annotation_and_get_next( + url_id=creation_info.url_ids[0], + record_type_annotation_post_info=RecordTypeAnnotationPostInfo( + record_type=RecordType.ANNUAL_AND_MONTHLY_REPORTS + ) + ) + except HTTPException as e: + assert e.status_code == HTTPStatus.CONFLICT + assert e.detail["detail"]["code"] == ErrorTypes.ANNOTATION_EXISTS.value + assert e.detail["detail"]["message"] == f"Annotation of type RECORD_TYPE already exists for url {creation_info.url_ids[0]}" + + +@pytest.mark.asyncio +async def test_annotate_record_type_no_html_info(api_test_helper): + ath = api_test_helper + + batch_id = ath.db_data_creator.batch() + + # Create 2 URLs with outcome `pending` + iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=2) + + url_1 = iui.url_mappings[0] + url_2 = iui.url_mappings[1] + + # Add record type attribute with value `Accident Reports` to 1st URL + await ath.db_data_creator.auto_record_type_suggestions( + url_id=url_1.url_id, + record_type=RecordType.ACCIDENT_REPORTS + ) + + # Add 'Record Type' attribute with value `Dispatch Recordings` to 2nd URL + await ath.db_data_creator.auto_record_type_suggestions( + url_id=url_2.url_id, + record_type=RecordType.DISPATCH_RECORDINGS + ) + + # Call `GET` `/annotate/record-type` and receive next URL + request_info_1: GetNextRecordTypeAnnotationResponseOuterInfo = api_test_helper.request_validator.get_next_record_type_annotation() + inner_info_1 = request_info_1.next_annotation + + check_url_mappings_match(inner_info_1.url_info, url_1) + assert html_info_empty(inner_info_1.html_info) diff --git a/tests/automated/integration/api/annotate/relevancy/__init__.py b/tests/automated/integration/api/annotate/relevancy/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/annotate/relevancy/test_relevancy.py b/tests/automated/integration/api/annotate/relevancy/test_relevancy.py new file mode 100644 index 00000000..387d68c0 --- /dev/null +++ b/tests/automated/integration/api/annotate/relevancy/test_relevancy.py @@ -0,0 +1,213 @@ +from http import HTTPStatus + +import pytest +from fastapi import HTTPException + +from src.api.endpoints.annotate.relevance.get.dto import GetNextRelevanceAnnotationResponseOuterInfo +from src.api.endpoints.annotate.relevance.post.dto import RelevanceAnnotationPostInfo +from src.core.enums import SuggestedStatus +from src.core.error_manager.enums import ErrorTypes +from src.db.dtos.url.insert import InsertURLsInfo +from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion +from tests.automated.integration.api.annotate.helpers import check_url_mappings_match, check_html_info_not_empty, \ + html_info_empty +from tests.helpers.data_creator.models.creation_info.batch.v1 import BatchURLCreationInfo + + +@pytest.mark.asyncio +async def test_annotate_relevancy(api_test_helper): + ath = api_test_helper + + batch_id = ath.db_data_creator.batch() + + # Create 2 URLs with outcome `pending` + iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=2) + + url_1 = iui.url_mappings[0] + url_2 = iui.url_mappings[1] + + # Add `Relevancy` attribute with value `True` to 1st URL + await ath.db_data_creator.auto_relevant_suggestions( + url_id=url_1.url_id, + relevant=True + ) + + # Add 'Relevancy' attribute with value `False` to 2nd URL + await ath.db_data_creator.auto_relevant_suggestions( + url_id=url_2.url_id, + relevant=False + ) + + # Add HTML data to both + await ath.db_data_creator.html_data([url_1.url_id, url_2.url_id]) + # Call `GET` `/annotate/relevance` and receive next URL + request_info_1: GetNextRelevanceAnnotationResponseOuterInfo = api_test_helper.request_validator.get_next_relevance_annotation() + inner_info_1 = request_info_1.next_annotation + + check_url_mappings_match(inner_info_1.url_info, url_1) + check_html_info_not_empty(inner_info_1.html_info) + + # Validate that the correct relevant value is returned + assert inner_info_1.annotation.is_relevant is True + + # A second user should see the same URL + + + # Annotate with value 'False' and get next URL + request_info_2: GetNextRelevanceAnnotationResponseOuterInfo = api_test_helper.request_validator.post_relevance_annotation_and_get_next( + url_id=inner_info_1.url_info.url_id, + relevance_annotation_post_info=RelevanceAnnotationPostInfo( + suggested_status=SuggestedStatus.NOT_RELEVANT + ) + ) + + inner_info_2 = request_info_2.next_annotation + + check_url_mappings_match( + inner_info_2.url_info, + url_2 + ) + check_html_info_not_empty(inner_info_2.html_info) + + request_info_3: GetNextRelevanceAnnotationResponseOuterInfo = api_test_helper.request_validator.post_relevance_annotation_and_get_next( + url_id=inner_info_2.url_info.url_id, + relevance_annotation_post_info=RelevanceAnnotationPostInfo( + suggested_status=SuggestedStatus.RELEVANT + ) + ) + + assert request_info_3.next_annotation is None + + # Get all URL annotations. Confirm they exist for user + adb_client = ath.adb_client() + results: list[UserRelevantSuggestion] = await adb_client.get_all(UserRelevantSuggestion) + result_1 = results[0] + result_2 = results[1] + + assert result_1.url_id == inner_info_1.url_info.url_id + assert result_1.suggested_status == SuggestedStatus.NOT_RELEVANT.value + + assert result_2.url_id == inner_info_2.url_info.url_id + assert result_2.suggested_status == SuggestedStatus.RELEVANT.value + + # If user submits annotation for same URL, the URL should be overwritten + request_info_4: GetNextRelevanceAnnotationResponseOuterInfo = api_test_helper.request_validator.post_relevance_annotation_and_get_next( + url_id=inner_info_1.url_info.url_id, + relevance_annotation_post_info=RelevanceAnnotationPostInfo( + suggested_status=SuggestedStatus.RELEVANT + ) + ) + + assert request_info_4.next_annotation is None + + results: list[UserRelevantSuggestion] = await adb_client.get_all(UserRelevantSuggestion) + assert len(results) == 2 + + for result in results: + if result.url_id == inner_info_1.url_info.url_id: + assert results[0].suggested_status == SuggestedStatus.RELEVANT.value + + +async def post_and_validate_relevancy_annotation(ath, url_id, annotation: SuggestedStatus): + response = ath.request_validator.post_relevance_annotation_and_get_next( + url_id=url_id, + relevance_annotation_post_info=RelevanceAnnotationPostInfo( + suggested_status=annotation + ) + ) + + assert response.next_annotation is None + + results: list[UserRelevantSuggestion] = await ath.adb_client().get_all(UserRelevantSuggestion) + assert len(results) == 1 + assert results[0].suggested_status == annotation.value + + +@pytest.mark.asyncio +async def test_annotate_relevancy_broken_page(api_test_helper): + ath = api_test_helper + + creation_info = await ath.db_data_creator.batch_and_urls(url_count=1, with_html_content=False) + + await post_and_validate_relevancy_annotation( + ath, + url_id=creation_info.url_ids[0], + annotation=SuggestedStatus.BROKEN_PAGE_404 + ) + + +@pytest.mark.asyncio +async def test_annotate_relevancy_individual_record(api_test_helper): + ath = api_test_helper + + creation_info: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( + url_count=1 + ) + + await post_and_validate_relevancy_annotation( + ath, + url_id=creation_info.url_ids[0], + annotation=SuggestedStatus.INDIVIDUAL_RECORD + ) + + +@pytest.mark.asyncio +async def test_annotate_relevancy_already_annotated_by_different_user( + api_test_helper +): + ath = api_test_helper + + creation_info: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( + url_count=1 + ) + + await ath.db_data_creator.user_relevant_suggestion( + url_id=creation_info.url_ids[0], + user_id=2, + suggested_status=SuggestedStatus.RELEVANT + ) + + # Annotate with different user (default is 1) and get conflict error + try: + response = await ath.request_validator.post_relevance_annotation_and_get_next( + url_id=creation_info.url_ids[0], + relevance_annotation_post_info=RelevanceAnnotationPostInfo( + suggested_status=SuggestedStatus.NOT_RELEVANT + ) + ) + except HTTPException as e: + assert e.status_code == HTTPStatus.CONFLICT + assert e.detail["detail"]["code"] == ErrorTypes.ANNOTATION_EXISTS.value + assert e.detail["detail"]["message"] == f"Annotation of type RELEVANCE already exists for url {creation_info.url_ids[0]}" + + +@pytest.mark.asyncio +async def test_annotate_relevancy_no_html(api_test_helper): + ath = api_test_helper + + batch_id = ath.db_data_creator.batch() + + # Create 2 URLs with outcome `pending` + iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=2) + + url_1 = iui.url_mappings[0] + url_2 = iui.url_mappings[1] + + # Add `Relevancy` attribute with value `True` to 1st URL + await ath.db_data_creator.auto_relevant_suggestions( + url_id=url_1.url_id, + relevant=True + ) + + # Add 'Relevancy' attribute with value `False` to 2nd URL + await ath.db_data_creator.auto_relevant_suggestions( + url_id=url_2.url_id, + relevant=False + ) + + # Call `GET` `/annotate/relevance` and receive next URL + request_info_1: GetNextRelevanceAnnotationResponseOuterInfo = api_test_helper.request_validator.get_next_relevance_annotation() + inner_info_1 = request_info_1.next_annotation + + check_url_mappings_match(inner_info_1.url_info, url_1) + assert html_info_empty(inner_info_1.html_info) diff --git a/tests/automated/integration/api/annotate/test_.py b/tests/automated/integration/api/annotate/test_.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/batch/__init__.py b/tests/automated/integration/api/batch/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/batch/summaries/__init__.py b/tests/automated/integration/api/batch/summaries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/batch/summaries/test_happy_path.py b/tests/automated/integration/api/batch/summaries/test_happy_path.py new file mode 100644 index 00000000..d91e1a8c --- /dev/null +++ b/tests/automated/integration/api/batch/summaries/test_happy_path.py @@ -0,0 +1,95 @@ +import pytest + +from src.core.enums import BatchStatus +from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum +from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters + + +@pytest.mark.asyncio +async def test_get_batch_summaries(api_test_helper): + ath = api_test_helper + + batch_params = [ + TestBatchCreationParameters( + urls=[ + TestURLCreationParameters( + count=1, + status=URLCreationEnum.OK + ), + TestURLCreationParameters( + count=2, + status=URLCreationEnum.SUBMITTED + ) + ] + ), + TestBatchCreationParameters( + urls=[ + TestURLCreationParameters( + count=4, + status=URLCreationEnum.NOT_RELEVANT + ), + TestURLCreationParameters( + count=3, + status=URLCreationEnum.ERROR + ) + ] + ), + TestBatchCreationParameters( + urls=[ + TestURLCreationParameters( + count=7, + status=URLCreationEnum.DUPLICATE + ), + TestURLCreationParameters( + count=1, + status=URLCreationEnum.SUBMITTED + ) + ] + ) + ] + + batch_1_creation_info = await ath.db_data_creator.batch_v2(batch_params[0]) + batch_2_creation_info = await ath.db_data_creator.batch_v2(batch_params[1]) + batch_3_creation_info = await ath.db_data_creator.batch_v2(batch_params[2]) + + batch_1_id = batch_1_creation_info.batch_id + batch_2_id = batch_2_creation_info.batch_id + batch_3_id = batch_3_creation_info.batch_id + + + response = ath.request_validator.get_batch_statuses() + results = response.results + + assert len(results) == 3 + + result_1 = results[0] + assert result_1.id == batch_1_id + assert result_1.status == BatchStatus.READY_TO_LABEL + counts_1 = result_1.url_counts + assert counts_1.total == 3 + assert counts_1.pending == 1 + assert counts_1.submitted == 2 + assert counts_1.not_relevant == 0 + assert counts_1.duplicate == 0 + assert counts_1.errored == 0 + + result_2 = results[1] + assert result_2.id == batch_2_id + counts_2 = result_2.url_counts + assert counts_2.total == 7 + assert counts_2.not_relevant == 4 + assert counts_2.errored == 3 + assert counts_2.pending == 3 + assert counts_2.submitted == 0 + assert counts_2.duplicate == 0 + + result_3 = results[2] + assert result_3.id == batch_3_id + counts_3 = result_3.url_counts + assert counts_3.total == 8 + assert counts_3.not_relevant == 0 + assert counts_3.errored == 0 + assert counts_3.pending == 7 + assert counts_3.submitted == 1 + assert counts_3.duplicate == 7 diff --git a/tests/automated/integration/api/batch/summaries/test_pending_url_filter.py b/tests/automated/integration/api/batch/summaries/test_pending_url_filter.py new file mode 100644 index 00000000..7fdc96b1 --- /dev/null +++ b/tests/automated/integration/api/batch/summaries/test_pending_url_filter.py @@ -0,0 +1,75 @@ +import pytest + +from src.collectors.enums import CollectorType +from src.core.enums import BatchStatus +from src.db.dtos.url.mapping import URLMapping +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum +from tests.helpers.data_creator.core import DBDataCreator + + +@pytest.mark.asyncio +async def test_get_batch_summaries_pending_url_filter(api_test_helper): + ath = api_test_helper + dbdc: DBDataCreator = ath.db_data_creator + + # Add an errored out batch + batch_error: int = await dbdc.create_batch(status=BatchStatus.ERROR) + + # Add a batch with pending urls + batch_pending = await ath.db_data_creator.batch_and_urls( + strategy=CollectorType.EXAMPLE, + url_count=2, + batch_status=BatchStatus.READY_TO_LABEL, + with_html_content=True, + url_status=URLCreationEnum.OK + ) + + # Add a batch with submitted URLs + batch_submitted: int = await dbdc.create_batch(status=BatchStatus.READY_TO_LABEL) + submitted_url_mappings: list[URLMapping] = await dbdc.create_submitted_urls(count=2) + submitted_url_ids: list[int] = [url_mapping.url_id for url_mapping in submitted_url_mappings] + await dbdc.create_batch_url_links( + batch_id=batch_submitted, + url_ids=submitted_url_ids + ) + + # Add an aborted batch + batch_aborted: int = await dbdc.create_batch(status=BatchStatus.ABORTED) + + # Add a batch with validated URLs + batch_validated: int = await dbdc.create_batch(status=BatchStatus.READY_TO_LABEL) + validated_url_mappings: list[URLMapping] = await dbdc.create_validated_urls( + count=2 + ) + validated_url_ids: list[int] = [url_mapping.url_id for url_mapping in validated_url_mappings] + await dbdc.create_batch_url_links( + batch_id=batch_validated, + url_ids=validated_url_ids + ) + + # Test filter for pending URLs and only retrieve the second batch + pending_urls_results = ath.request_validator.get_batch_statuses( + has_pending_urls=True + ) + + assert len(pending_urls_results.results) == 1 + assert pending_urls_results.results[0].id == batch_pending.batch_id + + # Test filter without pending URLs and retrieve the other four batches + no_pending_urls_results = ath.request_validator.get_batch_statuses( + has_pending_urls=False + ) + + assert len(no_pending_urls_results.results) == 4 + for result in no_pending_urls_results.results: + assert result.id in [ + batch_error, + batch_submitted, + batch_validated, + batch_aborted + ] + + # Test no filter for pending URLs and retrieve all batches + no_filter_results = ath.request_validator.get_batch_statuses() + + assert len(no_filter_results.results) == 5 diff --git a/tests/automated/integration/api/batch/test_batch.py b/tests/automated/integration/api/batch/test_batch.py new file mode 100644 index 00000000..86f35cfc --- /dev/null +++ b/tests/automated/integration/api/batch/test_batch.py @@ -0,0 +1,64 @@ +from src.db.models.impl.batch.pydantic.info import BatchInfo +from src.db.dtos.url.insert import InsertURLsInfo +from src.collectors.impl.example.dtos.input import ExampleInputDTO +from src.core.enums import BatchStatus + + +def test_abort_batch(api_test_helper): + ath = api_test_helper + + dto = ExampleInputDTO( + sleep_time=1 + ) + + batch_id = ath.request_validator.example_collector(dto=dto)["batch_id"] + + response = ath.request_validator.abort_batch(batch_id=batch_id) + + assert response.message == "Batch aborted." + + bi: BatchInfo = ath.request_validator.get_batch_info(batch_id=batch_id) + + assert bi.status == BatchStatus.ABORTED + +def test_get_batch_urls(api_test_helper): + + # Insert batch and urls into database + ath = api_test_helper + batch_id = ath.db_data_creator.batch() + iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=101) + + response = ath.request_validator.get_batch_urls(batch_id=batch_id, page=1) + assert len(response.urls) == 100 + # Check that the first url corresponds to the first url inserted + assert response.urls[0].url == iui.url_mappings[0].url + # Check that the last url corresponds to the 100th url inserted + assert response.urls[-1].url == iui.url_mappings[99].url + + + # Check that a more limited set of urls exist + response = ath.request_validator.get_batch_urls(batch_id=batch_id, page=2) + assert len(response.urls) == 1 + # Check that this url corresponds to the last url inserted + assert response.urls[0].url == iui.url_mappings[-1].url + +def test_get_duplicate_urls(api_test_helper): + + # Insert batch and url into database + ath = api_test_helper + batch_id = ath.db_data_creator.batch() + iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=101) + # Get a list of all url ids + url_ids = [url.url_id for url in iui.url_mappings] + + # Create a second batch which will be associated with the duplicates + dup_batch_id = ath.db_data_creator.batch() + + # Insert duplicate urls into database + ath.db_data_creator.duplicate_urls(duplicate_batch_id=dup_batch_id, url_ids=url_ids) + + response = ath.request_validator.get_batch_url_duplicates(batch_id=dup_batch_id, page=1) + assert len(response.duplicates) == 100 + + response = ath.request_validator.get_batch_url_duplicates(batch_id=dup_batch_id, page=2) + assert len(response.duplicates) == 1 \ No newline at end of file diff --git a/tests/automated/integration/api/example_collector/test_happy_path.py b/tests/automated/integration/api/example_collector/test_happy_path.py index bbb52789..d580f546 100644 --- a/tests/automated/integration/api/example_collector/test_happy_path.py +++ b/tests/automated/integration/api/example_collector/test_happy_path.py @@ -6,7 +6,7 @@ from src.api.endpoints.batch.dtos.get.summaries.response import GetBatchSummariesResponse from src.api.endpoints.batch.dtos.get.summaries.summary import BatchSummary from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.impl.batch.pydantic import BatchInfo +from src.db.models.impl.batch.pydantic.info import BatchInfo from src.collectors.impl.example.dtos.input import ExampleInputDTO from src.collectors.enums import CollectorType from src.core.logger import AsyncCoreLogger diff --git a/tests/automated/integration/api/metrics/batches/test_aggregated.py b/tests/automated/integration/api/metrics/batches/test_aggregated.py index 084762b9..4b7b4f75 100644 --- a/tests/automated/integration/api/metrics/batches/test_aggregated.py +++ b/tests/automated/integration/api/metrics/batches/test_aggregated.py @@ -2,44 +2,65 @@ from src.collectors.enums import CollectorType, URLStatus from src.core.enums import BatchStatus +from src.db.client.async_ import AsyncDatabaseClient +from src.db.dtos.url.mapping import URLMapping +from src.db.helpers.connect import get_postgres_connection_string +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters -from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters +from tests.helpers.data_creator.create import create_batch, create_url_data_sources, create_urls, \ + create_batch_url_links, create_validated_flags +from tests.helpers.setup.wipe import wipe_database @pytest.mark.asyncio -async def test_get_batches_aggregated_metrics(api_test_helper): +async def test_get_batches_aggregated_metrics( + api_test_helper, + wiped_database +): ath = api_test_helper + adb_client: AsyncDatabaseClient = ath.adb_client() # Create successful batches with URLs of different statuses - all_params = [] for i in range(3): - params = TestBatchCreationParameters( + batch_id = await create_batch( + adb_client=adb_client, strategy=CollectorType.MANUAL, - urls=[ - TestURLCreationParameters( - count=1, - status=URLStatus.PENDING - ), - TestURLCreationParameters( - count=2, - status=URLStatus.SUBMITTED - ), - TestURLCreationParameters( - count=3, - status=URLStatus.NOT_RELEVANT - ), - TestURLCreationParameters( - count=4, - status=URLStatus.ERROR - ), - TestURLCreationParameters( - count=5, - status=URLStatus.VALIDATED - ) - ] ) - all_params.append(params) - + url_mappings_error: list[URLMapping] = await create_urls( + adb_client=adb_client, + status=URLStatus.ERROR, + count=4, + ) + url_mappings_ok: list[URLMapping] = await create_urls( + adb_client=adb_client, + status=URLStatus.OK, + count=11, + ) + url_mappings_all: list[URLMapping] = url_mappings_error + url_mappings_ok + url_ids_all: list[int] = [url_mapping.url_id for url_mapping in url_mappings_all] + await create_batch_url_links( + adb_client=adb_client, + batch_id=batch_id, + url_ids=url_ids_all, + ) + urls_submitted: list[int] = url_ids_all[:2] + urls_not_relevant: list[int] = url_ids_all[2:5] + urls_validated: list[int] = url_ids_all[5:10] + await create_validated_flags( + adb_client=adb_client, + url_ids=urls_validated + urls_submitted, + validation_type=URLValidatedType.DATA_SOURCE, + ) + await create_validated_flags( + adb_client=adb_client, + url_ids=urls_not_relevant, + validation_type=URLValidatedType.NOT_RELEVANT, + ) + await create_url_data_sources( + adb_client=adb_client, + url_ids=urls_submitted, + ) + all_params = [] # Create failed batches for i in range(2): params = TestBatchCreationParameters( @@ -66,8 +87,8 @@ async def test_get_batches_aggregated_metrics(api_test_helper): assert inner_dto_manual.count_urls == 45 assert inner_dto_manual.count_successful_batches == 3 assert inner_dto_manual.count_failed_batches == 0 - assert inner_dto_manual.count_urls_pending == 3 + assert inner_dto_manual.count_urls_pending == 15 assert inner_dto_manual.count_urls_submitted == 6 assert inner_dto_manual.count_urls_rejected == 9 assert inner_dto_manual.count_urls_errors == 12 - assert inner_dto_manual.count_urls_validated == 15 + assert inner_dto_manual.count_urls_validated == 30 diff --git a/tests/automated/integration/api/metrics/batches/test_breakdown.py b/tests/automated/integration/api/metrics/batches/test_breakdown.py index 0cce8740..0657c66f 100644 --- a/tests/automated/integration/api/metrics/batches/test_breakdown.py +++ b/tests/automated/integration/api/metrics/batches/test_breakdown.py @@ -1,79 +1,102 @@ +from datetime import datetime, timedelta + import pendulum import pytest from src.collectors.enums import CollectorType, URLStatus from src.core.enums import BatchStatus -from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters -from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters +from src.db.client.async_ import AsyncDatabaseClient +from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from tests.helpers.data_creator.create import create_batch, create_urls, create_batch_url_links, create_validated_flags, \ + create_url_data_sources @pytest.mark.asyncio async def test_get_batches_breakdown_metrics(api_test_helper): # Create a different batch for each month, with different URLs - today = pendulum.parse('2021-01-01') + today = datetime.now() ath = api_test_helper + adb_client: AsyncDatabaseClient = ath.adb_client() - batch_1_params = TestBatchCreationParameters( + batch_id_1 = await create_batch( + adb_client=adb_client, strategy=CollectorType.MANUAL, - urls=[ - TestURLCreationParameters( - count=1, - status=URLStatus.PENDING - ), - TestURLCreationParameters( - count=2, - status=URLStatus.SUBMITTED - ), - ] ) - batch_1 = await ath.db_data_creator.batch_v2(batch_1_params) - batch_2_params = TestBatchCreationParameters( - strategy=CollectorType.EXAMPLE, - outcome=BatchStatus.ERROR, - created_at=today.subtract(weeks=1), + url_mappings_1: list[URLMapping] = await create_urls( + adb_client=adb_client, + count=3, + ) + url_ids_1: list[int] = [url_mapping.url_id for url_mapping in url_mappings_1] + await create_batch_url_links(adb_client=adb_client, batch_id=batch_id_1, url_ids=url_ids_1) + await create_validated_flags( + adb_client=adb_client, + url_ids=url_ids_1[:2], + validation_type=URLValidatedType.DATA_SOURCE + ) + await create_url_data_sources( + adb_client=adb_client, + url_ids=url_ids_1[:2], ) - batch_2 = await ath.db_data_creator.batch_v2(batch_2_params) - batch_3_params = TestBatchCreationParameters( + + batch_id_2 = await create_batch( + adb_client=adb_client, + status=BatchStatus.ERROR, + date_generated=today - timedelta(days=7), + ) + + batch_id_3 = await create_batch( + adb_client=adb_client, strategy=CollectorType.AUTO_GOOGLER, - created_at=today.subtract(weeks=2), - urls=[ - TestURLCreationParameters( - count=3, - status=URLStatus.NOT_RELEVANT - ), - TestURLCreationParameters( - count=4, - status=URLStatus.ERROR - ), - TestURLCreationParameters( - count=5, - status=URLStatus.VALIDATED - ), - ] + date_generated=today - timedelta(days=14) ) - batch_3 = await ath.db_data_creator.batch_v2(batch_3_params) + error_url_mappings: list[URLMapping] = await create_urls( + adb_client=adb_client, + status=URLStatus.ERROR, + count=4, + ) + error_url_ids: list[int] = [url_mapping.url_id for url_mapping in error_url_mappings] + validated_url_mappings: list[URLMapping] = await create_urls( + adb_client=adb_client, + count=8, + ) + validated_url_ids: list[int] = [url_mapping.url_id for url_mapping in validated_url_mappings] + await create_validated_flags( + adb_client=adb_client, + url_ids=validated_url_ids[:3], + validation_type=URLValidatedType.NOT_RELEVANT, + ) + await create_validated_flags( + adb_client=adb_client, + url_ids=validated_url_ids[4:9], + validation_type=URLValidatedType.DATA_SOURCE, + ) + await create_batch_url_links( + adb_client=adb_client, + batch_id=batch_id_3, + url_ids=error_url_ids + validated_url_ids, + ) + dto_1 = await ath.request_validator.get_batches_breakdown_metrics( page=1 ) assert len(dto_1.batches) == 3 dto_batch_1 = dto_1.batches[2] - assert dto_batch_1.batch_id == batch_1.batch_id + assert dto_batch_1.batch_id == batch_id_1 assert dto_batch_1.strategy == CollectorType.MANUAL assert dto_batch_1.status == BatchStatus.READY_TO_LABEL - assert pendulum.instance(dto_batch_1.created_at) > today assert dto_batch_1.count_url_total == 3 assert dto_batch_1.count_url_pending == 1 assert dto_batch_1.count_url_submitted == 2 assert dto_batch_1.count_url_rejected == 0 assert dto_batch_1.count_url_error == 0 - assert dto_batch_1.count_url_validated == 0 + assert dto_batch_1.count_url_validated == 2 dto_batch_2 = dto_1.batches[1] - assert dto_batch_2.batch_id == batch_2.batch_id + assert dto_batch_2.batch_id == batch_id_2 assert dto_batch_2.status == BatchStatus.ERROR assert dto_batch_2.strategy == CollectorType.EXAMPLE - assert pendulum.instance(dto_batch_2.created_at) == today.subtract(weeks=1) assert dto_batch_2.count_url_total == 0 assert dto_batch_2.count_url_submitted == 0 assert dto_batch_2.count_url_pending == 0 @@ -82,16 +105,15 @@ async def test_get_batches_breakdown_metrics(api_test_helper): assert dto_batch_2.count_url_validated == 0 dto_batch_3 = dto_1.batches[0] - assert dto_batch_3.batch_id == batch_3.batch_id + assert dto_batch_3.batch_id == batch_id_3 assert dto_batch_3.status == BatchStatus.READY_TO_LABEL assert dto_batch_3.strategy == CollectorType.AUTO_GOOGLER - assert pendulum.instance(dto_batch_3.created_at) == today.subtract(weeks=2) assert dto_batch_3.count_url_total == 12 - assert dto_batch_3.count_url_pending == 0 + assert dto_batch_3.count_url_pending == 5 assert dto_batch_3.count_url_submitted == 0 assert dto_batch_3.count_url_rejected == 3 assert dto_batch_3.count_url_error == 4 - assert dto_batch_3.count_url_validated == 5 + assert dto_batch_3.count_url_validated == 7 dto_2 = await ath.request_validator.get_batches_breakdown_metrics( page=2 diff --git a/tests/automated/integration/api/metrics/test_backlog.py b/tests/automated/integration/api/metrics/test_backlog.py index a6807a23..e48db202 100644 --- a/tests/automated/integration/api/metrics/test_backlog.py +++ b/tests/automated/integration/api/metrics/test_backlog.py @@ -3,9 +3,13 @@ from src.collectors.enums import CollectorType, URLStatus from src.core.enums import SuggestedStatus +from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from tests.helpers.batch_creation_parameters.annotation_info import AnnotationInfo from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters +from tests.helpers.data_creator.core import DBDataCreator @pytest.mark.asyncio @@ -14,29 +18,22 @@ async def test_get_backlog_metrics(api_test_helper): ath = api_test_helper adb_client = ath.adb_client() + ddc: DBDataCreator = ath.db_data_creator # Populate the backlog table and test that backlog metrics returned on a monthly basis # Ensure that multiple days in each month are added to the backlog table, with different values - - batch_1_params = TestBatchCreationParameters( - strategy=CollectorType.MANUAL, - urls=[ - TestURLCreationParameters( - count=1, - status=URLStatus.PENDING, - annotation_info=AnnotationInfo( - user_relevant=SuggestedStatus.NOT_RELEVANT - ) - ), - TestURLCreationParameters( - count=2, - status=URLStatus.SUBMITTED - ), - ] + batch_1_id: int = await ddc.create_batch() + url_mappings_1: list[URLMapping] = await ddc.create_urls(count=3) + url_ids_1: list[int] = [url_mapping.url_id for url_mapping in url_mappings_1] + await ddc.create_batch_url_links(url_ids=url_ids_1, batch_id=batch_1_id) + submitted_url_ids_1: list[int] = url_ids_1[:2] + await ddc.create_validated_flags( + url_ids=submitted_url_ids_1, + validation_type=URLValidatedType.DATA_SOURCE ) - batch_1 = await ath.db_data_creator.batch_v2(batch_1_params) + await ddc.create_url_data_sources(url_ids=submitted_url_ids_1) await adb_client.populate_backlog_snapshot( dt=today.subtract(months=3).naive() @@ -46,23 +43,20 @@ async def test_get_backlog_metrics(api_test_helper): dt=today.subtract(months=2, days=3).naive() ) - batch_2_params = TestBatchCreationParameters( - strategy=CollectorType.AUTO_GOOGLER, - urls=[ - TestURLCreationParameters( - count=4, - status=URLStatus.PENDING, - annotation_info=AnnotationInfo( - user_relevant=SuggestedStatus.NOT_RELEVANT - ) - ), - TestURLCreationParameters( - count=2, - status=URLStatus.ERROR - ), - ] + batch_2_id: int = await ddc.create_batch() + not_relevant_url_mappings_2: list[URLMapping] = await ddc.create_urls(count=6) + not_relevant_url_ids_2: list[int] = [url_mapping.url_id for url_mapping in not_relevant_url_mappings_2] + await ddc.create_batch_url_links(url_ids=not_relevant_url_ids_2, batch_id=batch_2_id) + await ddc.create_validated_flags( + url_ids=not_relevant_url_ids_2[:4], + validation_type=URLValidatedType.NOT_RELEVANT + ) + error_url_mappings_2: list[URLMapping] = await ddc.create_urls( + status=URLStatus.ERROR, + count=2 ) - batch_2 = await ath.db_data_creator.batch_v2(batch_2_params) + error_url_ids_2: list[int] = [url_mapping.url_id for url_mapping in error_url_mappings_2] + await ddc.create_batch_url_links(url_ids=error_url_ids_2, batch_id=batch_2_id) await adb_client.populate_backlog_snapshot( dt=today.subtract(months=2).naive() @@ -72,23 +66,15 @@ async def test_get_backlog_metrics(api_test_helper): dt=today.subtract(months=1, days=4).naive() ) - batch_3_params = TestBatchCreationParameters( - strategy=CollectorType.AUTO_GOOGLER, - urls=[ - TestURLCreationParameters( - count=7, - status=URLStatus.PENDING, - annotation_info=AnnotationInfo( - user_relevant=SuggestedStatus.NOT_RELEVANT - ) - ), - TestURLCreationParameters( - count=5, - status=URLStatus.VALIDATED - ), - ] + batch_3_id: int = await ddc.create_batch() + url_mappings_3: list[URLMapping] = await ddc.create_urls(count=12) + url_ids_3: list[int] = [url_mapping.url_id for url_mapping in url_mappings_3] + await ddc.create_batch_url_links(url_ids=url_ids_3, batch_id=batch_3_id) + await ddc.create_validated_flags( + url_ids=url_ids_3[:5], + validation_type=URLValidatedType.DATA_SOURCE ) - batch_3 = await ath.db_data_creator.batch_v2(batch_3_params) + await adb_client.populate_backlog_snapshot( dt=today.subtract(months=1).naive() @@ -100,5 +86,5 @@ async def test_get_backlog_metrics(api_test_helper): # Test that the count closest to the beginning of the month is returned for each month assert dto.entries[0].count_pending_total == 1 - assert dto.entries[1].count_pending_total == 5 - assert dto.entries[2].count_pending_total == 12 + assert dto.entries[1].count_pending_total == 3 + assert dto.entries[2].count_pending_total == 10 diff --git a/tests/automated/integration/api/metrics/urls/aggregated/test_core.py b/tests/automated/integration/api/metrics/urls/aggregated/test_core.py index c8957952..08c52845 100644 --- a/tests/automated/integration/api/metrics/urls/aggregated/test_core.py +++ b/tests/automated/integration/api/metrics/urls/aggregated/test_core.py @@ -1,75 +1,70 @@ +from datetime import datetime, timedelta, timezone + import pendulum import pytest from src.collectors.enums import CollectorType, URLStatus +from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters +from tests.helpers.data_creator.core import DBDataCreator @pytest.mark.asyncio async def test_get_urls_aggregated_metrics(api_test_helper): ath = api_test_helper - today = pendulum.parse('2021-01-01') + today = datetime.now() + + ddc: DBDataCreator = ath.db_data_creator batch_0_params = TestBatchCreationParameters( strategy=CollectorType.MANUAL, - created_at=today.subtract(days=1), + created_at=today - timedelta(days=1), urls=[ TestURLCreationParameters( count=1, - status=URLStatus.PENDING, + status=URLCreationEnum.OK, ), ] ) - batch_0 = await ath.db_data_creator.batch_v2(batch_0_params) - oldest_url_id = batch_0.urls_by_status[URLStatus.PENDING].url_mappings[0].url_id - + batch_0: int = await ddc.create_batch( + strategy=CollectorType.MANUAL, + date_generated=today - timedelta(days=1) + ) + url_mappings_0: list[URLMapping] = await ddc.create_urls(batch_id=batch_0) + oldest_url_id: int = url_mappings_0[0].url_id - batch_1_params = TestBatchCreationParameters( + batch_1: int = await ddc.create_batch( strategy=CollectorType.MANUAL, - urls=[ - TestURLCreationParameters( - count=1, - status=URLStatus.PENDING, - ), - TestURLCreationParameters( - count=2, - status=URLStatus.SUBMITTED - ), - ] ) - batch_1 = await ath.db_data_creator.batch_v2(batch_1_params) + url_mappings_1_ok: list[URLMapping] = await ddc.create_urls(batch_id=batch_1, count=1) + url_mappings_1_submitted: list[URLMapping] = await ddc.create_submitted_urls(count=2) + url_ids_1_submitted: list[int] = [url_mapping.url_id for url_mapping in url_mappings_1_submitted] + await ddc.create_batch_url_links(url_ids=url_ids_1_submitted, batch_id=batch_1) - batch_2_params = TestBatchCreationParameters( + batch_2: int = await ddc.create_batch( strategy=CollectorType.AUTO_GOOGLER, - urls=[ - TestURLCreationParameters( - count=4, - status=URLStatus.PENDING, - ), - TestURLCreationParameters( - count=2, - status=URLStatus.ERROR - ), - TestURLCreationParameters( - count=1, - status=URLStatus.VALIDATED - ), - TestURLCreationParameters( - count=5, - status=URLStatus.NOT_RELEVANT - ), - ] ) - batch_2 = await ath.db_data_creator.batch_v2(batch_2_params) + url_mappings_2_ok: list[URLMapping] = await ddc.create_urls(batch_id=batch_2, count=4, status=URLStatus.OK) + url_mappings_2_error: list[URLMapping] = await ddc.create_urls(batch_id=batch_2, count=2, status=URLStatus.ERROR) + url_mappings_2_validated: list[URLMapping] = await ddc.create_validated_urls(count=1, validation_type=URLValidatedType.DATA_SOURCE) + url_mappings_2_not_relevant: list[URLMapping] = await ddc.create_validated_urls(count=5, validation_type=URLValidatedType.NOT_RELEVANT) + url_ids_2_validated: list[int] = [url_mapping.url_id for url_mapping in url_mappings_2_validated] + url_ids_2_not_relevant: list[int] = [url_mapping.url_id for url_mapping in url_mappings_2_not_relevant] + await ddc.create_batch_url_links( + url_ids=url_ids_2_validated + url_ids_2_not_relevant, + batch_id=batch_2 + ) + + dto = await ath.request_validator.get_urls_aggregated_metrics() assert dto.oldest_pending_url_id == oldest_url_id - assert dto.oldest_pending_url_created_at == today.subtract(days=1).in_timezone('UTC').naive() - assert dto.count_urls_pending == 6 assert dto.count_urls_rejected == 5 assert dto.count_urls_errors == 2 - assert dto.count_urls_validated == 1 + assert dto.count_urls_validated == 8 assert dto.count_urls_submitted == 2 assert dto.count_urls_total == 16 diff --git a/tests/automated/integration/api/metrics/urls/breakdown/test_pending.py b/tests/automated/integration/api/metrics/urls/breakdown/test_pending.py index e81d6ec7..02f1aae2 100644 --- a/tests/automated/integration/api/metrics/urls/breakdown/test_pending.py +++ b/tests/automated/integration/api/metrics/urls/breakdown/test_pending.py @@ -6,6 +6,7 @@ from src.core.enums import SuggestedStatus, RecordType from tests.helpers.batch_creation_parameters.annotation_info import AnnotationInfo from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters @@ -27,14 +28,14 @@ async def test_get_urls_breakdown_pending_metrics(api_test_helper): urls=[ TestURLCreationParameters( count=1, - status=URLStatus.PENDING, + status=URLCreationEnum.OK, annotation_info=AnnotationInfo( user_relevant=SuggestedStatus.NOT_RELEVANT ) ), TestURLCreationParameters( count=2, - status=URLStatus.SUBMITTED + status=URLCreationEnum.SUBMITTED ), ] ) @@ -44,7 +45,7 @@ async def test_get_urls_breakdown_pending_metrics(api_test_helper): urls=[ TestURLCreationParameters( count=3, - status=URLStatus.PENDING, + status=URLCreationEnum.OK, annotation_info=AnnotationInfo( user_relevant=SuggestedStatus.RELEVANT, user_record_type=RecordType.CALLS_FOR_SERVICE @@ -60,15 +61,15 @@ async def test_get_urls_breakdown_pending_metrics(api_test_helper): urls=[ TestURLCreationParameters( count=3, - status=URLStatus.SUBMITTED + status=URLCreationEnum.SUBMITTED ), TestURLCreationParameters( count=4, - status=URLStatus.ERROR + status=URLCreationEnum.ERROR ), TestURLCreationParameters( count=5, - status=URLStatus.PENDING, + status=URLCreationEnum.OK, annotation_info=AnnotationInfo( user_relevant=SuggestedStatus.RELEVANT, user_record_type=RecordType.INCARCERATION_RECORDS, diff --git a/tests/automated/integration/api/metrics/urls/breakdown/test_submitted.py b/tests/automated/integration/api/metrics/urls/breakdown/test_submitted.py index 71e00e51..cbd30f8b 100644 --- a/tests/automated/integration/api/metrics/urls/breakdown/test_submitted.py +++ b/tests/automated/integration/api/metrics/urls/breakdown/test_submitted.py @@ -3,6 +3,7 @@ from src.collectors.enums import CollectorType, URLStatus from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters @@ -18,11 +19,11 @@ async def test_get_urls_breakdown_submitted_metrics(api_test_helper): urls=[ TestURLCreationParameters( count=1, - status=URLStatus.PENDING + status=URLCreationEnum.OK ), TestURLCreationParameters( count=2, - status=URLStatus.SUBMITTED + status=URLCreationEnum.SUBMITTED ), ] ) @@ -32,7 +33,7 @@ async def test_get_urls_breakdown_submitted_metrics(api_test_helper): urls=[ TestURLCreationParameters( count=3, - status=URLStatus.SUBMITTED + status=URLCreationEnum.SUBMITTED ) ], created_at=today.subtract(weeks=1), @@ -44,15 +45,15 @@ async def test_get_urls_breakdown_submitted_metrics(api_test_helper): urls=[ TestURLCreationParameters( count=3, - status=URLStatus.SUBMITTED + status=URLCreationEnum.SUBMITTED ), TestURLCreationParameters( count=4, - status=URLStatus.ERROR + status=URLCreationEnum.ERROR ), TestURLCreationParameters( count=5, - status=URLStatus.VALIDATED + status=URLCreationEnum.VALIDATED ), ] ) diff --git a/tests/automated/integration/api/review/conftest.py b/tests/automated/integration/api/review/conftest.py index e4345821..59d76930 100644 --- a/tests/automated/integration/api/review/conftest.py +++ b/tests/automated/integration/api/review/conftest.py @@ -5,32 +5,18 @@ from src.core.enums import SuggestedStatus, RecordType from tests.helpers.batch_creation_parameters.annotation_info import AnnotationInfo from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters @pytest_asyncio.fixture async def batch_url_creation_info(db_data_creator): - simple_parameter_statuses = [ - URLStatus.VALIDATED, - URLStatus.SUBMITTED, - URLStatus.INDIVIDUAL_RECORD, - URLStatus.NOT_RELEVANT, - URLStatus.ERROR, - URLStatus.DUPLICATE, - URLStatus.NOT_FOUND - ] - simple_parameters = [ - TestURLCreationParameters( - status=status - ) for status in simple_parameter_statuses - ] parameters = TestBatchCreationParameters( urls=[ - *simple_parameters, TestURLCreationParameters( count=2, - status=URLStatus.PENDING, + status=URLCreationEnum.OK, annotation_info=AnnotationInfo( user_relevant=SuggestedStatus.RELEVANT, user_record_type=RecordType.ARREST_RECORDS, diff --git a/tests/automated/integration/api/review/rejection/test_individual_record.py b/tests/automated/integration/api/review/rejection/test_individual_record.py index 6e81d378..33addd91 100644 --- a/tests/automated/integration/api/review/rejection/test_individual_record.py +++ b/tests/automated/integration/api/review/rejection/test_individual_record.py @@ -2,14 +2,21 @@ from src.api.endpoints.review.enums import RejectionReason from src.collectors.enums import URLStatus +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from tests.automated.integration.api.review.rejection.helpers import run_rejection_test +from tests.helpers.api_test_helper import APITestHelper @pytest.mark.asyncio -async def test_rejection_individual_record(api_test_helper): +async def test_rejection_individual_record(api_test_helper: APITestHelper): await run_rejection_test( api_test_helper, rejection_reason=RejectionReason.INDIVIDUAL_RECORD, - url_status=URLStatus.INDIVIDUAL_RECORD + url_status=URLStatus.OK ) + # Get FlagURLValidated and confirm Individual Record + flag: FlagURLValidated = (await api_test_helper.adb_client().get_all(FlagURLValidated))[0] + assert flag.type == URLValidatedType.INDIVIDUAL_RECORD + diff --git a/tests/automated/integration/api/review/rejection/test_not_relevant.py b/tests/automated/integration/api/review/rejection/test_not_relevant.py index 1ad2847f..03ee72d3 100644 --- a/tests/automated/integration/api/review/rejection/test_not_relevant.py +++ b/tests/automated/integration/api/review/rejection/test_not_relevant.py @@ -2,6 +2,8 @@ from src.api.endpoints.review.enums import RejectionReason from src.collectors.enums import URLStatus +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from tests.automated.integration.api.review.rejection.helpers import run_rejection_test @@ -10,5 +12,9 @@ async def test_rejection_not_relevant(api_test_helper): await run_rejection_test( api_test_helper, rejection_reason=RejectionReason.NOT_RELEVANT, - url_status=URLStatus.NOT_RELEVANT + url_status=URLStatus.OK ) + + # Get FlagURLValidated and confirm Not Relevant + flag: FlagURLValidated = (await api_test_helper.adb_client().get_all(FlagURLValidated))[0] + assert flag.type == URLValidatedType.NOT_RELEVANT \ No newline at end of file diff --git a/tests/automated/integration/api/review/test_approve_and_get_next_source.py b/tests/automated/integration/api/review/test_approve_and_get_next_source.py index bfa126b1..69cf13d2 100644 --- a/tests/automated/integration/api/review/test_approve_and_get_next_source.py +++ b/tests/automated/integration/api/review/test_approve_and_get_next_source.py @@ -6,6 +6,8 @@ from src.core.enums import RecordType from src.db.constants import PLACEHOLDER_AGENCY_NAME from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata @@ -55,7 +57,7 @@ async def test_approve_and_get_next_source_for_review(api_test_helper): url = urls[0] assert url.id == url_mapping.url_id assert url.record_type == RecordType.ARREST_RECORDS - assert url.status == URLStatus.VALIDATED + assert url.status == URLStatus.OK assert url.name == "New Test Name" assert url.description == "New Test Description" @@ -76,3 +78,8 @@ async def test_approve_and_get_next_source_for_review(api_test_helper): for agency in agencies: if agency.agency_id == additional_agency: assert agency.name == PLACEHOLDER_AGENCY_NAME + + # Confirm presence of FlagURLValidated + flag_url_validated = await adb_client.get_all(FlagURLValidated) + assert len(flag_url_validated) == 1 + assert flag_url_validated[0].type == URLValidatedType.DATA_SOURCE \ No newline at end of file diff --git a/tests/automated/integration/api/review/test_batch_filtering.py b/tests/automated/integration/api/review/test_batch_filtering.py index 2e8aa63c..481f7e90 100644 --- a/tests/automated/integration/api/review/test_batch_filtering.py +++ b/tests/automated/integration/api/review/test_batch_filtering.py @@ -1,21 +1,37 @@ import pytest +from src.collectors.enums import URLStatus +from src.db.dtos.url.mapping import URLMapping +from tests.helpers.data_creator.core import DBDataCreator +from tests.helpers.data_creator.models.creation_info.batch.v1 import BatchURLCreationInfo + @pytest.mark.asyncio async def test_batch_filtering( - batch_url_creation_info, + batch_url_creation_info: BatchURLCreationInfo, api_test_helper ): ath = api_test_helper rv = ath.request_validator + dbdc: DBDataCreator = ath.db_data_creator + + batch_id: int = batch_url_creation_info.batch_id + + validated_url_mappings: list[URLMapping] = await dbdc.create_validated_urls(count=4) + validated_url_ids: list[int] = [url_mapping.url_id for url_mapping in validated_url_mappings] + await dbdc.create_batch_url_links( + url_ids=validated_url_ids, + batch_id=batch_id + ) + # Receive null batch info if batch id not provided outer_result_no_batch_info = await rv.review_next_source() assert outer_result_no_batch_info.next_source.batch_info is None # Get batch info if batch id is provided outer_result = await ath.request_validator.review_next_source( - batch_id=batch_url_creation_info.batch_id + batch_id=batch_id ) assert outer_result.remaining == 2 batch_info = outer_result.next_source.batch_info diff --git a/tests/automated/integration/api/test_annotate.py b/tests/automated/integration/api/test_annotate.py deleted file mode 100644 index 51688765..00000000 --- a/tests/automated/integration/api/test_annotate.py +++ /dev/null @@ -1,756 +0,0 @@ -from http import HTTPStatus - -import pytest -from fastapi import HTTPException - -from src.api.endpoints.annotate.agency.post.dto import URLAgencyAnnotationPostInfo -from src.api.endpoints.annotate.all.post.dto import AllAnnotationPostInfo -from src.api.endpoints.annotate.dtos.record_type.post import RecordTypeAnnotationPostInfo -from src.api.endpoints.annotate.dtos.record_type.response import GetNextRecordTypeAnnotationResponseOuterInfo -from src.api.endpoints.annotate.relevance.get.dto import GetNextRelevanceAnnotationResponseOuterInfo -from src.api.endpoints.annotate.relevance.post.dto import RelevanceAnnotationPostInfo -from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo -from src.db.dtos.url.insert import InsertURLsInfo -from src.db.dtos.url.mapping import URLMapping -from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion -from src.core.error_manager.enums import ErrorTypes -from src.core.enums import RecordType, SuggestionType, SuggestedStatus -from src.core.exceptions import FailedValidationException -from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.impl.url.suggestion.relevant.user import UserRelevantSuggestion -from tests.helpers.setup.annotate_agency.model import AnnotateAgencySetupInfo -from tests.helpers.setup.final_review.core import setup_for_get_next_url_for_final_review -from tests.helpers.setup.annotate_agency.core import setup_for_annotate_agency -from tests.helpers.data_creator.models.creation_info.batch.v1 import BatchURLCreationInfo -from tests.automated.integration.api.conftest import MOCK_USER_ID - -def check_url_mappings_match( - map_1: URLMapping, - map_2: URLMapping -): - assert map_1.url_id == map_2.url_id - assert map_2.url == map_2.url - -def check_html_info_not_empty( - html_info: ResponseHTMLInfo -): - assert not html_info_empty(html_info) - -def html_info_empty( - html_info: ResponseHTMLInfo -) -> bool: - return html_info.description == "" and html_info.title == "" - -@pytest.mark.asyncio -async def test_annotate_relevancy(api_test_helper): - ath = api_test_helper - - batch_id = ath.db_data_creator.batch() - - # Create 2 URLs with outcome `pending` - iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=2) - - url_1 = iui.url_mappings[0] - url_2 = iui.url_mappings[1] - - # Add `Relevancy` attribute with value `True` to 1st URL - await ath.db_data_creator.auto_relevant_suggestions( - url_id=url_1.url_id, - relevant=True - ) - - # Add 'Relevancy' attribute with value `False` to 2nd URL - await ath.db_data_creator.auto_relevant_suggestions( - url_id=url_2.url_id, - relevant=False - ) - - # Add HTML data to both - await ath.db_data_creator.html_data([url_1.url_id, url_2.url_id]) - # Call `GET` `/annotate/relevance` and receive next URL - request_info_1: GetNextRelevanceAnnotationResponseOuterInfo = api_test_helper.request_validator.get_next_relevance_annotation() - inner_info_1 = request_info_1.next_annotation - - check_url_mappings_match(inner_info_1.url_info, url_1) - check_html_info_not_empty(inner_info_1.html_info) - - # Validate that the correct relevant value is returned - assert inner_info_1.annotation.is_relevant is True - - # A second user should see the same URL - - - # Annotate with value 'False' and get next URL - request_info_2: GetNextRelevanceAnnotationResponseOuterInfo = api_test_helper.request_validator.post_relevance_annotation_and_get_next( - url_id=inner_info_1.url_info.url_id, - relevance_annotation_post_info=RelevanceAnnotationPostInfo( - suggested_status=SuggestedStatus.NOT_RELEVANT - ) - ) - - inner_info_2 = request_info_2.next_annotation - - check_url_mappings_match( - inner_info_2.url_info, - url_2 - ) - check_html_info_not_empty(inner_info_2.html_info) - - request_info_3: GetNextRelevanceAnnotationResponseOuterInfo = api_test_helper.request_validator.post_relevance_annotation_and_get_next( - url_id=inner_info_2.url_info.url_id, - relevance_annotation_post_info=RelevanceAnnotationPostInfo( - suggested_status=SuggestedStatus.RELEVANT - ) - ) - - assert request_info_3.next_annotation is None - - # Get all URL annotations. Confirm they exist for user - adb_client = ath.adb_client() - results: list[UserRelevantSuggestion] = await adb_client.get_all(UserRelevantSuggestion) - result_1 = results[0] - result_2 = results[1] - - assert result_1.url_id == inner_info_1.url_info.url_id - assert result_1.suggested_status == SuggestedStatus.NOT_RELEVANT.value - - assert result_2.url_id == inner_info_2.url_info.url_id - assert result_2.suggested_status == SuggestedStatus.RELEVANT.value - - # If user submits annotation for same URL, the URL should be overwritten - request_info_4: GetNextRelevanceAnnotationResponseOuterInfo = api_test_helper.request_validator.post_relevance_annotation_and_get_next( - url_id=inner_info_1.url_info.url_id, - relevance_annotation_post_info=RelevanceAnnotationPostInfo( - suggested_status=SuggestedStatus.RELEVANT - ) - ) - - assert request_info_4.next_annotation is None - - results: list[UserRelevantSuggestion] = await adb_client.get_all(UserRelevantSuggestion) - assert len(results) == 2 - - for result in results: - if result.url_id == inner_info_1.url_info.url_id: - assert results[0].suggested_status == SuggestedStatus.RELEVANT.value - -async def post_and_validate_relevancy_annotation(ath, url_id, annotation: SuggestedStatus): - response = ath.request_validator.post_relevance_annotation_and_get_next( - url_id=url_id, - relevance_annotation_post_info=RelevanceAnnotationPostInfo( - suggested_status=annotation - ) - ) - - assert response.next_annotation is None - - results: list[UserRelevantSuggestion] = await ath.adb_client().get_all(UserRelevantSuggestion) - assert len(results) == 1 - assert results[0].suggested_status == annotation.value - -@pytest.mark.asyncio -async def test_annotate_relevancy_broken_page(api_test_helper): - ath = api_test_helper - - creation_info = await ath.db_data_creator.batch_and_urls(url_count=1, with_html_content=False) - - await post_and_validate_relevancy_annotation( - ath, - url_id=creation_info.url_ids[0], - annotation=SuggestedStatus.BROKEN_PAGE_404 - ) - -@pytest.mark.asyncio -async def test_annotate_relevancy_individual_record(api_test_helper): - ath = api_test_helper - - creation_info: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( - url_count=1 - ) - - await post_and_validate_relevancy_annotation( - ath, - url_id=creation_info.url_ids[0], - annotation=SuggestedStatus.INDIVIDUAL_RECORD - ) - -@pytest.mark.asyncio -async def test_annotate_relevancy_already_annotated_by_different_user( - api_test_helper -): - ath = api_test_helper - - creation_info: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( - url_count=1 - ) - - await ath.db_data_creator.user_relevant_suggestion( - url_id=creation_info.url_ids[0], - user_id=2, - suggested_status=SuggestedStatus.RELEVANT - ) - - # Annotate with different user (default is 1) and get conflict error - try: - response = await ath.request_validator.post_relevance_annotation_and_get_next( - url_id=creation_info.url_ids[0], - relevance_annotation_post_info=RelevanceAnnotationPostInfo( - suggested_status=SuggestedStatus.NOT_RELEVANT - ) - ) - except HTTPException as e: - assert e.status_code == HTTPStatus.CONFLICT - assert e.detail["detail"]["code"] == ErrorTypes.ANNOTATION_EXISTS.value - assert e.detail["detail"]["message"] == f"Annotation of type RELEVANCE already exists for url {creation_info.url_ids[0]}" - - -@pytest.mark.asyncio -async def test_annotate_relevancy_no_html(api_test_helper): - ath = api_test_helper - - batch_id = ath.db_data_creator.batch() - - # Create 2 URLs with outcome `pending` - iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=2) - - url_1 = iui.url_mappings[0] - url_2 = iui.url_mappings[1] - - # Add `Relevancy` attribute with value `True` to 1st URL - await ath.db_data_creator.auto_relevant_suggestions( - url_id=url_1.url_id, - relevant=True - ) - - # Add 'Relevancy' attribute with value `False` to 2nd URL - await ath.db_data_creator.auto_relevant_suggestions( - url_id=url_2.url_id, - relevant=False - ) - - # Call `GET` `/annotate/relevance` and receive next URL - request_info_1: GetNextRelevanceAnnotationResponseOuterInfo = api_test_helper.request_validator.get_next_relevance_annotation() - inner_info_1 = request_info_1.next_annotation - - check_url_mappings_match(inner_info_1.url_info, url_1) - assert html_info_empty(inner_info_1.html_info) - -@pytest.mark.asyncio -async def test_annotate_record_type(api_test_helper): - ath = api_test_helper - - batch_id = ath.db_data_creator.batch() - - # Create 2 URLs with outcome `pending` - iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=2) - - url_1 = iui.url_mappings[0] - url_2 = iui.url_mappings[1] - - # Add record type attribute with value `Accident Reports` to 1st URL - await ath.db_data_creator.auto_record_type_suggestions( - url_id=url_1.url_id, - record_type=RecordType.ACCIDENT_REPORTS - ) - - # Add 'Record Type' attribute with value `Dispatch Recordings` to 2nd URL - await ath.db_data_creator.auto_record_type_suggestions( - url_id=url_2.url_id, - record_type=RecordType.DISPATCH_RECORDINGS - ) - - # Add HTML data to both - await ath.db_data_creator.html_data([url_1.url_id, url_2.url_id]) - - # Call `GET` `/annotate/record-type` and receive next URL - request_info_1: GetNextRecordTypeAnnotationResponseOuterInfo = api_test_helper.request_validator.get_next_record_type_annotation() - inner_info_1 = request_info_1.next_annotation - - check_url_mappings_match(inner_info_1.url_info, url_1) - check_html_info_not_empty(inner_info_1.html_info) - - # Validate that the correct record type is returned - assert inner_info_1.suggested_record_type == RecordType.ACCIDENT_REPORTS - - # Annotate with value 'Personnel Records' and get next URL - request_info_2: GetNextRecordTypeAnnotationResponseOuterInfo = api_test_helper.request_validator.post_record_type_annotation_and_get_next( - url_id=inner_info_1.url_info.url_id, - record_type_annotation_post_info=RecordTypeAnnotationPostInfo( - record_type=RecordType.PERSONNEL_RECORDS - ) - ) - - inner_info_2 = request_info_2.next_annotation - - check_url_mappings_match(inner_info_2.url_info, url_2) - check_html_info_not_empty(inner_info_2.html_info) - - request_info_3: GetNextRecordTypeAnnotationResponseOuterInfo = api_test_helper.request_validator.post_record_type_annotation_and_get_next( - url_id=inner_info_2.url_info.url_id, - record_type_annotation_post_info=RecordTypeAnnotationPostInfo( - record_type=RecordType.ANNUAL_AND_MONTHLY_REPORTS - ) - ) - - assert request_info_3.next_annotation is None - - # Get all URL annotations. Confirm they exist for user - adb_client = ath.adb_client() - results: list[UserRecordTypeSuggestion] = await adb_client.get_all(UserRecordTypeSuggestion) - result_1 = results[0] - result_2 = results[1] - - assert result_1.url_id == inner_info_1.url_info.url_id - assert result_1.record_type == RecordType.PERSONNEL_RECORDS.value - - assert result_2.url_id == inner_info_2.url_info.url_id - assert result_2.record_type == RecordType.ANNUAL_AND_MONTHLY_REPORTS.value - - # If user submits annotation for same URL, the URL should be overwritten - - request_info_4: GetNextRecordTypeAnnotationResponseOuterInfo = api_test_helper.request_validator.post_record_type_annotation_and_get_next( - url_id=inner_info_1.url_info.url_id, - record_type_annotation_post_info=RecordTypeAnnotationPostInfo( - record_type=RecordType.BOOKING_REPORTS - ) - ) - - assert request_info_4.next_annotation is None - - results: list[UserRecordTypeSuggestion] = await adb_client.get_all(UserRecordTypeSuggestion) - assert len(results) == 2 - - for result in results: - if result.url_id == inner_info_1.url_info.url_id: - assert result.record_type == RecordType.BOOKING_REPORTS.value - -@pytest.mark.asyncio -async def test_annotate_record_type_already_annotated_by_different_user( - api_test_helper -): - ath = api_test_helper - - creation_info: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( - url_count=1 - ) - - await ath.db_data_creator.user_record_type_suggestion( - url_id=creation_info.url_ids[0], - user_id=2, - record_type=RecordType.ACCIDENT_REPORTS - ) - - # Annotate with different user (default is 1) and get conflict error - try: - response = await ath.request_validator.post_record_type_annotation_and_get_next( - url_id=creation_info.url_ids[0], - record_type_annotation_post_info=RecordTypeAnnotationPostInfo( - record_type=RecordType.ANNUAL_AND_MONTHLY_REPORTS - ) - ) - except HTTPException as e: - assert e.status_code == HTTPStatus.CONFLICT - assert e.detail["detail"]["code"] == ErrorTypes.ANNOTATION_EXISTS.value - assert e.detail["detail"]["message"] == f"Annotation of type RECORD_TYPE already exists for url {creation_info.url_ids[0]}" - - -@pytest.mark.asyncio -async def test_annotate_record_type_no_html_info(api_test_helper): - ath = api_test_helper - - batch_id = ath.db_data_creator.batch() - - # Create 2 URLs with outcome `pending` - iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=2) - - url_1 = iui.url_mappings[0] - url_2 = iui.url_mappings[1] - - # Add record type attribute with value `Accident Reports` to 1st URL - await ath.db_data_creator.auto_record_type_suggestions( - url_id=url_1.url_id, - record_type=RecordType.ACCIDENT_REPORTS - ) - - # Add 'Record Type' attribute with value `Dispatch Recordings` to 2nd URL - await ath.db_data_creator.auto_record_type_suggestions( - url_id=url_2.url_id, - record_type=RecordType.DISPATCH_RECORDINGS - ) - - # Call `GET` `/annotate/record-type` and receive next URL - request_info_1: GetNextRecordTypeAnnotationResponseOuterInfo = api_test_helper.request_validator.get_next_record_type_annotation() - inner_info_1 = request_info_1.next_annotation - - check_url_mappings_match(inner_info_1.url_info, url_1) - assert html_info_empty(inner_info_1.html_info) - -@pytest.mark.asyncio -async def test_annotate_agency_multiple_auto_suggestions(api_test_helper): - """ - Test Scenario: Multiple Auto Suggestions - A URL has multiple Agency Auto Suggestion and has not been annotated by the User - The user should receive all of the auto suggestions with full detail - """ - ath = api_test_helper - buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( - url_count=1, - with_html_content=True - ) - await ath.db_data_creator.auto_suggestions( - url_ids=buci.url_ids, - num_suggestions=2, - suggestion_type=SuggestionType.AUTO_SUGGESTION - ) - - # User requests next annotation - response = await ath.request_validator.get_next_agency_annotation() - - assert response.next_annotation - next_annotation = response.next_annotation - # Check that url_id matches the one we inserted - assert next_annotation.url_info.url_id == buci.url_ids[0] - - # Check that html data is present - assert next_annotation.html_info.description != "" - assert next_annotation.html_info.title != "" - - # Check that two agency_suggestions exist - assert len(next_annotation.agency_suggestions) == 2 - - for agency_suggestion in next_annotation.agency_suggestions: - assert agency_suggestion.suggestion_type == SuggestionType.AUTO_SUGGESTION - assert agency_suggestion.pdap_agency_id is not None - assert agency_suggestion.agency_name is not None - assert agency_suggestion.state is not None - assert agency_suggestion.county is not None - assert agency_suggestion.locality is not None - - -@pytest.mark.asyncio -async def test_annotate_agency_multiple_auto_suggestions_no_html(api_test_helper): - """ - Test Scenario: Multiple Auto Suggestions - A URL has multiple Agency Auto Suggestion and has not been annotated by the User - The user should receive all of the auto suggestions with full detail - """ - ath = api_test_helper - buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( - url_count=1, - with_html_content=False - ) - await ath.db_data_creator.auto_suggestions( - url_ids=buci.url_ids, - num_suggestions=2, - suggestion_type=SuggestionType.AUTO_SUGGESTION - ) - - # User requests next annotation - response = await ath.request_validator.get_next_agency_annotation() - - assert response.next_annotation - next_annotation = response.next_annotation - # Check that url_id matches the one we inserted - assert next_annotation.url_info.url_id == buci.url_ids[0] - - # Check that html data is not present - assert next_annotation.html_info.description == "" - assert next_annotation.html_info.title == "" - -@pytest.mark.asyncio -async def test_annotate_agency_single_unknown_auto_suggestion(api_test_helper): - """ - Test Scenario: Single Unknown Auto Suggestion - A URL has a single Unknown Agency Auto Suggestion and has not been annotated by the User - The user should receive a single Unknown Auto Suggestion lacking other detail - """ - ath = api_test_helper - buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( - url_count=1, - with_html_content=True - ) - await ath.db_data_creator.auto_suggestions( - url_ids=buci.url_ids, - num_suggestions=1, - suggestion_type=SuggestionType.UNKNOWN - ) - response = await ath.request_validator.get_next_agency_annotation() - - assert response.next_annotation - next_annotation = response.next_annotation - # Check that url_id matches the one we inserted - assert next_annotation.url_info.url_id == buci.url_ids[0] - - # Check that html data is present - assert next_annotation.html_info.description != "" - assert next_annotation.html_info.title != "" - - # Check that one agency_suggestion exists - assert len(next_annotation.agency_suggestions) == 1 - - agency_suggestion = next_annotation.agency_suggestions[0] - - assert agency_suggestion.suggestion_type == SuggestionType.UNKNOWN - assert agency_suggestion.pdap_agency_id is None - assert agency_suggestion.agency_name is None - assert agency_suggestion.state is None - assert agency_suggestion.county is None - assert agency_suggestion.locality is None - - -@pytest.mark.asyncio -async def test_annotate_agency_single_confirmed_agency(api_test_helper): - """ - Test Scenario: Single Confirmed Agency - A URL has a single Confirmed Agency and has not been annotated by the User - The user should not receive this URL to annotate - """ - ath = api_test_helper - buci: BatchURLCreationInfo = await ath.db_data_creator.batch_and_urls( - url_count=1, - with_html_content=True - ) - await ath.db_data_creator.confirmed_suggestions( - url_ids=buci.url_ids, - ) - response = await ath.request_validator.get_next_agency_annotation() - assert response.next_annotation is None - -@pytest.mark.asyncio -async def test_annotate_agency_other_user_annotation(api_test_helper): - """ - Test Scenario: Other User Annotation - A URL has been annotated by another User - Our user should still receive this URL to annotate - """ - ath = api_test_helper - setup_info: AnnotateAgencySetupInfo = await setup_for_annotate_agency( - db_data_creator=ath.db_data_creator, - url_count=1 - ) - url_ids = setup_info.url_ids - - response = await ath.request_validator.get_next_agency_annotation() - - assert response.next_annotation - next_annotation = response.next_annotation - # Check that url_id matches the one we inserted - assert next_annotation.url_info.url_id == url_ids[0] - - # Check that html data is present - assert next_annotation.html_info.description != "" - assert next_annotation.html_info.title != "" - - # Check that one agency_suggestion exists - assert len(next_annotation.agency_suggestions) == 1 - - # Test that another user can insert a suggestion - await ath.db_data_creator.manual_suggestion( - user_id=MOCK_USER_ID + 1, - url_id=url_ids[0], - ) - - # After this, text that our user does not receive this URL - response = await ath.request_validator.get_next_agency_annotation() - assert response.next_annotation is None - -@pytest.mark.asyncio -async def test_annotate_agency_submit_and_get_next(api_test_helper): - """ - Test Scenario: Submit and Get Next (no other URL available) - A URL has been annotated by our User, and no other valid URLs have not been annotated - Our user should not receive another URL to annotate - Until another relevant URL is added - """ - ath = api_test_helper - setup_info: AnnotateAgencySetupInfo = await setup_for_annotate_agency( - db_data_creator=ath.db_data_creator, - url_count=2 - ) - url_ids = setup_info.url_ids - - # User should submit an annotation and receive the next - response = await ath.request_validator.post_agency_annotation_and_get_next( - url_id=url_ids[0], - agency_annotation_post_info=URLAgencyAnnotationPostInfo( - suggested_agency=await ath.db_data_creator.agency(), - is_new=False - ) - - ) - assert response.next_annotation is not None - - # User should submit this annotation and receive none for the next - response = await ath.request_validator.post_agency_annotation_and_get_next( - url_id=url_ids[1], - agency_annotation_post_info=URLAgencyAnnotationPostInfo( - suggested_agency=await ath.db_data_creator.agency(), - is_new=False - ) - ) - assert response.next_annotation is None - - -@pytest.mark.asyncio -async def test_annotate_agency_submit_new(api_test_helper): - """ - Test Scenario: Submit New - Our user receives an annotation and marks it as `NEW` - This should complete successfully - And within the database the annotation should be marked as `NEW` - """ - ath = api_test_helper - adb_client = ath.adb_client() - setup_info: AnnotateAgencySetupInfo = await setup_for_annotate_agency( - db_data_creator=ath.db_data_creator, - url_count=1 - ) - url_ids = setup_info.url_ids - - # User should submit an annotation and mark it as New - response = await ath.request_validator.post_agency_annotation_and_get_next( - url_id=url_ids[0], - agency_annotation_post_info=URLAgencyAnnotationPostInfo( - suggested_agency=await ath.db_data_creator.agency(), - is_new=True - ) - ) - assert response.next_annotation is None - - # Within database, the annotation should be marked as `NEW` - all_manual_suggestions = await adb_client.get_all(UserUrlAgencySuggestion) - assert len(all_manual_suggestions) == 1 - assert all_manual_suggestions[0].is_new - -@pytest.mark.asyncio -async def test_annotate_all(api_test_helper): - """ - Test the happy path workflow for the all-annotations endpoint - The user should be able to get a valid URL (filtering on batch id if needed), - submit a full annotation, and receive another URL - """ - ath = api_test_helper - adb_client = ath.adb_client() - setup_info_1 = await setup_for_get_next_url_for_final_review( - db_data_creator=ath.db_data_creator, include_user_annotations=False - ) - url_mapping_1 = setup_info_1.url_mapping - setup_info_2 = await setup_for_get_next_url_for_final_review( - db_data_creator=ath.db_data_creator, include_user_annotations=False - ) - url_mapping_2 = setup_info_2.url_mapping - - # First, get a valid URL to annotate - get_response_1 = await ath.request_validator.get_next_url_for_all_annotations() - - # Apply the second batch id as a filter and see that a different URL is returned - get_response_2 = await ath.request_validator.get_next_url_for_all_annotations( - batch_id=setup_info_2.batch_id - ) - - assert get_response_1.next_annotation.url_info.url_id != get_response_2.next_annotation.url_info.url_id - - # Annotate the first and submit - agency_id = await ath.db_data_creator.agency() - post_response_1 = await ath.request_validator.post_all_annotations_and_get_next( - url_id=url_mapping_1.url_id, - all_annotations_post_info=AllAnnotationPostInfo( - suggested_status=SuggestedStatus.RELEVANT, - record_type=RecordType.ACCIDENT_REPORTS, - agency=URLAgencyAnnotationPostInfo( - is_new=False, - suggested_agency=agency_id - ) - ) - ) - assert post_response_1.next_annotation is not None - - # Confirm the second is received - assert post_response_1.next_annotation.url_info.url_id == url_mapping_2.url_id - - # Upon submitting the second, confirm that no more URLs are returned through either POST or GET - post_response_2 = await ath.request_validator.post_all_annotations_and_get_next( - url_id=url_mapping_2.url_id, - all_annotations_post_info=AllAnnotationPostInfo( - suggested_status=SuggestedStatus.NOT_RELEVANT, - ) - ) - assert post_response_2.next_annotation is None - - get_response_3 = await ath.request_validator.get_next_url_for_all_annotations() - assert get_response_3.next_annotation is None - - - # Check that all annotations are present in the database - - # Should be two relevance annotations, one True and one False - all_relevance_suggestions: list[UserRelevantSuggestion] = await adb_client.get_all(UserRelevantSuggestion) - assert len(all_relevance_suggestions) == 2 - assert all_relevance_suggestions[0].suggested_status == SuggestedStatus.RELEVANT.value - assert all_relevance_suggestions[1].suggested_status == SuggestedStatus.NOT_RELEVANT.value - - # Should be one agency - all_agency_suggestions = await adb_client.get_all(UserUrlAgencySuggestion) - assert len(all_agency_suggestions) == 1 - assert all_agency_suggestions[0].is_new == False - assert all_agency_suggestions[0].agency_id == agency_id - - # Should be one record type - all_record_type_suggestions = await adb_client.get_all(UserRecordTypeSuggestion) - assert len(all_record_type_suggestions) == 1 - assert all_record_type_suggestions[0].record_type == RecordType.ACCIDENT_REPORTS.value - -@pytest.mark.asyncio -async def test_annotate_all_post_batch_filtering(api_test_helper): - """ - Batch filtering should also work when posting annotations - """ - ath = api_test_helper - adb_client = ath.adb_client() - setup_info_1 = await setup_for_get_next_url_for_final_review( - db_data_creator=ath.db_data_creator, include_user_annotations=False - ) - url_mapping_1 = setup_info_1.url_mapping - setup_info_2 = await setup_for_get_next_url_for_final_review( - db_data_creator=ath.db_data_creator, include_user_annotations=False - ) - setup_info_3 = await setup_for_get_next_url_for_final_review( - db_data_creator=ath.db_data_creator, include_user_annotations=False - ) - url_mapping_3 = setup_info_3.url_mapping - - # Submit the first annotation, using the third batch id, and receive the third URL - post_response_1 = await ath.request_validator.post_all_annotations_and_get_next( - url_id=url_mapping_1.url_id, - batch_id=setup_info_3.batch_id, - all_annotations_post_info=AllAnnotationPostInfo( - suggested_status=SuggestedStatus.RELEVANT, - record_type=RecordType.ACCIDENT_REPORTS, - agency=URLAgencyAnnotationPostInfo( - is_new=True - ) - ) - ) - - assert post_response_1.next_annotation.url_info.url_id == url_mapping_3.url_id - - -@pytest.mark.asyncio -async def test_annotate_all_validation_error(api_test_helper): - """ - Validation errors in the PostInfo DTO should result in a 400 BAD REQUEST response - """ - ath = api_test_helper - setup_info_1 = await setup_for_get_next_url_for_final_review( - db_data_creator=ath.db_data_creator, include_user_annotations=False - ) - url_mapping_1 = setup_info_1.url_mapping - - with pytest.raises(FailedValidationException) as e: - response = await ath.request_validator.post_all_annotations_and_get_next( - url_id=url_mapping_1.url_id, - all_annotations_post_info=AllAnnotationPostInfo( - suggested_status=SuggestedStatus.NOT_RELEVANT, - record_type=RecordType.ACCIDENT_REPORTS - ) - ) diff --git a/tests/automated/integration/api/test_batch.py b/tests/automated/integration/api/test_batch.py deleted file mode 100644 index 4dd21a49..00000000 --- a/tests/automated/integration/api/test_batch.py +++ /dev/null @@ -1,237 +0,0 @@ -import pytest - -from src.db.models.impl.batch.pydantic import BatchInfo -from src.db.dtos.url.insert import InsertURLsInfo -from src.collectors.impl.example.dtos.input import ExampleInputDTO -from src.collectors.enums import CollectorType, URLStatus -from src.core.enums import BatchStatus -from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters -from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters - - -@pytest.mark.asyncio -async def test_get_batch_summaries(api_test_helper): - ath = api_test_helper - - batch_params = [ - TestBatchCreationParameters( - urls=[ - TestURLCreationParameters( - count=1, - status=URLStatus.PENDING - ), - TestURLCreationParameters( - count=2, - status=URLStatus.SUBMITTED - ) - ] - ), - TestBatchCreationParameters( - urls=[ - TestURLCreationParameters( - count=4, - status=URLStatus.NOT_RELEVANT - ), - TestURLCreationParameters( - count=3, - status=URLStatus.ERROR - ) - ] - ), - TestBatchCreationParameters( - urls=[ - TestURLCreationParameters( - count=7, - status=URLStatus.DUPLICATE - ), - TestURLCreationParameters( - count=1, - status=URLStatus.SUBMITTED - ) - ] - ) - ] - - batch_1_creation_info = await ath.db_data_creator.batch_v2(batch_params[0]) - batch_2_creation_info = await ath.db_data_creator.batch_v2(batch_params[1]) - batch_3_creation_info = await ath.db_data_creator.batch_v2(batch_params[2]) - - batch_1_id = batch_1_creation_info.batch_id - batch_2_id = batch_2_creation_info.batch_id - batch_3_id = batch_3_creation_info.batch_id - - - response = ath.request_validator.get_batch_statuses() - results = response.results - - assert len(results) == 3 - - result_1 = results[0] - assert result_1.id == batch_1_id - assert result_1.status == BatchStatus.READY_TO_LABEL - counts_1 = result_1.url_counts - assert counts_1.total == 3 - assert counts_1.pending == 1 - assert counts_1.submitted == 2 - assert counts_1.not_relevant == 0 - assert counts_1.duplicate == 0 - assert counts_1.errored == 0 - - result_2 = results[1] - assert result_2.id == batch_2_id - counts_2 = result_2.url_counts - assert counts_2.total == 7 - assert counts_2.not_relevant == 4 - assert counts_2.errored == 3 - assert counts_2.pending == 0 - assert counts_2.submitted == 0 - assert counts_2.duplicate == 0 - - result_3 = results[2] - assert result_3.id == batch_3_id - counts_3 = result_3.url_counts - assert counts_3.total == 8 - assert counts_3.not_relevant == 0 - assert counts_3.errored == 0 - assert counts_3.pending == 0 - assert counts_3.submitted == 1 - assert counts_3.duplicate == 7 - - - - - - -@pytest.mark.asyncio -async def test_get_batch_summaries_pending_url_filter(api_test_helper): - ath = api_test_helper - - # Add an errored out batch - batch_error = await ath.db_data_creator.batch_and_urls( - strategy=CollectorType.EXAMPLE, - url_count=2, - batch_status=BatchStatus.ERROR - ) - - # Add a batch with pending urls - batch_pending = await ath.db_data_creator.batch_and_urls( - strategy=CollectorType.EXAMPLE, - url_count=2, - batch_status=BatchStatus.READY_TO_LABEL, - with_html_content=True, - url_status=URLStatus.PENDING - ) - - # Add a batch with submitted URLs - batch_submitted = await ath.db_data_creator.batch_and_urls( - strategy=CollectorType.EXAMPLE, - url_count=2, - batch_status=BatchStatus.READY_TO_LABEL, - with_html_content=True, - url_status=URLStatus.SUBMITTED - ) - - # Add an aborted batch - batch_aborted = await ath.db_data_creator.batch_and_urls( - strategy=CollectorType.EXAMPLE, - url_count=2, - batch_status=BatchStatus.ABORTED - ) - - # Add a batch with validated URLs - batch_validated = await ath.db_data_creator.batch_and_urls( - strategy=CollectorType.EXAMPLE, - url_count=2, - batch_status=BatchStatus.READY_TO_LABEL, - with_html_content=True, - url_status=URLStatus.VALIDATED - ) - - # Test filter for pending URLs and only retrieve the second batch - pending_urls_results = ath.request_validator.get_batch_statuses( - has_pending_urls=True - ) - - assert len(pending_urls_results.results) == 1 - assert pending_urls_results.results[0].id == batch_pending.batch_id - - # Test filter without pending URLs and retrieve the other four batches - no_pending_urls_results = ath.request_validator.get_batch_statuses( - has_pending_urls=False - ) - - assert len(no_pending_urls_results.results) == 4 - for result in no_pending_urls_results.results: - assert result.id in [ - batch_error.batch_id, - batch_submitted.batch_id, - batch_validated.batch_id, - batch_aborted.batch_id - ] - - # Test no filter for pending URLs and retrieve all batches - no_filter_results = ath.request_validator.get_batch_statuses() - - assert len(no_filter_results.results) == 5 - - - - -def test_abort_batch(api_test_helper): - ath = api_test_helper - - dto = ExampleInputDTO( - sleep_time=1 - ) - - batch_id = ath.request_validator.example_collector(dto=dto)["batch_id"] - - response = ath.request_validator.abort_batch(batch_id=batch_id) - - assert response.message == "Batch aborted." - - bi: BatchInfo = ath.request_validator.get_batch_info(batch_id=batch_id) - - assert bi.status == BatchStatus.ABORTED - -def test_get_batch_urls(api_test_helper): - - # Insert batch and urls into database - ath = api_test_helper - batch_id = ath.db_data_creator.batch() - iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=101) - - response = ath.request_validator.get_batch_urls(batch_id=batch_id, page=1) - assert len(response.urls) == 100 - # Check that the first url corresponds to the first url inserted - assert response.urls[0].url == iui.url_mappings[0].url - # Check that the last url corresponds to the 100th url inserted - assert response.urls[-1].url == iui.url_mappings[99].url - - - # Check that a more limited set of urls exist - response = ath.request_validator.get_batch_urls(batch_id=batch_id, page=2) - assert len(response.urls) == 1 - # Check that this url corresponds to the last url inserted - assert response.urls[0].url == iui.url_mappings[-1].url - -def test_get_duplicate_urls(api_test_helper): - - # Insert batch and url into database - ath = api_test_helper - batch_id = ath.db_data_creator.batch() - iui: InsertURLsInfo = ath.db_data_creator.urls(batch_id=batch_id, url_count=101) - # Get a list of all url ids - url_ids = [url.url_id for url in iui.url_mappings] - - # Create a second batch which will be associated with the duplicates - dup_batch_id = ath.db_data_creator.batch() - - # Insert duplicate urls into database - ath.db_data_creator.duplicate_urls(duplicate_batch_id=dup_batch_id, url_ids=url_ids) - - response = ath.request_validator.get_batch_url_duplicates(batch_id=dup_batch_id, page=1) - assert len(response.duplicates) == 100 - - response = ath.request_validator.get_batch_url_duplicates(batch_id=dup_batch_id, page=2) - assert len(response.duplicates) == 1 \ No newline at end of file diff --git a/tests/automated/integration/api/test_manual_batch.py b/tests/automated/integration/api/test_manual_batch.py index 9b3fb326..1d2e595d 100644 --- a/tests/automated/integration/api/test_manual_batch.py +++ b/tests/automated/integration/api/test_manual_batch.py @@ -2,7 +2,7 @@ import pytest from src.api.endpoints.collector.dtos.manual_batch.post import ManualBatchInnerInputDTO, ManualBatchInputDTO -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.batch.sqlalchemy import Batch diff --git a/tests/automated/integration/core/async_/run_task/test_break_loop.py b/tests/automated/integration/core/async_/run_task/test_break_loop.py index 0d8a9bc2..71b5704f 100644 --- a/tests/automated/integration/core/async_/run_task/test_break_loop.py +++ b/tests/automated/integration/core/async_/run_task/test_break_loop.py @@ -21,9 +21,9 @@ async def test_run_task_break_loop(db_data_creator: DBDataCreator): and an alert should be sent to discord """ - async def run_task(self, task_id: int) -> TaskOperatorRunInfo: + async def run_task(self) -> TaskOperatorRunInfo: return TaskOperatorRunInfo( - task_id=task_id, + task_id=1, outcome=TaskOperatorOutcome.SUCCESS, task_type=TaskType.HTML ) diff --git a/tests/automated/integration/core/async_/run_task/test_prereq_met.py b/tests/automated/integration/core/async_/run_task/test_prereq_met.py index a7724a45..e5425fd9 100644 --- a/tests/automated/integration/core/async_/run_task/test_prereq_met.py +++ b/tests/automated/integration/core/async_/run_task/test_prereq_met.py @@ -18,12 +18,11 @@ async def test_run_task_prereq_met(db_data_creator: DBDataCreator): """ When a task pre-requisite is met, the task should be run - And a task entry should be created in the database """ - async def run_task(self, task_id: int) -> TaskOperatorRunInfo: + async def run_task(self) -> TaskOperatorRunInfo: return TaskOperatorRunInfo( - task_id=task_id, + task_id=1, task_type=TaskType.HTML, outcome=TaskOperatorOutcome.SUCCESS, ) @@ -48,9 +47,4 @@ async def run_task(self, task_id: int) -> TaskOperatorRunInfo: # There should be two calls to meets_task_prerequisites mock_operator.meets_task_prerequisites.assert_has_calls([call(), call()]) - results = await db_data_creator.adb_client.get_all(Task) - - assert len(results) == 1 - assert results[0].task_status == BatchStatus.IN_PROCESS.value - core.task_manager.conclude_task.assert_called_once() diff --git a/tests/automated/integration/db/client/approve_url/test_basic.py b/tests/automated/integration/db/client/approve_url/test_basic.py index 2a7f9569..62f215fb 100644 --- a/tests/automated/integration/db/client/approve_url/test_basic.py +++ b/tests/automated/integration/db/client/approve_url/test_basic.py @@ -3,6 +3,7 @@ from src.api.endpoints.review.approve.dto import FinalReviewApprovalInfo from src.collectors.enums import URLStatus from src.core.enums import RecordType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata @@ -42,10 +43,16 @@ async def test_approve_url_basic(db_data_creator: DBDataCreator): url = urls[0] assert url.id == url_mapping.url_id assert url.record_type == RecordType.ARREST_RECORDS - assert url.status == URLStatus.VALIDATED + assert url.status == URLStatus.OK assert url.name == "Test Name" assert url.description == "Test Description" + # Confirm presence of validated flag + validated_flags: list[FlagURLValidated] = await adb_client.get_all(FlagURLValidated) + assert len(validated_flags) == 1 + assert validated_flags[0].url_id == url_mapping.url_id + + confirmed_agency: list[LinkURLAgency] = await adb_client.get_all(LinkURLAgency) assert len(confirmed_agency) == 1 assert confirmed_agency[0].url_id == url_mapping.url_id diff --git a/tests/automated/integration/db/client/get_next_url_for_final_review/test_new_agency.py b/tests/automated/integration/db/client/get_next_url_for_final_review/test_new_agency.py deleted file mode 100644 index 72430fec..00000000 --- a/tests/automated/integration/db/client/get_next_url_for_final_review/test_new_agency.py +++ /dev/null @@ -1,41 +0,0 @@ -import pytest - -from src.api.endpoints.annotate.agency.post.dto import URLAgencyAnnotationPostInfo -from src.core.enums import SuggestedStatus, RecordType, SuggestionType -from tests.helpers.batch_creation_parameters.annotation_info import AnnotationInfo -from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters -from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters -from tests.helpers.data_creator.core import DBDataCreator - - -@pytest.mark.asyncio -async def test_get_next_url_for_final_review_new_agency(db_data_creator: DBDataCreator): - """ - Test that a URL with a new agency is properly returned - """ - - # Apply batch v2 - parameters = TestBatchCreationParameters( - urls=[ - TestURLCreationParameters( - annotation_info=AnnotationInfo( - user_relevant=SuggestedStatus.RELEVANT, - user_agency=URLAgencyAnnotationPostInfo( - is_new=True - ), - user_record_type=RecordType.ARREST_RECORDS - ) - ) - ] - ) - creation_info = await db_data_creator.batch_v2(parameters) - outer_result = await db_data_creator.adb_client.get_next_url_for_final_review( - batch_id=None - ) - result = outer_result.next_source - - assert result is not None - user_suggestion = result.annotations.agency.user - assert user_suggestion.suggestion_type == SuggestionType.NEW_AGENCY - assert user_suggestion.pdap_agency_id is None - assert user_suggestion.agency_name is None diff --git a/tests/automated/integration/db/client/get_next_url_for_final_review/test_only_confirmed_urls.py b/tests/automated/integration/db/client/get_next_url_for_final_review/test_only_confirmed_urls.py index 7e68ada4..72706aaf 100644 --- a/tests/automated/integration/db/client/get_next_url_for_final_review/test_only_confirmed_urls.py +++ b/tests/automated/integration/db/client/get_next_url_for_final_review/test_only_confirmed_urls.py @@ -1,6 +1,7 @@ import pytest from src.collectors.enums import URLStatus +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.data_creator.core import DBDataCreator @@ -14,7 +15,7 @@ async def test_get_next_url_for_final_review_only_confirmed_urls(db_data_creator url_mapping = db_data_creator.urls( batch_id=batch_id, url_count=1, - outcome=URLStatus.SUBMITTED + outcome=URLCreationEnum.SUBMITTED ).url_mappings[0] result = await db_data_creator.adb_client.get_next_url_for_final_review( diff --git a/tests/automated/integration/db/client/get_next_url_for_user_relevance_annotation/test_validated.py b/tests/automated/integration/db/client/get_next_url_for_user_relevance_annotation/test_validated.py index 95e40847..ab5acd59 100644 --- a/tests/automated/integration/db/client/get_next_url_for_user_relevance_annotation/test_validated.py +++ b/tests/automated/integration/db/client/get_next_url_for_user_relevance_annotation/test_validated.py @@ -1,6 +1,7 @@ import pytest from src.collectors.enums import URLStatus +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.setup.annotation.core import setup_for_get_next_url_for_annotation from tests.helpers.data_creator.core import DBDataCreator @@ -12,19 +13,12 @@ async def test_get_next_url_for_user_relevance_annotation_validated( """ A validated URL should not turn up in get_next_url_for_user_annotation """ - - setup_info = await setup_for_get_next_url_for_annotation( - db_data_creator=db_data_creator, - url_count=1, - outcome=URLStatus.VALIDATED - ) - - - url_1 = setup_info.insert_urls_info.url_mappings[0] + dbdc = db_data_creator + url_1: int = (await dbdc.create_validated_urls())[0].url_id # Add `Relevancy` attribute with value `True` await db_data_creator.auto_relevant_suggestions( - url_id=url_1.url_id, + url_id=url_1, relevant=True ) diff --git a/tests/automated/integration/db/client/test_insert_urls.py b/tests/automated/integration/db/client/test_insert_urls.py index 78578c6b..f2d73f00 100644 --- a/tests/automated/integration/db/client/test_insert_urls.py +++ b/tests/automated/integration/db/client/test_insert_urls.py @@ -1,8 +1,8 @@ import pytest from src.core.enums import BatchStatus -from src.db.models.impl.batch.pydantic import BatchInfo -from src.db.models.impl.link.batch_url import LinkBatchURL +from src.db.models.impl.batch.pydantic.info import BatchInfo +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.core.pydantic.info import URLInfo from src.db.models.impl.url.core.sqlalchemy import URL diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/check.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/check.py new file mode 100644 index 00000000..81bef537 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/check.py @@ -0,0 +1,30 @@ +from unittest.mock import AsyncMock + +from src.core.tasks.scheduled.impl.huggingface.operator import PushToHuggingFaceTaskOperator +from src.core.tasks.scheduled.impl.huggingface.queries.get.model import GetForLoadingToHuggingFaceOutput + + +def check_results_called( + operator: PushToHuggingFaceTaskOperator, + expected_outputs: list[GetForLoadingToHuggingFaceOutput] +) -> None: + mock_hf_client: AsyncMock = operator.hf_client + mock_push: AsyncMock = mock_hf_client.push_data_sources_raw_to_hub + outputs: list[GetForLoadingToHuggingFaceOutput] = mock_push.call_args.args[0] + outputs = sorted(outputs, key=lambda x: x.url_id) + expected_outputs = sorted(expected_outputs, key=lambda x: x.url_id) + for output, expected_output in zip(outputs, expected_outputs): + assert output.url_id == expected_output.url_id + assert output.url == expected_output.url + assert output.relevant == expected_output.relevant, f"Expected {expected_output.relevant}, got {output.relevant}" + assert output.record_type_fine == expected_output.record_type_fine + assert output.record_type_coarse == expected_output.record_type_coarse + assert output.html == expected_output.html + + +def check_not_called( + operator: PushToHuggingFaceTaskOperator, +) -> None: + mock_hf_client: AsyncMock = operator.hf_client + mock_push: AsyncMock = mock_hf_client.push_data_sources_raw_to_hub + mock_push.assert_not_called() \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/data.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/data.py index 64a16f9f..e7a9a69b 100644 --- a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/data.py +++ b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/data.py @@ -1,71 +1,30 @@ -from src.collectors.enums import URLStatus from src.core.enums import RecordType from src.core.tasks.scheduled.impl.huggingface.queries.get.enums import RecordTypeCoarse -from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.models.entry \ - import TestPushToHuggingFaceURLSetupEntry as Entry -from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.models.output import \ - TestPushToHuggingFaceURLSetupExpectedOutput as Output -from tests.automated.integration.tasks.scheduled.impl.huggingface.setup.models.input import \ - TestPushToHuggingFaceURLSetupEntryInput as Input +from src.core.tasks.scheduled.impl.huggingface.queries.get.model import GetForLoadingToHuggingFaceOutput + + +def get_test_url(i: int) -> str: + return f"www.testPushToHuggingFaceURLSetupEntry.com/{i}" + +def get_test_html(i: int) -> str: + return f"
This is an example of HTML content.
+ + + """ def setup_url_to_response_info( ) -> dict[str, URLResponseInfo]: @@ -37,7 +49,7 @@ def setup_url_to_response_info( response_info = URLResponseInfo( success=_get_success(entry), status=get_http_status(entry), - html=generate_test_html() if _get_success(entry) else None, + html=_generate_test_html() if _get_success(entry) else None, content_type=_get_content_type(entry), exception=None if _get_success(entry) else "Error" ) diff --git a/tests/automated/integration/tasks/url/impl/html/setup/data.py b/tests/automated/integration/tasks/url/impl/html/setup/data.py index e9495ad4..5615392c 100644 --- a/tests/automated/integration/tasks/url/impl/html/setup/data.py +++ b/tests/automated/integration/tasks/url/impl/html/setup/data.py @@ -11,7 +11,7 @@ TestURLHTMLTaskSetupEntry( url_info=TestURLInfo( url="https://happy-path.com/pending", - status=URLStatus.PENDING + status=URLStatus.OK ), web_metadata_info=TestWebMetadataInfo( accessed=True, @@ -66,7 +66,7 @@ TestURLHTMLTaskSetupEntry( url_info=TestURLInfo( url="https://not-200-path.com/submitted", - status=URLStatus.PENDING + status=URLStatus.OK ), web_metadata_info=TestWebMetadataInfo( accessed=True, @@ -83,7 +83,7 @@ TestURLHTMLTaskSetupEntry( url_info=TestURLInfo( url="https://no-web-metadata.com/submitted", - status=URLStatus.PENDING + status=URLStatus.OK ), web_metadata_info=None, expected_result=ExpectedResult( diff --git a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_error.py b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_error.py index 404f00e1..e788fff1 100644 --- a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_error.py +++ b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_error.py @@ -1,15 +1,19 @@ import pytest from src.collectors.enums import URLStatus +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error from tests.automated.integration.tasks.url.impl.probe.check.manager import TestURLProbeCheckManager from tests.automated.integration.tasks.url.impl.probe.setup.manager import TestURLProbeSetupManager +from tests.helpers.data_creator.core import DBDataCreator @pytest.mark.asyncio async def test_url_probe_task_error( setup_manager: TestURLProbeSetupManager, - check_manager: TestURLProbeCheckManager + check_manager: TestURLProbeCheckManager, + db_data_creator: DBDataCreator ): """ If a URL returns a 500 error response (or any other error), @@ -28,15 +32,20 @@ async def test_url_probe_task_error( ) ) assert not await operator.meets_task_prerequisites() - url_id = await setup_manager.setup_url(URLStatus.SUBMITTED) + url_id: int = await setup_manager.setup_url(URLStatus.OK) + await db_data_creator.create_validated_flags([url_id], validation_type=URLValidatedType.DATA_SOURCE) + await db_data_creator.create_url_data_sources([url_id]) + assert await operator.meets_task_prerequisites() run_info = await operator.run_task() assert_task_ran_without_error(run_info) assert not await operator.meets_task_prerequisites() await check_manager.check_url( url_id=url_id, - expected_status=URLStatus.SUBMITTED + expected_status=URLStatus.OK ) + + await check_manager.check_web_metadata( url_id=url_id, status_code=500, diff --git a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_not_found.py b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_not_found.py index 97937c15..7fc54da4 100644 --- a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_not_found.py +++ b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_not_found.py @@ -1,15 +1,18 @@ import pytest from src.collectors.enums import URLStatus +from src.db.models.impl.flag.url_validated.enums import URLValidatedType from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error from tests.automated.integration.tasks.url.impl.probe.check.manager import TestURLProbeCheckManager from tests.automated.integration.tasks.url.impl.probe.setup.manager import TestURLProbeSetupManager +from tests.helpers.data_creator.core import DBDataCreator @pytest.mark.asyncio async def test_url_probe_task_not_found( setup_manager: TestURLProbeSetupManager, - check_manager: TestURLProbeCheckManager + check_manager: TestURLProbeCheckManager, + db_data_creator: DBDataCreator ): """ If a URL returns a 404 error response, @@ -29,14 +32,15 @@ async def test_url_probe_task_not_found( ) ) assert not await operator.meets_task_prerequisites() - url_id = await setup_manager.setup_url(URLStatus.NOT_RELEVANT) + url_id = await setup_manager.setup_url(URLStatus.OK) + await db_data_creator.create_validated_flags([url_id], validation_type=URLValidatedType.NOT_RELEVANT) assert await operator.meets_task_prerequisites() run_info = await operator.run_task() assert_task_ran_without_error(run_info) assert not await operator.meets_task_prerequisites() await check_manager.check_url( url_id=url_id, - expected_status=URLStatus.NOT_RELEVANT + expected_status=URLStatus.OK ) await check_manager.check_web_metadata( url_id=url_id, diff --git a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_ok.py b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_ok.py index a02f1ba4..ecaec084 100644 --- a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_ok.py +++ b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_ok.py @@ -28,14 +28,14 @@ async def test_url_probe_task_no_redirect_ok( ) ) assert not await operator.meets_task_prerequisites() - url_id = await setup_manager.setup_url(URLStatus.PENDING) + url_id = await setup_manager.setup_url(URLStatus.OK) assert await operator.meets_task_prerequisites() run_info = await operator.run_task() assert_task_ran_without_error(run_info) assert not await operator.meets_task_prerequisites() await check_manager.check_url( url_id=url_id, - expected_status=URLStatus.PENDING + expected_status=URLStatus.OK ) await check_manager.check_web_metadata( url_id=url_id, diff --git a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_two_urls.py b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_two_urls.py index 0c1da5fd..cfd1f68f 100644 --- a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_two_urls.py +++ b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_two_urls.py @@ -31,8 +31,8 @@ async def test_two_urls( ] ) assert not await operator.meets_task_prerequisites() - url_id_1 = await setup_manager.setup_url(URLStatus.PENDING, url=url_1) - url_id_2 = await setup_manager.setup_url(URLStatus.NOT_RELEVANT, url=url_2) + url_id_1 = await setup_manager.setup_url(URLStatus.OK, url=url_1) + url_id_2 = await setup_manager.setup_url(URLStatus.OK, url=url_2) assert await operator.meets_task_prerequisites() run_info = await operator.run_task() assert_task_ran_without_error(run_info) diff --git a/tests/automated/integration/tasks/url/impl/probe/redirect/dest_new/test_dest_ok.py b/tests/automated/integration/tasks/url/impl/probe/redirect/dest_new/test_dest_ok.py index 88098b16..df695021 100644 --- a/tests/automated/integration/tasks/url/impl/probe/redirect/dest_new/test_dest_ok.py +++ b/tests/automated/integration/tasks/url/impl/probe/redirect/dest_new/test_dest_ok.py @@ -28,12 +28,12 @@ async def test_url_probe_task_redirect_dest_new_ok( dest_error=None ) ) - source_url_id = await setup_manager.setup_url(URLStatus.PENDING) + source_url_id = await setup_manager.setup_url(URLStatus.OK) run_info = await operator.run_task() assert_task_ran_without_error(run_info) await check_manager.check_url( url_id=source_url_id, - expected_status=URLStatus.PENDING + expected_status=URLStatus.OK ) await check_manager.check_web_metadata( url_id=source_url_id, @@ -45,7 +45,7 @@ async def test_url_probe_task_redirect_dest_new_ok( dest_url_id = await check_manager.check_redirect(source_url_id) await check_manager.check_url( url_id=dest_url_id, - expected_status=URLStatus.PENDING + expected_status=URLStatus.OK ) await check_manager.check_web_metadata( url_id=dest_url_id, diff --git a/tests/automated/integration/tasks/url/impl/probe/redirect/test_dest_exists_in_db.py b/tests/automated/integration/tasks/url/impl/probe/redirect/test_dest_exists_in_db.py index 0744f3b9..b52dce6b 100644 --- a/tests/automated/integration/tasks/url/impl/probe/redirect/test_dest_exists_in_db.py +++ b/tests/automated/integration/tasks/url/impl/probe/redirect/test_dest_exists_in_db.py @@ -29,8 +29,8 @@ async def test_url_probe_task_redirect_dest_exists_in_db( dest_error=None ) ) - source_url_id = await setup_manager.setup_url(URLStatus.INDIVIDUAL_RECORD) - dest_url_id = await setup_manager.setup_url(URLStatus.PENDING, url=TEST_DEST_URL) + source_url_id = await setup_manager.setup_url(URLStatus.OK) + dest_url_id = await setup_manager.setup_url(URLStatus.OK, url=TEST_DEST_URL) # Add web metadata for destination URL, to prevent it from being pulled web_metadata = URLWebMetadataPydantic( url_id=dest_url_id, @@ -44,11 +44,11 @@ async def test_url_probe_task_redirect_dest_exists_in_db( assert_task_ran_without_error(run_info) await check_manager.check_url( url_id=source_url_id, - expected_status=URLStatus.INDIVIDUAL_RECORD + expected_status=URLStatus.OK ) await check_manager.check_url( url_id=dest_url_id, - expected_status=URLStatus.PENDING + expected_status=URLStatus.OK ) await check_manager.check_web_metadata( url_id=source_url_id, diff --git a/tests/automated/integration/tasks/url/impl/probe/redirect/test_redirect_infinite.py b/tests/automated/integration/tasks/url/impl/probe/redirect/test_redirect_infinite.py index ed9c38ac..5a66af3d 100644 --- a/tests/automated/integration/tasks/url/impl/probe/redirect/test_redirect_infinite.py +++ b/tests/automated/integration/tasks/url/impl/probe/redirect/test_redirect_infinite.py @@ -27,11 +27,11 @@ async def test_url_probe_task_redirect_infinite( redirect_url=TEST_URL ) ) - url_id = await setup_manager.setup_url(URLStatus.PENDING) + url_id = await setup_manager.setup_url(URLStatus.OK) run_info = await operator.run_task() await check_manager.check_url( url_id=url_id, - expected_status=URLStatus.PENDING + expected_status=URLStatus.OK ) await check_manager.check_web_metadata( url_id=url_id, diff --git a/tests/automated/integration/tasks/url/impl/probe/redirect/test_two_urls_same_dest.py b/tests/automated/integration/tasks/url/impl/probe/redirect/test_two_urls_same_dest.py index 267d9015..f0e113ff 100644 --- a/tests/automated/integration/tasks/url/impl/probe/redirect/test_two_urls_same_dest.py +++ b/tests/automated/integration/tasks/url/impl/probe/redirect/test_two_urls_same_dest.py @@ -34,17 +34,17 @@ async def test_url_probe_task_redirect_two_urls_same_dest( ), ] ) - source_url_id_1 = await setup_manager.setup_url(URLStatus.PENDING) - source_url_id_2 = await setup_manager.setup_url(URLStatus.PENDING, url="https://example.com/2") + source_url_id_1 = await setup_manager.setup_url(URLStatus.OK) + source_url_id_2 = await setup_manager.setup_url(URLStatus.OK, url="https://example.com/2") run_info = await operator.run_task() assert_task_ran_without_error(run_info) await check_manager.check_url( url_id=source_url_id_1, - expected_status=URLStatus.PENDING + expected_status=URLStatus.OK ) await check_manager.check_url( url_id=source_url_id_2, - expected_status=URLStatus.PENDING + expected_status=URLStatus.OK ) redirect_url_id_1 = await check_manager.check_redirect( source_url_id=source_url_id_1 diff --git a/tests/automated/integration/tasks/url/impl/submit_approved/test_submit_approved_url_task.py b/tests/automated/integration/tasks/url/impl/submit_approved/test_submit_approved_url_task.py index 7d56ddcf..f992fbb6 100644 --- a/tests/automated/integration/tasks/url/impl/submit_approved/test_submit_approved_url_task.py +++ b/tests/automated/integration/tasks/url/impl/submit_approved/test_submit_approved_url_task.py @@ -16,9 +16,9 @@ @pytest.mark.asyncio async def test_submit_approved_url_task( - db_data_creator, - mock_pdap_client: PDAPClient, - monkeypatch + db_data_creator, + mock_pdap_client: PDAPClient, + monkeypatch ): """ The submit_approved_url_task should submit @@ -37,7 +37,7 @@ async def test_submit_approved_url_task( # Create URLs with status 'validated' in database and all requisite URL values # Ensure they have optional metadata as well - urls = await setup_validated_urls(db_data_creator) + urls: list[str] = await setup_validated_urls(db_data_creator) mock_make_request(mock_pdap_client, urls) # Check Task Operator does meet pre-requisites @@ -50,14 +50,14 @@ async def test_submit_approved_url_task( assert run_info.outcome == TaskOperatorOutcome.SUCCESS, run_info.message # Get URLs - urls = await db_data_creator.adb_client.get_all(URL, order_by_attribute="id") - url_1 = urls[0] - url_2 = urls[1] - url_3 = urls[2] + urls: list[URL] = await db_data_creator.adb_client.get_all(URL, order_by_attribute="id") + url_1: URL = urls[0] + url_2: URL = urls[1] + url_3: URL = urls[2] # Check URLs have been marked as 'submitted' - assert url_1.status == URLStatus.SUBMITTED - assert url_2.status == URLStatus.SUBMITTED + assert url_1.status == URLStatus.OK + assert url_2.status == URLStatus.OK assert url_3.status == URLStatus.ERROR # Get URL Data Source Links diff --git a/tests/automated/integration/tasks/url/impl/submit_approved/test_validated_meta_url.py b/tests/automated/integration/tasks/url/impl/submit_approved/test_validated_meta_url.py new file mode 100644 index 00000000..5f927159 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/submit_approved/test_validated_meta_url.py @@ -0,0 +1,42 @@ +import pytest + +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.url.operators.submit_approved.core import SubmitApprovedURLTaskOperator +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource +from src.external.pdap.client import PDAPClient +from tests.helpers.asserts import assert_task_run_success + + +@pytest.mark.asyncio +async def test_validated_meta_url_not_included( + db_data_creator, + mock_pdap_client: PDAPClient, + monkeypatch +): + """ + If a validated Meta URL is included in the database + This should not be included in the submit approved task + """ + + # Get Task Operator + operator = SubmitApprovedURLTaskOperator( + adb_client=db_data_creator.adb_client, + pdap_client=mock_pdap_client + ) + + dbdc = db_data_creator + url_1: int = (await dbdc.create_validated_urls( + validation_type=URLValidatedType.META_URL + ))[0].url_id + + # Test task operator does not meet prerequisites + assert not await operator.meets_task_prerequisites() + + # Run task and confirm runs without error + run_info: TaskOperatorRunInfo = await operator.run_task() + assert_task_run_success(run_info) + + # Confirm entry not included in database + ds_urls: list[URLDataSource] = await dbdc.adb_client.get_all(URLDataSource) + assert len(ds_urls) == 0 diff --git a/tests/automated/integration/tasks/url/impl/test_url_404_probe.py b/tests/automated/integration/tasks/url/impl/test_url_404_probe.py index 630f7f4e..e55ad9ad 100644 --- a/tests/automated/integration/tasks/url/impl/test_url_404_probe.py +++ b/tests/automated/integration/tasks/url/impl/test_url_404_probe.py @@ -12,6 +12,7 @@ from src.collectors.enums import URLStatus from src.core.tasks.url.enums import TaskOperatorOutcome from src.external.url_request.dtos.url_response import URLResponseInfo +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.data_creator.core import DBDataCreator from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters @@ -19,6 +20,7 @@ @pytest.mark.asyncio async def test_url_404_probe_task( + wiped_database, db_data_creator: DBDataCreator ): @@ -84,12 +86,12 @@ async def mock_make_simple_requests(self, urls: list[str]) -> list[URLResponseIn urls=[ TestURLCreationParameters( count=3, - status=URLStatus.PENDING, + status=URLCreationEnum.OK, with_html_content=True ), TestURLCreationParameters( count=1, - status=URLStatus.ERROR, + status=URLCreationEnum.ERROR, with_html_content=False ), ] @@ -104,12 +106,12 @@ async def mock_make_simple_requests(self, urls: list[str]) -> list[URLResponseIn assert run_info.outcome == TaskOperatorOutcome.SUCCESS, run_info.message - pending_url_mappings = creation_info.urls_by_status[URLStatus.PENDING].url_mappings + pending_url_mappings = creation_info.urls_by_status[URLCreationEnum.OK].url_mappings url_id_success = pending_url_mappings[0].url_id url_id_404 = pending_url_mappings[1].url_id url_id_error = pending_url_mappings[2].url_id - url_id_initial_error = creation_info.urls_by_status[URLStatus.ERROR].url_mappings[0].url_id + url_id_initial_error = creation_info.urls_by_status[URLCreationEnum.ERROR].url_mappings[0].url_id # Check that URLProbedFor404 has been appropriately populated probed_for_404_objects: list[URLProbedFor404] = await db_data_creator.adb_client.get_all(URLProbedFor404) @@ -128,9 +130,9 @@ def find_url(url_id: int) -> URL: return url raise Exception(f"URL with id {url_id} not found") - assert find_url(url_id_success).status == URLStatus.PENDING + assert find_url(url_id_success).status == URLStatus.OK assert find_url(url_id_404).status == URLStatus.NOT_FOUND - assert find_url(url_id_error).status == URLStatus.PENDING + assert find_url(url_id_error).status == URLStatus.OK assert find_url(url_id_initial_error).status == URLStatus.ERROR # Check that meets_task_prerequisites now returns False diff --git a/tests/automated/integration/tasks/url/loader/conftest.py b/tests/automated/integration/tasks/url/loader/conftest.py index 045236f9..52a17b5e 100644 --- a/tests/automated/integration/tasks/url/loader/conftest.py +++ b/tests/automated/integration/tasks/url/loader/conftest.py @@ -4,10 +4,11 @@ from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface from src.core.tasks.url.loader import URLTaskOperatorLoader +from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor.nlp.core import \ + NLPProcessor from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser from src.db.client.async_ import AsyncDatabaseClient from src.external.huggingface.inference.client import HuggingFaceInferenceClient -from src.external.internet_archives.client import InternetArchivesClient from src.external.pdap.client import PDAPClient from src.external.url_request.core import URLRequestInterface @@ -22,4 +23,5 @@ def loader() -> URLTaskOperatorLoader: pdap_client=AsyncMock(spec=PDAPClient), muckrock_api_interface=AsyncMock(spec=MuckrockAPIInterface), hf_inference_client=AsyncMock(spec=HuggingFaceInferenceClient), + nlp_processor=AsyncMock(spec=NLPProcessor) ) \ No newline at end of file diff --git a/tests/helpers/batch_creation_parameters/core.py b/tests/helpers/batch_creation_parameters/core.py index dfc33644..4562cbdf 100644 --- a/tests/helpers/batch_creation_parameters/core.py +++ b/tests/helpers/batch_creation_parameters/core.py @@ -9,10 +9,10 @@ class TestBatchCreationParameters(BaseModel): - created_at: Optional[datetime.datetime] = None + created_at: datetime.datetime | None = None outcome: BatchStatus = BatchStatus.READY_TO_LABEL strategy: CollectorType = CollectorType.EXAMPLE - urls: Optional[list[TestURLCreationParameters]] = None + urls: list[TestURLCreationParameters] | None = None @model_validator(mode='after') def validate_urls(self): diff --git a/tests/helpers/batch_creation_parameters/enums.py b/tests/helpers/batch_creation_parameters/enums.py new file mode 100644 index 00000000..d61a2793 --- /dev/null +++ b/tests/helpers/batch_creation_parameters/enums.py @@ -0,0 +1,11 @@ +from enum import Enum + + +class URLCreationEnum(Enum): + OK = "ok" + SUBMITTED = "submitted" + VALIDATED = "validated" + ERROR = "error" + NOT_RELEVANT = "not_relevant" + DUPLICATE = "duplicate" + NOT_FOUND = "not_found" \ No newline at end of file diff --git a/tests/helpers/batch_creation_parameters/url_creation_parameters.py b/tests/helpers/batch_creation_parameters/url_creation_parameters.py index 2e30cca0..701a239b 100644 --- a/tests/helpers/batch_creation_parameters/url_creation_parameters.py +++ b/tests/helpers/batch_creation_parameters/url_creation_parameters.py @@ -1,23 +1,26 @@ from pydantic import BaseModel, model_validator from src.api.endpoints.annotate.agency.post.dto import URLAgencyAnnotationPostInfo -from src.collectors.enums import URLStatus from src.core.enums import RecordType from tests.helpers.batch_creation_parameters.annotation_info import AnnotationInfo +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum class TestURLCreationParameters(BaseModel): count: int = 1 - status: URLStatus = URLStatus.PENDING + status: URLCreationEnum = URLCreationEnum.OK with_html_content: bool = False annotation_info: AnnotationInfo = AnnotationInfo() @model_validator(mode='after') def validate_annotation_info(self): - if self.status == URLStatus.NOT_RELEVANT: + if self.status == URLCreationEnum.NOT_RELEVANT: self.annotation_info.final_review_approved = False return self - if self.status != URLStatus.VALIDATED: + if self.status not in ( + URLCreationEnum.SUBMITTED, + URLCreationEnum.VALIDATED + ): return self # Assume is validated diff --git a/tests/helpers/counter.py b/tests/helpers/counter.py new file mode 100644 index 00000000..8d9de1a0 --- /dev/null +++ b/tests/helpers/counter.py @@ -0,0 +1,7 @@ + +from itertools import count + +COUNTER = count(1) + +def next_int() -> int: + return next(COUNTER) \ No newline at end of file diff --git a/tests/helpers/data_creator/commands/impl/annotate.py b/tests/helpers/data_creator/commands/impl/annotate.py index 5f341326..1f549615 100644 --- a/tests/helpers/data_creator/commands/impl/annotate.py +++ b/tests/helpers/data_creator/commands/impl/annotate.py @@ -7,7 +7,7 @@ from src.core.enums import SuggestionType from tests.helpers.batch_creation_parameters.annotation_info import AnnotationInfo from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase -from tests.helpers.data_creator.commands.impl.suggestion.auto.agency import AgencyAutoSuggestionsCommand +from tests.helpers.data_creator.commands.impl.suggestion.auto.agency_.core import AgencyAutoSuggestionsCommand from tests.helpers.data_creator.commands.impl.suggestion.auto.record_type import AutoRecordTypeSuggestionCommand from tests.helpers.data_creator.commands.impl.suggestion.auto.relevant import AutoRelevantSuggestionCommand from tests.helpers.data_creator.commands.impl.suggestion.user.agency import AgencyUserSuggestionsCommand diff --git a/tests/helpers/data_creator/commands/impl/batch.py b/tests/helpers/data_creator/commands/impl/batch.py index 69583a45..6871661d 100644 --- a/tests/helpers/data_creator/commands/impl/batch.py +++ b/tests/helpers/data_creator/commands/impl/batch.py @@ -3,7 +3,7 @@ from src.collectors.enums import CollectorType from src.core.enums import BatchStatus -from src.db.models.impl.batch.pydantic import BatchInfo +from src.db.models.impl.batch.pydantic.info import BatchInfo from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase diff --git a/tests/helpers/data_creator/commands/impl/suggestion/auto/agency.py b/tests/helpers/data_creator/commands/impl/suggestion/auto/agency.py deleted file mode 100644 index 96743df8..00000000 --- a/tests/helpers/data_creator/commands/impl/suggestion/auto/agency.py +++ /dev/null @@ -1,46 +0,0 @@ -from typing import final - -from typing_extensions import override - -from src.core.enums import SuggestionType -from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase -from tests.helpers.data_creator.commands.impl.agency import AgencyCommand - -@final -class AgencyAutoSuggestionsCommand(DBDataCreatorCommandBase): - - def __init__( - self, - url_id: int, - count: int, - suggestion_type: SuggestionType = SuggestionType.AUTO_SUGGESTION - ): - super().__init__() - if suggestion_type == SuggestionType.UNKNOWN: - count = 1 # Can only be one auto suggestion if unknown - self.url_id = url_id - self.count = count - self.suggestion_type = suggestion_type - - @override - async def run(self) -> None: - suggestions = [] - for _ in range(self.count): - if self.suggestion_type == SuggestionType.UNKNOWN: - pdap_agency_id = None - else: - pdap_agency_id = await self.run_command(AgencyCommand()) - suggestion = URLAgencySuggestionInfo( - url_id=self.url_id, - suggestion_type=self.suggestion_type, - pdap_agency_id=pdap_agency_id, - state="Test State", - county="Test County", - locality="Test Locality" - ) - suggestions.append(suggestion) - - await self.adb_client.add_agency_auto_suggestions( - suggestions=suggestions - ) \ No newline at end of file diff --git a/tests/helpers/data_creator/commands/impl/suggestion/auto/agency_/__init__.py b/tests/helpers/data_creator/commands/impl/suggestion/auto/agency_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/helpers/data_creator/commands/impl/suggestion/auto/agency_/core.py b/tests/helpers/data_creator/commands/impl/suggestion/auto/agency_/core.py new file mode 100644 index 00000000..fe54c6f9 --- /dev/null +++ b/tests/helpers/data_creator/commands/impl/suggestion/auto/agency_/core.py @@ -0,0 +1,78 @@ +from typing import final + +from typing_extensions import override + +from src.core.enums import SuggestionType +from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo +from src.db.enums import TaskType +from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType +from src.db.models.impl.url.suggestion.agency.subtask.pydantic import URLAutoAgencyIDSubtaskPydantic +from src.db.models.impl.url.suggestion.agency.suggestion.pydantic import AgencyIDSubtaskSuggestionPydantic +from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase +from tests.helpers.data_creator.commands.impl.agency import AgencyCommand + +@final +class AgencyAutoSuggestionsCommand(DBDataCreatorCommandBase): + + def __init__( + self, + url_id: int, + count: int, + suggestion_type: SuggestionType = SuggestionType.AUTO_SUGGESTION, + subtask_type: AutoAgencyIDSubtaskType = AutoAgencyIDSubtaskType.HOMEPAGE_MATCH, + confidence: int = 50 + ): + super().__init__() + if suggestion_type == SuggestionType.UNKNOWN: + count = 1 # Can only be one auto suggestion if unknown + agencies_found = False + else: + agencies_found = True + self.url_id = url_id + self.count = count + self.suggestion_type = suggestion_type + self.subtask_type = subtask_type + self.confidence = confidence + self.agencies_found = agencies_found + + @override + async def run(self) -> None: + task_id: int = await self.add_task() + subtask_id: int = await self.create_subtask(task_id) + if not self.agencies_found: + return + + suggestions: list[AgencyIDSubtaskSuggestionPydantic] = [] + for _ in range(self.count): + pdap_agency_id: int = await self.run_command(AgencyCommand()) + + suggestion = AgencyIDSubtaskSuggestionPydantic( + subtask_id=subtask_id, + agency_id=pdap_agency_id, + confidence=self.confidence, + ) + suggestions.append(suggestion) + + await self.adb_client.bulk_insert( + models=suggestions, + ) + + async def add_task(self) -> int: + task_id: int = await self.adb_client.initiate_task( + task_type=TaskType.AGENCY_IDENTIFICATION, + ) + return task_id + + async def create_subtask(self, task_id: int) -> int: + obj: URLAutoAgencyIDSubtaskPydantic = URLAutoAgencyIDSubtaskPydantic( + task_id=task_id, + type=self.subtask_type, + url_id=self.url_id, + agencies_found=self.agencies_found, + ) + subtask_id: int = (await self.adb_client.bulk_insert( + models=[obj], + return_ids=True + ))[0] + return subtask_id + diff --git a/tests/helpers/data_creator/commands/impl/urls_/__init__.py b/tests/helpers/data_creator/commands/impl/urls_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/helpers/data_creator/commands/impl/urls_/convert.py b/tests/helpers/data_creator/commands/impl/urls_/convert.py new file mode 100644 index 00000000..d76edfe5 --- /dev/null +++ b/tests/helpers/data_creator/commands/impl/urls_/convert.py @@ -0,0 +1,36 @@ +from src.collectors.enums import URLStatus +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum + + +def convert_url_creation_enum_to_url_status(url_creation_enum: URLCreationEnum) -> URLStatus: + match url_creation_enum: + case URLCreationEnum.OK: + return URLStatus.OK + case URLCreationEnum.SUBMITTED: + return URLStatus.OK + case URLCreationEnum.VALIDATED: + return URLStatus.OK + case URLCreationEnum.NOT_RELEVANT: + return URLStatus.OK + case URLCreationEnum.ERROR: + return URLStatus.ERROR + case URLCreationEnum.DUPLICATE: + return URLStatus.DUPLICATE + case URLCreationEnum.NOT_FOUND: + return URLStatus.NOT_FOUND + case _: + raise ValueError(f"Unknown URLCreationEnum: {url_creation_enum}") + +def convert_url_creation_enum_to_validated_type( + url_creation_enum: URLCreationEnum +) -> URLValidatedType: + match url_creation_enum: + case URLCreationEnum.SUBMITTED: + return URLValidatedType.DATA_SOURCE + case URLCreationEnum.VALIDATED: + return URLValidatedType.DATA_SOURCE + case URLCreationEnum.NOT_RELEVANT: + return URLValidatedType.NOT_RELEVANT + case _: + raise ValueError(f"Unknown URLCreationEnum: {url_creation_enum}") \ No newline at end of file diff --git a/tests/helpers/data_creator/commands/impl/urls.py b/tests/helpers/data_creator/commands/impl/urls_/query.py similarity index 79% rename from tests/helpers/data_creator/commands/impl/urls.py rename to tests/helpers/data_creator/commands/impl/urls_/query.py index ee9ef954..7587abfb 100644 --- a/tests/helpers/data_creator/commands/impl/urls.py +++ b/tests/helpers/data_creator/commands/impl/urls_/query.py @@ -1,11 +1,12 @@ from datetime import datetime -from src.collectors.enums import URLStatus from src.core.tasks.url.operators.submit_approved.tdo import SubmittedURLInfo from src.db.dtos.url.insert import InsertURLsInfo from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.core.pydantic.info import URLInfo +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase +from tests.helpers.data_creator.commands.impl.urls_.convert import convert_url_creation_enum_to_url_status from tests.helpers.simple_test_data_functions import generate_test_urls @@ -16,7 +17,7 @@ def __init__( batch_id: int | None, url_count: int, collector_metadata: dict | None = None, - status: URLStatus = URLStatus.PENDING, + status: URLCreationEnum = URLCreationEnum.OK, created_at: datetime | None = None ): super().__init__() @@ -36,8 +37,11 @@ def run_sync(self) -> InsertURLsInfo: url_infos.append( URLInfo( url=url, - status=self.status, - name="Test Name" if self.status == URLStatus.VALIDATED else None, + status=convert_url_creation_enum_to_url_status(self.status), + name="Test Name" if self.status in ( + URLCreationEnum.VALIDATED, + URLCreationEnum.SUBMITTED, + ) else None, collector_metadata=self.collector_metadata, created_at=self.created_at, source=URLSource.COLLECTOR @@ -50,7 +54,7 @@ def run_sync(self) -> InsertURLsInfo: ) # If outcome is submitted, also add entry to DataSourceURL - if self.status == URLStatus.SUBMITTED: + if self.status == URLCreationEnum.SUBMITTED: submitted_url_infos = [] for url_id in url_insert_info.url_ids: submitted_url_info = SubmittedURLInfo( diff --git a/tests/helpers/data_creator/commands/impl/urls_v2/core.py b/tests/helpers/data_creator/commands/impl/urls_v2/core.py index c80dc447..f7042720 100644 --- a/tests/helpers/data_creator/commands/impl/urls_v2/core.py +++ b/tests/helpers/data_creator/commands/impl/urls_v2/core.py @@ -1,14 +1,16 @@ from datetime import datetime -from src.collectors.enums import URLStatus from src.db.dtos.url.insert import InsertURLsInfo +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase from tests.helpers.data_creator.commands.impl.annotate import AnnotateCommand from tests.helpers.data_creator.commands.impl.html_data import HTMLDataCreatorCommand -from tests.helpers.data_creator.commands.impl.urls import URLsDBDataCreatorCommand +from tests.helpers.data_creator.commands.impl.urls_.convert import convert_url_creation_enum_to_validated_type +from tests.helpers.data_creator.commands.impl.urls_.query import URLsDBDataCreatorCommand from tests.helpers.data_creator.commands.impl.urls_v2.response import URLsV2Response -from tests.helpers.data_creator.models.creation_info.batch.v2 import BatchURLCreationInfoV2 +from tests.helpers.data_creator.generate import generate_validated_flags from tests.helpers.data_creator.models.creation_info.url import URLCreationInfo @@ -26,7 +28,7 @@ def __init__( self.created_at = created_at async def run(self) -> URLsV2Response: - urls_by_status: dict[URLStatus, URLCreationInfo] = {} + urls_by_status: dict[URLCreationEnum, URLCreationInfo] = {} urls_by_order: list[URLCreationInfo] = [] # Create urls for url_parameters in self.parameters: diff --git a/tests/helpers/data_creator/commands/impl/urls_v2/response.py b/tests/helpers/data_creator/commands/impl/urls_v2/response.py index db19328e..74aa8e20 100644 --- a/tests/helpers/data_creator/commands/impl/urls_v2/response.py +++ b/tests/helpers/data_creator/commands/impl/urls_v2/response.py @@ -1,9 +1,10 @@ from pydantic import BaseModel from src.collectors.enums import URLStatus +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.data_creator.models.creation_info.url import URLCreationInfo class URLsV2Response(BaseModel): - urls_by_status: dict[URLStatus, URLCreationInfo] = {} + urls_by_status: dict[URLCreationEnum, URLCreationInfo] = {} urls_by_order: list[URLCreationInfo] = [] \ No newline at end of file diff --git a/tests/helpers/data_creator/core.py b/tests/helpers/data_creator/core.py index 096bad32..57ee3576 100644 --- a/tests/helpers/data_creator/core.py +++ b/tests/helpers/data_creator/core.py @@ -5,8 +5,15 @@ from src.api.endpoints.annotate.agency.post.dto import URLAgencyAnnotationPostInfo from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo from src.db.client.async_ import AsyncDatabaseClient +from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.agency.sqlalchemy import Agency from src.db.models.impl.duplicate.pydantic.insert import DuplicateInsertInfo from src.db.dtos.url.insert import InsertURLsInfo +from src.db.models.impl.flag.root_url.sqlalchemy import FlagRootURL +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.link.urls_root_url.sqlalchemy import LinkURLRootURL +from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.error_info.pydantic import URLErrorPydanticInfo from src.db.client.sync import DatabaseClient from src.db.enums import TaskType @@ -14,26 +21,31 @@ from src.core.tasks.url.operators.misc_metadata.tdo import URLMiscellaneousMetadataTDO from src.core.enums import BatchStatus, SuggestionType, RecordType, SuggestedStatus from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.batch_creation_parameters.url_creation_parameters import TestURLCreationParameters +from tests.helpers.counter import next_int from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase from tests.helpers.data_creator.commands.impl.agency import AgencyCommand from tests.helpers.data_creator.commands.impl.batch import DBDataCreatorBatchCommand from tests.helpers.data_creator.commands.impl.batch_v2 import BatchV2Command from tests.helpers.data_creator.commands.impl.html_data import HTMLDataCreatorCommand from tests.helpers.data_creator.commands.impl.suggestion.agency_confirmed import AgencyConfirmedSuggestionCommand -from tests.helpers.data_creator.commands.impl.suggestion.auto.agency import AgencyAutoSuggestionsCommand +from tests.helpers.data_creator.commands.impl.suggestion.auto.agency_.core import AgencyAutoSuggestionsCommand from tests.helpers.data_creator.commands.impl.suggestion.auto.record_type import AutoRecordTypeSuggestionCommand from tests.helpers.data_creator.commands.impl.suggestion.auto.relevant import AutoRelevantSuggestionCommand from tests.helpers.data_creator.commands.impl.suggestion.user.agency import AgencyUserSuggestionsCommand from tests.helpers.data_creator.commands.impl.suggestion.user.record_type import UserRecordTypeSuggestionCommand from tests.helpers.data_creator.commands.impl.suggestion.user.relevant import UserRelevantSuggestionCommand from tests.helpers.data_creator.commands.impl.url_metadata import URLMetadataCommand -from tests.helpers.data_creator.commands.impl.urls import URLsDBDataCreatorCommand +from tests.helpers.data_creator.commands.impl.urls_.query import URLsDBDataCreatorCommand from tests.helpers.data_creator.commands.impl.urls_v2.core import URLsV2Command from tests.helpers.data_creator.commands.impl.urls_v2.response import URLsV2Response +from tests.helpers.data_creator.create import create_urls, create_batch, create_batch_url_links, create_validated_flags, \ + create_url_data_sources from tests.helpers.data_creator.models.clients import DBDataCreatorClientContainer from tests.helpers.data_creator.models.creation_info.batch.v1 import BatchURLCreationInfo from tests.helpers.data_creator.models.creation_info.batch.v2 import BatchURLCreationInfoV2 +from tests.helpers.simple_test_data_functions import generate_test_name class DBDataCreator: @@ -105,7 +117,7 @@ async def batch_and_urls( url_count: int = 3, with_html_content: bool = False, batch_status: BatchStatus = BatchStatus.READY_TO_LABEL, - url_status: URLStatus = URLStatus.PENDING + url_status: URLCreationEnum = URLCreationEnum.OK ) -> BatchURLCreationInfo: batch_id = self.batch( strategy=strategy, @@ -194,23 +206,14 @@ async def auto_suggestions( raise ValueError(f"suggestion_type must be one of {allowed_suggestion_types}") if suggestion_type == SuggestionType.UNKNOWN and num_suggestions > 1: raise ValueError("num_suggestions must be 1 when suggestion_type is unknown") - + for url_id in url_ids: - suggestions = [] - for i in range(num_suggestions): - if suggestion_type == SuggestionType.UNKNOWN: - agency_id = None - else: - agency_id = await self.agency() - suggestion = URLAgencySuggestionInfo( + await self.run_command( + AgencyAutoSuggestionsCommand( url_id=url_id, - suggestion_type=suggestion_type, - pdap_agency_id=agency_id + count=num_suggestions, + suggestion_type=suggestion_type ) - suggestions.append(suggestion) - - await self.adb_client.add_agency_auto_suggestions( - suggestions=suggestions ) async def confirmed_suggestions(self, url_ids: list[int]): @@ -239,7 +242,7 @@ def urls( batch_id: int, url_count: int, collector_metadata: dict | None = None, - outcome: URLStatus = URLStatus.PENDING, + outcome: URLCreationEnum = URLCreationEnum.OK, created_at: datetime | None = None ) -> InsertURLsInfo: command = URLsDBDataCreatorCommand( @@ -259,7 +262,7 @@ async def url_miscellaneous_metadata( record_formats: Optional[list[str]] = None, data_portal_type: Optional[str] = "Test Data Portal Type", supplying_entity: Optional[str] = "Test Supplying Entity" - ): + ) -> None: if record_formats is None: record_formats = ["Test Record Format", "Test Record Format 2"] @@ -277,7 +280,11 @@ async def url_miscellaneous_metadata( await self.adb_client.add_miscellaneous_metadata([tdo]) - def duplicate_urls(self, duplicate_batch_id: int, url_ids: list[int]): + def duplicate_urls( + self, + duplicate_batch_id: int, + url_ids: list[int] + ) -> None: """ Create duplicates for all given url ids, and associate them with the given batch @@ -302,7 +309,7 @@ async def error_info( self, url_ids: list[int], task_id: Optional[int] = None - ): + ) -> None: if task_id is None: task_id = await self.task() error_infos = [] @@ -368,3 +375,173 @@ async def url_metadata( status_code=status_code ) ) + + async def create_validated_urls( + self, + record_type: RecordType = RecordType.RESOURCES, + validation_type: URLValidatedType = URLValidatedType.DATA_SOURCE, + count: int = 1 + ) -> list[URLMapping]: + url_mappings: list[URLMapping] = await self.create_urls( + record_type=record_type, + count=count + ) + url_ids: list[int] = [url_mapping.url_id for url_mapping in url_mappings] + await self.create_validated_flags( + url_ids=url_ids, + validation_type=validation_type + ) + return url_mappings + + async def create_submitted_urls( + self, + record_type: RecordType = RecordType.RESOURCES, + count: int = 1 + ) -> list[URLMapping]: + url_mappings: list[URLMapping] = await self.create_urls( + record_type=record_type, + count=count + ) + url_ids: list[int] = [url_mapping.url_id for url_mapping in url_mappings] + await self.create_validated_flags( + url_ids=url_ids, + validation_type=URLValidatedType.DATA_SOURCE + ) + await self.create_url_data_sources(url_ids=url_ids) + return url_mappings + + + async def create_urls( + self, + status: URLStatus = URLStatus.OK, + source: URLSource = URLSource.COLLECTOR, + record_type: RecordType | None = RecordType.RESOURCES, + collector_metadata: dict | None = None, + count: int = 1, + batch_id: int | None = None + ) -> list[URLMapping]: + + url_mappings: list[URLMapping] = await create_urls( + adb_client=self.adb_client, + status=status, + source=source, + record_type=record_type, + collector_metadata=collector_metadata, + count=count + ) + url_ids: list[int] = [url_mapping.url_id for url_mapping in url_mappings] + if batch_id is not None: + await self.create_batch_url_links( + url_ids=url_ids, + batch_id=batch_id + ) + return url_mappings + + async def create_batch( + self, + status: BatchStatus = BatchStatus.READY_TO_LABEL, + strategy: CollectorType = CollectorType.EXAMPLE, + date_generated: datetime = datetime.now(), + ) -> int: + return await create_batch( + adb_client=self.adb_client, + status=status, + strategy=strategy, + date_generated=date_generated + ) + + async def create_batch_url_links( + self, + url_ids: list[int], + batch_id: int, + ) -> None: + await create_batch_url_links( + adb_client=self.adb_client, + url_ids=url_ids, + batch_id=batch_id + ) + + async def create_validated_flags( + self, + url_ids: list[int], + validation_type: URLValidatedType, + ) -> None: + await create_validated_flags( + adb_client=self.adb_client, + url_ids=url_ids, + validation_type=validation_type + ) + + async def create_url_data_sources( + self, + url_ids: list[int], + ) -> None: + await create_url_data_sources( + adb_client=self.adb_client, + url_ids=url_ids + ) + + async def create_url_agency_links( + self, + url_ids: list[int], + agency_ids: list[int], + ) -> None: + links: list[LinkURLAgency] = [] + for url_id in url_ids: + for agency_id in agency_ids: + link = LinkURLAgency( + url_id=url_id, + agency_id=agency_id, + ) + links.append(link) + await self.adb_client.add_all(links) + + async def create_agency(self, agency_id: int = 1) -> None: + agency = Agency( + agency_id=agency_id, + name=generate_test_name(agency_id), + state=None, + county=None, + locality=None + ) + await self.adb_client.add_all([agency]) + + async def create_agencies(self, count: int = 3) -> list[int]: + agencies: list[Agency] = [] + agency_ids: list[int] = [] + for _ in range(count): + agency_id = next_int() + agency = Agency( + agency_id=agency_id, + name=generate_test_name(agency_id), + state=None, + county=None, + locality=None + ) + agencies.append(agency) + agency_ids.append(agency_id) + await self.adb_client.add_all(agencies) + return agency_ids + + async def flag_as_root(self, url_ids: list[int]) -> None: + flag_root_urls: list[FlagRootURL] = [ + FlagRootURL(url_id=url_id) for url_id in url_ids + ] + await self.adb_client.add_all(flag_root_urls) + + async def link_urls_to_root(self, url_ids: list[int], root_url_id: int) -> None: + links: list[LinkURLRootURL] = [ + LinkURLRootURL(url_id=url_id, root_url_id=root_url_id) for url_id in url_ids + ] + await self.adb_client.add_all(links) + + async def link_urls_to_agencies(self, url_ids: list[int], agency_ids: list[int]) -> None: + assert len(url_ids) == len(agency_ids) + links: list[LinkURLAgency] = [] + for url_id, agency_id in zip(url_ids, agency_ids): + link = LinkURLAgency( + url_id=url_id, + agency_id=agency_id + ) + links.append(link) + await self.adb_client.add_all(links) \ No newline at end of file diff --git a/tests/helpers/data_creator/create.py b/tests/helpers/data_creator/create.py new file mode 100644 index 00000000..83b2e3f5 --- /dev/null +++ b/tests/helpers/data_creator/create.py @@ -0,0 +1,75 @@ +from datetime import datetime + +from src.collectors.enums import CollectorType, URLStatus +from src.core.enums import BatchStatus, RecordType +from src.db.client.async_ import AsyncDatabaseClient +from src.db.dtos.url.mapping import URLMapping +from src.db.models.impl.batch.pydantic.insert import BatchInsertModel +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic +from src.db.models.impl.link.batch_url.pydantic import LinkBatchURLPydantic +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.pydantic.insert import URLInsertModel +from src.db.models.impl.url.data_source.pydantic import URLDataSourcePydantic +from tests.helpers.data_creator.generate import generate_batch, generate_urls, generate_validated_flags, \ + generate_url_data_sources, generate_batch_url_links + + +async def create_batch( + adb_client: AsyncDatabaseClient, + status: BatchStatus = BatchStatus.READY_TO_LABEL, + strategy: CollectorType = CollectorType.EXAMPLE, + date_generated: datetime = datetime.now(), +) -> int: + batch: BatchInsertModel = generate_batch(status=status, strategy=strategy, date_generated=date_generated) + return (await adb_client.bulk_insert([batch], return_ids=True))[0] + +async def create_urls( + adb_client: AsyncDatabaseClient, + status: URLStatus = URLStatus.OK, + source: URLSource = URLSource.COLLECTOR, + record_type: RecordType | None = RecordType.RESOURCES, + collector_metadata: dict | None = None, + count: int = 1 +) -> list[URLMapping]: + urls: list[URLInsertModel] = generate_urls( + status=status, + source=source, + record_type=record_type, + collector_metadata=collector_metadata, + count=count, + ) + url_ids = await adb_client.bulk_insert(urls, return_ids=True) + return [URLMapping(url_id=url_id, url=url.url) for url_id, url in zip(url_ids, urls)] + +async def create_validated_flags( + adb_client: AsyncDatabaseClient, + url_ids: list[int], + validation_type: URLValidatedType, +) -> None: + validated_flags: list[FlagURLValidatedPydantic] = generate_validated_flags( + url_ids=url_ids, + validation_type=validation_type, + ) + await adb_client.bulk_insert(validated_flags) + +async def create_url_data_sources( + adb_client: AsyncDatabaseClient, + url_ids: list[int], +) -> None: + url_data_sources: list[URLDataSourcePydantic] = generate_url_data_sources( + url_ids=url_ids, + ) + await adb_client.bulk_insert(url_data_sources) + +async def create_batch_url_links( + adb_client: AsyncDatabaseClient, + url_ids: list[int], + batch_id: int, +) -> None: + batch_url_links: list[LinkBatchURLPydantic] = generate_batch_url_links( + url_ids=url_ids, + batch_id=batch_id, + ) + await adb_client.bulk_insert(batch_url_links) + diff --git a/tests/helpers/data_creator/generate.py b/tests/helpers/data_creator/generate.py new file mode 100644 index 00000000..5dabc016 --- /dev/null +++ b/tests/helpers/data_creator/generate.py @@ -0,0 +1,82 @@ +from datetime import datetime + +from src.collectors.enums import URLStatus, CollectorType +from src.core.enums import BatchStatus, RecordType +from src.db.models.impl.batch.pydantic.insert import BatchInsertModel +from src.db.models.impl.flag.url_validated.enums import URLValidatedType +from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.pydantic import LinkBatchURLPydantic +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.pydantic.insert import URLInsertModel +from src.db.models.impl.url.data_source.pydantic import URLDataSourcePydantic +from tests.helpers.counter import next_int + + +def generate_batch( + status: BatchStatus, + strategy: CollectorType = CollectorType.EXAMPLE, + date_generated: datetime = datetime.now(), +) -> BatchInsertModel: + return BatchInsertModel( + strategy=strategy.value, + status=status, + parameters={}, + user_id=1, + date_generated=date_generated, + ) + +def generate_batch_url_links( + url_ids: list[int], + batch_id: int +) -> list[LinkBatchURLPydantic]: + return [ + LinkBatchURLPydantic( + url_id=url_id, + batch_id=batch_id, + ) + for url_id in url_ids + ] + +def generate_urls( + status: URLStatus = URLStatus.OK, + source: URLSource = URLSource.COLLECTOR, + record_type: RecordType | None = RecordType.RESOURCES, + collector_metadata: dict | None = None, + count: int = 1 +) -> list[URLInsertModel]: + results: list[URLInsertModel] = [] + for i in range(count): + val: int = next_int() + results.append(URLInsertModel( + url=f"http://example.com/{val}", + status=status, + source=source, + name=f"Example {val}", + collector_metadata=collector_metadata, + record_type=record_type, + )) + return results + +def generate_validated_flags( + url_ids: list[int], + validation_type: URLValidatedType, +) -> list[FlagURLValidatedPydantic]: + return [ + FlagURLValidatedPydantic( + url_id=url_id, + type=validation_type, + ) + for url_id in url_ids + ] + +def generate_url_data_sources( + url_ids: list[int], +) -> list[URLDataSourcePydantic]: + return [ + URLDataSourcePydantic( + url_id=url_id, + data_source_id=url_id, + ) + for url_id in url_ids + ] \ No newline at end of file diff --git a/tests/helpers/data_creator/insert.py b/tests/helpers/data_creator/insert.py new file mode 100644 index 00000000..06b207e3 --- /dev/null +++ b/tests/helpers/data_creator/insert.py @@ -0,0 +1,10 @@ +from src.db.client.async_ import AsyncDatabaseClient +from src.db.templates.markers.bulk.insert import BulkInsertableModel + + +async def bulk_insert_all( + adb_client: AsyncDatabaseClient, + lists_of_models: list[list[BulkInsertableModel]], +): + for list_of_models in lists_of_models: + await adb_client.bulk_insert(list_of_models) \ No newline at end of file diff --git a/tests/helpers/data_creator/models/creation_info/batch/v2.py b/tests/helpers/data_creator/models/creation_info/batch/v2.py index 3e6ed74a..52d7e37d 100644 --- a/tests/helpers/data_creator/models/creation_info/batch/v2.py +++ b/tests/helpers/data_creator/models/creation_info/batch/v2.py @@ -1,12 +1,12 @@ from pydantic import BaseModel -from src.collectors.enums import URLStatus +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.data_creator.models.creation_info.url import URLCreationInfo class BatchURLCreationInfoV2(BaseModel): batch_id: int - urls_by_status: dict[URLStatus, URLCreationInfo] = {} + urls_by_status: dict[URLCreationEnum, URLCreationInfo] = {} @property def url_ids(self) -> list[int]: diff --git a/tests/helpers/data_creator/models/creation_info/url.py b/tests/helpers/data_creator/models/creation_info/url.py index 082769e7..16c45a0a 100644 --- a/tests/helpers/data_creator/models/creation_info/url.py +++ b/tests/helpers/data_creator/models/creation_info/url.py @@ -5,11 +5,12 @@ from src.collectors.enums import URLStatus from src.db.dtos.url.mapping import URLMapping from tests.helpers.batch_creation_parameters.annotation_info import AnnotationInfo +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum class URLCreationInfo(BaseModel): url_mappings: list[URLMapping] - outcome: URLStatus + outcome: URLCreationEnum annotation_info: Optional[AnnotationInfo] = None @property diff --git a/tests/helpers/setup/annotation/core.py b/tests/helpers/setup/annotation/core.py index ff5105cd..70123cb9 100644 --- a/tests/helpers/setup/annotation/core.py +++ b/tests/helpers/setup/annotation/core.py @@ -1,4 +1,5 @@ from src.collectors.enums import URLStatus +from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.data_creator.core import DBDataCreator from tests.helpers.setup.annotation.model import AnnotationSetupInfo @@ -6,7 +7,7 @@ async def setup_for_get_next_url_for_annotation( db_data_creator: DBDataCreator, url_count: int, - outcome: URLStatus = URLStatus.PENDING + outcome: URLCreationEnum = URLCreationEnum.OK ) -> AnnotationSetupInfo: batch_id = db_data_creator.batch() insert_urls_info = db_data_creator.urls( diff --git a/tests/helpers/setup/final_review/core.py b/tests/helpers/setup/final_review/core.py index 6c4a3498..58b1ae49 100644 --- a/tests/helpers/setup/final_review/core.py +++ b/tests/helpers/setup/final_review/core.py @@ -37,7 +37,7 @@ async def add_agency_suggestion() -> int: ) return agency_id - async def add_record_type_suggestion(record_type: RecordType): + async def add_record_type_suggestion(record_type: RecordType) -> None: await db_data_creator.user_record_type_suggestion( url_id=url_mapping.url_id, record_type=record_type diff --git a/tests/helpers/setup/wipe.py b/tests/helpers/setup/wipe.py index 630d0f71..e81c266d 100644 --- a/tests/helpers/setup/wipe.py +++ b/tests/helpers/setup/wipe.py @@ -8,5 +8,7 @@ def wipe_database(connection_string: str) -> None: engine = create_engine(connection_string) with engine.connect() as connection: for table in reversed(Base.metadata.sorted_tables): + if table.info == "view": + continue connection.execute(table.delete()) connection.commit() diff --git a/tests/helpers/simple_test_data_functions.py b/tests/helpers/simple_test_data_functions.py index df455e0e..7c42fd8d 100644 --- a/tests/helpers/simple_test_data_functions.py +++ b/tests/helpers/simple_test_data_functions.py @@ -13,16 +13,15 @@ def generate_test_urls(count: int) -> list[str]: return results -def generate_test_html() -> str: - return """ - - - -This is an example of HTML content.
- - - """ \ No newline at end of file + +def generate_test_url(i: int) -> str: + return f"https://test.com/{i}" + +def generate_test_name(i: int) -> str: + return f"Test Name {i}" + +def generate_test_description(i: int) -> str: + return f"Test description {i}" + +def generate_test_html(i: int) -> str: + return f"