diff --git a/ENV.md b/ENV.md index b957bc11..386dbdae 100644 --- a/ENV.md +++ b/ENV.md @@ -57,19 +57,30 @@ Note that some tasks/subtasks are themselves enabled by other tasks. ### Scheduled Task Flags -| Flag | Description | -|-------------------------------------|-------------------------------------------------------------------------------| -| `SCHEDULED_TASKS_FLAG` | All scheduled tasks. Disabling disables all other scheduled tasks. | -| `PUSH_TO_HUGGING_FACE_TASK_FLAG` | Pushes data to HuggingFace. | -| `POPULATE_BACKLOG_SNAPSHOT_TASK_FLAG` | Populates the backlog snapshot. | -| `DELETE_OLD_LOGS_TASK_FLAG` | Deletes old logs. | -| `RUN_URL_TASKS_TASK_FLAG` | Runs URL tasks. | -| `IA_PROBE_TASK_FLAG` | Extracts and links Internet Archives metadata to URLs. | -| `IA_SAVE_TASK_FLAG` | Saves URLs to Internet Archives. | -| `MARK_TASK_NEVER_COMPLETED_TASK_FLAG` | Marks tasks that were started but never completed (usually due to a restart). | -| `DELETE_STALE_SCREENSHOTS_TASK_FLAG` | Deletes stale screenshots for URLs already validated. | -| `TASK_CLEANUP_TASK_FLAG` | Cleans up tasks that are no longer needed. | -| `REFRESH_MATERIALIZED_VIEWS_TASK_FLAG` | Refreshes materialized views. | +| Flag | Description | +|--------------------------------------------|-------------------------------------------------------------------------------| +| `SCHEDULED_TASKS_FLAG` | All scheduled tasks. Disabling disables all other scheduled tasks. | +| `PUSH_TO_HUGGING_FACE_TASK_FLAG` | Pushes data to HuggingFace. | +| `POPULATE_BACKLOG_SNAPSHOT_TASK_FLAG` | Populates the backlog snapshot. | +| `DELETE_OLD_LOGS_TASK_FLAG` | Deletes old logs. | +| `RUN_URL_TASKS_TASK_FLAG` | Runs URL tasks. | +| `IA_PROBE_TASK_FLAG` | Extracts and links Internet Archives metadata to URLs. | +| `IA_SAVE_TASK_FLAG` | Saves URLs to Internet Archives. | +| `MARK_TASK_NEVER_COMPLETED_TASK_FLAG` | Marks tasks that were started but never completed (usually due to a restart). | +| `DELETE_STALE_SCREENSHOTS_TASK_FLAG` | Deletes stale screenshots for URLs already validated. | +| `TASK_CLEANUP_TASK_FLAG` | Cleans up tasks that are no longer needed. | +| `REFRESH_MATERIALIZED_VIEWS_TASK_FLAG` | Refreshes materialized views. | +| `UPDATE_URL_STATUS_TASK_FLAG` | Updates the status of URLs. | +| `DS_APP_SYNC_AGENCY_ADD_TASK_FLAG` | Adds new agencies to the Data Sources App| +| `DS_APP_SYNC_AGENCY_UPDATE_TASK_FLAG` | Updates existing agencies in the Data Sources App| +| `DS_APP_SYNC_AGENCY_DELETE_TASK_FLAG` | Deletes agencies in the Data Sources App| +| `DS_APP_SYNC_DATA_SOURCE_ADD_TASK_FLAG` | Adds new data sources to the Data Sources App| +| `DS_APP_SYNC_DATA_SOURCE_UPDATE_TASK_FLAG` | Updates existing data sources in the Data Sources App| +| `DS_APP_SYNC_DATA_SOURCE_DELETE_TASK_FLAG` | Deletes data sources in the Data Sources App| +| `DS_APP_SYNC_META_URL_ADD_TASK_FLAG` | Adds new meta URLs to the Data Sources App| +| `DS_APP_SYNC_META_URL_UPDATE_TASK_FLAG` | Updates existing meta URLs in the Data Sources App| +| `DS_APP_SYNC_META_URL_DELETE_TASK_FLAG` | Deletes meta URLs in the Data Sources App| +| `INTEGRITY_MONITOR_TASK_FLAG` | Runs integrity checks. | ### URL Task Flags @@ -81,7 +92,6 @@ URL Task Flags are collectively controlled by the `RUN_URL_TASKS_TASK_FLAG` flag | `URL_HTML_TASK_FLAG` | URL HTML scraping task. | | `URL_RECORD_TYPE_TASK_FLAG` | Automatically assigns Record Types to URLs. | | `URL_AGENCY_IDENTIFICATION_TASK_FLAG` | Automatically assigns and suggests Agencies for URLs. | -| `URL_SUBMIT_APPROVED_TASK_FLAG` | Submits approved URLs to the Data Sources App. | | `URL_MISC_METADATA_TASK_FLAG` | Adds misc metadata to URLs. | | `URL_AUTO_RELEVANCE_TASK_FLAG` | Automatically assigns Relevances to URLs. | | `URL_PROBE_TASK_FLAG` | Probes URLs for web metadata. | @@ -90,7 +100,6 @@ URL Task Flags are collectively controlled by the `RUN_URL_TASKS_TASK_FLAG` flag | `URL_AUTO_VALIDATE_TASK_FLAG` | Automatically validates URLs. | | `URL_AUTO_NAME_TASK_FLAG` | Automatically names URLs. | | `URL_SUSPEND_TASK_FLAG` | Suspends URLs meeting suspension criteria. | -| `URL_SUBMIT_META_URLS_TASK_FLAG` | Submits meta URLs to the Data Sources App. | ### Agency ID Subtasks diff --git a/README.md b/README.md index ae2263dc..56e8182d 100644 --- a/README.md +++ b/README.md @@ -156,3 +156,71 @@ if it detects any missing docstrings or type hints in files that you have modifi These will *not* block any Pull request, but exist primarily as advisory comments to encourage good coding standards. Note that `python_checks.yml` will only function on pull requests made from within the repo, not from a forked repo. + +# Syncing to Data Sources App + +The Source Manager (SM) is part of a two app system, with the other app being the Data Sources (DS) App. + + +## Add, Update, and Delete + +These are the core synchronization actions. + +In order to propagate changes to DS, we synchronize additions, updates, and deletions of the following entities: +- Agencies +- Data Sources +- Meta URLs + +Each action for each entity occurs through a separate task. At the moment, there are nine tasks total. + +Each task gathers requisite information from the SM database and sends a request to one of nine corresponding endpoints in the DS API. + +Each DS endpoint follows the following format: + +```text +/v3/sync/{entity}/{action} +``` + +Synchronizations are designed to occur on an hourly basis. + +Here is a high-level description of how each action works: + +### Add + +Adds the given entities to DS. + +These are denoted with the `/{entity}/add` path in the DS API. + +When an entity is added, it returns a unique DS ID that is mapped to the internal SM database ID via the DS app link tables. + +For an entity to be added, it must meet preconditions which are distinct for each entity: +- Agencies: Must have an agency entry in the database and be linked to a location. +- Data Sources: Must be a URL that has been internally validated as a data source and linked to an agency. +- Meta URLs: Must be a URL that has been internally validated as a meta URL and linked to an agency. + +### Update + +Updates the given entities in DS. + +These are denoted with the `/{entity}/update` path in the DS API. + +These consist of submitting the updated entities (in full) to the requisite endpoint, and updating the local app link to indicate that the update occurred. All updates are designed to be full overwrites of the entity. + +For an entity to be updated, it must meet preconditions which are distinct for each entity: +- Agencies: Must have either an agency row updated or an agency/location link updated or deleted. +- Data Sources: One of the following must be updated: + - The URL table + - The record type table + - The optional data sources metadata table + - The agency link table (either an addition or deletion) +- Meta URLs: Must be a URL that has been internally validated as a meta URL and linked to an agency. Either the URL table or the agency link table (addition or deletion) must be updated. + +### Delete + +Deletes the given entities from DS. + +These are denoted with the `/{entity}/delete` path in the DS API. + +This consists of submitting a set of DS IDs to the requisite endpoint, and removing the associated DS app link entry in the SM database. + +When an entity with a corresponding DS App Link is deleted from the Source Manager, the core data is removed but a deletion flag is appended to the DS App Link entry, indicating that the entry is not yet removed from the DS App. The deletion task uses this flag to identify entities to be deleted, submits the deletion request to the DS API, and removes both the flag and the DS App Link. \ No newline at end of file diff --git a/alembic/Jenkinsfile b/alembic/Jenkinsfile new file mode 100644 index 00000000..b5a330c7 --- /dev/null +++ b/alembic/Jenkinsfile @@ -0,0 +1,30 @@ +pipeline { + agent { + dockerfile { + filename 'Dockerfile' + args '-e POSTGRES_USER=POSTGRES_USER -e POSTGRES_PASSWORD=POSTGRES_PASSWORD -e POSTGRES_DB=POSTGRES_DB -e POSTGRES_HOST=POSTGRES_HOST -e POSTGRES_PORT=POSTGRES_PORT' + } + } + + stages { + stage('Migrate using Alembic') { + steps { + echo 'Building..' + sh 'python apply_migrations.py' + } + } + } + post { + failure { + script { + def payload = """{ + "content": "🚨 Build Failed: ${env.JOB_NAME} #${env.BUILD_NUMBER}" + }""" + + sh """ + curl -X POST -H "Content-Type: application/json" -d '${payload}' ${env.WEBHOOK_URL} + """ + } + } + } +} \ No newline at end of file diff --git a/alembic/versions/2025_10_14_1105-a8f36f185694_add_url_scheme_column.py b/alembic/versions/2025_10_14_1105-a8f36f185694_add_url_scheme_column.py new file mode 100644 index 00000000..aa73e268 --- /dev/null +++ b/alembic/versions/2025_10_14_1105-a8f36f185694_add_url_scheme_column.py @@ -0,0 +1,338 @@ +"""Add url scheme column + +Revision ID: a8f36f185694 +Revises: 7aace6587d1a +Create Date: 2025-10-14 11:05:28.686940 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'a8f36f185694' +down_revision: Union[str, None] = '7aace6587d1a' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + + + +def upgrade() -> None: + _update_foreign_key_constraints() + + _delete_duplicate_urls() + _add_column() + _populate_column() + _remove_schemes_from_url_column() + _add_check_constraint_to_url_column() + +def _update_foreign_key_constraints(): + # URL Optional Data Source Metadata + op.execute(""" + ALTER TABLE url_optional_data_source_metadata + DROP CONSTRAINT IF EXISTS url_optional_data_source_metadata_url_id_fkey; + """) + + op.create_foreign_key( + "url_optional_data_source_metadata_url_id_fkey", + "url_optional_data_source_metadata", + "urls", + ["url_id"], + ["id"], + ondelete="CASCADE" + ) + + # Link URLs Redirect URL + # (Source URL ID) + op.execute(""" + ALTER TABLE link_urls_redirect_url + DROP CONSTRAINT IF EXISTS link_urls_redirect_url_source_url_id_fkey; + """) + + op.create_foreign_key( + "link_urls_redirect_url_source_url_id_fkey", + "link_urls_redirect_url", + "urls", + ["source_url_id"], + ["id"], + ondelete="CASCADE" + ) + + # (Destination URL ID) + op.execute(""" + ALTER TABLE link_urls_redirect_url + DROP CONSTRAINT IF EXISTS link_urls_redirect_url_destination_url_id_fkey; + """) + + op.create_foreign_key( + "link_urls_redirect_url_destination_url_id_fkey", + "link_urls_redirect_url", + "urls", + ["destination_url_id"], + ["id"], + ondelete="CASCADE" + ) + + # Reviewing User URL + op.execute(""" + ALTER TABLE reviewing_user_url + DROP CONSTRAINT IF EXISTS approving_user_url_url_id_fkey; + """) + + op.create_foreign_key( + "approving_user_url_url_id_fkey", + "reviewing_user_url", + "urls", + ["url_id"], + ["id"], + ondelete="CASCADE" + ) + + # user_url_agency_suggestions + op.execute(""" + ALTER TABLE user_url_agency_suggestions + DROP CONSTRAINT IF EXISTS user_url_agency_suggestions_url_id_fkey; + """) + + op.create_foreign_key( + "user_url_agency_suggestions_url_id_fkey", + "user_url_agency_suggestions", + "urls", + ["url_id"], + ["id"], + ondelete="CASCADE" + ) + + # Duplicates + op.execute(""" + ALTER TABLE duplicates + DROP CONSTRAINT IF EXISTS duplicates_original_url_id_fkey; + """) + + op.create_foreign_key( + "duplicates_original_url_id_fkey", + "duplicates", + "urls", + ["original_url_id"], + ["id"], + ondelete="CASCADE" + ) + + # link_user_name_suggestions + op.execute(""" + ALTER TABLE link_user_name_suggestions + DROP CONSTRAINT IF EXISTS link_user_name_suggestions_suggestion_id_fkey; + """) + + op.create_foreign_key( + "link_user_name_suggestions_suggestion_id_fkey", + "link_user_name_suggestions", + "url_name_suggestions", + ["suggestion_id"], + ["id"], + ondelete="CASCADE" + ) + +def _delete_duplicate_urls(): + op.execute(""" + DELETE FROM urls + WHERE id IN ( + 4217, + 15902, + 3472, + 17387, + 24256, + 17617, + 17414, + 15259, + 17952, + 17651, + 18010, + 18496, + 18563, + 18587, + 18592, + 18092, + 18046, + 20467, + 24346, + 28241, + 25075, + 22508, + 22391, + 24256, + 22486, + 28109, + 26336, + 30701, + 17387, + 19348, + 18080, + 27863, + 18855, + 28830, + 18824, + 17414, + 15259, + 20676, + 27716, + 21475, + 23442, + 28553, + 8176, + 22270, + 19161, + 21250, + 15659, + 18821, + 27067, + 27567, + 27318, + 20640, + 21840, + 3472, + 28982, + 28910, + 19527, + 28776, + 15902, + 18468, + 29557, + 22977, + 27694, + 22678, + 19094, + 27203, + 26436, + 18868, + 22813, + 25007, + 7548, + 30088, + 20924, + 22575, + 28149, + 30705, + 28179, + 30660, + 2988, + 17182, + 18893, + 30317, + 19215, + 17651, + 21117, + 17617, + 23742, + 19620, + 16865, + 19320, + 20516, + 25248, + 26122, + 30158, + 30522, + 23307, + 18621, + 27855, + 26922, + 21397, + 18010, + 18592, + 2527, + 26279, + 18563, + 18242, + 21550, + 28288, + 22361, + 24660, + 2989, + 28765, + 10627, + 19625, + 12191, + 27523, + 18373, + 28565, + 25437, + 26077, + 28554, + 23229, + 25631, + 25528, + 18092, + 10765, + 26126, + 51499, + 27375, + 24177, + 22734, + 22459, + 22439, + 18532, + 29064, + 20504, + 21643, + 21551, + 27698, + 19234, + 24308, + 22559, + 26227, + 19080, + 16010, + 3515, + 22658, + 20673, + 21854, + 19361, + 21768, + 26903, + 21253, + 23085, + 3761, + 3565 + ) + """) + +def _populate_column(): + op.execute( + """ + UPDATE urls + SET scheme = lower(split_part(url, '://', 1)) + WHERE url ~* '^[a-z][a-z0-9+.-]*://'; + """ + ) + + +def _remove_schemes_from_url_column(): + op.execute( + """ + UPDATE urls + SET url = regexp_replace(url, '^[a-z][a-z0-9+.-]*://', '', 'i') + WHERE url ~* '^[a-z][a-z0-9+.-]*://'; + """ + ) + + +def _add_check_constraint_to_url_column(): + op.execute( + """ + ALTER TABLE urls + ADD CONSTRAINT check_url_does_not_have_schema CHECK (url !~* '^[a-z][a-z0-9+.-]*://'); + """ + ) + + +def _add_column(): + op.add_column( + "urls", + sa.Column("scheme", sa.String(), nullable=True) + ) + +def downgrade() -> None: + pass diff --git a/alembic/versions/2025_10_14_1837-ff4e8b2f6348_add_updated_at_triggers.py b/alembic/versions/2025_10_14_1837-ff4e8b2f6348_add_updated_at_triggers.py new file mode 100644 index 00000000..faf10f91 --- /dev/null +++ b/alembic/versions/2025_10_14_1837-ff4e8b2f6348_add_updated_at_triggers.py @@ -0,0 +1,46 @@ +"""Add updated_at triggers + +Revision ID: ff4e8b2f6348 +Revises: a8f36f185694 +Create Date: 2025-10-14 18:37:07.121323 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + +from src.util.alembic_helpers import create_updated_at_trigger + +# revision identifiers, used by Alembic. +revision: str = 'ff4e8b2f6348' +down_revision: Union[str, None] = 'a8f36f185694' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + for table in [ + "agencies", + "auto_record_type_suggestions", + "auto_relevant_suggestions", + "flag_url_validated", + "link_batch_urls", + "link_urls_agency", + "link_urls_redirect_url", + "link_urls_root_url", + "tasks", + "url_compressed_html", + "url_internet_archives_probe_metadata", + "url_scrape_info", + "url_screenshot", + "url_web_metadata", + "urls", + "user_record_type_suggestions", + "user_url_type_suggestions", + ]: + create_updated_at_trigger(table) + + +def downgrade() -> None: + pass diff --git a/alembic/versions/2025_10_17_1826-7fc6502f1fa3_add_trailing_slash_column.py b/alembic/versions/2025_10_17_1826-7fc6502f1fa3_add_trailing_slash_column.py new file mode 100644 index 00000000..69faae2e --- /dev/null +++ b/alembic/versions/2025_10_17_1826-7fc6502f1fa3_add_trailing_slash_column.py @@ -0,0 +1,147 @@ +"""Add trailing slash column + +Revision ID: 7fc6502f1fa3 +Revises: ff4e8b2f6348 +Create Date: 2025-10-17 18:26:56.756915 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '7fc6502f1fa3' +down_revision: Union[str, None] = 'ff4e8b2f6348' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + _remove_duplicates() + _add_trailing_slash_column() + _migrate_trailing_slash_to_column() + _remove_trailing_slash_from_url_column() + _add_check_constraint_forbidding_trailing_slash_in_url() + +def _remove_duplicates(): + op.execute( + """ + DELETE FROM urls + WHERE id IN ( + 23504, + 29401, + 21032, + 23687, + 15760, + 17574, + 17669, + 21382, + 11697, + 18076, + 27764, + 11395, + 17702, + 26857, + 30843, + 21850, + 29471, + 26789, + 19428, + 18452, + 30547, + 24004, + 27857, + 30260, + 26968, + 27065, + 29073, + 21827, + 25615, + 28644, + 24417, + 29801, + 27625, + 15708, + 23517, + 26415, + 26081, + 7478, + 20368, + 19494, + 26624, + 3817, + 3597, + 3568, + 16113, + 24125, + 30625, + 29965, + 23134, + 19207, + 12158, + 3835, + 24730, + 17113, + 29987, + 21452, + 24605, + 5043, + 17237, + 25522, + 11065, + 12387, + 12210, + 11185, + 11961, + 4935, + 24200, + 29028, + 24371, + 28355, + 17620, + 19546, + 3598 + ) + """ + ) + +def _add_trailing_slash_column(): + op.add_column( + 'urls', + sa.Column( + 'trailing_slash', + sa.Boolean(), + nullable=False, + server_default=sa.text('false') + ) + ) + +def _migrate_trailing_slash_to_column(): + op.execute( + """ + UPDATE urls + SET trailing_slash = url ~ '/$' + """ + ) + +def _remove_trailing_slash_from_url_column(): + op.execute( + """ + UPDATE urls + SET url = rtrim(url, '/') + WHERE url like '%/'; + """ + ) + +def _add_check_constraint_forbidding_trailing_slash_in_url(): + op.execute( + """ + ALTER TABLE urls + ADD CONSTRAINT no_trailing_slash CHECK (url !~ '/$') + """ + ) + +def downgrade() -> None: + pass diff --git a/alembic/versions/2025_10_18_1517-9d57b3b79d35_update_url_status_materialized_view.py b/alembic/versions/2025_10_18_1517-9d57b3b79d35_update_url_status_materialized_view.py new file mode 100644 index 00000000..2a7db8e5 --- /dev/null +++ b/alembic/versions/2025_10_18_1517-9d57b3b79d35_update_url_status_materialized_view.py @@ -0,0 +1,104 @@ +"""Update URL Status Materialized View + +Revision ID: 9d57b3b79d35 +Revises: 7fc6502f1fa3 +Create Date: 2025-10-18 15:17:23.653448 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '9d57b3b79d35' +down_revision: Union[str, None] = '7fc6502f1fa3' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.execute("DROP MATERIALIZED VIEW IF EXISTS url_status_mat_view") + op.execute(""" + CREATE MATERIALIZED VIEW url_status_mat_view as + with + urls_with_relevant_errors as ( + select + ute.url_id + from + url_task_error ute + where + ute.task_type in ( + 'Screenshot', + 'HTML', + 'URL Probe' + ) + ) + , status_text as ( + select + u.id as url_id, + case + when ( + -- Validated as not relevant, individual record, or not found + fuv.type in ('not relevant', 'individual record', 'not found') + ) Then 'Accepted' + when ( + (fuv.type = 'data source' and uds.url_id is null) + OR + (fuv.type = 'meta url' and udmu.url_id is null) + ) Then 'Awaiting Submission' + when ( + (fuv.type = 'data source' and uds.url_id is not null) + OR + (fuv.type = 'meta url' and udmu.url_id is not null) + ) Then 'Submitted' + when ( + -- Has compressed HTML + uch.url_id is not null + AND + -- Has web metadata + uwm.url_id is not null + AND + -- Has screenshot + us.url_id is not null + ) THEN 'Community Labeling' + when uwre.url_id is not null then 'Error' + ELSE 'Intake' + END as status + + from + urls u + left join urls_with_relevant_errors uwre + on u.id = uwre.url_id + left join url_screenshot us + on u.id = us.url_id + left join url_compressed_html uch + on u.id = uch.url_id + left join url_web_metadata uwm + on u.id = uwm.url_id + left join flag_url_validated fuv + on u.id = fuv.url_id + left join url_ds_meta_url udmu + on u.id = udmu.url_id + left join url_data_source uds + on u.id = uds.url_id + ) + select + url_id, + status, + CASE status + WHEN 'Intake' THEN 100 + WHEN 'Error' THEN 110 + WHEN 'Community Labeling' THEN 200 + WHEN 'Accepted' THEN 300 + WHEN 'Awaiting Submission' THEN 380 + WHEN 'Submitted' THEN 390 + ELSE -1 + END as code + from status_text + """) + + +def downgrade() -> None: + pass diff --git a/alembic/versions/2025_10_20_1620-6adf9d894180_enable_data_source_agency_submission.py b/alembic/versions/2025_10_20_1620-6adf9d894180_enable_data_source_agency_submission.py new file mode 100644 index 00000000..c45f4f28 --- /dev/null +++ b/alembic/versions/2025_10_20_1620-6adf9d894180_enable_data_source_agency_submission.py @@ -0,0 +1,109 @@ +"""Enable data source/agency submission + +Revision ID: 6adf9d894180 +Revises: 9d57b3b79d35 +Create Date: 2025-10-20 16:20:44.081736 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects.postgresql import ENUM, ARRAY + + +# revision identifiers, used by Alembic. +revision: str = '6adf9d894180' +down_revision: Union[str, None] = '9d57b3b79d35' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + +def upgrade() -> None: + _add_autogenerated_agency_id() + _add_new_columns_to_optional_ds_metadata() + +def _add_new_columns_to_optional_ds_metadata(): + table_name: str = "url_optional_data_source_metadata" + + agency_aggregation_enum = ENUM( + 'federal', + 'state', + 'county', + 'local', + name='agency_aggregation_enum', + create_type=True, + ) + agency_aggregation_enum.create(op.get_bind()) + + update_method_enum = ENUM( + 'Overwrite', + 'Insert', + 'No updates', + name='update_method_enum', + create_type=True + ) + update_method_enum.create(op.get_bind()) + + retention_schedule_enum = ENUM( + 'Future only', + '1 month', + '1 day', + '1 week', + '1-10 years', + '< 1 day', + '< 1 week', + '< 1 year', + '> 10 years', + name='retention_schedule_enum', + create_type=True + ) + retention_schedule_enum.create(op.get_bind()) + + access_type_enum = ENUM( + 'Webpage', + 'Download', + 'API', + name='access_type_enum', + create_type=True, + ) + access_type_enum.create(op.get_bind()) + + for column in [ + sa.Column('coverage_start', sa.Date(), nullable=True), + sa.Column('coverage_end', sa.Date(), nullable=True), + sa.Column("agency_supplied", sa.Boolean(), nullable=True), + sa.Column('agency_originated', sa.Boolean(), nullable=True), + sa.Column('agency_aggregation', agency_aggregation_enum), + sa.Column('agency_described_not_in_database', sa.Text(), nullable=True), + sa.Column('update_method', update_method_enum, nullable=True), + sa.Column('readme_url', sa.Text(), nullable=True), + sa.Column('originating_entity', sa.Text(), nullable=True), + sa.Column('retention_schedule', retention_schedule_enum, nullable=True), + sa.Column('scraper_url', sa.Text(), nullable=True), + sa.Column('submission_notes', sa.Text(), nullable=True), + sa.Column('access_notes', sa.Text(), nullable=True), + sa.Column('access_types', ARRAY( + access_type_enum + ), nullable=True), + ]: + op.add_column( + table_name, + column, + ) + +def _add_autogenerated_agency_id(): + op.execute( + """ + CREATE SEQUENCE agencies_agency_id START WITH 23191; + """ + ) + + op.execute( + """ + ALTER TABLE agencies + ALTER COLUMN agency_id SET DEFAULT nextval('agencies_agency_id'); + """ + ) + +def downgrade() -> None: + pass diff --git a/alembic/versions/2025_10_21_1123-f32ba7664e9f_set_batches_user_id_to_be_nullable.py b/alembic/versions/2025_10_21_1123-f32ba7664e9f_set_batches_user_id_to_be_nullable.py new file mode 100644 index 00000000..d6076e7a --- /dev/null +++ b/alembic/versions/2025_10_21_1123-f32ba7664e9f_set_batches_user_id_to_be_nullable.py @@ -0,0 +1,30 @@ +"""Set batches.user_id to be nullable + +Revision ID: f32ba7664e9f +Revises: 6adf9d894180 +Create Date: 2025-10-21 11:23:35.611484 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'f32ba7664e9f' +down_revision: Union[str, None] = '6adf9d894180' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.alter_column( + table_name='batches', + column_name='user_id', + nullable=True + ) + + +def downgrade() -> None: + pass diff --git a/alembic/versions/2025_10_28_1539-a57c3b5b6e93_add_sync_log_table.py b/alembic/versions/2025_10_28_1539-a57c3b5b6e93_add_sync_log_table.py new file mode 100644 index 00000000..03510b1c --- /dev/null +++ b/alembic/versions/2025_10_28_1539-a57c3b5b6e93_add_sync_log_table.py @@ -0,0 +1,644 @@ +"""Add sync_log table + +Revision ID: a57c3b5b6e93 +Revises: f32ba7664e9f +Create Date: 2025-10-28 15:39:50.494489 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + +from src.util.alembic_helpers import created_at_column, updated_at_column, create_updated_at_trigger, remove_enum_value + +# revision identifiers, used by Alembic. +revision: str = 'a57c3b5b6e93' +down_revision: Union[str, None] = 'f32ba7664e9f' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + + + + +def upgrade() -> None: + _create_sync_log() + _create_ds_agency_link() + _migrate_agency_ids_to_ds_agency_link() + remove_id_column_from_agencies() + rename_agency_id_to_id() + _rename_existing_tables_to_ds_app_format() + _delete_meta_url_ds_app_links() + _alter_ds_app_link_data_source_table() + _alter_ds_app_link_meta_url_table() + _add_flag_deletion_tables() + _add_last_synced_at_columns() + _add_link_table_modification_triggers() + _add_updated_at_to_optional_data_source_metadata_table() + _update_sync_tasks() + _alter_agency_jurisdiction_type_column() + _add_updated_at_to_url_record_type_table() + _add_updated_at_trigger_to_url_optional_data_source_metadata() + _add_data_portal_type_other_to_ds_optional_metadata() + +def _delete_meta_url_ds_app_links(): + op.execute( + "DELETE FROM ds_app_link_meta_url;" + ) + +def _add_data_portal_type_other_to_ds_optional_metadata(): + op.add_column( + 'url_optional_data_source_metadata', + sa.Column( + 'data_portal_type_other', + sa.String(), + nullable=True + ) + ) + +def _add_updated_at_trigger_to_url_optional_data_source_metadata(): + create_updated_at_trigger( + "url_optional_data_source_metadata" + ) + +def _add_updated_at_to_url_record_type_table(): + op.add_column( + 'url_record_type', + updated_at_column() + ) + create_updated_at_trigger( + "url_record_type" + ) + + + +def _alter_agency_jurisdiction_type_column(): + op.alter_column( + 'agencies', + 'jurisdiction_type', + nullable=False, + ) + + +def _update_sync_tasks(): + + # Drop Views + op.execute("drop view url_task_count_1_day") + op.execute("drop view url_task_count_1_week") + op.execute("drop materialized view url_status_mat_view") + + + + targets: list[tuple[str, str]] = [ + ('tasks', 'task_type'), + ('url_task_error', 'task_type') + ] + + remove_enum_value( + enum_name="task_type", + value_to_remove="Sync Agencies", + targets=targets + ) + remove_enum_value( + enum_name="task_type", + value_to_remove="Sync Data Sources", + targets=targets + ) + new_enum_values: list[str] = [ + "Sync Agencies Add", + "Sync Agencies Update", + "Sync Agencies Delete", + "Sync Data Sources Add", + "Sync Data Sources Update", + "Sync Data Sources Delete", + "Sync Meta URLs Add", + "Sync Meta URLs Update", + "Sync Meta URLs Delete", + ] + for enum_value in new_enum_values: + op.execute(f"ALTER TYPE task_type ADD VALUE '{enum_value}';") + + # Recreate Views + op.execute(""" + create view url_task_count_1_day(task_type, count) as + SELECT + t.task_type, + count(ltu.url_id) AS count + FROM + tasks t + JOIN link_task_urls ltu + ON ltu.task_id = t.id + WHERE + t.updated_at > (now() - '1 day'::interval) + GROUP BY + t.task_type; + """) + + op.execute(""" + create view url_task_count_1_week(task_type, count) as + SELECT + t.task_type, + count(ltu.url_id) AS count + FROM + tasks t + JOIN link_task_urls ltu + ON ltu.task_id = t.id + WHERE + t.updated_at > (now() - '7 days'::interval) + GROUP BY + t.task_type; + """) + + op.execute( + """ + CREATE MATERIALIZED VIEW url_status_mat_view as + with + urls_with_relevant_errors as ( + select + ute.url_id + from + url_task_error ute + where + ute.task_type in ( + 'Screenshot', + 'HTML', + 'URL Probe' + ) + ) + , status_text as ( + select + u.id as url_id, + case + when ( + -- Validated as not relevant, individual record, or not found + fuv.type in ('not relevant', 'individual record', 'not found') + ) Then 'Accepted' + when ( + (fuv.type = 'data source' and uds.url_id is null) + OR + (fuv.type = 'meta url' and udmu.url_id is null) + ) Then 'Awaiting Submission' + when ( + (fuv.type = 'data source' and uds.url_id is not null) + OR + (fuv.type = 'meta url' and udmu.url_id is not null) + ) Then 'Submitted' + when ( + -- Has compressed HTML + uch.url_id is not null + AND + -- Has web metadata + uwm.url_id is not null + AND + -- Has screenshot + us.url_id is not null + ) THEN 'Community Labeling' + when uwre.url_id is not null then 'Error' + ELSE 'Intake' + END as status + + from + urls u + left join urls_with_relevant_errors uwre + on u.id = uwre.url_id + left join url_screenshot us + on u.id = us.url_id + left join url_compressed_html uch + on u.id = uch.url_id + left join url_web_metadata uwm + on u.id = uwm.url_id + left join flag_url_validated fuv + on u.id = fuv.url_id + left join ds_app_link_meta_url udmu + on u.id = udmu.url_id + left join ds_app_link_data_source uds + on u.id = uds.url_id + ) + select + url_id, + status, + CASE status + WHEN 'Intake' THEN 100 + WHEN 'Error' THEN 110 + WHEN 'Community Labeling' THEN 200 + WHEN 'Accepted' THEN 300 + WHEN 'Awaiting Submission' THEN 380 + WHEN 'Submitted' THEN 390 + ELSE -1 + END as code + from status_text + """ + ) + + +def last_synced_at_column(): + return sa.Column( + 'last_synced_at', + sa.DateTime(), + nullable=False, + server_default=sa.func.now() + ) + + +def _add_link_table_modification_triggers(): + op.execute(""" + -- trigger func that "touches" parent rows hit by changes to the link table + CREATE OR REPLACE FUNCTION touch_url_from_agency_link() + RETURNS trigger + LANGUAGE plpgsql AS $$ + BEGIN + IF TG_OP = 'INSERT' THEN + EXECUTE $q$ + UPDATE urls u + SET updated_at = clock_timestamp() + FROM (SELECT DISTINCT url_id FROM newtab) AS hit + WHERE u.id = hit.url_id + $q$; + + ELSIF TG_OP = 'DELETE' THEN + EXECUTE $q$ + UPDATE urls u + SET updated_at = clock_timestamp() + FROM (SELECT DISTINCT url_id FROM oldtab) AS hit + WHERE u.id = hit.url_id + $q$; + + ELSE -- UPDATE + EXECUTE $q$ + UPDATE urls u + SET updated_at = clock_timestamp() + FROM ( + SELECT DISTINCT url_id FROM newtab + UNION + SELECT DISTINCT url_id FROM oldtab + ) AS hit + WHERE u.id = hit.url_id + $q$; + END IF; + + RETURN NULL; -- statement-level trigger + END $$; + + -- statement-level trigger with transition tables + CREATE TRIGGER trg_link_urls_agency_touch_url_ins + AFTER INSERT ON link_urls_agency + REFERENCING NEW TABLE AS newtab + FOR EACH STATEMENT + EXECUTE FUNCTION touch_url_from_agency_link(); + + CREATE TRIGGER trg_link_urls_agency_touch_url_upd + AFTER UPDATE ON link_urls_agency + REFERENCING NEW TABLE AS newtab OLD TABLE AS oldtab + FOR EACH STATEMENT + EXECUTE FUNCTION touch_url_from_agency_link(); + + CREATE TRIGGER trg_link_urls_agency_touch_url_del + AFTER DELETE ON link_urls_agency + REFERENCING OLD TABLE AS oldtab + FOR EACH STATEMENT + EXECUTE FUNCTION touch_url_from_agency_link(); + + """) + + op.execute( + """ + -- trigger func that "touches" agency rows hit by changes to the link_agencies_locations table + CREATE OR REPLACE FUNCTION touch_agency_from_location_link() + RETURNS trigger + LANGUAGE plpgsql AS + $$ + BEGIN + IF TG_OP = 'INSERT' THEN + EXECUTE $q$ + UPDATE agencies a + SET updated_at = clock_timestamp() + FROM (SELECT DISTINCT agency_id FROM newtab) AS hit + WHERE a.id = hit.agency_id + $q$; + + ELSIF TG_OP = 'DELETE' THEN + EXECUTE $q$ + UPDATE agencies a + SET updated_at = clock_timestamp() + FROM (SELECT DISTINCT agency_id FROM oldtab) AS hit + WHERE a.id = hit.agency_id + $q$; + + ELSE -- UPDATE + EXECUTE $q$ + UPDATE agencies a + SET updated_at = clock_timestamp() + FROM ( + SELECT DISTINCT agency_id FROM newtab + UNION + SELECT DISTINCT agency_id FROM oldtab + ) AS hit + WHERE a.id = hit.agency_id + $q$; + END IF; + + RETURN NULL; -- statement-level trigger + END + $$; + + -- statement-level trigger with transition tables + CREATE TRIGGER trg_link_agencies_locations_touch_agencies_ins + AFTER INSERT ON link_agencies_locations + REFERENCING NEW TABLE AS newtab + FOR EACH STATEMENT + EXECUTE FUNCTION touch_agency_from_location_link(); + + CREATE TRIGGER trg_link_agencies_locations_touch_agencies_upd + AFTER UPDATE ON link_agencies_locations + REFERENCING NEW TABLE AS newtab OLD TABLE AS oldtab + FOR EACH STATEMENT + EXECUTE FUNCTION touch_agency_from_location_link(); + + CREATE TRIGGER trg_link_agencies_locations_touch_agencies_del + AFTER DELETE ON link_agencies_locations + REFERENCING OLD TABLE AS oldtab + FOR EACH STATEMENT + EXECUTE FUNCTION touch_agency_from_location_link(); + """ + ) + + + + + + + +def _add_updated_at_to_optional_data_source_metadata_table(): + op.add_column( + "url_optional_data_source_metadata", + updated_at_column() + ) + create_updated_at_trigger( + "url_optional_data_source_metadata" + ) + +def _add_last_synced_at_columns(): + op.add_column( + 'ds_app_link_data_source', + last_synced_at_column() + ) + op.add_column( + 'ds_app_link_meta_url', + last_synced_at_column() + ) + + +def _alter_ds_app_link_data_source_table(): + # Drop unique constraint for data source id + op.drop_constraint( + 'uq_url_data_sources_data_source_id', + 'ds_app_link_data_source', + type_='unique' + ) + # Drop primary keys + op.drop_constraint( + 'url_data_sources_pkey', + 'ds_app_link_data_source', + type_='primary' + ) + # Rename `data_source_id` to `ds_data_source_id` + op.alter_column( + 'ds_app_link_data_source', + 'data_source_id', + new_column_name='ds_data_source_id', + ) + # Add new primary key + op.create_primary_key( + 'ds_app_link_data_source_pkey', + 'ds_app_link_data_source', + ['ds_data_source_id'] + ) + + # Drop url_id foreign key + op.drop_constraint( + 'url_data_sources_url_id_fkey', + 'ds_app_link_data_source', + type_='foreignkey' + ) + # Recreate foreign key with ON DELETE SET NULL + op.create_foreign_key( + 'ds_app_link_data_source_url_id_fkey', + 'ds_app_link_data_source', + 'urls', + ['url_id'], + ['id'], + ondelete='SET NULL' + ) + # Alter url_id column to be nullable + op.alter_column( + 'ds_app_link_data_source', + 'url_id', + nullable=True + ) + + + +def _alter_ds_app_link_meta_url_table(): + # Drop joint primary key for url_id and agency_id + op.drop_constraint( + 'url_ds_meta_url_pkey', + 'ds_app_link_meta_url', + type_='primary' + ) + # Drop unique constraint for ds_meta_url_id + op.drop_constraint( + 'url_ds_meta_url_ds_meta_url_id_key', + 'ds_app_link_meta_url', + type_='unique' + ) + # Drop agency_id column + op.drop_column( + 'ds_app_link_meta_url', + 'agency_id' + ) + # Make ds_meta_url_id primary key + op.create_primary_key( + 'ds_app_link_meta_url_pkey', + 'ds_app_link_meta_url', + ['ds_meta_url_id'] + ) + # Add unique constraint for url_id + op.create_unique_constraint( + 'uq_ds_app_link_meta_url_url_id', + 'ds_app_link_meta_url', + ['url_id'] + ) + # URL ID + ## Drop foreign key + op.drop_constraint( + 'url_ds_meta_url_url_id_fkey', + 'ds_app_link_meta_url', + type_='foreignkey' + ) + ## Recreate foreign key with ON DELETE SET NULL + op.create_foreign_key( + 'ds_app_link_meta_url_url_id_fkey', + 'ds_app_link_meta_url', + 'urls', + ['url_id'], + ['id'], + ondelete='SET NULL' + ) + ## Alter url_id column to be nullable + op.alter_column( + 'ds_app_link_meta_url', + 'url_id', + nullable=True + ) + + +def _add_flag_deletion_tables(): + op.create_table( + 'flag_ds_delete_agency', + sa.Column( + 'ds_agency_id', + sa.Integer(), + sa.ForeignKey( + 'ds_app_link_agency.ds_agency_id', + ondelete='CASCADE' + ), + primary_key=True, + nullable=False + ), + created_at_column() + ) + + op.create_table( + 'flag_ds_delete_data_source', + sa.Column( + 'ds_data_source_id', + sa.Integer(), + sa.ForeignKey( + 'ds_app_link_data_source.ds_data_source_id', + ondelete='CASCADE' + ), + primary_key=True, + nullable=False + ), + created_at_column(), + ) + + op.create_table( + 'flag_ds_delete_meta_url', + sa.Column( + 'ds_meta_url_id', + sa.Integer(), + sa.ForeignKey( + 'ds_app_link_meta_url.ds_meta_url_id', + ondelete='CASCADE' + ), + primary_key=True, + nullable=False + ), + created_at_column(), + ) + + +def _rename_existing_tables_to_ds_app_format(): + op.rename_table( + 'url_data_source', + 'ds_app_link_data_source' + ) + op.rename_table( + 'url_ds_meta_url', + 'ds_app_link_meta_url' + ) + +def _migrate_agency_ids_to_ds_agency_link(): + """ + While this migration uses the existing DS agency IDs for both sm and ds agency ids + From this point onward the sm ID is internal to the SM application, + and the same is true for DS ID. + """ + + op.execute(""" + INSERT INTO ds_app_link_agency(agency_id, ds_agency_id) + SELECT agency_id, agency_id + FROM agencies + """) + + +def remove_id_column_from_agencies(): + op.drop_column( + 'agencies', + 'id' + ) + +def rename_agency_id_to_id(): + op.alter_column( + 'agencies', + 'agency_id', + new_column_name='id' + ) + +def _create_ds_agency_link(): + op.create_table( + 'ds_app_link_agency', + sa.Column( + 'agency_id', + sa.Integer(), + sa.ForeignKey( + 'agencies.agency_id', + ondelete='SET NULL' + ), + nullable=True + ), + sa.Column( + 'ds_agency_id', + sa.Integer(), + nullable=False, + primary_key=True + ), + created_at_column(), + last_synced_at_column(), + sa.UniqueConstraint( + "agency_id", name="uq_ds_app_link_agency_agency_id" + ) + ) + + +def _create_sync_log(): + op.create_table( + 'sync_log', + sa.Column( + 'resource_type', + sa.Enum( + 'agency', + 'data_source', + 'meta_url', + name='resource_type_enum' + ), + nullable=False, + ), + sa.Column( + 'sync_type', + sa.Enum( + 'add', + 'update', + 'delete', + name='sync_type_enum' + ), + nullable=False, + ), + sa.Column( + 'count', + sa.Integer(), + nullable=False, + ), + created_at_column(), + sa.PrimaryKeyConstraint( + 'resource_type', + 'sync_type', + 'created_at' + ) + ) + + +def downgrade() -> None: + pass diff --git a/alembic/versions/2025_11_15_1441-de0305465e2c_update_record_formats_and_access_types_.py b/alembic/versions/2025_11_15_1441-de0305465e2c_update_record_formats_and_access_types_.py new file mode 100644 index 00000000..e9e14ca8 --- /dev/null +++ b/alembic/versions/2025_11_15_1441-de0305465e2c_update_record_formats_and_access_types_.py @@ -0,0 +1,67 @@ +"""Update record_formats and access_types to be not null + +Revision ID: de0305465e2c +Revises: a57c3b5b6e93 +Create Date: 2025-11-15 14:41:45.619148 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'de0305465e2c' +down_revision: Union[str, None] = 'a57c3b5b6e93' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +TABLE_NAME = "url_optional_data_source_metadata" + + +def upgrade() -> None: + _update_record_formats() + _update_access_types() + _alter_record_formats_column() + _alter_access_types_column() + +def _alter_record_formats_column(): + op.alter_column( + table_name=TABLE_NAME, + column_name="record_formats", + nullable=False, + server_default='{}' + ) + + +def _alter_access_types_column(): + op.alter_column( + table_name=TABLE_NAME, + column_name="access_types", + nullable=False, + server_default='{}' + ) + + + +def _update_access_types(): + op.execute(""" + UPDATE url_optional_data_source_metadata + SET access_types = '{}' + WHERE access_types is null + + """) + + +def _update_record_formats(): + op.execute(""" + UPDATE url_optional_data_source_metadata + SET record_formats = '{}' + WHERE record_formats is null + """) + + +def downgrade() -> None: + pass diff --git a/alembic/versions/2025_11_16_1130-88ac26c3b025_add_task_log.py b/alembic/versions/2025_11_16_1130-88ac26c3b025_add_task_log.py new file mode 100644 index 00000000..ed7f9e49 --- /dev/null +++ b/alembic/versions/2025_11_16_1130-88ac26c3b025_add_task_log.py @@ -0,0 +1,37 @@ +"""Add task log + +Revision ID: 88ac26c3b025 +Revises: de0305465e2c +Create Date: 2025-11-16 11:30:25.742630 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + +from src.util.alembic_helpers import task_id_column, created_at_column + +# revision identifiers, used by Alembic. +revision: str = '88ac26c3b025' +down_revision: Union[str, None] = 'de0305465e2c' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.create_table( + "tasks__log", + task_id_column(), + sa.Column( + "log", + sa.Text, + nullable=False, + ), + created_at_column(), + sa.PrimaryKeyConstraint("task_id"), + ) + + +def downgrade() -> None: + pass diff --git a/alembic/versions/2025_11_18_0902-783268bd3daa_add_update_url_status_task.py b/alembic/versions/2025_11_18_0902-783268bd3daa_add_update_url_status_task.py new file mode 100644 index 00000000..986d6187 --- /dev/null +++ b/alembic/versions/2025_11_18_0902-783268bd3daa_add_update_url_status_task.py @@ -0,0 +1,34 @@ +"""Add update_url_status task + +Revision ID: 783268bd3daa +Revises: 88ac26c3b025 +Create Date: 2025-11-18 09:02:54.985705 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + +from src.util.alembic_helpers import add_enum_value + +# revision identifiers, used by Alembic. +revision: str = '783268bd3daa' +down_revision: Union[str, None] = '88ac26c3b025' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + add_enum_value( + enum_name="url_status", + enum_value="broken" + ) + add_enum_value( + enum_name="task_type", + enum_value="Update URL Status" + ) + + +def downgrade() -> None: + pass diff --git a/alembic/versions/2025_11_18_1907-b8a68f4260a4_rename_link_tables.py b/alembic/versions/2025_11_18_1907-b8a68f4260a4_rename_link_tables.py new file mode 100644 index 00000000..fb927bf6 --- /dev/null +++ b/alembic/versions/2025_11_18_1907-b8a68f4260a4_rename_link_tables.py @@ -0,0 +1,35 @@ +"""Rename link tables + +Revision ID: b8a68f4260a4 +Revises: 783268bd3daa +Create Date: 2025-11-18 19:07:48.518828 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'b8a68f4260a4' +down_revision: Union[str, None] = '783268bd3daa' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + old_name_new_name = { + "link_task_urls": "link_tasks__urls", + "link_agencies_locations": "link_agencies__locations", + "link_agency_batches": "link_agencies__batches", + "link_batch_urls": "link_batches__urls", + "link_location_batches": "link_batches__locations", + "link_urls_agency": "link_agencies__urls", + } + for old_name, new_name in old_name_new_name.items(): + op.rename_table(old_name, new_name) + + +def downgrade() -> None: + pass diff --git a/alembic/versions/2025_11_20_1530-c4edeb795134_remove_url_error_status.py b/alembic/versions/2025_11_20_1530-c4edeb795134_remove_url_error_status.py new file mode 100644 index 00000000..faa827b4 --- /dev/null +++ b/alembic/versions/2025_11_20_1530-c4edeb795134_remove_url_error_status.py @@ -0,0 +1,39 @@ +"""Remove URL Error Status + +Revision ID: c4edeb795134 +Revises: b8a68f4260a4 +Create Date: 2025-11-20 15:30:15.783191 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + +from src.util.alembic_helpers import remove_enum_value + +# revision identifiers, used by Alembic. +revision: str = 'c4edeb795134' +down_revision: Union[str, None] = 'b8a68f4260a4' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.execute(""" + UPDATE urls + SET status = 'ok' + WHERE status = 'error'; + """) + + remove_enum_value( + enum_name="url_status", + value_to_remove="error", + targets=[ + ("urls", "status") + ] + ) + + +def downgrade() -> None: + pass diff --git a/alembic/versions/2025_11_23_1850-1bb2dfad3275_eliminate_hanging_data_sources.py b/alembic/versions/2025_11_23_1850-1bb2dfad3275_eliminate_hanging_data_sources.py new file mode 100644 index 00000000..65982106 --- /dev/null +++ b/alembic/versions/2025_11_23_1850-1bb2dfad3275_eliminate_hanging_data_sources.py @@ -0,0 +1,32 @@ +"""Eliminate hanging data sources + +Revision ID: 1bb2dfad3275 +Revises: c4edeb795134 +Create Date: 2025-11-23 18:50:55.557428 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '1bb2dfad3275' +down_revision: Union[str, None] = 'c4edeb795134' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.execute(""" + DELETE FROM ds_app_link_data_source ds + USING ds_app_link_meta_url mu, + flag_url_validated fuv + WHERE ds.url_id = mu.url_id + AND ds.url_id = fuv.url_id; + """) + + +def downgrade() -> None: + pass diff --git a/alembic/versions/2025_11_23_1923-5ac9d50b91c5_add_integrity_monitor.py b/alembic/versions/2025_11_23_1923-5ac9d50b91c5_add_integrity_monitor.py new file mode 100644 index 00000000..1f44dd25 --- /dev/null +++ b/alembic/versions/2025_11_23_1923-5ac9d50b91c5_add_integrity_monitor.py @@ -0,0 +1,97 @@ +"""Add integrity monitor + +Revision ID: 5ac9d50b91c5 +Revises: 1bb2dfad3275 +Create Date: 2025-11-23 19:23:45.487445 + +""" +from typing import Sequence, Union + +from alembic import op + +from src.util.alembic_helpers import add_enum_value + +# revision identifiers, used by Alembic. +revision: str = '5ac9d50b91c5' +down_revision: Union[str, None] = '1bb2dfad3275' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + +def upgrade() -> None: + _create_integrity_task() + _create_incomplete_data_sources_view() + _create_incomplete_meta_urls_view() + _create_url_both_data_source_and_meta_url_view() + _create_non_federal_agencies_no_location_view() + +def _create_non_federal_agencies_no_location_view(): + op.execute(""" + create view integrity__non_federal_agencies_no_location_view as + select + ag.id as agency_id + from agencies ag + left join link_agencies__locations link on ag.id = link.agency_id + where ag.jurisdiction_type != 'federal' + and link.location_id is null + """) + + +def _create_url_both_data_source_and_meta_url_view(): + op.execute(""" + create view integrity__url_both_data_source_and_meta_url_view as + select + ds.url_id + from + ds_app_link_data_source ds + join ds_app_link_meta_url mu + on mu.url_id = ds.url_id + """) + + +def _create_incomplete_meta_urls_view(): + op.execute(""" + create view integrity__incomplete_meta_urls_view as + select + mu.url_id, + fuv.url_id is not null as has_validated_flag, + fuv.type as validated_type, + lau.url_id is not null as has_agency_flag + from ds_app_link_meta_url mu + left join flag_url_validated fuv on fuv.url_id = mu.url_id + left join link_agencies__urls lau on lau.url_id = mu.url_id + where + fuv.url_id is null + or fuv.type != 'meta url' + or lau.url_id is null + """) + + +def _create_incomplete_data_sources_view(): + op.execute(""" + create view integrity__incomplete_data_sources_view as + select + ds.url_id, + fuv.url_id is not null as has_validated_flag, + fuv.type as validated_type, + urt.url_id is not null as has_record_type, + lau.url_id is not null as has_agency_flag + from ds_app_link_data_source ds + left join flag_url_validated fuv on fuv.url_id = ds.url_id + left join url_record_type urt on urt.url_id = ds.url_id + left join link_agencies__urls lau on lau.url_id = ds.url_id + where + fuv.url_id is null + or fuv.type != 'data source' + or urt.url_id is null + or lau.url_id is null + """) + + +def _create_integrity_task(): + add_enum_value( + enum_name="task_type", + enum_value="Integrity Monitor", + ) + +def downgrade() -> None: + pass diff --git a/alembic/versions/2025_11_27_0907-d5f0cc2be6b6_add_html_duplicate_url_materialized_view.py b/alembic/versions/2025_11_27_0907-d5f0cc2be6b6_add_html_duplicate_url_materialized_view.py new file mode 100644 index 00000000..ec726c07 --- /dev/null +++ b/alembic/versions/2025_11_27_0907-d5f0cc2be6b6_add_html_duplicate_url_materialized_view.py @@ -0,0 +1,57 @@ +"""Add html duplicate url materialized view + +Revision ID: d5f0cc2be6b6 +Revises: 5ac9d50b91c5 +Create Date: 2025-11-27 09:07:28.767553 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'd5f0cc2be6b6' +down_revision: Union[str, None] = '5ac9d50b91c5' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.execute(""" + create extension if not exists pgcrypto; + """) + + op.execute(""" + CREATE MATERIALIZED VIEW mat_view__html_duplicate_url AS + WITH + hashes AS ( + SELECT + url_id, + digest(compressed_html, 'sha256') AS hash + FROM + url_compressed_html + ) + , duplicate_hashes as ( + SELECT + hash AS content_hash, + COUNT(*) AS n, + ARRAY_AGG(url_id ORDER BY url_id) AS url_ids + FROM + hashes + GROUP BY + hash + HAVING + COUNT(*) > 1 + ) + select + urls.id as url_id + from urls + join hashes h on h.url_id = urls.id + join duplicate_hashes dh on dh.content_hash = h.hash; + """) + + +def downgrade() -> None: + pass diff --git a/alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py b/alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py new file mode 100644 index 00000000..9a20bafb --- /dev/null +++ b/alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py @@ -0,0 +1,237 @@ +"""Remove ID columns + +Revision ID: 5d6412540aba +Revises: d5f0cc2be6b6 +Create Date: 2025-11-29 07:17:32.794305 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '5d6412540aba' +down_revision: Union[str, None] = 'd5f0cc2be6b6' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + +TABLES = [ + "task_errors", # + "auto_record_type_suggestions", # + "auto_relevant_suggestions", # + "duplicates", # + "flag_url_validated", # + "link_agencies__locations", # + "link_urls_redirect_url", # + "link_urls_root_url", # + "reviewing_user_url", # + "url_checked_for_duplicate", # + "url_compressed_html", # + "url_html_content", # + "url_internet_archives_probe_metadata", # + "url_internet_archives_save_metadata", # + "url_optional_data_source_metadata", # + "url_scrape_info", # + "url_web_metadata", # + "user_record_type_suggestions", # + "user_url_type_suggestions", # +] + +URL_ONLY_PRIMARY_KEY_TABLES = [ + "url_checked_for_duplicate", # + "url_compressed_html", # + "url_internet_archives_probe_metadata", # + "url_internet_archives_save_metadata", # + "url_optional_data_source_metadata", # + "url_scrape_info", # + "url_web_metadata", # + "auto_relevant_suggestions", # + "auto_record_type_suggestions", # + "flag_url_validated" # +] + + + +USER_URL_ID_PRIMARY_KEY_TABLES = [ + "user_record_type_suggestions", # + "user_url_type_suggestions", # + "reviewing_user_url" # +] + +BESPOKE_UNIQUE_IDS: dict[str, list[str]] = { + "task_errors": ["task_id"], # + "link_agencies__locations": ["agency_id", "location_id"], # + "link_urls_redirect_url": ["source_url_id", "destination_url_id"], # + "link_urls_root_url": ["url_id", "root_url_id"], # + "url_html_content": ["url_id", "content_type"], # +} + +def drop_views(): + op.execute("drop materialized view if exists url_status_mat_view") + op.execute("drop materialized view if exists batch_url_status_mat_view") + +def recreate_views(): + op.execute(""" + create materialized view url_status_mat_view as + WITH + urls_with_relevant_errors AS ( + SELECT + ute.url_id + FROM + url_task_error ute + WHERE + ute.task_type = ANY (ARRAY ['Screenshot'::task_type, 'HTML'::task_type, 'URL Probe'::task_type]) + ) + , status_text AS ( + SELECT + u.id AS url_id, + CASE + WHEN fuv.type = ANY + (ARRAY ['not relevant'::url_type, 'individual record'::url_type, 'not found'::url_type]) + THEN 'Accepted'::text + WHEN fuv.type = 'data source'::url_type AND uds.url_id IS NULL OR + fuv.type = 'meta url'::url_type AND udmu.url_id IS NULL THEN 'Awaiting Submission'::text + WHEN fuv.type = 'data source'::url_type AND uds.url_id IS NOT NULL OR + fuv.type = 'meta url'::url_type AND udmu.url_id IS NOT NULL THEN 'Submitted'::text + WHEN uch.url_id IS NOT NULL AND uwm.url_id IS NOT NULL AND us.url_id IS NOT NULL + THEN 'Community Labeling'::text + WHEN uwre.url_id IS NOT NULL THEN 'Error'::text + ELSE 'Intake'::text + END AS status + FROM + urls u + LEFT JOIN urls_with_relevant_errors uwre + ON u.id = uwre.url_id + LEFT JOIN url_screenshot us + ON u.id = us.url_id + LEFT JOIN url_compressed_html uch + ON u.id = uch.url_id + LEFT JOIN url_web_metadata uwm + ON u.id = uwm.url_id + LEFT JOIN flag_url_validated fuv + ON u.id = fuv.url_id + LEFT JOIN ds_app_link_meta_url udmu + ON u.id = udmu.url_id + LEFT JOIN ds_app_link_data_source uds + ON u.id = uds.url_id + ) + SELECT + status_text.url_id, + status_text.status, + CASE status_text.status + WHEN 'Intake'::text THEN 100 + WHEN 'Error'::text THEN 110 + WHEN 'Community Labeling'::text THEN 200 + WHEN 'Accepted'::text THEN 300 + WHEN 'Awaiting Submission'::text THEN 380 + WHEN 'Submitted'::text THEN 390 + ELSE '-1'::integer + END AS code + FROM + status_text; + """) + + op.execute(""" + create materialized view batch_url_status_mat_view as + WITH + batches_with_urls AS ( + SELECT + b_1.id + FROM + batches b_1 + WHERE + (EXISTS ( + SELECT + 1 + FROM + link_batches__urls lbu + WHERE + lbu.batch_id = b_1.id + )) + ) + , batches_with_only_validated_urls AS ( + SELECT + b_1.id + FROM + batches b_1 + WHERE + (EXISTS ( + SELECT + 1 + FROM + link_batches__urls lbu + LEFT JOIN flag_url_validated fuv + ON fuv.url_id = lbu.url_id + WHERE + lbu.batch_id = b_1.id + AND fuv.url_id IS NOT NULL + )) + AND NOT (EXISTS ( + SELECT + 1 + FROM + link_batches__urls lbu + LEFT JOIN flag_url_validated fuv + ON fuv.url_id = lbu.url_id + WHERE + lbu.batch_id = b_1.id + AND fuv.url_id IS NULL + )) + ) + SELECT + b.id AS batch_id, + CASE + WHEN b.status = 'error'::batch_status THEN 'Error'::text + WHEN bwu.id IS NULL THEN 'No URLs'::text + WHEN bwovu.id IS NOT NULL THEN 'Labeling Complete'::text + ELSE 'Has Unlabeled URLs'::text + END AS batch_url_status + FROM + batches b + LEFT JOIN batches_with_urls bwu + ON bwu.id = b.id + LEFT JOIN batches_with_only_validated_urls bwovu + ON bwovu.id = b.id; + """) + + + +def upgrade() -> None: + drop_views() + + for table in TABLES: + op.drop_column(table, "id") + + # Add new primary keys + for table, columns in BESPOKE_UNIQUE_IDS.items(): + suffix = "_".join(columns) + op.create_primary_key( + f"pk_{table}_{suffix}", + table, + columns + ) + + for table in URL_ONLY_PRIMARY_KEY_TABLES: + op.create_primary_key( + f"pk_{table}", + table, + ["url_id"] + ) + + for table in USER_URL_ID_PRIMARY_KEY_TABLES: + op.create_primary_key( + f"pk_{table}", + table, + ["user_id", "url_id"] + ) + + recreate_views() + + + + + +def downgrade() -> None: + pass diff --git a/alembic/versions/2025_12_01_1632-1d3398f9cd8a_create_anonymous_session_users.py b/alembic/versions/2025_12_01_1632-1d3398f9cd8a_create_anonymous_session_users.py new file mode 100644 index 00000000..e3dafbbc --- /dev/null +++ b/alembic/versions/2025_12_01_1632-1d3398f9cd8a_create_anonymous_session_users.py @@ -0,0 +1,163 @@ +"""Create anonymous_session_users + +Revision ID: 1d3398f9cd8a +Revises: 5d6412540aba +Create Date: 2025-12-01 16:32:27.842175 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects.postgresql import UUID + +from src.util.alembic_helpers import created_at_column + +# revision identifiers, used by Alembic. +revision: str = '1d3398f9cd8a' +down_revision: Union[str, None] = '5d6412540aba' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + +def _alter_anonymous_annotation_agency(): + # Add new column + op.add_column( + "anonymous_annotation_agency", + sa.Column( + "session_id", + UUID, + sa.ForeignKey("anonymous_sessions.id"), + nullable=False + ) + ) + + # Drop prior unique constraint/primary key + op.drop_constraint( + "anonymous_annotation_agency_pkey", + "anonymous_annotation_agency" + ) + + # Add new unique constraint/primary key + op.create_primary_key( + "anonymous_annotation_agency_pkey", + "anonymous_annotation_agency", + ["session_id", "url_id", "agency_id"] + ) + +def _alter_anonymous_annotation_location(): + # Add new column + op.add_column( + "anonymous_annotation_location", + sa.Column( + "session_id", + UUID, + sa.ForeignKey("anonymous_sessions.id"), + nullable=False + ) + ) + + # Drop prior unique constraint/primary key + op.drop_constraint( + "anonymous_annotation_location_pkey", + "anonymous_annotation_location" + ) + + # Add new unique constraint/primary key + op.create_primary_key( + "anonymous_annotation_location_pkey", + "anonymous_annotation_location", + ["session_id", "url_id", "location_id"] + ) + +def _alter_anonymous_annotation_record_type(): + # Add new column + op.add_column( + "anonymous_annotation_record_type", + sa.Column( + "session_id", + UUID, + sa.ForeignKey("anonymous_sessions.id"), + nullable=False + ) + ) + + # Drop prior unique constraint/primary key + op.drop_constraint( + "anonymous_annotation_record_type_pkey", + "anonymous_annotation_record_type" + ) + + # Add new unique constraint/primary key + op.create_primary_key( + "anonymous_annotation_record_type_pkey", + "anonymous_annotation_record_type", + ["session_id", "url_id", "record_type"] + ) + +def _alter_anonymous_annotation_url_type(): + # Add new column + op.add_column( + "anonymous_annotation_url_type", + sa.Column( + "session_id", + UUID, + sa.ForeignKey("anonymous_sessions.id"), + nullable=False + ) + ) + + # Drop prior unique constraint/primary key + op.drop_constraint( + "anonymous_annotation_url_type_pkey", + "anonymous_annotation_url_type" + ) + + # Add new unique constraint/primary key + op.create_primary_key( + "anonymous_annotation_url_type_pkey", + "anonymous_annotation_url_type", + ["session_id", "url_id", "url_type"] + ) + +def upgrade() -> None: + # Create anonymous_sessions table + _create_anonymous_sessions_table() + + # Remove all prior anonymous annotations + _remove_prior_sessions() + + _alter_anonymous_annotation_agency() + _alter_anonymous_annotation_location() + _alter_anonymous_annotation_record_type() + _alter_anonymous_annotation_url_type() + + +def _remove_prior_sessions(): + for table in [ + "anonymous_annotation_agency", + "anonymous_annotation_location", + "anonymous_annotation_record_type", + "anonymous_annotation_url_type" + ]: + op.execute( + f""" + DELETE FROM {table} + """ + ) + + +def _create_anonymous_sessions_table(): + op.create_table( + "anonymous_sessions", + sa.Column( + "id", + UUID, + server_default=sa.text("gen_random_uuid()"), + primary_key=True + ), + created_at_column() + ) + + +def downgrade() -> None: + pass diff --git a/apply_migrations.py b/apply_migrations.py index 2b217c8b..cbacf0a4 100644 --- a/apply_migrations.py +++ b/apply_migrations.py @@ -1,15 +1,19 @@ from alembic import command from alembic.config import Config -from src.db.helpers.connect import get_postgres_connection_string +from src.util.helper_functions import get_from_env def apply_migrations(): print("Applying migrations...") alembic_config = Config("alembic.ini") + connection_string = ( + f"postgresql://{get_from_env('POSTGRES_USER')}:{get_from_env('POSTGRES_PASSWORD')}" + + f"@{get_from_env('POSTGRES_HOST')}:{get_from_env('POSTGRES_PORT')}/{get_from_env('POSTGRES_DB')}") + alembic_config.set_main_option( "sqlalchemy.url", - get_postgres_connection_string() + connection_string ) command.upgrade(alembic_config, "head") print("Migrations applied.") diff --git a/pyproject.toml b/pyproject.toml index 70f54673..eda8cd67 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,7 @@ dependencies = [ "lxml~=5.1.0", "marshmallow~=3.23.2", "openai~=1.60.1", - "pdap-access-manager==0.3.6", + "pdap-access-manager==0.4.4", "pillow>=11.3.0", "pip>=25.2", "playwright~=1.49.1", diff --git a/pytest.ini b/pytest.ini index ceaa093c..5c39d47c 100644 --- a/pytest.ini +++ b/pytest.ini @@ -3,3 +3,4 @@ timeout = 300 asyncio_default_fixture_loop_scope=function markers = manual: mark test as manual-only (excluded from default test runs) +asyncio_mode = auto \ No newline at end of file diff --git a/src/api/endpoints/batch/dtos/post/__init__.py b/src/api/endpoints/agencies/__init__.py similarity index 100% rename from src/api/endpoints/batch/dtos/post/__init__.py rename to src/api/endpoints/agencies/__init__.py diff --git a/src/core/tasks/scheduled/impl/huggingface/queries/check/__init__.py b/src/api/endpoints/agencies/by_id/__init__.py similarity index 100% rename from src/core/tasks/scheduled/impl/huggingface/queries/check/__init__.py rename to src/api/endpoints/agencies/by_id/__init__.py diff --git a/src/core/tasks/url/operators/probe/queries/urls/exist/__init__.py b/src/api/endpoints/agencies/by_id/delete/__init__.py similarity index 100% rename from src/core/tasks/url/operators/probe/queries/urls/exist/__init__.py rename to src/api/endpoints/agencies/by_id/delete/__init__.py diff --git a/src/api/endpoints/agencies/by_id/delete/query.py b/src/api/endpoints/agencies/by_id/delete/query.py new file mode 100644 index 00000000..627fc932 --- /dev/null +++ b/src/api/endpoints/agencies/by_id/delete/query.py @@ -0,0 +1,41 @@ +from sqlalchemy import delete, select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.models.impl.agency.ds_link.sqlalchemy import DSAppLinkAgency +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.flag.ds_delete.agency import FlagDSDeleteAgency +from src.db.queries.base.builder import QueryBuilderBase + + +class DeleteAgencyQueryBuilder(QueryBuilderBase): + + def __init__( + self, + agency_id: int, + ): + super().__init__() + self.agency_id = agency_id + + async def run(self, session: AsyncSession) -> None: + # Check for existence of DS App Link. If so, add deletion flag + query = ( + select( + DSAppLinkAgency + ) + .where( + DSAppLinkAgency.agency_id == self.agency_id + ) + ) + ds_app_link_agency: DSAppLinkAgency | None = await self.sh.one_or_none(session, query=query) + if ds_app_link_agency is not None: + flag = FlagDSDeleteAgency( + ds_agency_id=ds_app_link_agency.ds_agency_id, + ) + session.add(flag) + + # Delete Agency + statement = ( + delete(Agency) + .where(Agency.id == self.agency_id) + ) + await session.execute(statement) \ No newline at end of file diff --git a/src/core/tasks/url/operators/submit_approved/__init__.py b/src/api/endpoints/agencies/by_id/locations/__init__.py similarity index 100% rename from src/core/tasks/url/operators/submit_approved/__init__.py rename to src/api/endpoints/agencies/by_id/locations/__init__.py diff --git a/src/core/tasks/url/operators/submit_approved/queries/__init__.py b/src/api/endpoints/agencies/by_id/locations/delete/__init__.py similarity index 100% rename from src/core/tasks/url/operators/submit_approved/queries/__init__.py rename to src/api/endpoints/agencies/by_id/locations/delete/__init__.py diff --git a/src/api/endpoints/agencies/by_id/locations/delete/query.py b/src/api/endpoints/agencies/by_id/locations/delete/query.py new file mode 100644 index 00000000..9c96c65b --- /dev/null +++ b/src/api/endpoints/agencies/by_id/locations/delete/query.py @@ -0,0 +1,29 @@ +from sqlalchemy import delete +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.models.impl.link.agency_location.sqlalchemy import LinkAgencyLocation +from src.db.queries.base.builder import QueryBuilderBase + + +class DeleteAgencyLocationQueryBuilder(QueryBuilderBase): + + def __init__( + self, + agency_id: int, + location_id: int, + ): + super().__init__() + self.agency_id = agency_id + self.location_id = location_id + + async def run(self, session: AsyncSession) -> None: + statement = ( + delete(LinkAgencyLocation) + .where( + (LinkAgencyLocation.agency_id == self.agency_id) + & (LinkAgencyLocation.location_id == self.location_id) + ) + ) + + await session.execute(statement) + diff --git a/src/core/tasks/url/operators/submit_meta_urls/__init__.py b/src/api/endpoints/agencies/by_id/locations/get/__init__.py similarity index 100% rename from src/core/tasks/url/operators/submit_meta_urls/__init__.py rename to src/api/endpoints/agencies/by_id/locations/get/__init__.py diff --git a/src/api/endpoints/agencies/by_id/locations/get/query.py b/src/api/endpoints/agencies/by_id/locations/get/query.py new file mode 100644 index 00000000..e7ad22d5 --- /dev/null +++ b/src/api/endpoints/agencies/by_id/locations/get/query.py @@ -0,0 +1,37 @@ +from typing import Sequence + +from sqlalchemy import select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.agencies.by_id.locations.get.response import AgencyGetLocationsResponse +from src.db.models.impl.link.agency_location.sqlalchemy import LinkAgencyLocation +from src.db.models.views.location_expanded import LocationExpandedView +from src.db.queries.base.builder import QueryBuilderBase + + +class GetAgencyLocationsQueryBuilder(QueryBuilderBase): + + def __init__( + self, + agency_id: int, + ): + super().__init__() + self.agency_id = agency_id + + async def run(self, session: AsyncSession) -> list[AgencyGetLocationsResponse]: + query = ( + select( + LinkAgencyLocation.location_id, + LocationExpandedView.full_display_name + ) + .where( + LinkAgencyLocation.agency_id == self.agency_id + ) + .join( + LocationExpandedView, + LocationExpandedView.id == LinkAgencyLocation.location_id + ) + ) + + result: Sequence[RowMapping] = await self.sh.mappings(session, query=query) + return [AgencyGetLocationsResponse(**row) for row in result] \ No newline at end of file diff --git a/src/api/endpoints/agencies/by_id/locations/get/response.py b/src/api/endpoints/agencies/by_id/locations/get/response.py new file mode 100644 index 00000000..1e4a3078 --- /dev/null +++ b/src/api/endpoints/agencies/by_id/locations/get/response.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class AgencyGetLocationsResponse(BaseModel): + location_id: int + full_display_name: str diff --git a/src/core/tasks/url/operators/submit_meta_urls/queries/__init__.py b/src/api/endpoints/agencies/by_id/locations/post/__init__.py similarity index 100% rename from src/core/tasks/url/operators/submit_meta_urls/queries/__init__.py rename to src/api/endpoints/agencies/by_id/locations/post/__init__.py diff --git a/src/api/endpoints/agencies/by_id/locations/post/query.py b/src/api/endpoints/agencies/by_id/locations/post/query.py new file mode 100644 index 00000000..fd1bdf2f --- /dev/null +++ b/src/api/endpoints/agencies/by_id/locations/post/query.py @@ -0,0 +1,23 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.models.impl.link.agency_location.sqlalchemy import LinkAgencyLocation +from src.db.queries.base.builder import QueryBuilderBase + + +class AddAgencyLocationQueryBuilder(QueryBuilderBase): + + def __init__( + self, + agency_id: int, + location_id: int + ): + super().__init__() + self.agency_id = agency_id + self.location_id = location_id + + async def run(self, session: AsyncSession) -> None: + lal = LinkAgencyLocation( + agency_id=self.agency_id, + location_id=self.location_id, + ) + session.add(lal) \ No newline at end of file diff --git a/src/db/models/impl/url/suggestion/relevant/__init__.py b/src/api/endpoints/agencies/by_id/put/__init__.py similarity index 100% rename from src/db/models/impl/url/suggestion/relevant/__init__.py rename to src/api/endpoints/agencies/by_id/put/__init__.py diff --git a/src/api/endpoints/agencies/by_id/put/query.py b/src/api/endpoints/agencies/by_id/put/query.py new file mode 100644 index 00000000..942203fc --- /dev/null +++ b/src/api/endpoints/agencies/by_id/put/query.py @@ -0,0 +1,42 @@ +from fastapi import HTTPException +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.agencies.by_id.put.request import AgencyPutRequest +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.queries.base.builder import QueryBuilderBase + + +class UpdateAgencyQueryBuilder(QueryBuilderBase): + + def __init__( + self, + agency_id: int, + request: AgencyPutRequest, + ): + super().__init__() + self.agency_id = agency_id + self.request = request + + async def run(self, session: AsyncSession) -> None: + + query = ( + select( + Agency + ) + .where( + Agency.id == self.agency_id + ) + ) + + agency = await self.sh.one_or_none(session, query=query) + if not agency: + raise HTTPException(status_code=400, detail="Agency not found") + + if self.request.name is not None: + agency.name = self.request.name + if self.request.type is not None: + agency.type = self.request.type + if self.request.jurisdiction_type is not None: + agency.jurisdiction_type = self.request.jurisdiction_type + diff --git a/src/api/endpoints/agencies/by_id/put/request.py b/src/api/endpoints/agencies/by_id/put/request.py new file mode 100644 index 00000000..8d1457fb --- /dev/null +++ b/src/api/endpoints/agencies/by_id/put/request.py @@ -0,0 +1,8 @@ +from src.api.shared.models.request_base import RequestBase +from src.db.models.impl.agency.enums import AgencyType, JurisdictionType + + +class AgencyPutRequest(RequestBase): + name: str | None = None + type: AgencyType | None = None + jurisdiction_type: JurisdictionType | None = None diff --git a/src/db/models/impl/url/suggestion/relevant/auto/__init__.py b/src/api/endpoints/agencies/root/__init__.py similarity index 100% rename from src/db/models/impl/url/suggestion/relevant/auto/__init__.py rename to src/api/endpoints/agencies/root/__init__.py diff --git a/src/db/models/impl/url/suggestion/relevant/auto/pydantic/__init__.py b/src/api/endpoints/agencies/root/get/__init__.py similarity index 100% rename from src/db/models/impl/url/suggestion/relevant/auto/pydantic/__init__.py rename to src/api/endpoints/agencies/root/get/__init__.py diff --git a/src/api/endpoints/agencies/root/get/query.py b/src/api/endpoints/agencies/root/get/query.py new file mode 100644 index 00000000..ae3b943d --- /dev/null +++ b/src/api/endpoints/agencies/root/get/query.py @@ -0,0 +1,52 @@ +from sqlalchemy import select, Result +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import joinedload, selectinload + +from src.api.endpoints.agencies.by_id.locations.get.response import AgencyGetLocationsResponse +from src.api.endpoints.agencies.root.get.response import AgencyGetResponse +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.queries.base.builder import QueryBuilderBase + + +class GetAgenciesQueryBuilder(QueryBuilderBase): + + def __init__( + self, + page: int, + ): + super().__init__() + self.page = page + + async def run(self, session: AsyncSession) -> list[AgencyGetResponse]: + + query = ( + select( + Agency + ) + .options( + selectinload(Agency.locations) + ) + .offset((self.page - 1) * 100) + .limit(100) + ) + + results: Result[tuple[Agency]] = await session.execute(query) + responses: list[AgencyGetResponse] = [] + for result in results: + agency: Agency = result[0] + locations: list[AgencyGetLocationsResponse] = [ + AgencyGetLocationsResponse( + location_id=location.id, + full_display_name=location.full_display_name, + ) + for location in agency.locations + ] + responses.append(AgencyGetResponse( + id=agency.id, + name=agency.name, + type=agency.agency_type, + jurisdiction_type=agency.jurisdiction_type, + locations=locations, + )) + + return responses diff --git a/src/api/endpoints/agencies/root/get/response.py b/src/api/endpoints/agencies/root/get/response.py new file mode 100644 index 00000000..23590958 --- /dev/null +++ b/src/api/endpoints/agencies/root/get/response.py @@ -0,0 +1,15 @@ +from pydantic import BaseModel + +from src.api.endpoints.agencies.by_id.locations.get.response import AgencyGetLocationsResponse +from src.db.models.impl.agency.enums import AgencyType, JurisdictionType + + +class AgencyGetResponse(BaseModel): + id: int + name: str + type: AgencyType + jurisdiction_type: JurisdictionType | None + locations: list[AgencyGetLocationsResponse] + +class AgencyGetOuterResponse(BaseModel): + results: list[AgencyGetResponse] \ No newline at end of file diff --git a/src/external/pdap/dtos/match_agency/__init__.py b/src/api/endpoints/agencies/root/post/__init__.py similarity index 100% rename from src/external/pdap/dtos/match_agency/__init__.py rename to src/api/endpoints/agencies/root/post/__init__.py diff --git a/src/api/endpoints/agencies/root/post/query.py b/src/api/endpoints/agencies/root/post/query.py new file mode 100644 index 00000000..43064f85 --- /dev/null +++ b/src/api/endpoints/agencies/root/post/query.py @@ -0,0 +1,44 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.agencies.root.post.request import AgencyPostRequest +from src.api.endpoints.agencies.root.post.response import AgencyPostResponse +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.link.agency_location.sqlalchemy import LinkAgencyLocation +from src.db.queries.base.builder import QueryBuilderBase + + +class AddAgencyQueryBuilder(QueryBuilderBase): + + def __init__( + self, + request: AgencyPostRequest, + ): + super().__init__() + self.request = request + + async def run(self, session: AsyncSession) -> AgencyPostResponse: + agency = Agency( + name=self.request.name, + agency_type=self.request.type, + jurisdiction_type=self.request.jurisdiction_type, + ) + + session.add(agency) + await session.flush() + await session.refresh(agency) + agency_id: int = agency.id + + try: + + for location_id in self.request.location_ids: + lal = LinkAgencyLocation( + agency_id=agency_id, + location_id=location_id, + ) + session.add(lal) + + except Exception as e: + await session.rollback() + raise e + + return AgencyPostResponse(agency_id=agency_id) \ No newline at end of file diff --git a/src/api/endpoints/agencies/root/post/request.py b/src/api/endpoints/agencies/root/post/request.py new file mode 100644 index 00000000..009c863c --- /dev/null +++ b/src/api/endpoints/agencies/root/post/request.py @@ -0,0 +1,9 @@ +from src.api.shared.models.request_base import RequestBase +from src.db.models.impl.agency.enums import AgencyType, JurisdictionType + + +class AgencyPostRequest(RequestBase): + name: str + type: AgencyType + jurisdiction_type: JurisdictionType + location_ids: list[int] \ No newline at end of file diff --git a/src/api/endpoints/agencies/root/post/response.py b/src/api/endpoints/agencies/root/post/response.py new file mode 100644 index 00000000..dfba5261 --- /dev/null +++ b/src/api/endpoints/agencies/root/post/response.py @@ -0,0 +1,5 @@ +from pydantic import BaseModel + + +class AgencyPostResponse(BaseModel): + agency_id: int \ No newline at end of file diff --git a/src/api/endpoints/agencies/routes.py b/src/api/endpoints/agencies/routes.py new file mode 100644 index 00000000..b0a756aa --- /dev/null +++ b/src/api/endpoints/agencies/routes.py @@ -0,0 +1,107 @@ +from fastapi import APIRouter +from fastapi.params import Query, Depends, Path + +from src.api.dependencies import get_async_core +from src.api.endpoints.agencies.by_id.delete.query import DeleteAgencyQueryBuilder +from src.api.endpoints.agencies.by_id.locations.delete.query import DeleteAgencyLocationQueryBuilder +from src.api.endpoints.agencies.by_id.locations.get.query import GetAgencyLocationsQueryBuilder +from src.api.endpoints.agencies.by_id.locations.get.response import AgencyGetLocationsResponse +from src.api.endpoints.agencies.by_id.locations.post.query import AddAgencyLocationQueryBuilder +from src.api.endpoints.agencies.by_id.put.query import UpdateAgencyQueryBuilder +from src.api.endpoints.agencies.by_id.put.request import AgencyPutRequest +from src.api.endpoints.agencies.root.get.query import GetAgenciesQueryBuilder +from src.api.endpoints.agencies.root.get.response import AgencyGetResponse +from src.api.endpoints.agencies.root.post.query import AddAgencyQueryBuilder +from src.api.endpoints.agencies.root.post.request import AgencyPostRequest +from src.api.endpoints.agencies.root.post.response import AgencyPostResponse +from src.api.shared.models.message_response import MessageResponse +from src.core.core import AsyncCore + +agencies_router = APIRouter(prefix="/agencies", tags=["Agencies"]) + +@agencies_router.get("") +async def get_agencies( + async_core: AsyncCore = Depends(get_async_core), + page: int = Query( + description="Page number", + default=1 + ), +) -> list[AgencyGetResponse]: + return await async_core.adb_client.run_query_builder( + GetAgenciesQueryBuilder(page=page) + ) + +@agencies_router.post("") +async def create_agency( + request: AgencyPostRequest, + async_core: AsyncCore = Depends(get_async_core), +) -> AgencyPostResponse: + return await async_core.adb_client.run_query_builder( + AddAgencyQueryBuilder(request=request) + ) + +@agencies_router.delete("/{agency_id}") +async def delete_agency( + agency_id: int = Path( + description="Agency ID to delete" + ), + async_core: AsyncCore = Depends(get_async_core), +) -> MessageResponse: + await async_core.adb_client.run_query_builder( + DeleteAgencyQueryBuilder(agency_id=agency_id) + ) + return MessageResponse(message="Agency deleted.") + +@agencies_router.put("/{agency_id}") +async def update_agency( + request: AgencyPutRequest, + agency_id: int = Path( + description="Agency ID to update" + ), + async_core: AsyncCore = Depends(get_async_core), +) -> MessageResponse: + await async_core.adb_client.run_query_builder( + UpdateAgencyQueryBuilder(agency_id=agency_id, request=request) + ) + return MessageResponse(message="Agency updated.") + +@agencies_router.get("/{agency_id}/locations") +async def get_agency_locations( + agency_id: int = Path( + description="Agency ID to get locations for" + ), + async_core: AsyncCore = Depends(get_async_core), +) -> list[AgencyGetLocationsResponse]: + return await async_core.adb_client.run_query_builder( + GetAgencyLocationsQueryBuilder(agency_id=agency_id) + ) + +@agencies_router.post("/{agency_id}/locations/{location_id}") +async def add_location_to_agency( + agency_id: int = Path( + description="Agency ID to add location to" + ), + location_id: int = Path( + description="Location ID to add" + ), + async_core: AsyncCore = Depends(get_async_core), +) -> MessageResponse: + await async_core.adb_client.run_query_builder( + AddAgencyLocationQueryBuilder(agency_id=agency_id, location_id=location_id) + ) + return MessageResponse(message="Location added to agency.") + +@agencies_router.delete("/{agency_id}/locations/{location_id}") +async def remove_location_from_agency( + agency_id: int = Path( + description="Agency ID to remove location from" + ), + location_id: int = Path( + description="Location ID to remove" + ), + async_core: AsyncCore = Depends(get_async_core), +) -> MessageResponse: + await async_core.adb_client.run_query_builder( + DeleteAgencyLocationQueryBuilder(agency_id=agency_id, location_id=location_id) + ) + return MessageResponse(message="Location removed from agency.") diff --git a/src/api/endpoints/annotate/_shared/extract.py b/src/api/endpoints/annotate/_shared/extract.py index 390579d9..3fb7770b 100644 --- a/src/api/endpoints/annotate/_shared/extract.py +++ b/src/api/endpoints/annotate/_shared/extract.py @@ -3,8 +3,8 @@ from src.api.endpoints.annotate._shared.queries.get_annotation_batch_info import GetAnnotationBatchInfoQueryBuilder from src.api.endpoints.annotate.all.get.models.agency import AgencyAnnotationResponseOuterInfo from src.api.endpoints.annotate.all.get.models.location import LocationAnnotationResponseOuterInfo -from src.api.endpoints.annotate.all.get.models.name import NameAnnotationSuggestion -from src.api.endpoints.annotate.all.get.models.record_type import RecordTypeAnnotationSuggestion +from src.api.endpoints.annotate.all.get.models.name import NameAnnotationResponseOuterInfo +from src.api.endpoints.annotate.all.get.models.record_type import RecordTypeAnnotationResponseOuterInfo from src.api.endpoints.annotate.all.get.models.response import GetNextURLForAllAnnotationResponse, \ GetNextURLForAllAnnotationInnerResponse from src.api.endpoints.annotate.all.get.models.url_type import URLTypeAnnotationSuggestion @@ -15,16 +15,16 @@ from src.api.endpoints.annotate.all.get.queries.location_.core import GetLocationSuggestionsQueryBuilder from src.api.endpoints.annotate.all.get.queries.name.core import GetNameSuggestionsQueryBuilder from src.db.dto_converter import DTOConverter -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion async def extract_and_format_get_annotation_result( session: AsyncSession, url: URL, batch_id: int | None = None -): +) -> GetNextURLForAllAnnotationResponse: html_response_info = DTOConverter.html_content_list_to_html_response_info( url.html_content ) @@ -32,7 +32,7 @@ async def extract_and_format_get_annotation_result( convert_user_url_type_suggestion_to_url_type_annotation_suggestion( url.user_relevant_suggestions ) - record_type_suggestions: list[RecordTypeAnnotationSuggestion] = \ + record_type_suggestions: RecordTypeAnnotationResponseOuterInfo = \ convert_user_record_type_suggestion_to_record_type_annotation_suggestion( url.user_record_type_suggestions ) @@ -40,13 +40,13 @@ async def extract_and_format_get_annotation_result( await GetAgencySuggestionsQueryBuilder(url_id=url.id).run(session) location_suggestions: LocationAnnotationResponseOuterInfo = \ await GetLocationSuggestionsQueryBuilder(url_id=url.id).run(session) - name_suggestions: list[NameAnnotationSuggestion] = \ + name_suggestions: NameAnnotationResponseOuterInfo = \ await GetNameSuggestionsQueryBuilder(url_id=url.id).run(session) return GetNextURLForAllAnnotationResponse( next_annotation=GetNextURLForAllAnnotationInnerResponse( - url_info=URLMapping( + url_info=SimpleURLMapping( url_id=url.id, - url=url.url + url=url.full_url ), html_info=html_response_info, url_type_suggestions=url_type_suggestions, @@ -55,7 +55,7 @@ async def extract_and_format_get_annotation_result( batch_info=await GetAnnotationBatchInfoQueryBuilder( batch_id=batch_id, models=[ - UserUrlAgencySuggestion, + UserURLAgencySuggestion, ] ).run(session), location_suggestions=location_suggestions, diff --git a/src/api/endpoints/annotate/agency/post/dto.py b/src/api/endpoints/annotate/agency/post/dto.py index dc41720a..1a13f073 100644 --- a/src/api/endpoints/annotate/agency/post/dto.py +++ b/src/api/endpoints/annotate/agency/post/dto.py @@ -2,7 +2,9 @@ from pydantic import BaseModel +from src.api.shared.models.request_base import RequestBase -class URLAgencyAnnotationPostInfo(BaseModel): + +class URLAgencyAnnotationPostInfo(RequestBase): is_new: bool = False suggested_agency: int | None = None diff --git a/src/api/endpoints/annotate/all/get/models/agency.py b/src/api/endpoints/annotate/all/get/models/agency.py index 45806d98..593438ce 100644 --- a/src/api/endpoints/annotate/all/get/models/agency.py +++ b/src/api/endpoints/annotate/all/get/models/agency.py @@ -1,27 +1,9 @@ from pydantic import BaseModel, Field +from src.api.endpoints.annotate.all.get.models.suggestion import SuggestionModel -class AgencyAnnotationAutoSuggestion(BaseModel): - agency_id: int - agency_name: str - confidence: int = Field( - title="The confidence of the location", - ge=0, - le=100, - ) - -class AgencyAnnotationUserSuggestion(BaseModel): - agency_id: int - agency_name: str - user_count: int - -class AgencyAnnotationUserSuggestionOuterInfo(BaseModel): - suggestions: list[AgencyAnnotationUserSuggestion] +class AgencyAnnotationResponseOuterInfo(BaseModel): + suggestions: list[SuggestionModel] not_found_count: int = Field( - title="How many users listed the agency as not found.", - ge=0, + description="How many users indicated the agency could not be found." ) - -class AgencyAnnotationResponseOuterInfo(BaseModel): - user: AgencyAnnotationUserSuggestionOuterInfo - auto: list[AgencyAnnotationAutoSuggestion] \ No newline at end of file diff --git a/src/api/endpoints/annotate/all/get/models/location.py b/src/api/endpoints/annotate/all/get/models/location.py index fb467004..be277c41 100644 --- a/src/api/endpoints/annotate/all/get/models/location.py +++ b/src/api/endpoints/annotate/all/get/models/location.py @@ -1,35 +1,9 @@ from pydantic import BaseModel, Field - -class LocationAnnotationAutoSuggestion(BaseModel): - location_id: int - location_name: str = Field( - title="The full name of the location" - ) - confidence: int = Field( - title="The confidence of the location", - ge=0, - le=100, - ) - - -class LocationAnnotationUserSuggestion(BaseModel): - location_id: int - location_name: str = Field( - title="The full name of the location" - ) - user_count: int = Field( - title="The number of users who suggested this location", - ge=1, - ) - -class LocationAnnotationUserSuggestionOuterInfo(BaseModel): - suggestions: list[LocationAnnotationUserSuggestion] - not_found_count: int = Field( - title="How many users listed the location as not found.", - ge=0, - ) +from src.api.endpoints.annotate.all.get.models.suggestion import SuggestionModel class LocationAnnotationResponseOuterInfo(BaseModel): - user: LocationAnnotationUserSuggestionOuterInfo - auto: list[LocationAnnotationAutoSuggestion] \ No newline at end of file + suggestions: list[SuggestionModel] + not_found_count: int = Field( + description="How many users indicated the location could not be found." + ) \ No newline at end of file diff --git a/src/api/endpoints/annotate/all/get/models/name.py b/src/api/endpoints/annotate/all/get/models/name.py index 80857305..386b11de 100644 --- a/src/api/endpoints/annotate/all/get/models/name.py +++ b/src/api/endpoints/annotate/all/get/models/name.py @@ -1,7 +1,10 @@ from pydantic import BaseModel - class NameAnnotationSuggestion(BaseModel): - name: str - suggestion_id: int - endorsement_count: int \ No newline at end of file + id: int + display_name: str + user_count: int + robo_count: int + +class NameAnnotationResponseOuterInfo(BaseModel): + suggestions: list[NameAnnotationSuggestion] \ No newline at end of file diff --git a/src/api/endpoints/annotate/all/get/models/record_type.py b/src/api/endpoints/annotate/all/get/models/record_type.py index a1c24911..a99dfd7b 100644 --- a/src/api/endpoints/annotate/all/get/models/record_type.py +++ b/src/api/endpoints/annotate/all/get/models/record_type.py @@ -1,11 +1,17 @@ -from pydantic import BaseModel +from pydantic import BaseModel, Field +from src.api.endpoints.annotate.all.get.models.suggestion import SuggestionModel from src.core.enums import RecordType - - -class RecordTypeAnnotationSuggestion(BaseModel): +class RecordTypeSuggestionModel(BaseModel): record_type: RecordType - endorsement_count: int + user_count: int + robo_confidence: int | None = Field( + description="The robo labeler's given confidence for its suggestion. Null if no robo-label occurred.", + ge=0, + le=100, + ) +class RecordTypeAnnotationResponseOuterInfo(BaseModel): + suggestions: list[RecordTypeSuggestionModel] diff --git a/src/api/endpoints/annotate/all/get/models/response.py b/src/api/endpoints/annotate/all/get/models/response.py index 989dbf8d..7f924e3f 100644 --- a/src/api/endpoints/annotate/all/get/models/response.py +++ b/src/api/endpoints/annotate/all/get/models/response.py @@ -1,16 +1,11 @@ -from typing import Optional - from pydantic import Field, BaseModel -from src.api.endpoints.annotate.agency.get.dto import GetNextURLForAgencyAgencyInfo from src.api.endpoints.annotate.all.get.models.agency import AgencyAnnotationResponseOuterInfo from src.api.endpoints.annotate.all.get.models.location import LocationAnnotationResponseOuterInfo -from src.api.endpoints.annotate.all.get.models.name import NameAnnotationSuggestion -from src.api.endpoints.annotate.all.get.models.record_type import RecordTypeAnnotationSuggestion +from src.api.endpoints.annotate.all.get.models.name import NameAnnotationResponseOuterInfo +from src.api.endpoints.annotate.all.get.models.record_type import RecordTypeAnnotationResponseOuterInfo from src.api.endpoints.annotate.all.get.models.url_type import URLTypeAnnotationSuggestion from src.api.endpoints.annotate.dtos.shared.base.response import AnnotationInnerResponseInfoBase -from src.api.endpoints.annotate.relevance.get.dto import RelevanceAnnotationResponseInfo -from src.core.enums import RecordType class GetNextURLForAllAnnotationInnerResponse(AnnotationInnerResponseInfoBase): @@ -23,10 +18,10 @@ class GetNextURLForAllAnnotationInnerResponse(AnnotationInnerResponseInfoBase): url_type_suggestions: list[URLTypeAnnotationSuggestion] = Field( title="Whether the auto-labeler identified the URL as relevant or not" ) - record_type_suggestions: list[RecordTypeAnnotationSuggestion] = Field( + record_type_suggestions: RecordTypeAnnotationResponseOuterInfo = Field( title="What record type, if any, user and the auto-labeler identified the URL as" ) - name_suggestions: list[NameAnnotationSuggestion] | None = Field( + name_suggestions: NameAnnotationResponseOuterInfo = Field( title="User and Auto-Suggestions for names" ) diff --git a/src/api/endpoints/annotate/all/get/models/suggestion.py b/src/api/endpoints/annotate/all/get/models/suggestion.py new file mode 100644 index 00000000..bed981fe --- /dev/null +++ b/src/api/endpoints/annotate/all/get/models/suggestion.py @@ -0,0 +1,17 @@ +from pydantic import BaseModel, Field + + +class SuggestionModel(BaseModel): + id: int + display_name: str + user_count: int + robo_confidence: int | None = Field( + description="The robo labeler's given confidence for its suggestion. Null if no robo-label occurred.", + ge=0, + le=100, + ) + + @property + def score(self) -> float: + robo_score = (self.robo_confidence or 0) / 100 + return self.user_count + robo_score \ No newline at end of file diff --git a/src/external/pdap/impl/meta_urls/__init__.py b/src/api/endpoints/annotate/all/get/queries/_shared/__init__.py similarity index 100% rename from src/external/pdap/impl/meta_urls/__init__.py rename to src/api/endpoints/annotate/all/get/queries/_shared/__init__.py diff --git a/src/api/endpoints/annotate/all/get/queries/_shared/sort.py b/src/api/endpoints/annotate/all/get/queries/_shared/sort.py new file mode 100644 index 00000000..0dae85b4 --- /dev/null +++ b/src/api/endpoints/annotate/all/get/queries/_shared/sort.py @@ -0,0 +1,13 @@ +from src.api.endpoints.annotate.all.get.models.suggestion import SuggestionModel + + +def sort_suggestions( + suggestions: list[SuggestionModel] +) -> list[SuggestionModel]: + """ + Sort according to the following criterion: + - Each user suggestion is a point + - The robo suggestion is a point * (confidence /100) + - Sort in descending order of points + """ + return sorted(suggestions, key=lambda s: s.score, reverse=True) \ No newline at end of file diff --git a/src/api/endpoints/annotate/all/get/queries/agency/core.py b/src/api/endpoints/annotate/all/get/queries/agency/core.py index 28cfbd2d..f7cfaf42 100644 --- a/src/api/endpoints/annotate/all/get/queries/agency/core.py +++ b/src/api/endpoints/annotate/all/get/queries/agency/core.py @@ -1,13 +1,7 @@ from sqlalchemy.ext.asyncio import AsyncSession -from src.api.endpoints.annotate.all.get.models.agency import AgencyAnnotationResponseOuterInfo, \ - AgencyAnnotationUserSuggestionOuterInfo, AgencyAnnotationUserSuggestion, AgencyAnnotationAutoSuggestion -from src.api.endpoints.annotate.all.get.queries.agency.requester import GetAgencySuggestionsRequester -from src.db.queries.base.builder import QueryBuilderBase -from sqlalchemy.ext.asyncio import AsyncSession - -from src.api.endpoints.annotate.all.get.models.agency import AgencyAnnotationResponseOuterInfo, \ - AgencyAnnotationUserSuggestionOuterInfo, AgencyAnnotationUserSuggestion, AgencyAnnotationAutoSuggestion +from src.api.endpoints.annotate.all.get.models.agency import AgencyAnnotationResponseOuterInfo +from src.api.endpoints.annotate.all.get.models.suggestion import SuggestionModel from src.api.endpoints.annotate.all.get.queries.agency.requester import GetAgencySuggestionsRequester from src.db.queries.base.builder import QueryBuilderBase @@ -30,18 +24,13 @@ async def run(self, session: AsyncSession) -> AgencyAnnotationResponseOuterInfo: location_id=self.location_id ) - user_suggestions: list[AgencyAnnotationUserSuggestion] = \ - await requester.get_user_agency_suggestions() - auto_suggestions: list[AgencyAnnotationAutoSuggestion] = \ - await requester.get_auto_agency_suggestions() + suggestions: list[SuggestionModel] = \ + await requester.get_agency_suggestions() not_found_count: int = \ await requester.get_not_found_count() return AgencyAnnotationResponseOuterInfo( - user=AgencyAnnotationUserSuggestionOuterInfo( - suggestions=user_suggestions, - not_found_count=not_found_count - ), - auto=auto_suggestions, + suggestions=suggestions, + not_found_count=not_found_count ) diff --git a/src/api/endpoints/annotate/all/get/queries/agency/requester.py b/src/api/endpoints/annotate/all/get/queries/agency/requester.py index fc309e50..9d933ae2 100644 --- a/src/api/endpoints/annotate/all/get/queries/agency/requester.py +++ b/src/api/endpoints/annotate/all/get/queries/agency/requester.py @@ -1,17 +1,17 @@ from typing import Sequence -from sqlalchemy import func, select, RowMapping +from sqlalchemy import func, select, RowMapping, or_, and_ from sqlalchemy.ext.asyncio import AsyncSession -from src.api.endpoints.annotate.all.get.models.agency import AgencyAnnotationAutoSuggestion, \ - AgencyAnnotationUserSuggestion -from src.api.endpoints.annotate.all.get.queries.agency.suggestions_with_highest_confidence import \ - SuggestionsWithHighestConfidenceCTE +from src.api.endpoints.annotate.all.get.models.suggestion import SuggestionModel +from src.api.endpoints.annotate.all.get.queries._shared.sort import sort_suggestions +from src.db.helpers.query import exists_url from src.db.helpers.session import session_helper as sh from src.db.models.impl.agency.sqlalchemy import Agency -from src.db.models.impl.link.agency_location.sqlalchemy import LinkAgencyLocation from src.db.models.impl.link.user_suggestion_not_found.agency.sqlalchemy import LinkUserSuggestionAgencyNotFound -from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask +from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion +from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion from src.db.templates.requester import RequesterBase @@ -27,102 +27,103 @@ def __init__( self.url_id = url_id self.location_id = location_id - async def get_user_agency_suggestions(self) -> list[AgencyAnnotationUserSuggestion]: - query = ( + async def get_agency_suggestions(self) -> list[SuggestionModel]: + # All agencies with either a user or robo annotation + valid_agencies_cte = ( select( - UserUrlAgencySuggestion.agency_id, - func.count(UserUrlAgencySuggestion.user_id).label("count"), - Agency.name.label("agency_name"), + Agency.id, ) - .join( - Agency, - Agency.agency_id == UserUrlAgencySuggestion.agency_id - ) - - ) - - if self.location_id is not None: - query = ( - query.join( - LinkAgencyLocation, - LinkAgencyLocation.agency_id == UserUrlAgencySuggestion.agency_id - ) - .where( - LinkAgencyLocation.location_id == self.location_id + .where( + or_( + exists_url( + UserURLAgencySuggestion + ), + exists_url( + URLAutoAgencyIDSubtask + ) ) ) + .cte("valid_agencies") + ) - query = ( - query.where( - UserUrlAgencySuggestion.url_id == self.url_id + # Number of users who suggested each agency + user_suggestions_cte = ( + select( + UserURLAgencySuggestion.url_id, + UserURLAgencySuggestion.agency_id, + func.count(UserURLAgencySuggestion.user_id).label('user_count') ) .group_by( - UserUrlAgencySuggestion.agency_id, - Agency.name + UserURLAgencySuggestion.agency_id, + UserURLAgencySuggestion.url_id, ) - .order_by( - func.count(UserUrlAgencySuggestion.user_id).desc() - ) - .limit(3) + .cte("user_suggestions") ) - results: Sequence[RowMapping] = await sh.mappings(self.session, query=query) - - return [ - AgencyAnnotationUserSuggestion( - agency_id=autosuggestion["agency_id"], - user_count=autosuggestion["count"], - agency_name=autosuggestion["agency_name"], - ) - for autosuggestion in results - ] - - - async def get_auto_agency_suggestions(self) -> list[AgencyAnnotationAutoSuggestion]: - cte = SuggestionsWithHighestConfidenceCTE() - query = ( + # Maximum confidence of robo annotation, if any + robo_suggestions_cte = ( select( - cte.agency_id, - cte.confidence, - Agency.name.label("agency_name"), + URLAutoAgencyIDSubtask.url_id, + Agency.id.label("agency_id"), + func.max(AgencyIDSubtaskSuggestion.confidence).label('robo_confidence') + ) + .join( + AgencyIDSubtaskSuggestion, + AgencyIDSubtaskSuggestion.subtask_id == URLAutoAgencyIDSubtask.id ) .join( Agency, - Agency.agency_id == cte.agency_id + Agency.id == AgencyIDSubtaskSuggestion.agency_id + ) + .group_by( + URLAutoAgencyIDSubtask.url_id, + Agency.id ) + .cte("robo_suggestions") ) - - if self.location_id is not None: - query = ( - query.join( - LinkAgencyLocation, - LinkAgencyLocation.agency_id == cte.agency_id - ) - .where( - LinkAgencyLocation.location_id == self.location_id + # Join user and robo suggestions + joined_suggestions_query = ( + select( + valid_agencies_cte.c.id, + Agency.name.label("display_name"), + func.coalesce(user_suggestions_cte.c.user_count, 0).label('user_count'), + func.coalesce(robo_suggestions_cte.c.robo_confidence, 0).label('robo_confidence'), + ) + .join( + Agency, + Agency.id == valid_agencies_cte.c.id + ) + .outerjoin( + user_suggestions_cte, + and_( + user_suggestions_cte.c.url_id == self.url_id, + user_suggestions_cte.c.agency_id == Agency.id ) ) - - query = ( - query.where( - cte.url_id == self.url_id + .outerjoin( + robo_suggestions_cte, + and_( + robo_suggestions_cte.c.url_id == self.url_id, + robo_suggestions_cte.c.agency_id == Agency.id + ) ) - .order_by( - cte.confidence.desc() + .where( + or_( + user_suggestions_cte.c.user_count > 0, + robo_suggestions_cte.c.robo_confidence > 0 + ) ) - .limit(3) ) - results: Sequence[RowMapping] = await sh.mappings(self.session, query=query) - - return [ - AgencyAnnotationAutoSuggestion( - agency_id=autosuggestion["agency_id"], - confidence=autosuggestion["confidence"], - agency_name=autosuggestion["agency_name"], + # Return suggestions + mappings: Sequence[RowMapping] = await self.mappings(joined_suggestions_query) + suggestions: list[SuggestionModel] = [ + SuggestionModel( + **mapping ) - for autosuggestion in results + for mapping in mappings ] + return sort_suggestions(suggestions) async def get_not_found_count(self) -> int: query = ( diff --git a/src/api/endpoints/annotate/all/get/queries/agency/suggestions_with_highest_confidence.py b/src/api/endpoints/annotate/all/get/queries/agency/suggestions_with_highest_confidence.py deleted file mode 100644 index 6d389b11..00000000 --- a/src/api/endpoints/annotate/all/get/queries/agency/suggestions_with_highest_confidence.py +++ /dev/null @@ -1,62 +0,0 @@ -from sqlalchemy import CTE, select, func, Column - -from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask -from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion - -SUGGESTIONS_WITH_HIGHEST_CONFIDENCE_CTE: CTE = ( - select( - URLAutoAgencyIDSubtask.url_id, - AgencyIDSubtaskSuggestion.agency_id, - func.max(AgencyIDSubtaskSuggestion.confidence) - ) - .select_from(URLAutoAgencyIDSubtask) - .join( - AgencyIDSubtaskSuggestion, - URLAutoAgencyIDSubtask.id == AgencyIDSubtaskSuggestion.subtask_id - ) - .group_by( - URLAutoAgencyIDSubtask.url_id, - AgencyIDSubtaskSuggestion.agency_id - ) - .cte("suggestions_with_highest_confidence") -) - -class SuggestionsWithHighestConfidenceCTE: - - def __init__(self): - self._cte = ( - select( - URLAutoAgencyIDSubtask.url_id, - AgencyIDSubtaskSuggestion.agency_id, - func.max(AgencyIDSubtaskSuggestion.confidence).label("confidence") - ) - .select_from(URLAutoAgencyIDSubtask) - .join( - AgencyIDSubtaskSuggestion, - URLAutoAgencyIDSubtask.id == AgencyIDSubtaskSuggestion.subtask_id - ) - .where( - AgencyIDSubtaskSuggestion.agency_id.isnot(None) - ) - .group_by( - URLAutoAgencyIDSubtask.url_id, - AgencyIDSubtaskSuggestion.agency_id - ) - .cte("suggestions_with_highest_confidence") - ) - - @property - def cte(self) -> CTE: - return self._cte - - @property - def url_id(self) -> Column[int]: - return self._cte.columns.url_id - - @property - def agency_id(self) -> Column[int]: - return self._cte.columns.agency_id - - @property - def confidence(self) -> Column[float]: - return self._cte.columns.confidence \ No newline at end of file diff --git a/src/api/endpoints/annotate/all/get/queries/convert.py b/src/api/endpoints/annotate/all/get/queries/convert.py index 535a7d15..fe9b0777 100644 --- a/src/api/endpoints/annotate/all/get/queries/convert.py +++ b/src/api/endpoints/annotate/all/get/queries/convert.py @@ -1,11 +1,12 @@ from collections import Counter -from src.api.endpoints.annotate.all.get.models.record_type import RecordTypeAnnotationSuggestion +from src.api.endpoints.annotate.all.get.models.record_type import RecordTypeAnnotationResponseOuterInfo, \ + RecordTypeSuggestionModel from src.api.endpoints.annotate.all.get.models.url_type import URLTypeAnnotationSuggestion from src.core.enums import RecordType from src.db.models.impl.flag.url_validated.enums import URLType from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion +from src.db.models.impl.url.suggestion.url_type.user import UserURLTypeSuggestion def convert_user_url_type_suggestion_to_url_type_annotation_suggestion( @@ -26,18 +27,20 @@ def convert_user_url_type_suggestion_to_url_type_annotation_suggestion( def convert_user_record_type_suggestion_to_record_type_annotation_suggestion( db_suggestions: list[UserRecordTypeSuggestion] -) -> list[RecordTypeAnnotationSuggestion]: +) -> RecordTypeAnnotationResponseOuterInfo: counter: Counter[RecordType] = Counter() for suggestion in db_suggestions: counter[suggestion.record_type] += 1 - anno_suggestions: list[RecordTypeAnnotationSuggestion] = [] + suggestions: list[RecordTypeSuggestionModel] = [] for record_type, endorsement_count in counter.most_common(3): - anno_suggestions.append( - RecordTypeAnnotationSuggestion( + suggestions.append( + RecordTypeSuggestionModel( record_type=record_type, - endorsement_count=endorsement_count, + user_count=endorsement_count, + robo_confidence=0, ) ) - - return anno_suggestions \ No newline at end of file + return RecordTypeAnnotationResponseOuterInfo( + suggestions=suggestions + ) diff --git a/src/api/endpoints/annotate/all/get/queries/core.py b/src/api/endpoints/annotate/all/get/queries/core.py index e37f2396..89975a08 100644 --- a/src/api/endpoints/annotate/all/get/queries/core.py +++ b/src/api/endpoints/annotate/all/get/queries/core.py @@ -8,10 +8,10 @@ from src.db.models.impl.flag.url_suspended.sqlalchemy import FlagURLSuspended from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion from src.db.models.impl.url.suggestion.location.user.sqlalchemy import UserLocationSuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion +from src.db.models.impl.url.suggestion.url_type.user import UserURLTypeSuggestion from src.db.models.views.unvalidated_url import UnvalidatedURL from src.db.models.views.url_anno_count import URLAnnotationCount from src.db.models.views.url_annotations_flags import URLAnnotationFlagsView @@ -61,17 +61,17 @@ async def run( URL.status == URLStatus.OK.value, # Must not have been previously annotated by user ~exists( - select(UserURLTypeSuggestion.id) + select(UserURLTypeSuggestion.url_id) .where( UserURLTypeSuggestion.url_id == URL.id, UserURLTypeSuggestion.user_id == self.user_id, ) ), ~exists( - select(UserUrlAgencySuggestion.id) + select(UserURLAgencySuggestion.url_id) .where( - UserUrlAgencySuggestion.url_id == URL.id, - UserUrlAgencySuggestion.user_id == self.user_id, + UserURLAgencySuggestion.url_id == URL.id, + UserURLAgencySuggestion.user_id == self.user_id, ) ), ~exists( diff --git a/src/api/endpoints/annotate/all/get/queries/location_/core.py b/src/api/endpoints/annotate/all/get/queries/location_/core.py index 85db523c..6081c5f7 100644 --- a/src/api/endpoints/annotate/all/get/queries/location_/core.py +++ b/src/api/endpoints/annotate/all/get/queries/location_/core.py @@ -1,13 +1,7 @@ from sqlalchemy.ext.asyncio import AsyncSession -from src.api.endpoints.annotate.all.get.models.location import LocationAnnotationResponseOuterInfo, \ - LocationAnnotationUserSuggestion, LocationAnnotationAutoSuggestion, LocationAnnotationUserSuggestionOuterInfo -from src.api.endpoints.annotate.all.get.queries.location_.requester import GetLocationSuggestionsRequester -from src.db.queries.base.builder import QueryBuilderBase -from sqlalchemy.ext.asyncio import AsyncSession - -from src.api.endpoints.annotate.all.get.models.location import LocationAnnotationResponseOuterInfo, \ - LocationAnnotationUserSuggestion, LocationAnnotationAutoSuggestion +from src.api.endpoints.annotate.all.get.models.location import LocationAnnotationResponseOuterInfo +from src.api.endpoints.annotate.all.get.models.suggestion import SuggestionModel from src.api.endpoints.annotate.all.get.queries.location_.requester import GetLocationSuggestionsRequester from src.db.queries.base.builder import QueryBuilderBase @@ -21,21 +15,17 @@ def __init__( super().__init__() self.url_id = url_id - + # TODO: Test async def run(self, session: AsyncSession) -> LocationAnnotationResponseOuterInfo: requester = GetLocationSuggestionsRequester(session) - user_suggestions: list[LocationAnnotationUserSuggestion] = \ - await requester.get_user_location_suggestions(self.url_id) - auto_suggestions: list[LocationAnnotationAutoSuggestion] = \ - await requester.get_auto_location_suggestions(self.url_id) + + suggestions: list[SuggestionModel] = \ + await requester.get_location_suggestions(self.url_id) not_found_count: int = \ await requester.get_not_found_count(self.url_id) return LocationAnnotationResponseOuterInfo( - user=LocationAnnotationUserSuggestionOuterInfo( - suggestions=user_suggestions, - not_found_count=not_found_count - ), - auto=auto_suggestions + suggestions=suggestions, + not_found_count=not_found_count ) diff --git a/src/api/endpoints/annotate/all/get/queries/location_/requester.py b/src/api/endpoints/annotate/all/get/queries/location_/requester.py index c60c8efe..fad8e834 100644 --- a/src/api/endpoints/annotate/all/get/queries/location_/requester.py +++ b/src/api/endpoints/annotate/all/get/queries/location_/requester.py @@ -1,9 +1,11 @@ from typing import Sequence -from sqlalchemy import select, func, RowMapping +from sqlalchemy import select, func, RowMapping, or_, and_ -from src.api.endpoints.annotate.all.get.models.location import LocationAnnotationUserSuggestion, \ - LocationAnnotationAutoSuggestion +from src.api.endpoints.annotate.all.get.models.suggestion import SuggestionModel +from src.api.endpoints.annotate.all.get.queries._shared.sort import sort_suggestions +from src.db.helpers.query import exists_url +from src.db.helpers.session import session_helper as sh from src.db.models.impl.link.user_suggestion_not_found.location.sqlalchemy import LinkUserSuggestionLocationNotFound from src.db.models.impl.url.suggestion.location.auto.subtask.sqlalchemy import AutoLocationIDSubtask from src.db.models.impl.url.suggestion.location.auto.suggestion.sqlalchemy import LocationIDSubtaskSuggestion @@ -11,52 +13,46 @@ from src.db.models.views.location_expanded import LocationExpandedView from src.db.templates.requester import RequesterBase -from src.db.helpers.session import session_helper as sh class GetLocationSuggestionsRequester(RequesterBase): - - async def get_user_location_suggestions(self, url_id: int) -> list[LocationAnnotationUserSuggestion]: - query = ( + async def get_location_suggestions(self, url_id: int) -> list[SuggestionModel]: + # All locations with either a user or robo annotation + valid_locations_cte = ( select( - UserLocationSuggestion.location_id, - LocationExpandedView.display_name.label("location_name"), - func.count(UserLocationSuggestion.user_id).label('user_count') - ) - .join( - LocationExpandedView, - LocationExpandedView.id == UserLocationSuggestion.location_id + LocationExpandedView.id, ) .where( - UserLocationSuggestion.url_id == url_id + or_( + exists_url( + UserLocationSuggestion + ), + exists_url( + AutoLocationIDSubtask + ) + ) ) - .group_by( + .cte("valid_locations") + ) + # Number of users who suggested each location + user_suggestions_cte = ( + select( + UserLocationSuggestion.url_id, UserLocationSuggestion.location_id, - LocationExpandedView.display_name + func.count(UserLocationSuggestion.user_id).label('user_count') ) - .order_by( - func.count(UserLocationSuggestion.user_id).desc() + .group_by( + UserLocationSuggestion.location_id, + UserLocationSuggestion.url_id, ) + .cte("user_suggestions") ) - raw_results: Sequence[RowMapping] = await sh.mappings(self.session, query) - return [ - LocationAnnotationUserSuggestion( - **raw_result - ) - for raw_result in raw_results - ] - - - - async def get_auto_location_suggestions( - self, - url_id: int - ) -> list[LocationAnnotationAutoSuggestion]: - query = ( + # Maximum confidence of robo annotation, if any + robo_suggestions_cte = ( select( - LocationExpandedView.full_display_name.label("location_name"), - LocationIDSubtaskSuggestion.location_id, - LocationIDSubtaskSuggestion.confidence, + AutoLocationIDSubtask.url_id, + LocationExpandedView.id.label("location_id"), + func.max(LocationIDSubtaskSuggestion.confidence).label('robo_confidence') ) .join( LocationExpandedView, @@ -66,20 +62,54 @@ async def get_auto_location_suggestions( AutoLocationIDSubtask, AutoLocationIDSubtask.id == LocationIDSubtaskSuggestion.subtask_id ) - .where( - AutoLocationIDSubtask.url_id == url_id + .group_by( + LocationExpandedView.id, + AutoLocationIDSubtask.url_id, + ) + .cte("robo_suggestions") + ) + # Join user and robo suggestions + joined_suggestions_query = ( + select( + valid_locations_cte.c.id, + LocationExpandedView.full_display_name.label("display_name"), + func.coalesce(user_suggestions_cte.c.user_count, 0).label("user_count"), + func.coalesce(robo_suggestions_cte.c.robo_confidence, 0).label("robo_confidence"), + ) + .join( + LocationExpandedView, + LocationExpandedView.id == valid_locations_cte.c.id ) - .order_by( - LocationIDSubtaskSuggestion.confidence.desc() + .outerjoin( + user_suggestions_cte, + and_( + user_suggestions_cte.c.url_id == url_id, + user_suggestions_cte.c.location_id == LocationExpandedView.id + ) + ) + .outerjoin( + robo_suggestions_cte, + and_( + robo_suggestions_cte.c.url_id == url_id, + robo_suggestions_cte.c.location_id == LocationExpandedView.id + ) + ) + .where( + or_( + user_suggestions_cte.c.user_count > 0, + robo_suggestions_cte.c.robo_confidence > 0 + ) ) ) - raw_results: Sequence[RowMapping] = await sh.mappings(self.session, query) - return [ - LocationAnnotationAutoSuggestion( - **raw_result + + mappings: Sequence[RowMapping] = await self.mappings(joined_suggestions_query) + suggestions: list[SuggestionModel] = [ + SuggestionModel( + **mapping ) - for raw_result in raw_results + for mapping in mappings ] + return sort_suggestions(suggestions) async def get_not_found_count(self, url_id: int) -> int: query = ( diff --git a/src/api/endpoints/annotate/all/get/queries/name/core.py b/src/api/endpoints/annotate/all/get/queries/name/core.py index b048cb2c..9438f14e 100644 --- a/src/api/endpoints/annotate/all/get/queries/name/core.py +++ b/src/api/endpoints/annotate/all/get/queries/name/core.py @@ -1,11 +1,12 @@ from typing import Sequence -from sqlalchemy import select, func, RowMapping +from sqlalchemy import select, func, RowMapping, case from sqlalchemy.ext.asyncio import AsyncSession -from src.api.endpoints.annotate.all.get.models.name import NameAnnotationSuggestion +from src.api.endpoints.annotate.all.get.models.name import NameAnnotationSuggestion, NameAnnotationResponseOuterInfo from src.db.helpers.session import session_helper as sh from src.db.models.impl.link.user_name_suggestion.sqlalchemy import LinkUserNameSuggestion +from src.db.models.impl.url.suggestion.name.enums import NameSuggestionSource from src.db.models.impl.url.suggestion.name.sqlalchemy import URLNameSuggestion from src.db.queries.base.builder import QueryBuilderBase @@ -19,14 +20,18 @@ def __init__( super().__init__() self.url_id = url_id - async def run(self, session: AsyncSession) -> list[NameAnnotationSuggestion]: + async def run(self, session: AsyncSession) -> NameAnnotationResponseOuterInfo: query = ( select( - URLNameSuggestion.id.label('suggestion_id'), - URLNameSuggestion.suggestion.label('name'), + URLNameSuggestion.id.label('id'), + URLNameSuggestion.suggestion.label('display_name'), func.count( LinkUserNameSuggestion.user_id - ).label('endorsement_count'), + ).label('user_count'), + case( + (URLNameSuggestion.source == NameSuggestionSource.HTML_METADATA_TITLE, 1), + else_=0 + ).label("robo_count") ) .outerjoin( LinkUserNameSuggestion, @@ -47,12 +52,15 @@ async def run(self, session: AsyncSession) -> list[NameAnnotationSuggestion]: ) mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) - return [ + suggestions = [ NameAnnotationSuggestion( **mapping ) for mapping in mappings ] + return NameAnnotationResponseOuterInfo( + suggestions=suggestions + ) diff --git a/src/api/endpoints/annotate/all/post/models/request.py b/src/api/endpoints/annotate/all/post/models/request.py index 8de222de..32228bac 100644 --- a/src/api/endpoints/annotate/all/post/models/request.py +++ b/src/api/endpoints/annotate/all/post/models/request.py @@ -1,15 +1,15 @@ -from pydantic import BaseModel, model_validator, ConfigDict +from pydantic import model_validator from src.api.endpoints.annotate.all.post.models.agency import AnnotationPostAgencyInfo from src.api.endpoints.annotate.all.post.models.location import AnnotationPostLocationInfo from src.api.endpoints.annotate.all.post.models.name import AnnotationPostNameInfo +from src.api.shared.models.request_base import RequestBase from src.core.enums import RecordType from src.core.exceptions import FailedValidationException from src.db.models.impl.flag.url_validated.enums import URLType -class AllAnnotationPostInfo(BaseModel): - model_config = ConfigDict(extra='forbid') +class AllAnnotationPostInfo(RequestBase): suggested_status: URLType record_type: RecordType | None = None diff --git a/src/api/endpoints/annotate/all/post/requester.py b/src/api/endpoints/annotate/all/post/requester.py index 14064e8a..8834ff76 100644 --- a/src/api/endpoints/annotate/all/post/requester.py +++ b/src/api/endpoints/annotate/all/post/requester.py @@ -6,12 +6,12 @@ from src.db.models.impl.link.user_name_suggestion.sqlalchemy import LinkUserNameSuggestion from src.db.models.impl.link.user_suggestion_not_found.agency.sqlalchemy import LinkUserSuggestionAgencyNotFound from src.db.models.impl.link.user_suggestion_not_found.location.sqlalchemy import LinkUserSuggestionLocationNotFound -from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion from src.db.models.impl.url.suggestion.location.user.sqlalchemy import UserLocationSuggestion from src.db.models.impl.url.suggestion.name.enums import NameSuggestionSource from src.db.models.impl.url.suggestion.name.sqlalchemy import URLNameSuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion +from src.db.models.impl.url.suggestion.url_type.user import UserURLTypeSuggestion from src.db.templates.requester import RequesterBase @@ -53,7 +53,7 @@ def add_relevant_annotation( def add_agency_ids(self, agency_ids: list[int]) -> None: for agency_id in agency_ids: - agency_suggestion = UserUrlAgencySuggestion( + agency_suggestion = UserURLAgencySuggestion( url_id=self.url_id, user_id=self.user_id, agency_id=agency_id, diff --git a/src/api/endpoints/annotate/anonymous/get/helpers.py b/src/api/endpoints/annotate/anonymous/get/helpers.py new file mode 100644 index 00000000..83a10845 --- /dev/null +++ b/src/api/endpoints/annotate/anonymous/get/helpers.py @@ -0,0 +1,27 @@ +from typing import Protocol, TypeVar +from uuid import UUID + +from marshmallow.fields import Bool +from sqlalchemy import Exists, select, exists, ColumnElement, Boolean + +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.mixins import AnonymousSessionMixin, URLDependentMixin +from src.db.models.templates_.base import Base + + +class AnonymousURLModelProtocol( + Protocol, +): + session_id: ColumnElement[UUID] + url_id: ColumnElement[int] + +AnonModel = TypeVar("AnonModel", bound=AnonymousURLModelProtocol) + +def not_exists_anon_annotation(session_id: UUID, anon_model: AnonModel) -> ColumnElement[bool]: + return ~exists( + select(anon_model.url_id) + .where( + anon_model.url_id == URL.id, + anon_model.session_id == session_id, + ) + ) \ No newline at end of file diff --git a/src/api/endpoints/annotate/anonymous/get/query.py b/src/api/endpoints/annotate/anonymous/get/query.py index 7e5f2e53..041d5cda 100644 --- a/src/api/endpoints/annotate/anonymous/get/query.py +++ b/src/api/endpoints/annotate/anonymous/get/query.py @@ -1,14 +1,21 @@ from typing import Any +from uuid import UUID -from sqlalchemy import Select, func +from sqlalchemy import Select, func, exists, select from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.orm import joinedload from src.api.endpoints.annotate._shared.extract import extract_and_format_get_annotation_result from src.api.endpoints.annotate.all.get.models.response import GetNextURLForAllAnnotationResponse +from src.api.endpoints.annotate.anonymous.get.helpers import not_exists_anon_annotation +from src.api.endpoints.annotate.anonymous.get.response import GetNextURLForAnonymousAnnotationResponse from src.collectors.enums import URLStatus from src.db.helpers.query import not_exists_url +from src.db.models.impl.flag.url_suspended.sqlalchemy import FlagURLSuspended from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.suggestion.anonymous.agency.sqlalchemy import AnonymousAnnotationAgency +from src.db.models.impl.url.suggestion.anonymous.location.sqlalchemy import AnonymousAnnotationLocation +from src.db.models.impl.url.suggestion.anonymous.record_type.sqlalchemy import AnonymousAnnotationRecordType from src.db.models.impl.url.suggestion.anonymous.url_type.sqlalchemy import AnonymousAnnotationURLType from src.db.models.views.unvalidated_url import UnvalidatedURL from src.db.models.views.url_anno_count import URLAnnotationCount @@ -18,7 +25,14 @@ class GetNextURLForAnonymousAnnotationQueryBuilder(QueryBuilderBase): - async def run(self, session: AsyncSession) -> GetNextURLForAllAnnotationResponse: + def __init__( + self, + session_id: UUID + ): + super().__init__() + self.session_id = session_id + + async def run(self, session: AsyncSession) -> GetNextURLForAnonymousAnnotationResponse: query = ( Select(URL) @@ -37,7 +51,31 @@ async def run(self, session: AsyncSession) -> GetNextURLForAllAnnotationResponse ) .where( URL.status == URLStatus.OK.value, - not_exists_url(AnonymousAnnotationURLType) + # Must not have been previously annotated by user + not_exists_anon_annotation( + session_id=self.session_id, + anon_model=AnonymousAnnotationURLType + ), + not_exists_anon_annotation( + session_id=self.session_id, + anon_model=AnonymousAnnotationRecordType + ), + not_exists_anon_annotation( + session_id=self.session_id, + anon_model=AnonymousAnnotationLocation + ), + not_exists_anon_annotation( + session_id=self.session_id, + anon_model=AnonymousAnnotationAgency + ), + ~exists( + select( + FlagURLSuspended.url_id + ) + .where( + FlagURLSuspended.url_id == URL.id, + ) + ) ) .options( joinedload(URL.html_content), @@ -46,7 +84,8 @@ async def run(self, session: AsyncSession) -> GetNextURLForAllAnnotationResponse joinedload(URL.name_suggestions), ) .order_by( - func.random() + URLAnnotationCount.total_anno_count.desc(), + URL.id.asc() ) .limit(1) ) @@ -54,8 +93,13 @@ async def run(self, session: AsyncSession) -> GetNextURLForAllAnnotationResponse raw_results = (await session.execute(query)).unique() url: URL | None = raw_results.scalars().one_or_none() if url is None: - return GetNextURLForAllAnnotationResponse( - next_annotation=None + return GetNextURLForAnonymousAnnotationResponse( + next_annotation=None, + session_id=self.session_id ) - return await extract_and_format_get_annotation_result(session, url=url) + response: GetNextURLForAllAnnotationResponse = await extract_and_format_get_annotation_result(session, url=url) + return GetNextURLForAnonymousAnnotationResponse( + session_id=self.session_id, + next_annotation=response.next_annotation + ) diff --git a/src/api/endpoints/annotate/anonymous/get/response.py b/src/api/endpoints/annotate/anonymous/get/response.py new file mode 100644 index 00000000..e54403bc --- /dev/null +++ b/src/api/endpoints/annotate/anonymous/get/response.py @@ -0,0 +1,10 @@ +from uuid import UUID + +from pydantic import BaseModel + +from src.api.endpoints.annotate.all.get.models.response import GetNextURLForAllAnnotationInnerResponse + + +class GetNextURLForAnonymousAnnotationResponse(BaseModel): + next_annotation: GetNextURLForAllAnnotationInnerResponse | None + session_id: UUID \ No newline at end of file diff --git a/src/api/endpoints/annotate/anonymous/post/query.py b/src/api/endpoints/annotate/anonymous/post/query.py index faa7aa1d..593d79d9 100644 --- a/src/api/endpoints/annotate/anonymous/post/query.py +++ b/src/api/endpoints/annotate/anonymous/post/query.py @@ -1,3 +1,5 @@ +from uuid import UUID + from sqlalchemy.ext.asyncio import AsyncSession from src.api.endpoints.annotate.all.post.models.request import AllAnnotationPostInfo @@ -11,10 +13,12 @@ class AddAnonymousAnnotationsToURLQueryBuilder(QueryBuilderBase): def __init__( self, + session_id: UUID, url_id: int, post_info: AllAnnotationPostInfo ): super().__init__() + self.session_id = session_id self.url_id = url_id self.post_info = post_info @@ -22,14 +26,16 @@ async def run(self, session: AsyncSession) -> None: url_type_suggestion = AnonymousAnnotationURLType( url_id=self.url_id, - url_type=self.post_info.suggested_status + url_type=self.post_info.suggested_status, + session_id=self.session_id ) session.add(url_type_suggestion) if self.post_info.record_type is not None: record_type_suggestion = AnonymousAnnotationRecordType( url_id=self.url_id, - record_type=self.post_info.record_type + record_type=self.post_info.record_type, + session_id=self.session_id ) session.add(record_type_suggestion) @@ -37,7 +43,8 @@ async def run(self, session: AsyncSession) -> None: location_suggestions = [ AnonymousAnnotationLocation( url_id=self.url_id, - location_id=location_id + location_id=location_id, + session_id=self.session_id ) for location_id in self.post_info.location_info.location_ids ] @@ -47,7 +54,8 @@ async def run(self, session: AsyncSession) -> None: agency_suggestions = [ AnonymousAnnotationAgency( url_id=self.url_id, - agency_id=agency_id + agency_id=agency_id, + session_id=self.session_id ) for agency_id in self.post_info.agency_info.agency_ids ] diff --git a/src/api/endpoints/annotate/dtos/shared/base/response.py b/src/api/endpoints/annotate/dtos/shared/base/response.py index edcc80e1..0d3ae253 100644 --- a/src/api/endpoints/annotate/dtos/shared/base/response.py +++ b/src/api/endpoints/annotate/dtos/shared/base/response.py @@ -1,14 +1,12 @@ -from typing import Optional - from pydantic import BaseModel, Field from src.api.endpoints.annotate.dtos.shared.batch import AnnotationBatchInfo from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping class AnnotationInnerResponseInfoBase(BaseModel): - url_info: URLMapping = Field( + url_info: SimpleURLMapping = Field( title="Information about the URL" ) html_info: ResponseHTMLInfo = Field( diff --git a/src/api/endpoints/annotate/routes.py b/src/api/endpoints/annotate/routes.py index a09ee1ec..1633eb5a 100644 --- a/src/api/endpoints/annotate/routes.py +++ b/src/api/endpoints/annotate/routes.py @@ -1,3 +1,6 @@ +import uuid +from uuid import UUID + from fastapi import APIRouter, Depends, Query from src.api.dependencies import get_async_core @@ -7,8 +10,10 @@ from src.api.endpoints.annotate.all.post.models.request import AllAnnotationPostInfo from src.api.endpoints.annotate.all.post.query import AddAllAnnotationsToURLQueryBuilder from src.api.endpoints.annotate.anonymous.get.query import GetNextURLForAnonymousAnnotationQueryBuilder +from src.api.endpoints.annotate.anonymous.get.response import GetNextURLForAnonymousAnnotationResponse from src.api.endpoints.annotate.anonymous.post.query import AddAnonymousAnnotationsToURLQueryBuilder from src.core.core import AsyncCore +from src.db.queries.implementations.anonymous_session import MakeAnonymousSessionQueryBuilder from src.security.dtos.access_info import AccessInfo from src.security.manager import get_access_info @@ -33,26 +38,38 @@ @annotate_router.get("/anonymous") async def get_next_url_for_all_annotations_anonymous( async_core: AsyncCore = Depends(get_async_core), -) -> GetNextURLForAllAnnotationResponse: + session_id: UUID | None = Query(description="The session id of the anonymous user.", default=None) +) -> GetNextURLForAnonymousAnnotationResponse: + # If session_id is not provided, generate new UUID + if session_id is None: + session_id: uuid.UUID = await async_core.adb_client.run_query_builder( + MakeAnonymousSessionQueryBuilder() + ) + return await async_core.adb_client.run_query_builder( - GetNextURLForAnonymousAnnotationQueryBuilder() + GetNextURLForAnonymousAnnotationQueryBuilder(session_id=session_id) ) + @annotate_router.post("/anonymous/{url_id}") async def annotate_url_for_all_annotations_and_get_next_url_anonymous( url_id: int, all_annotation_post_info: AllAnnotationPostInfo, async_core: AsyncCore = Depends(get_async_core), -) -> GetNextURLForAllAnnotationResponse: + session_id: UUID = Query(description="The session id of the anonymous user") +) -> GetNextURLForAnonymousAnnotationResponse: await async_core.adb_client.run_query_builder( AddAnonymousAnnotationsToURLQueryBuilder( url_id=url_id, - post_info=all_annotation_post_info + post_info=all_annotation_post_info, + session_id=session_id ) ) return await async_core.adb_client.run_query_builder( - GetNextURLForAnonymousAnnotationQueryBuilder() + GetNextURLForAnonymousAnnotationQueryBuilder( + session_id=session_id + ) ) diff --git a/src/api/endpoints/batch/routes.py b/src/api/endpoints/batch/routes.py index bd7bbf61..87839fb7 100644 --- a/src/api/endpoints/batch/routes.py +++ b/src/api/endpoints/batch/routes.py @@ -1,5 +1,3 @@ -from typing import Optional - from fastapi import Path, APIRouter from fastapi.params import Query, Depends @@ -7,12 +5,11 @@ from src.api.endpoints.batch.dtos.get.logs import GetBatchLogsResponse from src.api.endpoints.batch.dtos.get.summaries.response import GetBatchSummariesResponse from src.api.endpoints.batch.dtos.get.summaries.summary import BatchSummary -from src.api.endpoints.batch.dtos.post.abort import MessageResponse +from src.api.shared.models.message_response import MessageResponse from src.api.endpoints.batch.duplicates.dto import GetDuplicatesByBatchResponse from src.api.endpoints.batch.urls.dto import GetURLsByBatchResponse from src.collectors.enums import CollectorType from src.core.core import AsyncCore -from src.core.enums import BatchStatus from src.db.models.views.batch_url_status.enums import BatchURLStatusEnum from src.security.dtos.access_info import AccessInfo from src.security.manager import get_access_info diff --git a/tests/automated/integration/tasks/url/impl/submit_approved/__init__.py b/src/api/endpoints/check/__init__.py similarity index 100% rename from tests/automated/integration/tasks/url/impl/submit_approved/__init__.py rename to src/api/endpoints/check/__init__.py diff --git a/src/api/endpoints/check/routes.py b/src/api/endpoints/check/routes.py new file mode 100644 index 00000000..9ea309a7 --- /dev/null +++ b/src/api/endpoints/check/routes.py @@ -0,0 +1,21 @@ +from fastapi import APIRouter, Depends + +from src.api.dependencies import get_async_core +from src.api.endpoints.check.unique_url.response import CheckUniqueURLResponse +from src.api.endpoints.check.unique_url.wrapper import check_unique_url_wrapper +from src.core.core import AsyncCore + +check_router = APIRouter( + prefix="/check", + tags=["Check"] +) + +@check_router.get("/unique-url") +async def check_unique_url( + url: str, + async_core: AsyncCore = Depends(get_async_core), +) -> CheckUniqueURLResponse: + return await check_unique_url_wrapper( + adb_client=async_core.adb_client, + url=url + ) diff --git a/tests/automated/integration/tasks/url/impl/submit_meta_urls/__init__.py b/src/api/endpoints/check/unique_url/__init__.py similarity index 100% rename from tests/automated/integration/tasks/url/impl/submit_meta_urls/__init__.py rename to src/api/endpoints/check/unique_url/__init__.py diff --git a/src/api/endpoints/check/unique_url/response.py b/src/api/endpoints/check/unique_url/response.py new file mode 100644 index 00000000..f9a15ddd --- /dev/null +++ b/src/api/endpoints/check/unique_url/response.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class CheckUniqueURLResponse(BaseModel): + unique_url: bool + url_id: int | None \ No newline at end of file diff --git a/src/api/endpoints/check/unique_url/wrapper.py b/src/api/endpoints/check/unique_url/wrapper.py new file mode 100644 index 00000000..63deddf1 --- /dev/null +++ b/src/api/endpoints/check/unique_url/wrapper.py @@ -0,0 +1,23 @@ +from src.api.endpoints.check.unique_url.response import CheckUniqueURLResponse +from src.db.client.async_ import AsyncDatabaseClient +from src.db.queries.urls_exist.model import URLExistsResult +from src.db.queries.urls_exist.query import URLsExistInDBQueryBuilder +from src.util.models.full_url import FullURL + + +async def check_unique_url_wrapper( + adb_client: AsyncDatabaseClient, + url: str +) -> CheckUniqueURLResponse: + result: URLExistsResult = (await adb_client.run_query_builder( + URLsExistInDBQueryBuilder(full_urls=[FullURL(url)]) + ))[0] + if result.exists: + return CheckUniqueURLResponse( + unique_url=False, + url_id=result.url_id + ) + return CheckUniqueURLResponse( + unique_url=True, + url_id=None + ) diff --git a/src/api/endpoints/collector/dtos/manual_batch/post.py b/src/api/endpoints/collector/dtos/manual_batch/post.py index 6ec62579..ce00e40b 100644 --- a/src/api/endpoints/collector/dtos/manual_batch/post.py +++ b/src/api/endpoints/collector/dtos/manual_batch/post.py @@ -2,6 +2,7 @@ from pydantic import BaseModel, Field +from src.api.shared.models.request_base import RequestBase from src.core.enums import RecordType @@ -16,7 +17,7 @@ class ManualBatchInnerInputDTO(BaseModel): supplying_entity: str | None = None -class ManualBatchInputDTO(BaseModel): +class ManualBatchInputDTO(RequestBase): name: str entries: list[ManualBatchInnerInputDTO] = Field( min_length=1, diff --git a/src/api/endpoints/collector/manual/query.py b/src/api/endpoints/collector/manual/query.py index 4f8956dc..5ebe0e4b 100644 --- a/src/api/endpoints/collector/manual/query.py +++ b/src/api/endpoints/collector/manual/query.py @@ -9,9 +9,11 @@ from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType from src.db.queries.base.builder import QueryBuilderBase +from src.util.models.url_and_scheme import URLAndScheme +from src.util.url import get_url_and_scheme class UploadManualBatchQueryBuilder(QueryBuilderBase): @@ -43,13 +45,17 @@ async def run(self, session: AsyncSession) -> ManualBatchResponseDTO: duplicate_urls: list[str] = [] for entry in self.dto.entries: + url_and_scheme: URLAndScheme = get_url_and_scheme(entry.url) + url = URL( - url=entry.url, + url=url_and_scheme.url.rstrip('/'), + scheme=url_and_scheme.scheme, name=entry.name, description=entry.description, collector_metadata=entry.collector_metadata, status=URLStatus.OK.value, - source=URLSource.MANUAL + source=URLSource.MANUAL, + trailing_slash=url_and_scheme.url.endswith('/'), ) @@ -78,9 +84,10 @@ async def run(self, session: AsyncSession) -> ManualBatchResponseDTO: optional_metadata = URLOptionalDataSourceMetadata( url_id=url.id, - record_formats=entry.record_formats, + record_formats=entry.record_formats or [], data_portal_type=entry.data_portal_type, supplying_entity=entry.supplying_entity, + access_types=[] ) session.add(optional_metadata) url_ids.append(url.id) diff --git a/src/api/endpoints/contributions/shared/contributions.py b/src/api/endpoints/contributions/shared/contributions.py index 477f0365..ae72fc00 100644 --- a/src/api/endpoints/contributions/shared/contributions.py +++ b/src/api/endpoints/contributions/shared/contributions.py @@ -1,6 +1,6 @@ from sqlalchemy import select, func, CTE, Column -from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion +from src.db.models.impl.url.suggestion.url_type.user import UserURLTypeSuggestion class ContributionsCTEContainer: diff --git a/src/api/endpoints/contributions/user/queries/agreement/agency.py b/src/api/endpoints/contributions/user/queries/agreement/agency.py index 96011e06..01000bf2 100644 --- a/src/api/endpoints/contributions/user/queries/agreement/agency.py +++ b/src/api/endpoints/contributions/user/queries/agreement/agency.py @@ -1,56 +1,80 @@ -from sqlalchemy import select, func, exists, and_ +from sqlalchemy import select, func, exists, and_, or_, any_, cast, Float -from src.api.endpoints.contributions.user.queries.annotated_and_validated import AnnotatedAndValidatedCTEContainer from src.api.endpoints.contributions.user.queries.templates.agreement import AgreementCTEContainer +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency -from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion -def get_agency_agreement_cte_container( - inner_cte: AnnotatedAndValidatedCTEContainer -) -> AgreementCTEContainer: +def get_agency_agreement_cte_container() -> AgreementCTEContainer: - count_cte = ( + uuas = UserURLAgencySuggestion + fuv = FlagURLValidated + lau = LinkURLAgency + # CTE 1: All validated Meta URLs/Data Sources and their agencies + validated_urls_with_agencies = ( select( - inner_cte.user_id, - func.count() + uuas.url_id, + func.array_agg(lau.agency_id).label("agency_ids"), + ) + .join(fuv, fuv.url_id == uuas.url_id) + .join(lau, lau.url_id == uuas.url_id, isouter=True) + .where( + or_( + uuas.is_new.is_(None), + uuas.is_new.is_(False) + ), + or_( + fuv.type == "meta url", + fuv.type == "data source" + ), + ) + .group_by(uuas.url_id) + .cte("validated_urls_with_agencies") + ) + + # CTE 2 + cte_2 = ( + select( + validated_urls_with_agencies.c.url_id, + validated_urls_with_agencies.c.agency_ids, + uuas.is_new, + uuas.user_id, + uuas.agency_id.label("suggested_agency_id"), + (uuas.agency_id == any_(validated_urls_with_agencies.c.agency_ids)).label( + "is_suggested_agency_validated" + ), ) .join( - UserUrlAgencySuggestion, - and_( - inner_cte.user_id == UserUrlAgencySuggestion.user_id, - inner_cte.url_id == UserUrlAgencySuggestion.url_id - ) + validated_urls_with_agencies, + validated_urls_with_agencies.c.url_id == uuas.url_id, + ) + .cte("final") + ) + + count_cte = ( + select( + cte_2.c.user_id, + func.count() ) .group_by( - inner_cte.user_id + cte_2.c.user_id ) - .cte("agency_count_total") + .cte("count_cte") ) agreed_cte = ( select( - inner_cte.user_id, + cte_2.c.user_id, func.count() ) - .join( - UserUrlAgencySuggestion, - and_( - inner_cte.user_id == UserUrlAgencySuggestion.user_id, - inner_cte.url_id == UserUrlAgencySuggestion.url_id - ) - ) .where( - exists() - .where( - LinkURLAgency.url_id == UserUrlAgencySuggestion.url_id, - LinkURLAgency.agency_id == UserUrlAgencySuggestion.agency_id - ) + cte_2.c.is_suggested_agency_validated.is_(True) ) .group_by( - inner_cte.user_id + cte_2.c.user_id ) - .cte("agency_count_agreed") + .cte("agreed_cte") ) return AgreementCTEContainer( diff --git a/src/api/endpoints/contributions/user/queries/agreement/url_type.py b/src/api/endpoints/contributions/user/queries/agreement/url_type.py index cf028bf1..12feb834 100644 --- a/src/api/endpoints/contributions/user/queries/agreement/url_type.py +++ b/src/api/endpoints/contributions/user/queries/agreement/url_type.py @@ -3,7 +3,7 @@ from src.api.endpoints.contributions.user.queries.annotated_and_validated import AnnotatedAndValidatedCTEContainer from src.api.endpoints.contributions.user.queries.templates.agreement import AgreementCTEContainer from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated -from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion +from src.db.models.impl.url.suggestion.url_type.user import UserURLTypeSuggestion def get_url_type_agreement_cte_container( diff --git a/src/api/endpoints/contributions/user/queries/annotated_and_validated.py b/src/api/endpoints/contributions/user/queries/annotated_and_validated.py index a9740328..9c7c48f6 100644 --- a/src/api/endpoints/contributions/user/queries/annotated_and_validated.py +++ b/src/api/endpoints/contributions/user/queries/annotated_and_validated.py @@ -1,7 +1,7 @@ from sqlalchemy import select, Column, CTE from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated -from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion +from src.db.models.impl.url.suggestion.url_type.user import UserURLTypeSuggestion class AnnotatedAndValidatedCTEContainer: diff --git a/src/api/endpoints/contributions/user/queries/core.py b/src/api/endpoints/contributions/user/queries/core.py index 57727215..d905026c 100644 --- a/src/api/endpoints/contributions/user/queries/core.py +++ b/src/api/endpoints/contributions/user/queries/core.py @@ -1,4 +1,5 @@ from sqlalchemy import select, RowMapping +from sqlalchemy.exc import NoResultFound from sqlalchemy.ext.asyncio import AsyncSession from src.api.endpoints.contributions.shared.contributions import ContributionsCTEContainer @@ -23,7 +24,7 @@ async def run(self, session: AsyncSession) -> ContributionsUserResponse: contributions_cte = ContributionsCTEContainer() record_type_agree: AgreementCTEContainer = get_record_type_agreement_cte_container(inner_cte) - agency_agree: AgreementCTEContainer = get_agency_agreement_cte_container(inner_cte) + agency_agree: AgreementCTEContainer = get_agency_agreement_cte_container() url_type_agree: AgreementCTEContainer = get_url_type_agreement_cte_container(inner_cte) query = ( @@ -33,27 +34,40 @@ async def run(self, session: AsyncSession) -> ContributionsUserResponse: agency_agree.agreement.label("agency"), url_type_agree.agreement.label("url_type") ) - .join( + .outerjoin( record_type_agree.cte, contributions_cte.user_id == record_type_agree.user_id ) - .join( + .outerjoin( agency_agree.cte, contributions_cte.user_id == agency_agree.user_id ) - .join( + .outerjoin( url_type_agree.cte, contributions_cte.user_id == url_type_agree.user_id ) + .where( + contributions_cte.user_id == self.user_id + ) ) - mapping: RowMapping = await sh.mapping(session, query=query) + try: + mapping: RowMapping = await sh.mapping(session, query=query) + except NoResultFound: + return ContributionsUserResponse( + count_validated=0, + agreement=ContributionsUserAgreement( + record_type=0, + agency=0, + url_type=0 + ) + ) return ContributionsUserResponse( count_validated=mapping.count, agreement=ContributionsUserAgreement( - record_type=mapping.record_type, - agency=mapping.agency, - url_type=mapping.url_type + record_type=mapping.record_type or 0, + agency=mapping.agency or 0, + url_type=mapping.url_type or 0 ) ) \ No newline at end of file diff --git a/src/api/endpoints/data_source/__init__.py b/src/api/endpoints/data_source/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/data_source/_shared/__init__.py b/src/api/endpoints/data_source/_shared/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/data_source/_shared/build.py b/src/api/endpoints/data_source/_shared/build.py new file mode 100644 index 00000000..35b65343 --- /dev/null +++ b/src/api/endpoints/data_source/_shared/build.py @@ -0,0 +1,66 @@ +from sqlalchemy import Select, select, and_ +from sqlalchemy.orm import selectinload + +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata +from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType + + +def build_data_source_get_query() -> Select: + return ( + select( + URL, + URL.id, + URL.url, + + # Required Attributes + URL.name, + URLRecordType.record_type, + + # Optional Attributes + URL.description, + LinkBatchURL.batch_id, + URLOptionalDataSourceMetadata.record_formats, + URLOptionalDataSourceMetadata.data_portal_type, + URLOptionalDataSourceMetadata.supplying_entity, + URLOptionalDataSourceMetadata.coverage_start, + URLOptionalDataSourceMetadata.coverage_end, + URLOptionalDataSourceMetadata.agency_supplied, + URLOptionalDataSourceMetadata.agency_aggregation, + URLOptionalDataSourceMetadata.agency_described_not_in_database, + URLOptionalDataSourceMetadata.agency_originated, + URLOptionalDataSourceMetadata.update_method, + URLOptionalDataSourceMetadata.readme_url, + URLOptionalDataSourceMetadata.originating_entity, + URLOptionalDataSourceMetadata.retention_schedule, + URLOptionalDataSourceMetadata.scraper_url, + URLOptionalDataSourceMetadata.submission_notes, + URLOptionalDataSourceMetadata.access_notes, + URLOptionalDataSourceMetadata.access_types + ) + .join( + URLRecordType, + URLRecordType.url_id == URL.id + ) + .join( + FlagURLValidated, + and_( + FlagURLValidated.url_id == URL.id, + FlagURLValidated.type == URLType.DATA_SOURCE + ) + ) + .outerjoin( + LinkBatchURL, + LinkBatchURL.url_id == URL.id + ) + .outerjoin( + URLOptionalDataSourceMetadata, + URLOptionalDataSourceMetadata.url_id == URL.id + ) + .options( + selectinload(URL.confirmed_agencies), + ) + ) \ No newline at end of file diff --git a/src/api/endpoints/data_source/_shared/process.py b/src/api/endpoints/data_source/_shared/process.py new file mode 100644 index 00000000..252ed7c0 --- /dev/null +++ b/src/api/endpoints/data_source/_shared/process.py @@ -0,0 +1,44 @@ +from sqlalchemy import RowMapping + +from src.api.endpoints.data_source.get.response import DataSourceGetResponse +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata +from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType + + +def process_data_source_get_mapping( + mapping: RowMapping +) -> DataSourceGetResponse: + url: URL = mapping[URL] + + url_agency_ids: list[int] = [] + for agency in url.confirmed_agencies: + url_agency_ids.append(agency.id) + + return DataSourceGetResponse( + url_id=mapping[URL.id], + url=mapping[URL.url], + name=mapping[URL.name], + record_type=mapping[URLRecordType.record_type], + agency_ids=url_agency_ids, + description=mapping[URL.description], + batch_id=mapping[LinkBatchURL.batch_id], + record_formats=mapping[URLOptionalDataSourceMetadata.record_formats] or [], + data_portal_type=mapping[URLOptionalDataSourceMetadata.data_portal_type], + supplying_entity=mapping[URLOptionalDataSourceMetadata.supplying_entity], + coverage_start=mapping[URLOptionalDataSourceMetadata.coverage_start], + coverage_end=mapping[URLOptionalDataSourceMetadata.coverage_end], + agency_supplied=mapping[URLOptionalDataSourceMetadata.agency_supplied], + agency_aggregation=mapping[URLOptionalDataSourceMetadata.agency_aggregation], + agency_originated=mapping[URLOptionalDataSourceMetadata.agency_originated], + agency_described_not_in_database=mapping[URLOptionalDataSourceMetadata.agency_described_not_in_database], + update_method=mapping[URLOptionalDataSourceMetadata.update_method], + readme_url=mapping[URLOptionalDataSourceMetadata.readme_url], + originating_entity=mapping[URLOptionalDataSourceMetadata.originating_entity], + retention_schedule=mapping[URLOptionalDataSourceMetadata.retention_schedule], + scraper_url=mapping[URLOptionalDataSourceMetadata.scraper_url], + submission_notes=mapping[URLOptionalDataSourceMetadata.submission_notes], + access_notes=mapping[URLOptionalDataSourceMetadata.access_notes], + access_types=mapping[URLOptionalDataSourceMetadata.access_types] or [] + ) \ No newline at end of file diff --git a/src/api/endpoints/data_source/by_id/__init__.py b/src/api/endpoints/data_source/by_id/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/data_source/by_id/agency/__init__.py b/src/api/endpoints/data_source/by_id/agency/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/data_source/by_id/agency/delete/__init__.py b/src/api/endpoints/data_source/by_id/agency/delete/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/data_source/by_id/agency/delete/wrapper.py b/src/api/endpoints/data_source/by_id/agency/delete/wrapper.py new file mode 100644 index 00000000..f04885af --- /dev/null +++ b/src/api/endpoints/data_source/by_id/agency/delete/wrapper.py @@ -0,0 +1,17 @@ +from src.api.endpoints.data_source.by_id.agency.shared.check import check_is_data_source_url +from src.api.shared.agency.delete.query import RemoveURLAgencyLinkQueryBuilder +from src.db.client.async_ import AsyncDatabaseClient + + +async def delete_data_source_agency_link( + url_id: int, + agency_id: int, + adb_client: AsyncDatabaseClient +) -> None: + await check_is_data_source_url(url_id=url_id, adb_client=adb_client) + await adb_client.run_query_builder( + RemoveURLAgencyLinkQueryBuilder( + url_id=url_id, + agency_id=agency_id + ) + ) \ No newline at end of file diff --git a/src/api/endpoints/data_source/by_id/agency/get/__init__.py b/src/api/endpoints/data_source/by_id/agency/get/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/data_source/by_id/agency/get/wrapper.py b/src/api/endpoints/data_source/by_id/agency/get/wrapper.py new file mode 100644 index 00000000..f58d4936 --- /dev/null +++ b/src/api/endpoints/data_source/by_id/agency/get/wrapper.py @@ -0,0 +1,14 @@ +from src.api.endpoints.agencies.root.get.response import AgencyGetOuterResponse +from src.api.endpoints.data_source.by_id.agency.shared.check import check_is_data_source_url +from src.api.shared.agency.get.query import GetRelatedAgenciesQueryBuilder +from src.db.client.async_ import AsyncDatabaseClient + + +async def get_data_source_agencies_wrapper( + url_id: int, + adb_client: AsyncDatabaseClient +) -> AgencyGetOuterResponse: + await check_is_data_source_url(url_id=url_id, adb_client=adb_client) + return await adb_client.run_query_builder( + GetRelatedAgenciesQueryBuilder(url_id=url_id) + ) \ No newline at end of file diff --git a/src/api/endpoints/data_source/by_id/agency/post/__init__.py b/src/api/endpoints/data_source/by_id/agency/post/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/data_source/by_id/agency/post/wrapper.py b/src/api/endpoints/data_source/by_id/agency/post/wrapper.py new file mode 100644 index 00000000..97197103 --- /dev/null +++ b/src/api/endpoints/data_source/by_id/agency/post/wrapper.py @@ -0,0 +1,17 @@ +from src.api.endpoints.data_source.by_id.agency.shared.check import check_is_data_source_url +from src.api.shared.agency.post.query import AddURLAgencyLinkQueryBuilder +from src.db.client.async_ import AsyncDatabaseClient + + +async def add_data_source_agency_link( + url_id: int, + agency_id: int, + adb_client: AsyncDatabaseClient +) -> None: + await check_is_data_source_url(url_id=url_id, adb_client=adb_client) + await adb_client.run_query_builder( + AddURLAgencyLinkQueryBuilder( + url_id=url_id, + agency_id=agency_id + ) + ) \ No newline at end of file diff --git a/src/api/endpoints/data_source/by_id/agency/shared/__init__.py b/src/api/endpoints/data_source/by_id/agency/shared/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/data_source/by_id/agency/shared/check.py b/src/api/endpoints/data_source/by_id/agency/shared/check.py new file mode 100644 index 00000000..2ef9640c --- /dev/null +++ b/src/api/endpoints/data_source/by_id/agency/shared/check.py @@ -0,0 +1,17 @@ +from src.api.shared.check.url_type.query import CheckURLTypeQueryBuilder +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.flag.url_validated.enums import URLType + + +async def check_is_data_source_url( + url_id: int, + adb_client: AsyncDatabaseClient +) -> None: + """ + Raises: + Bad Request if url_type is not valid or does not exist + """ + + await adb_client.run_query_builder( + CheckURLTypeQueryBuilder(url_id=url_id, url_type=URLType.DATA_SOURCE) + ) \ No newline at end of file diff --git a/src/api/endpoints/data_source/by_id/get/__init__.py b/src/api/endpoints/data_source/by_id/get/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/data_source/by_id/get/query.py b/src/api/endpoints/data_source/by_id/get/query.py new file mode 100644 index 00000000..8f839543 --- /dev/null +++ b/src/api/endpoints/data_source/by_id/get/query.py @@ -0,0 +1,24 @@ +from sqlalchemy import Select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.data_source._shared.build import build_data_source_get_query +from src.api.endpoints.data_source._shared.process import process_data_source_get_mapping +from src.api.endpoints.data_source.get.response import DataSourceGetResponse +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.base.builder import QueryBuilderBase + + +class GetDataSourceByIDQueryBuilder(QueryBuilderBase): + def __init__( + self, + url_id: int, + ): + super().__init__() + self.url_id = url_id + + async def run(self, session: AsyncSession) -> DataSourceGetResponse: + query: Select = build_data_source_get_query() + query = query.where(URL.id == self.url_id) + + mapping: RowMapping = await self.sh.mapping(session, query=query) + return process_data_source_get_mapping(mapping=mapping) \ No newline at end of file diff --git a/src/api/endpoints/data_source/by_id/put/__init__.py b/src/api/endpoints/data_source/by_id/put/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/data_source/by_id/put/query.py b/src/api/endpoints/data_source/by_id/put/query.py new file mode 100644 index 00000000..96106395 --- /dev/null +++ b/src/api/endpoints/data_source/by_id/put/query.py @@ -0,0 +1,123 @@ +from sqlalchemy import update, select, literal, insert +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.data_source.by_id.put.request import DataSourcePutRequest +from src.api.shared.batch.url.link import UpdateBatchURLLinkQueryBuilder +from src.api.shared.record_type.put.query import UpdateRecordTypeQueryBuilder +from src.api.shared.url.put.query import UpdateURLQueryBuilder +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata +from src.db.queries.base.builder import QueryBuilderBase + + +class UpdateDataSourceQueryBuilder(QueryBuilderBase): + + def __init__( + self, + url_id: int, + request: DataSourcePutRequest, + ): + super().__init__() + self.url_id = url_id + self.request = request + + async def run(self, session: AsyncSession) -> None: + + if self.request.record_type is not None: + await UpdateRecordTypeQueryBuilder( + url_id=self.url_id, + record_type=self.request.record_type + ).run(session) + + # Update URL if any of the URL fields are not None + if ( + self.request.description is None and + self.request.name is None and + self.request.description is None + ): + return + + # Update Batch if Batch link is none + if self.request.batch_id is not None: + await UpdateBatchURLLinkQueryBuilder( + batch_id=self.request.batch_id, + url_id=self.url_id + ).run(session) + + await UpdateURLQueryBuilder( + url_id=self.url_id, + url=self.request.url, + name=self.request.name, + description=self.request.description, + ).run( + session, + ) + if not self.request.optional_data_source_metadata_not_none(): + return + value_dict = {} + if self.request.record_formats is not None: + value_dict["record_formats"] = self.request.record_formats + if self.request.data_portal_type is not None: + value_dict["data_portal_type"] = self.request.data_portal_type + if self.request.supplying_entity is not None: + value_dict["supplying_entity"] = self.request.supplying_entity + if self.request.coverage_start is not None: + value_dict["coverage_start"] = self.request.coverage_start + if self.request.coverage_end is not None: + value_dict["coverage_end"] = self.request.coverage_end + if self.request.agency_supplied is not None: + value_dict["agency_supplied"] = self.request.agency_supplied + if self.request.agency_originated is not None: + value_dict["agency_originated"] = self.request.agency_originated + if self.request.agency_aggregation is not None: + value_dict["agency_aggregation"] = self.request.agency_aggregation + if self.request.agency_described_not_in_database is not None: + value_dict["agency_described_not_in_database"] = self.request.agency_described_not_in_database + if self.request.update_method is not None: + value_dict["update_method"] = self.request.update_method + if self.request.readme_url is not None: + value_dict["readme_url"] = self.request.readme_url + if self.request.originating_entity is not None: + value_dict["originating_entity"] = self.request.originating_entity + if self.request.retention_schedule is not None: + value_dict["retention_schedule"] = self.request.retention_schedule + if self.request.scraper_url is not None: + value_dict["scraper_url"] = self.request.scraper_url + if self.request.submission_notes is not None: + value_dict["submission_notes"] = self.request.submission_notes + if self.request.access_notes is not None: + value_dict["access_notes"] = self.request.access_notes + if self.request.access_types is not None: + value_dict["access_types"] = self.request.access_types + + # Check for existing metadata object + query = ( + select( + literal(True) + ) + .where( + URLOptionalDataSourceMetadata.url_id == self.url_id + ) + ) + exists = await self.sh.one_or_none(session=session, query=query) + if not exists: + insert_obj = URLOptionalDataSourceMetadata( + url_id=self.url_id, + **value_dict + ) + session.add(insert_obj) + else: + statement = ( + update( + URLOptionalDataSourceMetadata + ) + .where( + URLOptionalDataSourceMetadata.url_id == self.url_id + ) + .values( + value_dict + ) + ) + + await session.execute(statement) + + diff --git a/src/api/endpoints/data_source/by_id/put/request.py b/src/api/endpoints/data_source/by_id/put/request.py new file mode 100644 index 00000000..28549c28 --- /dev/null +++ b/src/api/endpoints/data_source/by_id/put/request.py @@ -0,0 +1,59 @@ +from datetime import date + +from pydantic import BaseModel + +from src.core.enums import RecordType +from src.db.models.impl.url.optional_ds_metadata.enums import AgencyAggregationEnum, UpdateMethodEnum, \ + RetentionScheduleEnum, AccessTypeEnum + + +class DataSourcePutRequest(BaseModel): + + # Required Attributes + url: str | None = None + name: str | None = None + record_type: RecordType | None = None + + # Optional Attributes + batch_id: int | None = None + description: str | None = None + + # Optional data source metadata + record_formats: list[str] | None = None + data_portal_type: str | None = None + supplying_entity: str | None = None + coverage_start: date | None = None + coverage_end: date | None = None + agency_supplied: bool | None = None + agency_originated: bool | None = None + agency_aggregation: AgencyAggregationEnum | None = None + agency_described_not_in_database: str | None = None + update_method: UpdateMethodEnum | None = None + readme_url: str | None = None + originating_entity: str | None = None + retention_schedule: RetentionScheduleEnum | None = None + scraper_url: str | None = None + submission_notes: str | None = None + access_notes: str | None = None + access_types: list[AccessTypeEnum] | None = None + + def optional_data_source_metadata_not_none(self) -> bool: + return ( + self.record_formats is not None or + self.data_portal_type is not None or + self.supplying_entity is not None or + self.coverage_start is not None or + self.coverage_end is not None or + self.agency_supplied is not None or + self.agency_originated is not None or + self.agency_aggregation is not None or + self.agency_described_not_in_database is not None or + self.update_method is not None or + self.readme_url is not None or + self.originating_entity is not None or + self.retention_schedule is not None or + self.scraper_url is not None or + self.submission_notes is not None or + self.access_notes is not None or + self.access_types is not None + ) \ No newline at end of file diff --git a/src/api/endpoints/data_source/get/__init__.py b/src/api/endpoints/data_source/get/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/data_source/get/query.py b/src/api/endpoints/data_source/get/query.py new file mode 100644 index 00000000..cc167d62 --- /dev/null +++ b/src/api/endpoints/data_source/get/query.py @@ -0,0 +1,50 @@ +from datetime import date +from typing import Any, Sequence + +from sqlalchemy import select, RowMapping, and_, Select +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import selectinload + +from src.api.endpoints.data_source._shared.build import build_data_source_get_query +from src.api.endpoints.data_source._shared.process import process_data_source_get_mapping +from src.api.endpoints.data_source.get.response import DataSourceGetOuterResponse, DataSourceGetResponse +from src.core.enums import RecordType +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.optional_ds_metadata.enums import AgencyAggregationEnum, UpdateMethodEnum, \ + RetentionScheduleEnum, AccessTypeEnum +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata +from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType +from src.db.queries.base.builder import QueryBuilderBase + + +class GetDataSourcesQueryBuilder(QueryBuilderBase): + + def __init__( + self, + page: int, + ): + super().__init__() + self.page = page + + async def run(self, session: AsyncSession) -> DataSourceGetOuterResponse: + query: Select = build_data_source_get_query() + query = ( + query + .limit(100) + .offset((self.page - 1) * 100) + ) + + mappings: Sequence[RowMapping] = await self.sh.mappings(session, query=query) + responses: list[DataSourceGetResponse] = [] + + for mapping in mappings: + response: DataSourceGetResponse = process_data_source_get_mapping(mapping) + responses.append(response) + + return DataSourceGetOuterResponse( + results=responses, + ) + diff --git a/src/api/endpoints/data_source/get/response.py b/src/api/endpoints/data_source/get/response.py new file mode 100644 index 00000000..b80ee9e1 --- /dev/null +++ b/src/api/endpoints/data_source/get/response.py @@ -0,0 +1,43 @@ +from datetime import date + +from pydantic import BaseModel + +from src.core.enums import RecordType +from src.db.models.impl.url.optional_ds_metadata.enums import AgencyAggregationEnum, UpdateMethodEnum, \ + RetentionScheduleEnum, AccessTypeEnum + + +class DataSourceGetResponse(BaseModel): + url_id: int + url: str + + # Required Attributes + name: str + record_type: RecordType + agency_ids: list[int] + + # Optional Attributes + batch_id: int | None + description: str | None + + # Optional data source metadata + record_formats: list[str] + data_portal_type: str | None = None + supplying_entity: str | None = None + coverage_start: date | None = None + coverage_end: date | None = None + agency_supplied: bool | None = None + agency_originated: bool | None = None + agency_aggregation: AgencyAggregationEnum | None = None + agency_described_not_in_database: str | None = None + update_method: UpdateMethodEnum | None = None + readme_url: str | None = None + originating_entity: str | None = None + retention_schedule: RetentionScheduleEnum | None = None + scraper_url: str | None = None + submission_notes: str | None = None + access_notes: str | None = None + access_types: list[AccessTypeEnum] + +class DataSourceGetOuterResponse(BaseModel): + results: list[DataSourceGetResponse] diff --git a/src/api/endpoints/data_source/routes.py b/src/api/endpoints/data_source/routes.py new file mode 100644 index 00000000..25787b85 --- /dev/null +++ b/src/api/endpoints/data_source/routes.py @@ -0,0 +1,94 @@ +from fastapi import APIRouter, Depends, Query + +from src.api.dependencies import get_async_core +from src.api.endpoints.agencies.root.get.response import AgencyGetResponse, AgencyGetOuterResponse +from src.api.endpoints.data_source.by_id.agency.delete.wrapper import delete_data_source_agency_link +from src.api.endpoints.data_source.by_id.agency.get.wrapper import get_data_source_agencies_wrapper +from src.api.endpoints.data_source.by_id.agency.post.wrapper import add_data_source_agency_link +from src.api.endpoints.data_source.by_id.agency.shared.check import check_is_data_source_url +from src.api.endpoints.data_source.by_id.get.query import GetDataSourceByIDQueryBuilder +from src.api.endpoints.data_source.get.query import GetDataSourcesQueryBuilder +from src.api.endpoints.data_source.get.response import DataSourceGetOuterResponse, DataSourceGetResponse +from src.api.endpoints.data_source.by_id.put.query import UpdateDataSourceQueryBuilder +from src.api.endpoints.data_source.by_id.put.request import DataSourcePutRequest +from src.api.shared.models.message_response import MessageResponse +from src.core.core import AsyncCore + +data_sources_router = APIRouter( + prefix="/data-sources", + tags=["Data Sources"] +) + + +@data_sources_router.get("") +async def get_data_sources( + async_core: AsyncCore = Depends(get_async_core), + page: int = Query( + description="Page number", + default=1 + ), +) -> DataSourceGetOuterResponse: + return await async_core.adb_client.run_query_builder( + GetDataSourcesQueryBuilder(page=page) + ) + +@data_sources_router.get("/{url_id}") +async def get_data_source_by_id( + url_id: int, + async_core: AsyncCore = Depends(get_async_core), +) -> DataSourceGetResponse: + return await async_core.adb_client.run_query_builder( + GetDataSourceByIDQueryBuilder(url_id) + ) + +@data_sources_router.put("/{url_id}") +async def update_data_source( + url_id: int , + request: DataSourcePutRequest, + async_core: AsyncCore = Depends(get_async_core), +) -> MessageResponse: + await check_is_data_source_url(url_id=url_id, adb_client=async_core.adb_client) + await async_core.adb_client.run_query_builder( + UpdateDataSourceQueryBuilder( + url_id=url_id, + request=request + ) + ) + return MessageResponse(message="Data source updated.") + +@data_sources_router.get("/{url_id}/agencies") +async def get_data_source_agencies( + url_id: int, + async_core: AsyncCore = Depends(get_async_core), +) -> AgencyGetOuterResponse: + return await get_data_source_agencies_wrapper( + url_id=url_id, + adb_client=async_core.adb_client + ) + +@data_sources_router.post("/{url_id}/agencies/{agency_id}") +async def add_agency_to_data_source( + url_id: int, + agency_id: int, + async_core: AsyncCore = Depends(get_async_core), +) -> MessageResponse: + await add_data_source_agency_link( + url_id=url_id, + agency_id=agency_id, + adb_client=async_core.adb_client + ) + return MessageResponse(message="Agency added to data source.") + +@data_sources_router.delete("/{url_id}/agencies/{agency_id}") +async def remove_agency_from_data_source( + url_id: int, + agency_id: int, + async_core: AsyncCore = Depends(get_async_core), +) -> MessageResponse: + await delete_data_source_agency_link( + url_id=url_id, + agency_id=agency_id, + adb_client=async_core.adb_client + ) + return MessageResponse(message="Agency removed from data source.") + diff --git a/src/api/endpoints/locations/__init__.py b/src/api/endpoints/locations/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/locations/post/__init__.py b/src/api/endpoints/locations/post/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/locations/post/query.py b/src/api/endpoints/locations/post/query.py new file mode 100644 index 00000000..61345191 --- /dev/null +++ b/src/api/endpoints/locations/post/query.py @@ -0,0 +1,46 @@ +from typing import Any + +from sqlalchemy import select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.locations.post.request import AddLocationRequestModel +from src.api.endpoints.locations.post.response import AddLocationResponseModel +from src.db import Locality, Location +from src.db.queries.base.builder import QueryBuilderBase + + +class AddLocationQueryBuilder(QueryBuilderBase): + + def __init__( + self, + request: AddLocationRequestModel + ): + super().__init__() + self.request = request + + async def run(self, session: AsyncSession) -> AddLocationResponseModel: + locality = Locality( + name=self.request.locality_name, + county_id=self.request.county_id + ) + session.add(locality) + await session.flush() + locality_id: int = locality.id + + query = ( + select( + Location.id + ) + .where( + Location.locality_id == locality_id + ) + ) + + mapping: RowMapping = await self.sh.mapping( + session=session, + query=query + ) + + return AddLocationResponseModel( + location_id=mapping[Location.id] + ) diff --git a/src/api/endpoints/locations/post/request.py b/src/api/endpoints/locations/post/request.py new file mode 100644 index 00000000..1f8eba3d --- /dev/null +++ b/src/api/endpoints/locations/post/request.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class AddLocationRequestModel(BaseModel): + locality_name: str + county_id: int diff --git a/src/api/endpoints/locations/post/response.py b/src/api/endpoints/locations/post/response.py new file mode 100644 index 00000000..6cd6a249 --- /dev/null +++ b/src/api/endpoints/locations/post/response.py @@ -0,0 +1,5 @@ +from pydantic import BaseModel + + +class AddLocationResponseModel(BaseModel): + location_id: int \ No newline at end of file diff --git a/src/api/endpoints/locations/routes.py b/src/api/endpoints/locations/routes.py new file mode 100644 index 00000000..4a0ef096 --- /dev/null +++ b/src/api/endpoints/locations/routes.py @@ -0,0 +1,22 @@ +from fastapi import APIRouter, Depends + +from src.api.dependencies import get_async_core +from src.api.endpoints.locations.post.query import AddLocationQueryBuilder +from src.api.endpoints.locations.post.request import AddLocationRequestModel +from src.api.endpoints.locations.post.response import AddLocationResponseModel +from src.core.core import AsyncCore + +location_url_router = APIRouter( + prefix="/locations", + tags=["Locations"], + responses={404: {"description": "Not found"}} +) + +@location_url_router.post("") +async def create_location( + request: AddLocationRequestModel, + async_core: AsyncCore = Depends(get_async_core), +) -> AddLocationResponseModel: + return await async_core.adb_client.run_query_builder( + AddLocationQueryBuilder(request) + ) diff --git a/src/api/endpoints/meta_url/__init__.py b/src/api/endpoints/meta_url/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/meta_url/by_id/__init__.py b/src/api/endpoints/meta_url/by_id/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/meta_url/by_id/agencies/__init__.py b/src/api/endpoints/meta_url/by_id/agencies/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/meta_url/by_id/agencies/delete/__init__.py b/src/api/endpoints/meta_url/by_id/agencies/delete/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/meta_url/by_id/agencies/delete/wrapper.py b/src/api/endpoints/meta_url/by_id/agencies/delete/wrapper.py new file mode 100644 index 00000000..7adf695a --- /dev/null +++ b/src/api/endpoints/meta_url/by_id/agencies/delete/wrapper.py @@ -0,0 +1,17 @@ +from src.api.endpoints.meta_url.by_id.agencies.shared.check import check_is_meta_url +from src.api.shared.agency.delete.query import RemoveURLAgencyLinkQueryBuilder +from src.db.client.async_ import AsyncDatabaseClient + + +async def delete_meta_url_agency_link( + url_id: int, + agency_id: int, + adb_client: AsyncDatabaseClient +) -> None: + await check_is_meta_url(url_id=url_id, adb_client=adb_client) + await adb_client.run_query_builder( + RemoveURLAgencyLinkQueryBuilder( + url_id=url_id, + agency_id=agency_id + ) + ) \ No newline at end of file diff --git a/src/api/endpoints/meta_url/by_id/agencies/get/__init__.py b/src/api/endpoints/meta_url/by_id/agencies/get/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/meta_url/by_id/agencies/get/wrapper.py b/src/api/endpoints/meta_url/by_id/agencies/get/wrapper.py new file mode 100644 index 00000000..17362a88 --- /dev/null +++ b/src/api/endpoints/meta_url/by_id/agencies/get/wrapper.py @@ -0,0 +1,14 @@ +from src.api.endpoints.agencies.root.get.response import AgencyGetOuterResponse +from src.api.endpoints.meta_url.by_id.agencies.shared.check import check_is_meta_url +from src.api.shared.agency.get.query import GetRelatedAgenciesQueryBuilder +from src.db.client.async_ import AsyncDatabaseClient + + +async def get_meta_url_agencies_wrapper( + url_id: int, + adb_client: AsyncDatabaseClient +) -> AgencyGetOuterResponse: + await check_is_meta_url(url_id=url_id, adb_client=adb_client) + return await adb_client.run_query_builder( + GetRelatedAgenciesQueryBuilder(url_id=url_id) + ) \ No newline at end of file diff --git a/src/api/endpoints/meta_url/by_id/agencies/shared/__init__.py b/src/api/endpoints/meta_url/by_id/agencies/shared/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/meta_url/by_id/agencies/shared/check.py b/src/api/endpoints/meta_url/by_id/agencies/shared/check.py new file mode 100644 index 00000000..72c79601 --- /dev/null +++ b/src/api/endpoints/meta_url/by_id/agencies/shared/check.py @@ -0,0 +1,17 @@ +from src.api.shared.check.url_type.query import CheckURLTypeQueryBuilder +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.flag.url_validated.enums import URLType + + +async def check_is_meta_url( + url_id: int, + adb_client: AsyncDatabaseClient +) -> None: + """ + Raises: + Bad Request if url_type is not valid or does not exist + """ + + await adb_client.run_query_builder( + CheckURLTypeQueryBuilder(url_id=url_id, url_type=URLType.META_URL) + ) \ No newline at end of file diff --git a/src/api/endpoints/meta_url/by_id/post/__init__.py b/src/api/endpoints/meta_url/by_id/post/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/meta_url/by_id/post/wrapper.py b/src/api/endpoints/meta_url/by_id/post/wrapper.py new file mode 100644 index 00000000..4153e144 --- /dev/null +++ b/src/api/endpoints/meta_url/by_id/post/wrapper.py @@ -0,0 +1,17 @@ +from src.api.endpoints.meta_url.by_id.agencies.shared.check import check_is_meta_url +from src.api.shared.agency.post.query import AddURLAgencyLinkQueryBuilder +from src.db.client.async_ import AsyncDatabaseClient + + +async def add_meta_url_agency_link( + url_id: int, + agency_id: int, + adb_client: AsyncDatabaseClient +) -> None: + await check_is_meta_url(url_id=url_id, adb_client=adb_client) + await adb_client.run_query_builder( + AddURLAgencyLinkQueryBuilder( + url_id=url_id, + agency_id=agency_id + ) + ) \ No newline at end of file diff --git a/src/api/endpoints/meta_url/by_id/put/__init__.py b/src/api/endpoints/meta_url/by_id/put/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/meta_url/by_id/put/query.py b/src/api/endpoints/meta_url/by_id/put/query.py new file mode 100644 index 00000000..7392375c --- /dev/null +++ b/src/api/endpoints/meta_url/by_id/put/query.py @@ -0,0 +1,46 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.meta_url.by_id.put.request import UpdateMetaURLRequest +from src.api.shared.batch.url.link import UpdateBatchURLLinkQueryBuilder +from src.api.shared.url.put.query import UpdateURLQueryBuilder +from src.db.queries.base.builder import QueryBuilderBase + + +class UpdateMetaURLQueryBuilder(QueryBuilderBase): + + def __init__( + self, + url_id: int, + request: UpdateMetaURLRequest + ): + super().__init__() + self.url_id = url_id + self.request = request + + async def run(self, session: AsyncSession) -> None: + + # Update Batch ID if not none + if self.request.batch_id is not None: + await UpdateBatchURLLinkQueryBuilder( + batch_id=self.request.batch_id, + url_id=self.url_id + ).run(session) + + + # Update URL if any of the URL fields are not None + if ( + self.request.description is None and + self.request.name is None and + self.request.description is None + ): + return + + await UpdateURLQueryBuilder( + url_id=self.url_id, + url=self.request.url, + name=self.request.name, + description=self.request.description, + ).run( + session, + ) + diff --git a/src/api/endpoints/meta_url/by_id/put/request.py b/src/api/endpoints/meta_url/by_id/put/request.py new file mode 100644 index 00000000..456f2b99 --- /dev/null +++ b/src/api/endpoints/meta_url/by_id/put/request.py @@ -0,0 +1,10 @@ +from pydantic import BaseModel + + +class UpdateMetaURLRequest(BaseModel): + url: str | None = None + name: str | None = None + description: str | None = None + + batch_id: int | None = None + diff --git a/src/api/endpoints/meta_url/get/__init__.py b/src/api/endpoints/meta_url/get/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/meta_url/get/query.py b/src/api/endpoints/meta_url/get/query.py new file mode 100644 index 00000000..30db1e05 --- /dev/null +++ b/src/api/endpoints/meta_url/get/query.py @@ -0,0 +1,82 @@ +from typing import Sequence + +from sqlalchemy import select, and_, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import selectinload + +from src.api.endpoints.meta_url.get.response import MetaURLGetOuterResponse, MetaURLGetResponse +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.base.builder import QueryBuilderBase + + +class GetMetaURLQueryBuilder(QueryBuilderBase): + + def __init__( + self, + page: int, + ): + super().__init__() + self.page = page + + async def run(self, session: AsyncSession) -> MetaURLGetOuterResponse: + query = ( + select( + URL, + URL.id, + URL.url, + + # Required Attributes + URL.name, + + # Optional Attributes + URL.description, + LinkBatchURL.batch_id, + ) + .join( + FlagURLValidated, + and_( + FlagURLValidated.url_id == URL.id, + FlagURLValidated.type == URLType.META_URL + ) + ) + .outerjoin( + LinkBatchURL, + LinkBatchURL.url_id == URL.id + ) + .options( + selectinload(URL.confirmed_agencies), + ) + .limit(100) + .offset((self.page - 1) * 100) + ) + + mappings: Sequence[RowMapping] = await self.sh.mappings(session, query=query) + responses: list[MetaURLGetResponse] = [] + + for mapping in mappings: + url: URL = mapping[URL] + url_id: int = mapping[URL.id] + url_url: str = mapping[URL.url] + url_name: str = mapping[URL.name] + url_agency_ids: list[int] = [] + for agency in url.confirmed_agencies: + url_agency_ids.append(agency.id) + url_description: str | None = mapping[URL.description] + link_batch_url_batch_id: int | None = mapping[LinkBatchURL.batch_id] + responses.append( + MetaURLGetResponse( + url_id=url_id, + url=url_url, + name=url_name, + agency_ids=url_agency_ids, + description=url_description, + batch_id=link_batch_url_batch_id, + ) + ) + + return MetaURLGetOuterResponse( + results=responses, + ) \ No newline at end of file diff --git a/src/api/endpoints/meta_url/get/response.py b/src/api/endpoints/meta_url/get/response.py new file mode 100644 index 00000000..1f683a65 --- /dev/null +++ b/src/api/endpoints/meta_url/get/response.py @@ -0,0 +1,17 @@ +from pydantic import BaseModel + + +class MetaURLGetResponse(BaseModel): + url_id: int + url: str + + # Required Attributes + name: str + agency_ids: list[int] + + # Optional Attributes + batch_id: int| None + description: str | None + +class MetaURLGetOuterResponse(BaseModel): + results: list[MetaURLGetResponse] \ No newline at end of file diff --git a/src/api/endpoints/meta_url/routes.py b/src/api/endpoints/meta_url/routes.py new file mode 100644 index 00000000..82a36756 --- /dev/null +++ b/src/api/endpoints/meta_url/routes.py @@ -0,0 +1,84 @@ +from fastapi import APIRouter, Depends, Query + +from src.api.dependencies import get_async_core +from src.api.endpoints.agencies.root.get.response import AgencyGetOuterResponse +from src.api.endpoints.meta_url.by_id.agencies.delete.wrapper import delete_meta_url_agency_link +from src.api.endpoints.meta_url.by_id.agencies.get.wrapper import get_meta_url_agencies_wrapper +from src.api.endpoints.meta_url.by_id.agencies.shared.check import check_is_meta_url +from src.api.endpoints.meta_url.by_id.post.wrapper import add_meta_url_agency_link +from src.api.endpoints.meta_url.get.query import GetMetaURLQueryBuilder +from src.api.endpoints.meta_url.get.response import MetaURLGetOuterResponse +from src.api.endpoints.meta_url.by_id.put.query import UpdateMetaURLQueryBuilder +from src.api.endpoints.meta_url.by_id.put.request import UpdateMetaURLRequest +from src.api.shared.models.message_response import MessageResponse +from src.core.core import AsyncCore + +meta_urls_router = APIRouter( + prefix="/meta-urls", + tags=["Meta Urls"] +) + +@meta_urls_router.get("") +async def get_meta_urls( + async_core: AsyncCore = Depends(get_async_core), + page: int = Query( + description="Page number", + default=1 + ), +) -> MetaURLGetOuterResponse: + return await async_core.adb_client.run_query_builder( + GetMetaURLQueryBuilder(page=page) + ) + + +@meta_urls_router.put("/{url_id}") +async def update_meta_url( + url_id: int, + request: UpdateMetaURLRequest, + async_core: AsyncCore = Depends(get_async_core), +) -> MessageResponse: + await check_is_meta_url(url_id=url_id, adb_client=async_core.adb_client) + await async_core.adb_client.run_query_builder( + UpdateMetaURLQueryBuilder( + url_id=url_id, + request=request + ) + ) + return MessageResponse(message="Meta URL updated.") + + +@meta_urls_router.get("/{url_id}/agencies") +async def get_meta_url_agencies( + url_id: int, + async_core: AsyncCore = Depends(get_async_core), +) -> AgencyGetOuterResponse: + return await get_meta_url_agencies_wrapper( + url_id=url_id, + adb_client=async_core.adb_client + ) + +@meta_urls_router.post("/{url_id}/agencies/{agency_id}") +async def add_agency_to_meta_url( + url_id: int, + agency_id: int, + async_core: AsyncCore = Depends(get_async_core), +) -> MessageResponse: + await add_meta_url_agency_link( + url_id=url_id, + agency_id=agency_id, + adb_client=async_core.adb_client + ) + return MessageResponse(message="Agency added to meta URL.") + +@meta_urls_router.delete("/{url_id}/agencies/{agency_id}") +async def remove_agency_from_meta_url( + url_id: int, + agency_id: int, + async_core: AsyncCore = Depends(get_async_core), +) -> MessageResponse: + await delete_meta_url_agency_link( + url_id=url_id, + agency_id=agency_id, + adb_client=async_core.adb_client + ) + return MessageResponse(message="Agency removed from meta URL.") diff --git a/src/api/endpoints/metrics/batches/aggregated/query/core.py b/src/api/endpoints/metrics/batches/aggregated/query/core.py index c17f0f6d..cc6259de 100644 --- a/src/api/endpoints/metrics/batches/aggregated/query/core.py +++ b/src/api/endpoints/metrics/batches/aggregated/query/core.py @@ -21,7 +21,7 @@ from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer diff --git a/src/api/endpoints/metrics/batches/aggregated/query/submitted_/query.py b/src/api/endpoints/metrics/batches/aggregated/query/submitted_/query.py index ee8f8065..e3fa9d14 100644 --- a/src/api/endpoints/metrics/batches/aggregated/query/submitted_/query.py +++ b/src/api/endpoints/metrics/batches/aggregated/query/submitted_/query.py @@ -8,7 +8,7 @@ from src.db.helpers.session import session_helper as sh from src.db.models.impl.batch.sqlalchemy import Batch from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL -from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource from src.db.queries.base.builder import QueryBuilderBase @@ -20,15 +20,15 @@ async def run(self, session: AsyncSession) -> list[ query = ( select( Batch.strategy, - func.count(URLDataSource.id).label("count") + func.count(DSAppLinkDataSource.id).label("count") ) .join( LinkBatchURL, LinkBatchURL.batch_id == Batch.id ) .join( - URLDataSource, - URLDataSource.url_id == LinkBatchURL.url_id + DSAppLinkDataSource, + DSAppLinkDataSource.url_id == LinkBatchURL.url_id ) .group_by(Batch.strategy) ) diff --git a/src/api/endpoints/metrics/batches/aggregated/query/url_error/query.py b/src/api/endpoints/metrics/batches/aggregated/query/url_error/query.py index 9bcc3a57..a7b9e27a 100644 --- a/src/api/endpoints/metrics/batches/aggregated/query/url_error/query.py +++ b/src/api/endpoints/metrics/batches/aggregated/query/url_error/query.py @@ -5,10 +5,12 @@ from src.api.endpoints.metrics.batches.aggregated.query.models.strategy_count import CountByBatchStrategyResponse from src.collectors.enums import URLStatus +from src.db.helpers.query import exists_url from src.db.helpers.session import session_helper as sh from src.db.models.impl.batch.sqlalchemy import Batch from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.task_error.sqlalchemy import URLTaskError from src.db.queries.base.builder import QueryBuilderBase @@ -23,7 +25,9 @@ async def run(self, session: AsyncSession) -> list[CountByBatchStrategyResponse] .select_from(Batch) .join(LinkBatchURL) .join(URL) - .where(URL.status == URLStatus.ERROR) + .where( + exists_url(URLTaskError) + ) .group_by(Batch.strategy, URL.status) ) diff --git a/src/api/endpoints/metrics/batches/breakdown/error/cte_.py b/src/api/endpoints/metrics/batches/breakdown/error/cte_.py index ed2ff44f..6c54e45b 100644 --- a/src/api/endpoints/metrics/batches/breakdown/error/cte_.py +++ b/src/api/endpoints/metrics/batches/breakdown/error/cte_.py @@ -1,10 +1,11 @@ -from sqlalchemy import select, func, CTE, Column +from sqlalchemy import select, func -from src.collectors.enums import URLStatus +from src.api.endpoints.metrics.batches.breakdown.templates.cte_ import BatchesBreakdownURLCTE +from src.db.helpers.query import exists_url from src.db.models.impl.batch.sqlalchemy import Batch from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL -from src.api.endpoints.metrics.batches.breakdown.templates.cte_ import BatchesBreakdownURLCTE from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.task_error.sqlalchemy import URLTaskError URL_ERROR_CTE = BatchesBreakdownURLCTE( select( @@ -19,7 +20,9 @@ URL, URL.id == LinkBatchURL.url_id ) - .where(URL.status == URLStatus.ERROR) + .where( + exists_url(URLTaskError) + ) .group_by(Batch.id) .cte("error") ) diff --git a/src/api/endpoints/metrics/batches/breakdown/submitted/cte_.py b/src/api/endpoints/metrics/batches/breakdown/submitted/cte_.py index face1891..1fd616a6 100644 --- a/src/api/endpoints/metrics/batches/breakdown/submitted/cte_.py +++ b/src/api/endpoints/metrics/batches/breakdown/submitted/cte_.py @@ -3,20 +3,20 @@ from src.api.endpoints.metrics.batches.breakdown.templates.cte_ import BatchesBreakdownURLCTE from src.db.models.impl.batch.sqlalchemy import Batch from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL -from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource SUBMITTED_CTE = BatchesBreakdownURLCTE( select( Batch.id, - func.count(URLDataSource.id).label("count_submitted") + func.count(DSAppLinkDataSource.id).label("count_submitted") ) .join( LinkBatchURL, LinkBatchURL.batch_id == Batch.id ) .join( - URLDataSource, - URLDataSource.url_id == LinkBatchURL.url_id + DSAppLinkDataSource, + DSAppLinkDataSource.url_id == LinkBatchURL.url_id ) .group_by(Batch.id) .cte("submitted") diff --git a/src/api/endpoints/metrics/urls/aggregated/query/subqueries/oldest_pending_url.py b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/oldest_pending_url.py index 2a951b4a..e086b752 100644 --- a/src/api/endpoints/metrics/urls/aggregated/query/subqueries/oldest_pending_url.py +++ b/src/api/endpoints/metrics/urls/aggregated/query/subqueries/oldest_pending_url.py @@ -27,8 +27,9 @@ async def run( ).where( URLStatusMatView.status.not_in( [ - URLStatusViewEnum.SUBMITTED_PIPELINE_COMPLETE.value, + URLStatusViewEnum.SUBMITTED.value, URLStatusViewEnum.ACCEPTED.value, + URLStatusViewEnum.AWAITING_SUBMISSION.value, ] ) ).order_by( diff --git a/src/api/endpoints/metrics/urls/breakdown/query/core.py b/src/api/endpoints/metrics/urls/breakdown/query/core.py index e585554c..bccc7d68 100644 --- a/src/api/endpoints/metrics/urls/breakdown/query/core.py +++ b/src/api/endpoints/metrics/urls/breakdown/query/core.py @@ -8,9 +8,9 @@ from src.collectors.enums import URLStatus from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion +from src.db.models.impl.url.suggestion.url_type.user import UserURLTypeSuggestion from src.db.queries.base.builder import QueryBuilderBase @@ -27,13 +27,13 @@ async def run(self, session: AsyncSession) -> GetMetricsURLsBreakdownPendingResp case((UserURLTypeSuggestion.url_id != None, literal(True)), else_=literal(False)).label( "has_user_relevant_annotation" ), - case((UserUrlAgencySuggestion.url_id != None, literal(True)), else_=literal(False)).label( + case((UserURLAgencySuggestion.url_id != None, literal(True)), else_=literal(False)).label( "has_user_agency_annotation" ), ) .outerjoin(UserRecordTypeSuggestion, URL.id == UserRecordTypeSuggestion.url_id) .outerjoin(UserURLTypeSuggestion, URL.id == UserURLTypeSuggestion.url_id) - .outerjoin(UserUrlAgencySuggestion, URL.id == UserUrlAgencySuggestion.url_id) + .outerjoin(UserURLAgencySuggestion, URL.id == UserURLAgencySuggestion.url_id) ).cte("flags") month = func.date_trunc('month', URL.created_at) diff --git a/src/api/endpoints/review/approve/query_/core.py b/src/api/endpoints/review/approve/query_/core.py index 15641764..b05c6c67 100644 --- a/src/api/endpoints/review/approve/query_/core.py +++ b/src/api/endpoints/review/approve/query_/core.py @@ -6,14 +6,12 @@ from src.api.endpoints.review.approve.dto import FinalReviewApprovalInfo from src.api.endpoints.review.approve.query_.util import update_if_not_none -from src.collectors.enums import URLStatus -from src.db.constants import PLACEHOLDER_AGENCY_NAME from src.db.models.impl.agency.sqlalchemy import Agency from src.db.models.impl.flag.url_validated.enums import URLType from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType from src.db.models.impl.url.reviewing_user import ReviewingUserURL from src.db.queries.base.builder import QueryBuilderBase @@ -39,7 +37,7 @@ async def run(self, session: AsyncSession) -> None: # Get existing agency ids existing_agencies = url.confirmed_agencies or [] - existing_agency_ids = [agency.agency_id for agency in existing_agencies] + existing_agency_ids = [agency.id for agency in existing_agencies] new_agency_ids = self.approval_info.agency_ids or [] await self._check_for_unspecified_agency_ids(existing_agency_ids, new_agency_ids) @@ -68,9 +66,10 @@ async def _optionally_update_optional_metdata(self, url: URL) -> None: optional_metadata = url.optional_data_source_metadata if optional_metadata is None: url.optional_data_source_metadata = URLOptionalDataSourceMetadata( - record_formats=self.approval_info.record_formats, + record_formats=self.approval_info.record_formats or [], data_portal_type=self.approval_info.data_portal_type, - supplying_entity=self.approval_info.supplying_entity + supplying_entity=self.approval_info.supplying_entity, + access_types=[] ) else: update_if_not_none( @@ -143,7 +142,7 @@ async def _add_new_agencies(self, existing_agency_ids, new_agency_ids, session): # Check if the new agency exists in the database query = ( select(Agency) - .where(Agency.agency_id == new_agency_id) + .where(Agency.id == new_agency_id) ) existing_agency = await session.execute(query) existing_agency = existing_agency.scalars().first() diff --git a/src/api/endpoints/root.py b/src/api/endpoints/root.py index b42a84d3..03b05ed4 100644 --- a/src/api/endpoints/root.py +++ b/src/api/endpoints/root.py @@ -3,7 +3,7 @@ from src.security.manager import get_access_info from src.security.dtos.access_info import AccessInfo -root_router = APIRouter(prefix="", tags=["root"]) +root_router = APIRouter(prefix="", tags=["Root"]) @root_router.get("/") async def root( diff --git a/src/api/endpoints/search/agency/query.py b/src/api/endpoints/search/agency/query.py index 9476e039..5e36e9a5 100644 --- a/src/api/endpoints/search/agency/query.py +++ b/src/api/endpoints/search/agency/query.py @@ -20,17 +20,19 @@ def __init__( location_id: int | None, query: str | None, jurisdiction_type: JurisdictionType | None, + page: int ): super().__init__() self.location_id = location_id self.query = query self.jurisdiction_type = jurisdiction_type + self.page = page async def run(self, session: AsyncSession) -> list[AgencySearchResponse]: query = ( select( - Agency.agency_id, + Agency.id.label("agency_id"), Agency.name.label("agency_name"), Agency.jurisdiction_type, Agency.agency_type, @@ -40,7 +42,7 @@ async def run(self, session: AsyncSession) -> list[AgencySearchResponse]: if self.location_id is None: query = query.join( LinkAgencyLocation, - LinkAgencyLocation.agency_id == Agency.agency_id + LinkAgencyLocation.agency_id == Agency.id ).join( LocationExpandedView, LocationExpandedView.id == LinkAgencyLocation.location_id @@ -49,7 +51,7 @@ async def run(self, session: AsyncSession) -> list[AgencySearchResponse]: with_location_id_cte_container = WithLocationIdCTEContainer(self.location_id) query = query.join( with_location_id_cte_container.cte, - with_location_id_cte_container.agency_id == Agency.agency_id + with_location_id_cte_container.agency_id == Agency.id ).join( LocationExpandedView, LocationExpandedView.id == with_location_id_cte_container.location_id @@ -68,7 +70,7 @@ async def run(self, session: AsyncSession) -> list[AgencySearchResponse]: ).desc() ) - query = query.limit(50) + query = query.limit(10).offset((self.page - 1) * 10) mappings: Sequence[RowMapping] = await sh.mappings(session, query) diff --git a/src/api/endpoints/search/routes.py b/src/api/endpoints/search/routes.py index f2abb93c..58b661e8 100644 --- a/src/api/endpoints/search/routes.py +++ b/src/api/endpoints/search/routes.py @@ -11,7 +11,7 @@ from src.security.manager import get_access_info from src.security.dtos.access_info import AccessInfo -search_router = APIRouter(prefix="/search", tags=["search"]) +search_router = APIRouter(prefix="/search", tags=["Search"]) @search_router.get("/url") @@ -40,6 +40,10 @@ async def search_agency( description="The jurisdiction type to search for", default=None ), + page: int = Query( + description="The page to search for", + default=1 + ), access_info: AccessInfo = Depends(get_access_info), async_core: AsyncCore = Depends(get_async_core), ) -> list[AgencySearchResponse]: @@ -53,6 +57,7 @@ async def search_agency( SearchAgencyQueryBuilder( location_id=location_id, query=query, - jurisdiction_type=jurisdiction_type + jurisdiction_type=jurisdiction_type, + page=page ) ) \ No newline at end of file diff --git a/src/api/endpoints/submit/data_source/__init__.py b/src/api/endpoints/submit/data_source/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/submit/data_source/models/__init__.py b/src/api/endpoints/submit/data_source/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/submit/data_source/models/response/__init__.py b/src/api/endpoints/submit/data_source/models/response/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/submit/data_source/models/response/duplicate.py b/src/api/endpoints/submit/data_source/models/response/duplicate.py new file mode 100644 index 00000000..12367372 --- /dev/null +++ b/src/api/endpoints/submit/data_source/models/response/duplicate.py @@ -0,0 +1,11 @@ +from pydantic import BaseModel + +from src.collectors.enums import URLStatus +from src.db.models.impl.flag.url_validated.enums import URLType + + +class SubmitDataSourceURLDuplicateSubmissionResponse(BaseModel): + message: str + url_id: int + url_type: URLType | None + url_status: URLStatus \ No newline at end of file diff --git a/src/api/endpoints/submit/data_source/models/response/standard.py b/src/api/endpoints/submit/data_source/models/response/standard.py new file mode 100644 index 00000000..b2d7ba3f --- /dev/null +++ b/src/api/endpoints/submit/data_source/models/response/standard.py @@ -0,0 +1,5 @@ +from pydantic import BaseModel + + +class SubmitDataSourceURLProposalResponse(BaseModel): + url_id: int \ No newline at end of file diff --git a/src/api/endpoints/submit/data_source/queries/__init__.py b/src/api/endpoints/submit/data_source/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/submit/data_source/queries/core.py b/src/api/endpoints/submit/data_source/queries/core.py new file mode 100644 index 00000000..1f97cd11 --- /dev/null +++ b/src/api/endpoints/submit/data_source/queries/core.py @@ -0,0 +1,153 @@ +import uuid +from typing import Any + +from sqlalchemy.exc import IntegrityError +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.submit.data_source.models.response.standard import SubmitDataSourceURLProposalResponse +from src.api.endpoints.submit.data_source.request import DataSourceSubmissionRequest +from src.collectors.enums import URLStatus +from src.core.enums import BatchStatus +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata +from src.db.models.impl.url.suggestion.anonymous.agency.sqlalchemy import AnonymousAnnotationAgency +from src.db.models.impl.url.suggestion.anonymous.location.sqlalchemy import AnonymousAnnotationLocation +from src.db.models.impl.url.suggestion.anonymous.record_type.sqlalchemy import AnonymousAnnotationRecordType +from src.db.models.impl.url.suggestion.anonymous.url_type.sqlalchemy import AnonymousAnnotationURLType +from src.db.models.impl.url.suggestion.name.enums import NameSuggestionSource +from src.db.models.impl.url.suggestion.name.sqlalchemy import URLNameSuggestion +from src.db.queries.base.builder import QueryBuilderBase +from src.db.queries.implementations.anonymous_session import MakeAnonymousSessionQueryBuilder +from src.util.models.full_url import FullURL + + +class SubmitDataSourceURLProposalQueryBuilder(QueryBuilderBase): + + def __init__(self, request: DataSourceSubmissionRequest): + super().__init__() + self.request = request + + async def run( + self, + session: AsyncSession + ) -> SubmitDataSourceURLProposalResponse: + full_url = FullURL(full_url=self.request.source_url) + + # Begin by attempting to submit the full URL + url = URL( + url=full_url.id_form, + scheme=full_url.scheme, + trailing_slash=full_url.has_trailing_slash, + name=self.request.name, + description=self.request.description, + status=URLStatus.OK, + source=URLSource.MANUAL, + ) + + session.add(url) + await session.flush() + + # Standard Path + url_id: int = url.id + + # Add Batch + batch = Batch( + strategy='manual', + status=BatchStatus.READY_TO_LABEL, + parameters={} + ) + session.add(batch) + await session.flush() + batch_id: int = batch.id + + # Add Batch URL link + batch_url_link = LinkBatchURL( + batch_id=batch_id, + url_id=url_id + ) + session.add(batch_url_link) + + # Create single-use session id + session_id: uuid.UUID = await MakeAnonymousSessionQueryBuilder().run(session=session) + + # Add URL Type Suggestion + url_type_suggestion = AnonymousAnnotationURLType( + url_id=url_id, + url_type=URLType.DATA_SOURCE, + session_id=session_id + ) + session.add(url_type_suggestion) + + # Optionally add Record Type as suggestion + if self.request.record_type is not None: + record_type_suggestion = AnonymousAnnotationRecordType( + url_id=url_id, + record_type=self.request.record_type.value, + session_id=session_id + ) + session.add(record_type_suggestion) + + # Optionally add Agency ID suggestions + if self.request.agency_ids is not None: + agency_id_suggestions = [ + AnonymousAnnotationAgency( + url_id=url_id, + agency_id=agency_id, + session_id=session_id + ) + for agency_id in self.request.agency_ids + ] + session.add_all(agency_id_suggestions) + + # Optionally add Location ID suggestions + if self.request.location_ids is not None: + location_id_suggestions = [ + AnonymousAnnotationLocation( + url_id=url_id, + location_id=location_id, + session_id=session_id + ) + for location_id in self.request.location_ids + ] + session.add_all(location_id_suggestions) + + # Optionally add name suggestion + if self.request.name is not None: + name_suggestion = URLNameSuggestion( + url_id=url_id, + suggestion=self.request.name, + source=NameSuggestionSource.USER + ) + session.add(name_suggestion) + + # Add data source metadata + ds_metadata = URLOptionalDataSourceMetadata( + url_id=url_id, + coverage_start=self.request.coverage_start, + coverage_end=self.request.coverage_end, + supplying_entity=self.request.supplying_entity, + agency_supplied=self.request.agency_supplied, + agency_originated=self.request.agency_originated, + agency_aggregation=self.request.agency_aggregation, + agency_described_not_in_database=self.request.agency_described_not_in_database, + data_portal_type=self.request.data_portal_type, + update_method=self.request.update_method, + readme_url=self.request.readme_url, + originating_entity=self.request.originating_entity, + retention_schedule=self.request.retention_schedule, + scraper_url=self.request.scraper_url, + submission_notes=self.request.submission_notes, + access_notes=self.request.access_notes, + access_types=self.request.access_types, + record_formats=self.request.record_formats, + ) + session.add(ds_metadata) + await session.flush() + + return SubmitDataSourceURLProposalResponse( + url_id=url_id, + ) \ No newline at end of file diff --git a/src/api/endpoints/submit/data_source/queries/duplicate.py b/src/api/endpoints/submit/data_source/queries/duplicate.py new file mode 100644 index 00000000..75346cf6 --- /dev/null +++ b/src/api/endpoints/submit/data_source/queries/duplicate.py @@ -0,0 +1,58 @@ +from http import HTTPStatus + +from fastapi import HTTPException +from sqlalchemy import select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.api.endpoints.submit.data_source.models.response.duplicate import \ + SubmitDataSourceURLDuplicateSubmissionResponse +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.base.builder import QueryBuilderBase + + +class GetDataSourceDuplicateQueryBuilder(QueryBuilderBase): + + def __init__( + self, + url: str + ): + super().__init__() + self.url = url + + async def run(self, session: AsyncSession) -> None: + """ + Raises: + HTTPException including details on the duplicate result. + """ + + query = ( + select( + URL.id, + URL.status, + FlagURLValidated.type + ) + .outerjoin( + FlagURLValidated, + FlagURLValidated.url_id == URL.id + ) + .where( + URL.url == self.url + ) + ) + mapping: RowMapping = await self.sh.mapping( + query=query, + session=session + ) + + model = SubmitDataSourceURLDuplicateSubmissionResponse( + message="Duplicate URL found", + url_id=mapping[URL.id], + url_status=mapping[URL.status], + url_type=mapping[FlagURLValidated.type] + ) + raise HTTPException( + detail=model.model_dump(mode='json'), + status_code=HTTPStatus.CONFLICT + ) + diff --git a/src/api/endpoints/submit/data_source/request.py b/src/api/endpoints/submit/data_source/request.py new file mode 100644 index 00000000..fe541923 --- /dev/null +++ b/src/api/endpoints/submit/data_source/request.py @@ -0,0 +1,37 @@ +from datetime import date + +from src.api.shared.models.request_base import RequestBase +from src.core.enums import RecordType +from src.db.models.impl.url.optional_ds_metadata.enums import AgencyAggregationEnum, UpdateMethodEnum, \ + RetentionScheduleEnum, AccessTypeEnum + + +class DataSourceSubmissionRequest(RequestBase): + # Required + name: str + record_type: RecordType + source_url: str + description: str | None = None + + # Optional URL DS Metadata + coverage_start: date | None = None + coverage_end: date | None = None + supplying_entity: str | None = None + agency_supplied: bool | None = None + agency_originated: bool | None = None + agency_aggregation: AgencyAggregationEnum | None = None + agency_described_not_in_database: str | None = None + data_portal_type: str | None = None + update_method: UpdateMethodEnum | None = None + readme_url: str | None = None + originating_entity: str | None = None + retention_schedule: RetentionScheduleEnum | None = None + scraper_url: str | None = None + submission_notes: str | None = None + access_notes: str | None = None + access_types: list[AccessTypeEnum] = [] + record_formats: list[str] = [] + + # Links to other entities + agency_ids: list[int] = [] + location_ids: list[int] = [] diff --git a/src/api/endpoints/submit/data_source/wrapper.py b/src/api/endpoints/submit/data_source/wrapper.py new file mode 100644 index 00000000..20e5e158 --- /dev/null +++ b/src/api/endpoints/submit/data_source/wrapper.py @@ -0,0 +1,44 @@ +from fastapi import HTTPException + +from src.api.endpoints.submit.data_source.models.response.standard import SubmitDataSourceURLProposalResponse +from src.api.endpoints.submit.data_source.queries.core import SubmitDataSourceURLProposalQueryBuilder + +from src.api.endpoints.submit.data_source.queries.duplicate import GetDataSourceDuplicateQueryBuilder +from src.api.endpoints.submit.data_source.request import DataSourceSubmissionRequest +from src.db.client.async_ import AsyncDatabaseClient +from src.db.queries.urls_exist.model import URLExistsResult +from src.db.queries.urls_exist.query import URLsExistInDBQueryBuilder +from src.util.models.full_url import FullURL +from src.util.url import is_valid_url + + +async def submit_data_source_url_proposal( + request: DataSourceSubmissionRequest, + adb_client: AsyncDatabaseClient +) -> SubmitDataSourceURLProposalResponse: + + if not is_valid_url(request.source_url): + raise HTTPException( + status_code=400, + detail="Invalid URL" + ) + + full_url = FullURL(request.source_url) + + url_exists_results: URLExistsResult = (await adb_client.run_query_builder( + URLsExistInDBQueryBuilder( + full_urls=[full_url] + ) + ))[0] + if url_exists_results.exists: + await adb_client.run_query_builder( + GetDataSourceDuplicateQueryBuilder( + url=full_url.id_form + ) + ) + + return await adb_client.run_query_builder( + SubmitDataSourceURLProposalQueryBuilder( + request=request + ) + ) \ No newline at end of file diff --git a/src/api/endpoints/submit/routes.py b/src/api/endpoints/submit/routes.py index d91d1821..2eb46c15 100644 --- a/src/api/endpoints/submit/routes.py +++ b/src/api/endpoints/submit/routes.py @@ -1,6 +1,13 @@ from fastapi import APIRouter, Depends from src.api.dependencies import get_async_core + +from src.api.endpoints.submit.data_source.models.response.duplicate import \ + SubmitDataSourceURLDuplicateSubmissionResponse +from src.api.endpoints.submit.data_source.models.response.standard import SubmitDataSourceURLProposalResponse +from src.api.endpoints.submit.data_source.queries.core import SubmitDataSourceURLProposalQueryBuilder +from src.api.endpoints.submit.data_source.request import DataSourceSubmissionRequest +from src.api.endpoints.submit.data_source.wrapper import submit_data_source_url_proposal from src.api.endpoints.submit.url.models.request import URLSubmissionRequest from src.api.endpoints.submit.url.models.response import URLSubmissionResponse from src.api.endpoints.submit.url.queries.core import SubmitURLQueryBuilder @@ -8,9 +15,11 @@ from src.security.dtos.access_info import AccessInfo from src.security.manager import get_access_info -submit_router = APIRouter(prefix="/submit", tags=["submit"]) +submit_router = APIRouter(prefix="/submit", tags=["Submit"]) -@submit_router.post("/url") +@submit_router.post( + "/url" +) async def submit_url( request: URLSubmissionRequest, access_info: AccessInfo = Depends(get_access_info), @@ -21,4 +30,22 @@ async def submit_url( request=request, user_id=access_info.user_id ) - ) \ No newline at end of file + ) + +@submit_router.post( + "/data-source", + response_model=SubmitDataSourceURLProposalResponse, + responses={ + 409: { + "model": SubmitDataSourceURLDuplicateSubmissionResponse + } + } +) +async def submit_data_source( + request: DataSourceSubmissionRequest, + async_core: AsyncCore = Depends(get_async_core), +): + return await submit_data_source_url_proposal( + request=request, + adb_client=async_core.adb_client + ) diff --git a/src/api/endpoints/submit/url/models/request.py b/src/api/endpoints/submit/url/models/request.py index 5b52d761..4e5656b0 100644 --- a/src/api/endpoints/submit/url/models/request.py +++ b/src/api/endpoints/submit/url/models/request.py @@ -1,11 +1,13 @@ from pydantic import BaseModel +from src.api.shared.models.request_base import RequestBase from src.core.enums import RecordType -class URLSubmissionRequest(BaseModel): +class URLSubmissionRequest(RequestBase): url: str record_type: RecordType | None = None name: str | None = None location_id: int | None = None - agency_id: int | None = None \ No newline at end of file + agency_id: int | None = None + description: str | None = None \ No newline at end of file diff --git a/src/api/endpoints/submit/url/queries/core.py b/src/api/endpoints/submit/url/queries/core.py index 081b5456..0d2c1c84 100644 --- a/src/api/endpoints/submit/url/queries/core.py +++ b/src/api/endpoints/submit/url/queries/core.py @@ -12,14 +12,14 @@ from src.db.models.impl.link.user_suggestion_not_found.users_submitted_url.sqlalchemy import LinkUserSubmittedURL from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion from src.db.models.impl.url.suggestion.location.user.sqlalchemy import UserLocationSuggestion from src.db.models.impl.url.suggestion.name.enums import NameSuggestionSource from src.db.models.impl.url.suggestion.name.sqlalchemy import URLNameSuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion from src.db.queries.base.builder import QueryBuilderBase -from src.db.utils.validate import is_valid_url -from src.util.clean import clean_url +from src.util.models.url_and_scheme import URLAndScheme +from src.util.url import clean_url, get_url_and_scheme, is_valid_url class SubmitURLQueryBuilder(QueryBuilderBase): @@ -41,11 +41,13 @@ async def run(self, session: AsyncSession) -> URLSubmissionResponse: if not valid: return convert_invalid_url_to_url_response(url_original) - # Clean URLs + # Clean URL url_clean: str = clean_url(url_original) + url_and_scheme: URLAndScheme = get_url_and_scheme(url_clean) + # Check if duplicate - is_duplicate: bool = await DeduplicateURLQueryBuilder(url=url_clean).run(session) + is_duplicate: bool = await DeduplicateURLQueryBuilder(url=url_and_scheme.url).run(session) if is_duplicate: return convert_duplicate_urls_to_url_response( clean_url=url_clean, @@ -56,9 +58,12 @@ async def run(self, session: AsyncSession) -> URLSubmissionResponse: # Add URL url_insert = URL( - url=url_clean, + url=url_and_scheme.url, + scheme=url_and_scheme.scheme, source=URLSource.MANUAL, status=URLStatus.OK, + description=self.request.description, + trailing_slash=url_and_scheme.url.endswith('/'), ) session.add(url_insert) await session.flush() @@ -108,7 +113,7 @@ async def run(self, session: AsyncSession) -> URLSubmissionResponse: # Add agency ID as suggestion if exists if self.request.agency_id is not None: - agen_sugg = UserUrlAgencySuggestion( + agen_sugg = UserURLAgencySuggestion( user_id=self.user_id, url_id=url_insert.id, agency_id=self.request.agency_id diff --git a/src/api/endpoints/url/by_id/delete/__init__.py b/src/api/endpoints/url/by_id/delete/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/url/by_id/delete/query.py b/src/api/endpoints/url/by_id/delete/query.py new file mode 100644 index 00000000..f8eba43d --- /dev/null +++ b/src/api/endpoints/url/by_id/delete/query.py @@ -0,0 +1,79 @@ +from typing import Any + +from sqlalchemy import select, delete +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.models.impl.flag.ds_delete.data_source import FlagDSDeleteDataSource +from src.db.models.impl.flag.ds_delete.meta_url import FlagDSDeleteMetaURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource +from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL +from src.db.queries.base.builder import QueryBuilderBase + + +class DeleteURLQueryBuilder(QueryBuilderBase): + + def __init__( + self, + url_id: int + ): + super().__init__() + self.url_id = url_id + + async def run(self, session: AsyncSession) -> Any: + + await self._check_for_ds_app_link_data_source(session) + await self._check_for_ds_app_link_meta_url(session) + statement = ( + delete( + URL + ).where( + URL.id == self.url_id + ) + ) + await session.execute(statement) + + async def _check_for_ds_app_link_data_source( + self, + session: AsyncSession + ) -> Any: + """ + Check if a DS App Link Data Source exists for the URL + If so, add a deletion flag + """ + query = ( + select(DSAppLinkDataSource) + .where(DSAppLinkDataSource.url_id == self.url_id) + ) + ds_app_link_data_source: DSAppLinkDataSource | None = await self.sh.one_or_none( + session=session, + query=query + ) + if ds_app_link_data_source is not None: + delete_flag = FlagDSDeleteDataSource( + ds_data_source_id=ds_app_link_data_source.ds_data_source_id + ) + session.add(delete_flag) + + async def _check_for_ds_app_link_meta_url( + self, + session: AsyncSession + ) -> Any: + """ + Check if a DS App Link Meta URL exists for the URL + If so, add a deletion flag + """ + query = ( + select(DSAppLinkMetaURL) + .where(DSAppLinkMetaURL.url_id == self.url_id) + ) + ds_app_link_meta_url: DSAppLinkMetaURL | None = await self.sh.one_or_none( + session=session, + query=query + ) + if ds_app_link_meta_url is not None: + delete_flag = FlagDSDeleteMetaURL( + ds_meta_url_id=ds_app_link_meta_url.ds_meta_url_id + ) + session.add(delete_flag) + diff --git a/src/api/endpoints/url/get/query.py b/src/api/endpoints/url/get/query.py index d7198612..6885ef64 100644 --- a/src/api/endpoints/url/get/query.py +++ b/src/api/endpoints/url/get/query.py @@ -50,7 +50,7 @@ async def run(self, session: AsyncSession) -> GetURLsResponseInfo: GetURLsResponseInnerInfo( id=result.id, batch_id=result.batch.id if result.batch is not None else None, - url=result.url, + url=result.full_url, status=URLStatus(result.status), collector_metadata=result.collector_metadata, updated_at=result.updated_at, diff --git a/src/api/endpoints/url/routes.py b/src/api/endpoints/url/routes.py index c7bb59b0..7d184e6e 100644 --- a/src/api/endpoints/url/routes.py +++ b/src/api/endpoints/url/routes.py @@ -1,8 +1,10 @@ from fastapi import APIRouter, Query, Depends, Response from src.api.dependencies import get_async_core +from src.api.endpoints.url.by_id.delete.query import DeleteURLQueryBuilder from src.api.endpoints.url.by_id.screenshot.wrapper import get_url_screenshot_wrapper from src.api.endpoints.url.get.dto import GetURLsResponseInfo +from src.api.shared.models.message_response import MessageResponse from src.core.core import AsyncCore from src.security.manager import get_access_info from src.security.dtos.access_info import AccessInfo @@ -43,3 +45,13 @@ async def get_url_screenshot( content=raw_result, media_type="image/webp" ) + +@url_router.delete("/{url_id}") +async def delete_url( + url_id: int, + async_core: AsyncCore = Depends(get_async_core), +) -> MessageResponse: + await async_core.adb_client.run_query_builder( + DeleteURLQueryBuilder(url_id=url_id) + ) + return MessageResponse(message="URL deleted.") diff --git a/src/api/main.py b/src/api/main.py index 2d31dc1f..87fa0d3a 100644 --- a/src/api/main.py +++ b/src/api/main.py @@ -4,13 +4,20 @@ import uvicorn from discord_poster import DiscordPoster from fastapi import FastAPI -from pdap_access_manager import AccessManager +from pdap_access_manager.access_manager.async_ import AccessManagerAsync +from pdap_access_manager.models.auth import AuthInfo +from starlette.middleware.cors import CORSMiddleware from starlette.responses import RedirectResponse +from src.api.endpoints.agencies.routes import agencies_router from src.api.endpoints.annotate.routes import annotate_router from src.api.endpoints.batch.routes import batch_router +from src.api.endpoints.check.routes import check_router from src.api.endpoints.collector.routes import collector_router from src.api.endpoints.contributions.routes import contributions_router +from src.api.endpoints.data_source.routes import data_sources_router +from src.api.endpoints.locations.routes import location_url_router +from src.api.endpoints.meta_url.routes import meta_urls_router from src.api.endpoints.metrics.routes import metrics_router from src.api.endpoints.root import root_router from src.api.endpoints.search.routes import search_router @@ -49,12 +56,9 @@ async def lifespan(app: FastAPI): env.read_env() # Initialize shared dependencies - db_client = DatabaseClient( - db_url=env_var_manager.get_postgres_connection_string() - ) - adb_client = AsyncDatabaseClient( - db_url=env_var_manager.get_postgres_connection_string(is_async=True) - ) + + db_client = DatabaseClient() + adb_client = AsyncDatabaseClient() await setup_database(db_client) core_logger = AsyncCoreLogger(adb_client=adb_client) @@ -72,10 +76,12 @@ async def lifespan(app: FastAPI): discord_poster=discord_poster ) pdap_client = PDAPClient( - access_manager=AccessManager( + access_manager=AccessManagerAsync( data_sources_url=env_var_manager.pdap_api_url, - email=env_var_manager.pdap_email, - password=env_var_manager.pdap_password, + auth=AuthInfo( + email=env_var_manager.pdap_email, + password=env_var_manager.pdap_password, + ), api_key=env_var_manager.pdap_api_key, session=session ) @@ -161,6 +167,17 @@ async def setup_database(db_client): version="0.1.0", lifespan=lifespan ) +app.add_middleware( + CORSMiddleware, + allow_origins=[ + "http://localhost:8888", # For local development + "https://pdap.io", + "https://pdap.dev" + ], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) @app.get("/docs", include_in_schema=False) async def redirect_docs(): @@ -177,7 +194,12 @@ async def redirect_docs(): search_router, metrics_router, submit_router, - contributions_router + contributions_router, + agencies_router, + data_sources_router, + meta_urls_router, + check_router, + location_url_router ] for router in routers: diff --git a/src/api/shared/__init__.py b/src/api/shared/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/shared/agency/README.md b/src/api/shared/agency/README.md new file mode 100644 index 00000000..6afa1917 --- /dev/null +++ b/src/api/shared/agency/README.md @@ -0,0 +1 @@ +Logic for adding, removing and getting agencies by URL id \ No newline at end of file diff --git a/src/api/shared/agency/__init__.py b/src/api/shared/agency/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/shared/agency/delete/__init__.py b/src/api/shared/agency/delete/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/shared/agency/delete/query.py b/src/api/shared/agency/delete/query.py new file mode 100644 index 00000000..ca291a6f --- /dev/null +++ b/src/api/shared/agency/delete/query.py @@ -0,0 +1,29 @@ +from sqlalchemy import delete +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.queries.base.builder import QueryBuilderBase + + +class RemoveURLAgencyLinkQueryBuilder(QueryBuilderBase): + + def __init__( + self, + url_id: int, + agency_id: int + ): + super().__init__() + self.url_id = url_id + self.agency_id = agency_id + + async def run(self, session: AsyncSession) -> None: + statement = ( + delete( + LinkURLAgency + ) + .where( + LinkURLAgency.url_id == self.url_id, + LinkURLAgency.agency_id == self.agency_id + ) + ) + await session.execute(statement) \ No newline at end of file diff --git a/src/api/shared/agency/get/__init__.py b/src/api/shared/agency/get/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/shared/agency/get/query.py b/src/api/shared/agency/get/query.py new file mode 100644 index 00000000..eccb3581 --- /dev/null +++ b/src/api/shared/agency/get/query.py @@ -0,0 +1,62 @@ +from typing import Sequence + +from sqlalchemy import select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import selectinload + +from src.api.endpoints.agencies.by_id.locations.get.response import AgencyGetLocationsResponse +from src.api.endpoints.agencies.root.get.response import AgencyGetResponse, AgencyGetOuterResponse +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.queries.base.builder import QueryBuilderBase + + +class GetRelatedAgenciesQueryBuilder(QueryBuilderBase): + + def __init__( + self, + url_id: int + ): + super().__init__() + self.url_id = url_id + + async def run(self, session: AsyncSession) -> AgencyGetOuterResponse: + query = ( + select( + Agency, + ) + .options( + selectinload(Agency.locations) + ) + .join( + LinkURLAgency, + LinkURLAgency.agency_id == Agency.id + ) + .where( + LinkURLAgency.url_id == self.url_id + ) + ) + + results: Sequence[RowMapping] = await self.sh.mappings( + session, + query=query + ) + responses: list[AgencyGetResponse] = [] + for result in results: + agency: Agency = result[Agency] + locations: list[AgencyGetLocationsResponse] = [ + AgencyGetLocationsResponse( + location_id=location.id, + full_display_name=location.full_display_name, + ) + for location in agency.locations + ] + responses.append(AgencyGetResponse( + id=agency.id, + name=agency.name, + type=agency.agency_type, + jurisdiction_type=agency.jurisdiction_type, + locations=locations, + )) + + return AgencyGetOuterResponse(results=responses) diff --git a/src/api/shared/agency/get/response.py b/src/api/shared/agency/get/response.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/shared/agency/post/__init__.py b/src/api/shared/agency/post/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/shared/agency/post/query.py b/src/api/shared/agency/post/query.py new file mode 100644 index 00000000..045d1c84 --- /dev/null +++ b/src/api/shared/agency/post/query.py @@ -0,0 +1,32 @@ +from fastapi import HTTPException +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.queries.base.builder import QueryBuilderBase + + +class AddURLAgencyLinkQueryBuilder(QueryBuilderBase): + + def __init__( + self, + url_id: int, + agency_id: int + ): + super().__init__() + self.url_id = url_id + self.agency_id = agency_id + + async def run(self, session: AsyncSession) -> None: + link = LinkURLAgency( + url_id=self.url_id, + agency_id=self.agency_id + ) + session.add(link) + try: + await session.commit() + except Exception as e: + await session.rollback() + raise HTTPException( + status_code=500, + detail=f"Failed to add URL agency link: {e}" + ) \ No newline at end of file diff --git a/src/api/shared/batch/__init__.py b/src/api/shared/batch/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/shared/batch/url/__init__.py b/src/api/shared/batch/url/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/shared/batch/url/link.py b/src/api/shared/batch/url/link.py new file mode 100644 index 00000000..2ea22525 --- /dev/null +++ b/src/api/shared/batch/url/link.py @@ -0,0 +1,36 @@ +from fastapi import HTTPException +from sqlalchemy import delete +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.queries.base.builder import QueryBuilderBase + + +class UpdateBatchURLLinkQueryBuilder(QueryBuilderBase): + + def __init__( + self, + batch_id: int, + url_id: int + ): + super().__init__() + self.batch_id = batch_id + self.url_id = url_id + + async def run(self, session: AsyncSession) -> None: + + # Delete existing link if it exists + statement = ( + delete(LinkBatchURL) + .where( + LinkBatchURL.url_id==self.url_id + ) + ) + await session.execute(statement) + + # Add new link + link = LinkBatchURL( + batch_id=self.batch_id, + url_id=self.url_id + ) + session.add(link) diff --git a/src/api/shared/check/__init__.py b/src/api/shared/check/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/shared/check/url_type/__init__.py b/src/api/shared/check/url_type/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/shared/check/url_type/query.py b/src/api/shared/check/url_type/query.py new file mode 100644 index 00000000..be6287c2 --- /dev/null +++ b/src/api/shared/check/url_type/query.py @@ -0,0 +1,58 @@ +from fastapi import HTTPException +from sqlalchemy import select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.base.builder import QueryBuilderBase + + +class CheckURLTypeQueryBuilder(QueryBuilderBase): + + def __init__( + self, + url_id: int, + url_type: URLType + ): + super().__init__() + self.url_id = url_id + self.url_type = url_type + + async def run(self, session: AsyncSession) -> None: + """ + Raises: + Bad Request if url_type is not valid or does not exist + """ + + query = ( + select( + URL.id, + FlagURLValidated.type + ) + .outerjoin( + FlagURLValidated, + FlagURLValidated.url_id == URL.id + ) + .where( + URL.id == self.url_id, + ) + ) + + result: RowMapping | None = await self.sh.mapping(session, query=query) + if result is None: + raise HTTPException( + status_code=404, + detail="URL not found" + ) + url_type: URLType | None = result.get("type") + if url_type is None: + raise HTTPException( + status_code=400, + detail="URL is not validated" + ) + if url_type != self.url_type: + raise HTTPException( + status_code=400, + detail="URL type does not match expected URL type" + ) \ No newline at end of file diff --git a/src/api/shared/models/__init__.py b/src/api/shared/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/endpoints/batch/dtos/post/abort.py b/src/api/shared/models/message_response.py similarity index 100% rename from src/api/endpoints/batch/dtos/post/abort.py rename to src/api/shared/models/message_response.py diff --git a/src/api/shared/models/request_base.py b/src/api/shared/models/request_base.py new file mode 100644 index 00000000..816cc226 --- /dev/null +++ b/src/api/shared/models/request_base.py @@ -0,0 +1,8 @@ +from pydantic import BaseModel, ConfigDict + + +class RequestBase(BaseModel): + model_config = ConfigDict( + extra="forbid", + frozen=True + ) \ No newline at end of file diff --git a/src/api/shared/record_type/__init__.py b/src/api/shared/record_type/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/shared/record_type/put/__init__.py b/src/api/shared/record_type/put/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/shared/record_type/put/query.py b/src/api/shared/record_type/put/query.py new file mode 100644 index 00000000..f4cbae5c --- /dev/null +++ b/src/api/shared/record_type/put/query.py @@ -0,0 +1,32 @@ +from sqlalchemy import update +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.enums import RecordType +from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType +from src.db.queries.base.builder import QueryBuilderBase + + +class UpdateRecordTypeQueryBuilder(QueryBuilderBase): + + def __init__( + self, + url_id: int, + record_type: RecordType + ): + super().__init__() + self.url_id = url_id + self.record_type = record_type + + async def run(self, session: AsyncSession) -> None: + statement = ( + update( + URLRecordType + ) + .where( + URLRecordType.url_id == self.url_id + ) + .values( + record_type=self.record_type + ) + ) + await session.execute(statement) \ No newline at end of file diff --git a/src/api/shared/url/__init__.py b/src/api/shared/url/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/shared/url/put/__init__.py b/src/api/shared/url/put/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/api/shared/url/put/query.py b/src/api/shared/url/put/query.py new file mode 100644 index 00000000..a47a382c --- /dev/null +++ b/src/api/shared/url/put/query.py @@ -0,0 +1,50 @@ +from typing import Any + +from sqlalchemy import update +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.base.builder import QueryBuilderBase +from src.util.models.full_url import FullURL + + +class UpdateURLQueryBuilder(QueryBuilderBase): + + def __init__( + self, + url_id: int, + url: str | None, + name: str | None, + description: str | None + ): + super().__init__() + self.url_id = url_id + self.url = url + self.name = name + self.description = description + + async def run(self, session: AsyncSession) -> Any: + values_dict = {} + if self.url is not None: + full_url = FullURL(self.url) + values_dict["url"] = full_url.id_form + values_dict["scheme"] = full_url.scheme + values_dict["trailing_slash"] = full_url.has_trailing_slash + if self.name is not None: + values_dict["name"] = self.name + if self.description is not None: + values_dict["description"] = self.description + + query = ( + update( + URL + ) + .where( + URL.id == self.url_id + ) + .values( + values_dict + ) + ) + + await session.execute(query) \ No newline at end of file diff --git a/src/collectors/enums.py b/src/collectors/enums.py index f40e5f19..16711a0c 100644 --- a/src/collectors/enums.py +++ b/src/collectors/enums.py @@ -14,3 +14,4 @@ class URLStatus(Enum): OK = "ok" ERROR = "error" DUPLICATE = "duplicate" + BROKEN = "broken" diff --git a/src/collectors/impl/example/core.py b/src/collectors/impl/example/core.py index 4bccf242..be0b8e07 100644 --- a/src/collectors/impl/example/core.py +++ b/src/collectors/impl/example/core.py @@ -24,7 +24,7 @@ async def run_implementation(self) -> None: await self.sleep() self.data = ExampleOutputDTO( message=f"Data collected by {self.batch_id}", - urls=["https://example.com", "https://example.com/2"], + urls=["example.com", "example.com/2"], parameters=self.dto.model_dump(), ) diff --git a/src/collectors/queries/insert/url.py b/src/collectors/queries/insert/url.py index af72a3aa..60f39a2c 100644 --- a/src/collectors/queries/insert/url.py +++ b/src/collectors/queries/insert/url.py @@ -4,6 +4,8 @@ from src.db.models.impl.url.core.pydantic.info import URLInfo from src.db.models.impl.url.core.sqlalchemy import URL from src.db.queries.base.builder import QueryBuilderBase +from src.util.models.url_and_scheme import URLAndScheme +from src.util.url import get_url_and_scheme class InsertURLQueryBuilder(QueryBuilderBase): @@ -15,11 +17,14 @@ def __init__(self, url_info: URLInfo): async def run(self, session: AsyncSession) -> int: """Insert a new URL into the database.""" + url_and_scheme: URLAndScheme = get_url_and_scheme(self.url_info.url) url_entry = URL( - url=self.url_info.url, + url=url_and_scheme.url.rstrip('/'), + scheme=url_and_scheme.scheme, collector_metadata=self.url_info.collector_metadata, status=self.url_info.status.value, - source=self.url_info.source + source=self.url_info.source, + trailing_slash=url_and_scheme.url.endswith('/'), ) if self.url_info.created_at is not None: url_entry.created_at = self.url_info.created_at diff --git a/src/collectors/queries/insert/urls/query.py b/src/collectors/queries/insert/urls/query.py index 75176158..77f3fe1b 100644 --- a/src/collectors/queries/insert/urls/query.py +++ b/src/collectors/queries/insert/urls/query.py @@ -2,9 +2,9 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.collectors.queries.insert.urls.request_manager import InsertURLsRequestManager -from src.util.clean import clean_url +from src.util.url import clean_url from src.db.dtos.url.insert import InsertURLsInfo -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.duplicate.pydantic.insert import DuplicateInsertInfo from src.db.models.impl.url.core.pydantic.info import URLInfo from src.db.queries.base.builder import QueryBuilderBase @@ -32,7 +32,7 @@ async def run(self, session: AsyncSession) -> InsertURLsInfo: async with session.begin_nested() as sp: url_id = await rm.insert_url(url_info) url_mappings.append( - URLMapping( + SimpleURLMapping( url_id=url_id, url=url_info.url ) diff --git a/src/core/core.py b/src/core/core.py index 7d4ac083..ad2f20d5 100644 --- a/src/core/core.py +++ b/src/core/core.py @@ -1,16 +1,12 @@ from http import HTTPStatus -from typing import Optional from fastapi import HTTPException from pydantic import BaseModel -from src.api.endpoints.annotate.all.get.models.response import GetNextURLForAllAnnotationResponse -from src.api.endpoints.annotate.all.post.models.request import AllAnnotationPostInfo -from src.api.endpoints.annotate.all.post.query import AddAllAnnotationsToURLQueryBuilder from src.api.endpoints.batch.dtos.get.logs import GetBatchLogsResponse from src.api.endpoints.batch.dtos.get.summaries.response import GetBatchSummariesResponse from src.api.endpoints.batch.dtos.get.summaries.summary import BatchSummary -from src.api.endpoints.batch.dtos.post.abort import MessageResponse +from src.api.shared.models.message_response import MessageResponse from src.api.endpoints.batch.duplicates.dto import GetDuplicatesByBatchResponse from src.api.endpoints.batch.urls.dto import GetURLsByBatchResponse from src.api.endpoints.collector.dtos.collector_start import CollectorStartInfo @@ -23,9 +19,6 @@ from src.api.endpoints.metrics.dtos.get.urls.aggregated.pending import GetMetricsURLsAggregatedPendingResponseDTO from src.api.endpoints.metrics.dtos.get.urls.breakdown.pending import GetMetricsURLsBreakdownPendingResponseDTO from src.api.endpoints.metrics.dtos.get.urls.breakdown.submitted import GetMetricsURLsBreakdownSubmittedResponseDTO -from src.api.endpoints.review.approve.dto import FinalReviewApprovalInfo -from src.api.endpoints.review.enums import RejectionReason -from src.api.endpoints.review.next.dto import GetNextURLForFinalReviewOuterResponse from src.api.endpoints.search.dtos.response import SearchURLResponse from src.api.endpoints.task.by_id.dto import TaskInfo from src.api.endpoints.task.dtos.get.task_status import GetTaskStatusResponseInfo @@ -39,7 +32,6 @@ from src.db.enums import TaskType from src.db.models.impl.batch.pydantic.info import BatchInfo from src.db.models.views.batch_url_status.enums import BatchURLStatusEnum -from src.security.dtos.access_info import AccessInfo class AsyncCore: diff --git a/src/core/tasks/base/operator.py b/src/core/tasks/base/operator.py index 51f07a47..ff5ec4e5 100644 --- a/src/core/tasks/base/operator.py +++ b/src/core/tasks/base/operator.py @@ -1,5 +1,6 @@ import traceback from abc import ABC, abstractmethod +from typing import Any from src.core.enums import BatchStatus from src.core.tasks.base.run_info import TaskOperatorRunInfo @@ -7,8 +8,10 @@ from src.db.client.async_ import AsyncDatabaseClient from src.db.enums import TaskType from src.db.models.impl.task.enums import TaskStatus +from src.db.models.impl.task.log import TaskLog from src.db.models.impl.url.task_error.pydantic_.insert import URLTaskErrorPydantic from src.db.models.impl.url.task_error.pydantic_.small import URLTaskErrorSmall +from src.db.queries.base.builder import QueryBuilderBase class TaskOperatorBase(ABC): @@ -53,9 +56,17 @@ async def run_task(self) -> TaskOperatorRunInfo: message=str(e) + "\n" + stack_trace ) - @abstractmethod - async def run_info(self, outcome: TaskOperatorOutcome, message: str) -> TaskOperatorRunInfo: - raise NotImplementedError + async def run_info( + self, + outcome: TaskOperatorOutcome, + message: str + ) -> TaskOperatorRunInfo: + return TaskOperatorRunInfo( + task_id=self.task_id, + task_type=self.task_type, + outcome=outcome, + message=message + ) @abstractmethod @@ -82,4 +93,18 @@ async def add_task_errors( ) for error in errors ] - await self.adb_client.bulk_insert(inserts) \ No newline at end of file + await self.adb_client.bulk_insert(inserts) + + async def add_task_log( + self, + log: str + ) -> None: + task_log = TaskLog( + task_id=self.task_id, + log=log + ) + await self.adb_client.add(task_log) + + # Convenience forwarder functions + async def run_query_builder(self, query_builder: QueryBuilderBase) -> Any: + return await self.adb_client.run_query_builder(query_builder) \ No newline at end of file diff --git a/src/core/tasks/handler.py b/src/core/tasks/handler.py index 92b96103..7ed0d230 100644 --- a/src/core/tasks/handler.py +++ b/src/core/tasks/handler.py @@ -2,7 +2,6 @@ from discord_poster import DiscordPoster -from src.core.enums import BatchStatus from src.core.tasks.base.run_info import TaskOperatorRunInfo from src.core.tasks.url.enums import TaskOperatorOutcome from src.db.client.async_ import AsyncDatabaseClient diff --git a/src/core/tasks/scheduled/impl/huggingface/operator.py b/src/core/tasks/scheduled/impl/huggingface/operator.py index 9bb7a85e..f644ff94 100644 --- a/src/core/tasks/scheduled/impl/huggingface/operator.py +++ b/src/core/tasks/scheduled/impl/huggingface/operator.py @@ -1,7 +1,7 @@ from itertools import count from src.core.tasks.mixins.prereq import HasPrerequisitesMixin -from src.core.tasks.scheduled.impl.huggingface.queries.check.core import CheckValidURLsUpdatedQueryBuilder +from src.core.tasks.scheduled.impl.huggingface.queries.prereq.core import CheckValidURLsUpdatedQueryBuilder from src.core.tasks.scheduled.impl.huggingface.queries.get.core import GetForLoadingToHuggingFaceQueryBuilder from src.core.tasks.scheduled.impl.huggingface.queries.get.model import GetForLoadingToHuggingFaceOutput from src.core.tasks.scheduled.templates.operator import ScheduledTaskOperatorBase diff --git a/src/core/tasks/scheduled/impl/huggingface/queries/cte.py b/src/core/tasks/scheduled/impl/huggingface/queries/cte.py new file mode 100644 index 00000000..8ea75b0c --- /dev/null +++ b/src/core/tasks/scheduled/impl/huggingface/queries/cte.py @@ -0,0 +1,38 @@ +from datetime import datetime + +from sqlalchemy import select, Column + +from src.db.enums import TaskType +from src.db.helpers.query import exists_url, no_url_task_error, not_exists_url +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML +from src.db.models.materialized_views.html_duplicate_url import HTMLDuplicateURLMaterializedView + + +class HuggingfacePrereqCTEContainer: + + def __init__(self): + self.cte = ( + select( + URL.id, + URL.updated_at + ) + .join( + URLCompressedHTML, + URL.id == URLCompressedHTML.url_id + ) + .where( + exists_url(FlagURLValidated), + not_exists_url(HTMLDuplicateURLMaterializedView), + no_url_task_error(TaskType.PUSH_TO_HUGGINGFACE) + ) + ) + + @property + def url_id(self) -> Column[int]: + return self.cte.c.id + + @property + def updated_at(self) -> Column[datetime]: + return self.cte.c.updated_at \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/huggingface/queries/get/core.py b/src/core/tasks/scheduled/impl/huggingface/queries/get/core.py index 5b6bd08d..10986a05 100644 --- a/src/core/tasks/scheduled/impl/huggingface/queries/get/core.py +++ b/src/core/tasks/scheduled/impl/huggingface/queries/get/core.py @@ -1,6 +1,7 @@ from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession +from src.core.tasks.scheduled.impl.huggingface.queries.cte import HuggingfacePrereqCTEContainer from src.core.tasks.scheduled.impl.huggingface.queries.get.convert import convert_fine_to_coarse_record_type, \ convert_validated_type_to_relevant from src.core.tasks.scheduled.impl.huggingface.queries.get.model import GetForLoadingToHuggingFaceOutput @@ -23,21 +24,26 @@ def __init__(self, page: int): async def run(self, session: AsyncSession) -> list[GetForLoadingToHuggingFaceOutput]: - label_url_id = 'url_id' label_url = 'url' label_record_type_fine = 'record_type_fine' label_html = 'html' label_type = 'type' + cte = HuggingfacePrereqCTEContainer() + query = ( select( - URL.id.label(label_url_id), - URL.url.label(label_url), + cte.url_id, + URL.full_url.label(label_url), URLRecordType.record_type.label(label_record_type_fine), URLCompressedHTML.compressed_html.label(label_html), FlagURLValidated.type.label(label_type) ) + .join( + URL, + cte.url_id == URL.id + ) .join( URLRecordType, URL.id == URLRecordType.url_id @@ -65,7 +71,7 @@ async def run(self, session: AsyncSession) -> list[GetForLoadingToHuggingFaceOut final_results = [] for result in db_results: output = GetForLoadingToHuggingFaceOutput( - url_id=result[label_url_id], + url_id=result[cte.url_id], url=result[label_url], relevant=convert_validated_type_to_relevant( URLType(result[label_type]) diff --git a/src/core/tasks/scheduled/impl/huggingface/queries/prereq/__init__.py b/src/core/tasks/scheduled/impl/huggingface/queries/prereq/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/huggingface/queries/check/core.py b/src/core/tasks/scheduled/impl/huggingface/queries/prereq/core.py similarity index 78% rename from src/core/tasks/scheduled/impl/huggingface/queries/check/core.py rename to src/core/tasks/scheduled/impl/huggingface/queries/prereq/core.py index c76fa2e1..fdf82ba9 100644 --- a/src/core/tasks/scheduled/impl/huggingface/queries/check/core.py +++ b/src/core/tasks/scheduled/impl/huggingface/queries/prereq/core.py @@ -1,6 +1,6 @@ from sqlalchemy.ext.asyncio import AsyncSession -from src.core.tasks.scheduled.impl.huggingface.queries.check.requester import CheckValidURLsUpdatedRequester +from src.core.tasks.scheduled.impl.huggingface.queries.prereq.requester import CheckValidURLsUpdatedRequester from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/core/tasks/scheduled/impl/huggingface/queries/check/requester.py b/src/core/tasks/scheduled/impl/huggingface/queries/prereq/requester.py similarity index 75% rename from src/core/tasks/scheduled/impl/huggingface/queries/check/requester.py rename to src/core/tasks/scheduled/impl/huggingface/queries/prereq/requester.py index ef43bd3d..1eaa306d 100644 --- a/src/core/tasks/scheduled/impl/huggingface/queries/check/requester.py +++ b/src/core/tasks/scheduled/impl/huggingface/queries/prereq/requester.py @@ -6,6 +6,7 @@ from sqlalchemy.sql.functions import count from src.collectors.enums import URLStatus +from src.core.tasks.scheduled.impl.huggingface.queries.cte import HuggingfacePrereqCTEContainer from src.db.enums import TaskType from src.db.helpers.query import not_exists_url, no_url_task_error, exists_url from src.db.helpers.session import session_helper as sh @@ -32,21 +33,17 @@ async def latest_upload(self) -> datetime: ) async def has_valid_urls(self, last_upload_at: datetime | None) -> bool: + cte = HuggingfacePrereqCTEContainer() query = ( - select(count(URL.id)) - .join( - URLCompressedHTML, - URL.id == URLCompressedHTML.url_id - ) - .where( - exists_url(FlagURLValidated), - no_url_task_error(TaskType.PUSH_TO_HUGGINGFACE) + select( + cte.url_id ) ) if last_upload_at is not None: - query = query.where(URL.updated_at > last_upload_at) - url_count = await sh.scalar( + query = query.where(cte.updated_at > last_upload_at) + query = query.limit(1) + result = await sh.one_or_none( session=self.session, query=query ) - return url_count > 0 + return result is not None \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/integrity/__init__.py b/src/core/tasks/scheduled/impl/integrity/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/integrity/exceptions.py b/src/core/tasks/scheduled/impl/integrity/exceptions.py new file mode 100644 index 00000000..3e9f797e --- /dev/null +++ b/src/core/tasks/scheduled/impl/integrity/exceptions.py @@ -0,0 +1,4 @@ + + +class IntegrityMonitorTaskException(Exception): + pass \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/integrity/operator.py b/src/core/tasks/scheduled/impl/integrity/operator.py new file mode 100644 index 00000000..42ca43bb --- /dev/null +++ b/src/core/tasks/scheduled/impl/integrity/operator.py @@ -0,0 +1,30 @@ +from src.core.tasks.mixins.prereq import HasPrerequisitesMixin +from src.core.tasks.scheduled.impl.integrity.exceptions import IntegrityMonitorTaskException +from src.core.tasks.scheduled.impl.integrity.queries.get import GetIntegrityTaskDataQueryBuilder +from src.core.tasks.scheduled.impl.integrity.queries.prereq import GetIntegrityTaskPrerequisitesQueryBuilder +from src.core.tasks.scheduled.templates.operator import ScheduledTaskOperatorBase +from src.db.enums import TaskType + + +class IntegrityMonitorTaskOperator( + ScheduledTaskOperatorBase, + HasPrerequisitesMixin +): + + @property + def task_type(self) -> TaskType: + return TaskType.INTEGRITY_MONITOR + + async def meets_task_prerequisites(self) -> bool: + return await self.run_query_builder( + query_builder=GetIntegrityTaskPrerequisitesQueryBuilder() + ) + + async def inner_task_logic(self) -> None: + failing_views: list[str] = await self.run_query_builder( + query_builder=GetIntegrityTaskDataQueryBuilder() + ) + raise IntegrityMonitorTaskException( + f"Integrity Monitor Task failed for the following views {failing_views}", + ) + diff --git a/src/core/tasks/scheduled/impl/integrity/queries/__init__.py b/src/core/tasks/scheduled/impl/integrity/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/integrity/queries/cte.py b/src/core/tasks/scheduled/impl/integrity/queries/cte.py new file mode 100644 index 00000000..dc894ea7 --- /dev/null +++ b/src/core/tasks/scheduled/impl/integrity/queries/cte.py @@ -0,0 +1,61 @@ +from sqlalchemy import select, literal, Exists, Label, or_ + +from src.db.models.templates_.base import Base +from src.db.models.views.integrity.incomplete_data_sources import IntegrityIncompleteDataSource +from src.db.models.views.integrity.incomplete_meta_urls import IntegrityIncompleteMetaURL +from src.db.models.views.integrity.non_federal_agencies_no_location import IntegrityNonFederalAgenciesNoLocation +from src.db.models.views.integrity.url_both_data_source_and_meta_url import IntegrityURLBothDataSourceAndMetaURL + + +def any_row_exists( + model: type[Base] +) -> Exists: + return ( + select( + literal(1) + ) + .select_from( + model + ) + .exists() + ) + +class IntegrityTaskCTEContainer: + + def __init__( + self, + ): + self.models: list[type[Base]] = [ + IntegrityURLBothDataSourceAndMetaURL, + IntegrityNonFederalAgenciesNoLocation, + IntegrityIncompleteMetaURL, + IntegrityIncompleteDataSource, + ] + + expressions: list[Label[bool]] = [ + any_row_exists(model) + .label(model.__tablename__) + for model in self.models + ] + + self.cte = ( + select( + *expressions + ) + .cte( + name="integrity_task_cte", + ) + ) + + @property + def any_rows_exist_query(self) -> select: + expression = [ + getattr(self.cte.c, model.__tablename__) + for model in self.models + ] + return select(or_(*expression)) + + @property + def select_all_columns_query(self) -> select: + return select(self.cte) + diff --git a/src/core/tasks/scheduled/impl/integrity/queries/get.py b/src/core/tasks/scheduled/impl/integrity/queries/get.py new file mode 100644 index 00000000..b8632fa2 --- /dev/null +++ b/src/core/tasks/scheduled/impl/integrity/queries/get.py @@ -0,0 +1,20 @@ +from sqlalchemy import RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.integrity.queries.cte import IntegrityTaskCTEContainer +from src.db.queries.base.builder import QueryBuilderBase + + +class GetIntegrityTaskDataQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> list[str]: + cte = IntegrityTaskCTEContainer() + mapping: RowMapping = await self.sh.mapping( + session=session, + query=cte.select_all_columns_query + ) + return [ + model.__tablename__ + for model in cte.models + if mapping[model.__tablename__] + ] \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/integrity/queries/prereq.py b/src/core/tasks/scheduled/impl/integrity/queries/prereq.py new file mode 100644 index 00000000..12a6fa33 --- /dev/null +++ b/src/core/tasks/scheduled/impl/integrity/queries/prereq.py @@ -0,0 +1,16 @@ +from typing import Any + +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.integrity.queries.cte import IntegrityTaskCTEContainer +from src.db.queries.base.builder import QueryBuilderBase + + +class GetIntegrityTaskPrerequisitesQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> Any: + cte = IntegrityTaskCTEContainer() + return await self.sh.scalar( + session=session, + query=cte.any_rows_exist_query + ) diff --git a/src/core/tasks/scheduled/impl/internet_archives/probe/convert.py b/src/core/tasks/scheduled/impl/internet_archives/probe/convert.py index efd5e45c..4d4be86d 100644 --- a/src/core/tasks/scheduled/impl/internet_archives/probe/convert.py +++ b/src/core/tasks/scheduled/impl/internet_archives/probe/convert.py @@ -1,10 +1,10 @@ from src.db.models.impl.url.internet_archives.probe.pydantic import URLInternetArchiveMetadataPydantic from src.external.internet_archives.models.ia_url_mapping import InternetArchivesURLMapping -from src.util.url_mapper import URLMapper +from src.util.url_mapper_.simple import SimpleURLMapper def convert_ia_url_mapping_to_ia_metadata( - url_mapper: URLMapper, + url_mapper: SimpleURLMapper, ia_mapping: InternetArchivesURLMapping ) -> URLInternetArchiveMetadataPydantic: iam = ia_mapping.ia_metadata diff --git a/src/core/tasks/scheduled/impl/internet_archives/probe/operator.py b/src/core/tasks/scheduled/impl/internet_archives/probe/operator.py index f4773417..4c58df00 100644 --- a/src/core/tasks/scheduled/impl/internet_archives/probe/operator.py +++ b/src/core/tasks/scheduled/impl/internet_archives/probe/operator.py @@ -12,7 +12,7 @@ CheckURLInternetArchivesTaskPrerequisitesQueryBuilder from src.core.tasks.scheduled.templates.operator import ScheduledTaskOperatorBase from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.enums import TaskType from src.db.models.impl.flag.checked_for_ia.pydantic import FlagURLCheckedForInternetArchivesPydantic from src.db.models.impl.url.internet_archives.probe.pydantic import URLInternetArchiveMetadataPydantic @@ -20,7 +20,7 @@ from src.external.internet_archives.client import InternetArchivesClient from src.external.internet_archives.models.ia_url_mapping import InternetArchivesURLMapping from src.util.progress_bar import get_progress_bar_disabled -from src.util.url_mapper import URLMapper +from src.util.url_mapper_.simple import SimpleURLMapper class InternetArchivesProbeTaskOperator( @@ -51,10 +51,10 @@ async def inner_task_logic(self) -> None: DeleteOldUnsuccessfulIACheckedFlagsQueryBuilder() ) - url_mappings: list[URLMapping] = await self._get_url_mappings() + url_mappings: list[SimpleURLMapping] = await self._get_url_mappings() if len(url_mappings) == 0: return - mapper = URLMapper(url_mappings) + mapper = SimpleURLMapper(url_mappings) await self.link_urls_to_task(mapper.get_all_ids()) @@ -65,7 +65,7 @@ async def inner_task_logic(self) -> None: await self._add_errors_to_db(mapper, ia_mappings=subsets.error) await self._add_ia_metadata_to_db(mapper, ia_mappings=subsets.has_metadata) - async def _add_errors_to_db(self, mapper: URLMapper, ia_mappings: list[InternetArchivesURLMapping]) -> None: + async def _add_errors_to_db(self, mapper: SimpleURLMapper, ia_mappings: list[InternetArchivesURLMapping]) -> None: url_error_info_list: list[URLTaskErrorSmall] = [] for ia_mapping in ia_mappings: url_id = mapper.get_id(ia_mapping.url) @@ -76,7 +76,7 @@ async def _add_errors_to_db(self, mapper: URLMapper, ia_mappings: list[InternetA url_error_info_list.append(url_error_info) await self.add_task_errors(url_error_info_list) - async def _get_url_mappings(self) -> list[URLMapping]: + async def _get_url_mappings(self) -> list[SimpleURLMapping]: return await self.adb_client.run_query_builder( GetURLsForInternetArchivesTaskQueryBuilder() ) @@ -93,7 +93,7 @@ async def _search_for_internet_archive_links(self, urls: list[str]) -> list[Inte async def _add_ia_metadata_to_db( self, - url_mapper: URLMapper, + url_mapper: SimpleURLMapper, ia_mappings: list[InternetArchivesURLMapping], ) -> None: insert_objects: list[URLInternetArchiveMetadataPydantic] = [ @@ -106,7 +106,7 @@ async def _add_ia_metadata_to_db( await self.adb_client.bulk_insert(insert_objects) async def _add_ia_flags_to_db( - self, mapper: URLMapper, ia_mappings: list[InternetArchivesURLMapping]) -> None: + self, mapper: SimpleURLMapper, ia_mappings: list[InternetArchivesURLMapping]) -> None: flags: list[FlagURLCheckedForInternetArchivesPydantic] = [] for ia_mapping in ia_mappings: url_id = mapper.get_id(ia_mapping.url) diff --git a/src/core/tasks/scheduled/impl/internet_archives/probe/queries/cte.py b/src/core/tasks/scheduled/impl/internet_archives/probe/queries/cte.py index 7de8b290..e6886134 100644 --- a/src/core/tasks/scheduled/impl/internet_archives/probe/queries/cte.py +++ b/src/core/tasks/scheduled/impl/internet_archives/probe/queries/cte.py @@ -12,7 +12,7 @@ def __init__(self): self._cte = ( select( URL.id.label("url_id"), - URL.url + URL.full_url.label("url") ) .where( or_( diff --git a/src/core/tasks/scheduled/impl/internet_archives/probe/queries/get.py b/src/core/tasks/scheduled/impl/internet_archives/probe/queries/get.py index 3306943a..a806b691 100644 --- a/src/core/tasks/scheduled/impl/internet_archives/probe/queries/get.py +++ b/src/core/tasks/scheduled/impl/internet_archives/probe/queries/get.py @@ -1,18 +1,15 @@ -from sqlalchemy import select, or_, exists, text, func +from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession from src.core.tasks.scheduled.impl.internet_archives.probe.queries.cte import CheckURLInternetArchivesCTEContainer -from src.db.dtos.url.mapping import URLMapping -from src.db.helpers.query import not_exists_url -from src.db.models.impl.flag.checked_for_ia.sqlalchemy import FlagURLCheckedForInternetArchives -from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.queries.base.builder import QueryBuilderBase from src.db.helpers.session import session_helper as sh class GetURLsForInternetArchivesTaskQueryBuilder(QueryBuilderBase): - async def run(self, session: AsyncSession) -> list[URLMapping]: + async def run(self, session: AsyncSession) -> list[SimpleURLMapping]: cte = CheckURLInternetArchivesCTEContainer() query = ( select( @@ -24,7 +21,7 @@ async def run(self, session: AsyncSession) -> list[URLMapping]: db_mappings = await sh.mappings(session, query=query) return [ - URLMapping( + SimpleURLMapping( url_id=mapping["url_id"], url=mapping["url"] ) for mapping in db_mappings diff --git a/src/core/tasks/scheduled/impl/internet_archives/save/mapper.py b/src/core/tasks/scheduled/impl/internet_archives/save/mapper.py index 1d20b1c2..09a708bc 100644 --- a/src/core/tasks/scheduled/impl/internet_archives/save/mapper.py +++ b/src/core/tasks/scheduled/impl/internet_archives/save/mapper.py @@ -7,6 +7,8 @@ def __init__(self, entries: list[InternetArchivesSaveTaskEntry]): self._url_to_entry: dict[str, InternetArchivesSaveTaskEntry] = { entry.url: entry for entry in entries } + if len(self._url_to_entry) != len(entries): + raise ValueError("Duplicate URLs found in entries") def get_is_new(self, url: str) -> bool: return self._url_to_entry[url].is_new diff --git a/src/core/tasks/scheduled/impl/internet_archives/save/models/entry.py b/src/core/tasks/scheduled/impl/internet_archives/save/models/entry.py index 6e4ae84e..280aa51d 100644 --- a/src/core/tasks/scheduled/impl/internet_archives/save/models/entry.py +++ b/src/core/tasks/scheduled/impl/internet_archives/save/models/entry.py @@ -1,6 +1,6 @@ from pydantic import BaseModel -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping class InternetArchivesSaveTaskEntry(BaseModel): @@ -8,8 +8,8 @@ class InternetArchivesSaveTaskEntry(BaseModel): url_id: int is_new: bool - def to_url_mapping(self) -> URLMapping: - return URLMapping( + def to_url_mapping(self) -> SimpleURLMapping: + return SimpleURLMapping( url_id=self.url_id, url=self.url ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/internet_archives/save/queries/shared/get_valid_entries.py b/src/core/tasks/scheduled/impl/internet_archives/save/queries/shared/get_valid_entries.py index b0f9eeea..fa4c36f0 100644 --- a/src/core/tasks/scheduled/impl/internet_archives/save/queries/shared/get_valid_entries.py +++ b/src/core/tasks/scheduled/impl/internet_archives/save/queries/shared/get_valid_entries.py @@ -1,5 +1,7 @@ from sqlalchemy import select, or_, func, text +from src.db.enums import TaskType +from src.db.helpers.query import no_url_task_error from src.db.models.impl.flag.checked_for_ia.sqlalchemy import FlagURLCheckedForInternetArchives from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.internet_archives.probe.sqlalchemy import URLInternetArchivesProbeMetadata @@ -9,7 +11,7 @@ IA_SAVE_VALID_ENTRIES_QUERY = ( select( URL.id, - URL.url, + URL.full_url.label("url"), (URLInternetArchivesSaveMetadata.url_id.is_(None)).label("is_new"), ) # URL must have been previously probed for its online status. @@ -39,6 +41,7 @@ URLInternetArchivesSaveMetadata.url_id.is_(None), URLInternetArchivesSaveMetadata.last_uploaded_at < func.now() - text("INTERVAL '1 month'") ), + no_url_task_error(TaskType.IA_SAVE), # Must have returned a 200 status code URLWebMetadata.status_code == 200 ) diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/constants.py b/src/core/tasks/scheduled/impl/sync_to_ds/constants.py new file mode 100644 index 00000000..d4bb072f --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/constants.py @@ -0,0 +1,3 @@ + + +PER_REQUEST_ENTITY_LIMIT = 1000 \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/core.py new file mode 100644 index 00000000..d21f1259 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/core.py @@ -0,0 +1,52 @@ +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.add.queries.add_links import \ + DSAppSyncAgenciesAddInsertLinksQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.add.queries.get import DSAppSyncAgenciesAddGetQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.add.queries.prereq import \ + DSAppSyncAgenciesAddPrerequisitesQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.templates.operator import DSSyncTaskOperatorBase +from src.db.enums import TaskType +from src.external.pdap.impl.sync.agencies.add.core import AddAgenciesRequestBuilder +from src.external.pdap.impl.sync.agencies.add.request import AddAgenciesOuterRequest +from src.external.pdap.impl.sync.shared.models.add.response import DSAppSyncAddResponseInnerModel + + +class DSAppSyncAgenciesAddTaskOperator( + DSSyncTaskOperatorBase +): + + @property + def task_type(self) -> TaskType: + return TaskType.SYNC_AGENCIES_ADD + + async def meets_task_prerequisites(self) -> bool: + return await self.run_query_builder( + DSAppSyncAgenciesAddPrerequisitesQueryBuilder() + ) + + async def inner_task_logic(self) -> None: + request: AddAgenciesOuterRequest = await self.get_request_input() + db_ids: list[int] = [r.request_id for r in request.agencies] + await self.add_task_log(f"Adding agencies with the following db_ids: {db_ids}") + responses: list[DSAppSyncAddResponseInnerModel] = await self.make_request(request) + await self.insert_ds_app_links(responses) + + async def get_request_input(self) -> AddAgenciesOuterRequest: + return await self.run_query_builder( + DSAppSyncAgenciesAddGetQueryBuilder() + ) + + async def make_request( + self, + request: AddAgenciesOuterRequest + ) -> list[DSAppSyncAddResponseInnerModel]: + return await self.pdap_client.run_request_builder( + AddAgenciesRequestBuilder(request) + ) + + async def insert_ds_app_links( + self, + responses: list[DSAppSyncAddResponseInnerModel] + ) -> None: + await self.run_query_builder( + DSAppSyncAgenciesAddInsertLinksQueryBuilder(responses) + ) diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/add_links.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/add_links.py new file mode 100644 index 00000000..36a3ebc0 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/add_links.py @@ -0,0 +1,26 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.models.impl.agency.ds_link.sqlalchemy import DSAppLinkAgency +from src.db.queries.base.builder import QueryBuilderBase +from src.external.pdap.impl.sync.shared.models.add.response import DSAppSyncAddResponseInnerModel + + +class DSAppSyncAgenciesAddInsertLinksQueryBuilder(QueryBuilderBase): + + def __init__( + self, + mappings: list[DSAppSyncAddResponseInnerModel] + ): + super().__init__() + self._mappings = mappings + + async def run(self, session: AsyncSession) -> None: + inserts: list[DSAppLinkAgency] = [] + for mapping in self._mappings: + inserts.append( + DSAppLinkAgency( + ds_agency_id=mapping.app_id, + agency_id=mapping.request_id, + ) + ) + session.add_all(inserts) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/cte.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/cte.py new file mode 100644 index 00000000..b91feb11 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/cte.py @@ -0,0 +1,32 @@ +""" +Agencies to be added to the DS database must not have a +ds app link entry +""" +from sqlalchemy import Column, select, exists, CTE + +from src.db.models.impl.agency.ds_link.sqlalchemy import DSAppLinkAgency +from src.db.models.impl.agency.sqlalchemy import Agency + + +class DSAppLinkSyncAgencyAddPrerequisitesCTEContainer: + + def __init__(self): + self._cte = ( + select( + Agency.id + ) + .where( + ~exists( + select(DSAppLinkAgency.agency_id) + .where(DSAppLinkAgency.agency_id == Agency.id) + ) + ).cte("ds_app_link_sync_agency_add_prerequisites") + ) + + @property + def agency_id(self) -> Column[int]: + return self._cte.columns.id + + @property + def cte(self) -> CTE: + return self._cte \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/get.py new file mode 100644 index 00000000..f037115a --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/get.py @@ -0,0 +1,71 @@ +from typing import Sequence + +from sqlalchemy import select, RowMapping, func +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.sync_to_ds.constants import PER_REQUEST_ENTITY_LIMIT +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.add.queries.cte import \ + DSAppLinkSyncAgencyAddPrerequisitesCTEContainer +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.link.agency_location.sqlalchemy import LinkAgencyLocation +from src.db.queries.base.builder import QueryBuilderBase +from src.external.pdap.impl.sync.agencies._shared.models.content import AgencySyncContentModel +from src.external.pdap.impl.sync.agencies.add.request import AddAgenciesOuterRequest, AddAgenciesInnerRequest + + +class DSAppSyncAgenciesAddGetQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> AddAgenciesOuterRequest: + cte = DSAppLinkSyncAgencyAddPrerequisitesCTEContainer() + + location_id_cte = ( + select( + LinkAgencyLocation.agency_id, + func.array_agg(LinkAgencyLocation.location_id).label("location_ids"), + ) + .group_by( + LinkAgencyLocation.agency_id, + ) + .cte("location_id_cte") + ) + + query = ( + select( + cte.agency_id, + Agency.name, + Agency.jurisdiction_type, + Agency.agency_type, + location_id_cte.c.location_ids, + ) + .join( + Agency, + Agency.id == cte.agency_id, + ) + .join( + location_id_cte, + location_id_cte.c.agency_id == cte.agency_id, + ).limit(PER_REQUEST_ENTITY_LIMIT) + ) + + mappings: Sequence[RowMapping] = await self.sh.mappings( + session=session, + query=query, + ) + + inner_requests: list[AddAgenciesInnerRequest] = [] + for mapping in mappings: + inner_requests.append( + AddAgenciesInnerRequest( + request_id=mapping[cte.agency_id], + content=AgencySyncContentModel( + name=mapping[Agency.name], + jurisdiction_type=mapping[Agency.jurisdiction_type], + agency_type=mapping[Agency.agency_type], + location_ids=mapping["location_ids"] + ) + ) + ) + + return AddAgenciesOuterRequest( + agencies=inner_requests, + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/prereq.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/prereq.py new file mode 100644 index 00000000..61097fc6 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/add/queries/prereq.py @@ -0,0 +1,17 @@ +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.add.queries.cte import \ + DSAppLinkSyncAgencyAddPrerequisitesCTEContainer +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncAgenciesAddPrerequisitesQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> bool: + return await self.sh.results_exist( + session=session, + query=select( + DSAppLinkSyncAgencyAddPrerequisitesCTEContainer().agency_id + ) + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/core.py new file mode 100644 index 00000000..806ba230 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/core.py @@ -0,0 +1,68 @@ +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.delete.queries.delete_flags import \ + DSAppSyncAgenciesDeleteRemoveFlagsQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.delete.queries.delete_links import \ + DSAppSyncAgenciesDeleteRemoveLinksQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.delete.queries.get import \ + DSAppSyncAgenciesDeleteGetQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.delete.queries.prereq import \ + DSAppSyncAgenciesDeletePrerequisitesQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.templates.operator import DSSyncTaskOperatorBase +from src.db.enums import TaskType +from src.external.pdap.impl.sync.agencies.delete.core import DeleteAgenciesRequestBuilder + + +class DSAppSyncAgenciesDeleteTaskOperator( + DSSyncTaskOperatorBase +): + + @property + def task_type(self) -> TaskType: + return TaskType.SYNC_AGENCIES_DELETE + + async def meets_task_prerequisites(self) -> bool: + return await self.adb_client.run_query_builder( + DSAppSyncAgenciesDeletePrerequisitesQueryBuilder() + ) + + async def inner_task_logic(self) -> None: + ds_app_ids: list[int] = await self.get_inputs() + await self.log_ds_app_ids(ds_app_ids) + await self.make_request(ds_app_ids) + await self.delete_flags(ds_app_ids) + await self.delete_links(ds_app_ids) + + async def log_ds_app_ids(self, ds_app_ids: list[int]): + await self.add_task_log(f"Deleting agencies with the following ds_app_ids: {ds_app_ids}") + + async def get_inputs(self) -> list[int]: + return await self.adb_client.run_query_builder( + DSAppSyncAgenciesDeleteGetQueryBuilder() + ) + + async def make_request( + self, + ds_app_ids: list[int] + ) -> None: + await self.pdap_client.run_request_builder( + DeleteAgenciesRequestBuilder(ds_app_ids) + ) + + async def delete_flags( + self, + ds_app_ids: list[int] + ) -> None: + await self.run_query_builder( + DSAppSyncAgenciesDeleteRemoveFlagsQueryBuilder( + ds_agency_ids=ds_app_ids + ) + ) + + async def delete_links( + self, + ds_app_ids: list[int] + ) -> None: + await self.run_query_builder( + DSAppSyncAgenciesDeleteRemoveLinksQueryBuilder( + ds_agency_ids=ds_app_ids + ) + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/cte.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/cte.py new file mode 100644 index 00000000..d93f6a1d --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/cte.py @@ -0,0 +1,29 @@ +""" +Agencies to be deleted from the DS database must be flagged for deletion +""" +from sqlalchemy import select, Column, CTE + +from src.db.models.impl.agency.ds_link.sqlalchemy import DSAppLinkAgency +from src.db.models.impl.flag.ds_delete.agency import FlagDSDeleteAgency + + +class DSAppLinkSyncAgencyDeletePrerequisitesCTEContainer: + + def __init__(self): + self._cte = ( + select( + DSAppLinkAgency.ds_agency_id + ) + .join( + FlagDSDeleteAgency, + FlagDSDeleteAgency.ds_agency_id == DSAppLinkAgency.ds_agency_id + ).cte("ds_app_link_sync_agency_delete_prerequisites") + ) + + @property + def ds_agency_id(self) -> Column[int]: + return self._cte.columns.ds_agency_id + + @property + def cte(self) -> CTE: + return self._cte \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/delete_flags.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/delete_flags.py new file mode 100644 index 00000000..f1633337 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/delete_flags.py @@ -0,0 +1,22 @@ +from sqlalchemy import delete +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.models.impl.flag.ds_delete.agency import FlagDSDeleteAgency +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncAgenciesDeleteRemoveFlagsQueryBuilder(QueryBuilderBase): + + def __init__( + self, + ds_agency_ids: list[int] + ): + super().__init__() + self._ds_agency_ids = ds_agency_ids + + async def run(self, session: AsyncSession) -> None: + statement = ( + delete(FlagDSDeleteAgency) + .where(FlagDSDeleteAgency.ds_agency_id.in_(self._ds_agency_ids)) + ) + await session.execute(statement) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/delete_links.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/delete_links.py new file mode 100644 index 00000000..0ad20ee0 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/delete_links.py @@ -0,0 +1,22 @@ +from sqlalchemy import delete +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.models.impl.agency.ds_link.sqlalchemy import DSAppLinkAgency +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncAgenciesDeleteRemoveLinksQueryBuilder(QueryBuilderBase): + + def __init__( + self, + ds_agency_ids: list[int] + ): + super().__init__() + self._ds_agency_ids = ds_agency_ids + + async def run(self, session: AsyncSession) -> None: + statement = ( + delete(DSAppLinkAgency) + .where(DSAppLinkAgency.ds_agency_id.in_(self._ds_agency_ids)) + ) + await session.execute(statement) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/get.py new file mode 100644 index 00000000..c155f921 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/get.py @@ -0,0 +1,29 @@ +from typing import Sequence + +from sqlalchemy import select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.sync_to_ds.constants import PER_REQUEST_ENTITY_LIMIT +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.delete.queries.cte import \ + DSAppLinkSyncAgencyDeletePrerequisitesCTEContainer +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncAgenciesDeleteGetQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> list[int]: + """Get DS App links to delete.""" + cte = DSAppLinkSyncAgencyDeletePrerequisitesCTEContainer() + + query = ( + select( + cte.ds_agency_id, + ).limit(PER_REQUEST_ENTITY_LIMIT) + ) + + mappings: Sequence[RowMapping] = await self.sh.mappings( + session=session, + query=query, + ) + + return [mapping[cte.ds_agency_id] for mapping in mappings] \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/prereq.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/prereq.py new file mode 100644 index 00000000..fdafab72 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/delete/queries/prereq.py @@ -0,0 +1,17 @@ +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.delete.queries.cte import \ + DSAppLinkSyncAgencyDeletePrerequisitesCTEContainer +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncAgenciesDeletePrerequisitesQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> bool: + return await self.sh.results_exist( + session=session, + query=select( + DSAppLinkSyncAgencyDeletePrerequisitesCTEContainer().ds_agency_id + ) + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/core.py new file mode 100644 index 00000000..814f9a1e --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/core.py @@ -0,0 +1,60 @@ +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.update.queries.get import \ + DSAppSyncAgenciesUpdateGetQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.update.queries.prereq import \ + DSAppSyncAgenciesUpdatePrerequisitesQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.update.queries.update_links import \ + DSAppSyncAgenciesUpdateAlterLinksQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.templates.operator import DSSyncTaskOperatorBase +from src.db.enums import TaskType +from src.external.pdap.impl.sync.agencies.update.core import UpdateAgenciesRequestBuilder +from src.external.pdap.impl.sync.agencies.update.request import UpdateAgenciesOuterRequest + + +class DSAppSyncAgenciesUpdateTaskOperator( + DSSyncTaskOperatorBase +): + + @property + def task_type(self) -> TaskType: + return TaskType.SYNC_AGENCIES_UPDATE + + async def meets_task_prerequisites(self) -> bool: + return await self.adb_client.run_query_builder( + DSAppSyncAgenciesUpdatePrerequisitesQueryBuilder() + ) + + async def inner_task_logic(self) -> None: + request: UpdateAgenciesOuterRequest = await self.get_inputs() + ds_app_ids: list[int] = [ + agency.app_id + for agency in request.agencies + ] + await self.log_ds_app_ids(ds_app_ids) + await self.make_request(request) + await self.update_links(ds_app_ids) + + async def log_ds_app_ids(self, ds_app_ids: list[int]): + await self.add_task_log(f"Updating agencies with the following ds_app_ids: {ds_app_ids}") + + async def get_inputs(self) -> UpdateAgenciesOuterRequest: + return await self.adb_client.run_query_builder( + DSAppSyncAgenciesUpdateGetQueryBuilder() + ) + + async def make_request( + self, + request: UpdateAgenciesOuterRequest + ): + await self.pdap_client.run_request_builder( + UpdateAgenciesRequestBuilder(request) + ) + + async def update_links( + self, + ds_app_ids: list[int] + ) -> None: + await self.adb_client.run_query_builder( + DSAppSyncAgenciesUpdateAlterLinksQueryBuilder( + ds_agency_ids=ds_app_ids + ) + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/cte.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/cte.py new file mode 100644 index 00000000..57a9957c --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/cte.py @@ -0,0 +1,34 @@ +from sqlalchemy import select, Column, CTE + +from src.db.models.impl.agency.ds_link.sqlalchemy import DSAppLinkAgency +from src.db.models.impl.agency.sqlalchemy import Agency + + +class DSAppLinkSyncAgencyUpdatePrerequisitesCTEContainer: + + def __init__(self): + self._cte = ( + select( + DSAppLinkAgency.agency_id, + DSAppLinkAgency.ds_agency_id, + ) + .join( + Agency, + Agency.id == DSAppLinkAgency.agency_id, + ) + .where( + Agency.updated_at > DSAppLinkAgency.last_synced_at + ).cte("ds_app_link_sync_agency_update_prerequisites") + ) + + @property + def ds_agency_id(self) -> Column[int]: + return self._cte.columns.ds_agency_id + + @property + def agency_id(self) -> Column[int]: + return self._cte.columns.agency_id + + @property + def cte(self) -> CTE: + return self._cte \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/get.py new file mode 100644 index 00000000..0488f51b --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/get.py @@ -0,0 +1,77 @@ +from typing import Sequence + +from sqlalchemy import select, func, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.sync_to_ds.constants import PER_REQUEST_ENTITY_LIMIT +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.update.queries.cte import \ + DSAppLinkSyncAgencyUpdatePrerequisitesCTEContainer +from src.db.models.impl.agency.ds_link.sqlalchemy import DSAppLinkAgency +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.link.agency_location.sqlalchemy import LinkAgencyLocation +from src.db.queries.base.builder import QueryBuilderBase +from src.external.pdap.impl.sync.agencies._shared.models.content import AgencySyncContentModel +from src.external.pdap.impl.sync.agencies.update.request import UpdateAgenciesOuterRequest, UpdateAgenciesInnerRequest + + +class DSAppSyncAgenciesUpdateGetQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> UpdateAgenciesOuterRequest: + cte = DSAppLinkSyncAgencyUpdatePrerequisitesCTEContainer() + + location_id_cte = ( + select( + LinkAgencyLocation.agency_id, + func.array_agg(LinkAgencyLocation.location_id).label("location_ids"), + ) + .join( + Agency, + Agency.id == LinkAgencyLocation.agency_id, + ) + .group_by( + LinkAgencyLocation.agency_id, + ) + .cte() + ) + + query = ( + select( + cte.ds_agency_id, + Agency.name, + Agency.jurisdiction_type, + Agency.agency_type, + location_id_cte.c.location_ids, + ) + .join( + Agency, + Agency.id == cte.agency_id, + ) + .join( + location_id_cte, + location_id_cte.c.agency_id == cte.agency_id, + ) + .limit(PER_REQUEST_ENTITY_LIMIT) + ) + + mappings: Sequence[RowMapping] = await self.sh.mappings( + session=session, + query=query, + ) + + inner_requests: list[UpdateAgenciesInnerRequest] = [] + for mapping in mappings: + inner_requests.append( + UpdateAgenciesInnerRequest( + app_id=mapping[cte.ds_agency_id], + content=AgencySyncContentModel( + name=mapping[Agency.name], + jurisdiction_type=mapping[Agency.jurisdiction_type], + agency_type=mapping[Agency.agency_type], + location_ids=mapping["location_ids"] + ) + ) + ) + + return UpdateAgenciesOuterRequest( + agencies=inner_requests, + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/prereq.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/prereq.py new file mode 100644 index 00000000..5327f4a8 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/prereq.py @@ -0,0 +1,17 @@ +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.update.queries.cte import \ + DSAppLinkSyncAgencyUpdatePrerequisitesCTEContainer +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncAgenciesUpdatePrerequisitesQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> bool: + return await self.sh.results_exist( + session=session, + query=select( + DSAppLinkSyncAgencyUpdatePrerequisitesCTEContainer().agency_id + ) + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/update_links.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/update_links.py new file mode 100644 index 00000000..8950ccd6 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/agencies/update/queries/update_links.py @@ -0,0 +1,25 @@ +from sqlalchemy import update, func +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.models.impl.agency.ds_link.sqlalchemy import DSAppLinkAgency +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncAgenciesUpdateAlterLinksQueryBuilder(QueryBuilderBase): + + def __init__( + self, + ds_agency_ids: list[int] + ): + super().__init__() + self._ds_agency_ids = ds_agency_ids + + async def run(self, session: AsyncSession) -> None: + statement = ( + update(DSAppLinkAgency) + .where(DSAppLinkAgency.ds_agency_id.in_(self._ds_agency_ids)) + .values({ + DSAppLinkAgency.last_synced_at: func.now(), + }) + ) + await session.execute(statement) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/core.py new file mode 100644 index 00000000..6acd74fd --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/core.py @@ -0,0 +1,57 @@ +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.add.queries.add_links import \ + DSAppSyncDataSourcesAddInsertLinksQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.add.queries.get import \ + DSAppSyncDataSourcesAddGetQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.add.queries.prereq import \ + DSAppSyncDataSourcesAddPrerequisitesQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.templates.operator import DSSyncTaskOperatorBase +from src.db.enums import TaskType +from src.external.pdap.impl.sync.data_sources.add.core import AddDataSourcesRequestBuilder +from src.external.pdap.impl.sync.data_sources.add.request import AddDataSourcesOuterRequest, AddDataSourcesInnerRequest +from src.external.pdap.impl.sync.shared.models.add.response import DSAppSyncAddResponseInnerModel + + +class DSAppSyncDataSourcesAddTaskOperator( + DSSyncTaskOperatorBase +): + + @property + def task_type(self) -> TaskType: + return TaskType.SYNC_DATA_SOURCES_ADD + + async def meets_task_prerequisites(self) -> bool: + return await self.run_query_builder( + DSAppSyncDataSourcesAddPrerequisitesQueryBuilder() + ) + + + async def inner_task_logic(self) -> None: + request: AddDataSourcesOuterRequest = await self.get_request_input() + await self.log_db_ids(request.data_sources) + responses: list[DSAppSyncAddResponseInnerModel] = await self.make_request(request) + await self.insert_ds_app_links(responses) + + async def log_db_ids(self, data_sources: list[AddDataSourcesInnerRequest]): + db_ids: list[int] = [d.request_id for d in data_sources] + await self.add_task_log(f"Adding data sources with the following db_ids: {db_ids}") + + async def get_request_input(self) -> AddDataSourcesOuterRequest: + return await self.run_query_builder( + DSAppSyncDataSourcesAddGetQueryBuilder() + ) + + async def make_request( + self, + request: AddDataSourcesOuterRequest + ) -> list[DSAppSyncAddResponseInnerModel]: + return await self.pdap_client.run_request_builder( + AddDataSourcesRequestBuilder(request) + ) + + async def insert_ds_app_links( + self, + responses: list[DSAppSyncAddResponseInnerModel] + ) -> None: + await self.run_query_builder( + DSAppSyncDataSourcesAddInsertLinksQueryBuilder(responses) + ) diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/add_links.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/add_links.py new file mode 100644 index 00000000..88c88d4b --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/add_links.py @@ -0,0 +1,26 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource +from src.db.queries.base.builder import QueryBuilderBase +from src.external.pdap.impl.sync.shared.models.add.response import DSAppSyncAddResponseInnerModel + + +class DSAppSyncDataSourcesAddInsertLinksQueryBuilder(QueryBuilderBase): + + def __init__( + self, + mappings: list[DSAppSyncAddResponseInnerModel] + ): + super().__init__() + self._mappings = mappings + + async def run(self, session: AsyncSession) -> None: + inserts: list[DSAppLinkDataSource] = [] + for mapping in self._mappings: + inserts.append( + DSAppLinkDataSource( + ds_data_source_id=mapping.app_id, + url_id=mapping.request_id, + ) + ) + session.add_all(inserts) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/cte.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/cte.py new file mode 100644 index 00000000..8c8bc945 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/cte.py @@ -0,0 +1,39 @@ +""" +Data sources to be added to the DS database must not have a +ds app link entry +""" +from sqlalchemy import select, exists, CTE, Column + +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource + + +class DSAppLinkSyncDataSourceAddPrerequisitesCTEContainer: + + def __init__(self): + self._cte = ( + select( + URL.id + ) + .join( + FlagURLValidated, + FlagURLValidated.url_id == URL.id, + ) + .where( + FlagURLValidated.type == URLType.DATA_SOURCE, + ~exists( + select(DSAppLinkDataSource.url_id) + .where(DSAppLinkDataSource.url_id == URL.id) + ) + ).cte("ds_app_link_sync_data_source_add_prerequisites") + ) + + @property + def url_id(self) -> Column[int]: + return self._cte.columns.id + + @property + def cte(self) -> CTE: + return self._cte \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/get.py new file mode 100644 index 00000000..04710ba6 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/get.py @@ -0,0 +1,131 @@ +from typing import Sequence + +from sqlalchemy import RowMapping, func, select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.sync_to_ds.constants import PER_REQUEST_ENTITY_LIMIT +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.add.queries.cte import \ + DSAppLinkSyncDataSourceAddPrerequisitesCTEContainer +from src.core.tasks.scheduled.impl.sync_to_ds.shared.convert import convert_sm_url_status_to_ds_url_status +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.internet_archives.probe.sqlalchemy import URLInternetArchivesProbeMetadata +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata +from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType +from src.db.queries.base.builder import QueryBuilderBase +from src.external.pdap.enums import DataSourcesURLStatus +from src.external.pdap.impl.sync.data_sources._shared.content import DataSourceSyncContentModel +from src.external.pdap.impl.sync.data_sources.add.request import AddDataSourcesOuterRequest, AddDataSourcesInnerRequest + + +class DSAppSyncDataSourcesAddGetQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> AddDataSourcesOuterRequest: + cte = DSAppLinkSyncDataSourceAddPrerequisitesCTEContainer() + + agency_id_cte = ( + select( + LinkURLAgency.url_id, + func.array_agg(LinkURLAgency.agency_id).label("agency_ids") + ) + .group_by( + LinkURLAgency.url_id + ) + .cte() + ) + + query = ( + select( + cte.url_id, + # Required + URL.full_url, + URL.name, + URL.status, + URLRecordType.record_type, + agency_id_cte.c.agency_ids, + # Optional + URL.description, + URLOptionalDataSourceMetadata.record_formats, + URLOptionalDataSourceMetadata.data_portal_type, + URLOptionalDataSourceMetadata.supplying_entity, + URLOptionalDataSourceMetadata.coverage_start, + URLOptionalDataSourceMetadata.coverage_end, + URLOptionalDataSourceMetadata.agency_supplied, + URLOptionalDataSourceMetadata.agency_originated, + URLOptionalDataSourceMetadata.update_method, + URLOptionalDataSourceMetadata.readme_url, + URLOptionalDataSourceMetadata.originating_entity, + URLOptionalDataSourceMetadata.retention_schedule, + URLOptionalDataSourceMetadata.scraper_url, + URLOptionalDataSourceMetadata.access_notes, + URLOptionalDataSourceMetadata.access_types, + URLInternetArchivesProbeMetadata.archive_url, + ) + .select_from( + cte.cte + ) + .join( + URL, + URL.id == cte.url_id, + ) + .outerjoin( + URLOptionalDataSourceMetadata, + URL.id == URLOptionalDataSourceMetadata.url_id, + ) + .outerjoin( + URLInternetArchivesProbeMetadata, + URL.id == URLInternetArchivesProbeMetadata.url_id, + ) + .join( + URLRecordType, + URLRecordType.url_id == URL.id, + ) + .join( + agency_id_cte, + cte.url_id == agency_id_cte.c.url_id + ).limit(PER_REQUEST_ENTITY_LIMIT) + ) + + mappings: Sequence[RowMapping] = await self.sh.mappings( + session=session, + query=query, + ) + + inner_requests: list[AddDataSourcesInnerRequest] = [] + for mapping in mappings: + inner_requests.append( + AddDataSourcesInnerRequest( + request_id=mapping[cte.url_id], + content=DataSourceSyncContentModel( + # Required + source_url=mapping["full_url"], + name=mapping[URL.name], + record_type=mapping[URLRecordType.record_type], + agency_ids=mapping["agency_ids"], + # Optional + description=mapping[URL.description], + record_formats=mapping[URLOptionalDataSourceMetadata.record_formats] or [], + data_portal_type=mapping[URLOptionalDataSourceMetadata.data_portal_type], + supplying_entity=mapping[URLOptionalDataSourceMetadata.supplying_entity], + coverage_start=mapping[URLOptionalDataSourceMetadata.coverage_start], + coverage_end=mapping[URLOptionalDataSourceMetadata.coverage_end], + agency_supplied=mapping[URLOptionalDataSourceMetadata.agency_supplied], + agency_originated=mapping[URLOptionalDataSourceMetadata.agency_originated], + update_method=mapping[URLOptionalDataSourceMetadata.update_method], + readme_url=mapping[URLOptionalDataSourceMetadata.readme_url], + originating_entity=mapping[URLOptionalDataSourceMetadata.originating_entity], + retention_schedule=mapping[URLOptionalDataSourceMetadata.retention_schedule], + scraper_url=mapping[URLOptionalDataSourceMetadata.scraper_url], + access_notes=mapping[URLOptionalDataSourceMetadata.access_notes], + access_types=mapping[URLOptionalDataSourceMetadata.access_types] or [], + url_status=convert_sm_url_status_to_ds_url_status( + sm_url_status=mapping[URL.status], + ), + internet_archives_url=mapping[URLInternetArchivesProbeMetadata.archive_url] or None, + ) + ) + ) + + return AddDataSourcesOuterRequest( + data_sources=inner_requests, + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/prereq.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/prereq.py new file mode 100644 index 00000000..d375f524 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/prereq.py @@ -0,0 +1,17 @@ +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.add.queries.cte import \ + DSAppLinkSyncDataSourceAddPrerequisitesCTEContainer +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncDataSourcesAddPrerequisitesQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> bool: + return await self.sh.results_exist( + session=session, + query=select( + DSAppLinkSyncDataSourceAddPrerequisitesCTEContainer().url_id + ) + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/core.py new file mode 100644 index 00000000..0c5bd53e --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/core.py @@ -0,0 +1,68 @@ +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.delete.queries.delete_flags import \ + DSAppSyncDataSourcesDeleteRemoveFlagsQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.delete.queries.delete_links import \ + DSAppSyncDataSourcesDeleteRemoveLinksQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.delete.queries.get import \ + DSAppSyncDataSourcesDeleteGetQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.delete.queries.prereq import \ + DSAppSyncDataSourcesDeletePrerequisitesQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.templates.operator import DSSyncTaskOperatorBase +from src.db.enums import TaskType +from src.external.pdap.impl.sync.data_sources.delete.core import DeleteDataSourcesRequestBuilder + + +class DSAppSyncDataSourcesDeleteTaskOperator( + DSSyncTaskOperatorBase +): + + @property + def task_type(self) -> TaskType: + return TaskType.SYNC_DATA_SOURCES_DELETE + + async def meets_task_prerequisites(self) -> bool: + return await self.run_query_builder( + DSAppSyncDataSourcesDeletePrerequisitesQueryBuilder() + ) + + async def inner_task_logic(self) -> None: + ds_app_ids: list[int] = await self.get_inputs() + await self.log_ds_app_ids(ds_app_ids) + await self.make_request(ds_app_ids) + await self.delete_flags(ds_app_ids) + await self.delete_links(ds_app_ids) + + async def log_ds_app_ids(self, ds_app_ids: list[int]): + await self.add_task_log(f"Deleting data sources with the following ds_app_ids: {ds_app_ids}") + + async def get_inputs(self) -> list[int]: + return await self.run_query_builder( + DSAppSyncDataSourcesDeleteGetQueryBuilder() + ) + + async def make_request( + self, + ds_app_ids: list[int] + ) -> None: + await self.pdap_client.run_request_builder( + DeleteDataSourcesRequestBuilder(ds_app_ids) + ) + + async def delete_flags( + self, + ds_app_ids: list[int] + ) -> None: + await self.run_query_builder( + DSAppSyncDataSourcesDeleteRemoveFlagsQueryBuilder( + ds_data_source_ids=ds_app_ids + ) + ) + + async def delete_links( + self, + ds_app_ids: list[int] + ) -> None: + await self.run_query_builder( + DSAppSyncDataSourcesDeleteRemoveLinksQueryBuilder( + ds_data_source_ids=ds_app_ids + ) + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/cte.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/cte.py new file mode 100644 index 00000000..12ad5c84 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/cte.py @@ -0,0 +1,29 @@ +""" +Data sources to be deleted from the DS database must be flagged for deletion +""" +from sqlalchemy import select, Column, CTE + +from src.db.models.impl.flag.ds_delete.data_source import FlagDSDeleteDataSource +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource + + +class DSAppLinkSyncDataSourceDeletePrerequisitesCTEContainer: + + def __init__(self): + self._cte = ( + select( + DSAppLinkDataSource.ds_data_source_id + ) + .join( + FlagDSDeleteDataSource, + FlagDSDeleteDataSource.ds_data_source_id == DSAppLinkDataSource.ds_data_source_id + ).cte("ds_app_link_sync_data_source_delete_prerequisites") + ) + + @property + def ds_data_source_id(self) -> Column[int]: + return self._cte.columns.ds_data_source_id + + @property + def cte(self) -> CTE: + return self._cte \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/delete_flags.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/delete_flags.py new file mode 100644 index 00000000..ef869a9c --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/delete_flags.py @@ -0,0 +1,22 @@ +from sqlalchemy import delete +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.models.impl.flag.ds_delete.data_source import FlagDSDeleteDataSource +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncDataSourcesDeleteRemoveFlagsQueryBuilder(QueryBuilderBase): + + def __init__( + self, + ds_data_source_ids: list[int] + ): + super().__init__() + self._ds_data_source_ids = ds_data_source_ids + + async def run(self, session: AsyncSession) -> None: + statement = ( + delete(FlagDSDeleteDataSource) + .where(FlagDSDeleteDataSource.ds_data_source_id.in_(self._ds_data_source_ids)) + ) + await session.execute(statement) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/delete_links.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/delete_links.py new file mode 100644 index 00000000..9b417ce8 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/delete_links.py @@ -0,0 +1,22 @@ +from sqlalchemy import delete +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncDataSourcesDeleteRemoveLinksQueryBuilder(QueryBuilderBase): + + def __init__( + self, + ds_data_source_ids: list[int] + ): + super().__init__() + self._ds_data_source_ids = ds_data_source_ids + + async def run(self, session: AsyncSession) -> None: + statement = ( + delete(DSAppLinkDataSource) + .where(DSAppLinkDataSource.ds_data_source_id.in_(self._ds_data_source_ids)) + ) + await session.execute(statement) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/get.py new file mode 100644 index 00000000..0e8e5732 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/get.py @@ -0,0 +1,29 @@ +from typing import Sequence + +from sqlalchemy import select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.sync_to_ds.constants import PER_REQUEST_ENTITY_LIMIT +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.delete.queries.cte import \ + DSAppLinkSyncDataSourceDeletePrerequisitesCTEContainer +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncDataSourcesDeleteGetQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> list[int]: + """Get DS App links to delete.""" + cte = DSAppLinkSyncDataSourceDeletePrerequisitesCTEContainer() + + query = ( + select( + cte.ds_data_source_id, + ) + ).limit(PER_REQUEST_ENTITY_LIMIT) + + mappings: Sequence[RowMapping] = await self.sh.mappings( + session=session, + query=query, + ) + + return [mapping[cte.ds_data_source_id] for mapping in mappings] \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/prereq.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/prereq.py new file mode 100644 index 00000000..1f3e797a --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/delete/queries/prereq.py @@ -0,0 +1,17 @@ +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.delete.queries.cte import \ + DSAppLinkSyncDataSourceDeletePrerequisitesCTEContainer +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncDataSourcesDeletePrerequisitesQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> bool: + return await self.sh.results_exist( + session=session, + query=select( + DSAppLinkSyncDataSourceDeletePrerequisitesCTEContainer().ds_data_source_id + ) + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/core.py new file mode 100644 index 00000000..0a0c4d21 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/core.py @@ -0,0 +1,60 @@ +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update.queries.get import \ + DSAppSyncDataSourcesUpdateGetQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update.queries.prereq import \ + DSAppSyncDataSourcesUpdatePrerequisitesQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update.queries.update_links import \ + DSAppSyncDataSourcesUpdateAlterLinksQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.templates.operator import DSSyncTaskOperatorBase +from src.db.enums import TaskType +from src.external.pdap.impl.sync.data_sources.update.core import UpdateDataSourcesRequestBuilder +from src.external.pdap.impl.sync.data_sources.update.request import UpdateDataSourcesOuterRequest + + +class DSAppSyncDataSourcesUpdateTaskOperator( + DSSyncTaskOperatorBase +): + + @property + def task_type(self) -> TaskType: + return TaskType.SYNC_DATA_SOURCES_UPDATE + + async def meets_task_prerequisites(self) -> bool: + return await self.adb_client.run_query_builder( + DSAppSyncDataSourcesUpdatePrerequisitesQueryBuilder() + ) + + async def inner_task_logic(self) -> None: + request: UpdateDataSourcesOuterRequest = await self.get_inputs() + ds_app_ids: list[int] = [ + ds.app_id + for ds in request.data_sources + ] + await self.log_ds_app_ids(ds_app_ids) + await self.make_request(request) + await self.update_links(ds_app_ids) + + async def log_ds_app_ids(self, ds_app_ids: list[int]): + await self.add_task_log(f"Updating data sources with the following ds_app_ids: {ds_app_ids}") + + async def get_inputs(self) -> UpdateDataSourcesOuterRequest: + return await self.adb_client.run_query_builder( + DSAppSyncDataSourcesUpdateGetQueryBuilder() + ) + + async def make_request( + self, + request: UpdateDataSourcesOuterRequest + ): + await self.pdap_client.run_request_builder( + UpdateDataSourcesRequestBuilder(request) + ) + + async def update_links( + self, + ds_app_ids: list[int] + ) -> None: + await self.adb_client.run_query_builder( + DSAppSyncDataSourcesUpdateAlterLinksQueryBuilder( + ds_data_source_ids=ds_app_ids + ) + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/cte.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/cte.py new file mode 100644 index 00000000..8f0ff65e --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/cte.py @@ -0,0 +1,49 @@ +from sqlalchemy import select, or_, Column, CTE + +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata +from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType + + +class DSAppLinkSyncDataSourceUpdatePrerequisitesCTEContainer: + + def __init__(self): + self._cte = ( + select( + DSAppLinkDataSource.url_id, + DSAppLinkDataSource.ds_data_source_id, + ) + .join( + URL, + URL.id == DSAppLinkDataSource.url_id, + ) + .outerjoin( + URLRecordType, + URL.id == URLRecordType.url_id, + ) + .outerjoin( + URLOptionalDataSourceMetadata, + URL.id == URLOptionalDataSourceMetadata.url_id, + ) + .where( + or_( + URL.updated_at > DSAppLinkDataSource.last_synced_at, + URLOptionalDataSourceMetadata.updated_at > DSAppLinkDataSource.last_synced_at, + URLRecordType.created_at > DSAppLinkDataSource.last_synced_at, + URLRecordType.updated_at > DSAppLinkDataSource.last_synced_at, + ) + ).cte("ds_app_link_sync_data_source_update_prerequisites") + ) + + @property + def url_id(self) -> Column[int]: + return self._cte.columns.url_id + + @property + def ds_data_source_id(self) -> Column[int]: + return self._cte.columns.ds_data_source_id + + @property + def cte(self) -> CTE: + return self._cte \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/get.py new file mode 100644 index 00000000..a710b6f7 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/get.py @@ -0,0 +1,134 @@ +from typing import Sequence + +from sqlalchemy import select, func, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.sync_to_ds.constants import PER_REQUEST_ENTITY_LIMIT +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update.queries.cte import \ + DSAppLinkSyncDataSourceUpdatePrerequisitesCTEContainer +from src.core.tasks.scheduled.impl.sync_to_ds.shared.convert import convert_sm_url_status_to_ds_url_status +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.internet_archives.probe.sqlalchemy import URLInternetArchivesProbeMetadata +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata +from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType +from src.db.queries.base.builder import QueryBuilderBase +from src.external.pdap.enums import DataSourcesURLStatus +from src.external.pdap.impl.sync.data_sources._shared.content import DataSourceSyncContentModel +from src.external.pdap.impl.sync.data_sources.update.request import UpdateDataSourcesOuterRequest, \ + UpdateDataSourcesInnerRequest + + +class DSAppSyncDataSourcesUpdateGetQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> UpdateDataSourcesOuterRequest: + cte = DSAppLinkSyncDataSourceUpdatePrerequisitesCTEContainer() + + agency_id_cte = ( + select( + LinkURLAgency.url_id, + func.array_agg(LinkURLAgency.agency_id).label("agency_ids") + ) + .group_by( + LinkURLAgency.url_id + ) + .cte() + ) + + query = ( + select( + cte.ds_data_source_id, + # Required + URL.full_url, + URL.name, + URL.status, + URLRecordType.record_type, + agency_id_cte.c.agency_ids, + # Optional + URL.description, + URLOptionalDataSourceMetadata.record_formats, + URLOptionalDataSourceMetadata.data_portal_type, + URLOptionalDataSourceMetadata.supplying_entity, + URLOptionalDataSourceMetadata.coverage_start, + URLOptionalDataSourceMetadata.coverage_end, + URLOptionalDataSourceMetadata.agency_supplied, + URLOptionalDataSourceMetadata.agency_originated, + URLOptionalDataSourceMetadata.update_method, + URLOptionalDataSourceMetadata.readme_url, + URLOptionalDataSourceMetadata.originating_entity, + URLOptionalDataSourceMetadata.retention_schedule, + URLOptionalDataSourceMetadata.scraper_url, + URLOptionalDataSourceMetadata.access_notes, + URLOptionalDataSourceMetadata.access_types, + URLOptionalDataSourceMetadata.data_portal_type_other, + URLInternetArchivesProbeMetadata.archive_url, + ) + .select_from( + cte.cte + ) + .join( + URL, + URL.id == cte.url_id, + ) + .outerjoin( + URLOptionalDataSourceMetadata, + URL.id == URLOptionalDataSourceMetadata.url_id, + ) + .outerjoin( + URLInternetArchivesProbeMetadata, + URL.id == URLInternetArchivesProbeMetadata.url_id, + ) + .join( + URLRecordType, + URLRecordType.url_id == URL.id, + ) + .outerjoin( + agency_id_cte, + cte.url_id == agency_id_cte.c.url_id + ) + ).limit(PER_REQUEST_ENTITY_LIMIT) + + mappings: Sequence[RowMapping] = await self.sh.mappings( + session=session, + query=query, + ) + + inner_requests: list[UpdateDataSourcesInnerRequest] = [] + for mapping in mappings: + inner_requests.append( + UpdateDataSourcesInnerRequest( + app_id=mapping[cte.ds_data_source_id], + content=DataSourceSyncContentModel( + # Required + source_url=mapping["full_url"], + name=mapping[URL.name], + record_type=mapping[URLRecordType.record_type], + agency_ids=mapping["agency_ids"] or [], + # Optional + description=mapping[URL.description], + record_formats=mapping[URLOptionalDataSourceMetadata.record_formats] or [], + data_portal_type=mapping[URLOptionalDataSourceMetadata.data_portal_type], + supplying_entity=mapping[URLOptionalDataSourceMetadata.supplying_entity], + coverage_start=mapping[URLOptionalDataSourceMetadata.coverage_start], + coverage_end=mapping[URLOptionalDataSourceMetadata.coverage_end], + agency_supplied=mapping[URLOptionalDataSourceMetadata.agency_supplied], + agency_originated=mapping[URLOptionalDataSourceMetadata.agency_originated], + update_method=mapping[URLOptionalDataSourceMetadata.update_method], + readme_url=mapping[URLOptionalDataSourceMetadata.readme_url], + originating_entity=mapping[URLOptionalDataSourceMetadata.originating_entity], + retention_schedule=mapping[URLOptionalDataSourceMetadata.retention_schedule], + scraper_url=mapping[URLOptionalDataSourceMetadata.scraper_url], + access_notes=mapping[URLOptionalDataSourceMetadata.access_notes], + access_types=mapping[URLOptionalDataSourceMetadata.access_types] or [], + data_portal_type_other=mapping[URLOptionalDataSourceMetadata.data_portal_type_other], + url_status=convert_sm_url_status_to_ds_url_status( + sm_url_status=mapping[URL.status], + ), + internet_archives_url=mapping[URLInternetArchivesProbeMetadata.archive_url] or None, + ) + ) + ) + + return UpdateDataSourcesOuterRequest( + data_sources=inner_requests, + ) diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/prereq.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/prereq.py new file mode 100644 index 00000000..e31ff1d7 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/prereq.py @@ -0,0 +1,17 @@ +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update.queries.cte import \ + DSAppLinkSyncDataSourceUpdatePrerequisitesCTEContainer +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncDataSourcesUpdatePrerequisitesQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> bool: + return await self.sh.results_exist( + session=session, + query=select( + DSAppLinkSyncDataSourceUpdatePrerequisitesCTEContainer().ds_data_source_id + ) + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/update_links.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/update_links.py new file mode 100644 index 00000000..ffba7ec8 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/update_links.py @@ -0,0 +1,25 @@ +from sqlalchemy import update, func +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncDataSourcesUpdateAlterLinksQueryBuilder(QueryBuilderBase): + + def __init__( + self, + ds_data_source_ids: list[int] + ): + super().__init__() + self._ds_data_source_ids = ds_data_source_ids + + async def run(self, session: AsyncSession) -> None: + statement = ( + update(DSAppLinkDataSource) + .where(DSAppLinkDataSource.ds_data_source_id.in_(self._ds_data_source_ids)) + .values({ + DSAppLinkDataSource.last_synced_at: func.now(), + }) + ) + await session.execute(statement) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/core.py new file mode 100644 index 00000000..08ee031d --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/core.py @@ -0,0 +1,55 @@ +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.add.queries.add_links import \ + DSAppSyncMetaURLsAddInsertLinksQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.add.queries.get import DSAppSyncMetaURLsAddGetQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.add.queries.prereq import \ + DSAppSyncMetaURLsAddPrerequisitesQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.templates.operator import DSSyncTaskOperatorBase +from src.db.enums import TaskType +from src.external.pdap.impl.sync.meta_urls.add.core import AddMetaURLsRequestBuilder +from src.external.pdap.impl.sync.meta_urls.add.request import AddMetaURLsOuterRequest, AddMetaURLsInnerRequest +from src.external.pdap.impl.sync.shared.models.add.response import DSAppSyncAddResponseInnerModel + + +class DSAppSyncMetaURLsAddTaskOperator( + DSSyncTaskOperatorBase +): + + @property + def task_type(self) -> TaskType: + return TaskType.SYNC_META_URLS_ADD + + async def meets_task_prerequisites(self) -> bool: + return await self.run_query_builder( + DSAppSyncMetaURLsAddPrerequisitesQueryBuilder() + ) + + async def inner_task_logic(self) -> None: + request: AddMetaURLsOuterRequest = await self.get_request_input() + await self.log_db_ids(request.meta_urls) + responses: list[DSAppSyncAddResponseInnerModel] = await self.make_request(request) + await self.insert_ds_app_links(responses) + + async def log_db_ids(self, meta_urls: list[AddMetaURLsInnerRequest]): + db_ids: list[int] = [m.request_id for m in meta_urls] + await self.add_task_log(f"Adding meta urls with the following db_ids: {db_ids}") + + async def get_request_input(self) -> AddMetaURLsOuterRequest: + return await self.run_query_builder( + DSAppSyncMetaURLsAddGetQueryBuilder() + ) + + async def make_request( + self, + request: AddMetaURLsOuterRequest + ) -> list[DSAppSyncAddResponseInnerModel]: + return await self.pdap_client.run_request_builder( + AddMetaURLsRequestBuilder(request) + ) + + async def insert_ds_app_links( + self, + responses: list[DSAppSyncAddResponseInnerModel] + ) -> None: + await self.run_query_builder( + DSAppSyncMetaURLsAddInsertLinksQueryBuilder(responses) + ) diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/add_links.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/add_links.py new file mode 100644 index 00000000..52a288f3 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/add_links.py @@ -0,0 +1,26 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL +from src.db.queries.base.builder import QueryBuilderBase +from src.external.pdap.impl.sync.shared.models.add.response import DSAppSyncAddResponseInnerModel + + +class DSAppSyncMetaURLsAddInsertLinksQueryBuilder(QueryBuilderBase): + + def __init__( + self, + mappings: list[DSAppSyncAddResponseInnerModel] + ): + super().__init__() + self._mappings = mappings + + async def run(self, session: AsyncSession) -> None: + inserts: list[DSAppLinkMetaURL] = [] + for mapping in self._mappings: + inserts.append( + DSAppLinkMetaURL( + ds_meta_url_id=mapping.app_id, + url_id=mapping.request_id, + ) + ) + session.add_all(inserts) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/cte.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/cte.py new file mode 100644 index 00000000..178e19e8 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/cte.py @@ -0,0 +1,32 @@ +""" +Meta URLs to be added to the DS database must not have a +ds app link entry +""" +from sqlalchemy import select, exists, Column, CTE + +from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL +from src.db.models.views.meta_url import MetaURL + + +class DSAppLinkSyncMetaURLAddPrerequisitesCTEContainer: + + def __init__(self): + self._cte = ( + select( + MetaURL.url_id + ) + .where( + ~exists( + select(DSAppLinkMetaURL.url_id) + .where(DSAppLinkMetaURL.url_id == MetaURL.url_id) + ) + ).cte("ds_app_link_sync_meta_url_add_prerequisites") + ) + + @property + def url_id(self) -> Column[int]: + return self._cte.columns.url_id + + @property + def cte(self) -> CTE: + return self._cte \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/get.py new file mode 100644 index 00000000..5a784295 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/get.py @@ -0,0 +1,84 @@ +from typing import Sequence + +from sqlalchemy import select, func, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.sync_to_ds.constants import PER_REQUEST_ENTITY_LIMIT +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.add.queries.cte import \ + DSAppLinkSyncMetaURLAddPrerequisitesCTEContainer +from src.core.tasks.scheduled.impl.sync_to_ds.shared.convert import convert_sm_url_status_to_ds_url_status +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.internet_archives.probe.sqlalchemy import URLInternetArchivesProbeMetadata +from src.db.queries.base.builder import QueryBuilderBase +from src.external.pdap.impl.sync.meta_urls._shared.content import MetaURLSyncContentModel +from src.external.pdap.impl.sync.meta_urls.add.request import AddMetaURLsOuterRequest, AddMetaURLsInnerRequest + + +class DSAppSyncMetaURLsAddGetQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> AddMetaURLsOuterRequest: + cte = DSAppLinkSyncMetaURLAddPrerequisitesCTEContainer() + + agency_id_cte = ( + select( + LinkURLAgency.url_id, + func.array_agg(LinkURLAgency.agency_id).label("agency_ids"), + + ) + .group_by( + LinkURLAgency.url_id + ) + .cte() + ) + + query = ( + select( + cte.url_id, + URL.full_url, + URL.status, + URLInternetArchivesProbeMetadata.archive_url, + agency_id_cte.c.agency_ids + ) + .select_from( + cte.cte + ) + .join( + URL, + URL.id == cte.url_id, + ) + .outerjoin( + URLInternetArchivesProbeMetadata, + URL.id == URLInternetArchivesProbeMetadata.url_id, + ) + .join( + agency_id_cte, + cte.url_id == agency_id_cte.c.url_id + ) + .limit(PER_REQUEST_ENTITY_LIMIT) + ) + + mappings: Sequence[RowMapping] = await self.sh.mappings( + session=session, + query=query, + ) + + inner_requests: list[AddMetaURLsInnerRequest] = [] + for mapping in mappings: + inner_requests.append( + AddMetaURLsInnerRequest( + request_id=mapping[cte.url_id], + content=MetaURLSyncContentModel( + url=mapping["full_url"], + agency_ids=mapping["agency_ids"], + internet_archives_url=mapping[URLInternetArchivesProbeMetadata.archive_url] or None, + url_status=convert_sm_url_status_to_ds_url_status( + sm_url_status=mapping[URL.status], + ), + ) + ) + ) + + return AddMetaURLsOuterRequest( + meta_urls=inner_requests, + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/prereq.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/prereq.py new file mode 100644 index 00000000..9439b6d0 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/prereq.py @@ -0,0 +1,17 @@ +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.add.queries.cte import \ + DSAppLinkSyncMetaURLAddPrerequisitesCTEContainer +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncMetaURLsAddPrerequisitesQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> bool: + return await self.sh.results_exist( + session=session, + query=select( + DSAppLinkSyncMetaURLAddPrerequisitesCTEContainer().url_id + ) + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/core.py new file mode 100644 index 00000000..76fc9c4b --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/core.py @@ -0,0 +1,68 @@ +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.delete.queries.delete_flags import \ + DSAppSyncMetaURLsDeleteRemoveFlagsQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.delete.queries.delete_links import \ + DSAppSyncMetaURLsDeleteRemoveLinksQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.delete.queries.get import \ + DSAppSyncMetaURLsDeleteGetQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.delete.queries.prereq import \ + DSAppSyncMetaURLsDeletePrerequisitesQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.templates.operator import DSSyncTaskOperatorBase +from src.db.enums import TaskType +from src.external.pdap.impl.sync.meta_urls.delete.core import DeleteMetaURLsRequestBuilder + + +class DSAppSyncMetaURLsDeleteTaskOperator( + DSSyncTaskOperatorBase +): + + @property + def task_type(self) -> TaskType: + return TaskType.SYNC_META_URLS_DELETE + + async def meets_task_prerequisites(self) -> bool: + return await self.run_query_builder( + DSAppSyncMetaURLsDeletePrerequisitesQueryBuilder() + ) + + async def inner_task_logic(self) -> None: + ds_app_ids: list[int] = await self.get_inputs() + await self.log_ds_app_ids(ds_app_ids) + await self.make_request(ds_app_ids) + await self.delete_flags(ds_app_ids) + await self.delete_links(ds_app_ids) + + async def log_ds_app_ids(self, ds_app_ids: list[int]): + await self.add_task_log(f"Deleting meta urls with the following ds_app_ids: {ds_app_ids}") + + async def get_inputs(self) -> list[int]: + return await self.run_query_builder( + DSAppSyncMetaURLsDeleteGetQueryBuilder() + ) + + async def make_request( + self, + ds_app_ids: list[int] + ) -> None: + await self.pdap_client.run_request_builder( + DeleteMetaURLsRequestBuilder(ds_app_ids) + ) + + async def delete_flags( + self, + ds_app_ids: list[int] + ) -> None: + await self.run_query_builder( + DSAppSyncMetaURLsDeleteRemoveFlagsQueryBuilder( + ds_meta_url_ids=ds_app_ids + ) + ) + + async def delete_links( + self, + ds_app_ids: list[int] + ) -> None: + await self.run_query_builder( + DSAppSyncMetaURLsDeleteRemoveLinksQueryBuilder( + ds_meta_url_ids=ds_app_ids + ) + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/cte.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/cte.py new file mode 100644 index 00000000..91887e48 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/cte.py @@ -0,0 +1,29 @@ +""" +Meta URLs to be deleted from the DS database must be flagged for deletion +""" +from sqlalchemy import Column, CTE, select + +from src.db.models.impl.flag.ds_delete.meta_url import FlagDSDeleteMetaURL +from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL + + +class DSAppLinkSyncMetaURLDeletePrerequisitesCTEContainer: + + def __init__(self): + self._cte = ( + select( + DSAppLinkMetaURL.ds_meta_url_id + ) + .join( + FlagDSDeleteMetaURL, + FlagDSDeleteMetaURL.ds_meta_url_id == DSAppLinkMetaURL.ds_meta_url_id + ).cte("ds_app_link_sync_meta_url_delete_prerequisites") + ) + + @property + def ds_meta_url_id(self) -> Column[int]: + return self._cte.columns.ds_meta_url_id + + @property + def cte(self) -> CTE: + return self._cte \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/delete_flags.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/delete_flags.py new file mode 100644 index 00000000..4bee4ccc --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/delete_flags.py @@ -0,0 +1,22 @@ +from sqlalchemy import delete +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.models.impl.flag.ds_delete.meta_url import FlagDSDeleteMetaURL +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncMetaURLsDeleteRemoveFlagsQueryBuilder(QueryBuilderBase): + + def __init__( + self, + ds_meta_url_ids: list[int] + ): + super().__init__() + self._ds_meta_url_ids = ds_meta_url_ids + + async def run(self, session: AsyncSession) -> None: + statement = ( + delete(FlagDSDeleteMetaURL) + .where(FlagDSDeleteMetaURL.ds_meta_url_id.in_(self._ds_meta_url_ids)) + ) + await session.execute(statement) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/delete_links.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/delete_links.py new file mode 100644 index 00000000..0fb66bb5 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/delete_links.py @@ -0,0 +1,22 @@ +from sqlalchemy import delete +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncMetaURLsDeleteRemoveLinksQueryBuilder(QueryBuilderBase): + + def __init__( + self, + ds_meta_url_ids: list[int] + ): + super().__init__() + self._ds_meta_url_ids = ds_meta_url_ids + + async def run(self, session: AsyncSession) -> None: + statement = ( + delete(DSAppLinkMetaURL) + .where(DSAppLinkMetaURL.ds_meta_url_id.in_(self._ds_meta_url_ids)) + ) + await session.execute(statement) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/get.py new file mode 100644 index 00000000..0d3b09cc --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/get.py @@ -0,0 +1,30 @@ +from typing import Sequence + +from sqlalchemy import select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.sync_to_ds.constants import PER_REQUEST_ENTITY_LIMIT +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.delete.queries.cte import \ + DSAppLinkSyncMetaURLDeletePrerequisitesCTEContainer +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncMetaURLsDeleteGetQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> list[int]: + """Get DS App links to delete.""" + cte = DSAppLinkSyncMetaURLDeletePrerequisitesCTEContainer() + + query = ( + select( + cte.ds_meta_url_id, + ) + .limit(PER_REQUEST_ENTITY_LIMIT) + ) + + mappings: Sequence[RowMapping] = await self.sh.mappings( + session=session, + query=query, + ) + + return [mapping[cte.ds_meta_url_id] for mapping in mappings] \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/prereq.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/prereq.py new file mode 100644 index 00000000..8bc7dbd8 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/delete/queries/prereq.py @@ -0,0 +1,17 @@ +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.delete.queries.cte import \ + DSAppLinkSyncMetaURLDeletePrerequisitesCTEContainer +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncMetaURLsDeletePrerequisitesQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> bool: + return await self.sh.results_exist( + session=session, + query=select( + DSAppLinkSyncMetaURLDeletePrerequisitesCTEContainer().ds_meta_url_id + ) + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/core.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/core.py new file mode 100644 index 00000000..ff0b06ec --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/core.py @@ -0,0 +1,60 @@ +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.update.queries.get import \ + DSAppSyncMetaURLsUpdateGetQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.update.queries.prereq import \ + DSAppSyncMetaURLsUpdatePrerequisitesQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.update.queries.update_links import \ + DSAppSyncMetaURLsUpdateAlterLinksQueryBuilder +from src.core.tasks.scheduled.impl.sync_to_ds.templates.operator import DSSyncTaskOperatorBase +from src.db.enums import TaskType +from src.external.pdap.impl.sync.meta_urls.update.core import UpdateMetaURLsRequestBuilder +from src.external.pdap.impl.sync.meta_urls.update.request import UpdateMetaURLsOuterRequest + + +class DSAppSyncMetaURLsUpdateTaskOperator( + DSSyncTaskOperatorBase +): + + @property + def task_type(self) -> TaskType: + return TaskType.SYNC_META_URLS_UPDATE + + async def meets_task_prerequisites(self) -> bool: + return await self.adb_client.run_query_builder( + DSAppSyncMetaURLsUpdatePrerequisitesQueryBuilder() + ) + + async def inner_task_logic(self) -> None: + request: UpdateMetaURLsOuterRequest = await self.get_inputs() + ds_app_ids: list[int] = [ + meta_url.app_id + for meta_url in request.meta_urls + ] + await self.log_ds_app_ids(ds_app_ids) + await self.make_request(request) + await self.update_links(ds_app_ids) + + async def log_ds_app_ids(self, ds_app_ids: list[int]): + await self.add_task_log(f"Updating meta urls with the following ds_app_ids: {ds_app_ids}") + + async def get_inputs(self) -> UpdateMetaURLsOuterRequest: + return await self.adb_client.run_query_builder( + DSAppSyncMetaURLsUpdateGetQueryBuilder() + ) + + async def make_request( + self, + request: UpdateMetaURLsOuterRequest + ): + await self.pdap_client.run_request_builder( + UpdateMetaURLsRequestBuilder(request) + ) + + async def update_links( + self, + ds_app_ids: list[int] + ) -> None: + await self.adb_client.run_query_builder( + DSAppSyncMetaURLsUpdateAlterLinksQueryBuilder( + ds_meta_url_ids=ds_app_ids + ) + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/cte.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/cte.py new file mode 100644 index 00000000..20123566 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/cte.py @@ -0,0 +1,33 @@ +from sqlalchemy import select, Column, CTE + +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL + +class DSAppLinkSyncMetaURLUpdatePrerequisitesCTEContainer: + + def __init__(self): + self._cte = ( + select( + DSAppLinkMetaURL.url_id, + DSAppLinkMetaURL.ds_meta_url_id, + ) + .join( + URL, + URL.id == DSAppLinkMetaURL.url_id, + ) + .where( + URL.updated_at > DSAppLinkMetaURL.last_synced_at, + ).cte("ds_app_link_sync_meta_url_update_prerequisites") + ) + + @property + def url_id(self) -> Column[int]: + return self._cte.columns.url_id + + @property + def ds_meta_url_id(self) -> Column[int]: + return self._cte.columns.ds_meta_url_id + + @property + def cte(self) -> CTE: + return self._cte \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/get.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/get.py new file mode 100644 index 00000000..8cdb8ed6 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/get.py @@ -0,0 +1,83 @@ +from typing import Sequence + +from sqlalchemy import select, func, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.sync_to_ds.constants import PER_REQUEST_ENTITY_LIMIT +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.update.queries.cte import \ + DSAppLinkSyncMetaURLUpdatePrerequisitesCTEContainer +from src.core.tasks.scheduled.impl.sync_to_ds.shared.convert import convert_sm_url_status_to_ds_url_status +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.internet_archives.probe.sqlalchemy import URLInternetArchivesProbeMetadata +from src.db.queries.base.builder import QueryBuilderBase +from src.external.pdap.impl.sync.meta_urls._shared.content import MetaURLSyncContentModel +from src.external.pdap.impl.sync.meta_urls.update.request import UpdateMetaURLsOuterRequest, UpdateMetaURLsInnerRequest + + +class DSAppSyncMetaURLsUpdateGetQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> UpdateMetaURLsOuterRequest: + cte = DSAppLinkSyncMetaURLUpdatePrerequisitesCTEContainer() + + agency_id_cte = ( + select( + LinkURLAgency.url_id, + func.array_agg(LinkURLAgency.agency_id).label("agency_ids") + ) + .group_by( + LinkURLAgency.url_id + ) + .cte() + ) + + query = ( + select( + cte.ds_meta_url_id, + URL.full_url, + URL.status, + agency_id_cte.c.agency_ids, + URLInternetArchivesProbeMetadata.archive_url, + ) + .select_from( + cte.cte + ) + .join( + URL, + URL.id == cte.url_id, + ) + .outerjoin( + URLInternetArchivesProbeMetadata, + URL.id == URLInternetArchivesProbeMetadata.url_id, + ) + .outerjoin( + agency_id_cte, + cte.url_id == agency_id_cte.c.url_id + ) + .limit(PER_REQUEST_ENTITY_LIMIT) + ) + + mappings: Sequence[RowMapping] = await self.sh.mappings( + session=session, + query=query, + ) + + inner_requests: list[UpdateMetaURLsInnerRequest] = [] + for mapping in mappings: + inner_requests.append( + UpdateMetaURLsInnerRequest( + app_id=mapping[cte.ds_meta_url_id], + content=MetaURLSyncContentModel( + url=mapping['full_url'], + agency_ids=mapping["agency_ids"] or [], + internet_archives_url=mapping[URLInternetArchivesProbeMetadata.archive_url] or None, + url_status=convert_sm_url_status_to_ds_url_status( + sm_url_status=mapping[URL.status], + ), + ) + ) + ) + + return UpdateMetaURLsOuterRequest( + meta_urls=inner_requests, + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/prereq.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/prereq.py new file mode 100644 index 00000000..761bb2c5 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/prereq.py @@ -0,0 +1,17 @@ +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.update.queries.cte import \ + DSAppLinkSyncMetaURLUpdatePrerequisitesCTEContainer +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncMetaURLsUpdatePrerequisitesQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> bool: + return await self.sh.results_exist( + session=session, + query=select( + DSAppLinkSyncMetaURLUpdatePrerequisitesCTEContainer().ds_meta_url_id + ) + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/update_links.py b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/update_links.py new file mode 100644 index 00000000..baafcaa8 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/update_links.py @@ -0,0 +1,25 @@ +from sqlalchemy import update, func +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL +from src.db.queries.base.builder import QueryBuilderBase + + +class DSAppSyncMetaURLsUpdateAlterLinksQueryBuilder(QueryBuilderBase): + + def __init__( + self, + ds_meta_url_ids: list[int] + ): + super().__init__() + self._ds_meta_url_ids = ds_meta_url_ids + + async def run(self, session: AsyncSession) -> None: + statement = ( + update(DSAppLinkMetaURL) + .where(DSAppLinkMetaURL.ds_meta_url_id.in_(self._ds_meta_url_ids)) + .values({ + DSAppLinkMetaURL.last_synced_at: func.now(), + }) + ) + await session.execute(statement) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/shared/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/shared/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/shared/convert.py b/src/core/tasks/scheduled/impl/sync_to_ds/shared/convert.py new file mode 100644 index 00000000..3f586b20 --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/shared/convert.py @@ -0,0 +1,14 @@ +from src.collectors.enums import URLStatus +from src.external.pdap.enums import DataSourcesURLStatus + + +def convert_sm_url_status_to_ds_url_status( + sm_url_status: URLStatus +) -> DataSourcesURLStatus: + match sm_url_status: + case URLStatus.OK: + return DataSourcesURLStatus.OK + case URLStatus.BROKEN: + return DataSourcesURLStatus.BROKEN + case _: + raise ValueError(f"URL status has no corresponding DS Status: {sm_url_status}") \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/templates/__init__.py b/src/core/tasks/scheduled/impl/sync_to_ds/templates/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/sync_to_ds/templates/operator.py b/src/core/tasks/scheduled/impl/sync_to_ds/templates/operator.py new file mode 100644 index 00000000..63a72a2f --- /dev/null +++ b/src/core/tasks/scheduled/impl/sync_to_ds/templates/operator.py @@ -0,0 +1,21 @@ +from abc import ABC + +from src.core.tasks.mixins.prereq import HasPrerequisitesMixin +from src.core.tasks.scheduled.templates.operator import ScheduledTaskOperatorBase +from src.db.client.async_ import AsyncDatabaseClient +from src.external.pdap.client import PDAPClient + + +class DSSyncTaskOperatorBase( + ScheduledTaskOperatorBase, + HasPrerequisitesMixin, + ABC +): + + def __init__( + self, + adb_client: AsyncDatabaseClient, + pdap_client: PDAPClient + ): + super().__init__(adb_client) + self.pdap_client = pdap_client diff --git a/src/core/tasks/scheduled/impl/update_url_status/__init__.py b/src/core/tasks/scheduled/impl/update_url_status/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/scheduled/impl/update_url_status/operator.py b/src/core/tasks/scheduled/impl/update_url_status/operator.py new file mode 100644 index 00000000..82285996 --- /dev/null +++ b/src/core/tasks/scheduled/impl/update_url_status/operator.py @@ -0,0 +1,15 @@ +from src.core.tasks.scheduled.impl.update_url_status.query import UpdateURLStatusQueryBuilder +from src.core.tasks.scheduled.templates.operator import ScheduledTaskOperatorBase +from src.db.enums import TaskType + + +class UpdateURLStatusOperator(ScheduledTaskOperatorBase): + + @property + def task_type(self) -> TaskType: + return TaskType.UPDATE_URL_STATUS + + async def inner_task_logic(self) -> None: + await self.adb_client.run_query_builder( + UpdateURLStatusQueryBuilder() + ) \ No newline at end of file diff --git a/src/core/tasks/scheduled/impl/update_url_status/query.py b/src/core/tasks/scheduled/impl/update_url_status/query.py new file mode 100644 index 00000000..963405b6 --- /dev/null +++ b/src/core/tasks/scheduled/impl/update_url_status/query.py @@ -0,0 +1,49 @@ +from sqlalchemy import update, exists, select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.collectors.enums import URLStatus +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata +from src.db.queries.base.builder import QueryBuilderBase + + +class UpdateURLStatusQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> None: + + # Update broken URLs to nonbroken if their status is not 404 + query_broken_to_ok = ( + update(URL) + .values( + status=URLStatus.OK + ) + .where( + exists( + select(1).where( + URLWebMetadata.url_id == URL.id, # <-- correlate + URLWebMetadata.status_code != 404, + URL.status == URLStatus.BROKEN + ) + ) + ) + ) + + # Update ok URLs to broken if their status is 404 + query_ok_to_broken = ( + update(URL) + .values( + status=URLStatus.BROKEN + ) + .where( + exists( + select(1).where( + URLWebMetadata.url_id == URL.id, # <-- correlate + URLWebMetadata.status_code == 404, + URL.status == URLStatus.OK + ) + ) + ) + ) + + await session.execute(query_broken_to_ok) + await session.execute(query_ok_to_broken) \ No newline at end of file diff --git a/src/core/tasks/scheduled/loader.py b/src/core/tasks/scheduled/loader.py index 82ac92cc..61169a66 100644 --- a/src/core/tasks/scheduled/loader.py +++ b/src/core/tasks/scheduled/loader.py @@ -6,13 +6,25 @@ from src.core.tasks.scheduled.impl.delete_logs.operator import DeleteOldLogsTaskOperator from src.core.tasks.scheduled.impl.delete_stale_screenshots.operator import DeleteStaleScreenshotsTaskOperator from src.core.tasks.scheduled.impl.huggingface.operator import PushToHuggingFaceTaskOperator +from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator from src.core.tasks.scheduled.impl.internet_archives.probe.operator import InternetArchivesProbeTaskOperator from src.core.tasks.scheduled.impl.internet_archives.save.operator import InternetArchivesSaveTaskOperator from src.core.tasks.scheduled.impl.mark_never_completed.operator import MarkTaskNeverCompletedOperator -from src.core.tasks.scheduled.impl.mark_never_completed.query import MarkTaskNeverCompletedQueryBuilder from src.core.tasks.scheduled.impl.refresh_materialized_views.operator import RefreshMaterializedViewsOperator from src.core.tasks.scheduled.impl.run_url_tasks.operator import RunURLTasksTaskOperator +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.add.core import DSAppSyncAgenciesAddTaskOperator +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.delete.core import DSAppSyncAgenciesDeleteTaskOperator +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.update.core import DSAppSyncAgenciesUpdateTaskOperator +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.add.core import DSAppSyncDataSourcesAddTaskOperator +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.delete.core import \ + DSAppSyncDataSourcesDeleteTaskOperator +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update.core import \ + DSAppSyncDataSourcesUpdateTaskOperator +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.add.core import DSAppSyncMetaURLsAddTaskOperator +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.delete.core import DSAppSyncMetaURLsDeleteTaskOperator +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.update.core import DSAppSyncMetaURLsUpdateTaskOperator from src.core.tasks.scheduled.impl.task_cleanup.operator import TaskCleanupOperator +from src.core.tasks.scheduled.impl.update_url_status.operator import UpdateURLStatusOperator from src.core.tasks.scheduled.models.entry import ScheduledTaskEntry from src.db.client.async_ import AsyncDatabaseClient from src.external.huggingface.hub.client import HuggingFaceHubClient @@ -115,5 +127,106 @@ async def load_entries(self) -> list[ScheduledTaskEntry]: operator=RefreshMaterializedViewsOperator(adb_client=self.adb_client), interval_minutes=IntervalEnum.DAILY.value, enabled=self.setup_flag("REFRESH_MATERIALIZED_VIEWS_TASK_FLAG") - ) + ), + ScheduledTaskEntry( + operator=IntegrityMonitorTaskOperator( + adb_client=self.adb_client + ), + interval_minutes=IntervalEnum.DAILY.value, + enabled=self.setup_flag("INTEGRITY_MONITOR_TASK_FLAG") + ), + # Sync + ## Adds + ### Agency + ScheduledTaskEntry( + operator=DSAppSyncAgenciesAddTaskOperator( + adb_client=self.adb_client, + pdap_client=self.pdap_client + ), + interval_minutes=IntervalEnum.HOURLY.value, + enabled=self.setup_flag("DS_APP_SYNC_AGENCY_ADD_TASK_FLAG") + ), + ### Meta URL + ScheduledTaskEntry( + operator=DSAppSyncMetaURLsAddTaskOperator( + adb_client=self.adb_client, + pdap_client=self.pdap_client + ), + interval_minutes=IntervalEnum.HOURLY.value, + enabled=self.setup_flag("DS_APP_SYNC_META_URL_ADD_TASK_FLAG") + ), + ### Data Source + ScheduledTaskEntry( + operator=DSAppSyncDataSourcesAddTaskOperator( + adb_client=self.adb_client, + pdap_client=self.pdap_client + ), + interval_minutes=IntervalEnum.HOURLY.value, + enabled=self.setup_flag("DS_APP_SYNC_DATA_SOURCE_ADD_TASK_FLAG") + ), + ## Updates + ### Agency + ScheduledTaskEntry( + operator=DSAppSyncAgenciesUpdateTaskOperator( + adb_client=self.adb_client, + pdap_client=self.pdap_client + ), + interval_minutes=IntervalEnum.HOURLY.value, + enabled=self.setup_flag("DS_APP_SYNC_AGENCY_UPDATE_TASK_FLAG") + ), + ### Meta URL + ScheduledTaskEntry( + operator=DSAppSyncMetaURLsUpdateTaskOperator( + adb_client=self.adb_client, + pdap_client=self.pdap_client + ), + interval_minutes=IntervalEnum.HOURLY.value, + enabled=self.setup_flag("DS_APP_SYNC_META_URL_UPDATE_TASK_FLAG") + ), + ### Data Source + ScheduledTaskEntry( + operator=DSAppSyncDataSourcesUpdateTaskOperator( + adb_client=self.adb_client, + pdap_client=self.pdap_client + ), + interval_minutes=IntervalEnum.HOURLY.value, + enabled=self.setup_flag("DS_APP_SYNC_DATA_SOURCE_UPDATE_TASK_FLAG") + ), + ## Deletes + ### Data Source + ScheduledTaskEntry( + operator=DSAppSyncDataSourcesDeleteTaskOperator( + adb_client=self.adb_client, + pdap_client=self.pdap_client + ), + interval_minutes=IntervalEnum.HOURLY.value, + enabled=self.setup_flag("DS_APP_SYNC_DATA_SOURCE_DELETE_TASK_FLAG") + ), + ### Meta URL + ScheduledTaskEntry( + operator=DSAppSyncMetaURLsDeleteTaskOperator( + adb_client=self.adb_client, + pdap_client=self.pdap_client + ), + interval_minutes=IntervalEnum.HOURLY.value, + enabled=self.setup_flag("DS_APP_SYNC_META_URL_DELETE_TASK_FLAG") + ), + ### Agency + ScheduledTaskEntry( + operator=DSAppSyncAgenciesDeleteTaskOperator( + adb_client=self.adb_client, + pdap_client=self.pdap_client + ), + interval_minutes=IntervalEnum.HOURLY.value, + enabled=self.setup_flag("DS_APP_SYNC_AGENCY_DELETE_TASK_FLAG") + ), + ### URL + ScheduledTaskEntry( + operator=UpdateURLStatusOperator( + adb_client=self.adb_client + ), + interval_minutes=IntervalEnum.DAILY.value, + enabled=self.setup_flag("UPDATE_URL_STATUS_TASK_FLAG") + ), + ] diff --git a/src/core/tasks/scheduled/manager.py b/src/core/tasks/scheduled/manager.py index 87cb5a27..adf386a6 100644 --- a/src/core/tasks/scheduled/manager.py +++ b/src/core/tasks/scheduled/manager.py @@ -1,3 +1,5 @@ +from datetime import datetime + from src.core.tasks.base.run_info import TaskOperatorRunInfo from src.core.tasks.handler import TaskHandler from src.core.tasks.mixins.link_urls import LinkURLsMixin @@ -5,6 +7,7 @@ from src.core.tasks.scheduled.loader import ScheduledTaskOperatorLoader from src.core.tasks.scheduled.models.entry import ScheduledTaskEntry from src.core.tasks.scheduled.registry.core import ScheduledJobRegistry +from src.core.tasks.scheduled.registry.format import format_job_datetime from src.core.tasks.scheduled.templates.operator import ScheduledTaskOperatorBase @@ -44,12 +47,17 @@ async def add_scheduled_tasks(self): enabled_entries.append(entry) initial_lag: int = 1 + + print("Adding the following scheduled tasks:") + print(f"TASK_NAME | TASK_INTERVAL") for idx, entry in enumerate(enabled_entries): - await self._registry.add_job( + next_run_time: datetime = await self._registry.add_job( func=self.run_task, entry=entry, minute_lag=idx + initial_lag ) + run_time_str: str = format_job_datetime(next_run_time) + print(f"{entry.operator.task_type.value:<25}| {run_time_str}") def shutdown(self): self._registry.shutdown_scheduler() diff --git a/src/core/tasks/scheduled/registry/core.py b/src/core/tasks/scheduled/registry/core.py index e9fc205b..e85c15f0 100644 --- a/src/core/tasks/scheduled/registry/core.py +++ b/src/core/tasks/scheduled/registry/core.py @@ -25,7 +25,7 @@ async def add_job( func: Callable, entry: ScheduledTaskEntry, minute_lag: int - ) -> None: + ) -> datetime: """ Modifies: self._jobs @@ -40,10 +40,8 @@ async def add_job( misfire_grace_time=60, kwargs={"operator": entry.operator} ) - run_time_str: str = format_job_datetime(job.next_run_time) - print(f"Adding {job.id} task to scheduler. " + - f"First run at {run_time_str}") self._jobs[entry.operator.task_type] = job + return job.next_run_time def start_scheduler(self) -> None: """ diff --git a/src/core/tasks/url/loader.py b/src/core/tasks/url/loader.py index b5910f5e..70c3eebe 100644 --- a/src/core/tasks/url/loader.py +++ b/src/core/tasks/url/loader.py @@ -5,6 +5,17 @@ from environs import Env from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.add.core import DSAppSyncAgenciesAddTaskOperator +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.delete.core import DSAppSyncAgenciesDeleteTaskOperator +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.update.core import DSAppSyncAgenciesUpdateTaskOperator +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.add.core import DSAppSyncDataSourcesAddTaskOperator +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.delete.core import \ + DSAppSyncDataSourcesDeleteTaskOperator +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update.core import \ + DSAppSyncDataSourcesUpdateTaskOperator +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.add.core import DSAppSyncMetaURLsAddTaskOperator +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.delete.core import DSAppSyncMetaURLsDeleteTaskOperator +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.update.core import DSAppSyncMetaURLsUpdateTaskOperator from src.core.tasks.url.models.entry import URLTaskEntry from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator from src.core.tasks.url.operators.agency_identification.subtasks.loader import AgencyIdentificationSubtaskLoader @@ -21,8 +32,6 @@ from src.core.tasks.url.operators.record_type.llm_api.record_classifier.openai import OpenAIRecordClassifier from src.core.tasks.url.operators.root_url.core import URLRootURLTaskOperator from src.core.tasks.url.operators.screenshot.core import URLScreenshotTaskOperator -from src.core.tasks.url.operators.submit_approved.core import SubmitApprovedURLTaskOperator -from src.core.tasks.url.operators.submit_meta_urls.core import SubmitMetaURLsTaskOperator from src.core.tasks.url.operators.suspend.core import SuspendURLTaskOperator from src.core.tasks.url.operators.validate.core import AutoValidateURLTaskOperator from src.db.client.async_ import AsyncDatabaseClient @@ -96,26 +105,6 @@ def _get_agency_identification_task_operator(self) -> URLTaskEntry: enabled=self.setup_flag("URL_AGENCY_IDENTIFICATION_TASK_FLAG") ) - def _get_submit_approved_url_task_operator(self) -> URLTaskEntry: - operator = SubmitApprovedURLTaskOperator( - adb_client=self.adb_client, - pdap_client=self.pdap_client - ) - return URLTaskEntry( - operator=operator, - enabled=self.setup_flag("URL_SUBMIT_APPROVED_TASK_FLAG") - ) - - def _get_submit_meta_urls_task_operator(self) -> URLTaskEntry: - operator = SubmitMetaURLsTaskOperator( - adb_client=self.adb_client, - pdap_client=self.pdap_client - ) - return URLTaskEntry( - operator=operator, - enabled=self.setup_flag("URL_SUBMIT_META_URLS_TASK_FLAG") - ) - def _get_url_miscellaneous_metadata_task_operator(self) -> URLTaskEntry: operator = URLMiscellaneousMetadataTaskOperator( adb_client=self.adb_client @@ -204,6 +193,109 @@ def _get_suspend_url_task_operator(self) -> URLTaskEntry: enabled=self.setup_flag("URL_SUSPEND_TASK_FLAG") ) + # DS App Sync + ## Agency + ### Add + def _get_ds_app_sync_agency_add_task_operator(self) -> URLTaskEntry: + operator = DSAppSyncAgenciesAddTaskOperator( + adb_client=self.adb_client, + pdap_client=self.pdap_client + ) + return URLTaskEntry( + operator=operator, + enabled=self.setup_flag("DS_APP_SYNC_AGENCY_ADD_TASK_FLAG") + ) + + ### Update + def _get_ds_app_sync_agency_update_task_operator(self) -> URLTaskEntry: + operator = DSAppSyncAgenciesUpdateTaskOperator( + adb_client=self.adb_client, + pdap_client=self.pdap_client + ) + return URLTaskEntry( + operator=operator, + enabled=self.setup_flag("DS_APP_SYNC_AGENCY_UPDATE_TASK_FLAG") + ) + + ### Delete + def _get_ds_app_sync_agency_delete_task_operator(self) -> URLTaskEntry: + operator = DSAppSyncAgenciesDeleteTaskOperator( + adb_client=self.adb_client, + pdap_client=self.pdap_client + ) + return URLTaskEntry( + operator=operator, + enabled=self.setup_flag("DS_APP_SYNC_AGENCY_DELETE_TASK_FLAG") + ) + + ## Data Source + ### Add + def _get_ds_app_sync_data_source_add_task_operator(self) -> URLTaskEntry: + operator = DSAppSyncDataSourcesAddTaskOperator( + adb_client=self.adb_client, + pdap_client=self.pdap_client + ) + return URLTaskEntry( + operator=operator, + enabled=self.setup_flag("DS_APP_SYNC_DATA_SOURCE_ADD_TASK_FLAG") + ) + + ### Update + def _get_ds_app_sync_data_source_update_task_operator(self) -> URLTaskEntry: + operator = DSAppSyncDataSourcesUpdateTaskOperator( + adb_client=self.adb_client, + pdap_client=self.pdap_client + ) + return URLTaskEntry( + operator=operator, + enabled=self.setup_flag("DS_APP_SYNC_DATA_SOURCE_UPDATE_TASK_FLAG") + ) + + ### Delete + def _get_ds_app_sync_data_source_delete_task_operator(self) -> URLTaskEntry: + operator = DSAppSyncDataSourcesDeleteTaskOperator( + adb_client=self.adb_client, + pdap_client=self.pdap_client + ) + return URLTaskEntry( + operator=operator, + enabled=self.setup_flag("DS_APP_SYNC_DATA_SOURCE_DELETE_TASK_FLAG") + ) + + ## Meta URL + ### Add + def _get_ds_app_sync_meta_url_add_task_operator(self) -> URLTaskEntry: + operator = DSAppSyncMetaURLsAddTaskOperator( + adb_client=self.adb_client, + pdap_client=self.pdap_client + ) + return URLTaskEntry( + operator=operator, + enabled=self.setup_flag("DS_APP_SYNC_META_URL_ADD_TASK_FLAG") + ) + + ### Update + def _get_ds_app_sync_meta_url_update_task_operator(self) -> URLTaskEntry: + operator = DSAppSyncMetaURLsUpdateTaskOperator( + adb_client=self.adb_client, + pdap_client=self.pdap_client + ) + return URLTaskEntry( + operator=operator, + enabled=self.setup_flag("DS_APP_SYNC_META_URL_UPDATE_TASK_FLAG") + ) + + ### Delete + def _get_ds_app_sync_meta_url_delete_task_operator(self) -> URLTaskEntry: + operator = DSAppSyncMetaURLsDeleteTaskOperator( + adb_client=self.adb_client, + pdap_client=self.pdap_client + ) + return URLTaskEntry( + operator=operator, + enabled=self.setup_flag("DS_APP_SYNC_META_URL_DELETE_TASK_FLAG") + ) + async def load_entries(self) -> list[URLTaskEntry]: return [ @@ -213,12 +305,23 @@ async def load_entries(self) -> list[URLTaskEntry]: self._get_url_record_type_task_operator(), self._get_agency_identification_task_operator(), self._get_url_miscellaneous_metadata_task_operator(), - self._get_submit_approved_url_task_operator(), - self._get_submit_meta_urls_task_operator(), self._get_url_auto_relevance_task_operator(), self._get_url_screenshot_task_operator(), self._get_location_id_task_operator(), self._get_auto_validate_task_operator(), self._get_auto_name_task_operator(), self._get_suspend_url_task_operator(), + # DS App Sync + ## Agency + self._get_ds_app_sync_agency_add_task_operator(), + self._get_ds_app_sync_agency_update_task_operator(), + self._get_ds_app_sync_agency_delete_task_operator(), + ## Data Source + self._get_ds_app_sync_data_source_add_task_operator(), + self._get_ds_app_sync_data_source_update_task_operator(), + self._get_ds_app_sync_data_source_delete_task_operator(), + ## Meta URL + self._get_ds_app_sync_meta_url_add_task_operator(), + self._get_ds_app_sync_meta_url_update_task_operator(), + self._get_ds_app_sync_meta_url_delete_task_operator(), ] diff --git a/src/core/tasks/url/models/entry.py b/src/core/tasks/url/models/entry.py index eeb09047..69269c1e 100644 --- a/src/core/tasks/url/models/entry.py +++ b/src/core/tasks/url/models/entry.py @@ -1,5 +1,7 @@ from pydantic import BaseModel +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.add.core import DSAppSyncAgenciesAddTaskOperator +from src.core.tasks.scheduled.impl.sync_to_ds.templates.operator import DSSyncTaskOperatorBase from src.core.tasks.url.operators.base import URLTaskOperatorBase @@ -8,5 +10,5 @@ class URLTaskEntry(BaseModel): class Config: arbitrary_types_allowed = True - operator: URLTaskOperatorBase + operator: URLTaskOperatorBase | DSSyncTaskOperatorBase enabled: bool \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/convert.py b/src/core/tasks/url/operators/agency_identification/subtasks/convert.py index 95c9e704..5cead5d3 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/convert.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/convert.py @@ -2,21 +2,15 @@ from src.core.tasks.url.operators.agency_identification.subtasks.models.suggestion import AgencySuggestion from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType from src.db.models.impl.url.suggestion.agency.subtask.pydantic import URLAutoAgencyIDSubtaskPydantic -from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo -from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse -from src.external.pdap.enums import MatchAgencyResponseStatus -def convert_match_agency_response_to_subtask_data( + +def convert_agency_suggestions_to_subtask_data( url_id: int, - response: MatchAgencyResponse, + agency_suggestions: list[AgencySuggestion], subtask_type: AutoAgencyIDSubtaskType, - task_id: int -): - suggestions: list[AgencySuggestion] = \ - _convert_match_agency_response_to_suggestions( - response - ) - agencies_found: bool = len(suggestions) > 0 + task_id: int, +) -> AutoAgencyIDSubtaskData: + agencies_found: bool = len(agency_suggestions) > 0 subtask_pydantic = URLAutoAgencyIDSubtaskPydantic( url_id=url_id, type=subtask_type, @@ -25,30 +19,6 @@ def convert_match_agency_response_to_subtask_data( ) return AutoAgencyIDSubtaskData( pydantic_model=subtask_pydantic, - suggestions=suggestions + suggestions=agency_suggestions ) -def _convert_match_agency_response_to_suggestions( - match_response: MatchAgencyResponse, -) -> list[AgencySuggestion]: - if match_response.status == MatchAgencyResponseStatus.EXACT_MATCH: - match_info: MatchAgencyInfo = match_response.matches[0] - return [ - AgencySuggestion( - agency_id=int(match_info.id), - confidence=100 - ) - ] - if match_response.status == MatchAgencyResponseStatus.NO_MATCH: - return [] - if match_response.status != MatchAgencyResponseStatus.PARTIAL_MATCH: - raise ValueError(f"Unknown Match Agency Response Status: {match_response.status}") - total_confidence: int = 100 - confidence_per_match: int = total_confidence // len(match_response.matches) - return [ - AgencySuggestion( - agency_id=int(match_info.id), - confidence=confidence_per_match - ) - for match_info in match_response.matches - ] \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/core.py index d1af5391..2603191a 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/core.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/ckan_/core.py @@ -3,17 +3,18 @@ from typing_extensions import override from src.core.tasks.url.operators.agency_identification.subtasks.convert import \ - convert_match_agency_response_to_subtask_data + convert_agency_suggestions_to_subtask_data from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan_.params import CKANAgencyIDSubtaskParams from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan_.query import \ GetCKANAgencyIDSubtaskParamsQueryBuilder from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData +from src.core.tasks.url.operators.agency_identification.subtasks.models.suggestion import AgencySuggestion +from src.core.tasks.url.operators.agency_identification.subtasks.queries.match_agency import MatchAgencyQueryBuilder from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import \ AgencyIDSubtaskOperatorBase from src.db.client.async_ import AsyncDatabaseClient from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType from src.external.pdap.client import PDAPClient -from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse @final @@ -35,12 +36,14 @@ async def inner_logic(self) -> None: subtask_data_list: list[AutoAgencyIDSubtaskData] = [] for param in params: agency_name: str = param.collector_metadata["agency_name"] - response: MatchAgencyResponse = await self.pdap_client.match_agency( - name=agency_name + agency_suggestions: list[AgencySuggestion] = await self.adb_client.run_query_builder( + MatchAgencyQueryBuilder( + agency_name=agency_name + ) ) - subtask_data: AutoAgencyIDSubtaskData = convert_match_agency_response_to_subtask_data( + subtask_data: AutoAgencyIDSubtaskData = convert_agency_suggestions_to_subtask_data( url_id=param.url_id, - response=response, + agency_suggestions=agency_suggestions, subtask_type=AutoAgencyIDSubtaskType.CKAN, task_id=self.task_id ) diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/whitelisted_root_urls.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/whitelisted_root_urls.py index 272717b5..dd7a5a8c 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/whitelisted_root_urls.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/homepage_match_/queries/ctes/whitelisted_root_urls.py @@ -34,7 +34,7 @@ # The connected URLs must be Meta URLs FlagURLValidated.type == URLType.META_URL, # Root URL can't be "https://catalog.data.gov" - URL.url != "https://catalog.data.gov" + URL.url != "catalog.data.gov" ) .group_by( URL.id diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/core.py b/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/core.py index 4fa92c2e..030139ad 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/core.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/impl/muckrock_/core.py @@ -6,18 +6,19 @@ from src.collectors.impl.muckrock.api_interface.lookup_response import AgencyLookupResponse from src.collectors.impl.muckrock.enums import AgencyLookupResponseType from src.core.tasks.url.operators.agency_identification.subtasks.convert import \ - convert_match_agency_response_to_subtask_data + convert_agency_suggestions_to_subtask_data from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock_.params import \ MuckrockAgencyIDSubtaskParams from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock_.query import \ GetMuckrockAgencyIDSubtaskParamsQueryBuilder from src.core.tasks.url.operators.agency_identification.subtasks.models.subtask import AutoAgencyIDSubtaskData +from src.core.tasks.url.operators.agency_identification.subtasks.models.suggestion import AgencySuggestion +from src.core.tasks.url.operators.agency_identification.subtasks.queries.match_agency import MatchAgencyQueryBuilder from src.core.tasks.url.operators.agency_identification.subtasks.templates.subtask import AgencyIDSubtaskOperatorBase from src.db.client.async_ import AsyncDatabaseClient from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType, SubtaskDetailCode from src.db.models.impl.url.suggestion.agency.subtask.pydantic import URLAutoAgencyIDSubtaskPydantic from src.external.pdap.client import PDAPClient -from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse @final @@ -52,12 +53,14 @@ async def inner_logic(self) -> None: ) subtask_data_list.append(data) continue - match_agency_response: MatchAgencyResponse = await self.pdap_client.match_agency( - name=agency_lookup_response.name + agency_suggestions: list[AgencySuggestion] = await self.adb_client.run_query_builder( + MatchAgencyQueryBuilder( + agency_name=agency_lookup_response.name + ) ) - subtask_data: AutoAgencyIDSubtaskData = convert_match_agency_response_to_subtask_data( + subtask_data: AutoAgencyIDSubtaskData = convert_agency_suggestions_to_subtask_data( url_id=param.url_id, - response=match_agency_response, + agency_suggestions=agency_suggestions, subtask_type=AutoAgencyIDSubtaskType.MUCKROCK, task_id=self.task_id ) diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/match_agency.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/match_agency.py new file mode 100644 index 00000000..4b5d6516 --- /dev/null +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/match_agency.py @@ -0,0 +1,44 @@ +from typing import Sequence + +from sqlalchemy import select, func, desc, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.url.operators.agency_identification.subtasks.models.suggestion import AgencySuggestion +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.queries.base.builder import QueryBuilderBase + + +class MatchAgencyQueryBuilder(QueryBuilderBase): + + def __init__( + self, + agency_name: str + ): + super().__init__() + self.agency_name = agency_name + + async def run(self, session: AsyncSession) -> list[AgencySuggestion]: + query = ( + select( + Agency.id, + func.similarity(Agency.name, self.agency_name).label("similarity") + ) + .where( + func.similarity(Agency.name, self.agency_name) > 0.5 + ) + .order_by( + desc("similarity") + ) + .limit(10) + ) + mappings: Sequence[RowMapping] = await self.sh.mappings( + session=session, + query=query + ) + return [ + AgencySuggestion( + agency_id=mapping[Agency.id], + confidence=int(mapping["similarity"] * 100) + ) + for mapping in mappings + ] \ No newline at end of file diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location.py index 17055d1a..7a15b67a 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location.py @@ -29,7 +29,7 @@ # One of the locations must be linked to an agency exists( select( - LinkAgencyLocation.id + LinkAgencyLocation.location_id ) .join( LocationIDSubtaskSuggestion, diff --git a/src/core/tasks/url/operators/auto_relevant/core.py b/src/core/tasks/url/operators/auto_relevant/core.py index 86cc179e..3acff217 100644 --- a/src/core/tasks/url/operators/auto_relevant/core.py +++ b/src/core/tasks/url/operators/auto_relevant/core.py @@ -5,7 +5,7 @@ from src.core.tasks.url.operators.auto_relevant.sort import separate_success_and_error_subsets from src.core.tasks.url.operators.base import URLTaskOperatorBase from src.db.client.async_ import AsyncDatabaseClient -from src.db.models.impl.url.suggestion.relevant.auto.pydantic.input import AutoRelevancyAnnotationInput +from src.db.models.impl.url.suggestion.url_type.auto.pydantic.input import AutoRelevancyAnnotationInput from src.db.enums import TaskType from src.db.models.impl.url.task_error.pydantic_.insert import URLTaskErrorPydantic from src.db.models.impl.url.task_error.pydantic_.small import URLTaskErrorSmall diff --git a/src/core/tasks/url/operators/auto_relevant/queries/cte.py b/src/core/tasks/url/operators/auto_relevant/queries/cte.py index 8ad33867..c8b816fd 100644 --- a/src/core/tasks/url/operators/auto_relevant/queries/cte.py +++ b/src/core/tasks/url/operators/auto_relevant/queries/cte.py @@ -6,7 +6,7 @@ from src.db.helpers.query import not_exists_url, no_url_task_error from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML -from src.db.models.impl.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion +from src.db.models.impl.url.suggestion.url_type.auto.sqlalchemy import AutoRelevantSuggestion class AutoRelevantPrerequisitesCTEContainer: diff --git a/src/core/tasks/url/operators/auto_relevant/queries/get.py b/src/core/tasks/url/operators/auto_relevant/queries/get.py index 6f6c59b0..b566bb42 100644 --- a/src/core/tasks/url/operators/auto_relevant/queries/get.py +++ b/src/core/tasks/url/operators/auto_relevant/queries/get.py @@ -9,7 +9,7 @@ from src.core.tasks.url.operators.auto_relevant.queries.cte import AutoRelevantPrerequisitesCTEContainer from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion +from src.db.models.impl.url.suggestion.url_type.auto.sqlalchemy import AutoRelevantSuggestion from src.db.queries.base.builder import QueryBuilderBase from src.db.statement_composer import StatementComposer from src.db.utils.compression import decompress_html diff --git a/src/core/tasks/url/operators/base.py b/src/core/tasks/url/operators/base.py index e1d70d5e..8fc0b422 100644 --- a/src/core/tasks/url/operators/base.py +++ b/src/core/tasks/url/operators/base.py @@ -22,15 +22,3 @@ async def conclude_task(self): outcome=TaskOperatorOutcome.SUCCESS, message="Task completed successfully" ) - - async def run_info( - self, - outcome: TaskOperatorOutcome, - message: str - ) -> TaskOperatorRunInfo: - return TaskOperatorRunInfo( - task_id=self.task_id, - task_type=self.task_type, - outcome=outcome, - message=message - ) diff --git a/src/core/tasks/url/operators/html/content_info_getter.py b/src/core/tasks/url/operators/html/content_info_getter.py index bee7183c..a2d554ff 100644 --- a/src/core/tasks/url/operators/html/content_info_getter.py +++ b/src/core/tasks/url/operators/html/content_info_getter.py @@ -1,6 +1,7 @@ from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo from src.db.dtos.url.html_content import URLHTMLContentInfo from src.db.models.impl.url.html.content.enums import HTMLContentType +from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent class HTMLContentInfoGetter: @@ -10,7 +11,7 @@ def __init__(self, response_html_info: ResponseHTMLInfo, url_id: int): self.url_id = url_id self.html_content_infos = [] - def get_all_html_content(self) -> list[URLHTMLContentInfo]: + def get_all_html_content(self) -> list[URLHTMLContent]: for content_type in HTMLContentType: self.add_html_content(content_type) return self.html_content_infos @@ -20,9 +21,9 @@ def add_html_content(self, content_type: HTMLContentType): val = getattr(self.response_html_info, lower_str) if val is None or val.strip() == "": return - uhci = URLHTMLContentInfo( + uhc = URLHTMLContent( url_id=self.url_id, - content_type=content_type, + content_type=content_type.value, content=val ) - self.html_content_infos.append(uhci) + self.html_content_infos.append(uhc) diff --git a/src/core/tasks/url/operators/html/queries/get.py b/src/core/tasks/url/operators/html/queries/get.py index 832d9917..a6cbe4a8 100644 --- a/src/core/tasks/url/operators/html/queries/get.py +++ b/src/core/tasks/url/operators/html/queries/get.py @@ -19,7 +19,7 @@ async def run(self, session: AsyncSession) -> list[URLInfo]: url_info = URLInfo( id=url.id, batch_id=url.batch.id if url.batch is not None else None, - url=url.url, + url=url.full_url, collector_metadata=url.collector_metadata, status=url.status, created_at=url.created_at, diff --git a/src/core/tasks/url/operators/html/queries/insert/convert.py b/src/core/tasks/url/operators/html/queries/insert/convert.py index ca827c7e..e55b9843 100644 --- a/src/core/tasks/url/operators/html/queries/insert/convert.py +++ b/src/core/tasks/url/operators/html/queries/insert/convert.py @@ -6,6 +6,7 @@ from src.db.enums import TaskType from src.db.models.impl.url.error_info.pydantic import URLErrorInfoPydantic from src.db.models.impl.url.html.compressed.pydantic import URLCompressedHTMLPydantic +from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent from src.db.models.impl.url.scrape_info.enums import ScrapeStatus from src.db.models.impl.url.scrape_info.pydantic import URLScrapeInfoInsertModel from src.db.models.impl.url.task_error.pydantic_.insert import URLTaskErrorPydantic @@ -33,8 +34,8 @@ def _convert_to_html_content_info_getter(tdo: UrlHtmlTDO) -> HTMLContentInfoGett url_id=tdo.url_info.id ) -def convert_to_html_content_info_list(tdos: list[UrlHtmlTDO]) -> list[URLHTMLContentInfo]: - html_content_infos = [] +def convert_to_html_content_info_list(tdos: list[UrlHtmlTDO]) -> list[URLHTMLContent]: + html_content_infos: list[URLHTMLContent] = [] for tdo in tdos: if tdo.url_response_info.status != HTTPStatus.OK: continue diff --git a/src/core/tasks/url/operators/html/queries/insert/query.py b/src/core/tasks/url/operators/html/queries/insert/query.py index e0bff2e6..86f04e72 100644 --- a/src/core/tasks/url/operators/html/queries/insert/query.py +++ b/src/core/tasks/url/operators/html/queries/insert/query.py @@ -3,6 +3,10 @@ from src.core.tasks.url.operators.html.queries.insert.convert import convert_to_compressed_html, \ convert_to_html_content_info_list, convert_to_scrape_infos, convert_to_url_errors from src.core.tasks.url.operators.html.tdo import UrlHtmlTDO +from src.db.dtos.url.html_content import URLHTMLContentInfo +from src.db.models.impl.url.html.compressed.pydantic import URLCompressedHTMLPydantic +from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent +from src.db.models.impl.url.scrape_info.pydantic import URLScrapeInfoInsertModel from src.db.queries.base.builder import QueryBuilderBase from src.db.helpers.session import session_helper as sh @@ -14,17 +18,20 @@ def __init__(self, tdos: list[UrlHtmlTDO], task_id: int): self.task_id = task_id async def run(self, session: AsyncSession) -> None: - compressed_html_models = convert_to_compressed_html(self.tdos) - url_html_content_list = convert_to_html_content_info_list(self.tdos) - scrape_info_list = convert_to_scrape_infos(self.tdos) + compressed_html_models: list[URLCompressedHTMLPydantic] = convert_to_compressed_html(self.tdos) + url_html_content_list: list[URLHTMLContent] = convert_to_html_content_info_list(self.tdos) + scrape_info_list: list[URLScrapeInfoInsertModel] = convert_to_scrape_infos(self.tdos) url_errors = convert_to_url_errors(self.tdos, task_id=self.task_id) for models in [ compressed_html_models, - url_html_content_list, scrape_info_list, url_errors ]: await sh.bulk_insert(session, models=models) + await sh.add_all(session=session, models=url_html_content_list) + + + diff --git a/src/core/tasks/url/operators/probe/convert.py b/src/core/tasks/url/operators/probe/convert.py index dcb211f0..e568de91 100644 --- a/src/core/tasks/url/operators/probe/convert.py +++ b/src/core/tasks/url/operators/probe/convert.py @@ -1,5 +1,6 @@ from src.core.tasks.url.operators.probe.tdo import URLProbeTDO from src.db.models.impl.url.web_metadata.insert import URLWebMetadataPydantic +from src.external.url_request.probe.models.redirect import URLProbeRedirectResponsePair def convert_tdo_to_web_metadata_list(tdos: list[URLProbeTDO]) -> list[URLWebMetadataPydantic]: @@ -16,3 +17,19 @@ def convert_tdo_to_web_metadata_list(tdos: list[URLProbeTDO]) -> list[URLWebMeta results.append(web_metadata_object) return results +def convert_tdos_with_functional_equivalents_to_web_metadata_list( + tdos: list[URLProbeTDO] +) -> list[URLWebMetadataPydantic]: + results: list[URLWebMetadataPydantic] = [] + for tdo in tdos: + response: URLProbeRedirectResponsePair = tdo.response.response + dest = response.destination + web_metadata_object = URLWebMetadataPydantic( + url_id=tdo.url_mapping.url_id, + accessed=dest.status_code != 404, + status_code=dest.status_code, + content_type=dest.content_type, + error_message=dest.error + ) + results.append(web_metadata_object) + return results diff --git a/src/core/tasks/url/operators/probe/core.py b/src/core/tasks/url/operators/probe/core.py index 1c961155..4f38c1d9 100644 --- a/src/core/tasks/url/operators/probe/core.py +++ b/src/core/tasks/url/operators/probe/core.py @@ -1,18 +1,25 @@ from typing import final + from typing_extensions import override from src.core.tasks.url.operators.base import URLTaskOperatorBase -from src.core.tasks.url.operators.probe.convert import convert_tdo_to_web_metadata_list -from src.core.tasks.url.operators.probe.filter import filter_non_redirect_tdos, filter_redirect_tdos +from src.core.tasks.url.operators.probe.convert import convert_tdo_to_web_metadata_list, \ + convert_tdos_with_functional_equivalents_to_web_metadata_list +from src.core.tasks.url.operators.probe.filter import filter_non_redirect_tdos, filter_redirect_tdos, \ + filter_functionally_equivalent_urls +from src.core.tasks.url.operators.probe.models.subsets import RedirectTDOSubsets +from src.core.tasks.url.operators.probe.models.upsert_functional_equivalents import URLFunctionalEquivalentsUpsertModel from src.core.tasks.url.operators.probe.queries.insert_redirects.query import InsertRedirectsQueryBuilder from src.core.tasks.url.operators.probe.queries.urls.not_probed.exists import HasURLsWithoutProbeQueryBuilder from src.core.tasks.url.operators.probe.queries.urls.not_probed.get.query import GetURLsWithoutProbeQueryBuilder from src.core.tasks.url.operators.probe.tdo import URLProbeTDO -from src.db.models.impl.url.web_metadata.insert import URLWebMetadataPydantic -from src.external.url_request.core import URLRequestInterface from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.full import FullURLMapping from src.db.enums import TaskType +from src.db.models.impl.url.task_error.pydantic_.small import URLTaskErrorSmall +from src.db.models.impl.url.web_metadata.insert import URLWebMetadataPydantic +from src.external.url_request.core import URLRequestInterface + @final class URLProbeTaskOperator(URLTaskOperatorBase): @@ -36,7 +43,7 @@ async def meets_task_prerequisites(self) -> bool: return await self.has_urls_without_probe() async def get_urls_without_probe(self) -> list[URLProbeTDO]: - url_mappings: list[URLMapping] = await self.adb_client.run_query_builder( + url_mappings: list[FullURLMapping] = await self.adb_client.run_query_builder( GetURLsWithoutProbeQueryBuilder() ) return [URLProbeTDO(url_mapping=url_mapping) for url_mapping in url_mappings] @@ -57,26 +64,76 @@ async def probe_urls(self, tdos: list[URLProbeTDO]) -> None: URLProbeTDO.response """ url_to_tdo: dict[str, URLProbeTDO] = { - tdo.url_mapping.url: tdo for tdo in tdos + tdo.url_mapping.full_url.id_form: tdo for tdo in tdos } responses = await self.url_request_interface.probe_urls( - urls=[tdo.url_mapping.url for tdo in tdos] + urls=[tdo.url_mapping.full_url for tdo in tdos] ) # Re-associate the responses with the URL mappings for response in responses: - tdo = url_to_tdo[response.original_url] + tdo = url_to_tdo[response.original_url.id_form] tdo.response = response async def update_database(self, tdos: list[URLProbeTDO]) -> None: - non_redirect_tdos = filter_non_redirect_tdos(tdos) + none_tdos: list[URLProbeTDO] = [ + tdo for tdo in tdos if tdo.response is None + ] + await self.upload_none_errors(none_tdos) + + non_error_tdos = [ + tdo for tdo in tdos if tdo.response is not None + ] + + non_redirect_tdos = filter_non_redirect_tdos(non_error_tdos) web_metadata_objects: list[URLWebMetadataPydantic] = convert_tdo_to_web_metadata_list(non_redirect_tdos) await self.adb_client.bulk_upsert(web_metadata_objects) - redirect_tdos: list[URLProbeTDO] = filter_redirect_tdos(tdos) + redirect_tdos: list[URLProbeTDO] = filter_redirect_tdos(non_error_tdos) + + # Filter redirects into true redirects and functional equivalents + redirect_subsets: RedirectTDOSubsets = filter_functionally_equivalent_urls(redirect_tdos) + + await self._insert_true_redirects(redirect_subsets.true_redirects) - query_builder = InsertRedirectsQueryBuilder(tdos=redirect_tdos) - await self.adb_client.run_query_builder(query_builder) + await self._update_functional_equivalents(redirect_subsets.functional_equivalents) + + async def upload_none_errors( + self, + tdos: list[URLProbeTDO] + ) -> None: + error_url_ids: list[int] = [tdo.url_mapping.url_id for tdo in tdos] + task_errors = [ + URLTaskErrorSmall( + url_id=url_id, + error="TDO response is None" + ) + for url_id in error_url_ids + ] + await self.add_task_errors(task_errors) + + + async def _insert_true_redirects(self, tdos: list[URLProbeTDO]) -> None: + await self.adb_client.run_query_builder( + InsertRedirectsQueryBuilder(tdos=tdos) + ) + async def _update_functional_equivalents(self, tdos: list[URLProbeTDO]) -> None: + # For non-true redirects, treat the redirected URL as the true URL and update database + url_updates = [ + URLFunctionalEquivalentsUpsertModel( + id=tdo.url_mapping.url_id, + url=tdo.response.response.destination.url.without_scheme.rstrip('/'), + trailing_slash=tdo.response.response.destination.url.without_scheme.endswith('/') + ) + for tdo in tdos + ] + await self.adb_client.bulk_update(url_updates) + # For these URLs, also update web metadata + func_equiv_web_metadata_objects: list[URLWebMetadataPydantic] = \ + convert_tdos_with_functional_equivalents_to_web_metadata_list( + tdos + ) + await self.adb_client.bulk_upsert(func_equiv_web_metadata_objects) async def has_urls_without_probe(self) -> bool: return await self.adb_client.run_query_builder( diff --git a/src/core/tasks/url/operators/probe/filter.py b/src/core/tasks/url/operators/probe/filter.py index 4a129676..2f9313e8 100644 --- a/src/core/tasks/url/operators/probe/filter.py +++ b/src/core/tasks/url/operators/probe/filter.py @@ -1,8 +1,30 @@ +from src.core.tasks.url.operators.probe.models.subsets import RedirectTDOSubsets from src.core.tasks.url.operators.probe.tdo import URLProbeTDO +from src.external.url_request.probe.models.redirect import URLProbeRedirectResponsePair +from src.util.models.full_url import FullURL def filter_non_redirect_tdos(tdos: list[URLProbeTDO]) -> list[URLProbeTDO]: return [tdo for tdo in tdos if not tdo.response.is_redirect] def filter_redirect_tdos(tdos: list[URLProbeTDO]) -> list[URLProbeTDO]: - return [tdo for tdo in tdos if tdo.response.is_redirect] \ No newline at end of file + return [tdo for tdo in tdos if tdo.response.is_redirect] + +def filter_functionally_equivalent_urls(tdos: list[URLProbeTDO]) -> RedirectTDOSubsets: + true_redirects: list[URLProbeTDO] = [] + functional_equivalents: list[URLProbeTDO] = [] + for tdo in tdos: + og_url: FullURL = tdo.url_mapping.full_url + response: URLProbeRedirectResponsePair = tdo.response.response + redirect_url: FullURL = response.destination.url + + if og_url.id_form != redirect_url.id_form: + true_redirects.append(tdo) + # Otherwise, they are functional equivalents. + else: + functional_equivalents.append(tdo) + + return RedirectTDOSubsets( + true_redirects=true_redirects, + functional_equivalents=functional_equivalents + ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/probe/models/__init__.py b/src/core/tasks/url/operators/probe/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/probe/models/subsets.py b/src/core/tasks/url/operators/probe/models/subsets.py new file mode 100644 index 00000000..8cad6434 --- /dev/null +++ b/src/core/tasks/url/operators/probe/models/subsets.py @@ -0,0 +1,8 @@ +from pydantic import BaseModel + +from src.core.tasks.url.operators.probe.tdo import URLProbeTDO + + +class RedirectTDOSubsets(BaseModel): + true_redirects: list[URLProbeTDO] + functional_equivalents: list[URLProbeTDO] diff --git a/src/core/tasks/url/operators/probe/models/upsert_functional_equivalents.py b/src/core/tasks/url/operators/probe/models/upsert_functional_equivalents.py new file mode 100644 index 00000000..434f43af --- /dev/null +++ b/src/core/tasks/url/operators/probe/models/upsert_functional_equivalents.py @@ -0,0 +1,22 @@ +from pydantic import BaseModel + +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.templates_.base import Base +from src.db.templates.markers.bulk.upsert import BulkUpsertableModel + + +class URLFunctionalEquivalentsUpsertModel(BulkUpsertableModel): + + @classmethod + def id_field(cls) -> str: + return "id" + + @classmethod + def sa_model(cls) -> type[Base]: + """Defines the SQLAlchemy model.""" + return URL + + id: int + url: str + trailing_slash: bool + diff --git a/src/core/tasks/url/operators/probe/queries/insert_redirects/convert.py b/src/core/tasks/url/operators/probe/queries/insert_redirects/convert.py index eb0597ba..80d58110 100644 --- a/src/core/tasks/url/operators/probe/queries/insert_redirects/convert.py +++ b/src/core/tasks/url/operators/probe/queries/insert_redirects/convert.py @@ -1,10 +1,11 @@ from src.core.tasks.url.operators.probe.queries.insert_redirects.models.url_response_map import URLResponseMapping -from src.core.tasks.url.operators.probe.queries.urls.exist.model import UrlExistsResult from src.core.tasks.url.operators.probe.tdo import URLProbeTDO -from src.db.dtos.url.mapping import URLMapping from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.core.pydantic.insert import URLInsertModel from src.db.models.impl.url.web_metadata.insert import URLWebMetadataPydantic +from src.util.models.full_url import FullURL +from src.util.models.url_and_scheme import URLAndScheme +from src.util.url import get_url_and_scheme def convert_url_response_mapping_to_web_metadata_list( @@ -23,23 +24,15 @@ def convert_url_response_mapping_to_web_metadata_list( results.append(web_metadata_object) return results - -def convert_to_url_mappings(url_exists_results: list[UrlExistsResult]) -> list[URLMapping]: - return [ - URLMapping( - url=url_exists_result.url, - url_id=url_exists_result.url_id - ) for url_exists_result in url_exists_results - ] - - -def convert_to_url_insert_models(urls: list[str]) -> list[URLInsertModel]: - results = [] +def convert_to_url_insert_models(urls: list[FullURL]) -> list[URLInsertModel]: + results: list[URLInsertModel] = [] for url in urls: results.append( URLInsertModel( - url=url, - source=URLSource.REDIRECT + url=url.without_scheme.rstrip('/'), + scheme=url.scheme, + source=URLSource.REDIRECT, + trailing_slash=url.without_scheme.endswith('/') ) ) return results diff --git a/src/core/tasks/url/operators/probe/queries/insert_redirects/extract.py b/src/core/tasks/url/operators/probe/queries/insert_redirects/extract.py index 3de66e85..1f6d83e5 100644 --- a/src/core/tasks/url/operators/probe/queries/insert_redirects/extract.py +++ b/src/core/tasks/url/operators/probe/queries/insert_redirects/extract.py @@ -1,5 +1,4 @@ from src.core.tasks.url.operators.probe.tdo import URLProbeTDO -from src.db.dtos.url.mapping import URLMapping from src.external.url_request.probe.models.redirect import URLProbeRedirectResponsePair diff --git a/src/core/tasks/url/operators/probe/queries/insert_redirects/filter.py b/src/core/tasks/url/operators/probe/queries/insert_redirects/filter.py deleted file mode 100644 index 1f36893d..00000000 --- a/src/core/tasks/url/operators/probe/queries/insert_redirects/filter.py +++ /dev/null @@ -1,14 +0,0 @@ -from src.db.dtos.url.mapping import URLMapping - - -def filter_new_dest_urls( - url_mappings_in_db: list[URLMapping], - all_dest_urls: list[str] -) -> list[str]: - extant_destination_urls: set[str] = set([url_mapping.url for url_mapping in url_mappings_in_db]) - new_dest_urls: list[str] = [ - url - for url in all_dest_urls - if url not in extant_destination_urls - ] - return new_dest_urls \ No newline at end of file diff --git a/src/core/tasks/url/operators/probe/queries/insert_redirects/map.py b/src/core/tasks/url/operators/probe/queries/insert_redirects/map.py index 53f2b2e1..3f83e941 100644 --- a/src/core/tasks/url/operators/probe/queries/insert_redirects/map.py +++ b/src/core/tasks/url/operators/probe/queries/insert_redirects/map.py @@ -1,15 +1,16 @@ from src.core.tasks.url.operators.probe.queries.insert_redirects.models.url_response_map import URLResponseMapping -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.full import FullURLMapping from src.external.url_request.probe.models.response import URLProbeResponse +from src.util.models.full_url import FullURL def map_url_mappings_to_probe_responses( - url_mappings: list[URLMapping], - url_to_probe_responses: dict[str, URLProbeResponse] + url_mappings: list[FullURLMapping], + url_to_probe_responses: dict[FullURL, URLProbeResponse] ) -> list[URLResponseMapping]: results = [] for url_mapping in url_mappings: - response = url_to_probe_responses[url_mapping.url] + response = url_to_probe_responses[url_mapping.full_url] results.append( URLResponseMapping( url_mapping=url_mapping, diff --git a/src/core/tasks/url/operators/probe/queries/insert_redirects/models/subset.py b/src/core/tasks/url/operators/probe/queries/insert_redirects/models/subset.py new file mode 100644 index 00000000..c5b26c24 --- /dev/null +++ b/src/core/tasks/url/operators/probe/queries/insert_redirects/models/subset.py @@ -0,0 +1,9 @@ +from pydantic import BaseModel + +from src.util.models.full_url import FullURL + + +class DestinationURLSubsets(BaseModel): + new_urls: list[FullURL] + exist_with_alterations: list[FullURL] + exist_as_is: list[FullURL] \ No newline at end of file diff --git a/src/core/tasks/url/operators/probe/queries/insert_redirects/models/url_response_map.py b/src/core/tasks/url/operators/probe/queries/insert_redirects/models/url_response_map.py index efbd5db8..fd90ab65 100644 --- a/src/core/tasks/url/operators/probe/queries/insert_redirects/models/url_response_map.py +++ b/src/core/tasks/url/operators/probe/queries/insert_redirects/models/url_response_map.py @@ -1,9 +1,10 @@ from pydantic import BaseModel -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.full import FullURLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.external.url_request.probe.models.response import URLProbeResponse class URLResponseMapping(BaseModel): - url_mapping: URLMapping + url_mapping: FullURLMapping response: URLProbeResponse \ No newline at end of file diff --git a/src/core/tasks/url/operators/probe/queries/insert_redirects/query.py b/src/core/tasks/url/operators/probe/queries/insert_redirects/query.py index 0ba70c47..79dd7d9a 100644 --- a/src/core/tasks/url/operators/probe/queries/insert_redirects/query.py +++ b/src/core/tasks/url/operators/probe/queries/insert_redirects/query.py @@ -1,14 +1,15 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.core.tasks.url.operators.probe.queries.insert_redirects.extract import extract_response_pairs -from src.core.tasks.url.operators.probe.queries.insert_redirects.filter import filter_new_dest_urls from src.core.tasks.url.operators.probe.queries.insert_redirects.request_manager import InsertRedirectsRequestManager +from src.db.queries.urls_exist.model import URLExistsResult from src.core.tasks.url.operators.probe.tdo import URLProbeTDO -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.full import FullURLMapping from src.db.queries.base.builder import QueryBuilderBase from src.external.url_request.probe.models.redirect import URLProbeRedirectResponsePair from src.external.url_request.probe.models.response import URLProbeResponse -from src.util.url_mapper import URLMapper +from src.util.models.full_url import FullURL +from src.util.url_mapper_.full import FullURLMapper class InsertRedirectsQueryBuilder(QueryBuilderBase): @@ -19,7 +20,7 @@ def __init__( super().__init__() self.tdos = tdos self.source_url_mappings = [tdo.url_mapping for tdo in self.tdos] - self._mapper = URLMapper(self.source_url_mappings) + self._mapper = FullURLMapper(self.source_url_mappings) self._response_pairs: list[URLProbeRedirectResponsePair] = extract_response_pairs(self.tdos) @@ -27,12 +28,12 @@ def __init__( pair.destination for pair in self._response_pairs ] - self._destination_urls: list[str] = [ + self._destination_urls: list[FullURL] = [ response.url for response in self._destination_probe_responses ] - self._destination_url_to_probe_response_mapping: dict[str, URLProbeResponse] = { + self._destination_url_to_probe_response_mapping: dict[FullURL, URLProbeResponse] = { response.url: response for response in self._destination_probe_responses } @@ -50,29 +51,39 @@ async def run(self, session: AsyncSession) -> None: session=session ) - # Get all destination URLs already in the database - dest_url_mappings_in_db: list[URLMapping] = await rm.get_url_mappings_in_db( + url_exist_results: list[URLExistsResult] = await rm.check_if_urls_exist_in_db( urls=self._destination_urls ) - # Filter out to only have those URLs that are new in the database - new_dest_urls: list[str] = filter_new_dest_urls( - url_mappings_in_db=dest_url_mappings_in_db, - all_dest_urls=self._destination_urls - ) + # Two Options: + # - URLs that do not exist in any form in the database + # - URLs that exist as-is or in slightly modified version (url scheme or trailing slash differs) + new_urls: list[FullURL] = [] + extant_url_mappings: list[FullURLMapping] = [] + for result in url_exist_results: + if not result.exists: + new_urls.append(result.query_url) + else: + extant_url_mappings.append( + FullURLMapping( + full_url=result.query_url, + url_id=result.url_id + ) + ) # Add the new URLs - new_dest_url_mappings: list[URLMapping] = await rm.insert_new_urls( - urls=new_dest_urls + new_dest_url_mappings: list[FullURLMapping] = await rm.insert_new_urls( + urls=new_urls ) - all_dest_url_mappings: list[URLMapping] = dest_url_mappings_in_db + new_dest_url_mappings - self._mapper.add_mappings(all_dest_url_mappings) + all_url_mappings: list[FullURLMapping] = extant_url_mappings + new_dest_url_mappings + + self._mapper.add_mappings(all_url_mappings) # Add web metadata for new URLs await rm.add_web_metadata( - all_dest_url_mappings=all_dest_url_mappings, + all_dest_url_mappings=all_url_mappings, dest_url_to_probe_response_mappings=self._destination_url_to_probe_response_mapping, tdos=self.tdos ) diff --git a/src/core/tasks/url/operators/probe/queries/insert_redirects/request_manager.py b/src/core/tasks/url/operators/probe/queries/insert_redirects/request_manager.py index 35dfded5..64e6299a 100644 --- a/src/core/tasks/url/operators/probe/queries/insert_redirects/request_manager.py +++ b/src/core/tasks/url/operators/probe/queries/insert_redirects/request_manager.py @@ -3,23 +3,23 @@ from sqlalchemy import select, tuple_, RowMapping from sqlalchemy.ext.asyncio import AsyncSession -from src.core.tasks.url.operators.probe.queries.insert_redirects.convert import convert_to_url_mappings, \ - convert_to_url_insert_models, convert_tdo_to_url_response_mappings, \ +from src.core.tasks.url.operators.probe.queries.insert_redirects.convert import convert_to_url_insert_models, \ + convert_tdo_to_url_response_mappings, \ convert_url_response_mapping_to_web_metadata_list from src.core.tasks.url.operators.probe.queries.insert_redirects.map import map_url_mappings_to_probe_responses from src.core.tasks.url.operators.probe.queries.insert_redirects.models.url_response_map import URLResponseMapping -from src.core.tasks.url.operators.probe.queries.urls.exist.model import UrlExistsResult -from src.core.tasks.url.operators.probe.queries.urls.exist.query import URLsExistInDBQueryBuilder +from src.db.queries.urls_exist.model import URLExistsResult from src.core.tasks.url.operators.probe.tdo import URLProbeTDO -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.full import FullURLMapping from src.db.helpers.session import session_helper as sh from src.db.models.impl.link.url_redirect_url.pydantic import LinkURLRedirectURLPydantic from src.db.models.impl.link.url_redirect_url.sqlalchemy import LinkURLRedirectURL -from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.web_metadata.insert import URLWebMetadataPydantic +from src.db.queries.urls_exist.query import URLsExistInDBQueryBuilder from src.external.url_request.probe.models.redirect import URLProbeRedirectResponsePair from src.external.url_request.probe.models.response import URLProbeResponse -from src.util.url_mapper import URLMapper +from src.util.models.full_url import FullURL +from src.util.url_mapper_.full import FullURLMapper class InsertRedirectsRequestManager: @@ -27,24 +27,23 @@ class InsertRedirectsRequestManager: def __init__(self, session: AsyncSession): self.session = session - async def get_url_mappings_in_db( + async def check_if_urls_exist_in_db( self, - urls: list[str], - ): - results: list[UrlExistsResult] = await URLsExistInDBQueryBuilder( - urls=urls + urls: list[FullURL], + ) -> list[URLExistsResult]: + results: list[URLExistsResult] = await URLsExistInDBQueryBuilder( + full_urls=urls ).run(self.session) - extant_urls = [result for result in results if result.exists] - return convert_to_url_mappings(extant_urls) + return results - async def insert_new_urls(self, urls: list[str]) -> list[URLMapping]: + async def insert_new_urls(self, urls: list[FullURL]) -> list[FullURLMapping]: if len(urls) == 0: return [] deduplicated_urls = list(set(urls)) insert_models = convert_to_url_insert_models(deduplicated_urls) url_ids = await sh.bulk_insert(self.session, models=insert_models, return_ids=True) url_mappings = [ - URLMapping(url=url, url_id=url_id) + FullURLMapping(full_url=url, url_id=url_id) for url, url_id in zip(deduplicated_urls, url_ids) ] @@ -52,8 +51,8 @@ async def insert_new_urls(self, urls: list[str]) -> list[URLMapping]: async def add_web_metadata( self, - all_dest_url_mappings: list[URLMapping], - dest_url_to_probe_response_mappings: dict[str, URLProbeResponse], + all_dest_url_mappings: list[FullURLMapping], + dest_url_to_probe_response_mappings: dict[FullURL, URLProbeResponse], tdos: list[URLProbeTDO], ) -> None: dest_url_response_mappings = map_url_mappings_to_probe_responses( @@ -72,7 +71,7 @@ async def add_web_metadata( async def add_redirect_links( self, response_pairs: list[URLProbeRedirectResponsePair], - mapper: URLMapper + mapper: FullURLMapper ) -> None: # Get all existing links and exclude link_tuples: list[tuple[int, int]] = [] diff --git a/src/core/tasks/url/operators/probe/queries/urls/exist/model.py b/src/core/tasks/url/operators/probe/queries/urls/exist/model.py deleted file mode 100644 index 1245044c..00000000 --- a/src/core/tasks/url/operators/probe/queries/urls/exist/model.py +++ /dev/null @@ -1,10 +0,0 @@ -from pydantic import BaseModel - - -class UrlExistsResult(BaseModel): - url: str - url_id: int | None - - @property - def exists(self): - return self.url_id is not None \ No newline at end of file diff --git a/src/core/tasks/url/operators/probe/queries/urls/exist/query.py b/src/core/tasks/url/operators/probe/queries/urls/exist/query.py deleted file mode 100644 index 5176add9..00000000 --- a/src/core/tasks/url/operators/probe/queries/urls/exist/query.py +++ /dev/null @@ -1,29 +0,0 @@ -from sqlalchemy import select -from sqlalchemy.ext.asyncio import AsyncSession - -from src.core.tasks.url.operators.probe.queries.urls.exist.model import UrlExistsResult -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.queries.base.builder import QueryBuilderBase -from src.db.helpers.session import session_helper as sh - -class URLsExistInDBQueryBuilder(QueryBuilderBase): - """Checks if URLs exist in the database.""" - - def __init__(self, urls: list[str]): - super().__init__() - self.urls = urls - - async def run(self, session: AsyncSession) -> list[UrlExistsResult]: - query = select(URL.id, URL.url).where(URL.url.in_(self.urls)) - db_mappings = await sh.mappings(session, query=query) - - url_to_id_map: dict[str, int] = { - row["url"]: row["id"] - for row in db_mappings - } - return [ - UrlExistsResult( - url=url, - url_id=url_to_id_map.get(url) - ) for url in self.urls - ] \ No newline at end of file diff --git a/src/core/tasks/url/operators/probe/queries/urls/not_probed/exists.py b/src/core/tasks/url/operators/probe/queries/urls/not_probed/exists.py index 5954c197..087bef65 100644 --- a/src/core/tasks/url/operators/probe/queries/urls/not_probed/exists.py +++ b/src/core/tasks/url/operators/probe/queries/urls/not_probed/exists.py @@ -26,7 +26,7 @@ async def run(self, session: AsyncSession) -> bool: ) .where( or_( - URLWebMetadata.id.is_(None), + URLWebMetadata.url_id.is_(None), URLWebMetadata.updated_at < datetime.now() - timedelta(days=30) ), no_url_task_error(TaskType.PROBE_URL) diff --git a/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/query.py b/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/query.py index 36450252..e8eafd15 100644 --- a/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/query.py +++ b/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/query.py @@ -4,23 +4,23 @@ from sqlalchemy.ext.asyncio import AsyncSession from typing_extensions import override, final -from src.util.clean import clean_url -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.full import FullURLMapping +from src.db.helpers.session import session_helper as sh from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata -from src.db.helpers.session import session_helper as sh from src.db.queries.base.builder import QueryBuilderBase +from src.util.models.full_url import FullURL @final class GetURLsWithoutProbeQueryBuilder(QueryBuilderBase): @override - async def run(self, session: AsyncSession) -> list[URLMapping]: + async def run(self, session: AsyncSession) -> list[FullURLMapping]: query = ( select( URL.id.label("url_id"), - URL.url + URL.full_url ) .outerjoin( URLWebMetadata, @@ -28,7 +28,7 @@ async def run(self, session: AsyncSession) -> list[URLMapping]: ) .where( or_( - URLWebMetadata.id.is_(None), + URLWebMetadata.url_id.is_(None), URLWebMetadata.updated_at < datetime.now() - timedelta(days=30) ) ) @@ -36,8 +36,8 @@ async def run(self, session: AsyncSession) -> list[URLMapping]: ) db_mappings = await sh.mappings(session, query=query) return [ - URLMapping( + FullURLMapping( url_id=mapping["url_id"], - url=clean_url(mapping["url"]) + full_url=FullURL(mapping["full_url"]) ) for mapping in db_mappings ] \ No newline at end of file diff --git a/src/core/tasks/url/operators/probe/tdo.py b/src/core/tasks/url/operators/probe/tdo.py index 5208fd80..0fcb806c 100644 --- a/src/core/tasks/url/operators/probe/tdo.py +++ b/src/core/tasks/url/operators/probe/tdo.py @@ -1,9 +1,12 @@ from pydantic import BaseModel -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.full import FullURLMapping from src.external.url_request.probe.models.wrapper import URLProbeResponseOuterWrapper class URLProbeTDO(BaseModel): - url_mapping: URLMapping + class Config: + arbitrary_types_allowed = True + + url_mapping: FullURLMapping response: URLProbeResponseOuterWrapper | None = None diff --git a/src/core/tasks/url/operators/record_type/core.py b/src/core/tasks/url/operators/record_type/core.py index 8e31fa8d..9f63a6a5 100644 --- a/src/core/tasks/url/operators/record_type/core.py +++ b/src/core/tasks/url/operators/record_type/core.py @@ -1,9 +1,13 @@ from src.core.enums import RecordType from src.core.tasks.url.operators.base import URLTaskOperatorBase from src.core.tasks.url.operators.record_type.llm_api.record_classifier.openai import OpenAIRecordClassifier +from src.core.tasks.url.operators.record_type.queries.get import GetRecordTypeTaskURLsQueryBuilder +from src.core.tasks.url.operators.record_type.queries.prereq import RecordTypeTaskPrerequisiteQueryBuilder from src.core.tasks.url.operators.record_type.tdo import URLRecordTypeTDO from src.db.client.async_ import AsyncDatabaseClient +from src.db.dtos.url.with_html import URLWithHTML from src.db.enums import TaskType +from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.impl.url.task_error.pydantic_.small import URLTaskErrorSmall @@ -18,18 +22,22 @@ def __init__( self.classifier = classifier @property - def task_type(self): + def task_type(self) -> TaskType: return TaskType.RECORD_TYPE - async def meets_task_prerequisites(self): - return await self.adb_client.has_urls_with_html_data_and_without_auto_record_type_suggestion() + async def meets_task_prerequisites(self) -> bool: + return await self.run_query_builder( + RecordTypeTaskPrerequisiteQueryBuilder() + ) async def get_tdos(self) -> list[URLRecordTypeTDO]: - urls_with_html = await self.adb_client.get_urls_with_html_data_and_without_auto_record_type_suggestion() + urls_with_html: list[URLWithHTML] = await self.run_query_builder( + GetRecordTypeTaskURLsQueryBuilder() + ) tdos = [URLRecordTypeTDO(url_with_html=url_with_html) for url_with_html in urls_with_html] return tdos - async def inner_task_logic(self): + async def inner_task_logic(self) -> None: # Get pending urls from Source Collector # with HTML data and without Record Type Metadata tdos = await self.get_tdos() @@ -41,7 +49,10 @@ async def inner_task_logic(self): await self.put_results_into_database(success_subset) await self.update_errors_in_database(error_subset) - async def update_errors_in_database(self, tdos: list[URLRecordTypeTDO]): + async def update_errors_in_database( + self, + tdos: list[URLRecordTypeTDO] + ) -> None: task_errors: list[URLTaskErrorSmall] = [] for tdo in tdos: error_info = URLTaskErrorSmall( @@ -51,20 +62,42 @@ async def update_errors_in_database(self, tdos: list[URLRecordTypeTDO]): task_errors.append(error_info) await self.add_task_errors(task_errors) - async def put_results_into_database(self, tdos: list[URLRecordTypeTDO]): - suggestions = [] + async def put_results_into_database( + self, + tdos: list[URLRecordTypeTDO] + ) -> None: + url_and_record_type_list = [] for tdo in tdos: url_id = tdo.url_with_html.url_id record_type = tdo.record_type - suggestions.append((url_id, record_type)) - await self.adb_client.add_auto_record_type_suggestions(suggestions) + url_and_record_type_list.append((url_id, record_type)) + # Add to database + suggestions: list[AutoRecordTypeSuggestion] = [] + for url_id, record_type in url_and_record_type_list: + suggestion = AutoRecordTypeSuggestion( + url_id=url_id, + record_type=record_type.value + ) + suggestions.append(suggestion) + await self.adb_client.add_all(suggestions) - async def separate_success_and_error_subsets(self, tdos: list[URLRecordTypeTDO]): + @staticmethod + async def separate_success_and_error_subsets( + tdos: list[URLRecordTypeTDO] + ) -> tuple[list[URLRecordTypeTDO], list[URLRecordTypeTDO]]: success_subset = [tdo for tdo in tdos if not tdo.is_errored()] error_subset = [tdo for tdo in tdos if tdo.is_errored()] return success_subset, error_subset - async def get_ml_classifications(self, tdos: list[URLRecordTypeTDO]): + async def get_ml_classifications( + self, + tdos: list[URLRecordTypeTDO] + ) -> None: + """ + Modifies: + - tdo.record_type + - tdo.error + """ for tdo in tdos: try: record_type_str = await self.classifier.classify_url(tdo.url_with_html.html_infos) diff --git a/src/core/tasks/url/operators/record_type/queries/__init__.py b/src/core/tasks/url/operators/record_type/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/record_type/queries/cte.py b/src/core/tasks/url/operators/record_type/queries/cte.py new file mode 100644 index 00000000..22d3db10 --- /dev/null +++ b/src/core/tasks/url/operators/record_type/queries/cte.py @@ -0,0 +1,31 @@ +from sqlalchemy import select, CTE, Column + +from src.db.enums import TaskType +from src.db.helpers.query import not_exists_url, no_url_task_error +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML +from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion + + +class RecordTypeTaskPrerequisiteCTEContainer: + + def __init__(self): + self.cte: CTE = ( + select( + URL.id + ) + .join( + URLCompressedHTML + ) + .where( + not_exists_url(AutoRecordTypeSuggestion), + no_url_task_error( + TaskType.RECORD_TYPE + ) + ) + .cte("record_type_task_prerequisite") + ) + + @property + def url_id(self) -> Column[int]: + return self.cte.columns.id \ No newline at end of file diff --git a/src/core/tasks/url/operators/record_type/queries/get.py b/src/core/tasks/url/operators/record_type/queries/get.py new file mode 100644 index 00000000..c5b7e7e0 --- /dev/null +++ b/src/core/tasks/url/operators/record_type/queries/get.py @@ -0,0 +1,36 @@ +from typing import Sequence + +from sqlalchemy import select, Row +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import selectinload + +from src.core.tasks.url.operators.record_type.queries.cte import RecordTypeTaskPrerequisiteCTEContainer +from src.db.dto_converter import DTOConverter +from src.db.dtos.url.with_html import URLWithHTML +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.base.builder import QueryBuilderBase + + +class GetRecordTypeTaskURLsQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> list[URLWithHTML]: + cte = RecordTypeTaskPrerequisiteCTEContainer() + query = ( + select( + URL + ) + .join( + cte.cte, + cte.url_id == URL.id + ) + .options( + selectinload(URL.html_content) + ) + .limit(100) + .order_by(URL.id) + ) + urls: Sequence[Row[URL]] = await self.sh.scalars( + session=session, + query=query + ) + return DTOConverter.url_list_to_url_with_html_list(urls) diff --git a/src/core/tasks/url/operators/record_type/queries/prereq.py b/src/core/tasks/url/operators/record_type/queries/prereq.py new file mode 100644 index 00000000..32b70adb --- /dev/null +++ b/src/core/tasks/url/operators/record_type/queries/prereq.py @@ -0,0 +1,18 @@ +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.url.operators.record_type.queries.cte import RecordTypeTaskPrerequisiteCTEContainer +from src.db.queries.base.builder import QueryBuilderBase + + +class RecordTypeTaskPrerequisiteQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> bool: + container = RecordTypeTaskPrerequisiteCTEContainer() + query = ( + select( + container.url_id + ) + ) + return await self.sh.results_exist(session=session, query=query) + diff --git a/src/core/tasks/url/operators/root_url/convert.py b/src/core/tasks/url/operators/root_url/convert.py index 405cbc49..1c7a3cdc 100644 --- a/src/core/tasks/url/operators/root_url/convert.py +++ b/src/core/tasks/url/operators/root_url/convert.py @@ -1,17 +1,17 @@ from src.core.tasks.url.operators.root_url.extract import extract_root_url from src.core.tasks.url.operators.root_url.models.root_mapping import URLRootURLMapping -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.flag.root_url.pydantic import FlagRootURLPydantic from src.db.models.impl.link.urls_root_url.pydantic import LinkURLRootURLPydantic from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.core.pydantic.insert import URLInsertModel -from src.util.url_mapper import URLMapper +from src.util.url_mapper_.simple import SimpleURLMapper def convert_to_flag_root_url_pydantic(url_ids: list[int]) -> list[FlagRootURLPydantic]: return [FlagRootURLPydantic(url_id=url_id) for url_id in url_ids] -def convert_to_url_root_url_mapping(url_mappings: list[URLMapping]) -> list[URLRootURLMapping]: +def convert_to_url_root_url_mapping(url_mappings: list[SimpleURLMapping]) -> list[URLRootURLMapping]: return [ URLRootURLMapping( url=mapping.url, @@ -22,18 +22,19 @@ def convert_to_url_root_url_mapping(url_mappings: list[URLMapping]) -> list[URLR def convert_to_url_insert_models(urls: list[str]) -> list[URLInsertModel]: return [ URLInsertModel( - url=url, - source=URLSource.ROOT_URL + url=url.rstrip('/'), + source=URLSource.ROOT_URL, + trailing_slash=url.endswith('/') ) for url in urls ] def convert_to_root_url_links( - root_db_mappings: list[URLMapping], - branch_db_mappings: list[URLMapping], + root_db_mappings: list[SimpleURLMapping], + branch_db_mappings: list[SimpleURLMapping], url_root_url_mappings: list[URLRootURLMapping] ) -> list[LinkURLRootURLPydantic]: - root_mapper = URLMapper(root_db_mappings) - branch_mapper = URLMapper(branch_db_mappings) + root_mapper = SimpleURLMapper(root_db_mappings) + branch_mapper = SimpleURLMapper(branch_db_mappings) results: list[LinkURLRootURLPydantic] = [] for url_root_url_mapping in url_root_url_mappings: diff --git a/src/core/tasks/url/operators/root_url/core.py b/src/core/tasks/url/operators/root_url/core.py index e32654da..ece5929f 100644 --- a/src/core/tasks/url/operators/root_url/core.py +++ b/src/core/tasks/url/operators/root_url/core.py @@ -11,12 +11,12 @@ from src.core.tasks.url.operators.root_url.queries.lookup.response import LookupRootsURLResponse from src.core.tasks.url.operators.root_url.queries.prereq import CheckPrereqsForRootURLTaskQueryBuilder from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.enums import TaskType from src.db.models.impl.flag.root_url.pydantic import FlagRootURLPydantic from src.db.models.impl.link.urls_root_url.pydantic import LinkURLRootURLPydantic from src.db.models.impl.url.core.pydantic.insert import URLInsertModel -from src.util.url_mapper import URLMapper +from src.util.url_mapper_.simple import SimpleURLMapper @final @@ -37,14 +37,14 @@ def task_type(self) -> TaskType: @override async def inner_task_logic(self) -> None: - all_task_mappings: list[URLMapping] = await self._get_urls_for_root_url_task() + all_task_mappings: list[SimpleURLMapping] = await self._get_urls_for_root_url_task() await self.link_urls_to_task( url_ids=[mapping.url_id for mapping in all_task_mappings] ) # Get the Root URLs for all URLs - mapper = URLMapper(all_task_mappings) + mapper = SimpleURLMapper(all_task_mappings) # -- Identify and Derive Root URLs -- @@ -65,7 +65,7 @@ async def inner_task_logic(self) -> None: for response in derived_root_url_lookup_responses if response.url_id is None ] - new_derived_root_url_mappings: list[URLMapping] = await self._add_new_urls(derived_root_urls_not_in_db) + new_derived_root_url_mappings: list[SimpleURLMapping] = await self._add_new_urls(derived_root_urls_not_in_db) # Add these to the mapper mapper.add_mappings(new_derived_root_url_mappings) @@ -105,7 +105,7 @@ async def inner_task_logic(self) -> None: async def _add_root_url_links( self, - mapper: URLMapper, + mapper: SimpleURLMapper, root_url_mappings: list[URLRootURLMapping], ): # For all task URLs that are not root URLs (i.e. 'branch' URLs): @@ -115,8 +115,8 @@ async def _add_root_url_links( branch_urls: list[str] = [mapping.url for mapping in root_url_mappings] root_urls: list[str] = [mapping.root_url for mapping in root_url_mappings] - root_url_db_mappings: list[URLMapping] = await self._lookup_root_urls(root_urls) - task_url_db_mappings: list[URLMapping] = mapper.get_mappings_by_url(branch_urls) + root_url_db_mappings: list[SimpleURLMapping] = await self._lookup_root_urls(root_urls) + task_url_db_mappings: list[SimpleURLMapping] = mapper.get_mappings_by_url(branch_urls) links: list[LinkURLRootURLPydantic] = convert_to_root_url_links( root_db_mappings=root_url_db_mappings, @@ -131,7 +131,7 @@ async def _flag_root_urls( ): await self._flag_as_root_urls(url_ids) - async def _get_urls_for_root_url_task(self) -> list[URLMapping]: + async def _get_urls_for_root_url_task(self) -> list[SimpleURLMapping]: builder = GetURLsForRootURLTaskQueryBuilder() return await self.adb_client.run_query_builder(builder) @@ -139,15 +139,15 @@ async def _lookup_root_urls(self, urls: list[str]) -> list[LookupRootsURLRespons builder = LookupRootURLsQueryBuilder(urls=list(set(urls))) return await self.adb_client.run_query_builder(builder) - async def _add_new_urls(self, urls: list[str]) -> list[URLMapping]: + async def _add_new_urls(self, urls: list[str]) -> list[SimpleURLMapping]: if len(urls) == 0: return [] insert_models: list[URLInsertModel] = convert_to_url_insert_models(urls) url_ids: list[int] = await self.adb_client.bulk_insert(insert_models, return_ids=True) - mappings: list[URLMapping] = [] + mappings: list[SimpleURLMapping] = [] for url, url_id in zip(urls, url_ids): mappings.append( - URLMapping( + SimpleURLMapping( url=url, url_id=url_id ) diff --git a/src/core/tasks/url/operators/root_url/extract.py b/src/core/tasks/url/operators/root_url/extract.py index e384fd15..67a66c6f 100644 --- a/src/core/tasks/url/operators/root_url/extract.py +++ b/src/core/tasks/url/operators/root_url/extract.py @@ -2,6 +2,7 @@ def extract_root_url(url: str) -> str: - parsed_url: ParseResult = urlparse(url) - root_url = f"{parsed_url.scheme}://{parsed_url.netloc}" - return root_url \ No newline at end of file + # URLs in DB should not have HTTPS -- add to enable url parse to function properly + parsed_url: ParseResult = urlparse(f"https://{url}") + root_url = parsed_url.netloc + return root_url.rstrip("/") \ No newline at end of file diff --git a/src/core/tasks/url/operators/root_url/models/root_mapping.py b/src/core/tasks/url/operators/root_url/models/root_mapping.py index 7b115f36..03f87f66 100644 --- a/src/core/tasks/url/operators/root_url/models/root_mapping.py +++ b/src/core/tasks/url/operators/root_url/models/root_mapping.py @@ -7,4 +7,5 @@ class URLRootURLMapping(BaseModel): @property def is_root_url(self) -> bool: - return self.url == self.root_url \ No newline at end of file + # Add rstrip to handle trailing slashes + return self.url.rstrip("/") == self.root_url.rstrip("/") \ No newline at end of file diff --git a/src/core/tasks/url/operators/root_url/queries/get.py b/src/core/tasks/url/operators/root_url/queries/get.py index 3643f343..e02651b3 100644 --- a/src/core/tasks/url/operators/root_url/queries/get.py +++ b/src/core/tasks/url/operators/root_url/queries/get.py @@ -2,7 +2,7 @@ from typing_extensions import override from src.core.tasks.url.operators.root_url.queries._shared.urls_without_root_id import URLS_WITHOUT_ROOT_ID_QUERY -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.helpers.session import session_helper as sh from src.db.queries.base.builder import QueryBuilderBase @@ -10,13 +10,13 @@ class GetURLsForRootURLTaskQueryBuilder(QueryBuilderBase): @override - async def run(self, session: AsyncSession) -> list[URLMapping]: + async def run(self, session: AsyncSession) -> list[SimpleURLMapping]: query = ( URLS_WITHOUT_ROOT_ID_QUERY ) mappings = await sh.mappings(session, query=query) return [ - URLMapping( + SimpleURLMapping( url_id=mapping["id"], url=mapping["url"] ) for mapping in mappings diff --git a/src/core/tasks/url/operators/screenshot/core.py b/src/core/tasks/url/operators/screenshot/core.py index 96627ab8..2afea9ed 100644 --- a/src/core/tasks/url/operators/screenshot/core.py +++ b/src/core/tasks/url/operators/screenshot/core.py @@ -8,7 +8,7 @@ from src.core.tasks.url.operators.screenshot.queries.get import GetURLsForScreenshotTaskQueryBuilder from src.core.tasks.url.operators.screenshot.queries.prereq import URLsForScreenshotTaskPrerequisitesQueryBuilder from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.enums import TaskType from src.db.models.impl.url.screenshot.pydantic import URLScreenshotPydantic from src.db.models.impl.url.task_error.pydantic_.small import URLTaskErrorSmall @@ -31,7 +31,7 @@ async def meets_task_prerequisites(self) -> bool: URLsForScreenshotTaskPrerequisitesQueryBuilder() ) - async def get_urls_without_screenshot(self) -> list[URLMapping]: + async def get_urls_without_screenshot(self) -> list[SimpleURLMapping]: return await self.adb_client.run_query_builder( GetURLsForScreenshotTaskQueryBuilder() ) @@ -47,7 +47,7 @@ async def upload_errors(self, outcomes: list[URLScreenshotOutcome]) -> None: await self.add_task_errors(insert_models) async def inner_task_logic(self) -> None: - url_mappings: list[URLMapping] = await self.get_urls_without_screenshot() + url_mappings: list[SimpleURLMapping] = await self.get_urls_without_screenshot() await self.link_urls_to_task( url_ids=[url_mapping.url_id for url_mapping in url_mappings] ) diff --git a/src/core/tasks/url/operators/screenshot/get.py b/src/core/tasks/url/operators/screenshot/get.py index 7c0d6a42..7598c43e 100644 --- a/src/core/tasks/url/operators/screenshot/get.py +++ b/src/core/tasks/url/operators/screenshot/get.py @@ -1,12 +1,12 @@ from src.core.tasks.url.operators.screenshot.models.outcome import URLScreenshotOutcome -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.external.url_request.dtos.screenshot_response import URLScreenshotResponse from src.external.url_request.screenshot_.core import get_screenshots -from src.util.url_mapper import URLMapper +from src.util.url_mapper_.simple import SimpleURLMapper -async def get_url_screenshots(mappings: list[URLMapping]) -> list[URLScreenshotOutcome]: - mapper = URLMapper(mappings) +async def get_url_screenshots(mappings: list[SimpleURLMapping]) -> list[URLScreenshotOutcome]: + mapper = SimpleURLMapper(mappings) responses: list[URLScreenshotResponse] = await get_screenshots( urls=mapper.get_all_urls() ) diff --git a/src/core/tasks/url/operators/screenshot/queries/cte.py b/src/core/tasks/url/operators/screenshot/queries/cte.py index d961aabf..f1b3b1d2 100644 --- a/src/core/tasks/url/operators/screenshot/queries/cte.py +++ b/src/core/tasks/url/operators/screenshot/queries/cte.py @@ -13,7 +13,7 @@ def __init__(self): self._cte: CTE = ( select( URL.id.label("url_id"), - URL.url, + URL.full_url.label("url"), ) .join( URLWebMetadata, diff --git a/src/core/tasks/url/operators/screenshot/queries/get.py b/src/core/tasks/url/operators/screenshot/queries/get.py index e2dd94df..f3bf2839 100644 --- a/src/core/tasks/url/operators/screenshot/queries/get.py +++ b/src/core/tasks/url/operators/screenshot/queries/get.py @@ -1,18 +1,18 @@ -from typing import Any, Sequence +from typing import Sequence from sqlalchemy import select, RowMapping from sqlalchemy.ext.asyncio import AsyncSession from src.core.tasks.url.operators.screenshot.constants import TASK_URL_LIMIT from src.core.tasks.url.operators.screenshot.queries.cte import URLScreenshotPrerequisitesCTEContainer -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.queries.base.builder import QueryBuilderBase from src.db.helpers.session import session_helper as sh class GetURLsForScreenshotTaskQueryBuilder(QueryBuilderBase): - async def run(self, session: AsyncSession) -> list[URLMapping]: + async def run(self, session: AsyncSession) -> list[SimpleURLMapping]: cte = URLScreenshotPrerequisitesCTEContainer() query = select( @@ -22,4 +22,4 @@ async def run(self, session: AsyncSession) -> list[URLMapping]: mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) - return [URLMapping(**mapping) for mapping in mappings] + return [SimpleURLMapping(**mapping) for mapping in mappings] diff --git a/src/core/tasks/url/operators/submit_approved/convert.py b/src/core/tasks/url/operators/submit_approved/convert.py deleted file mode 100644 index 1c4a8298..00000000 --- a/src/core/tasks/url/operators/submit_approved/convert.py +++ /dev/null @@ -1,19 +0,0 @@ -from src.core.tasks.url.operators.submit_approved.tdo import SubmittedURLInfo -from src.db.models.impl.url.task_error.pydantic_.small import URLTaskErrorSmall - - -async def convert_to_task_errors( - submitted_url_infos: list[SubmittedURLInfo] -) -> list[URLTaskErrorSmall]: - task_errors: list[URLTaskErrorSmall] = [] - error_response_objects = [ - response_object for response_object in submitted_url_infos - if response_object.request_error is not None - ] - for error_response_object in error_response_objects: - error_info = URLTaskErrorSmall( - url_id=error_response_object.url_id, - error=error_response_object.request_error, - ) - task_errors.append(error_info) - return task_errors diff --git a/src/core/tasks/url/operators/submit_approved/core.py b/src/core/tasks/url/operators/submit_approved/core.py deleted file mode 100644 index e16a1269..00000000 --- a/src/core/tasks/url/operators/submit_approved/core.py +++ /dev/null @@ -1,50 +0,0 @@ -from src.core.tasks.url.operators.base import URLTaskOperatorBase -from src.core.tasks.url.operators.submit_approved.convert import convert_to_task_errors -from src.core.tasks.url.operators.submit_approved.filter import filter_successes -from src.core.tasks.url.operators.submit_approved.queries.get import GetValidatedURLsQueryBuilder -from src.core.tasks.url.operators.submit_approved.queries.has_validated import HasValidatedURLsQueryBuilder -from src.core.tasks.url.operators.submit_approved.tdo import SubmitApprovedURLTDO, SubmittedURLInfo -from src.db.client.async_ import AsyncDatabaseClient -from src.db.enums import TaskType -from src.db.models.impl.url.task_error.pydantic_.small import URLTaskErrorSmall -from src.external.pdap.client import PDAPClient - - -class SubmitApprovedURLTaskOperator(URLTaskOperatorBase): - - def __init__( - self, - adb_client: AsyncDatabaseClient, - pdap_client: PDAPClient - ): - super().__init__(adb_client) - self.pdap_client = pdap_client - - @property - def task_type(self): - return TaskType.SUBMIT_APPROVED - - async def meets_task_prerequisites(self): - return await self.adb_client.run_query_builder(HasValidatedURLsQueryBuilder()) - - async def inner_task_logic(self): - # Retrieve all URLs that are validated and not submitted - tdos: list[SubmitApprovedURLTDO] = await self.get_validated_urls() - - # Link URLs to this task - await self.link_urls_to_task(url_ids=[tdo.url_id for tdo in tdos]) - - # Submit each URL, recording errors if they exist - submitted_url_infos: list[SubmittedURLInfo] = await self.pdap_client.submit_data_source_urls(tdos) - - task_errors: list[URLTaskErrorSmall] = await convert_to_task_errors(submitted_url_infos) - success_infos = await filter_successes(submitted_url_infos) - - # Update the database for successful submissions - await self.adb_client.mark_urls_as_submitted(infos=success_infos) - - # Update the database for failed submissions - await self.add_task_errors(task_errors) - - async def get_validated_urls(self) -> list[SubmitApprovedURLTDO]: - return await self.adb_client.run_query_builder(GetValidatedURLsQueryBuilder()) diff --git a/src/core/tasks/url/operators/submit_approved/filter.py b/src/core/tasks/url/operators/submit_approved/filter.py deleted file mode 100644 index 4ba2fad8..00000000 --- a/src/core/tasks/url/operators/submit_approved/filter.py +++ /dev/null @@ -1,11 +0,0 @@ -from src.core.tasks.url.operators.submit_approved.tdo import SubmittedURLInfo - - -async def filter_successes( - submitted_url_infos: list[SubmittedURLInfo] -) -> list[SubmittedURLInfo]: - success_infos = [ - response_object for response_object in submitted_url_infos - if response_object.data_source_id is not None - ] - return success_infos diff --git a/src/core/tasks/url/operators/submit_approved/queries/cte.py b/src/core/tasks/url/operators/submit_approved/queries/cte.py deleted file mode 100644 index cf7ccb71..00000000 --- a/src/core/tasks/url/operators/submit_approved/queries/cte.py +++ /dev/null @@ -1,31 +0,0 @@ -from sqlalchemy import CTE, select, exists -from sqlalchemy.orm import aliased - -from src.collectors.enums import URLStatus -from src.db.enums import TaskType -from src.db.helpers.query import not_exists_url, no_url_task_error -from src.db.models.impl.flag.url_validated.enums import URLType -from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource - -VALIDATED_URLS_WITHOUT_DS_SQ =( - select(URL) - .join( - FlagURLValidated, - FlagURLValidated.url_id == URL.id - ) - .where( - URL.status == URLStatus.OK, - URL.name.isnot(None), - FlagURLValidated.type == URLType.DATA_SOURCE, - not_exists_url(URLDataSource), - no_url_task_error(TaskType.SUBMIT_APPROVED) - ) - .subquery() -) - -VALIDATED_URLS_WITHOUT_DS_ALIAS = aliased( - URL, - VALIDATED_URLS_WITHOUT_DS_SQ -) \ No newline at end of file diff --git a/src/core/tasks/url/operators/submit_approved/queries/get.py b/src/core/tasks/url/operators/submit_approved/queries/get.py deleted file mode 100644 index d4138f9a..00000000 --- a/src/core/tasks/url/operators/submit_approved/queries/get.py +++ /dev/null @@ -1,68 +0,0 @@ -from sqlalchemy import select -from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.orm import selectinload - -from src.core.tasks.url.operators.submit_approved.queries.cte import VALIDATED_URLS_WITHOUT_DS_ALIAS -from src.core.tasks.url.operators.submit_approved.tdo import SubmitApprovedURLTDO -from src.db.helpers.session import session_helper as sh -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.queries.base.builder import QueryBuilderBase - - -class GetValidatedURLsQueryBuilder(QueryBuilderBase): - - async def run(self, session: AsyncSession) -> list[SubmitApprovedURLTDO]: - query = await self._build_query() - urls = await sh.scalars(session, query) - return await self._process_results(urls) - - async def _process_results(self, urls): - results: list[SubmitApprovedURLTDO] = [] - for url in urls: - try: - tdo = await self._process_result(url) - except Exception as e: - raise ValueError(f"Failed to process url {url.id}") from e - results.append(tdo) - return results - - @staticmethod - async def _build_query(): - query = ( - select(VALIDATED_URLS_WITHOUT_DS_ALIAS) - .options( - selectinload(VALIDATED_URLS_WITHOUT_DS_ALIAS.optional_data_source_metadata), - selectinload(VALIDATED_URLS_WITHOUT_DS_ALIAS.confirmed_agencies), - selectinload(VALIDATED_URLS_WITHOUT_DS_ALIAS.reviewing_user), - selectinload(VALIDATED_URLS_WITHOUT_DS_ALIAS.record_type), - ).limit(100) - ) - return query - - @staticmethod - async def _process_result(url: URL) -> SubmitApprovedURLTDO: - agency_ids = [] - for agency in url.confirmed_agencies: - agency_ids.append(agency.agency_id) - optional_metadata = url.optional_data_source_metadata - if optional_metadata is None: - record_formats = None - data_portal_type = None - supplying_entity = None - else: - record_formats = optional_metadata.record_formats - data_portal_type = optional_metadata.data_portal_type - supplying_entity = optional_metadata.supplying_entity - tdo = SubmitApprovedURLTDO( - url_id=url.id, - url=url.url, - name=url.name, - agency_ids=agency_ids, - description=url.description, - record_type=url.record_type.record_type, - record_formats=record_formats, - data_portal_type=data_portal_type, - supplying_entity=supplying_entity, - approving_user_id=url.reviewing_user.user_id - ) - return tdo \ No newline at end of file diff --git a/src/core/tasks/url/operators/submit_approved/queries/has_validated.py b/src/core/tasks/url/operators/submit_approved/queries/has_validated.py deleted file mode 100644 index 2cbee486..00000000 --- a/src/core/tasks/url/operators/submit_approved/queries/has_validated.py +++ /dev/null @@ -1,18 +0,0 @@ -from sqlalchemy import select -from sqlalchemy.ext.asyncio import AsyncSession - -from src.core.tasks.url.operators.submit_approved.queries.cte import VALIDATED_URLS_WITHOUT_DS_ALIAS -from src.db.helpers.session import session_helper as sh -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.queries.base.builder import QueryBuilderBase - - -class HasValidatedURLsQueryBuilder(QueryBuilderBase): - - async def run(self, session: AsyncSession) -> bool: - query = ( - select(VALIDATED_URLS_WITHOUT_DS_ALIAS) - .limit(1) - ) - url: URL | None = await sh.one_or_none(session, query=query) - return url is not None \ No newline at end of file diff --git a/src/core/tasks/url/operators/submit_approved/queries/mark_submitted.py b/src/core/tasks/url/operators/submit_approved/queries/mark_submitted.py deleted file mode 100644 index 4ebfef56..00000000 --- a/src/core/tasks/url/operators/submit_approved/queries/mark_submitted.py +++ /dev/null @@ -1,29 +0,0 @@ -from sqlalchemy import update -from sqlalchemy.ext.asyncio import AsyncSession - -from src.collectors.enums import URLStatus -from src.core.tasks.url.operators.submit_approved.tdo import SubmittedURLInfo -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource -from src.db.queries.base.builder import QueryBuilderBase - - -class MarkURLsAsSubmittedQueryBuilder(QueryBuilderBase): - - def __init__(self, infos: list[SubmittedURLInfo]): - super().__init__() - self.infos = infos - - async def run(self, session: AsyncSession): - for info in self.infos: - url_id = info.url_id - data_source_id = info.data_source_id - - url_data_source_object = URLDataSource( - url_id=url_id, - data_source_id=data_source_id - ) - if info.submitted_at is not None: - url_data_source_object.created_at = info.submitted_at - session.add(url_data_source_object) - diff --git a/src/core/tasks/url/operators/submit_approved/tdo.py b/src/core/tasks/url/operators/submit_approved/tdo.py deleted file mode 100644 index 89d89d9e..00000000 --- a/src/core/tasks/url/operators/submit_approved/tdo.py +++ /dev/null @@ -1,26 +0,0 @@ -from datetime import datetime - -from pydantic import BaseModel - -from src.core.enums import RecordType - - -class SubmitApprovedURLTDO(BaseModel): - url_id: int - url: str - record_type: RecordType - agency_ids: list[int] - name: str - description: str | None = None - approving_user_id: int - record_formats: list[str] | None = None - data_portal_type: str | None = None - supplying_entity: str | None = None - data_source_id: int | None = None - request_error: str | None = None - -class SubmittedURLInfo(BaseModel): - url_id: int - data_source_id: int | None - request_error: str | None - submitted_at: datetime | None = None \ No newline at end of file diff --git a/src/core/tasks/url/operators/submit_meta_urls/core.py b/src/core/tasks/url/operators/submit_meta_urls/core.py deleted file mode 100644 index e06901da..00000000 --- a/src/core/tasks/url/operators/submit_meta_urls/core.py +++ /dev/null @@ -1,78 +0,0 @@ -from src.core.tasks.url.operators.base import URLTaskOperatorBase -from src.core.tasks.url.operators.submit_meta_urls.queries.get import GetMetaURLsForSubmissionQueryBuilder -from src.core.tasks.url.operators.submit_meta_urls.queries.prereq import \ - MeetsMetaURLSSubmissionPrerequisitesQueryBuilder -from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.mapping import URLMapping -from src.db.enums import TaskType -from src.db.models.impl.url.ds_meta_url.pydantic import URLDSMetaURLPydantic -from src.db.models.impl.url.task_error.pydantic_.small import URLTaskErrorSmall -from src.external.pdap.client import PDAPClient -from src.external.pdap.impl.meta_urls.enums import SubmitMetaURLsStatus -from src.external.pdap.impl.meta_urls.request import SubmitMetaURLsRequest -from src.external.pdap.impl.meta_urls.response import SubmitMetaURLsResponse -from src.util.url_mapper import URLMapper - - -class SubmitMetaURLsTaskOperator(URLTaskOperatorBase): - - def __init__( - self, - adb_client: AsyncDatabaseClient, - pdap_client: PDAPClient - ): - super().__init__(adb_client) - self.pdap_client = pdap_client - - @property - def task_type(self) -> TaskType: - return TaskType.SUBMIT_META_URLS - - async def meets_task_prerequisites(self) -> bool: - return await self.adb_client.run_query_builder( - MeetsMetaURLSSubmissionPrerequisitesQueryBuilder() - ) - - async def inner_task_logic(self) -> None: - requests: list[SubmitMetaURLsRequest] = await self.adb_client.run_query_builder( - GetMetaURLsForSubmissionQueryBuilder() - ) - - url_mappings: list[URLMapping] = [ - URLMapping( - url=request.url, - url_id=request.url_id, - ) - for request in requests - ] - - mapper = URLMapper(url_mappings) - - await self.link_urls_to_task(mapper.get_all_ids()) - - responses: list[SubmitMetaURLsResponse] = \ - await self.pdap_client.submit_meta_urls(requests) - - errors: list[URLTaskErrorSmall] = [] - inserts: list[URLDSMetaURLPydantic] = [] - - for response in responses: - url_id: int = mapper.get_id(response.url) - if response.status == SubmitMetaURLsStatus.SUCCESS: - inserts.append( - URLDSMetaURLPydantic( - url_id=url_id, - agency_id=response.agency_id, - ds_meta_url_id=response.meta_url_id - ) - ) - else: - errors.append( - URLTaskErrorSmall( - url_id=url_id, - error=response.error, - ) - ) - - await self.add_task_errors(errors) - await self.adb_client.bulk_insert(inserts) diff --git a/src/core/tasks/url/operators/submit_meta_urls/queries/cte.py b/src/core/tasks/url/operators/submit_meta_urls/queries/cte.py deleted file mode 100644 index d350258c..00000000 --- a/src/core/tasks/url/operators/submit_meta_urls/queries/cte.py +++ /dev/null @@ -1,61 +0,0 @@ -from sqlalchemy import select, exists, Column, CTE - -from src.db.enums import TaskType -from src.db.helpers.query import no_url_task_error -from src.db.models.impl.agency.sqlalchemy import Agency -from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.ds_meta_url.sqlalchemy import URLDSMetaURL -from src.db.models.views.meta_url import MetaURL - - -class SubmitMetaURLsPrerequisitesCTEContainer: - - def __init__(self): - - self._cte = ( - select( - URL.id.label("url_id"), - URL.url, - LinkURLAgency.agency_id, - ) - # Validated as Meta URL - .join( - MetaURL, - MetaURL.url_id == URL.id - ) - .join( - LinkURLAgency, - LinkURLAgency.url_id == URL.id - ) - # Does not have a submission - .where( - ~exists( - select( - URLDSMetaURL.ds_meta_url_id - ) - .where( - URLDSMetaURL.url_id == URL.id, - URLDSMetaURL.agency_id == LinkURLAgency.agency_id - ) - ), - no_url_task_error(TaskType.SUBMIT_META_URLS) - ) - .cte("submit_meta_urls_prerequisites") - ) - - @property - def cte(self) -> CTE: - return self._cte - - @property - def url_id(self) -> Column[int]: - return self._cte.c.url_id - - @property - def agency_id(self) -> Column[int]: - return self._cte.c.agency_id - - @property - def url(self) -> Column[str]: - return self._cte.c.url \ No newline at end of file diff --git a/src/core/tasks/url/operators/submit_meta_urls/queries/get.py b/src/core/tasks/url/operators/submit_meta_urls/queries/get.py deleted file mode 100644 index 518393f6..00000000 --- a/src/core/tasks/url/operators/submit_meta_urls/queries/get.py +++ /dev/null @@ -1,34 +0,0 @@ -from typing import Any, Sequence - -from sqlalchemy import select, RowMapping -from sqlalchemy.ext.asyncio import AsyncSession - -from src.core.tasks.url.operators.submit_meta_urls.queries.cte import SubmitMetaURLsPrerequisitesCTEContainer -from src.db.queries.base.builder import QueryBuilderBase -from src.external.pdap.impl.meta_urls.request import SubmitMetaURLsRequest - -from src.db.helpers.session import session_helper as sh - -class GetMetaURLsForSubmissionQueryBuilder(QueryBuilderBase): - - - async def run(self, session: AsyncSession) -> list[SubmitMetaURLsRequest]: - cte = SubmitMetaURLsPrerequisitesCTEContainer() - query = ( - select( - cte.url_id, - cte.agency_id, - cte.url - ) - ) - - mappings: Sequence[RowMapping] = await sh.mappings(session, query=query) - - return [ - SubmitMetaURLsRequest( - url_id=mapping["url_id"], - agency_id=mapping["agency_id"], - url=mapping["url"], - ) - for mapping in mappings - ] diff --git a/src/core/tasks/url/operators/submit_meta_urls/queries/prereq.py b/src/core/tasks/url/operators/submit_meta_urls/queries/prereq.py deleted file mode 100644 index 3b5538be..00000000 --- a/src/core/tasks/url/operators/submit_meta_urls/queries/prereq.py +++ /dev/null @@ -1,20 +0,0 @@ -from sqlalchemy import select -from sqlalchemy.ext.asyncio import AsyncSession - -from src.core.tasks.url.operators.submit_meta_urls.queries.cte import SubmitMetaURLsPrerequisitesCTEContainer -from src.db.queries.base.builder import QueryBuilderBase -from src.db.helpers.session import session_helper as sh - - -class MeetsMetaURLSSubmissionPrerequisitesQueryBuilder(QueryBuilderBase): - - - async def run(self, session: AsyncSession) -> bool: - cte = SubmitMetaURLsPrerequisitesCTEContainer() - query = ( - select( - cte.url_id, - ) - ) - - return await sh.has_results(session, query=query) \ No newline at end of file diff --git a/src/core/tasks/url/operators/validate/queries/ctes/counts/constants.py b/src/core/tasks/url/operators/validate/queries/ctes/counts/constants.py new file mode 100644 index 00000000..d09029a4 --- /dev/null +++ b/src/core/tasks/url/operators/validate/queries/ctes/counts/constants.py @@ -0,0 +1,3 @@ + + +ANONYMOUS_VOTE_RATIO = 0.5 \ No newline at end of file diff --git a/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/agency.py b/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/agency.py index e9df9db4..36fe0a87 100644 --- a/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/agency.py +++ b/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/agency.py @@ -1,24 +1,66 @@ from sqlalchemy import select, func +from src.core.tasks.url.operators.validate.queries.ctes.counts.constants import ANONYMOUS_VOTE_RATIO from src.core.tasks.url.operators.validate.queries.ctes.counts.core import ValidatedCountsCTEContainer -from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion +from src.db.models.impl.url.suggestion.anonymous.agency.sqlalchemy import AnonymousAnnotationAgency from src.db.models.views.unvalidated_url import UnvalidatedURL +_user_counts = ( + select( + UserURLAgencySuggestion.url_id, + UserURLAgencySuggestion.agency_id.label("entity"), + func.count().label("votes") + ) + .group_by( + UserURLAgencySuggestion.url_id, + UserURLAgencySuggestion.agency_id + ) +) + +_anon_counts = ( + select( + AnonymousAnnotationAgency.url_id, + AnonymousAnnotationAgency.agency_id.label("entity"), + (func.count() / ANONYMOUS_VOTE_RATIO).label("votes") + ) + .group_by( + AnonymousAnnotationAgency.url_id, + AnonymousAnnotationAgency.agency_id + ) +) + +_union_counts = ( + select( + _user_counts.c.url_id, + _user_counts.c.entity, + _user_counts.c.votes + ) + .union_all( + select( + _anon_counts.c.url_id, + _anon_counts.c.entity, + _anon_counts.c.votes + ) + ) + .cte("counts_agency_union") +) + AGENCY_VALIDATION_COUNTS_CTE = ValidatedCountsCTEContainer( ( - select( - UserUrlAgencySuggestion.url_id, - UserUrlAgencySuggestion.agency_id.label("entity"), - func.count().label("votes") - ) - .join( - UnvalidatedURL, - UserUrlAgencySuggestion.url_id == UnvalidatedURL.url_id - ) - .group_by( - UserUrlAgencySuggestion.url_id, - UserUrlAgencySuggestion.agency_id - ) - .cte("counts_agency") + select( + _union_counts.c.url_id, + _union_counts.c.entity, + func.sum(_union_counts.c.votes).label("votes") + ) + .join( + UnvalidatedURL, + _union_counts.c.url_id == UnvalidatedURL.url_id + ) + .group_by( + _union_counts.c.url_id, + _union_counts.c.entity, ) + .cte("counts_agency") + ) ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/location.py b/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/location.py index 2ef385cc..4e180e18 100644 --- a/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/location.py +++ b/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/location.py @@ -1,24 +1,67 @@ from sqlalchemy import select, func +from src.core.tasks.url.operators.validate.queries.ctes.counts.constants import ANONYMOUS_VOTE_RATIO from src.core.tasks.url.operators.validate.queries.ctes.counts.core import ValidatedCountsCTEContainer +from src.db.models.impl.url.suggestion.anonymous.location.sqlalchemy import AnonymousAnnotationLocation +from src.db.models.impl.url.suggestion.anonymous.url_type.sqlalchemy import AnonymousAnnotationURLType from src.db.models.impl.url.suggestion.location.user.sqlalchemy import UserLocationSuggestion from src.db.models.views.unvalidated_url import UnvalidatedURL +_user_counts = ( + select( + UserLocationSuggestion.url_id, + UserLocationSuggestion.location_id.label("entity"), + func.count().label("votes") + ) + .group_by( + UserLocationSuggestion.url_id, + UserLocationSuggestion.location_id + ) +) + +_anon_counts = ( + select( + AnonymousAnnotationLocation.url_id, + AnonymousAnnotationLocation.location_id.label("entity"), + (func.count() / ANONYMOUS_VOTE_RATIO).label("votes") + ) + .group_by( + AnonymousAnnotationLocation.url_id, + AnonymousAnnotationLocation.location_id + ) +) + +_union_counts = ( + select( + _user_counts.c.url_id, + _user_counts.c.entity, + _user_counts.c.votes + ) + .union_all( + select( + _anon_counts.c.url_id, + _anon_counts.c.entity, + _anon_counts.c.votes + ) + ) + .cte("counts_location_union") +) + LOCATION_VALIDATION_COUNTS_CTE = ValidatedCountsCTEContainer( ( - select( - UserLocationSuggestion.url_id, - UserLocationSuggestion.location_id.label("entity"), - func.count().label("votes") - ) - .join( - UnvalidatedURL, - UserLocationSuggestion.url_id == UnvalidatedURL.url_id - ) - .group_by( - UserLocationSuggestion.url_id, - UserLocationSuggestion.location_id - ) - .cte("counts_location") + select( + _union_counts.c.url_id, + _union_counts.c.entity, + func.sum(_union_counts.c.votes).label("votes") + ) + .join( + UnvalidatedURL, + _union_counts.c.url_id == UnvalidatedURL.url_id + ) + .group_by( + _union_counts.c.url_id, + _union_counts.c.entity, ) + .cte("counts_location") + ) ) \ No newline at end of file diff --git a/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/record_type.py b/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/record_type.py index 6300ec92..65b1f9b0 100644 --- a/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/record_type.py +++ b/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/record_type.py @@ -1,23 +1,66 @@ from sqlalchemy import select, func +from src.core.tasks.url.operators.validate.queries.ctes.counts.constants import ANONYMOUS_VOTE_RATIO from src.core.tasks.url.operators.validate.queries.ctes.counts.core import ValidatedCountsCTEContainer +from src.db.models.impl.url.suggestion.anonymous.record_type.sqlalchemy import AnonymousAnnotationRecordType from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion from src.db.models.views.unvalidated_url import UnvalidatedURL +_user_counts = ( + select( + UserRecordTypeSuggestion.url_id, + UserRecordTypeSuggestion.record_type.label("entity"), + func.count().label("votes") + ) + .group_by( + UserRecordTypeSuggestion.url_id, + UserRecordTypeSuggestion.record_type + ) +) + +_anon_counts = ( + select( + AnonymousAnnotationRecordType.url_id, + AnonymousAnnotationRecordType.record_type.label("entity"), + (func.count() * ANONYMOUS_VOTE_RATIO).label("votes") + ) + .group_by( + AnonymousAnnotationRecordType.url_id, + AnonymousAnnotationRecordType.record_type + ) +) + +_union_counts = ( + select( + _user_counts.c.url_id, + _user_counts.c.entity, + _user_counts.c.votes + ) + .union_all( + select( + _anon_counts.c.url_id, + _anon_counts.c.entity, + _anon_counts.c.votes + ) + ) + .cte("counts_record_type_union") +) + + RECORD_TYPE_COUNTS_CTE = ValidatedCountsCTEContainer( ( select( - UserRecordTypeSuggestion.url_id, - UserRecordTypeSuggestion.record_type.label("entity"), - func.count().label("votes") + _union_counts.c.url_id, + _union_counts.c.entity, + func.sum(_union_counts.c.votes).label("votes") ) .join( UnvalidatedURL, - UserRecordTypeSuggestion.url_id == UnvalidatedURL.url_id + _union_counts.c.url_id == UnvalidatedURL.url_id ) .group_by( - UserRecordTypeSuggestion.url_id, - UserRecordTypeSuggestion.record_type + _union_counts.c.url_id, + _union_counts.c.entity, ) .cte("counts_record_type") ) diff --git a/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/url_type.py b/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/url_type.py index 0e3de946..72638f19 100644 --- a/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/url_type.py +++ b/src/core/tasks/url/operators/validate/queries/ctes/counts/impl/url_type.py @@ -1,24 +1,65 @@ from sqlalchemy import select, func +from src.core.tasks.url.operators.validate.queries.ctes.counts.constants import ANONYMOUS_VOTE_RATIO from src.core.tasks.url.operators.validate.queries.ctes.counts.core import ValidatedCountsCTEContainer -from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion +from src.db.models.impl.url.suggestion.anonymous.url_type.sqlalchemy import AnonymousAnnotationURLType +from src.db.models.impl.url.suggestion.url_type.user import UserURLTypeSuggestion from src.db.models.views.unvalidated_url import UnvalidatedURL +_user_counts = ( + select( + UserURLTypeSuggestion.url_id, + UserURLTypeSuggestion.type.label("entity"), + func.count().label("votes") + ) + .group_by( + UserURLTypeSuggestion.url_id, + UserURLTypeSuggestion.type + ) +) + +_anon_counts = ( + select( + AnonymousAnnotationURLType.url_id, + AnonymousAnnotationURLType.url_type.label("entity"), + (func.count() / ANONYMOUS_VOTE_RATIO).label("votes") + ) + .group_by( + AnonymousAnnotationURLType.url_id, + AnonymousAnnotationURLType.url_type + ) +) + +_union_counts = ( + select( + _user_counts.c.url_id, + _user_counts.c.entity, + _user_counts.c.votes + ) + .union_all( + select( + _anon_counts.c.url_id, + _anon_counts.c.entity, + _anon_counts.c.votes + ) + ) + .cte("counts_url_type_union") +) + URL_TYPES_VALIDATION_COUNTS_CTE = ValidatedCountsCTEContainer( ( select( - UserURLTypeSuggestion.url_id, - UserURLTypeSuggestion.type.label("entity"), - func.count().label("votes") + _union_counts.c.url_id, + _union_counts.c.entity, + func.sum(_union_counts.c.votes).label("votes") ) .join( UnvalidatedURL, - UserURLTypeSuggestion.url_id == UnvalidatedURL.url_id + _union_counts.c.url_id == UnvalidatedURL.url_id ) .group_by( - UserURLTypeSuggestion.url_id, - UserURLTypeSuggestion.type + _union_counts.c.url_id, + _union_counts.c.entity, ) .cte("counts_url_type") ) diff --git a/src/core/tasks/url/operators/validate/queries/insert.py b/src/core/tasks/url/operators/validate/queries/insert.py index 31bdfa74..00dc36ac 100644 --- a/src/core/tasks/url/operators/validate/queries/insert.py +++ b/src/core/tasks/url/operators/validate/queries/insert.py @@ -4,14 +4,14 @@ from sqlalchemy.ext.asyncio import AsyncSession from src.core.tasks.url.operators.validate.queries.get.models.response import GetURLsForAutoValidationResponse +from src.db.helpers.session import session_helper as sh from src.db.models.impl.flag.auto_validated.pydantic import FlagURLAutoValidatedPydantic from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic from src.db.models.impl.link.url_agency.pydantic import LinkURLAgencyPydantic -from src.db.models.impl.url.core.pydantic.upsert import URLUpsertModel from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.record_type.pydantic import URLRecordTypePydantic from src.db.queries.base.builder import QueryBuilderBase -from src.db.helpers.session import session_helper as sh + class InsertURLAutoValidationsQueryBuilder(QueryBuilderBase): diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 93c36544..125c594e 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -1,10 +1,10 @@ -from datetime import datetime, timedelta +from datetime import datetime from functools import wraps -from typing import Optional, Type, Any, List, Sequence +from typing import Optional, Any, List +from uuid import UUID, uuid4 -from sqlalchemy import select, exists, func, Select, and_, update, delete, Row, text -from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker -from sqlalchemy.orm import selectinload +from sqlalchemy import select, func, Select, and_, update, Row, text +from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker, AsyncEngine from src.api.endpoints.annotate.all.get.models.response import GetNextURLForAllAnnotationResponse from src.api.endpoints.annotate.all.get.queries.core import GetNextURLForAllAnnotationQueryBuilder @@ -46,19 +46,16 @@ from src.core.tasks.url.operators.html.queries.get import \ GetPendingURLsWithoutHTMLDataQueryBuilder from src.core.tasks.url.operators.misc_metadata.tdo import URLMiscellaneousMetadataTDO -from src.core.tasks.url.operators.submit_approved.queries.mark_submitted import MarkURLsAsSubmittedQueryBuilder -from src.core.tasks.url.operators.submit_approved.tdo import SubmittedURLInfo from src.db.client.helpers import add_standard_limit_and_offset from src.db.client.types import UserSuggestionModel from src.db.config_manager import ConfigManager from src.db.constants import PLACEHOLDER_AGENCY_NAME -from src.db.dto_converter import DTOConverter from src.db.dtos.url.html_content import URLHTMLContentInfo from src.db.dtos.url.insert import InsertURLsInfo from src.db.dtos.url.raw_html import RawHTMLInfo from src.db.enums import TaskType from src.db.helpers.session import session_helper as sh -from src.db.models.impl.agency.enums import AgencyType +from src.db.models.impl.agency.enums import AgencyType, JurisdictionType from src.db.models.impl.agency.sqlalchemy import Agency from src.db.models.impl.backlog_snapshot import BacklogSnapshot from src.db.models.impl.batch.pydantic.info import BatchInfo @@ -74,25 +71,24 @@ from src.db.models.impl.task.core import Task from src.db.models.impl.task.enums import TaskStatus from src.db.models.impl.task.error import TaskError -from src.db.models.impl.url.checked_for_duplicate import URLCheckedForDuplicate from src.db.models.impl.url.core.pydantic.info import URLInfo from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent -from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata -from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata +from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion +from src.db.models.impl.url.suggestion.anonymous import AnonymousSession from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.impl.url.suggestion.relevant.auto.pydantic.input import AutoRelevancyAnnotationInput -from src.db.models.impl.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion -from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion +from src.db.models.impl.url.suggestion.url_type.auto.pydantic.input import AutoRelevancyAnnotationInput +from src.db.models.impl.url.suggestion.url_type.auto.sqlalchemy import AutoRelevantSuggestion +from src.db.models.impl.url.suggestion.url_type.user import UserURLTypeSuggestion from src.db.models.impl.url.task_error.sqlalchemy import URLTaskError from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata from src.db.models.templates_.base import Base from src.db.models.views.batch_url_status.enums import BatchURLStatusEnum from src.db.queries.base.builder import QueryBuilderBase -from src.db.queries.implementations.core.get.html_content_info import GetHTMLContentInfoQueryBuilder from src.db.queries.implementations.core.get.recent_batch_summaries.builder import GetRecentBatchSummariesQueryBuilder from src.db.queries.implementations.core.metrics.urls.aggregated.pending import \ GetMetricsURLSAggregatedPendingQueryBuilder @@ -102,18 +98,20 @@ from src.db.templates.markers.bulk.insert import BulkInsertableModel from src.db.templates.markers.bulk.upsert import BulkUpsertableModel from src.db.utils.compression import decompress_html, compress_html +from src.util.models.url_and_scheme import URLAndScheme +from src.util.url import get_url_and_scheme class AsyncDatabaseClient: - def __init__(self, db_url: str | None = None): - if db_url is None: + def __init__(self, engine: AsyncEngine | None = None): + if engine is None: db_url = EnvVarManager.get().get_postgres_connection_string(is_async=True) - self.db_url = db_url - echo = ConfigManager.get_sqlalchemy_echo() - self.engine = create_async_engine( - url=db_url, - echo=echo, - ) + echo = ConfigManager.get_sqlalchemy_echo() + engine = create_async_engine( + url=db_url, + echo=echo, + ) + self.engine = engine self.session_maker = async_sessionmaker(bind=self.engine, expire_on_commit=False) self.statement_composer = StatementComposer() @@ -142,8 +140,8 @@ async def wrapper(self, *args, **kwargs): return wrapper @session_manager - async def execute(self, session: AsyncSession, statement): - await session.execute(statement) + async def execute(self, session: AsyncSession, statement) -> Any: + return await session.execute(statement) @session_manager async def add( @@ -167,14 +165,9 @@ async def add_all( async def bulk_update( self, session: AsyncSession, - model: Base, - mappings: list[dict], + models: list[Base], ): - # Note, mapping must include primary key - await session.execute( - update(model), - mappings - ) + await sh.bulk_update(session=session, models=models) @session_manager async def bulk_upsert( @@ -293,18 +286,6 @@ async def add_user_relevant_suggestion( # region record_type - @session_manager - async def add_auto_record_type_suggestions( - self, - session: AsyncSession, - url_and_record_type_list: list[tuple[int, RecordType]] - ): - for url_id, record_type in url_and_record_type_list: - suggestion = AutoRecordTypeSuggestion( - url_id=url_id, - record_type=record_type.value - ) - session.add(suggestion) async def add_auto_record_type_suggestion( self, @@ -345,10 +326,6 @@ async def add_user_record_type_suggestion( # endregion record_type - @session_manager - async def add_html_content_infos(self, session: AsyncSession, html_content_infos: list[URLHTMLContentInfo]): - await self._add_models(session, URLHTMLContent, html_content_infos) - @session_manager async def has_non_errored_urls_without_html_data(self, session: AsyncSession) -> bool: statement = self.statement_composer.has_non_errored_urls_without_html_data() @@ -378,67 +355,23 @@ async def add_miscellaneous_metadata(self, session: AsyncSession, tdos: list[URL for tdo in tdos: metadata_object = URLOptionalDataSourceMetadata( url_id=tdo.url_id, - record_formats=tdo.record_formats, + record_formats=tdo.record_formats or [], data_portal_type=tdo.data_portal_type, - supplying_entity=tdo.supplying_entity + supplying_entity=tdo.supplying_entity, + access_types=[], ) session.add(metadata_object) async def get_non_errored_urls_without_html_data(self) -> list[URLInfo]: return await self.run_query_builder(GetPendingURLsWithoutHTMLDataQueryBuilder()) - async def get_urls_with_html_data_and_without_models( - self, - session: AsyncSession, - model: Type[Base] - ): - statement = (select(URL) - .options(selectinload(URL.html_content)) - .where(URL.status == URLStatus.OK.value)) - statement = self.statement_composer.exclude_urls_with_extant_model( - statement=statement, - model=model - ) - statement = statement.limit(100).order_by(URL.id) - raw_result = await session.execute(statement) - urls: Sequence[Row[URL]] = raw_result.unique().scalars().all() - final_results = DTOConverter.url_list_to_url_with_html_list(urls) - - return final_results - @session_manager - async def get_urls_with_html_data_and_without_auto_record_type_suggestion( - self, - session: AsyncSession - ): - return await self.get_urls_with_html_data_and_without_models( - session=session, - model=AutoRecordTypeSuggestion - ) - - async def has_urls_with_html_data_and_without_models( + async def one_or_none_model( self, session: AsyncSession, - model: Type[Base] - ) -> bool: - statement = (select(URL) - .join(URLCompressedHTML) - .where(URL.status == URLStatus.OK.value)) - # Exclude URLs with auto suggested record types - statement = self.statement_composer.exclude_urls_with_extant_model( - statement=statement, - model=model - ) - statement = statement.limit(1) - scalar_result = await session.scalars(statement) - return bool(scalar_result.first()) - - @session_manager - async def has_urls_with_html_data_and_without_auto_record_type_suggestion(self, session: AsyncSession) -> bool: - return await self.has_urls_with_html_data_and_without_models( - session=session, - model=AutoRecordTypeSuggestion - ) + model: Base + ) -> Row | None: + return await sh.one_or_none(session=session, query=select(model)) @session_manager async def get_all( @@ -450,6 +383,12 @@ async def get_all( """Get all records of a model. Used primarily in testing.""" return await sh.get_all(session=session, model=model, order_by_attribute=order_by_attribute) + + @session_manager + async def has_no_rows(self, session: AsyncSession, model: Base) -> bool: + results: list[Base] = await sh.get_all(session=session, model=model) + return len(results) == 0 + async def get_urls( self, page: int, @@ -502,9 +441,6 @@ async def get_task_info( ) -> TaskInfo: return await self.run_query_builder(GetTaskInfoQueryBuilder(task_id)) - async def get_html_content_info(self, url_id: int) -> list[URLHTMLContentInfo]: - return await self.run_query_builder(GetHTMLContentInfoQueryBuilder(url_id)) - @session_manager async def link_urls_to_task( self, @@ -584,11 +520,14 @@ async def upsert_new_agencies( Add or update agencies in the database """ for suggestion in suggestions: - query = select(Agency).where(Agency.agency_id == suggestion.pdap_agency_id) + query = select(Agency).where(Agency.id == suggestion.pdap_agency_id) result = await session.execute(query) agency = result.scalars().one_or_none() if agency is None: - agency = Agency(agency_id=suggestion.pdap_agency_id) + agency = Agency( + id=suggestion.pdap_agency_id, + jurisdiction_type=JurisdictionType.LOCAL + ) agency.name = suggestion.agency_name agency.agency_type = AgencyType.UNKNOWN session.add(agency) @@ -620,17 +559,18 @@ async def add_agency_manual_suggestion( # Check if agency exists in database -- if not, add with placeholder if agency_id is not None: - statement = select(Agency).where(Agency.agency_id == agency_id) + statement = select(Agency).where(Agency.id == agency_id) result = await session.execute(statement) if len(result.all()) == 0: agency = Agency( - agency_id=agency_id, + id=agency_id, name=PLACEHOLDER_AGENCY_NAME, agency_type=AgencyType.UNKNOWN, + jurisdiction_type=JurisdictionType.LOCAL ) await session.merge(agency) - url_agency_suggestion = UserUrlAgencySuggestion( + url_agency_suggestion = UserURLAgencySuggestion( url_id=url_id, agency_id=agency_id, user_id=user_id, @@ -638,12 +578,6 @@ async def add_agency_manual_suggestion( ) session.add(url_agency_suggestion) - @session_manager - async def get_urls_with_confirmed_agencies(self, session: AsyncSession) -> list[URL]: - statement = select(URL).where(exists().where(LinkURLAgency.url_id == URL.id)) - results = await session.execute(statement) - return list(results.scalars().all()) - async def approve_url( self, approval_info: FinalReviewApprovalInfo, @@ -756,9 +690,6 @@ async def update_batch_post_collection( batch.status = batch_status.value batch.compute_time = compute_time - async def mark_urls_as_submitted(self, infos: list[SubmittedURLInfo]): - await self.run_query_builder(MarkURLsAsSubmittedQueryBuilder(infos)) - async def get_duplicates_by_batch_id(self, batch_id: int, page: int) -> list[DuplicateInfo]: return await self.run_query_builder( GetDuplicatesByBatchIDQueryBuilder( @@ -793,15 +724,6 @@ async def get_logs_by_batch_id(self, session, batch_id: int) -> List[LogOutputIn logs = raw_results.scalars().all() return ([LogOutputInfo(**log.__dict__) for log in logs]) - async def delete_old_logs(self): - """ - Delete logs older than a day - """ - statement = delete(Log).where( - Log.created_at < datetime.now() - timedelta(days=7) - ) - await self.execute(statement) - async def get_next_url_for_all_annotations( self, user_id: int, @@ -828,9 +750,10 @@ async def upload_manual_batch( @session_manager async def search_for_url(self, session: AsyncSession, url: str) -> SearchURLResponse: - query = select(URL).where(URL.url == url) + url_and_scheme: URLAndScheme = get_url_and_scheme(url) + query = select(URL).where(URL.url == url_and_scheme.url) raw_results = await session.execute(query) - url = raw_results.scalars().one_or_none() + url: URL | None = raw_results.scalars().one_or_none() if url is None: return SearchURLResponse( found=False, @@ -863,11 +786,11 @@ async def get_urls_breakdown_submitted_metrics( ) -> GetMetricsURLsBreakdownSubmittedResponseDTO: # Build the query - month = func.date_trunc('month', URLDataSource.created_at) + month = func.date_trunc('month', DSAppLinkDataSource.created_at) query = ( select( month.label('month'), - func.count(URLDataSource.id).label('count_submitted'), + func.count(DSAppLinkDataSource.id).label('count_submitted'), ) .group_by(month) .order_by(month.asc()) @@ -933,12 +856,6 @@ async def mark_all_as_404(self, url_ids: List[int]): query = update(URLWebMetadata).where(URLWebMetadata.url_id.in_(url_ids)).values(status_code=404) await self.execute(query) - @session_manager - async def mark_as_checked_for_duplicates(self, session: AsyncSession, url_ids: list[int]): - for url_id in url_ids: - url_checked_for_duplicate = URLCheckedForDuplicate(url_id=url_id) - session.add(url_checked_for_duplicate) - async def get_urls_aggregated_pending_metrics(self): return await self.run_query_builder(GetMetricsURLSAggregatedPendingQueryBuilder()) @@ -1000,4 +917,7 @@ async def refresh_materialized_views(self): ) await self.execute( text("REFRESH MATERIALIZED VIEW batch_url_status_mat_view") + ) + await self.execute( + text("REFRESH MATERIALIZED VIEW mat_view__html_duplicate_url") ) \ No newline at end of file diff --git a/src/db/client/sync.py b/src/db/client/sync.py index 006d6f0e..e29909cf 100644 --- a/src/db/client/sync.py +++ b/src/db/client/sync.py @@ -1,41 +1,44 @@ from functools import wraps from typing import List -from sqlalchemy import create_engine, update, Select +from sqlalchemy import create_engine, Select, Engine from sqlalchemy.exc import IntegrityError from sqlalchemy.orm import sessionmaker, scoped_session, Session -from src.collectors.enums import URLStatus +from src.core.enums import BatchStatus +from src.core.env_var_manager import EnvVarManager from src.db.config_manager import ConfigManager +from src.db.dtos.url.insert import InsertURLsInfo +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.batch.pydantic.info import BatchInfo +from src.db.models.impl.batch.sqlalchemy import Batch from src.db.models.impl.duplicate.pydantic.insert import DuplicateInsertInfo -from src.db.dtos.url.insert import InsertURLsInfo -from src.db.models.impl.log.pydantic.info import LogInfo -from src.db.dtos.url.mapping import URLMapping -from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL -from src.db.models.impl.url.core.pydantic.info import URLInfo -from src.db.models.templates_.base import Base from src.db.models.impl.duplicate.sqlalchemy import Duplicate +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.log.pydantic.info import LogInfo from src.db.models.impl.log.sqlalchemy import Log -from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource +from src.db.models.impl.url.core.pydantic.info import URLInfo from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.batch.sqlalchemy import Batch -from src.core.tasks.url.operators.submit_approved.tdo import SubmittedURLInfo -from src.core.env_var_manager import EnvVarManager -from src.core.enums import BatchStatus +from src.db.models.templates_.base import Base +from src.util.models.url_and_scheme import URLAndScheme +from src.util.url import get_url_and_scheme # Database Client class DatabaseClient: - def __init__(self, db_url: str | None = None): + def __init__( + self, + engine: Engine | None = None + ): """Initialize the DatabaseClient.""" - if db_url is None: + if engine is None: db_url = EnvVarManager.get().get_postgres_connection_string(is_async=True) + engine = create_engine( + url=db_url, + echo=ConfigManager.get_sqlalchemy_echo(), + ) - self.engine = create_engine( - url=db_url, - echo=ConfigManager.get_sqlalchemy_echo(), - ) + self.engine = engine self.session_maker = scoped_session(sessionmaker(bind=self.engine)) self.session = None @@ -116,11 +119,14 @@ def get_url_info_by_url( @session_manager def insert_url(self, session, url_info: URLInfo) -> int: """Insert a new URL into the database.""" + url_and_scheme: URLAndScheme = get_url_and_scheme(url_info.url) url_entry = URL( - url=url_info.url, + url=url_and_scheme.url, + scheme=url_and_scheme.scheme, collector_metadata=url_info.collector_metadata, status=url_info.status, name=url_info.name, + trailing_slash=url_and_scheme.url.endswith('/'), source=url_info.source ) if url_info.created_at is not None: @@ -137,13 +143,13 @@ def insert_url(self, session, url_info: URLInfo) -> int: return url_entry.id def insert_urls(self, url_infos: List[URLInfo], batch_id: int) -> InsertURLsInfo: - url_mappings = [] + url_mappings: list[SimpleURLMapping] = [] duplicates = [] for url_info in url_infos: url_info.batch_id = batch_id try: url_id = self.insert_url(url_info) - url_mappings.append(URLMapping(url_id=url_id, url=url_info.url)) + url_mappings.append(SimpleURLMapping(url_id=url_id, url=url_info.url)) except IntegrityError as e: orig_url_info = self.get_url_info_by_url(url_info.url) duplicate_info = DuplicateInsertInfo( @@ -211,25 +217,6 @@ def update_url( url = session.query(URL).filter_by(id=url_info.id).first() url.collector_metadata = url_info.collector_metadata - @session_manager - def mark_urls_as_submitted( - self, - session: Session, - infos: list[SubmittedURLInfo] - ): - for info in infos: - url_id = info.url_id - data_source_id = info.data_source_id - - url_data_source_object = URLDataSource( - url_id=url_id, - data_source_id=data_source_id - ) - if info.submitted_at is not None: - url_data_source_object.created_at = info.submitted_at - session.add(url_data_source_object) - - if __name__ == "__main__": client = DatabaseClient() print("Database client initialized.") diff --git a/src/db/client/types.py b/src/db/client/types.py index ffce5621..e4f70301 100644 --- a/src/db/client/types.py +++ b/src/db/client/types.py @@ -1,5 +1,5 @@ -from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion +from src.db.models.impl.url.suggestion.url_type.user import UserURLTypeSuggestion -UserSuggestionModel = UserURLTypeSuggestion or UserRecordTypeSuggestion or UserUrlAgencySuggestion +UserSuggestionModel = UserURLTypeSuggestion or UserRecordTypeSuggestion or UserURLAgencySuggestion diff --git a/src/db/constants.py b/src/db/constants.py index a3574a96..c8821e7e 100644 --- a/src/db/constants.py +++ b/src/db/constants.py @@ -1,6 +1,6 @@ -from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion +from src.db.models.impl.url.suggestion.url_type.user import UserURLTypeSuggestion PLACEHOLDER_AGENCY_NAME = "PLACEHOLDER_AGENCY_NAME" @@ -9,5 +9,5 @@ USER_ANNOTATION_MODELS = [ UserURLTypeSuggestion, UserRecordTypeSuggestion, - UserUrlAgencySuggestion + UserURLAgencySuggestion ] \ No newline at end of file diff --git a/src/db/dto_converter.py b/src/db/dto_converter.py index f0c9b097..4c91a353 100644 --- a/src/db/dto_converter.py +++ b/src/db/dto_converter.py @@ -13,11 +13,11 @@ from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.html.content.enums import HTMLContentType from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent -from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.impl.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion -from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion +from src.db.models.impl.url.suggestion.url_type.auto.sqlalchemy import AutoRelevantSuggestion +from src.db.models.impl.url.suggestion.url_type.user import UserURLTypeSuggestion class DTOConverter: diff --git a/src/db/dtos/url/insert.py b/src/db/dtos/url/insert.py index f3143668..672cbb9f 100644 --- a/src/db/dtos/url/insert.py +++ b/src/db/dtos/url/insert.py @@ -1,10 +1,10 @@ from pydantic import BaseModel -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping class InsertURLsInfo(BaseModel): - url_mappings: list[URLMapping] + url_mappings: list[SimpleURLMapping] url_ids: list[int] total_count: int = 0 original_count: int = 0 diff --git a/src/db/dtos/url/mapping_/__init__.py b/src/db/dtos/url/mapping_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/dtos/url/mapping_/full.py b/src/db/dtos/url/mapping_/full.py new file mode 100644 index 00000000..c60f367c --- /dev/null +++ b/src/db/dtos/url/mapping_/full.py @@ -0,0 +1,14 @@ +from pydantic import BaseModel, ConfigDict + +from src.util.models.full_url import FullURL + + +class FullURLMapping(BaseModel): + """Mapping between full URL and url_id""" + model_config = ConfigDict( + arbitrary_types_allowed=True, + frozen=True # <- makes it immutable & hashable + ) + + full_url: FullURL + url_id: int \ No newline at end of file diff --git a/src/db/dtos/url/mapping.py b/src/db/dtos/url/mapping_/simple.py similarity index 84% rename from src/db/dtos/url/mapping.py rename to src/db/dtos/url/mapping_/simple.py index d48a4649..ff2e4f6b 100644 --- a/src/db/dtos/url/mapping.py +++ b/src/db/dtos/url/mapping_/simple.py @@ -1,7 +1,7 @@ from pydantic import BaseModel, ConfigDict -class URLMapping(BaseModel): +class SimpleURLMapping(BaseModel): """Mapping between url and url_id.""" model_config = ConfigDict(frozen=True) # <- makes it immutable & hashable diff --git a/src/db/enums.py b/src/db/enums.py index b232c188..65f446c5 100644 --- a/src/db/enums.py +++ b/src/db/enums.py @@ -55,8 +55,6 @@ class TaskType(PyEnum): # Scheduled Tasks PUSH_TO_HUGGINGFACE = "Push to Hugging Face" - SYNC_AGENCIES = "Sync Agencies" - SYNC_DATA_SOURCES = "Sync Data Sources" POPULATE_BACKLOG_SNAPSHOT = "Populate Backlog Snapshot" DELETE_OLD_LOGS = "Delete Old Logs" DELETE_STALE_SCREENSHOTS = "Delete Stale Screenshots" @@ -64,6 +62,19 @@ class TaskType(PyEnum): RUN_URL_TASKS = "Run URL Task Cycles" TASK_CLEANUP = "Task Cleanup" REFRESH_MATERIALIZED_VIEWS = "Refresh Materialized Views" + UPDATE_URL_STATUS = "Update URL Status" + INTEGRITY_MONITOR = "Integrity Monitor" + + # Sync Tasks + SYNC_AGENCIES_ADD = "Sync Agencies Add" + SYNC_AGENCIES_UPDATE = "Sync Agencies Update" + SYNC_AGENCIES_DELETE = "Sync Agencies Delete" + SYNC_DATA_SOURCES_ADD = "Sync Data Sources Add" + SYNC_DATA_SOURCES_UPDATE = "Sync Data Sources Update" + SYNC_DATA_SOURCES_DELETE = "Sync Data Sources Delete" + SYNC_META_URLS_ADD = "Sync Meta URLs Add" + SYNC_META_URLS_UPDATE = "Sync Meta URLs Update" + SYNC_META_URLS_DELETE = "Sync Meta URLs Delete" class ChangeLogOperationType(PyEnum): INSERT = "INSERT" diff --git a/src/db/helpers/session/session_helper.py b/src/db/helpers/session/session_helper.py index 43369ff3..f451f30c 100644 --- a/src/db/helpers/session/session_helper.py +++ b/src/db/helpers/session/session_helper.py @@ -52,6 +52,12 @@ async def bulk_upsert( session: AsyncSession, models: list[BulkUpsertableModel], ) -> None: + """Bulk update sqlalchemy models via their pydantic counterparts. + + WARNING: All non-id fields in the model will be updated on conflict. Do not include + attributes in the BulkUpdatableModel unless you intend to update them. + + """ if len(models) == 0: return # Parse models to get sa_model and id_field @@ -205,15 +211,19 @@ async def bulk_update( session: AsyncSession, models: list[BulkUpdatableModel], ): - """Bulk update sqlalchemy models via their pydantic counterparts.""" + """Bulk update sqlalchemy models via their pydantic counterparts. + + WARNING: All non-id fields in the model will be updated. Do not include + attributes in the BulkUpdatableModel unless you intend to update them. + """ if len(models) == 0: return parser = BulkActionParser(models) sa_model = parser.sa_model - id_field = parser.id_field - update_fields = parser.get_non_id_fields() + id_field: str = parser.id_field + update_fields: list[str] = parser.get_non_id_fields() for model in models: diff --git a/src/db/models/helpers.py b/src/db/models/helpers.py index f547e8d4..e1c77978 100644 --- a/src/db/models/helpers.py +++ b/src/db/models/helpers.py @@ -11,7 +11,7 @@ def get_agency_id_foreign_column( return Column( 'agency_id', Integer(), - ForeignKey('agencies.agency_id', ondelete='CASCADE'), + ForeignKey('agencies.id', ondelete='CASCADE'), nullable=nullable ) @@ -46,9 +46,12 @@ def location_id_column() -> Column[int]: CURRENT_TIME_SERVER_DEFAULT = func.now() +VIEW_ARG = {"info": "view"} + def url_id_primary_key_constraint() -> PrimaryKeyConstraint: return PrimaryKeyConstraint('url_id') + def county_column(nullable: bool = False) -> Column[int]: return Column( Integer(), diff --git a/src/db/models/impl/agency/ds_link/__init__.py b/src/db/models/impl/agency/ds_link/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/impl/agency/ds_link/sqlalchemy.py b/src/db/models/impl/agency/ds_link/sqlalchemy.py new file mode 100644 index 00000000..32911882 --- /dev/null +++ b/src/db/models/impl/agency/ds_link/sqlalchemy.py @@ -0,0 +1,19 @@ +from sqlalchemy import Integer, Column + +from src.db.models.mixins import CreatedAtMixin, AgencyDependentMixin, LastSyncedAtMixin +from src.db.models.templates_.base import Base + + +class DSAppLinkAgency( + Base, + CreatedAtMixin, + AgencyDependentMixin, + LastSyncedAtMixin +): + __tablename__ = "ds_app_link_agency" + + ds_agency_id = Column( + Integer, + primary_key=True, + nullable=False + ) \ No newline at end of file diff --git a/src/db/models/impl/agency/sqlalchemy.py b/src/db/models/impl/agency/sqlalchemy.py index 002b0255..9e99a0be 100644 --- a/src/db/models/impl/agency/sqlalchemy.py +++ b/src/db/models/impl/agency/sqlalchemy.py @@ -2,8 +2,8 @@ References an agency in the data sources database. """ -from sqlalchemy import Column, Integer, String, DateTime -from sqlalchemy.orm import relationship +from sqlalchemy import Column, Integer, String, DateTime, Sequence +from sqlalchemy.orm import relationship, Mapped from src.db.models.helpers import enum_column from src.db.models.impl.agency.enums import AgencyType, JurisdictionType @@ -18,18 +18,22 @@ class Agency( ): __tablename__ = "agencies" - # TODO: Rename agency_id to ds_agency_id - - agency_id = Column(Integer, primary_key=True) name = Column(String, nullable=False) - agency_type = enum_column(AgencyType, name="agency_type_enum") - jurisdiction_type = enum_column( + agency_type: Mapped[AgencyType] = enum_column(AgencyType, name="agency_type_enum") + jurisdiction_type: Mapped[JurisdictionType] = enum_column( JurisdictionType, name="jurisdiction_type_enum", - nullable=True, + nullable=False, ) # Relationships automated_suggestions = relationship("AgencyIDSubtaskSuggestion") - user_suggestions = relationship("UserUrlAgencySuggestion", back_populates="agency") + user_suggestions = relationship("UserURLAgencySuggestion", back_populates="agency") confirmed_urls = relationship("LinkURLAgency", back_populates="agency") + + locations = relationship( + "LocationExpandedView", + primaryjoin="Agency.id == LinkAgencyLocation.agency_id", + secondaryjoin="LocationExpandedView.id == LinkAgencyLocation.location_id", + secondary="link_agencies__locations", + ) diff --git a/src/db/models/impl/batch/sqlalchemy.py b/src/db/models/impl/batch/sqlalchemy.py index 564ce163..72c8b39b 100644 --- a/src/db/models/impl/batch/sqlalchemy.py +++ b/src/db/models/impl/batch/sqlalchemy.py @@ -1,7 +1,8 @@ from sqlalchemy import Column, Integer, TIMESTAMP, Float, JSON from sqlalchemy.dialects import postgresql -from sqlalchemy.orm import relationship +from sqlalchemy.orm import relationship, Mapped +from src.core.enums import BatchStatus from src.db.models.helpers import CURRENT_TIME_SERVER_DEFAULT from src.db.models.impl.log.sqlalchemy import Log from src.db.models.templates_.with_id import WithIDBase @@ -23,9 +24,9 @@ class Batch(WithIDBase): 'manual', name='batch_strategy'), nullable=False) - user_id = Column(Integer, nullable=False) + user_id = Column(Integer, nullable=True) # Gives the status of the batch - status = Column( + status: Mapped[BatchStatus] = Column( batch_status_enum, nullable=False ) @@ -40,7 +41,7 @@ class Batch(WithIDBase): # Relationships urls = relationship( "URL", - secondary="link_batch_urls", + secondary="link_batches__urls", back_populates="batch", overlaps="url" ) diff --git a/src/db/models/impl/duplicate/sqlalchemy.py b/src/db/models/impl/duplicate/sqlalchemy.py index 03c492e3..2b50409d 100644 --- a/src/db/models/impl/duplicate/sqlalchemy.py +++ b/src/db/models/impl/duplicate/sqlalchemy.py @@ -1,15 +1,19 @@ -from sqlalchemy import Column, Integer, ForeignKey +from sqlalchemy import Column, Integer, ForeignKey, PrimaryKeyConstraint from sqlalchemy.orm import relationship from src.db.models.mixins import BatchDependentMixin +from src.db.models.templates_.base import Base from src.db.models.templates_.with_id import WithIDBase -class Duplicate(BatchDependentMixin, WithIDBase): +class Duplicate(BatchDependentMixin, Base): """ Identifies duplicates which occur within a batch """ __tablename__ = 'duplicates' + __table_args__ = ( + PrimaryKeyConstraint("batch_id"), + ) original_url_id = Column( Integer, diff --git a/src/db/models/impl/flag/ds_delete/__init__.py b/src/db/models/impl/flag/ds_delete/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/impl/flag/ds_delete/agency.py b/src/db/models/impl/flag/ds_delete/agency.py new file mode 100644 index 00000000..2559376d --- /dev/null +++ b/src/db/models/impl/flag/ds_delete/agency.py @@ -0,0 +1,20 @@ +from sqlalchemy import ForeignKey, Integer, Column + +from src.db.models.mixins import CreatedAtMixin +from src.db.models.templates_.base import Base + + +class FlagDSDeleteAgency( + Base, + CreatedAtMixin +): + __tablename__ = "flag_ds_delete_agency" + + ds_agency_id = Column( + Integer, + ForeignKey( + "ds_app_link_agency.ds_agency_id", + ondelete="CASCADE" + ), + primary_key=True, + ) \ No newline at end of file diff --git a/src/db/models/impl/flag/ds_delete/data_source.py b/src/db/models/impl/flag/ds_delete/data_source.py new file mode 100644 index 00000000..38d3cba8 --- /dev/null +++ b/src/db/models/impl/flag/ds_delete/data_source.py @@ -0,0 +1,20 @@ +from sqlalchemy import ForeignKey, Integer, Column + +from src.db.models.mixins import CreatedAtMixin +from src.db.models.templates_.base import Base + + +class FlagDSDeleteDataSource( + Base, + CreatedAtMixin +): + __tablename__ = "flag_ds_delete_data_source" + + ds_data_source_id = Column( + Integer, + ForeignKey( + "ds_app_link_data_source.ds_data_source_id", + ondelete="CASCADE" + ), + primary_key=True, + ) \ No newline at end of file diff --git a/src/db/models/impl/flag/ds_delete/meta_url.py b/src/db/models/impl/flag/ds_delete/meta_url.py new file mode 100644 index 00000000..1fc90d06 --- /dev/null +++ b/src/db/models/impl/flag/ds_delete/meta_url.py @@ -0,0 +1,20 @@ +from sqlalchemy import Column, Integer, ForeignKey + +from src.db.models.mixins import CreatedAtMixin +from src.db.models.templates_.base import Base + + +class FlagDSDeleteMetaURL( + Base, + CreatedAtMixin +): + __tablename__ = "flag_ds_delete_meta_url" + + ds_meta_url_id = Column( + Integer, + ForeignKey( + 'ds_app_link_meta_url.ds_meta_url_id', + ondelete='CASCADE' + ), + primary_key=True, + ) \ No newline at end of file diff --git a/src/db/models/impl/flag/url_validated/sqlalchemy.py b/src/db/models/impl/flag/url_validated/sqlalchemy.py index 97abf056..081441d8 100644 --- a/src/db/models/impl/flag/url_validated/sqlalchemy.py +++ b/src/db/models/impl/flag/url_validated/sqlalchemy.py @@ -1,4 +1,5 @@ from sqlalchemy import PrimaryKeyConstraint +from sqlalchemy.orm import Mapped from src.db.models.helpers import enum_column from src.db.models.impl.flag.url_validated.enums import URLType @@ -19,7 +20,7 @@ class FlagURLValidated( ), ) - type = enum_column( + type: Mapped[URLType] = enum_column( enum_type=URLType, name="url_type", ) diff --git a/src/db/models/impl/link/agency_batch/sqlalchemy.py b/src/db/models/impl/link/agency_batch/sqlalchemy.py index dcb670d3..32518ed4 100644 --- a/src/db/models/impl/link/agency_batch/sqlalchemy.py +++ b/src/db/models/impl/link/agency_batch/sqlalchemy.py @@ -10,7 +10,7 @@ class LinkAgencyBatch( BatchDependentMixin, AgencyDependentMixin, ): - __tablename__ = "link_agency_batches" + __tablename__ = "link_agencies__batches" __table_args__ = ( PrimaryKeyConstraint( 'batch_id', diff --git a/src/db/models/impl/link/agency_location/sqlalchemy.py b/src/db/models/impl/link/agency_location/sqlalchemy.py index 18a3ae5f..c4203d44 100644 --- a/src/db/models/impl/link/agency_location/sqlalchemy.py +++ b/src/db/models/impl/link/agency_location/sqlalchemy.py @@ -1,10 +1,15 @@ +from sqlalchemy import PrimaryKeyConstraint + from src.db.models.mixins import AgencyDependentMixin, LocationDependentMixin -from src.db.models.templates_.with_id import WithIDBase +from src.db.models.templates_.base import Base class LinkAgencyLocation( - WithIDBase, + Base, AgencyDependentMixin, LocationDependentMixin, ): - __tablename__ = "link_agencies_locations" \ No newline at end of file + __tablename__ = "link_agencies__locations" + __table_args__ = ( + PrimaryKeyConstraint("agency_id", "location_id"), + ) \ No newline at end of file diff --git a/src/db/models/impl/link/batch_url/sqlalchemy.py b/src/db/models/impl/link/batch_url/sqlalchemy.py index 951ac539..ac747e01 100644 --- a/src/db/models/impl/link/batch_url/sqlalchemy.py +++ b/src/db/models/impl/link/batch_url/sqlalchemy.py @@ -11,5 +11,5 @@ class LinkBatchURL( BatchDependentMixin, WithIDBase ): - __tablename__ = "link_batch_urls" + __tablename__ = "link_batches__urls" diff --git a/src/db/models/impl/link/location_batch/sqlalchemy.py b/src/db/models/impl/link/location_batch/sqlalchemy.py index e73a5ec8..e3ea5e55 100644 --- a/src/db/models/impl/link/location_batch/sqlalchemy.py +++ b/src/db/models/impl/link/location_batch/sqlalchemy.py @@ -11,7 +11,7 @@ class LinkLocationBatch( CreatedAtMixin ): - __tablename__ = "link_location_batches" + __tablename__ = "link_batches__locations" __table_args__ = ( PrimaryKeyConstraint( 'batch_id', diff --git a/src/db/models/impl/link/task_url.py b/src/db/models/impl/link/task_url.py index 2535d317..d04d8275 100644 --- a/src/db/models/impl/link/task_url.py +++ b/src/db/models/impl/link/task_url.py @@ -4,7 +4,7 @@ class LinkTaskURL(Base): - __tablename__ = 'link_task_urls' + __tablename__ = 'link_tasks__urls' __table_args__ = (UniqueConstraint( "task_id", "url_id", diff --git a/src/db/models/impl/link/url_agency/sqlalchemy.py b/src/db/models/impl/link/url_agency/sqlalchemy.py index 875fa25f..7111bc6d 100644 --- a/src/db/models/impl/link/url_agency/sqlalchemy.py +++ b/src/db/models/impl/link/url_agency/sqlalchemy.py @@ -1,19 +1,20 @@ -from sqlalchemy import UniqueConstraint +from sqlalchemy import UniqueConstraint, PrimaryKeyConstraint from sqlalchemy.orm import relationship, Mapped from src.db.models.helpers import get_agency_id_foreign_column from src.db.models.mixins import URLDependentMixin +from src.db.models.templates_.base import Base from src.db.models.templates_.with_id import WithIDBase class LinkURLAgency(URLDependentMixin, WithIDBase): - __tablename__ = "link_urls_agency" + __tablename__ = "link_agencies__urls" + __table_args__ = ( + UniqueConstraint("url_id", "agency_id"), + ) agency_id: Mapped[int] = get_agency_id_foreign_column() - url = relationship("URL", back_populates="confirmed_agencies") - agency = relationship("Agency", back_populates="confirmed_urls") + url = relationship("URL") + agency = relationship("Agency") - __table_args__ = ( - UniqueConstraint("url_id", "agency_id", name="uq_confirmed_url_agency"), - ) diff --git a/src/db/models/impl/link/url_redirect_url/sqlalchemy.py b/src/db/models/impl/link/url_redirect_url/sqlalchemy.py index 312cbb57..c470e323 100644 --- a/src/db/models/impl/link/url_redirect_url/sqlalchemy.py +++ b/src/db/models/impl/link/url_redirect_url/sqlalchemy.py @@ -1,10 +1,21 @@ -from src.db.models.helpers import url_id_column -from src.db.models.templates_.standard import StandardBase +from sqlalchemy import PrimaryKeyConstraint +from sqlalchemy.orm import Mapped +from src.db.models.helpers import url_id_column +from src.db.models.mixins import CreatedAtMixin, UpdatedAtMixin +from src.db.models.templates_.base import Base -class LinkURLRedirectURL(StandardBase): +class LinkURLRedirectURL( + Base, + CreatedAtMixin, + UpdatedAtMixin +): __tablename__ = "link_urls_redirect_url" - source_url_id = url_id_column() - destination_url_id = url_id_column() + __table_args__ = ( + PrimaryKeyConstraint("source_url_id", "destination_url_id"), + ) + + source_url_id: Mapped[int] = url_id_column() + destination_url_id: Mapped[int] = url_id_column() diff --git a/src/db/models/impl/link/urls_root_url/sqlalchemy.py b/src/db/models/impl/link/urls_root_url/sqlalchemy.py index a856dd31..d55a181f 100644 --- a/src/db/models/impl/link/urls_root_url/sqlalchemy.py +++ b/src/db/models/impl/link/urls_root_url/sqlalchemy.py @@ -1,14 +1,20 @@ +from sqlalchemy import PrimaryKeyConstraint +from sqlalchemy.orm import Mapped + from src.db.models.helpers import url_id_column from src.db.models.mixins import URLDependentMixin, CreatedAtMixin, UpdatedAtMixin -from src.db.models.templates_.with_id import WithIDBase +from src.db.models.templates_.base import Base class LinkURLRootURL( UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, - WithIDBase + Base, ): __tablename__ = "link_urls_root_url" + __table_args__ = ( + PrimaryKeyConstraint("url_id", "root_url_id"), + ) - root_url_id = url_id_column() \ No newline at end of file + root_url_id: Mapped[int] = url_id_column() \ No newline at end of file diff --git a/src/db/models/impl/sync_log/__init__.py b/src/db/models/impl/sync_log/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/impl/sync_log/enums.py b/src/db/models/impl/sync_log/enums.py new file mode 100644 index 00000000..e1fe483a --- /dev/null +++ b/src/db/models/impl/sync_log/enums.py @@ -0,0 +1,12 @@ +from enum import Enum + + +class ResourceType(Enum): + AGENCY = 'agency' + DATA_SOURCE = 'data_source' + META_URL = 'meta_url' + +class SyncType(Enum): + ADD = 'add' + UPDATE = 'update' + DELETE = 'delete' \ No newline at end of file diff --git a/src/db/models/impl/sync_log/sqlalchemy.py b/src/db/models/impl/sync_log/sqlalchemy.py new file mode 100644 index 00000000..b545940f --- /dev/null +++ b/src/db/models/impl/sync_log/sqlalchemy.py @@ -0,0 +1,17 @@ +from sqlalchemy import PrimaryKeyConstraint, Column, Integer, DateTime + +from src.db.models.helpers import enum_column +from src.db.models.impl.sync_log.enums import ResourceType, SyncType +from src.db.models.templates_.base import Base + + +class SyncLog(Base): + __tablename__ = 'sync_log' + __table_args__ = ( + PrimaryKeyConstraint('resource_type', 'sync_type', 'created_at'), + ) + + resource_type = enum_column(ResourceType, name='resource_type_enum') + sync_type = enum_column(SyncType, name='sync_type_enum') + count = Column(Integer, nullable=False) + created_at = Column(DateTime, nullable=False) \ No newline at end of file diff --git a/src/db/models/impl/task/core.py b/src/db/models/impl/task/core.py index 566dd116..2eb8fd44 100644 --- a/src/db/models/impl/task/core.py +++ b/src/db/models/impl/task/core.py @@ -32,7 +32,7 @@ class Task(UpdatedAtMixin, WithIDBase): # Relationships urls = relationship( "URL", - secondary="link_task_urls", + secondary="link_tasks__urls", back_populates="tasks" ) errors = relationship(TaskError) diff --git a/src/db/models/impl/task/error.py b/src/db/models/impl/task/error.py index 2de0c66a..cd04a2ea 100644 --- a/src/db/models/impl/task/error.py +++ b/src/db/models/impl/task/error.py @@ -1,11 +1,12 @@ -from sqlalchemy import Column, Text, UniqueConstraint +from sqlalchemy import Column, Text, UniqueConstraint, PrimaryKeyConstraint from sqlalchemy.orm import relationship from src.db.models.mixins import UpdatedAtMixin, TaskDependentMixin +from src.db.models.templates_.base import Base from src.db.models.templates_.with_id import WithIDBase -class TaskError(UpdatedAtMixin, TaskDependentMixin, WithIDBase): +class TaskError(UpdatedAtMixin, TaskDependentMixin, Base): __tablename__ = 'task_errors' error = Column(Text, nullable=False) @@ -13,8 +14,8 @@ class TaskError(UpdatedAtMixin, TaskDependentMixin, WithIDBase): # Relationships task = relationship("Task") - __table_args__ = (UniqueConstraint( + __table_args__ = (PrimaryKeyConstraint( "task_id", "error", - name="uq_task_id_error"), + ), ) diff --git a/src/db/models/impl/task/log.py b/src/db/models/impl/task/log.py new file mode 100644 index 00000000..9efd86da --- /dev/null +++ b/src/db/models/impl/task/log.py @@ -0,0 +1,17 @@ +from sqlalchemy import Column, Text, PrimaryKeyConstraint + +from src.db.models.mixins import TaskDependentMixin, UpdatedAtMixin, CreatedAtMixin +from src.db.models.templates_.base import Base + + +class TaskLog( + Base, + TaskDependentMixin, + CreatedAtMixin, +): + __tablename__ = "tasks__log" + __table_args__ = ( + PrimaryKeyConstraint("task_id"), + ) + + log = Column(Text, nullable=False) diff --git a/src/db/models/impl/url/checked_for_duplicate.py b/src/db/models/impl/url/checked_for_duplicate.py index bb7cf666..89192573 100644 --- a/src/db/models/impl/url/checked_for_duplicate.py +++ b/src/db/models/impl/url/checked_for_duplicate.py @@ -1,11 +1,13 @@ +from sqlalchemy import PrimaryKeyConstraint from sqlalchemy.orm import relationship from src.db.models.mixins import CreatedAtMixin, URLDependentMixin -from src.db.models.templates_.with_id import WithIDBase +from src.db.models.templates_.base import Base -class URLCheckedForDuplicate(CreatedAtMixin, URLDependentMixin, WithIDBase): +class URLCheckedForDuplicate(CreatedAtMixin, URLDependentMixin, Base): __tablename__ = 'url_checked_for_duplicate' + __table_args__ = (PrimaryKeyConstraint("url_id"),) # Relationships url = relationship("URL", uselist=False, back_populates="checked_for_duplicate") diff --git a/src/db/models/impl/url/core/pydantic/insert.py b/src/db/models/impl/url/core/pydantic/insert.py index f04dd3df..ed73b6c1 100644 --- a/src/db/models/impl/url/core/pydantic/insert.py +++ b/src/db/models/impl/url/core/pydantic/insert.py @@ -14,7 +14,9 @@ def sa_model(cls) -> type[Base]: return URL url: str + scheme: str | None = None collector_metadata: dict | None = None name: str | None = None status: URLStatus = URLStatus.OK - source: URLSource \ No newline at end of file + source: URLSource + trailing_slash: bool \ No newline at end of file diff --git a/src/db/models/impl/url/core/pydantic/upsert.py b/src/db/models/impl/url/core/pydantic/upsert.py index 8a101c70..0ee5695a 100644 --- a/src/db/models/impl/url/core/pydantic/upsert.py +++ b/src/db/models/impl/url/core/pydantic/upsert.py @@ -15,4 +15,6 @@ def sa_model(cls) -> type[Base]: return URL id: int - name: str | None + name: str | None = None + url: str | None = None + trailing_slash: bool | None = None diff --git a/src/db/models/impl/url/core/sqlalchemy.py b/src/db/models/impl/url/core/sqlalchemy.py index 3582dd56..de4af177 100644 --- a/src/db/models/impl/url/core/sqlalchemy.py +++ b/src/db/models/impl/url/core/sqlalchemy.py @@ -1,13 +1,17 @@ -from sqlalchemy import Column, Text, String, JSON -from sqlalchemy.orm import relationship +from sqlalchemy import Column, Text, String, JSON, case, literal, Boolean +from sqlalchemy.ext.hybrid import hybrid_property +from sqlalchemy.orm import relationship, Mapped +from sqlalchemy.util import hybridproperty from src.collectors.enums import URLStatus from src.db.models.helpers import enum_column +from src.db.models.impl.link.user_suggestion_not_found.location.sqlalchemy import LinkUserSuggestionLocationNotFound from src.db.models.impl.url.checked_for_duplicate import URLCheckedForDuplicate from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType from src.db.models.impl.url.suggestion.location.auto.subtask.sqlalchemy import AutoLocationIDSubtask +from src.db.models.impl.url.suggestion.location.user.sqlalchemy import UserLocationSuggestion from src.db.models.impl.url.suggestion.name.sqlalchemy import URLNameSuggestion from src.db.models.impl.url.task_error.sqlalchemy import URLTaskError from src.db.models.mixins import UpdatedAtMixin, CreatedAtMixin @@ -19,18 +23,43 @@ class URL(UpdatedAtMixin, CreatedAtMixin, WithIDBase): # The batch this URL is associated with url = Column(Text, unique=True) + scheme: Mapped[str | None] = Column(String, nullable=True) name = Column(String) description = Column(Text) # The metadata from the collector collector_metadata = Column(JSON) # The outcome of the URL: submitted, human_labeling, rejected, duplicate, etc. - status = enum_column( + status: Mapped[URLStatus] = enum_column( URLStatus, name='url_status', nullable=False ) + trailing_slash = Column(Boolean, nullable=False) - source = enum_column( + @hybrid_property + def full_url(self) -> str: + if self.scheme is None: + return self.url + url: str = f"{self.scheme}://{self.url}" + if self.trailing_slash: + url += "/" + return url + + @full_url.expression + def full_url(cls): + return case( + ( + (cls.scheme != None) & (cls.trailing_slash == True), + (cls.scheme + literal("://") + cls.url + literal("/")) + ), + ( + (cls.scheme != None) & (cls.trailing_slash == False), + (cls.scheme + literal("://") + cls.url) + ), + else_=cls.url + ) + + source: Mapped[URLSource] = enum_column( URLSource, name='url_source', nullable=False @@ -39,7 +68,7 @@ class URL(UpdatedAtMixin, CreatedAtMixin, WithIDBase): # Relationships batch = relationship( "Batch", - secondary="link_batch_urls", + secondary="link_batches__urls", back_populates="urls", uselist=False, ) @@ -48,44 +77,60 @@ class URL(UpdatedAtMixin, CreatedAtMixin, WithIDBase): uselist=False, ) duplicates = relationship("Duplicate", back_populates="original_url") - html_content = relationship("URLHTMLContent", back_populates="url", cascade="all, delete-orphan") + html_content = relationship("URLHTMLContent", cascade="all, delete-orphan") task_errors = relationship( URLTaskError, cascade="all, delete-orphan" ) tasks = relationship( "Task", - secondary="link_task_urls", + secondary="link_tasks__urls", back_populates="urls", ) - auto_agency_subtasks = relationship( - "URLAutoAgencyIDSubtask" + + + name_suggestions = relationship( + URLNameSuggestion + ) + # Location + user_location_suggestions = relationship( + UserLocationSuggestion + ) + user_location_suggestion_not_found = relationship( + LinkUserSuggestionLocationNotFound ) auto_location_subtasks = relationship( AutoLocationIDSubtask ) - name_suggestions = relationship( - URLNameSuggestion - ) + + # Agency user_agency_suggestions = relationship( - "UserUrlAgencySuggestion", back_populates="url") + "UserURLAgencySuggestion", back_populates="url") + auto_agency_subtasks = relationship( + "URLAutoAgencyIDSubtask" + ) + # Record Type auto_record_type_suggestion = relationship( "AutoRecordTypeSuggestion", uselist=False, back_populates="url") user_record_type_suggestions = relationship( "UserRecordTypeSuggestion", back_populates="url") + # Relvant/URL Type auto_relevant_suggestion = relationship( "AutoRelevantSuggestion", uselist=False, back_populates="url") user_relevant_suggestions = relationship( "UserURLTypeSuggestion", back_populates="url") + reviewing_user = relationship( "ReviewingUserURL", uselist=False, back_populates="url") optional_data_source_metadata = relationship( "URLOptionalDataSourceMetadata", uselist=False, back_populates="url") confirmed_agencies = relationship( - "LinkURLAgency", + "Agency", + secondary="link_agencies__urls" + ) data_source = relationship( - "URLDataSource", + "DSAppLinkDataSource", back_populates="url", uselist=False ) diff --git a/src/db/models/impl/url/data_source/pydantic.py b/src/db/models/impl/url/data_source/pydantic.py index 7d02c5df..49a83ac8 100644 --- a/src/db/models/impl/url/data_source/pydantic.py +++ b/src/db/models/impl/url/data_source/pydantic.py @@ -1,11 +1,11 @@ -from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource from src.db.templates.markers.bulk.insert import BulkInsertableModel class URLDataSourcePydantic(BulkInsertableModel): - data_source_id: int + ds_data_source_id: int url_id: int @classmethod - def sa_model(cls) -> type[URLDataSource]: - return URLDataSource \ No newline at end of file + def sa_model(cls) -> type[DSAppLinkDataSource]: + return DSAppLinkDataSource \ No newline at end of file diff --git a/src/db/models/impl/url/data_source/sqlalchemy.py b/src/db/models/impl/url/data_source/sqlalchemy.py index be7bf047..74c9bdf0 100644 --- a/src/db/models/impl/url/data_source/sqlalchemy.py +++ b/src/db/models/impl/url/data_source/sqlalchemy.py @@ -1,14 +1,27 @@ -from sqlalchemy import Column, Integer +from sqlalchemy import Column, Integer, ForeignKey from sqlalchemy.orm import relationship -from src.db.models.mixins import CreatedAtMixin, URLDependentMixin +from src.db.models.mixins import CreatedAtMixin, URLDependentMixin, LastSyncedAtMixin from src.db.models.templates_.with_id import WithIDBase -class URLDataSource(CreatedAtMixin, URLDependentMixin, WithIDBase): - __tablename__ = "url_data_source" +class DSAppLinkDataSource( + CreatedAtMixin, + URLDependentMixin, + WithIDBase, + LastSyncedAtMixin +): + __tablename__ = "ds_app_link_data_source" - data_source_id = Column(Integer, nullable=False) + url_id = Column( + Integer, + ForeignKey( + 'urls.id', + ondelete="SET NULL", + ), + nullable=True + ) + ds_data_source_id = Column(Integer, nullable=False, primary_key=True) # Relationships url = relationship( diff --git a/src/db/models/impl/url/ds_meta_url/pydantic.py b/src/db/models/impl/url/ds_meta_url/pydantic.py index 8f7674e9..60a83e3b 100644 --- a/src/db/models/impl/url/ds_meta_url/pydantic.py +++ b/src/db/models/impl/url/ds_meta_url/pydantic.py @@ -1,6 +1,6 @@ from pydantic import BaseModel -from src.db.models.impl.url.ds_meta_url.sqlalchemy import URLDSMetaURL +from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL class URLDSMetaURLPydantic(BaseModel): @@ -10,5 +10,5 @@ class URLDSMetaURLPydantic(BaseModel): agency_id: int @classmethod - def sa_model(cls) -> type[URLDSMetaURL]: - return URLDSMetaURL \ No newline at end of file + def sa_model(cls) -> type[DSAppLinkMetaURL]: + return DSAppLinkMetaURL \ No newline at end of file diff --git a/src/db/models/impl/url/ds_meta_url/sqlalchemy.py b/src/db/models/impl/url/ds_meta_url/sqlalchemy.py index e642a694..1d74c12d 100644 --- a/src/db/models/impl/url/ds_meta_url/sqlalchemy.py +++ b/src/db/models/impl/url/ds_meta_url/sqlalchemy.py @@ -1,20 +1,26 @@ -from sqlalchemy import Column, Integer, PrimaryKeyConstraint, UniqueConstraint +from sqlalchemy import Column, Integer, PrimaryKeyConstraint, UniqueConstraint, ForeignKey -from src.db.models.mixins import URLDependentMixin, CreatedAtMixin, AgencyDependentMixin +from src.db.models.mixins import URLDependentMixin, CreatedAtMixin, AgencyDependentMixin, LastSyncedAtMixin from src.db.models.templates_.base import Base -class URLDSMetaURL( +class DSAppLinkMetaURL( Base, - URLDependentMixin, - AgencyDependentMixin, - CreatedAtMixin + CreatedAtMixin, + LastSyncedAtMixin ): - __tablename__ = "url_ds_meta_url" + __tablename__ = "ds_app_link_meta_url" - ds_meta_url_id = Column(Integer) + url_id = Column( + Integer, + ForeignKey( + 'urls.id', + ondelete="SET NULL", + ), + nullable=True + ) + ds_meta_url_id = Column(Integer, primary_key=True) __table_args__ = ( - PrimaryKeyConstraint("url_id", "agency_id"), - UniqueConstraint("ds_meta_url_id"), + UniqueConstraint("url_id"), ) \ No newline at end of file diff --git a/src/db/models/impl/url/html/compressed/sqlalchemy.py b/src/db/models/impl/url/html/compressed/sqlalchemy.py index 995c5b25..4974e5f0 100644 --- a/src/db/models/impl/url/html/compressed/sqlalchemy.py +++ b/src/db/models/impl/url/html/compressed/sqlalchemy.py @@ -1,16 +1,20 @@ -from sqlalchemy import Column, LargeBinary +from sqlalchemy import Column, LargeBinary, PrimaryKeyConstraint from sqlalchemy.orm import relationship, Mapped from src.db.models.mixins import CreatedAtMixin, URLDependentMixin +from src.db.models.templates_.base import Base from src.db.models.templates_.with_id import WithIDBase class URLCompressedHTML( CreatedAtMixin, URLDependentMixin, - WithIDBase + Base ): __tablename__ = 'url_compressed_html' + __table_args__ = ( + PrimaryKeyConstraint("url_id"), + ) compressed_html: Mapped[bytes] = Column(LargeBinary, nullable=False) diff --git a/src/db/models/impl/url/html/content/sqlalchemy.py b/src/db/models/impl/url/html/content/sqlalchemy.py index 63e4da76..ded0957b 100644 --- a/src/db/models/impl/url/html/content/sqlalchemy.py +++ b/src/db/models/impl/url/html/content/sqlalchemy.py @@ -1,21 +1,20 @@ -from sqlalchemy import UniqueConstraint, Column, Text +from sqlalchemy import UniqueConstraint, Column, Text, PrimaryKeyConstraint from sqlalchemy.orm import relationship from src.db.enums import PGEnum from src.db.models.mixins import UpdatedAtMixin, URLDependentMixin +from src.db.models.templates_.base import Base from src.db.models.templates_.with_id import WithIDBase class URLHTMLContent( UpdatedAtMixin, URLDependentMixin, - WithIDBase + Base, ): __tablename__ = 'url_html_content' - __table_args__ = (UniqueConstraint( - "url_id", - "content_type", - name="uq_url_id_content_type"), + __table_args__ = ( + PrimaryKeyConstraint("url_id", "content_type"), ) content_type = Column( diff --git a/src/db/models/impl/url/internet_archives/probe/sqlalchemy.py b/src/db/models/impl/url/internet_archives/probe/sqlalchemy.py index 122905a7..ca9d1b0a 100644 --- a/src/db/models/impl/url/internet_archives/probe/sqlalchemy.py +++ b/src/db/models/impl/url/internet_archives/probe/sqlalchemy.py @@ -1,14 +1,18 @@ +from sqlalchemy import PrimaryKeyConstraint from sqlalchemy.orm import Mapped from src.db.models.mixins import URLDependentMixin -from src.db.models.templates_.standard import StandardBase +from src.db.models.templates_.base import Base class URLInternetArchivesProbeMetadata( - StandardBase, + Base, URLDependentMixin ): __tablename__ = 'url_internet_archives_probe_metadata' + __table_args__ = ( + PrimaryKeyConstraint("url_id"), + ) archive_url: Mapped[str] digest: Mapped[str] diff --git a/src/db/models/impl/url/internet_archives/save/sqlalchemy.py b/src/db/models/impl/url/internet_archives/save/sqlalchemy.py index 791f4077..f0aff36f 100644 --- a/src/db/models/impl/url/internet_archives/save/sqlalchemy.py +++ b/src/db/models/impl/url/internet_archives/save/sqlalchemy.py @@ -1,14 +1,17 @@ -from sqlalchemy import Column, DateTime, func +from sqlalchemy import Column, DateTime, func, PrimaryKeyConstraint from src.db.models.mixins import URLDependentMixin -from src.db.models.templates_.with_id import WithIDBase +from src.db.models.templates_.base import Base class URLInternetArchivesSaveMetadata( - WithIDBase, + Base, URLDependentMixin ): __tablename__ = 'url_internet_archives_save_metadata' + __table_args__ = ( + PrimaryKeyConstraint("url_id"), + ) created_at = Column(DateTime, nullable=False, server_default=func.now()) last_uploaded_at = Column(DateTime, nullable=False, server_default=func.now()) diff --git a/src/db/models/impl/url/optional_data_source_metadata.py b/src/db/models/impl/url/optional_data_source_metadata.py deleted file mode 100644 index bb2a95e5..00000000 --- a/src/db/models/impl/url/optional_data_source_metadata.py +++ /dev/null @@ -1,16 +0,0 @@ -from sqlalchemy import Column, ARRAY, String -from sqlalchemy.orm import relationship - -from src.db.models.mixins import URLDependentMixin -from src.db.models.templates_.with_id import WithIDBase - - -class URLOptionalDataSourceMetadata(URLDependentMixin, WithIDBase): - __tablename__ = 'url_optional_data_source_metadata' - - record_formats = Column(ARRAY(String), nullable=True) - data_portal_type = Column(String, nullable=True) - supplying_entity = Column(String, nullable=True) - - # Relationships - url = relationship("URL", uselist=False, back_populates="optional_data_source_metadata") diff --git a/src/db/models/impl/url/optional_ds_metadata/__init__.py b/src/db/models/impl/url/optional_ds_metadata/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/impl/url/optional_ds_metadata/enums.py b/src/db/models/impl/url/optional_ds_metadata/enums.py new file mode 100644 index 00000000..3b08e6f0 --- /dev/null +++ b/src/db/models/impl/url/optional_ds_metadata/enums.py @@ -0,0 +1,29 @@ +from enum import Enum + + +class AgencyAggregationEnum(Enum): + FEDERAL = "federal" + STATE = "state" + COUNTY = "county" + LOCALITY = "local" + +class UpdateMethodEnum(Enum): + OVERWRITE = "Overwrite" + INSERT = "Insert" + NO_UPDATES = "No updates" + +class RetentionScheduleEnum(Enum): + FUTURE_ONLY = "Future only" + ONE_MONTH = "1 month" + ONE_DAY = "1 day" + ONE_WEEK = "1 week" + ONE_TO_TEN_YEARS = "1-10 years" + LT_1_DAY = "< 1 day" + LT_1_WEEK = "< 1 week" + LT_1_YEAR = "< 1 year" + GT_10_YEARS = "> 10 years" + +class AccessTypeEnum(Enum): + WEBPAGE = "Webpage" + DOWNLOAD = "Download" + API = "API" \ No newline at end of file diff --git a/src/db/models/impl/url/optional_ds_metadata/sqlalchemy.py b/src/db/models/impl/url/optional_ds_metadata/sqlalchemy.py new file mode 100644 index 00000000..04541ad6 --- /dev/null +++ b/src/db/models/impl/url/optional_ds_metadata/sqlalchemy.py @@ -0,0 +1,48 @@ +from sqlalchemy import Column, ARRAY, String, Date, Boolean, Enum, PrimaryKeyConstraint +from sqlalchemy.orm import relationship, Mapped + +from src.db.models.helpers import enum_column +from src.db.models.impl.url.optional_ds_metadata.enums import AgencyAggregationEnum, AccessTypeEnum, \ + RetentionScheduleEnum, UpdateMethodEnum +from src.db.models.mixins import URLDependentMixin, UpdatedAtMixin +from src.db.models.templates_.base import Base + + +class URLOptionalDataSourceMetadata( + URLDependentMixin, + Base, + UpdatedAtMixin +): + __tablename__ = 'url_optional_data_source_metadata' + __table_args__ = ( + PrimaryKeyConstraint("url_id"), + ) + + record_formats = Column(ARRAY(String), nullable=False, default=[]) + data_portal_type = Column(String, nullable=True) + supplying_entity = Column(String, nullable=True) + coverage_start = Column(Date, nullable=True) + coverage_end = Column(Date, nullable=True) + agency_supplied = Column(Boolean, nullable=True) + agency_originated = Column(Boolean, nullable=True) + agency_aggregation: Mapped[AgencyAggregationEnum] = enum_column(AgencyAggregationEnum, name="agency_aggregation_enum") + agency_described_not_in_database = Column(String, nullable=True) + update_method: Mapped[UpdateMethodEnum] = enum_column(UpdateMethodEnum, name="update_method_enum") + readme_url = Column(String, nullable=True) + originating_entity = Column(String, nullable=True) + retention_schedule: Mapped[RetentionScheduleEnum] = enum_column(RetentionScheduleEnum, name="retention_schedule_enum") + scraper_url = Column(String, nullable=True) + submission_notes = Column(String, nullable=True) + access_notes = Column(String, nullable=True) + access_types: Mapped[list[AccessTypeEnum]] = Column(ARRAY( + Enum( + AccessTypeEnum, + name="access_type_enum", + native_enum=True, + values_callable=lambda AccessTypeEnum: [e.value for e in AccessTypeEnum] + ) + ), nullable=False, default=[]) + data_portal_type_other = Column(String, nullable=True) + + # Relationships + url = relationship("URL", uselist=False, back_populates="optional_data_source_metadata") diff --git a/src/db/models/impl/url/record_type/sqlalchemy.py b/src/db/models/impl/url/record_type/sqlalchemy.py index 7e8f2fac..23137fae 100644 --- a/src/db/models/impl/url/record_type/sqlalchemy.py +++ b/src/db/models/impl/url/record_type/sqlalchemy.py @@ -2,13 +2,14 @@ from src.core.enums import RecordType from src.db.models.helpers import url_id_primary_key_constraint, enum_column -from src.db.models.mixins import URLDependentMixin, CreatedAtMixin +from src.db.models.mixins import URLDependentMixin, CreatedAtMixin, UpdatedAtMixin from src.db.models.templates_.base import Base class URLRecordType( Base, CreatedAtMixin, + UpdatedAtMixin, URLDependentMixin ): __tablename__ = "url_record_type" diff --git a/src/db/models/impl/url/reviewing_user.py b/src/db/models/impl/url/reviewing_user.py index 9213a157..379cfee5 100644 --- a/src/db/models/impl/url/reviewing_user.py +++ b/src/db/models/impl/url/reviewing_user.py @@ -1,16 +1,17 @@ -from sqlalchemy import UniqueConstraint, Column, Integer +from sqlalchemy import UniqueConstraint, Column, Integer, PrimaryKeyConstraint from sqlalchemy.orm import relationship from src.db.models.mixins import CreatedAtMixin, URLDependentMixin +from src.db.models.templates_.base import Base from src.db.models.templates_.with_id import WithIDBase -class ReviewingUserURL(CreatedAtMixin, URLDependentMixin, WithIDBase): +class ReviewingUserURL(CreatedAtMixin, URLDependentMixin, Base): __tablename__ = 'reviewing_user_url' __table_args__ = ( - UniqueConstraint( - "url_id", - name="approving_user_url_uq_user_id_url_id"), + PrimaryKeyConstraint( + "url_id", + ), ) user_id = Column(Integer, nullable=False) diff --git a/src/db/models/impl/url/scrape_info/sqlalchemy.py b/src/db/models/impl/url/scrape_info/sqlalchemy.py index b50f2903..bd59c6ff 100644 --- a/src/db/models/impl/url/scrape_info/sqlalchemy.py +++ b/src/db/models/impl/url/scrape_info/sqlalchemy.py @@ -1,15 +1,22 @@ +from sqlalchemy import PrimaryKeyConstraint + from src.db.models.helpers import enum_column from src.db.models.impl.url.scrape_info.enums import ScrapeStatus -from src.db.models.mixins import URLDependentMixin -from src.db.models.templates_.standard import StandardBase +from src.db.models.mixins import URLDependentMixin, CreatedAtMixin, UpdatedAtMixin +from src.db.models.templates_.base import Base class URLScrapeInfo( - StandardBase, + Base, + CreatedAtMixin, + UpdatedAtMixin, URLDependentMixin ): __tablename__ = 'url_scrape_info' + __table_args__ = ( + PrimaryKeyConstraint("url_id"), + ) status = enum_column( enum_type=ScrapeStatus, diff --git a/src/db/models/impl/url/suggestion/agency/subtask/sqlalchemy.py b/src/db/models/impl/url/suggestion/agency/subtask/sqlalchemy.py index 89371498..7a297ef1 100644 --- a/src/db/models/impl/url/suggestion/agency/subtask/sqlalchemy.py +++ b/src/db/models/impl/url/suggestion/agency/subtask/sqlalchemy.py @@ -1,4 +1,4 @@ -from sqlalchemy.orm import relationship +from sqlalchemy.orm import relationship, Mapped from src.db.models.helpers import enum_column from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType, SubtaskDetailCode @@ -16,7 +16,7 @@ class URLAutoAgencyIDSubtask( __tablename__ = "url_auto_agency_id_subtasks" - type = enum_column( + type: Mapped[AutoAgencyIDSubtaskType] = enum_column( AutoAgencyIDSubtaskType, name="agency_auto_suggestion_method" ) @@ -24,7 +24,7 @@ class URLAutoAgencyIDSubtask( sa.Boolean(), nullable=False ) - detail = enum_column( + detail: Mapped[SubtaskDetailCode] = enum_column( SubtaskDetailCode, name="agency_id_subtask_detail_code", ) diff --git a/src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py b/src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py index de6ee029..3f8b8186 100644 --- a/src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py +++ b/src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py @@ -12,6 +12,7 @@ class AgencyIDSubtaskSuggestion( ): __tablename__ = "agency_id_subtask_suggestions" + subtask_id = sa.Column( sa.Integer, sa.ForeignKey("url_auto_agency_id_subtasks.id"), diff --git a/src/db/models/impl/url/suggestion/agency/user.py b/src/db/models/impl/url/suggestion/agency/user.py index f7c43aad..79fa933c 100644 --- a/src/db/models/impl/url/suggestion/agency/user.py +++ b/src/db/models/impl/url/suggestion/agency/user.py @@ -1,21 +1,21 @@ -from sqlalchemy import Column, Boolean, UniqueConstraint, Integer +from sqlalchemy import Column, Boolean, UniqueConstraint, Integer, PrimaryKeyConstraint from sqlalchemy.orm import relationship, Mapped from src.db.models.helpers import get_agency_id_foreign_column from src.db.models.mixins import URLDependentMixin +from src.db.models.templates_.base import Base from src.db.models.templates_.with_id import WithIDBase -class UserUrlAgencySuggestion(URLDependentMixin, WithIDBase): +class UserURLAgencySuggestion(URLDependentMixin, Base): __tablename__ = "user_url_agency_suggestions" + __table_args__ = ( + PrimaryKeyConstraint("agency_id", "url_id", "user_id"), + ) agency_id: Mapped[int] = get_agency_id_foreign_column(nullable=True) user_id = Column(Integer, nullable=False) is_new = Column(Boolean, nullable=True) - agency = relationship("Agency", back_populates="user_suggestions") - url = relationship("URL", back_populates="user_agency_suggestions") - - __table_args__ = ( - UniqueConstraint("agency_id", "url_id", "user_id", name="uq_user_url_agency_suggestions"), - ) + agency = relationship("Agency") + url = relationship("URL") diff --git a/src/db/models/impl/url/suggestion/anonymous/__init__.py b/src/db/models/impl/url/suggestion/anonymous/__init__.py index e69de29b..fddc715f 100644 --- a/src/db/models/impl/url/suggestion/anonymous/__init__.py +++ b/src/db/models/impl/url/suggestion/anonymous/__init__.py @@ -0,0 +1 @@ +from src.db.models.impl.url.suggestion.anonymous.session.sqlalchemy import AnonymousSession \ No newline at end of file diff --git a/src/db/models/impl/url/suggestion/anonymous/agency/sqlalchemy.py b/src/db/models/impl/url/suggestion/anonymous/agency/sqlalchemy.py index afea2f23..6f750289 100644 --- a/src/db/models/impl/url/suggestion/anonymous/agency/sqlalchemy.py +++ b/src/db/models/impl/url/suggestion/anonymous/agency/sqlalchemy.py @@ -1,6 +1,6 @@ from sqlalchemy import PrimaryKeyConstraint -from src.db.models.mixins import URLDependentMixin, AgencyDependentMixin, CreatedAtMixin +from src.db.models.mixins import URLDependentMixin, AgencyDependentMixin, CreatedAtMixin, AnonymousSessionMixin from src.db.models.templates_.base import Base @@ -8,9 +8,10 @@ class AnonymousAnnotationAgency( Base, URLDependentMixin, AgencyDependentMixin, - CreatedAtMixin + CreatedAtMixin, + AnonymousSessionMixin ): __tablename__ = "anonymous_annotation_agency" __table_args__ = ( - PrimaryKeyConstraint("url_id", "agency_id"), + PrimaryKeyConstraint("session_id", "url_id", "agency_id"), ) \ No newline at end of file diff --git a/src/db/models/impl/url/suggestion/anonymous/location/sqlalchemy.py b/src/db/models/impl/url/suggestion/anonymous/location/sqlalchemy.py index f02cb7ba..3e39810b 100644 --- a/src/db/models/impl/url/suggestion/anonymous/location/sqlalchemy.py +++ b/src/db/models/impl/url/suggestion/anonymous/location/sqlalchemy.py @@ -1,6 +1,6 @@ from sqlalchemy import PrimaryKeyConstraint -from src.db.models.mixins import LocationDependentMixin, URLDependentMixin, CreatedAtMixin +from src.db.models.mixins import LocationDependentMixin, URLDependentMixin, CreatedAtMixin, AnonymousSessionMixin from src.db.models.templates_.base import Base @@ -8,10 +8,11 @@ class AnonymousAnnotationLocation( Base, URLDependentMixin, LocationDependentMixin, - CreatedAtMixin + CreatedAtMixin, + AnonymousSessionMixin ): __tablename__ = "anonymous_annotation_location" __table_args__ = ( - PrimaryKeyConstraint("url_id", "location_id"), + PrimaryKeyConstraint("session_id", "url_id", "location_id"), ) \ No newline at end of file diff --git a/src/db/models/impl/url/suggestion/anonymous/record_type/sqlalchemy.py b/src/db/models/impl/url/suggestion/anonymous/record_type/sqlalchemy.py index 25a9ddec..22f37839 100644 --- a/src/db/models/impl/url/suggestion/anonymous/record_type/sqlalchemy.py +++ b/src/db/models/impl/url/suggestion/anonymous/record_type/sqlalchemy.py @@ -3,18 +3,19 @@ from src.core.enums import RecordType from src.db.models.helpers import enum_column -from src.db.models.mixins import URLDependentMixin, CreatedAtMixin +from src.db.models.mixins import URLDependentMixin, CreatedAtMixin, AnonymousSessionMixin from src.db.models.templates_.base import Base class AnonymousAnnotationRecordType( Base, URLDependentMixin, - CreatedAtMixin + CreatedAtMixin, + AnonymousSessionMixin ): __tablename__ = "anonymous_annotation_record_type" __table_args__ = ( - PrimaryKeyConstraint("url_id", "record_type"), + PrimaryKeyConstraint("session_id", "url_id", "record_type"), ) record_type: Mapped[RecordType] = enum_column( diff --git a/src/db/models/impl/url/suggestion/anonymous/session/__init__.py b/src/db/models/impl/url/suggestion/anonymous/session/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/impl/url/suggestion/anonymous/session/sqlalchemy.py b/src/db/models/impl/url/suggestion/anonymous/session/sqlalchemy.py new file mode 100644 index 00000000..cbb43448 --- /dev/null +++ b/src/db/models/impl/url/suggestion/anonymous/session/sqlalchemy.py @@ -0,0 +1,17 @@ +from sqlalchemy import text, Column + +from src.db.models.mixins import CreatedAtMixin +from src.db.models.templates_.base import Base +from sqlalchemy.dialects.postgresql import UUID + + +class AnonymousSession( + Base, + CreatedAtMixin +): + __tablename__ = "anonymous_sessions" + id = Column( + UUID(as_uuid=True), + primary_key=True, + server_default=text("gen_random_uuid()") + ) \ No newline at end of file diff --git a/src/db/models/impl/url/suggestion/anonymous/url_type/sqlalchemy.py b/src/db/models/impl/url/suggestion/anonymous/url_type/sqlalchemy.py index f9033ffa..f0cbc6a7 100644 --- a/src/db/models/impl/url/suggestion/anonymous/url_type/sqlalchemy.py +++ b/src/db/models/impl/url/suggestion/anonymous/url_type/sqlalchemy.py @@ -3,18 +3,19 @@ from src.db.models.helpers import enum_column from src.db.models.impl.flag.url_validated.enums import URLType -from src.db.models.mixins import URLDependentMixin, CreatedAtMixin +from src.db.models.mixins import URLDependentMixin, CreatedAtMixin, AnonymousSessionMixin from src.db.models.templates_.base import Base class AnonymousAnnotationURLType( Base, URLDependentMixin, - CreatedAtMixin + CreatedAtMixin, + AnonymousSessionMixin ): __tablename__ = "anonymous_annotation_url_type" __table_args__ = ( - PrimaryKeyConstraint("url_id", "url_type"), + PrimaryKeyConstraint("session_id", "url_id", "url_type"), ) url_type: Mapped[URLType] = enum_column( diff --git a/src/db/models/impl/url/suggestion/location/user/sqlalchemy.py b/src/db/models/impl/url/suggestion/location/user/sqlalchemy.py index a9d4ae8b..18ac3851 100644 --- a/src/db/models/impl/url/suggestion/location/user/sqlalchemy.py +++ b/src/db/models/impl/url/suggestion/location/user/sqlalchemy.py @@ -1,5 +1,7 @@ from sqlalchemy import Integer, Column, PrimaryKeyConstraint +from sqlalchemy.orm import relationship +from src.db.models.impl.link.user_suggestion_not_found.location.sqlalchemy import LinkUserSuggestionLocationNotFound from src.db.models.mixins import CreatedAtMixin, URLDependentMixin, LocationDependentMixin from src.db.models.templates_.base import Base @@ -18,4 +20,5 @@ class UserLocationSuggestion( user_id = Column( Integer, nullable=False, - ) \ No newline at end of file + ) + diff --git a/src/db/models/impl/url/suggestion/record_type/auto.py b/src/db/models/impl/url/suggestion/record_type/auto.py index 2aaed526..1c2c68d1 100644 --- a/src/db/models/impl/url/suggestion/record_type/auto.py +++ b/src/db/models/impl/url/suggestion/record_type/auto.py @@ -1,8 +1,9 @@ -from sqlalchemy import Column, UniqueConstraint +from sqlalchemy import Column, UniqueConstraint, PrimaryKeyConstraint from sqlalchemy.dialects import postgresql from sqlalchemy.orm import relationship from src.db.models.mixins import URLDependentMixin, UpdatedAtMixin, CreatedAtMixin +from src.db.models.templates_.base import Base from src.db.models.templates_.with_id import WithIDBase from src.db.models.types import record_type_values @@ -11,13 +12,13 @@ class AutoRecordTypeSuggestion( UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, - WithIDBase + Base, ): __tablename__ = "auto_record_type_suggestions" record_type = Column(postgresql.ENUM(*record_type_values, name='record_type'), nullable=False) __table_args__ = ( - UniqueConstraint("url_id", name="auto_record_type_suggestions_uq_url_id"), + PrimaryKeyConstraint("url_id"), ) # Relationships diff --git a/src/db/models/impl/url/suggestion/record_type/user.py b/src/db/models/impl/url/suggestion/record_type/user.py index 5b9dde8c..4e271225 100644 --- a/src/db/models/impl/url/suggestion/record_type/user.py +++ b/src/db/models/impl/url/suggestion/record_type/user.py @@ -1,22 +1,27 @@ -from sqlalchemy import Column, Integer, UniqueConstraint +from sqlalchemy import Column, Integer, UniqueConstraint, PrimaryKeyConstraint from sqlalchemy.dialects import postgresql from sqlalchemy.orm import relationship from src.db.models.mixins import UpdatedAtMixin, CreatedAtMixin, URLDependentMixin +from src.db.models.templates_.base import Base from src.db.models.templates_.with_id import WithIDBase from src.db.models.types import record_type_values -class UserRecordTypeSuggestion(UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, WithIDBase): +class UserRecordTypeSuggestion( + UpdatedAtMixin, + CreatedAtMixin, + URLDependentMixin, + Base, +): __tablename__ = "user_record_type_suggestions" + __table_args__ = ( + PrimaryKeyConstraint("url_id", "user_id"), + ) user_id = Column(Integer, nullable=False) record_type = Column(postgresql.ENUM(*record_type_values, name='record_type'), nullable=False) - __table_args__ = ( - UniqueConstraint("url_id", "user_id", name="uq_user_record_type_suggestions"), - ) # Relationships - url = relationship("URL", back_populates="user_record_type_suggestions") diff --git a/src/db/models/impl/url/suggestion/url_type/__init__.py b/src/db/models/impl/url/suggestion/url_type/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/impl/url/suggestion/url_type/auto/__init__.py b/src/db/models/impl/url/suggestion/url_type/auto/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/impl/url/suggestion/url_type/auto/pydantic/__init__.py b/src/db/models/impl/url/suggestion/url_type/auto/pydantic/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/impl/url/suggestion/relevant/auto/pydantic/input.py b/src/db/models/impl/url/suggestion/url_type/auto/pydantic/input.py similarity index 100% rename from src/db/models/impl/url/suggestion/relevant/auto/pydantic/input.py rename to src/db/models/impl/url/suggestion/url_type/auto/pydantic/input.py diff --git a/src/db/models/impl/url/suggestion/relevant/auto/sqlalchemy.py b/src/db/models/impl/url/suggestion/url_type/auto/sqlalchemy.py similarity index 74% rename from src/db/models/impl/url/suggestion/relevant/auto/sqlalchemy.py rename to src/db/models/impl/url/suggestion/url_type/auto/sqlalchemy.py index 49dc7457..19b5dc09 100644 --- a/src/db/models/impl/url/suggestion/relevant/auto/sqlalchemy.py +++ b/src/db/models/impl/url/suggestion/url_type/auto/sqlalchemy.py @@ -1,11 +1,17 @@ -from sqlalchemy import Column, Boolean, UniqueConstraint, String, Float +from sqlalchemy import Column, Boolean, UniqueConstraint, String, Float, PrimaryKeyConstraint from sqlalchemy.orm import relationship from src.db.models.mixins import UpdatedAtMixin, CreatedAtMixin, URLDependentMixin +from src.db.models.templates_.base import Base from src.db.models.templates_.with_id import WithIDBase -class AutoRelevantSuggestion(UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, WithIDBase): +class AutoRelevantSuggestion( + UpdatedAtMixin, + CreatedAtMixin, + URLDependentMixin, + Base, +): __tablename__ = "auto_relevant_suggestions" relevant = Column(Boolean, nullable=True) @@ -14,6 +20,7 @@ class AutoRelevantSuggestion(UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, __table_args__ = ( UniqueConstraint("url_id", name="auto_relevant_suggestions_uq_url_id"), + PrimaryKeyConstraint("url_id"), ) # Relationships diff --git a/src/db/models/impl/url/suggestion/relevant/user.py b/src/db/models/impl/url/suggestion/url_type/user.py similarity index 80% rename from src/db/models/impl/url/suggestion/relevant/user.py rename to src/db/models/impl/url/suggestion/url_type/user.py index c7070b5e..52bbc4eb 100644 --- a/src/db/models/impl/url/suggestion/relevant/user.py +++ b/src/db/models/impl/url/suggestion/url_type/user.py @@ -1,10 +1,11 @@ -from sqlalchemy import Column, UniqueConstraint, Integer +from sqlalchemy import Column, UniqueConstraint, Integer, PrimaryKeyConstraint from sqlalchemy.dialects import postgresql from sqlalchemy.orm import relationship, Mapped from src.db.models.helpers import enum_column from src.db.models.impl.flag.url_validated.enums import URLType from src.db.models.mixins import UpdatedAtMixin, CreatedAtMixin, URLDependentMixin +from src.db.models.templates_.base import Base from src.db.models.templates_.with_id import WithIDBase @@ -12,9 +13,12 @@ class UserURLTypeSuggestion( UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, - WithIDBase + Base, ): __tablename__ = "user_url_type_suggestions" + __table_args__ = ( + PrimaryKeyConstraint("url_id", "user_id"), + ) user_id = Column(Integer, nullable=False) type: Mapped[URLType | None] = enum_column( @@ -23,10 +27,6 @@ class UserURLTypeSuggestion( nullable=True ) - __table_args__ = ( - UniqueConstraint("url_id", "user_id", name="uq_user_relevant_suggestions"), - ) - # Relationships url = relationship("URL", back_populates="user_relevant_suggestions") diff --git a/src/db/models/impl/url/web_metadata/sqlalchemy.py b/src/db/models/impl/url/web_metadata/sqlalchemy.py index 45f5233c..3170a189 100644 --- a/src/db/models/impl/url/web_metadata/sqlalchemy.py +++ b/src/db/models/impl/url/web_metadata/sqlalchemy.py @@ -1,17 +1,20 @@ -from sqlalchemy import Column, Text, Boolean, Integer +from sqlalchemy import Column, Text, Boolean, Integer, PrimaryKeyConstraint from src.db.models.mixins import URLDependentMixin, CreatedAtMixin, UpdatedAtMixin -from src.db.models.templates_.with_id import WithIDBase +from src.db.models.templates_.base import Base class URLWebMetadata( - WithIDBase, + Base, URLDependentMixin, CreatedAtMixin, UpdatedAtMixin ): """Contains information about the web page.""" __tablename__ = "url_web_metadata" + __table_args__ = ( + PrimaryKeyConstraint("url_id"), + ) accessed = Column( Boolean(), diff --git a/src/db/models/materialized_views/__init__.py b/src/db/models/materialized_views/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/materialized_views/html_duplicate_url.py b/src/db/models/materialized_views/html_duplicate_url.py new file mode 100644 index 00000000..703bbbea --- /dev/null +++ b/src/db/models/materialized_views/html_duplicate_url.py @@ -0,0 +1,9 @@ +from src.db.models.mixins import URLDependentViewMixin +from src.db.models.templates_.base import Base + + +class HTMLDuplicateURLMaterializedView( + Base, + URLDependentViewMixin +): + __tablename__ = "mat_view__html_duplicate_url" \ No newline at end of file diff --git a/src/db/models/mixins.py b/src/db/models/mixins.py index 12a0b2a1..640ec955 100644 --- a/src/db/models/mixins.py +++ b/src/db/models/mixins.py @@ -3,7 +3,9 @@ from sqlalchemy import Column, Integer, ForeignKey, TIMESTAMP, event from src.db.models.exceptions import WriteToViewError -from src.db.models.helpers import get_created_at_column, CURRENT_TIME_SERVER_DEFAULT +from src.db.models.helpers import get_created_at_column, CURRENT_TIME_SERVER_DEFAULT, url_id_primary_key_constraint, \ + VIEW_ARG +from sqlalchemy.dialects.postgresql import UUID class URLDependentMixin: @@ -58,10 +60,16 @@ class AgencyDependentMixin: nullable=False ) - class CreatedAtMixin: created_at = get_created_at_column() +class LastSyncedAtMixin: + last_synced_at = Column( + TIMESTAMP, + nullable=False, + server_default=CURRENT_TIME_SERVER_DEFAULT + ) + class UpdatedAtMixin: updated_at = Column( @@ -84,3 +92,19 @@ def __declare_last__(cls) -> None: @staticmethod def _block_write(mapper, connection, target): raise WriteToViewError(f"{type(target).__name__} is a read-only view.") + +class URLDependentViewMixin(URLDependentMixin, ViewMixin): + __table_args__ = ( + url_id_primary_key_constraint(), + VIEW_ARG + ) + +class AnonymousSessionMixin: + session_id = Column( + UUID(as_uuid=True), + ForeignKey( + 'anonymous_sessions.id', + ondelete="CASCADE", + ), + nullable=False + ) \ No newline at end of file diff --git a/src/db/models/views/batch_url_status/core.py b/src/db/models/views/batch_url_status/core.py index 888ca169..1ec0711d 100644 --- a/src/db/models/views/batch_url_status/core.py +++ b/src/db/models/views/batch_url_status/core.py @@ -11,7 +11,7 @@ select 1 from - link_batch_urls lbu + link_batches__urls lbu where lbu.batch_id = b.id ) @@ -26,7 +26,7 @@ select 1 from - link_batch_urls lbu + link_batches__urls lbu left join flag_url_validated fuv on fuv.url_id = lbu.url_id where lbu.batch_id = b.id @@ -36,7 +36,7 @@ select 1 from - link_batch_urls lbu + link_batches__urls lbu left join flag_url_validated fuv on fuv.url_id = lbu.url_id where lbu.batch_id = b.id diff --git a/src/db/models/views/dependent_locations.py b/src/db/models/views/dependent_locations.py index 95f3db98..425e25a6 100644 --- a/src/db/models/views/dependent_locations.py +++ b/src/db/models/views/dependent_locations.py @@ -31,6 +31,7 @@ """ from sqlalchemy import Column, Integer, ForeignKey +from src.db.models.helpers import VIEW_ARG from src.db.models.mixins import ViewMixin from src.db.models.templates_.base import Base @@ -39,7 +40,7 @@ class DependentLocationView(Base, ViewMixin): __tablename__ = "dependent_locations" __table_args__ = ( - {"info": "view"} + VIEW_ARG, ) parent_location_id = Column( diff --git a/src/db/models/views/integrity/__init__.py b/src/db/models/views/integrity/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/views/integrity/incomplete_data_sources.py b/src/db/models/views/integrity/incomplete_data_sources.py new file mode 100644 index 00000000..06efa3b4 --- /dev/null +++ b/src/db/models/views/integrity/incomplete_data_sources.py @@ -0,0 +1,38 @@ +""" + create view integrity__incomplete_data_sources_view as + select + ds.url_id, + fuv.url_id is not null as has_validated_flag, + fuv.type as validated_type, + urt.url_id is not null as has_record_type, + lau.url_id is not null as has_agency_flag + from ds_app_link_data_source ds + left join flag_url_validated fuv on fuv.url_id = ds.url_id + left join url_record_type urt on urt.url_id = ds.url_id + left join link_agencies__urls lau on lau.url_id = ds.url_id + where + fuv.url_id is null + or fuv.type != 'data source' + or urt.url_id is null + or lau.url_id is null + """ +from sqlalchemy import Column, Boolean + +from src.db.models.helpers import enum_column +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.mixins import URLDependentViewMixin +from src.db.models.templates_.base import Base + +class IntegrityIncompleteDataSource( + Base, + URLDependentViewMixin +): + __tablename__ = "integrity__incomplete_data_sources_view" + + has_validated_flag = Column(Boolean) + validated_type = enum_column( + enum_type=URLType, + name="url_type", + ) + has_record_type = Column(Boolean) + has_agency_flag = Column(Boolean) diff --git a/src/db/models/views/integrity/incomplete_meta_urls.py b/src/db/models/views/integrity/incomplete_meta_urls.py new file mode 100644 index 00000000..a837c156 --- /dev/null +++ b/src/db/models/views/integrity/incomplete_meta_urls.py @@ -0,0 +1,36 @@ +""" + create view integrity__incomplete_meta_urls_view as + select + mu.url_id, + fuv.url_id is not null as has_validated_flag, + fuv.type as validated_type, + lau.url_id is not null as has_agency_flag + from ds_app_link_meta_url mu + left join flag_url_validated fuv on fuv.url_id = mu.url_id + left join link_agencies__urls lau on lau.url_id = mu.url_id + where + fuv.url_id is null + or fuv.type != 'meta url' + or lau.url_id is null + """ +from sqlalchemy import Column, Boolean + +from src.db.models.helpers import enum_column +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.mixins import URLDependentViewMixin +from src.db.models.templates_.base import Base + +class IntegrityIncompleteMetaURL( + Base, + URLDependentViewMixin +): + __tablename__ = "integrity__incomplete_meta_urls_view" + + has_validated_flag = Column(Boolean) + validated_type = enum_column( + enum_type=URLType, + name="url_type", + ) + has_agency_flag = Column(Boolean) + + diff --git a/src/db/models/views/integrity/non_federal_agencies_no_location.py b/src/db/models/views/integrity/non_federal_agencies_no_location.py new file mode 100644 index 00000000..73e547b9 --- /dev/null +++ b/src/db/models/views/integrity/non_federal_agencies_no_location.py @@ -0,0 +1,27 @@ +""" + create view integrity__non_federal_agencies_no_location_view as + select + ag.id as agency_id + from agencies ag + left join link_agencies__locations link on ag.id = link.agency_id + where ag.jurisdiction_type != 'federal' + and link.location_id is null + """ +from sqlalchemy import String, Column, PrimaryKeyConstraint + +from src.db.models.helpers import VIEW_ARG +from src.db.models.mixins import ViewMixin, AgencyDependentMixin +from src.db.models.templates_.base import Base + +class IntegrityNonFederalAgenciesNoLocation( + Base, + ViewMixin, + AgencyDependentMixin, +): + __tablename__ = "integrity__non_federal_agencies_no_location_view" + __table_args__ = ( + PrimaryKeyConstraint("agency_id"), + VIEW_ARG, + ) + + name = Column(String) diff --git a/src/db/models/views/integrity/url_both_data_source_and_meta_url.py b/src/db/models/views/integrity/url_both_data_source_and_meta_url.py new file mode 100644 index 00000000..0de88314 --- /dev/null +++ b/src/db/models/views/integrity/url_both_data_source_and_meta_url.py @@ -0,0 +1,21 @@ +""" + create view integrity__url_both_data_source_and_meta_url_view as + select + ds.url_id + from + ds_app_link_data_source ds + join ds_app_link_meta_url mu + on mu.url_id = ds.url_id +""" + +from src.db.models.mixins import URLDependentViewMixin +from src.db.models.templates_.base import Base + + +class IntegrityURLBothDataSourceAndMetaURL( + Base, + URLDependentViewMixin +): + __tablename__ = "integrity__url_both_data_source_and_meta_url_view" + + diff --git a/src/db/models/views/location_expanded.py b/src/db/models/views/location_expanded.py index 1eb973aa..cf60005b 100644 --- a/src/db/models/views/location_expanded.py +++ b/src/db/models/views/location_expanded.py @@ -45,7 +45,6 @@ class LocationExpandedView( WithIDBase, ViewMixin, - LocationDependentMixin ): __tablename__ = "locations_expanded" diff --git a/src/db/models/views/meta_url.py b/src/db/models/views/meta_url.py index 20437075..a2d64ca9 100644 --- a/src/db/models/views/meta_url.py +++ b/src/db/models/views/meta_url.py @@ -9,18 +9,13 @@ from sqlalchemy import PrimaryKeyConstraint -from src.db.models.mixins import ViewMixin, URLDependentMixin +from src.db.models.mixins import ViewMixin, URLDependentMixin, URLDependentViewMixin from src.db.models.templates_.base import Base class MetaURL( Base, - ViewMixin, - URLDependentMixin, + URLDependentViewMixin ): __tablename__ = "meta_url_view" - __table_args__ = ( - PrimaryKeyConstraint("url_id"), - {"info": "view"} - ) \ No newline at end of file diff --git a/src/db/models/views/unvalidated_url.py b/src/db/models/views/unvalidated_url.py index bcfa9293..baf5f071 100644 --- a/src/db/models/views/unvalidated_url.py +++ b/src/db/models/views/unvalidated_url.py @@ -11,18 +11,13 @@ """ from sqlalchemy import PrimaryKeyConstraint -from src.db.models.mixins import ViewMixin, URLDependentMixin +from src.db.models.mixins import ViewMixin, URLDependentMixin, URLDependentViewMixin from src.db.models.templates_.base import Base class UnvalidatedURL( Base, - ViewMixin, - URLDependentMixin, + URLDependentViewMixin ): - __tablename__ = "unvalidated_url_view" - __table_args__ = ( - PrimaryKeyConstraint("url_id"), - {"info": "view"} - ) \ No newline at end of file + __tablename__ = "unvalidated_url_view" \ No newline at end of file diff --git a/src/db/models/views/url_anno_count.py b/src/db/models/views/url_anno_count.py index 232f0d21..2e910afb 100644 --- a/src/db/models/views/url_anno_count.py +++ b/src/db/models/views/url_anno_count.py @@ -98,21 +98,16 @@ from sqlalchemy import PrimaryKeyConstraint, Column, Integer from src.db.models.helpers import url_id_primary_key_constraint -from src.db.models.mixins import ViewMixin, URLDependentMixin +from src.db.models.mixins import ViewMixin, URLDependentMixin, URLDependentViewMixin from src.db.models.templates_.base import Base class URLAnnotationCount( Base, - ViewMixin, - URLDependentMixin + URLDependentViewMixin ): __tablename__ = "url_annotation_count_view" - __table_args__ = ( - url_id_primary_key_constraint(), - {"info": "view"} - ) auto_agency_count = Column(Integer, nullable=False) auto_location_count = Column(Integer, nullable=False) diff --git a/src/db/models/views/url_annotations_flags.py b/src/db/models/views/url_annotations_flags.py index 57d8e866..c133fbfc 100644 --- a/src/db/models/views/url_annotations_flags.py +++ b/src/db/models/views/url_annotations_flags.py @@ -18,26 +18,21 @@ LEFT JOIN public.user_relevant_suggestions urs ON u.id = urs.url_id LEFT JOIN public.user_url_agency_suggestions uuas ON u.id = uuas.url_id LEFT JOIN public.reviewing_user_url ruu ON u.id = ruu.url_id - LEFT JOIN public.link_urls_agency cua on u.id = cua.url_id + LEFT JOIN public.link_agencies__urls cua on u.id = cua.url_id ) """ from sqlalchemy import PrimaryKeyConstraint, Column, Boolean -from src.db.models.mixins import ViewMixin, URLDependentMixin +from src.db.models.mixins import ViewMixin, URLDependentMixin, URLDependentViewMixin from src.db.models.templates_.base import Base class URLAnnotationFlagsView( Base, - ViewMixin, - URLDependentMixin + URLDependentViewMixin ): __tablename__ = "url_annotation_flags" - __table_args__ = ( - PrimaryKeyConstraint("url_id"), - {"info": "view"} - ) has_auto_record_type_suggestion = Column(Boolean, nullable=False) has_auto_relevant_suggestion = Column(Boolean, nullable=False) diff --git a/src/db/models/views/url_status/core.py b/src/db/models/views/url_status/core.py index 77a01139..be771fe5 100644 --- a/src/db/models/views/url_status/core.py +++ b/src/db/models/views/url_status/core.py @@ -59,19 +59,14 @@ from sqlalchemy import String, Column from src.db.models.helpers import url_id_primary_key_constraint -from src.db.models.mixins import ViewMixin, URLDependentMixin +from src.db.models.mixins import ViewMixin, URLDependentMixin, URLDependentViewMixin from src.db.models.templates_.base import Base class URLStatusMatView( Base, - ViewMixin, - URLDependentMixin + URLDependentViewMixin ): __tablename__ = "url_status_mat_view" - __table_args__ = ( - url_id_primary_key_constraint(), - {"info": "view"} - ) status = Column(String) \ No newline at end of file diff --git a/src/db/models/views/url_status/enums.py b/src/db/models/views/url_status/enums.py index 82995812..a467a33d 100644 --- a/src/db/models/views/url_status/enums.py +++ b/src/db/models/views/url_status/enums.py @@ -4,6 +4,7 @@ class URLStatusViewEnum(Enum): INTAKE = "Intake" ACCEPTED = "Accepted" - SUBMITTED_PIPELINE_COMPLETE = "Submitted/Pipeline Complete" + AWAITING_SUBMISSION = "Awaiting Submission" + SUBMITTED = "Submitted" ERROR = "Error" COMMUNITY_LABELING = "Community Labeling" \ No newline at end of file diff --git a/src/db/queries/base/builder.py b/src/db/queries/base/builder.py index f0ef345c..8a1829d0 100644 --- a/src/db/queries/base/builder.py +++ b/src/db/queries/base/builder.py @@ -1,4 +1,4 @@ -from typing import Any, Generic, Optional +from typing import Any, Generic from sqlalchemy import FromClause, ColumnClause from sqlalchemy.ext.asyncio import AsyncSession @@ -12,6 +12,7 @@ class QueryBuilderBase(Generic[LabelsType]): def __init__(self, labels: LabelsType | None = None): self.query: FromClause | None = None self.labels = labels + self.sh = sh def get(self, key: str) -> ColumnClause: return getattr(self.query.c, key) diff --git a/src/db/queries/implementations/anonymous_session.py b/src/db/queries/implementations/anonymous_session.py new file mode 100644 index 00000000..0ff00ea3 --- /dev/null +++ b/src/db/queries/implementations/anonymous_session.py @@ -0,0 +1,16 @@ +from uuid import UUID + +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.models.impl.url.suggestion.anonymous import AnonymousSession +from src.db.queries.base.builder import QueryBuilderBase + + +class MakeAnonymousSessionQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> UUID: + return await self.sh.add( + session=session, + model=AnonymousSession(), + return_id=True + ) diff --git a/src/db/queries/implementations/core/common/annotation_exists_/constants.py b/src/db/queries/implementations/core/common/annotation_exists_/constants.py index 1237634e..190291ef 100644 --- a/src/db/queries/implementations/core/common/annotation_exists_/constants.py +++ b/src/db/queries/implementations/core/common/annotation_exists_/constants.py @@ -1,9 +1,9 @@ from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask -from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.impl.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion -from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion +from src.db.models.impl.url.suggestion.url_type.auto.sqlalchemy import AutoRelevantSuggestion +from src.db.models.impl.url.suggestion.url_type.user import UserURLTypeSuggestion ALL_ANNOTATION_MODELS = [ AutoRecordTypeSuggestion, @@ -11,5 +11,5 @@ URLAutoAgencyIDSubtask, UserURLTypeSuggestion, UserRecordTypeSuggestion, - UserUrlAgencySuggestion + UserURLAgencySuggestion ] diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py index 4921337f..27240b7d 100644 --- a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/builder.py @@ -8,7 +8,7 @@ from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.batch.sqlalchemy import Batch -from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource from src.db.models.views.batch_url_status.core import BatchURLStatusMatView from src.db.models.views.batch_url_status.enums import BatchURLStatusEnum from src.db.queries.base.builder import QueryBuilderBase diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/error.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/error.py index b74020c4..953a5c0d 100644 --- a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/error.py +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/error.py @@ -1,9 +1,11 @@ from sqlalchemy import select, func from src.collectors.enums import URLStatus +from src.db.helpers.query import exists_url from src.db.models.impl.batch.sqlalchemy import Batch from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.task_error.sqlalchemy import URLTaskError from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte_container import \ URLCountsCTEContainer @@ -21,7 +23,7 @@ URL.id == LinkBatchURL.url_id, ) .where( - URL.status == URLStatus.ERROR + exists_url(URLTaskError) ) .group_by( Batch.id diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/submitted.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/submitted.py index 5ab305cc..3b9e0c55 100644 --- a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/submitted.py +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/submitted.py @@ -5,7 +5,7 @@ from src.db.models.impl.batch.sqlalchemy import Batch from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte_container import \ URLCountsCTEContainer @@ -23,8 +23,8 @@ URL.id == LinkBatchURL.url_id, ) .join( - URLDataSource, - URLDataSource.url_id == URL.id, + DSAppLinkDataSource, + DSAppLinkDataSource.url_id == URL.id, ) .group_by( Batch.id diff --git a/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py b/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py index 17136cce..d609e2b3 100644 --- a/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py +++ b/src/db/queries/implementations/core/metrics/urls/aggregated/pending.py @@ -6,9 +6,9 @@ from src.api.endpoints.metrics.dtos.get.urls.aggregated.pending import GetMetricsURLsAggregatedPendingResponseDTO from src.collectors.enums import URLStatus from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion +from src.db.models.impl.url.suggestion.url_type.user import UserURLTypeSuggestion from src.db.models.mixins import URLDependentMixin from src.db.queries.base.builder import QueryBuilderBase from src.db.queries.implementations.core.common.annotation_exists_.core import AnnotationExistsCTEQueryBuilder @@ -25,7 +25,7 @@ def has_user_record_type_annotation(self): @property def has_user_agency_annotation(self): - return self.get_exists_for_model(UserUrlAgencySuggestion) + return self.get_exists_for_model(UserURLAgencySuggestion) def get_exists_for_model(self, model: Type[URLDependentMixin]): return self.query.c[ diff --git a/src/db/queries/urls_exist/__init__.py b/src/db/queries/urls_exist/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/queries/urls_exist/model.py b/src/db/queries/urls_exist/model.py new file mode 100644 index 00000000..72e20cfa --- /dev/null +++ b/src/db/queries/urls_exist/model.py @@ -0,0 +1,20 @@ +from pydantic import BaseModel + +from src.util.models.full_url import FullURL + + +class URLExistsResult(BaseModel): + class Config: + arbitrary_types_allowed = True + + query_url: FullURL + db_url: FullURL | None + url_id: int | None + + @property + def exists(self) -> bool: + return self.url_id is not None + + @property + def urls_match(self) -> bool: + return self.query_url.id_form == self.db_url.id_form \ No newline at end of file diff --git a/src/db/queries/urls_exist/query.py b/src/db/queries/urls_exist/query.py new file mode 100644 index 00000000..510cf78f --- /dev/null +++ b/src/db/queries/urls_exist/query.py @@ -0,0 +1,52 @@ +from sqlalchemy import select, func +from sqlalchemy.ext.asyncio import AsyncSession + +from src.db.queries.urls_exist.model import URLExistsResult +from src.db.helpers.session import session_helper as sh +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.queries.base.builder import QueryBuilderBase +from src.util.models.full_url import FullURL + + +class URLsExistInDBQueryBuilder(QueryBuilderBase): + """Checks if URLs exist in the database.""" + + def __init__(self, full_urls: list[FullURL]): + super().__init__() + self.full_urls = full_urls + self.id_form_urls = [ + url.id_form + for url in full_urls + ] + + async def run(self, session: AsyncSession) -> list[URLExistsResult]: + norm_url = func.rtrim(URL.url, '/').label("norm_url") + + query = select( + URL.id, + norm_url + ).where( + norm_url.in_(self.id_form_urls) + ) + db_mappings = await sh.mappings(session, query=query) + + url_to_id_map: dict[str, int] = { + row["norm_url"]: row["id"] + for row in db_mappings + } + id_to_db_url_map: dict[int, FullURL] = { + row["id"]: FullURL(row["norm_url"]) + for row in db_mappings + } + results: list[URLExistsResult] = [] + for full_url in self.full_urls: + url_id: int | None = url_to_id_map.get(full_url.id_form) + db_url: FullURL | None = id_to_db_url_map.get(url_id) + result = URLExistsResult( + query_url=full_url, + db_url=db_url, + url_id=url_id + ) + results.append(result) + + return results \ No newline at end of file diff --git a/src/db/queries/urls_exist/requester.py b/src/db/queries/urls_exist/requester.py new file mode 100644 index 00000000..45335b87 --- /dev/null +++ b/src/db/queries/urls_exist/requester.py @@ -0,0 +1,41 @@ +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.enums import RecordType +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.templates.requester import RequesterBase + + +class URLSuggestRequester(RequesterBase): + + def __init__( + self, + session: AsyncSession, + url_id: int + ): + super().__init__(session=session) + self.url_id = url_id + + async def optionally_add_url_type_suggestion( + self, + url_type: URLType | None + ) -> None: + if url_type is None: + return + # TODO + + async def optionally_add_record_type_suggestion(self, record_type: RecordType | None): + if record_type is None: + return + # TODO + + async def optionally_add_agency_id_suggestions(self, agency_ids: list[int]): + if len(agency_ids) == 0: + return + # TODO + + async def optionally_add_name_suggestion(self, name: str | None): + if name is None: + return + # TODO + + diff --git a/src/db/statement_composer.py b/src/db/statement_composer.py index 0ae843b3..faa965a8 100644 --- a/src/db/statement_composer.py +++ b/src/db/statement_composer.py @@ -5,8 +5,6 @@ from sqlalchemy.orm import selectinload from src.collectors.enums import URLStatus -from src.core.enums import BatchStatus -from src.db.constants import STANDARD_ROW_LIMIT from src.db.enums import TaskType from src.db.models.impl.batch.sqlalchemy import Batch from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL @@ -14,7 +12,7 @@ from src.db.models.impl.task.core import Task from src.db.models.impl.task.enums import TaskStatus from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata from src.db.models.impl.url.scrape_info.sqlalchemy import URLScrapeInfo from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata from src.db.types import UserSuggestionType @@ -40,7 +38,7 @@ def has_non_errored_urls_without_html_data() -> Select: .join(URLWebMetadata) .outerjoin(URLScrapeInfo) .where( - URLScrapeInfo.id == None, + URLScrapeInfo.url_id == None, ~exists(exclude_subquery), URLWebMetadata.status_code == HTTPStatus.OK.value, URLWebMetadata.content_type.like("%html%"), diff --git a/src/db/templates/requester.py b/src/db/templates/requester.py index b56af87f..9588ea9d 100644 --- a/src/db/templates/requester.py +++ b/src/db/templates/requester.py @@ -4,6 +4,7 @@ """ from abc import ABC +from sqlalchemy import Select from sqlalchemy.ext.asyncio import AsyncSession import src.db.helpers.session.session_helper as sh @@ -16,5 +17,11 @@ def __init__(self, session: AsyncSession): self.session = session self.session_helper = sh + async def scalar(self, query: Select): + return await sh.scalar(self.session, query=query) + + async def mappings(self, query: Select): + return await sh.mappings(self.session, query=query) + async def run_query_builder(self, query_builder: QueryBuilderBase): return await query_builder.run(session=self.session) \ No newline at end of file diff --git a/src/db/types.py b/src/db/types.py index dcee196f..c224a36c 100644 --- a/src/db/types.py +++ b/src/db/types.py @@ -1,10 +1,10 @@ from typing import TypeVar -from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion +from src.db.models.impl.url.suggestion.url_type.user import UserURLTypeSuggestion from src.db.queries.base.labels import LabelsBase -UserSuggestionType = UserUrlAgencySuggestion | UserURLTypeSuggestion | UserRecordTypeSuggestion +UserSuggestionType = UserURLAgencySuggestion | UserURLTypeSuggestion | UserRecordTypeSuggestion LabelsType = TypeVar("LabelsType", bound=LabelsBase) \ No newline at end of file diff --git a/src/db/utils/validate.py b/src/db/utils/validate.py index 4837e12c..9d77f910 100644 --- a/src/db/utils/validate.py +++ b/src/db/utils/validate.py @@ -1,7 +1,4 @@ from typing import Protocol -from urllib.parse import urlparse - -from pydantic import BaseModel def validate_has_protocol(obj: object, protocol: type[Protocol]): @@ -13,15 +10,3 @@ def validate_all_models_of_same_type(objects: list[object]): if not all(isinstance(model, type(first_model)) for model in objects): raise TypeError("Models must be of the same type") -def is_valid_url(url: str) -> bool: - try: - result = urlparse(url) - # If scheme is missing, `netloc` will be empty, so we check path too - if result.scheme in ("http", "https") and result.netloc: - return True - if not result.scheme and result.path: - # no scheme, treat path as potential domain - return "." in result.path - return False - except ValueError: - return False diff --git a/src/external/pdap/_templates/__init__.py b/src/external/pdap/_templates/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/_templates/request_builder.py b/src/external/pdap/_templates/request_builder.py new file mode 100644 index 00000000..2cde6c51 --- /dev/null +++ b/src/external/pdap/_templates/request_builder.py @@ -0,0 +1,42 @@ +from abc import ABC, abstractmethod +from http import HTTPStatus +from typing import Any + +from pdap_access_manager.access_manager.async_ import AccessManagerAsync +from pdap_access_manager.enums import RequestType +from pdap_access_manager.models.request import RequestInfo +from pdap_access_manager.models.response import ResponseInfo +from pydantic import BaseModel + + +class PDAPRequestBuilderBase(ABC): + + def __init__(self): + self.access_manager: AccessManagerAsync | None = None + + async def run(self, access_manager: AccessManagerAsync) -> Any: + self.access_manager = access_manager + return await self.inner_logic() + + def build_url(self, path: str) -> str: + return f"{self.access_manager.data_sources_url}/{path}" + + async def post( + self, + url: str, + model: BaseModel + ) -> dict: + request_info = RequestInfo( + type_=RequestType.POST, + url=url, + json_=model.model_dump(mode='json'), + headers=await self.access_manager.jwt_header() + ) + response_info: ResponseInfo = await self.access_manager.make_request(request_info) + if response_info.status_code != HTTPStatus.OK: + raise Exception(f"Failed to make request to PDAP: {response_info.data}") + return response_info.data + + @abstractmethod + async def inner_logic(self) -> Any: + raise NotImplementedError diff --git a/src/external/pdap/client.py b/src/external/pdap/client.py index 1c950ad3..38c67e08 100644 --- a/src/external/pdap/client.py +++ b/src/external/pdap/client.py @@ -1,70 +1,27 @@ from typing import Any -from pdap_access_manager import AccessManager, DataSourcesNamespaces, RequestInfo, RequestType, ResponseInfo +from pdap_access_manager.access_manager.async_ import AccessManagerAsync +from pdap_access_manager.enums import RequestType +from pdap_access_manager.models.request import RequestInfo +from pdap_access_manager.models.response import ResponseInfo -from src.core.tasks.url.operators.submit_approved.tdo import SubmitApprovedURLTDO, SubmittedURLInfo -from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo -from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse +from src.external.pdap._templates.request_builder import PDAPRequestBuilderBase from src.external.pdap.dtos.unique_url_duplicate import UniqueURLDuplicateInfo -from src.external.pdap.enums import MatchAgencyResponseStatus -from src.external.pdap.impl.meta_urls.core import submit_meta_urls -from src.external.pdap.impl.meta_urls.request import SubmitMetaURLsRequest class PDAPClient: def __init__( self, - access_manager: AccessManager, + access_manager: AccessManagerAsync, ): self.access_manager = access_manager - async def match_agency( + async def run_request_builder( self, - name: str, - state: str | None = None, - county: str | None = None, - locality: str | None = None - ) -> MatchAgencyResponse: - """ - Returns agencies, if any, that match or partially match the search criteria - """ - url: str = self.access_manager.build_url( - namespace=DataSourcesNamespaces.MATCH, - subdomains=["agency"] - ) - - headers: dict[str, str] = await self.access_manager.jwt_header() - headers['Content-Type']: str = "application/json" - request_info = RequestInfo( - type_=RequestType.POST, - url=url, - headers=headers, - json_={ - "name": name, - "state": state, - "county": county, - "locality": locality - } - ) - response_info: ResponseInfo = await self.access_manager.make_request(request_info) - matches: list[MatchAgencyInfo] = [] - for agency in response_info.data["agencies"]: - mai = MatchAgencyInfo( - id=agency['id'], - submitted_name=agency['name'] - ) - if len(agency['locations']) > 0: - first_location: dict[str, Any] = agency['locations'][0] - mai.state = first_location['state'] - mai.county = first_location['county'] - mai.locality = first_location['locality'] - matches.append(mai) - - return MatchAgencyResponse( - status=MatchAgencyResponseStatus(response_info.data["status"]), - matches=matches - ) + request_builder: PDAPRequestBuilderBase + ) -> Any: + return await request_builder.run(self.access_manager) async def is_url_duplicate( self, @@ -73,10 +30,8 @@ async def is_url_duplicate( """ Check if a URL is unique. Returns duplicate info otherwise """ - url: str = self.access_manager.build_url( - namespace=DataSourcesNamespaces.CHECK, - subdomains=["unique-url"] - ) + url: str = f"{self.access_manager.data_sources_url}/v2/check/unique-url" + request_info = RequestInfo( type_=RequestType.GET, url=url, @@ -90,70 +45,3 @@ async def is_url_duplicate( ] is_duplicate: bool = (len(duplicates) != 0) return is_duplicate - - async def submit_data_source_urls( - self, - tdos: list[SubmitApprovedURLTDO] - ) -> list[SubmittedURLInfo]: - """ - Submits URLs to Data Sources App, - modifying tdos in-place with data source id or error - """ - request_url = self.access_manager.build_url( - namespace=DataSourcesNamespaces.SOURCE_COLLECTOR, - subdomains=["data-sources"] - ) - - # Build url-id dictionary - url_id_dict: dict[str, int] = {} - for tdo in tdos: - url_id_dict[tdo.url] = tdo.url_id - - data_sources_json: list[dict[str, Any]] = [] - for tdo in tdos: - data_sources_json.append( - { - "name": tdo.name, - "description": tdo.description, - "source_url": tdo.url, - "record_type": tdo.record_type.value, - "record_formats": tdo.record_formats, - "data_portal_type": tdo.data_portal_type, - "last_approval_editor": tdo.approving_user_id, - "supplying_entity": tdo.supplying_entity, - "agency_ids": tdo.agency_ids - } - ) - - headers: dict[str, str] = await self.access_manager.jwt_header() - request_info = RequestInfo( - type_=RequestType.POST, - url=request_url, - headers=headers, - json_={ - "data_sources": data_sources_json - } - ) - response_info: ResponseInfo = await self.access_manager.make_request(request_info) - data_sources_response_json: list[dict[str, Any]] = response_info.data["data_sources"] - - results: list[SubmittedURLInfo] = [] - for data_source in data_sources_response_json: - url: str = data_source["url"] - response_object = SubmittedURLInfo( - url_id=url_id_dict[url], - data_source_id=data_source["data_source_id"], - request_error=data_source["error"] - ) - results.append(response_object) - - return results - - async def submit_meta_urls( - self, - requests: list[SubmitMetaURLsRequest] - ): - return await submit_meta_urls( - self.access_manager, - requests=requests - ) \ No newline at end of file diff --git a/src/external/pdap/dtos/match_agency/post.py b/src/external/pdap/dtos/match_agency/post.py deleted file mode 100644 index 2be0b90e..00000000 --- a/src/external/pdap/dtos/match_agency/post.py +++ /dev/null @@ -1,11 +0,0 @@ -from typing import Optional - -from pydantic import BaseModel - - -class MatchAgencyInfo(BaseModel): - id: int - submitted_name: str - state: str | None = None - county: str | None = None - locality: str | None = None diff --git a/src/external/pdap/dtos/match_agency/response.py b/src/external/pdap/dtos/match_agency/response.py deleted file mode 100644 index aa4d9ec3..00000000 --- a/src/external/pdap/dtos/match_agency/response.py +++ /dev/null @@ -1,11 +0,0 @@ -from typing import List - -from pydantic import BaseModel - -from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo -from src.external.pdap.enums import MatchAgencyResponseStatus - - -class MatchAgencyResponse(BaseModel): - status: MatchAgencyResponseStatus - matches: List[MatchAgencyInfo] diff --git a/src/external/pdap/enums.py b/src/external/pdap/enums.py index c532f820..55819619 100644 --- a/src/external/pdap/enums.py +++ b/src/external/pdap/enums.py @@ -1,12 +1,6 @@ from enum import Enum -class MatchAgencyResponseStatus(Enum): - EXACT_MATCH = "Exact Match" - PARTIAL_MATCH = "Partial Matches" - NO_MATCH = "No Match" - - class ApprovalStatus(Enum): APPROVED = "approved" REJECTED = "rejected" diff --git a/src/external/pdap/impl/meta_urls/core.py b/src/external/pdap/impl/meta_urls/core.py deleted file mode 100644 index 4a34fbeb..00000000 --- a/src/external/pdap/impl/meta_urls/core.py +++ /dev/null @@ -1,58 +0,0 @@ -from typing import Any - -from pdap_access_manager import AccessManager, DataSourcesNamespaces, RequestInfo, RequestType, ResponseInfo - -from src.external.pdap.impl.meta_urls.enums import SubmitMetaURLsStatus -from src.external.pdap.impl.meta_urls.request import SubmitMetaURLsRequest -from src.external.pdap.impl.meta_urls.response import SubmitMetaURLsResponse - - -async def submit_meta_urls( - access_manager: AccessManager, - requests: list[SubmitMetaURLsRequest] -) -> list[SubmitMetaURLsResponse]: - - - # Build url-id dictionary - url_id_dict: dict[str, int] = {} - for request in requests: - url_id_dict[request.url] = request.url_id - - meta_urls_json: list[dict[str, Any]] = [] - for request in requests: - meta_urls_json.append( - { - "url": request.url, - "agency_id": request.agency_id - } - ) - - headers: dict[str, str] = await access_manager.jwt_header() - url: str = access_manager.build_url( - namespace=DataSourcesNamespaces.SOURCE_COLLECTOR, - subdomains=["meta-urls"] - ) - request_info = RequestInfo( - type_=RequestType.POST, - url=url, - headers=headers, - json_={ - "meta_urls": meta_urls_json - } - ) - - response_info: ResponseInfo = await access_manager.make_request(request_info) - meta_urls_response_json: list[dict[str, Any]] = response_info.data["meta_urls"] - - responses: list[SubmitMetaURLsResponse] = [] - for meta_url in meta_urls_response_json: - responses.append( - SubmitMetaURLsResponse( - url=meta_url["url"], - status=SubmitMetaURLsStatus(meta_url["status"]), - agency_id=meta_url["agency_id"], - meta_url_id=meta_url["meta_url_id"], - error=meta_url["error"] - ) - ) - return responses \ No newline at end of file diff --git a/src/external/pdap/impl/meta_urls/enums.py b/src/external/pdap/impl/meta_urls/enums.py deleted file mode 100644 index e49e71aa..00000000 --- a/src/external/pdap/impl/meta_urls/enums.py +++ /dev/null @@ -1,7 +0,0 @@ -from enum import Enum - - -class SubmitMetaURLsStatus(Enum): - SUCCESS = "success" - FAILURE = "failure" - ALREADY_EXISTS = "already_exists" \ No newline at end of file diff --git a/src/external/pdap/impl/meta_urls/request.py b/src/external/pdap/impl/meta_urls/request.py deleted file mode 100644 index ac222aca..00000000 --- a/src/external/pdap/impl/meta_urls/request.py +++ /dev/null @@ -1,7 +0,0 @@ -from pydantic import BaseModel - - -class SubmitMetaURLsRequest(BaseModel): - url_id: int - url: str - agency_id: int diff --git a/src/external/pdap/impl/meta_urls/response.py b/src/external/pdap/impl/meta_urls/response.py deleted file mode 100644 index 96d5ece7..00000000 --- a/src/external/pdap/impl/meta_urls/response.py +++ /dev/null @@ -1,11 +0,0 @@ -from pydantic import BaseModel - -from src.external.pdap.impl.meta_urls.enums import SubmitMetaURLsStatus - - -class SubmitMetaURLsResponse(BaseModel): - url: str - status: SubmitMetaURLsStatus - meta_url_id: int | None = None - agency_id: int | None = None - error: str | None = None \ No newline at end of file diff --git a/src/external/pdap/impl/sync/__init__.py b/src/external/pdap/impl/sync/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/agencies/__init__.py b/src/external/pdap/impl/sync/agencies/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/agencies/_shared/__init__.py b/src/external/pdap/impl/sync/agencies/_shared/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/agencies/_shared/models/__init__.py b/src/external/pdap/impl/sync/agencies/_shared/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/agencies/_shared/models/content.py b/src/external/pdap/impl/sync/agencies/_shared/models/content.py new file mode 100644 index 00000000..124072a7 --- /dev/null +++ b/src/external/pdap/impl/sync/agencies/_shared/models/content.py @@ -0,0 +1,15 @@ +from pydantic import Field, BaseModel + +from src.db.models.impl.agency.enums import JurisdictionType, AgencyType + + +class AgencySyncContentModel(BaseModel): + # Required + name: str + jurisdiction_type: JurisdictionType + agency_type: AgencyType + location_ids: list[int] = Field(min_length=1) + + # Optional + no_web_presence: bool = False + defunct_year: int | None = None diff --git a/src/external/pdap/impl/sync/agencies/add/__init__.py b/src/external/pdap/impl/sync/agencies/add/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/agencies/add/core.py b/src/external/pdap/impl/sync/agencies/add/core.py new file mode 100644 index 00000000..a8f190ce --- /dev/null +++ b/src/external/pdap/impl/sync/agencies/add/core.py @@ -0,0 +1,27 @@ +from src.external.pdap._templates.request_builder import PDAPRequestBuilderBase +from src.external.pdap.impl.sync.agencies.add.request import AddAgenciesOuterRequest +from src.external.pdap.impl.sync.shared.models.add.response import DSAppSyncAddResponseInnerModel, \ + DSAppSyncAddResponseModel + + +class AddAgenciesRequestBuilder(PDAPRequestBuilderBase): + + def __init__( + self, + request: AddAgenciesOuterRequest + ): + super().__init__() + self.request = request + + async def inner_logic(self) -> list[DSAppSyncAddResponseInnerModel]: + url: str = self.build_url("v3/sync/agencies/add") + raw_results = await self.post( + url=url, + model=self.request, + ) + response = DSAppSyncAddResponseModel(**raw_results) + return response.entities + + + + diff --git a/src/external/pdap/impl/sync/agencies/add/request.py b/src/external/pdap/impl/sync/agencies/add/request.py new file mode 100644 index 00000000..575b4c42 --- /dev/null +++ b/src/external/pdap/impl/sync/agencies/add/request.py @@ -0,0 +1,20 @@ +from pydantic import BaseModel, model_validator, Field + +from src.external.pdap.impl.sync.agencies._shared.models.content import AgencySyncContentModel + + +class AddAgenciesInnerRequest(BaseModel): + request_id: int + content: AgencySyncContentModel + + +class AddAgenciesOuterRequest(BaseModel): + agencies: list[AddAgenciesInnerRequest] = Field(max_length=1000) + + @model_validator(mode="after") + def all_request_ids_unique(self): + if len(self.agencies) != len( + set([agency.request_id for agency in self.agencies]) + ): + raise ValueError("All request_ids must be unique") + return self diff --git a/src/external/pdap/impl/sync/agencies/delete/__init__.py b/src/external/pdap/impl/sync/agencies/delete/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/agencies/delete/core.py b/src/external/pdap/impl/sync/agencies/delete/core.py new file mode 100644 index 00000000..f48e1b11 --- /dev/null +++ b/src/external/pdap/impl/sync/agencies/delete/core.py @@ -0,0 +1,22 @@ +from src.external.pdap._templates.request_builder import PDAPRequestBuilderBase +from src.external.pdap.impl.sync.shared.models.delete.request import DSAppSyncDeleteRequestModel + + +class DeleteAgenciesRequestBuilder(PDAPRequestBuilderBase): + + def __init__( + self, + ds_app_ids: list[int] + ): + super().__init__() + self.ds_app_ids = ds_app_ids + + async def inner_logic(self) -> None: + url: str = self.build_url("v3/sync/agencies/delete") + await self.post( + url=url, + model=DSAppSyncDeleteRequestModel( + ids=self.ds_app_ids + ) + ) + diff --git a/src/external/pdap/impl/sync/agencies/update/__init__.py b/src/external/pdap/impl/sync/agencies/update/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/agencies/update/core.py b/src/external/pdap/impl/sync/agencies/update/core.py new file mode 100644 index 00000000..6589c8b0 --- /dev/null +++ b/src/external/pdap/impl/sync/agencies/update/core.py @@ -0,0 +1,19 @@ +from src.external.pdap._templates.request_builder import PDAPRequestBuilderBase +from src.external.pdap.impl.sync.agencies.update.request import UpdateAgenciesOuterRequest + + +class UpdateAgenciesRequestBuilder(PDAPRequestBuilderBase): + + def __init__( + self, + request: UpdateAgenciesOuterRequest + ): + super().__init__() + self.request = request + + async def inner_logic(self) -> None: + url: str = self.build_url("v3/sync/agencies/update") + await self.post( + url=url, + model=self.request + ) \ No newline at end of file diff --git a/src/external/pdap/impl/sync/agencies/update/request.py b/src/external/pdap/impl/sync/agencies/update/request.py new file mode 100644 index 00000000..df43578e --- /dev/null +++ b/src/external/pdap/impl/sync/agencies/update/request.py @@ -0,0 +1,12 @@ +from pydantic import BaseModel, Field + +from src.external.pdap.impl.sync.agencies._shared.models.content import AgencySyncContentModel + + +class UpdateAgenciesInnerRequest(BaseModel): + app_id: int + content: AgencySyncContentModel + + +class UpdateAgenciesOuterRequest(BaseModel): + agencies: list[UpdateAgenciesInnerRequest] = Field(max_length=1000) diff --git a/src/external/pdap/impl/sync/data_sources/__init__.py b/src/external/pdap/impl/sync/data_sources/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/data_sources/_shared/__init__.py b/src/external/pdap/impl/sync/data_sources/_shared/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/data_sources/_shared/content.py b/src/external/pdap/impl/sync/data_sources/_shared/content.py new file mode 100644 index 00000000..59d0bcc6 --- /dev/null +++ b/src/external/pdap/impl/sync/data_sources/_shared/content.py @@ -0,0 +1,43 @@ +from datetime import date + +from pydantic import BaseModel, Field + +from src.core.enums import RecordType +from src.db.models.impl.url.optional_ds_metadata.enums import AgencyAggregationEnum, UpdateMethodEnum, \ + RetentionScheduleEnum, AccessTypeEnum +from src.external.pdap.enums import DataSourcesURLStatus +from src.external.pdap.impl.sync.data_sources._shared.enums import DetailLevel + + +class DataSourceSyncContentModel(BaseModel): + # Required + source_url: str + name: str + record_type: RecordType + + # Optional + description: str | None = None + + # Optional data source metadata + record_formats: list[str] = [] + data_portal_type: str | None = None + supplying_entity: str | None = None + coverage_start: date | None = None + coverage_end: date | None = None + detail_level: DetailLevel | None = None + agency_supplied: bool | None = None + agency_originated: bool | None = None + agency_aggregation: AgencyAggregationEnum | None = None + agency_described_not_in_database: str | None = None + update_method: UpdateMethodEnum | None = None + readme_url: str | None = None + originating_entity: str | None = None + retention_schedule: RetentionScheduleEnum | None = None + scraper_url: str | None = None + access_notes: str | None = None + access_types: list[AccessTypeEnum] = [] + data_portal_type_other: str | None = None + url_status: DataSourcesURLStatus = DataSourcesURLStatus.OK + internet_archives_url: str | None = None + + agency_ids: list[int] = [] diff --git a/src/external/pdap/impl/sync/data_sources/_shared/enums.py b/src/external/pdap/impl/sync/data_sources/_shared/enums.py new file mode 100644 index 00000000..bc7929a2 --- /dev/null +++ b/src/external/pdap/impl/sync/data_sources/_shared/enums.py @@ -0,0 +1,11 @@ +from enum import Enum + + +class DetailLevel(Enum): + """ + Correlates to the detail_level enum in the database + """ + + INDIVIDUAL = "Individual record" + AGGREGATED = "Aggregated records" + SUMMARIZED = "Summarized totals" diff --git a/src/external/pdap/impl/sync/data_sources/add/__init__.py b/src/external/pdap/impl/sync/data_sources/add/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/data_sources/add/core.py b/src/external/pdap/impl/sync/data_sources/add/core.py new file mode 100644 index 00000000..c3576961 --- /dev/null +++ b/src/external/pdap/impl/sync/data_sources/add/core.py @@ -0,0 +1,24 @@ +from src.external.pdap._templates.request_builder import PDAPRequestBuilderBase +from src.external.pdap.impl.sync.data_sources.add.request import AddDataSourcesOuterRequest +from src.external.pdap.impl.sync.shared.models.add.response import DSAppSyncAddResponseInnerModel, \ + DSAppSyncAddResponseModel + + +class AddDataSourcesRequestBuilder(PDAPRequestBuilderBase): + + def __init__( + self, + request: AddDataSourcesOuterRequest + ): + super().__init__() + self.request = request + + async def inner_logic(self) -> list[DSAppSyncAddResponseInnerModel]: + url: str = self.build_url("v3/sync/data-sources/add") + raw_results = await self.post( + url=url, + model=self.request, + ) + response = DSAppSyncAddResponseModel(**raw_results) + return response.entities + diff --git a/src/external/pdap/impl/sync/data_sources/add/request.py b/src/external/pdap/impl/sync/data_sources/add/request.py new file mode 100644 index 00000000..dfa7188f --- /dev/null +++ b/src/external/pdap/impl/sync/data_sources/add/request.py @@ -0,0 +1,20 @@ +from pydantic import BaseModel, Field, model_validator + +from src.external.pdap.impl.sync.data_sources._shared.content import DataSourceSyncContentModel + + +class AddDataSourcesInnerRequest(BaseModel): + request_id: int + content: DataSourceSyncContentModel + + +class AddDataSourcesOuterRequest(BaseModel): + data_sources: list[AddDataSourcesInnerRequest] = Field(max_length=1000) + + @model_validator(mode="after") + def all_request_ids_unique(self): + if len(self.data_sources) != len( + set([data_source.request_id for data_source in self.data_sources]) + ): + raise ValueError("All request_ids must be unique") + return self diff --git a/src/external/pdap/impl/sync/data_sources/delete/__init__.py b/src/external/pdap/impl/sync/data_sources/delete/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/data_sources/delete/core.py b/src/external/pdap/impl/sync/data_sources/delete/core.py new file mode 100644 index 00000000..e58d1741 --- /dev/null +++ b/src/external/pdap/impl/sync/data_sources/delete/core.py @@ -0,0 +1,22 @@ +from src.external.pdap._templates.request_builder import PDAPRequestBuilderBase +from src.external.pdap.impl.sync.shared.models.delete.request import DSAppSyncDeleteRequestModel + + +class DeleteDataSourcesRequestBuilder(PDAPRequestBuilderBase): + + def __init__( + self, + ds_app_ids: list[int] + ): + super().__init__() + self.ds_app_ids = ds_app_ids + + async def inner_logic(self) -> None: + url: str = self.build_url("v3/sync/data-sources/delete") + await self.post( + url=url, + model=DSAppSyncDeleteRequestModel( + ids=self.ds_app_ids + ) + ) + diff --git a/src/external/pdap/impl/sync/data_sources/request.py b/src/external/pdap/impl/sync/data_sources/request.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/data_sources/update/__init__.py b/src/external/pdap/impl/sync/data_sources/update/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/data_sources/update/core.py b/src/external/pdap/impl/sync/data_sources/update/core.py new file mode 100644 index 00000000..491f1676 --- /dev/null +++ b/src/external/pdap/impl/sync/data_sources/update/core.py @@ -0,0 +1,19 @@ +from src.external.pdap._templates.request_builder import PDAPRequestBuilderBase +from src.external.pdap.impl.sync.data_sources.update.request import UpdateDataSourcesOuterRequest + + +class UpdateDataSourcesRequestBuilder(PDAPRequestBuilderBase): + + def __init__( + self, + request: UpdateDataSourcesOuterRequest + ): + super().__init__() + self.request = request + + async def inner_logic(self) -> None: + url: str = self.build_url("v3/sync/data-sources/update") + await self.post( + url=url, + model=self.request + ) \ No newline at end of file diff --git a/src/external/pdap/impl/sync/data_sources/update/request.py b/src/external/pdap/impl/sync/data_sources/update/request.py new file mode 100644 index 00000000..97d95818 --- /dev/null +++ b/src/external/pdap/impl/sync/data_sources/update/request.py @@ -0,0 +1,15 @@ +from pydantic import BaseModel, Field + +from src.external.pdap.impl.sync.data_sources._shared.content import DataSourceSyncContentModel + + +class UpdateDataSourcesInnerRequest(BaseModel): + class Config: + arbitrary_types_allowed = True + + app_id: int + content: DataSourceSyncContentModel + + +class UpdateDataSourcesOuterRequest(BaseModel): + data_sources: list[UpdateDataSourcesInnerRequest] = Field(max_length=1000) diff --git a/src/external/pdap/impl/sync/meta_urls/__init__.py b/src/external/pdap/impl/sync/meta_urls/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/meta_urls/_shared/__init__.py b/src/external/pdap/impl/sync/meta_urls/_shared/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/meta_urls/_shared/content.py b/src/external/pdap/impl/sync/meta_urls/_shared/content.py new file mode 100644 index 00000000..5db804cd --- /dev/null +++ b/src/external/pdap/impl/sync/meta_urls/_shared/content.py @@ -0,0 +1,10 @@ +from pydantic import BaseModel + +from src.external.pdap.enums import DataSourcesURLStatus + + +class MetaURLSyncContentModel(BaseModel): + url: str + url_status: DataSourcesURLStatus = DataSourcesURLStatus.OK + internet_archives_url: str | None = None + agency_ids: list[int] = [] diff --git a/src/external/pdap/impl/sync/meta_urls/add/__init__.py b/src/external/pdap/impl/sync/meta_urls/add/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/meta_urls/add/core.py b/src/external/pdap/impl/sync/meta_urls/add/core.py new file mode 100644 index 00000000..8f1b3752 --- /dev/null +++ b/src/external/pdap/impl/sync/meta_urls/add/core.py @@ -0,0 +1,25 @@ +from src.external.pdap._templates.request_builder import PDAPRequestBuilderBase +from src.external.pdap.impl.sync.meta_urls.add.request import AddMetaURLsOuterRequest +from src.external.pdap.impl.sync.shared.models.add.response import DSAppSyncAddResponseInnerModel, \ + DSAppSyncAddResponseModel + + +class AddMetaURLsRequestBuilder(PDAPRequestBuilderBase): + + def __init__( + self, + request: AddMetaURLsOuterRequest + ): + super().__init__() + self.request = request + + async def inner_logic(self) -> list[DSAppSyncAddResponseInnerModel]: + url: str = self.build_url("v3/sync/meta-urls/add") + raw_results = await self.post( + url=url, + model=self.request, + ) + response = DSAppSyncAddResponseModel(**raw_results) + return response.entities + + diff --git a/src/external/pdap/impl/sync/meta_urls/add/request.py b/src/external/pdap/impl/sync/meta_urls/add/request.py new file mode 100644 index 00000000..109560a2 --- /dev/null +++ b/src/external/pdap/impl/sync/meta_urls/add/request.py @@ -0,0 +1,20 @@ +from pydantic import BaseModel, Field, model_validator + +from src.external.pdap.impl.sync.meta_urls._shared.content import MetaURLSyncContentModel + + +class AddMetaURLsInnerRequest(BaseModel): + request_id: int + content: MetaURLSyncContentModel + + +class AddMetaURLsOuterRequest(BaseModel): + meta_urls: list[AddMetaURLsInnerRequest] = Field(max_length=1000) + + @model_validator(mode="after") + def all_request_ids_unique(self): + if len(self.meta_urls) != len( + set([meta_url.request_id for meta_url in self.meta_urls]) + ): + raise ValueError("All request_ids must be unique") + return self diff --git a/src/external/pdap/impl/sync/meta_urls/delete/__init__.py b/src/external/pdap/impl/sync/meta_urls/delete/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/meta_urls/delete/core.py b/src/external/pdap/impl/sync/meta_urls/delete/core.py new file mode 100644 index 00000000..bde1dc8d --- /dev/null +++ b/src/external/pdap/impl/sync/meta_urls/delete/core.py @@ -0,0 +1,22 @@ +from src.external.pdap._templates.request_builder import PDAPRequestBuilderBase +from src.external.pdap.impl.sync.shared.models.delete.request import DSAppSyncDeleteRequestModel + + +class DeleteMetaURLsRequestBuilder(PDAPRequestBuilderBase): + + def __init__( + self, + ds_app_ids: list[int] + ): + super().__init__() + self.ds_app_ids = ds_app_ids + + async def inner_logic(self) -> None: + url: str = self.build_url("v3/sync/meta-urls/delete") + await self.post( + url=url, + model=DSAppSyncDeleteRequestModel( + ids=self.ds_app_ids + ) + ) + diff --git a/src/external/pdap/impl/sync/meta_urls/update/__init__.py b/src/external/pdap/impl/sync/meta_urls/update/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/meta_urls/update/core.py b/src/external/pdap/impl/sync/meta_urls/update/core.py new file mode 100644 index 00000000..0c917535 --- /dev/null +++ b/src/external/pdap/impl/sync/meta_urls/update/core.py @@ -0,0 +1,19 @@ +from src.external.pdap._templates.request_builder import PDAPRequestBuilderBase +from src.external.pdap.impl.sync.meta_urls.update.request import UpdateMetaURLsOuterRequest + + +class UpdateMetaURLsRequestBuilder(PDAPRequestBuilderBase): + + def __init__( + self, + request: UpdateMetaURLsOuterRequest + ): + super().__init__() + self.request = request + + async def inner_logic(self) -> None: + url: str = self.build_url("v3/sync/meta-urls/update") + await self.post( + url=url, + model=self.request + ) \ No newline at end of file diff --git a/src/external/pdap/impl/sync/meta_urls/update/request.py b/src/external/pdap/impl/sync/meta_urls/update/request.py new file mode 100644 index 00000000..c38ae09e --- /dev/null +++ b/src/external/pdap/impl/sync/meta_urls/update/request.py @@ -0,0 +1,12 @@ +from pydantic import Field, BaseModel + +from src.external.pdap.impl.sync.meta_urls._shared.content import MetaURLSyncContentModel + + +class UpdateMetaURLsInnerRequest(BaseModel): + app_id: int + content: MetaURLSyncContentModel + + +class UpdateMetaURLsOuterRequest(BaseModel): + meta_urls: list[UpdateMetaURLsInnerRequest] = Field(max_length=1000) diff --git a/src/external/pdap/impl/sync/shared/__init__.py b/src/external/pdap/impl/sync/shared/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/shared/models/__init__.py b/src/external/pdap/impl/sync/shared/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/shared/models/add/__init__.py b/src/external/pdap/impl/sync/shared/models/add/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/shared/models/add/response.py b/src/external/pdap/impl/sync/shared/models/add/response.py new file mode 100644 index 00000000..209139cf --- /dev/null +++ b/src/external/pdap/impl/sync/shared/models/add/response.py @@ -0,0 +1,8 @@ +from pydantic import BaseModel + +class DSAppSyncAddResponseInnerModel(BaseModel): + request_id: int + app_id: int + +class DSAppSyncAddResponseModel(BaseModel): + entities: list[DSAppSyncAddResponseInnerModel] \ No newline at end of file diff --git a/src/external/pdap/impl/sync/shared/models/delete/__init__.py b/src/external/pdap/impl/sync/shared/models/delete/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/external/pdap/impl/sync/shared/models/delete/request.py b/src/external/pdap/impl/sync/shared/models/delete/request.py new file mode 100644 index 00000000..c4e3bb8d --- /dev/null +++ b/src/external/pdap/impl/sync/shared/models/delete/request.py @@ -0,0 +1,5 @@ +from pydantic import BaseModel + + +class DSAppSyncDeleteRequestModel(BaseModel): + ids: list[int] \ No newline at end of file diff --git a/src/external/pdap/impl/sync/shared/models/mapping.py b/src/external/pdap/impl/sync/shared/models/mapping.py new file mode 100644 index 00000000..fd22bca2 --- /dev/null +++ b/src/external/pdap/impl/sync/shared/models/mapping.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class DSSyncIDMapping(BaseModel): + ds_app_link_id: int + entity_id: int \ No newline at end of file diff --git a/src/external/url_request/core.py b/src/external/url_request/core.py index 7a6920fe..d49b2649 100644 --- a/src/external/url_request/core.py +++ b/src/external/url_request/core.py @@ -4,6 +4,7 @@ from src.external.url_request.probe.core import URLProbeManager from src.external.url_request.probe.models.wrapper import URLProbeResponseOuterWrapper from src.external.url_request.request import fetch_urls +from src.util.models.full_url import FullURL class URLRequestInterface: @@ -15,7 +16,7 @@ async def make_requests_with_html( return await fetch_urls(urls) @staticmethod - async def probe_urls(urls: list[str]) -> list[URLProbeResponseOuterWrapper]: + async def probe_urls(urls: list[FullURL]) -> list[URLProbeResponseOuterWrapper]: async with ClientSession(timeout=ClientTimeout(total=30)) as session: manager = URLProbeManager(session=session) return await manager.probe_urls(urls=urls) diff --git a/src/external/url_request/probe/convert.py b/src/external/url_request/probe/convert.py index 3b15268a..16258cdb 100644 --- a/src/external/url_request/probe/convert.py +++ b/src/external/url_request/probe/convert.py @@ -6,6 +6,7 @@ from src.external.url_request.probe.models.response import URLProbeResponse from src.external.url_request.probe.models.redirect import URLProbeRedirectResponsePair from src.external.url_request.probe.models.wrapper import URLProbeResponseOuterWrapper +from src.util.models.full_url import FullURL def _process_client_response_history(history: Sequence[ClientResponse]) -> list[str]: @@ -29,7 +30,7 @@ def _extract_redirect_probe_response(cr: ClientResponse) -> URLProbeResponse | N first_url = all_urls[0] return URLProbeResponse( - url=first_url, + url=FullURL(first_url), status_code=HTTPStatus.FOUND.value, content_type=None, error=None, @@ -53,14 +54,14 @@ def _extract_destination_url(cr: ClientResponse) -> str: return str(cr.url) def convert_client_response_to_probe_response( - url: str, + url: FullURL, cr: ClientResponse ) -> URLProbeResponse | URLProbeRedirectResponsePair: error = _extract_error(cr) content_type = _extract_content_type(cr, error=error) if not _has_redirect(cr): return URLProbeResponse( - url=str(cr.url), + url=FullURL(str(cr.url)), status_code=cr.status, content_type=content_type, error=error, @@ -85,7 +86,7 @@ def convert_client_response_to_probe_response( destination_error = _extract_error(destination_cr) destination_content_type = _extract_content_type(destination_cr, error=destination_error) destination_probe_response = URLProbeResponse( - url=destination_url, + url=FullURL(destination_url), status_code=destination_cr.status, content_type=destination_content_type, error=destination_error, @@ -97,7 +98,7 @@ def convert_client_response_to_probe_response( ) def convert_to_error_response( - url: str, + url: FullURL, error: str, status_code: int | None = None ) -> URLProbeResponseOuterWrapper: diff --git a/src/external/url_request/probe/core.py b/src/external/url_request/probe/core.py index 48009381..120e1b66 100644 --- a/src/external/url_request/probe/core.py +++ b/src/external/url_request/probe/core.py @@ -9,6 +9,7 @@ from src.external.url_request.probe.convert import convert_client_response_to_probe_response, convert_to_error_response from src.external.url_request.probe.models.wrapper import URLProbeResponseOuterWrapper +from src.util.models.full_url import FullURL from src.util.progress_bar import get_progress_bar_disabled @@ -20,14 +21,14 @@ def __init__( ): self.session = session - async def probe_urls(self, urls: list[str]) -> list[URLProbeResponseOuterWrapper]: + async def probe_urls(self, urls: list[FullURL]) -> list[URLProbeResponseOuterWrapper]: return await tqdm_asyncio.gather( *[self._probe(url) for url in urls], timeout=60 * 10, # 10 minutes, disable=get_progress_bar_disabled() ) - async def _probe(self, url: str) -> URLProbeResponseOuterWrapper: + async def _probe(self, url: FullURL) -> URLProbeResponseOuterWrapper: try: response = await self._head(url) if not response.is_redirect and response.response.status_code == HTTPStatus.OK: @@ -52,9 +53,9 @@ async def _probe(self, url: str) -> URLProbeResponseOuterWrapper: except ClientOSError as e: return convert_to_error_response(url, error=f"Client OS Error: {e.errno}. {str(e)}") - async def _head(self, url: str) -> URLProbeResponseOuterWrapper: + async def _head(self, url: FullURL) -> URLProbeResponseOuterWrapper: try: - async with self.session.head(url, allow_redirects=True) as response: + async with self.session.head(str(url), allow_redirects=True) as response: return URLProbeResponseOuterWrapper( original_url=url, response=convert_client_response_to_probe_response( @@ -74,9 +75,9 @@ async def _head(self, url: str) -> URLProbeResponseOuterWrapper: status_code=e.status ) - async def _get(self, url: str) -> URLProbeResponseOuterWrapper: + async def _get(self, url: FullURL) -> URLProbeResponseOuterWrapper: try: - async with self.session.get(url, allow_redirects=True) as response: + async with self.session.get(str(url), allow_redirects=True) as response: return URLProbeResponseOuterWrapper( original_url=url, response=convert_client_response_to_probe_response( diff --git a/src/external/url_request/probe/models/response.py b/src/external/url_request/probe/models/response.py index 967f1c4f..ad6eb588 100644 --- a/src/external/url_request/probe/models/response.py +++ b/src/external/url_request/probe/models/response.py @@ -1,9 +1,13 @@ from pydantic import BaseModel, Field, model_validator +from src.util.models.full_url import FullURL class URLProbeResponse(BaseModel): - url: str + class Config: + arbitrary_types_allowed = True + + url: FullURL status_code: int | None = Field(le=999, ge=100) content_type: str | None error: str | None = None diff --git a/src/external/url_request/probe/models/wrapper.py b/src/external/url_request/probe/models/wrapper.py index 04dbc9c4..27fd7be8 100644 --- a/src/external/url_request/probe/models/wrapper.py +++ b/src/external/url_request/probe/models/wrapper.py @@ -2,10 +2,14 @@ from src.external.url_request.probe.models.redirect import URLProbeRedirectResponsePair from src.external.url_request.probe.models.response import URLProbeResponse +from src.util.models.full_url import FullURL class URLProbeResponseOuterWrapper(BaseModel): - original_url: str + class Config: + arbitrary_types_allowed = True + + original_url: FullURL response: URLProbeResponse | URLProbeRedirectResponsePair @property diff --git a/src/security/enums.py b/src/security/enums.py index c10c346b..0090c3bc 100644 --- a/src/security/enums.py +++ b/src/security/enums.py @@ -2,5 +2,5 @@ class Permissions(Enum): - SOURCE_COLLECTOR = "source_collector" + SOURCE_COLLECTOR = "access_source_collector" SOURCE_COLLECTOR_FINAL_REVIEW = "source_collector_final_review" diff --git a/src/util/alembic_helpers.py b/src/util/alembic_helpers.py index cb9d8d67..f711136d 100644 --- a/src/util/alembic_helpers.py +++ b/src/util/alembic_helpers.py @@ -295,4 +295,35 @@ def remove_enum_value( f"ALTER TYPE {_q_ident(schema)}.{_q_ident(tmp_name)} " f"RENAME TO {_q_ident(enum_name)}" ) - ) \ No newline at end of file + ) + + +def create_updated_at_trigger(table_name: str) -> None: + """ + Adds a trigger to the given table that automatically updates the + 'updated_at' column to the current timestamp on UPDATE. + + Parameters: + table_name (str): Name of the table to attach the trigger to. + """ + + # Step 1: Define the trigger function (only needs to exist once) + op.execute(""" + CREATE OR REPLACE FUNCTION set_updated_at() + RETURNS TRIGGER AS $$ + BEGIN + NEW.updated_at = NOW(); + RETURN NEW; + END; + $$ LANGUAGE plpgsql; + """) + + # Step 2: Create the trigger for this specific table + trigger_name = f"{table_name}_updated_at_trigger" + op.execute(f""" + DROP TRIGGER IF EXISTS {trigger_name} ON {table_name}; + CREATE TRIGGER {trigger_name} + BEFORE UPDATE ON {table_name} + FOR EACH ROW + EXECUTE FUNCTION set_updated_at(); + """) diff --git a/src/util/clean.py b/src/util/clean.py deleted file mode 100644 index 3c0a0f92..00000000 --- a/src/util/clean.py +++ /dev/null @@ -1,10 +0,0 @@ - - -def clean_url(url: str) -> str: - # Remove Non-breaking spaces - url = url.strip(" ") - - # Remove any fragments and everything after them - url = url.split("#")[0] - return url - diff --git a/src/util/models/__init__.py b/src/util/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/util/models/full_url.py b/src/util/models/full_url.py new file mode 100644 index 00000000..9b3fc694 --- /dev/null +++ b/src/util/models/full_url.py @@ -0,0 +1,88 @@ +from urllib.parse import urlparse + +from src.util.url import clean_url + + +class FullURL: + __slots__ = ( + "_full_url", + "_scheme", + "_url_without_scheme" + ) + + def __init__( + self, + full_url: str + ): + if not isinstance(full_url, str): + raise ValueError("full_url must be a string") + self._full_url = full_url + self._scheme = None + self._url_without_scheme = None + + @property + def full_url(self) -> str: + return self._full_url + + def __str__(self): + return self.full_url + + def __repr__(self): + return self.id_form + + def __hash__(self): + return hash(self.id_form) + + def __eq__(self, other): + return isinstance(other, FullURL) and self.id_form == other.id_form + + def _set_url_parts(self): + """ + Modifies: + self._scheme + self._url + + """ + parse_result = urlparse(self.full_url) + self._scheme = parse_result.scheme + if parse_result.scheme is not None: + self._url_without_scheme = self.full_url.replace(f"{parse_result.scheme}://", "", 1) + else: + self._url_without_scheme = self.full_url + + + @property + def scheme(self) -> str | None: + if self._scheme is None: + self._set_url_parts() + return self._scheme + + @property + def without_scheme(self) -> str: + if self._url_without_scheme is None: + self._set_url_parts() + return self._url_without_scheme + + @property + def id_form(self) -> str: + """Retrieves URL in 'Identification Form' + + These are meant to be used to compare URLs with one another. + + They have the following properties: + No Scheme + No Trailing Slash + Cleaned of fragments and query parameters. + """ + no_scheme: str = self.without_scheme + no_trailing_slash: str = no_scheme.rstrip("/") + clean: str = clean_url(no_trailing_slash) + return clean + + @property + def has_trailing_slash(self) -> bool: + return self.full_url.endswith("/") + + def clean(self) -> str: + return clean_url(self.full_url) + diff --git a/src/util/models/url_and_scheme.py b/src/util/models/url_and_scheme.py new file mode 100644 index 00000000..494acd49 --- /dev/null +++ b/src/util/models/url_and_scheme.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class URLAndScheme(BaseModel): + url: str + scheme: str | None \ No newline at end of file diff --git a/src/util/url.py b/src/util/url.py new file mode 100644 index 00000000..88c8959d --- /dev/null +++ b/src/util/url.py @@ -0,0 +1,48 @@ +from urllib.parse import urlparse + +from src.util.models.url_and_scheme import URLAndScheme + + +def clean_url(url: str) -> str: + # Remove Non-breaking spaces + url = url.strip(" ") + + # Remove any fragments and everything after them + url = url.split("#")[0] + return url + +def get_url_and_scheme( + url: str +) -> URLAndScheme: + parsed = urlparse(url) + if parsed.scheme: + remainder = url.replace(f"{parsed.scheme}://", "", 1) + return URLAndScheme( + url=remainder, + scheme=parsed.scheme + ) + # Handle URLs without scheme + return URLAndScheme( + url=url, + scheme=None + ) + +def remove_url_scheme(url: str) -> str: + parsed = urlparse(url) + if parsed.scheme: + return url.replace(f"{parsed.scheme}://", "", 1) + return url + + +def is_valid_url(url: str) -> bool: + try: + result = urlparse(url) + # If scheme is missing, `netloc` will be empty, so we check path too + if result.scheme in ("http", "https") and result.netloc: + return True + if not result.scheme and result.path: + # no scheme, treat path as potential domain + return "." in result.path + return False + except ValueError: + return False diff --git a/src/util/url_mapper_/__init__.py b/src/util/url_mapper_/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/util/url_mapper_/full.py b/src/util/url_mapper_/full.py new file mode 100644 index 00000000..8f6272c2 --- /dev/null +++ b/src/util/url_mapper_/full.py @@ -0,0 +1,49 @@ +from src.db.dtos.url.mapping_.full import FullURLMapping +from src.util.models.full_url import FullURL + + +class FullURLMapper: + + def __init__(self, mappings: list[FullURLMapping]): + self._url_to_id = { + mapping.full_url.id_form: mapping.url_id + for mapping in mappings + } + self._id_to_url = { + mapping.url_id: mapping.full_url + for mapping in mappings + } + + def get_id(self, full_url: FullURL) -> int: + return self._url_to_id[full_url.id_form] + + def get_ids(self, full_urls: list[FullURL]) -> list[int]: + return [ + self._url_to_id[full_url.id_form] + for full_url in full_urls + ] + + def get_all_ids(self) -> list[int]: + return list(self._url_to_id.values()) + + def get_all_urls(self) -> list[FullURL]: + return list(self._id_to_url.values()) + + def get_url(self, url_id: int) -> FullURL: + return self._id_to_url[url_id] + + def get_mappings_by_url(self, full_urls: list[FullURL]) -> list[FullURLMapping]: + return [ + FullURLMapping( + url_id=self._url_to_id[full_url.id_form], + full_url=full_url + ) for full_url in full_urls + ] + + def add_mapping(self, mapping: FullURLMapping) -> None: + self._url_to_id[mapping.full_url.id_form] = mapping.url_id + self._id_to_url[mapping.url_id] = mapping.full_url + + def add_mappings(self, mappings: list[FullURLMapping]) -> None: + for mapping in mappings: + self.add_mapping(mapping) \ No newline at end of file diff --git a/src/util/url_mapper.py b/src/util/url_mapper_/simple.py similarity index 72% rename from src/util/url_mapper.py rename to src/util/url_mapper_/simple.py index 3a399d77..2a7f7353 100644 --- a/src/util/url_mapper.py +++ b/src/util/url_mapper_/simple.py @@ -1,9 +1,9 @@ -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping -class URLMapper: +class SimpleURLMapper: - def __init__(self, mappings: list[URLMapping]): + def __init__(self, mappings: list[SimpleURLMapping]): self._url_to_id = { mapping.url: mapping.url_id for mapping in mappings @@ -31,18 +31,18 @@ def get_all_urls(self) -> list[str]: def get_url(self, url_id: int) -> str: return self._id_to_url[url_id] - def get_mappings_by_url(self, urls: list[str]) -> list[URLMapping]: + def get_mappings_by_url(self, urls: list[str]) -> list[SimpleURLMapping]: return [ - URLMapping( + SimpleURLMapping( url_id=self._url_to_id[url], url=url ) for url in urls ] - def add_mapping(self, mapping: URLMapping) -> None: + def add_mapping(self, mapping: SimpleURLMapping) -> None: self._url_to_id[mapping.url] = mapping.url_id self._id_to_url[mapping.url_id] = mapping.url - def add_mappings(self, mappings: list[URLMapping]) -> None: + def add_mappings(self, mappings: list[SimpleURLMapping]) -> None: for mapping in mappings: self.add_mapping(mapping) \ No newline at end of file diff --git a/tests/automated/integration/api/_helpers/RequestValidator.py b/tests/automated/integration/api/_helpers/RequestValidator.py index 73293522..0db00cb3 100644 --- a/tests/automated/integration/api/_helpers/RequestValidator.py +++ b/tests/automated/integration/api/_helpers/RequestValidator.py @@ -10,7 +10,7 @@ from src.api.endpoints.batch.dtos.get.logs import GetBatchLogsResponse from src.api.endpoints.batch.dtos.get.summaries.response import GetBatchSummariesResponse from src.api.endpoints.batch.dtos.get.summaries.summary import BatchSummary -from src.api.endpoints.batch.dtos.post.abort import MessageResponse +from src.api.shared.models.message_response import MessageResponse from src.api.endpoints.batch.duplicates.dto import GetDuplicatesByBatchResponse from src.api.endpoints.batch.urls.dto import GetURLsByBatchResponse from src.api.endpoints.collector.dtos.manual_batch.post import ManualBatchInputDTO @@ -102,6 +102,24 @@ def open_v2( ) return response.json() + def open_v3( + self, + method: str, + url: str, + params: dict | None = None, + expected_model: type[BaseModel] | None = None, + **kwargs + ) -> BaseModel | dict: + response = self.open_v2( + method=method, + url=url, + params=params, + **kwargs + ) + if expected_model: + return expected_model(**response) + return response + def get( self, url: str, @@ -158,6 +176,66 @@ def get_v2( **kwargs ) + def get_v3( + self, + url: str, + params: dict | None = None, + expected_model: BaseModel | None = None, + **kwargs + ): + return self.open_v3( + method="GET", + url=url, + params=params, + expected_model=expected_model, + **kwargs + ) + + def post_v3( + self, + url: str, + params: dict | None = None, + expected_model: BaseModel | None = None, + **kwargs + ): + return self.open_v3( + method="POST", + url=url, + params=params, + expected_model=expected_model, + **kwargs + ) + + def put_v3( + self, + url: str, + params: dict | None = None, + expected_model: BaseModel | None = None, + **kwargs + ): + return self.open_v3( + method="PUT", + url=url, + params=params, + expected_model=expected_model, + **kwargs + ) + + def delete_v3( + self, + url: str, + params: dict | None = None, + expected_model: BaseModel | None = None, + **kwargs + ): + return self.open_v3( + method="DELETE", + url=url, + params=params, + expected_model=expected_model, + **kwargs + ) + def put( self, diff --git a/tests/automated/integration/api/agencies/__init__.py b/tests/automated/integration/api/agencies/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/agencies/delete/__init__.py b/tests/automated/integration/api/agencies/delete/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/agencies/delete/test_core.py b/tests/automated/integration/api/agencies/delete/test_core.py new file mode 100644 index 00000000..be8fb9fa --- /dev/null +++ b/tests/automated/integration/api/agencies/delete/test_core.py @@ -0,0 +1,75 @@ +import pytest + +from src.api.endpoints.agencies.by_id.put.request import AgencyPutRequest +from src.api.endpoints.agencies.root.post.request import AgencyPostRequest +from src.db.models.impl.agency.enums import AgencyType, JurisdictionType +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.link.agency_location.sqlalchemy import LinkAgencyLocation +from tests.helpers.api_test_helper import APITestHelper +from tests.helpers.data_creator.models.creation_info.us_state import USStateCreationInfo + + +@pytest.mark.asyncio +async def test_agencies( + api_test_helper: APITestHelper, + california: USStateCreationInfo, + pennsylvania: USStateCreationInfo +): + ath = api_test_helper + rv = ath.request_validator + + rv.post_v3( + url=f"/agencies", + json=AgencyPostRequest( + name="Test Agency", + type=AgencyType.LAW_ENFORCEMENT, + jurisdiction_type=JurisdictionType.STATE, + location_ids=[california.location_id] + ).model_dump(mode="json") + ) + + agency: Agency = await ath.adb_client().one_or_none_model(model=Agency) + assert agency.name == "Test Agency" + assert agency.agency_type == AgencyType.LAW_ENFORCEMENT + assert agency.jurisdiction_type == JurisdictionType.STATE + + link: LinkAgencyLocation = await ath.adb_client().one_or_none_model(model=LinkAgencyLocation) + assert link is not None + assert link.agency_id == agency.id + assert link.location_id == california.location_id + + rv.delete_v3( + url=f"/agencies/{agency.id}/locations/{california.location_id}", + ) + + link: LinkAgencyLocation | None = await ath.adb_client().one_or_none_model(model=LinkAgencyLocation) + assert link is None + + rv.post_v3( + url=f"/agencies/{agency.id}/locations/{pennsylvania.location_id}", + ) + + link: LinkAgencyLocation = await ath.adb_client().one_or_none_model(model=LinkAgencyLocation) + assert link is not None + assert link.agency_id == agency.id + assert link.location_id == pennsylvania.location_id + + rv.put_v3( + url=f"/agencies/{agency.id}", + json=AgencyPutRequest( + name="Test Agency Updated", + ).model_dump(mode="json") + ) + + agency: Agency = await ath.adb_client().one_or_none_model(model=Agency) + assert agency.name == "Test Agency Updated" + assert agency.agency_type == AgencyType.LAW_ENFORCEMENT + assert agency.jurisdiction_type == JurisdictionType.STATE + + + rv.delete_v3( + url=f"/agencies/{agency.id}", + ) + + agency: Agency | None = await ath.adb_client().one_or_none_model(model=Agency) + assert agency is None diff --git a/tests/automated/integration/api/agencies/delete/test_ds_linked.py b/tests/automated/integration/api/agencies/delete/test_ds_linked.py new file mode 100644 index 00000000..0470c75e --- /dev/null +++ b/tests/automated/integration/api/agencies/delete/test_ds_linked.py @@ -0,0 +1,44 @@ +import pytest + +from src.db.models.impl.agency.ds_link.sqlalchemy import DSAppLinkAgency +from src.db.models.impl.agency.enums import AgencyType, JurisdictionType +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.flag.ds_delete.agency import FlagDSDeleteAgency +from tests.helpers.api_test_helper import APITestHelper +from tests.helpers.counter import next_int + + +@pytest.mark.asyncio +async def test_ds_linked( + api_test_helper: APITestHelper +): + """If an agency has been linked to the Data Sources App, + the deletion operation should include an agency flag for deletion. + """ + + agency = Agency( + name="Test Agency", + agency_type=AgencyType.LAW_ENFORCEMENT, + jurisdiction_type=JurisdictionType.STATE, + ) + agency_id: int = await api_test_helper.adb_client().add(agency, return_id=True) + + ds_agency_id: int = next_int() + # Add DS link + ds_link = DSAppLinkAgency( + agency_id=agency_id, + ds_agency_id=ds_agency_id, + ) + await api_test_helper.adb_client().add(ds_link) + + api_test_helper.request_validator.delete_v3( + url=f"/agencies/{agency.id}", + ) + + agency: Agency | None = await api_test_helper.adb_client().one_or_none_model(model=Agency) + assert agency is None + + flag: FlagDSDeleteAgency | None = await api_test_helper.adb_client().one_or_none_model(model=FlagDSDeleteAgency) + assert flag is not None + assert flag.ds_agency_id == ds_agency_id + diff --git a/tests/automated/integration/api/annotate/all/test_happy_path.py b/tests/automated/integration/api/annotate/all/test_happy_path.py index 48b60b8b..49d8bd97 100644 --- a/tests/automated/integration/api/annotate/all/test_happy_path.py +++ b/tests/automated/integration/api/annotate/all/test_happy_path.py @@ -1,7 +1,7 @@ import pytest -from src.api.endpoints.annotate.all.get.models.location import LocationAnnotationUserSuggestion from src.api.endpoints.annotate.all.get.models.response import GetNextURLForAllAnnotationResponse +from src.api.endpoints.annotate.all.get.models.suggestion import SuggestionModel from src.api.endpoints.annotate.all.get.queries.core import GetNextURLForAllAnnotationQueryBuilder from src.api.endpoints.annotate.all.post.models.agency import AnnotationPostAgencyInfo from src.api.endpoints.annotate.all.post.models.location import AnnotationPostLocationInfo @@ -10,11 +10,11 @@ from src.core.enums import RecordType from src.db.models.impl.flag.url_validated.enums import URLType from src.db.models.impl.link.user_name_suggestion.sqlalchemy import LinkUserNameSuggestion -from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion from src.db.models.impl.url.suggestion.location.user.sqlalchemy import UserLocationSuggestion from src.db.models.impl.url.suggestion.name.sqlalchemy import URLNameSuggestion from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion -from src.db.models.impl.url.suggestion.relevant.user import UserURLTypeSuggestion +from src.db.models.impl.url.suggestion.url_type.user import UserURLTypeSuggestion from tests.helpers.data_creator.models.creation_info.us_state import USStateCreationInfo from tests.helpers.setup.final_review.core import setup_for_get_next_url_for_final_review @@ -23,7 +23,9 @@ async def test_annotate_all( api_test_helper, pennsylvania: USStateCreationInfo, + allegheny_county: USStateCreationInfo, california: USStateCreationInfo, + test_agency_id: int ): """ Test the happy path workflow for the all-annotations endpoint @@ -46,10 +48,10 @@ async def test_annotate_all( # Get a valid URL to annotate get_response_1 = await ath.request_validator.get_next_url_for_all_annotations() assert get_response_1.next_annotation is not None - assert len(get_response_1.next_annotation.name_suggestions) == 1 - name_suggestion = get_response_1.next_annotation.name_suggestions[0] - assert name_suggestion.name is not None - assert name_suggestion.endorsement_count == 0 + assert len(get_response_1.next_annotation.name_suggestions.suggestions) == 1 + name_suggestion = get_response_1.next_annotation.name_suggestions.suggestions[0] + assert name_suggestion.display_name is not None + assert name_suggestion.user_count == 0 # Apply the second batch id as a filter and see that a different URL is returned get_response_2 = await ath.request_validator.get_next_url_for_all_annotations( @@ -110,7 +112,7 @@ async def test_annotate_all( assert suggested_types == {URLType.DATA_SOURCE, URLType.NOT_RELEVANT} # Should be one agency - all_agency_suggestions = await adb_client.get_all(UserUrlAgencySuggestion) + all_agency_suggestions = await adb_client.get_all(UserURLAgencySuggestion) assert len(all_agency_suggestions) == 3 suggested_agency_ids: set[int] = {sugg.agency_id for sugg in all_agency_suggestions} assert agency_id in suggested_agency_ids @@ -140,20 +142,27 @@ async def test_annotate_all( user_id=99, ) ) - user_suggestions: list[LocationAnnotationUserSuggestion] = \ - response.next_annotation.location_suggestions.user.suggestions - assert len(user_suggestions) == 2 + suggestions: list[SuggestionModel] = response.next_annotation.location_suggestions.suggestions + assert len(suggestions) == 2 - response_location_ids: list[int] = [location_suggestion.location_id for location_suggestion in user_suggestions] - assert set(response_location_ids) == {california.location_id, pennsylvania.location_id} + response_location_ids: list[int] = [ + location_suggestion.id + for location_suggestion in suggestions] - response_location_names: list[str] = [location_suggestion.location_name for location_suggestion in user_suggestions] + assert set(response_location_ids) == { + california.location_id, + pennsylvania.location_id + } + + response_location_names: list[str] = [ + location_suggestion.display_name + for location_suggestion in suggestions] assert set(response_location_names) == { "California", "Pennsylvania" } - for user_suggestion in user_suggestions: + for user_suggestion in suggestions: assert user_suggestion.user_count == 1 # Confirm 3 name suggestions diff --git a/tests/automated/integration/api/annotate/anonymous/helper.py b/tests/automated/integration/api/annotate/anonymous/helper.py index ccfe518f..cb892091 100644 --- a/tests/automated/integration/api/annotate/anonymous/helper.py +++ b/tests/automated/integration/api/annotate/anonymous/helper.py @@ -1,23 +1,32 @@ -from src.api.endpoints.annotate.all.get.models.response import GetNextURLForAllAnnotationResponse +from uuid import UUID + from src.api.endpoints.annotate.all.post.models.request import AllAnnotationPostInfo +from src.api.endpoints.annotate.anonymous.get.response import GetNextURLForAnonymousAnnotationResponse from tests.automated.integration.api._helpers.RequestValidator import RequestValidator async def get_next_url_for_anonymous_annotation( request_validator: RequestValidator, -): + session_id: UUID | None = None +) -> GetNextURLForAnonymousAnnotationResponse: + url = "/annotate/anonymous" + if session_id is not None: + url += f"?session_id={session_id}" + data = request_validator.get( - url=f"/annotate/anonymous" + url=url ) - return GetNextURLForAllAnnotationResponse(**data) + return GetNextURLForAnonymousAnnotationResponse(**data) async def post_and_get_next_url_for_anonymous_annotation( request_validator: RequestValidator, url_id: int, all_annotation_post_info: AllAnnotationPostInfo, -): + session_id: UUID +) -> GetNextURLForAnonymousAnnotationResponse: + url = f"/annotate/anonymous/{url_id}?session_id={session_id}" data = request_validator.post( - url=f"/annotate/anonymous/{url_id}", + url=url, json=all_annotation_post_info.model_dump(mode='json') ) - return GetNextURLForAllAnnotationResponse(**data) \ No newline at end of file + return GetNextURLForAnonymousAnnotationResponse(**data) \ No newline at end of file diff --git a/tests/automated/integration/api/annotate/anonymous/test_core.py b/tests/automated/integration/api/annotate/anonymous/test_core.py index 4b747363..26516b16 100644 --- a/tests/automated/integration/api/annotate/anonymous/test_core.py +++ b/tests/automated/integration/api/annotate/anonymous/test_core.py @@ -1,3 +1,5 @@ +from uuid import UUID + import pytest from src.api.endpoints.annotate.all.get.models.name import NameAnnotationSuggestion @@ -6,8 +8,9 @@ from src.api.endpoints.annotate.all.post.models.location import AnnotationPostLocationInfo from src.api.endpoints.annotate.all.post.models.name import AnnotationPostNameInfo from src.api.endpoints.annotate.all.post.models.request import AllAnnotationPostInfo +from src.api.endpoints.annotate.anonymous.get.response import GetNextURLForAnonymousAnnotationResponse from src.core.enums import RecordType -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.flag.url_validated.enums import URLType from src.db.models.impl.url.suggestion.anonymous.agency.sqlalchemy import AnonymousAnnotationAgency from src.db.models.impl.url.suggestion.anonymous.location.sqlalchemy import AnonymousAnnotationLocation @@ -26,6 +29,7 @@ async def test_annotate_anonymous( api_test_helper, pennsylvania: USStateCreationInfo, ): + ath = api_test_helper ddc = ath.db_data_creator rv = ath.request_validator @@ -34,22 +38,24 @@ async def test_annotate_anonymous( setup_info_1 = await setup_for_get_next_url_for_final_review( db_data_creator=ath.db_data_creator, include_user_annotations=True ) - url_mapping_1: URLMapping = setup_info_1.url_mapping + url_mapping_1: SimpleURLMapping = setup_info_1.url_mapping setup_info_2: FinalReviewSetupInfo = await setup_for_get_next_url_for_final_review( db_data_creator=ath.db_data_creator, include_user_annotations=True ) - url_mapping_2: URLMapping = setup_info_2.url_mapping + url_mapping_2: SimpleURLMapping = setup_info_2.url_mapping - get_response_1: GetNextURLForAllAnnotationResponse = await get_next_url_for_anonymous_annotation(rv) + get_response_1: GetNextURLForAnonymousAnnotationResponse = await get_next_url_for_anonymous_annotation(rv) + session_id: UUID = get_response_1.session_id + assert session_id is not None assert get_response_1.next_annotation is not None - assert len(get_response_1.next_annotation.name_suggestions) == 1 - name_suggestion: NameAnnotationSuggestion = get_response_1.next_annotation.name_suggestions[0] - assert name_suggestion.name is not None - assert name_suggestion.endorsement_count == 0 + assert len(get_response_1.next_annotation.name_suggestions.suggestions) == 1 + name_suggestion: NameAnnotationSuggestion = get_response_1.next_annotation.name_suggestions.suggestions[0] + assert name_suggestion.display_name is not None + assert name_suggestion.user_count == 0 agency_id: int = await ddc.agency() - post_response_1: GetNextURLForAllAnnotationResponse = await post_and_get_next_url_for_anonymous_annotation( + post_response_1: GetNextURLForAnonymousAnnotationResponse = await post_and_get_next_url_for_anonymous_annotation( rv, get_response_1.next_annotation.url_info.url_id, AllAnnotationPostInfo( @@ -64,8 +70,11 @@ async def test_annotate_anonymous( name_info=AnnotationPostNameInfo( new_name="New Name" ) - ) + ), + session_id=session_id ) + assert post_response_1.session_id == session_id + assert post_response_1.next_annotation is not None assert post_response_1.next_annotation.url_info.url_id != get_response_1.next_annotation.url_info.url_id @@ -81,3 +90,15 @@ async def test_annotate_anonymous( instance: model = instances[0] assert instance.url_id == get_response_1.next_annotation.url_info.url_id + # Run again without giving session ID, confirm original URL returned + get_response_2: GetNextURLForAnonymousAnnotationResponse = await get_next_url_for_anonymous_annotation(rv) + assert get_response_2.session_id != session_id + assert get_response_2.next_annotation is not None + assert get_response_2.next_annotation.url_info.url_id == get_response_1.next_annotation.url_info.url_id + + # Run again while giving session ID, confirm second URL returned + get_response_3: GetNextURLForAnonymousAnnotationResponse = await get_next_url_for_anonymous_annotation(rv, session_id) + assert get_response_3.session_id == session_id + assert get_response_3.next_annotation is not None + assert get_response_3.next_annotation.url_info.url_id == post_response_1.next_annotation.url_info.url_id + diff --git a/tests/automated/integration/api/annotate/helpers.py b/tests/automated/integration/api/annotate/helpers.py index 39cfedab..92392ab1 100644 --- a/tests/automated/integration/api/annotate/helpers.py +++ b/tests/automated/integration/api/annotate/helpers.py @@ -1,10 +1,10 @@ from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping def check_url_mappings_match( - map_1: URLMapping, - map_2: URLMapping + map_1: SimpleURLMapping, + map_2: SimpleURLMapping ): assert map_1.url_id == map_2.url_id assert map_2.url == map_2.url diff --git a/tests/automated/integration/api/batch/summaries/test_happy_path.py b/tests/automated/integration/api/batch/summaries/test_happy_path.py index f6e28238..6af9ce2b 100644 --- a/tests/automated/integration/api/batch/summaries/test_happy_path.py +++ b/tests/automated/integration/api/batch/summaries/test_happy_path.py @@ -29,10 +29,6 @@ async def test_get_batch_summaries(api_test_helper): count=4, status=URLCreationEnum.NOT_RELEVANT ), - TestURLCreationParameters( - count=3, - status=URLCreationEnum.ERROR - ) ] ), TestBatchCreationParameters( @@ -78,10 +74,10 @@ async def test_get_batch_summaries(api_test_helper): result_2 = results[1] assert result_2.id == batch_2_id counts_2 = result_2.url_counts - assert counts_2.total == 7 + assert counts_2.total == 4 assert counts_2.not_relevant == 4 - assert counts_2.errored == 3 - assert counts_2.pending == 3 + assert counts_2.errored == 0 + assert counts_2.pending == 0 assert counts_2.submitted == 0 assert counts_2.duplicate == 0 diff --git a/tests/automated/integration/api/batch/summaries/test_pending_url_filter.py b/tests/automated/integration/api/batch/summaries/test_pending_url_filter.py index c471b6fa..f4181629 100644 --- a/tests/automated/integration/api/batch/summaries/test_pending_url_filter.py +++ b/tests/automated/integration/api/batch/summaries/test_pending_url_filter.py @@ -2,7 +2,7 @@ from src.collectors.enums import CollectorType from src.core.enums import BatchStatus -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.views.batch_url_status.enums import BatchURLStatusEnum from tests.helpers.batch_creation_parameters.enums import URLCreationEnum from tests.helpers.data_creator.core import DBDataCreator @@ -27,7 +27,7 @@ async def test_get_batch_summaries_pending_url_filter(api_test_helper): # Add a batch with submitted URLs batch_submitted: int = await dbdc.create_batch(status=BatchStatus.READY_TO_LABEL) - submitted_url_mappings: list[URLMapping] = await dbdc.create_submitted_urls(count=2) + submitted_url_mappings: list[SimpleURLMapping] = await dbdc.create_submitted_urls(count=2) submitted_url_ids: list[int] = [url_mapping.url_id for url_mapping in submitted_url_mappings] await dbdc.create_batch_url_links( batch_id=batch_submitted, @@ -39,7 +39,7 @@ async def test_get_batch_summaries_pending_url_filter(api_test_helper): # Add a batch with validated URLs batch_validated: int = await dbdc.create_batch(status=BatchStatus.READY_TO_LABEL) - validated_url_mappings: list[URLMapping] = await dbdc.create_validated_urls( + validated_url_mappings: list[SimpleURLMapping] = await dbdc.create_validated_urls( count=2 ) validated_url_ids: list[int] = [url_mapping.url_id for url_mapping in validated_url_mappings] diff --git a/tests/automated/integration/api/batch/test_batch.py b/tests/automated/integration/api/batch/test_batch.py index f1e3d4f2..f34928d6 100644 --- a/tests/automated/integration/api/batch/test_batch.py +++ b/tests/automated/integration/api/batch/test_batch.py @@ -1,8 +1,5 @@ -from src.api.endpoints.batch.dtos.get.summaries.summary import BatchSummary -from src.db.models.impl.batch.pydantic.info import BatchInfo from src.db.dtos.url.insert import InsertURLsInfo -from src.collectors.impl.example.dtos.input import ExampleInputDTO -from src.core.enums import BatchStatus + def test_get_batch_urls(api_test_helper): diff --git a/tests/automated/integration/api/conftest.py b/tests/automated/integration/api/conftest.py deleted file mode 100644 index fa019469..00000000 --- a/tests/automated/integration/api/conftest.py +++ /dev/null @@ -1,69 +0,0 @@ -from typing import Generator, Any, AsyncGenerator -from unittest.mock import AsyncMock - -import pytest -import pytest_asyncio -from starlette.testclient import TestClient - -from src.api.main import app -from src.core.core import AsyncCore -from src.security.dtos.access_info import AccessInfo -from src.security.enums import Permissions -from src.security.manager import get_access_info -from tests.automated.integration.api._helpers.RequestValidator import RequestValidator -from tests.helpers.api_test_helper import APITestHelper - -MOCK_USER_ID = 1 - -def disable_task_trigger(ath: APITestHelper) -> None: - ath.async_core.collector_manager.post_collection_function_trigger = AsyncMock() - - - -async def fail_task_trigger() -> None: - raise Exception( - "Task Trigger is set to fail in tests by default, to catch unintentional calls." - "If this is not intended, either replace with a Mock or the expected task function." - ) - -def override_access_info() -> AccessInfo: - return AccessInfo( - user_id=MOCK_USER_ID, - permissions=[ - Permissions.SOURCE_COLLECTOR, - Permissions.SOURCE_COLLECTOR_FINAL_REVIEW - ] - ) - - -@pytest.fixture(scope="session") -def client(disable_task_flags) -> Generator[TestClient, None, None]: - with TestClient(app) as c: - app.dependency_overrides[get_access_info] = override_access_info - async_core: AsyncCore = c.app.state.async_core - - # Interfaces to the web should be mocked - task_manager = async_core.task_manager - task_manager.url_request_interface = AsyncMock() - task_manager.discord_poster = AsyncMock() - # Disable Logger - task_manager.logger.disabled = True - # Set trigger to fail immediately if called, to force it to be manually specified in tests - task_manager.task_trigger._func = fail_task_trigger - yield c - - # Reset environment variables back to original state - - -@pytest_asyncio.fixture -async def api_test_helper( - client: TestClient, - db_data_creator, - monkeypatch -) -> AsyncGenerator[APITestHelper, Any]: - yield APITestHelper( - request_validator=RequestValidator(client=client), - async_core=client.app.state.async_core, - db_data_creator=db_data_creator, - ) - await client.app.state.async_core.collector_manager.logger.clear_log_queue() diff --git a/tests/automated/integration/api/data_sources/__init__.py b/tests/automated/integration/api/data_sources/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/data_sources/agencies/__init__.py b/tests/automated/integration/api/data_sources/agencies/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/data_sources/agencies/test_add_remove.py b/tests/automated/integration/api/data_sources/agencies/test_add_remove.py new file mode 100644 index 00000000..42a82e11 --- /dev/null +++ b/tests/automated/integration/api/data_sources/agencies/test_add_remove.py @@ -0,0 +1,27 @@ +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from tests.helpers.api_test_helper import APITestHelper + + +async def test_agencies_add_remove( + api_test_helper: APITestHelper, + test_url_data_source_id: int, + test_agency_id_2: int, + test_agency_id: int +): + api_test_helper.request_validator.post_v3( + url=f"/data-sources/{test_url_data_source_id}/agencies/{test_agency_id_2}", + ) + adb_client: AsyncDatabaseClient = api_test_helper.adb_client() + + links: list[LinkURLAgency] = await adb_client.get_all(LinkURLAgency) + assert len(links) == 2 + assert {link.agency_id for link in links} == {test_agency_id_2, test_agency_id} + assert {link.url_id for link in links} == {test_url_data_source_id} + + api_test_helper.request_validator.delete_v3( + url=f"/data-sources/{test_url_data_source_id}/agencies/{test_agency_id_2}", + ) + + links: list[LinkURLAgency] = await adb_client.get_all(LinkURLAgency) + assert len(links) == 1 \ No newline at end of file diff --git a/tests/automated/integration/api/data_sources/agencies/test_invalid_type.py b/tests/automated/integration/api/data_sources/agencies/test_invalid_type.py new file mode 100644 index 00000000..54be1750 --- /dev/null +++ b/tests/automated/integration/api/data_sources/agencies/test_invalid_type.py @@ -0,0 +1,18 @@ +import pytest + +from tests.helpers.api_test_helper import APITestHelper +from tests.helpers.check import check_forbidden_url_type + + +@pytest.mark.asyncio +async def test_invalid_type( + api_test_helper: APITestHelper, + test_url_meta_url_id: int, + test_agency_id: int +): + for method in ['POST', 'DELETE']: + check_forbidden_url_type( + method=method, + route=f"/data-sources/{test_url_meta_url_id}/agencies/{test_agency_id}", + api_test_helper=api_test_helper, + ) \ No newline at end of file diff --git a/tests/automated/integration/api/data_sources/test_invalid_type.py b/tests/automated/integration/api/data_sources/test_invalid_type.py new file mode 100644 index 00000000..f415ee2b --- /dev/null +++ b/tests/automated/integration/api/data_sources/test_invalid_type.py @@ -0,0 +1,20 @@ +import pytest + +from src.api.endpoints.data_source.by_id.put.request import DataSourcePutRequest +from tests.helpers.api_test_helper import APITestHelper +from tests.helpers.check import check_forbidden_url_type + + +@pytest.mark.asyncio +async def test_invalid_type( + api_test_helper: APITestHelper, + test_url_meta_url_id: int +): + check_forbidden_url_type( + method="PUT", + route=f"/data-sources/{test_url_meta_url_id}", + api_test_helper=api_test_helper, + json=DataSourcePutRequest( + name="test" + ).model_dump(mode='json') + ) \ No newline at end of file diff --git a/tests/automated/integration/api/data_sources/test_put.py b/tests/automated/integration/api/data_sources/test_put.py new file mode 100644 index 00000000..c954b59c --- /dev/null +++ b/tests/automated/integration/api/data_sources/test_put.py @@ -0,0 +1,89 @@ +from datetime import date + +import pytest + +from src.api.endpoints.data_source.by_id.put.request import DataSourcePutRequest +from src.core.enums import RecordType +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.optional_ds_metadata.enums import AgencyAggregationEnum, UpdateMethodEnum, \ + RetentionScheduleEnum, AccessTypeEnum +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata +from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType +from tests.helpers.api_test_helper import APITestHelper + + +@pytest.mark.asyncio +async def test_put( + api_test_helper: APITestHelper, + test_url_data_source_id: int, + test_batch_id: int +): + + api_test_helper.request_validator.put_v3( + url=f"/data-sources/{test_url_data_source_id}", + json=DataSourcePutRequest( + url="http://modified_url.com/", + name="Modified URL", + record_type=RecordType.OTHER, + + batch_id=test_batch_id, + description="Modified Description", + + record_formats=["csv", "pdf"], + data_portal_type="CKAN", + supplying_entity="Modified Supplying Entity", + coverage_start=date(year=2025, month=4, day=1), + coverage_end=date(year=2025, month=8, day=29), + agency_supplied=False, + agency_originated=True, + agency_aggregation=AgencyAggregationEnum.LOCALITY, + agency_described_not_in_database="Modified Agency Not In DB", + update_method=UpdateMethodEnum.OVERWRITE, + readme_url="https://modified-readme.com", + originating_entity="Modified Originating Entity", + retention_schedule=RetentionScheduleEnum.FUTURE_ONLY, + scraper_url="https://modified-scraper.com", + submission_notes="Modified Submission Notes", + access_notes="Modified Access Notes", + access_types=[AccessTypeEnum.WEBPAGE, AccessTypeEnum.API], + ).model_dump(mode='json') + + ) + + adb_client: AsyncDatabaseClient = api_test_helper.adb_client() + + url: URL = (await adb_client.get_all(URL))[0] + assert url.url == "modified_url.com" + assert url.scheme == "http" + assert url.trailing_slash == True + assert url.description == "Modified Description" + + # Check Record Type + record_type: URLRecordType = (await adb_client.get_all(URLRecordType))[0] + assert record_type.record_type == RecordType.OTHER + + # Check Batch Link + link: LinkBatchURL = (await adb_client.get_all(LinkBatchURL))[0] + assert link.batch_id == test_batch_id + + # Check Optional Metadata + optional_metadata: URLOptionalDataSourceMetadata = (await adb_client.get_all(URLOptionalDataSourceMetadata))[0] + assert optional_metadata.record_formats == ["csv", "pdf"] + assert optional_metadata.data_portal_type == "CKAN" + assert optional_metadata.supplying_entity == "Modified Supplying Entity" + assert optional_metadata.coverage_start == date(year=2025, month=4, day=1) + assert optional_metadata.coverage_end == date(year=2025, month=8, day=29) + assert optional_metadata.agency_supplied == False + assert optional_metadata.agency_originated == True + assert optional_metadata.agency_aggregation == AgencyAggregationEnum.LOCALITY + assert optional_metadata.agency_described_not_in_database == "Modified Agency Not In DB" + assert optional_metadata.update_method == UpdateMethodEnum.OVERWRITE + assert optional_metadata.readme_url == "https://modified-readme.com" + assert optional_metadata.originating_entity == "Modified Originating Entity" + assert optional_metadata.retention_schedule == RetentionScheduleEnum.FUTURE_ONLY + assert optional_metadata.scraper_url == "https://modified-scraper.com" + assert optional_metadata.submission_notes == "Modified Submission Notes" + assert optional_metadata.access_notes == "Modified Access Notes" + assert optional_metadata.access_types == [AccessTypeEnum.WEBPAGE, AccessTypeEnum.API] diff --git a/tests/automated/integration/api/locations/__init__.py b/tests/automated/integration/api/locations/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/locations/post/__init__.py b/tests/automated/integration/api/locations/post/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/locations/post/test_locality.py b/tests/automated/integration/api/locations/post/test_locality.py new file mode 100644 index 00000000..6a1bc4b0 --- /dev/null +++ b/tests/automated/integration/api/locations/post/test_locality.py @@ -0,0 +1,38 @@ +import pytest + +from src.api.endpoints.locations.post.request import AddLocationRequestModel +from src.api.endpoints.locations.post.response import AddLocationResponseModel +from src.db import Locality, Location +from src.db.client.async_ import AsyncDatabaseClient +from tests.helpers.api_test_helper import APITestHelper +from tests.helpers.data_creator.models.creation_info.county import CountyCreationInfo + + +@pytest.mark.asyncio +async def test_add_locality( + allegheny_county: CountyCreationInfo, + adb_client_test: AsyncDatabaseClient, + api_test_helper: APITestHelper +): + # Add Locality + locality_response: dict = api_test_helper.request_validator.post_v3( + "/locations", + json=AddLocationRequestModel( + locality_name="Test Locality", + county_id=allegheny_county.county_id + ).model_dump(mode='json') + ) + response_model = AddLocationResponseModel( + **locality_response + ) + + # Confirm exists in database + localities: list[Locality] = await adb_client_test.get_all(Locality) + assert len(localities) == 1 + assert localities[0].name == "Test Locality" + assert localities[0].county_id == allegheny_county.county_id + + locations: list[Location] = await adb_client_test.get_all(Location) + assert len(locations) == 3 + location_ids = {location.id for location in locations} + assert response_model.location_id in location_ids diff --git a/tests/automated/integration/api/meta_urls/__init__.py b/tests/automated/integration/api/meta_urls/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/meta_urls/agencies/__init__.py b/tests/automated/integration/api/meta_urls/agencies/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/meta_urls/agencies/test_add_remove.py b/tests/automated/integration/api/meta_urls/agencies/test_add_remove.py new file mode 100644 index 00000000..1bd90ea2 --- /dev/null +++ b/tests/automated/integration/api/meta_urls/agencies/test_add_remove.py @@ -0,0 +1,31 @@ +from src.api.endpoints.agencies.root.get.response import AgencyGetOuterResponse +from tests.helpers.api_test_helper import APITestHelper + + +async def test_agencies_add_remove( + api_test_helper: APITestHelper, + test_url_meta_url_id: int, + test_agency_id: int, + test_agency_id_2: int +): + api_test_helper.request_validator.post_v3( + url=f"/meta-urls/{test_url_meta_url_id}/agencies/{test_agency_id_2}", + ) + + raw_response: dict = api_test_helper.request_validator.get_v3( + url=f"/meta-urls/{test_url_meta_url_id}/agencies", + ) + response = AgencyGetOuterResponse(**raw_response) + assert len(response.results) == 2 + assert {result.id for result in response.results} == {test_agency_id, test_agency_id_2} + + + api_test_helper.request_validator.delete_v3( + url=f"/meta-urls/{test_url_meta_url_id}/agencies/{test_agency_id_2}", + ) + + raw_response: dict = api_test_helper.request_validator.get_v3( + url=f"/meta-urls/{test_url_meta_url_id}/agencies", + ) + response = AgencyGetOuterResponse(**raw_response) + assert len(response.results) == 1 diff --git a/tests/automated/integration/api/meta_urls/agencies/test_invalid_type.py b/tests/automated/integration/api/meta_urls/agencies/test_invalid_type.py new file mode 100644 index 00000000..4f3c6f4a --- /dev/null +++ b/tests/automated/integration/api/meta_urls/agencies/test_invalid_type.py @@ -0,0 +1,18 @@ +import pytest + +from tests.helpers.api_test_helper import APITestHelper +from tests.helpers.check import check_forbidden_url_type + + +@pytest.mark.asyncio +async def test_invalid_type( + api_test_helper: APITestHelper, + test_url_data_source_id: int, + test_agency_id: int +): + for method in ['POST', 'DELETE']: + check_forbidden_url_type( + method=method, + route=f"/meta-urls/{test_url_data_source_id}/agencies/{test_agency_id}", + api_test_helper=api_test_helper, + ) \ No newline at end of file diff --git a/tests/automated/integration/api/meta_urls/test_invalid_type.py b/tests/automated/integration/api/meta_urls/test_invalid_type.py new file mode 100644 index 00000000..b3e98a3d --- /dev/null +++ b/tests/automated/integration/api/meta_urls/test_invalid_type.py @@ -0,0 +1,20 @@ +import pytest + +from src.api.endpoints.meta_url.by_id.put.request import UpdateMetaURLRequest +from tests.helpers.api_test_helper import APITestHelper +from tests.helpers.check import check_forbidden_url_type + + +@pytest.mark.asyncio +async def test_invalid_type( + api_test_helper: APITestHelper, + test_url_data_source_id: int +): + check_forbidden_url_type( + method="PUT", + route=f"/meta-urls/{test_url_data_source_id}", + api_test_helper=api_test_helper, + json=UpdateMetaURLRequest( + name="test" + ).model_dump(mode='json') + ) \ No newline at end of file diff --git a/tests/automated/integration/api/meta_urls/test_put.py b/tests/automated/integration/api/meta_urls/test_put.py new file mode 100644 index 00000000..1c493009 --- /dev/null +++ b/tests/automated/integration/api/meta_urls/test_put.py @@ -0,0 +1,39 @@ +import pytest + +from src.api.endpoints.meta_url.by_id.put.request import UpdateMetaURLRequest +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.sqlalchemy import URL +from tests.helpers.api_test_helper import APITestHelper + + +@pytest.mark.asyncio +async def test_put( + api_test_helper: APITestHelper, + test_url_meta_url_id: int, + test_batch_id: int +): + api_test_helper.request_validator.put_v3( + url=f"/meta-urls/{test_url_meta_url_id}", + json=UpdateMetaURLRequest( + url="new-meta-url.com", + name="Modified name", + description="Modified description", + batch_id=test_batch_id + ).model_dump(mode='json') + ) + + adb_client: AsyncDatabaseClient = api_test_helper.adb_client() + + # Check URL updated (including schema and trailing slash) + url: URL = (await adb_client.get_all(URL))[0] + assert url.url == "new-meta-url.com" + assert url.name == "Modified name" + assert url.scheme == "" + assert url.trailing_slash == False + assert url.description == "Modified description" + + # Check Batch ID + link: LinkBatchURL = (await adb_client.get_all(LinkBatchURL))[0] + assert link.batch_id == test_batch_id + diff --git a/tests/automated/integration/api/metrics/batches/test_aggregated.py b/tests/automated/integration/api/metrics/batches/test_aggregated.py index 090896e8..3d84d6d7 100644 --- a/tests/automated/integration/api/metrics/batches/test_aggregated.py +++ b/tests/automated/integration/api/metrics/batches/test_aggregated.py @@ -3,13 +3,11 @@ from src.collectors.enums import CollectorType, URLStatus from src.core.enums import BatchStatus from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.mapping import URLMapping -from src.db.helpers.connect import get_postgres_connection_string +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.flag.url_validated.enums import URLType from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters from tests.helpers.data_creator.create import create_batch, create_url_data_sources, create_urls, \ create_batch_url_links, create_validated_flags -from tests.helpers.setup.wipe import wipe_database @pytest.mark.asyncio @@ -25,17 +23,17 @@ async def test_get_batches_aggregated_metrics( adb_client=adb_client, strategy=CollectorType.MANUAL, ) - url_mappings_error: list[URLMapping] = await create_urls( + url_mappings_broken: list[SimpleURLMapping] = await create_urls( adb_client=adb_client, - status=URLStatus.ERROR, + status=URLStatus.BROKEN, count=4, ) - url_mappings_ok: list[URLMapping] = await create_urls( + url_mappings_ok: list[SimpleURLMapping] = await create_urls( adb_client=adb_client, status=URLStatus.OK, count=11, ) - url_mappings_all: list[URLMapping] = url_mappings_error + url_mappings_ok + url_mappings_all: list[SimpleURLMapping] = url_mappings_broken + url_mappings_ok url_ids_all: list[int] = [url_mapping.url_id for url_mapping in url_mappings_all] await create_batch_url_links( adb_client=adb_client, @@ -90,5 +88,5 @@ async def test_get_batches_aggregated_metrics( assert inner_dto_manual.count_urls_pending == 15 assert inner_dto_manual.count_urls_submitted == 6 assert inner_dto_manual.count_urls_rejected == 9 - assert inner_dto_manual.count_urls_errors == 12 + assert inner_dto_manual.count_urls_errors == 0 # TODO: Change by adding URL Task Errors assert inner_dto_manual.count_urls_validated == 30 diff --git a/tests/automated/integration/api/metrics/batches/test_breakdown.py b/tests/automated/integration/api/metrics/batches/test_breakdown.py index c6ef6e0b..6921c3c1 100644 --- a/tests/automated/integration/api/metrics/batches/test_breakdown.py +++ b/tests/automated/integration/api/metrics/batches/test_breakdown.py @@ -1,12 +1,11 @@ from datetime import datetime, timedelta -import pendulum import pytest from src.collectors.enums import CollectorType, URLStatus from src.core.enums import BatchStatus from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.flag.url_validated.enums import URLType from tests.helpers.data_creator.create import create_batch, create_urls, create_batch_url_links, create_validated_flags, \ create_url_data_sources @@ -23,7 +22,7 @@ async def test_get_batches_breakdown_metrics(api_test_helper): adb_client=adb_client, strategy=CollectorType.MANUAL, ) - url_mappings_1: list[URLMapping] = await create_urls( + url_mappings_1: list[SimpleURLMapping] = await create_urls( adb_client=adb_client, count=3, ) @@ -50,13 +49,7 @@ async def test_get_batches_breakdown_metrics(api_test_helper): strategy=CollectorType.AUTO_GOOGLER, date_generated=today - timedelta(days=14) ) - error_url_mappings: list[URLMapping] = await create_urls( - adb_client=adb_client, - status=URLStatus.ERROR, - count=4, - ) - error_url_ids: list[int] = [url_mapping.url_id for url_mapping in error_url_mappings] - validated_url_mappings: list[URLMapping] = await create_urls( + validated_url_mappings: list[SimpleURLMapping] = await create_urls( adb_client=adb_client, count=8, ) @@ -74,7 +67,7 @@ async def test_get_batches_breakdown_metrics(api_test_helper): await create_batch_url_links( adb_client=adb_client, batch_id=batch_id_3, - url_ids=error_url_ids + validated_url_ids, + url_ids=validated_url_ids, ) @@ -108,11 +101,11 @@ async def test_get_batches_breakdown_metrics(api_test_helper): assert dto_batch_3.batch_id == batch_id_3 assert dto_batch_3.status == BatchStatus.READY_TO_LABEL assert dto_batch_3.strategy == CollectorType.AUTO_GOOGLER - assert dto_batch_3.count_url_total == 12 - assert dto_batch_3.count_url_pending == 5 + assert dto_batch_3.count_url_total == 8 + assert dto_batch_3.count_url_pending == 1 assert dto_batch_3.count_url_submitted == 0 assert dto_batch_3.count_url_rejected == 3 - assert dto_batch_3.count_url_error == 4 + assert dto_batch_3.count_url_error == 0 assert dto_batch_3.count_url_validated == 7 dto_2 = await ath.request_validator.get_batches_breakdown_metrics( diff --git a/tests/automated/integration/api/metrics/test_backlog.py b/tests/automated/integration/api/metrics/test_backlog.py index da8dccd6..181c295e 100644 --- a/tests/automated/integration/api/metrics/test_backlog.py +++ b/tests/automated/integration/api/metrics/test_backlog.py @@ -2,7 +2,7 @@ import pytest from src.collectors.enums import URLStatus -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.flag.url_validated.enums import URLType from tests.helpers.data_creator.core import DBDataCreator @@ -20,7 +20,7 @@ async def test_get_backlog_metrics(api_test_helper): # Ensure that multiple days in each month are added to the backlog table, with different values batch_1_id: int = await ddc.create_batch() - url_mappings_1: list[URLMapping] = await ddc.create_urls(count=3) + url_mappings_1: list[SimpleURLMapping] = await ddc.create_urls(count=3) url_ids_1: list[int] = [url_mapping.url_id for url_mapping in url_mappings_1] await ddc.create_batch_url_links(url_ids=url_ids_1, batch_id=batch_1_id) submitted_url_ids_1: list[int] = url_ids_1[:2] @@ -39,19 +39,13 @@ async def test_get_backlog_metrics(api_test_helper): ) batch_2_id: int = await ddc.create_batch() - not_relevant_url_mappings_2: list[URLMapping] = await ddc.create_urls(count=6) + not_relevant_url_mappings_2: list[SimpleURLMapping] = await ddc.create_urls(count=6) not_relevant_url_ids_2: list[int] = [url_mapping.url_id for url_mapping in not_relevant_url_mappings_2] await ddc.create_batch_url_links(url_ids=not_relevant_url_ids_2, batch_id=batch_2_id) await ddc.create_validated_flags( url_ids=not_relevant_url_ids_2[:4], validation_type=URLType.NOT_RELEVANT ) - error_url_mappings_2: list[URLMapping] = await ddc.create_urls( - status=URLStatus.ERROR, - count=2 - ) - error_url_ids_2: list[int] = [url_mapping.url_id for url_mapping in error_url_mappings_2] - await ddc.create_batch_url_links(url_ids=error_url_ids_2, batch_id=batch_2_id) await adb_client.populate_backlog_snapshot( dt=today.subtract(months=2).naive() @@ -62,7 +56,7 @@ async def test_get_backlog_metrics(api_test_helper): ) batch_3_id: int = await ddc.create_batch() - url_mappings_3: list[URLMapping] = await ddc.create_urls(count=12) + url_mappings_3: list[SimpleURLMapping] = await ddc.create_urls(count=12) url_ids_3: list[int] = [url_mapping.url_id for url_mapping in url_mappings_3] await ddc.create_batch_url_links(url_ids=url_ids_3, batch_id=batch_3_id) await ddc.create_validated_flags( diff --git a/tests/automated/integration/api/metrics/urls/aggregated/test_core.py b/tests/automated/integration/api/metrics/urls/aggregated/test_core.py index 64ae5ae4..e203b722 100644 --- a/tests/automated/integration/api/metrics/urls/aggregated/test_core.py +++ b/tests/automated/integration/api/metrics/urls/aggregated/test_core.py @@ -1,10 +1,9 @@ -from datetime import datetime, timedelta, timezone +from datetime import datetime, timedelta -import pendulum import pytest from src.collectors.enums import CollectorType, URLStatus -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.flag.url_validated.enums import URLType from tests.helpers.batch_creation_parameters.core import TestBatchCreationParameters from tests.helpers.batch_creation_parameters.enums import URLCreationEnum @@ -33,24 +32,23 @@ async def test_get_urls_aggregated_metrics(api_test_helper): strategy=CollectorType.MANUAL, date_generated=today - timedelta(days=1) ) - url_mappings_0: list[URLMapping] = await ddc.create_urls(batch_id=batch_0) + url_mappings_0: list[SimpleURLMapping] = await ddc.create_urls(batch_id=batch_0) oldest_url_id: int = url_mappings_0[0].url_id batch_1: int = await ddc.create_batch( strategy=CollectorType.MANUAL, ) - url_mappings_1_ok: list[URLMapping] = await ddc.create_urls(batch_id=batch_1, count=1) - url_mappings_1_submitted: list[URLMapping] = await ddc.create_submitted_urls(count=2) + url_mappings_1_ok: list[SimpleURLMapping] = await ddc.create_urls(batch_id=batch_1, count=1) + url_mappings_1_submitted: list[SimpleURLMapping] = await ddc.create_submitted_urls(count=2) url_ids_1_submitted: list[int] = [url_mapping.url_id for url_mapping in url_mappings_1_submitted] await ddc.create_batch_url_links(url_ids=url_ids_1_submitted, batch_id=batch_1) batch_2: int = await ddc.create_batch( strategy=CollectorType.AUTO_GOOGLER, ) - url_mappings_2_ok: list[URLMapping] = await ddc.create_urls(batch_id=batch_2, count=4, status=URLStatus.OK) - url_mappings_2_error: list[URLMapping] = await ddc.create_urls(batch_id=batch_2, count=2, status=URLStatus.ERROR) - url_mappings_2_validated: list[URLMapping] = await ddc.create_validated_urls(count=1, validation_type=URLType.DATA_SOURCE) - url_mappings_2_not_relevant: list[URLMapping] = await ddc.create_validated_urls(count=5, validation_type=URLType.NOT_RELEVANT) + url_mappings_2_ok: list[SimpleURLMapping] = await ddc.create_urls(batch_id=batch_2, count=4, status=URLStatus.OK) + url_mappings_2_validated: list[SimpleURLMapping] = await ddc.create_validated_urls(count=1, validation_type=URLType.DATA_SOURCE) + url_mappings_2_not_relevant: list[SimpleURLMapping] = await ddc.create_validated_urls(count=5, validation_type=URLType.NOT_RELEVANT) url_ids_2_validated: list[int] = [url_mapping.url_id for url_mapping in url_mappings_2_validated] url_ids_2_not_relevant: list[int] = [url_mapping.url_id for url_mapping in url_mappings_2_not_relevant] await ddc.create_batch_url_links( diff --git a/tests/automated/integration/api/metrics/urls/breakdown/test_pending.py b/tests/automated/integration/api/metrics/urls/breakdown/test_pending.py index 3e906a8c..9bdf59ba 100644 --- a/tests/automated/integration/api/metrics/urls/breakdown/test_pending.py +++ b/tests/automated/integration/api/metrics/urls/breakdown/test_pending.py @@ -64,10 +64,6 @@ async def test_get_urls_breakdown_pending_metrics(api_test_helper): count=3, status=URLCreationEnum.SUBMITTED ), - TestURLCreationParameters( - count=4, - status=URLCreationEnum.ERROR - ), TestURLCreationParameters( count=5, status=URLCreationEnum.OK, diff --git a/tests/automated/integration/api/metrics/urls/breakdown/test_submitted.py b/tests/automated/integration/api/metrics/urls/breakdown/test_submitted.py index cbd30f8b..d0a25ab1 100644 --- a/tests/automated/integration/api/metrics/urls/breakdown/test_submitted.py +++ b/tests/automated/integration/api/metrics/urls/breakdown/test_submitted.py @@ -47,10 +47,6 @@ async def test_get_urls_breakdown_submitted_metrics(api_test_helper): count=3, status=URLCreationEnum.SUBMITTED ), - TestURLCreationParameters( - count=4, - status=URLCreationEnum.ERROR - ), TestURLCreationParameters( count=5, status=URLCreationEnum.VALIDATED diff --git a/tests/automated/integration/api/search/agency/test_search.py b/tests/automated/integration/api/search/agency/test_search.py index cc3fee19..f207b3ae 100644 --- a/tests/automated/integration/api/search/agency/test_search.py +++ b/tests/automated/integration/api/search/agency/test_search.py @@ -61,3 +61,13 @@ async def test_search_agency( } ) assert len(responses) == 3 + + # Test pagination + responses = api_test_helper.request_validator.get_v2( + url="/search/agency", + params={ + "query": "A Agency", + "location_id": allegheny_county.location_id, + "page": 2 + } + ) diff --git a/tests/automated/integration/api/submit/data_source/__init__.py b/tests/automated/integration/api/submit/data_source/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/submit/data_source/test_core.py b/tests/automated/integration/api/submit/data_source/test_core.py new file mode 100644 index 00000000..558327c3 --- /dev/null +++ b/tests/automated/integration/api/submit/data_source/test_core.py @@ -0,0 +1,154 @@ +from datetime import date +from uuid import UUID + +import pytest + +from src.api.endpoints.submit.data_source.request import DataSourceSubmissionRequest +from src.collectors.enums import URLStatus +from src.core.enums import RecordType, BatchStatus +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.batch.sqlalchemy import Batch +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.optional_ds_metadata.enums import AgencyAggregationEnum, UpdateMethodEnum, \ + RetentionScheduleEnum, AccessTypeEnum +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata +from src.db.models.impl.url.suggestion.anonymous.agency.sqlalchemy import AnonymousAnnotationAgency +from src.db.models.impl.url.suggestion.anonymous.location.sqlalchemy import AnonymousAnnotationLocation +from src.db.models.impl.url.suggestion.anonymous.record_type.sqlalchemy import AnonymousAnnotationRecordType +from src.db.models.impl.url.suggestion.anonymous.url_type.sqlalchemy import AnonymousAnnotationURLType +from src.db.models.impl.url.suggestion.name.sqlalchemy import URLNameSuggestion +from tests.helpers.api_test_helper import APITestHelper +from tests.helpers.data_creator.models.creation_info.locality import LocalityCreationInfo + + +@pytest.mark.asyncio +async def test_submit_data_source( + api_test_helper: APITestHelper, + test_agency_id: int, + pittsburgh_locality: LocalityCreationInfo, +): + ath = api_test_helper + ath.request_validator.post_v3( + url="submit/data-source", + json=DataSourceSubmissionRequest( + source_url="https://example.com/", + name="Example name", + description="Example description", + record_type=RecordType.COMPLAINTS_AND_MISCONDUCT, + coverage_start=date(year=2025, month=8, day=9), + coverage_end=date(year=2025, month=8, day=10), + supplying_entity="Test supplying entity", + agency_supplied=True, + agency_originated=False, + agency_aggregation=AgencyAggregationEnum.STATE, + agency_described_not_in_database="Test agency described not in database", + update_method=UpdateMethodEnum.NO_UPDATES, + readme_url="https://example.com/readme", + originating_entity="Test Originating Entity", + retention_schedule=RetentionScheduleEnum.GT_10_YEARS, + scraper_url="https://example.com/scraper", + submission_notes="Test submission notes", + data_portal_type="Test data portal", + access_notes="Test access notes", + access_types=[ + AccessTypeEnum.API, + AccessTypeEnum.DOWNLOAD, + AccessTypeEnum.WEBPAGE + ], + record_formats=[ + "Test record format", + "Test record format 2" + ], + + agency_ids=[test_agency_id], + location_ids=[pittsburgh_locality.location_id] + + ).model_dump(mode='json') + ) + + adb_client: AsyncDatabaseClient = api_test_helper.adb_client() + + # Check URL + url: URL = await adb_client.one_or_none_model(URL) + assert url is not None + assert url.url == "example.com" + assert url.scheme == "https" + assert url.trailing_slash == True + assert url.source == URLSource.MANUAL + assert url.status == URLStatus.OK + assert url.description == "Example description" + + # Check for Batch + batch: Batch = await adb_client.one_or_none_model(Batch) + assert batch is not None + assert batch.user_id is None + assert batch.strategy == 'manual' + assert batch.status == BatchStatus.READY_TO_LABEL.value + assert batch.parameters == {} + + # Check for Batch URL Link + batch_url_link: LinkBatchURL = await adb_client.one_or_none_model(LinkBatchURL) + assert batch_url_link is not None + assert batch_url_link.batch_id == batch.id + assert batch_url_link.url_id == url.id + + # Check for anonymous annotations + url_type_suggestion: AnonymousAnnotationURLType = await adb_client.one_or_none_model(AnonymousAnnotationURLType) + assert url_type_suggestion is not None + assert url_type_suggestion.url_id == url.id + assert url_type_suggestion.url_type == URLType.DATA_SOURCE + session_id: UUID = url_type_suggestion.session_id + + # Check for Location Suggestion + location_suggestion: AnonymousAnnotationLocation = await adb_client.one_or_none_model(AnonymousAnnotationLocation) + assert location_suggestion is not None + assert location_suggestion.location_id == pittsburgh_locality.location_id + assert location_suggestion.session_id == session_id + + # Check for Agency Suggestion + agency_suggestion: AnonymousAnnotationAgency = await adb_client.one_or_none_model(AnonymousAnnotationAgency) + assert agency_suggestion is not None + assert agency_suggestion.agency_id == test_agency_id + assert agency_suggestion.session_id == session_id + + # Check for Name Suggestion + name_suggestion: URLNameSuggestion = await adb_client.one_or_none_model(URLNameSuggestion) + assert name_suggestion is not None + assert name_suggestion.suggestion == "Example name" + + # Check for Record Type Suggestion + record_type_suggestion: AnonymousAnnotationRecordType = await adb_client.one_or_none_model(AnonymousAnnotationRecordType) + assert record_type_suggestion.record_type == RecordType.COMPLAINTS_AND_MISCONDUCT + assert record_type_suggestion.session_id == session_id + + # Check for URL DS Optional Metadata + optional_ds: URLOptionalDataSourceMetadata = await adb_client.one_or_none_model(URLOptionalDataSourceMetadata) + assert optional_ds is not None + assert optional_ds.coverage_start == date(year=2025, month=8, day=9) + assert optional_ds.coverage_end == date(year=2025, month=8, day=10) + assert optional_ds.supplying_entity == "Test supplying entity" + assert optional_ds.agency_supplied + assert not optional_ds.agency_originated + assert optional_ds.agency_aggregation == AgencyAggregationEnum.STATE + assert optional_ds.agency_described_not_in_database == "Test agency described not in database" + assert optional_ds.data_portal_type == "Test data portal" + assert optional_ds.update_method == UpdateMethodEnum.NO_UPDATES + assert optional_ds.readme_url == "https://example.com/readme" + assert optional_ds.originating_entity == "Test Originating Entity" + assert optional_ds.retention_schedule == RetentionScheduleEnum.GT_10_YEARS + assert optional_ds.scraper_url == "https://example.com/scraper" + assert optional_ds.submission_notes == "Test submission notes" + assert optional_ds.access_notes == "Test access notes" + assert optional_ds.access_types == [ + AccessTypeEnum.API, + AccessTypeEnum.DOWNLOAD, + AccessTypeEnum.WEBPAGE + ] + assert optional_ds.record_formats == [ + "Test record format", + "Test record format 2" + ] + diff --git a/tests/automated/integration/api/submit/data_source/test_duplicate.py b/tests/automated/integration/api/submit/data_source/test_duplicate.py new file mode 100644 index 00000000..ea16e1ec --- /dev/null +++ b/tests/automated/integration/api/submit/data_source/test_duplicate.py @@ -0,0 +1,38 @@ +import pytest +from fastapi import HTTPException + +from src.api.endpoints.submit.data_source.models.response.duplicate import SubmitDataSourceURLDuplicateSubmissionResponse +from src.api.endpoints.submit.data_source.request import DataSourceSubmissionRequest +from src.collectors.enums import URLStatus +from src.core.enums import RecordType +from src.db.dtos.url.mapping_.simple import SimpleURLMapping +from src.db.models.impl.flag.url_validated.enums import URLType +from tests.helpers.api_test_helper import APITestHelper +from tests.helpers.data_creator.models.creation_info.locality import LocalityCreationInfo + + +@pytest.mark.asyncio +async def test_submit_data_source_duplicate( + api_test_helper: APITestHelper, + test_agency_id: int, + pittsburgh_locality: LocalityCreationInfo, + test_url_data_source_mapping: SimpleURLMapping +): + + ath = api_test_helper + try: + ath.request_validator.post_v3( + url="submit/data-source", + json=DataSourceSubmissionRequest( + source_url=test_url_data_source_mapping.url, + name="Test Name", + record_type=RecordType.RECORDS_REQUEST_INFO + ).model_dump(mode='json') + ) + except HTTPException as e: + response = e.detail['detail'] + model = SubmitDataSourceURLDuplicateSubmissionResponse(**response) + assert model.url_id == test_url_data_source_mapping.url_id + assert model.url_type == URLType.DATA_SOURCE + assert model.url_status == URLStatus.OK + assert model.message == "Duplicate URL found" diff --git a/tests/automated/integration/api/submit/test_duplicate.py b/tests/automated/integration/api/submit/test_duplicate.py index c1ccfd29..0bef1091 100644 --- a/tests/automated/integration/api/submit/test_duplicate.py +++ b/tests/automated/integration/api/submit/test_duplicate.py @@ -3,7 +3,7 @@ from src.api.endpoints.submit.url.enums import URLSubmissionStatus from src.api.endpoints.submit.url.models.request import URLSubmissionRequest from src.api.endpoints.submit.url.models.response import URLSubmissionResponse -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from tests.helpers.api_test_helper import APITestHelper from tests.helpers.data_creator.core import DBDataCreator @@ -13,7 +13,7 @@ async def test_duplicate( api_test_helper: APITestHelper, db_data_creator: DBDataCreator ): - url_mapping: URLMapping = (await db_data_creator.create_urls(count=1))[0] + url_mapping: SimpleURLMapping = (await db_data_creator.create_urls(count=1))[0] response: URLSubmissionResponse = await api_test_helper.request_validator.submit_url( request=URLSubmissionRequest( diff --git a/tests/automated/integration/api/submit/test_url_maximal.py b/tests/automated/integration/api/submit/test_url_maximal.py index 8d1930f5..e57770fb 100644 --- a/tests/automated/integration/api/submit/test_url_maximal.py +++ b/tests/automated/integration/api/submit/test_url_maximal.py @@ -8,7 +8,7 @@ from src.db.models.impl.link.user_name_suggestion.sqlalchemy import LinkUserNameSuggestion from src.db.models.impl.link.user_suggestion_not_found.users_submitted_url.sqlalchemy import LinkUserSubmittedURL from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.suggestion.agency.user import UserUrlAgencySuggestion +from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion from src.db.models.impl.url.suggestion.location.user.sqlalchemy import UserLocationSuggestion from src.db.models.impl.url.suggestion.name.enums import NameSuggestionSource from src.db.models.impl.url.suggestion.name.sqlalchemy import URLNameSuggestion @@ -32,6 +32,7 @@ async def test_maximal( request=URLSubmissionRequest( url="www.example.com", record_type=RecordType.INCARCERATION_RECORDS, + description="Example description", name="Example URL", location_id=pittsburgh_locality.location_id, agency_id=agency_id, @@ -48,15 +49,16 @@ async def test_maximal( url: URL = urls[0] assert url.id == url_id assert url.url == "www.example.com" + assert url.description == "Example description" links: list[LinkUserSubmittedURL] = await adb_client.get_all(LinkUserSubmittedURL) assert len(links) == 1 link: LinkUserSubmittedURL = links[0] assert link.url_id == url_id - agen_suggs: list[UserUrlAgencySuggestion] = await adb_client.get_all(UserUrlAgencySuggestion) + agen_suggs: list[UserURLAgencySuggestion] = await adb_client.get_all(UserURLAgencySuggestion) assert len(agen_suggs) == 1 - agen_sugg: UserUrlAgencySuggestion = agen_suggs[0] + agen_sugg: UserURLAgencySuggestion = agen_suggs[0] assert agen_sugg.url_id == url_id assert agen_sugg.agency_id == agency_id diff --git a/tests/automated/integration/api/test_manual_batch.py b/tests/automated/integration/api/test_manual_batch.py index dae5ee4f..fa3f7884 100644 --- a/tests/automated/integration/api/test_manual_batch.py +++ b/tests/automated/integration/api/test_manual_batch.py @@ -3,7 +3,7 @@ from src.api.endpoints.collector.dtos.manual_batch.post import ManualBatchInnerInputDTO, ManualBatchInputDTO from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL -from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.batch.sqlalchemy import Batch from src.collectors.enums import CollectorType @@ -20,14 +20,14 @@ async def test_manual_batch(api_test_helper): dtos = [] for i in range(50): dto = ManualBatchInnerInputDTO( - url=f"https://example.com/{i}", + url=f"example.com/{i}", ) dtos.append(dto) # Create 50 entries with URL and all optional fields for i in range(50): dto = ManualBatchInnerInputDTO( - url=f"https://example.com/{i+50}", + url=f"example.com/{i+50}", name=manual_batch_name, description=f"Description {i}", collector_metadata={ @@ -121,7 +121,10 @@ def check_url(url: URL, url_only: bool): def check_opt_metadata(metadata: URLOptionalDataSourceMetadata, no_optional: bool): assert metadata.url_id is not None - other_attributes = ["record_formats", "data_portal_type", "supplying_entity"] + other_attributes = [ + "data_portal_type", + "supplying_entity" + ] return check_attributes(metadata, other_attributes, no_optional) # Confirm 50 have nothing but URL id @@ -142,13 +145,13 @@ def check_opt_metadata(metadata: URLOptionalDataSourceMetadata, no_optional: boo more_dtos = [] for i in range(49): dto = ManualBatchInnerInputDTO( - url=f"https://example.com/{i+100}", + url=f"example.com/{i+100}", ) more_dtos.append(dto) for i in range(2): dto = ManualBatchInnerInputDTO( - url=f"https://example.com/{i+1}", + url=f"example.com/{i+1}", ) more_dtos.append(dto) @@ -162,7 +165,7 @@ def check_opt_metadata(metadata: URLOptionalDataSourceMetadata, no_optional: boo response = await ath.request_validator.submit_manual_batch(duplicate_input_dto) # Check duplicate URLs assert len(response.duplicate_urls) == 2 - assert response.duplicate_urls == ['https://example.com/1', 'https://example.com/2'] + assert response.duplicate_urls == ['example.com/1', 'example.com/2'] assert len(response.urls) == 49 # Check 149 URLs in database diff --git a/tests/automated/integration/api/url/by_id/delete/__init__.py b/tests/automated/integration/api/url/by_id/delete/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/url/by_id/delete/setup.py b/tests/automated/integration/api/url/by_id/delete/setup.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/api/url/by_id/delete/test_any_url.py b/tests/automated/integration/api/url/by_id/delete/test_any_url.py new file mode 100644 index 00000000..50b3ca0c --- /dev/null +++ b/tests/automated/integration/api/url/by_id/delete/test_any_url.py @@ -0,0 +1,458 @@ +from uuid import UUID + +import pytest +from sqlalchemy import select + +from src.core.enums import RecordType +from src.db.client.async_ import AsyncDatabaseClient +from src.db.dtos.url.mapping_.simple import SimpleURLMapping +from src.db.enums import ChangeLogOperationType +from src.db.models.impl.change_log import ChangeLog +from src.db.models.impl.flag.checked_for_ia.sqlalchemy import FlagURLCheckedForInternetArchives +from src.db.models.impl.flag.root_url.sqlalchemy import FlagRootURL +from src.db.models.impl.flag.url_suspended.sqlalchemy import FlagURLSuspended +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL +from src.db.models.impl.link.url_redirect_url.sqlalchemy import LinkURLRedirectURL +from src.db.models.impl.link.urls_root_url.sqlalchemy import LinkURLRootURL +from src.db.models.impl.link.user_name_suggestion.sqlalchemy import LinkUserNameSuggestion +from src.db.models.impl.link.user_suggestion_not_found.agency.sqlalchemy import LinkUserSuggestionAgencyNotFound +from src.db.models.impl.link.user_suggestion_not_found.location.sqlalchemy import LinkUserSuggestionLocationNotFound +from src.db.models.impl.link.user_suggestion_not_found.users_submitted_url.sqlalchemy import LinkUserSubmittedURL +from src.db.models.impl.url.checked_for_duplicate import URLCheckedForDuplicate +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML +from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent +from src.db.models.impl.url.internet_archives.probe.sqlalchemy import URLInternetArchivesProbeMetadata +from src.db.models.impl.url.internet_archives.save.sqlalchemy import URLInternetArchivesSaveMetadata +from src.db.models.impl.url.screenshot.sqlalchemy import URLScreenshot +from src.db.models.impl.url.suggestion.agency.subtask.enum import SubtaskDetailCode, AutoAgencyIDSubtaskType +from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask +from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion +from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion +from src.db.models.impl.url.suggestion.anonymous.agency.sqlalchemy import AnonymousAnnotationAgency +from src.db.models.impl.url.suggestion.anonymous.location.sqlalchemy import AnonymousAnnotationLocation +from src.db.models.impl.url.suggestion.anonymous.record_type.sqlalchemy import AnonymousAnnotationRecordType +from src.db.models.impl.url.suggestion.anonymous.url_type.sqlalchemy import AnonymousAnnotationURLType +from src.db.models.impl.url.suggestion.location.auto.subtask.enums import LocationIDSubtaskType +from src.db.models.impl.url.suggestion.location.auto.subtask.sqlalchemy import AutoLocationIDSubtask +from src.db.models.impl.url.suggestion.location.auto.suggestion.sqlalchemy import LocationIDSubtaskSuggestion +from src.db.models.impl.url.suggestion.location.user.sqlalchemy import UserLocationSuggestion +from src.db.models.impl.url.suggestion.name.enums import NameSuggestionSource +from src.db.models.impl.url.suggestion.name.sqlalchemy import URLNameSuggestion +from src.db.models.impl.url.suggestion.record_type.auto import AutoRecordTypeSuggestion +from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion +from src.db.models.impl.url.suggestion.url_type.auto.sqlalchemy import AutoRelevantSuggestion +from src.db.models.impl.url.suggestion.url_type.user import UserURLTypeSuggestion +from src.db.models.impl.url.task_error.sqlalchemy import URLTaskError +from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata +from src.db.queries.implementations.anonymous_session import MakeAnonymousSessionQueryBuilder +from tests.helpers.api_test_helper import APITestHelper +from tests.helpers.data_creator.core import DBDataCreator +from tests.helpers.data_creator.models.creation_info.locality import LocalityCreationInfo + + +@pytest.mark.asyncio +async def test_any_url( + pittsburgh_locality: LocalityCreationInfo, + db_data_creator: DBDataCreator, + test_agency_id: int, + api_test_helper: APITestHelper +): + """ + Test that deletion works properly for a URL that has all possible attributes + that any URL could have + """ + + url_id: int = await _setup( + ddc=db_data_creator, + pittsburgh_id=pittsburgh_locality.location_id, + agency_id=test_agency_id + ) + api_test_helper.request_validator.delete_v3( + f"url/{url_id}" + ) + await _check_results(url_id, dbc=db_data_creator.adb_client) + + + +async def _check_results( + url_id: int, + dbc: AsyncDatabaseClient +) -> None: + # There should be only two urls present in the database, neither matching URL id + urls: list[URL] = await dbc.get_all(URL) + assert len(urls) == 2 + assert url_id not in (url.id for url in urls) + + # For the following models, there should no longer be any entries in the database. + models = [ + # Batch Link + LinkBatchURL, + # MISCELLANEOUS + ## Flag Root URL + FlagRootURL, + ## URL Task Error + URLTaskError, + ## URL Checked for Duplicate + URLCheckedForDuplicate, + ## Flag URL Suspended + FlagURLSuspended, + # LINKS + ## Link URLs Redirect URL + LinkURLRedirectURL, + ## Link URLs Root URL + LinkURLRootURL, + ## Link User Submitted URLs + LinkUserSubmittedURL, + ## Link User Suggestion Agency Not Found + LinkUserSuggestionAgencyNotFound, + ## Link User Suggestion Location Not Found + LinkUserSuggestionLocationNotFound, + # WEB DATA + ## URL Compressed HTML + URLCompressedHTML, + ## URL HTML Content + URLHTMLContent, + ## URL Screenshot + URLScreenshot, + ## URL Web Metadata + URLWebMetadata, + # INTERNET ARCHIVES + ## Flag URL Checked for Internet Archives + FlagURLCheckedForInternetArchives, + ## URL Internet Archives Probe Metadata + URLInternetArchivesProbeMetadata, + ## URL Internet Archives Save Metadata + URLInternetArchivesSaveMetadata, + # ANNOTATIONS + ## AUTO + ### Agency + URLAutoAgencyIDSubtask, + AgencyIDSubtaskSuggestion, + ### Record Type + AutoRecordTypeSuggestion, + ### URL Type + AutoRelevantSuggestion, + ### Location + AutoLocationIDSubtask, + LocationIDSubtaskSuggestion, + ## USER + ### Agency + UserURLAgencySuggestion, + ### Record Type + UserRecordTypeSuggestion, + ### URL Type + UserURLTypeSuggestion, + ### Location + UserLocationSuggestion, + URLNameSuggestion, + ## ANONYMOUS + ### Agency + AnonymousAnnotationAgency, + ### Location + AnonymousAnnotationLocation, + ### Record Type + AnonymousAnnotationRecordType, + ### URL Type + AnonymousAnnotationURLType, + ] + for model in models: + assert await dbc.get_all(model) == [] + + # The Change Log should show, at minimum, the deletion of the URL + query = ( + select( + ChangeLog + ) + .where( + ChangeLog.table_name == "urls", + ChangeLog.operation_type == ChangeLogOperationType.DELETE + ) + ) + result = dbc.one_or_none(query) + assert result is not None + + +async def _setup( + ddc: DBDataCreator, + pittsburgh_id: int, + agency_id: int +) -> int: + dbc: AsyncDatabaseClient = ddc.adb_client + # URL & Batch Link + url: SimpleURLMapping = (await ddc.create_urls( + record_type=None + ))[0] + + # MISCELLANEOUS + ## Flag Root URL + await ddc.flag_as_root(url_ids=[url.url_id]) + ## URL Task Error + ### Task + task_id: int = await ddc.task(url_ids=[url.url_id]) + ### Error + await ddc.task_errors(url_ids=[url.url_id], task_id=task_id) + ## URL Checked for Duplicate + await dbc.add( + URLCheckedForDuplicate( + url_id=url.url_id + ) + ) + ## Flag URL Suspended + await dbc.add( + FlagURLSuspended( + url_id=url.url_id + ) + ) + # LINKS + ## Link URLs Redirect URL + ### Additional url + additional_url: SimpleURLMapping = (await ddc.create_urls( + record_type=None + ))[0] + ### Redirect url + await dbc.add( + LinkURLRedirectURL( + source_url_id=url.url_id, + destination_url_id=additional_url.url_id + ) + ) + ### (We will go in both directions even though this should technically not be legal) + await dbc.add( + LinkURLRedirectURL( + source_url_id=additional_url.url_id, + destination_url_id=url.url_id + ) + ) + ## Link URLs Root URL + ### (Again, will go in both directions despite this not being legal) + root_url: SimpleURLMapping = (await ddc.create_urls( + record_type=None + ))[0] + await dbc.add( + LinkURLRootURL( + url_id=url.url_id, + root_url_id=root_url.url_id + ) + ) + await dbc.add( + LinkURLRootURL( + url_id=root_url.url_id, + root_url_id=url.url_id + ) + ) + ## Link User Submitted URL + await dbc.add( + LinkUserSubmittedURL( + url_id=url.url_id, + user_id=1 + ) + ) + ## Link User Suggestion Agency Not Found + await dbc.add( + LinkUserSuggestionAgencyNotFound( + url_id=url.url_id, + user_id=1 + ) + ) + ## Link User Suggestion Location Not Found + await dbc.add( + LinkUserSuggestionLocationNotFound( + url_id=url.url_id, + user_id=1 + ) + ) + # WEB DATA + ## URL Compressed HTML + await ddc.add_compressed_html( + url_ids=[url.url_id] + ) + ## URL HTML Content + await dbc.add( + URLHTMLContent( + url_id=url.url_id, + content_type="Title", + content="Test Title" + ) + ) + ## URL Screenshot + await dbc.add( + URLScreenshot( + url_id=url.url_id, + content=b"Test Screenshot", + file_size=1024 + ) + ) + ## URL Web Metadata + await ddc.create_web_metadata( + url_ids=[url.url_id] + ) + # INTERNET ARCHIVES + ## Flag URL Checked for Internet Archives + await dbc.add( + FlagURLCheckedForInternetArchives( + url_id=url.url_id, + success=True + ) + ) + ## URL Internet Archives Probe Metadata + await dbc.add( + URLInternetArchivesProbeMetadata( + url_id=url.url_id, + archive_url="https://example.com", + digest="test_digest", + length=1024, + ) + ) + ## URL Internet Archives Save Metadata + await dbc.add( + URLInternetArchivesSaveMetadata( + url_id=url.url_id, + ) + ) + # ANNOTATIONS + ## AUTO + ### Agency + #### Subtask + agency_subtask_id: int = await dbc.add( + URLAutoAgencyIDSubtask( + url_id=url.url_id, + task_id=task_id, + agencies_found=True, + type=AutoAgencyIDSubtaskType.NLP_LOCATION_MATCH, + detail=SubtaskDetailCode.NO_DETAILS + ), + return_id=True + ) + ### Suggestion + await dbc.add( + AgencyIDSubtaskSuggestion( + subtask_id=agency_subtask_id, + agency_id=agency_id, + confidence=60 + ) + ) + ### Record Type + await dbc.add( + AutoRecordTypeSuggestion( + url_id=url.url_id, + record_type=RecordType.BOOKING_REPORTS.value + ) + ) + ### Relevant + await dbc.add( + AutoRelevantSuggestion( + url_id=url.url_id, + relevant=True, + confidence=0.5, + model_name="Test Model" + ) + ) + ### Location + #### Subtask + location_subtask_id: int = await dbc.add( + AutoLocationIDSubtask( + url_id=url.url_id, + task_id=task_id, + locations_found=True, + type=LocationIDSubtaskType.NLP_LOCATION_FREQUENCY, + ), + return_id=True + ) + #### Suggestion + await dbc.add( + LocationIDSubtaskSuggestion( + subtask_id=location_subtask_id, + location_id=pittsburgh_id, + confidence=50 + ) + ) + ## USER + ### Agency + await dbc.add( + UserURLAgencySuggestion( + url_id=url.url_id, + user_id=1, + agency_id=agency_id, + is_new=False + ) + ) + ### Record Type + await dbc.add( + UserRecordTypeSuggestion( + url_id=url.url_id, + user_id=1, + record_type=RecordType.BOOKING_REPORTS.value, + ) + ) + ### URL Type + await dbc.add( + UserURLTypeSuggestion( + url_id=url.url_id, + type=URLType.INDIVIDUAL_RECORD, + user_id=1 + ) + ) + ### Location + await dbc.add( + UserLocationSuggestion( + url_id=url.url_id, + location_id=pittsburgh_id, + user_id=1, + ) + ) + ### Name + name_suggestion_id: int = await dbc.add( + URLNameSuggestion( + url_id=url.url_id, + suggestion="Test Name", + source=NameSuggestionSource.USER, + ), + return_id=True + ) + await dbc.add( + LinkUserNameSuggestion( + suggestion_id=name_suggestion_id, + user_id=1, + ) + ) + session_id: UUID = await dbc.run_query_builder( + MakeAnonymousSessionQueryBuilder() + ) + ## ANONYMOUS + for model in [ + ### Agency + AnonymousAnnotationAgency( + url_id=url.url_id, + agency_id=agency_id, + session_id=session_id, + ), + ### Record Type + AnonymousAnnotationRecordType( + url_id=url.url_id, + record_type=RecordType.BOOKING_REPORTS.value, + session_id=session_id, + ), + ### URL Type + AnonymousAnnotationURLType( + url_id=url.url_id, + url_type=URLType.INDIVIDUAL_RECORD, + session_id=session_id, + ), + ### Location + AnonymousAnnotationLocation( + url_id=url.url_id, + location_id=pittsburgh_id, + session_id=session_id + ) + ]: + await dbc.add(model) + + return url.url_id + + + + + + diff --git a/tests/automated/integration/api/url/by_id/delete/test_data_source_url.py b/tests/automated/integration/api/url/by_id/delete/test_data_source_url.py new file mode 100644 index 00000000..d551118b --- /dev/null +++ b/tests/automated/integration/api/url/by_id/delete/test_data_source_url.py @@ -0,0 +1,115 @@ +from datetime import date + +import pytest + +from src.core.enums import RecordType +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.flag.ds_delete.data_source import FlagDSDeleteDataSource +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource +from src.db.models.impl.url.optional_ds_metadata.enums import AccessTypeEnum, RetentionScheduleEnum, UpdateMethodEnum, \ + AgencyAggregationEnum +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata +from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType +from tests.helpers.api_test_helper import APITestHelper +from tests.helpers.data_creator.core import DBDataCreator + + +@pytest.mark.asyncio +async def test_data_source_url( + db_data_creator: DBDataCreator, + api_test_helper: APITestHelper, + test_agency_id: int +): + """ + Test that deletion works properly for a URL that is a validated data source + and has all data source-only attributes. + """ + + url_id: int = await _setup( + ddc=db_data_creator, + agency_id=test_agency_id + ) + api_test_helper.request_validator.delete_v3( + f"url/{url_id}" + ) + await _check_results( + dbc=db_data_creator.adb_client + ) + +async def _check_results( + dbc: AsyncDatabaseClient +) -> None: + pass + # CHECK + ## URL and all associated tables should be deleted + assert await dbc.has_no_rows(URL) + + ### Record Type should be deleted + assert await dbc.has_no_rows(URLOptionalDataSourceMetadata) + assert await dbc.has_no_rows(LinkURLAgency) + assert await dbc.has_no_rows(URLRecordType) + + ## DS App Link should not yet be deleted + app_link: DSAppLinkDataSource = await dbc.one_or_none_model(DSAppLinkDataSource) + assert app_link is not None + + ## DS App Data Source Deletion Flag should be added + flag: FlagDSDeleteDataSource = await dbc.one_or_none_model(FlagDSDeleteDataSource) + assert flag is not None + assert flag.ds_data_source_id == app_link.ds_data_source_id + + +async def _setup( + ddc: DBDataCreator, + agency_id: int +) -> int: + pass + # SETUP + ## Validated Flag - Data Source + ## Record Type + url_id: int = (await ddc.create_validated_urls( + validation_type=URLType.DATA_SOURCE, + record_type=RecordType.BOOKING_REPORTS, + count=1 + ))[0].url_id + + ## Link Agency + await ddc.create_url_agency_links( + url_ids=[url_id], + agency_ids=[agency_id] + ) + + ## Optional DS Metadata + optional_ds_metadata = URLOptionalDataSourceMetadata( + url_id=url_id, + record_formats=["csv", "pdf"], + data_portal_type="CKAN", + supplying_entity="ReadOnly Agency", + coverage_start=date(year=2025, month=6, day=1), + coverage_end=date(year=2025, month=8, day=20), + agency_supplied=False, + agency_originated=True, + agency_aggregation=AgencyAggregationEnum.LOCALITY, + agency_described_not_in_database="ReadOnly Agency Not In DB", + update_method=UpdateMethodEnum.NO_UPDATES, + readme_url="https://read-only-readme.com", + originating_entity="ReadOnly Agency Originating", + retention_schedule=RetentionScheduleEnum.GT_10_YEARS, + scraper_url="https://read-only-scraper.com", + submission_notes="Read Only Submission Notes", + access_notes="Read Only Access Notes", + access_types=[AccessTypeEnum.WEBPAGE, AccessTypeEnum.API], + ) + await ddc.adb_client.add(optional_ds_metadata) + + ## DS App Link + app_link = DSAppLinkDataSource( + url_id=url_id, + ds_data_source_id=1 + ) + await ddc.adb_client.add(app_link) + + return url_id diff --git a/tests/automated/integration/api/url/by_id/delete/test_meta_url.py b/tests/automated/integration/api/url/by_id/delete/test_meta_url.py new file mode 100644 index 00000000..0fbee489 --- /dev/null +++ b/tests/automated/integration/api/url/by_id/delete/test_meta_url.py @@ -0,0 +1,77 @@ +import pytest + +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.flag.ds_delete.meta_url import FlagDSDeleteMetaURL +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL +from tests.helpers.api_test_helper import APITestHelper +from tests.helpers.data_creator.core import DBDataCreator + + +@pytest.mark.asyncio +async def test_meta_url( + db_data_creator: DBDataCreator, + api_test_helper: APITestHelper, + test_agency_id: int +): + """ + Test that deletion works properly for a URL that is a validated meta url + and has all data source-only attributes. + """ + + url_id: int = await _setup( + ddc=db_data_creator, + agency_id=test_agency_id + ) + api_test_helper.request_validator.delete_v3( + f"url/{url_id}" + ) + await _check_results( + dbc=db_data_creator.adb_client + ) + + +async def _check_results( + dbc: AsyncDatabaseClient +) -> None: + pass + # CHECK + ## URL and all associated tables should be deleted + assert await dbc.has_no_rows(URL) + + ## DS App Link should not yet be deleted + app_link: DSAppLinkMetaURL = await dbc.one_or_none_model(DSAppLinkMetaURL) + assert app_link is not None + + ## DS App Meta URL Deletion Flag should be added + flag: FlagDSDeleteMetaURL = await dbc.one_or_none_model(FlagDSDeleteMetaURL) + assert flag is not None + assert flag.ds_meta_url_id == app_link.ds_meta_url_id + + +async def _setup( + ddc: DBDataCreator, + agency_id: int +) -> int: + pass + # SETUP + ## Validated Flag - Meta URL + url_id: int = (await ddc.create_validated_urls( + validation_type=URLType.META_URL, + count=1 + ))[0].url_id + + ## Link Agency + await ddc.create_url_agency_links( + url_ids=[url_id], + agency_ids=[agency_id] + ) + ## DS App Link + app_link = DSAppLinkMetaURL( + url_id=url_id, + ds_meta_url_id=1 + ) + await ddc.adb_client.add(app_link) + return url_id + diff --git a/tests/automated/integration/api/url/by_id/delete/test_validated_not_relevant.py b/tests/automated/integration/api/url/by_id/delete/test_validated_not_relevant.py new file mode 100644 index 00000000..6e6a738d --- /dev/null +++ b/tests/automated/integration/api/url/by_id/delete/test_validated_not_relevant.py @@ -0,0 +1,71 @@ +import pytest + +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.url.core.sqlalchemy import URL +from tests.helpers.api_test_helper import APITestHelper +from tests.helpers.data_creator.core import DBDataCreator + + +@pytest.mark.asyncio +async def test_validated_not_relevant( + db_data_creator: DBDataCreator, + api_test_helper: APITestHelper +): + """ + Test that deletion works properly for a URL that is a validated + as any of the non-relevant URL types + (not relevant, broken, individual record) + """ + + url_ids: list[int] = await _setup( + ddc=db_data_creator + ) + for url_id in url_ids: + api_test_helper.request_validator.delete_v3( + f"url/{url_id}" + ) + await _check_results( + url_ids, + dbc=db_data_creator.adb_client + ) + + + +async def _check_results( + url_ids: list[int], + dbc: AsyncDatabaseClient +) -> None: + pass + # CHECK + ## Each URLs Validation Flags should be deleted + url_validation_flags: list[FlagURLValidated] = await dbc.get_all(FlagURLValidated) + assert len(url_validation_flags) == 0 + + ## Each URL should be deleted + urls: list[URL] = await dbc.get_all(URL) + assert len(urls) == 0 + +async def _setup( + ddc: DBDataCreator +) -> list[int]: + url_ids: list[int] = [] + # SETUP (3 URLs) + for validated_type in [ + ## Validated Flag - Individual Record + URLType.INDIVIDUAL_RECORD, + ## Validated Flag - Broken + URLType.BROKEN_PAGE, + ## Validated Flag - Not Relevant + URLType.NOT_RELEVANT + ]: + url_id: int = (await ddc.create_validated_urls( + validation_type=validated_type, + count=1 + ))[0].url_id + url_ids.append(url_id) + return url_ids + + + diff --git a/tests/automated/integration/api/url/by_id/snapshot/test_success.py b/tests/automated/integration/api/url/by_id/snapshot/test_success.py index e3ea9d73..3109706d 100644 --- a/tests/automated/integration/api/url/by_id/snapshot/test_success.py +++ b/tests/automated/integration/api/url/by_id/snapshot/test_success.py @@ -1,6 +1,6 @@ import pytest -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.url.screenshot.sqlalchemy import URLScreenshot from tests.automated.integration.api._helpers.RequestValidator import RequestValidator from tests.helpers.api_test_helper import APITestHelper @@ -15,7 +15,7 @@ async def test_get_url_screenshot_success( ddc: DBDataCreator = api_test_helper.db_data_creator rv: RequestValidator = ath.request_validator - url_mapping: URLMapping = (await ddc.create_urls())[0] + url_mapping: SimpleURLMapping = (await ddc.create_urls())[0] url_id: int = url_mapping.url_id url_screenshot = URLScreenshot( diff --git a/tests/automated/integration/conftest.py b/tests/automated/integration/conftest.py index 574f35f4..19a9fe19 100644 --- a/tests/automated/integration/conftest.py +++ b/tests/automated/integration/conftest.py @@ -1,12 +1,27 @@ -from unittest.mock import MagicMock +from typing import Generator, AsyncGenerator, Any +from unittest.mock import MagicMock, AsyncMock import pytest import pytest_asyncio +from starlette.testclient import TestClient +from src.api.main import app +from src.collectors.enums import URLStatus from src.collectors.manager import AsyncCollectorManager from src.core.core import AsyncCore +from src.core.enums import RecordType from src.core.logger import AsyncCoreLogger from src.db.client.async_ import AsyncDatabaseClient +from src.db.client.sync import DatabaseClient +from src.db.dtos.url.mapping_.simple import SimpleURLMapping +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.sqlalchemy import URL +from src.security.dtos.access_info import AccessInfo +from src.security.enums import Permissions +from src.security.manager import get_access_info +from tests.automated.integration.api._helpers.RequestValidator import RequestValidator +from tests.helpers.api_test_helper import APITestHelper from tests.helpers.data_creator.core import DBDataCreator from tests.helpers.data_creator.models.creation_info.county import CountyCreationInfo from tests.helpers.data_creator.models.creation_info.locality import LocalityCreationInfo @@ -93,4 +108,184 @@ async def los_angeles_locality( state_id=california.us_state_id, county_id=los_angeles_county.county_id, name="Los Angeles" - ) \ No newline at end of file + ) + + +MOCK_USER_ID = 1 + + +async def fail_task_trigger() -> None: + raise Exception( + "Task Trigger is set to fail in tests by default, to catch unintentional calls." + "If this is not intended, either replace with a Mock or the expected task function." + ) + + +def override_access_info() -> AccessInfo: + return AccessInfo( + user_id=MOCK_USER_ID, + permissions=[ + Permissions.SOURCE_COLLECTOR, + Permissions.SOURCE_COLLECTOR_FINAL_REVIEW + ] + ) + + +@pytest.fixture(scope="session") +def client(disable_task_flags) -> Generator[TestClient, None, None]: + with TestClient(app) as c: + app.dependency_overrides[get_access_info] = override_access_info + async_core: AsyncCore = c.app.state.async_core + + # Interfaces to the web should be mocked + task_manager = async_core.task_manager + task_manager.url_request_interface = AsyncMock() + task_manager.discord_poster = AsyncMock() + # Disable Logger + task_manager.logger.disabled = True + # Set trigger to fail immediately if called, to force it to be manually specified in tests + task_manager.task_trigger._func = fail_task_trigger + yield c + + # Reset environment variables back to original state + + +@pytest_asyncio.fixture +async def api_test_helper( + client: TestClient, + db_client_test: DatabaseClient, + adb_client_test: AsyncDatabaseClient + ) -> AsyncGenerator[APITestHelper, Any]: + yield APITestHelper( + request_validator=RequestValidator(client=client), + async_core=client.app.state.async_core, + db_data_creator=DBDataCreator( + db_client=db_client_test, + adb_client=adb_client_test + ), + ) + await client.app.state.async_core.collector_manager.logger.clear_log_queue() + +@pytest.fixture +def test_batch_id( + db_data_creator: DBDataCreator +) -> int: + return db_data_creator.batch() + +@pytest_asyncio.fixture +async def test_agency_id( + db_data_creator: DBDataCreator, + pittsburgh_locality: LocalityCreationInfo, + pennsylvania: USStateCreationInfo +) -> int: + """Test agency linked to two locations: Pittsburgh and Pennsylvania""" + agency_id: int = await db_data_creator.agency( + name="Test Agency" + ) + await db_data_creator.link_agencies_to_location( + agency_ids=[agency_id], + location_id=pittsburgh_locality.location_id + ) + await db_data_creator.link_agencies_to_location( + agency_ids=[agency_id], + location_id=pennsylvania.location_id + ) + return agency_id + +@pytest_asyncio.fixture +async def test_agency_id_2( + db_data_creator: DBDataCreator, + pittsburgh_locality: LocalityCreationInfo +) -> int: + agency_id: int = await db_data_creator.agency( + name="Test Agency 2" + ) + await db_data_creator.link_agencies_to_location( + agency_ids=[agency_id], + location_id=pittsburgh_locality.location_id + ) + return agency_id + +@pytest_asyncio.fixture +async def test_url_data_source_id( + db_data_creator: DBDataCreator, + test_agency_id: int +) -> int: + url_id: int = (await db_data_creator.create_validated_urls( + record_type=RecordType.CRIME_STATISTICS, + validation_type=URLType.DATA_SOURCE, + ))[0].url_id + await db_data_creator.link_urls_to_agencies( + url_ids=[url_id], + agency_ids=[test_agency_id] + ) + return url_id + +@pytest_asyncio.fixture +async def test_url_data_source_id_2( + db_data_creator: DBDataCreator, + test_agency_id: int +) -> int: + url_id: int = (await db_data_creator.create_validated_urls( + record_type=RecordType.CAR_GPS, + validation_type=URLType.DATA_SOURCE, + ))[0].url_id + await db_data_creator.link_urls_to_agencies( + url_ids=[url_id], + agency_ids=[test_agency_id] + ) + return url_id + +@pytest_asyncio.fixture +async def test_url_id( + db_data_creator: DBDataCreator, +) -> int: + url = URL( + url="example.com", + source=URLSource.COLLECTOR, + trailing_slash=False, + status=URLStatus.OK + ) + return await db_data_creator.adb_client.add(url, return_id=True) + +@pytest_asyncio.fixture +async def test_url_id_2( + db_data_creator: DBDataCreator, +) -> int: + url = URL( + url="example.com/2", + source=URLSource.COLLECTOR, + trailing_slash=False, + status=URLStatus.OK + ) + return await db_data_creator.adb_client.add(url, return_id=True) + + +@pytest_asyncio.fixture +async def test_url_data_source_mapping( + db_data_creator: DBDataCreator, + test_agency_id: int +) -> SimpleURLMapping: + url_mapping: SimpleURLMapping = (await db_data_creator.create_validated_urls( + record_type=RecordType.CRIME_STATISTICS, + validation_type=URLType.DATA_SOURCE, + ))[0] + await db_data_creator.link_urls_to_agencies( + url_ids=[url_mapping.url_id], + agency_ids=[test_agency_id] + ) + return url_mapping + +@pytest_asyncio.fixture +async def test_url_meta_url_id( + db_data_creator: DBDataCreator, + test_agency_id: int +) -> int: + url_id: int = (await db_data_creator.create_validated_urls( + validation_type=URLType.META_URL, + ))[0].url_id + await db_data_creator.link_urls_to_agencies( + url_ids=[url_id], + agency_ids=[test_agency_id] + ) + return url_id diff --git a/tests/automated/integration/db/client/approve_url/test_basic.py b/tests/automated/integration/db/client/approve_url/test_basic.py index c9eb62b1..f090a4ea 100644 --- a/tests/automated/integration/db/client/approve_url/test_basic.py +++ b/tests/automated/integration/db/client/approve_url/test_basic.py @@ -6,7 +6,7 @@ from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType from src.db.models.impl.url.reviewing_user import ReviewingUserURL from tests.helpers.setup.final_review.core import setup_for_get_next_url_for_final_review diff --git a/tests/automated/integration/db/client/test_insert_urls.py b/tests/automated/integration/db/client/test_insert_urls.py index f2d73f00..852da385 100644 --- a/tests/automated/integration/db/client/test_insert_urls.py +++ b/tests/automated/integration/db/client/test_insert_urls.py @@ -24,17 +24,17 @@ async def test_insert_urls( urls = [ URLInfo( - url="https://example.com/1", + url="example.com/1", collector_metadata={"name": "example_1"}, source=URLSource.COLLECTOR ), URLInfo( - url="https://example.com/2", + url="example.com/2", source=URLSource.COLLECTOR ), # Duplicate URLInfo( - url="https://example.com/1", + url="example.com/1", collector_metadata={"name": "example_duplicate"}, source=URLSource.COLLECTOR ) @@ -46,8 +46,8 @@ async def test_insert_urls( url_mappings = insert_urls_info.url_mappings assert len(url_mappings) == 2 - assert url_mappings[0].url == "https://example.com/1" - assert url_mappings[1].url == "https://example.com/2" + assert url_mappings[0].url == "example.com/1" + assert url_mappings[1].url == "example.com/2" assert insert_urls_info.original_count == 2 diff --git a/tests/automated/integration/db/structure/test_root_url.py b/tests/automated/integration/db/structure/test_root_url.py index 8f8be80b..62755b00 100644 --- a/tests/automated/integration/db/structure/test_root_url.py +++ b/tests/automated/integration/db/structure/test_root_url.py @@ -13,7 +13,7 @@ def test_root_url(db_data_creator: DBDataCreator): ColumnTester( column_name="url", type_=sa.String, - allowed_values=["https://example.com"] + allowed_values=["example.com"] ), ColumnTester( column_name="page_title", diff --git a/tests/automated/integration/db/structure/test_updated_at.py b/tests/automated/integration/db/structure/test_updated_at.py new file mode 100644 index 00000000..0a4c18a4 --- /dev/null +++ b/tests/automated/integration/db/structure/test_updated_at.py @@ -0,0 +1,40 @@ +import asyncio +from datetime import datetime + +import pytest + +from src.collectors.enums import URLStatus +from src.db.models.impl.url.core.pydantic.upsert import URLUpsertModel +from src.db.models.impl.url.core.sqlalchemy import URL +from tests.helpers.data_creator.core import DBDataCreator + + +@pytest.mark.asyncio +async def test_updated_at(db_data_creator: DBDataCreator): + + _ = await db_data_creator.create_urls( + count=1, + status=URLStatus.OK + ) + + urls: list[URL] = await db_data_creator.adb_client.get_all(URL) + url = urls[0] + assert url.updated_at is not None + updated_at: datetime = url.updated_at + + url_upsert = URLUpsertModel( + id=url.id, + name="New Name", + url=url.url, + trailing_slash=url.trailing_slash, + ) + + await db_data_creator.adb_client.bulk_update([url_upsert]) + + new_urls: list[URL] = await db_data_creator.adb_client.get_all(URL) + new_url = new_urls[0] + + new_updated_at = new_url.updated_at + assert new_updated_at > updated_at + + diff --git a/tests/automated/integration/db/structure/test_upsert_new_agencies.py b/tests/automated/integration/db/structure/test_upsert_new_agencies.py index 6b377974..6adb043b 100644 --- a/tests/automated/integration/db/structure/test_upsert_new_agencies.py +++ b/tests/automated/integration/db/structure/test_upsert_new_agencies.py @@ -46,13 +46,13 @@ async def test_upsert_new_agencies( await adb_client.upsert_new_agencies([update_suggestion]) - rows = await adb_client.get_all(Agency, order_by_attribute="agency_id") + rows: list[Agency] = await adb_client.get_all(Agency, order_by_attribute="id") assert len(rows) == 3 d = {} for row in rows: - d[row.agency_id] = row.name + d[row.id] = row.name assert d[0] == "Updated Test Agency" assert d[1] == "Test Agency 1" diff --git a/tests/automated/integration/readonly/README.md b/tests/automated/integration/readonly/README.md new file mode 100644 index 00000000..3c72830f --- /dev/null +++ b/tests/automated/integration/readonly/README.md @@ -0,0 +1 @@ +Read Only tests are tests that work on a variant of the database populated with static test data. These tests are designed to not modify the database in any way. diff --git a/tests/automated/integration/readonly/__init__.py b/tests/automated/integration/readonly/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/readonly/api/__init__.py b/tests/automated/integration/readonly/api/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/readonly/api/agencies/__init__.py b/tests/automated/integration/readonly/api/agencies/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/readonly/api/agencies/get/__init__.py b/tests/automated/integration/readonly/api/agencies/get/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/readonly/api/agencies/get/test_locations.py b/tests/automated/integration/readonly/api/agencies/get/test_locations.py new file mode 100644 index 00000000..34904057 --- /dev/null +++ b/tests/automated/integration/readonly/api/agencies/get/test_locations.py @@ -0,0 +1,16 @@ +import pytest + +from tests.automated.integration.readonly.helper import ReadOnlyTestHelper + + +@pytest.mark.asyncio +async def test_agency_get_locations( + readonly_helper: ReadOnlyTestHelper, +) -> None: + + response_raw: list[dict] = readonly_helper.api_test_helper.request_validator.get_v3( + url=f"/agencies/{readonly_helper.agency_1_id}/locations", + ) + assert len(response_raw) == 1 + assert response_raw[0]["location_id"] == readonly_helper.agency_1_location_id + assert response_raw[0]["full_display_name"] == "Pittsburgh, Allegheny, Pennsylvania" diff --git a/tests/automated/integration/readonly/api/agencies/get/test_root.py b/tests/automated/integration/readonly/api/agencies/get/test_root.py new file mode 100644 index 00000000..412a9512 --- /dev/null +++ b/tests/automated/integration/readonly/api/agencies/get/test_root.py @@ -0,0 +1,20 @@ +import pytest + +from src.db.models.impl.agency.enums import JurisdictionType, AgencyType +from tests.automated.integration.readonly.helper import ReadOnlyTestHelper + + +@pytest.mark.asyncio +async def test_agency_get( + readonly_helper: ReadOnlyTestHelper +): + + responses_raw: list[dict] = readonly_helper.api_test_helper.request_validator.get_v3( + url=f"/agencies", + ) + assert len(responses_raw) == 2 + response_raw = responses_raw[0] + assert response_raw["id"] == readonly_helper.agency_1_id + assert response_raw["name"] == "Agency 1" + assert response_raw["type"] == AgencyType.LAW_ENFORCEMENT.value + assert response_raw["jurisdiction_type"] == JurisdictionType.STATE.value \ No newline at end of file diff --git a/tests/automated/integration/readonly/api/check/__init__.py b/tests/automated/integration/readonly/api/check/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/readonly/api/check/test_unique_url.py b/tests/automated/integration/readonly/api/check/test_unique_url.py new file mode 100644 index 00000000..12123b99 --- /dev/null +++ b/tests/automated/integration/readonly/api/check/test_unique_url.py @@ -0,0 +1,33 @@ +import pytest + +from src.api.endpoints.check.unique_url.response import CheckUniqueURLResponse +from tests.automated.integration.readonly.helper import ReadOnlyTestHelper +from tests.helpers.api_test_helper import APITestHelper + + +@pytest.mark.asyncio +async def test_check_unique_url( + readonly_helper: ReadOnlyTestHelper +): + + ath: APITestHelper = readonly_helper.api_test_helper + response_not_unique_url = ath.request_validator.get_v3( + url="/check/unique-url", + params={ + "url": "https://read-only-ds.com" + } + ) + model_not_unique_url = CheckUniqueURLResponse(**response_not_unique_url) + assert not model_not_unique_url.unique_url + assert model_not_unique_url.url_id == readonly_helper.maximal_data_source_url_id + + + response_unique_url = ath.request_validator.get_v3( + url="/check/unique-url", + params={ + "url": "https://nonexistent-url.com" + } + ) + model_unique_url = CheckUniqueURLResponse(**response_unique_url) + assert model_unique_url.unique_url + assert model_unique_url.url_id is None \ No newline at end of file diff --git a/tests/automated/integration/readonly/api/contributions/__init__.py b/tests/automated/integration/readonly/api/contributions/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/readonly/api/contributions/test_leaderboard.py b/tests/automated/integration/readonly/api/contributions/test_leaderboard.py new file mode 100644 index 00000000..140cc777 --- /dev/null +++ b/tests/automated/integration/readonly/api/contributions/test_leaderboard.py @@ -0,0 +1,14 @@ +import pytest + +from src.api.endpoints.contributions.leaderboard.query import GetContributionsLeaderboardQueryBuilder +from tests.automated.integration.readonly.helper import ReadOnlyTestHelper + + +@pytest.mark.asyncio +async def test_leaderboard( + readonly_helper: ReadOnlyTestHelper +): + await readonly_helper.adb_client.run_query_builder( + GetContributionsLeaderboardQueryBuilder() + ) + diff --git a/tests/automated/integration/readonly/api/contributions/test_user.py b/tests/automated/integration/readonly/api/contributions/test_user.py new file mode 100644 index 00000000..170797df --- /dev/null +++ b/tests/automated/integration/readonly/api/contributions/test_user.py @@ -0,0 +1,17 @@ +import pytest + +from src.api.endpoints.contributions.user.queries.core import GetUserContributionsQueryBuilder +from tests.automated.integration.readonly.helper import ReadOnlyTestHelper + + +@pytest.mark.asyncio +async def test_user( + readonly_helper: ReadOnlyTestHelper +): + for user_id in [ + readonly_helper.user_1_id, + readonly_helper.user_2_id, + ]: + await readonly_helper.adb_client.run_query_builder( + GetUserContributionsQueryBuilder(user_id) + ) diff --git a/tests/automated/integration/readonly/api/data_sources/__init__.py b/tests/automated/integration/readonly/api/data_sources/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/readonly/api/data_sources/by_id/__init__.py b/tests/automated/integration/readonly/api/data_sources/by_id/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/readonly/api/data_sources/by_id/agencies/__init__.py b/tests/automated/integration/readonly/api/data_sources/by_id/agencies/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/readonly/api/data_sources/by_id/agencies/test_forbid.py b/tests/automated/integration/readonly/api/data_sources/by_id/agencies/test_forbid.py new file mode 100644 index 00000000..85a54705 --- /dev/null +++ b/tests/automated/integration/readonly/api/data_sources/by_id/agencies/test_forbid.py @@ -0,0 +1,13 @@ +import pytest + +from tests.automated.integration.readonly.helper import ReadOnlyTestHelper +from tests.helpers.check import check_forbidden_url_type + + +@pytest.mark.asyncio +async def test_forbid(readonly_helper: ReadOnlyTestHelper): + check_forbidden_url_type( + route=f"/data-sources/{readonly_helper.url_meta_url_id}/agencies", + api_test_helper=readonly_helper.api_test_helper, + method="GET" + ) diff --git a/tests/automated/integration/readonly/api/data_sources/by_id/test_get.py b/tests/automated/integration/readonly/api/data_sources/by_id/test_get.py new file mode 100644 index 00000000..16c30869 --- /dev/null +++ b/tests/automated/integration/readonly/api/data_sources/by_id/test_get.py @@ -0,0 +1,12 @@ +import pytest + +from src.api.endpoints.data_source.get.response import DataSourceGetResponse +from tests.automated.integration.readonly.helper import ReadOnlyTestHelper + +@pytest.mark.asyncio +async def test_get_by_id(readonly_helper: ReadOnlyTestHelper): + raw_json: dict = readonly_helper.api_test_helper.request_validator.get_v3( + url=f"/data-sources/{readonly_helper.maximal_data_source_url_id}", + ) + # Test response is in expected form. + DataSourceGetResponse(**raw_json) \ No newline at end of file diff --git a/tests/automated/integration/readonly/api/data_sources/test_get.py b/tests/automated/integration/readonly/api/data_sources/test_get.py new file mode 100644 index 00000000..c23d2177 --- /dev/null +++ b/tests/automated/integration/readonly/api/data_sources/test_get.py @@ -0,0 +1,57 @@ +from datetime import date + +import pytest +from deepdiff import DeepDiff + +from src.api.endpoints.data_source.get.response import DataSourceGetOuterResponse, DataSourceGetResponse +from src.core.enums import RecordType +from src.db.models.impl.url.optional_ds_metadata.enums import AgencyAggregationEnum, UpdateMethodEnum, \ + RetentionScheduleEnum, AccessTypeEnum +from tests.automated.integration.readonly.helper import ReadOnlyTestHelper + + +@pytest.mark.asyncio +async def test_get(readonly_helper: ReadOnlyTestHelper): + + raw_json: dict = readonly_helper.api_test_helper.request_validator.get_v3( + url=f"/data-sources", + ) + outer_response = DataSourceGetOuterResponse(**raw_json) + + assert len(outer_response.results) == 2 + response: DataSourceGetResponse = outer_response.results[0] + + diff = DeepDiff( + response.model_dump(mode='json'), + DataSourceGetResponse( + url_id=readonly_helper.maximal_data_source_url_id, + url="read-only-ds.com", + + name="Read only URL name", + record_type=RecordType.CRIME_STATISTICS, + agency_ids=[readonly_helper.agency_1_id], + + batch_id=None, + description="Read only URL", + + record_formats=["csv", "pdf"], + data_portal_type="CKAN", + supplying_entity="ReadOnly Agency", + coverage_start=date(year=2025, month=6, day=1), + coverage_end=date(year=2025, month=8, day=20), + agency_supplied=False, + agency_originated=True, + agency_aggregation=AgencyAggregationEnum.LOCALITY, + agency_described_not_in_database="ReadOnly Agency Not In DB", + update_method=UpdateMethodEnum.NO_UPDATES, + readme_url="https://read-only-readme.com", + originating_entity="ReadOnly Agency Originating", + retention_schedule=RetentionScheduleEnum.GT_10_YEARS, + scraper_url="https://read-only-scraper.com", + submission_notes="Read Only Submission Notes", + access_notes="Read Only Access Notes", + access_types=[AccessTypeEnum.WEBPAGE, AccessTypeEnum.API], + ).model_dump(mode='json'), + ) + + assert diff == {}, f"Differences found: {diff}" diff --git a/tests/automated/integration/readonly/api/meta_urls/__init__.py b/tests/automated/integration/readonly/api/meta_urls/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/readonly/api/meta_urls/agencies/__init__.py b/tests/automated/integration/readonly/api/meta_urls/agencies/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/readonly/api/meta_urls/agencies/test_forbid.py b/tests/automated/integration/readonly/api/meta_urls/agencies/test_forbid.py new file mode 100644 index 00000000..28d5e45e --- /dev/null +++ b/tests/automated/integration/readonly/api/meta_urls/agencies/test_forbid.py @@ -0,0 +1,15 @@ + +import pytest + +from tests.automated.integration.readonly.helper import ReadOnlyTestHelper +from tests.helpers.check import check_forbidden_url_type + + +@pytest.mark.asyncio +async def test_forbid(readonly_helper: ReadOnlyTestHelper): + check_forbidden_url_type( + route=f"/meta-urls/{readonly_helper.minimal_data_source_url_id}/agencies", + api_test_helper=readonly_helper.api_test_helper, + method="GET" + ) + diff --git a/tests/automated/integration/readonly/api/meta_urls/test_get.py b/tests/automated/integration/readonly/api/meta_urls/test_get.py new file mode 100644 index 00000000..8779a3fc --- /dev/null +++ b/tests/automated/integration/readonly/api/meta_urls/test_get.py @@ -0,0 +1,30 @@ +import pytest +from deepdiff import DeepDiff + +from src.api.endpoints.meta_url.get.response import MetaURLGetOuterResponse, MetaURLGetResponse +from tests.automated.integration.readonly.helper import ReadOnlyTestHelper + + +@pytest.mark.asyncio +async def test_get(readonly_helper: ReadOnlyTestHelper): + + raw_json: dict = readonly_helper.api_test_helper.request_validator.get_v3( + url=f"/meta-urls", + ) + outer_response = MetaURLGetOuterResponse(**raw_json) + + assert len(outer_response.results) == 1 + response: MetaURLGetResponse = outer_response.results[0] + + diff = DeepDiff( + response.model_dump(mode='json'), + MetaURLGetResponse( + url_id=readonly_helper.url_meta_url_id, + url="read-only-meta-url.com", + name="Read only URL Name", + description="Read only URL", + batch_id=None, + agency_ids=[] + ).model_dump(mode='json'), + ) + assert diff == {}, f"Differences found: {diff}" \ No newline at end of file diff --git a/tests/automated/integration/readonly/conftest.py b/tests/automated/integration/readonly/conftest.py new file mode 100644 index 00000000..3fdd0598 --- /dev/null +++ b/tests/automated/integration/readonly/conftest.py @@ -0,0 +1,51 @@ +import asyncio +from typing import Any, AsyncGenerator + +import pytest +import pytest_asyncio +from sqlalchemy import Engine +from starlette.testclient import TestClient + +from tests.automated.integration.api._helpers.RequestValidator import RequestValidator +from tests.automated.integration.readonly.helper import ReadOnlyTestHelper +from tests.automated.integration.readonly.setup.core import setup_readonly_data +from tests.helpers.api_test_helper import APITestHelper +from tests.helpers.data_creator.core import DBDataCreator +from tests.helpers.data_creator.models.creation_info.us_state import USStateCreationInfo +from tests.helpers.setup.wipe import wipe_database + + +@pytest.fixture(scope="module") +def event_loop(): + loop = asyncio.new_event_loop() + yield loop + loop.close() + +@pytest_asyncio.fixture(scope='module') +async def california_readonly( +) -> USStateCreationInfo: + return await DBDataCreator().create_us_state( + name="California", + iso="CA" + ) + +@pytest_asyncio.fixture(scope="module") +async def readonly_helper( + event_loop, + client: TestClient, + engine: Engine + +) -> AsyncGenerator[ReadOnlyTestHelper, Any]: + wipe_database(engine) + db_data_creator = DBDataCreator() + api_test_helper = APITestHelper( + request_validator=RequestValidator(client=client), + async_core=client.app.state.async_core, + db_data_creator=db_data_creator, + ) + + helper: ReadOnlyTestHelper = await setup_readonly_data( + api_test_helper=api_test_helper + ) + + yield helper \ No newline at end of file diff --git a/tests/automated/integration/readonly/helper.py b/tests/automated/integration/readonly/helper.py new file mode 100644 index 00000000..b0ffcd9e --- /dev/null +++ b/tests/automated/integration/readonly/helper.py @@ -0,0 +1,31 @@ +from pydantic import BaseModel + +from src.db.client.async_ import AsyncDatabaseClient +from tests.helpers.api_test_helper import APITestHelper + + +class ReadOnlyTestHelper(BaseModel): + class Config: + arbitrary_types_allowed = True + + # Clients + adb_client: AsyncDatabaseClient + api_test_helper: APITestHelper + + # Agencies + agency_1_id: int + agency_1_location_id: int + agency_2_id: int + agency_2_location_id: int + + # URLs + minimal_data_source_url_id: int + maximal_data_source_url_id: int + url_meta_url_id: int + unvalidated_url_id: int + + # Users + user_1_id: int + user_2_id: int + + diff --git a/tests/automated/integration/readonly/setup/__init__.py b/tests/automated/integration/readonly/setup/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/readonly/setup/agency.py b/tests/automated/integration/readonly/setup/agency.py new file mode 100644 index 00000000..366bc43d --- /dev/null +++ b/tests/automated/integration/readonly/setup/agency.py @@ -0,0 +1,23 @@ +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.agency.enums import AgencyType, JurisdictionType +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.link.agency_location.sqlalchemy import LinkAgencyLocation + + +async def add_agency( + adb_client: AsyncDatabaseClient, + location_id: int +) -> int: + agency_1 = Agency( + name="Agency 1", + agency_type=AgencyType.LAW_ENFORCEMENT, + jurisdiction_type=JurisdictionType.STATE, + ) + agency_id: int = await adb_client.add(agency_1, return_id=True) + # Add Agency location + agency_1_location = LinkAgencyLocation( + agency_id=agency_id, + location_id=location_id, + ) + await adb_client.add(agency_1_location) + return agency_id diff --git a/tests/automated/integration/readonly/setup/annotations.py b/tests/automated/integration/readonly/setup/annotations.py new file mode 100644 index 00000000..b07bbd9f --- /dev/null +++ b/tests/automated/integration/readonly/setup/annotations.py @@ -0,0 +1,72 @@ +from src.core.enums import RecordType +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.impl.link.user_name_suggestion.sqlalchemy import LinkUserNameSuggestion +from src.db.models.impl.url.suggestion.agency.user import UserURLAgencySuggestion +from src.db.models.impl.url.suggestion.location.user.sqlalchemy import UserLocationSuggestion +from src.db.models.impl.url.suggestion.name.enums import NameSuggestionSource +from src.db.models.impl.url.suggestion.name.sqlalchemy import URLNameSuggestion +from src.db.models.impl.url.suggestion.record_type.user import UserRecordTypeSuggestion +from src.db.models.impl.url.suggestion.url_type.user import UserURLTypeSuggestion + + +async def add_full_data_sources_annotations( + url_id: int, + user_id: int, + agency_id: int, + location_id: int, + adb_client: AsyncDatabaseClient +) -> None: + name_suggestion = URLNameSuggestion( + url_id=url_id, + suggestion="Name suggestion", + source=NameSuggestionSource.USER + ) + name_suggestion_id: int = await adb_client.add( + name_suggestion, + return_id=True + ) + url_type_suggestion = UserURLTypeSuggestion( + url_id=url_id, + user_id=user_id, + type=URLType.DATA_SOURCE + ) + record_type_suggestion = UserRecordTypeSuggestion( + user_id=user_id, + url_id=url_id, + record_type=RecordType.RECORDS_REQUEST_INFO.value + ) + user_name_suggestion = LinkUserNameSuggestion( + user_id=user_id, + suggestion_id=name_suggestion_id, + ) + agency_suggestion = UserURLAgencySuggestion( + agency_id=agency_id, + url_id=url_id, + user_id=user_id, + ) + location_suggestion = UserLocationSuggestion( + location_id=location_id, + url_id=url_id, + user_id=user_id, + ) + for suggestion in [ + url_type_suggestion, + record_type_suggestion, + user_name_suggestion, + agency_suggestion, + location_suggestion + ]: + await adb_client.add(suggestion) + +async def add_minimal_not_relevant_annotation( + url_id: int, + user_id: int, + adb_client: AsyncDatabaseClient +) -> None: + url_type_suggestion = UserURLTypeSuggestion( + url_id=url_id, + user_id=user_id, + type=URLType.NOT_RELEVANT + ) + await adb_client.add(url_type_suggestion) \ No newline at end of file diff --git a/tests/automated/integration/readonly/setup/core.py b/tests/automated/integration/readonly/setup/core.py new file mode 100644 index 00000000..c938b523 --- /dev/null +++ b/tests/automated/integration/readonly/setup/core.py @@ -0,0 +1,96 @@ +from tests.automated.integration.readonly.helper import ReadOnlyTestHelper +from tests.automated.integration.readonly.setup.agency import add_agency +from tests.automated.integration.readonly.setup.annotations import add_full_data_sources_annotations, \ + add_minimal_not_relevant_annotation +from tests.automated.integration.readonly.setup.data_source import add_maximal_data_source, add_minimal_data_source +from tests.automated.integration.readonly.setup.meta_url import add_meta_url +from tests.helpers.api_test_helper import APITestHelper +from tests.helpers.data_creator.models.creation_info.county import CountyCreationInfo +from tests.helpers.data_creator.models.creation_info.locality import LocalityCreationInfo +from tests.helpers.data_creator.models.creation_info.us_state import USStateCreationInfo + + +async def setup_readonly_data( + api_test_helper: APITestHelper +) -> ReadOnlyTestHelper: + db_data_creator = api_test_helper.db_data_creator + adb_client = db_data_creator.adb_client + + # Pennsylvania + pennsylvania: USStateCreationInfo = await db_data_creator.create_us_state( + name="Pennsylvania", + iso="PA" + ) + allegheny_county: CountyCreationInfo = await db_data_creator.create_county( + state_id=pennsylvania.us_state_id, + name="Allegheny" + ) + pittsburgh: LocalityCreationInfo = await db_data_creator.create_locality( + state_id=pennsylvania.us_state_id, + county_id=allegheny_county.county_id, + name="Pittsburgh" + ) + + # Add Agencies + agency_1_id: int = await add_agency(adb_client, pittsburgh.location_id) + agency_2_id: int = await add_agency(adb_client, allegheny_county.location_id) + + + # Add users with varying contributions + user_id_1: int = 1 + user_id_2: int = 2 + # Add unvalidated URL + unvalidated_url_id: int = (await db_data_creator.create_urls( + record_type=None, + count=1 + ))[0].url_id + # Have User 1 give a full set of data sources annotations + await add_full_data_sources_annotations( + url_id=unvalidated_url_id, + user_id=user_id_1, + agency_id=agency_1_id, + location_id=pittsburgh.location_id, + adb_client=adb_client + ) + # Have User 2 give a single rejected annotation + await add_minimal_not_relevant_annotation( + url_id=unvalidated_url_id, + user_id=user_id_2, + adb_client=adb_client + ) + + # Add Data Source With Linked Agency + maximal_data_source: int = await add_maximal_data_source( + agency_1_id=agency_1_id, + db_data_creator=db_data_creator + ) + minimal_data_source: int = await add_minimal_data_source( + agency_1_id=agency_1_id, + db_data_creator=db_data_creator + ) + + # Add Meta URL with Linked Agency + url_meta_url_id: int = await add_meta_url(agency_1_id, db_data_creator) + + return ReadOnlyTestHelper( + adb_client=adb_client, + api_test_helper=api_test_helper, + + # Agencies + agency_1_id=agency_1_id, + agency_1_location_id=pittsburgh.location_id, + agency_2_id=agency_2_id, + agency_2_location_id=allegheny_county.location_id, + + # URLs + maximal_data_source_url_id=maximal_data_source, + minimal_data_source_url_id=minimal_data_source, + url_meta_url_id=url_meta_url_id, + unvalidated_url_id=unvalidated_url_id, + + # Users + user_1_id=user_id_1, + user_2_id=user_id_2, + ) + + diff --git a/tests/automated/integration/readonly/setup/data_source.py b/tests/automated/integration/readonly/setup/data_source.py new file mode 100644 index 00000000..e22929ee --- /dev/null +++ b/tests/automated/integration/readonly/setup/data_source.py @@ -0,0 +1,103 @@ +from datetime import date + +from src.collectors.enums import URLStatus +from src.core.enums import RecordType +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.optional_ds_metadata.enums import AgencyAggregationEnum, UpdateMethodEnum, \ + RetentionScheduleEnum, AccessTypeEnum +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata +from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType +from tests.helpers.data_creator.core import DBDataCreator + + +async def add_maximal_data_source( + agency_1_id: int, + db_data_creator: DBDataCreator +) -> int: + adb_client: AsyncDatabaseClient = db_data_creator.adb_client + url = URL( + scheme="https", + url="read-only-ds.com", + name="Read only URL name", + trailing_slash=True, + description="Read only URL", + collector_metadata={ + "url": "https://read-only.com/" + }, + status=URLStatus.OK, + source=URLSource.COLLECTOR, + ) + url_id: int = await adb_client.add(url, return_id=True) + await db_data_creator.create_validated_flags( + url_ids=[url_id], + validation_type=URLType.DATA_SOURCE + ) + record_type = URLRecordType( + url_id=url_id, + record_type=RecordType.CRIME_STATISTICS + ) + await adb_client.add(record_type) + + optional_ds_metadata = URLOptionalDataSourceMetadata( + url_id=url_id, + record_formats=["csv", "pdf"], + data_portal_type="CKAN", + supplying_entity="ReadOnly Agency", + coverage_start=date(year=2025, month=6, day=1), + coverage_end=date(year=2025, month=8, day=20), + agency_supplied=False, + agency_originated=True, + agency_aggregation=AgencyAggregationEnum.LOCALITY, + agency_described_not_in_database="ReadOnly Agency Not In DB", + update_method=UpdateMethodEnum.NO_UPDATES, + readme_url="https://read-only-readme.com", + originating_entity="ReadOnly Agency Originating", + retention_schedule=RetentionScheduleEnum.GT_10_YEARS, + scraper_url="https://read-only-scraper.com", + submission_notes="Read Only Submission Notes", + access_notes="Read Only Access Notes", + access_types=[AccessTypeEnum.WEBPAGE, AccessTypeEnum.API], + ) + + await adb_client.add(optional_ds_metadata) + + await db_data_creator.create_url_agency_links( + url_ids=[url_id], + agency_ids=[agency_1_id] + ) + return url_id + + +async def add_minimal_data_source( + agency_1_id: int, + db_data_creator: DBDataCreator +) -> int: + adb_client: AsyncDatabaseClient = db_data_creator.adb_client + url = URL( + scheme="https", + url="minimal-ds.com", + name="Minimal name", + trailing_slash=False, + collector_metadata={}, + status=URLStatus.OK, + source=URLSource.ROOT_URL, + ) + url_id: int = await adb_client.add(url, return_id=True) + await db_data_creator.create_validated_flags( + url_ids=[url_id], + validation_type=URLType.DATA_SOURCE + ) + record_type = URLRecordType( + url_id=url_id, + record_type=RecordType.POLICIES_AND_CONTRACTS + ) + await adb_client.add(record_type) + + await db_data_creator.create_url_agency_links( + url_ids=[url_id], + agency_ids=[agency_1_id] + ) + return url_id diff --git a/tests/automated/integration/readonly/setup/meta_url.py b/tests/automated/integration/readonly/setup/meta_url.py new file mode 100644 index 00000000..837274bb --- /dev/null +++ b/tests/automated/integration/readonly/setup/meta_url.py @@ -0,0 +1,33 @@ +from src.collectors.enums import URLStatus +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.impl.url.core.enums import URLSource +from src.db.models.impl.url.core.sqlalchemy import URL +from tests.helpers.data_creator.core import DBDataCreator + + +async def add_meta_url( + agency_1_id: int, + db_data_creator: DBDataCreator +) -> int: + adb_client: AsyncDatabaseClient = db_data_creator.adb_client + url = URL( + scheme=None, + url="read-only-meta-url.com", + name="Read only URL Name", + trailing_slash=False, + description="Read only URL", + collector_metadata={ + "url": "https://read-only-meta-url.com/" + }, + status=URLStatus.OK, + source=URLSource.REDIRECT, + ) + url_id: int = await adb_client.add(url, return_id=True) + + await db_data_creator.create_validated_flags( + url_ids=[url_id], + validation_type=URLType.META_URL + ) + + return url_id diff --git a/tests/automated/integration/tasks/conftest.py b/tests/automated/integration/tasks/conftest.py index a06da58c..e14d3369 100644 --- a/tests/automated/integration/tasks/conftest.py +++ b/tests/automated/integration/tasks/conftest.py @@ -1,16 +1,17 @@ from unittest.mock import MagicMock, AsyncMock import pytest -from pdap_access_manager import AccessManager +from pdap_access_manager.access_manager.async_ import AccessManagerAsync from src.external.pdap.client import PDAPClient @pytest.fixture def mock_pdap_client() -> PDAPClient: - mock_access_manager = MagicMock( - spec=AccessManager + mock_access_manager = AsyncMock( + spec=AccessManagerAsync ) + mock_access_manager.data_sources_url = "http://example.com" mock_access_manager.build_url = MagicMock( return_value="http://example.com" ) diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/setup.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/setup.py index 417677df..1d1085a5 100644 --- a/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/setup.py +++ b/tests/automated/integration/tasks/scheduled/impl/huggingface/setup/queries/setup.py @@ -37,10 +37,12 @@ async def run(self, session: AsyncSession) -> list[int]: description = None url = URL( url=get_test_url(i), + scheme=None, status=URLStatus.OK, name=name, description=description, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=False, ) session.add(url) await session.flush() diff --git a/tests/automated/integration/tasks/scheduled/impl/huggingface/test_duplicate_html_content_not_picked_up.py b/tests/automated/integration/tasks/scheduled/impl/huggingface/test_duplicate_html_content_not_picked_up.py new file mode 100644 index 00000000..be84ffd4 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/huggingface/test_duplicate_html_content_not_picked_up.py @@ -0,0 +1,38 @@ +import pytest + +from src.core.tasks.scheduled.impl.huggingface.operator import PushToHuggingFaceTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.html.compressed.sqlalchemy import URLCompressedHTML + + +@pytest.mark.asyncio +async def test_huggingface_task_duplicate_html_content_not_picked_up( + adb_client_test: AsyncDatabaseClient, + operator: PushToHuggingFaceTaskOperator, + test_url_data_source_id: int, + test_url_data_source_id_2: int +): + + # Add HTML content with the same hash + uch_1 = URLCompressedHTML( + url_id=test_url_data_source_id, + compressed_html=b"test" + ) + uch_2 = URLCompressedHTML( + url_id=test_url_data_source_id_2, + compressed_html=b"test" + ) + await adb_client_test.add_all([ + uch_1, + uch_2 + ]) + + # Confirm task meets prerequisites + assert await operator.meets_task_prerequisites() + + # Refresh materialized view + await adb_client_test.refresh_materialized_views() + + # Confirm task does not meet prerequisites + assert not await operator.meets_task_prerequisites() + diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/__init__.py b/tests/automated/integration/tasks/scheduled/impl/integrity/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/conftest.py b/tests/automated/integration/tasks/scheduled/impl/integrity/conftest.py new file mode 100644 index 00000000..9106f1b7 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/conftest.py @@ -0,0 +1,13 @@ +import pytest + +from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator +from src.db.client.async_ import AsyncDatabaseClient + + +@pytest.fixture +def operator( + adb_client_test: AsyncDatabaseClient +) -> IntegrityMonitorTaskOperator: + return IntegrityMonitorTaskOperator( + adb_client=adb_client_test, + ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/helpers.py b/tests/automated/integration/tasks/scheduled/impl/integrity/helpers.py new file mode 100644 index 00000000..2b617ca2 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/helpers.py @@ -0,0 +1,12 @@ +from src.core.tasks.base.run_info import TaskOperatorRunInfo +from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator +from src.core.tasks.url.enums import TaskOperatorOutcome + + +async def run_task_and_confirm_error( + operator: IntegrityMonitorTaskOperator, + expected_view: str +) -> None: + run_info: TaskOperatorRunInfo = await operator.run_task() + assert run_info.outcome == TaskOperatorOutcome.ERROR + assert expected_view in run_info.message \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_data_sources.py b/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_data_sources.py new file mode 100644 index 00000000..3381d7f0 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_data_sources.py @@ -0,0 +1,66 @@ +import pytest + +from src.core.enums import RecordType +from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource +from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType +from tests.automated.integration.tasks.scheduled.impl.integrity.helpers import run_task_and_confirm_error + + +@pytest.mark.asyncio +async def test_core( + operator: IntegrityMonitorTaskOperator, + test_url_id: int, + test_agency_id: int +): + # Check does not meet prerequisites + assert not await operator.meets_task_prerequisites() + + # Add URL as data source but without record type or validated flag + ## App Link + ds_app_link = DSAppLinkDataSource( + url_id=test_url_id, + ds_data_source_id=1 + ) + await operator.adb_client.add(ds_app_link) + + # Check meets prerequisites + assert await operator.meets_task_prerequisites() + + # Add validated URL flag + flag = FlagURLValidated( + url_id=test_url_id, + type=URLType.DATA_SOURCE + ) + await operator.adb_client.add(flag) + # Check still meets prerequisites + assert await operator.meets_task_prerequisites() + + # Run task and confirm produces error + await run_task_and_confirm_error( + operator=operator, + expected_view="integrity__incomplete_data_sources_view" + ) + + # Add record type to data source + record_type = URLRecordType( + url_id=test_url_id, + record_type=RecordType.INCARCERATION_RECORDS + ) + await operator.adb_client.add(record_type) + + # Check still meets prerequisites + assert await operator.meets_task_prerequisites() + + # Add agency to data source + agency = LinkURLAgency( + agency_id=test_agency_id, + url_id=test_url_id + ) + await operator.adb_client.add(agency) + + # Check no longer meets task prerequisites + assert not await operator.meets_task_prerequisites() diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_meta_urls.py b/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_meta_urls.py new file mode 100644 index 00000000..9c3a147d --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/test_incomplete_meta_urls.py @@ -0,0 +1,54 @@ +import pytest + +from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator +from src.db.models.impl.flag.url_validated.enums import URLType +from src.db.models.impl.flag.url_validated.sqlalchemy import FlagURLValidated +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL +from tests.automated.integration.tasks.scheduled.impl.integrity.helpers import run_task_and_confirm_error + + +@pytest.mark.asyncio +async def test_core( + operator: IntegrityMonitorTaskOperator, + test_agency_id: int, + test_url_id: int +): + # Check does not meet prerequisites + assert not await operator.meets_task_prerequisites() + + # Add Meta URL without linking an agency to it + ## Validated Flag + flag = FlagURLValidated( + url_id=test_url_id, + type=URLType.META_URL + ) + await operator.adb_client.add(flag) + + ## App Link + ds_app_link = DSAppLinkMetaURL( + url_id=test_url_id, + ds_meta_url_id=1 + ) + await operator.adb_client.add(ds_app_link) + + # Check meets prerequisites + assert await operator.meets_task_prerequisites() + + # Run task and confirm produces error + await run_task_and_confirm_error( + operator=operator, + expected_view="integrity__incomplete_meta_urls_view" + ) + + # Add agency to Meta URL + link = LinkURLAgency( + agency_id=test_agency_id, + url_id=test_url_id + ) + await operator.adb_client.add(link) + + # Check no longer meets task prerequisites + assert not await operator.meets_task_prerequisites() + + diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/test_non_federal_agencies_no_location.py b/tests/automated/integration/tasks/scheduled/impl/integrity/test_non_federal_agencies_no_location.py new file mode 100644 index 00000000..ee189f64 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/test_non_federal_agencies_no_location.py @@ -0,0 +1,55 @@ +import pytest + +from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator +from src.db.models.impl.agency.enums import JurisdictionType, AgencyType +from src.db.models.impl.agency.sqlalchemy import Agency +from src.db.models.impl.link.agency_location.sqlalchemy import LinkAgencyLocation +from tests.automated.integration.tasks.scheduled.impl.integrity.helpers import run_task_and_confirm_error +from tests.helpers.data_creator.models.creation_info.locality import LocalityCreationInfo + + +@pytest.mark.asyncio +async def test_core( + operator: IntegrityMonitorTaskOperator, + pittsburgh_locality: LocalityCreationInfo +): + # Check does not meet prerequisites + assert not await operator.meets_task_prerequisites() + + # Add federal agency + agency = Agency( + name="Federal Agency", + agency_type=AgencyType.COURT, + jurisdiction_type=JurisdictionType.FEDERAL + ) + await operator.adb_client.add(agency) + + # Check does not meet prerequisites + assert not await operator.meets_task_prerequisites() + + # Add non-federal agency + agency = Agency( + name="Non-Federal Agency", + agency_type=AgencyType.COURT, + jurisdiction_type=JurisdictionType.LOCAL + ) + agency_id: int =await operator.adb_client.add(agency, return_id=True) + + # Check meets prerequisites + assert await operator.meets_task_prerequisites() + + # Run task and confirm produces error + await run_task_and_confirm_error( + operator=operator, + expected_view="integrity__non_federal_agencies_no_location_view" + ) + + # Add location to non-federal agency + link = LinkAgencyLocation( + agency_id=agency_id, + location_id=pittsburgh_locality.location_id + ) + await operator.adb_client.add(link) + + # Check no longer meets task prerequisites + assert not await operator.meets_task_prerequisites() diff --git a/tests/automated/integration/tasks/scheduled/impl/integrity/test_url_both_data_source_and_meta_url.py b/tests/automated/integration/tasks/scheduled/impl/integrity/test_url_both_data_source_and_meta_url.py new file mode 100644 index 00000000..fa36a269 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/integrity/test_url_both_data_source_and_meta_url.py @@ -0,0 +1,54 @@ +import pytest +from sqlalchemy import delete + +from src.core.tasks.scheduled.impl.integrity.operator import IntegrityMonitorTaskOperator +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource +from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL +from tests.automated.integration.tasks.scheduled.impl.integrity.helpers import run_task_and_confirm_error + + +@pytest.mark.asyncio +async def test_core( + operator: IntegrityMonitorTaskOperator, + test_url_data_source_id: int +): + + # Check does not meet prerequisites + assert not await operator.meets_task_prerequisites() + + # Add DS App Link + ds_app_link_ds = DSAppLinkDataSource( + url_id=test_url_data_source_id, + ds_data_source_id=1 + ) + await operator.adb_client.add(ds_app_link_ds) + + # Add same URL as Meta URL + ## App Link + ds_app_link_mu = DSAppLinkMetaURL( + url_id=test_url_data_source_id, + ds_meta_url_id=1 + ) + await operator.adb_client.add(ds_app_link_mu) + + # Check meets prerequisites + assert await operator.meets_task_prerequisites() + + # Run task and confirm produces error + await run_task_and_confirm_error( + operator=operator, + expected_view="integrity__url_both_data_source_and_meta_url_view" + ) + + # Delete data source link + statement = ( + delete( + DSAppLinkMetaURL + ).where( + DSAppLinkMetaURL.url_id == test_url_data_source_id + ) + ) + await operator.adb_client.execute(statement) + + # Check no longer meets task prerequisites + assert not await operator.meets_task_prerequisites() diff --git a/tests/automated/integration/tasks/scheduled/impl/internet_archives/probe/constants.py b/tests/automated/integration/tasks/scheduled/impl/internet_archives/probe/constants.py index d41ffb48..60f762e7 100644 --- a/tests/automated/integration/tasks/scheduled/impl/internet_archives/probe/constants.py +++ b/tests/automated/integration/tasks/scheduled/impl/internet_archives/probe/constants.py @@ -1,4 +1,4 @@ -TEST_URL_1 = "https://test-ia-metadata.com/1" -TEST_URL_2 = "https://test-ia-metadata.com/2" \ No newline at end of file +TEST_URL_1 = "test-ia-metadata.com/1" +TEST_URL_2 = "test-ia-metadata.com/2" \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/internet_archives/probe/setup.py b/tests/automated/integration/tasks/scheduled/impl/internet_archives/probe/setup.py index 59b2d77c..7bc33222 100644 --- a/tests/automated/integration/tasks/scheduled/impl/internet_archives/probe/setup.py +++ b/tests/automated/integration/tasks/scheduled/impl/internet_archives/probe/setup.py @@ -11,11 +11,13 @@ async def add_urls(dbc: AsyncDatabaseClient) -> list[int]: insert_models: list[URLInsertModel] = [ URLInsertModel( url=TEST_URL_1, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=False ), URLInsertModel( url=TEST_URL_2, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=False ) ] return await dbc.bulk_insert(insert_models, return_ids=True) diff --git a/tests/automated/integration/tasks/scheduled/impl/internet_archives/save/constants.py b/tests/automated/integration/tasks/scheduled/impl/internet_archives/save/constants.py index bc1b5a2e..658d8cb9 100644 --- a/tests/automated/integration/tasks/scheduled/impl/internet_archives/save/constants.py +++ b/tests/automated/integration/tasks/scheduled/impl/internet_archives/save/constants.py @@ -1,5 +1,5 @@ -TEST_URL_1 = "https://ia-save-test.com/1" -TEST_URL_2 = "https://ia-save-test.com/2" \ No newline at end of file +TEST_URL_1 = "ia-save-test.com/1" +TEST_URL_2 = "ia-save-test.com/2" \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/internet_archives/save/setup.py b/tests/automated/integration/tasks/scheduled/impl/internet_archives/save/setup.py index 36b1bcb9..836ee678 100644 --- a/tests/automated/integration/tasks/scheduled/impl/internet_archives/save/setup.py +++ b/tests/automated/integration/tasks/scheduled/impl/internet_archives/save/setup.py @@ -72,11 +72,13 @@ async def add_test_urls(adb_client: AsyncDatabaseClient) -> list[int]: url_inserts: list[URLInsertModel] = [ URLInsertModel( url=TEST_URL_1, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=False ), URLInsertModel( url=TEST_URL_2, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=False ) ] url_ids = await adb_client.bulk_insert(url_inserts, return_ids=True) diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/__init__.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/__init__.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/conftest.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/conftest.py new file mode 100644 index 00000000..4cb7a3f2 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/conftest.py @@ -0,0 +1,22 @@ +import pytest_asyncio + +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.agency.ds_link.sqlalchemy import DSAppLinkAgency +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel + + +@pytest_asyncio.fixture +async def ds_app_linked_agency( + test_agency_id: int, + adb_client_test: AsyncDatabaseClient +) -> DSAppLinkInfoModel: + # Add DS App Link + ds_app_link = DSAppLinkAgency( + agency_id=test_agency_id, + ds_agency_id=67 + ) + await adb_client_test.add(ds_app_link) + return DSAppLinkInfoModel( + ds_app_id=67, + db_id=test_agency_id + ) diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/test_add.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/test_add.py new file mode 100644 index 00000000..6a1dc358 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/test_add.py @@ -0,0 +1,77 @@ +import pytest + +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.add.core import DSAppSyncAgenciesAddTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.agency.ds_link.sqlalchemy import DSAppLinkAgency +from src.db.models.impl.agency.enums import JurisdictionType, AgencyType +from src.external.pdap.client import PDAPClient +from src.external.pdap.impl.sync.agencies._shared.models.content import AgencySyncContentModel +from src.external.pdap.impl.sync.agencies.add.request import AddAgenciesOuterRequest, AddAgenciesInnerRequest +from src.external.pdap.impl.sync.shared.models.add.response import DSAppSyncAddResponseModel, \ + DSAppSyncAddResponseInnerModel +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.helpers import extract_and_validate_sync_request, \ + mock_make_request +from tests.helpers.data_creator.core import DBDataCreator +from tests.helpers.data_creator.models.creation_info.locality import LocalityCreationInfo +from tests.helpers.data_creator.models.creation_info.us_state import USStateCreationInfo +from tests.helpers.run import run_task_and_confirm_success + + +@pytest.mark.asyncio +async def test_add( + db_data_creator: DBDataCreator, + test_agency_id: int, + adb_client_test: AsyncDatabaseClient, + mock_pdap_client: PDAPClient, + pittsburgh_locality: LocalityCreationInfo, + pennsylvania: USStateCreationInfo, +): + operator = DSAppSyncAgenciesAddTaskOperator( + adb_client=adb_client_test, + pdap_client=mock_pdap_client + ) + + # Mock make_request to return a false DS App id + mock_make_request( + mock_pdap_client=mock_pdap_client, + data=DSAppSyncAddResponseModel( + entities=[ + DSAppSyncAddResponseInnerModel( + app_id=67, + request_id=test_agency_id + ) + ] + ) + ) + + # Check meets prerequisite + assert await operator.meets_task_prerequisites() + + # Run task and confirm runs without error + await run_task_and_confirm_success(operator) + + # Confirm expected method was called with expected parameters + request: AddAgenciesOuterRequest = extract_and_validate_sync_request( + mock_pdap_client, + expected_path="agencies/add", + expected_model=AddAgenciesOuterRequest + ) + assert len(request.agencies) == 1 + agency: AddAgenciesInnerRequest = request.agencies[0] + assert agency.request_id == test_agency_id + content: AgencySyncContentModel = agency.content + assert content.name == "Test Agency" + assert content.jurisdiction_type == JurisdictionType.LOCAL + assert content.agency_type == AgencyType.UNKNOWN + assert set(content.location_ids) == { + pittsburgh_locality.location_id, + pennsylvania.location_id + } + + # Check Presence of DS App Link + ds_app_link: DSAppLinkAgency = await adb_client_test.one_or_none_model(DSAppLinkAgency) + assert ds_app_link is not None + assert ds_app_link.ds_agency_id == 67 + assert ds_app_link.agency_id == test_agency_id + + diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/test_delete.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/test_delete.py new file mode 100644 index 00000000..e311b886 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/test_delete.py @@ -0,0 +1,64 @@ +import pytest + +from src.api.shared.models.message_response import MessageResponse +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.delete.core import DSAppSyncAgenciesDeleteTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.agency.ds_link.sqlalchemy import DSAppLinkAgency +from src.db.models.impl.flag.ds_delete.agency import FlagDSDeleteAgency +from src.external.pdap.client import PDAPClient +from src.external.pdap.impl.sync.shared.models.delete.request import DSAppSyncDeleteRequestModel +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.helpers import extract_and_validate_sync_request, \ + mock_make_request +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel +from tests.helpers.data_creator.core import DBDataCreator +from tests.helpers.run import run_task_and_confirm_success + + +@pytest.mark.asyncio +async def test_delete( + db_data_creator: DBDataCreator, + ds_app_linked_agency: DSAppLinkInfoModel, + adb_client_test: AsyncDatabaseClient, + mock_pdap_client: PDAPClient +): + ds_agency_id: int = 67 + operator = DSAppSyncAgenciesDeleteTaskOperator( + adb_client=adb_client_test, + pdap_client=mock_pdap_client + ) + + # Mock make_request + mock_make_request( + mock_pdap_client=mock_pdap_client, + data=MessageResponse(message="Success") + ) + + + # Check does not currently meet prerequisite + assert not await operator.meets_task_prerequisites() + + # Add Task Deletion Flag for App Link + flag = FlagDSDeleteAgency( + ds_agency_id=ds_agency_id + ) + await adb_client_test.add(flag) + + # Check meets prerequisite + assert operator.meets_task_prerequisites() + + # Run task and confirm runs without error + await run_task_and_confirm_success(operator) + + # Confirm expected method was caused with expected parameters + request: DSAppSyncDeleteRequestModel = extract_and_validate_sync_request( + mock_pdap_client, + expected_path="agencies/delete", + expected_model=DSAppSyncDeleteRequestModel + ) + assert request.ids == [ds_agency_id] + + # Check DS App Link Is Deleted + assert await adb_client_test.has_no_rows(DSAppLinkAgency) + + # Check DS App Agency Deletion Flag is deleted + assert await adb_client_test.has_no_rows(FlagDSDeleteAgency) diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/__init__.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/conftest.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/conftest.py new file mode 100644 index 00000000..eafc4148 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/conftest.py @@ -0,0 +1,16 @@ +import pytest + +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.update.core import DSAppSyncAgenciesUpdateTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.external.pdap.client import PDAPClient + + +@pytest.fixture +def operator( + adb_client_test: AsyncDatabaseClient, + mock_pdap_client: PDAPClient +) -> DSAppSyncAgenciesUpdateTaskOperator: + return DSAppSyncAgenciesUpdateTaskOperator( + adb_client=adb_client_test, + pdap_client=mock_pdap_client + ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/helpers.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/helpers.py new file mode 100644 index 00000000..7901bea5 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/helpers.py @@ -0,0 +1,7 @@ +from datetime import datetime + + +def check_ds_app_link_updated( + old_updated_at: datetime +) -> None: + raise NotImplementedError \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/test_add_location_link.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/test_add_location_link.py new file mode 100644 index 00000000..4dfbaba7 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/test_add_location_link.py @@ -0,0 +1,76 @@ +from src.api.shared.models.message_response import MessageResponse +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.update.core import DSAppSyncAgenciesUpdateTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.agency.ds_link.sqlalchemy import DSAppLinkAgency +from src.db.models.impl.agency.enums import JurisdictionType, AgencyType +from src.db.models.impl.link.agency_location.sqlalchemy import LinkAgencyLocation +from src.external.pdap.client import PDAPClient +from src.external.pdap.impl.sync.agencies._shared.models.content import AgencySyncContentModel +from src.external.pdap.impl.sync.agencies.update.request import UpdateAgenciesOuterRequest, UpdateAgenciesInnerRequest +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.agency.conftest import ds_app_linked_agency +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.helpers import extract_and_validate_sync_request, \ + mock_make_request +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel +from tests.helpers.data_creator.models.creation_info.county import CountyCreationInfo +from tests.helpers.data_creator.models.creation_info.locality import LocalityCreationInfo +from tests.helpers.data_creator.models.creation_info.us_state import USStateCreationInfo +from tests.helpers.run import run_task_and_confirm_success + + +async def test_add_location_link( + ds_app_linked_agency: DSAppLinkInfoModel, + pittsburgh_locality: LocalityCreationInfo, + allegheny_county: CountyCreationInfo, + pennsylvania: USStateCreationInfo, + operator: DSAppSyncAgenciesUpdateTaskOperator, + mock_pdap_client: PDAPClient, + adb_client_test: AsyncDatabaseClient, +): + + # Check prerequisites not met + assert not await operator.meets_task_prerequisites() + + # Mock make_request + mock_make_request( + mock_pdap_client=mock_pdap_client, + data=MessageResponse(message="Success") + ) + + # Add location link + link = LinkAgencyLocation( + agency_id=ds_app_linked_agency.db_id, + location_id=allegheny_county.location_id + ) + await adb_client_test.add(link) + + # Check prerequisites are met + assert operator.meets_task_prerequisites() + + # Run task and confirm runs without error + await run_task_and_confirm_success(operator) + + # Confirm expected method was called with expected parameters + request: UpdateAgenciesOuterRequest = extract_and_validate_sync_request( + mock_pdap_client, + expected_path="agencies/update", + expected_model=UpdateAgenciesOuterRequest + ) + assert len(request.agencies) == 1 + agency: UpdateAgenciesInnerRequest = request.agencies[0] + assert agency.app_id == ds_app_linked_agency.ds_app_id + content: AgencySyncContentModel = agency.content + assert content.name == "Test Agency" + assert content.jurisdiction_type == JurisdictionType.LOCAL + assert content.agency_type == AgencyType.UNKNOWN + assert set(content.location_ids) == { + pittsburgh_locality.location_id, + pennsylvania.location_id, + allegheny_county.location_id + } + + + # Check DS App Link Is Updated + ds_app_link: DSAppLinkAgency | None = await adb_client_test.one_or_none_model(model=DSAppLinkAgency) + assert ds_app_link is not None + assert ds_app_link.ds_agency_id == 67 + assert ds_app_link.last_synced_at > ds_app_linked_agency.updated_at diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/test_delete_location_link.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/test_delete_location_link.py new file mode 100644 index 00000000..7f0450fe --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/test_delete_location_link.py @@ -0,0 +1,76 @@ +from sqlalchemy import delete + +from src.api.shared.models.message_response import MessageResponse +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.update.core import DSAppSyncAgenciesUpdateTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.agency.ds_link.sqlalchemy import DSAppLinkAgency +from src.db.models.impl.agency.enums import JurisdictionType, AgencyType +from src.db.models.impl.link.agency_location.sqlalchemy import LinkAgencyLocation +from src.external.pdap.client import PDAPClient +from src.external.pdap.impl.sync.agencies._shared.models.content import AgencySyncContentModel +from src.external.pdap.impl.sync.agencies.update.request import UpdateAgenciesOuterRequest, UpdateAgenciesInnerRequest +from tests.automated.integration.conftest import pennsylvania +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.helpers import extract_and_validate_sync_request, \ + mock_make_request +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel +from tests.helpers.data_creator.models.creation_info.locality import LocalityCreationInfo +from tests.helpers.data_creator.models.creation_info.us_state import USStateCreationInfo +from tests.helpers.run import run_task_and_confirm_success + + +async def test_delete_location_link( + ds_app_linked_agency: DSAppLinkInfoModel, + pittsburgh_locality: LocalityCreationInfo, + operator: DSAppSyncAgenciesUpdateTaskOperator, + mock_pdap_client: PDAPClient, + pennsylvania: USStateCreationInfo, + adb_client_test: AsyncDatabaseClient +): + + # Check prerequisites not met + assert not await operator.meets_task_prerequisites() + + # Mock make_request + mock_make_request( + mock_pdap_client=mock_pdap_client, + data=MessageResponse(message="Success") + ) + + # Delete location link (pittsburgh) + statement = ( + delete( + LinkAgencyLocation + ) + .where( + LinkAgencyLocation.agency_id == ds_app_linked_agency.db_id, + LinkAgencyLocation.location_id == pittsburgh_locality.location_id + ) + ) + await adb_client_test.execute(statement) + + # Check prerequisites are met + assert operator.meets_task_prerequisites() + + # Run task and confirm runs without error + await run_task_and_confirm_success(operator) + + # Confirm expected method was called with expected parameters + request: UpdateAgenciesOuterRequest = extract_and_validate_sync_request( + mock_pdap_client, + expected_path="agencies/update", + expected_model=UpdateAgenciesOuterRequest + ) + assert len(request.agencies) == 1 + agency: UpdateAgenciesInnerRequest = request.agencies[0] + assert agency.app_id == ds_app_linked_agency.ds_app_id + content: AgencySyncContentModel = agency.content + assert content.name == "Test Agency" + assert content.jurisdiction_type == JurisdictionType.LOCAL + assert content.agency_type == AgencyType.UNKNOWN + assert content.location_ids == [pennsylvania.location_id] + + # Check DS App Link Is Updated + ds_app_link: DSAppLinkAgency | None = await adb_client_test.one_or_none_model(model=DSAppLinkAgency) + assert ds_app_link is not None + assert ds_app_link.ds_agency_id == 67 + assert ds_app_link.last_synced_at > ds_app_linked_agency.updated_at \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/test_update_agency.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/test_update_agency.py new file mode 100644 index 00000000..4749b0b0 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/agency/update/test_update_agency.py @@ -0,0 +1,66 @@ +from sqlalchemy import update + +from src.api.shared.models.message_response import MessageResponse +from src.core.tasks.scheduled.impl.sync_to_ds.impl.agencies.update.core import DSAppSyncAgenciesUpdateTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.agency.ds_link.sqlalchemy import DSAppLinkAgency +from src.db.models.impl.agency.enums import AgencyType, JurisdictionType +from src.db.models.impl.agency.sqlalchemy import Agency +from src.external.pdap.client import PDAPClient +from src.external.pdap.impl.sync.agencies.update.request import UpdateAgenciesOuterRequest +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.helpers import extract_and_validate_sync_request, \ + mock_make_request +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel +from tests.helpers.run import run_task_and_confirm_success + + +async def test_update_agency( + ds_app_linked_agency: DSAppLinkInfoModel, + operator: DSAppSyncAgenciesUpdateTaskOperator, + mock_pdap_client: PDAPClient, + adb_client_test: AsyncDatabaseClient +): + + # Check prerequisites not met + assert not await operator.meets_task_prerequisites() + + # Mock make_request + mock_make_request( + mock_pdap_client=mock_pdap_client, + data=MessageResponse(message="Success") + ) + + # Update agency table + statement = ( + update( + Agency + ) + .values( + name="Updated Agency Name", + agency_type=AgencyType.COURT, + jurisdiction_type=JurisdictionType.STATE + ) + .where( + Agency.id == ds_app_linked_agency.db_id + ) + ) + await adb_client_test.execute(statement) + + # Check prerequisites are met + assert operator.meets_task_prerequisites() + + # Run task and confirm runs without error + await run_task_and_confirm_success(operator) + + # Confirm expected method was called with expected parameters + extract_and_validate_sync_request( + mock_pdap_client, + expected_path="agencies/update", + expected_model=UpdateAgenciesOuterRequest + ) + + # Check DS App Link Is Updated + ds_app_link: DSAppLinkAgency | None = await adb_client_test.one_or_none_model(model=DSAppLinkAgency) + assert ds_app_link is not None + assert ds_app_link.ds_agency_id == 67 + assert ds_app_link.last_synced_at > ds_app_linked_agency.updated_at diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/__init__.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/conftest.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/conftest.py new file mode 100644 index 00000000..72b621b2 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/conftest.py @@ -0,0 +1,21 @@ +import pytest_asyncio + +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel + + +@pytest_asyncio.fixture +async def ds_app_linked_data_source_url( + test_url_data_source_id: int, + adb_client_test: AsyncDatabaseClient +) -> DSAppLinkInfoModel: + link = DSAppLinkDataSource( + ds_data_source_id=67, + url_id=test_url_data_source_id, + ) + await adb_client_test.add(link) + return DSAppLinkInfoModel( + db_id=test_url_data_source_id, + ds_app_id=67, + ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/delete/__init__.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/delete/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/delete/test_delete.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/delete/test_delete.py new file mode 100644 index 00000000..1987bc79 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/delete/test_delete.py @@ -0,0 +1,83 @@ +import pytest + +from src.api.shared.models.message_response import MessageResponse +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.delete.core import \ + DSAppSyncDataSourcesDeleteTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.flag.ds_delete.data_source import FlagDSDeleteDataSource +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource +from src.external.pdap.client import PDAPClient +from src.external.pdap.impl.sync.shared.models.delete.request import DSAppSyncDeleteRequestModel +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.helpers import extract_and_validate_sync_request, \ + mock_make_request +from tests.helpers.data_creator.core import DBDataCreator +from tests.helpers.run import run_task_and_confirm_success + + +@pytest.mark.asyncio +async def test_delete( + db_data_creator: DBDataCreator, + adb_client_test: AsyncDatabaseClient, + mock_pdap_client: PDAPClient, + test_url_data_source_id: int +): + ds_data_source_id: int = 67 + operator = DSAppSyncDataSourcesDeleteTaskOperator( + adb_client=adb_client_test, + pdap_client=mock_pdap_client + ) + # Mock make_request + mock_make_request( + mock_pdap_client=mock_pdap_client, + data=MessageResponse(message="Success") + ) + + # Check does not currently meet prerequisite + assert not await operator.meets_task_prerequisites() + + # Add DS App Link for deleted URL + ds_app_link = DSAppLinkDataSource( + url_id=None, + ds_data_source_id=ds_data_source_id, + ) + await adb_client_test.add(ds_app_link) + + # Add DS App Link for extant URL + ds_app_link = DSAppLinkDataSource( + url_id=test_url_data_source_id, + ds_data_source_id=ds_data_source_id + 1, + ) + await adb_client_test.add(ds_app_link) + + # Check does not currently meet prerequisite + assert not await operator.meets_task_prerequisites() + + # Add Task Deletion Flag for App Link + flag = FlagDSDeleteDataSource( + ds_data_source_id=ds_data_source_id, + ) + await adb_client_test.add(flag) + + # Check meets prerequisite + assert operator.meets_task_prerequisites() + + # Run task and confirm runs without error + await run_task_and_confirm_success(operator) + + # Confirm expected method was caused with expected parameters + request: DSAppSyncDeleteRequestModel = extract_and_validate_sync_request( + mock_pdap_client, + expected_path="data-sources/delete", + expected_model=DSAppSyncDeleteRequestModel + ) + assert request.ids == [ds_data_source_id] + + # Check DS App Link has only one row + assert len(await adb_client_test.get_all(DSAppLinkDataSource)) == 1 + + # Check DS App Data Source Deletion Flag is deleted + assert await adb_client_test.has_no_rows(FlagDSDeleteDataSource) + + # Check one row in URLs table + assert len(await adb_client_test.get_all(URL)) == 1 diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/test_add.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/test_add.py new file mode 100644 index 00000000..fa31dc40 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/test_add.py @@ -0,0 +1,90 @@ +import pytest + +from src.core.enums import RecordType +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.add.core import DSAppSyncDataSourcesAddTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource +from src.external.pdap.client import PDAPClient +from src.external.pdap.enums import DataSourcesURLStatus +from src.external.pdap.impl.sync.data_sources._shared.content import DataSourceSyncContentModel +from src.external.pdap.impl.sync.data_sources.add.request import AddDataSourcesOuterRequest, AddDataSourcesInnerRequest +from src.external.pdap.impl.sync.shared.models.add.response import DSAppSyncAddResponseModel, \ + DSAppSyncAddResponseInnerModel +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.helpers import extract_and_validate_sync_request, \ + mock_make_request +from tests.helpers.data_creator.core import DBDataCreator +from tests.helpers.run import run_task_and_confirm_success + + +@pytest.mark.asyncio +async def test_add( + db_data_creator: DBDataCreator, + test_url_data_source_id: int, + adb_client_test: AsyncDatabaseClient, + mock_pdap_client: PDAPClient, + test_agency_id: int +): + operator = DSAppSyncDataSourcesAddTaskOperator( + adb_client=adb_client_test, + pdap_client=mock_pdap_client + ) + + # Mock make_request + mock_make_request( + mock_pdap_client=mock_pdap_client, + data=DSAppSyncAddResponseModel( + entities=[ + DSAppSyncAddResponseInnerModel( + app_id=67, + request_id=test_url_data_source_id + ) + ] + ) + ) + + # Check meet task prerequisites + assert await operator.meets_task_prerequisites() + + # Run task and confirm runs without error + await run_task_and_confirm_success(operator) + + # Confirm expected method was called with expected parameters + request: AddDataSourcesOuterRequest = extract_and_validate_sync_request( + mock_pdap_client, + expected_path="data-sources/add", + expected_model=AddDataSourcesOuterRequest + ) + assert len(request.data_sources) == 1 + data_source: AddDataSourcesInnerRequest = request.data_sources[0] + assert data_source.request_id == test_url_data_source_id + content: DataSourceSyncContentModel = data_source.content + assert content.source_url.startswith("https://example.com/") + assert content.name.startswith("Example ") + assert content.record_type == RecordType.CRIME_STATISTICS + assert content.description is None + assert content.record_formats == [] + assert content.data_portal_type is None + assert content.supplying_entity is None + assert content.coverage_start is None + assert content.coverage_end is None + assert content.detail_level is None + assert content.agency_supplied is None + assert content.agency_originated is None + assert content.agency_described_not_in_database is None + assert content.update_method is None + assert content.readme_url is None + assert content.originating_entity is None + assert content.retention_schedule is None + assert content.scraper_url is None + assert content.access_notes is None + assert content.access_types == [] + assert content.data_portal_type_other is None + assert content.url_status == DataSourcesURLStatus.OK + + assert content.agency_ids == [test_agency_id] + + # Check Presence of DS App Link + ds_app_link: DSAppLinkDataSource | None = await adb_client_test.one_or_none_model(DSAppLinkDataSource) + assert ds_app_link is not None + assert ds_app_link.ds_data_source_id == 67 + assert ds_app_link.url_id == test_url_data_source_id diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/__init__.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/conftest.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/conftest.py new file mode 100644 index 00000000..8a6bbfc5 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/conftest.py @@ -0,0 +1,17 @@ +import pytest + +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update.core import \ + DSAppSyncDataSourcesUpdateTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.external.pdap.client import PDAPClient + + +@pytest.fixture +def operator( + adb_client_test: AsyncDatabaseClient, + mock_pdap_client: PDAPClient +) -> DSAppSyncDataSourcesUpdateTaskOperator: + return DSAppSyncDataSourcesUpdateTaskOperator( + adb_client=adb_client_test, + pdap_client=mock_pdap_client + ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/helpers.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/helpers.py new file mode 100644 index 00000000..7901bea5 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/helpers.py @@ -0,0 +1,7 @@ +from datetime import datetime + + +def check_ds_app_link_updated( + old_updated_at: datetime +) -> None: + raise NotImplementedError \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_add_agency_link.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_add_agency_link.py new file mode 100644 index 00000000..9852df7a --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_add_agency_link.py @@ -0,0 +1,68 @@ +from src.api.shared.models.message_response import MessageResponse +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update.core import \ + DSAppSyncDataSourcesUpdateTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource +from src.external.pdap.client import PDAPClient +from src.external.pdap.impl.sync.data_sources._shared.content import DataSourceSyncContentModel +from src.external.pdap.impl.sync.data_sources.update.request import UpdateDataSourcesInnerRequest, \ + UpdateDataSourcesOuterRequest +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.helpers import extract_and_validate_sync_request, \ + mock_make_request +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel +from tests.helpers.run import run_task_and_confirm_success + + +async def test_add_agency_link( + ds_app_linked_data_source_url: DSAppLinkInfoModel, + test_agency_id: int, + test_agency_id_2: int, + operator: DSAppSyncDataSourcesUpdateTaskOperator, + mock_pdap_client: PDAPClient, + adb_client_test: AsyncDatabaseClient +): + # Mock make_request + mock_make_request( + mock_pdap_client=mock_pdap_client, + data=MessageResponse(message="Success") + ) + + # Check prerequisites not met + assert not await operator.meets_task_prerequisites() + + # Add additional agency link + link = LinkURLAgency( + url_id=ds_app_linked_data_source_url.db_id, + agency_id=test_agency_id_2 + ) + await adb_client_test.add(link) + + # Check prerequisites are met + assert operator.meets_task_prerequisites() + + # Run task and confirm runs without error + await run_task_and_confirm_success(operator) + + # Confirm expected method was called with expected parameters + request: UpdateDataSourcesOuterRequest = extract_and_validate_sync_request( + mock_pdap_client, + expected_path="data-sources/update", + expected_model=UpdateDataSourcesOuterRequest + ) + assert len(request.data_sources) == 1 + data_source: UpdateDataSourcesInnerRequest = request.data_sources[0] + assert data_source.app_id == ds_app_linked_data_source_url.ds_app_id + content: DataSourceSyncContentModel = data_source.content + assert content.name.startswith("Example") + assert set(content.agency_ids) == { + test_agency_id, + test_agency_id_2 + } + + # Check DS App Link Is Updated + ds_app_link: DSAppLinkDataSource | None = await adb_client_test.one_or_none_model(model=DSAppLinkDataSource) + assert ds_app_link is not None + assert ds_app_link.ds_data_source_id == 67 + assert ds_app_link.last_synced_at > ds_app_linked_data_source_url.updated_at + diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_delete_agency_link.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_delete_agency_link.py new file mode 100644 index 00000000..f0dbf204 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_delete_agency_link.py @@ -0,0 +1,72 @@ +from sqlalchemy import delete + +from src.api.shared.models.message_response import MessageResponse +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update.core import \ + DSAppSyncDataSourcesUpdateTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource +from src.external.pdap.client import PDAPClient +from src.external.pdap.impl.sync.data_sources._shared.content import DataSourceSyncContentModel +from src.external.pdap.impl.sync.data_sources.update.request import UpdateDataSourcesInnerRequest, \ + UpdateDataSourcesOuterRequest +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.helpers import extract_and_validate_sync_request, \ + mock_make_request +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel +from tests.conftest import adb_client_test +from tests.helpers.run import run_task_and_confirm_success + + +async def test_delete_agency_link( + ds_app_linked_data_source_url: DSAppLinkInfoModel, + test_agency_id: int, + operator: DSAppSyncDataSourcesUpdateTaskOperator, + mock_pdap_client: PDAPClient, + adb_client_test: AsyncDatabaseClient +): + # Mock make_request + mock_make_request( + mock_pdap_client=mock_pdap_client, + data=MessageResponse(message="Success") + ) + + # Check prerequisites not met + assert not await operator.meets_task_prerequisites() + + # Delete agency ID link + statement = ( + delete( + LinkURLAgency + ) + .where( + LinkURLAgency.url_id == ds_app_linked_data_source_url.db_id, + LinkURLAgency.agency_id == test_agency_id + ) + ) + await adb_client_test.execute(statement) + + # Check prerequisites are met + assert operator.meets_task_prerequisites() + + # Run task and confirm runs without error + await run_task_and_confirm_success(operator) + + # Confirm expected method was called with expected parameters + request: UpdateDataSourcesOuterRequest = extract_and_validate_sync_request( + mock_pdap_client, + expected_path="data-sources/update", + expected_model=UpdateDataSourcesOuterRequest + ) + assert len(request.data_sources) == 1 + data_source: UpdateDataSourcesInnerRequest = request.data_sources[0] + assert data_source.app_id == ds_app_linked_data_source_url.ds_app_id + content: DataSourceSyncContentModel = data_source.content + assert content.name.startswith("Example") + assert content.agency_ids == [] + + # Check DS App Link Is Updated + ds_app_link: DSAppLinkDataSource | None = await adb_client_test.one_or_none_model(model=DSAppLinkDataSource) + assert ds_app_link is not None + assert ds_app_link.ds_data_source_id == 67 + assert ds_app_link.last_synced_at > ds_app_linked_data_source_url.updated_at + diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_update_optional_ds_metadata.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_update_optional_ds_metadata.py new file mode 100644 index 00000000..94273019 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_update_optional_ds_metadata.py @@ -0,0 +1,105 @@ +from datetime import date + +from sqlalchemy import update + +from src.api.shared.models.message_response import MessageResponse +from src.core.enums import RecordType +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update.core import \ + DSAppSyncDataSourcesUpdateTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource +from src.db.models.impl.url.optional_ds_metadata.enums import AgencyAggregationEnum, AccessTypeEnum, UpdateMethodEnum, \ + RetentionScheduleEnum +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata +from src.external.pdap.client import PDAPClient +from src.external.pdap.impl.sync.data_sources._shared.content import DataSourceSyncContentModel +from src.external.pdap.impl.sync.data_sources.update.request import UpdateDataSourcesInnerRequest, \ + UpdateDataSourcesOuterRequest +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.helpers import extract_and_validate_sync_request, \ + mock_make_request +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel +from tests.helpers.run import run_task_and_confirm_success + + +async def test_update_optional_ds_metadata( + ds_app_linked_data_source_url: DSAppLinkInfoModel, + operator: DSAppSyncDataSourcesUpdateTaskOperator, + mock_pdap_client: PDAPClient, + adb_client_test: AsyncDatabaseClient, + test_agency_id: int +): + # Mock make_request + mock_make_request( + mock_pdap_client=mock_pdap_client, + data=MessageResponse(message="Success") + ) + + # Check prerequisites not met + assert not await operator.meets_task_prerequisites() + + # Update url_optional_ds_metadata_table table + insert = URLOptionalDataSourceMetadata( + url_id=ds_app_linked_data_source_url.db_id, + record_formats=["Record Format 1", "Record Format 2"], + data_portal_type="Test Data Portal Type", + supplying_entity="Test Supplying Entity", + coverage_start=date(year=2025, month=5, day=1), + coverage_end=date(year=2025, month=5, day=31), + agency_supplied=True, + agency_originated=True, + agency_aggregation=AgencyAggregationEnum.FEDERAL, + update_method=UpdateMethodEnum.OVERWRITE, + readme_url="https://example.com/readme", + originating_entity="Test originating entity", + retention_schedule=RetentionScheduleEnum.FUTURE_ONLY, + scraper_url="https://example.com/scraper", + submission_notes="Test submission notes", + access_notes="Test Access notes", + access_types=[AccessTypeEnum.DOWNLOAD], + data_portal_type_other="Test data portal type other" + ) + await adb_client_test.add(insert) + + # Check prerequisites are met + assert operator.meets_task_prerequisites() + + # Run task and confirm runs without error + await run_task_and_confirm_success(operator) + + # Confirm expected method was called with expected parameters + request: UpdateDataSourcesOuterRequest = extract_and_validate_sync_request( + mock_pdap_client, + expected_path="data-sources/update", + expected_model=UpdateDataSourcesOuterRequest + ) + assert len(request.data_sources) == 1 + data_source: UpdateDataSourcesInnerRequest = request.data_sources[0] + assert data_source.app_id == ds_app_linked_data_source_url.ds_app_id + content: DataSourceSyncContentModel = data_source.content + assert content.source_url.startswith("https://example.com/") + assert content.name.startswith("Example ") + assert content.record_type == RecordType.CRIME_STATISTICS + assert content.description is None + assert content.record_formats == ["Record Format 1", "Record Format 2"] + assert content.data_portal_type == "Test Data Portal Type" + assert content.supplying_entity == "Test Supplying Entity" + assert content.coverage_start == date(year=2025, month=5, day=1) + assert content.coverage_end == date(year=2025, month=5, day=31) + assert content.detail_level is None + assert content.agency_supplied == True + assert content.agency_originated == True + assert content.update_method == UpdateMethodEnum.OVERWRITE + assert content.readme_url == "https://example.com/readme" + assert content.originating_entity == "Test originating entity" + assert content.retention_schedule == RetentionScheduleEnum.FUTURE_ONLY + assert content.scraper_url == "https://example.com/scraper" + assert content.access_notes == "Test Access notes" + assert content.access_types == [AccessTypeEnum.DOWNLOAD] + assert content.data_portal_type_other == "Test data portal type other" + + # Check DS App Link Is Updated + ds_app_link: DSAppLinkDataSource | None = await adb_client_test.one_or_none_model(model=DSAppLinkDataSource) + assert ds_app_link is not None + assert ds_app_link.ds_data_source_id == 67 + assert ds_app_link.last_synced_at > ds_app_linked_data_source_url.updated_at + diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_update_record_type.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_update_record_type.py new file mode 100644 index 00000000..66fae2cb --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_update_record_type.py @@ -0,0 +1,78 @@ +from sqlalchemy import update + +from src.api.shared.models.message_response import MessageResponse +from src.core.enums import RecordType +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update.core import \ + DSAppSyncDataSourcesUpdateTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource +from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType +from src.external.pdap.client import PDAPClient +from src.external.pdap.impl.sync.data_sources._shared.content import DataSourceSyncContentModel +from src.external.pdap.impl.sync.data_sources.update.request import UpdateDataSourcesInnerRequest, \ + UpdateDataSourcesOuterRequest +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.helpers import extract_and_validate_sync_request, \ + mock_make_request +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel +from tests.helpers.run import run_task_and_confirm_success + + +async def test_update_url( + ds_app_linked_data_source_url: DSAppLinkInfoModel, + operator: DSAppSyncDataSourcesUpdateTaskOperator, + mock_pdap_client: PDAPClient, + adb_client_test: AsyncDatabaseClient, + test_agency_id: int +): + # Mock make_request + mock_make_request( + mock_pdap_client=mock_pdap_client, + data=MessageResponse(message="Success") + ) + + # Check prerequisites not met + assert not await operator.meets_task_prerequisites() + + # Update URL Record Type table + statement = ( + update( + URLRecordType + ) + .values( + record_type=RecordType.POLICIES_AND_CONTRACTS + ) + .where( + URLRecordType.url_id == ds_app_linked_data_source_url.db_id + ) + ) + await adb_client_test.execute(statement) + + # Check prerequisites are met + assert operator.meets_task_prerequisites() + + # Run task and confirm runs without error + await run_task_and_confirm_success(operator) + + # Confirm expected method was called with expected parameters + request: UpdateDataSourcesOuterRequest = extract_and_validate_sync_request( + mock_pdap_client, + expected_path="data-sources/update", + expected_model=UpdateDataSourcesOuterRequest + ) + assert len(request.data_sources) == 1 + data_source: UpdateDataSourcesInnerRequest = request.data_sources[0] + assert data_source.app_id == ds_app_linked_data_source_url.ds_app_id + content: DataSourceSyncContentModel = data_source.content + assert content.name.startswith("Example ") + assert content.record_type == RecordType.POLICIES_AND_CONTRACTS + assert content.agency_ids == [ + test_agency_id + ] + assert content.retention_schedule is None + + # Check DS App Link Is Updated + ds_app_link: DSAppLinkDataSource | None = await adb_client_test.one_or_none_model(model=DSAppLinkDataSource) + assert ds_app_link is not None + assert ds_app_link.ds_data_source_id == 67 + assert ds_app_link.last_synced_at > ds_app_linked_data_source_url.updated_at + diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_update_url.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_update_url.py new file mode 100644 index 00000000..78c095c0 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/update/test_update_url.py @@ -0,0 +1,81 @@ +from sqlalchemy import update + +from src.api.shared.models.message_response import MessageResponse +from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update.core import \ + DSAppSyncDataSourcesUpdateTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource +from src.external.pdap.client import PDAPClient +from src.external.pdap.impl.sync.data_sources._shared.content import DataSourceSyncContentModel +from src.external.pdap.impl.sync.data_sources.update.request import UpdateDataSourcesInnerRequest, \ + UpdateDataSourcesOuterRequest +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.helpers import extract_and_validate_sync_request, \ + mock_make_request +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel +from tests.helpers.run import run_task_and_confirm_success + + +async def test_update_url( + ds_app_linked_data_source_url: DSAppLinkInfoModel, + operator: DSAppSyncDataSourcesUpdateTaskOperator, + mock_pdap_client: PDAPClient, + adb_client_test: AsyncDatabaseClient, + test_agency_id: int +): + # Mock make_request + mock_make_request( + mock_pdap_client=mock_pdap_client, + data=MessageResponse(message="Success") + ) + + # Check prerequisites not met + assert not await operator.meets_task_prerequisites() + + # Update URL table + statement = ( + update( + URL + ) + .values( + name="Updated URL Name", + scheme="http", + trailing_slash=True, + url="modified-example.com", + description="Updated URL Description", + ) + .where( + URL.id == ds_app_linked_data_source_url.db_id + ) + ) + await adb_client_test.execute(statement) + + # Check prerequisites are met + assert operator.meets_task_prerequisites() + + # Run task and confirm runs without error + await run_task_and_confirm_success(operator) + + # Confirm expected method was called with expected parameters + request: UpdateDataSourcesOuterRequest = extract_and_validate_sync_request( + mock_pdap_client, + expected_path="data-sources/update", + expected_model=UpdateDataSourcesOuterRequest + ) + assert len(request.data_sources) == 1 + data_source: UpdateDataSourcesInnerRequest = request.data_sources[0] + assert data_source.app_id == ds_app_linked_data_source_url.ds_app_id + content: DataSourceSyncContentModel = data_source.content + assert content.name == "Updated URL Name" + assert content.agency_ids == [ + test_agency_id + ] + assert content.source_url == "http://modified-example.com/" + assert content.description == "Updated URL Description" + + # Check DS App Link Is Updated + ds_app_link: DSAppLinkDataSource | None = await adb_client_test.one_or_none_model(model=DSAppLinkDataSource) + assert ds_app_link is not None + assert ds_app_link.ds_data_source_id == 67 + assert ds_app_link.last_synced_at > ds_app_linked_data_source_url.updated_at + diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/helpers.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/helpers.py new file mode 100644 index 00000000..f6d1bd68 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/helpers.py @@ -0,0 +1,40 @@ +from http import HTTPStatus +from typing import Any +from unittest.mock import AsyncMock + +from pdap_access_manager.enums import RequestType +from pdap_access_manager.models.request import RequestInfo +from pdap_access_manager.models.response import ResponseInfo +from pydantic import BaseModel + +from src.external.pdap.client import PDAPClient +from tests.helpers.mock import get_last_call_arguments + + +def get_last_request( + mock_pdap_client: PDAPClient +) -> RequestInfo: + return get_last_call_arguments(mock_pdap_client.access_manager.make_request)[0] + +def extract_and_validate_sync_request( + mock_pdap_client: PDAPClient, + expected_path: str, + expected_model: type[BaseModel] +) -> Any: + assert mock_pdap_client.access_manager.make_request.call_count == 1 + request_info: RequestInfo = get_last_request(mock_pdap_client) + assert request_info.type_ == RequestType.POST + full_expected_url: str = f"http://example.com/v3/sync/{expected_path}" + assert request_info.url == full_expected_url, f"Expected URL: {full_expected_url}, Actual URL: {request_info.url}" + return expected_model(**request_info.json_) + +def mock_make_request( + mock_pdap_client: PDAPClient, + data: BaseModel +) -> None: + mock_pdap_client.access_manager.make_request = AsyncMock( + return_value=ResponseInfo( + status_code=HTTPStatus.OK, + data=data.model_dump(mode='json') + ) + ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/__init__.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/conftest.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/conftest.py new file mode 100644 index 00000000..69bf1287 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/conftest.py @@ -0,0 +1,21 @@ +import pytest_asyncio + +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel + + +@pytest_asyncio.fixture +async def ds_app_linked_meta_url( + test_url_meta_url_id: int, + adb_client_test: AsyncDatabaseClient +) -> DSAppLinkInfoModel: + ds_app_link = DSAppLinkMetaURL( + url_id=test_url_meta_url_id, + ds_meta_url_id=67 + ) + await adb_client_test.add(ds_app_link) + return DSAppLinkInfoModel( + ds_app_id=67, + db_id=test_url_meta_url_id + ) diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/test_add.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/test_add.py new file mode 100644 index 00000000..e63e1496 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/test_add.py @@ -0,0 +1,67 @@ +import pytest + +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.add.core import DSAppSyncMetaURLsAddTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL +from src.external.pdap.client import PDAPClient +from src.external.pdap.impl.sync.meta_urls._shared.content import MetaURLSyncContentModel +from src.external.pdap.impl.sync.meta_urls.add.request import AddMetaURLsOuterRequest, AddMetaURLsInnerRequest +from src.external.pdap.impl.sync.shared.models.add.response import DSAppSyncAddResponseModel, \ + DSAppSyncAddResponseInnerModel +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.helpers import extract_and_validate_sync_request, \ + mock_make_request +from tests.helpers.data_creator.core import DBDataCreator +from tests.helpers.run import run_task_and_confirm_success + + +@pytest.mark.asyncio +async def test_add( + db_data_creator: DBDataCreator, + test_url_meta_url_id: int, + adb_client_test: AsyncDatabaseClient, + mock_pdap_client: PDAPClient, + test_agency_id: int +): + operator = DSAppSyncMetaURLsAddTaskOperator( + adb_client=adb_client_test, + pdap_client=mock_pdap_client + ) + + # Mock make_request + mock_make_request( + mock_pdap_client=mock_pdap_client, + data=DSAppSyncAddResponseModel( + entities=[ + DSAppSyncAddResponseInnerModel( + app_id=67, + request_id=test_url_meta_url_id + ) + ] + ) + ) + + + # Check meets prerequisites + assert await operator.meets_task_prerequisites() + + # Run task and confirm runs without error + await run_task_and_confirm_success(operator) + + # Confirm expected method was called with expected parameters + request: AddMetaURLsOuterRequest = extract_and_validate_sync_request( + mock_pdap_client, + expected_path="meta-urls/add", + expected_model=AddMetaURLsOuterRequest + ) + assert len(request.meta_urls) == 1 + meta_url: AddMetaURLsInnerRequest = request.meta_urls[0] + assert meta_url.request_id == test_url_meta_url_id + content: MetaURLSyncContentModel = meta_url.content + assert content.url.startswith("https://example.com/") + assert content.agency_ids == [test_agency_id] + + # Check Presence of DS Meta URL App Link + ds_app_link: DSAppLinkMetaURL | None = await adb_client_test.one_or_none_model(model=DSAppLinkMetaURL) + assert ds_app_link is not None + assert ds_app_link.ds_meta_url_id == 67 + assert ds_app_link.url_id == test_url_meta_url_id diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/test_delete.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/test_delete.py new file mode 100644 index 00000000..8218759f --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/test_delete.py @@ -0,0 +1,68 @@ +import pytest + +from src.api.shared.models.message_response import MessageResponse +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.delete.core import DSAppSyncMetaURLsDeleteTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.flag.ds_delete.meta_url import FlagDSDeleteMetaURL +from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL +from src.external.pdap.client import PDAPClient +from src.external.pdap.impl.sync.shared.models.delete.request import DSAppSyncDeleteRequestModel +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.helpers import extract_and_validate_sync_request, \ + mock_make_request +from tests.helpers.data_creator.core import DBDataCreator +from tests.helpers.run import run_task_and_confirm_success + + +@pytest.mark.asyncio +async def test_delete( + db_data_creator: DBDataCreator, + adb_client_test: AsyncDatabaseClient, + mock_pdap_client: PDAPClient +): + ds_meta_url_id: int = 67 + operator = DSAppSyncMetaURLsDeleteTaskOperator( + adb_client=adb_client_test, + pdap_client=mock_pdap_client + ) + + # Mock make_request + mock_make_request( + mock_pdap_client=mock_pdap_client, + data=MessageResponse(message="Success") + ) + + # Check does not currently meet prerequisite + assert not await operator.meets_task_prerequisites() + + # Add DS App Link + ds_app_link = DSAppLinkMetaURL( + ds_meta_url_id=ds_meta_url_id, + url_id=None, + ) + await adb_client_test.add(ds_app_link) + + # Add Task Deletion Flag for App Link + flag = FlagDSDeleteMetaURL( + ds_meta_url_id=ds_meta_url_id + ) + await adb_client_test.add(flag) + + # Check meets prerequisite + assert await operator.meets_task_prerequisites() + + # Run task and confirm runs without error + await run_task_and_confirm_success(operator) + + # Confirm expected method was called with expected parameters + request: DSAppSyncDeleteRequestModel = extract_and_validate_sync_request( + mock_pdap_client, + expected_path="meta-urls/delete", + expected_model=DSAppSyncDeleteRequestModel + ) + assert request.ids == [ds_meta_url_id] + + # Check DS App Link Is Deleted + assert await adb_client_test.has_no_rows(DSAppLinkMetaURL) + + # Check DS App Meta URL Deletion Flag is deleted + assert await adb_client_test.has_no_rows(FlagDSDeleteMetaURL) diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/__init__.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/conftest.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/conftest.py new file mode 100644 index 00000000..3b2e8e7b --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/conftest.py @@ -0,0 +1,16 @@ +import pytest + +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.update.core import DSAppSyncMetaURLsUpdateTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.external.pdap.client import PDAPClient + + +@pytest.fixture +def operator( + adb_client_test: AsyncDatabaseClient, + mock_pdap_client: PDAPClient +) -> DSAppSyncMetaURLsUpdateTaskOperator: + return DSAppSyncMetaURLsUpdateTaskOperator( + adb_client=adb_client_test, + pdap_client=mock_pdap_client + ) \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/helpers.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/helpers.py new file mode 100644 index 00000000..7901bea5 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/helpers.py @@ -0,0 +1,7 @@ +from datetime import datetime + + +def check_ds_app_link_updated( + old_updated_at: datetime +) -> None: + raise NotImplementedError \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/test_add_agency_link.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/test_add_agency_link.py new file mode 100644 index 00000000..1caa1eab --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/test_add_agency_link.py @@ -0,0 +1,63 @@ +from src.api.shared.models.message_response import MessageResponse +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.update.core import DSAppSyncMetaURLsUpdateTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL +from src.external.pdap.client import PDAPClient +from src.external.pdap.impl.sync.meta_urls._shared.content import MetaURLSyncContentModel +from src.external.pdap.impl.sync.meta_urls.update.request import UpdateMetaURLsOuterRequest, UpdateMetaURLsInnerRequest +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.helpers import extract_and_validate_sync_request, \ + mock_make_request +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel +from tests.conftest import adb_client_test +from tests.helpers.run import run_task_and_confirm_success + + +async def test_add_agency_link( + ds_app_linked_meta_url: DSAppLinkInfoModel, + test_agency_id: int, + test_agency_id_2: int, + operator: DSAppSyncMetaURLsUpdateTaskOperator, + mock_pdap_client: PDAPClient, + adb_client_test: AsyncDatabaseClient +): + # Mock make_request + mock_make_request( + mock_pdap_client=mock_pdap_client, + data=MessageResponse(message="Success") + ) + + # Check prerequisites not met + assert not await operator.meets_task_prerequisites() + + # Add agency link + link = LinkURLAgency( + url_id=ds_app_linked_meta_url.db_id, + agency_id=test_agency_id_2 + ) + await adb_client_test.add(link) + + # Check prerequisites are met + assert operator.meets_task_prerequisites() + + # Run task and confirm runs without error + await run_task_and_confirm_success(operator) + + # Confirm expected method was called with expected parameters + request: UpdateMetaURLsOuterRequest = extract_and_validate_sync_request( + mock_pdap_client, + expected_path="meta-urls/update", + expected_model=UpdateMetaURLsOuterRequest + ) + assert len(request.meta_urls) == 1 + meta_url: UpdateMetaURLsInnerRequest = request.meta_urls[0] + assert meta_url.app_id == ds_app_linked_meta_url.ds_app_id + content: MetaURLSyncContentModel = meta_url.content + assert content.url.startswith("https://example.com/") + assert set(content.agency_ids) == {test_agency_id, test_agency_id_2} + + # Check DS App Link Is Updated + ds_app_link: DSAppLinkMetaURL | None = await adb_client_test.one_or_none_model(model=DSAppLinkMetaURL) + assert ds_app_link is not None + assert ds_app_link.ds_meta_url_id == 67 + assert ds_app_link.last_synced_at > ds_app_linked_meta_url.updated_at diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/test_delete_agency_link.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/test_delete_agency_link.py new file mode 100644 index 00000000..11ef284d --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/test_delete_agency_link.py @@ -0,0 +1,66 @@ +from sqlalchemy import delete + +from src.api.shared.models.message_response import MessageResponse +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.update.core import DSAppSyncMetaURLsUpdateTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency +from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL +from src.external.pdap.client import PDAPClient +from src.external.pdap.impl.sync.meta_urls._shared.content import MetaURLSyncContentModel +from src.external.pdap.impl.sync.meta_urls.update.request import UpdateMetaURLsOuterRequest, UpdateMetaURLsInnerRequest +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.helpers import extract_and_validate_sync_request, \ + mock_make_request +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel +from tests.helpers.run import run_task_and_confirm_success + + +async def test_delete_agency_link( + ds_app_linked_meta_url: DSAppLinkInfoModel, + test_agency_id: int, + operator: DSAppSyncMetaURLsUpdateTaskOperator, + mock_pdap_client: PDAPClient, + adb_client_test: AsyncDatabaseClient +): + # Mock make_request + mock_make_request( + mock_pdap_client=mock_pdap_client, + data=MessageResponse(message="Success") + ) + + assert not await operator.meets_task_prerequisites() + + # Delete agency link + statement = ( + delete( + LinkURLAgency + ) + .where( + LinkURLAgency.url_id == ds_app_linked_meta_url.db_id, + LinkURLAgency.agency_id == test_agency_id + ) + ) + await adb_client_test.execute(statement) + + # Check prerequisites are met + assert await operator.meets_task_prerequisites() + + # Run task and confirm runs without error + await run_task_and_confirm_success(operator) + + # Confirm expected method was called with expected parameters + request: UpdateMetaURLsOuterRequest = extract_and_validate_sync_request( + mock_pdap_client, + expected_path="meta-urls/update", + expected_model=UpdateMetaURLsOuterRequest + ) + assert len(request.meta_urls) == 1 + meta_url: UpdateMetaURLsInnerRequest = request.meta_urls[0] + assert meta_url.app_id == ds_app_linked_meta_url.ds_app_id + content: MetaURLSyncContentModel = meta_url.content + assert content.agency_ids == [] + + # Check DS App Link Is Updated + ds_app_link: DSAppLinkMetaURL | None = await adb_client_test.one_or_none_model(model=DSAppLinkMetaURL) + assert ds_app_link is not None + assert ds_app_link.ds_meta_url_id == 67 + assert ds_app_link.last_synced_at > ds_app_linked_meta_url.updated_at diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/test_update_url.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/test_update_url.py new file mode 100644 index 00000000..0342c388 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/meta_url/update/test_update_url.py @@ -0,0 +1,75 @@ +from sqlalchemy import update + +from src.api.shared.models.message_response import MessageResponse +from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.update.core import DSAppSyncMetaURLsUpdateTaskOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.ds_meta_url.sqlalchemy import DSAppLinkMetaURL +from src.external.pdap.client import PDAPClient +from src.external.pdap.impl.sync.meta_urls._shared.content import MetaURLSyncContentModel +from src.external.pdap.impl.sync.meta_urls.update.request import UpdateMetaURLsOuterRequest, UpdateMetaURLsInnerRequest +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.helpers import extract_and_validate_sync_request, \ + mock_make_request +from tests.automated.integration.tasks.scheduled.impl.sync_to_ds.models.ds_app_link_info import DSAppLinkInfoModel +from tests.helpers.run import run_task_and_confirm_success + + +async def test_update_url( + ds_app_linked_meta_url: DSAppLinkInfoModel, + operator: DSAppSyncMetaURLsUpdateTaskOperator, + mock_pdap_client: PDAPClient, + adb_client_test: AsyncDatabaseClient, + test_agency_id: int +): + # Mock make_request + mock_make_request( + mock_pdap_client=mock_pdap_client, + data=MessageResponse(message="Success") + ) + + # Check prerequisites not met + assert not await operator.meets_task_prerequisites() + + # Update URL table + statement = ( + update( + URL + ) + .values( + name="Updated URL Name", + scheme="http", + trailing_slash=True, + url="modified-example.com", + description="Updated URL Description", + ) + .where( + URL.id == ds_app_linked_meta_url.db_id + ) + ) + await adb_client_test.execute(statement) + + # Check prerequisites are met + assert operator.meets_task_prerequisites() + + # Run task and confirm runs without error + await run_task_and_confirm_success(operator) + + # Confirm expected method was called with expected parameters + request: UpdateMetaURLsOuterRequest = extract_and_validate_sync_request( + mock_pdap_client, + expected_path="meta-urls/update", + expected_model=UpdateMetaURLsOuterRequest + ) + assert len(request.meta_urls) == 1 + meta_url: UpdateMetaURLsInnerRequest = request.meta_urls[0] + assert meta_url.app_id == ds_app_linked_meta_url.ds_app_id + content: MetaURLSyncContentModel = meta_url.content + assert content.url == "http://modified-example.com/" + assert set(content.agency_ids) == {test_agency_id} + + # Check DS App Link Is Updated + ds_app_link: DSAppLinkMetaURL | None = await adb_client_test.one_or_none_model(model=DSAppLinkMetaURL) + assert ds_app_link is not None + assert ds_app_link.ds_meta_url_id == 67 + assert ds_app_link.last_synced_at > ds_app_linked_meta_url.updated_at + diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/models/__init__.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/models/ds_app_link_info.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/models/ds_app_link_info.py new file mode 100644 index 00000000..36e86874 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/models/ds_app_link_info.py @@ -0,0 +1,9 @@ +from datetime import datetime + +from pydantic import BaseModel + + +class DSAppLinkInfoModel(BaseModel): + ds_app_id: int + db_id: int + updated_at: datetime = datetime.now() \ No newline at end of file diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/test_.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/test_.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/impl/update_url_status/__init__.py b/tests/automated/integration/tasks/scheduled/impl/update_url_status/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/scheduled/impl/update_url_status/test_core.py b/tests/automated/integration/tasks/scheduled/impl/update_url_status/test_core.py new file mode 100644 index 00000000..6b06fe31 --- /dev/null +++ b/tests/automated/integration/tasks/scheduled/impl/update_url_status/test_core.py @@ -0,0 +1,77 @@ +import pytest +from sqlalchemy import update + +from src.collectors.enums import URLStatus +from src.core.tasks.scheduled.impl.update_url_status.operator import UpdateURLStatusOperator +from src.db.client.async_ import AsyncDatabaseClient +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata +from tests.helpers.data_creator.core import DBDataCreator + + +@pytest.mark.asyncio +async def test_update_url_status_task( + test_url_data_source_id: int, + test_url_meta_url_id: int, + adb_client_test: AsyncDatabaseClient, + db_data_creator: DBDataCreator +): + + # Create Operator + operator = UpdateURLStatusOperator( + adb_client=adb_client_test, + ) + + # Add web metadata to URLs + ## Data Source URL: Add 404 + await db_data_creator.create_web_metadata( + url_ids=[test_url_data_source_id], + status_code=404 + ) + + ## Meta URL: Add 200 + await db_data_creator.create_web_metadata( + url_ids=[test_url_meta_url_id], + status_code=200 + ) + + # Run Task + await operator.run_task() + + # Check URLs + urls: list[URL] = await adb_client_test.get_all(URL) + id_status_set_tuple: set[tuple[int, URLStatus]] = { + (url.id, url.status) + for url in urls + } + ## Data Source URL: Status should now be broken + ## Meta URL: Status should be unchanged + assert id_status_set_tuple == { + (test_url_data_source_id, URLStatus.BROKEN), + (test_url_meta_url_id, URLStatus.OK) + } + + # Update Web Metadata for Data Source URL to be 404 + statement = update(URLWebMetadata).where( + URLWebMetadata.url_id == test_url_data_source_id, + ).values( + status_code=200 + ) + await adb_client_test.execute(statement) + + # Run Task + await operator.run_task() + + # Check URLs + urls: list[URL] = await adb_client_test.get_all(URL) + id_status_set_tuple: set[tuple[int, URLStatus]] = { + (url.id, url.status) + for url in urls + } + ## Data Source URL: Status should now be ok + ## Meta URL: Status should be unchanged + assert id_status_set_tuple == { + (test_url_data_source_id, URLStatus.OK), + (test_url_meta_url_id, URLStatus.OK) + } + diff --git a/tests/automated/integration/tasks/scheduled/loader/test_happy_path.py b/tests/automated/integration/tasks/scheduled/loader/test_happy_path.py index f3402f4f..4e5bb551 100644 --- a/tests/automated/integration/tasks/scheduled/loader/test_happy_path.py +++ b/tests/automated/integration/tasks/scheduled/loader/test_happy_path.py @@ -2,7 +2,7 @@ from src.core.tasks.scheduled.loader import ScheduledTaskOperatorLoader -NUMBER_OF_ENTRIES = 10 +NUMBER_OF_ENTRIES = 21 @pytest.mark.asyncio async def test_happy_path( diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/ckan/test_core.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/ckan/test_core.py index 90aacfa5..4ec99967 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/ckan/test_core.py +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/ckan/test_core.py @@ -1,5 +1,3 @@ -from unittest.mock import AsyncMock - import pytest from src.collectors.enums import CollectorType @@ -9,11 +7,6 @@ from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion -from src.external.pdap.enums import MatchAgencyResponseStatus -from src.core.tasks.url.operators.agency_identification.subtasks.impl.ckan_.core import CKANAgencyIDSubtaskOperator -from src.core.enums import SuggestionType -from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse -from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo from tests.helpers.asserts import assert_task_run_success from tests.helpers.data_creator.core import DBDataCreator @@ -21,7 +14,9 @@ @pytest.mark.asyncio async def test_ckan_subtask( operator: AgencyIdentificationTaskOperator, - db_data_creator: DBDataCreator + db_data_creator: DBDataCreator, + test_agency_id: int, + test_agency_id_2: int ): # Test that ckan subtask correctly sends agency id to # CKANAPIInterface, sends resultant agency name to @@ -53,25 +48,6 @@ async def test_ckan_subtask( assert await operator.meets_task_prerequisites() assert operator._subtask == AutoAgencyIDSubtaskType.CKAN - pdap_client_mock = operator.loader._pdap_client - pdap_client_mock.match_agency.return_value = MatchAgencyResponse( - status=MatchAgencyResponseStatus.PARTIAL_MATCH, - matches=[ - MatchAgencyInfo( - id=1, - submitted_name="Mock Agency Name", - ), - MatchAgencyInfo( - id=2, - submitted_name="Another Mock Agency Name", - ) - ] - ) - - # Create agencies - await db_data_creator.create_agency(1) - await db_data_creator.create_agency(2) - # Run the operator run_info: TaskOperatorRunInfo = await operator.run_task() assert_task_run_success(run_info) @@ -92,9 +68,9 @@ async def test_ckan_subtask( AgencyIDSubtaskSuggestion ) assert len(suggestions) == 2 - assert {suggestion.confidence for suggestion in suggestions} == {50} - assert {suggestion.agency_id for suggestion in suggestions} == {1, 2} + assert {suggestion.agency_id for suggestion in suggestions} == { + test_agency_id, + test_agency_id_2 + } assert {suggestion.subtask_id for suggestion in suggestions} == {subtask_id} - # Assert methods called as expected - pdap_client_mock.match_agency.assert_called_once_with(name="Test Agency") diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/ineligible_cases/test_blacklist.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/ineligible_cases/test_blacklist.py index 2334aa17..a592002f 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/ineligible_cases/test_blacklist.py +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/ineligible_cases/test_blacklist.py @@ -1,7 +1,7 @@ import pytest from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.flag.url_validated.enums import URLType from tests.helpers.data_creator.core import DBDataCreator @@ -27,7 +27,7 @@ async def test_blacklist( await db_data_creator.link_urls_to_root([url_id], root_url_id=root_url_id) # Create Meta URLs - meta_urls: list[URLMapping] = await db_data_creator.create_validated_urls( + meta_urls: list[SimpleURLMapping] = await db_data_creator.create_validated_urls( count=3, validation_type=URLType.META_URL ) diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/test_happy_path.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/test_happy_path.py index 10e3f711..7575f37e 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/test_happy_path.py +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/homepage_match/test_happy_path.py @@ -5,7 +5,7 @@ from src.core.tasks.base.run_info import TaskOperatorRunInfo from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.flag.url_validated.enums import URLType from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType, SubtaskDetailCode from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask @@ -26,7 +26,7 @@ async def test_homepage_match( """ # Create 2 root URLs - root_url_mappings: list[URLMapping] = ( + root_url_mappings: list[SimpleURLMapping] = ( await db_data_creator.create_urls(count=2) ) root_url_ids: list[int] = [url_mapping.url_id for url_mapping in root_url_mappings] @@ -60,7 +60,7 @@ async def test_homepage_match( # Create 2 Meta URLs and agencies for multi agency case - multi_meta_urls: list[URLMapping] = await db_data_creator.create_validated_urls( + multi_meta_urls: list[SimpleURLMapping] = await db_data_creator.create_validated_urls( count=2, validation_type=URLType.META_URL ) @@ -84,7 +84,7 @@ async def test_homepage_match( assert not await operator.meets_task_prerequisites() # Set up eligible URLs - eligible_urls: list[URLMapping] = await db_data_creator.create_urls( + eligible_urls: list[SimpleURLMapping] = await db_data_creator.create_urls( count=2, ) single_url_id: int = eligible_urls[0].url_id diff --git a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/muckrock/test_core.py b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/muckrock/test_core.py index 7cf72c5e..af41354d 100644 --- a/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/muckrock/test_core.py +++ b/tests/automated/integration/tasks/url/impl/agency_identification/subtasks/muckrock/test_core.py @@ -1,24 +1,14 @@ -from unittest.mock import MagicMock - import pytest from src.collectors.enums import CollectorType -from src.collectors.impl.muckrock.api_interface.core import MuckrockAPIInterface from src.collectors.impl.muckrock.api_interface.lookup_response import AgencyLookupResponse from src.collectors.impl.muckrock.enums import AgencyLookupResponseType -from src.core.enums import SuggestionType from src.core.tasks.base.run_info import TaskOperatorRunInfo from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator -from src.core.tasks.url.operators.agency_identification.dtos.suggestion import URLAgencySuggestionInfo -from src.core.tasks.url.operators.agency_identification.subtasks.impl.muckrock_.core import MuckrockAgencyIDSubtaskOperator from src.db.client.async_ import AsyncDatabaseClient from src.db.models.impl.url.suggestion.agency.subtask.enum import AutoAgencyIDSubtaskType from src.db.models.impl.url.suggestion.agency.subtask.sqlalchemy import URLAutoAgencyIDSubtask from src.db.models.impl.url.suggestion.agency.suggestion.sqlalchemy import AgencyIDSubtaskSuggestion -from src.external.pdap.client import PDAPClient -from src.external.pdap.dtos.match_agency.post import MatchAgencyInfo -from src.external.pdap.dtos.match_agency.response import MatchAgencyResponse -from src.external.pdap.enums import MatchAgencyResponseStatus from tests.helpers.asserts import assert_task_run_success from tests.helpers.data_creator.core import DBDataCreator @@ -26,7 +16,9 @@ @pytest.mark.asyncio async def test_muckrock_subtask( operator: AgencyIdentificationTaskOperator, - db_data_creator: DBDataCreator + db_data_creator: DBDataCreator, + test_agency_id: int, + test_agency_id_2: int ): adb_client: AsyncDatabaseClient = operator.adb_client @@ -81,38 +73,16 @@ async def test_muckrock_subtask( assert await operator.meets_task_prerequisites() assert operator._subtask == AutoAgencyIDSubtaskType.MUCKROCK - # Test that muckrock subtask correctly sends agency name to - # MatchAgenciesInterface and adds received suggestions to - # url_agency_suggestions - # Create mock instances for dependency injections muckrock_api_interface_mock = operator.loader._muckrock_api_interface - pdap_client_mock = operator.loader._pdap_client # Set up mock return values for method calls muckrock_api_interface_mock.lookup_agency.return_value = AgencyLookupResponse( type=AgencyLookupResponseType.FOUND, - name="Mock Agency Name", + name="Test Agency", error=None ) - # Create agencies - await db_data_creator.create_agency(1) - await db_data_creator.create_agency(2) - - pdap_client_mock.match_agency.return_value = MatchAgencyResponse( - status=MatchAgencyResponseStatus.PARTIAL_MATCH, - matches=[ - MatchAgencyInfo( - id=1, - submitted_name="Mock Agency Name", - ), - MatchAgencyInfo( - id=2, - submitted_name="Another Mock Agency Name", - ) - ] - ) # Run the operator run_info: TaskOperatorRunInfo = await operator.run_task() @@ -134,15 +104,8 @@ async def test_muckrock_subtask( AgencyIDSubtaskSuggestion ) assert len(suggestions) == 2 - assert {suggestion.confidence for suggestion in suggestions} == {50} - assert {suggestion.agency_id for suggestion in suggestions} == {1, 2} + assert {suggestion.agency_id for suggestion in suggestions} == { + test_agency_id, + test_agency_id_2 + } assert {suggestion.subtask_id for suggestion in suggestions} == {subtask_id} - - - # # Assert methods called as expected - muckrock_api_interface_mock.lookup_agency.assert_called_once_with( - muckrock_agency_id=123 - ) - pdap_client_mock.match_agency.assert_called_once_with( - name="Mock Agency Name" - ) diff --git a/tests/automated/integration/tasks/url/impl/auto_relevant/test_task.py b/tests/automated/integration/tasks/url/impl/auto_relevant/test_task.py index 5de999ec..3f4873f4 100644 --- a/tests/automated/integration/tasks/url/impl/auto_relevant/test_task.py +++ b/tests/automated/integration/tasks/url/impl/auto_relevant/test_task.py @@ -5,7 +5,7 @@ from src.collectors.enums import URLStatus from src.core.tasks.url.operators.auto_relevant.core import URLAutoRelevantTaskOperator from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.suggestion.relevant.auto.sqlalchemy import AutoRelevantSuggestion +from src.db.models.impl.url.suggestion.url_type.auto.sqlalchemy import AutoRelevantSuggestion from src.db.models.impl.url.task_error.sqlalchemy import URLTaskError from tests.automated.integration.tasks.url.impl.asserts import assert_prereqs_not_met, assert_prereqs_met from tests.automated.integration.tasks.url.impl.auto_relevant.setup import setup_operator, setup_urls diff --git a/tests/automated/integration/tasks/url/impl/html/setup/data.py b/tests/automated/integration/tasks/url/impl/html/setup/data.py index 5615392c..a3a43f8b 100644 --- a/tests/automated/integration/tasks/url/impl/html/setup/data.py +++ b/tests/automated/integration/tasks/url/impl/html/setup/data.py @@ -10,7 +10,7 @@ # and their html should be stored TestURLHTMLTaskSetupEntry( url_info=TestURLInfo( - url="https://happy-path.com/pending", + url="happy-path.com/pending", status=URLStatus.OK ), web_metadata_info=TestWebMetadataInfo( @@ -28,8 +28,8 @@ # and their web metadata status should be updated to 404 TestURLHTMLTaskSetupEntry( url_info=TestURLInfo( - url="https://not-found-path.com/submitted", - status=URLStatus.ERROR + url="not-found-path.com/submitted", + status=URLStatus.OK ), web_metadata_info=TestWebMetadataInfo( accessed=True, @@ -47,8 +47,8 @@ # URLs that give errors should be updated with the appropriate scrape status TestURLHTMLTaskSetupEntry( url_info=TestURLInfo( - url="https://error-path.com/submitted", - status=URLStatus.ERROR + url="error-path.com/submitted", + status=URLStatus.OK ), web_metadata_info=TestWebMetadataInfo( accessed=True, @@ -65,7 +65,7 @@ # URLs with non-200 web metadata should not be processed TestURLHTMLTaskSetupEntry( url_info=TestURLInfo( - url="https://not-200-path.com/submitted", + url="not-200-path.com/submitted", status=URLStatus.OK ), web_metadata_info=TestWebMetadataInfo( @@ -82,7 +82,7 @@ # URLs with no web metadata should not be processed TestURLHTMLTaskSetupEntry( url_info=TestURLInfo( - url="https://no-web-metadata.com/submitted", + url="no-web-metadata.com/submitted", status=URLStatus.OK ), web_metadata_info=None, diff --git a/tests/automated/integration/tasks/url/impl/html/setup/manager.py b/tests/automated/integration/tasks/url/impl/html/setup/manager.py index 986a9f7e..e01f7b6d 100644 --- a/tests/automated/integration/tasks/url/impl/html/setup/manager.py +++ b/tests/automated/integration/tasks/url/impl/html/setup/manager.py @@ -33,7 +33,8 @@ async def _setup_urls(self) -> list[TestURLHTMLTaskSetupRecord]: url=entry.url_info.url, name=f"Test for {entry.url_info.url}", record_type=RecordType.RESOURCES, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=False ) url_insert_models.append(url_insert_model) url_ids = await self.adb_client.bulk_insert(url_insert_models, return_ids=True) diff --git a/tests/automated/integration/tasks/url/impl/location_identification/subtasks/nlp_location_frequency/end_to_end/conftest.py b/tests/automated/integration/tasks/url/impl/location_identification/subtasks/nlp_location_frequency/end_to_end/conftest.py index 766a7ca5..e3d39db5 100644 --- a/tests/automated/integration/tasks/url/impl/location_identification/subtasks/nlp_location_frequency/end_to_end/conftest.py +++ b/tests/automated/integration/tasks/url/impl/location_identification/subtasks/nlp_location_frequency/end_to_end/conftest.py @@ -1,6 +1,6 @@ import pytest_asyncio -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from tests.helpers.data_creator.core import DBDataCreator @@ -9,7 +9,7 @@ async def url_ids( db_data_creator: DBDataCreator, ) -> list[int]: # Create 2 URLs with compressed HTML - url_mappings: list[URLMapping] = await db_data_creator.create_urls(count=2) + url_mappings: list[SimpleURLMapping] = await db_data_creator.create_urls(count=2) url_ids: list[int] = [url.url_id for url in url_mappings] await db_data_creator.html_data(url_ids=url_ids) return url_ids diff --git a/tests/automated/integration/tasks/url/impl/probe/check/manager.py b/tests/automated/integration/tasks/url/impl/probe/check/manager.py index a8d89ba5..200f428a 100644 --- a/tests/automated/integration/tasks/url/impl/probe/check/manager.py +++ b/tests/automated/integration/tasks/url/impl/probe/check/manager.py @@ -20,7 +20,9 @@ async def check_url( url_id: int, expected_status: URLStatus ): - url: URL = await self.adb_client.one_or_none(select(URL).where(URL.id == url_id)) + url: URL = await self.adb_client.one_or_none( + statement=select(URL).where(URL.id == url_id) + ) assert url is not None assert url.status == expected_status diff --git a/tests/automated/integration/tasks/url/impl/probe/constants.py b/tests/automated/integration/tasks/url/impl/probe/constants.py index 6c218e25..93988afb 100644 --- a/tests/automated/integration/tasks/url/impl/probe/constants.py +++ b/tests/automated/integration/tasks/url/impl/probe/constants.py @@ -1,6 +1,6 @@ from src.db.models.impl.url.core.enums import URLSource PATCH_ROOT = "src.external.url_request.core.URLProbeManager" -TEST_URL = "https://www.example.com" +TEST_URL = "www.example.com" TEST_DEST_URL = "https://www.example.com/redirect" TEST_SOURCE = URLSource.COLLECTOR diff --git a/tests/automated/integration/tasks/url/impl/probe/mocks/url_request_interface.py b/tests/automated/integration/tasks/url/impl/probe/mocks/url_request_interface.py index cc493274..2eb6a5d7 100644 --- a/tests/automated/integration/tasks/url/impl/probe/mocks/url_request_interface.py +++ b/tests/automated/integration/tasks/url/impl/probe/mocks/url_request_interface.py @@ -1,4 +1,5 @@ from src.external.url_request.probe.models.wrapper import URLProbeResponseOuterWrapper +from src.util.models.full_url import FullURL class MockURLRequestInterface: @@ -13,10 +14,10 @@ def __init__( responses = response_or_responses self._url_to_response = { - response.original_url: response for response in responses + response.original_url.id_form: response for response in responses } - async def probe_urls(self, urls: list[str]) -> list[URLProbeResponseOuterWrapper]: + async def probe_urls(self, urls: list[FullURL]) -> list[URLProbeResponseOuterWrapper]: return [ - self._url_to_response[url] for url in urls + self._url_to_response[url.id_form] for url in urls ] diff --git a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_two_urls.py b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_two_urls.py index cfd1f68f..c3b0c6c4 100644 --- a/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_two_urls.py +++ b/tests/automated/integration/tasks/url/impl/probe/no_redirect/test_two_urls.py @@ -12,8 +12,8 @@ async def test_two_urls( setup_manager: TestURLProbeSetupManager, check_manager: TestURLProbeCheckManager ): - url_1 = "https://example.com/1" - url_2 = "https://example.com/2" + url_1 = "example.com/1" + url_2 = "example.com/2" operator = setup_manager.setup_operator( response_or_responses=[ setup_manager.setup_no_redirect_probe_response( diff --git a/tests/automated/integration/tasks/url/impl/probe/redirect/test_dest_exists_in_db.py b/tests/automated/integration/tasks/url/impl/probe/redirect/test_dest_exists_in_db.py index b52dce6b..7aeeb1f8 100644 --- a/tests/automated/integration/tasks/url/impl/probe/redirect/test_dest_exists_in_db.py +++ b/tests/automated/integration/tasks/url/impl/probe/redirect/test_dest_exists_in_db.py @@ -30,7 +30,7 @@ async def test_url_probe_task_redirect_dest_exists_in_db( ) ) source_url_id = await setup_manager.setup_url(URLStatus.OK) - dest_url_id = await setup_manager.setup_url(URLStatus.OK, url=TEST_DEST_URL) + dest_url_id = await setup_manager.setup_url(URLStatus.OK, url=TEST_DEST_URL.replace("https://", "")) # Add web metadata for destination URL, to prevent it from being pulled web_metadata = URLWebMetadataPydantic( url_id=dest_url_id, diff --git a/tests/automated/integration/tasks/url/impl/probe/redirect/test_functional_equivalent.py b/tests/automated/integration/tasks/url/impl/probe/redirect/test_functional_equivalent.py new file mode 100644 index 00000000..a8cb51f7 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/probe/redirect/test_functional_equivalent.py @@ -0,0 +1,46 @@ +import pytest + +from src.collectors.enums import URLStatus +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.web_metadata.sqlalchemy import URLWebMetadata +from src.util.models.full_url import FullURL +from tests.automated.integration.tasks.url.impl.probe.check.manager import TestURLProbeCheckManager +from tests.automated.integration.tasks.url.impl.probe.constants import TEST_URL +from tests.automated.integration.tasks.url.impl.probe.setup.manager import TestURLProbeSetupManager +from tests.helpers.run import run_task_and_confirm_success + + +@pytest.mark.asyncio +async def test_url_probe_task_functional_equivalent( + setup_manager: TestURLProbeSetupManager, + check_manager: TestURLProbeCheckManager +): + """ + If a URL: + - is functionally equivalent to the original URL + The existing URL should be updated to the functional equivalent + And no web metadata added. + """ + + operator = setup_manager.setup_operator( + response_or_responses=setup_manager.setup_redirect_probe_response( + redirect_status_code=303, + dest_status_code=303, + dest_content_type=None, + dest_error=None, + redirect_url=FullURL(TEST_URL + "/") + ) + ) + url_id = await setup_manager.setup_url(URLStatus.OK) + await run_task_and_confirm_success(operator) + + urls: list[URL] = await setup_manager.adb_client.get_all(URL) + assert len(urls) == 1 + url: URL = urls[0] + + assert url.url == TEST_URL + assert url.trailing_slash is True + + # Web metadata should be added + web_metadata: list[URLWebMetadata] = await setup_manager.adb_client.get_all(URLWebMetadata) + assert len(web_metadata) == 1 diff --git a/tests/automated/integration/tasks/url/impl/probe/redirect/test_redirect_infinite.py b/tests/automated/integration/tasks/url/impl/probe/redirect/test_redirect_infinite.py deleted file mode 100644 index 5a66af3d..00000000 --- a/tests/automated/integration/tasks/url/impl/probe/redirect/test_redirect_infinite.py +++ /dev/null @@ -1,46 +0,0 @@ -import pytest - -from src.collectors.enums import URLStatus -from tests.automated.integration.tasks.url.impl.probe.check.manager import TestURLProbeCheckManager -from tests.automated.integration.tasks.url.impl.probe.constants import TEST_URL -from tests.automated.integration.tasks.url.impl.probe.setup.manager import TestURLProbeSetupManager - - -@pytest.mark.asyncio -async def test_url_probe_task_redirect_infinite( - setup_manager: TestURLProbeSetupManager, - check_manager: TestURLProbeCheckManager -): - """ - If a URL: - - returns a redirect response to itself - The task should add a link that points to itself - as well as web metadata response to the database URL - """ - - operator = setup_manager.setup_operator( - response_or_responses=setup_manager.setup_redirect_probe_response( - redirect_status_code=303, - dest_status_code=303, - dest_content_type=None, - dest_error=None, - redirect_url=TEST_URL - ) - ) - url_id = await setup_manager.setup_url(URLStatus.OK) - run_info = await operator.run_task() - await check_manager.check_url( - url_id=url_id, - expected_status=URLStatus.OK - ) - await check_manager.check_web_metadata( - url_id=url_id, - status_code=303, - content_type=None, - error=None, - accessed=True - ) - redirect_url_id = await check_manager.check_redirect( - source_url_id=url_id, - ) - assert redirect_url_id == url_id diff --git a/tests/automated/integration/tasks/url/impl/probe/redirect/test_two_urls_same_dest.py b/tests/automated/integration/tasks/url/impl/probe/redirect/test_two_urls_same_dest.py index f0e113ff..1dcd98d9 100644 --- a/tests/automated/integration/tasks/url/impl/probe/redirect/test_two_urls_same_dest.py +++ b/tests/automated/integration/tasks/url/impl/probe/redirect/test_two_urls_same_dest.py @@ -1,6 +1,7 @@ import pytest from src.collectors.enums import URLStatus +from src.util.models.full_url import FullURL from tests.automated.integration.tasks.url.impl.asserts import assert_task_ran_without_error from tests.automated.integration.tasks.url.impl.probe.check.manager import TestURLProbeCheckManager from tests.automated.integration.tasks.url.impl.probe.setup.manager import TestURLProbeSetupManager @@ -30,12 +31,12 @@ async def test_url_probe_task_redirect_two_urls_same_dest( dest_status_code=200, dest_content_type=None, dest_error=None, - source_url="https://example.com/2", + source_url=FullURL("example.com/2"), ), ] ) source_url_id_1 = await setup_manager.setup_url(URLStatus.OK) - source_url_id_2 = await setup_manager.setup_url(URLStatus.OK, url="https://example.com/2") + source_url_id_2 = await setup_manager.setup_url(URLStatus.OK, url="example.com/2") run_info = await operator.run_task() assert_task_ran_without_error(run_info) await check_manager.check_url( diff --git a/tests/automated/integration/tasks/url/impl/probe/setup/manager.py b/tests/automated/integration/tasks/url/impl/probe/setup/manager.py index 50405970..44b5bd54 100644 --- a/tests/automated/integration/tasks/url/impl/probe/setup/manager.py +++ b/tests/automated/integration/tasks/url/impl/probe/setup/manager.py @@ -8,6 +8,7 @@ from src.external.url_request.probe.models.redirect import URLProbeRedirectResponsePair from src.external.url_request.probe.models.response import URLProbeResponse from src.external.url_request.probe.models.wrapper import URLProbeResponseOuterWrapper +from src.util.models.full_url import FullURL from tests.automated.integration.tasks.url.impl.probe.constants import TEST_URL, TEST_DEST_URL, TEST_SOURCE from tests.automated.integration.tasks.url.impl.probe.mocks.url_request_interface import MockURLRequestInterface @@ -28,7 +29,8 @@ async def setup_url( url_insert_model = URLInsertModel( url=url, status=url_status, - source=TEST_SOURCE + source=TEST_SOURCE, + trailing_slash=False ) return ( await self.adb_client.bulk_insert( @@ -60,9 +62,9 @@ def setup_no_redirect_probe_response( url: str = TEST_URL ) -> URLProbeResponseOuterWrapper: return URLProbeResponseOuterWrapper( - original_url=url, + original_url=FullURL(url), response=URLProbeResponse( - url=url, + url=FullURL(url), status_code=status_code, content_type=content_type, error=error @@ -75,8 +77,8 @@ def setup_redirect_probe_response( dest_status_code: int, dest_content_type: str | None, dest_error: str | None, - source_url: str = TEST_URL, - redirect_url: str = TEST_DEST_URL + source_url: FullURL = FullURL(TEST_URL), + redirect_url: FullURL = FullURL(TEST_DEST_URL) ) -> URLProbeResponseOuterWrapper: if redirect_status_code not in (301, 302, 303, 307, 308): raise ValueError('Redirect response must be one of 301, 302, 303, 307, 308') diff --git a/tests/automated/integration/tasks/url/impl/root_url/constants.py b/tests/automated/integration/tasks/url/impl/root_url/constants.py index dc688797..d5e38e8f 100644 --- a/tests/automated/integration/tasks/url/impl/root_url/constants.py +++ b/tests/automated/integration/tasks/url/impl/root_url/constants.py @@ -1,5 +1,5 @@ -ROOT_URL = "https://root.com" -BRANCH_URL = "https://root.com/branch" -SECOND_BRANCH_URL = "https://root.com/second-branch" \ No newline at end of file +ROOT_URL = "root.com" +BRANCH_URL = "root.com/branch" +SECOND_BRANCH_URL = "root.com/second-branch" \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/root_url/test_branch_root_url_in_db.py b/tests/automated/integration/tasks/url/impl/root_url/test_branch_root_url_in_db.py index 7e8af066..75b7f68f 100644 --- a/tests/automated/integration/tasks/url/impl/root_url/test_branch_root_url_in_db.py +++ b/tests/automated/integration/tasks/url/impl/root_url/test_branch_root_url_in_db.py @@ -25,7 +25,8 @@ async def test_branch_root_url_in_db( # Add URL that is a root URL, and mark as such url_insert_model_root = URLInsertModel( url=ROOT_URL, - source=URLSource.DATA_SOURCES + source=URLSource.DATA_SOURCES, + trailing_slash=False ) root_url_id = (await operator.adb_client.bulk_insert([url_insert_model_root], return_ids=True))[0] root_model_flag_insert = FlagRootURLPydantic( @@ -36,7 +37,8 @@ async def test_branch_root_url_in_db( # Add URL that is a branch of the root URL url_insert_model = URLInsertModel( url=BRANCH_URL, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=False ) branch_url_id = (await operator.adb_client.bulk_insert([url_insert_model], return_ids=True))[0] diff --git a/tests/automated/integration/tasks/url/impl/root_url/test_branch_root_url_not_in_db.py b/tests/automated/integration/tasks/url/impl/root_url/test_branch_root_url_not_in_db.py index 6c00f8f9..a0a43d3c 100644 --- a/tests/automated/integration/tasks/url/impl/root_url/test_branch_root_url_not_in_db.py +++ b/tests/automated/integration/tasks/url/impl/root_url/test_branch_root_url_not_in_db.py @@ -26,7 +26,8 @@ async def test_branch_root_url_not_in_db( # Add URL that is a branch of a root URL url_insert_model = URLInsertModel( url=BRANCH_URL, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=False ) branch_url_id = (await operator.adb_client.bulk_insert([url_insert_model], return_ids=True))[0] diff --git a/tests/automated/integration/tasks/url/impl/root_url/test_is_root_url.py b/tests/automated/integration/tasks/url/impl/root_url/test_is_root_url.py index a6a56c7c..f129b582 100644 --- a/tests/automated/integration/tasks/url/impl/root_url/test_is_root_url.py +++ b/tests/automated/integration/tasks/url/impl/root_url/test_is_root_url.py @@ -23,7 +23,8 @@ async def test_is_root_url( # Add URL that is a root URL url_insert_model = URLInsertModel( url=ROOT_URL, - source=URLSource.DATA_SOURCES + source=URLSource.DATA_SOURCES, + trailing_slash=False ) url_id = (await operator.adb_client.bulk_insert([url_insert_model], return_ids=True))[0] diff --git a/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_in_db.py b/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_in_db.py index be67d23e..6fe57721 100644 --- a/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_in_db.py +++ b/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_in_db.py @@ -23,7 +23,8 @@ async def test_two_branches_one_root_in_db( # Add root URL and mark as such url_insert_model_root = URLInsertModel( url=ROOT_URL, - source=URLSource.DATA_SOURCES + source=URLSource.DATA_SOURCES, + trailing_slash=False ) url_id_root = (await operator.adb_client.bulk_insert([url_insert_model_root], return_ids=True))[0] root_model_flag_insert = FlagRootURLPydantic( @@ -34,13 +35,15 @@ async def test_two_branches_one_root_in_db( # Add two URLs that are branches of that root URL url_insert_model_branch_1 = URLInsertModel( url=BRANCH_URL, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=False ) url_id_branch_1 = (await operator.adb_client.bulk_insert([url_insert_model_branch_1], return_ids=True))[0] url_insert_model_branch_2 = URLInsertModel( url=SECOND_BRANCH_URL, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=False ) url_id_branch_2 = (await operator.adb_client.bulk_insert([url_insert_model_branch_2], return_ids=True))[0] diff --git a/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_in_db_not_flagged.py b/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_in_db_not_flagged.py index 614796e9..8a40a476 100644 --- a/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_in_db_not_flagged.py +++ b/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_in_db_not_flagged.py @@ -26,20 +26,23 @@ async def test_two_branches_one_root_in_db_not_flagged( # Add root URL but do not mark as such url_insert_model_root = URLInsertModel( url=ROOT_URL, - source=URLSource.DATA_SOURCES + source=URLSource.DATA_SOURCES, + trailing_slash=False ) url_id_root = (await operator.adb_client.bulk_insert([url_insert_model_root], return_ids=True))[0] # Add two URLs that are branches of that root URL url_insert_model_branch_1 = URLInsertModel( url=BRANCH_URL, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=False ) url_id_branch_1 = (await operator.adb_client.bulk_insert([url_insert_model_branch_1], return_ids=True))[0] url_insert_model_branch_2 = URLInsertModel( url=SECOND_BRANCH_URL, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=False ) url_id_branch_2 = (await operator.adb_client.bulk_insert([url_insert_model_branch_2], return_ids=True))[0] diff --git a/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_not_in_db.py b/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_not_in_db.py index f68786b9..8839905b 100644 --- a/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_not_in_db.py +++ b/tests/automated/integration/tasks/url/impl/root_url/test_two_branches_one_root_not_in_db.py @@ -23,13 +23,15 @@ async def test_two_branches_one_root_in_db_not_flagged( # Add two URLs that are branches of a root URL url_insert_model_branch_1 = URLInsertModel( url=BRANCH_URL, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=BRANCH_URL.endswith('/') ) url_id_branch_1 = (await operator.adb_client.bulk_insert([url_insert_model_branch_1], return_ids=True))[0] url_insert_model_branch_2 = URLInsertModel( url=SECOND_BRANCH_URL, - source=URLSource.COLLECTOR + source=URLSource.COLLECTOR, + trailing_slash=SECOND_BRANCH_URL.endswith('/') ) url_id_branch_2 = (await operator.adb_client.bulk_insert([url_insert_model_branch_2], return_ids=True))[0] diff --git a/tests/automated/integration/tasks/url/impl/screenshot/test_core.py b/tests/automated/integration/tasks/url/impl/screenshot/test_core.py index 6f54fbf9..9acffd0e 100644 --- a/tests/automated/integration/tasks/url/impl/screenshot/test_core.py +++ b/tests/automated/integration/tasks/url/impl/screenshot/test_core.py @@ -3,7 +3,7 @@ import pytest from src.core.tasks.url.operators.screenshot.core import URLScreenshotTaskOperator -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.url.screenshot.sqlalchemy import URLScreenshot from src.db.models.impl.url.task_error.sqlalchemy import URLTaskError from src.external.url_request.dtos.screenshot_response import URLScreenshotResponse @@ -24,9 +24,9 @@ async def test_core( assert not await operator.meets_task_prerequisites() # Add two URLs to database - url_mappings: list[URLMapping] = await db_data_creator.create_urls(count=2) - screenshot_mapping: URLMapping = url_mappings[0] - error_mapping: URLMapping = url_mappings[1] + url_mappings: list[SimpleURLMapping] = await db_data_creator.create_urls(count=2) + screenshot_mapping: SimpleURLMapping = url_mappings[0] + error_mapping: SimpleURLMapping = url_mappings[1] url_ids: list[int] = [url_mapping.url_id for url_mapping in url_mappings] # Add web metadata for 200 responses @@ -40,11 +40,11 @@ async def test_core( mock_get_screenshots = AsyncMock(return_value=[ URLScreenshotResponse( - url=screenshot_mapping.url, + url=f"https://{screenshot_mapping.url}", screenshot=bytes(124536), ), URLScreenshotResponse( - url=error_mapping.url, + url=f"https://{error_mapping.url}", screenshot=None, error="error", ) diff --git a/tests/automated/integration/tasks/url/impl/submit_approved/mock.py b/tests/automated/integration/tasks/url/impl/submit_approved/mock.py deleted file mode 100644 index 0e631d5b..00000000 --- a/tests/automated/integration/tasks/url/impl/submit_approved/mock.py +++ /dev/null @@ -1,38 +0,0 @@ -from http import HTTPStatus -from unittest.mock import AsyncMock - -from pdap_access_manager import ResponseInfo - -from src.core.enums import SubmitResponseStatus -from src.external.pdap.client import PDAPClient - - -def mock_make_request(pdap_client: PDAPClient, urls: list[str]): - assert len(urls) == 3, "Expected 3 urls" - pdap_client.access_manager.make_request = AsyncMock( - return_value=ResponseInfo( - status_code=HTTPStatus.OK, - data={ - "data_sources": [ - { - "url": urls[0], - "status": SubmitResponseStatus.SUCCESS, - "error": None, - "data_source_id": 21, - }, - { - "url": urls[1], - "status": SubmitResponseStatus.SUCCESS, - "error": None, - "data_source_id": 34, - }, - { - "url": urls[2], - "status": SubmitResponseStatus.FAILURE, - "error": "Test Error", - "data_source_id": None - } - ] - } - ) - ) diff --git a/tests/automated/integration/tasks/url/impl/submit_approved/setup.py b/tests/automated/integration/tasks/url/impl/submit_approved/setup.py deleted file mode 100644 index 1f9d8915..00000000 --- a/tests/automated/integration/tasks/url/impl/submit_approved/setup.py +++ /dev/null @@ -1,49 +0,0 @@ -from src.api.endpoints.review.approve.dto import FinalReviewApprovalInfo -from src.core.enums import RecordType -from tests.helpers.data_creator.core import DBDataCreator -from tests.helpers.data_creator.models.creation_info.batch.v1 import BatchURLCreationInfo - - -async def setup_validated_urls(db_data_creator: DBDataCreator, agency_id: int) -> list[str]: - creation_info: BatchURLCreationInfo = await db_data_creator.batch_and_urls( - url_count=3, - with_html_content=True - ) - - url_1 = creation_info.url_ids[0] - url_2 = creation_info.url_ids[1] - url_3 = creation_info.url_ids[2] - await db_data_creator.adb_client.approve_url( - approval_info=FinalReviewApprovalInfo( - url_id=url_1, - record_type=RecordType.ACCIDENT_REPORTS, - agency_ids=[agency_id], - name="URL 1 Name", - description=None, - record_formats=["Record Format 1", "Record Format 2"], - data_portal_type="Data Portal Type 1", - supplying_entity="Supplying Entity 1" - ), - user_id=1 - ) - await db_data_creator.adb_client.approve_url( - approval_info=FinalReviewApprovalInfo( - url_id=url_2, - record_type=RecordType.INCARCERATION_RECORDS, - agency_ids=[agency_id], - name="URL 2 Name", - description="URL 2 Description", - ), - user_id=2 - ) - await db_data_creator.adb_client.approve_url( - approval_info=FinalReviewApprovalInfo( - url_id=url_3, - record_type=RecordType.ACCIDENT_REPORTS, - agency_ids=[agency_id], - name="URL 3 Name", - description="URL 3 Description", - ), - user_id=3 - ) - return creation_info.urls diff --git a/tests/automated/integration/tasks/url/impl/submit_approved/test_submit_approved_url_task.py b/tests/automated/integration/tasks/url/impl/submit_approved/test_submit_approved_url_task.py deleted file mode 100644 index 3d1aec23..00000000 --- a/tests/automated/integration/tasks/url/impl/submit_approved/test_submit_approved_url_task.py +++ /dev/null @@ -1,135 +0,0 @@ -import pytest -from deepdiff import DeepDiff -from pdap_access_manager import RequestInfo, RequestType, DataSourcesNamespaces - -from src.collectors.enums import URLStatus -from src.core.tasks.url.enums import TaskOperatorOutcome -from src.core.tasks.url.operators.submit_approved.core import SubmitApprovedURLTaskOperator -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource -from src.db.models.impl.url.task_error.sqlalchemy import URLTaskError -from src.external.pdap.client import PDAPClient -from tests.automated.integration.tasks.url.impl.submit_approved.mock import mock_make_request -from tests.automated.integration.tasks.url.impl.submit_approved.setup import setup_validated_urls - - -@pytest.mark.asyncio -async def test_submit_approved_url_task( - db_data_creator, - mock_pdap_client: PDAPClient, - monkeypatch -): - """ - The submit_approved_url_task should submit - all validated URLs to the PDAP Data Sources App - """ - - - # Get Task Operator - operator = SubmitApprovedURLTaskOperator( - adb_client=db_data_creator.adb_client, - pdap_client=mock_pdap_client - ) - - # Check Task Operator does not yet meet pre-requisites - assert not await operator.meets_task_prerequisites() - - # Create URLs with status 'validated' in database and all requisite URL values - # Ensure they have optional metadata as well - agency_id = await db_data_creator.agency() - urls: list[str] = await setup_validated_urls(db_data_creator, agency_id=agency_id) - mock_make_request(mock_pdap_client, urls) - - # Check Task Operator does meet pre-requisites - assert await operator.meets_task_prerequisites() - - # Run Task - run_info = await operator.run_task() - - # Check Task has been marked as completed - assert run_info.outcome == TaskOperatorOutcome.SUCCESS, run_info.message - - # Check Task Operator no longer meets pre-requisites - assert not await operator.meets_task_prerequisites() - - # Get URLs - urls: list[URL] = await db_data_creator.adb_client.get_all(URL, order_by_attribute="id") - url_1: URL = urls[0] - url_2: URL = urls[1] - url_3: URL = urls[2] - - # Get URL Data Source Links - url_data_sources = await db_data_creator.adb_client.get_all(URLDataSource) - assert len(url_data_sources) == 2 - - url_data_source_1 = url_data_sources[0] - url_data_source_2 = url_data_sources[1] - - assert url_data_source_1.url_id == url_1.id - assert url_data_source_1.data_source_id == 21 - - assert url_data_source_2.url_id == url_2.id - assert url_data_source_2.data_source_id == 34 - - # Check that errored URL has entry in url_error_info - url_errors = await db_data_creator.adb_client.get_all(URLTaskError) - assert len(url_errors) == 1 - url_error = url_errors[0] - assert url_error.url_id == url_3.id - assert url_error.error == "Test Error" - - # Check mock method was called expected parameters - access_manager = mock_pdap_client.access_manager - access_manager.make_request.assert_called_once() - access_manager.build_url.assert_called_with( - namespace=DataSourcesNamespaces.SOURCE_COLLECTOR, - subdomains=['data-sources'] - ) - - call_1 = access_manager.make_request.call_args_list[0][0][0] - expected_call_1 = RequestInfo( - type_=RequestType.POST, - url="http://example.com", - headers=access_manager.jwt_header.return_value, - json_={ - "data_sources": [ - { - "name": "URL 1 Name", - "source_url": url_1.url, - "record_type": "Accident Reports", - "description": None, - "record_formats": ["Record Format 1", "Record Format 2"], - "data_portal_type": "Data Portal Type 1", - "last_approval_editor": 1, - "supplying_entity": "Supplying Entity 1", - "agency_ids": [agency_id] - }, - { - "name": "URL 2 Name", - "source_url": url_2.url, - "record_type": "Incarceration Records", - "description": "URL 2 Description", - "last_approval_editor": 2, - "supplying_entity": None, - "record_formats": None, - "data_portal_type": None, - "agency_ids": [agency_id] - }, - { - "name": "URL 3 Name", - "source_url": url_3.url, - "record_type": "Accident Reports", - "description": "URL 3 Description", - "last_approval_editor": 3, - "supplying_entity": None, - "record_formats": None, - "data_portal_type": None, - "agency_ids": [agency_id] - } - ] - } - ) - assert call_1.type_ == expected_call_1.type_ - assert call_1.headers == expected_call_1.headers - diff = DeepDiff(call_1.json_, expected_call_1.json_, ignore_order=True) - assert diff == {}, f"Differences found: {diff}" diff --git a/tests/automated/integration/tasks/url/impl/submit_approved/test_validated_meta_url.py b/tests/automated/integration/tasks/url/impl/submit_approved/test_validated_meta_url.py deleted file mode 100644 index 76754b29..00000000 --- a/tests/automated/integration/tasks/url/impl/submit_approved/test_validated_meta_url.py +++ /dev/null @@ -1,41 +0,0 @@ -import pytest - -from src.core.tasks.base.run_info import TaskOperatorRunInfo -from src.core.tasks.url.operators.submit_approved.core import SubmitApprovedURLTaskOperator -from src.db.models.impl.flag.url_validated.enums import URLType -from src.db.models.impl.url.data_source.sqlalchemy import URLDataSource -from src.external.pdap.client import PDAPClient -from tests.helpers.asserts import assert_task_run_success - - -@pytest.mark.asyncio -async def test_validated_meta_url_not_included( - db_data_creator, - mock_pdap_client: PDAPClient, -): - """ - If a validated Meta URL is included in the database - This should not be included in the submit approved task - """ - - # Get Task Operator - operator = SubmitApprovedURLTaskOperator( - adb_client=db_data_creator.adb_client, - pdap_client=mock_pdap_client - ) - - dbdc = db_data_creator - url_1: int = (await dbdc.create_validated_urls( - validation_type=URLType.META_URL - ))[0].url_id - - # Test task operator does not meet prerequisites - assert not await operator.meets_task_prerequisites() - - # Run task and confirm runs without error - run_info: TaskOperatorRunInfo = await operator.run_task() - assert_task_run_success(run_info) - - # Confirm entry not included in database - ds_urls: list[URLDataSource] = await dbdc.adb_client.get_all(URLDataSource) - assert len(ds_urls) == 0 diff --git a/tests/automated/integration/tasks/url/impl/submit_meta_urls/test_core.py b/tests/automated/integration/tasks/url/impl/submit_meta_urls/test_core.py deleted file mode 100644 index 37d6e00f..00000000 --- a/tests/automated/integration/tasks/url/impl/submit_meta_urls/test_core.py +++ /dev/null @@ -1,80 +0,0 @@ -from http import HTTPStatus -from unittest.mock import AsyncMock - -import pytest -from pdap_access_manager import ResponseInfo - -from src.collectors.enums import URLStatus -from src.core.enums import SubmitResponseStatus -from src.core.tasks.url.operators.submit_meta_urls.core import SubmitMetaURLsTaskOperator -from src.db.dtos.url.mapping import URLMapping -from src.db.models.impl.flag.url_validated.enums import URLType -from src.db.models.impl.url.core.sqlalchemy import URL -from src.db.models.impl.url.ds_meta_url.sqlalchemy import URLDSMetaURL -from src.external.pdap.client import PDAPClient -from src.external.pdap.impl.meta_urls.enums import SubmitMetaURLsStatus -from tests.helpers.data_creator.core import DBDataCreator -from tests.helpers.run import run_task_and_confirm_success - - -@pytest.mark.asyncio -async def test_submit_meta_urls( - db_data_creator: DBDataCreator, - mock_pdap_client: PDAPClient, -): - """ - Test Submit Meta URLs Task Operator - """ - - - operator = SubmitMetaURLsTaskOperator( - adb_client=db_data_creator.adb_client, - pdap_client=mock_pdap_client - ) - - assert not await operator.meets_task_prerequisites() - - # Create validated meta url - agency_id: int = (await db_data_creator.create_agencies(count=1))[0] - - mapping: URLMapping = (await db_data_creator.create_validated_urls( - validation_type=URLType.META_URL - ))[0] - await db_data_creator.link_urls_to_agencies( - url_ids=[mapping.url_id], - agency_ids=[agency_id] - ) - - mock_pdap_client.access_manager.make_request = AsyncMock( - return_value=ResponseInfo( - status_code=HTTPStatus.OK, - data={ - "meta_urls": [ - { - "url": mapping.url, - "agency_id": agency_id, - "status": SubmitMetaURLsStatus.SUCCESS.value, - "meta_url_id": 2, - "error": None, - }, - ] - } - ) - ) - - - assert await operator.meets_task_prerequisites() - - await run_task_and_confirm_success(operator) - - urls: list[URL] = await db_data_creator.adb_client.get_all(URL) - assert len(urls) == 1 - url: URL = urls[0] - assert url.status == URLStatus.OK - - url_ds_meta_urls: list[URLDSMetaURL] = await db_data_creator.adb_client.get_all(URLDSMetaURL) - assert len(url_ds_meta_urls) == 1 - url_ds_meta_url: URLDSMetaURL = url_ds_meta_urls[0] - assert url_ds_meta_url.url_id == url.id - assert url_ds_meta_url.ds_meta_url_id == 2 - assert url_ds_meta_url.agency_id == agency_id \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/test_url_miscellaneous_metadata_task.py b/tests/automated/integration/tasks/url/impl/test_url_miscellaneous_metadata_task.py index 0af83bff..bc3f240d 100644 --- a/tests/automated/integration/tasks/url/impl/test_url_miscellaneous_metadata_task.py +++ b/tests/automated/integration/tasks/url/impl/test_url_miscellaneous_metadata_task.py @@ -3,7 +3,7 @@ import pytest from src.core.tasks.url.operators.misc_metadata.core import URLMiscellaneousMetadataTaskOperator -from src.db.models.impl.url.optional_data_source_metadata import URLOptionalDataSourceMetadata +from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata from src.db.models.impl.url.core.sqlalchemy import URL from src.collectors.enums import CollectorType from src.core.tasks.url.enums import TaskOperatorOutcome @@ -122,12 +122,12 @@ async def test_url_miscellaneous_metadata_task(db_data_creator: DBDataCreator): assert url.description == expected_description, f"For url.id {url.id}, expected description {expected_description}, got {url.description}" expected_urls = { - common_crawler_url_id: (None, None, None), - auto_googler_url_id: (None, None, None), + common_crawler_url_id: ([], None, None), + auto_googler_url_id: ([], None, None), ckan_url_id: (["CSV", "JSON"], "Test Data Portal Type", "Test Supplying Entity"), - muckrock_simple_url_id: (None, None, None), - muckrock_county_url_id: (None, None, None), - muckrock_all_url_id: (None, None, None), + muckrock_simple_url_id: ([], None, None), + muckrock_county_url_id: ([], None, None), + muckrock_all_url_id: ([], None, None), } metadatas: list[URLOptionalDataSourceMetadata] = await db_data_creator.adb_client.get_all(URLOptionalDataSourceMetadata) diff --git a/tests/automated/integration/tasks/url/impl/test_url_record_type_task.py b/tests/automated/integration/tasks/url/impl/test_url_record_type_task.py index 1373f3fa..57f41ded 100644 --- a/tests/automated/integration/tasks/url/impl/test_url_record_type_task.py +++ b/tests/automated/integration/tasks/url/impl/test_url_record_type_task.py @@ -7,6 +7,7 @@ from src.core.tasks.url.enums import TaskOperatorOutcome from src.core.tasks.url.operators.record_type.core import URLRecordTypeTaskOperator from src.core.enums import RecordType +from src.db.models.impl.url.task_error.sqlalchemy import URLTaskError from tests.helpers.data_creator.core import DBDataCreator from src.core.tasks.url.operators.record_type.llm_api.record_classifier.deepseek import DeepSeekRecordClassifier @@ -52,3 +53,9 @@ async def test_url_record_type_task(db_data_creator: DBDataCreator): for suggestion in suggestions: assert suggestion.record_type == RecordType.ACCIDENT_REPORTS.value + # Get URL Error Tasks + url_error_tasks: list[URLTaskError] = await db_data_creator.adb_client.get_all(URLTaskError) + assert len(url_error_tasks) == 1 + url_error_task = url_error_tasks[0] + assert url_error_task.url_id == url_ids[1] + assert url_error_task.task_type == TaskType.RECORD_TYPE \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/validate/helper.py b/tests/automated/integration/tasks/url/impl/validate/helper.py index 6ab44984..879fbc66 100644 --- a/tests/automated/integration/tasks/url/impl/validate/helper.py +++ b/tests/automated/integration/tasks/url/impl/validate/helper.py @@ -1,3 +1,5 @@ +from uuid import UUID + from src.api.endpoints.annotate.agency.post.dto import URLAgencyAnnotationPostInfo from src.core.enums import RecordType from src.db.client.async_ import AsyncDatabaseClient @@ -8,6 +10,7 @@ from src.db.models.impl.url.core.sqlalchemy import URL from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType from src.db.models.impl.url.suggestion.name.enums import NameSuggestionSource +from src.db.queries.implementations.anonymous_session import MakeAnonymousSessionQueryBuilder from tests.conftest import db_data_creator from tests.helpers.counter import next_int from tests.helpers.data_creator.core import DBDataCreator @@ -95,6 +98,11 @@ async def add_agency_suggestions( ) ) + async def get_anonymous_session_id(self) -> UUID: + return await self.adb_client.run_query_builder( + MakeAnonymousSessionQueryBuilder() + ) + async def add_location_suggestions( self, count: int = 1, diff --git a/tests/automated/integration/tasks/url/impl/validate/test_data_source.py b/tests/automated/integration/tasks/url/impl/validate/test_data_source.py index 82bed288..4fe0d444 100644 --- a/tests/automated/integration/tasks/url/impl/validate/test_data_source.py +++ b/tests/automated/integration/tasks/url/impl/validate/test_data_source.py @@ -6,12 +6,18 @@ - URL Type (DATA SOURCE) And confirm it is validated as DATA SOURCE """ +from uuid import UUID + import pytest from src.core.enums import RecordType from src.core.tasks.url.operators.validate.core import AutoValidateURLTaskOperator from src.db.models.impl.flag.url_validated.enums import URLType -from tests.automated.integration.tasks.url.impl.validate.helper import TestValidateTaskHelper +from src.db.models.impl.url.suggestion.anonymous.agency.sqlalchemy import AnonymousAnnotationAgency +from src.db.models.impl.url.suggestion.anonymous.location.sqlalchemy import AnonymousAnnotationLocation +from src.db.models.impl.url.suggestion.anonymous.record_type.sqlalchemy import AnonymousAnnotationRecordType +from src.db.models.impl.url.suggestion.anonymous.url_type.sqlalchemy import AnonymousAnnotationURLType +from tests.automated.integration.tasks.url.impl.validate.helper import TestValidateTaskHelper, DEFAULT_RECORD_TYPE from tests.helpers.run import run_task_and_confirm_success @@ -27,20 +33,55 @@ async def test_data_source( assert not await operator.meets_task_prerequisites() - await helper.add_agency_suggestions(count=2) + await helper.add_agency_suggestions(count=1) assert not await operator.meets_task_prerequisites() - await helper.add_location_suggestions(count=2) + await helper.add_location_suggestions(count=1) assert not await operator.meets_task_prerequisites() - await helper.add_record_type_suggestions(count=2) + await helper.add_record_type_suggestions(count=1) assert not await operator.meets_task_prerequisites() await helper.add_name_suggestion(count=2) + assert not await operator.meets_task_prerequisites() + + # Add anonymous annotations + session_id_1: UUID = await helper.get_anonymous_session_id() + session_id_2: UUID = await helper.get_anonymous_session_id() + + for session_id in [session_id_1, session_id_2]: + anon_url_type = AnonymousAnnotationURLType( + url_type=URLType.DATA_SOURCE, + session_id=session_id, + url_id=helper.url_id + ) + anon_record_type = AnonymousAnnotationRecordType( + record_type=DEFAULT_RECORD_TYPE, + session_id=session_id, + url_id=helper.url_id + ) + anon_location = AnonymousAnnotationLocation( + location_id=helper.location_id, + session_id=session_id, + url_id=helper.url_id + ) + anon_agency = AnonymousAnnotationAgency( + agency_id=helper.agency_id, + session_id=session_id, + url_id=helper.url_id + ) + for model in [ + anon_url_type, + anon_record_type, + anon_location, + anon_agency + ]: + await helper.adb_client.add(model) + assert await operator.meets_task_prerequisites() # Add different record type suggestion @@ -52,8 +93,14 @@ async def test_data_source( # Assert no longer meets task prerequisites assert not await operator.meets_task_prerequisites() - # Add tiebreaker - await helper.add_record_type_suggestions() + # Add tiebreaker -- a single anonymous vote + session_id_3: UUID = await helper.get_anonymous_session_id() + anon_record_type = AnonymousAnnotationRecordType( + record_type=DEFAULT_RECORD_TYPE, + session_id=session_id_3, + url_id=helper.url_id + ) + await helper.adb_client.add(anon_record_type) assert await operator.meets_task_prerequisites() diff --git a/tests/automated/integration/tasks/url/loader/test_flags.py b/tests/automated/integration/tasks/url/loader/test_flags.py index f812c947..33014f5f 100644 --- a/tests/automated/integration/tasks/url/loader/test_flags.py +++ b/tests/automated/integration/tasks/url/loader/test_flags.py @@ -12,7 +12,6 @@ from src.core.tasks.url.operators.probe.core import URLProbeTaskOperator from src.core.tasks.url.operators.record_type.core import URLRecordTypeTaskOperator from src.core.tasks.url.operators.root_url.core import URLRootURLTaskOperator -from src.core.tasks.url.operators.submit_approved.core import SubmitApprovedURLTaskOperator class FlagTestParams(BaseModel): @@ -36,10 +35,6 @@ class Config: env_var="URL_AGENCY_IDENTIFICATION_TASK_FLAG", operator=AgencyIdentificationTaskOperator ), - FlagTestParams( - env_var="URL_SUBMIT_APPROVED_TASK_FLAG", - operator=SubmitApprovedURLTaskOperator - ), FlagTestParams( env_var="URL_MISC_METADATA_TASK_FLAG", operator=URLMiscellaneousMetadataTaskOperator diff --git a/tests/automated/integration/tasks/url/loader/test_happy_path.py b/tests/automated/integration/tasks/url/loader/test_happy_path.py index a7b02e89..0786cb24 100644 --- a/tests/automated/integration/tasks/url/loader/test_happy_path.py +++ b/tests/automated/integration/tasks/url/loader/test_happy_path.py @@ -2,7 +2,7 @@ from src.core.tasks.url.loader import URLTaskOperatorLoader -NUMBER_OF_TASK_OPERATORS: int = 14 +NUMBER_OF_TASK_OPERATORS: int = 21 @pytest.mark.asyncio async def test_happy_path( diff --git a/tests/conftest.py b/tests/conftest.py index 8ba93200..eddb7f2d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,18 +7,19 @@ import pytest_asyncio from aiohttp import ClientSession from alembic.config import Config -from sqlalchemy import create_engine, inspect, MetaData +from sqlalchemy import create_engine, inspect, MetaData, Engine +from sqlalchemy.ext.asyncio import AsyncEngine, create_async_engine from sqlalchemy.orm import scoped_session, sessionmaker from src.core.env_var_manager import EnvVarManager +from src.db.client.async_ import AsyncDatabaseClient +from src.db.client.sync import DatabaseClient +from src.db.helpers.connect import get_postgres_connection_string +from src.db.models.impl.log.sqlalchemy import Log # noqa: F401 # Below are to prevent import errors from src.db.models.impl.missing import Missing # noqa: F401 -from src.db.models.impl.log.sqlalchemy import Log # noqa: F401 from src.db.models.impl.task.error import TaskError # noqa: F401 from src.db.models.impl.url.checked_for_duplicate import URLCheckedForDuplicate # noqa: F401 -from src.db.client.async_ import AsyncDatabaseClient -from src.db.client.sync import DatabaseClient -from src.db.helpers.connect import get_postgres_connection_string from src.util.helper_functions import load_from_environment from tests.helpers.alembic_runner import AlembicRunner from tests.helpers.data_creator.core import DBDataCreator @@ -99,33 +100,55 @@ def setup_and_teardown(): live_connection.close() engine.dispose() +@pytest.fixture(scope="session") +def engine(): + conn = get_postgres_connection_string() + engine = create_engine(conn) + yield engine + engine.dispose() + +@pytest.fixture(scope="session") +def async_engine(): + conn = get_postgres_connection_string(is_async=True) + engine = create_async_engine(conn) + yield engine + engine.dispose() + @pytest.fixture -def wiped_database(): +def wiped_database( + engine: Engine +): """Wipe all data from database.""" - wipe_database(get_postgres_connection_string()) + wipe_database(engine) @pytest.fixture -def db_client_test(wiped_database) -> Generator[DatabaseClient, Any, None]: +def db_client_test( + wiped_database, + engine +) -> Generator[DatabaseClient, Any, None]: # Drop pre-existing table - conn = get_postgres_connection_string() - db_client = DatabaseClient(db_url=conn) + db_client = DatabaseClient(engine) yield db_client db_client.engine.dispose() @pytest_asyncio.fixture -async def populated_database(wiped_database) -> None: - conn = get_postgres_connection_string(is_async=True) - adb_client = AsyncDatabaseClient(db_url=conn) +async def populated_database( + wiped_database, + async_engine: AsyncEngine +) -> None: + adb_client = AsyncDatabaseClient(async_engine) await populate_database(adb_client) @pytest_asyncio.fixture -async def adb_client_test(wiped_database) -> AsyncGenerator[AsyncDatabaseClient, Any]: - conn = get_postgres_connection_string(is_async=True) - adb_client = AsyncDatabaseClient(db_url=conn) +async def adb_client_test( + wiped_database, + async_engine: AsyncEngine +) -> AsyncGenerator[AsyncDatabaseClient, Any]: + adb_client = AsyncDatabaseClient(async_engine) yield adb_client - adb_client.engine.dispose() + await adb_client.engine.dispose() @pytest.fixture def db_data_creator( diff --git a/tests/helpers/awaitable_barrier.py b/tests/helpers/awaitable_barrier.py deleted file mode 100644 index 8bf65a11..00000000 --- a/tests/helpers/awaitable_barrier.py +++ /dev/null @@ -1,13 +0,0 @@ -import asyncio - - -class AwaitableBarrier: - def __init__(self): - self._event = asyncio.Event() - - async def __call__(self, *args, **kwargs): - await self._event.wait() - - def release(self): - self._event.set() - diff --git a/tests/helpers/check.py b/tests/helpers/check.py new file mode 100644 index 00000000..b9172151 --- /dev/null +++ b/tests/helpers/check.py @@ -0,0 +1,20 @@ +import pytest +from fastapi import HTTPException + +from tests.helpers.api_test_helper import APITestHelper + + +def check_forbidden_url_type( + route: str, + method: str, + api_test_helper: APITestHelper, + **kwargs +) -> None: + with pytest.raises(HTTPException) as e: + api_test_helper.request_validator.open_v3( + url=route, + method=method, + **kwargs + ) + assert e.value.status_code == 400, f"Expected status code 400, got {e.value.status_code}" + assert e.value.detail['detail'] == 'URL type does not match expected URL type' \ No newline at end of file diff --git a/tests/helpers/data_creator/commands/impl/html_data.py b/tests/helpers/data_creator/commands/impl/html_data.py index c548eb5a..38ecb4bd 100644 --- a/tests/helpers/data_creator/commands/impl/html_data.py +++ b/tests/helpers/data_creator/commands/impl/html_data.py @@ -1,6 +1,7 @@ from src.db.dtos.url.html_content import URLHTMLContentInfo from src.db.models.impl.url.html.content.enums import HTMLContentType from src.db.dtos.url.raw_html import RawHTMLInfo +from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent from src.db.models.impl.url.scrape_info.enums import ScrapeStatus from src.db.models.impl.url.scrape_info.pydantic import URLScrapeInfoInsertModel from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase @@ -22,16 +23,16 @@ async def run(self) -> None: scraper_info_list = [] for url_id in self.url_ids: html_content_infos.append( - URLHTMLContentInfo( + URLHTMLContent( url_id=url_id, - content_type=HTMLContentType.TITLE, + content_type=HTMLContentType.TITLE.value, content="test html content" ) ) html_content_infos.append( - URLHTMLContentInfo( + URLHTMLContent( url_id=url_id, - content_type=HTMLContentType.DESCRIPTION, + content_type=HTMLContentType.DESCRIPTION.value, content="test description" ) ) @@ -47,5 +48,5 @@ async def run(self) -> None: scraper_info_list.append(scraper_info) await self.adb_client.add_raw_html(raw_html_info_list) - await self.adb_client.add_html_content_infos(html_content_infos) + await self.adb_client.add_all(html_content_infos) diff --git a/tests/helpers/data_creator/commands/impl/suggestion/auto/relevant.py b/tests/helpers/data_creator/commands/impl/suggestion/auto/relevant.py index 2e31491d..d85b5a1b 100644 --- a/tests/helpers/data_creator/commands/impl/suggestion/auto/relevant.py +++ b/tests/helpers/data_creator/commands/impl/suggestion/auto/relevant.py @@ -1,4 +1,4 @@ -from src.db.models.impl.url.suggestion.relevant.auto.pydantic.input import AutoRelevancyAnnotationInput +from src.db.models.impl.url.suggestion.url_type.auto.pydantic.input import AutoRelevancyAnnotationInput from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase diff --git a/tests/helpers/data_creator/commands/impl/urls_/convert.py b/tests/helpers/data_creator/commands/impl/urls_/convert.py index 66747e6c..c1e2db31 100644 --- a/tests/helpers/data_creator/commands/impl/urls_/convert.py +++ b/tests/helpers/data_creator/commands/impl/urls_/convert.py @@ -14,7 +14,7 @@ def convert_url_creation_enum_to_url_status(url_creation_enum: URLCreationEnum) case URLCreationEnum.NOT_RELEVANT: return URLStatus.OK case URLCreationEnum.ERROR: - return URLStatus.ERROR + raise ValueError("Invalid URL Status") case URLCreationEnum.DUPLICATE: return URLStatus.DUPLICATE case _: diff --git a/tests/helpers/data_creator/commands/impl/urls_/query.py b/tests/helpers/data_creator/commands/impl/urls_/query.py index 7587abfb..1123af8e 100644 --- a/tests/helpers/data_creator/commands/impl/urls_/query.py +++ b/tests/helpers/data_creator/commands/impl/urls_/query.py @@ -1,6 +1,7 @@ from datetime import datetime -from src.core.tasks.url.operators.submit_approved.tdo import SubmittedURLInfo +from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource +from tests.helpers.data_creator.commands.impl.urls_.tdo import SubmittedURLInfo from src.db.dtos.url.insert import InsertURLsInfo from src.db.models.impl.url.core.enums import URLSource from src.db.models.impl.url.core.pydantic.info import URLInfo @@ -64,7 +65,21 @@ def run_sync(self) -> InsertURLsInfo: submitted_at=self.created_at ) submitted_url_infos.append(submitted_url_info) - self.db_client.mark_urls_as_submitted(submitted_url_infos) + + url_data_source_objects: list[DSAppLinkDataSource] = [] + for info in submitted_url_infos: + url_id = info.url_id + data_source_id = info.data_source_id + + url_data_source_object = DSAppLinkDataSource( + url_id=url_id, + ds_data_source_id=data_source_id + ) + if info.submitted_at is not None: + url_data_source_object.created_at = info.submitted_at + url_data_source_objects.append(url_data_source_object) + + self.db_client.add_all(url_data_source_objects) return url_insert_info \ No newline at end of file diff --git a/tests/helpers/data_creator/commands/impl/urls_/tdo.py b/tests/helpers/data_creator/commands/impl/urls_/tdo.py new file mode 100644 index 00000000..a8991dcd --- /dev/null +++ b/tests/helpers/data_creator/commands/impl/urls_/tdo.py @@ -0,0 +1,12 @@ +from datetime import datetime + +from pydantic import BaseModel + +from src.core.enums import RecordType + + +class SubmittedURLInfo(BaseModel): + url_id: int + data_source_id: int | None + request_error: str | None + submitted_at: datetime | None = None \ No newline at end of file diff --git a/tests/helpers/data_creator/core.py b/tests/helpers/data_creator/core.py index cbeb207f..dd08a178 100644 --- a/tests/helpers/data_creator/core.py +++ b/tests/helpers/data_creator/core.py @@ -10,9 +10,9 @@ from src.db.client.async_ import AsyncDatabaseClient from src.db.client.sync import DatabaseClient from src.db.dtos.url.insert import InsertURLsInfo -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.enums import TaskType -from src.db.models.impl.agency.enums import AgencyType +from src.db.models.impl.agency.enums import AgencyType, JurisdictionType from src.db.models.impl.agency.sqlalchemy import Agency from src.db.models.impl.duplicate.pydantic.insert import DuplicateInsertInfo from src.db.models.impl.flag.root_url.sqlalchemy import FlagRootURL @@ -68,12 +68,19 @@ class DBDataCreator: """ Assists in the creation of test data """ - def __init__(self, db_client: Optional[DatabaseClient] = None): + def __init__( + self, + db_client: DatabaseClient | None = None, + adb_client: AsyncDatabaseClient | None = None + ): if db_client is not None: self.db_client = db_client else: self.db_client = DatabaseClient() - self.adb_client: AsyncDatabaseClient = AsyncDatabaseClient() + if adb_client is not None: + self.adb_client = adb_client + else: + self.adb_client: AsyncDatabaseClient = AsyncDatabaseClient() self.clients = DBDataCreatorClientContainer( adb=self.adb_client, db=self.db_client @@ -398,8 +405,8 @@ async def create_validated_urls( record_type: RecordType = RecordType.RESOURCES, validation_type: URLType = URLType.DATA_SOURCE, count: int = 1 - ) -> list[URLMapping]: - url_mappings: list[URLMapping] = await self.create_urls( + ) -> list[SimpleURLMapping]: + url_mappings: list[SimpleURLMapping] = await self.create_urls( record_type=record_type, count=count ) @@ -414,8 +421,8 @@ async def create_submitted_urls( self, record_type: RecordType = RecordType.RESOURCES, count: int = 1 - ) -> list[URLMapping]: - url_mappings: list[URLMapping] = await self.create_urls( + ) -> list[SimpleURLMapping]: + url_mappings: list[SimpleURLMapping] = await self.create_urls( record_type=record_type, count=count ) @@ -436,9 +443,9 @@ async def create_urls( collector_metadata: dict | None = None, count: int = 1, batch_id: int | None = None - ) -> list[URLMapping]: + ) -> list[SimpleURLMapping]: - url_mappings: list[URLMapping] = await create_urls( + url_mappings: list[SimpleURLMapping] = await create_urls( adb_client=self.adb_client, status=status, source=source, @@ -515,9 +522,10 @@ async def create_url_agency_links( async def create_agency(self, agency_id: int = 1) -> None: agency = Agency( - agency_id=agency_id, + id=agency_id, name=generate_test_name(agency_id), - agency_type=AgencyType.UNKNOWN + agency_type=AgencyType.UNKNOWN, + jurisdiction_type=JurisdictionType.LOCAL ) await self.adb_client.add_all([agency]) @@ -527,9 +535,10 @@ async def create_agencies(self, count: int = 3) -> list[int]: for _ in range(count): agency_id = next_int() agency = Agency( - agency_id=agency_id, + id=agency_id, name=generate_test_name(agency_id), - agency_type=AgencyType.UNKNOWN + agency_type=AgencyType.UNKNOWN, + jurisdiction_type=JurisdictionType.LOCAL ) agencies.append(agency) agency_ids.append(agency_id) diff --git a/tests/helpers/data_creator/create.py b/tests/helpers/data_creator/create.py index 200a34cd..57c9f9da 100644 --- a/tests/helpers/data_creator/create.py +++ b/tests/helpers/data_creator/create.py @@ -4,7 +4,7 @@ from src.core.enums import BatchStatus, RecordType from src.db import County, Locality, USState from src.db.client.async_ import AsyncDatabaseClient -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from src.db.models.impl.batch.pydantic.insert import BatchInsertModel from src.db.models.impl.flag.url_validated.enums import URLType from src.db.models.impl.flag.url_validated.pydantic import FlagURLValidatedPydantic @@ -13,7 +13,7 @@ from src.db.models.impl.url.core.pydantic.insert import URLInsertModel from src.db.models.impl.url.data_source.pydantic import URLDataSourcePydantic from src.db.models.impl.url.record_type.pydantic import URLRecordTypePydantic -from tests.helpers.counter import COUNTER, next_int +from tests.helpers.counter import next_int from tests.helpers.data_creator.generate import generate_batch, generate_urls, generate_validated_flags, \ generate_url_data_sources, generate_batch_url_links from tests.helpers.data_creator.models.creation_info.county import CountyCreationInfo @@ -37,7 +37,7 @@ async def create_urls( record_type: RecordType | None = RecordType.RESOURCES, collector_metadata: dict | None = None, count: int = 1 -) -> list[URLMapping]: +) -> list[SimpleURLMapping]: urls: list[URLInsertModel] = generate_urls( status=status, source=source, @@ -55,7 +55,7 @@ async def create_urls( ] await adb_client.bulk_insert(record_types) - return [URLMapping(url_id=url_id, url=url.url) for url_id, url in zip(url_ids, urls)] + return [SimpleURLMapping(url_id=url_id, url=url.url) for url_id, url in zip(url_ids, urls)] async def create_validated_flags( adb_client: AsyncDatabaseClient, diff --git a/tests/helpers/data_creator/generate.py b/tests/helpers/data_creator/generate.py index 1cf0a806..b447888d 100644 --- a/tests/helpers/data_creator/generate.py +++ b/tests/helpers/data_creator/generate.py @@ -48,11 +48,13 @@ def generate_urls( for i in range(count): val: int = next_int() results.append(URLInsertModel( - url=f"http://example.com/{val}", + url=f"example.com/{val}", + scheme="https", status=status, source=source, name=f"Example {val}", collector_metadata=collector_metadata, + trailing_slash=False )) return results @@ -74,7 +76,7 @@ def generate_url_data_sources( return [ URLDataSourcePydantic( url_id=url_id, - data_source_id=url_id, + ds_data_source_id=url_id, ) for url_id in url_ids ] \ No newline at end of file diff --git a/tests/helpers/data_creator/models/creation_info/url.py b/tests/helpers/data_creator/models/creation_info/url.py index 16c45a0a..67e148c0 100644 --- a/tests/helpers/data_creator/models/creation_info/url.py +++ b/tests/helpers/data_creator/models/creation_info/url.py @@ -2,14 +2,13 @@ from pydantic import BaseModel -from src.collectors.enums import URLStatus -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping from tests.helpers.batch_creation_parameters.annotation_info import AnnotationInfo from tests.helpers.batch_creation_parameters.enums import URLCreationEnum class URLCreationInfo(BaseModel): - url_mappings: list[URLMapping] + url_mappings: list[SimpleURLMapping] outcome: URLCreationEnum annotation_info: Optional[AnnotationInfo] = None diff --git a/tests/helpers/mock.py b/tests/helpers/mock.py new file mode 100644 index 00000000..b761887b --- /dev/null +++ b/tests/helpers/mock.py @@ -0,0 +1,5 @@ +from unittest.mock import MagicMock, AsyncMock + + +def get_last_call_arguments(mock: MagicMock | AsyncMock) -> tuple: + return mock.call_args_list[-1].args \ No newline at end of file diff --git a/tests/helpers/patch_functions.py b/tests/helpers/patch_functions.py deleted file mode 100644 index 170a2062..00000000 --- a/tests/helpers/patch_functions.py +++ /dev/null @@ -1,10 +0,0 @@ -from tests.helpers.awaitable_barrier import AwaitableBarrier - - -async def block_sleep(monkeypatch) -> AwaitableBarrier: - barrier = AwaitableBarrier() - monkeypatch.setattr( - "src.collectors.impl.example.core.ExampleCollector.sleep", - barrier - ) - return barrier diff --git a/tests/helpers/setup/final_review/core.py b/tests/helpers/setup/final_review/core.py index ababae82..a3a3d42c 100644 --- a/tests/helpers/setup/final_review/core.py +++ b/tests/helpers/setup/final_review/core.py @@ -9,7 +9,7 @@ async def setup_for_get_next_url_for_final_review( db_data_creator: DBDataCreator, - annotation_count: Optional[int] = None, + annotation_count: int | None = None, include_user_annotations: bool = True, include_miscellaneous_metadata: bool = True ) -> FinalReviewSetupInfo: diff --git a/tests/helpers/setup/final_review/model.py b/tests/helpers/setup/final_review/model.py index a3e57a3c..1eac963e 100644 --- a/tests/helpers/setup/final_review/model.py +++ b/tests/helpers/setup/final_review/model.py @@ -1,12 +1,10 @@ -from typing import Optional - from pydantic import BaseModel -from src.db.dtos.url.mapping import URLMapping +from src.db.dtos.url.mapping_.simple import SimpleURLMapping class FinalReviewSetupInfo(BaseModel): batch_id: int - url_mapping: URLMapping + url_mapping: SimpleURLMapping user_agency_id: int | None name_suggestion_id: int | None diff --git a/tests/helpers/setup/populate.py b/tests/helpers/setup/populate.py index 02c364d6..d0ce5869 100644 --- a/tests/helpers/setup/populate.py +++ b/tests/helpers/setup/populate.py @@ -5,7 +5,8 @@ async def populate_database(adb_client: AsyncDatabaseClient) -> None: """Populate database with test data.""" url = URL( - url="https://www.test-data.com/static-test-data", + url="www.test-data.com/static-test-data", + scheme="https", name="Fake test data", description="Test data populated as a result of `reset_database`, " "which imitates a validated URL synchronized from the Data Sources App.", diff --git a/tests/helpers/setup/wipe.py b/tests/helpers/setup/wipe.py index e81c266d..f6cd3582 100644 --- a/tests/helpers/setup/wipe.py +++ b/tests/helpers/setup/wipe.py @@ -1,11 +1,10 @@ -from sqlalchemy import create_engine +from sqlalchemy import create_engine, Engine from src.db.models.templates_.base import Base -def wipe_database(connection_string: str) -> None: +def wipe_database(engine: Engine) -> None: """Wipe all data from database.""" - engine = create_engine(connection_string) with engine.connect() as connection: for table in reversed(Base.metadata.sorted_tables): if table.info == "view": diff --git a/tests/helpers/simple_test_data_functions.py b/tests/helpers/simple_test_data_functions.py index 4d321dc5..b250dc83 100644 --- a/tests/helpers/simple_test_data_functions.py +++ b/tests/helpers/simple_test_data_functions.py @@ -10,14 +10,14 @@ def generate_test_urls(count: int) -> list[str]: results = [] for i in range(count): - url = f"https://example.com/{uuid.uuid4().hex}" + url = f"example.com/{uuid.uuid4().hex}" results.append(url) return results def generate_test_url(i: int) -> str: - return f"https://test.com/{i}" + return f"test.com/{i}" def generate_test_name(i: int | None = None) -> str: if i is None: diff --git a/tests/manual/agency_identifier/test_nlp_processor.py b/tests/manual/agency_identifier/test_nlp_processor.py index 30978a56..0786b830 100644 --- a/tests/manual/agency_identifier/test_nlp_processor.py +++ b/tests/manual/agency_identifier/test_nlp_processor.py @@ -1,7 +1,6 @@ import pytest -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor import \ - NLPProcessor +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.core import NLPProcessor SAMPLE_HTML: str = """ diff --git a/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py b/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py index bc9b5dfa..22203910 100644 --- a/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py +++ b/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py @@ -3,9 +3,8 @@ import dotenv from src.db.models.impl.batch.pydantic.info import BatchInfo -from src.collectors import CollectorType +from src.collectors.enums import CollectorType from src.core.enums import BatchStatus -from test_automated.integration.core.helpers.common_test_procedures import run_collector_and_wait_for_completion def test_auto_googler_collector_lifecycle(test_core): diff --git a/tests/manual/external/pdap/test_check_for_duplicate.py b/tests/manual/external/pdap/test_check_for_duplicate.py index 34bbc317..25a8bc52 100644 --- a/tests/manual/external/pdap/test_check_for_duplicate.py +++ b/tests/manual/external/pdap/test_check_for_duplicate.py @@ -4,6 +4,6 @@ @pytest.mark.asyncio async def test_check_for_duplicate(pdap_client): - response = await pdap_client.is_url_duplicate(url_to_check="https://example.com") + response = await pdap_client.is_url_duplicate(url_to_check="example.com") print(response) diff --git a/tests/manual/external/pdap/test_match_agency.py b/tests/manual/external/pdap/test_match_agency.py deleted file mode 100644 index a637dad0..00000000 --- a/tests/manual/external/pdap/test_match_agency.py +++ /dev/null @@ -1,6 +0,0 @@ -import pytest - - -@pytest.mark.asyncio -async def test_match_agency(pdap_client): - response = await pdap_client.match_agency(name="police") diff --git a/tests/manual/external/url_request/test_url_screenshot.py b/tests/manual/external/url_request/test_url_screenshot.py index b16535d6..3388c09f 100644 --- a/tests/manual/external/url_request/test_url_screenshot.py +++ b/tests/manual/external/url_request/test_url_screenshot.py @@ -12,7 +12,7 @@ async def test_url_screenshot(): """ urls: list[str] = [ - "https://www.example.com" + "www.example.com" ] responses: list[URLScreenshotResponse] = await get_screenshots(urls=urls) diff --git a/uv.lock b/uv.lock index e7f52cfd..50ae00e8 100644 --- a/uv.lock +++ b/uv.lock @@ -560,7 +560,7 @@ requires-dist = [ { name = "lxml", specifier = "~=5.1.0" }, { name = "marshmallow", specifier = "~=3.23.2" }, { name = "openai", specifier = "~=1.60.1" }, - { name = "pdap-access-manager", specifier = "==0.3.6" }, + { name = "pdap-access-manager", specifier = "==0.4.4" }, { name = "pillow", specifier = ">=11.3.0" }, { name = "pip", specifier = ">=25.2" }, { name = "playwright", specifier = "~=1.49.1" }, @@ -1591,7 +1591,7 @@ wheels = [ [[package]] name = "pdap-access-manager" -version = "0.3.6" +version = "0.4.4" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiohttp" }, @@ -1599,9 +1599,9 @@ dependencies = [ { name = "pydantic" }, { name = "requests" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/a4/14/d910483f08a0203a20fc2839738d9e27c83a66849fed422c3d4e804e15f5/pdap_access_manager-0.3.6.tar.gz", hash = "sha256:15c04f704e22116cd56b459e8a9d7f8514c75c36ca2c8a889b9ce2a308d88f6c", size = 4169, upload_time = "2025-06-12T20:14:55.942Z" } +sdist = { url = "https://files.pythonhosted.org/packages/75/60/743b8d5e2478e911c421f5cc4a8dec0f051542c12958776a4a96fef73ee5/pdap_access_manager-0.4.4.tar.gz", hash = "sha256:b824cf8014c6eb6ca29e797788f26d44d96ca0bc033309d5af2d878bb913d08f", size = 6004, upload_time = "2025-11-16T23:15:13.835Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/f7/81/76803339fd732cd3eda7458d48e67487d9377197f9ea7d4583df098823b2/pdap_access_manager-0.3.6-py3-none-any.whl", hash = "sha256:a5910068f642f7548d037bcb98657ca1945997fae4e89dc4e1d47283da485b91", size = 5034, upload_time = "2025-06-12T20:14:48.452Z" }, + { url = "https://files.pythonhosted.org/packages/bd/48/4fe13370886bcc2ac8b6e35d508c733a6eae05e2fe684e6aacc26d7ddea2/pdap_access_manager-0.4.4-py3-none-any.whl", hash = "sha256:b65203daec4a5bffe0be5d3577b6e515c644b3a9ca583c39f912f32e0bca11ef", size = 10808, upload_time = "2025-11-16T23:15:12.406Z" }, ] [[package]]