From fe67257ec4a3e25a61b414430c208c4e5d47f6dc Mon Sep 17 00:00:00 2001 From: maxachis Date: Thu, 20 Nov 2025 16:01:10 -0500 Subject: [PATCH 1/2] Begin draft --- ...30-c4edeb795134_remove_url_error_status.py | 33 +++++++++++++++++++ .../aggregated/query/url_error/query.py | 6 +++- .../metrics/batches/breakdown/error/cte_.py | 11 ++++--- .../url_counts/cte/error.py | 4 ++- .../api/metrics/batches/test_aggregated.py | 8 ++--- .../tasks/url/impl/html/setup/data.py | 4 +-- .../commands/impl/urls_/convert.py | 2 +- 7 files changed, 55 insertions(+), 13 deletions(-) create mode 100644 alembic/versions/2025_11_20_1530-c4edeb795134_remove_url_error_status.py diff --git a/alembic/versions/2025_11_20_1530-c4edeb795134_remove_url_error_status.py b/alembic/versions/2025_11_20_1530-c4edeb795134_remove_url_error_status.py new file mode 100644 index 00000000..32c977e2 --- /dev/null +++ b/alembic/versions/2025_11_20_1530-c4edeb795134_remove_url_error_status.py @@ -0,0 +1,33 @@ +"""Remove URL Error Status + +Revision ID: c4edeb795134 +Revises: b8a68f4260a4 +Create Date: 2025-11-20 15:30:15.783191 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + +from src.util.alembic_helpers import remove_enum_value + +# revision identifiers, used by Alembic. +revision: str = 'c4edeb795134' +down_revision: Union[str, None] = 'b8a68f4260a4' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + remove_enum_value( + enum_name="url_status", + value_to_remove="error", + targets=[ + ("urls", "status") + ] + ) + + +def downgrade() -> None: + pass diff --git a/src/api/endpoints/metrics/batches/aggregated/query/url_error/query.py b/src/api/endpoints/metrics/batches/aggregated/query/url_error/query.py index 9bcc3a57..a7b9e27a 100644 --- a/src/api/endpoints/metrics/batches/aggregated/query/url_error/query.py +++ b/src/api/endpoints/metrics/batches/aggregated/query/url_error/query.py @@ -5,10 +5,12 @@ from src.api.endpoints.metrics.batches.aggregated.query.models.strategy_count import CountByBatchStrategyResponse from src.collectors.enums import URLStatus +from src.db.helpers.query import exists_url from src.db.helpers.session import session_helper as sh from src.db.models.impl.batch.sqlalchemy import Batch from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.task_error.sqlalchemy import URLTaskError from src.db.queries.base.builder import QueryBuilderBase @@ -23,7 +25,9 @@ async def run(self, session: AsyncSession) -> list[CountByBatchStrategyResponse] .select_from(Batch) .join(LinkBatchURL) .join(URL) - .where(URL.status == URLStatus.ERROR) + .where( + exists_url(URLTaskError) + ) .group_by(Batch.strategy, URL.status) ) diff --git a/src/api/endpoints/metrics/batches/breakdown/error/cte_.py b/src/api/endpoints/metrics/batches/breakdown/error/cte_.py index ed2ff44f..6c54e45b 100644 --- a/src/api/endpoints/metrics/batches/breakdown/error/cte_.py +++ b/src/api/endpoints/metrics/batches/breakdown/error/cte_.py @@ -1,10 +1,11 @@ -from sqlalchemy import select, func, CTE, Column +from sqlalchemy import select, func -from src.collectors.enums import URLStatus +from src.api.endpoints.metrics.batches.breakdown.templates.cte_ import BatchesBreakdownURLCTE +from src.db.helpers.query import exists_url from src.db.models.impl.batch.sqlalchemy import Batch from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL -from src.api.endpoints.metrics.batches.breakdown.templates.cte_ import BatchesBreakdownURLCTE from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.task_error.sqlalchemy import URLTaskError URL_ERROR_CTE = BatchesBreakdownURLCTE( select( @@ -19,7 +20,9 @@ URL, URL.id == LinkBatchURL.url_id ) - .where(URL.status == URLStatus.ERROR) + .where( + exists_url(URLTaskError) + ) .group_by(Batch.id) .cte("error") ) diff --git a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/error.py b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/error.py index b74020c4..953a5c0d 100644 --- a/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/error.py +++ b/src/db/queries/implementations/core/get/recent_batch_summaries/url_counts/cte/error.py @@ -1,9 +1,11 @@ from sqlalchemy import select, func from src.collectors.enums import URLStatus +from src.db.helpers.query import exists_url from src.db.models.impl.batch.sqlalchemy import Batch from src.db.models.impl.link.batch_url.sqlalchemy import LinkBatchURL from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.task_error.sqlalchemy import URLTaskError from src.db.queries.implementations.core.get.recent_batch_summaries.url_counts.cte_container import \ URLCountsCTEContainer @@ -21,7 +23,7 @@ URL.id == LinkBatchURL.url_id, ) .where( - URL.status == URLStatus.ERROR + exists_url(URLTaskError) ) .group_by( Batch.id diff --git a/tests/automated/integration/api/metrics/batches/test_aggregated.py b/tests/automated/integration/api/metrics/batches/test_aggregated.py index 97cd805e..3d84d6d7 100644 --- a/tests/automated/integration/api/metrics/batches/test_aggregated.py +++ b/tests/automated/integration/api/metrics/batches/test_aggregated.py @@ -23,9 +23,9 @@ async def test_get_batches_aggregated_metrics( adb_client=adb_client, strategy=CollectorType.MANUAL, ) - url_mappings_error: list[SimpleURLMapping] = await create_urls( + url_mappings_broken: list[SimpleURLMapping] = await create_urls( adb_client=adb_client, - status=URLStatus.ERROR, + status=URLStatus.BROKEN, count=4, ) url_mappings_ok: list[SimpleURLMapping] = await create_urls( @@ -33,7 +33,7 @@ async def test_get_batches_aggregated_metrics( status=URLStatus.OK, count=11, ) - url_mappings_all: list[SimpleURLMapping] = url_mappings_error + url_mappings_ok + url_mappings_all: list[SimpleURLMapping] = url_mappings_broken + url_mappings_ok url_ids_all: list[int] = [url_mapping.url_id for url_mapping in url_mappings_all] await create_batch_url_links( adb_client=adb_client, @@ -88,5 +88,5 @@ async def test_get_batches_aggregated_metrics( assert inner_dto_manual.count_urls_pending == 15 assert inner_dto_manual.count_urls_submitted == 6 assert inner_dto_manual.count_urls_rejected == 9 - assert inner_dto_manual.count_urls_errors == 12 + assert inner_dto_manual.count_urls_errors == 0 # TODO: Change by adding URL Task Errors assert inner_dto_manual.count_urls_validated == 30 diff --git a/tests/automated/integration/tasks/url/impl/html/setup/data.py b/tests/automated/integration/tasks/url/impl/html/setup/data.py index 203eb34b..a3a43f8b 100644 --- a/tests/automated/integration/tasks/url/impl/html/setup/data.py +++ b/tests/automated/integration/tasks/url/impl/html/setup/data.py @@ -29,7 +29,7 @@ TestURLHTMLTaskSetupEntry( url_info=TestURLInfo( url="not-found-path.com/submitted", - status=URLStatus.ERROR + status=URLStatus.OK ), web_metadata_info=TestWebMetadataInfo( accessed=True, @@ -48,7 +48,7 @@ TestURLHTMLTaskSetupEntry( url_info=TestURLInfo( url="error-path.com/submitted", - status=URLStatus.ERROR + status=URLStatus.OK ), web_metadata_info=TestWebMetadataInfo( accessed=True, diff --git a/tests/helpers/data_creator/commands/impl/urls_/convert.py b/tests/helpers/data_creator/commands/impl/urls_/convert.py index 66747e6c..c1e2db31 100644 --- a/tests/helpers/data_creator/commands/impl/urls_/convert.py +++ b/tests/helpers/data_creator/commands/impl/urls_/convert.py @@ -14,7 +14,7 @@ def convert_url_creation_enum_to_url_status(url_creation_enum: URLCreationEnum) case URLCreationEnum.NOT_RELEVANT: return URLStatus.OK case URLCreationEnum.ERROR: - return URLStatus.ERROR + raise ValueError("Invalid URL Status") case URLCreationEnum.DUPLICATE: return URLStatus.DUPLICATE case _: From e10624cd36e01dd70971b7c3ebddf4abec669bfc Mon Sep 17 00:00:00 2001 From: Max Chis Date: Fri, 21 Nov 2025 17:46:34 -0500 Subject: [PATCH 2/2] Remove URL Error Status --- .../api/batch/summaries/test_happy_path.py | 10 +++------- .../api/metrics/batches/test_breakdown.py | 14 ++++---------- .../integration/api/metrics/test_backlog.py | 6 ------ .../api/metrics/urls/aggregated/test_core.py | 1 - .../api/metrics/urls/breakdown/test_pending.py | 4 ---- .../api/metrics/urls/breakdown/test_submitted.py | 4 ---- .../manual/agency_identifier/test_nlp_processor.py | 3 +-- .../core/lifecycle/test_auto_googler_lifecycle.py | 3 +-- 8 files changed, 9 insertions(+), 36 deletions(-) diff --git a/tests/automated/integration/api/batch/summaries/test_happy_path.py b/tests/automated/integration/api/batch/summaries/test_happy_path.py index f6e28238..6af9ce2b 100644 --- a/tests/automated/integration/api/batch/summaries/test_happy_path.py +++ b/tests/automated/integration/api/batch/summaries/test_happy_path.py @@ -29,10 +29,6 @@ async def test_get_batch_summaries(api_test_helper): count=4, status=URLCreationEnum.NOT_RELEVANT ), - TestURLCreationParameters( - count=3, - status=URLCreationEnum.ERROR - ) ] ), TestBatchCreationParameters( @@ -78,10 +74,10 @@ async def test_get_batch_summaries(api_test_helper): result_2 = results[1] assert result_2.id == batch_2_id counts_2 = result_2.url_counts - assert counts_2.total == 7 + assert counts_2.total == 4 assert counts_2.not_relevant == 4 - assert counts_2.errored == 3 - assert counts_2.pending == 3 + assert counts_2.errored == 0 + assert counts_2.pending == 0 assert counts_2.submitted == 0 assert counts_2.duplicate == 0 diff --git a/tests/automated/integration/api/metrics/batches/test_breakdown.py b/tests/automated/integration/api/metrics/batches/test_breakdown.py index ca05eaa1..6921c3c1 100644 --- a/tests/automated/integration/api/metrics/batches/test_breakdown.py +++ b/tests/automated/integration/api/metrics/batches/test_breakdown.py @@ -49,12 +49,6 @@ async def test_get_batches_breakdown_metrics(api_test_helper): strategy=CollectorType.AUTO_GOOGLER, date_generated=today - timedelta(days=14) ) - error_url_mappings: list[SimpleURLMapping] = await create_urls( - adb_client=adb_client, - status=URLStatus.ERROR, - count=4, - ) - error_url_ids: list[int] = [url_mapping.url_id for url_mapping in error_url_mappings] validated_url_mappings: list[SimpleURLMapping] = await create_urls( adb_client=adb_client, count=8, @@ -73,7 +67,7 @@ async def test_get_batches_breakdown_metrics(api_test_helper): await create_batch_url_links( adb_client=adb_client, batch_id=batch_id_3, - url_ids=error_url_ids + validated_url_ids, + url_ids=validated_url_ids, ) @@ -107,11 +101,11 @@ async def test_get_batches_breakdown_metrics(api_test_helper): assert dto_batch_3.batch_id == batch_id_3 assert dto_batch_3.status == BatchStatus.READY_TO_LABEL assert dto_batch_3.strategy == CollectorType.AUTO_GOOGLER - assert dto_batch_3.count_url_total == 12 - assert dto_batch_3.count_url_pending == 5 + assert dto_batch_3.count_url_total == 8 + assert dto_batch_3.count_url_pending == 1 assert dto_batch_3.count_url_submitted == 0 assert dto_batch_3.count_url_rejected == 3 - assert dto_batch_3.count_url_error == 4 + assert dto_batch_3.count_url_error == 0 assert dto_batch_3.count_url_validated == 7 dto_2 = await ath.request_validator.get_batches_breakdown_metrics( diff --git a/tests/automated/integration/api/metrics/test_backlog.py b/tests/automated/integration/api/metrics/test_backlog.py index 09f687f5..181c295e 100644 --- a/tests/automated/integration/api/metrics/test_backlog.py +++ b/tests/automated/integration/api/metrics/test_backlog.py @@ -46,12 +46,6 @@ async def test_get_backlog_metrics(api_test_helper): url_ids=not_relevant_url_ids_2[:4], validation_type=URLType.NOT_RELEVANT ) - error_url_mappings_2: list[SimpleURLMapping] = await ddc.create_urls( - status=URLStatus.ERROR, - count=2 - ) - error_url_ids_2: list[int] = [url_mapping.url_id for url_mapping in error_url_mappings_2] - await ddc.create_batch_url_links(url_ids=error_url_ids_2, batch_id=batch_2_id) await adb_client.populate_backlog_snapshot( dt=today.subtract(months=2).naive() diff --git a/tests/automated/integration/api/metrics/urls/aggregated/test_core.py b/tests/automated/integration/api/metrics/urls/aggregated/test_core.py index 1d8eb947..e203b722 100644 --- a/tests/automated/integration/api/metrics/urls/aggregated/test_core.py +++ b/tests/automated/integration/api/metrics/urls/aggregated/test_core.py @@ -47,7 +47,6 @@ async def test_get_urls_aggregated_metrics(api_test_helper): strategy=CollectorType.AUTO_GOOGLER, ) url_mappings_2_ok: list[SimpleURLMapping] = await ddc.create_urls(batch_id=batch_2, count=4, status=URLStatus.OK) - url_mappings_2_error: list[SimpleURLMapping] = await ddc.create_urls(batch_id=batch_2, count=2, status=URLStatus.ERROR) url_mappings_2_validated: list[SimpleURLMapping] = await ddc.create_validated_urls(count=1, validation_type=URLType.DATA_SOURCE) url_mappings_2_not_relevant: list[SimpleURLMapping] = await ddc.create_validated_urls(count=5, validation_type=URLType.NOT_RELEVANT) url_ids_2_validated: list[int] = [url_mapping.url_id for url_mapping in url_mappings_2_validated] diff --git a/tests/automated/integration/api/metrics/urls/breakdown/test_pending.py b/tests/automated/integration/api/metrics/urls/breakdown/test_pending.py index 3e906a8c..9bdf59ba 100644 --- a/tests/automated/integration/api/metrics/urls/breakdown/test_pending.py +++ b/tests/automated/integration/api/metrics/urls/breakdown/test_pending.py @@ -64,10 +64,6 @@ async def test_get_urls_breakdown_pending_metrics(api_test_helper): count=3, status=URLCreationEnum.SUBMITTED ), - TestURLCreationParameters( - count=4, - status=URLCreationEnum.ERROR - ), TestURLCreationParameters( count=5, status=URLCreationEnum.OK, diff --git a/tests/automated/integration/api/metrics/urls/breakdown/test_submitted.py b/tests/automated/integration/api/metrics/urls/breakdown/test_submitted.py index cbd30f8b..d0a25ab1 100644 --- a/tests/automated/integration/api/metrics/urls/breakdown/test_submitted.py +++ b/tests/automated/integration/api/metrics/urls/breakdown/test_submitted.py @@ -47,10 +47,6 @@ async def test_get_urls_breakdown_submitted_metrics(api_test_helper): count=3, status=URLCreationEnum.SUBMITTED ), - TestURLCreationParameters( - count=4, - status=URLCreationEnum.ERROR - ), TestURLCreationParameters( count=5, status=URLCreationEnum.VALIDATED diff --git a/tests/manual/agency_identifier/test_nlp_processor.py b/tests/manual/agency_identifier/test_nlp_processor.py index 30978a56..0786b830 100644 --- a/tests/manual/agency_identifier/test_nlp_processor.py +++ b/tests/manual/agency_identifier/test_nlp_processor.py @@ -1,7 +1,6 @@ import pytest -from src.core.tasks.url.operators.agency_identification.subtasks.impl.nlp_location_match_.processor import \ - NLPProcessor +from src.core.tasks.url.operators.location_id.subtasks.impl.nlp_location_freq.processor.nlp.core import NLPProcessor SAMPLE_HTML: str = """ diff --git a/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py b/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py index bc9b5dfa..22203910 100644 --- a/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py +++ b/tests/manual/core/lifecycle/test_auto_googler_lifecycle.py @@ -3,9 +3,8 @@ import dotenv from src.db.models.impl.batch.pydantic.info import BatchInfo -from src.collectors import CollectorType +from src.collectors.enums import CollectorType from src.core.enums import BatchStatus -from test_automated.integration.core.helpers.common_test_procedures import run_collector_and_wait_for_completion def test_auto_googler_collector_lifecycle(test_core):