diff --git a/alembic/versions/2025_10_04_1640-f708c6a8ae5d_remove_unused_batches_columns.py b/alembic/versions/2025_10_04_1640-f708c6a8ae5d_remove_unused_batches_columns.py new file mode 100644 index 00000000..83d8c441 --- /dev/null +++ b/alembic/versions/2025_10_04_1640-f708c6a8ae5d_remove_unused_batches_columns.py @@ -0,0 +1,31 @@ +"""Remove unused batches columns + +Revision ID: f708c6a8ae5d +Revises: 445d8858b23a +Create Date: 2025-10-04 16:40:11.064794 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'f708c6a8ae5d' +down_revision: Union[str, None] = '445d8858b23a' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + +TABLE_NAME = "batches" + +def upgrade() -> None: + op.drop_column(TABLE_NAME, "strategy_success_rate") + op.drop_column(TABLE_NAME, "metadata_success_rate") + op.drop_column(TABLE_NAME, "agency_match_rate") + op.drop_column(TABLE_NAME, "record_type_match_rate") + op.drop_column(TABLE_NAME, "record_category_match_rate") + + +def downgrade() -> None: + pass diff --git a/pyproject.toml b/pyproject.toml index 2846bf88..70f54673 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,6 +51,7 @@ dev = [ "pytest-asyncio~=0.25.2", "pytest-mock==3.12.0", "pytest-timeout~=2.3.1", + "vulture>=2.14", ] diff --git a/src/core/tasks/url/operators/record_type/llm_api/record_classifier/base.py b/src/core/tasks/url/operators/record_type/llm_api/record_classifier/base.py index b995bda9..1268e4e5 100644 --- a/src/core/tasks/url/operators/record_type/llm_api/record_classifier/base.py +++ b/src/core/tasks/url/operators/record_type/llm_api/record_classifier/base.py @@ -70,8 +70,3 @@ async def classify_url(self, content_infos: list[URLHTMLContentInfo]) -> str: response_format=self.response_format ) return self.post_process_response(response) - - result_str = response.choices[0].message.content - - result_dict = json.loads(result_str) - return result_dict["record_type"] \ No newline at end of file diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 4e0c1dda..22e63ab5 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -722,11 +722,6 @@ async def insert_batch( status=batch_info.status.value, parameters=batch_info.parameters, compute_time=batch_info.compute_time, - strategy_success_rate=0, - metadata_success_rate=0, - agency_match_rate=0, - record_type_match_rate=0, - record_category_match_rate=0, ) if batch_info.date_generated is not None: batch.date_generated = batch_info.date_generated diff --git a/src/db/client/sync.py b/src/db/client/sync.py index 04ecc892..006d6f0e 100644 --- a/src/db/client/sync.py +++ b/src/db/client/sync.py @@ -72,11 +72,6 @@ def insert_batch(self, session: Session, batch_info: BatchInfo) -> int: status=batch_info.status.value, parameters=batch_info.parameters, compute_time=batch_info.compute_time, - strategy_success_rate=0, - metadata_success_rate=0, - agency_match_rate=0, - record_type_match_rate=0, - record_category_match_rate=0, ) if batch_info.date_generated is not None: batch.date_generated = batch_info.date_generated diff --git a/src/db/models/impl/batch/sqlalchemy.py b/src/db/models/impl/batch/sqlalchemy.py index b3c38ae9..564ce163 100644 --- a/src/db/models/impl/batch/sqlalchemy.py +++ b/src/db/models/impl/batch/sqlalchemy.py @@ -30,16 +30,7 @@ class Batch(WithIDBase): nullable=False ) date_generated = Column(TIMESTAMP, nullable=False, server_default=CURRENT_TIME_SERVER_DEFAULT) - # How often URLs ended up approved in the database - strategy_success_rate = Column(Float) - # Percentage of metadata identified by models - metadata_success_rate = Column(Float) - # Rate of matching to agencies - agency_match_rate = Column(Float) - # Rate of matching to record types - record_type_match_rate = Column(Float) - # Rate of matching to record categories - record_category_match_rate = Column(Float) + # Time taken to generate the batch # TODO: Add means to update after execution compute_time = Column(Float) diff --git a/src/db/statement_composer.py b/src/db/statement_composer.py index 8618fd84..0ae843b3 100644 --- a/src/db/statement_composer.py +++ b/src/db/statement_composer.py @@ -116,13 +116,3 @@ def user_suggestion_not_exists( @staticmethod def count_distinct(field, label): return func.count(func.distinct(field)).label(label) - - @staticmethod - def add_limit_and_page_offset(query: Select, page: int): - zero_offset_page = page - 1 - rows_offset = zero_offset_page * STANDARD_ROW_LIMIT - return query.offset( - rows_offset - ).limit( - STANDARD_ROW_LIMIT - ) diff --git a/uv.lock b/uv.lock index 739c9411..e7f52cfd 100644 --- a/uv.lock +++ b/uv.lock @@ -535,6 +535,7 @@ dev = [ { name = "pytest-asyncio" }, { name = "pytest-mock" }, { name = "pytest-timeout" }, + { name = "vulture" }, ] [package.metadata] @@ -587,6 +588,7 @@ dev = [ { name = "pytest-asyncio", specifier = "~=0.25.2" }, { name = "pytest-mock", specifier = "==3.12.0" }, { name = "pytest-timeout", specifier = "~=2.3.1" }, + { name = "vulture", specifier = ">=2.14" }, ] [[package]] @@ -2850,6 +2852,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/63/9a/0962b05b308494e3202d3f794a6e85abe471fe3cafdbcf95c2e8c713aabd/uvloop-0.21.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a5c39f217ab3c663dc699c04cbd50c13813e31d917642d459fdcec07555cc553", size = 4660018, upload_time = "2024-10-14T23:38:10.888Z" }, ] +[[package]] +name = "vulture" +version = "2.14" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8e/25/925f35db758a0f9199113aaf61d703de891676b082bd7cf73ea01d6000f7/vulture-2.14.tar.gz", hash = "sha256:cb8277902a1138deeab796ec5bef7076a6e0248ca3607a3f3dee0b6d9e9b8415", size = 58823, upload_time = "2024-12-08T17:39:43.319Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a0/56/0cc15b8ff2613c1d5c3dc1f3f576ede1c43868c1bc2e5ccaa2d4bcd7974d/vulture-2.14-py2.py3-none-any.whl", hash = "sha256:d9a90dba89607489548a49d557f8bac8112bd25d3cbc8aeef23e860811bd5ed9", size = 28915, upload_time = "2024-12-08T17:39:40.573Z" }, +] + [[package]] name = "wasabi" version = "1.1.3"