Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
"""Remove unused batch columns

Revision ID: c1380f90f5de
Revises: 00cc949e0347
Create Date: 2025-06-03 08:14:15.583777

"""
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision: str = 'c1380f90f5de'
down_revision: Union[str, None] = '00cc949e0347'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None

TABLE_NAME = "batches"
TOTAL_URL_COUNT_COLUMN = "total_url_count"
ORIGINAL_URL_COUNT_COLUMN = "original_url_count"
DUPLICATE_URL_COUNT_COLUMN = "duplicate_url_count"

def upgrade() -> None:

Check warning on line 25 in alembic/versions/2025_06_03_0814-c1380f90f5de_remove_unused_batch_columns.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_06_03_0814-c1380f90f5de_remove_unused_batch_columns.py#L25 <103>

Missing docstring in public function
Raw output
./alembic/versions/2025_06_03_0814-c1380f90f5de_remove_unused_batch_columns.py:25:1: D103 Missing docstring in public function
for column in [
TOTAL_URL_COUNT_COLUMN,
ORIGINAL_URL_COUNT_COLUMN,
DUPLICATE_URL_COUNT_COLUMN,
]:
op.drop_column(TABLE_NAME, column)


def downgrade() -> None:

Check warning on line 34 in alembic/versions/2025_06_03_0814-c1380f90f5de_remove_unused_batch_columns.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_06_03_0814-c1380f90f5de_remove_unused_batch_columns.py#L34 <103>

Missing docstring in public function
Raw output
./alembic/versions/2025_06_03_0814-c1380f90f5de_remove_unused_batch_columns.py:34:1: D103 Missing docstring in public function
for column in [
TOTAL_URL_COUNT_COLUMN,
ORIGINAL_URL_COUNT_COLUMN,
DUPLICATE_URL_COUNT_COLUMN,
]:
op.add_column(
TABLE_NAME,
sa.Column(column, sa.Integer(), nullable=False, default=0),
)
7 changes: 0 additions & 7 deletions src/api/endpoints/batch/dtos/get/status.py

This file was deleted.

Empty file.
10 changes: 10 additions & 0 deletions src/api/endpoints/batch/dtos/get/summaries/counts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from pydantic import BaseModel

Check warning on line 1 in src/api/endpoints/batch/dtos/get/summaries/counts.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/api/endpoints/batch/dtos/get/summaries/counts.py#L1 <100>

Missing docstring in public module
Raw output
./src/api/endpoints/batch/dtos/get/summaries/counts.py:1:1: D100 Missing docstring in public module


class BatchSummaryURLCounts(BaseModel):

Check warning on line 4 in src/api/endpoints/batch/dtos/get/summaries/counts.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/api/endpoints/batch/dtos/get/summaries/counts.py#L4 <101>

Missing docstring in public class
Raw output
./src/api/endpoints/batch/dtos/get/summaries/counts.py:4:1: D101 Missing docstring in public class
total: int
pending: int
duplicate: int
not_relevant: int
submitted: int
errored: int
7 changes: 7 additions & 0 deletions src/api/endpoints/batch/dtos/get/summaries/response.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from pydantic import BaseModel

Check warning on line 1 in src/api/endpoints/batch/dtos/get/summaries/response.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/api/endpoints/batch/dtos/get/summaries/response.py#L1 <100>

Missing docstring in public module
Raw output
./src/api/endpoints/batch/dtos/get/summaries/response.py:1:1: D100 Missing docstring in public module

from src.api.endpoints.batch.dtos.get.summaries.summary import BatchSummary


class GetBatchSummariesResponse(BaseModel):

Check warning on line 6 in src/api/endpoints/batch/dtos/get/summaries/response.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/api/endpoints/batch/dtos/get/summaries/response.py#L6 <101>

Missing docstring in public class
Raw output
./src/api/endpoints/batch/dtos/get/summaries/response.py:6:1: D101 Missing docstring in public class
results: list[BatchSummary]
18 changes: 18 additions & 0 deletions src/api/endpoints/batch/dtos/get/summaries/summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import datetime

Check warning on line 1 in src/api/endpoints/batch/dtos/get/summaries/summary.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/api/endpoints/batch/dtos/get/summaries/summary.py#L1 <100>

Missing docstring in public module
Raw output
./src/api/endpoints/batch/dtos/get/summaries/summary.py:1:1: D100 Missing docstring in public module
from typing import Optional

from pydantic import BaseModel

from src.api.endpoints.batch.dtos.get.summaries.counts import BatchSummaryURLCounts
from src.core.enums import BatchStatus


class BatchSummary(BaseModel):

Check warning on line 10 in src/api/endpoints/batch/dtos/get/summaries/summary.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/api/endpoints/batch/dtos/get/summaries/summary.py#L10 <101>

Missing docstring in public class
Raw output
./src/api/endpoints/batch/dtos/get/summaries/summary.py:10:1: D101 Missing docstring in public class
id: int
strategy: str
status: BatchStatus
parameters: dict
user_id: int
compute_time: Optional[float]
date_generated: datetime.datetime
url_counts: BatchSummaryURLCounts
13 changes: 6 additions & 7 deletions src/api/endpoints/batch/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@
from src.api.dependencies import get_async_core
from src.api.endpoints.batch.dtos.get.duplicates import GetDuplicatesByBatchResponse
from src.api.endpoints.batch.dtos.get.logs import GetBatchLogsResponse
from src.api.endpoints.batch.dtos.get.status import GetBatchStatusResponse
from src.api.endpoints.batch.dtos.get.summaries.response import GetBatchSummariesResponse
from src.api.endpoints.batch.dtos.get.summaries.summary import BatchSummary
from src.api.endpoints.batch.dtos.get.urls import GetURLsByBatchResponse
from src.api.endpoints.batch.dtos.post.abort import MessageResponse
from src.db.dtos.batch_info import BatchInfo
from src.collectors.enums import CollectorType
from src.core.core import AsyncCore
from src.core.enums import BatchStatus
from src.security.manager import get_access_info
from src.security.dtos.access_info import AccessInfo
from src.security.manager import get_access_info

batch_router = APIRouter(
prefix="/batch",
Expand Down Expand Up @@ -43,7 +43,7 @@ async def get_batch_status(
),
core: AsyncCore = Depends(get_async_core),
access_info: AccessInfo = Depends(get_access_info),
) -> GetBatchStatusResponse:
) -> GetBatchSummariesResponse:
"""
Get the status of recent batches
"""
Expand All @@ -60,9 +60,8 @@ async def get_batch_info(
batch_id: int = Path(description="The batch id"),
core: AsyncCore = Depends(get_async_core),
access_info: AccessInfo = Depends(get_access_info),
) -> BatchInfo:
result = await core.get_batch_info(batch_id)
return result
) -> BatchSummary:
return await core.get_batch_info(batch_id)

@batch_router.get("/{batch_id}/urls")
async def get_urls_by_batch(
Expand Down
24 changes: 15 additions & 9 deletions src/core/core.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from http import HTTPStatus

Check warning on line 1 in src/core/core.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/core.py#L1 <100>

Missing docstring in public module
Raw output
./src/core/core.py:1:1: D100 Missing docstring in public module
from typing import Optional

from fastapi import HTTPException
from pydantic import BaseModel
from sqlalchemy.exc import IntegrityError

Expand All @@ -11,7 +13,8 @@
from src.api.endpoints.annotate.dtos.relevance.response import GetNextRelevanceAnnotationResponseOuterInfo
from src.api.endpoints.batch.dtos.get.duplicates import GetDuplicatesByBatchResponse
from src.api.endpoints.batch.dtos.get.logs import GetBatchLogsResponse
from src.api.endpoints.batch.dtos.get.status import GetBatchStatusResponse
from src.api.endpoints.batch.dtos.get.summaries.response import GetBatchSummariesResponse
from src.api.endpoints.batch.dtos.get.summaries.summary import BatchSummary
from src.api.endpoints.batch.dtos.get.urls import GetURLsByBatchResponse
from src.api.endpoints.batch.dtos.post.abort import MessageResponse
from src.api.endpoints.collector.dtos.collector_start import CollectorStartInfo
Expand Down Expand Up @@ -62,8 +65,14 @@
await self.collector_manager.shutdown_all_collectors()

#region Batch
async def get_batch_info(self, batch_id: int) -> BatchInfo:
return await self.adb_client.get_batch_by_id(batch_id)
async def get_batch_info(self, batch_id: int) -> BatchSummary:

Check warning on line 68 in src/core/core.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/core.py#L68 <102>

Missing docstring in public method
Raw output
./src/core/core.py:68:1: D102 Missing docstring in public method
result = await self.adb_client.get_batch_by_id(batch_id)
if result is None:
raise HTTPException(
status_code=HTTPStatus.BAD_REQUEST,
detail=f"Batch {batch_id} does not exist"
)
return result

async def get_urls_by_batch(self, batch_id: int, page: int = 1) -> GetURLsByBatchResponse:
url_infos = await self.adb_client.get_urls_by_batch(batch_id, page)
Expand All @@ -77,23 +86,20 @@
dup_infos = await self.adb_client.get_duplicates_by_batch_id(batch_id, page=page)
return GetDuplicatesByBatchResponse(duplicates=dup_infos)

async def get_batch_status(self, batch_id: int) -> BatchInfo:
return await self.adb_client.get_batch_by_id(batch_id)

async def get_batch_statuses(
self,
collector_type: Optional[CollectorType],
status: Optional[BatchStatus],
has_pending_urls: Optional[bool],
page: int
) -> GetBatchStatusResponse:
results = await self.adb_client.get_recent_batch_status_info(
) -> GetBatchSummariesResponse:
results = await self.adb_client.get_batch_summaries(
collector_type=collector_type,
status=status,
page=page,
has_pending_urls=has_pending_urls
)
return GetBatchStatusResponse(results=results)
return results

async def get_batch_logs(self, batch_id: int) -> GetBatchLogsResponse:
logs = await self.adb_client.get_logs_by_batch_id(batch_id)
Expand Down
84 changes: 31 additions & 53 deletions src/db/client/async_.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
from src.api.endpoints.annotate.dtos.record_type.response import GetNextRecordTypeAnnotationResponseInfo
from src.api.endpoints.annotate.dtos.relevance.response import GetNextRelevanceAnnotationResponseInfo
from src.api.endpoints.annotate.dtos.shared.batch import AnnotationBatchInfo
from src.api.endpoints.batch.dtos.get.summaries.response import GetBatchSummariesResponse
from src.api.endpoints.batch.dtos.get.summaries.summary import BatchSummary
from src.api.endpoints.collector.dtos.manual_batch.post import ManualBatchInputDTO
from src.api.endpoints.collector.dtos.manual_batch.response import ManualBatchResponseDTO
from src.api.endpoints.metrics.dtos.get.backlog import GetMetricsBacklogResponseDTO, GetMetricsBacklogResponseInnerDTO
Expand Down Expand Up @@ -52,8 +54,9 @@
from src.db.dtos.url_html_content_info import URLHTMLContentInfo, HTMLContentType
from src.db.dtos.url_info import URLInfo
from src.db.dtos.url_mapping import URLMapping
from src.db.queries.implementations.core.get_recent_batch_summaries.builder import GetRecentBatchSummariesQueryBuilder
from src.db.statement_composer import StatementComposer
from src.db.constants import PLACEHOLDER_AGENCY_NAME
from src.db.constants import PLACEHOLDER_AGENCY_NAME, STANDARD_ROW_LIMIT

Check warning on line 59 in src/db/client/async_.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/db/client/async_.py#L59 <401>

'src.db.constants.STANDARD_ROW_LIMIT' imported but unused
Raw output
./src/db/client/async_.py:59:1: F401 'src.db.constants.STANDARD_ROW_LIMIT' imported but unused
from src.db.enums import TaskType
from src.db.models.templates import Base
from src.db.models.core import URL, URLErrorInfo, URLHTMLContent, \
Expand Down Expand Up @@ -1363,12 +1366,17 @@
session.add(rejecting_user_url)

@session_manager
async def get_batch_by_id(self, session, batch_id: int) -> Optional[BatchInfo]:
async def get_batch_by_id(self, session, batch_id: int) -> Optional[BatchSummary]:
"""Retrieve a batch by ID."""
query = Select(Batch).where(Batch.id == batch_id)
result = await session.execute(query)
batch = result.scalars().first()
return BatchInfo(**batch.__dict__)
builder = GetRecentBatchSummariesQueryBuilder(
batch_id=batch_id
)
summaries = await builder.run(session)
if len(summaries) == 0:
return None
batch_summary = summaries[0]
return batch_summary


@session_manager
async def get_urls_by_batch(self, session, batch_id: int, page: int = 1) -> List[URLInfo]:
Expand Down Expand Up @@ -1432,15 +1440,12 @@
user_id=batch_info.user_id,
status=batch_info.status.value,
parameters=batch_info.parameters,
total_url_count=batch_info.total_url_count,
original_url_count=batch_info.original_url_count,
duplicate_url_count=batch_info.duplicate_url_count,
compute_time=batch_info.compute_time,
strategy_success_rate=batch_info.strategy_success_rate,
metadata_success_rate=batch_info.metadata_success_rate,
agency_match_rate=batch_info.agency_match_rate,
record_type_match_rate=batch_info.record_type_match_rate,
record_category_match_rate=batch_info.record_category_match_rate,
strategy_success_rate=0,
metadata_success_rate=0,
agency_match_rate=0,
record_type_match_rate=0,
record_category_match_rate=0,
)
if batch_info.date_generated is not None:
batch.date_generated = batch_info.date_generated
Expand Down Expand Up @@ -1618,52 +1623,25 @@
return final_results

@session_manager
async def get_recent_batch_status_info(
async def get_batch_summaries(

Check warning on line 1626 in src/db/client/async_.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/db/client/async_.py#L1626 <102>

Missing docstring in public method
Raw output
./src/db/client/async_.py:1626:1: D102 Missing docstring in public method
self,
session,
page: int,
collector_type: Optional[CollectorType] = None,
status: Optional[BatchStatus] = None,
has_pending_urls: Optional[bool] = None
) -> List[BatchInfo]:
) -> GetBatchSummariesResponse:
# Get only the batch_id, collector_type, status, and created_at
limit = 100
query = Select(Batch)
if has_pending_urls is not None:
pending_url_subquery = Select(URL).where(
and_(
URL.batch_id == Batch.id,
URL.outcome == URLStatus.PENDING.value
)
)

if has_pending_urls:
# Query for all that have pending URLs
query = query.where(exists(
pending_url_subquery
))
else:
# Query for all that DO NOT have pending URLs
# (or that have no URLs at all)
query = query.where(
not_(
exists(
pending_url_subquery
)
)
)
if collector_type:
query = query.filter(Batch.strategy == collector_type.value)
if status:
query = query.filter(Batch.status == status.value)

query = (query.
order_by(Batch.date_generated.desc()).
limit(limit).
offset((page - 1) * limit))
raw_results = await session.execute(query)
batches = raw_results.scalars().all()
return [BatchInfo(**batch.__dict__) for batch in batches]
builder = GetRecentBatchSummariesQueryBuilder(
page=page,
collector_type=collector_type,
status=status,
has_pending_urls=has_pending_urls
)
summaries = await builder.run(session)
return GetBatchSummariesResponse(
results=summaries
)

@session_manager
async def get_logs_by_batch_id(self, session, batch_id: int) -> List[LogOutputInfo]:
Expand Down
13 changes: 5 additions & 8 deletions src/db/client/sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,15 +61,12 @@ def insert_batch(self, session, batch_info: BatchInfo) -> int:
user_id=batch_info.user_id,
status=batch_info.status.value,
parameters=batch_info.parameters,
total_url_count=batch_info.total_url_count,
original_url_count=batch_info.original_url_count,
duplicate_url_count=batch_info.duplicate_url_count,
compute_time=batch_info.compute_time,
strategy_success_rate=batch_info.strategy_success_rate,
metadata_success_rate=batch_info.metadata_success_rate,
agency_match_rate=batch_info.agency_match_rate,
record_type_match_rate=batch_info.record_type_match_rate,
record_category_match_rate=batch_info.record_category_match_rate,
strategy_success_rate=0,
metadata_success_rate=0,
agency_match_rate=0,
record_type_match_rate=0,
record_category_match_rate=0,
)
if batch_info.date_generated is not None:
batch.date_generated = batch_info.date_generated
Expand Down
4 changes: 3 additions & 1 deletion src/db/constants.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@


PLACEHOLDER_AGENCY_NAME = "PLACEHOLDER_AGENCY_NAME"
PLACEHOLDER_AGENCY_NAME = "PLACEHOLDER_AGENCY_NAME"

STANDARD_ROW_LIMIT = 100

Check warning on line 5 in src/db/constants.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/db/constants.py#L5 <292>

no newline at end of file
Raw output
./src/db/constants.py:5:25: W292 no newline at end of file
9 changes: 1 addition & 8 deletions src/db/dtos/batch_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,6 @@ class BatchInfo(BaseModel):
status: BatchStatus
parameters: dict
user_id: int
total_url_count: int = 0
original_url_count: int = 0
duplicate_url_count: int = 0
strategy_success_rate: Optional[float] = None
metadata_success_rate: Optional[float] = None
agency_match_rate: Optional[float] = None
record_type_match_rate: Optional[float] = None
record_category_match_rate: Optional[float] = None
total_url_count: Optional[int] = None
compute_time: Optional[float] = None
date_generated: Optional[datetime] = None
5 changes: 0 additions & 5 deletions src/db/models/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,6 @@ class Batch(StandardModel):
batch_status_enum,
nullable=False
)
# The number of URLs in the batch
# TODO: Add means to update after execution
total_url_count = Column(Integer, nullable=False, default=0)
original_url_count = Column(Integer, nullable=False, default=0)
duplicate_url_count = Column(Integer, nullable=False, default=0)
date_generated = Column(TIMESTAMP, nullable=False, server_default=CURRENT_TIME_SERVER_DEFAULT)
# How often URLs ended up approved in the database
strategy_success_rate = Column(Float)
Expand Down
1 change: 1 addition & 0 deletions src/db/queries/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This directory contains classes for building more complex queries.
Empty file added src/db/queries/__init__.py
Empty file.
Empty file.
Loading