Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
"""Update relevancy logic

- Add new status to URL Status outcome: `individual record`
- Change URL Status value `rejected` to `not relevant` , for specificity
- Create `user_suggested_status` enum
- ` Add `suggested_status` column to `user_relevant_suggestions`
- Migrate `user_relevant_suggestions:relevant` to `user_relevant_suggestions:user_suggested_status`

Revision ID: 00cc949e0347
Revises: b5f079b6b8cb
Create Date: 2025-05-16 10:31:04.417203

"""
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa

from util.alembic_helpers import switch_enum_type

# revision identifiers, used by Alembic.
revision: str = '00cc949e0347'
down_revision: Union[str, None] = 'b5f079b6b8cb'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


suggested_status_enum = sa.Enum(
'relevant',
'not relevant',
'individual record',
'broken page/404 not found',
name='suggested_status'
)

def upgrade() -> None:

Check warning on line 36 in alembic/versions/2025_05_16_1031-00cc949e0347_update_relevancy_logic.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_05_16_1031-00cc949e0347_update_relevancy_logic.py#L36 <103>

Missing docstring in public function
Raw output
./alembic/versions/2025_05_16_1031-00cc949e0347_update_relevancy_logic.py:36:1: D103 Missing docstring in public function
suggested_status_enum.create(op.get_bind())
# Replace `relevant` column with `suggested_status` column
op.add_column(
'user_relevant_suggestions',
sa.Column(
'suggested_status',
suggested_status_enum,
nullable=True
)
)
# Migrate existing entries
op.execute("""
UPDATE user_relevant_suggestions
SET suggested_status = 'relevant'
WHERE relevant = true
""")
op.execute("""
UPDATE user_relevant_suggestions
SET suggested_status = 'not relevant'
WHERE relevant = false
""")
op.alter_column(
'user_relevant_suggestions',
'suggested_status',
nullable=False
)
op.drop_column(
'user_relevant_suggestions',
'relevant'
)

# Update `url_status` enum to include
# `individual record`
# And change `rejected` to `not relevant`
op.execute("""
ALTER TYPE url_status RENAME VALUE 'rejected' TO 'not relevant';
""")
switch_enum_type(
table_name='urls',
column_name='outcome',
enum_name='url_status',
new_enum_values=[
'pending',
'submitted',
'validated',
'duplicate',
'not relevant',
'error',
'404 not found',
'individual record'
],
check_constraints_to_drop=['url_name_not_null_when_validated']
)
op.execute(
"""

Check failure on line 91 in alembic/versions/2025_05_16_1031-00cc949e0347_update_relevancy_logic.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_05_16_1031-00cc949e0347_update_relevancy_logic.py#L91 <122>

continuation line missing indentation or outdented
Raw output
./alembic/versions/2025_05_16_1031-00cc949e0347_update_relevancy_logic.py:91:5: E122 continuation line missing indentation or outdented
ALTER TABLE urls
ADD CONSTRAINT url_name_not_null_when_validated
CHECK ((name IS NOT NULL) OR (outcome <> 'validated'::url_status))
"""
)


def downgrade() -> None:

Check warning on line 99 in alembic/versions/2025_05_16_1031-00cc949e0347_update_relevancy_logic.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_05_16_1031-00cc949e0347_update_relevancy_logic.py#L99 <103>

Missing docstring in public function
Raw output
./alembic/versions/2025_05_16_1031-00cc949e0347_update_relevancy_logic.py:99:1: D103 Missing docstring in public function
# Update `url_status` enum to remove
# `individual record`
# And change `not relevant` to `rejected`
op.execute("""
ALTER TYPE url_status RENAME VALUE 'not relevant' TO 'rejected';
""")
op.execute("""
UPDATE urls
SET outcome = 'rejected'
WHERE outcome = 'individual record'
""")
switch_enum_type(
table_name='urls',
column_name='outcome',
enum_name='url_status',
new_enum_values=[
'pending',
'submitted',
'validated',
'duplicate',
'rejected',
'error',
'404 not found',
],
check_constraints_to_drop=['url_name_not_null_when_validated']
)
op.execute(
"""

Check failure on line 127 in alembic/versions/2025_05_16_1031-00cc949e0347_update_relevancy_logic.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_05_16_1031-00cc949e0347_update_relevancy_logic.py#L127 <122>

continuation line missing indentation or outdented
Raw output
./alembic/versions/2025_05_16_1031-00cc949e0347_update_relevancy_logic.py:127:5: E122 continuation line missing indentation or outdented
ALTER TABLE urls
ADD CONSTRAINT url_name_not_null_when_validated
CHECK ((name IS NOT NULL) OR (outcome <> 'validated'::url_status))
"""
)

# Replace `suggested_status` column with `relevant` column
op.add_column(
'user_relevant_suggestions',
sa.Column(
'relevant',
sa.BOOLEAN(),
nullable=True
)
)
op.execute("""
UPDATE user_relevant_suggestions
SET relevant = true
WHERE suggested_status = 'relevant'
""")
op.execute("""
UPDATE user_relevant_suggestions
SET relevant = false
WHERE suggested_status = 'not relevant'
""")
op.alter_column(
'user_relevant_suggestions',
'relevant',
nullable=False
)
op.drop_column(
'user_relevant_suggestions',
'suggested_status'
)
suggested_status_enum.drop(op.get_bind(), checkfirst=True)
2 changes: 1 addition & 1 deletion api/routes/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ async def annotate_url_for_relevance_and_get_next_url(
await async_core.submit_url_relevance_annotation(
user_id=access_info.user_id,
url_id=url_id,
relevant=relevance_annotation_post_info.is_relevant
suggested_status=relevance_annotation_post_info.suggested_status
)
return await async_core.get_next_url_for_relevance_annotation(
user_id=access_info.user_id,
Expand Down
5 changes: 3 additions & 2 deletions api/routes/review.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from api.dependencies import get_async_core
from core.AsyncCore import AsyncCore
from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo, FinalReviewBaseInfo
from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo, FinalReviewBaseInfo, FinalReviewRejectionInfo

Check warning on line 7 in api/routes/review.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] api/routes/review.py#L7 <401>

'core.DTOs.FinalReviewApprovalInfo.FinalReviewBaseInfo' imported but unused
Raw output
./api/routes/review.py:7:1: F401 'core.DTOs.FinalReviewApprovalInfo.FinalReviewBaseInfo' imported but unused
from core.DTOs.GetNextURLForFinalReviewResponse import GetNextURLForFinalReviewResponse, \
GetNextURLForFinalReviewOuterResponse
from security_manager.SecurityManager import AccessInfo, get_access_info, require_permission, Permissions
Expand Down Expand Up @@ -50,7 +50,7 @@
async def reject_source(
core: AsyncCore = Depends(get_async_core),
access_info: AccessInfo = Depends(requires_final_review_permission),
review_info: FinalReviewBaseInfo = FinalReviewBaseInfo,
review_info: FinalReviewRejectionInfo = FinalReviewRejectionInfo,
batch_id: Optional[int] = Query(
description="The batch id of the next URL to get. "
"If not specified, defaults to first qualifying URL",
Expand All @@ -59,6 +59,7 @@
await core.reject_url(
url_id=review_info.url_id,
access_info=access_info,
rejection_reason=review_info.rejection_reason
)
next_source = await core.get_next_source_for_review(batch_id=batch_id)
return GetNextURLForFinalReviewOuterResponse(next_source=next_source)
40 changes: 26 additions & 14 deletions collector_db/AsyncDatabaseClient.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
BacklogSnapshot, URLDataSource, URLCheckedForDuplicate, URLProbedFor404
from collector_manager.enums import URLStatus, CollectorType
from core.DTOs.AllAnnotationPostInfo import AllAnnotationPostInfo
from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo
from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo, RejectionReason
from core.DTOs.GetMetricsBacklogResponse import GetMetricsBacklogResponseDTO, GetMetricsBacklogResponseInnerDTO
from core.DTOs.GetMetricsBatchesAggregatedResponseDTO import GetMetricsBatchesAggregatedResponseDTO, \
GetMetricsBatchesAggregatedInnerResponseDTO
Expand Down Expand Up @@ -65,7 +65,7 @@
from core.DTOs.task_data_objects.URLDuplicateTDO import URLDuplicateTDO
from core.DTOs.task_data_objects.URLMiscellaneousMetadataTDO import URLMiscellaneousMetadataTDO, URLHTMLMetadataInfo
from core.EnvVarManager import EnvVarManager
from core.enums import BatchStatus, SuggestionType, RecordType
from core.enums import BatchStatus, SuggestionType, RecordType, SuggestedStatus
from html_tag_collector.DataClassTags import convert_to_response_html_info

# Type Hints
Expand Down Expand Up @@ -170,7 +170,7 @@ async def get_next_url_for_user_annotation(
select(UserRelevantSuggestion)
.where(
UserRelevantSuggestion.url_id == URL.id,
UserRelevantSuggestion.relevant == False
UserRelevantSuggestion.suggested_status != SuggestedStatus.RELEVANT.value
)
)
)
Expand All @@ -194,7 +194,7 @@ async def add_user_relevant_suggestion(
session: AsyncSession,
url_id: int,
user_id: int,
relevant: bool
suggested_status: SuggestedStatus
):
prior_suggestion = await self.get_user_suggestion(
session,
Expand All @@ -203,13 +203,13 @@ async def add_user_relevant_suggestion(
url_id=url_id
)
if prior_suggestion is not None:
prior_suggestion.relevant = relevant
prior_suggestion.suggested_status = suggested_status.value
return

suggestion = UserRelevantSuggestion(
url_id=url_id,
user_id=user_id,
relevant=relevant
suggested_status=suggested_status.value
)
session.add(suggestion)

Expand Down Expand Up @@ -881,7 +881,7 @@ async def get_next_url_agency_for_annotation(
where(
(UserRelevantSuggestion.user_id == user_id) &
(UserRelevantSuggestion.url_id == URL.id) &
(UserRelevantSuggestion.relevant == False)
(UserRelevantSuggestion.suggested_status != SuggestedStatus.RELEVANT.value)
).correlate(URL)
)
)
Expand Down Expand Up @@ -1288,7 +1288,8 @@ async def reject_url(
self,
session: AsyncSession,
url_id: int,
user_id: int
user_id: int,
rejection_reason: RejectionReason
) -> None:

query = (
Expand All @@ -1299,7 +1300,18 @@ async def reject_url(
url = await session.execute(query)
url = url.scalars().first()

url.outcome = URLStatus.REJECTED.value
match rejection_reason:
case RejectionReason.INDIVIDUAL_RECORD:
url.outcome = URLStatus.INDIVIDUAL_RECORD.value
case RejectionReason.BROKEN_PAGE_404:
url.outcome = URLStatus.NOT_FOUND.value
case RejectionReason.NOT_RELEVANT:
url.outcome = URLStatus.NOT_RELEVANT.value
case _:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="Invalid rejection reason"
)

# Add rejecting user
rejecting_user_url = ReviewingUserURL(
Expand Down Expand Up @@ -1741,12 +1753,12 @@ async def add_all_annotations_to_url(
relevant_suggestion = UserRelevantSuggestion(
url_id=url_id,
user_id=user_id,
relevant=post_info.is_relevant
suggested_status=post_info.suggested_status.value
)
session.add(relevant_suggestion)

# If not relevant, do nothing else
if not post_info.is_relevant:
if not post_info.suggested_status == SuggestedStatus.RELEVANT:
return

record_type_suggestion = UserRecordTypeSuggestion(
Expand Down Expand Up @@ -1872,7 +1884,7 @@ def url_column(status: URLStatus, label):
url_column(URLStatus.ERROR, label="error_count"),
url_column(URLStatus.VALIDATED, label="validated_count"),
url_column(URLStatus.SUBMITTED, label="submitted_count"),
url_column(URLStatus.REJECTED, label="rejected_count"),
url_column(URLStatus.NOT_RELEVANT, label="rejected_count"),

).outerjoin(
Batch, Batch.id == URL.batch_id
Expand Down Expand Up @@ -1957,7 +1969,7 @@ def url_column(status: URLStatus, label):
sc.count_distinct(URL.id, label="count_total"),
url_column(URLStatus.PENDING, label="count_pending"),
url_column(URLStatus.SUBMITTED, label="count_submitted"),
url_column(URLStatus.REJECTED, label="count_rejected"),
url_column(URLStatus.NOT_RELEVANT, label="count_rejected"),
url_column(URLStatus.ERROR, label="count_error"),
url_column(URLStatus.VALIDATED, label="count_validated"),
).group_by(
Expand Down Expand Up @@ -2075,7 +2087,7 @@ def case_column(status: URLStatus, label):
case_column(URLStatus.PENDING, label="count_pending"),
case_column(URLStatus.SUBMITTED, label="count_submitted"),
case_column(URLStatus.VALIDATED, label="count_validated"),
case_column(URLStatus.REJECTED, label="count_rejected"),
case_column(URLStatus.NOT_RELEVANT, label="count_rejected"),
case_column(URLStatus.ERROR, label="count_error"),
)
raw_results = await session.execute(count_query)
Expand Down
2 changes: 1 addition & 1 deletion collector_db/DTOConverter.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def final_review_annotation_relevant_info(
) -> FinalReviewAnnotationRelevantInfo:

auto_value = auto_suggestion.relevant if auto_suggestion else None
user_value = user_suggestion.relevant if user_suggestion else None
user_value = user_suggestion.suggested_status if user_suggestion else None
return FinalReviewAnnotationRelevantInfo(
auto=auto_value,
user=user_value
Expand Down
14 changes: 12 additions & 2 deletions collector_db/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,10 +96,11 @@ class URL(Base):
'pending',
'submitted',
'validated',
'rejected',
'not relevant',
'duplicate',
'error',
'404 not found',
'individual record',
name='url_status'
),
nullable=False
Expand Down Expand Up @@ -457,7 +458,16 @@ class UserRelevantSuggestion(Base):
id = Column(Integer, primary_key=True, autoincrement=True)
url_id = Column(Integer, ForeignKey("urls.id"), nullable=False)
user_id = Column(Integer, nullable=False)
relevant = Column(Boolean, nullable=False)
suggested_status = Column(
postgresql.ENUM(
'relevant',
'not relevant',
'individual record',
'broken page/404 not found',
name='suggested_status'
),
nullable=True
)
created_at = get_created_at_column()
updated_at = get_updated_at_column()

Expand Down
3 changes: 2 additions & 1 deletion collector_manager/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,6 @@ class URLStatus(Enum):
VALIDATED = "validated"
ERROR = "error"
DUPLICATE = "duplicate"
REJECTED = "rejected"
NOT_RELEVANT = "not relevant"
NOT_FOUND = "404 not found"
INDIVIDUAL_RECORD = "individual record"
Loading