Skip to content

Commit 82ce183

Browse files
authored
Merge pull request #273 from Police-Data-Accessibility-Project/mc_223_relevancy_labeling
Mc 223 relevancy labeling
2 parents cd32524 + b72a810 commit 82ce183

21 files changed

+397
-107
lines changed
Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
"""Update relevancy logic
2+
3+
- Add new status to URL Status outcome: `individual record`
4+
- Change URL Status value `rejected` to `not relevant` , for specificity
5+
- Create `user_suggested_status` enum
6+
- ` Add `suggested_status` column to `user_relevant_suggestions`
7+
- Migrate `user_relevant_suggestions:relevant` to `user_relevant_suggestions:user_suggested_status`
8+
9+
Revision ID: 00cc949e0347
10+
Revises: b5f079b6b8cb
11+
Create Date: 2025-05-16 10:31:04.417203
12+
13+
"""
14+
from typing import Sequence, Union
15+
16+
from alembic import op
17+
import sqlalchemy as sa
18+
19+
from util.alembic_helpers import switch_enum_type
20+
21+
# revision identifiers, used by Alembic.
22+
revision: str = '00cc949e0347'
23+
down_revision: Union[str, None] = 'b5f079b6b8cb'
24+
branch_labels: Union[str, Sequence[str], None] = None
25+
depends_on: Union[str, Sequence[str], None] = None
26+
27+
28+
suggested_status_enum = sa.Enum(
29+
'relevant',
30+
'not relevant',
31+
'individual record',
32+
'broken page/404 not found',
33+
name='suggested_status'
34+
)
35+
36+
def upgrade() -> None:
37+
suggested_status_enum.create(op.get_bind())
38+
# Replace `relevant` column with `suggested_status` column
39+
op.add_column(
40+
'user_relevant_suggestions',
41+
sa.Column(
42+
'suggested_status',
43+
suggested_status_enum,
44+
nullable=True
45+
)
46+
)
47+
# Migrate existing entries
48+
op.execute("""
49+
UPDATE user_relevant_suggestions
50+
SET suggested_status = 'relevant'
51+
WHERE relevant = true
52+
""")
53+
op.execute("""
54+
UPDATE user_relevant_suggestions
55+
SET suggested_status = 'not relevant'
56+
WHERE relevant = false
57+
""")
58+
op.alter_column(
59+
'user_relevant_suggestions',
60+
'suggested_status',
61+
nullable=False
62+
)
63+
op.drop_column(
64+
'user_relevant_suggestions',
65+
'relevant'
66+
)
67+
68+
# Update `url_status` enum to include
69+
# `individual record`
70+
# And change `rejected` to `not relevant`
71+
op.execute("""
72+
ALTER TYPE url_status RENAME VALUE 'rejected' TO 'not relevant';
73+
""")
74+
switch_enum_type(
75+
table_name='urls',
76+
column_name='outcome',
77+
enum_name='url_status',
78+
new_enum_values=[
79+
'pending',
80+
'submitted',
81+
'validated',
82+
'duplicate',
83+
'not relevant',
84+
'error',
85+
'404 not found',
86+
'individual record'
87+
],
88+
check_constraints_to_drop=['url_name_not_null_when_validated']
89+
)
90+
op.execute(
91+
"""
92+
ALTER TABLE urls
93+
ADD CONSTRAINT url_name_not_null_when_validated
94+
CHECK ((name IS NOT NULL) OR (outcome <> 'validated'::url_status))
95+
"""
96+
)
97+
98+
99+
def downgrade() -> None:
100+
# Update `url_status` enum to remove
101+
# `individual record`
102+
# And change `not relevant` to `rejected`
103+
op.execute("""
104+
ALTER TYPE url_status RENAME VALUE 'not relevant' TO 'rejected';
105+
""")
106+
op.execute("""
107+
UPDATE urls
108+
SET outcome = 'rejected'
109+
WHERE outcome = 'individual record'
110+
""")
111+
switch_enum_type(
112+
table_name='urls',
113+
column_name='outcome',
114+
enum_name='url_status',
115+
new_enum_values=[
116+
'pending',
117+
'submitted',
118+
'validated',
119+
'duplicate',
120+
'rejected',
121+
'error',
122+
'404 not found',
123+
],
124+
check_constraints_to_drop=['url_name_not_null_when_validated']
125+
)
126+
op.execute(
127+
"""
128+
ALTER TABLE urls
129+
ADD CONSTRAINT url_name_not_null_when_validated
130+
CHECK ((name IS NOT NULL) OR (outcome <> 'validated'::url_status))
131+
"""
132+
)
133+
134+
# Replace `suggested_status` column with `relevant` column
135+
op.add_column(
136+
'user_relevant_suggestions',
137+
sa.Column(
138+
'relevant',
139+
sa.BOOLEAN(),
140+
nullable=True
141+
)
142+
)
143+
op.execute("""
144+
UPDATE user_relevant_suggestions
145+
SET relevant = true
146+
WHERE suggested_status = 'relevant'
147+
""")
148+
op.execute("""
149+
UPDATE user_relevant_suggestions
150+
SET relevant = false
151+
WHERE suggested_status = 'not relevant'
152+
""")
153+
op.alter_column(
154+
'user_relevant_suggestions',
155+
'relevant',
156+
nullable=False
157+
)
158+
op.drop_column(
159+
'user_relevant_suggestions',
160+
'suggested_status'
161+
)
162+
suggested_status_enum.drop(op.get_bind(), checkfirst=True)

api/routes/annotate.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ async def annotate_url_for_relevance_and_get_next_url(
5555
await async_core.submit_url_relevance_annotation(
5656
user_id=access_info.user_id,
5757
url_id=url_id,
58-
relevant=relevance_annotation_post_info.is_relevant
58+
suggested_status=relevance_annotation_post_info.suggested_status
5959
)
6060
return await async_core.get_next_url_for_relevance_annotation(
6161
user_id=access_info.user_id,

api/routes/review.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from api.dependencies import get_async_core
66
from core.AsyncCore import AsyncCore
7-
from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo, FinalReviewBaseInfo
7+
from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo, FinalReviewBaseInfo, FinalReviewRejectionInfo
88
from core.DTOs.GetNextURLForFinalReviewResponse import GetNextURLForFinalReviewResponse, \
99
GetNextURLForFinalReviewOuterResponse
1010
from security_manager.SecurityManager import AccessInfo, get_access_info, require_permission, Permissions
@@ -50,7 +50,7 @@ async def approve_source(
5050
async def reject_source(
5151
core: AsyncCore = Depends(get_async_core),
5252
access_info: AccessInfo = Depends(requires_final_review_permission),
53-
review_info: FinalReviewBaseInfo = FinalReviewBaseInfo,
53+
review_info: FinalReviewRejectionInfo = FinalReviewRejectionInfo,
5454
batch_id: Optional[int] = Query(
5555
description="The batch id of the next URL to get. "
5656
"If not specified, defaults to first qualifying URL",
@@ -59,6 +59,7 @@ async def reject_source(
5959
await core.reject_url(
6060
url_id=review_info.url_id,
6161
access_info=access_info,
62+
rejection_reason=review_info.rejection_reason
6263
)
6364
next_source = await core.get_next_source_for_review(batch_id=batch_id)
6465
return GetNextURLForFinalReviewOuterResponse(next_source=next_source)

collector_db/AsyncDatabaseClient.py

Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
BacklogSnapshot, URLDataSource, URLCheckedForDuplicate, URLProbedFor404
3434
from collector_manager.enums import URLStatus, CollectorType
3535
from core.DTOs.AllAnnotationPostInfo import AllAnnotationPostInfo
36-
from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo
36+
from core.DTOs.FinalReviewApprovalInfo import FinalReviewApprovalInfo, RejectionReason
3737
from core.DTOs.GetMetricsBacklogResponse import GetMetricsBacklogResponseDTO, GetMetricsBacklogResponseInnerDTO
3838
from core.DTOs.GetMetricsBatchesAggregatedResponseDTO import GetMetricsBatchesAggregatedResponseDTO, \
3939
GetMetricsBatchesAggregatedInnerResponseDTO
@@ -65,7 +65,7 @@
6565
from core.DTOs.task_data_objects.URLDuplicateTDO import URLDuplicateTDO
6666
from core.DTOs.task_data_objects.URLMiscellaneousMetadataTDO import URLMiscellaneousMetadataTDO, URLHTMLMetadataInfo
6767
from core.EnvVarManager import EnvVarManager
68-
from core.enums import BatchStatus, SuggestionType, RecordType
68+
from core.enums import BatchStatus, SuggestionType, RecordType, SuggestedStatus
6969
from html_tag_collector.DataClassTags import convert_to_response_html_info
7070

7171
# Type Hints
@@ -170,7 +170,7 @@ async def get_next_url_for_user_annotation(
170170
select(UserRelevantSuggestion)
171171
.where(
172172
UserRelevantSuggestion.url_id == URL.id,
173-
UserRelevantSuggestion.relevant == False
173+
UserRelevantSuggestion.suggested_status != SuggestedStatus.RELEVANT.value
174174
)
175175
)
176176
)
@@ -194,7 +194,7 @@ async def add_user_relevant_suggestion(
194194
session: AsyncSession,
195195
url_id: int,
196196
user_id: int,
197-
relevant: bool
197+
suggested_status: SuggestedStatus
198198
):
199199
prior_suggestion = await self.get_user_suggestion(
200200
session,
@@ -203,13 +203,13 @@ async def add_user_relevant_suggestion(
203203
url_id=url_id
204204
)
205205
if prior_suggestion is not None:
206-
prior_suggestion.relevant = relevant
206+
prior_suggestion.suggested_status = suggested_status.value
207207
return
208208

209209
suggestion = UserRelevantSuggestion(
210210
url_id=url_id,
211211
user_id=user_id,
212-
relevant=relevant
212+
suggested_status=suggested_status.value
213213
)
214214
session.add(suggestion)
215215

@@ -881,7 +881,7 @@ async def get_next_url_agency_for_annotation(
881881
where(
882882
(UserRelevantSuggestion.user_id == user_id) &
883883
(UserRelevantSuggestion.url_id == URL.id) &
884-
(UserRelevantSuggestion.relevant == False)
884+
(UserRelevantSuggestion.suggested_status != SuggestedStatus.RELEVANT.value)
885885
).correlate(URL)
886886
)
887887
)
@@ -1288,7 +1288,8 @@ async def reject_url(
12881288
self,
12891289
session: AsyncSession,
12901290
url_id: int,
1291-
user_id: int
1291+
user_id: int,
1292+
rejection_reason: RejectionReason
12921293
) -> None:
12931294

12941295
query = (
@@ -1299,7 +1300,18 @@ async def reject_url(
12991300
url = await session.execute(query)
13001301
url = url.scalars().first()
13011302

1302-
url.outcome = URLStatus.REJECTED.value
1303+
match rejection_reason:
1304+
case RejectionReason.INDIVIDUAL_RECORD:
1305+
url.outcome = URLStatus.INDIVIDUAL_RECORD.value
1306+
case RejectionReason.BROKEN_PAGE_404:
1307+
url.outcome = URLStatus.NOT_FOUND.value
1308+
case RejectionReason.NOT_RELEVANT:
1309+
url.outcome = URLStatus.NOT_RELEVANT.value
1310+
case _:
1311+
raise HTTPException(
1312+
status_code=status.HTTP_400_BAD_REQUEST,
1313+
detail="Invalid rejection reason"
1314+
)
13031315

13041316
# Add rejecting user
13051317
rejecting_user_url = ReviewingUserURL(
@@ -1741,12 +1753,12 @@ async def add_all_annotations_to_url(
17411753
relevant_suggestion = UserRelevantSuggestion(
17421754
url_id=url_id,
17431755
user_id=user_id,
1744-
relevant=post_info.is_relevant
1756+
suggested_status=post_info.suggested_status.value
17451757
)
17461758
session.add(relevant_suggestion)
17471759

17481760
# If not relevant, do nothing else
1749-
if not post_info.is_relevant:
1761+
if not post_info.suggested_status == SuggestedStatus.RELEVANT:
17501762
return
17511763

17521764
record_type_suggestion = UserRecordTypeSuggestion(
@@ -1872,7 +1884,7 @@ def url_column(status: URLStatus, label):
18721884
url_column(URLStatus.ERROR, label="error_count"),
18731885
url_column(URLStatus.VALIDATED, label="validated_count"),
18741886
url_column(URLStatus.SUBMITTED, label="submitted_count"),
1875-
url_column(URLStatus.REJECTED, label="rejected_count"),
1887+
url_column(URLStatus.NOT_RELEVANT, label="rejected_count"),
18761888

18771889
).outerjoin(
18781890
Batch, Batch.id == URL.batch_id
@@ -1957,7 +1969,7 @@ def url_column(status: URLStatus, label):
19571969
sc.count_distinct(URL.id, label="count_total"),
19581970
url_column(URLStatus.PENDING, label="count_pending"),
19591971
url_column(URLStatus.SUBMITTED, label="count_submitted"),
1960-
url_column(URLStatus.REJECTED, label="count_rejected"),
1972+
url_column(URLStatus.NOT_RELEVANT, label="count_rejected"),
19611973
url_column(URLStatus.ERROR, label="count_error"),
19621974
url_column(URLStatus.VALIDATED, label="count_validated"),
19631975
).group_by(
@@ -2075,7 +2087,7 @@ def case_column(status: URLStatus, label):
20752087
case_column(URLStatus.PENDING, label="count_pending"),
20762088
case_column(URLStatus.SUBMITTED, label="count_submitted"),
20772089
case_column(URLStatus.VALIDATED, label="count_validated"),
2078-
case_column(URLStatus.REJECTED, label="count_rejected"),
2090+
case_column(URLStatus.NOT_RELEVANT, label="count_rejected"),
20792091
case_column(URLStatus.ERROR, label="count_error"),
20802092
)
20812093
raw_results = await session.execute(count_query)

collector_db/DTOConverter.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def final_review_annotation_relevant_info(
2626
) -> FinalReviewAnnotationRelevantInfo:
2727

2828
auto_value = auto_suggestion.relevant if auto_suggestion else None
29-
user_value = user_suggestion.relevant if user_suggestion else None
29+
user_value = user_suggestion.suggested_status if user_suggestion else None
3030
return FinalReviewAnnotationRelevantInfo(
3131
auto=auto_value,
3232
user=user_value

collector_db/models.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,10 +96,11 @@ class URL(Base):
9696
'pending',
9797
'submitted',
9898
'validated',
99-
'rejected',
99+
'not relevant',
100100
'duplicate',
101101
'error',
102102
'404 not found',
103+
'individual record',
103104
name='url_status'
104105
),
105106
nullable=False
@@ -457,7 +458,16 @@ class UserRelevantSuggestion(Base):
457458
id = Column(Integer, primary_key=True, autoincrement=True)
458459
url_id = Column(Integer, ForeignKey("urls.id"), nullable=False)
459460
user_id = Column(Integer, nullable=False)
460-
relevant = Column(Boolean, nullable=False)
461+
suggested_status = Column(
462+
postgresql.ENUM(
463+
'relevant',
464+
'not relevant',
465+
'individual record',
466+
'broken page/404 not found',
467+
name='suggested_status'
468+
),
469+
nullable=True
470+
)
461471
created_at = get_created_at_column()
462472
updated_at = get_updated_at_column()
463473

collector_manager/enums.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,5 +16,6 @@ class URLStatus(Enum):
1616
VALIDATED = "validated"
1717
ERROR = "error"
1818
DUPLICATE = "duplicate"
19-
REJECTED = "rejected"
19+
NOT_RELEVANT = "not relevant"
2020
NOT_FOUND = "404 not found"
21+
INDIVIDUAL_RECORD = "individual record"

0 commit comments

Comments
 (0)