Skip to content

Commit f09d60d

Browse files
authored
Merge pull request #266 from Police-Data-Accessibility-Project/mc_255_relevancy_task_error
fix(app): Address bug in agency identification
2 parents 1281e40 + bd27536 commit f09d60d

File tree

8 files changed

+53
-13
lines changed

8 files changed

+53
-13
lines changed

alembic/versions/2025_05_11_1054-9d4002437ebe_set_default_created_at_for_backlog_.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def upgrade() -> None:
3030

3131
def downgrade() -> None:
3232
op.alter_column(
33-
table_name='backlog_snapshots',
33+
table_name='backlog_snapshot',
3434
column_name='created_at',
3535
existing_type=sa.DateTime(),
3636
nullable=False,

collector_db/AsyncDatabaseClient.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -776,6 +776,8 @@ async def has_urls_without_agency_suggestions(
776776
statement = (
777777
select(
778778
URL.id
779+
).where(
780+
URL.outcome == URLStatus.PENDING.value
779781
)
780782
)
781783

@@ -797,6 +799,7 @@ async def get_urls_without_agency_suggestions(self, session: AsyncSession) -> li
797799

798800
statement = (
799801
select(URL.id, URL.collector_metadata, Batch.strategy)
802+
.where(URL.outcome == URLStatus.PENDING.value)
800803
.join(Batch)
801804
)
802805
statement = self.statement_composer.exclude_urls_with_agency_suggestions(statement)

core/TaskManager.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ async def get_task_operators(self) -> list[TaskOperatorBase]:
101101
await self.get_url_html_task_operator(),
102102
# await self.get_url_relevance_huggingface_task_operator(),
103103
await self.get_url_record_type_task_operator(),
104-
# await self.get_agency_identification_task_operator(),
104+
await self.get_agency_identification_task_operator(),
105105
await self.get_url_miscellaneous_metadata_task_operator(),
106106
await self.get_submit_approved_url_task_operator()
107107
]
@@ -122,10 +122,9 @@ async def run_tasks(self):
122122
while meets_prereq:
123123
print(f"Running {operator.task_type.value} Task")
124124
if count > TASK_REPEAT_THRESHOLD:
125-
self.discord_poster.post_to_discord(
126-
message=f"Task {operator.task_type.value} has been run"
127-
f" more than {TASK_REPEAT_THRESHOLD} times in a row. "
128-
f"Task loop terminated.")
125+
message = f"Task {operator.task_type.value} has been run more than {TASK_REPEAT_THRESHOLD} times in a row. Task loop terminated."
126+
print(message)
127+
self.discord_poster.post_to_discord(message=message)
129128
break
130129
task_id = await self.initiate_task_in_db(task_type=operator.task_type)
131130
run_info: TaskOperatorRunInfo = await operator.run_task(task_id)

hugging_face/HuggingFaceInterface.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import asyncio
22
import json
3+
import os
34
import sys
45
from typing import List
56

@@ -17,17 +18,22 @@ async def get_url_relevancy_async(urls_with_html: List[URLWithHTML]) -> List[boo
1718
stdin=asyncio.subprocess.PIPE,
1819
stdout=asyncio.subprocess.PIPE,
1920
stderr=asyncio.subprocess.PIPE,
21+
env=os.environ.copy(), # ⬅️ ensure env variables are inherited
2022
)
2123

2224
stdout, stderr = await proc.communicate(input=input_data.encode("utf-8"))
25+
print(stderr)
2326

2427
raw_output = stdout.decode("utf-8").strip()
2528

29+
if proc.returncode != 0:
30+
raise RuntimeError(f"Error running HuggingFace: {stderr}/{raw_output}")
31+
2632
# Try to extract the actual JSON line
2733
for line in raw_output.splitlines():
2834
try:
2935
return json.loads(line)
30-
except json.JSONDecodeError:
36+
except json.JSONDecodeError as e:
3137
continue
3238

3339
raise RuntimeError(f"Could not parse JSON from subprocess: {raw_output}")

hugging_face/relevancy_worker.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import os
12
import sys
23
import json
34
from transformers import pipeline
@@ -7,6 +8,13 @@ def main():
78

89
pipe = pipeline("text-classification", model="PDAP/url-relevance")
910
results = pipe(urls)
11+
12+
print("Executable:", sys.executable, file=sys.stderr)
13+
print("sys.path:", sys.path, file=sys.stderr)
14+
print("PYTHONPATH:", os.getenv("PYTHONPATH"), file=sys.stderr)
15+
16+
if len(results) != len(urls):
17+
raise RuntimeError(f"Expected {len(urls)} results, got {len(results)}")
1018
bools = [r["score"] >= 0.5 for r in results]
1119

1220
print(json.dumps(bools))

local_database/classes/DockerContainer.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,12 @@ def run_command(self, command: str):
1717
def stop(self):
1818
self.container.stop()
1919

20+
def log_to_file(self):
21+
logs = self.container.logs(stdout=True, stderr=True)
22+
container_name = self.container.name
23+
with open(f"{container_name}.log", "wb") as f:
24+
f.write(logs)
25+
2026
def wait_for_pg_to_be_ready(self):
2127
for i in range(30):
2228
exit_code, output = self.container.exec_run("pg_isready")

tests/helpers/DBDataCreator.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
class URLCreationInfo(BaseModel):
2929
url_mappings: list[URLMapping]
3030
outcome: URLStatus
31-
annotation_info: AnnotationInfo
31+
annotation_info: Optional[AnnotationInfo] = None
3232

3333
class BatchURLCreationInfoV2(BaseModel):
3434
batch_id: int
@@ -109,7 +109,7 @@ async def batch_v2(
109109
d[url_parameters.status] = URLCreationInfo(
110110
url_mappings=iui.url_mappings,
111111
outcome=url_parameters.status,
112-
annotation_info=url_parameters.annotation_info
112+
annotation_info=url_parameters.annotation_info if url_parameters.annotation_info.has_annotations() else None
113113
)
114114
return BatchURLCreationInfoV2(
115115
batch_id=batch_id,

tests/test_automated/integration/tasks/test_agency_preannotation_task.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,10 @@
55
import pytest
66
from aiohttp import ClientSession
77

8+
from tests.helpers.test_batch_creation_parameters import TestBatchCreationParameters, TestURLCreationParameters
89
from source_collectors.muckrock.MuckrockAPIInterface import MuckrockAPIInterface, AgencyLookupResponseType, AgencyLookupResponse
910
from collector_db.models import Agency, AutomatedUrlAgencySuggestion
10-
from collector_manager.enums import CollectorType
11+
from collector_manager.enums import CollectorType, URLStatus
1112
from core.DTOs.TaskOperatorRunInfo import TaskOperatorOutcome
1213
from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo
1314
from core.classes.task_operators.AgencyIdentificationTaskOperator import AgencyIdentificationTaskOperator
@@ -20,7 +21,7 @@
2021
from pdap_api_client.DTOs import MatchAgencyResponse, MatchAgencyInfo
2122
from pdap_api_client.PDAPClient import PDAPClient
2223
from pdap_api_client.enums import MatchAgencyResponseStatus
23-
from tests.helpers.DBDataCreator import DBDataCreator, BatchURLCreationInfo
24+
from tests.helpers.DBDataCreator import DBDataCreator, BatchURLCreationInfo, BatchURLCreationInfoV2
2425

2526
sample_agency_suggestions = [
2627
URLAgencySuggestionInfo(
@@ -103,8 +104,25 @@ async def mock_run_subtask(
103104
CollectorType.MUCKROCK_ALL_SEARCH,
104105
CollectorType.CKAN
105106
]:
106-
creation_info: BatchURLCreationInfo = await db_data_creator.batch_and_urls(strategy=strategy, url_count=1, with_html_content=True)
107-
d[strategy] = creation_info.url_ids[0]
107+
# Create two URLs for each, one pending and one errored
108+
creation_info: BatchURLCreationInfoV2 = await db_data_creator.batch_v2(
109+
parameters=TestBatchCreationParameters(
110+
strategy=strategy,
111+
urls=[
112+
TestURLCreationParameters(
113+
count=1,
114+
status=URLStatus.PENDING,
115+
with_html_content=True
116+
),
117+
TestURLCreationParameters(
118+
count=1,
119+
status=URLStatus.ERROR,
120+
with_html_content=True
121+
)
122+
]
123+
)
124+
)
125+
d[strategy] = creation_info.url_creation_infos[URLStatus.PENDING].url_mappings[0].url_id
108126

109127

110128
# Confirm meets prerequisites

0 commit comments

Comments
 (0)