Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def upgrade() -> None:

def downgrade() -> None:
op.alter_column(
table_name='backlog_snapshots',
table_name='backlog_snapshot',
column_name='created_at',
existing_type=sa.DateTime(),
nullable=False,
Expand Down
3 changes: 3 additions & 0 deletions collector_db/AsyncDatabaseClient.py
Original file line number Diff line number Diff line change
Expand Up @@ -776,6 +776,8 @@ async def has_urls_without_agency_suggestions(
statement = (
select(
URL.id
).where(
URL.outcome == URLStatus.PENDING.value
)
)

Expand All @@ -797,6 +799,7 @@ async def get_urls_without_agency_suggestions(self, session: AsyncSession) -> li

statement = (
select(URL.id, URL.collector_metadata, Batch.strategy)
.where(URL.outcome == URLStatus.PENDING.value)
.join(Batch)
)
statement = self.statement_composer.exclude_urls_with_agency_suggestions(statement)
Expand Down
9 changes: 4 additions & 5 deletions core/TaskManager.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ async def get_task_operators(self) -> list[TaskOperatorBase]:
await self.get_url_html_task_operator(),
# await self.get_url_relevance_huggingface_task_operator(),
await self.get_url_record_type_task_operator(),
# await self.get_agency_identification_task_operator(),
await self.get_agency_identification_task_operator(),
await self.get_url_miscellaneous_metadata_task_operator(),
await self.get_submit_approved_url_task_operator()
]
Expand All @@ -122,10 +122,9 @@ async def run_tasks(self):
while meets_prereq:
print(f"Running {operator.task_type.value} Task")
if count > TASK_REPEAT_THRESHOLD:
self.discord_poster.post_to_discord(
message=f"Task {operator.task_type.value} has been run"
f" more than {TASK_REPEAT_THRESHOLD} times in a row. "
f"Task loop terminated.")
message = f"Task {operator.task_type.value} has been run more than {TASK_REPEAT_THRESHOLD} times in a row. Task loop terminated."
print(message)
self.discord_poster.post_to_discord(message=message)
break
task_id = await self.initiate_task_in_db(task_type=operator.task_type)
run_info: TaskOperatorRunInfo = await operator.run_task(task_id)
Expand Down
8 changes: 7 additions & 1 deletion hugging_face/HuggingFaceInterface.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import asyncio
import json
import os
import sys
from typing import List

Expand All @@ -17,17 +18,22 @@
stdin=asyncio.subprocess.PIPE,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
env=os.environ.copy(), # ⬅️ ensure env variables are inherited
)

stdout, stderr = await proc.communicate(input=input_data.encode("utf-8"))
print(stderr)

raw_output = stdout.decode("utf-8").strip()

if proc.returncode != 0:
raise RuntimeError(f"Error running HuggingFace: {stderr}/{raw_output}")

# Try to extract the actual JSON line
for line in raw_output.splitlines():
try:
return json.loads(line)
except json.JSONDecodeError:
except json.JSONDecodeError as e:

Check warning on line 36 in hugging_face/HuggingFaceInterface.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] hugging_face/HuggingFaceInterface.py#L36 <841>

local variable 'e' is assigned to but never used
Raw output
./hugging_face/HuggingFaceInterface.py:36:13: F841 local variable 'e' is assigned to but never used
continue

raise RuntimeError(f"Could not parse JSON from subprocess: {raw_output}")
Expand Down
8 changes: 8 additions & 0 deletions hugging_face/relevancy_worker.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os

Check warning on line 1 in hugging_face/relevancy_worker.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] hugging_face/relevancy_worker.py#L1 <100>

Missing docstring in public module
Raw output
./hugging_face/relevancy_worker.py:1:1: D100 Missing docstring in public module
import sys
import json
from transformers import pipeline
Expand All @@ -7,6 +8,13 @@

pipe = pipeline("text-classification", model="PDAP/url-relevance")
results = pipe(urls)

print("Executable:", sys.executable, file=sys.stderr)
print("sys.path:", sys.path, file=sys.stderr)
print("PYTHONPATH:", os.getenv("PYTHONPATH"), file=sys.stderr)

if len(results) != len(urls):
raise RuntimeError(f"Expected {len(urls)} results, got {len(results)}")
bools = [r["score"] >= 0.5 for r in results]

print(json.dumps(bools))
Expand Down
6 changes: 6 additions & 0 deletions local_database/classes/DockerContainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@
def stop(self):
self.container.stop()

def log_to_file(self):

Check warning on line 20 in local_database/classes/DockerContainer.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] local_database/classes/DockerContainer.py#L20 <102>

Missing docstring in public method
Raw output
./local_database/classes/DockerContainer.py:20:1: D102 Missing docstring in public method
logs = self.container.logs(stdout=True, stderr=True)
container_name = self.container.name
with open(f"{container_name}.log", "wb") as f:
f.write(logs)

def wait_for_pg_to_be_ready(self):
for i in range(30):
exit_code, output = self.container.exec_run("pg_isready")
Expand Down
4 changes: 2 additions & 2 deletions tests/helpers/DBDataCreator.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
class URLCreationInfo(BaseModel):
url_mappings: list[URLMapping]
outcome: URLStatus
annotation_info: AnnotationInfo
annotation_info: Optional[AnnotationInfo] = None

class BatchURLCreationInfoV2(BaseModel):
batch_id: int
Expand Down Expand Up @@ -109,7 +109,7 @@ async def batch_v2(
d[url_parameters.status] = URLCreationInfo(
url_mappings=iui.url_mappings,
outcome=url_parameters.status,
annotation_info=url_parameters.annotation_info
annotation_info=url_parameters.annotation_info if url_parameters.annotation_info.has_annotations() else None
)
return BatchURLCreationInfoV2(
batch_id=batch_id,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@
import pytest
from aiohttp import ClientSession

from tests.helpers.test_batch_creation_parameters import TestBatchCreationParameters, TestURLCreationParameters
from source_collectors.muckrock.MuckrockAPIInterface import MuckrockAPIInterface, AgencyLookupResponseType, AgencyLookupResponse
from collector_db.models import Agency, AutomatedUrlAgencySuggestion
from collector_manager.enums import CollectorType
from collector_manager.enums import CollectorType, URLStatus
from core.DTOs.TaskOperatorRunInfo import TaskOperatorOutcome
from core.DTOs.URLAgencySuggestionInfo import URLAgencySuggestionInfo
from core.classes.task_operators.AgencyIdentificationTaskOperator import AgencyIdentificationTaskOperator
Expand All @@ -20,7 +21,7 @@
from pdap_api_client.DTOs import MatchAgencyResponse, MatchAgencyInfo
from pdap_api_client.PDAPClient import PDAPClient
from pdap_api_client.enums import MatchAgencyResponseStatus
from tests.helpers.DBDataCreator import DBDataCreator, BatchURLCreationInfo
from tests.helpers.DBDataCreator import DBDataCreator, BatchURLCreationInfo, BatchURLCreationInfoV2

Check warning on line 24 in tests/test_automated/integration/tasks/test_agency_preannotation_task.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] tests/test_automated/integration/tasks/test_agency_preannotation_task.py#L24 <401>

'tests.helpers.DBDataCreator.BatchURLCreationInfo' imported but unused
Raw output
./tests/test_automated/integration/tasks/test_agency_preannotation_task.py:24:1: F401 'tests.helpers.DBDataCreator.BatchURLCreationInfo' imported but unused

sample_agency_suggestions = [
URLAgencySuggestionInfo(
Expand Down Expand Up @@ -103,8 +104,25 @@
CollectorType.MUCKROCK_ALL_SEARCH,
CollectorType.CKAN
]:
creation_info: BatchURLCreationInfo = await db_data_creator.batch_and_urls(strategy=strategy, url_count=1, with_html_content=True)
d[strategy] = creation_info.url_ids[0]
# Create two URLs for each, one pending and one errored
creation_info: BatchURLCreationInfoV2 = await db_data_creator.batch_v2(
parameters=TestBatchCreationParameters(
strategy=strategy,
urls=[
TestURLCreationParameters(
count=1,
status=URLStatus.PENDING,
with_html_content=True
),
TestURLCreationParameters(
count=1,
status=URLStatus.ERROR,
with_html_content=True
)
]
)
)
d[strategy] = creation_info.url_creation_infos[URLStatus.PENDING].url_mappings[0].url_id


# Confirm meets prerequisites
Expand Down