Skip to content

Commit fd9abbd

Browse files
authored
Merge pull request #438 from Police-Data-Accessibility-Project/mc_fix_annotation_bug
mc_fix_annotation_bug
2 parents d4a6e9d + 40a47fa commit fd9abbd

File tree

29 files changed

+385
-26
lines changed

29 files changed

+385
-26
lines changed

ENV.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ URL Task Flags are collectively controlled by the `RUN_URL_TASKS_TASK_FLAG` flag
8585
| `URL_ROOT_URL_TASK_FLAG` | Extracts and links Root URLs to URLs. |
8686
| `URL_SCREENSHOT_TASK_FLAG` | Takes screenshots of URLs. |
8787
| `URL_AUTO_VALIDATE_TASK_FLAG` | Automatically validates URLs. |
88+
| `URL_AUTO_NAME_TASK_FLAG` | Automatically names URLs. |
8889

8990
### Agency ID Subtasks
9091

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
"""Add URL naming logic
2+
3+
Revision ID: 3687026267fc
4+
Revises: e6a1a1b3bad4
5+
Create Date: 2025-09-24 17:39:55.353947
6+
7+
"""
8+
from typing import Sequence, Union
9+
10+
from alembic import op
11+
import sqlalchemy as sa
12+
13+
from src.util.alembic_helpers import id_column, url_id_column, created_at_column, user_id_column
14+
15+
# revision identifiers, used by Alembic.
16+
revision: str = '3687026267fc'
17+
down_revision: Union[str, None] = 'e6a1a1b3bad4'
18+
branch_labels: Union[str, Sequence[str], None] = None
19+
depends_on: Union[str, Sequence[str], None] = None
20+
21+
22+
23+
def upgrade() -> None:
24+
_add_auto_name_task()
25+
_create_url_name_suggestion_table()
26+
_create_link_user_name_suggestion_table()
27+
28+
def _add_auto_name_task():
29+
op.execute("""ALTER TYPE task_type ADD VALUE 'Auto Name';""")
30+
31+
32+
def _create_url_name_suggestion_table():
33+
op.create_table(
34+
'url_name_suggestions',
35+
id_column(),
36+
url_id_column(),
37+
sa.Column('suggestion', sa.String(
38+
length=100
39+
), nullable=False),
40+
sa.Column(
41+
'source', sa.Enum(
42+
"HTML Metadata Title",
43+
"User",
44+
name="suggestion_source_enum"
45+
)
46+
),
47+
created_at_column(),
48+
sa.UniqueConstraint(
49+
'url_id', 'suggestion', name='url_name_suggestions_url_id_source_unique'
50+
)
51+
)
52+
53+
54+
def _create_link_user_name_suggestion_table():
55+
op.create_table(
56+
'link_user_name_suggestions',
57+
user_id_column(),
58+
sa.Column(
59+
"suggestion_id",
60+
sa.Integer(),
61+
sa.ForeignKey("url_name_suggestions.id"),
62+
nullable=False,
63+
),
64+
created_at_column(),
65+
sa.PrimaryKeyConstraint(
66+
"user_id",
67+
"suggestion_id"
68+
)
69+
)

src/core/tasks/url/loader.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from src.core.tasks.url.models.entry import URLTaskEntry
99
from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator
1010
from src.core.tasks.url.operators.agency_identification.subtasks.loader import AgencyIdentificationSubtaskLoader
11+
from src.core.tasks.url.operators.auto_name.core import AutoNameURLTaskOperator
1112
from src.core.tasks.url.operators.auto_relevant.core import URLAutoRelevantTaskOperator
1213
from src.core.tasks.url.operators.html.core import URLHTMLTaskOperator
1314
from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser
@@ -213,6 +214,18 @@ def _get_auto_validate_task_operator(self) -> URLTaskEntry:
213214
)
214215
)
215216

217+
def _get_auto_name_task_operator(self) -> URLTaskEntry:
218+
operator = AutoNameURLTaskOperator(
219+
adb_client=self.adb_client,
220+
)
221+
return URLTaskEntry(
222+
operator=operator,
223+
enabled=self.env.bool(
224+
"URL_AUTO_NAME_TASK_FLAG",
225+
default=True
226+
)
227+
)
228+
216229

217230
async def load_entries(self) -> list[URLTaskEntry]:
218231
return [
@@ -227,5 +240,6 @@ async def load_entries(self) -> list[URLTaskEntry]:
227240
self._get_url_auto_relevance_task_operator(),
228241
self._get_url_screenshot_task_operator(),
229242
self._get_location_id_task_operator(),
230-
self._get_auto_validate_task_operator()
243+
self._get_auto_validate_task_operator(),
244+
self._get_auto_name_task_operator(),
231245
]

src/core/tasks/url/operators/auto_name/__init__.py

Whitespace-only changes.
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
from src.db.models.impl.url.suggestion.location.auto.subtask.constants import MAX_SUGGESTION_LENGTH
2+
3+
4+
def clean_title(title: str) -> str:
5+
if len(title) > MAX_SUGGESTION_LENGTH:
6+
return title[:MAX_SUGGESTION_LENGTH-3] + "..."
7+
return title
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
from src.core.tasks.url.operators.auto_name.clean import clean_title
2+
from src.core.tasks.url.operators.auto_name.input import AutoNamePrerequisitesInput
3+
from src.core.tasks.url.operators.auto_name.queries.get import AutoNameGetInputsQueryBuilder
4+
from src.core.tasks.url.operators.auto_name.queries.prereq import AutoNamePrerequisitesQueryBuilder
5+
from src.core.tasks.url.operators.base import URLTaskOperatorBase
6+
from src.db.enums import TaskType
7+
from src.db.models.impl.url.suggestion.name.enums import NameSuggestionSource
8+
from src.db.models.impl.url.suggestion.name.pydantic import URLNameSuggestionPydantic
9+
10+
11+
class AutoNameURLTaskOperator(URLTaskOperatorBase):
12+
13+
@property
14+
def task_type(self) -> TaskType:
15+
return TaskType.AUTO_NAME
16+
17+
async def meets_task_prerequisites(self) -> bool:
18+
return await self.adb_client.run_query_builder(
19+
AutoNamePrerequisitesQueryBuilder()
20+
)
21+
22+
async def inner_task_logic(self) -> None:
23+
24+
# Get URLs with HTML metadata title
25+
inputs: list[AutoNamePrerequisitesInput] = await self.adb_client.run_query_builder(
26+
AutoNameGetInputsQueryBuilder()
27+
)
28+
29+
# Link URLs to task
30+
url_ids: list[int] = [input.url_id for input in inputs]
31+
await self.link_urls_to_task(url_ids)
32+
33+
# Add suggestions
34+
suggestions: list[URLNameSuggestionPydantic] = [
35+
URLNameSuggestionPydantic(
36+
url_id=input_.url_id,
37+
suggestion=clean_title(input_.title),
38+
source=NameSuggestionSource.HTML_METADATA_TITLE,
39+
)
40+
for input_ in inputs
41+
]
42+
43+
await self.adb_client.bulk_insert(models=suggestions)
44+
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
from pydantic import BaseModel
2+
3+
4+
class AutoNamePrerequisitesInput(BaseModel):
5+
url_id: int
6+
title: str

src/core/tasks/url/operators/auto_name/queries/__init__.py

Whitespace-only changes.
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
from sqlalchemy import select, exists, CTE, Column
2+
3+
from src.db.enums import URLHTMLContentType
4+
from src.db.models.impl.url.core.sqlalchemy import URL
5+
from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent
6+
from src.db.models.impl.url.suggestion.name.enums import NameSuggestionSource
7+
from src.db.models.impl.url.suggestion.name.sqlalchemy import URLNameSuggestion
8+
9+
10+
class AutoNamePrerequisiteCTEContainer:
11+
12+
def __init__(self):
13+
self._query = (
14+
select(
15+
URL.id.label("url_id"),
16+
URLHTMLContent.content
17+
)
18+
.join(
19+
URLHTMLContent,
20+
URLHTMLContent.url_id == URL.id
21+
)
22+
.where(
23+
URLHTMLContent.content_type == URLHTMLContentType.TITLE.value,
24+
~exists(
25+
select(
26+
URLNameSuggestion.id
27+
)
28+
.where(
29+
URLNameSuggestion.url_id == URL.id,
30+
URLNameSuggestion.source == NameSuggestionSource.HTML_METADATA_TITLE.value,
31+
)
32+
)
33+
).cte("auto_name_prerequisites")
34+
)
35+
36+
@property
37+
def cte(self) -> CTE:
38+
return self._query
39+
40+
@property
41+
def url_id(self) -> Column[int]:
42+
return self.cte.c.url_id
43+
44+
@property
45+
def content(self) -> Column[str]:
46+
return self.cte.c.content
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
from typing import Sequence
2+
3+
from sqlalchemy import select, RowMapping
4+
from sqlalchemy.ext.asyncio import AsyncSession
5+
6+
from src.core.tasks.url.operators.auto_name.input import AutoNamePrerequisitesInput
7+
from src.core.tasks.url.operators.auto_name.queries.cte import AutoNamePrerequisiteCTEContainer
8+
from src.db.queries.base.builder import QueryBuilderBase
9+
10+
from src.db.helpers.session import session_helper as sh
11+
12+
class AutoNameGetInputsQueryBuilder(QueryBuilderBase):
13+
14+
async def run(self, session: AsyncSession) -> list[AutoNamePrerequisitesInput]:
15+
cte = AutoNamePrerequisiteCTEContainer()
16+
query = select(cte.url_id, cte.content)
17+
18+
mappings: Sequence[RowMapping] = await sh.mappings(session=session, query=query)
19+
results: list[AutoNamePrerequisitesInput] = []
20+
for mapping in mappings:
21+
result = AutoNamePrerequisitesInput(
22+
url_id=mapping["url_id"],
23+
title=mapping["content"],
24+
)
25+
results.append(result)
26+
27+
return results

0 commit comments

Comments
 (0)