Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ENV.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ URL Task Flags are collectively controlled by the `RUN_URL_TASKS_TASK_FLAG` flag
| `URL_ROOT_URL_TASK_FLAG` | Extracts and links Root URLs to URLs. |
| `URL_SCREENSHOT_TASK_FLAG` | Takes screenshots of URLs. |
| `URL_AUTO_VALIDATE_TASK_FLAG` | Automatically validates URLs. |
| `URL_AUTO_NAME_TASK_FLAG` | Automatically names URLs. |

### Agency ID Subtasks

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
"""Add URL naming logic

Revision ID: 3687026267fc
Revises: e6a1a1b3bad4
Create Date: 2025-09-24 17:39:55.353947

"""
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa

from src.util.alembic_helpers import id_column, url_id_column, created_at_column, user_id_column

# revision identifiers, used by Alembic.
revision: str = '3687026267fc'
down_revision: Union[str, None] = 'e6a1a1b3bad4'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None



def upgrade() -> None:
_add_auto_name_task()
_create_url_name_suggestion_table()
_create_link_user_name_suggestion_table()

def _add_auto_name_task():
op.execute("""ALTER TYPE task_type ADD VALUE 'Auto Name';""")


def _create_url_name_suggestion_table():
op.create_table(
'url_name_suggestions',
id_column(),
url_id_column(),
sa.Column('suggestion', sa.String(
length=100
), nullable=False),
sa.Column(
'source', sa.Enum(
"HTML Metadata Title",
"User",
name="suggestion_source_enum"
)
),
created_at_column(),
sa.UniqueConstraint(
'url_id', 'suggestion', name='url_name_suggestions_url_id_source_unique'
)
)


def _create_link_user_name_suggestion_table():
op.create_table(
'link_user_name_suggestions',
user_id_column(),
sa.Column(
"suggestion_id",
sa.Integer(),
sa.ForeignKey("url_name_suggestions.id"),
nullable=False,
),
created_at_column(),
sa.PrimaryKeyConstraint(
"user_id",
"suggestion_id"
)
)
16 changes: 15 additions & 1 deletion src/core/tasks/url/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from src.core.tasks.url.models.entry import URLTaskEntry
from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator
from src.core.tasks.url.operators.agency_identification.subtasks.loader import AgencyIdentificationSubtaskLoader
from src.core.tasks.url.operators.auto_name.core import AutoNameURLTaskOperator
from src.core.tasks.url.operators.auto_relevant.core import URLAutoRelevantTaskOperator
from src.core.tasks.url.operators.html.core import URLHTMLTaskOperator
from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser
Expand Down Expand Up @@ -213,6 +214,18 @@ def _get_auto_validate_task_operator(self) -> URLTaskEntry:
)
)

def _get_auto_name_task_operator(self) -> URLTaskEntry:
operator = AutoNameURLTaskOperator(
adb_client=self.adb_client,
)
return URLTaskEntry(
operator=operator,
enabled=self.env.bool(
"URL_AUTO_NAME_TASK_FLAG",
default=True
)
)


async def load_entries(self) -> list[URLTaskEntry]:
return [
Expand All @@ -227,5 +240,6 @@ async def load_entries(self) -> list[URLTaskEntry]:
self._get_url_auto_relevance_task_operator(),
self._get_url_screenshot_task_operator(),
self._get_location_id_task_operator(),
self._get_auto_validate_task_operator()
self._get_auto_validate_task_operator(),
self._get_auto_name_task_operator(),
]
Empty file.
7 changes: 7 additions & 0 deletions src/core/tasks/url/operators/auto_name/clean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from src.db.models.impl.url.suggestion.location.auto.subtask.constants import MAX_SUGGESTION_LENGTH


def clean_title(title: str) -> str:
if len(title) > MAX_SUGGESTION_LENGTH:
return title[:MAX_SUGGESTION_LENGTH-3] + "..."
return title
44 changes: 44 additions & 0 deletions src/core/tasks/url/operators/auto_name/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from src.core.tasks.url.operators.auto_name.clean import clean_title
from src.core.tasks.url.operators.auto_name.input import AutoNamePrerequisitesInput
from src.core.tasks.url.operators.auto_name.queries.get import AutoNameGetInputsQueryBuilder
from src.core.tasks.url.operators.auto_name.queries.prereq import AutoNamePrerequisitesQueryBuilder
from src.core.tasks.url.operators.base import URLTaskOperatorBase
from src.db.enums import TaskType
from src.db.models.impl.url.suggestion.name.enums import NameSuggestionSource
from src.db.models.impl.url.suggestion.name.pydantic import URLNameSuggestionPydantic


class AutoNameURLTaskOperator(URLTaskOperatorBase):

@property
def task_type(self) -> TaskType:
return TaskType.AUTO_NAME

async def meets_task_prerequisites(self) -> bool:
return await self.adb_client.run_query_builder(
AutoNamePrerequisitesQueryBuilder()
)

async def inner_task_logic(self) -> None:

# Get URLs with HTML metadata title
inputs: list[AutoNamePrerequisitesInput] = await self.adb_client.run_query_builder(
AutoNameGetInputsQueryBuilder()
)

# Link URLs to task
url_ids: list[int] = [input.url_id for input in inputs]
await self.link_urls_to_task(url_ids)

# Add suggestions
suggestions: list[URLNameSuggestionPydantic] = [
URLNameSuggestionPydantic(
url_id=input_.url_id,
suggestion=clean_title(input_.title),
source=NameSuggestionSource.HTML_METADATA_TITLE,
)
for input_ in inputs
]

await self.adb_client.bulk_insert(models=suggestions)

6 changes: 6 additions & 0 deletions src/core/tasks/url/operators/auto_name/input.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from pydantic import BaseModel


class AutoNamePrerequisitesInput(BaseModel):
url_id: int
title: str
Empty file.
46 changes: 46 additions & 0 deletions src/core/tasks/url/operators/auto_name/queries/cte.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from sqlalchemy import select, exists, CTE, Column

from src.db.enums import URLHTMLContentType
from src.db.models.impl.url.core.sqlalchemy import URL
from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent
from src.db.models.impl.url.suggestion.name.enums import NameSuggestionSource
from src.db.models.impl.url.suggestion.name.sqlalchemy import URLNameSuggestion


class AutoNamePrerequisiteCTEContainer:

def __init__(self):
self._query = (
select(
URL.id.label("url_id"),
URLHTMLContent.content
)
.join(
URLHTMLContent,
URLHTMLContent.url_id == URL.id
)
.where(
URLHTMLContent.content_type == URLHTMLContentType.TITLE.value,
~exists(
select(
URLNameSuggestion.id
)
.where(
URLNameSuggestion.url_id == URL.id,
URLNameSuggestion.source == NameSuggestionSource.HTML_METADATA_TITLE.value,
)
)
).cte("auto_name_prerequisites")
)

@property
def cte(self) -> CTE:
return self._query

@property
def url_id(self) -> Column[int]:
return self.cte.c.url_id

@property
def content(self) -> Column[str]:
return self.cte.c.content
27 changes: 27 additions & 0 deletions src/core/tasks/url/operators/auto_name/queries/get.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from typing import Sequence

from sqlalchemy import select, RowMapping
from sqlalchemy.ext.asyncio import AsyncSession

from src.core.tasks.url.operators.auto_name.input import AutoNamePrerequisitesInput
from src.core.tasks.url.operators.auto_name.queries.cte import AutoNamePrerequisiteCTEContainer
from src.db.queries.base.builder import QueryBuilderBase

from src.db.helpers.session import session_helper as sh

class AutoNameGetInputsQueryBuilder(QueryBuilderBase):

async def run(self, session: AsyncSession) -> list[AutoNamePrerequisitesInput]:
cte = AutoNamePrerequisiteCTEContainer()
query = select(cte.url_id, cte.content)

mappings: Sequence[RowMapping] = await sh.mappings(session=session, query=query)
results: list[AutoNamePrerequisitesInput] = []
for mapping in mappings:
result = AutoNamePrerequisitesInput(
url_id=mapping["url_id"],
title=mapping["content"],
)
results.append(result)

return results
16 changes: 16 additions & 0 deletions src/core/tasks/url/operators/auto_name/queries/prereq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession

from src.core.tasks.url.operators.auto_name.queries.cte import AutoNamePrerequisiteCTEContainer
from src.db.helpers.session import session_helper as sh
from src.db.queries.base.builder import QueryBuilderBase


class AutoNamePrerequisitesQueryBuilder(QueryBuilderBase):

async def run(self, session: AsyncSession) -> bool:
cte = AutoNamePrerequisiteCTEContainer()
query = select(cte.url_id)
return await sh.results_exist(session, query=query)


1 change: 1 addition & 0 deletions src/db/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ class TaskType(PyEnum):
SCREENSHOT = "Screenshot"
LOCATION_ID = "Location ID"
AUTO_VALIDATE = "Auto Validate"
AUTO_NAME = "Auto Name"

# Scheduled Tasks
PUSH_TO_HUGGINGFACE = "Push to Hugging Face"
Expand Down
Empty file.
12 changes: 12 additions & 0 deletions src/db/models/impl/link/user_name_suggestion/pydantic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from src.db.models.impl.link.user_name_suggestion.sqlalchemy import LinkUserNameSuggestion
from src.db.templates.markers.bulk.insert import BulkInsertableModel


class LinkUserNameSuggestionPydantic(BulkInsertableModel):

suggestion_id: int
user_id: int

@classmethod
def sa_model(cls) -> type[LinkUserNameSuggestion]:
return LinkUserNameSuggestion
25 changes: 25 additions & 0 deletions src/db/models/impl/link/user_name_suggestion/sqlalchemy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from sqlalchemy import Column, Integer, ForeignKey

from src.db.models.mixins import CreatedAtMixin
from src.db.models.templates_.base import Base


class LinkUserNameSuggestion(
Base,
CreatedAtMixin,
):

__tablename__ = "link_user_name_suggestions"

suggestion_id = Column(
Integer,
ForeignKey("url_name_suggestions.id"),
primary_key=True,
nullable=False,
)

user_id = Column(
Integer,
primary_key=True,
nullable=False,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@


MAX_SUGGESTION_LENGTH: int = 100
Empty file.
6 changes: 6 additions & 0 deletions src/db/models/impl/url/suggestion/name/enums.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from enum import Enum


class NameSuggestionSource(Enum):
HTML_METADATA_TITLE = "HTML Metadata Title"
USER = "User"
17 changes: 17 additions & 0 deletions src/db/models/impl/url/suggestion/name/pydantic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from pydantic import Field

from src.db.models.impl.url.suggestion.location.auto.subtask.constants import MAX_SUGGESTION_LENGTH
from src.db.models.impl.url.suggestion.name.enums import NameSuggestionSource
from src.db.models.impl.url.suggestion.name.sqlalchemy import URLNameSuggestion
from src.db.templates.markers.bulk.insert import BulkInsertableModel


class URLNameSuggestionPydantic(BulkInsertableModel):

url_id: int
suggestion: str = Field(..., max_length=MAX_SUGGESTION_LENGTH)
source: NameSuggestionSource

@classmethod
def sa_model(cls) -> type[URLNameSuggestion]:
return URLNameSuggestion
22 changes: 22 additions & 0 deletions src/db/models/impl/url/suggestion/name/sqlalchemy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from sqlalchemy import Column, String

from src.db.models.helpers import enum_column
from src.db.models.impl.url.suggestion.location.auto.subtask.constants import MAX_SUGGESTION_LENGTH
from src.db.models.impl.url.suggestion.name.enums import NameSuggestionSource
from src.db.models.mixins import URLDependentMixin, CreatedAtMixin
from src.db.models.templates_.with_id import WithIDBase


class URLNameSuggestion(
WithIDBase,
CreatedAtMixin,
URLDependentMixin
):

__tablename__ = "url_name_suggestions"

suggestion = Column(String(MAX_SUGGESTION_LENGTH), nullable=False)
source = enum_column(
NameSuggestionSource,
name="suggestion_source_enum"
)
14 changes: 4 additions & 10 deletions tests/alembic/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,17 +43,11 @@ def alembic_runner(connection, alembic_config) -> Generator[AlembicRunner, Any,
connection=connection,
session=scoped_session(sessionmaker(bind=connection)),
)
try:
runner.downgrade("base")
except Exception as e:
runner.reset_schema()
runner.stamp("base")
runner.reset_schema()
runner.stamp("base")
print("Running test")
yield runner
print("Test complete")
runner.session.close()
try:
runner.downgrade("base")
except Exception as e:
runner.reset_schema()
runner.stamp("base")
runner.reset_schema()
runner.stamp("base")
Loading
Loading