diff --git a/ENV.md b/ENV.md index a6208f8b..525fb3f4 100644 --- a/ENV.md +++ b/ENV.md @@ -85,6 +85,7 @@ URL Task Flags are collectively controlled by the `RUN_URL_TASKS_TASK_FLAG` flag | `URL_ROOT_URL_TASK_FLAG` | Extracts and links Root URLs to URLs. | | `URL_SCREENSHOT_TASK_FLAG` | Takes screenshots of URLs. | | `URL_AUTO_VALIDATE_TASK_FLAG` | Automatically validates URLs. | +| `URL_AUTO_NAME_TASK_FLAG` | Automatically names URLs. | ### Agency ID Subtasks diff --git a/alembic/versions/2025_09_24_1739-3687026267fc_add_url_naming_logic.py b/alembic/versions/2025_09_24_1739-3687026267fc_add_url_naming_logic.py new file mode 100644 index 00000000..9e6a3821 --- /dev/null +++ b/alembic/versions/2025_09_24_1739-3687026267fc_add_url_naming_logic.py @@ -0,0 +1,69 @@ +"""Add URL naming logic + +Revision ID: 3687026267fc +Revises: e6a1a1b3bad4 +Create Date: 2025-09-24 17:39:55.353947 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + +from src.util.alembic_helpers import id_column, url_id_column, created_at_column, user_id_column + +# revision identifiers, used by Alembic. +revision: str = '3687026267fc' +down_revision: Union[str, None] = 'e6a1a1b3bad4' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + + +def upgrade() -> None: + _add_auto_name_task() + _create_url_name_suggestion_table() + _create_link_user_name_suggestion_table() + +def _add_auto_name_task(): + op.execute("""ALTER TYPE task_type ADD VALUE 'Auto Name';""") + + +def _create_url_name_suggestion_table(): + op.create_table( + 'url_name_suggestions', + id_column(), + url_id_column(), + sa.Column('suggestion', sa.String( + length=100 + ), nullable=False), + sa.Column( + 'source', sa.Enum( + "HTML Metadata Title", + "User", + name="suggestion_source_enum" + ) + ), + created_at_column(), + sa.UniqueConstraint( + 'url_id', 'suggestion', name='url_name_suggestions_url_id_source_unique' + ) + ) + + +def _create_link_user_name_suggestion_table(): + op.create_table( + 'link_user_name_suggestions', + user_id_column(), + sa.Column( + "suggestion_id", + sa.Integer(), + sa.ForeignKey("url_name_suggestions.id"), + nullable=False, + ), + created_at_column(), + sa.PrimaryKeyConstraint( + "user_id", + "suggestion_id" + ) + ) \ No newline at end of file diff --git a/src/core/tasks/url/loader.py b/src/core/tasks/url/loader.py index 00993798..41e79949 100644 --- a/src/core/tasks/url/loader.py +++ b/src/core/tasks/url/loader.py @@ -8,6 +8,7 @@ from src.core.tasks.url.models.entry import URLTaskEntry from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator from src.core.tasks.url.operators.agency_identification.subtasks.loader import AgencyIdentificationSubtaskLoader +from src.core.tasks.url.operators.auto_name.core import AutoNameURLTaskOperator from src.core.tasks.url.operators.auto_relevant.core import URLAutoRelevantTaskOperator from src.core.tasks.url.operators.html.core import URLHTMLTaskOperator from src.core.tasks.url.operators.html.scraper.parser.core import HTMLResponseParser @@ -213,6 +214,18 @@ def _get_auto_validate_task_operator(self) -> URLTaskEntry: ) ) + def _get_auto_name_task_operator(self) -> URLTaskEntry: + operator = AutoNameURLTaskOperator( + adb_client=self.adb_client, + ) + return URLTaskEntry( + operator=operator, + enabled=self.env.bool( + "URL_AUTO_NAME_TASK_FLAG", + default=True + ) + ) + async def load_entries(self) -> list[URLTaskEntry]: return [ @@ -227,5 +240,6 @@ async def load_entries(self) -> list[URLTaskEntry]: self._get_url_auto_relevance_task_operator(), self._get_url_screenshot_task_operator(), self._get_location_id_task_operator(), - self._get_auto_validate_task_operator() + self._get_auto_validate_task_operator(), + self._get_auto_name_task_operator(), ] diff --git a/src/core/tasks/url/operators/auto_name/__init__.py b/src/core/tasks/url/operators/auto_name/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/auto_name/clean.py b/src/core/tasks/url/operators/auto_name/clean.py new file mode 100644 index 00000000..2e1820ab --- /dev/null +++ b/src/core/tasks/url/operators/auto_name/clean.py @@ -0,0 +1,7 @@ +from src.db.models.impl.url.suggestion.location.auto.subtask.constants import MAX_SUGGESTION_LENGTH + + +def clean_title(title: str) -> str: + if len(title) > MAX_SUGGESTION_LENGTH: + return title[:MAX_SUGGESTION_LENGTH-3] + "..." + return title \ No newline at end of file diff --git a/src/core/tasks/url/operators/auto_name/core.py b/src/core/tasks/url/operators/auto_name/core.py new file mode 100644 index 00000000..00af9838 --- /dev/null +++ b/src/core/tasks/url/operators/auto_name/core.py @@ -0,0 +1,44 @@ +from src.core.tasks.url.operators.auto_name.clean import clean_title +from src.core.tasks.url.operators.auto_name.input import AutoNamePrerequisitesInput +from src.core.tasks.url.operators.auto_name.queries.get import AutoNameGetInputsQueryBuilder +from src.core.tasks.url.operators.auto_name.queries.prereq import AutoNamePrerequisitesQueryBuilder +from src.core.tasks.url.operators.base import URLTaskOperatorBase +from src.db.enums import TaskType +from src.db.models.impl.url.suggestion.name.enums import NameSuggestionSource +from src.db.models.impl.url.suggestion.name.pydantic import URLNameSuggestionPydantic + + +class AutoNameURLTaskOperator(URLTaskOperatorBase): + + @property + def task_type(self) -> TaskType: + return TaskType.AUTO_NAME + + async def meets_task_prerequisites(self) -> bool: + return await self.adb_client.run_query_builder( + AutoNamePrerequisitesQueryBuilder() + ) + + async def inner_task_logic(self) -> None: + + # Get URLs with HTML metadata title + inputs: list[AutoNamePrerequisitesInput] = await self.adb_client.run_query_builder( + AutoNameGetInputsQueryBuilder() + ) + + # Link URLs to task + url_ids: list[int] = [input.url_id for input in inputs] + await self.link_urls_to_task(url_ids) + + # Add suggestions + suggestions: list[URLNameSuggestionPydantic] = [ + URLNameSuggestionPydantic( + url_id=input_.url_id, + suggestion=clean_title(input_.title), + source=NameSuggestionSource.HTML_METADATA_TITLE, + ) + for input_ in inputs + ] + + await self.adb_client.bulk_insert(models=suggestions) + diff --git a/src/core/tasks/url/operators/auto_name/input.py b/src/core/tasks/url/operators/auto_name/input.py new file mode 100644 index 00000000..afbd2f34 --- /dev/null +++ b/src/core/tasks/url/operators/auto_name/input.py @@ -0,0 +1,6 @@ +from pydantic import BaseModel + + +class AutoNamePrerequisitesInput(BaseModel): + url_id: int + title: str \ No newline at end of file diff --git a/src/core/tasks/url/operators/auto_name/queries/__init__.py b/src/core/tasks/url/operators/auto_name/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/core/tasks/url/operators/auto_name/queries/cte.py b/src/core/tasks/url/operators/auto_name/queries/cte.py new file mode 100644 index 00000000..5dc585bc --- /dev/null +++ b/src/core/tasks/url/operators/auto_name/queries/cte.py @@ -0,0 +1,46 @@ +from sqlalchemy import select, exists, CTE, Column + +from src.db.enums import URLHTMLContentType +from src.db.models.impl.url.core.sqlalchemy import URL +from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent +from src.db.models.impl.url.suggestion.name.enums import NameSuggestionSource +from src.db.models.impl.url.suggestion.name.sqlalchemy import URLNameSuggestion + + +class AutoNamePrerequisiteCTEContainer: + + def __init__(self): + self._query = ( + select( + URL.id.label("url_id"), + URLHTMLContent.content + ) + .join( + URLHTMLContent, + URLHTMLContent.url_id == URL.id + ) + .where( + URLHTMLContent.content_type == URLHTMLContentType.TITLE.value, + ~exists( + select( + URLNameSuggestion.id + ) + .where( + URLNameSuggestion.url_id == URL.id, + URLNameSuggestion.source == NameSuggestionSource.HTML_METADATA_TITLE.value, + ) + ) + ).cte("auto_name_prerequisites") + ) + + @property + def cte(self) -> CTE: + return self._query + + @property + def url_id(self) -> Column[int]: + return self.cte.c.url_id + + @property + def content(self) -> Column[str]: + return self.cte.c.content \ No newline at end of file diff --git a/src/core/tasks/url/operators/auto_name/queries/get.py b/src/core/tasks/url/operators/auto_name/queries/get.py new file mode 100644 index 00000000..b4978521 --- /dev/null +++ b/src/core/tasks/url/operators/auto_name/queries/get.py @@ -0,0 +1,27 @@ +from typing import Sequence + +from sqlalchemy import select, RowMapping +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.url.operators.auto_name.input import AutoNamePrerequisitesInput +from src.core.tasks.url.operators.auto_name.queries.cte import AutoNamePrerequisiteCTEContainer +from src.db.queries.base.builder import QueryBuilderBase + +from src.db.helpers.session import session_helper as sh + +class AutoNameGetInputsQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> list[AutoNamePrerequisitesInput]: + cte = AutoNamePrerequisiteCTEContainer() + query = select(cte.url_id, cte.content) + + mappings: Sequence[RowMapping] = await sh.mappings(session=session, query=query) + results: list[AutoNamePrerequisitesInput] = [] + for mapping in mappings: + result = AutoNamePrerequisitesInput( + url_id=mapping["url_id"], + title=mapping["content"], + ) + results.append(result) + + return results \ No newline at end of file diff --git a/src/core/tasks/url/operators/auto_name/queries/prereq.py b/src/core/tasks/url/operators/auto_name/queries/prereq.py new file mode 100644 index 00000000..c6224db8 --- /dev/null +++ b/src/core/tasks/url/operators/auto_name/queries/prereq.py @@ -0,0 +1,16 @@ +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from src.core.tasks.url.operators.auto_name.queries.cte import AutoNamePrerequisiteCTEContainer +from src.db.helpers.session import session_helper as sh +from src.db.queries.base.builder import QueryBuilderBase + + +class AutoNamePrerequisitesQueryBuilder(QueryBuilderBase): + + async def run(self, session: AsyncSession) -> bool: + cte = AutoNamePrerequisiteCTEContainer() + query = select(cte.url_id) + return await sh.results_exist(session, query=query) + + diff --git a/src/db/enums.py b/src/db/enums.py index 84d2c199..af2b02a7 100644 --- a/src/db/enums.py +++ b/src/db/enums.py @@ -50,6 +50,7 @@ class TaskType(PyEnum): SCREENSHOT = "Screenshot" LOCATION_ID = "Location ID" AUTO_VALIDATE = "Auto Validate" + AUTO_NAME = "Auto Name" # Scheduled Tasks PUSH_TO_HUGGINGFACE = "Push to Hugging Face" diff --git a/src/db/models/impl/link/user_name_suggestion/__init__.py b/src/db/models/impl/link/user_name_suggestion/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/impl/link/user_name_suggestion/pydantic.py b/src/db/models/impl/link/user_name_suggestion/pydantic.py new file mode 100644 index 00000000..6e07989b --- /dev/null +++ b/src/db/models/impl/link/user_name_suggestion/pydantic.py @@ -0,0 +1,12 @@ +from src.db.models.impl.link.user_name_suggestion.sqlalchemy import LinkUserNameSuggestion +from src.db.templates.markers.bulk.insert import BulkInsertableModel + + +class LinkUserNameSuggestionPydantic(BulkInsertableModel): + + suggestion_id: int + user_id: int + + @classmethod + def sa_model(cls) -> type[LinkUserNameSuggestion]: + return LinkUserNameSuggestion \ No newline at end of file diff --git a/src/db/models/impl/link/user_name_suggestion/sqlalchemy.py b/src/db/models/impl/link/user_name_suggestion/sqlalchemy.py new file mode 100644 index 00000000..316a8e3c --- /dev/null +++ b/src/db/models/impl/link/user_name_suggestion/sqlalchemy.py @@ -0,0 +1,25 @@ +from sqlalchemy import Column, Integer, ForeignKey + +from src.db.models.mixins import CreatedAtMixin +from src.db.models.templates_.base import Base + + +class LinkUserNameSuggestion( + Base, + CreatedAtMixin, +): + + __tablename__ = "link_user_name_suggestions" + + suggestion_id = Column( + Integer, + ForeignKey("url_name_suggestions.id"), + primary_key=True, + nullable=False, + ) + + user_id = Column( + Integer, + primary_key=True, + nullable=False, + ) \ No newline at end of file diff --git a/src/db/models/impl/url/suggestion/location/auto/subtask/constants.py b/src/db/models/impl/url/suggestion/location/auto/subtask/constants.py new file mode 100644 index 00000000..d6b887c7 --- /dev/null +++ b/src/db/models/impl/url/suggestion/location/auto/subtask/constants.py @@ -0,0 +1,3 @@ + + +MAX_SUGGESTION_LENGTH: int = 100 \ No newline at end of file diff --git a/src/db/models/impl/url/suggestion/name/__init__.py b/src/db/models/impl/url/suggestion/name/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/db/models/impl/url/suggestion/name/enums.py b/src/db/models/impl/url/suggestion/name/enums.py new file mode 100644 index 00000000..89b570e6 --- /dev/null +++ b/src/db/models/impl/url/suggestion/name/enums.py @@ -0,0 +1,6 @@ +from enum import Enum + + +class NameSuggestionSource(Enum): + HTML_METADATA_TITLE = "HTML Metadata Title" + USER = "User" \ No newline at end of file diff --git a/src/db/models/impl/url/suggestion/name/pydantic.py b/src/db/models/impl/url/suggestion/name/pydantic.py new file mode 100644 index 00000000..244e02c2 --- /dev/null +++ b/src/db/models/impl/url/suggestion/name/pydantic.py @@ -0,0 +1,17 @@ +from pydantic import Field + +from src.db.models.impl.url.suggestion.location.auto.subtask.constants import MAX_SUGGESTION_LENGTH +from src.db.models.impl.url.suggestion.name.enums import NameSuggestionSource +from src.db.models.impl.url.suggestion.name.sqlalchemy import URLNameSuggestion +from src.db.templates.markers.bulk.insert import BulkInsertableModel + + +class URLNameSuggestionPydantic(BulkInsertableModel): + + url_id: int + suggestion: str = Field(..., max_length=MAX_SUGGESTION_LENGTH) + source: NameSuggestionSource + + @classmethod + def sa_model(cls) -> type[URLNameSuggestion]: + return URLNameSuggestion \ No newline at end of file diff --git a/src/db/models/impl/url/suggestion/name/sqlalchemy.py b/src/db/models/impl/url/suggestion/name/sqlalchemy.py new file mode 100644 index 00000000..d06d7305 --- /dev/null +++ b/src/db/models/impl/url/suggestion/name/sqlalchemy.py @@ -0,0 +1,22 @@ +from sqlalchemy import Column, String + +from src.db.models.helpers import enum_column +from src.db.models.impl.url.suggestion.location.auto.subtask.constants import MAX_SUGGESTION_LENGTH +from src.db.models.impl.url.suggestion.name.enums import NameSuggestionSource +from src.db.models.mixins import URLDependentMixin, CreatedAtMixin +from src.db.models.templates_.with_id import WithIDBase + + +class URLNameSuggestion( + WithIDBase, + CreatedAtMixin, + URLDependentMixin +): + + __tablename__ = "url_name_suggestions" + + suggestion = Column(String(MAX_SUGGESTION_LENGTH), nullable=False) + source = enum_column( + NameSuggestionSource, + name="suggestion_source_enum" + ) \ No newline at end of file diff --git a/tests/alembic/conftest.py b/tests/alembic/conftest.py index e8c5dc9f..f041e94a 100644 --- a/tests/alembic/conftest.py +++ b/tests/alembic/conftest.py @@ -43,17 +43,11 @@ def alembic_runner(connection, alembic_config) -> Generator[AlembicRunner, Any, connection=connection, session=scoped_session(sessionmaker(bind=connection)), ) - try: - runner.downgrade("base") - except Exception as e: - runner.reset_schema() - runner.stamp("base") + runner.reset_schema() + runner.stamp("base") print("Running test") yield runner print("Test complete") runner.session.close() - try: - runner.downgrade("base") - except Exception as e: - runner.reset_schema() - runner.stamp("base") + runner.reset_schema() + runner.stamp("base") diff --git a/tests/alembic/test_revisions.py b/tests/alembic/test_revisions.py index 19b5d046..94fa6c5e 100644 --- a/tests/alembic/test_revisions.py +++ b/tests/alembic/test_revisions.py @@ -6,4 +6,3 @@ def test_full_upgrade_downgrade(alembic_runner): # Both should run without error alembic_runner.upgrade("head") - alembic_runner.downgrade("base") \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/auto_name/__init__.py b/tests/automated/integration/tasks/url/impl/auto_name/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/automated/integration/tasks/url/impl/auto_name/conftest.py b/tests/automated/integration/tasks/url/impl/auto_name/conftest.py new file mode 100644 index 00000000..7dcb6683 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/auto_name/conftest.py @@ -0,0 +1,14 @@ +import pytest + +from src.core.tasks.url.operators.auto_name.core import AutoNameURLTaskOperator +from src.db.client.async_ import AsyncDatabaseClient + + +@pytest.fixture +def operator( + adb_client_test: AsyncDatabaseClient +) -> AutoNameURLTaskOperator: + operator = AutoNameURLTaskOperator( + adb_client=adb_client_test, + ) + return operator \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/impl/auto_name/test_core.py b/tests/automated/integration/tasks/url/impl/auto_name/test_core.py new file mode 100644 index 00000000..c0500d99 --- /dev/null +++ b/tests/automated/integration/tasks/url/impl/auto_name/test_core.py @@ -0,0 +1,39 @@ +import pytest + +from src.core.tasks.url.operators.auto_name.core import AutoNameURLTaskOperator +from src.db.models.impl.url.suggestion.name.enums import NameSuggestionSource +from src.db.models.impl.url.suggestion.name.sqlalchemy import URLNameSuggestion +from tests.helpers.data_creator.core import DBDataCreator +from tests.helpers.run import run_task_and_confirm_success + + +@pytest.mark.asyncio +async def test_core( + operator: AutoNameURLTaskOperator, + db_data_creator: DBDataCreator +): + + assert not await operator.meets_task_prerequisites() + + # Create URL + url_id: int = (await db_data_creator.create_urls(count=1))[0].url_id + + assert not await operator.meets_task_prerequisites() + + # Add HTML content + + await db_data_creator.html_data(url_ids=[url_id]) + + assert await operator.meets_task_prerequisites() + + await run_task_and_confirm_success(operator) + + assert not await operator.meets_task_prerequisites() + + # Confirm suggestion was added + suggestions: list[URLNameSuggestion] = await db_data_creator.adb_client.get_all(URLNameSuggestion) + assert len(suggestions) == 1 + suggestion: URLNameSuggestion = suggestions[0] + assert suggestion.url_id == url_id + assert suggestion.suggestion == "test html content" + assert suggestion.source == NameSuggestionSource.HTML_METADATA_TITLE \ No newline at end of file diff --git a/tests/automated/integration/tasks/url/loader/test_flags.py b/tests/automated/integration/tasks/url/loader/test_flags.py index 43164d9e..777038b1 100644 --- a/tests/automated/integration/tasks/url/loader/test_flags.py +++ b/tests/automated/integration/tasks/url/loader/test_flags.py @@ -4,6 +4,7 @@ from src.core.tasks.url.loader import URLTaskOperatorLoader from src.core.tasks.url.models.entry import URLTaskEntry from src.core.tasks.url.operators.agency_identification.core import AgencyIdentificationTaskOperator +from src.core.tasks.url.operators.auto_name.core import AutoNameURLTaskOperator from src.core.tasks.url.operators.auto_relevant.core import URLAutoRelevantTaskOperator from src.core.tasks.url.operators.base import URLTaskOperatorBase from src.core.tasks.url.operators.html.core import URLHTMLTaskOperator @@ -59,6 +60,10 @@ class Config: FlagTestParams( env_var="URL_ROOT_URL_TASK_FLAG", operator=URLRootURLTaskOperator + ), + FlagTestParams( + env_var="URL_AUTO_NAME_TASK_FLAG", + operator=AutoNameURLTaskOperator ) ] diff --git a/tests/automated/integration/tasks/url/loader/test_happy_path.py b/tests/automated/integration/tasks/url/loader/test_happy_path.py index 7ba76a79..61dbb8c1 100644 --- a/tests/automated/integration/tasks/url/loader/test_happy_path.py +++ b/tests/automated/integration/tasks/url/loader/test_happy_path.py @@ -2,7 +2,7 @@ from src.core.tasks.url.loader import URLTaskOperatorLoader -NUMBER_OF_TASK_OPERATORS = 12 +NUMBER_OF_TASK_OPERATORS: int = 13 @pytest.mark.asyncio async def test_happy_path( diff --git a/tests/conftest.py b/tests/conftest.py index 35a87275..8333529e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -94,16 +94,11 @@ def setup_and_teardown(): yield - try: - runner.downgrade("base") - except Exception as e: - print("Exception while downgrading: ", e) - print("Resetting schema") - runner.reset_schema() - runner.stamp("base") - finally: - live_connection.close() - engine.dispose() + + runner.reset_schema() + runner.stamp("base") + live_connection.close() + engine.dispose() @pytest.fixture def wiped_database(): diff --git a/tests/helpers/alembic_runner.py b/tests/helpers/alembic_runner.py index 53458109..dd1807ba 100644 --- a/tests/helpers/alembic_runner.py +++ b/tests/helpers/alembic_runner.py @@ -23,9 +23,6 @@ def upgrade(self, revision: str): command.upgrade(self.alembic_config, revision) self.reflect() - def downgrade(self, revision: str): - command.downgrade(self.alembic_config, revision) - def stamp(self, revision: str): command.stamp(self.alembic_config, revision)