diff --git a/alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py b/alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py new file mode 100644 index 00000000..34ae8506 --- /dev/null +++ b/alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py @@ -0,0 +1,240 @@ +"""Remove ID columns + +Revision ID: 5d6412540aba +Revises: d5f0cc2be6b6 +Create Date: 2025-11-29 07:17:32.794305 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '5d6412540aba' +down_revision: Union[str, None] = 'd5f0cc2be6b6' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + +TABLES = [ + "task_errors", # + "agency_id_subtask_suggestions", # + "auto_record_type_suggestions", # + "auto_relevant_suggestions", # + "duplicates", # + "flag_url_validated", # + "link_agencies__locations", # + "link_urls_redirect_url", # + "link_urls_root_url", # + "reviewing_user_url", # + "url_checked_for_duplicate", # + "url_compressed_html", # + "url_html_content", # + "url_internet_archives_probe_metadata", # + "url_internet_archives_save_metadata", # + "url_optional_data_source_metadata", # + "url_scrape_info", # + "url_web_metadata", # + "user_record_type_suggestions", # + "user_url_type_suggestions", # +] + +URL_ONLY_PRIMARY_KEY_TABLES = [ + "url_checked_for_duplicate", # + "url_compressed_html", # + "url_internet_archives_probe_metadata", # + "url_internet_archives_save_metadata", # + "url_optional_data_source_metadata", # + "url_scrape_info", # + "url_web_metadata", # + "auto_relevant_suggestions", # + "auto_record_type_suggestions", # + "flag_url_validated" # +] + + + +USER_URL_ID_PRIMARY_KEY_TABLES = [ + "user_record_type_suggestions", # + "user_url_type_suggestions", # + "reviewing_user_url" # +] + +BESPOKE_UNIQUE_IDS: dict[str, list[str]] = { + "task_errors": ["task_id"], # + "agency_id_subtask_suggestions": ["agency_id", "subtask_id"], # + "link_agencies__locations": ["agency_id", "location_id"], # + "link_urls_redirect_url": ["source_url_id", "destination_url_id"], # + "link_urls_root_url": ["url_id", "root_url_id"], # + "url_html_content": ["url_id", "content_type"], # + "duplicates": ["batch_id", "original_url_id"] +} + +def drop_views(): + op.execute("drop materialized view if exists url_status_mat_view") + op.execute("drop materialized view if exists batch_url_status_mat_view") + +def recreate_views(): + op.execute(""" + create materialized view url_status_mat_view as + WITH + urls_with_relevant_errors AS ( + SELECT + ute.url_id + FROM + url_task_error ute + WHERE + ute.task_type = ANY (ARRAY ['Screenshot'::task_type, 'HTML'::task_type, 'URL Probe'::task_type]) + ) + , status_text AS ( + SELECT + u.id AS url_id, + CASE + WHEN fuv.type = ANY + (ARRAY ['not relevant'::url_type, 'individual record'::url_type, 'not found'::url_type]) + THEN 'Accepted'::text + WHEN fuv.type = 'data source'::url_type AND uds.url_id IS NULL OR + fuv.type = 'meta url'::url_type AND udmu.url_id IS NULL THEN 'Awaiting Submission'::text + WHEN fuv.type = 'data source'::url_type AND uds.url_id IS NOT NULL OR + fuv.type = 'meta url'::url_type AND udmu.url_id IS NOT NULL THEN 'Submitted'::text + WHEN uch.url_id IS NOT NULL AND uwm.url_id IS NOT NULL AND us.url_id IS NOT NULL + THEN 'Community Labeling'::text + WHEN uwre.url_id IS NOT NULL THEN 'Error'::text + ELSE 'Intake'::text + END AS status + FROM + urls u + LEFT JOIN urls_with_relevant_errors uwre + ON u.id = uwre.url_id + LEFT JOIN url_screenshot us + ON u.id = us.url_id + LEFT JOIN url_compressed_html uch + ON u.id = uch.url_id + LEFT JOIN url_web_metadata uwm + ON u.id = uwm.url_id + LEFT JOIN flag_url_validated fuv + ON u.id = fuv.url_id + LEFT JOIN ds_app_link_meta_url udmu + ON u.id = udmu.url_id + LEFT JOIN ds_app_link_data_source uds + ON u.id = uds.url_id + ) + SELECT + status_text.url_id, + status_text.status, + CASE status_text.status + WHEN 'Intake'::text THEN 100 + WHEN 'Error'::text THEN 110 + WHEN 'Community Labeling'::text THEN 200 + WHEN 'Accepted'::text THEN 300 + WHEN 'Awaiting Submission'::text THEN 380 + WHEN 'Submitted'::text THEN 390 + ELSE '-1'::integer + END AS code + FROM + status_text; + """) + + op.execute(""" + create materialized view batch_url_status_mat_view as + WITH + batches_with_urls AS ( + SELECT + b_1.id + FROM + batches b_1 + WHERE + (EXISTS ( + SELECT + 1 + FROM + link_batches__urls lbu + WHERE + lbu.batch_id = b_1.id + )) + ) + , batches_with_only_validated_urls AS ( + SELECT + b_1.id + FROM + batches b_1 + WHERE + (EXISTS ( + SELECT + 1 + FROM + link_batches__urls lbu + LEFT JOIN flag_url_validated fuv + ON fuv.url_id = lbu.url_id + WHERE + lbu.batch_id = b_1.id + AND fuv.url_id IS NOT NULL + )) + AND NOT (EXISTS ( + SELECT + 1 + FROM + link_batches__urls lbu + LEFT JOIN flag_url_validated fuv + ON fuv.url_id = lbu.url_id + WHERE + lbu.batch_id = b_1.id + AND fuv.url_id IS NULL + )) + ) + SELECT + b.id AS batch_id, + CASE + WHEN b.status = 'error'::batch_status THEN 'Error'::text + WHEN bwu.id IS NULL THEN 'No URLs'::text + WHEN bwovu.id IS NOT NULL THEN 'Labeling Complete'::text + ELSE 'Has Unlabeled URLs'::text + END AS batch_url_status + FROM + batches b + LEFT JOIN batches_with_urls bwu + ON bwu.id = b.id + LEFT JOIN batches_with_only_validated_urls bwovu + ON bwovu.id = b.id; + """) + + + +def upgrade() -> None: + drop_views() + + for table in TABLES: + op.drop_column(table, "id") + + # Add new primary keys + for table, columns in BESPOKE_UNIQUE_IDS.items(): + suffix = "_".join(columns) + op.create_primary_key( + f"pk_{table}_{suffix}", + table, + columns + ) + + for table in URL_ONLY_PRIMARY_KEY_TABLES: + op.create_primary_key( + f"pk_{table}", + table, + ["url_id"] + ) + + for table in USER_URL_ID_PRIMARY_KEY_TABLES: + op.create_primary_key( + f"pk_{table}", + table, + ["user_id", "url_id"] + ) + + recreate_views() + + + + + +def downgrade() -> None: + pass diff --git a/src/api/endpoints/annotate/all/get/queries/core.py b/src/api/endpoints/annotate/all/get/queries/core.py index 5b239db0..89975a08 100644 --- a/src/api/endpoints/annotate/all/get/queries/core.py +++ b/src/api/endpoints/annotate/all/get/queries/core.py @@ -61,14 +61,14 @@ async def run( URL.status == URLStatus.OK.value, # Must not have been previously annotated by user ~exists( - select(UserURLTypeSuggestion.id) + select(UserURLTypeSuggestion.url_id) .where( UserURLTypeSuggestion.url_id == URL.id, UserURLTypeSuggestion.user_id == self.user_id, ) ), ~exists( - select(UserURLAgencySuggestion.id) + select(UserURLAgencySuggestion.url_id) .where( UserURLAgencySuggestion.url_id == URL.id, UserURLAgencySuggestion.user_id == self.user_id, diff --git a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location.py b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location.py index 17055d1a..7a15b67a 100644 --- a/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location.py +++ b/src/core/tasks/url/operators/agency_identification/subtasks/queries/survey/queries/ctes/subtask/impl/nlp_location.py @@ -29,7 +29,7 @@ # One of the locations must be linked to an agency exists( select( - LinkAgencyLocation.id + LinkAgencyLocation.location_id ) .join( LocationIDSubtaskSuggestion, diff --git a/src/core/tasks/url/operators/html/content_info_getter.py b/src/core/tasks/url/operators/html/content_info_getter.py index bee7183c..a2d554ff 100644 --- a/src/core/tasks/url/operators/html/content_info_getter.py +++ b/src/core/tasks/url/operators/html/content_info_getter.py @@ -1,6 +1,7 @@ from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo from src.db.dtos.url.html_content import URLHTMLContentInfo from src.db.models.impl.url.html.content.enums import HTMLContentType +from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent class HTMLContentInfoGetter: @@ -10,7 +11,7 @@ def __init__(self, response_html_info: ResponseHTMLInfo, url_id: int): self.url_id = url_id self.html_content_infos = [] - def get_all_html_content(self) -> list[URLHTMLContentInfo]: + def get_all_html_content(self) -> list[URLHTMLContent]: for content_type in HTMLContentType: self.add_html_content(content_type) return self.html_content_infos @@ -20,9 +21,9 @@ def add_html_content(self, content_type: HTMLContentType): val = getattr(self.response_html_info, lower_str) if val is None or val.strip() == "": return - uhci = URLHTMLContentInfo( + uhc = URLHTMLContent( url_id=self.url_id, - content_type=content_type, + content_type=content_type.value, content=val ) - self.html_content_infos.append(uhci) + self.html_content_infos.append(uhc) diff --git a/src/core/tasks/url/operators/html/queries/insert/convert.py b/src/core/tasks/url/operators/html/queries/insert/convert.py index ca827c7e..e55b9843 100644 --- a/src/core/tasks/url/operators/html/queries/insert/convert.py +++ b/src/core/tasks/url/operators/html/queries/insert/convert.py @@ -6,6 +6,7 @@ from src.db.enums import TaskType from src.db.models.impl.url.error_info.pydantic import URLErrorInfoPydantic from src.db.models.impl.url.html.compressed.pydantic import URLCompressedHTMLPydantic +from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent from src.db.models.impl.url.scrape_info.enums import ScrapeStatus from src.db.models.impl.url.scrape_info.pydantic import URLScrapeInfoInsertModel from src.db.models.impl.url.task_error.pydantic_.insert import URLTaskErrorPydantic @@ -33,8 +34,8 @@ def _convert_to_html_content_info_getter(tdo: UrlHtmlTDO) -> HTMLContentInfoGett url_id=tdo.url_info.id ) -def convert_to_html_content_info_list(tdos: list[UrlHtmlTDO]) -> list[URLHTMLContentInfo]: - html_content_infos = [] +def convert_to_html_content_info_list(tdos: list[UrlHtmlTDO]) -> list[URLHTMLContent]: + html_content_infos: list[URLHTMLContent] = [] for tdo in tdos: if tdo.url_response_info.status != HTTPStatus.OK: continue diff --git a/src/core/tasks/url/operators/html/queries/insert/query.py b/src/core/tasks/url/operators/html/queries/insert/query.py index e0bff2e6..86f04e72 100644 --- a/src/core/tasks/url/operators/html/queries/insert/query.py +++ b/src/core/tasks/url/operators/html/queries/insert/query.py @@ -3,6 +3,10 @@ from src.core.tasks.url.operators.html.queries.insert.convert import convert_to_compressed_html, \ convert_to_html_content_info_list, convert_to_scrape_infos, convert_to_url_errors from src.core.tasks.url.operators.html.tdo import UrlHtmlTDO +from src.db.dtos.url.html_content import URLHTMLContentInfo +from src.db.models.impl.url.html.compressed.pydantic import URLCompressedHTMLPydantic +from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent +from src.db.models.impl.url.scrape_info.pydantic import URLScrapeInfoInsertModel from src.db.queries.base.builder import QueryBuilderBase from src.db.helpers.session import session_helper as sh @@ -14,17 +18,20 @@ def __init__(self, tdos: list[UrlHtmlTDO], task_id: int): self.task_id = task_id async def run(self, session: AsyncSession) -> None: - compressed_html_models = convert_to_compressed_html(self.tdos) - url_html_content_list = convert_to_html_content_info_list(self.tdos) - scrape_info_list = convert_to_scrape_infos(self.tdos) + compressed_html_models: list[URLCompressedHTMLPydantic] = convert_to_compressed_html(self.tdos) + url_html_content_list: list[URLHTMLContent] = convert_to_html_content_info_list(self.tdos) + scrape_info_list: list[URLScrapeInfoInsertModel] = convert_to_scrape_infos(self.tdos) url_errors = convert_to_url_errors(self.tdos, task_id=self.task_id) for models in [ compressed_html_models, - url_html_content_list, scrape_info_list, url_errors ]: await sh.bulk_insert(session, models=models) + await sh.add_all(session=session, models=url_html_content_list) + + + diff --git a/src/core/tasks/url/operators/probe/queries/urls/not_probed/exists.py b/src/core/tasks/url/operators/probe/queries/urls/not_probed/exists.py index 5954c197..087bef65 100644 --- a/src/core/tasks/url/operators/probe/queries/urls/not_probed/exists.py +++ b/src/core/tasks/url/operators/probe/queries/urls/not_probed/exists.py @@ -26,7 +26,7 @@ async def run(self, session: AsyncSession) -> bool: ) .where( or_( - URLWebMetadata.id.is_(None), + URLWebMetadata.url_id.is_(None), URLWebMetadata.updated_at < datetime.now() - timedelta(days=30) ), no_url_task_error(TaskType.PROBE_URL) diff --git a/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/query.py b/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/query.py index 7011a8de..e8eafd15 100644 --- a/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/query.py +++ b/src/core/tasks/url/operators/probe/queries/urls/not_probed/get/query.py @@ -28,7 +28,7 @@ async def run(self, session: AsyncSession) -> list[FullURLMapping]: ) .where( or_( - URLWebMetadata.id.is_(None), + URLWebMetadata.url_id.is_(None), URLWebMetadata.updated_at < datetime.now() - timedelta(days=30) ) ) diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 0fb99f76..5ec64ad7 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -324,10 +324,6 @@ async def add_user_record_type_suggestion( # endregion record_type - @session_manager - async def add_html_content_infos(self, session: AsyncSession, html_content_infos: list[URLHTMLContentInfo]): - await self._add_models(session, URLHTMLContent, html_content_infos) - @session_manager async def has_non_errored_urls_without_html_data(self, session: AsyncSession) -> bool: statement = self.statement_composer.has_non_errored_urls_without_html_data() diff --git a/src/db/models/impl/duplicate/sqlalchemy.py b/src/db/models/impl/duplicate/sqlalchemy.py index 03c492e3..2b50409d 100644 --- a/src/db/models/impl/duplicate/sqlalchemy.py +++ b/src/db/models/impl/duplicate/sqlalchemy.py @@ -1,15 +1,19 @@ -from sqlalchemy import Column, Integer, ForeignKey +from sqlalchemy import Column, Integer, ForeignKey, PrimaryKeyConstraint from sqlalchemy.orm import relationship from src.db.models.mixins import BatchDependentMixin +from src.db.models.templates_.base import Base from src.db.models.templates_.with_id import WithIDBase -class Duplicate(BatchDependentMixin, WithIDBase): +class Duplicate(BatchDependentMixin, Base): """ Identifies duplicates which occur within a batch """ __tablename__ = 'duplicates' + __table_args__ = ( + PrimaryKeyConstraint("batch_id"), + ) original_url_id = Column( Integer, diff --git a/src/db/models/impl/link/agency_location/sqlalchemy.py b/src/db/models/impl/link/agency_location/sqlalchemy.py index fb7f34da..c4203d44 100644 --- a/src/db/models/impl/link/agency_location/sqlalchemy.py +++ b/src/db/models/impl/link/agency_location/sqlalchemy.py @@ -1,10 +1,15 @@ +from sqlalchemy import PrimaryKeyConstraint + from src.db.models.mixins import AgencyDependentMixin, LocationDependentMixin -from src.db.models.templates_.with_id import WithIDBase +from src.db.models.templates_.base import Base class LinkAgencyLocation( - WithIDBase, + Base, AgencyDependentMixin, LocationDependentMixin, ): - __tablename__ = "link_agencies__locations" \ No newline at end of file + __tablename__ = "link_agencies__locations" + __table_args__ = ( + PrimaryKeyConstraint("agency_id", "location_id"), + ) \ No newline at end of file diff --git a/src/db/models/impl/link/url_agency/sqlalchemy.py b/src/db/models/impl/link/url_agency/sqlalchemy.py index c4ca6124..7111bc6d 100644 --- a/src/db/models/impl/link/url_agency/sqlalchemy.py +++ b/src/db/models/impl/link/url_agency/sqlalchemy.py @@ -1,19 +1,20 @@ -from sqlalchemy import UniqueConstraint +from sqlalchemy import UniqueConstraint, PrimaryKeyConstraint from sqlalchemy.orm import relationship, Mapped from src.db.models.helpers import get_agency_id_foreign_column from src.db.models.mixins import URLDependentMixin +from src.db.models.templates_.base import Base from src.db.models.templates_.with_id import WithIDBase class LinkURLAgency(URLDependentMixin, WithIDBase): __tablename__ = "link_agencies__urls" + __table_args__ = ( + UniqueConstraint("url_id", "agency_id"), + ) agency_id: Mapped[int] = get_agency_id_foreign_column() url = relationship("URL") agency = relationship("Agency") - __table_args__ = ( - UniqueConstraint("url_id", "agency_id", name="uq_confirmed_url_agency"), - ) diff --git a/src/db/models/impl/link/url_redirect_url/sqlalchemy.py b/src/db/models/impl/link/url_redirect_url/sqlalchemy.py index 534c7213..c470e323 100644 --- a/src/db/models/impl/link/url_redirect_url/sqlalchemy.py +++ b/src/db/models/impl/link/url_redirect_url/sqlalchemy.py @@ -1,12 +1,21 @@ +from sqlalchemy import PrimaryKeyConstraint from sqlalchemy.orm import Mapped from src.db.models.helpers import url_id_column -from src.db.models.templates_.standard import StandardBase +from src.db.models.mixins import CreatedAtMixin, UpdatedAtMixin +from src.db.models.templates_.base import Base - -class LinkURLRedirectURL(StandardBase): +class LinkURLRedirectURL( + Base, + CreatedAtMixin, + UpdatedAtMixin +): __tablename__ = "link_urls_redirect_url" + __table_args__ = ( + PrimaryKeyConstraint("source_url_id", "destination_url_id"), + ) + source_url_id: Mapped[int] = url_id_column() destination_url_id: Mapped[int] = url_id_column() diff --git a/src/db/models/impl/link/urls_root_url/sqlalchemy.py b/src/db/models/impl/link/urls_root_url/sqlalchemy.py index 8dcd7085..d55a181f 100644 --- a/src/db/models/impl/link/urls_root_url/sqlalchemy.py +++ b/src/db/models/impl/link/urls_root_url/sqlalchemy.py @@ -1,16 +1,20 @@ +from sqlalchemy import PrimaryKeyConstraint from sqlalchemy.orm import Mapped from src.db.models.helpers import url_id_column from src.db.models.mixins import URLDependentMixin, CreatedAtMixin, UpdatedAtMixin -from src.db.models.templates_.with_id import WithIDBase +from src.db.models.templates_.base import Base class LinkURLRootURL( UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, - WithIDBase + Base, ): __tablename__ = "link_urls_root_url" + __table_args__ = ( + PrimaryKeyConstraint("url_id", "root_url_id"), + ) root_url_id: Mapped[int] = url_id_column() \ No newline at end of file diff --git a/src/db/models/impl/task/error.py b/src/db/models/impl/task/error.py index 2de0c66a..cd04a2ea 100644 --- a/src/db/models/impl/task/error.py +++ b/src/db/models/impl/task/error.py @@ -1,11 +1,12 @@ -from sqlalchemy import Column, Text, UniqueConstraint +from sqlalchemy import Column, Text, UniqueConstraint, PrimaryKeyConstraint from sqlalchemy.orm import relationship from src.db.models.mixins import UpdatedAtMixin, TaskDependentMixin +from src.db.models.templates_.base import Base from src.db.models.templates_.with_id import WithIDBase -class TaskError(UpdatedAtMixin, TaskDependentMixin, WithIDBase): +class TaskError(UpdatedAtMixin, TaskDependentMixin, Base): __tablename__ = 'task_errors' error = Column(Text, nullable=False) @@ -13,8 +14,8 @@ class TaskError(UpdatedAtMixin, TaskDependentMixin, WithIDBase): # Relationships task = relationship("Task") - __table_args__ = (UniqueConstraint( + __table_args__ = (PrimaryKeyConstraint( "task_id", "error", - name="uq_task_id_error"), + ), ) diff --git a/src/db/models/impl/url/checked_for_duplicate.py b/src/db/models/impl/url/checked_for_duplicate.py index bb7cf666..89192573 100644 --- a/src/db/models/impl/url/checked_for_duplicate.py +++ b/src/db/models/impl/url/checked_for_duplicate.py @@ -1,11 +1,13 @@ +from sqlalchemy import PrimaryKeyConstraint from sqlalchemy.orm import relationship from src.db.models.mixins import CreatedAtMixin, URLDependentMixin -from src.db.models.templates_.with_id import WithIDBase +from src.db.models.templates_.base import Base -class URLCheckedForDuplicate(CreatedAtMixin, URLDependentMixin, WithIDBase): +class URLCheckedForDuplicate(CreatedAtMixin, URLDependentMixin, Base): __tablename__ = 'url_checked_for_duplicate' + __table_args__ = (PrimaryKeyConstraint("url_id"),) # Relationships url = relationship("URL", uselist=False, back_populates="checked_for_duplicate") diff --git a/src/db/models/impl/url/core/sqlalchemy.py b/src/db/models/impl/url/core/sqlalchemy.py index 56681e3d..8ee51a43 100644 --- a/src/db/models/impl/url/core/sqlalchemy.py +++ b/src/db/models/impl/url/core/sqlalchemy.py @@ -75,7 +75,7 @@ def full_url(cls): uselist=False, ) duplicates = relationship("Duplicate", back_populates="original_url") - html_content = relationship("URLHTMLContent", back_populates="url", cascade="all, delete-orphan") + html_content = relationship("URLHTMLContent", cascade="all, delete-orphan") task_errors = relationship( URLTaskError, cascade="all, delete-orphan" diff --git a/src/db/models/impl/url/html/compressed/sqlalchemy.py b/src/db/models/impl/url/html/compressed/sqlalchemy.py index 995c5b25..4974e5f0 100644 --- a/src/db/models/impl/url/html/compressed/sqlalchemy.py +++ b/src/db/models/impl/url/html/compressed/sqlalchemy.py @@ -1,16 +1,20 @@ -from sqlalchemy import Column, LargeBinary +from sqlalchemy import Column, LargeBinary, PrimaryKeyConstraint from sqlalchemy.orm import relationship, Mapped from src.db.models.mixins import CreatedAtMixin, URLDependentMixin +from src.db.models.templates_.base import Base from src.db.models.templates_.with_id import WithIDBase class URLCompressedHTML( CreatedAtMixin, URLDependentMixin, - WithIDBase + Base ): __tablename__ = 'url_compressed_html' + __table_args__ = ( + PrimaryKeyConstraint("url_id"), + ) compressed_html: Mapped[bytes] = Column(LargeBinary, nullable=False) diff --git a/src/db/models/impl/url/html/content/sqlalchemy.py b/src/db/models/impl/url/html/content/sqlalchemy.py index 63e4da76..ded0957b 100644 --- a/src/db/models/impl/url/html/content/sqlalchemy.py +++ b/src/db/models/impl/url/html/content/sqlalchemy.py @@ -1,21 +1,20 @@ -from sqlalchemy import UniqueConstraint, Column, Text +from sqlalchemy import UniqueConstraint, Column, Text, PrimaryKeyConstraint from sqlalchemy.orm import relationship from src.db.enums import PGEnum from src.db.models.mixins import UpdatedAtMixin, URLDependentMixin +from src.db.models.templates_.base import Base from src.db.models.templates_.with_id import WithIDBase class URLHTMLContent( UpdatedAtMixin, URLDependentMixin, - WithIDBase + Base, ): __tablename__ = 'url_html_content' - __table_args__ = (UniqueConstraint( - "url_id", - "content_type", - name="uq_url_id_content_type"), + __table_args__ = ( + PrimaryKeyConstraint("url_id", "content_type"), ) content_type = Column( diff --git a/src/db/models/impl/url/internet_archives/probe/sqlalchemy.py b/src/db/models/impl/url/internet_archives/probe/sqlalchemy.py index 122905a7..ca9d1b0a 100644 --- a/src/db/models/impl/url/internet_archives/probe/sqlalchemy.py +++ b/src/db/models/impl/url/internet_archives/probe/sqlalchemy.py @@ -1,14 +1,18 @@ +from sqlalchemy import PrimaryKeyConstraint from sqlalchemy.orm import Mapped from src.db.models.mixins import URLDependentMixin -from src.db.models.templates_.standard import StandardBase +from src.db.models.templates_.base import Base class URLInternetArchivesProbeMetadata( - StandardBase, + Base, URLDependentMixin ): __tablename__ = 'url_internet_archives_probe_metadata' + __table_args__ = ( + PrimaryKeyConstraint("url_id"), + ) archive_url: Mapped[str] digest: Mapped[str] diff --git a/src/db/models/impl/url/internet_archives/save/sqlalchemy.py b/src/db/models/impl/url/internet_archives/save/sqlalchemy.py index 791f4077..f0aff36f 100644 --- a/src/db/models/impl/url/internet_archives/save/sqlalchemy.py +++ b/src/db/models/impl/url/internet_archives/save/sqlalchemy.py @@ -1,14 +1,17 @@ -from sqlalchemy import Column, DateTime, func +from sqlalchemy import Column, DateTime, func, PrimaryKeyConstraint from src.db.models.mixins import URLDependentMixin -from src.db.models.templates_.with_id import WithIDBase +from src.db.models.templates_.base import Base class URLInternetArchivesSaveMetadata( - WithIDBase, + Base, URLDependentMixin ): __tablename__ = 'url_internet_archives_save_metadata' + __table_args__ = ( + PrimaryKeyConstraint("url_id"), + ) created_at = Column(DateTime, nullable=False, server_default=func.now()) last_uploaded_at = Column(DateTime, nullable=False, server_default=func.now()) diff --git a/src/db/models/impl/url/optional_ds_metadata/sqlalchemy.py b/src/db/models/impl/url/optional_ds_metadata/sqlalchemy.py index 32156a38..04541ad6 100644 --- a/src/db/models/impl/url/optional_ds_metadata/sqlalchemy.py +++ b/src/db/models/impl/url/optional_ds_metadata/sqlalchemy.py @@ -1,19 +1,22 @@ -from sqlalchemy import Column, ARRAY, String, Date, Boolean, Enum +from sqlalchemy import Column, ARRAY, String, Date, Boolean, Enum, PrimaryKeyConstraint from sqlalchemy.orm import relationship, Mapped from src.db.models.helpers import enum_column from src.db.models.impl.url.optional_ds_metadata.enums import AgencyAggregationEnum, AccessTypeEnum, \ RetentionScheduleEnum, UpdateMethodEnum from src.db.models.mixins import URLDependentMixin, UpdatedAtMixin -from src.db.models.templates_.with_id import WithIDBase +from src.db.models.templates_.base import Base class URLOptionalDataSourceMetadata( URLDependentMixin, - WithIDBase, + Base, UpdatedAtMixin ): __tablename__ = 'url_optional_data_source_metadata' + __table_args__ = ( + PrimaryKeyConstraint("url_id"), + ) record_formats = Column(ARRAY(String), nullable=False, default=[]) data_portal_type = Column(String, nullable=True) diff --git a/src/db/models/impl/url/reviewing_user.py b/src/db/models/impl/url/reviewing_user.py index 9213a157..379cfee5 100644 --- a/src/db/models/impl/url/reviewing_user.py +++ b/src/db/models/impl/url/reviewing_user.py @@ -1,16 +1,17 @@ -from sqlalchemy import UniqueConstraint, Column, Integer +from sqlalchemy import UniqueConstraint, Column, Integer, PrimaryKeyConstraint from sqlalchemy.orm import relationship from src.db.models.mixins import CreatedAtMixin, URLDependentMixin +from src.db.models.templates_.base import Base from src.db.models.templates_.with_id import WithIDBase -class ReviewingUserURL(CreatedAtMixin, URLDependentMixin, WithIDBase): +class ReviewingUserURL(CreatedAtMixin, URLDependentMixin, Base): __tablename__ = 'reviewing_user_url' __table_args__ = ( - UniqueConstraint( - "url_id", - name="approving_user_url_uq_user_id_url_id"), + PrimaryKeyConstraint( + "url_id", + ), ) user_id = Column(Integer, nullable=False) diff --git a/src/db/models/impl/url/scrape_info/sqlalchemy.py b/src/db/models/impl/url/scrape_info/sqlalchemy.py index b50f2903..bd59c6ff 100644 --- a/src/db/models/impl/url/scrape_info/sqlalchemy.py +++ b/src/db/models/impl/url/scrape_info/sqlalchemy.py @@ -1,15 +1,22 @@ +from sqlalchemy import PrimaryKeyConstraint + from src.db.models.helpers import enum_column from src.db.models.impl.url.scrape_info.enums import ScrapeStatus -from src.db.models.mixins import URLDependentMixin -from src.db.models.templates_.standard import StandardBase +from src.db.models.mixins import URLDependentMixin, CreatedAtMixin, UpdatedAtMixin +from src.db.models.templates_.base import Base class URLScrapeInfo( - StandardBase, + Base, + CreatedAtMixin, + UpdatedAtMixin, URLDependentMixin ): __tablename__ = 'url_scrape_info' + __table_args__ = ( + PrimaryKeyConstraint("url_id"), + ) status = enum_column( enum_type=ScrapeStatus, diff --git a/src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py b/src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py index de6ee029..fea75df8 100644 --- a/src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py +++ b/src/db/models/impl/url/suggestion/agency/suggestion/sqlalchemy.py @@ -1,16 +1,20 @@ import sqlalchemy as sa +from sqlalchemy import PrimaryKeyConstraint from sqlalchemy.orm import relationship from src.db.models.mixins import CreatedAtMixin, AgencyDependentMixin -from src.db.models.templates_.with_id import WithIDBase +from src.db.models.templates_.base import Base class AgencyIDSubtaskSuggestion( - WithIDBase, + Base, CreatedAtMixin, AgencyDependentMixin, ): __tablename__ = "agency_id_subtask_suggestions" + __table_args__ = ( + PrimaryKeyConstraint("agency_id", "subtask_id"), + ) subtask_id = sa.Column( sa.Integer, diff --git a/src/db/models/impl/url/suggestion/agency/user.py b/src/db/models/impl/url/suggestion/agency/user.py index 2cd18851..79fa933c 100644 --- a/src/db/models/impl/url/suggestion/agency/user.py +++ b/src/db/models/impl/url/suggestion/agency/user.py @@ -1,13 +1,17 @@ -from sqlalchemy import Column, Boolean, UniqueConstraint, Integer +from sqlalchemy import Column, Boolean, UniqueConstraint, Integer, PrimaryKeyConstraint from sqlalchemy.orm import relationship, Mapped from src.db.models.helpers import get_agency_id_foreign_column from src.db.models.mixins import URLDependentMixin +from src.db.models.templates_.base import Base from src.db.models.templates_.with_id import WithIDBase -class UserURLAgencySuggestion(URLDependentMixin, WithIDBase): +class UserURLAgencySuggestion(URLDependentMixin, Base): __tablename__ = "user_url_agency_suggestions" + __table_args__ = ( + PrimaryKeyConstraint("agency_id", "url_id", "user_id"), + ) agency_id: Mapped[int] = get_agency_id_foreign_column(nullable=True) user_id = Column(Integer, nullable=False) @@ -15,7 +19,3 @@ class UserURLAgencySuggestion(URLDependentMixin, WithIDBase): agency = relationship("Agency") url = relationship("URL") - - __table_args__ = ( - UniqueConstraint("agency_id", "url_id", "user_id", name="uq_user_url_agency_suggestions"), - ) diff --git a/src/db/models/impl/url/suggestion/record_type/auto.py b/src/db/models/impl/url/suggestion/record_type/auto.py index 2aaed526..1c2c68d1 100644 --- a/src/db/models/impl/url/suggestion/record_type/auto.py +++ b/src/db/models/impl/url/suggestion/record_type/auto.py @@ -1,8 +1,9 @@ -from sqlalchemy import Column, UniqueConstraint +from sqlalchemy import Column, UniqueConstraint, PrimaryKeyConstraint from sqlalchemy.dialects import postgresql from sqlalchemy.orm import relationship from src.db.models.mixins import URLDependentMixin, UpdatedAtMixin, CreatedAtMixin +from src.db.models.templates_.base import Base from src.db.models.templates_.with_id import WithIDBase from src.db.models.types import record_type_values @@ -11,13 +12,13 @@ class AutoRecordTypeSuggestion( UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, - WithIDBase + Base, ): __tablename__ = "auto_record_type_suggestions" record_type = Column(postgresql.ENUM(*record_type_values, name='record_type'), nullable=False) __table_args__ = ( - UniqueConstraint("url_id", name="auto_record_type_suggestions_uq_url_id"), + PrimaryKeyConstraint("url_id"), ) # Relationships diff --git a/src/db/models/impl/url/suggestion/record_type/user.py b/src/db/models/impl/url/suggestion/record_type/user.py index 5b9dde8c..4e271225 100644 --- a/src/db/models/impl/url/suggestion/record_type/user.py +++ b/src/db/models/impl/url/suggestion/record_type/user.py @@ -1,22 +1,27 @@ -from sqlalchemy import Column, Integer, UniqueConstraint +from sqlalchemy import Column, Integer, UniqueConstraint, PrimaryKeyConstraint from sqlalchemy.dialects import postgresql from sqlalchemy.orm import relationship from src.db.models.mixins import UpdatedAtMixin, CreatedAtMixin, URLDependentMixin +from src.db.models.templates_.base import Base from src.db.models.templates_.with_id import WithIDBase from src.db.models.types import record_type_values -class UserRecordTypeSuggestion(UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, WithIDBase): +class UserRecordTypeSuggestion( + UpdatedAtMixin, + CreatedAtMixin, + URLDependentMixin, + Base, +): __tablename__ = "user_record_type_suggestions" + __table_args__ = ( + PrimaryKeyConstraint("url_id", "user_id"), + ) user_id = Column(Integer, nullable=False) record_type = Column(postgresql.ENUM(*record_type_values, name='record_type'), nullable=False) - __table_args__ = ( - UniqueConstraint("url_id", "user_id", name="uq_user_record_type_suggestions"), - ) # Relationships - url = relationship("URL", back_populates="user_record_type_suggestions") diff --git a/src/db/models/impl/url/suggestion/url_type/auto/sqlalchemy.py b/src/db/models/impl/url/suggestion/url_type/auto/sqlalchemy.py index dd109269..19b5dc09 100644 --- a/src/db/models/impl/url/suggestion/url_type/auto/sqlalchemy.py +++ b/src/db/models/impl/url/suggestion/url_type/auto/sqlalchemy.py @@ -1,7 +1,8 @@ -from sqlalchemy import Column, Boolean, UniqueConstraint, String, Float +from sqlalchemy import Column, Boolean, UniqueConstraint, String, Float, PrimaryKeyConstraint from sqlalchemy.orm import relationship from src.db.models.mixins import UpdatedAtMixin, CreatedAtMixin, URLDependentMixin +from src.db.models.templates_.base import Base from src.db.models.templates_.with_id import WithIDBase @@ -9,7 +10,7 @@ class AutoRelevantSuggestion( UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, - WithIDBase + Base, ): __tablename__ = "auto_relevant_suggestions" @@ -19,6 +20,7 @@ class AutoRelevantSuggestion( __table_args__ = ( UniqueConstraint("url_id", name="auto_relevant_suggestions_uq_url_id"), + PrimaryKeyConstraint("url_id"), ) # Relationships diff --git a/src/db/models/impl/url/suggestion/url_type/user.py b/src/db/models/impl/url/suggestion/url_type/user.py index c7070b5e..52bbc4eb 100644 --- a/src/db/models/impl/url/suggestion/url_type/user.py +++ b/src/db/models/impl/url/suggestion/url_type/user.py @@ -1,10 +1,11 @@ -from sqlalchemy import Column, UniqueConstraint, Integer +from sqlalchemy import Column, UniqueConstraint, Integer, PrimaryKeyConstraint from sqlalchemy.dialects import postgresql from sqlalchemy.orm import relationship, Mapped from src.db.models.helpers import enum_column from src.db.models.impl.flag.url_validated.enums import URLType from src.db.models.mixins import UpdatedAtMixin, CreatedAtMixin, URLDependentMixin +from src.db.models.templates_.base import Base from src.db.models.templates_.with_id import WithIDBase @@ -12,9 +13,12 @@ class UserURLTypeSuggestion( UpdatedAtMixin, CreatedAtMixin, URLDependentMixin, - WithIDBase + Base, ): __tablename__ = "user_url_type_suggestions" + __table_args__ = ( + PrimaryKeyConstraint("url_id", "user_id"), + ) user_id = Column(Integer, nullable=False) type: Mapped[URLType | None] = enum_column( @@ -23,10 +27,6 @@ class UserURLTypeSuggestion( nullable=True ) - __table_args__ = ( - UniqueConstraint("url_id", "user_id", name="uq_user_relevant_suggestions"), - ) - # Relationships url = relationship("URL", back_populates="user_relevant_suggestions") diff --git a/src/db/models/impl/url/web_metadata/sqlalchemy.py b/src/db/models/impl/url/web_metadata/sqlalchemy.py index 45f5233c..3170a189 100644 --- a/src/db/models/impl/url/web_metadata/sqlalchemy.py +++ b/src/db/models/impl/url/web_metadata/sqlalchemy.py @@ -1,17 +1,20 @@ -from sqlalchemy import Column, Text, Boolean, Integer +from sqlalchemy import Column, Text, Boolean, Integer, PrimaryKeyConstraint from src.db.models.mixins import URLDependentMixin, CreatedAtMixin, UpdatedAtMixin -from src.db.models.templates_.with_id import WithIDBase +from src.db.models.templates_.base import Base class URLWebMetadata( - WithIDBase, + Base, URLDependentMixin, CreatedAtMixin, UpdatedAtMixin ): """Contains information about the web page.""" __tablename__ = "url_web_metadata" + __table_args__ = ( + PrimaryKeyConstraint("url_id"), + ) accessed = Column( Boolean(), diff --git a/src/db/statement_composer.py b/src/db/statement_composer.py index 31d6c7f9..faa965a8 100644 --- a/src/db/statement_composer.py +++ b/src/db/statement_composer.py @@ -38,7 +38,7 @@ def has_non_errored_urls_without_html_data() -> Select: .join(URLWebMetadata) .outerjoin(URLScrapeInfo) .where( - URLScrapeInfo.id == None, + URLScrapeInfo.url_id == None, ~exists(exclude_subquery), URLWebMetadata.status_code == HTTPStatus.OK.value, URLWebMetadata.content_type.like("%html%"), diff --git a/tests/helpers/data_creator/commands/impl/html_data.py b/tests/helpers/data_creator/commands/impl/html_data.py index c548eb5a..38ecb4bd 100644 --- a/tests/helpers/data_creator/commands/impl/html_data.py +++ b/tests/helpers/data_creator/commands/impl/html_data.py @@ -1,6 +1,7 @@ from src.db.dtos.url.html_content import URLHTMLContentInfo from src.db.models.impl.url.html.content.enums import HTMLContentType from src.db.dtos.url.raw_html import RawHTMLInfo +from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent from src.db.models.impl.url.scrape_info.enums import ScrapeStatus from src.db.models.impl.url.scrape_info.pydantic import URLScrapeInfoInsertModel from tests.helpers.data_creator.commands.base import DBDataCreatorCommandBase @@ -22,16 +23,16 @@ async def run(self) -> None: scraper_info_list = [] for url_id in self.url_ids: html_content_infos.append( - URLHTMLContentInfo( + URLHTMLContent( url_id=url_id, - content_type=HTMLContentType.TITLE, + content_type=HTMLContentType.TITLE.value, content="test html content" ) ) html_content_infos.append( - URLHTMLContentInfo( + URLHTMLContent( url_id=url_id, - content_type=HTMLContentType.DESCRIPTION, + content_type=HTMLContentType.DESCRIPTION.value, content="test description" ) ) @@ -47,5 +48,5 @@ async def run(self) -> None: scraper_info_list.append(scraper_info) await self.adb_client.add_raw_html(raw_html_info_list) - await self.adb_client.add_html_content_infos(html_content_infos) + await self.adb_client.add_all(html_content_infos)