Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
240 changes: 240 additions & 0 deletions alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,240 @@
"""Remove ID columns

Revision ID: 5d6412540aba
Revises: d5f0cc2be6b6
Create Date: 2025-11-29 07:17:32.794305

"""
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa

Check warning on line 11 in alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py#L11 <401>

'sqlalchemy as sa' imported but unused
Raw output
./alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py:11:1: F401 'sqlalchemy as sa' imported but unused


# revision identifiers, used by Alembic.
revision: str = '5d6412540aba'
down_revision: Union[str, None] = 'd5f0cc2be6b6'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None

TABLES = [
"task_errors", #
"agency_id_subtask_suggestions", #
"auto_record_type_suggestions", #
"auto_relevant_suggestions", #
"duplicates", #
"flag_url_validated", #
"link_agencies__locations", #
"link_urls_redirect_url", #
"link_urls_root_url", #
"reviewing_user_url", #
"url_checked_for_duplicate", #
"url_compressed_html", #

Check failure on line 32 in alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py#L32 <261>

at least two spaces before inline comment
Raw output
./alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py:32:27: E261 at least two spaces before inline comment
"url_html_content", #
"url_internet_archives_probe_metadata", #
"url_internet_archives_save_metadata", #
"url_optional_data_source_metadata", #
"url_scrape_info", #
"url_web_metadata", #
"user_record_type_suggestions", #
"user_url_type_suggestions", #
]

URL_ONLY_PRIMARY_KEY_TABLES = [
"url_checked_for_duplicate", #
"url_compressed_html", #
"url_internet_archives_probe_metadata", #
"url_internet_archives_save_metadata", #
"url_optional_data_source_metadata", #
"url_scrape_info", #
"url_web_metadata", #
"auto_relevant_suggestions", #
"auto_record_type_suggestions", #
"flag_url_validated" #
]



USER_URL_ID_PRIMARY_KEY_TABLES = [

Check failure on line 58 in alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py#L58 <303>

too many blank lines (3)
Raw output
./alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py:58:1: E303 too many blank lines (3)
"user_record_type_suggestions", #
"user_url_type_suggestions", #
"reviewing_user_url" #
]

BESPOKE_UNIQUE_IDS: dict[str, list[str]] = {
"task_errors": ["task_id"], #
"agency_id_subtask_suggestions": ["agency_id", "subtask_id"], #
"link_agencies__locations": ["agency_id", "location_id"], #
"link_urls_redirect_url": ["source_url_id", "destination_url_id"], #
"link_urls_root_url": ["url_id", "root_url_id"], #
"url_html_content": ["url_id", "content_type"], #
"duplicates": ["batch_id", "original_url_id"]
}

def drop_views():

Check warning on line 74 in alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py#L74 <103>

Missing docstring in public function
Raw output
./alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py:74:1: D103 Missing docstring in public function
op.execute("drop materialized view if exists url_status_mat_view")
op.execute("drop materialized view if exists batch_url_status_mat_view")

def recreate_views():

Check warning on line 78 in alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py#L78 <103>

Missing docstring in public function
Raw output
./alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py:78:1: D103 Missing docstring in public function
op.execute("""
create materialized view url_status_mat_view as
WITH
urls_with_relevant_errors AS (
SELECT
ute.url_id
FROM
url_task_error ute
WHERE
ute.task_type = ANY (ARRAY ['Screenshot'::task_type, 'HTML'::task_type, 'URL Probe'::task_type])
)
, status_text AS (
SELECT
u.id AS url_id,
CASE
WHEN fuv.type = ANY
(ARRAY ['not relevant'::url_type, 'individual record'::url_type, 'not found'::url_type])
THEN 'Accepted'::text
WHEN fuv.type = 'data source'::url_type AND uds.url_id IS NULL OR
fuv.type = 'meta url'::url_type AND udmu.url_id IS NULL THEN 'Awaiting Submission'::text
WHEN fuv.type = 'data source'::url_type AND uds.url_id IS NOT NULL OR
fuv.type = 'meta url'::url_type AND udmu.url_id IS NOT NULL THEN 'Submitted'::text
WHEN uch.url_id IS NOT NULL AND uwm.url_id IS NOT NULL AND us.url_id IS NOT NULL
THEN 'Community Labeling'::text
WHEN uwre.url_id IS NOT NULL THEN 'Error'::text
ELSE 'Intake'::text
END AS status
FROM
urls u
LEFT JOIN urls_with_relevant_errors uwre
ON u.id = uwre.url_id
LEFT JOIN url_screenshot us
ON u.id = us.url_id
LEFT JOIN url_compressed_html uch
ON u.id = uch.url_id
LEFT JOIN url_web_metadata uwm
ON u.id = uwm.url_id
LEFT JOIN flag_url_validated fuv
ON u.id = fuv.url_id
LEFT JOIN ds_app_link_meta_url udmu
ON u.id = udmu.url_id
LEFT JOIN ds_app_link_data_source uds
ON u.id = uds.url_id
)
SELECT
status_text.url_id,
status_text.status,
CASE status_text.status
WHEN 'Intake'::text THEN 100
WHEN 'Error'::text THEN 110
WHEN 'Community Labeling'::text THEN 200
WHEN 'Accepted'::text THEN 300
WHEN 'Awaiting Submission'::text THEN 380
WHEN 'Submitted'::text THEN 390
ELSE '-1'::integer
END AS code
FROM
status_text;
""")

op.execute("""
create materialized view batch_url_status_mat_view as
WITH
batches_with_urls AS (
SELECT
b_1.id
FROM
batches b_1
WHERE
(EXISTS (
SELECT
1
FROM
link_batches__urls lbu
WHERE
lbu.batch_id = b_1.id
))
)
, batches_with_only_validated_urls AS (
SELECT
b_1.id
FROM
batches b_1
WHERE
(EXISTS (
SELECT
1
FROM
link_batches__urls lbu
LEFT JOIN flag_url_validated fuv
ON fuv.url_id = lbu.url_id
WHERE
lbu.batch_id = b_1.id
AND fuv.url_id IS NOT NULL
))
AND NOT (EXISTS (
SELECT
1
FROM
link_batches__urls lbu
LEFT JOIN flag_url_validated fuv
ON fuv.url_id = lbu.url_id
WHERE
lbu.batch_id = b_1.id
AND fuv.url_id IS NULL
))
)
SELECT
b.id AS batch_id,
CASE
WHEN b.status = 'error'::batch_status THEN 'Error'::text
WHEN bwu.id IS NULL THEN 'No URLs'::text
WHEN bwovu.id IS NOT NULL THEN 'Labeling Complete'::text
ELSE 'Has Unlabeled URLs'::text
END AS batch_url_status
FROM
batches b
LEFT JOIN batches_with_urls bwu
ON bwu.id = b.id
LEFT JOIN batches_with_only_validated_urls bwovu
ON bwovu.id = b.id;
""")



def upgrade() -> None:

Check warning on line 204 in alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py#L204 <103>

Missing docstring in public function
Raw output
./alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py:204:1: D103 Missing docstring in public function

Check failure on line 204 in alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py#L204 <303>

too many blank lines (3)
Raw output
./alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py:204:1: E303 too many blank lines (3)
drop_views()

for table in TABLES:
op.drop_column(table, "id")

# Add new primary keys
for table, columns in BESPOKE_UNIQUE_IDS.items():
suffix = "_".join(columns)
op.create_primary_key(
f"pk_{table}_{suffix}",
table,
columns
)

for table in URL_ONLY_PRIMARY_KEY_TABLES:
op.create_primary_key(
f"pk_{table}",
table,
["url_id"]
)

for table in USER_URL_ID_PRIMARY_KEY_TABLES:
op.create_primary_key(
f"pk_{table}",
table,
["user_id", "url_id"]
)

recreate_views()





def downgrade() -> None:

Check warning on line 239 in alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py#L239 <103>

Missing docstring in public function
Raw output
./alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py:239:1: D103 Missing docstring in public function

Check failure on line 239 in alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py#L239 <303>

too many blank lines (5)
Raw output
./alembic/versions/2025_11_29_0717-5d6412540aba_remove_id_columns.py:239:1: E303 too many blank lines (5)
pass
4 changes: 2 additions & 2 deletions src/api/endpoints/annotate/all/get/queries/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,14 +61,14 @@ async def run(
URL.status == URLStatus.OK.value,
# Must not have been previously annotated by user
~exists(
select(UserURLTypeSuggestion.id)
select(UserURLTypeSuggestion.url_id)
.where(
UserURLTypeSuggestion.url_id == URL.id,
UserURLTypeSuggestion.user_id == self.user_id,
)
),
~exists(
select(UserURLAgencySuggestion.id)
select(UserURLAgencySuggestion.url_id)
.where(
UserURLAgencySuggestion.url_id == URL.id,
UserURLAgencySuggestion.user_id == self.user_id,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
# One of the locations must be linked to an agency
exists(
select(
LinkAgencyLocation.id
LinkAgencyLocation.location_id
)
.join(
LocationIDSubtaskSuggestion,
Expand Down
9 changes: 5 additions & 4 deletions src/core/tasks/url/operators/html/content_info_getter.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from src.core.tasks.url.operators.html.scraper.parser.dtos.response_html import ResponseHTMLInfo
from src.db.dtos.url.html_content import URLHTMLContentInfo
from src.db.models.impl.url.html.content.enums import HTMLContentType
from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent


class HTMLContentInfoGetter:
Expand All @@ -10,7 +11,7 @@
self.url_id = url_id
self.html_content_infos = []

def get_all_html_content(self) -> list[URLHTMLContentInfo]:
def get_all_html_content(self) -> list[URLHTMLContent]:

Check warning on line 14 in src/core/tasks/url/operators/html/content_info_getter.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/html/content_info_getter.py#L14 <102>

Missing docstring in public method
Raw output
./src/core/tasks/url/operators/html/content_info_getter.py:14:1: D102 Missing docstring in public method
for content_type in HTMLContentType:
self.add_html_content(content_type)
return self.html_content_infos
Expand All @@ -20,9 +21,9 @@
val = getattr(self.response_html_info, lower_str)
if val is None or val.strip() == "":
return
uhci = URLHTMLContentInfo(
uhc = URLHTMLContent(
url_id=self.url_id,
content_type=content_type,
content_type=content_type.value,
content=val
)
self.html_content_infos.append(uhci)
self.html_content_infos.append(uhc)
5 changes: 3 additions & 2 deletions src/core/tasks/url/operators/html/queries/insert/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from src.db.enums import TaskType
from src.db.models.impl.url.error_info.pydantic import URLErrorInfoPydantic
from src.db.models.impl.url.html.compressed.pydantic import URLCompressedHTMLPydantic
from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent
from src.db.models.impl.url.scrape_info.enums import ScrapeStatus
from src.db.models.impl.url.scrape_info.pydantic import URLScrapeInfoInsertModel
from src.db.models.impl.url.task_error.pydantic_.insert import URLTaskErrorPydantic
Expand Down Expand Up @@ -33,8 +34,8 @@
url_id=tdo.url_info.id
)

def convert_to_html_content_info_list(tdos: list[UrlHtmlTDO]) -> list[URLHTMLContentInfo]:
html_content_infos = []
def convert_to_html_content_info_list(tdos: list[UrlHtmlTDO]) -> list[URLHTMLContent]:

Check warning on line 37 in src/core/tasks/url/operators/html/queries/insert/convert.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/html/queries/insert/convert.py#L37 <103>

Missing docstring in public function
Raw output
./src/core/tasks/url/operators/html/queries/insert/convert.py:37:1: D103 Missing docstring in public function
html_content_infos: list[URLHTMLContent] = []
for tdo in tdos:
if tdo.url_response_info.status != HTTPStatus.OK:
continue
Expand Down
15 changes: 11 additions & 4 deletions src/core/tasks/url/operators/html/queries/insert/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
from src.core.tasks.url.operators.html.queries.insert.convert import convert_to_compressed_html, \
convert_to_html_content_info_list, convert_to_scrape_infos, convert_to_url_errors
from src.core.tasks.url.operators.html.tdo import UrlHtmlTDO
from src.db.dtos.url.html_content import URLHTMLContentInfo

Check warning on line 6 in src/core/tasks/url/operators/html/queries/insert/query.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] src/core/tasks/url/operators/html/queries/insert/query.py#L6 <401>

'src.db.dtos.url.html_content.URLHTMLContentInfo' imported but unused
Raw output
./src/core/tasks/url/operators/html/queries/insert/query.py:6:1: F401 'src.db.dtos.url.html_content.URLHTMLContentInfo' imported but unused
from src.db.models.impl.url.html.compressed.pydantic import URLCompressedHTMLPydantic
from src.db.models.impl.url.html.content.sqlalchemy import URLHTMLContent
from src.db.models.impl.url.scrape_info.pydantic import URLScrapeInfoInsertModel
from src.db.queries.base.builder import QueryBuilderBase
from src.db.helpers.session import session_helper as sh

Expand All @@ -14,17 +18,20 @@
self.task_id = task_id

async def run(self, session: AsyncSession) -> None:
compressed_html_models = convert_to_compressed_html(self.tdos)
url_html_content_list = convert_to_html_content_info_list(self.tdos)
scrape_info_list = convert_to_scrape_infos(self.tdos)
compressed_html_models: list[URLCompressedHTMLPydantic] = convert_to_compressed_html(self.tdos)
url_html_content_list: list[URLHTMLContent] = convert_to_html_content_info_list(self.tdos)
scrape_info_list: list[URLScrapeInfoInsertModel] = convert_to_scrape_infos(self.tdos)
url_errors = convert_to_url_errors(self.tdos, task_id=self.task_id)

for models in [
compressed_html_models,
url_html_content_list,
scrape_info_list,
url_errors
]:
await sh.bulk_insert(session, models=models)

await sh.add_all(session=session, models=url_html_content_list)




Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ async def run(self, session: AsyncSession) -> bool:
)
.where(
or_(
URLWebMetadata.id.is_(None),
URLWebMetadata.url_id.is_(None),
URLWebMetadata.updated_at < datetime.now() - timedelta(days=30)
),
no_url_task_error(TaskType.PROBE_URL)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ async def run(self, session: AsyncSession) -> list[FullURLMapping]:
)
.where(
or_(
URLWebMetadata.id.is_(None),
URLWebMetadata.url_id.is_(None),
URLWebMetadata.updated_at < datetime.now() - timedelta(days=30)
)
)
Expand Down
4 changes: 0 additions & 4 deletions src/db/client/async_.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,10 +324,6 @@ async def add_user_record_type_suggestion(
# endregion record_type


@session_manager
async def add_html_content_infos(self, session: AsyncSession, html_content_infos: list[URLHTMLContentInfo]):
await self._add_models(session, URLHTMLContent, html_content_infos)

@session_manager
async def has_non_errored_urls_without_html_data(self, session: AsyncSession) -> bool:
statement = self.statement_composer.has_non_errored_urls_without_html_data()
Expand Down
Loading