Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
124ca7d
Add test environment variable for INTERNET_ARCHIVE_S3_KEYS
maxachis Aug 19, 2025
aa1822f
Continue draft
maxachis Aug 21, 2025
e32c8ec
Progress draft
maxachis Aug 25, 2025
85b134f
Fix last tests
maxachis Aug 25, 2025
f47dbea
Continue draft
maxachis Aug 25, 2025
12eee24
Continue draft
maxachis Aug 26, 2025
2f08da1
Continue draft
maxachis Aug 26, 2025
497be00
/
maxachis Aug 28, 2025
fa63ec5
.
maxachis Aug 28, 2025
4968ab1
Add draft of Meta URL sync logic
maxachis Aug 29, 2025
b8749a4
Continue draft
maxachis Aug 30, 2025
7ae95c9
Continue draft
maxachis Aug 30, 2025
8bbefe5
Continue draft
maxachis Aug 30, 2025
0c760e2
Finish automated tests
maxachis Aug 30, 2025
01f7a50
Update draft
maxachis Sep 1, 2025
2bdaf1d
Continue draft
maxachis Sep 4, 2025
a8acbda
Update Draft
maxachis Sep 4, 2025
0dfb272
Continue Draft
maxachis Sep 4, 2025
db770be
Update Draft
maxachis Sep 5, 2025
e86e589
Resolve existing tests
maxachis Sep 6, 2025
e36bf18
Continue draft
maxachis Sep 6, 2025
2ac254e
Begin setting up Homepage CTE and additional views
maxachis Sep 6, 2025
fd16c86
Continue Draft
maxachis Sep 6, 2025
cd48315
Continue Draft
maxachis Sep 6, 2025
d07dfe5
Finish auto tests for homepage match
maxachis Sep 7, 2025
ef12a5c
Add framework of test for nlp
maxachis Sep 8, 2025
0471f15
Continue Draft
maxachis Sep 8, 2025
0346817
Continue draft
maxachis Sep 9, 2025
e3af970
Continue draft
maxachis Sep 9, 2025
008ab74
Continue Draft
maxachis Sep 11, 2025
f07b388
Continue draft
maxachis Sep 11, 2025
dd21a9c
Continue Draft
maxachis Sep 11, 2025
52abc9c
Finish draft
maxachis Sep 12, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
2 changes: 2 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ RUN uv sync --locked --no-dev
# Must call from the root directory because uv does not add playwright to path
RUN playwright install-deps chromium
RUN playwright install chromium
# Download Spacy Model
RUN python -m spacy download en_core_web_sm

# Copy project files
COPY src ./src
Expand Down
5 changes: 4 additions & 1 deletion ENV.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,10 @@ The following flags are available:
| `RUN_URL_TASKS_TASK_FLAG` | Runs URL tasks. |
| `IA_PROBE_TASK_FLAG` | Extracts and links Internet Archives metadata to URLs. |
| `IA_SAVE_TASK_FLAG` | Saves URLs to Internet Archives. |

| `AGENCY_ID_HOMEPAGE_MATCH_FLAG` | Enables the homepage match subtask for agency identification. |
| `AGENCY_ID_NLP_LOCATION_MATCH_FLAG` | Enables the NLP location match subtask for agency identification. |
| `AGENCY_ID_CKAN_FLAG` | Enables the CKAN subtask for agency identification. |
| `AGENCY_ID_MUCKROCK_FLAG` | Enables the MuckRock subtask for agency identification. |


## Foreign Data Wrapper (FDW)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa

Check warning on line 11 in alembic/versions/2025_08_09_2031-8cd5aa7670ff_remove_functional_duplicates.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_08_09_2031-8cd5aa7670ff_remove_functional_duplicates.py#L11 <401>

'sqlalchemy as sa' imported but unused
Raw output
./alembic/versions/2025_08_09_2031-8cd5aa7670ff_remove_functional_duplicates.py:11:1: F401 'sqlalchemy as sa' imported but unused


# revision identifiers, used by Alembic.
Expand All @@ -33,7 +33,7 @@
AUTOMATED_URL_AGENCY_SUGGESTION_FOREIGN_KEY_NAME = 'automated_url_agency_suggestions_url_id_fkey'


def upgrade() -> None:

Check warning on line 36 in alembic/versions/2025_08_09_2031-8cd5aa7670ff_remove_functional_duplicates.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_08_09_2031-8cd5aa7670ff_remove_functional_duplicates.py#L36 <103>

Missing docstring in public function
Raw output
./alembic/versions/2025_08_09_2031-8cd5aa7670ff_remove_functional_duplicates.py:36:1: D103 Missing docstring in public function
_add_cascade_foreign_key(URL_HTML_CONTENT_TABLE_NAME, foreign_key_name=URL_HTML_CONTENT_FOREIGN_KEY_NAME)
_add_cascade_foreign_key(COMPRESSED_HTML_TABLE_NAME, foreign_key_name=COMPRESSED_HTML_FOREIGN_KEY_NAME)
_add_cascade_foreign_key(URL_ERROR_INFO_TABLE_NAME, foreign_key_name=URL_ERROR_INFO_FOREIGN_KEY_NAME)
Expand All @@ -46,13 +46,13 @@
_add_constraint_forbidding_fragments()


def downgrade() -> None:

Check warning on line 49 in alembic/versions/2025_08_09_2031-8cd5aa7670ff_remove_functional_duplicates.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_08_09_2031-8cd5aa7670ff_remove_functional_duplicates.py#L49 <103>

Missing docstring in public function
Raw output
./alembic/versions/2025_08_09_2031-8cd5aa7670ff_remove_functional_duplicates.py:49:1: D103 Missing docstring in public function
_remove_constraint_forbidding_fragments()
_remove_constraint_forbidding_nbsp()
_remove_cascade_foreign_key(URL_ERROR_INFO_TABLE_NAME, foreign_key_name=URL_ERROR_INFO_FOREIGN_KEY_NAME)
_remove_cascade_foreign_key(COMPRESSED_HTML_TABLE_NAME, foreign_key_name=COMPRESSED_HTML_FOREIGN_KEY_NAME)
_remove_cascade_foreign_key(URL_HTML_CONTENT_TABLE_NAME, foreign_key_name=URL_HTML_CONTENT_FOREIGN_KEY_NAME)
_remove_cascade_foreign_key(AUTOMATED_URL_AGENCY_SUGGESTION_TABLE_NAME, foreign_key_name=AUTOMATED_URL_AGENCY_SUGGESTION_FOREIGN_KEY_NAME)
# _remove_cascade_foreign_key(AUTOMATED_URL_AGENCY_SUGGESTION_TABLE_NAME, foreign_key_name=AUTOMATED_URL_AGENCY_SUGGESTION_FOREIGN_KEY_NAME)

def _delete_duplicate_urls() -> None:
op.execute('delete from urls where id in (2341,2343,2344,2347,2348,2349,2354,2359,2361,2501,2504,2505,2506,2507)')
Expand Down Expand Up @@ -87,13 +87,13 @@
delete from urls
where source = 'data_sources_app'
"""
)

Check failure on line 90 in alembic/versions/2025_08_09_2031-8cd5aa7670ff_remove_functional_duplicates.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_08_09_2031-8cd5aa7670ff_remove_functional_duplicates.py#L90 <124>

closing bracket does not match visual indentation
Raw output
./alembic/versions/2025_08_09_2031-8cd5aa7670ff_remove_functional_duplicates.py:90:5: E124 closing bracket does not match visual indentation

def _reset_data_sources_sync_state() -> None:
op.execute("""
delete from data_sources_sync_state
"""
)

Check failure on line 96 in alembic/versions/2025_08_09_2031-8cd5aa7670ff_remove_functional_duplicates.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_08_09_2031-8cd5aa7670ff_remove_functional_duplicates.py#L96 <124>

closing bracket does not match visual indentation
Raw output
./alembic/versions/2025_08_09_2031-8cd5aa7670ff_remove_functional_duplicates.py:96:4: E124 closing bracket does not match visual indentation

def _add_constraint_forbidding_nbsp() -> None:
op.create_check_constraint(
Expand Down Expand Up @@ -121,3 +121,3 @@
update urls
set url = substring(url from 1 for position('#' in url) - 1)
where url like '%#%'
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,254 @@
"""Augment auto_agency_suggestions

Revision ID: b741b65a1431
Revises: 8a70ee509a74
Create Date: 2025-08-19 08:03:12.106575

"""
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa

from src.util.alembic_helpers import created_at_column, updated_at_column, id_column, url_id_column, switch_enum_type

# revision identifiers, used by Alembic.
revision: str = 'b741b65a1431'
down_revision: Union[str, None] = '8a70ee509a74'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None

OLD_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME = "automated_url_agency_suggestions"
NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME = "url_auto_agency_suggestions"

OLD_LINK_URLS_AGENCY_TABLE_NAME = "link_urls_agencies"
NEW_LINK_URLS_AGENCY_TABLE_NAME = "link_urls_agency"

AGENCY_AUTO_SUGGESTION_METHOD_ENUM = sa.Enum(
"homepage_match",
"nlp_location_match",
"muckrock_match",
"ckan_match",
name="agency_auto_suggestion_method",
)

FLAG_URL_VALIDATED_TABLE_NAME = "flag_url_validated"

VALIDATED_URL_TYPE_ENUM = sa.Enum(
"data source",
"meta url",
"not relevant",
"individual record",
name="validated_url_type"
)




def upgrade() -> None:

Check warning on line 48 in alembic/versions/2025_08_19_0803-b741b65a1431_augment_auto_agency_suggestions.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_08_19_0803-b741b65a1431_augment_auto_agency_suggestions.py#L48 <103>

Missing docstring in public function
Raw output
./alembic/versions/2025_08_19_0803-b741b65a1431_augment_auto_agency_suggestions.py:48:1: D103 Missing docstring in public function

Check failure on line 48 in alembic/versions/2025_08_19_0803-b741b65a1431_augment_auto_agency_suggestions.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_08_19_0803-b741b65a1431_augment_auto_agency_suggestions.py#L48 <303>

too many blank lines (4)
Raw output
./alembic/versions/2025_08_19_0803-b741b65a1431_augment_auto_agency_suggestions.py:48:1: E303 too many blank lines (4)
op.rename_table(OLD_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME)
op.rename_table(OLD_LINK_URLS_AGENCY_TABLE_NAME, NEW_LINK_URLS_AGENCY_TABLE_NAME)
_alter_auto_agency_suggestions_table()
_create_flag_url_validated_table()
_add_urls_to_flag_url_validated_table()
_remove_validated_and_submitted_url_statuses()
_reset_agencies_sync_state()


def downgrade() -> None:

Check warning on line 58 in alembic/versions/2025_08_19_0803-b741b65a1431_augment_auto_agency_suggestions.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_08_19_0803-b741b65a1431_augment_auto_agency_suggestions.py#L58 <103>

Missing docstring in public function
Raw output
./alembic/versions/2025_08_19_0803-b741b65a1431_augment_auto_agency_suggestions.py:58:1: D103 Missing docstring in public function
op.rename_table(NEW_LINK_URLS_AGENCY_TABLE_NAME, OLD_LINK_URLS_AGENCY_TABLE_NAME)
_revert_auto_agency_suggestions_table()
op.rename_table(NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, OLD_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME)
_revert_url_statuses()
_update_validated_and_submitted_url_statuses()
op.drop_table(FLAG_URL_VALIDATED_TABLE_NAME)
_drop_validated_url_type_enum()

def _reset_agencies_sync_state():
op.execute(
"""
UPDATE agencies_sync_state
set
last_full_sync_at = null,
current_cutoff_date = null,
current_page = null
"""
)

def _remove_validated_and_submitted_url_statuses():
switch_enum_type(
table_name="urls",
column_name="status",
enum_name="url_status",
new_enum_values=[
'ok',
'duplicate',
'error',
'404 not found',
],
check_constraints_to_drop=['url_name_not_null_when_validated'],
conversion_mappings={
'validated': 'ok',
'submitted': 'ok',
'pending': 'ok',
'not relevant': 'ok',
'individual record': 'ok'
}
)

def _add_urls_to_flag_url_validated_table():
op.execute("""
INSERT INTO flag_url_validated (url_id, type)
SELECT
urls.id,
CASE urls.status::text
WHEN 'validated' THEN 'data source'
WHEN 'submitted' THEN 'data source'
ELSE urls.status::text
END::validated_url_type
FROM urls
WHERE urls.status in ('validated', 'submitted', 'individual record', 'not relevant')""")

def _revert_url_statuses():
switch_enum_type(
table_name="urls",
column_name="status",
enum_name="url_status",
new_enum_values=[
'pending',
'validated',
'submitted',
'duplicate',
'not relevant',
'error',
'404 not found',
'individual record'
],
conversion_mappings={
'ok': 'pending',
}
)
op.create_check_constraint(
"url_name_not_null_when_validated",
"urls",
"(name IS NOT NULL) OR (status <> 'validated'::url_status)"
)

def _update_validated_and_submitted_url_statuses():
op.execute("""
UPDATE urls
SET status = 'not relevant'
FROM flag_url_validated
WHERE urls.id = flag_url_validated.id
AND flag_url_validated.type = 'not relevant'
""")

op.execute("""
UPDATE urls
SET status = 'individual record'
FROM flag_url_validated
WHERE urls.id = flag_url_validated.id
AND flag_url_validated.type = 'individual record'
""")

op.execute("""
UPDATE urls
SET status = 'validated'
FROM flag_url_validated
left join url_data_source on flag_url_validated.url_id = url_data_source.url_id
WHERE urls.id = flag_url_validated.id
AND flag_url_validated.type = 'data source'
AND url_data_source.url_id is NULL
""")

op.execute("""
UPDATE urls
SET status = 'validated'
FROM flag_url_validated
left join url_data_source on flag_url_validated.url_id = url_data_source.url_id
WHERE urls.id = flag_url_validated.id
AND flag_url_validated.type = 'data source'
AND url_data_source.url_id is not NULL
""")


def _create_flag_url_validated_table():
op.create_table(
FLAG_URL_VALIDATED_TABLE_NAME,
id_column(),
url_id_column(),
sa.Column(
'type',
VALIDATED_URL_TYPE_ENUM,
nullable=False,
),
created_at_column(),
updated_at_column(),
sa.UniqueConstraint('url_id', name='uq_flag_url_validated_url_id')
)

def _drop_validated_url_type_enum():
VALIDATED_URL_TYPE_ENUM.drop(op.get_bind())

def _alter_auto_agency_suggestions_table():
AGENCY_AUTO_SUGGESTION_METHOD_ENUM.create(op.get_bind())
# Created At
op.add_column(
NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME,
created_at_column()
)
# Updated At
op.add_column(
NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME,
updated_at_column()
)
# Method
op.add_column(
NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME,
sa.Column(
'method',
AGENCY_AUTO_SUGGESTION_METHOD_ENUM,
nullable=True
)
)
# Confidence
op.add_column(
NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME,
sa.Column(
'confidence',
sa.Float(),
server_default=sa.text('0.0'),
nullable=False
)
)
# Check constraint that confidence is between 0 and 1
op.create_check_constraint(
"auto_url_agency_suggestions_check_confidence_between_0_and_1",
NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME,
"confidence BETWEEN 0 AND 1"
)


def _revert_auto_agency_suggestions_table():
# Created At
op.drop_column(
NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME,
'created_at'
)
# Updated At
op.drop_column(
NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME,
'updated_at'
)
# Method
op.drop_column(
NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME,
'method'
)
# Confidence
op.drop_column(
NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME,
'confidence'
)
AGENCY_AUTO_SUGGESTION_METHOD_ENUM.drop(op.get_bind())

Check warning on line 254 in alembic/versions/2025_08_19_0803-b741b65a1431_augment_auto_agency_suggestions.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_08_19_0803-b741b65a1431_augment_auto_agency_suggestions.py#L254 <391>

blank line at end of file
Raw output
./alembic/versions/2025_08_19_0803-b741b65a1431_augment_auto_agency_suggestions.py:254:1: W391 blank line at end of file
Loading
Loading