Skip to content

Commit 833f493

Browse files
authored
Merge pull request #404 from Police-Data-Accessibility-Project/mc_381_url_agency_identifier
mc_381_url_agency_identifier
2 parents 28937d5 + 52abc9c commit 833f493

File tree

486 files changed

+11051
-4269
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

486 files changed

+11051
-4269
lines changed

Dockerfile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ RUN uv sync --locked --no-dev
1414
# Must call from the root directory because uv does not add playwright to path
1515
RUN playwright install-deps chromium
1616
RUN playwright install chromium
17+
# Download Spacy Model
18+
RUN python -m spacy download en_core_web_sm
1719

1820
# Copy project files
1921
COPY src ./src

ENV.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,10 @@ The following flags are available:
5353
| `RUN_URL_TASKS_TASK_FLAG` | Runs URL tasks. |
5454
| `IA_PROBE_TASK_FLAG` | Extracts and links Internet Archives metadata to URLs. |
5555
| `IA_SAVE_TASK_FLAG` | Saves URLs to Internet Archives. |
56-
56+
| `AGENCY_ID_HOMEPAGE_MATCH_FLAG` | Enables the homepage match subtask for agency identification. |
57+
| `AGENCY_ID_NLP_LOCATION_MATCH_FLAG` | Enables the NLP location match subtask for agency identification. |
58+
| `AGENCY_ID_CKAN_FLAG` | Enables the CKAN subtask for agency identification. |
59+
| `AGENCY_ID_MUCKROCK_FLAG` | Enables the MuckRock subtask for agency identification. |
5760

5861

5962
## Foreign Data Wrapper (FDW)

alembic/versions/2025_08_09_2031-8cd5aa7670ff_remove_functional_duplicates.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def downgrade() -> None:
5252
_remove_cascade_foreign_key(URL_ERROR_INFO_TABLE_NAME, foreign_key_name=URL_ERROR_INFO_FOREIGN_KEY_NAME)
5353
_remove_cascade_foreign_key(COMPRESSED_HTML_TABLE_NAME, foreign_key_name=COMPRESSED_HTML_FOREIGN_KEY_NAME)
5454
_remove_cascade_foreign_key(URL_HTML_CONTENT_TABLE_NAME, foreign_key_name=URL_HTML_CONTENT_FOREIGN_KEY_NAME)
55-
_remove_cascade_foreign_key(AUTOMATED_URL_AGENCY_SUGGESTION_TABLE_NAME, foreign_key_name=AUTOMATED_URL_AGENCY_SUGGESTION_FOREIGN_KEY_NAME)
55+
# _remove_cascade_foreign_key(AUTOMATED_URL_AGENCY_SUGGESTION_TABLE_NAME, foreign_key_name=AUTOMATED_URL_AGENCY_SUGGESTION_FOREIGN_KEY_NAME)
5656

5757
def _delete_duplicate_urls() -> None:
5858
op.execute('delete from urls where id in (2341,2343,2344,2347,2348,2349,2354,2359,2361,2501,2504,2505,2506,2507)')
Lines changed: 254 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,254 @@
1+
"""Augment auto_agency_suggestions
2+
3+
Revision ID: b741b65a1431
4+
Revises: 8a70ee509a74
5+
Create Date: 2025-08-19 08:03:12.106575
6+
7+
"""
8+
from typing import Sequence, Union
9+
10+
from alembic import op
11+
import sqlalchemy as sa
12+
13+
from src.util.alembic_helpers import created_at_column, updated_at_column, id_column, url_id_column, switch_enum_type
14+
15+
# revision identifiers, used by Alembic.
16+
revision: str = 'b741b65a1431'
17+
down_revision: Union[str, None] = '8a70ee509a74'
18+
branch_labels: Union[str, Sequence[str], None] = None
19+
depends_on: Union[str, Sequence[str], None] = None
20+
21+
OLD_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME = "automated_url_agency_suggestions"
22+
NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME = "url_auto_agency_suggestions"
23+
24+
OLD_LINK_URLS_AGENCY_TABLE_NAME = "link_urls_agencies"
25+
NEW_LINK_URLS_AGENCY_TABLE_NAME = "link_urls_agency"
26+
27+
AGENCY_AUTO_SUGGESTION_METHOD_ENUM = sa.Enum(
28+
"homepage_match",
29+
"nlp_location_match",
30+
"muckrock_match",
31+
"ckan_match",
32+
name="agency_auto_suggestion_method",
33+
)
34+
35+
FLAG_URL_VALIDATED_TABLE_NAME = "flag_url_validated"
36+
37+
VALIDATED_URL_TYPE_ENUM = sa.Enum(
38+
"data source",
39+
"meta url",
40+
"not relevant",
41+
"individual record",
42+
name="validated_url_type"
43+
)
44+
45+
46+
47+
48+
def upgrade() -> None:
49+
op.rename_table(OLD_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME)
50+
op.rename_table(OLD_LINK_URLS_AGENCY_TABLE_NAME, NEW_LINK_URLS_AGENCY_TABLE_NAME)
51+
_alter_auto_agency_suggestions_table()
52+
_create_flag_url_validated_table()
53+
_add_urls_to_flag_url_validated_table()
54+
_remove_validated_and_submitted_url_statuses()
55+
_reset_agencies_sync_state()
56+
57+
58+
def downgrade() -> None:
59+
op.rename_table(NEW_LINK_URLS_AGENCY_TABLE_NAME, OLD_LINK_URLS_AGENCY_TABLE_NAME)
60+
_revert_auto_agency_suggestions_table()
61+
op.rename_table(NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME, OLD_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME)
62+
_revert_url_statuses()
63+
_update_validated_and_submitted_url_statuses()
64+
op.drop_table(FLAG_URL_VALIDATED_TABLE_NAME)
65+
_drop_validated_url_type_enum()
66+
67+
def _reset_agencies_sync_state():
68+
op.execute(
69+
"""
70+
UPDATE agencies_sync_state
71+
set
72+
last_full_sync_at = null,
73+
current_cutoff_date = null,
74+
current_page = null
75+
"""
76+
)
77+
78+
def _remove_validated_and_submitted_url_statuses():
79+
switch_enum_type(
80+
table_name="urls",
81+
column_name="status",
82+
enum_name="url_status",
83+
new_enum_values=[
84+
'ok',
85+
'duplicate',
86+
'error',
87+
'404 not found',
88+
],
89+
check_constraints_to_drop=['url_name_not_null_when_validated'],
90+
conversion_mappings={
91+
'validated': 'ok',
92+
'submitted': 'ok',
93+
'pending': 'ok',
94+
'not relevant': 'ok',
95+
'individual record': 'ok'
96+
}
97+
)
98+
99+
def _add_urls_to_flag_url_validated_table():
100+
op.execute("""
101+
INSERT INTO flag_url_validated (url_id, type)
102+
SELECT
103+
urls.id,
104+
CASE urls.status::text
105+
WHEN 'validated' THEN 'data source'
106+
WHEN 'submitted' THEN 'data source'
107+
ELSE urls.status::text
108+
END::validated_url_type
109+
FROM urls
110+
WHERE urls.status in ('validated', 'submitted', 'individual record', 'not relevant')""")
111+
112+
def _revert_url_statuses():
113+
switch_enum_type(
114+
table_name="urls",
115+
column_name="status",
116+
enum_name="url_status",
117+
new_enum_values=[
118+
'pending',
119+
'validated',
120+
'submitted',
121+
'duplicate',
122+
'not relevant',
123+
'error',
124+
'404 not found',
125+
'individual record'
126+
],
127+
conversion_mappings={
128+
'ok': 'pending',
129+
}
130+
)
131+
op.create_check_constraint(
132+
"url_name_not_null_when_validated",
133+
"urls",
134+
"(name IS NOT NULL) OR (status <> 'validated'::url_status)"
135+
)
136+
137+
def _update_validated_and_submitted_url_statuses():
138+
op.execute("""
139+
UPDATE urls
140+
SET status = 'not relevant'
141+
FROM flag_url_validated
142+
WHERE urls.id = flag_url_validated.id
143+
AND flag_url_validated.type = 'not relevant'
144+
""")
145+
146+
op.execute("""
147+
UPDATE urls
148+
SET status = 'individual record'
149+
FROM flag_url_validated
150+
WHERE urls.id = flag_url_validated.id
151+
AND flag_url_validated.type = 'individual record'
152+
""")
153+
154+
op.execute("""
155+
UPDATE urls
156+
SET status = 'validated'
157+
FROM flag_url_validated
158+
left join url_data_source on flag_url_validated.url_id = url_data_source.url_id
159+
WHERE urls.id = flag_url_validated.id
160+
AND flag_url_validated.type = 'data source'
161+
AND url_data_source.url_id is NULL
162+
""")
163+
164+
op.execute("""
165+
UPDATE urls
166+
SET status = 'validated'
167+
FROM flag_url_validated
168+
left join url_data_source on flag_url_validated.url_id = url_data_source.url_id
169+
WHERE urls.id = flag_url_validated.id
170+
AND flag_url_validated.type = 'data source'
171+
AND url_data_source.url_id is not NULL
172+
""")
173+
174+
175+
def _create_flag_url_validated_table():
176+
op.create_table(
177+
FLAG_URL_VALIDATED_TABLE_NAME,
178+
id_column(),
179+
url_id_column(),
180+
sa.Column(
181+
'type',
182+
VALIDATED_URL_TYPE_ENUM,
183+
nullable=False,
184+
),
185+
created_at_column(),
186+
updated_at_column(),
187+
sa.UniqueConstraint('url_id', name='uq_flag_url_validated_url_id')
188+
)
189+
190+
def _drop_validated_url_type_enum():
191+
VALIDATED_URL_TYPE_ENUM.drop(op.get_bind())
192+
193+
def _alter_auto_agency_suggestions_table():
194+
AGENCY_AUTO_SUGGESTION_METHOD_ENUM.create(op.get_bind())
195+
# Created At
196+
op.add_column(
197+
NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME,
198+
created_at_column()
199+
)
200+
# Updated At
201+
op.add_column(
202+
NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME,
203+
updated_at_column()
204+
)
205+
# Method
206+
op.add_column(
207+
NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME,
208+
sa.Column(
209+
'method',
210+
AGENCY_AUTO_SUGGESTION_METHOD_ENUM,
211+
nullable=True
212+
)
213+
)
214+
# Confidence
215+
op.add_column(
216+
NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME,
217+
sa.Column(
218+
'confidence',
219+
sa.Float(),
220+
server_default=sa.text('0.0'),
221+
nullable=False
222+
)
223+
)
224+
# Check constraint that confidence is between 0 and 1
225+
op.create_check_constraint(
226+
"auto_url_agency_suggestions_check_confidence_between_0_and_1",
227+
NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME,
228+
"confidence BETWEEN 0 AND 1"
229+
)
230+
231+
232+
def _revert_auto_agency_suggestions_table():
233+
# Created At
234+
op.drop_column(
235+
NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME,
236+
'created_at'
237+
)
238+
# Updated At
239+
op.drop_column(
240+
NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME,
241+
'updated_at'
242+
)
243+
# Method
244+
op.drop_column(
245+
NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME,
246+
'method'
247+
)
248+
# Confidence
249+
op.drop_column(
250+
NEW_AUTO_URL_AGENCY_SUGGESTIONS_TABLE_NAME,
251+
'confidence'
252+
)
253+
AGENCY_AUTO_SUGGESTION_METHOD_ENUM.drop(op.get_bind())
254+

0 commit comments

Comments
 (0)