Skip to content

Commit 073b247

Browse files
authored
Merge pull request #355 from Police-Data-Accessibility-Project/mc_89_add_scraping_logic_for_non_pending_urls
mc_89_add_scraping_logic_for_non_pending_urls
2 parents 0d4b0ff + 284eb66 commit 073b247

File tree

232 files changed

+2008
-751
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

232 files changed

+2008
-751
lines changed

alembic/env.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import logging
21
from datetime import datetime
32
from logging.config import fileConfig
43

@@ -7,7 +6,7 @@
76
from sqlalchemy import pool
87

98
from src.db.helpers.connect import get_postgres_connection_string
10-
from src.db.models.templates import Base
9+
from src.db.models.templates_.base import Base
1110

1211
# this is the Alembic Config object, which provides
1312
# access to the values within the .ini file in use.
Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
"""Add HTML Status Info table
2+
3+
Revision ID: 99eceed6e614
4+
Revises: 637de6eaa3ab
5+
Create Date: 2025-07-31 15:36:40.966605
6+
7+
"""
8+
from typing import Sequence, Union
9+
10+
from alembic import op
11+
import sqlalchemy as sa
12+
13+
from src.util.alembic_helpers import id_column, created_at_column, updated_at_column, url_id_column, switch_enum_type
14+
15+
# revision identifiers, used by Alembic.
16+
revision: str = '99eceed6e614'
17+
down_revision: Union[str, None] = '637de6eaa3ab'
18+
branch_labels: Union[str, Sequence[str], None] = None
19+
depends_on: Union[str, Sequence[str], None] = None
20+
21+
WEB_STATUS_ENUM = sa.Enum(
22+
"not_attempted",
23+
"success",
24+
"error",
25+
"404_not_found",
26+
name="web_status"
27+
)
28+
SCRAPE_STATUS_ENUM = sa.Enum(
29+
"success",
30+
"error",
31+
name="scrape_status",
32+
)
33+
34+
URL_WEB_METADATA_TABLE_NAME = 'url_web_metadata'
35+
URL_SCRAPE_INFO = 'url_scrape_info'
36+
37+
38+
39+
40+
41+
def upgrade() -> None:
42+
_create_url_html_info_table()
43+
_add_url_probe_task_type_enum()
44+
_set_up_scrape_info_table()
45+
_use_existing_html_data_to_add_scrape_info()
46+
47+
def _use_existing_html_data_to_add_scrape_info():
48+
op.execute(
49+
f"""
50+
INSERT INTO {URL_SCRAPE_INFO} (url_id, status)
51+
SELECT url_id, 'success'::scrape_status
52+
FROM url_compressed_html
53+
"""
54+
)
55+
op.execute(
56+
f"""
57+
INSERT INTO {URL_SCRAPE_INFO} (url_id, status)
58+
SELECT distinct(url_id), 'success'::scrape_status
59+
FROM url_html_content
60+
LEFT JOIN URL_COMPRESSED_HTML USING (url_id)
61+
WHERE URL_COMPRESSED_HTML.url_id IS NULL
62+
"""
63+
)
64+
65+
def downgrade() -> None:
66+
_drop_scrape_info_table()
67+
# Drop Enums
68+
WEB_STATUS_ENUM.drop(op.get_bind(), checkfirst=True)
69+
_drop_url_probe_task_type_enum()
70+
_tear_down_scrape_info_table()
71+
72+
73+
def _set_up_scrape_info_table():
74+
op.create_table(
75+
URL_SCRAPE_INFO,
76+
id_column(),
77+
url_id_column(),
78+
sa.Column(
79+
'status',
80+
SCRAPE_STATUS_ENUM,
81+
nullable=False,
82+
comment='The status of the most recent scrape attempt.'
83+
),
84+
created_at_column(),
85+
updated_at_column(),
86+
sa.UniqueConstraint('url_id', name='uq_url_scrape_info_url_id')
87+
)
88+
89+
90+
91+
92+
def _tear_down_scrape_info_table():
93+
op.drop_table(URL_SCRAPE_INFO)
94+
# Drop enum
95+
SCRAPE_STATUS_ENUM.drop(op.get_bind(), checkfirst=True)
96+
97+
98+
def _add_url_probe_task_type_enum() -> None:
99+
switch_enum_type(
100+
table_name='tasks',
101+
column_name='task_type',
102+
enum_name='task_type',
103+
new_enum_values=[
104+
'HTML',
105+
'Relevancy',
106+
'Record Type',
107+
'Agency Identification',
108+
'Misc Metadata',
109+
'Submit Approved URLs',
110+
'Duplicate Detection',
111+
'404 Probe',
112+
'Sync Agencies',
113+
'Sync Data Sources',
114+
'Push to Hugging Face',
115+
'URL Probe'
116+
]
117+
)
118+
119+
def _drop_url_probe_task_type_enum() -> None:
120+
switch_enum_type(
121+
table_name='tasks',
122+
column_name='task_type',
123+
enum_name='task_type',
124+
new_enum_values=[
125+
'HTML',
126+
'Relevancy',
127+
'Record Type',
128+
'Agency Identification',
129+
'Misc Metadata',
130+
'Submit Approved URLs',
131+
'Duplicate Detection',
132+
'404 Probe',
133+
'Sync Agencies',
134+
'Sync Data Sources',
135+
'Push to Hugging Face'
136+
]
137+
)
138+
139+
def _create_url_html_info_table() -> None:
140+
op.create_table(
141+
URL_WEB_METADATA_TABLE_NAME,
142+
id_column(),
143+
url_id_column(),
144+
sa.Column('accessed', sa.Boolean(), nullable=False),
145+
sa.Column('status_code', sa.Integer(), nullable=True),
146+
sa.Column('content_type', sa.Text(), nullable=True),
147+
sa.Column('error_message', sa.Text(), nullable=True),
148+
created_at_column(),
149+
updated_at_column(),
150+
sa.UniqueConstraint('url_id', name='uq_url_web_status_info_url_id'),
151+
sa.CheckConstraint('status_code >= 100', name='ck_url_web_status_info_status_code_min'),
152+
sa.CheckConstraint('status_code <= 999', name='ck_url_web_status_info_status_code_max'),
153+
)
154+
155+
def _drop_scrape_info_table() -> None:
156+
op.drop_table(URL_WEB_METADATA_TABLE_NAME)

local_database/classes/DockerClient.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import docker
22
from docker.errors import NotFound, APIError
3+
from docker.models.containers import Container
4+
from docker.models.networks import Network
35

46
from local_database.DTOs import DockerfileInfo, DockerInfo
57

@@ -9,7 +11,7 @@ class DockerClient:
911
def __init__(self):
1012
self.client = docker.from_env()
1113

12-
def run_command(self, command: str, container_id: str):
14+
def run_command(self, command: str, container_id: str) -> None:
1315
exec_id = self.client.api.exec_create(
1416
container_id,
1517
cmd=command,
@@ -20,7 +22,7 @@ def run_command(self, command: str, container_id: str):
2022
for line in output_stream:
2123
print(line.decode().rstrip())
2224

23-
def start_network(self, network_name):
25+
def start_network(self, network_name) -> Network:
2426
try:
2527
self.client.networks.create(network_name, driver="bridge")
2628
except APIError as e:
@@ -30,14 +32,14 @@ def start_network(self, network_name):
3032
print("Network already exists")
3133
return self.client.networks.get(network_name)
3234

33-
def stop_network(self, network_name):
35+
def stop_network(self, network_name) -> None:
3436
self.client.networks.get(network_name).remove()
3537

3638
def get_image(
3739
self,
3840
dockerfile_info: DockerfileInfo,
3941
force_rebuild: bool = False
40-
):
42+
) -> None:
4143
if dockerfile_info.dockerfile_directory:
4244
# Build image from Dockerfile
4345
self.client.images.build(
@@ -58,7 +60,7 @@ def get_image(
5860
except NotFound:
5961
self.client.images.pull(dockerfile_info.image_tag)
6062

61-
def get_existing_container(self, docker_info_name: str):
63+
def get_existing_container(self, docker_info_name: str) -> Container | None:
6264
try:
6365
return self.client.containers.get(docker_info_name)
6466
except NotFound:

local_database/classes/DockerContainer.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,19 +11,19 @@ def __init__(self, dc: DockerClient, container: Container):
1111
self.dc = dc
1212
self.container = container
1313

14-
def run_command(self, command: str):
14+
def run_command(self, command: str) -> None:
1515
self.dc.run_command(command, self.container.id)
1616

17-
def stop(self):
17+
def stop(self) -> None:
1818
self.container.stop()
1919

20-
def log_to_file(self):
20+
def log_to_file(self) -> None:
2121
logs = self.container.logs(stdout=True, stderr=True)
2222
container_name = self.container.name
2323
with open(f"{container_name}.log", "wb") as f:
2424
f.write(logs)
2525

26-
def wait_for_pg_to_be_ready(self):
26+
def wait_for_pg_to_be_ready(self) -> None:
2727
for i in range(30):
2828
exit_code, output = self.container.exec_run("pg_isready")
2929
print(output)

local_database/classes/DockerManager.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44

55
import docker
66
from docker.errors import APIError
7+
from docker.models.containers import Container
8+
from docker.models.networks import Network
79

810
from local_database.DTOs import DockerfileInfo, DockerInfo
911
from local_database.classes.DockerClient import DockerClient
@@ -20,7 +22,7 @@ def __init__(self):
2022
self.network = self.start_network()
2123

2224
@staticmethod
23-
def start_docker_engine():
25+
def start_docker_engine() -> None:
2426
system = platform.system()
2527

2628
match system:
@@ -41,7 +43,7 @@ def start_docker_engine():
4143
sys.exit(1)
4244

4345
@staticmethod
44-
def is_docker_running():
46+
def is_docker_running() -> bool:
4547
try:
4648
client = docker.from_env()
4749
client.ping()
@@ -50,16 +52,23 @@ def is_docker_running():
5052
print(f"Docker is not running: {e}")
5153
return False
5254

53-
def run_command(self, command: str, container_id: str):
55+
def run_command(
56+
self,
57+
command: str,
58+
container_id: str
59+
) -> None:
5460
self.client.run_command(command, container_id)
5561

56-
def start_network(self):
62+
def start_network(self) -> Network:
5763
return self.client.start_network(self.network_name)
5864

59-
def stop_network(self):
65+
def stop_network(self) -> None:
6066
self.client.stop_network(self.network_name)
6167

62-
def get_image(self, dockerfile_info: DockerfileInfo):
68+
def get_image(
69+
self,
70+
dockerfile_info: DockerfileInfo
71+
) -> None:
6372
self.client.get_image(dockerfile_info)
6473

6574
def run_container(
@@ -74,5 +83,5 @@ def run_container(
7483
)
7584
return DockerContainer(self.client, raw_container)
7685

77-
def get_containers(self):
86+
def get_containers(self) -> list[Container]:
7887
return self.client.client.containers.list()
Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,31 @@
1-
import datetime
21
import os
3-
from typing import Optional
2+
from datetime import datetime, timedelta
43

54

65
class TimestampChecker:
76
def __init__(self):
8-
self.last_run_time: Optional[datetime.datetime] = self.load_last_run_time()
7+
self.last_run_time: datetime | None = self.load_last_run_time()
98

10-
def load_last_run_time(self) -> Optional[datetime.datetime]:
9+
def load_last_run_time(self) -> datetime | None:
1110
# Check if file `last_run.txt` exists
1211
# If it does, load the last run time
1312
if os.path.exists("local_state/last_run.txt"):
1413
with open("local_state/last_run.txt", "r") as f:
15-
return datetime.datetime.strptime(
14+
return datetime.strptime(
1615
f.read(),
1716
"%Y-%m-%d %H:%M:%S"
1817
)
1918
return None
2019

21-
def last_run_within_24_hours(self):
20+
def last_run_within_24_hours(self) -> bool:
2221
if self.last_run_time is None:
2322
return False
24-
return datetime.datetime.now() - self.last_run_time < datetime.timedelta(days=1)
23+
return datetime.now() - self.last_run_time < timedelta(days=1)
2524

2625
def set_last_run_time(self):
2726
# If directory `local_state` doesn't exist, create it
2827
if not os.path.exists("local_state"):
2928
os.makedirs("local_state")
3029

3130
with open("local_state/last_run.txt", "w") as f:
32-
f.write(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
31+
f.write(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

local_database/create_database.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616

1717
# Connect to the default 'postgres' database to create other databases
18-
def connect(database="postgres", autocommit=True):
18+
def connect(database="postgres", autocommit=True) -> psycopg2.extensions.connection:
1919
conn = psycopg2.connect(
2020
dbname=database,
2121
user=POSTGRES_USER,
@@ -27,7 +27,7 @@ def connect(database="postgres", autocommit=True):
2727
conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
2828
return conn
2929

30-
def create_database(db_name):
30+
def create_database(db_name: str) -> None:
3131
conn = connect("postgres")
3232
with conn.cursor() as cur:
3333
cur.execute(sql.SQL("""
@@ -48,7 +48,7 @@ def create_database(db_name):
4848
except Exception as e:
4949
print(f"❌ Failed to create {db_name}: {e}")
5050

51-
def main():
51+
def main() -> None:
5252
print("Creating databases...")
5353
create_database(LOCAL_SOURCE_COLLECTOR_DB_NAME)
5454

0 commit comments

Comments
 (0)