Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions alembic/env.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import logging
from datetime import datetime
from logging.config import fileConfig

Expand All @@ -7,7 +6,7 @@
from sqlalchemy import pool

from src.db.helpers.connect import get_postgres_connection_string
from src.db.models.templates import Base
from src.db.models.templates_.base import Base

# this is the Alembic Config object, which provides
# access to the values within the .ini file in use.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
"""Add HTML Status Info table

Revision ID: 99eceed6e614
Revises: 637de6eaa3ab
Create Date: 2025-07-31 15:36:40.966605

"""
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa

from src.util.alembic_helpers import id_column, created_at_column, updated_at_column, url_id_column, switch_enum_type

# revision identifiers, used by Alembic.
revision: str = '99eceed6e614'
down_revision: Union[str, None] = '637de6eaa3ab'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None

WEB_STATUS_ENUM = sa.Enum(
"not_attempted",
"success",
"error",
"404_not_found",
name="web_status"
)
SCRAPE_STATUS_ENUM = sa.Enum(
"success",
"error",
name="scrape_status",
)

URL_WEB_METADATA_TABLE_NAME = 'url_web_metadata'
URL_SCRAPE_INFO = 'url_scrape_info'





def upgrade() -> None:

Check warning on line 41 in alembic/versions/2025_07_31_1536-99eceed6e614_add_web_status_info_table.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_07_31_1536-99eceed6e614_add_web_status_info_table.py#L41 <103>

Missing docstring in public function
Raw output
./alembic/versions/2025_07_31_1536-99eceed6e614_add_web_status_info_table.py:41:1: D103 Missing docstring in public function

Check failure on line 41 in alembic/versions/2025_07_31_1536-99eceed6e614_add_web_status_info_table.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_07_31_1536-99eceed6e614_add_web_status_info_table.py#L41 <303>

too many blank lines (5)
Raw output
./alembic/versions/2025_07_31_1536-99eceed6e614_add_web_status_info_table.py:41:1: E303 too many blank lines (5)
_create_url_html_info_table()
_add_url_probe_task_type_enum()
_set_up_scrape_info_table()
_use_existing_html_data_to_add_scrape_info()

def _use_existing_html_data_to_add_scrape_info():
op.execute(
f"""
INSERT INTO {URL_SCRAPE_INFO} (url_id, status)
SELECT url_id, 'success'::scrape_status
FROM url_compressed_html
"""
)
op.execute(
f"""
INSERT INTO {URL_SCRAPE_INFO} (url_id, status)
SELECT distinct(url_id), 'success'::scrape_status
FROM url_html_content
LEFT JOIN URL_COMPRESSED_HTML USING (url_id)
WHERE URL_COMPRESSED_HTML.url_id IS NULL
"""
)

def downgrade() -> None:

Check warning on line 65 in alembic/versions/2025_07_31_1536-99eceed6e614_add_web_status_info_table.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_07_31_1536-99eceed6e614_add_web_status_info_table.py#L65 <103>

Missing docstring in public function
Raw output
./alembic/versions/2025_07_31_1536-99eceed6e614_add_web_status_info_table.py:65:1: D103 Missing docstring in public function
_drop_scrape_info_table()
# Drop Enums
WEB_STATUS_ENUM.drop(op.get_bind(), checkfirst=True)
_drop_url_probe_task_type_enum()
_tear_down_scrape_info_table()


def _set_up_scrape_info_table():
op.create_table(
URL_SCRAPE_INFO,
id_column(),
url_id_column(),
sa.Column(
'status',
SCRAPE_STATUS_ENUM,
nullable=False,
comment='The status of the most recent scrape attempt.'
),
created_at_column(),
updated_at_column(),
sa.UniqueConstraint('url_id', name='uq_url_scrape_info_url_id')
)




def _tear_down_scrape_info_table():

Check failure on line 92 in alembic/versions/2025_07_31_1536-99eceed6e614_add_web_status_info_table.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] alembic/versions/2025_07_31_1536-99eceed6e614_add_web_status_info_table.py#L92 <303>

too many blank lines (4)
Raw output
./alembic/versions/2025_07_31_1536-99eceed6e614_add_web_status_info_table.py:92:1: E303 too many blank lines (4)
op.drop_table(URL_SCRAPE_INFO)
# Drop enum
SCRAPE_STATUS_ENUM.drop(op.get_bind(), checkfirst=True)


def _add_url_probe_task_type_enum() -> None:
switch_enum_type(
table_name='tasks',
column_name='task_type',
enum_name='task_type',
new_enum_values=[
'HTML',
'Relevancy',
'Record Type',
'Agency Identification',
'Misc Metadata',
'Submit Approved URLs',
'Duplicate Detection',
'404 Probe',
'Sync Agencies',
'Sync Data Sources',
'Push to Hugging Face',
'URL Probe'
]
)

def _drop_url_probe_task_type_enum() -> None:
switch_enum_type(
table_name='tasks',
column_name='task_type',
enum_name='task_type',
new_enum_values=[
'HTML',
'Relevancy',
'Record Type',
'Agency Identification',
'Misc Metadata',
'Submit Approved URLs',
'Duplicate Detection',
'404 Probe',
'Sync Agencies',
'Sync Data Sources',
'Push to Hugging Face'
]
)

def _create_url_html_info_table() -> None:
op.create_table(
URL_WEB_METADATA_TABLE_NAME,
id_column(),
url_id_column(),
sa.Column('accessed', sa.Boolean(), nullable=False),
sa.Column('status_code', sa.Integer(), nullable=True),
sa.Column('content_type', sa.Text(), nullable=True),
sa.Column('error_message', sa.Text(), nullable=True),
created_at_column(),
updated_at_column(),
sa.UniqueConstraint('url_id', name='uq_url_web_status_info_url_id'),
sa.CheckConstraint('status_code >= 100', name='ck_url_web_status_info_status_code_min'),
sa.CheckConstraint('status_code <= 999', name='ck_url_web_status_info_status_code_max'),
)

def _drop_scrape_info_table() -> None:
op.drop_table(URL_WEB_METADATA_TABLE_NAME)
12 changes: 7 additions & 5 deletions local_database/classes/DockerClient.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import docker
from docker.errors import NotFound, APIError
from docker.models.containers import Container
from docker.models.networks import Network

from local_database.DTOs import DockerfileInfo, DockerInfo

Expand All @@ -9,7 +11,7 @@
def __init__(self):
self.client = docker.from_env()

def run_command(self, command: str, container_id: str):
def run_command(self, command: str, container_id: str) -> None:

Check warning on line 14 in local_database/classes/DockerClient.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] local_database/classes/DockerClient.py#L14 <102>

Missing docstring in public method
Raw output
./local_database/classes/DockerClient.py:14:1: D102 Missing docstring in public method
exec_id = self.client.api.exec_create(
container_id,
cmd=command,
Expand All @@ -20,7 +22,7 @@
for line in output_stream:
print(line.decode().rstrip())

def start_network(self, network_name):
def start_network(self, network_name) -> Network:

Check warning on line 25 in local_database/classes/DockerClient.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] local_database/classes/DockerClient.py#L25 <102>

Missing docstring in public method
Raw output
./local_database/classes/DockerClient.py:25:1: D102 Missing docstring in public method
try:
self.client.networks.create(network_name, driver="bridge")
except APIError as e:
Expand All @@ -30,14 +32,14 @@
print("Network already exists")
return self.client.networks.get(network_name)

def stop_network(self, network_name):
def stop_network(self, network_name) -> None:

Check warning on line 35 in local_database/classes/DockerClient.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] local_database/classes/DockerClient.py#L35 <102>

Missing docstring in public method
Raw output
./local_database/classes/DockerClient.py:35:1: D102 Missing docstring in public method
self.client.networks.get(network_name).remove()

def get_image(
self,
dockerfile_info: DockerfileInfo,
force_rebuild: bool = False
):
) -> None:
if dockerfile_info.dockerfile_directory:
# Build image from Dockerfile
self.client.images.build(
Expand All @@ -58,7 +60,7 @@
except NotFound:
self.client.images.pull(dockerfile_info.image_tag)

def get_existing_container(self, docker_info_name: str):
def get_existing_container(self, docker_info_name: str) -> Container | None:

Check warning on line 63 in local_database/classes/DockerClient.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] local_database/classes/DockerClient.py#L63 <102>

Missing docstring in public method
Raw output
./local_database/classes/DockerClient.py:63:1: D102 Missing docstring in public method
try:
return self.client.containers.get(docker_info_name)
except NotFound:
Expand Down
8 changes: 4 additions & 4 deletions local_database/classes/DockerContainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,19 @@
self.dc = dc
self.container = container

def run_command(self, command: str):
def run_command(self, command: str) -> None:

Check warning on line 14 in local_database/classes/DockerContainer.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] local_database/classes/DockerContainer.py#L14 <102>

Missing docstring in public method
Raw output
./local_database/classes/DockerContainer.py:14:1: D102 Missing docstring in public method
self.dc.run_command(command, self.container.id)

def stop(self):
def stop(self) -> None:

Check warning on line 17 in local_database/classes/DockerContainer.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] local_database/classes/DockerContainer.py#L17 <102>

Missing docstring in public method
Raw output
./local_database/classes/DockerContainer.py:17:1: D102 Missing docstring in public method
self.container.stop()

def log_to_file(self):
def log_to_file(self) -> None:

Check warning on line 20 in local_database/classes/DockerContainer.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] local_database/classes/DockerContainer.py#L20 <102>

Missing docstring in public method
Raw output
./local_database/classes/DockerContainer.py:20:1: D102 Missing docstring in public method
logs = self.container.logs(stdout=True, stderr=True)
container_name = self.container.name
with open(f"{container_name}.log", "wb") as f:
f.write(logs)

def wait_for_pg_to_be_ready(self):
def wait_for_pg_to_be_ready(self) -> None:

Check warning on line 26 in local_database/classes/DockerContainer.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] local_database/classes/DockerContainer.py#L26 <102>

Missing docstring in public method
Raw output
./local_database/classes/DockerContainer.py:26:1: D102 Missing docstring in public method
for i in range(30):
exit_code, output = self.container.exec_run("pg_isready")
print(output)
Expand Down
23 changes: 16 additions & 7 deletions local_database/classes/DockerManager.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

import docker
from docker.errors import APIError
from docker.models.containers import Container
from docker.models.networks import Network

from local_database.DTOs import DockerfileInfo, DockerInfo
from local_database.classes.DockerClient import DockerClient
Expand All @@ -20,7 +22,7 @@
self.network = self.start_network()

@staticmethod
def start_docker_engine():
def start_docker_engine() -> None:

Check warning on line 25 in local_database/classes/DockerManager.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] local_database/classes/DockerManager.py#L25 <102>

Missing docstring in public method
Raw output
./local_database/classes/DockerManager.py:25:1: D102 Missing docstring in public method
system = platform.system()

match system:
Expand All @@ -41,7 +43,7 @@
sys.exit(1)

@staticmethod
def is_docker_running():
def is_docker_running() -> bool:

Check warning on line 46 in local_database/classes/DockerManager.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] local_database/classes/DockerManager.py#L46 <102>

Missing docstring in public method
Raw output
./local_database/classes/DockerManager.py:46:1: D102 Missing docstring in public method
try:
client = docker.from_env()
client.ping()
Expand All @@ -50,16 +52,23 @@
print(f"Docker is not running: {e}")
return False

def run_command(self, command: str, container_id: str):
def run_command(

Check warning on line 55 in local_database/classes/DockerManager.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] local_database/classes/DockerManager.py#L55 <102>

Missing docstring in public method
Raw output
./local_database/classes/DockerManager.py:55:1: D102 Missing docstring in public method
self,
command: str,
container_id: str
) -> None:
self.client.run_command(command, container_id)

def start_network(self):
def start_network(self) -> Network:

Check warning on line 62 in local_database/classes/DockerManager.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] local_database/classes/DockerManager.py#L62 <102>

Missing docstring in public method
Raw output
./local_database/classes/DockerManager.py:62:1: D102 Missing docstring in public method
return self.client.start_network(self.network_name)

def stop_network(self):
def stop_network(self) -> None:

Check warning on line 65 in local_database/classes/DockerManager.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] local_database/classes/DockerManager.py#L65 <102>

Missing docstring in public method
Raw output
./local_database/classes/DockerManager.py:65:1: D102 Missing docstring in public method
self.client.stop_network(self.network_name)

def get_image(self, dockerfile_info: DockerfileInfo):
def get_image(

Check warning on line 68 in local_database/classes/DockerManager.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] local_database/classes/DockerManager.py#L68 <102>

Missing docstring in public method
Raw output
./local_database/classes/DockerManager.py:68:1: D102 Missing docstring in public method
self,
dockerfile_info: DockerfileInfo
) -> None:
self.client.get_image(dockerfile_info)

def run_container(
Expand All @@ -74,5 +83,5 @@
)
return DockerContainer(self.client, raw_container)

def get_containers(self):
def get_containers(self) -> list[Container]:

Check warning on line 86 in local_database/classes/DockerManager.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] local_database/classes/DockerManager.py#L86 <102>

Missing docstring in public method
Raw output
./local_database/classes/DockerManager.py:86:1: D102 Missing docstring in public method
return self.client.client.containers.list()
15 changes: 7 additions & 8 deletions local_database/classes/TimestampChecker.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,31 @@
import datetime
import os
from typing import Optional
from datetime import datetime, timedelta


class TimestampChecker:
def __init__(self):
self.last_run_time: Optional[datetime.datetime] = self.load_last_run_time()
self.last_run_time: datetime | None = self.load_last_run_time()

def load_last_run_time(self) -> Optional[datetime.datetime]:
def load_last_run_time(self) -> datetime | None:

Check warning on line 9 in local_database/classes/TimestampChecker.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] local_database/classes/TimestampChecker.py#L9 <102>

Missing docstring in public method
Raw output
./local_database/classes/TimestampChecker.py:9:1: D102 Missing docstring in public method
# Check if file `last_run.txt` exists
# If it does, load the last run time
if os.path.exists("local_state/last_run.txt"):
with open("local_state/last_run.txt", "r") as f:
return datetime.datetime.strptime(
return datetime.strptime(
f.read(),
"%Y-%m-%d %H:%M:%S"
)
return None

def last_run_within_24_hours(self):
def last_run_within_24_hours(self) -> bool:

Check warning on line 20 in local_database/classes/TimestampChecker.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] local_database/classes/TimestampChecker.py#L20 <102>

Missing docstring in public method
Raw output
./local_database/classes/TimestampChecker.py:20:1: D102 Missing docstring in public method
if self.last_run_time is None:
return False
return datetime.datetime.now() - self.last_run_time < datetime.timedelta(days=1)
return datetime.now() - self.last_run_time < timedelta(days=1)

def set_last_run_time(self):
# If directory `local_state` doesn't exist, create it
if not os.path.exists("local_state"):
os.makedirs("local_state")

with open("local_state/last_run.txt", "w") as f:
f.write(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
f.write(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
6 changes: 3 additions & 3 deletions local_database/create_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@


# Connect to the default 'postgres' database to create other databases
def connect(database="postgres", autocommit=True):
def connect(database="postgres", autocommit=True) -> psycopg2.extensions.connection:

Check warning on line 18 in local_database/create_database.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] local_database/create_database.py#L18 <103>

Missing docstring in public function
Raw output
./local_database/create_database.py:18:1: D103 Missing docstring in public function
conn = psycopg2.connect(
dbname=database,
user=POSTGRES_USER,
Expand All @@ -27,7 +27,7 @@
conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
return conn

def create_database(db_name):
def create_database(db_name: str) -> None:

Check warning on line 30 in local_database/create_database.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] local_database/create_database.py#L30 <103>

Missing docstring in public function
Raw output
./local_database/create_database.py:30:1: D103 Missing docstring in public function
conn = connect("postgres")
with conn.cursor() as cur:
cur.execute(sql.SQL("""
Expand All @@ -48,7 +48,7 @@
except Exception as e:
print(f"❌ Failed to create {db_name}: {e}")

def main():
def main() -> None:

Check warning on line 51 in local_database/create_database.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] local_database/create_database.py#L51 <103>

Missing docstring in public function
Raw output
./local_database/create_database.py:51:1: D103 Missing docstring in public function
print("Creating databases...")
create_database(LOCAL_SOURCE_COLLECTOR_DB_NAME)

Expand Down
Loading