Police-Data-Accessibility-Project · labradorite-dev · Feb 25, 2026 · Feb 25, 2026 · Feb 25, 2026 · Feb 25, 2026
@@ -0,0 +1,61 @@
+name: Annotation Benchmark
+
+on:
+  workflow_dispatch:
+  pull_request:
+    branches: [dev]
+
+jobs:
+  benchmark:
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    container: python:3.11.9
+
+    services:
+      postgres:
+        image: postgres:15
+        env:
+          POSTGRES_PASSWORD: postgres
+        options: >-
+          --health-cmd pg_isready
+          --health-interval 10s
+          --health-timeout 5s
+          --health-retries 5
+
+    env:
+      POSTGRES_PASSWORD: postgres
+      POSTGRES_USER: postgres
+      POSTGRES_DB: postgres
+      POSTGRES_HOST: postgres
+      POSTGRES_PORT: 5432
+      GOOGLE_API_KEY: TEST
+      GOOGLE_CSE_ID: TEST
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Install uv and set the python version
+        uses: astral-sh/setup-uv@v5
+
+      - name: Install the project
+        run: uv sync --locked --all-extras --dev
+
+      - name: Run benchmark tests
+        run: |
+          uv run pytest tests/automated/integration/benchmark \
+            -m "manual and benchmark" \
+            --benchmark-json=benchmark-results.json \
+            -v
+
+      - name: Post benchmark summary
+        run: uv run python scripts/post_benchmark_summary.py
+
+      - name: Upload benchmark results
+        uses: actions/upload-artifact@v4
+        with:
+          name: benchmark-results-${{ github.sha }}
+          path: |
+            benchmark-results.json
+            per-phase-results.json
+          retention-days: 90
@@ -5,20 +5,21 @@
 Create Date: 2026-02-15 12:57:34.550327
 
 """
-from typing import Sequence, Union
+from typing import Optional, Sequence, Union
 
 from alembic import op
 
 from src.util.alembic_helpers import switch_enum_type
 
 # revision identifiers, used by Alembic.
 revision: str = '1fb2286a016c'
-down_revision: Union[str, None] = '759ce7d0772b'
+down_revision: Optional[str] = '759ce7d0772b'
 branch_labels: Union[str, Sequence[str], None] = None
 depends_on: Union[str, Sequence[str], None] = None
 
 
 def upgrade() -> None:
+    """Add internet_archive to batch_strategy enum."""
     switch_enum_type(
         table_name="batches",
         column_name="strategy",
@@ -38,6 +39,7 @@ def upgrade() -> None:
 
 
 def downgrade() -> None:
+    """Remove internet_archive from batch_strategy enum."""
     op.execute("""
     DELETE FROM BATCHES
         WHERE STRATEGY = 'internet_archive'

@@ -0,0 +1,246 @@
+"""Materialize url_annotation_count_view and url_annotation_flags
+
+Revision ID: c8e4f1a2b3d5
+Revises: 759ce7d0772b
+Create Date: 2026-02-26 00:00:00.000000
+
+"""
+from typing import Optional, Sequence, Union
+
+from alembic import op
+
+
+# revision identifiers, used by Alembic.
+revision: str = 'c8e4f1a2b3d5'
+down_revision: Optional[str] = '1fb2286a016c'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+_URL_ANNOTATION_COUNT_VIEW_SQL = """
+            WITH
+    auto_location_count AS (
+        SELECT
+            u_1.id,
+            count(anno.url_id) AS cnt
+        FROM
+            urls u_1
+            JOIN annotation__location__auto__subtasks anno
+                 ON u_1.id = anno.url_id
+        GROUP BY
+            u_1.id
+        )
+    , auto_agency_count AS (
+        SELECT
+            u_1.id,
+            count(anno.url_id) AS cnt
+        FROM
+            urls u_1
+            JOIN annotation__agency__auto__subtasks anno
+                 ON u_1.id = anno.url_id
+        GROUP BY
+            u_1.id
+        )
+    , auto_url_type_count AS (
+        SELECT
+            u_1.id,
+            count(anno.url_id) AS cnt
+        FROM
+            urls u_1
+            JOIN annotation__url_type__auto anno
+                 ON u_1.id = anno.url_id
+        GROUP BY
+            u_1.id
+        )
+    , auto_record_type_count AS (
+        SELECT
+            u_1.id,
+            count(anno.url_id) AS cnt
+        FROM
+            urls u_1
+            JOIN annotation__record_type__auto anno
+                 ON u_1.id = anno.url_id
+        GROUP BY
+            u_1.id
+        )
+    , user_location_count AS (
+        SELECT
+            u_1.id,
+            count(anno.url_id) AS cnt
+        FROM
+            urls u_1
+            JOIN annotation__location__user anno
+                 ON u_1.id = anno.url_id
+        GROUP BY
+            u_1.id
+        )
+    , user_agency_count AS (
+        SELECT
+            u_1.id,
+            count(anno.url_id) AS cnt
+        FROM
+            urls u_1
+            JOIN annotation__agency__user anno
+                 ON u_1.id = anno.url_id
+        GROUP BY
+            u_1.id
+        )
+    , user_url_type_count AS (
+        SELECT
+            u_1.id,
+            count(anno.url_id) AS cnt
+        FROM
+            urls u_1
+            JOIN annotation__url_type__user anno
+                 ON u_1.id = anno.url_id
+        GROUP BY
+            u_1.id
+        )
+    , user_record_type_count AS (
+        SELECT
+            u_1.id,
+            count(anno.url_id) AS cnt
+        FROM
+            urls u_1
+            JOIN annotation__record_type__user anno
+                 ON u_1.id = anno.url_id
+        GROUP BY
+            u_1.id
+        )
+    , anon_location_count AS (
+        SELECT
+            u_1.id,
+            count(anno.url_id) AS cnt
+        FROM
+            urls u_1
+            JOIN annotation__location__anon anno
+                 ON u_1.id = anno.url_id
+        GROUP BY
+            u_1.id
+        )
+    , anon_agency_count AS (
+        SELECT
+            u_1.id,
+            count(anno.url_id) AS cnt
+        FROM
+            urls u_1
+            JOIN annotation__agency__anon anno
+                 ON u_1.id = anno.url_id
+        GROUP BY
+            u_1.id
+        )
+    , anon_url_type_count AS (
+        SELECT
+            u_1.id,
+            count(anno.url_id) AS cnt
+        FROM
+            urls u_1
+            JOIN annotation__url_type__anon anno
+                 ON u_1.id = anno.url_id
+        GROUP BY
+            u_1.id
+        )
+    , anon_record_type_count AS (
+        SELECT
+            u_1.id,
+            count(anno.url_id) AS cnt
+        FROM
+            urls u_1
+            JOIN annotation__record_type__anon anno
+                 ON u_1.id = anno.url_id
+        GROUP BY
+            u_1.id
+        )
+SELECT
+    u.id AS url_id,
+    COALESCE(auto_ag.cnt, 0::bigint) AS auto_agency_count,
+    COALESCE(auto_loc.cnt, 0::bigint) AS auto_location_count,
+    COALESCE(auto_rec.cnt, 0::bigint) AS auto_record_type_count,
+    COALESCE(auto_typ.cnt, 0::bigint) AS auto_url_type_count,
+    COALESCE(user_ag.cnt, 0::bigint) AS user_agency_count,
+    COALESCE(user_loc.cnt, 0::bigint) AS user_location_count,
+    COALESCE(user_rec.cnt, 0::bigint) AS user_record_type_count,
+    COALESCE(user_typ.cnt, 0::bigint) AS user_url_type_count,
+    COALESCE(anon_ag.cnt, 0::bigint) AS anon_agency_count,
+    COALESCE(anon_loc.cnt, 0::bigint) AS anon_location_count,
+    COALESCE(anon_rec.cnt, 0::bigint) AS anon_record_type_count,
+    COALESCE(anon_typ.cnt, 0::bigint) AS anon_url_type_count,
+    COALESCE(auto_ag.cnt, 0::bigint) + COALESCE(auto_loc.cnt, 0::bigint) + COALESCE(auto_rec.cnt, 0::bigint) +
+    COALESCE(auto_typ.cnt, 0::bigint) + COALESCE(user_ag.cnt, 0::bigint) + COALESCE(user_loc.cnt, 0::bigint) +
+    COALESCE(user_rec.cnt, 0::bigint) + COALESCE(user_typ.cnt, 0::bigint) + COALESCE(anon_ag.cnt, 0::bigint) +
+    COALESCE(anon_loc.cnt, 0::bigint) + COALESCE(anon_rec.cnt, 0::bigint) + COALESCE(anon_typ.cnt, 0::bigint) AS total_anno_count
+
+FROM
+    urls u
+    LEFT JOIN auto_agency_count auto_ag
+              ON auto_ag.id = u.id
+    LEFT JOIN auto_location_count auto_loc
+              ON auto_loc.id = u.id
+    LEFT JOIN auto_record_type_count auto_rec
+              ON auto_rec.id = u.id
+    LEFT JOIN auto_url_type_count auto_typ
+              ON auto_typ.id = u.id
+    LEFT JOIN user_agency_count user_ag
+              ON user_ag.id = u.id
+    LEFT JOIN user_location_count user_loc
+              ON user_loc.id = u.id
+    LEFT JOIN user_record_type_count user_rec
+              ON user_rec.id = u.id
+    LEFT JOIN user_url_type_count user_typ
+              ON user_typ.id = u.id
+    LEFT JOIN anon_agency_count anon_ag
+              ON anon_ag.id = u.id
+    LEFT JOIN anon_location_count anon_loc
+              ON anon_loc.id = u.id
+    LEFT JOIN anon_record_type_count anon_rec
+              ON anon_rec.id = u.id
+    LEFT JOIN anon_url_type_count anon_typ
+              ON anon_typ.id = u.id
+"""
+
+_URL_ANNOTATION_FLAGS_SQL = """
+SELECT u.id as url_id,
+        EXISTS (SELECT 1 FROM public.annotation__record_type__auto    a WHERE a.url_id = u.id) AS has_auto_record_type_suggestion,
+        EXISTS (SELECT 1 FROM public.annotation__url_type__auto       a WHERE a.url_id = u.id) AS has_auto_relevant_suggestion,
+        EXISTS (SELECT 1 FROM public.annotation__agency__auto__subtasks a WHERE a.url_id = u.id) AS has_auto_agency_suggestion,
+        EXISTS (SELECT 1 FROM public.annotation__location__auto__subtasks a WHERE a.url_id = u.id) AS has_auto_location_suggestion,
+        EXISTS (SELECT 1 FROM public.annotation__record_type__user    a WHERE a.url_id = u.id) AS has_user_record_type_suggestion,
+        EXISTS (SELECT 1 FROM public.annotation__url_type__user       a WHERE a.url_id = u.id) AS has_user_relevant_suggestion,
+        EXISTS (SELECT 1 FROM public.annotation__agency__user         a WHERE a.url_id = u.id) AS has_user_agency_suggestion,
+        EXISTS (SELECT 1 FROM public.annotation__location__user       a WHERE a.url_id = u.id) AS has_user_location_suggestion,
+        EXISTS (SELECT 1 FROM public.link_agencies__urls              a WHERE a.url_id = u.id) AS has_confirmed_agency,
+        EXISTS (SELECT 1 FROM public.reviewing_user_url               a WHERE a.url_id = u.id) AS was_reviewed
+FROM urls u
+"""
+
+
+def upgrade() -> None:
+    """Convert url_annotation_count_view and url_annotation_flags to materialized views."""
+    # Drop regular views
+    op.execute("DROP VIEW IF EXISTS url_annotation_count_view")
+    op.execute("DROP VIEW IF EXISTS url_annotation_flags")
+
+    # Recreate as materialized views
+    op.execute(
+        f"CREATE MATERIALIZED VIEW url_annotation_count_view AS {_URL_ANNOTATION_COUNT_VIEW_SQL}"
+    )
+    op.execute(
+        f"CREATE MATERIALIZED VIEW url_annotation_flags AS {_URL_ANNOTATION_FLAGS_SQL}"
+    )
+
+    # Unique indexes required for REFRESH MATERIALIZED VIEW CONCURRENTLY
+    op.execute("CREATE UNIQUE INDEX ON url_annotation_count_view (url_id)")
+    op.execute("CREATE UNIQUE INDEX ON url_annotation_flags (url_id)")
+
+
+def downgrade() -> None:
+    """Revert url_annotation_count_view and url_annotation_flags to regular views."""
+    op.execute("DROP MATERIALIZED VIEW IF EXISTS url_annotation_count_view")
+    op.execute("DROP MATERIALIZED VIEW IF EXISTS url_annotation_flags")
+
+    # Recreate as regular views
+    op.execute(
+        f"CREATE VIEW url_annotation_count_view AS {_URL_ANNOTATION_COUNT_VIEW_SQL}"
+    )
+    op.execute(
+        f"CREATE OR REPLACE VIEW url_annotation_flags AS ({_URL_ANNOTATION_FLAGS_SQL})"
+    )
@@ -35,14 +35,14 @@ At minimum, you need the database connection variables:
 
 ```dotenv
 POSTGRES_USER=test_source_collector_user
-POSTGRES_PASSWORD=HanviliciousHamiltonHilltops
+POSTGRES_PASSWORD=<see local_database/docker-compose.yml>
 POSTGRES_DB=source_collector_test_db
 POSTGRES_HOST=127.0.0.1
 POSTGRES_PORT=5432
 DEV=true
 ```
 
-These match the defaults in `local_database/docker-compose.yml`.
+The password and other defaults are defined in `local_database/docker-compose.yml`.
 
 ### API Keys
 

@@ -50,6 +50,7 @@ dev = [
     "pytest>=7.2.2",
     "pytest-asyncio~=0.25.2",
     "pytest-mock==3.12.0",
+    "pytest-benchmark~=5.2",
     "pytest-timeout~=2.3.1",
     "vulture>=2.14",
 ]

@@ -3,4 +3,5 @@ timeout = 300
 asyncio_default_fixture_loop_scope=function
 markers =
     manual: mark test as manual-only (excluded from default test runs)
+    benchmark: mark test as a performance benchmark (subset of manual)
 asyncio_mode = auto