From 845d8002d53a10041a1ec1842479295f5be081f2 Mon Sep 17 00:00:00 2001
From: HardMax71 <maxymazatyan@gmail.com>
Date: Mon, 16 Feb 2026 13:53:32 +0100
Subject: [PATCH 1/9] fix: grafana url updated, also user/pass is passed
 through gh secrets, not hardcoded in plaintext

---
 .github/workflows/release-deploy.yml | 7 ++++++-
 backend/grafana/grafana.ini          | 8 --------
 docker-compose.yaml                  | 3 +++
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/release-deploy.yml b/.github/workflows/release-deploy.yml
index 30c9f168..3bb636db 100644
--- a/.github/workflows/release-deploy.yml
+++ b/.github/workflows/release-deploy.yml
@@ -122,11 +122,13 @@ jobs:
           GHCR_TOKEN: ${{ secrets.DEPLOY_GHCR_TOKEN }}
           GHCR_USER: ${{ github.repository_owner }}
           IMAGE_TAG: ${{ needs.release.outputs.version }}
+          GRAFANA_ADMIN_USER: ${{ secrets.GRAFANA_ADMIN_USER }}
+          GRAFANA_ADMIN_PASSWORD: ${{ secrets.GRAFANA_ADMIN_PASSWORD }}
         with:
           host: ${{ secrets.DEPLOY_HOST }}
           username: ${{ secrets.DEPLOY_USER }}
           key: ${{ secrets.DEPLOY_SSH_KEY }}
-          envs: GHCR_TOKEN,GHCR_USER,IMAGE_TAG
+          envs: GHCR_TOKEN,GHCR_USER,IMAGE_TAG,GRAFANA_ADMIN_USER,GRAFANA_ADMIN_PASSWORD
           command_timeout: 10m
           script: |
             set -e
@@ -138,6 +140,9 @@ jobs:
 
             export IMAGE_TAG="$IMAGE_TAG"
             export COMPOSE_PROFILES=observability
+            export GRAFANA_ROOT_URL="https://grafana.integr8scode.cc/"
+            export GRAFANA_ADMIN_USER="$GRAFANA_ADMIN_USER"
+            export GRAFANA_ADMIN_PASSWORD="$GRAFANA_ADMIN_PASSWORD"
             docker compose pull
             docker compose up -d --remove-orphans --no-build --wait --wait-timeout 180
 
diff --git a/backend/grafana/grafana.ini b/backend/grafana/grafana.ini
index bf9130c9..b98be700 100644
--- a/backend/grafana/grafana.ini
+++ b/backend/grafana/grafana.ini
@@ -1,11 +1,3 @@
-[server]
-root_url = %(protocol)s://%(domain)s:%(http_port)s/grafana/
-serve_from_sub_path = true
-
-[security]
-admin_user = admin
-admin_password = admin123
-
 [users]
 allow_sign_up = false
 allow_org_create = false
diff --git a/docker-compose.yaml b/docker-compose.yaml
index c456492f..45d0f098 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -173,6 +173,9 @@ services:
       - app-network
     environment:
       - GF_LOG_LEVEL=warn
+      - GF_SERVER_ROOT_URL=${GRAFANA_ROOT_URL:-http://localhost:3000/}
+      - GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER:-admin}
+      - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin123}
 
   # Kafka Infrastructure for Event-Driven Design
   # Certificate generator for Zookeeper/Kafka SSL

From b525f70fe24e53bd701eb0a7c2b8169e386cd77b Mon Sep 17 00:00:00 2001
From: HardMax71 <maxymazatyan@gmail.com>
Date: Mon, 16 Feb 2026 13:53:52 +0100
Subject: [PATCH 2/9] fix: nginx rewrite for grafana path

---
 frontend/nginx.conf.template | 1 +
 1 file changed, 1 insertion(+)

diff --git a/frontend/nginx.conf.template b/frontend/nginx.conf.template
index 88df162c..532977d0 100644
--- a/frontend/nginx.conf.template
+++ b/frontend/nginx.conf.template
@@ -94,6 +94,7 @@ server {
     location /grafana/ {
         resolver 127.0.0.11 valid=30s ipv6=off;
         set $grafana_upstream http://grafana:3000;
+        rewrite ^/grafana/(.*) /$1 break;
         proxy_pass $grafana_upstream;
         proxy_set_header Host $host;
         proxy_set_header X-Real-IP $remote_addr;

From fecc94ed4dd87b8469bd0100826fb7888d798d35 Mon Sep 17 00:00:00 2001
From: HardMax71 <maxymazatyan@gmail.com>
Date: Mon, 16 Feb 2026 16:58:54 +0100
Subject: [PATCH 3/9] feat: contract testing for metrics/labels for metrics
 backend <-> grafana; also added docs file explaining what is what

---
 .../dashboards/dlq-monitoring.json            |  16 +-
 backend/pyproject.toml                        |   1 +
 backend/tests/contract/__init__.py            |   0
 .../tests/contract/test_grafana_metrics.py    | 178 ++++++++++++++++++
 docs/testing/contract-testing.md              |  70 +++++++
 mkdocs.yml                                    |   1 +
 6 files changed, 258 insertions(+), 8 deletions(-)
 create mode 100644 backend/tests/contract/__init__.py
 create mode 100644 backend/tests/contract/test_grafana_metrics.py
 create mode 100644 docs/testing/contract-testing.md

diff --git a/backend/grafana/provisioning/dashboards/dlq-monitoring.json b/backend/grafana/provisioning/dashboards/dlq-monitoring.json
index 5030ade0..74e95eab 100644
--- a/backend/grafana/provisioning/dashboards/dlq-monitoring.json
+++ b/backend/grafana/provisioning/dashboards/dlq-monitoring.json
@@ -591,17 +591,17 @@
       "pluginVersion": "8.3.3",
       "targets": [
         {
-          "expr": "histogram_quantile(0.50, sum(rate(dlq_message_age_bucket[5m])) by (le))",
+          "expr": "histogram_quantile(0.50, sum(rate(dlq_message_age_seconds_bucket[5m])) by (le))",
           "legendFormat": "p50",
           "refId": "A"
         },
         {
-          "expr": "histogram_quantile(0.90, sum(rate(dlq_message_age_bucket[5m])) by (le))",
+          "expr": "histogram_quantile(0.90, sum(rate(dlq_message_age_seconds_bucket[5m])) by (le))",
           "legendFormat": "p90",
           "refId": "B"
         },
         {
-          "expr": "histogram_quantile(0.99, sum(rate(dlq_message_age_bucket[5m])) by (le))",
+          "expr": "histogram_quantile(0.99, sum(rate(dlq_message_age_seconds_bucket[5m])) by (le))",
           "legendFormat": "p99",
           "refId": "C"
         }
@@ -678,12 +678,12 @@
       "pluginVersion": "8.3.3",
       "targets": [
         {
-          "expr": "histogram_quantile(0.50, sum(rate(dlq_processing_duration_bucket[5m])) by (le, operation))",
+          "expr": "histogram_quantile(0.50, sum(rate(dlq_processing_duration_seconds_bucket[5m])) by (le, operation))",
           "legendFormat": "p50 {{operation}}",
           "refId": "A"
         },
         {
-          "expr": "histogram_quantile(0.95, sum(rate(dlq_processing_duration_bucket[5m])) by (le, operation))",
+          "expr": "histogram_quantile(0.95, sum(rate(dlq_processing_duration_seconds_bucket[5m])) by (le, operation))",
           "legendFormat": "p95 {{operation}}",
           "refId": "B"
         }
@@ -755,7 +755,7 @@
       "pluginVersion": "8.3.3",
       "targets": [
         {
-          "expr": "avg by (original_topic) (dlq_retry_attempts)",
+          "expr": "sum by (original_topic) (dlq_retry_attempts_sum) / sum by (original_topic) (dlq_retry_attempts_count)",
           "legendFormat": "{{original_topic}}",
           "refId": "A"
         }
@@ -909,7 +909,7 @@
       "pluginVersion": "8.3.3",
       "targets": [
         {
-          "expr": "sum by (original_topic) (dlq_throughput_rate)",
+          "expr": "sum by (original_topic) (rate(dlq_throughput_rate_msg_per_second_sum[5m])) / sum by (original_topic) (rate(dlq_throughput_rate_msg_per_second_count[5m]))",
           "legendFormat": "{{original_topic}}",
           "refId": "A"
         }
@@ -1494,7 +1494,7 @@
       "pluginVersion": "8.3.3",
       "targets": [
         {
-          "expr": "histogram_quantile(0.99, sum(rate(dlq_message_age_bucket[5m])) by (le))",
+          "expr": "histogram_quantile(0.99, sum(rate(dlq_message_age_seconds_bucket[5m])) by (le))",
           "refId": "A"
         }
       ],
diff --git a/backend/pyproject.toml b/backend/pyproject.toml
index b1a96720..459a718c 100644
--- a/backend/pyproject.toml
+++ b/backend/pyproject.toml
@@ -210,6 +210,7 @@ markers = [
     "k8s: marks tests as requiring Kubernetes",
     "performance: marks tests as performance tests",
     "admin: marks tests as admin-only functionality tests",
+    "grafana_contract: marks tests as Grafana dashboard contract tests",
 ]
 asyncio_mode = "auto"
 asyncio_default_fixture_loop_scope = "session"
diff --git a/backend/tests/contract/__init__.py b/backend/tests/contract/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/backend/tests/contract/test_grafana_metrics.py b/backend/tests/contract/test_grafana_metrics.py
new file mode 100644
index 00000000..41dcec5b
--- /dev/null
+++ b/backend/tests/contract/test_grafana_metrics.py
@@ -0,0 +1,178 @@
+from __future__ import annotations
+
+import importlib
+import json
+import os
+import pkgutil
+import re
+from pathlib import Path
+from unittest.mock import MagicMock
+
+import app.core.metrics as metrics_pkg
+import pytest
+from app.core.metrics.base import BaseMetrics
+from app.core.middlewares.metrics import MetricsMiddleware, create_system_metrics
+from opentelemetry import metrics as otel_api
+from opentelemetry.exporter.prometheus import PrometheusMetricReader
+from opentelemetry.sdk.metrics import MeterProvider
+
+BACKEND_ROOT = Path(__file__).resolve().parent.parent.parent
+DASHBOARDS_DIR = BACKEND_ROOT / "grafana" / "provisioning" / "dashboards"
+
+PROMQL_BUILTINS = frozenset({
+    "sum", "avg", "min", "max", "count", "stddev", "stdvar", "group",
+    "count_values", "topk", "bottomk", "quantile",
+    "by", "without", "on", "ignoring", "group_left", "group_right", "bool",
+    "sum_over_time", "avg_over_time", "min_over_time", "max_over_time",
+    "count_over_time", "stddev_over_time", "stdvar_over_time",
+    "last_over_time", "present_over_time", "quantile_over_time",
+    "rate", "irate", "increase", "delta", "idelta", "deriv", "predict_linear",
+    "histogram_quantile", "histogram_avg", "histogram_count", "histogram_sum",
+    "histogram_fraction", "histogram_stddev", "histogram_stdvar",
+    "holt_winters",
+    "changes", "resets",
+    "vector", "scalar", "time", "timestamp",
+    "absent", "absent_over_time", "sgn",
+    "sort", "sort_desc", "sort_by_label", "sort_by_label_desc",
+    "label_replace", "label_join",
+    "round", "ceil", "floor", "clamp", "clamp_min", "clamp_max",
+    "abs", "sqrt", "ln", "log2", "log10", "exp", "exp2",
+    "acos", "asin", "atan", "atan2", "cos", "sin", "tan",
+    "acosh", "asinh", "atanh", "cosh", "sinh", "tanh",
+    "deg", "rad", "pi",
+    "day_of_month", "day_of_week", "day_of_year", "days_in_month",
+    "hour", "minute", "month", "year",
+    "or", "and", "unless",
+    "offset", "inf", "nan",
+    "le", "result", "status", "type", "format", "table", "instant",
+})
+
+
+@pytest.fixture(scope="module")
+def prometheus_families() -> dict[str, set[str]]:
+    """Instantiate every metric class through the real OTel -> Prometheus pipeline.
+
+    Returns:
+        Mapping of family name to set of sample names produced by that family.
+    """
+    # pytest-env sets OTEL_SDK_DISABLED=true; override so the real SDK is active.
+    os.environ.pop("OTEL_SDK_DISABLED", None)
+
+    reader = PrometheusMetricReader()
+    provider = MeterProvider(metric_readers=[reader])
+    otel_api.set_meter_provider(provider)
+
+    for _, mod_name, _ in pkgutil.iter_modules(metrics_pkg.__path__):
+        importlib.import_module(f"app.core.metrics.{mod_name}")
+
+    for cls in BaseMetrics.__subclasses__():
+        cls(MagicMock())
+    MetricsMiddleware(MagicMock())
+    create_system_metrics()
+
+    # Trigger every synchronous instrument via the SDK meter registry.
+    # Duck-typed getattr dispatch — no isinstance, works for any instrument type.
+    for meter in provider._meters.values():
+        for instrument in meter._instrument_id_instrument.values():
+            for method_name in ("add", "record", "set"):
+                method = getattr(instrument, method_name, None)
+                if method is not None:
+                    method(1)
+                    break
+
+    families: dict[str, set[str]] = {}
+    for family in reader._collector.collect():
+        sample_names: set[str] = set()
+        for sample in family.samples:
+            sample_names.add(sample.name)
+        if sample_names:
+            families[family.name] = sample_names
+    return families
+
+
+def _collect_exprs(obj: object, out: list[str]) -> None:
+    """Recursively extract ``expr`` field values from a Grafana dashboard JSON."""
+    if isinstance(obj, dict):
+        for key, value in obj.items():
+            if key == "expr" and isinstance(value, str):
+                out.append(value)
+            else:
+                _collect_exprs(value, out)
+    elif isinstance(obj, list):
+        for item in obj:
+            _collect_exprs(item, out)
+
+
+def _extract_metric_names(expr: str) -> set[str]:
+    """Extract Prometheus metric names from a PromQL expression."""
+    expr = re.sub(r'"[^"]*"', "", expr)
+    expr = re.sub(r"\{[^}]*\}", "", expr)
+    expr = re.sub(r"\[[^\]]*\]", "", expr)
+    expr = re.sub(r"\b(?:by|without)\s*\([^)]*\)", "", expr, flags=re.IGNORECASE)
+    tokens = re.findall(r"[a-zA-Z_:][a-zA-Z0-9_:]*", expr)
+    return {t for t in tokens if t.lower() not in PROMQL_BUILTINS and ("_" in t or len(t) > 3)}
+
+
+def _dashboard_metrics() -> dict[str, set[str]]:
+    """Return ``{dashboard_filename: {metric_name, ...}}`` for all dashboards."""
+    result: dict[str, set[str]] = {}
+    for path in sorted(DASHBOARDS_DIR.glob("*.json")):
+        data = json.loads(path.read_text())
+        exprs: list[str] = []
+        _collect_exprs(data, exprs)
+        metrics: set[str] = set()
+        for expr in exprs:
+            metrics |= _extract_metric_names(expr)
+        if metrics:
+            result[path.name] = metrics
+    return result
+
+
+@pytest.mark.grafana_contract
+def test_dashboard_metrics_defined_in_code(
+    prometheus_families: dict[str, set[str]],
+) -> None:
+    """Every metric in Grafana dashboards must map to a Python OTel definition."""
+    prom_names: set[str] = set()
+    for samples in prometheus_families.values():
+        prom_names |= samples
+
+    orphaned: dict[str, set[str]] = {}
+    for dashboard, metrics in _dashboard_metrics().items():
+        missing = metrics - prom_names
+        if missing:
+            orphaned[dashboard] = missing
+
+    if orphaned:
+        lines = ["Grafana dashboards reference metrics not found in code:\n"]
+        for dashboard, metrics in sorted(orphaned.items()):
+            lines.append(f"  {dashboard}:")
+            for m in sorted(metrics):
+                lines.append(f"    - {m}")
+        pytest.fail("\n".join(lines))
+
+
+@pytest.mark.grafana_contract
+def test_code_metrics_used_in_dashboards(
+    prometheus_families: dict[str, set[str]],
+) -> None:
+    """Every metric defined in Python OTel code must appear in a Grafana dashboard."""
+    all_dashboard_metrics: set[str] = set()
+    for metrics in _dashboard_metrics().values():
+        all_dashboard_metrics |= metrics
+
+    auto_generated = {"target"}
+    unused: dict[str, set[str]] = {}
+    for family_name, sample_names in sorted(prometheus_families.items()):
+        if family_name in auto_generated:
+            continue
+        if not sample_names & all_dashboard_metrics:
+            unused[family_name] = sample_names
+
+    if unused:
+        lines = ["Code defines metrics not used in any Grafana dashboard:\n"]
+        for family, samples in sorted(unused.items()):
+            lines.append(f"  {family}:")
+            for s in sorted(samples):
+                lines.append(f"    - {s}")
+        pytest.fail("\n".join(lines))
diff --git a/docs/testing/contract-testing.md b/docs/testing/contract-testing.md
new file mode 100644
index 00000000..a7e4360e
--- /dev/null
+++ b/docs/testing/contract-testing.md
@@ -0,0 +1,70 @@
+# Contract testing
+
+Contract tests sit between unit tests and integration tests. They verify that two parts of the system agree on a shared
+interface without actually running those parts together. In this project the main contract boundary is between Python
+OTel metric definitions and Grafana dashboard JSON files — both reference the same Prometheus metric names, but neither
+knows about the other at runtime.
+
+## Grafana metrics contract
+
+The test file lives at `backend/tests/contract/test_grafana_metrics.py`. It uses a real OTel-to-Prometheus export
+pipeline so that metric name conversion (dots to underscores, unit suffixes, `_total` / `_bucket` / etc.) is handled by
+the SDK, not hand-rolled.
+
+The setup works like this: a `PrometheusMetricReader` is attached to a `MeterProvider`, then every `BaseMetrics`
+subclass, the `MetricsMiddleware`, and the system metrics are instantiated so that their instruments get registered in
+the SDK. After that, the fixture walks `MeterProvider._meters` and triggers every synchronous instrument through
+duck-typed `getattr` — it tries `add`, `record`, `set` in order, calls the first one that exists, and moves on.
+Observable instruments (like the system CPU/memory gauges) fire their callbacks automatically during collection, so they
+need no explicit trigger. The result is a `dict[str, set[str]]` mapping each Prometheus family name to the set of sample
+names it produces.
+
+There are two tests that share this fixture:
+
+`test_dashboard_metrics_defined_in_code` is the forward check. It parses every `*.json` dashboard in
+`backend/grafana/provisioning/dashboards/`, extracts `expr` fields, tokenizes the PromQL, filters out known builtins
+(`rate`, `sum`, `by`, etc.), and checks that every remaining metric name exists in the Prometheus sample set. If a
+dashboard references `foo_bar_total` but no Python code defines that metric, the test fails and lists the offending
+dashboard and metric names.
+
+`test_code_metrics_used_in_dashboards` is the reverse check. It flattens every dashboard metric into one set, then
+iterates over the Prometheus families from the fixture. For each family it checks whether any of its sample names appear
+in that dashboard set. Because a single histogram like `execution_duration` produces `_bucket`, `_count`, `_sum`, and
+`_created` samples, the dashboard only needs to reference one of them for the family to pass. The `target` family is
+skipped since it's auto-generated by the OTel SDK and not something you'd panel in Grafana. If a metric family has no
+dashboard coverage at all, the test fails with a list of unused families and their samples.
+
+## Why duck-typed instrument triggering
+
+The earlier version of this test had an `isinstance` chain — check for `Counter`, call `add`; check for `Histogram`,
+call `record`; check for `UpDownCounter`, call `add`. This breaks silently when a new instrument type shows up (the
+project already uses `ObservableGauge`, which the old code didn't handle). The current approach iterates the SDK's
+internal meter registry and calls whichever method the instrument exposes. If OTel adds a new synchronous instrument
+type tomorrow that has an `add` or `record` method, the test picks it up with zero changes.
+
+## Running the tests
+
+```bash
+cd backend
+uv run pytest tests/contract/test_grafana_metrics.py -v -o "addopts="
+```
+
+The `-o "addopts="` override is needed because `pyproject.toml` sets `-n auto --dist=loadfile` for xdist, which
+interferes with the module-scoped fixture (the OTel `MeterProvider` can only be set once per process). Running without
+xdist is fine since these tests finish in under a second.
+
+Both tests use the `@pytest.mark.grafana_contract` marker, so you can also run them via:
+
+```bash
+uv run pytest -m grafana_contract -o "addopts="
+```
+
+## Adding a new metric
+
+When you add a metric to a `BaseMetrics` subclass, the forward test will keep passing (dashboards don't reference it
+yet). But the reverse test will fail, telling you exactly which family has no dashboard coverage. At that point either
+add a panel to an existing dashboard or create a new one. Conversely, if you add a PromQL expression to a dashboard that
+references a metric that doesn't exist in code, the forward test catches it.
+
+The goal is to keep the two sides in sync so you don't end up with dead panels pointing at metrics that were renamed
+three months ago, or metrics that nobody ever looks at because they were never wired into a dashboard.
diff --git a/mkdocs.yml b/mkdocs.yml
index f9aba928..150558d7 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -188,6 +188,7 @@ nav:
     - Error Handling: frontend/error-handling.md
 
   - Testing:
+    - Contract Testing: testing/contract-testing.md
     - Load Testing: testing/load-testing.md
     - Frontend Testing: testing/frontend-testing.md
     - Kafka Test Stability: testing/kafka-test-stability.md

From 22fe234ce9f868d46b5a64b6180379e0832dbd84 Mon Sep 17 00:00:00 2001
From: HardMax71 <maxymazatyan@gmail.com>
Date: Mon, 16 Feb 2026 20:09:22 +0100
Subject: [PATCH 4/9] feat: updated metrics system

---
 backend/app/api/routes/auth.py                | 152 +---
 backend/app/core/exceptions/handlers.py       |   3 +
 backend/app/core/metrics/__init__.py          |   2 -
 backend/app/core/metrics/connections.py       |  35 +-
 backend/app/core/metrics/dlq.py               |  22 -
 backend/app/core/metrics/events.py            |  89 +-
 backend/app/core/metrics/execution.py         |  24 -
 backend/app/core/metrics/health.py            | 109 ---
 backend/app/core/metrics/kubernetes.py        |  38 +-
 backend/app/core/metrics/notifications.py     |  79 +-
 backend/app/core/metrics/replay.py            |  32 +-
 backend/app/core/metrics/security.py          | 164 +---
 backend/app/core/middlewares/csrf.py          |  17 +-
 backend/app/core/providers.py                 |  27 +-
 backend/app/core/security.py                  |   7 +-
 backend/app/dlq/manager.py                    |   1 +
 backend/app/domain/exceptions.py              |   6 +
 backend/app/domain/user/__init__.py           |   2 +
 backend/app/domain/user/user_models.py        |  11 +
 .../app/services/admin/admin_user_service.py  |   4 +
 backend/app/services/auth_service.py          | 200 +++-
 .../services/event_replay/replay_service.py   |   7 +
 backend/app/services/k8s_worker/worker.py     |   4 +-
 backend/app/services/notification_service.py  |   8 +-
 backend/app/services/sse/sse_service.py       |   3 +
 .../dashboards/coordinator-execution.json     | 275 ++++++
 .../dashboards/dlq-monitoring.json            | 236 ++---
 .../provisioning/dashboards/event-replay.json | 387 ++++++++
 .../dashboards/event-stream-monitoring.json   | 861 +++---------------
 .../dashboards/http-middleware.json           | 387 ++++++++
 .../provisioning/dashboards/integr8scode.json | 148 ++-
 .../dashboards/kafka-events-monitoring.json   | 239 ++---
 .../dashboards/kubernetes-pods.json           | 350 +++++++
 .../dashboards/notifications.json             | 500 ++++++++++
 .../dashboards/security-auth.json             | 355 ++++++++
 backend/tests/unit/conftest.py                |   6 -
 ...est_connections_and_coordinator_metrics.py |   5 -
 .../metrics/test_database_and_dlq_metrics.py  |   2 -
 .../test_execution_and_events_metrics.py      |  14 -
 .../test_health_and_rate_limit_metrics.py     |  17 -
 ...st_kubernetes_and_notifications_metrics.py |  14 +-
 .../unit/core/metrics/test_metrics_classes.py |  14 +-
 .../test_replay_and_security_metrics.py       |  20 +-
 backend/tests/unit/core/test_csrf.py          |  80 +-
 backend/tests/unit/core/test_security.py      |  17 +-
 45 files changed, 2942 insertions(+), 2031 deletions(-)
 delete mode 100644 backend/app/core/metrics/health.py
 create mode 100644 backend/grafana/provisioning/dashboards/coordinator-execution.json
 create mode 100644 backend/grafana/provisioning/dashboards/event-replay.json
 create mode 100644 backend/grafana/provisioning/dashboards/http-middleware.json
 create mode 100644 backend/grafana/provisioning/dashboards/kubernetes-pods.json
 create mode 100644 backend/grafana/provisioning/dashboards/notifications.json
 create mode 100644 backend/grafana/provisioning/dashboards/security-auth.json

diff --git a/backend/app/api/routes/auth.py b/backend/app/api/routes/auth.py
index 679ea939..3aeb9de7 100644
--- a/backend/app/api/routes/auth.py
+++ b/backend/app/api/routes/auth.py
@@ -1,16 +1,10 @@
-from datetime import timedelta
-
 import structlog
 from dishka import FromDishka
 from dishka.integrations.fastapi import DishkaRoute
-from fastapi import APIRouter, Depends, HTTPException, Request, Response
+from fastapi import APIRouter, Depends, Request, Response
 from fastapi.security import OAuth2PasswordRequestForm
 
-from app.core.security import SecurityService
 from app.core.utils import get_client_ip
-from app.db.repositories import UserRepository
-from app.domain.enums import UserRole
-from app.domain.user import DomainUserCreate
 from app.schemas_pydantic.common import ErrorResponse
 from app.schemas_pydantic.user import (
     LoginResponse,
@@ -19,8 +13,6 @@
     UserResponse,
 )
 from app.services.auth_service import AuthService
-from app.services.login_lockout import LoginLockoutService
-from app.services.runtime_settings import RuntimeSettingsLoader
 
 router = APIRouter(prefix="/auth", tags=["authentication"], route_class=DishkaRoute)
 
@@ -36,10 +28,7 @@
 async def login(
     request: Request,
     response: Response,
-    user_repo: FromDishka[UserRepository],
-    security_service: FromDishka[SecurityService],
-    runtime_settings: FromDishka[RuntimeSettingsLoader],
-    lockout_service: FromDishka[LoginLockoutService],
+    auth_service: FromDishka[AuthService],
     logger: FromDishka[structlog.stdlib.BoundLogger],
     form_data: OAuth2PasswordRequestForm = Depends(),
 ) -> LoginResponse:
@@ -52,75 +41,18 @@ async def login(
         user_agent=request.headers.get("user-agent"),
     )
 
-    if await lockout_service.check_locked(form_data.username):
-        raise HTTPException(
-            status_code=423,
-            detail="Account temporarily locked due to too many failed attempts",
-        )
-
-    user = await user_repo.get_user(form_data.username)
-
-    if not user:
-        logger.warning(
-            "Login failed - user not found",
-            username=form_data.username,
-            client_ip=get_client_ip(request),
-            user_agent=request.headers.get("user-agent"),
-        )
-        locked = await lockout_service.record_failed_attempt(form_data.username)
-        if locked:
-            raise HTTPException(
-                status_code=423,
-                detail="Account locked due to too many failed attempts",
-            )
-        raise HTTPException(
-            status_code=401,
-            detail="Invalid credentials",
-            headers={"WWW-Authenticate": "Bearer"},
-        )
-
-    if not security_service.verify_password(form_data.password, user.hashed_password):
-        logger.warning(
-            "Login failed - invalid password",
-            username=form_data.username,
-            client_ip=get_client_ip(request),
-            user_agent=request.headers.get("user-agent"),
-        )
-        locked = await lockout_service.record_failed_attempt(form_data.username)
-        if locked:
-            raise HTTPException(
-                status_code=423,
-                detail="Account locked due to too many failed attempts",
-            )
-        raise HTTPException(
-            status_code=401,
-            detail="Invalid credentials",
-            headers={"WWW-Authenticate": "Bearer"},
-        )
-
-    await lockout_service.clear_attempts(form_data.username)
-
-    effective = await runtime_settings.get_effective_settings()
-    session_timeout = effective.session_timeout_minutes
-
-    logger.info(
-        "Login successful",
-        username=user.username,
-        client_ip=get_client_ip(request),
-        user_agent=request.headers.get("user-agent"),
-        token_expires_in_minutes=session_timeout,
+    result = await auth_service.login(
+        form_data.username,
+        form_data.password,
+        get_client_ip(request),
+        request.headers.get("user-agent"),
     )
 
-    access_token_expires = timedelta(minutes=session_timeout)
-    access_token = security_service.create_access_token(data={"sub": user.username}, expires_delta=access_token_expires)
-
-    csrf_token = security_service.generate_csrf_token(access_token)
-
     # --8<-- [start:login_cookies]
     response.set_cookie(
         key="access_token",
-        value=access_token,
-        max_age=session_timeout * 60,  # Convert to seconds
+        value=result.access_token,
+        max_age=result.session_timeout_minutes * 60,
         httponly=True,
         secure=True,  # HTTPS only
         samesite="strict",  # CSRF protection
@@ -129,8 +61,8 @@ async def login(
 
     response.set_cookie(
         key="csrf_token",
-        value=csrf_token,
-        max_age=session_timeout * 60,
+        value=result.csrf_token,
+        max_age=result.session_timeout_minutes * 60,
         httponly=False,  # JavaScript needs to read this
         secure=True,
         samesite="strict",
@@ -143,9 +75,9 @@ async def login(
 
     return LoginResponse(
         message="Login successful",
-        username=user.username,
-        role=user.role,
-        csrf_token=csrf_token,
+        username=result.username,
+        role=result.role,
+        csrf_token=result.csrf_token,
     )
 
 
@@ -160,9 +92,7 @@ async def login(
 async def register(
     request: Request,
     user: UserCreate,
-    user_repo: FromDishka[UserRepository],
-    security_service: FromDishka[SecurityService],
-    runtime_settings: FromDishka[RuntimeSettingsLoader],
+    auth_service: FromDishka[AuthService],
     logger: FromDishka[structlog.stdlib.BoundLogger],
 ) -> UserResponse:
     """Register a new user account."""
@@ -174,37 +104,12 @@ async def register(
         user_agent=request.headers.get("user-agent"),
     )
 
-    effective = await runtime_settings.get_effective_settings()
-    min_len = effective.password_min_length
-    if len(user.password) < min_len:
-        raise HTTPException(status_code=400, detail=f"Password must be at least {min_len} characters")
-
-    db_user = await user_repo.get_user(user.username)
-    if db_user:
-        logger.warning(
-            "Registration failed - username taken",
-            username=user.username,
-            client_ip=get_client_ip(request),
-            user_agent=request.headers.get("user-agent"),
-        )
-        raise HTTPException(status_code=409, detail="Username already registered")
-
-    hashed_password = security_service.get_password_hash(user.password)
-    create_data = DomainUserCreate(
-        username=user.username,
-        email=user.email,
-        hashed_password=hashed_password,
-        role=UserRole.USER,
-        is_active=True,
-        is_superuser=False,
-    )
-    created_user = await user_repo.create_user(create_data)
-
-    logger.info(
-        "Registration successful",
-        username=created_user.username,
-        client_ip=get_client_ip(request),
-        user_agent=request.headers.get("user-agent"),
+    created_user = await auth_service.register(
+        user.username,
+        user.email,
+        user.password,
+        get_client_ip(request),
+        request.headers.get("user-agent"),
     )
 
     return UserResponse.model_validate(created_user)
@@ -238,6 +143,7 @@ async def get_current_user_profile(
 async def logout(
     request: Request,
     response: Response,
+    auth_service: FromDishka[AuthService],
     logger: FromDishka[structlog.stdlib.BoundLogger],
 ) -> MessageResponse:
     """Log out and clear session cookies."""
@@ -248,17 +154,11 @@ async def logout(
         user_agent=request.headers.get("user-agent"),
     )
 
-    # Clear the httpOnly cookie
-    response.delete_cookie(
-        key="access_token",
-        path="/",
-    )
+    token = request.cookies.get("access_token")
+    await auth_service.publish_logout_event(token)
 
-    # Clear the CSRF cookie
-    response.delete_cookie(
-        key="csrf_token",
-        path="/",
-    )
+    response.delete_cookie(key="access_token", path="/")
+    response.delete_cookie(key="csrf_token", path="/")
 
     logger.info(
         "Logout successful",
diff --git a/backend/app/core/exceptions/handlers.py b/backend/app/core/exceptions/handlers.py
index 94cc6437..7453817e 100644
--- a/backend/app/core/exceptions/handlers.py
+++ b/backend/app/core/exceptions/handlers.py
@@ -2,6 +2,7 @@
 from fastapi.responses import JSONResponse
 
 from app.domain.exceptions import (
+    AccountLockedError,
     ConflictError,
     DomainError,
     ForbiddenError,
@@ -40,6 +41,8 @@ def _map_to_status_code(exc: DomainError) -> int:
         return 403
     if isinstance(exc, InvalidStateError):
         return 400
+    if isinstance(exc, AccountLockedError):
+        return 423
     if isinstance(exc, InfrastructureError):
         return 500
     return 500
diff --git a/backend/app/core/metrics/__init__.py b/backend/app/core/metrics/__init__.py
index 77d1687d..497229ec 100644
--- a/backend/app/core/metrics/__init__.py
+++ b/backend/app/core/metrics/__init__.py
@@ -5,7 +5,6 @@
 from app.core.metrics.dlq import DLQMetrics
 from app.core.metrics.events import EventMetrics
 from app.core.metrics.execution import ExecutionMetrics
-from app.core.metrics.health import HealthMetrics
 from app.core.metrics.kubernetes import KubernetesMetrics
 from app.core.metrics.notifications import NotificationMetrics
 from app.core.metrics.rate_limit import RateLimitMetrics
@@ -20,7 +19,6 @@
     "DLQMetrics",
     "EventMetrics",
     "ExecutionMetrics",
-    "HealthMetrics",
     "KubernetesMetrics",
     "NotificationMetrics",
     "RateLimitMetrics",
diff --git a/backend/app/core/metrics/connections.py b/backend/app/core/metrics/connections.py
index 3ca3c04f..aa40143a 100644
--- a/backend/app/core/metrics/connections.py
+++ b/backend/app/core/metrics/connections.py
@@ -2,7 +2,7 @@
 
 
 class ConnectionMetrics(BaseMetrics):
-    """Metrics for SSE connections and event bus."""
+    """Metrics for SSE connections."""
 
     def _create_instruments(self) -> None:
         self.sse_active_connections = self._meter.create_up_down_counter(
@@ -23,19 +23,6 @@ def _create_instruments(self) -> None:
             unit="1",
         )
 
-        self.sse_shutdown_duration = self._meter.create_histogram(
-            name="sse.shutdown.duration", description="Time taken for SSE shutdown phases in seconds", unit="s"
-        )
-
-        # Event bus metrics
-        self.event_bus_subscribers = self._meter.create_up_down_counter(
-            name="event.bus.subscribers", description="Number of active event bus subscribers by pattern", unit="1"
-        )
-
-        self.event_bus_subscriptions = self._meter.create_up_down_counter(
-            name="event.bus.subscriptions.total", description="Total number of event bus subscriptions", unit="1"
-        )
-
     def increment_sse_connections(self, endpoint: str = "default") -> None:
         self.sse_active_connections.add(1, attributes={"endpoint": endpoint})
 
@@ -50,23 +37,3 @@ def record_sse_connection_duration(self, duration_seconds: float, endpoint: str)
 
     def update_sse_draining_connections(self, delta: int) -> None:
         self.sse_draining_connections.add(delta)
-
-    def record_sse_shutdown_duration(self, duration_seconds: float, phase: str) -> None:
-        self.sse_shutdown_duration.record(duration_seconds, attributes={"phase": phase})
-
-    def update_sse_shutdown_duration(self, duration_seconds: float, phase: str) -> None:
-        self.sse_shutdown_duration.record(duration_seconds, attributes={"phase": phase})
-
-    def increment_event_bus_subscriptions(self) -> None:
-        self.event_bus_subscriptions.add(1)
-
-    def decrement_event_bus_subscriptions(self, count: int = 1) -> None:
-        self.event_bus_subscriptions.add(-count)
-
-    def update_event_bus_subscribers(self, count: int, pattern: str) -> None:
-        """Update the count of event bus subscribers for a specific pattern."""
-        # This tracks the current number of subscribers for a pattern
-        # We need to track the delta from the previous value
-        # Since we can't store state in metrics, we record the absolute value
-        # The metric system will handle the up/down nature
-        self.event_bus_subscribers.add(count, attributes={"pattern": pattern})
diff --git a/backend/app/core/metrics/dlq.py b/backend/app/core/metrics/dlq.py
index be8b988e..18383cdc 100644
--- a/backend/app/core/metrics/dlq.py
+++ b/backend/app/core/metrics/dlq.py
@@ -5,7 +5,6 @@ class DLQMetrics(BaseMetrics):
     """Metrics for Dead Letter Queue operations."""
 
     def _create_instruments(self) -> None:
-        # DLQ message metrics
         self.dlq_messages_received = self._meter.create_counter(
             name="dlq.messages.received.total", description="Total number of messages received in DLQ", unit="1"
         )
@@ -18,7 +17,6 @@ def _create_instruments(self) -> None:
             name="dlq.messages.discarded.total", description="Total number of DLQ messages discarded", unit="1"
         )
 
-        # DLQ processing metrics
         self.dlq_processing_duration = self._meter.create_histogram(
             name="dlq.processing.duration", description="Time spent processing DLQ messages in seconds", unit="s"
         )
@@ -27,25 +25,14 @@ def _create_instruments(self) -> None:
             name="dlq.message.age", description="Age of messages in DLQ in seconds", unit="s"
         )
 
-        # DLQ queue metrics
         self.dlq_queue_size = self._meter.create_up_down_counter(
             name="dlq.queue.size", description="Current size of DLQ by topic", unit="1"
         )
 
-        self.dlq_retry_attempts = self._meter.create_histogram(
-            name="dlq.retry.attempts", description="Number of retry attempts for DLQ messages", unit="1"
-        )
-
-        # DLQ error metrics
         self.dlq_processing_errors = self._meter.create_counter(
             name="dlq.processing.errors.total", description="Total number of DLQ processing errors", unit="1"
         )
 
-        # DLQ throughput metrics
-        self.dlq_throughput_rate = self._meter.create_histogram(
-            name="dlq.throughput.rate", description="Messages processed per second from DLQ", unit="msg/s"
-        )
-
     def record_dlq_message_received(self, original_topic: str, event_type: str) -> None:
         self.dlq_messages_received.add(1, attributes={"original_topic": original_topic, "event_type": event_type})
 
@@ -63,7 +50,6 @@ def record_dlq_processing_duration(self, duration_seconds: float, operation: str
         self.dlq_processing_duration.record(duration_seconds, attributes={"operation": operation})
 
     def update_dlq_queue_size(self, original_topic: str, size: int) -> None:
-        # Track the delta for gauge-like behavior
         key = f"_dlq_size_{original_topic}"
         current_val = getattr(self, key, 0)
         delta = size - current_val
@@ -74,19 +60,11 @@ def update_dlq_queue_size(self, original_topic: str, size: int) -> None:
     def record_dlq_message_age(self, age_seconds: float) -> None:
         self.dlq_message_age.record(age_seconds)
 
-    def record_dlq_retry_attempt(self, original_topic: str, event_type: str, attempt_number: int) -> None:
-        self.dlq_retry_attempts.record(
-            attempt_number, attributes={"original_topic": original_topic, "event_type": event_type}
-        )
-
     def record_dlq_processing_error(self, original_topic: str, event_type: str, error_type: str) -> None:
         self.dlq_processing_errors.add(
             1, attributes={"original_topic": original_topic, "event_type": event_type, "error_type": error_type}
         )
 
-    def record_dlq_throughput(self, messages_per_second: float, original_topic: str) -> None:
-        self.dlq_throughput_rate.record(messages_per_second, attributes={"original_topic": original_topic})
-
     def increment_dlq_queue_size(self, original_topic: str) -> None:
         self.dlq_queue_size.add(1, attributes={"original_topic": original_topic})
 
diff --git a/backend/app/core/metrics/events.py b/backend/app/core/metrics/events.py
index bd417078..f5dbdf49 100644
--- a/backend/app/core/metrics/events.py
+++ b/backend/app/core/metrics/events.py
@@ -2,20 +2,7 @@
 
 
 class EventMetrics(BaseMetrics):
-    """Metrics for event processing and Kafka.
-
-    This class tracks metrics related to event processing, event buffers,
-    and Kafka message production/consumption. Metrics are provided via
-    dependency injection (DI) through the MetricsProvider.
-
-    Usage (via DI):
-        class MyService:
-            def __init__(self, event_metrics: EventMetrics):
-                self.metrics = event_metrics
-
-            def my_method(self):
-                self.metrics.record_event_published("execution.requested")
-    """
+    """Metrics for event processing and Kafka."""
 
     def _create_instruments(self) -> None:
         # Core event metrics
@@ -36,43 +23,11 @@ def _create_instruments(self) -> None:
             name="event.bus.queue.size", description="Size of event bus message queue", unit="1"
         )
 
-        # Pod event metrics
-        self.pod_event_published = self._meter.create_counter(
-            name="pod.events.published.total", description="Total number of pod events published", unit="1"
-        )
-
         # Event replay metrics
         self.event_replay_operations = self._meter.create_counter(
             name="event.replay.operations.total", description="Total number of event replay operations", unit="1"
         )
 
-        # Event buffer metrics
-        self.event_buffer_size = self._meter.create_up_down_counter(
-            name="event.buffer.size", description="Current number of events in buffer", unit="1"
-        )
-
-        self.event_buffer_dropped = self._meter.create_counter(
-            name="event.buffer.dropped.total", description="Total number of events dropped from buffer", unit="1"
-        )
-
-        self.event_buffer_processed = self._meter.create_counter(
-            name="event.buffer.processed.total", description="Total number of events processed from buffer", unit="1"
-        )
-
-        self.event_buffer_latency = self._meter.create_histogram(
-            name="event.buffer.latency", description="Time between event creation and processing in seconds", unit="s"
-        )
-
-        self.event_buffer_backpressure = self._meter.create_up_down_counter(
-            name="event.buffer.backpressure.active",
-            description="Whether backpressure is currently active (1=active, 0=inactive)",
-            unit="1",
-        )
-
-        self.event_buffer_memory_usage = self._meter.create_histogram(
-            name="event.buffer.memory.usage", description="Memory usage of event buffer in MB", unit="MB"
-        )
-
         # Kafka-specific metrics
         self.kafka_messages_produced = self._meter.create_counter(
             name="kafka.messages.produced.total", description="Total number of messages produced to Kafka", unit="1"
@@ -82,10 +37,6 @@ def _create_instruments(self) -> None:
             name="kafka.messages.consumed.total", description="Total number of messages consumed from Kafka", unit="1"
         )
 
-        self.kafka_consumer_lag = self._meter.create_histogram(
-            name="kafka.consumer.lag", description="Consumer lag in number of messages", unit="1"
-        )
-
         self.kafka_production_errors = self._meter.create_counter(
             name="kafka.production.errors.total", description="Total number of Kafka production errors", unit="1"
         )
@@ -95,15 +46,7 @@ def _create_instruments(self) -> None:
         )
 
     def record_event_published(self, event_type: str, event_category: str | None = None) -> None:
-        """
-        Record that an event was published.
-
-        Args:
-            event_type: Full event type (e.g., "execution.requested")
-            event_category: Event category (e.g., "execution"). If None, extracted from event_type.
-        """
         if event_category is None:
-            # Extract category from event type (e.g., "execution" from "execution.requested")
             event_category = event_type.split(".")[0] if "." in event_type else event_type
 
         self.event_published.add(1, attributes={"event_type": event_type, "event_category": event_category})
@@ -111,31 +54,9 @@ def record_event_published(self, event_type: str, event_category: str | None = N
     def record_event_processing_duration(self, duration_seconds: float, event_type: str) -> None:
         self.event_processing_duration.record(duration_seconds, attributes={"event_type": event_type})
 
-    def record_pod_event_published(self, event_type: str) -> None:
-        self.pod_event_published.add(1, attributes={"event_type": event_type})
-
     def record_event_replay_operation(self, operation: str, status: str) -> None:
         self.event_replay_operations.add(1, attributes={"operation": operation, "status": status})
 
-    def update_event_buffer_size(self, delta: int) -> None:
-        self.event_buffer_size.add(delta)
-
-    def record_event_buffer_dropped(self) -> None:
-        self.event_buffer_dropped.add(1)
-
-    def record_event_buffer_processed(self) -> None:
-        self.event_buffer_processed.add(1)
-
-    def record_event_buffer_latency(self, latency_seconds: float) -> None:
-        self.event_buffer_latency.record(latency_seconds)
-
-    def set_event_buffer_backpressure(self, active: bool) -> None:
-        self.event_buffer_backpressure.add(-1 if not active else 0)
-        self.event_buffer_backpressure.add(1 if active else 0)
-
-    def record_event_buffer_memory_usage(self, memory_mb: float) -> None:
-        self.event_buffer_memory_usage.record(memory_mb)
-
     def record_event_stored(self, event_type: str, collection: str) -> None:
         self.event_published.add(1, attributes={"event_type": event_type, "aggregate_type": collection})
 
@@ -153,17 +74,14 @@ def record_events_processing_failed(
         )
 
     def record_event_store_duration(self, duration: float, operation: str, collection: str) -> None:
-        """Record event store operation duration."""
         self.event_processing_duration.record(duration, attributes={"operation": operation, "collection": collection})
 
     def record_event_store_failed(self, event_type: str, error_type: str) -> None:
-        """Record event store failure."""
         self.event_processing_errors.add(
             1, attributes={"event_type": event_type, "error_type": error_type, "operation": "store"}
         )
 
     def record_event_query_duration(self, duration: float, query_type: str, collection: str) -> None:
-        """Record event query duration."""
         self.event_processing_duration.record(
             duration, attributes={"operation": f"query_{query_type}", "collection": collection}
         )
@@ -183,11 +101,6 @@ def record_kafka_message_produced(self, topic: str, partition: int = -1) -> None
     def record_kafka_message_consumed(self, topic: str, consumer_group: str) -> None:
         self.kafka_messages_consumed.add(1, attributes={"topic": topic, "consumer_group": consumer_group})
 
-    def record_kafka_consumer_lag(self, lag: int, topic: str, consumer_group: str, partition: int) -> None:
-        self.kafka_consumer_lag.record(
-            lag, attributes={"topic": topic, "consumer_group": consumer_group, "partition": str(partition)}
-        )
-
     def record_kafka_production_error(self, topic: str, error_type: str) -> None:
         self.kafka_production_errors.add(1, attributes={"topic": topic, "error_type": error_type})
 
diff --git a/backend/app/core/metrics/execution.py b/backend/app/core/metrics/execution.py
index f033447b..adb96dab 100644
--- a/backend/app/core/metrics/execution.py
+++ b/backend/app/core/metrics/execution.py
@@ -10,12 +10,6 @@ def _create_instruments(self) -> None:
             name="script.executions.total", description="Total number of script executions", unit="1"
         )
 
-        self.execution_events = self._meter.create_observable_gauge(
-            name="script.execution.events",
-            description="Instantaneous execution events (1 when execution starts, 0 otherwise)",
-            unit="1",
-        )
-
         self.execution_duration = self._meter.create_histogram(
             name="script.execution.duration", description="Time spent executing scripts in seconds", unit="s"
         )
@@ -28,12 +22,6 @@ def _create_instruments(self) -> None:
             name="script.memory.usage", description="Memory usage per script execution in MiB", unit="MiB"
         )
 
-        self.cpu_utilization = self._meter.create_histogram(
-            name="script.cpu.utilization",
-            description="CPU utilization in millicores per script execution",
-            unit="millicores",
-        )
-
         self.memory_utilization_percent = self._meter.create_histogram(
             name="script.memory.utilization.percent",
             description="Memory utilization as percentage of available memory",
@@ -94,15 +82,3 @@ def record_execution_queued(self) -> None:
 
     def record_execution_scheduled(self) -> None:
         self.executions_assigned.add(1)
-
-    def update_cpu_available(self, cores: float) -> None:
-        self.cpu_utilization.record(cores)
-
-    def update_memory_available(self, memory_mb: float) -> None:
-        self.memory_usage.record(memory_mb, attributes={"lang_and_version": "resource_manager"})
-
-    def update_gpu_available(self, count: int) -> None:
-        self.cpu_utilization.record(count, attributes={"resource": "gpu"})
-
-    def update_allocations_active(self, count: int) -> None:
-        self.memory_utilization_percent.record(count, attributes={"metric": "allocations"})
diff --git a/backend/app/core/metrics/health.py b/backend/app/core/metrics/health.py
deleted file mode 100644
index eb26af27..00000000
--- a/backend/app/core/metrics/health.py
+++ /dev/null
@@ -1,109 +0,0 @@
-from app.core.metrics.base import BaseMetrics
-
-
-class HealthMetrics(BaseMetrics):
-    """Metrics for health checks."""
-
-    def _create_instruments(self) -> None:
-        # Core health check metrics - simple histogram to track latest value
-        self.health_check_status = self._meter.create_histogram(
-            name="health.check.status", description="Health check status (1=healthy, 0=unhealthy)", unit="1"
-        )
-
-        self.health_check_duration = self._meter.create_histogram(
-            name="health.check.duration", description="Time taken to perform health check in seconds", unit="s"
-        )
-
-        self.health_check_failures = self._meter.create_counter(
-            name="health.check.failures.total", description="Total number of health check failures", unit="1"
-        )
-
-        # Service health metrics
-        self.service_health_status = self._meter.create_histogram(
-            name="service.health.status", description="Service health status by service name", unit="1"
-        )
-
-        self.service_health_score = self._meter.create_histogram(
-            name="service.health.score", description="Overall health score for a service (0-100)", unit="%"
-        )
-
-        # Liveness and readiness specific metrics
-        self.liveness_check_status = self._meter.create_histogram(
-            name="liveness.check.status", description="Liveness check status (1=alive, 0=dead)", unit="1"
-        )
-
-        self.readiness_check_status = self._meter.create_histogram(
-            name="readiness.check.status", description="Readiness check status (1=ready, 0=not ready)", unit="1"
-        )
-
-        # Dependency health metrics
-        self.dependency_health_status = self._meter.create_histogram(
-            name="dependency.health.status", description="Health status of external dependencies", unit="1"
-        )
-
-        self.dependency_response_time = self._meter.create_histogram(
-            name="dependency.response.time", description="Response time for dependency health checks", unit="s"
-        )
-
-        # Health check execution metrics
-        self.health_checks_executed = self._meter.create_counter(
-            name="health.checks.executed.total", description="Total number of health checks executed", unit="1"
-        )
-
-        self.health_check_timeouts = self._meter.create_counter(
-            name="health.check.timeouts.total", description="Total number of health check timeouts", unit="1"
-        )
-
-        # Component health metrics
-        self.component_health_status = self._meter.create_histogram(
-            name="component.health.status", description="Health status of system components", unit="1"
-        )
-
-    def record_health_check_duration(self, duration_seconds: float, check_type: str, check_name: str) -> None:
-        self.health_check_duration.record(
-            duration_seconds, attributes={"check_type": check_type, "check_name": check_name}
-        )
-
-        # Also increment execution counter
-        self.health_checks_executed.add(1, attributes={"check_type": check_type, "check_name": check_name})
-
-    def record_health_check_failure(self, check_type: str, check_name: str, failure_type: str) -> None:
-        self.health_check_failures.add(
-            1, attributes={"check_type": check_type, "check_name": check_name, "failure_type": failure_type}
-        )
-
-    def update_health_check_status(self, status_value: int, check_type: str, check_name: str) -> None:
-        # Just record the current status value
-        self.health_check_status.record(status_value, attributes={"check_type": check_type, "check_name": check_name})
-
-    def record_health_status(self, service_name: str, status: str) -> None:
-        # Map status to numeric value
-        status_value = 1 if status.lower() in ["healthy", "ok", "up"] else 0
-        # Record the current status
-        self.service_health_status.record(status_value, attributes={"service": service_name})
-
-    def record_service_health_score(self, service_name: str, score: float) -> None:
-        self.service_health_score.record(score, attributes={"service": service_name})
-
-    def update_liveness_status(self, is_alive: bool, component: str = "default") -> None:
-        status_value = 1 if is_alive else 0
-        self.liveness_check_status.record(status_value, attributes={"component": component})
-
-    def update_readiness_status(self, is_ready: bool, component: str = "default") -> None:
-        status_value = 1 if is_ready else 0
-        self.readiness_check_status.record(status_value, attributes={"component": component})
-
-    def record_dependency_health(self, dependency_name: str, is_healthy: bool, response_time: float) -> None:
-        # Update health status
-        status_value = 1 if is_healthy else 0
-        self.dependency_health_status.record(status_value, attributes={"dependency": dependency_name})
-
-        # Record response time
-        self.dependency_response_time.record(response_time, attributes={"dependency": dependency_name})
-
-    def record_health_check_timeout(self, check_type: str, check_name: str) -> None:
-        self.health_check_timeouts.add(1, attributes={"check_type": check_type, "check_name": check_name})
-
-    def update_component_health(self, component_name: str, is_healthy: bool) -> None:
-        status_value = 1 if is_healthy else 0
-        self.component_health_status.record(status_value, attributes={"component": component_name})
diff --git a/backend/app/core/metrics/kubernetes.py b/backend/app/core/metrics/kubernetes.py
index 06d45bec..cb1f2be1 100644
--- a/backend/app/core/metrics/kubernetes.py
+++ b/backend/app/core/metrics/kubernetes.py
@@ -35,15 +35,11 @@ def _create_instruments(self) -> None:
             name="pods.by.phase", description="Current number of pods by phase", unit="1"
         )
 
-        # ConfigMap and NetworkPolicy metrics
+        # ConfigMap metrics
         self.config_maps_created = self._meter.create_counter(
             name="configmaps.created.total", description="Total number of ConfigMaps created", unit="1"
         )
 
-        self.network_policies_created = self._meter.create_counter(
-            name="networkpolicies.created.total", description="Total number of NetworkPolicies created", unit="1"
-        )
-
         # Pod monitor metrics
         self.pod_monitor_events = self._meter.create_counter(
             name="pod.monitor.events.total", description="Total number of pod monitor events", unit="1"
@@ -75,20 +71,6 @@ def _create_instruments(self) -> None:
             name="pods.monitored", description="Number of pods currently being monitored", unit="1"
         )
 
-        # Resource metrics
-        self.pod_resource_requests = self._meter.create_histogram(
-            name="pod.resource.requests", description="Pod resource requests", unit="1"
-        )
-
-        self.pod_resource_limits = self._meter.create_histogram(
-            name="pod.resource.limits", description="Pod resource limits", unit="1"
-        )
-
-        # Node metrics
-        self.pods_per_node = self._meter.create_histogram(
-            name="pods.per.node", description="Number of pods per node", unit="1"
-        )
-
     def record_pod_creation_failure(self, failure_reason: str) -> None:
         self.pod_creation_failures.add(1, attributes={"failure_reason": failure_reason})
 
@@ -99,7 +81,6 @@ def record_pod_creation_duration(self, duration_seconds: float, language: str) -
         self.pod_creation_duration.record(duration_seconds, attributes={"language": language})
 
     def update_active_pod_creations(self, count: int) -> None:
-        # Track the delta for gauge-like behavior
         key = "_active_pod_creations"
         current_val = getattr(self, key, 0)
         delta = count - current_val
@@ -125,12 +106,6 @@ def record_k8s_pod_creation_duration(self, duration_seconds: float, language: st
     def record_k8s_config_map_created(self, status: str) -> None:
         self.record_config_map_created(status)
 
-    def record_k8s_network_policy_created(self, status: str) -> None:
-        self.network_policies_created.add(1, attributes={"status": status})
-
-    def update_k8s_active_creations(self, count: int) -> None:
-        self.update_active_pod_creations(count)
-
     def increment_pod_monitor_watch_reconnects(self) -> None:
         self.pod_monitor_watch_reconnects.add(1)
 
@@ -147,7 +122,6 @@ def record_pod_monitor_watch_error(self, error_type: str) -> None:
         self.pod_monitor_watch_errors.add(1, attributes={"error_type": error_type})
 
     def update_pod_monitor_pods_watched(self, count: int) -> None:
-        # Track the delta for gauge-like behavior
         key = "_pods_monitored"
         current_val = getattr(self, key, 0)
         delta = count - current_val
@@ -164,19 +138,9 @@ def record_pod_lifetime(self, lifetime_seconds: float, final_phase: str, languag
         self.pod_lifetime.record(lifetime_seconds, attributes={"final_phase": final_phase, "language": language})
 
     def update_pods_by_phase(self, phase: str, count: int) -> None:
-        # Track the delta for gauge-like behavior
         key = f"_pods_phase_{phase}"
         current_val = getattr(self, key, 0)
         delta = count - current_val
         if delta != 0:
             self.pods_by_phase.add(delta, attributes={"phase": phase})
         setattr(self, key, count)
-
-    def record_pod_resource_request(self, resource_type: str, value: float, language: str) -> None:
-        self.pod_resource_requests.record(value, attributes={"resource_type": resource_type, "language": language})
-
-    def record_pod_resource_limit(self, resource_type: str, value: float, language: str) -> None:
-        self.pod_resource_limits.record(value, attributes={"resource_type": resource_type, "language": language})
-
-    def record_pods_per_node(self, node_name: str, pod_count: int) -> None:
-        self.pods_per_node.record(pod_count, attributes={"node_name": node_name})
diff --git a/backend/app/core/metrics/notifications.py b/backend/app/core/metrics/notifications.py
index 1610e659..13081829 100644
--- a/backend/app/core/metrics/notifications.py
+++ b/backend/app/core/metrics/notifications.py
@@ -54,14 +54,6 @@ def _create_instruments(self) -> None:
             name="notifications.read.total", description="Total notifications read by users", unit="1"
         )
 
-        self.notifications_clicked = self._meter.create_counter(
-            name="notifications.clicked.total", description="Total notifications clicked by users", unit="1"
-        )
-
-        self.time_to_read = self._meter.create_histogram(
-            name="notification.time.to.read", description="Time between notification sent and read in seconds", unit="s"
-        )
-
         self.unread_count = self._meter.create_up_down_counter(
             name="notifications.unread.count", description="Current unread notifications per user", unit="1"
         )
@@ -86,32 +78,6 @@ def _create_instruments(self) -> None:
             name="notification.retry.success.rate", description="Success rate of retried notifications", unit="%"
         )
 
-        # Batch processing metrics
-        self.batch_notifications_processed = self._meter.create_counter(
-            name="notification.batch.processed.total", description="Total notifications processed in batches", unit="1"
-        )
-
-        self.batch_processing_time = self._meter.create_histogram(
-            name="notification.batch.processing.time",
-            description="Time to process notification batch in seconds",
-            unit="s",
-        )
-
-        self.batch_size = self._meter.create_histogram(
-            name="notification.batch.size", description="Size of notification batches", unit="1"
-        )
-
-        # Template rendering metrics
-        self.template_render_time = self._meter.create_histogram(
-            name="notification.template.render.time",
-            description="Time to render notification template in seconds",
-            unit="s",
-        )
-
-        self.template_render_errors = self._meter.create_counter(
-            name="notification.template.render.errors.total", description="Total template rendering errors", unit="1"
-        )
-
         # Webhook-specific metrics
         self.webhook_delivery_time = self._meter.create_histogram(
             name="notification.webhook.delivery.time",
@@ -149,21 +115,17 @@ def record_notification_sent(
         self, notification_type: str, channel: str = "in_app", severity: str = "medium"
     ) -> None:
         self.notifications_sent.add(1, attributes={"category": notification_type})
-
         self.notifications_by_channel.add(1, attributes={"channel": channel, "category": notification_type})
-
         self.notifications_by_severity.add(1, attributes={"severity": severity, "category": notification_type})
 
     def record_notification_failed(self, notification_type: str, error: str, channel: str = "in_app") -> None:
         self.notifications_failed.add(1, attributes={"category": notification_type, "error": error})
-
         self.channel_failures.add(1, attributes={"channel": channel, "error": error})
 
     def record_notification_delivery_time(
         self, duration_seconds: float, notification_type: str, channel: str = "in_app"
     ) -> None:
         self.notification_delivery_time.record(duration_seconds, attributes={"category": notification_type})
-
         self.channel_delivery_time.record(
             duration_seconds, attributes={"channel": channel, "category": notification_type}
         )
@@ -171,7 +133,6 @@ def record_notification_delivery_time(
     def record_notification_status_change(self, notification_id: str, from_status: str, to_status: str) -> None:
         self.notification_status_changes.add(1, attributes={"from_status": from_status, "to_status": to_status})
 
-        # Update pending/queued counters
         if from_status == "pending":
             self.notifications_pending.add(-1)
         if to_status == "pending":
@@ -182,22 +143,11 @@ def record_notification_status_change(self, notification_id: str, from_status: s
         if to_status == "queued":
             self.notifications_queued.add(1)
 
-    def record_notification_read(self, notification_type: str, time_to_read_seconds: float) -> None:
+    def record_notification_read(self, notification_type: str) -> None:
         self.notifications_read.add(1, attributes={"category": notification_type})
 
-        self.time_to_read.record(time_to_read_seconds, attributes={"category": notification_type})
-
-    def record_notification_clicked(self, notification_type: str) -> None:
-        self.notifications_clicked.add(1, attributes={"category": notification_type})
-
-    def update_unread_count(self, user_id: str, count: int) -> None:
-        # Track the delta for gauge-like behavior
-        key = f"_unread_{user_id}"
-        current_val = getattr(self, key, 0)
-        delta = count - current_val
-        if delta != 0:
-            self.unread_count.add(delta, attributes={"user_id": user_id})
-        setattr(self, key, count)
+    def decrement_unread_count(self, user_id: str) -> None:
+        self.unread_count.add(-1, attributes={"user_id": user_id})
 
     def record_notification_throttled(self, notification_type: str, user_id: str) -> None:
         self.notifications_throttled.add(1, attributes={"category": notification_type, "user_id": user_id})
@@ -210,31 +160,13 @@ def record_notification_retry(self, notification_type: str, attempt_number: int,
             1, attributes={"category": notification_type, "attempt": str(attempt_number), "success": str(success)}
         )
 
-        if attempt_number > 1:  # Only record retry success rate for actual retries
+        if attempt_number > 1:
             self.retry_success_rate.record(100.0 if success else 0.0, attributes={"category": notification_type})
 
-    def record_batch_processed(
-        self, batch_size_count: int, processing_time_seconds: float, notification_type: str = "mixed"
-    ) -> None:
-        self.batch_notifications_processed.add(batch_size_count, attributes={"category": notification_type})
-
-        self.batch_processing_time.record(processing_time_seconds, attributes={"category": notification_type})
-
-        self.batch_size.record(batch_size_count, attributes={"category": notification_type})
-
-    def record_template_render(self, duration_seconds: float, template_name: str, success: bool) -> None:
-        self.template_render_time.record(
-            duration_seconds, attributes={"template": template_name, "success": str(success)}
-        )
-
-        if not success:
-            self.template_render_errors.add(1, attributes={"template": template_name})
-
     def record_webhook_delivery(self, duration_seconds: float, status_code: int, url_pattern: str) -> None:
         self.webhook_delivery_time.record(
             duration_seconds, attributes={"status_code": str(status_code), "url_pattern": url_pattern}
         )
-
         self.webhook_response_status.add(1, attributes={"status_code": str(status_code), "url_pattern": url_pattern})
 
     def record_slack_delivery(
@@ -246,7 +178,6 @@ def record_slack_delivery(
             self.slack_api_errors.add(1, attributes={"error_type": error_type, "channel": channel})
 
     def update_active_subscriptions(self, user_id: str, count: int) -> None:
-        # Track the delta for gauge-like behavior
         key = f"_subscriptions_{user_id}"
         current_val = getattr(self, key, 0)
         delta = count - current_val
@@ -260,7 +191,7 @@ def record_subscription_change(self, user_id: str, notification_type: str, actio
             attributes={
                 "user_id": user_id,
                 "category": notification_type,
-                "action": action,  # "subscribe" or "unsubscribe"
+                "action": action,
             },
         )
 
diff --git a/backend/app/core/metrics/replay.py b/backend/app/core/metrics/replay.py
index fc5beae9..24acd4b2 100644
--- a/backend/app/core/metrics/replay.py
+++ b/backend/app/core/metrics/replay.py
@@ -72,20 +72,7 @@ def _create_instruments(self) -> None:
             name="replay.delay.applied", description="Delay applied between replay events in seconds", unit="s"
         )
 
-        # Filter metrics
-        self.replay_events_filtered = self._meter.create_counter(
-            name="replay.events.filtered.total", description="Total events filtered during replay", unit="1"
-        )
-
-        self.replay_filter_effectiveness = self._meter.create_histogram(
-            name="replay.filter.effectiveness", description="Percentage of events passing filters", unit="%"
-        )
-
-        # Memory and resource metrics
-        self.replay_memory_usage = self._meter.create_histogram(
-            name="replay.memory.usage", description="Memory usage during replay in MB", unit="MB"
-        )
-
+        # Queue metrics
         self.replay_queue_size = self._meter.create_up_down_counter(
             name="replay.queue.size", description="Size of replay event queue", unit="1"
         )
@@ -94,7 +81,6 @@ def record_session_created(self, replay_type: str, target: str) -> None:
         self.replay_sessions_created.add(1, attributes={"replay_type": replay_type, "target": target})
 
     def update_active_replays(self, count: int) -> None:
-        # Track the delta for gauge-like behavior
         key = "_active_replays"
         current_val = getattr(self, key, 0)
         delta = count - current_val
@@ -122,7 +108,6 @@ def record_event_replayed(self, replay_type: str, event_type: str, status: str)
     def record_replay_duration(self, duration_seconds: float, replay_type: str, total_events: int = 0) -> None:
         self.replay_duration.record(duration_seconds, attributes={"replay_type": replay_type})
 
-        # Calculate and record throughput if events were processed
         if total_events > 0 and duration_seconds > 0:
             throughput = total_events / duration_seconds
             self.replay_throughput.record(throughput, attributes={"replay_type": replay_type})
@@ -135,8 +120,6 @@ def record_replay_error(self, error_type: str, replay_type: str = "unknown") ->
 
     def record_status_change(self, session_id: str, from_status: str, to_status: str) -> None:
         self.replay_status_changes.add(1, attributes={"from_status": from_status, "to_status": to_status})
-
-        # Update sessions by status
         self.update_sessions_by_status(from_status, -1)
         self.update_sessions_by_status(to_status, 1)
 
@@ -146,7 +129,6 @@ def update_sessions_by_status(self, status: str, delta: int) -> None:
 
     def record_replay_by_target(self, target: str, success: bool) -> None:
         self.replay_by_target.add(1, attributes={"target": target, "success": str(success)})
-
         if not success:
             self.replay_target_errors.add(1, attributes={"target": target})
 
@@ -159,19 +141,7 @@ def record_delay_applied(self, delay_seconds: float) -> None:
     def record_batch_size(self, size: int, replay_type: str) -> None:
         self.replay_batch_size.record(size, attributes={"replay_type": replay_type})
 
-    def record_events_filtered(self, filter_type: str, count: int) -> None:
-        self.replay_events_filtered.add(count, attributes={"filter_type": filter_type})
-
-    def record_filter_effectiveness(self, passed: int, total: int, filter_type: str) -> None:
-        if total > 0:
-            effectiveness = (passed / total) * 100
-            self.replay_filter_effectiveness.record(effectiveness, attributes={"filter_type": filter_type})
-
-    def record_replay_memory_usage(self, memory_mb: float, session_id: str) -> None:
-        self.replay_memory_usage.record(memory_mb, attributes={"session_id": session_id})
-
     def update_replay_queue_size(self, session_id: str, size: int) -> None:
-        # Track the delta for gauge-like behavior
         key = f"_queue_{session_id}"
         current_val = getattr(self, key, 0)
         delta = size - current_val
diff --git a/backend/app/core/metrics/security.py b/backend/app/core/metrics/security.py
index c89229c3..589fb1c8 100644
--- a/backend/app/core/metrics/security.py
+++ b/backend/app/core/metrics/security.py
@@ -5,19 +5,6 @@ class SecurityMetrics(BaseMetrics):
     """Metrics for security events."""
 
     def _create_instruments(self) -> None:
-        # Core security event metrics
-        self.security_events = self._meter.create_counter(
-            name="security.events.total", description="Total number of security events by type", unit="1"
-        )
-
-        self.security_violations = self._meter.create_counter(
-            name="security.violations.total", description="Total number of security violations", unit="1"
-        )
-
-        self.security_alerts = self._meter.create_counter(
-            name="security.alerts.total", description="Total number of security alerts raised", unit="1"
-        )
-
         # Authentication metrics
         self.authentication_attempts = self._meter.create_counter(
             name="authentication.attempts.total", description="Total number of authentication attempts", unit="1"
@@ -40,10 +27,6 @@ def _create_instruments(self) -> None:
             name="tokens.generated.total", description="Total number of tokens generated", unit="1"
         )
 
-        self.tokens_refreshed = self._meter.create_counter(
-            name="tokens.refreshed.total", description="Total number of tokens refreshed", unit="1"
-        )
-
         self.tokens_revoked = self._meter.create_counter(
             name="tokens.revoked.total", description="Total number of tokens revoked", unit="1"
         )
@@ -65,10 +48,6 @@ def _create_instruments(self) -> None:
             name="authorization.denials.total", description="Total number of authorization denials", unit="1"
         )
 
-        self.permission_checks = self._meter.create_counter(
-            name="permission.checks.total", description="Total number of permission checks", unit="1"
-        )
-
         # CSRF protection metrics
         self.csrf_tokens_generated = self._meter.create_counter(
             name="csrf.tokens.generated.total", description="Total number of CSRF tokens generated", unit="1"
@@ -78,60 +57,7 @@ def _create_instruments(self) -> None:
             name="csrf.validation.failures.total", description="Total number of CSRF validation failures", unit="1"
         )
 
-        # Network security metrics
-        self.network_policy_violations = self._meter.create_counter(
-            name="network.policy.violations.total", description="Total number of network policy violations", unit="1"
-        )
-
-        self.network_policy_created = self._meter.create_counter(
-            name="network.policies.created.total", description="Total number of network policies created", unit="1"
-        )
-
-        # Privilege escalation metrics
-        self.privilege_escalation_attempts = self._meter.create_counter(
-            name="privilege.escalation.attempts.total",
-            description="Total number of privilege escalation attempts",
-            unit="1",
-        )
-
-        self.privilege_escalation_blocked = self._meter.create_counter(
-            name="privilege.escalation.blocked.total",
-            description="Total number of blocked privilege escalation attempts",
-            unit="1",
-        )
-
-        # Rate limiting metrics
-        self.rate_limit_hits = self._meter.create_counter(
-            name="rate.limit.hits.total", description="Total number of rate limit hits", unit="1"
-        )
-
-        self.rate_limit_violations = self._meter.create_counter(
-            name="rate.limit.violations.total", description="Total number of rate limit violations", unit="1"
-        )
-
-        # API key metrics
-        self.api_keys_created = self._meter.create_counter(
-            name="api.keys.created.total", description="Total number of API keys created", unit="1"
-        )
-
-        self.api_keys_revoked = self._meter.create_counter(
-            name="api.keys.revoked.total", description="Total number of API keys revoked", unit="1"
-        )
-
-        self.api_key_usage = self._meter.create_counter(
-            name="api.key.usage.total", description="Total API key usage", unit="1"
-        )
-
-        # Audit log metrics
-        self.audit_events_logged = self._meter.create_counter(
-            name="audit.events.logged.total", description="Total number of audit events logged", unit="1"
-        )
-
         # Password metrics
-        self.password_changes = self._meter.create_counter(
-            name="password.changes.total", description="Total number of password changes", unit="1"
-        )
-
         self.password_reset_requests = self._meter.create_counter(
             name="password.reset.requests.total", description="Total number of password reset requests", unit="1"
         )
@@ -149,24 +75,6 @@ def _create_instruments(self) -> None:
             name="accounts.locked.total", description="Total number of accounts locked due to security", unit="1"
         )
 
-    def record_security_event(self, event_type: str, severity: str = "info", source: str = "system") -> None:
-        self.security_events.add(1, attributes={"event_type": event_type, "severity": severity, "source": source})
-
-        if severity in ["critical", "high"]:
-            self.security_alerts.add(1, attributes={"event_type": event_type, "severity": severity})
-
-    def record_security_violation(
-        self, violation_type: str, user_id: str | None = None, ip_address: str | None = None
-    ) -> None:
-        self.security_violations.add(
-            1,
-            attributes={
-                "violation_type": violation_type,
-                "user_id": user_id or "anonymous",
-                "ip_address": ip_address or "unknown",
-            },
-        )
-
     def record_authentication_attempt(
         self, method: str, success: bool, user_id: str | None = None, duration_seconds: float | None = None
     ) -> None:
@@ -180,15 +88,6 @@ def record_authentication_attempt(
         if duration_seconds is not None:
             self.authentication_duration.record(duration_seconds, attributes={"method": method})
 
-    def update_active_sessions(self, count: int) -> None:
-        # Track the delta for gauge-like behavior
-        key = "_active_sessions"
-        current_val = getattr(self, key, 0)
-        delta = count - current_val
-        if delta != 0:
-            self.active_sessions.add(delta)
-        setattr(self, key, count)
-
     def increment_active_sessions(self) -> None:
         self.active_sessions.add(1)
 
@@ -197,12 +96,8 @@ def decrement_active_sessions(self) -> None:
 
     def record_token_generated(self, token_type: str, expiry_seconds: float) -> None:
         self.tokens_generated.add(1, attributes={"token_type": token_type})
-
         self.token_expiry_time.record(expiry_seconds, attributes={"token_type": token_type})
 
-    def record_token_refreshed(self, token_type: str) -> None:
-        self.tokens_refreshed.add(1, attributes={"token_type": token_type})
-
     def record_token_revoked(self, token_type: str, reason: str) -> None:
         self.tokens_revoked.add(1, attributes={"token_type": token_type, "reason": reason})
 
@@ -227,70 +122,13 @@ def record_authorization_check(
                 1, attributes={"resource": resource, "action": action, "user_role": user_role or "unknown"}
             )
 
-    def record_permission_check(self, permission: str, granted: bool, user_id: str | None = None) -> None:
-        self.permission_checks.add(
-            1, attributes={"permission": permission, "granted": str(granted), "user_id": user_id or "unknown"}
-        )
-
     def record_csrf_token_generated(self) -> None:
         self.csrf_tokens_generated.add(1)
 
     def record_csrf_validation_failure(self, reason: str) -> None:
         self.csrf_validation_failures.add(1, attributes={"reason": reason})
 
-    def record_network_policy_violation(
-        self, policy_name: str, pod_name: str | None = None, violation_type: str = "ingress"
-    ) -> None:
-        self.network_policy_violations.add(
-            1,
-            attributes={
-                "policy_name": policy_name,
-                "pod_name": pod_name or "unknown",
-                "violation_type": violation_type,
-            },
-        )
-
-    def record_privilege_escalation_attempt(self, user_id: str, target_privilege: str, blocked: bool) -> None:
-        self.privilege_escalation_attempts.add(
-            1, attributes={"user_id": user_id, "target_privilege": target_privilege, "blocked": str(blocked)}
-        )
-
-        if blocked:
-            self.privilege_escalation_blocked.add(
-                1, attributes={"user_id": user_id, "target_privilege": target_privilege}
-            )
-
-    def record_rate_limit_hit(self, endpoint: str, user_id: str | None = None) -> None:
-        self.rate_limit_hits.add(1, attributes={"endpoint": endpoint, "user_id": user_id or "anonymous"})
-
-    def record_rate_limit_violation(self, endpoint: str, user_id: str | None = None, limit: int | None = None) -> None:
-        self.rate_limit_violations.add(
-            1,
-            attributes={
-                "endpoint": endpoint,
-                "user_id": user_id or "anonymous",
-                "limit": str(limit) if limit else "unknown",
-            },
-        )
-
-    def record_api_key_created(self, key_id: str, scopes: str | None = None) -> None:
-        self.api_keys_created.add(1, attributes={"key_id": key_id, "scopes": scopes or "default"})
-
-    def record_api_key_revoked(self, key_id: str, reason: str) -> None:
-        self.api_keys_revoked.add(1, attributes={"key_id": key_id, "reason": reason})
-
-    def record_api_key_usage(self, key_id: str, endpoint: str) -> None:
-        self.api_key_usage.add(1, attributes={"key_id": key_id, "endpoint": endpoint})
-
-    def record_audit_event(self, event_type: str, user_id: str, resource: str | None = None) -> None:
-        self.audit_events_logged.add(
-            1, attributes={"event_type": event_type, "user_id": user_id, "resource": resource or "system"}
-        )
-
-    def record_password_change(self, user_id: str, forced: bool = False) -> None:
-        self.password_changes.add(1, attributes={"user_id": user_id, "forced": str(forced)})
-
-    def record_password_reset_request(self, user_id: str, method: str = "email") -> None:
+    def record_password_reset_request(self, user_id: str, method: str = "admin") -> None:
         self.password_reset_requests.add(1, attributes={"user_id": user_id, "method": method})
 
     def record_weak_password_attempt(self, user_id: str, weakness_type: str) -> None:
diff --git a/backend/app/core/middlewares/csrf.py b/backend/app/core/middlewares/csrf.py
index dcde555a..ad070b4d 100644
--- a/backend/app/core/middlewares/csrf.py
+++ b/backend/app/core/middlewares/csrf.py
@@ -1,6 +1,5 @@
-import logging
-from typing import TYPE_CHECKING
-
+import structlog
+from dishka import AsyncContainer
 from starlette.requests import Request
 from starlette.responses import JSONResponse
 from starlette.types import ASGIApp, Receive, Scope, Send
@@ -8,10 +7,7 @@
 from app.core.security import SecurityService
 from app.domain.user import CSRFValidationError
 
-if TYPE_CHECKING:
-    from dishka import AsyncContainer
-
-logger = logging.getLogger(__name__)
+logger = structlog.get_logger()
 
 
 class CSRFMiddleware:
@@ -36,22 +32,21 @@ async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None:
             await self.app(scope, receive, send)
             return
 
-        # Get container from app state (set during lifespan)
         container: AsyncContainer = scope["app"].state.dishka_container
         security_service: SecurityService = await container.get(SecurityService)
 
         request = Request(scope, receive=receive)
 
         try:
-            # validate_csrf_from_request returns "skip" or the token if valid
-            # raises CSRFValidationError if invalid
             security_service.validate_csrf_from_request(request)
             await self.app(scope, receive, send)
 
         except CSRFValidationError as e:
             logger.warning(
                 "CSRF validation failed",
-                extra={"path": request.url.path, "method": request.method, "reason": str(e)},
+                path=request.url.path,
+                method=request.method,
+                reason=str(e),
             )
             response = JSONResponse(
                 status_code=403,
diff --git a/backend/app/core/providers.py b/backend/app/core/providers.py
index 8eb85a4c..f9b79ef3 100644
--- a/backend/app/core/providers.py
+++ b/backend/app/core/providers.py
@@ -18,7 +18,6 @@
     DLQMetrics,
     EventMetrics,
     ExecutionMetrics,
-    HealthMetrics,
     KubernetesMetrics,
     NotificationMetrics,
     RateLimitMetrics,
@@ -155,8 +154,8 @@ class CoreServicesProvider(Provider):
     scope = Scope.APP
 
     @provide
-    def get_security_service(self, settings: Settings) -> SecurityService:
-        return SecurityService(settings)
+    def get_security_service(self, settings: Settings, security_metrics: SecurityMetrics) -> SecurityService:
+        return SecurityService(settings, security_metrics)
 
     @provide
     def get_tracer(
@@ -315,10 +314,6 @@ def get_execution_metrics(self, settings: Settings) -> ExecutionMetrics:
     def get_database_metrics(self, settings: Settings) -> DatabaseMetrics:
         return DatabaseMetrics(settings)
 
-    @provide
-    def get_health_metrics(self, settings: Settings) -> HealthMetrics:
-        return HealthMetrics(settings)
-
     @provide
     def get_kubernetes_metrics(self, settings: Settings) -> KubernetesMetrics:
         return KubernetesMetrics(settings)
@@ -441,9 +436,23 @@ def get_auth_service(
             self,
             user_repository: UserRepository,
             security_service: SecurityService,
+            security_metrics: SecurityMetrics,
             logger: structlog.stdlib.BoundLogger,
+            lockout_service: LoginLockoutService,
+            runtime_settings: RuntimeSettingsLoader,
+            producer: UnifiedProducer,
+            settings: Settings,
     ) -> AuthService:
-        return AuthService(user_repository, security_service, logger)
+        return AuthService(
+            user_repo=user_repository,
+            security_service=security_service,
+            security_metrics=security_metrics,
+            logger=logger,
+            lockout_service=lockout_service,
+            runtime_settings=runtime_settings,
+            producer=producer,
+            settings=settings,
+        )
 
 
 class KafkaServicesProvider(Provider):
@@ -628,6 +637,7 @@ def get_admin_user_service(
             execution_service: ExecutionService,
             rate_limit_service: RateLimitService,
             security_service: SecurityService,
+            security_metrics: SecurityMetrics,
             logger: structlog.stdlib.BoundLogger,
     ) -> AdminUserService:
         return AdminUserService(
@@ -636,6 +646,7 @@ def get_admin_user_service(
             execution_service=execution_service,
             rate_limit_service=rate_limit_service,
             security_service=security_service,
+            security_metrics=security_metrics,
             logger=logger,
         )
 
diff --git a/backend/app/core/security.py b/backend/app/core/security.py
index ad25d507..4901c310 100644
--- a/backend/app/core/security.py
+++ b/backend/app/core/security.py
@@ -8,6 +8,7 @@
 from fastapi.security import OAuth2PasswordBearer
 from passlib.context import CryptContext
 
+from app.core.metrics import SecurityMetrics
 from app.domain.user import CSRFValidationError, InvalidCredentialsError
 from app.settings import Settings
 
@@ -15,8 +16,9 @@
 
 
 class SecurityService:
-    def __init__(self, settings: Settings) -> None:
+    def __init__(self, settings: Settings, security_metrics: SecurityMetrics) -> None:
         self.settings = settings
+        self._security_metrics = security_metrics
         # --8<-- [start:password_hashing]
         self.pwd_context = CryptContext(
             schemes=["bcrypt"],
@@ -123,12 +125,15 @@ def validate_csrf_from_request(self, request: Request) -> str:
         cookie_token = request.cookies.get("csrf_token", "")
 
         if not header_token:
+            self._security_metrics.record_csrf_validation_failure("missing_header")
             raise CSRFValidationError("CSRF token missing from X-CSRF-Token header")
 
         if not self.validate_csrf_token(header_token, cookie_token):
+            self._security_metrics.record_csrf_validation_failure("token_mismatch")
             raise CSRFValidationError("CSRF token invalid or does not match cookie")
 
         if not self._verify_csrf_signature(header_token, access_token):
+            self._security_metrics.record_csrf_validation_failure("invalid_signature")
             raise CSRFValidationError("CSRF token signature invalid")
 
         return header_token
diff --git a/backend/app/dlq/manager.py b/backend/app/dlq/manager.py
index 6ff600a2..968219b5 100644
--- a/backend/app/dlq/manager.py
+++ b/backend/app/dlq/manager.py
@@ -246,6 +246,7 @@ async def retry_messages_batch(self, event_ids: list[str]) -> DLQBatchRetryResul
                     details.append(DLQRetryResult(event_id=event_id, status="failed", error="Retry failed"))
             except Exception as e:
                 self.logger.error(f"Error retrying message {event_id}: {e}")
+                self.metrics.record_dlq_processing_error("batch_retry", event_id, type(e).__name__)
                 failed += 1
                 details.append(DLQRetryResult(event_id=event_id, status="failed", error=str(e)))
 
diff --git a/backend/app/domain/exceptions.py b/backend/app/domain/exceptions.py
index 79b46b43..30c326a8 100644
--- a/backend/app/domain/exceptions.py
+++ b/backend/app/domain/exceptions.py
@@ -51,6 +51,12 @@ class InvalidStateError(DomainError):
     pass
 
 
+class AccountLockedError(DomainError):
+    """Account temporarily locked (maps to 423)."""
+
+    pass
+
+
 class InfrastructureError(DomainError):
     """Infrastructure failure - DB, Kafka, K8s, etc (maps to 500)."""
 
diff --git a/backend/app/domain/user/__init__.py b/backend/app/domain/user/__init__.py
index 7bd51a2b..4f5957f5 100644
--- a/backend/app/domain/user/__init__.py
+++ b/backend/app/domain/user/__init__.py
@@ -20,6 +20,7 @@
 from .user_models import (
     DomainUserCreate,
     DomainUserUpdate,
+    LoginResult,
     PasswordReset,
     User,
     UserDeleteResult,
@@ -43,6 +44,7 @@
     "DomainUserSettingsUpdate",
     "DomainUserUpdate",
     "InvalidCredentialsError",
+    "LoginResult",
     "PasswordReset",
     "TokenExpiredError",
     "User",
diff --git a/backend/app/domain/user/user_models.py b/backend/app/domain/user/user_models.py
index 550ef41e..5665a135 100644
--- a/backend/app/domain/user/user_models.py
+++ b/backend/app/domain/user/user_models.py
@@ -103,6 +103,17 @@ class DomainUserCreate:
     is_superuser: bool = False
 
 
+@dataclass
+class LoginResult:
+    """Result of a successful login."""
+
+    username: str
+    role: UserRole
+    access_token: str
+    csrf_token: str
+    session_timeout_minutes: int
+
+
 @dataclass
 class DomainUserUpdate:
     """User update data for repository (with hashed password)."""
diff --git a/backend/app/services/admin/admin_user_service.py b/backend/app/services/admin/admin_user_service.py
index b30a0376..b9ed09af 100644
--- a/backend/app/services/admin/admin_user_service.py
+++ b/backend/app/services/admin/admin_user_service.py
@@ -3,6 +3,7 @@
 
 import structlog
 
+from app.core.metrics import SecurityMetrics
 from app.core.security import SecurityService
 from app.db.repositories import UserRepository
 from app.domain.admin import AdminUserOverviewDomain, DerivedCountsDomain, RateLimitSummaryDomain
@@ -31,6 +32,7 @@ def __init__(
         execution_service: ExecutionService,
         rate_limit_service: RateLimitService,
         security_service: SecurityService,
+        security_metrics: SecurityMetrics,
         logger: structlog.stdlib.BoundLogger,
     ) -> None:
         self._users = user_repository
@@ -38,6 +40,7 @@ def __init__(
         self._executions = execution_service
         self._rate_limits = rate_limit_service
         self._security = security_service
+        self._security_metrics = security_metrics
         self.logger = logger
 
     async def get_user_overview(self, user_id: str, hours: int = 24) -> AdminUserOverviewDomain:
@@ -202,6 +205,7 @@ async def reset_user_password(self, *, admin_user_id: str, user_id: str, new_pas
         self.logger.info(
             "Admin resetting user password", admin_user_id=admin_user_id, target_user_id=user_id
         )
+        self._security_metrics.record_password_reset_request(user_id, method="admin")
         hashed = self._security.get_password_hash(new_password)
         pr = PasswordReset(user_id=user_id, new_password=hashed)
         ok = await self._users.reset_user_password(pr)
diff --git a/backend/app/services/auth_service.py b/backend/app/services/auth_service.py
index fd05954e..243ccd31 100644
--- a/backend/app/services/auth_service.py
+++ b/backend/app/services/auth_service.py
@@ -1,10 +1,33 @@
+from datetime import timedelta
+from typing import NoReturn
+
 import structlog
 from fastapi import Request
 
+from app.core.metrics import SecurityMetrics
 from app.core.security import SecurityService
 from app.db.repositories import UserRepository
-from app.domain.enums import UserRole
-from app.domain.user import AdminAccessRequiredError, AuthenticationRequiredError, InvalidCredentialsError, User
+from app.domain.enums import LoginMethod, UserRole
+from app.domain.events import (
+    AuthFailedEvent,
+    EventMetadata,
+    UserLoggedInEvent,
+    UserLoggedOutEvent,
+    UserRegisteredEvent,
+)
+from app.domain.exceptions import AccountLockedError, ConflictError, ValidationError
+from app.domain.user import (
+    AdminAccessRequiredError,
+    AuthenticationRequiredError,
+    DomainUserCreate,
+    InvalidCredentialsError,
+    LoginResult,
+    User,
+)
+from app.events.core import UnifiedProducer
+from app.services.login_lockout import LoginLockoutService
+from app.services.runtime_settings import RuntimeSettingsLoader
+from app.settings import Settings
 
 
 class AuthService:
@@ -12,11 +35,28 @@ def __init__(
         self,
         user_repo: UserRepository,
         security_service: SecurityService,
+        security_metrics: SecurityMetrics,
         logger: structlog.stdlib.BoundLogger,
+        lockout_service: LoginLockoutService,
+        runtime_settings: RuntimeSettingsLoader,
+        producer: UnifiedProducer,
+        settings: Settings,
     ):
         self.user_repo = user_repo
         self.security_service = security_service
+        self.security_metrics = security_metrics
         self.logger = logger
+        self._lockout = lockout_service
+        self._runtime_settings = runtime_settings
+        self._producer = producer
+        self._settings = settings
+
+    def _build_metadata(self, user_id: str = "") -> EventMetadata:
+        return EventMetadata(
+            service_name=self._settings.SERVICE_NAME,
+            service_version=self._settings.SERVICE_VERSION,
+            user_id=user_id,
+        )
 
     async def get_current_user(self, request: Request) -> User:
         token = request.cookies.get("access_token")
@@ -24,6 +64,7 @@ async def get_current_user(self, request: Request) -> User:
             raise AuthenticationRequiredError()
 
         username = self.security_service.decode_token(token)
+
         user = await self.user_repo.get_user(username)
         if user is None:
             raise InvalidCredentialsError()
@@ -32,7 +73,162 @@ async def get_current_user(self, request: Request) -> User:
 
     async def get_admin(self, request: Request) -> User:
         user = await self.get_current_user(request)
+        self.security_metrics.record_authorization_check(
+            "/admin", request.method, user.role == UserRole.ADMIN, user_role=user.role,
+        )
         if user.role != UserRole.ADMIN:
             self.logger.warning(f"Admin access denied for user: {user.username} (role: {user.role})")
             raise AdminAccessRequiredError(user.username)
         return user
+
+    async def _fail_login(
+        self,
+        username: str,
+        reason: str,
+        ip_address: str,
+        user_agent: str | None,
+        user_id: str = "",
+    ) -> NoReturn:
+        self.logger.warning(
+            f"Login failed - {reason}",
+            username=username,
+            client_ip=ip_address,
+            user_agent=user_agent,
+        )
+        locked = await self._lockout.record_failed_attempt(username)
+        await self._producer.produce(
+            event_to_produce=AuthFailedEvent(
+                username=username,
+                reason=reason,
+                ip_address=ip_address,
+                metadata=self._build_metadata(user_id=user_id),
+            ),
+            key=username,
+        )
+        if locked:
+            self.security_metrics.record_account_locked(username, "brute_force")
+            raise AccountLockedError("Account locked due to too many failed attempts")
+        raise InvalidCredentialsError()
+
+    async def login(
+        self,
+        username: str,
+        password: str,
+        ip_address: str,
+        user_agent: str | None,
+    ) -> LoginResult:
+        if await self._lockout.check_locked(username):
+            raise AccountLockedError("Account temporarily locked due to too many failed attempts")
+
+        user = await self.user_repo.get_user(username)
+
+        if not user:
+            await self._fail_login(username, "user_not_found", ip_address, user_agent)
+
+        if not self.security_service.verify_password(password, user.hashed_password):
+            await self._fail_login(username, "invalid_password", ip_address, user_agent, user_id=str(user.user_id))
+
+        await self._lockout.clear_attempts(username)
+
+        effective = await self._runtime_settings.get_effective_settings()
+        session_timeout = effective.session_timeout_minutes
+
+        self.logger.info(
+            "Login successful",
+            username=user.username,
+            client_ip=ip_address,
+            user_agent=user_agent,
+            token_expires_in_minutes=session_timeout,
+        )
+
+        access_token_expires = timedelta(minutes=session_timeout)
+        access_token = self.security_service.create_access_token(
+            data={"sub": user.username}, expires_delta=access_token_expires,
+        )
+        csrf_token = self.security_service.generate_csrf_token(access_token)
+
+        await self._producer.produce(
+            event_to_produce=UserLoggedInEvent(
+                user_id=str(user.user_id),
+                login_method=LoginMethod.PASSWORD,
+                ip_address=ip_address,
+                user_agent=user_agent,
+                metadata=self._build_metadata(user_id=str(user.user_id)),
+            ),
+            key=user.username,
+        )
+
+        return LoginResult(
+            username=user.username,
+            role=user.role,
+            access_token=access_token,
+            csrf_token=csrf_token,
+            session_timeout_minutes=session_timeout,
+        )
+
+    async def register(
+        self,
+        username: str,
+        email: str,
+        password: str,
+        ip_address: str,
+        user_agent: str | None,
+    ) -> User:
+        effective = await self._runtime_settings.get_effective_settings()
+        min_len = effective.password_min_length
+        if len(password) < min_len:
+            self.security_metrics.record_weak_password_attempt(username, "too_short")
+            raise ValidationError(f"Password must be at least {min_len} characters")
+
+        existing = await self.user_repo.get_user(username)
+        if existing:
+            self.logger.warning(
+                "Registration failed - username taken",
+                username=username,
+                client_ip=ip_address,
+                user_agent=user_agent,
+            )
+            raise ConflictError("Username already registered")
+
+        hashed_password = self.security_service.get_password_hash(password)
+        create_data = DomainUserCreate(
+            username=username,
+            email=email,
+            hashed_password=hashed_password,
+            role=UserRole.USER,
+            is_active=True,
+            is_superuser=False,
+        )
+        created_user = await self.user_repo.create_user(create_data)
+
+        self.logger.info(
+            "Registration successful",
+            username=created_user.username,
+            client_ip=ip_address,
+            user_agent=user_agent,
+        )
+
+        await self._producer.produce(
+            event_to_produce=UserRegisteredEvent(
+                user_id=str(created_user.user_id),
+                username=created_user.username,
+                email=created_user.email,
+                metadata=self._build_metadata(user_id=str(created_user.user_id)),
+            ),
+            key=created_user.username,
+        )
+
+        return created_user
+
+    async def publish_logout_event(self, token: str | None) -> None:
+        if not token:
+            return
+        username = self.security_service.decode_token(token)
+        await self._producer.produce(
+            event_to_produce=UserLoggedOutEvent(
+                user_id=username,
+                logout_reason="user_initiated",
+                metadata=self._build_metadata(user_id=username),
+            ),
+            key=username,
+        )
diff --git a/backend/app/services/event_replay/replay_service.py b/backend/app/services/event_replay/replay_service.py
index fecaf40a..5dc65db6 100644
--- a/backend/app/services/event_replay/replay_service.py
+++ b/backend/app/services/event_replay/replay_service.py
@@ -51,6 +51,7 @@ async def create_session_from_config(self, config: ReplayConfig) -> ReplayOperat
             state = ReplaySessionState(session_id=str(uuid4()), config=config)
             self._sessions[state.session_id] = state
             await self._repository.save_session(state)
+            self._metrics.record_session_created(config.replay_type, config.target)
             return ReplayOperationResult(
                 session_id=state.session_id,
                 status=ReplayStatus.CREATED,
@@ -94,6 +95,7 @@ async def start_session(self, session_id: str) -> ReplayOperationResult:
         session.status = ReplayStatus.RUNNING
         session.started_at = datetime.now(timezone.utc)
         self._metrics.increment_active_replays()
+        self._metrics.record_speed_multiplier(session.config.speed_multiplier, session.config.replay_type)
         await self._repository.update_session_status(session_id, ReplayStatus.RUNNING)
         return ReplayOperationResult(
             session_id=session_id, status=ReplayStatus.RUNNING, message="Replay session started"
@@ -214,6 +216,9 @@ async def _dispatch_next(self, session: ReplaySessionState) -> None:
             time_diff = (next_event.timestamp - session.last_event_at).total_seconds()
             delay = max(time_diff / session.config.speed_multiplier, 0)
 
+        if delay > 0:
+            self._metrics.record_delay_applied(delay)
+
         scheduler = self._schedulers.get(session.session_id)
         if scheduler and scheduler.running and session.status == ReplayStatus.RUNNING:
             scheduler.add_job(
@@ -343,8 +348,10 @@ async def _dispatch() -> None:
 
         try:
             await _dispatch()
+            self._metrics.record_replay_by_target(config.target, success=True)
             return True
         except Exception:
+            self._metrics.record_replay_by_target(config.target, success=False)
             return False
 
     async def _write_event_to_file(self, event: DomainEvent, file_path: str) -> None:
diff --git a/backend/app/services/k8s_worker/worker.py b/backend/app/services/k8s_worker/worker.py
index e5429eab..5e48bbc0 100644
--- a/backend/app/services/k8s_worker/worker.py
+++ b/backend/app/services/k8s_worker/worker.py
@@ -109,7 +109,7 @@ async def _create_pod_for_execution(self, command: CreatePodCommandEvent) -> Non
         async with self._creation_semaphore:
             execution_id = command.execution_id
             self._active_creations.add(execution_id)
-            self.metrics.update_k8s_active_creations(len(self._active_creations))
+            self.metrics.update_active_pod_creations(len(self._active_creations))
 
             start_time = time.time()
 
@@ -151,7 +151,7 @@ async def _create_pod_for_execution(self, command: CreatePodCommandEvent) -> Non
 
             finally:
                 self._active_creations.discard(execution_id)
-                self.metrics.update_k8s_active_creations(len(self._active_creations))
+                self.metrics.update_active_pod_creations(len(self._active_creations))
 
     async def _get_entrypoint_script(self) -> str:
         """Get entrypoint script content"""
diff --git a/backend/app/services/notification_service.py b/backend/app/services/notification_service.py
index 0566f334..aaaa47d8 100644
--- a/backend/app/services/notification_service.py
+++ b/backend/app/services/notification_service.py
@@ -168,6 +168,7 @@ async def create_notification(
                 f"per {self.settings.NOTIF_THROTTLE_WINDOW_HOURS} hour(s)"
             )
             self.logger.warning(error_msg)
+            self.metrics.record_notification_throttled("general", user_id)
             raise NotificationThrottledError(
                 user_id,
                 self.settings.NOTIF_THROTTLE_MAX_PER_HOUR,
@@ -544,7 +545,10 @@ async def update_subscription(
             if not update_data.slack_webhook:
                 raise NotificationValidationError("slack_webhook is required when enabling SLACK")
 
-        return await self.repository.upsert_subscription(user_id, channel, update_data)
+        result = await self.repository.upsert_subscription(user_id, channel, update_data)
+        action = "enabled" if update_data.enabled else "updated"
+        self.metrics.record_subscription_change(user_id, channel, action)
+        return result
 
     async def mark_all_as_read(self, user_id: str) -> int:
         """Mark all notifications as read for a user."""
@@ -693,6 +697,8 @@ async def _attempt() -> None:
                     retry_count=notification.max_retries,
                 ),
             )
+            notification_type = notification.tags[0] if notification.tags else "unknown"
+            self.metrics.record_notification_failed(notification_type, str(last_error), channel=notification.channel)
             self.logger.error(
                 f"All delivery attempts exhausted for {notification.notification_id}: {last_error}",
                 exc_info=last_error,
diff --git a/backend/app/services/sse/sse_service.py b/backend/app/services/sse/sse_service.py
index cf9b78af..c698ec0e 100644
--- a/backend/app/services/sse/sse_service.py
+++ b/backend/app/services/sse/sse_service.py
@@ -58,6 +58,7 @@ def __init__(
 
     async def create_execution_stream(self, execution_id: str, user_id: str) -> AsyncGenerator[dict[str, Any], None]:
         subscription: SSERedisSubscription | None = None
+        start_time = datetime.now(timezone.utc)
         self.metrics.increment_sse_connections("executions")
         try:
             yield self._format_sse_event(
@@ -121,6 +122,8 @@ async def create_execution_stream(self, execution_id: str, user_id: str) -> Asyn
         finally:
             if subscription is not None:
                 await asyncio.shield(subscription.close())
+            duration = (datetime.now(timezone.utc) - start_time).total_seconds()
+            self.metrics.record_sse_connection_duration(duration, "executions")
             self.metrics.decrement_sse_connections("executions")
             self.logger.info("SSE connection closed", execution_id=execution_id)
 
diff --git a/backend/grafana/provisioning/dashboards/coordinator-execution.json b/backend/grafana/provisioning/dashboards/coordinator-execution.json
new file mode 100644
index 00000000..4026abe6
--- /dev/null
+++ b/backend/grafana/provisioning/dashboards/coordinator-execution.json
@@ -0,0 +1,275 @@
+{
+  "annotations": {
+    "list": []
+  },
+  "description": "Coordinator & Execution",
+  "editable": true,
+  "gnetId": null,
+  "graphTooltip": 1,
+  "id": null,
+  "panels": [
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 0
+      },
+      "id": 100,
+      "panels": [],
+      "title": "Coordinator",
+      "type": "row"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 0,
+        "y": 1
+      },
+      "id": 1,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, rate(coordinator_scheduling_duration_seconds_bucket[5m]))",
+          "legendFormat": "p95",
+          "refId": "A"
+        }
+      ],
+      "title": "Scheduling Duration",
+      "type": "timeseries"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 12,
+        "y": 1
+      },
+      "id": 2,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, rate(coordinator_queue_wait_time_seconds_bucket[5m]))",
+          "legendFormat": "p95",
+          "refId": "A"
+        }
+      ],
+      "title": "Queue Wait Time",
+      "type": "timeseries"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 6,
+        "x": 0,
+        "y": 7
+      },
+      "id": 3,
+      "targets": [
+        {
+          "expr": "coordinator_executions_active",
+          "refId": "A"
+        }
+      ],
+      "title": "Active Executions",
+      "type": "stat"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 6,
+        "x": 6,
+        "y": 7
+      },
+      "id": 4,
+      "targets": [
+        {
+          "expr": "rate(coordinator_executions_scheduled_total[5m])",
+          "refId": "A"
+        }
+      ],
+      "title": "Scheduled (5m)",
+      "type": "stat"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 11
+      },
+      "id": 101,
+      "panels": [],
+      "title": "Execution Queue",
+      "type": "row"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 0,
+        "y": 12
+      },
+      "id": 5,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, rate(execution_queue_wait_time_seconds_bucket[5m]))",
+          "legendFormat": "p95",
+          "refId": "A"
+        }
+      ],
+      "title": "Queue Wait Time (Execution)",
+      "type": "timeseries"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 12,
+        "y": 12
+      },
+      "id": 6,
+      "targets": [
+        {
+          "expr": "rate(executions_assigned_total[5m])",
+          "legendFormat": "Assigned",
+          "refId": "A"
+        },
+        {
+          "expr": "rate(executions_queued_total[5m])",
+          "legendFormat": "Queued",
+          "refId": "B"
+        }
+      ],
+      "title": "Assigned & Queued",
+      "type": "timeseries"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 18
+      },
+      "id": 102,
+      "panels": [],
+      "title": "Script Resources",
+      "type": "row"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 0,
+        "y": 19
+      },
+      "id": 7,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.50, rate(script_memory_usage_MiB_bucket[5m]))",
+          "legendFormat": "p50",
+          "refId": "A"
+        },
+        {
+          "expr": "histogram_quantile(0.95, rate(script_memory_usage_MiB_bucket[5m]))",
+          "legendFormat": "p95",
+          "refId": "B"
+        }
+      ],
+      "title": "Memory Usage",
+      "type": "timeseries"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 12,
+        "y": 19
+      },
+      "id": 8,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.50, rate(script_memory_utilization_percent_bucket[5m]))",
+          "legendFormat": "p50",
+          "refId": "A"
+        },
+        {
+          "expr": "histogram_quantile(0.95, rate(script_memory_utilization_percent_bucket[5m]))",
+          "legendFormat": "p95",
+          "refId": "B"
+        }
+      ],
+      "title": "Memory Utilization",
+      "type": "timeseries"
+    }
+  ],
+  "refresh": "10s",
+  "schemaVersion": 33,
+  "style": "dark",
+  "tags": [
+    "coordinator",
+    "execution"
+  ],
+  "templating": {
+    "list": []
+  },
+  "time": {
+    "from": "now-3h",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "",
+  "title": "Coordinator & Execution",
+  "uid": "coordinator-execution",
+  "version": 1
+}
diff --git a/backend/grafana/provisioning/dashboards/dlq-monitoring.json b/backend/grafana/provisioning/dashboards/dlq-monitoring.json
index 74e95eab..9a1b7a68 100644
--- a/backend/grafana/provisioning/dashboards/dlq-monitoring.json
+++ b/backend/grafana/provisioning/dashboards/dlq-monitoring.json
@@ -85,7 +85,9 @@
       "options": {
         "orientation": "auto",
         "reduceOptions": {
-          "calcs": ["lastNotNull"],
+          "calcs": [
+            "lastNotNull"
+          ],
           "fields": "",
           "values": false
         },
@@ -221,7 +223,9 @@
       "options": {
         "orientation": "auto",
         "reduceOptions": {
-          "calcs": ["lastNotNull"],
+          "calcs": [
+            "lastNotNull"
+          ],
           "fields": "",
           "values": false
         },
@@ -274,7 +278,9 @@
         "justifyMode": "auto",
         "orientation": "auto",
         "reduceOptions": {
-          "calcs": ["lastNotNull"],
+          "calcs": [
+            "lastNotNull"
+          ],
           "fields": "",
           "values": false
         },
@@ -332,15 +338,22 @@
       },
       "id": 5,
       "options": {
-        "displayLabels": ["name", "percent"],
+        "displayLabels": [
+          "name",
+          "percent"
+        ],
         "legend": {
           "displayMode": "table",
           "placement": "right",
-          "values": ["value"]
+          "values": [
+            "value"
+          ]
         },
         "pieType": "donut",
         "reduceOptions": {
-          "calcs": ["lastNotNull"],
+          "calcs": [
+            "lastNotNull"
+          ],
           "fields": "",
           "values": false
         },
@@ -417,7 +430,9 @@
       "id": 6,
       "options": {
         "legend": {
-          "calcs": ["mean"],
+          "calcs": [
+            "mean"
+          ],
           "displayMode": "list",
           "placement": "bottom"
         },
@@ -473,15 +488,23 @@
       },
       "id": 7,
       "options": {
-        "displayLabels": ["name", "value"],
+        "displayLabels": [
+          "name",
+          "value"
+        ],
         "legend": {
           "displayMode": "table",
           "placement": "bottom",
-          "values": ["value", "percent"]
+          "values": [
+            "value",
+            "percent"
+          ]
         },
         "pieType": "pie",
         "reduceOptions": {
-          "calcs": ["lastNotNull"],
+          "calcs": [
+            "lastNotNull"
+          ],
           "fields": "",
           "values": false
         },
@@ -580,7 +603,10 @@
       "id": 8,
       "options": {
         "legend": {
-          "calcs": ["mean", "max"],
+          "calcs": [
+            "mean",
+            "max"
+          ],
           "displayMode": "list",
           "placement": "bottom"
         },
@@ -667,7 +693,10 @@
       "id": 9,
       "options": {
         "legend": {
-          "calcs": ["mean", "max"],
+          "calcs": [
+            "mean",
+            "max"
+          ],
           "displayMode": "list",
           "placement": "bottom"
         },
@@ -705,64 +734,6 @@
       "title": "Retry Metrics",
       "type": "row"
     },
-    {
-      "datasource": "Victoria Metrics",
-      "description": "Number of retry attempts for messages",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "thresholds"
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "yellow",
-                "value": 3
-              },
-              {
-                "color": "red",
-                "value": 5
-              }
-            ]
-          },
-          "unit": "short"
-        }
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 8,
-        "x": 0,
-        "y": 26
-      },
-      "id": 10,
-      "options": {
-        "displayMode": "lcd",
-        "orientation": "horizontal",
-        "reduceOptions": {
-          "calcs": ["mean"],
-          "fields": "",
-          "values": false
-        },
-        "showUnfilled": true,
-        "text": {}
-      },
-      "pluginVersion": "8.3.3",
-      "targets": [
-        {
-          "expr": "sum by (original_topic) (dlq_retry_attempts_sum) / sum by (original_topic) (dlq_retry_attempts_count)",
-          "legendFormat": "{{original_topic}}",
-          "refId": "A"
-        }
-      ],
-      "title": "Average Retry Attempts by Topic",
-      "type": "bargauge"
-    },
     {
       "datasource": "Victoria Metrics",
       "description": "Retry success/failure breakdown",
@@ -821,7 +792,9 @@
       "id": 11,
       "options": {
         "legend": {
-          "calcs": ["sum"],
+          "calcs": [
+            "sum"
+          ],
           "displayMode": "list",
           "placement": "bottom"
         },
@@ -840,83 +813,6 @@
       "title": "Retry Results (1h)",
       "type": "timeseries"
     },
-    {
-      "datasource": "Victoria Metrics",
-      "description": "Messages processed per second from DLQ",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisLabel": "",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "drawStyle": "line",
-            "fillOpacity": 10,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "never",
-            "spanNulls": true,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              }
-            ]
-          },
-          "unit": "msg/s"
-        }
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 8,
-        "x": 16,
-        "y": 26
-      },
-      "id": 12,
-      "options": {
-        "legend": {
-          "calcs": ["mean", "max"],
-          "displayMode": "list",
-          "placement": "bottom"
-        },
-        "tooltip": {
-          "mode": "multi"
-        }
-      },
-      "pluginVersion": "8.3.3",
-      "targets": [
-        {
-          "expr": "sum by (original_topic) (rate(dlq_throughput_rate_msg_per_second_sum[5m])) / sum by (original_topic) (rate(dlq_throughput_rate_msg_per_second_count[5m]))",
-          "legendFormat": "{{original_topic}}",
-          "refId": "A"
-        }
-      ],
-      "title": "DLQ Throughput by Topic",
-      "type": "timeseries"
-    },
     {
       "collapsed": false,
       "datasource": null,
@@ -992,7 +888,9 @@
       "id": 13,
       "options": {
         "legend": {
-          "calcs": ["sum"],
+          "calcs": [
+            "sum"
+          ],
           "displayMode": "list",
           "placement": "bottom"
         },
@@ -1230,7 +1128,9 @@
       "id": 16,
       "options": {
         "legend": {
-          "calcs": ["lastNotNull"],
+          "calcs": [
+            "lastNotNull"
+          ],
           "displayMode": "list",
           "placement": "bottom"
         },
@@ -1305,7 +1205,9 @@
         "justifyMode": "center",
         "orientation": "auto",
         "reduceOptions": {
-          "calcs": ["lastNotNull"],
+          "calcs": [
+            "lastNotNull"
+          ],
           "fields": "",
           "values": false
         },
@@ -1366,7 +1268,9 @@
         "justifyMode": "center",
         "orientation": "auto",
         "reduceOptions": {
-          "calcs": ["lastNotNull"],
+          "calcs": [
+            "lastNotNull"
+          ],
           "fields": "",
           "values": false
         },
@@ -1425,7 +1329,9 @@
         "justifyMode": "center",
         "orientation": "auto",
         "reduceOptions": {
-          "calcs": ["lastNotNull"],
+          "calcs": [
+            "lastNotNull"
+          ],
           "fields": "",
           "values": false
         },
@@ -1484,7 +1390,9 @@
         "justifyMode": "center",
         "orientation": "auto",
         "reduceOptions": {
-          "calcs": ["lastNotNull"],
+          "calcs": [
+            "lastNotNull"
+          ],
           "fields": "",
           "values": false
         },
@@ -1505,7 +1413,10 @@
   "refresh": "10s",
   "schemaVersion": 33,
   "style": "dark",
-  "tags": ["kafka", "dlq"],
+  "tags": [
+    "kafka",
+    "dlq"
+  ],
   "templating": {
     "list": []
   },
@@ -1514,11 +1425,22 @@
     "to": "now"
   },
   "timepicker": {
-    "refresh_intervals": ["5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d"]
+    "refresh_intervals": [
+      "5s",
+      "10s",
+      "30s",
+      "1m",
+      "5m",
+      "15m",
+      "30m",
+      "1h",
+      "2h",
+      "1d"
+    ]
   },
   "timezone": "",
   "title": "Dead Letter Queue",
   "uid": "dlq-monitoring",
   "version": 1,
   "weekStart": ""
-}
\ No newline at end of file
+}
diff --git a/backend/grafana/provisioning/dashboards/event-replay.json b/backend/grafana/provisioning/dashboards/event-replay.json
new file mode 100644
index 00000000..56a9eb47
--- /dev/null
+++ b/backend/grafana/provisioning/dashboards/event-replay.json
@@ -0,0 +1,387 @@
+{
+  "annotations": {
+    "list": []
+  },
+  "description": "Event Replay",
+  "editable": true,
+  "gnetId": null,
+  "graphTooltip": 1,
+  "id": null,
+  "panels": [
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 0
+      },
+      "id": 100,
+      "panels": [],
+      "title": "Sessions",
+      "type": "row"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 0,
+        "y": 1
+      },
+      "id": 1,
+      "targets": [
+        {
+          "expr": "rate(replay_sessions_created_total[5m])",
+          "legendFormat": "Created",
+          "refId": "A"
+        }
+      ],
+      "title": "Sessions Created",
+      "type": "timeseries"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 6,
+        "x": 12,
+        "y": 1
+      },
+      "id": 2,
+      "targets": [
+        {
+          "expr": "replay_sessions_active",
+          "refId": "A"
+        }
+      ],
+      "title": "Active Sessions",
+      "type": "stat"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 6,
+        "x": 18,
+        "y": 1
+      },
+      "id": 3,
+      "targets": [
+        {
+          "expr": "replay_sessions_by_status",
+          "refId": "A"
+        }
+      ],
+      "title": "By Status",
+      "type": "stat"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 7
+      },
+      "id": 101,
+      "panels": [],
+      "title": "Events",
+      "type": "row"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 0,
+        "y": 8
+      },
+      "id": 4,
+      "targets": [
+        {
+          "expr": "rate(replay_events_processed_total[5m])",
+          "legendFormat": "Processed",
+          "refId": "A"
+        },
+        {
+          "expr": "rate(replay_events_failed_total[5m])",
+          "legendFormat": "Failed",
+          "refId": "B"
+        },
+        {
+          "expr": "rate(replay_events_skipped_total[5m])",
+          "legendFormat": "Skipped",
+          "refId": "C"
+        }
+      ],
+      "title": "Events Processed",
+      "type": "timeseries"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 12,
+        "y": 8
+      },
+      "id": 5,
+      "targets": [
+        {
+          "expr": "rate(replay_status_changes_total[5m])",
+          "legendFormat": "Changes",
+          "refId": "A"
+        }
+      ],
+      "title": "Status Changes",
+      "type": "timeseries"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 14
+      },
+      "id": 102,
+      "panels": [],
+      "title": "Performance",
+      "type": "row"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 0,
+        "y": 15
+      },
+      "id": 6,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, rate(replay_duration_seconds_bucket[5m]))",
+          "legendFormat": "p95",
+          "refId": "A"
+        }
+      ],
+      "title": "Replay Duration",
+      "type": "timeseries"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 12,
+        "y": 15
+      },
+      "id": 7,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, rate(replay_event_processing_time_seconds_bucket[5m]))",
+          "legendFormat": "p95",
+          "refId": "A"
+        }
+      ],
+      "title": "Event Processing Time",
+      "type": "timeseries"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 0,
+        "y": 21
+      },
+      "id": 8,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.50, rate(replay_throughput_event_per_second_bucket[5m]))",
+          "legendFormat": "p50",
+          "refId": "A"
+        }
+      ],
+      "title": "Throughput",
+      "type": "timeseries"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 12,
+        "y": 21
+      },
+      "id": 9,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.50, rate(replay_batch_size_bucket[5m]))",
+          "legendFormat": "p50",
+          "refId": "A"
+        }
+      ],
+      "title": "Batch Size",
+      "type": "timeseries"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 27
+      },
+      "id": 103,
+      "panels": [],
+      "title": "Targets & Speed",
+      "type": "row"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 0,
+        "y": 28
+      },
+      "id": 10,
+      "targets": [
+        {
+          "expr": "rate(replay_by_target_total[5m])",
+          "legendFormat": "{{target}}",
+          "refId": "A"
+        },
+        {
+          "expr": "rate(replay_target_errors_total[5m])",
+          "legendFormat": "Errors",
+          "refId": "B"
+        }
+      ],
+      "title": "By Target",
+      "type": "timeseries"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 12,
+        "y": 28
+      },
+      "id": 11,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.50, rate(replay_speed_multiplier_x_bucket[5m]))",
+          "legendFormat": "Multiplier p50",
+          "refId": "A"
+        },
+        {
+          "expr": "histogram_quantile(0.50, rate(replay_delay_applied_seconds_bucket[5m]))",
+          "legendFormat": "Delay p50",
+          "refId": "B"
+        }
+      ],
+      "title": "Speed Control",
+      "type": "timeseries"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 12,
+        "x": 0,
+        "y": 34
+      },
+      "id": 12,
+      "targets": [
+        {
+          "expr": "replay_queue_size",
+          "refId": "A"
+        }
+      ],
+      "title": "Queue Size",
+      "type": "stat"
+    }
+  ],
+  "refresh": "10s",
+  "schemaVersion": 33,
+  "style": "dark",
+  "tags": [
+    "replay"
+  ],
+  "templating": {
+    "list": []
+  },
+  "time": {
+    "from": "now-3h",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "",
+  "title": "Event Replay",
+  "uid": "event-replay",
+  "version": 1
+}
diff --git a/backend/grafana/provisioning/dashboards/event-stream-monitoring.json b/backend/grafana/provisioning/dashboards/event-stream-monitoring.json
index 3f421163..fde9d831 100644
--- a/backend/grafana/provisioning/dashboards/event-stream-monitoring.json
+++ b/backend/grafana/provisioning/dashboards/event-stream-monitoring.json
@@ -114,7 +114,9 @@
         "justifyMode": "auto",
         "orientation": "auto",
         "reduceOptions": {
-          "calcs": ["lastNotNull"],
+          "calcs": [
+            "lastNotNull"
+          ],
           "fields": "",
           "values": false
         },
@@ -179,7 +181,9 @@
         "justifyMode": "auto",
         "orientation": "auto",
         "reduceOptions": {
-          "calcs": ["lastNotNull"],
+          "calcs": [
+            "lastNotNull"
+          ],
           "fields": "",
           "values": false
         },
@@ -203,527 +207,7 @@
     },
     {
       "datasource": "Victoria Metrics",
-      "description": "Total events currently buffered",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "thresholds"
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "yellow",
-                "value": 1000
-              },
-              {
-                "color": "red",
-                "value": 5000
-              }
-            ]
-          },
-          "unit": "short"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 6,
-        "w": 4,
-        "x": 8,
-        "y": 1
-      },
-      "id": 53,
-      "options": {
-        "colorMode": "background",
-        "graphMode": "area",
-        "justifyMode": "auto",
-        "orientation": "auto",
-        "reduceOptions": {
-          "calcs": ["lastNotNull"],
-          "fields": "",
-          "values": false
-        },
-        "textMode": "auto",
-        "wideLayout": true
-      },
-      "pluginVersion": "10.2.0",
-      "targets": [
-        {
-          "datasource": "Victoria Metrics",
-          "editorMode": "code",
-          "expr": "event_buffer_size",
-          "instant": false,
-          "legendFormat": "Buffered Events",
-          "range": true,
-          "refId": "A"
-        }
-      ],
-      "title": "Events in Buffer",
-      "type": "stat"
-    },
-    {
-      "datasource": "Victoria Metrics",
-      "description": "Total event publishing rate",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "Events/sec",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "drawStyle": "line",
-            "fillOpacity": 10,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "insertNulls": false,
-            "lineInterpolation": "linear",
-            "lineWidth": 2,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "never",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              }
-            ]
-          },
-          "unit": "ops"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 6,
-        "w": 6,
-        "x": 12,
-        "y": 1
-      },
-      "id": 54,
-      "options": {
-        "legend": {
-          "calcs": ["mean", "max"],
-          "displayMode": "list",
-          "placement": "bottom",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "single",
-          "sort": "none"
-        }
-      },
-      "targets": [
-        {
-          "datasource": "Victoria Metrics",
-          "editorMode": "code",
-          "expr": "sum(rate(events_published_total[1m]))",
-          "instant": false,
-          "legendFormat": "Event Rate",
-          "range": true,
-          "refId": "A"
-        }
-      ],
-      "title": "Event Publishing Rate",
-      "type": "timeseries"
-    },
-    {
-      "datasource": "Victoria Metrics",
-      "description": "Event delivery success rate",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "thresholds"
-          },
-          "mappings": [],
-          "max": 100,
-          "min": 0,
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "red",
-                "value": null
-              },
-              {
-                "color": "yellow",
-                "value": 90
-              },
-              {
-                "color": "green",
-                "value": 95
-              }
-            ]
-          },
-          "unit": "percent"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 6,
-        "w": 6,
-        "x": 18,
-        "y": 1
-      },
-      "id": 55,
-      "options": {
-        "minVizHeight": 75,
-        "minVizWidth": 75,
-        "orientation": "auto",
-        "reduceOptions": {
-          "calcs": ["lastNotNull"],
-          "fields": "",
-          "values": false
-        },
-        "showThresholdLabels": false,
-        "showThresholdMarkers": true,
-        "sizing": "auto"
-      },
-      "pluginVersion": "10.2.0",
-      "targets": [
-        {
-          "datasource": "Victoria Metrics",
-          "editorMode": "code",
-          "expr": "100 - (sum(rate(event_buffer_dropped_total[5m])) / (sum(rate(event_buffer_processed_total[5m])) + 0.0001) * 100)",
-          "instant": false,
-          "legendFormat": "Delivery Rate",
-          "range": true,
-          "refId": "A"
-        }
-      ],
-      "title": "Event Delivery Success",
-      "type": "gauge"
-    },
-    {
-      "collapsed": false,
-      "gridPos": {
-        "h": 1,
-        "w": 24,
-        "x": 0,
-        "y": 7
-      },
-      "id": 56,
-      "panels": [],
-      "title": "SSE Connection Details",
-      "type": "row"
-    },
-    {
-      "datasource": "Victoria Metrics",
-      "description": "SSE connections by endpoint over time",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "Connections",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "drawStyle": "line",
-            "fillOpacity": 20,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "insertNulls": false,
-            "lineInterpolation": "linear",
-            "lineWidth": 2,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "never",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "normal"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              }
-            ]
-          },
-          "unit": "short"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 8,
-        "x": 0,
-        "y": 8
-      },
-      "id": 57,
-      "options": {
-        "legend": {
-          "calcs": ["mean", "max", "lastNotNull"],
-          "displayMode": "table",
-          "placement": "bottom",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "multi",
-          "sort": "desc"
-        }
-      },
-      "targets": [
-        {
-          "datasource": "Victoria Metrics",
-          "editorMode": "code",
-          "expr": "sse_connections_active",
-          "instant": false,
-          "legendFormat": "{{endpoint}}",
-          "range": true,
-          "refId": "A"
-        }
-      ],
-      "title": "SSE Connections by Endpoint",
-      "type": "timeseries"
-    },
-    {
-      "datasource": "Victoria Metrics",
-      "description": "Distribution of SSE connection durations",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "Duration (seconds)",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "drawStyle": "line",
-            "fillOpacity": 10,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "insertNulls": false,
-            "lineInterpolation": "linear",
-            "lineWidth": 2,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "never",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              }
-            ]
-          },
-          "unit": "s"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 8,
-        "x": 8,
-        "y": 8
-      },
-      "id": 58,
-      "options": {
-        "legend": {
-          "calcs": ["mean", "max"],
-          "displayMode": "list",
-          "placement": "bottom",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "multi",
-          "sort": "none"
-        }
-      },
-      "targets": [
-        {
-          "datasource": "Victoria Metrics",
-          "editorMode": "code",
-          "expr": "histogram_quantile(0.99, sum(rate(sse_connection_duration_seconds_bucket[5m])) by (le))",
-          "instant": false,
-          "legendFormat": "p99",
-          "range": true,
-          "refId": "A"
-        },
-        {
-          "datasource": "Victoria Metrics",
-          "editorMode": "code",
-          "expr": "histogram_quantile(0.95, sum(rate(sse_connection_duration_seconds_bucket[5m])) by (le))",
-          "instant": false,
-          "legendFormat": "p95",
-          "range": true,
-          "refId": "B"
-        },
-        {
-          "datasource": "Victoria Metrics",
-          "editorMode": "code",
-          "expr": "histogram_quantile(0.50, sum(rate(sse_connection_duration_seconds_bucket[5m])) by (le))",
-          "instant": false,
-          "legendFormat": "p50",
-          "range": true,
-          "refId": "C"
-        }
-      ],
-      "title": "SSE Connection Duration Percentiles",
-      "type": "timeseries"
-    },
-    {
-      "datasource": "Victoria Metrics",
-      "description": "SSE messages sent by event type",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "Messages/sec",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "drawStyle": "line",
-            "fillOpacity": 10,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "insertNulls": false,
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "never",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "normal"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              }
-            ]
-          },
-          "unit": "ops"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 8,
-        "x": 16,
-        "y": 8
-      },
-      "id": 59,
-      "options": {
-        "legend": {
-          "calcs": ["sum"],
-          "displayMode": "table",
-          "placement": "right",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "multi",
-          "sort": "desc"
-        }
-      },
-      "targets": [
-        {
-          "datasource": "Victoria Metrics",
-          "editorMode": "code",
-          "expr": "rate(sse_messages_sent_total[5m])",
-          "instant": false,
-          "legendFormat": "{{event_type}}",
-          "range": true,
-          "refId": "A"
-        }
-      ],
-      "title": "SSE Messages by Event Type",
-      "type": "timeseries"
-    },
-    {
-      "collapsed": false,
-      "gridPos": {
-        "h": 1,
-        "w": 24,
-        "x": 0,
-        "y": 16
-      },
-      "id": 60,
-      "panels": [],
-      "title": "Event Buffer Performance",
-      "type": "row"
-    },
-    {
-      "datasource": "Victoria Metrics",
-      "description": "Event buffer size trends",
+      "description": "Total event publishing rate",
       "fieldConfig": {
         "defaults": {
           "color": {
@@ -732,11 +216,11 @@
           "custom": {
             "axisCenteredZero": false,
             "axisColorMode": "text",
-            "axisLabel": "Events",
+            "axisLabel": "Events/sec",
             "axisPlacement": "auto",
             "barAlignment": 0,
             "drawStyle": "line",
-            "fillOpacity": 20,
+            "fillOpacity": 10,
             "gradientMode": "none",
             "hideFrom": {
               "legend": false,
@@ -767,57 +251,65 @@
               {
                 "color": "green",
                 "value": null
-              },
-              {
-                "color": "yellow",
-                "value": 1000
-              },
-              {
-                "color": "red",
-                "value": 5000
               }
             ]
           },
-          "unit": "short"
+          "unit": "ops"
         },
         "overrides": []
       },
       "gridPos": {
-        "h": 8,
-        "w": 12,
-        "x": 0,
-        "y": 17
+        "h": 6,
+        "w": 6,
+        "x": 12,
+        "y": 1
       },
-      "id": 61,
+      "id": 54,
       "options": {
         "legend": {
-          "calcs": ["mean", "max", "lastNotNull"],
-          "displayMode": "table",
+          "calcs": [
+            "mean",
+            "max"
+          ],
+          "displayMode": "list",
           "placement": "bottom",
           "showLegend": true
         },
         "tooltip": {
-          "mode": "multi",
-          "sort": "desc"
+          "mode": "single",
+          "sort": "none"
         }
       },
       "targets": [
         {
           "datasource": "Victoria Metrics",
           "editorMode": "code",
-          "expr": "event_buffer_size",
+          "expr": "sum(rate(events_published_total[1m]))",
           "instant": false,
-          "legendFormat": "Buffer Size",
+          "legendFormat": "Event Rate",
           "range": true,
           "refId": "A"
         }
       ],
-      "title": "Event Buffer Size Over Time",
+      "title": "Event Publishing Rate",
       "type": "timeseries"
     },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 7
+      },
+      "id": 56,
+      "panels": [],
+      "title": "SSE Connection Details",
+      "type": "row"
+    },
     {
       "datasource": "Victoria Metrics",
-      "description": "Event buffer processing and drop rates",
+      "description": "SSE connections by endpoint over time",
       "fieldConfig": {
         "defaults": {
           "color": {
@@ -826,11 +318,11 @@
           "custom": {
             "axisCenteredZero": false,
             "axisColorMode": "text",
-            "axisLabel": "Events/sec",
+            "axisLabel": "Connections",
             "axisPlacement": "auto",
             "barAlignment": 0,
             "drawStyle": "line",
-            "fillOpacity": 10,
+            "fillOpacity": 20,
             "gradientMode": "none",
             "hideFrom": {
               "legend": false,
@@ -848,7 +340,7 @@
             "spanNulls": false,
             "stacking": {
               "group": "A",
-              "mode": "none"
+              "mode": "normal"
             },
             "thresholdsStyle": {
               "mode": "off"
@@ -864,51 +356,24 @@
               }
             ]
           },
-          "unit": "ops"
+          "unit": "short"
         },
-        "overrides": [
-          {
-            "matcher": {
-              "id": "byName",
-              "options": "Dropped"
-            },
-            "properties": [
-              {
-                "id": "color",
-                "value": {
-                  "fixedColor": "red",
-                  "mode": "fixed"
-                }
-              }
-            ]
-          },
-          {
-            "matcher": {
-              "id": "byName",
-              "options": "Processed"
-            },
-            "properties": [
-              {
-                "id": "color",
-                "value": {
-                  "fixedColor": "green",
-                  "mode": "fixed"
-                }
-              }
-            ]
-          }
-        ]
+        "overrides": []
       },
       "gridPos": {
         "h": 8,
-        "w": 12,
-        "x": 12,
-        "y": 17
+        "w": 8,
+        "x": 0,
+        "y": 8
       },
-      "id": 62,
+      "id": 57,
       "options": {
         "legend": {
-          "calcs": ["mean", "max", "sum"],
+          "calcs": [
+            "mean",
+            "max",
+            "lastNotNull"
+          ],
           "displayMode": "table",
           "placement": "bottom",
           "showLegend": true
@@ -922,28 +387,19 @@
         {
           "datasource": "Victoria Metrics",
           "editorMode": "code",
-          "expr": "rate(event_buffer_processed_total[5m])",
+          "expr": "sse_connections_active",
           "instant": false,
-          "legendFormat": "Processed",
+          "legendFormat": "{{endpoint}}",
           "range": true,
           "refId": "A"
-        },
-        {
-          "datasource": "Victoria Metrics",
-          "editorMode": "code",
-          "expr": "rate(event_buffer_dropped_total[5m])",
-          "instant": false,
-          "legendFormat": "Dropped",
-          "range": true,
-          "refId": "B"
         }
       ],
-      "title": "Event Buffer Processing vs Drops",
+      "title": "SSE Connections by Endpoint",
       "type": "timeseries"
     },
     {
       "datasource": "Victoria Metrics",
-      "description": "Event processing latency percentiles",
+      "description": "Distribution of SSE connection durations",
       "fieldConfig": {
         "defaults": {
           "color": {
@@ -952,7 +408,7 @@
           "custom": {
             "axisCenteredZero": false,
             "axisColorMode": "text",
-            "axisLabel": "Latency (seconds)",
+            "axisLabel": "Duration (seconds)",
             "axisPlacement": "auto",
             "barAlignment": 0,
             "drawStyle": "line",
@@ -987,14 +443,6 @@
               {
                 "color": "green",
                 "value": null
-              },
-              {
-                "color": "yellow",
-                "value": 1
-              },
-              {
-                "color": "red",
-                "value": 5
               }
             ]
           },
@@ -1004,15 +452,18 @@
       },
       "gridPos": {
         "h": 8,
-        "w": 12,
-        "x": 0,
-        "y": 25
+        "w": 8,
+        "x": 8,
+        "y": 8
       },
-      "id": 63,
+      "id": 58,
       "options": {
         "legend": {
-          "calcs": ["mean", "max"],
-          "displayMode": "table",
+          "calcs": [
+            "mean",
+            "max"
+          ],
+          "displayMode": "list",
           "placement": "bottom",
           "showLegend": true
         },
@@ -1025,7 +476,7 @@
         {
           "datasource": "Victoria Metrics",
           "editorMode": "code",
-          "expr": "histogram_quantile(0.99, sum(rate(event_buffer_latency_seconds_bucket[5m])) by (le))",
+          "expr": "histogram_quantile(0.99, sum(rate(sse_connection_duration_seconds_bucket[5m])) by (le))",
           "instant": false,
           "legendFormat": "p99",
           "range": true,
@@ -1034,7 +485,7 @@
         {
           "datasource": "Victoria Metrics",
           "editorMode": "code",
-          "expr": "histogram_quantile(0.95, sum(rate(event_buffer_latency_seconds_bucket[5m])) by (le))",
+          "expr": "histogram_quantile(0.95, sum(rate(sse_connection_duration_seconds_bucket[5m])) by (le))",
           "instant": false,
           "legendFormat": "p95",
           "range": true,
@@ -1043,19 +494,19 @@
         {
           "datasource": "Victoria Metrics",
           "editorMode": "code",
-          "expr": "histogram_quantile(0.50, sum(rate(event_buffer_latency_seconds_bucket[5m])) by (le))",
+          "expr": "histogram_quantile(0.50, sum(rate(sse_connection_duration_seconds_bucket[5m])) by (le))",
           "instant": false,
           "legendFormat": "p50",
           "range": true,
           "refId": "C"
         }
       ],
-      "title": "Event Buffer Processing Latency",
+      "title": "SSE Connection Duration Percentiles",
       "type": "timeseries"
     },
     {
       "datasource": "Victoria Metrics",
-      "description": "Memory usage of event buffers",
+      "description": "SSE messages sent by event type",
       "fieldConfig": {
         "defaults": {
           "color": {
@@ -1064,7 +515,7 @@
           "custom": {
             "axisCenteredZero": false,
             "axisColorMode": "text",
-            "axisLabel": "Memory (MB)",
+            "axisLabel": "Messages/sec",
             "axisPlacement": "auto",
             "barAlignment": 0,
             "drawStyle": "line",
@@ -1077,7 +528,7 @@
             },
             "insertNulls": false,
             "lineInterpolation": "linear",
-            "lineWidth": 2,
+            "lineWidth": 1,
             "pointSize": 5,
             "scaleDistribution": {
               "type": "linear"
@@ -1086,7 +537,7 @@
             "spanNulls": false,
             "stacking": {
               "group": "A",
-              "mode": "none"
+              "mode": "normal"
             },
             "thresholdsStyle": {
               "mode": "off"
@@ -1099,33 +550,27 @@
               {
                 "color": "green",
                 "value": null
-              },
-              {
-                "color": "yellow",
-                "value": 50
-              },
-              {
-                "color": "red",
-                "value": 100
               }
             ]
           },
-          "unit": "decmbytes"
+          "unit": "ops"
         },
         "overrides": []
       },
       "gridPos": {
         "h": 8,
-        "w": 12,
-        "x": 12,
-        "y": 25
+        "w": 8,
+        "x": 16,
+        "y": 8
       },
-      "id": 64,
+      "id": 59,
       "options": {
         "legend": {
-          "calcs": ["mean", "max", "lastNotNull"],
+          "calcs": [
+            "sum"
+          ],
           "displayMode": "table",
-          "placement": "bottom",
+          "placement": "right",
           "showLegend": true
         },
         "tooltip": {
@@ -1137,16 +582,29 @@
         {
           "datasource": "Victoria Metrics",
           "editorMode": "code",
-          "expr": "event_buffer_memory_usage_MB_sum / event_buffer_memory_usage_MB_count",
+          "expr": "rate(sse_messages_sent_total[5m])",
           "instant": false,
-          "legendFormat": "Memory Usage",
+          "legendFormat": "{{event_type}}",
           "range": true,
           "refId": "A"
         }
       ],
-      "title": "Event Buffer Memory Usage",
+      "title": "SSE Messages by Event Type",
       "type": "timeseries"
     },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 16
+      },
+      "id": 60,
+      "panels": [],
+      "title": "Event Buffer Performance",
+      "type": "row"
+    },
     {
       "collapsed": false,
       "gridPos": {
@@ -1222,7 +680,10 @@
       "id": 66,
       "options": {
         "legend": {
-          "calcs": ["mean", "max"],
+          "calcs": [
+            "mean",
+            "max"
+          ],
           "displayMode": "table",
           "placement": "right",
           "showLegend": true
@@ -1316,7 +777,10 @@
       "id": 67,
       "options": {
         "legend": {
-          "calcs": ["mean", "max"],
+          "calcs": [
+            "mean",
+            "max"
+          ],
           "displayMode": "table",
           "placement": "right",
           "showLegend": true
@@ -1431,7 +895,11 @@
       "id": 69,
       "options": {
         "legend": {
-          "calcs": ["mean", "max", "sum"],
+          "calcs": [
+            "mean",
+            "max",
+            "sum"
+          ],
           "displayMode": "table",
           "placement": "bottom",
           "showLegend": true
@@ -1534,7 +1002,10 @@
       "id": 70,
       "options": {
         "legend": {
-          "calcs": ["mean", "max"],
+          "calcs": [
+            "mean",
+            "max"
+          ],
           "displayMode": "table",
           "placement": "bottom",
           "showLegend": true
@@ -1580,92 +1051,6 @@
       "title": "SSE Shutdown Monitoring",
       "type": "row"
     },
-    {
-      "datasource": "Victoria Metrics",
-      "description": "SSE graceful shutdown phases",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "Duration (seconds)",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "drawStyle": "bars",
-            "fillOpacity": 50,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "insertNulls": false,
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "never",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              }
-            ]
-          },
-          "unit": "s"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 12,
-        "x": 0,
-        "y": 52
-      },
-      "id": 72,
-      "options": {
-        "legend": {
-          "calcs": ["lastNotNull"],
-          "displayMode": "table",
-          "placement": "bottom",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "multi",
-          "sort": "none"
-        }
-      },
-      "targets": [
-        {
-          "datasource": "Victoria Metrics",
-          "editorMode": "code",
-          "expr": "sse_shutdown_duration_seconds_sum / sse_shutdown_duration_seconds_count",
-          "instant": false,
-          "legendFormat": "{{phase}}",
-          "range": true,
-          "refId": "A"
-        }
-      ],
-      "title": "SSE Shutdown Phase Durations",
-      "type": "timeseries"
-    },
     {
       "datasource": "Victoria Metrics",
       "description": "SSE draining connections during shutdown",
@@ -1728,7 +1113,10 @@
       "id": 73,
       "options": {
         "legend": {
-          "calcs": ["max", "lastNotNull"],
+          "calcs": [
+            "max",
+            "lastNotNull"
+          ],
           "displayMode": "list",
           "placement": "bottom",
           "showLegend": true
@@ -1755,7 +1143,10 @@
   ],
   "refresh": "5s",
   "schemaVersion": 38,
-  "tags": ["event-streaming", "sse"],
+  "tags": [
+    "event-streaming",
+    "sse"
+  ],
   "templating": {
     "list": []
   },
@@ -1769,4 +1160,4 @@
   "uid": "event-stream-monitoring",
   "version": 2,
   "weekStart": ""
-}
\ No newline at end of file
+}
diff --git a/backend/grafana/provisioning/dashboards/http-middleware.json b/backend/grafana/provisioning/dashboards/http-middleware.json
new file mode 100644
index 00000000..908d3a90
--- /dev/null
+++ b/backend/grafana/provisioning/dashboards/http-middleware.json
@@ -0,0 +1,387 @@
+{
+  "annotations": {
+    "list": []
+  },
+  "description": "HTTP & Middleware",
+  "editable": true,
+  "gnetId": null,
+  "graphTooltip": 1,
+  "id": null,
+  "panels": [
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 0
+      },
+      "id": 100,
+      "panels": [],
+      "title": "HTTP Requests",
+      "type": "row"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 0,
+        "y": 1
+      },
+      "id": 1,
+      "targets": [
+        {
+          "expr": "rate(http_requests_total[5m])",
+          "legendFormat": "Requests",
+          "refId": "A"
+        }
+      ],
+      "title": "Request Rate",
+      "type": "timeseries"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 12,
+        "y": 1
+      },
+      "id": 2,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))",
+          "legendFormat": "p95",
+          "refId": "A"
+        }
+      ],
+      "title": "Request Duration",
+      "type": "timeseries"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 8,
+        "x": 0,
+        "y": 7
+      },
+      "id": 3,
+      "targets": [
+        {
+          "expr": "http_requests_active_requests",
+          "refId": "A"
+        }
+      ],
+      "title": "Active Requests",
+      "type": "stat"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 16,
+        "x": 8,
+        "y": 7
+      },
+      "id": 4,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.50, rate(http_request_size_bytes_bucket[5m]))",
+          "legendFormat": "Request p50",
+          "refId": "A"
+        },
+        {
+          "expr": "histogram_quantile(0.50, rate(http_response_size_bytes_bucket[5m]))",
+          "legendFormat": "Response p50",
+          "refId": "B"
+        }
+      ],
+      "title": "Request/Response Size",
+      "type": "timeseries"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 13
+      },
+      "id": 101,
+      "panels": [],
+      "title": "Database & Event Store",
+      "type": "row"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 0,
+        "y": 14
+      },
+      "id": 5,
+      "targets": [
+        {
+          "expr": "rate(mongodb_event_operations_total[5m])",
+          "legendFormat": "Operations",
+          "refId": "A"
+        },
+        {
+          "expr": "rate(database_connection_errors_total[5m])",
+          "legendFormat": "Connection Errors",
+          "refId": "B"
+        }
+      ],
+      "title": "MongoDB Operations",
+      "type": "timeseries"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 12,
+        "y": 14
+      },
+      "id": 6,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, rate(mongodb_event_query_duration_seconds_bucket[5m]))",
+          "legendFormat": "p95",
+          "refId": "A"
+        }
+      ],
+      "title": "MongoDB Query Duration",
+      "type": "timeseries"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 6,
+        "x": 0,
+        "y": 20
+      },
+      "id": 7,
+      "targets": [
+        {
+          "expr": "database_connections_active",
+          "refId": "A"
+        }
+      ],
+      "title": "Active DB Connections",
+      "type": "stat"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 18,
+        "x": 6,
+        "y": 20
+      },
+      "id": 8,
+      "targets": [
+        {
+          "expr": "rate(event_store_operations_total[5m])",
+          "legendFormat": "Store Ops",
+          "refId": "A"
+        },
+        {
+          "expr": "histogram_quantile(0.95, rate(idempotency_processing_duration_seconds_bucket[5m]))",
+          "legendFormat": "Idempotency p95",
+          "refId": "B"
+        }
+      ],
+      "title": "Event Store & Idempotency",
+      "type": "timeseries"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 26
+      },
+      "id": 102,
+      "panels": [],
+      "title": "Kafka Errors & Event Bus",
+      "type": "row"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 0,
+        "y": 27
+      },
+      "id": 9,
+      "targets": [
+        {
+          "expr": "rate(kafka_production_errors_total[5m])",
+          "legendFormat": "Production",
+          "refId": "A"
+        },
+        {
+          "expr": "rate(kafka_consumption_errors_total[5m])",
+          "legendFormat": "Consumption",
+          "refId": "B"
+        }
+      ],
+      "title": "Kafka Errors",
+      "type": "timeseries"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 12,
+        "x": 12,
+        "y": 27
+      },
+      "id": 10,
+      "targets": [
+        {
+          "expr": "event_bus_queue_size",
+          "refId": "A"
+        }
+      ],
+      "title": "Event Bus Queue Size",
+      "type": "stat"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 33
+      },
+      "id": 103,
+      "panels": [],
+      "title": "System",
+      "type": "row"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 8,
+        "x": 0,
+        "y": 34
+      },
+      "id": 11,
+      "targets": [
+        {
+          "expr": "system_cpu_percent",
+          "refId": "A"
+        }
+      ],
+      "title": "CPU",
+      "type": "stat"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 8,
+        "x": 8,
+        "y": 34
+      },
+      "id": 12,
+      "targets": [
+        {
+          "expr": "process_metrics_mixed",
+          "refId": "A"
+        }
+      ],
+      "title": "Process Metrics",
+      "type": "stat"
+    }
+  ],
+  "refresh": "10s",
+  "schemaVersion": 33,
+  "style": "dark",
+  "tags": [
+    "http",
+    "middleware",
+    "database"
+  ],
+  "templating": {
+    "list": []
+  },
+  "time": {
+    "from": "now-3h",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "",
+  "title": "HTTP & Middleware",
+  "uid": "http-middleware",
+  "version": 1
+}
diff --git a/backend/grafana/provisioning/dashboards/integr8scode.json b/backend/grafana/provisioning/dashboards/integr8scode.json
index b1092a22..24687020 100644
--- a/backend/grafana/provisioning/dashboards/integr8scode.json
+++ b/backend/grafana/provisioning/dashboards/integr8scode.json
@@ -68,7 +68,9 @@
       "options": {
         "orientation": "auto",
         "reduceOptions": {
-          "calcs": ["lastNotNull"],
+          "calcs": [
+            "lastNotNull"
+          ],
           "fields": "",
           "values": false
         },
@@ -80,7 +82,7 @@
       "targets": [
         {
           "datasource": "Victoria Metrics",
-          "expr": "sum(rate(script_executions_total[1m]))"        ,
+          "expr": "sum(rate(script_executions_total[1m]))",
           "refId": "A"
         }
       ],
@@ -129,7 +131,9 @@
       "options": {
         "orientation": "auto",
         "reduceOptions": {
-          "calcs": ["lastNotNull"],
+          "calcs": [
+            "lastNotNull"
+          ],
           "fields": "",
           "values": false
         },
@@ -187,7 +191,9 @@
       "options": {
         "orientation": "auto",
         "reduceOptions": {
-          "calcs": ["lastNotNull"],
+          "calcs": [
+            "lastNotNull"
+          ],
           "fields": "",
           "values": false
         },
@@ -245,7 +251,9 @@
       "options": {
         "orientation": "auto",
         "reduceOptions": {
-          "calcs": ["lastNotNull"],
+          "calcs": [
+            "lastNotNull"
+          ],
           "fields": "",
           "values": false
         },
@@ -323,7 +331,10 @@
       "id": 5,
       "options": {
         "legend": {
-          "calcs": ["mean", "lastNotNull"],
+          "calcs": [
+            "mean",
+            "lastNotNull"
+          ],
           "displayMode": "table",
           "placement": "bottom"
         },
@@ -336,7 +347,7 @@
         {
           "datasource": "Victoria Metrics",
           "editorMode": "code",
-          "expr": "sum by (lang_and_version) (rate(script_executions_total[1m]))"        ,
+          "expr": "sum by (lang_and_version) (rate(script_executions_total[1m]))",
           "legendFormat": "{{lang_and_version}}",
           "refId": "A"
         }
@@ -403,7 +414,10 @@
       "id": 6,
       "options": {
         "legend": {
-          "calcs": ["mean", "max"],
+          "calcs": [
+            "mean",
+            "max"
+          ],
           "displayMode": "table",
           "placement": "bottom"
         },
@@ -502,7 +516,10 @@
       "id": 7,
       "options": {
         "legend": {
-          "calcs": ["mean", "max"],
+          "calcs": [
+            "mean",
+            "max"
+          ],
           "displayMode": "list",
           "placement": "bottom"
         },
@@ -646,7 +663,9 @@
       "options": {
         "orientation": "auto",
         "reduceOptions": {
-          "calcs": ["lastNotNull"],
+          "calcs": [
+            "lastNotNull"
+          ],
           "fields": "",
           "values": false
         },
@@ -671,83 +690,6 @@
       "title": "Memory Saturation",
       "type": "gauge"
     },
-    {
-      "datasource": "Victoria Metrics",
-      "description": "System health indicators",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "thresholds"
-          },
-          "mappings": [
-            {
-              "options": {
-                "0": {
-                  "color": "red",
-                  "index": 0,
-                  "text": "Unhealthy"
-                },
-                "1": {
-                  "color": "green",
-                  "index": 1,
-                  "text": "Healthy"
-                }
-              },
-              "type": "value"
-            }
-          ],
-          "noValue": "Healthy",
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 0.5
-              },
-              {
-                "color": "green",
-                "value": 0.99
-              }
-            ]
-          },
-          "unit": "short"
-        }
-      },
-      "gridPos": {
-        "h": 4,
-        "w": 6,
-        "x": 0,
-        "y": 22
-      },
-      "id": 10,
-      "options": {
-        "colorMode": "background",
-        "graphMode": "none",
-        "justifyMode": "center",
-        "orientation": "auto",
-        "reduceOptions": {
-          "calcs": ["lastNotNull"],
-          "fields": "",
-          "values": false
-        },
-        "text": {},
-        "textMode": "value"
-      },
-      "pluginVersion": "10.2.0",
-      "targets": [
-        {
-          "datasource": "Victoria Metrics",
-          "expr": "(sum(health_check_status_sum{check_type=\"readiness\"}) or vector(1)) / (sum(health_check_status_count{check_type=\"readiness\"}) or vector(1))",
-          "refId": "A"
-        }
-      ],
-      "title": "System Health",
-      "type": "stat"
-    },
     {
       "datasource": "Victoria Metrics",
       "description": "Number of active SSE connections",
@@ -790,7 +732,9 @@
         "justifyMode": "center",
         "orientation": "auto",
         "reduceOptions": {
-          "calcs": ["lastNotNull"],
+          "calcs": [
+            "lastNotNull"
+          ],
           "fields": "",
           "values": false
         },
@@ -850,7 +794,9 @@
         "justifyMode": "center",
         "orientation": "auto",
         "reduceOptions": {
-          "calcs": ["lastNotNull"],
+          "calcs": [
+            "lastNotNull"
+          ],
           "fields": "",
           "values": false
         },
@@ -911,7 +857,9 @@
         "justifyMode": "center",
         "orientation": "auto",
         "reduceOptions": {
-          "calcs": ["lastNotNull"],
+          "calcs": [
+            "lastNotNull"
+          ],
           "fields": "",
           "values": false
         },
@@ -933,7 +881,10 @@
   "refresh": "10s",
   "schemaVersion": 38,
   "style": "dark",
-  "tags": ["overview", "main"],
+  "tags": [
+    "overview",
+    "main"
+  ],
   "templating": {
     "list": []
   },
@@ -942,11 +893,22 @@
     "to": "now"
   },
   "timepicker": {
-    "refresh_intervals": ["5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d"]
+    "refresh_intervals": [
+      "5s",
+      "10s",
+      "30s",
+      "1m",
+      "5m",
+      "15m",
+      "30m",
+      "1h",
+      "2h",
+      "1d"
+    ]
   },
   "timezone": "",
   "title": "System Overview",
   "uid": "integr8scode-main",
   "version": 1,
   "weekStart": ""
-}
\ No newline at end of file
+}
diff --git a/backend/grafana/provisioning/dashboards/kafka-events-monitoring.json b/backend/grafana/provisioning/dashboards/kafka-events-monitoring.json
index 734eeaa4..4c5f93b3 100644
--- a/backend/grafana/provisioning/dashboards/kafka-events-monitoring.json
+++ b/backend/grafana/provisioning/dashboards/kafka-events-monitoring.json
@@ -114,7 +114,9 @@
         "justifyMode": "auto",
         "orientation": "auto",
         "reduceOptions": {
-          "calcs": ["lastNotNull"],
+          "calcs": [
+            "lastNotNull"
+          ],
           "fields": "",
           "values": false
         },
@@ -136,71 +138,6 @@
       "title": "Message Throughput",
       "type": "stat"
     },
-    {
-      "datasource": "Victoria Metrics",
-      "description": "Total consumer lag across all consumer groups",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "thresholds"
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "yellow",
-                "value": 100
-              },
-              {
-                "color": "red",
-                "value": 1000
-              }
-            ]
-          },
-          "unit": "short"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 6,
-        "w": 4,
-        "x": 4,
-        "y": 1
-      },
-      "id": 3,
-      "options": {
-        "colorMode": "background",
-        "graphMode": "area",
-        "justifyMode": "auto",
-        "orientation": "auto",
-        "reduceOptions": {
-          "calcs": ["lastNotNull"],
-          "fields": "",
-          "values": false
-        },
-        "textMode": "auto",
-        "wideLayout": true
-      },
-      "pluginVersion": "10.2.0",
-      "targets": [
-        {
-          "datasource": "Victoria Metrics",
-          "editorMode": "code",
-          "expr": "sum(kafka_consumer_lag_sum / kafka_consumer_lag_count)",
-          "instant": false,
-          "legendFormat": "Total Lag",
-          "range": true,
-          "refId": "A"
-        }
-      ],
-      "title": "Total Consumer Lag",
-      "type": "stat"
-    },
     {
       "datasource": "Victoria Metrics",
       "description": "Messages in Dead Letter Queue",
@@ -244,7 +181,9 @@
         "justifyMode": "auto",
         "orientation": "auto",
         "reduceOptions": {
-          "calcs": ["lastNotNull"],
+          "calcs": [
+            "lastNotNull"
+          ],
           "fields": "",
           "values": false
         },
@@ -359,7 +298,10 @@
       "id": 5,
       "options": {
         "legend": {
-          "calcs": ["mean", "max"],
+          "calcs": [
+            "mean",
+            "max"
+          ],
           "displayMode": "list",
           "placement": "bottom",
           "showLegend": true
@@ -467,7 +409,10 @@
       "id": 7,
       "options": {
         "legend": {
-          "calcs": ["mean", "max"],
+          "calcs": [
+            "mean",
+            "max"
+          ],
           "displayMode": "table",
           "placement": "right",
           "showLegend": true
@@ -561,7 +506,10 @@
       "id": 8,
       "options": {
         "legend": {
-          "calcs": ["mean", "max"],
+          "calcs": [
+            "mean",
+            "max"
+          ],
           "displayMode": "list",
           "placement": "bottom",
           "showLegend": true
@@ -616,100 +564,6 @@
       "title": "Consumer Metrics",
       "type": "row"
     },
-    {
-      "datasource": "Victoria Metrics",
-      "description": "Consumer lag by consumer group",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "Lag (messages)",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "drawStyle": "line",
-            "fillOpacity": 20,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "insertNulls": false,
-            "lineInterpolation": "linear",
-            "lineWidth": 2,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "never",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "yellow",
-                "value": 100
-              },
-              {
-                "color": "red",
-                "value": 1000
-              }
-            ]
-          },
-          "unit": "short"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 12,
-        "x": 0,
-        "y": 17
-      },
-      "id": 10,
-      "options": {
-        "legend": {
-          "calcs": ["mean", "max", "lastNotNull"],
-          "displayMode": "table",
-          "placement": "bottom",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "multi",
-          "sort": "desc"
-        }
-      },
-      "targets": [
-        {
-          "datasource": "Victoria Metrics",
-          "editorMode": "code",
-          "expr": "kafka_consumer_lag_sum / kafka_consumer_lag_count",
-          "instant": false,
-          "legendFormat": "{{consumer_group}}",
-          "range": true,
-          "refId": "A"
-        }
-      ],
-      "title": "Consumer Lag by Group",
-      "type": "timeseries"
-    },
     {
       "datasource": "Victoria Metrics",
       "description": "Message consumption rate by consumer group",
@@ -772,7 +626,10 @@
       "id": 11,
       "options": {
         "legend": {
-          "calcs": ["mean", "max"],
+          "calcs": [
+            "mean",
+            "max"
+          ],
           "displayMode": "table",
           "placement": "right",
           "showLegend": true
@@ -879,7 +736,10 @@
       "id": 13,
       "options": {
         "legend": {
-          "calcs": ["mean", "max"],
+          "calcs": [
+            "mean",
+            "max"
+          ],
           "displayMode": "table",
           "placement": "right",
           "showLegend": true
@@ -965,7 +825,9 @@
       "id": 14,
       "options": {
         "legend": {
-          "calcs": ["sum"],
+          "calcs": [
+            "sum"
+          ],
           "displayMode": "table",
           "placement": "right",
           "showLegend": true
@@ -1072,7 +934,10 @@
       "id": 16,
       "options": {
         "legend": {
-          "calcs": ["lastNotNull", "max"],
+          "calcs": [
+            "lastNotNull",
+            "max"
+          ],
           "displayMode": "table",
           "placement": "bottom",
           "showLegend": true
@@ -1204,7 +1069,10 @@
       "id": 17,
       "options": {
         "legend": {
-          "calcs": ["mean", "sum"],
+          "calcs": [
+            "mean",
+            "sum"
+          ],
           "displayMode": "list",
           "placement": "bottom",
           "showLegend": true
@@ -1289,7 +1157,9 @@
         "justifyMode": "auto",
         "orientation": "auto",
         "reduceOptions": {
-          "calcs": ["max"],
+          "calcs": [
+            "max"
+          ],
           "fields": "",
           "values": false
         },
@@ -1386,7 +1256,10 @@
       "id": 20,
       "options": {
         "legend": {
-          "calcs": ["mean", "max"],
+          "calcs": [
+            "mean",
+            "max"
+          ],
           "displayMode": "table",
           "placement": "right",
           "showLegend": true
@@ -1472,7 +1345,9 @@
       "id": 21,
       "options": {
         "legend": {
-          "calcs": ["sum"],
+          "calcs": [
+            "sum"
+          ],
           "displayMode": "table",
           "placement": "bottom",
           "showLegend": true
@@ -1571,7 +1446,9 @@
       "id": 23,
       "options": {
         "legend": {
-          "calcs": ["sum"],
+          "calcs": [
+            "sum"
+          ],
           "displayMode": "list",
           "placement": "bottom",
           "showLegend": true
@@ -1703,7 +1580,10 @@
       "id": 24,
       "options": {
         "legend": {
-          "calcs": ["mean", "sum"],
+          "calcs": [
+            "mean",
+            "sum"
+          ],
           "displayMode": "list",
           "placement": "bottom",
           "showLegend": true
@@ -1788,7 +1668,9 @@
         "justifyMode": "auto",
         "orientation": "auto",
         "reduceOptions": {
-          "calcs": ["lastNotNull"],
+          "calcs": [
+            "lastNotNull"
+          ],
           "fields": "",
           "values": false
         },
@@ -1813,7 +1695,10 @@
   ],
   "refresh": "5s",
   "schemaVersion": 38,
-  "tags": ["kafka", "events"],
+  "tags": [
+    "kafka",
+    "events"
+  ],
   "templating": {
     "list": []
   },
@@ -1827,4 +1712,4 @@
   "uid": "kafka-events-monitoring",
   "version": 2,
   "weekStart": ""
-}
\ No newline at end of file
+}
diff --git a/backend/grafana/provisioning/dashboards/kubernetes-pods.json b/backend/grafana/provisioning/dashboards/kubernetes-pods.json
new file mode 100644
index 00000000..47cf6dd4
--- /dev/null
+++ b/backend/grafana/provisioning/dashboards/kubernetes-pods.json
@@ -0,0 +1,350 @@
+{
+  "annotations": {
+    "list": []
+  },
+  "description": "Kubernetes & Pods",
+  "editable": true,
+  "gnetId": null,
+  "graphTooltip": 1,
+  "id": null,
+  "panels": [
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 0
+      },
+      "id": 100,
+      "panels": [],
+      "title": "Pod Creation",
+      "type": "row"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 0,
+        "y": 1
+      },
+      "id": 1,
+      "targets": [
+        {
+          "expr": "rate(pod_creations_total[5m])",
+          "legendFormat": "Created",
+          "refId": "A"
+        },
+        {
+          "expr": "rate(pod_creation_failures_total[5m])",
+          "legendFormat": "Failed",
+          "refId": "B"
+        }
+      ],
+      "title": "Pod Creations",
+      "type": "timeseries"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 12,
+        "y": 1
+      },
+      "id": 2,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, rate(pod_creation_duration_seconds_bucket[5m]))",
+          "legendFormat": "p95",
+          "refId": "A"
+        }
+      ],
+      "title": "Pod Creation Duration",
+      "type": "timeseries"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 6,
+        "x": 0,
+        "y": 7
+      },
+      "id": 3,
+      "targets": [
+        {
+          "expr": "pod_creations_active",
+          "refId": "A"
+        }
+      ],
+      "title": "Active Creations",
+      "type": "stat"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 6,
+        "x": 6,
+        "y": 7
+      },
+      "id": 4,
+      "targets": [
+        {
+          "expr": "increase(configmaps_created_total[24h])",
+          "refId": "A"
+        }
+      ],
+      "title": "ConfigMaps Created (24h)",
+      "type": "stat"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 11
+      },
+      "id": 101,
+      "panels": [],
+      "title": "Pod Lifecycle",
+      "type": "row"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 0,
+        "y": 12
+      },
+      "id": 5,
+      "targets": [
+        {
+          "expr": "rate(pod_phase_transitions_total[5m])",
+          "legendFormat": "{{phase}}",
+          "refId": "A"
+        }
+      ],
+      "title": "Phase Transitions",
+      "type": "timeseries"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 12,
+        "y": 12
+      },
+      "id": 6,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.50, rate(pod_lifetime_seconds_bucket[5m]))",
+          "legendFormat": "p50",
+          "refId": "A"
+        },
+        {
+          "expr": "histogram_quantile(0.95, rate(pod_lifetime_seconds_bucket[5m]))",
+          "legendFormat": "p95",
+          "refId": "B"
+        }
+      ],
+      "title": "Pod Lifetime",
+      "type": "timeseries"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 12,
+        "x": 0,
+        "y": 18
+      },
+      "id": 7,
+      "targets": [
+        {
+          "expr": "pods_by_phase",
+          "refId": "A"
+        }
+      ],
+      "title": "Pods by Phase",
+      "type": "stat"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 12,
+        "x": 12,
+        "y": 18
+      },
+      "id": 8,
+      "targets": [
+        {
+          "expr": "pods_monitored",
+          "refId": "A"
+        }
+      ],
+      "title": "Pods Monitored",
+      "type": "stat"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 22
+      },
+      "id": 102,
+      "panels": [],
+      "title": "Pod Monitor",
+      "type": "row"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 0,
+        "y": 23
+      },
+      "id": 9,
+      "targets": [
+        {
+          "expr": "rate(pod_monitor_events_total[5m])",
+          "legendFormat": "Events",
+          "refId": "A"
+        },
+        {
+          "expr": "rate(pod_monitor_reconciliations_total[5m])",
+          "legendFormat": "Reconciliations",
+          "refId": "B"
+        }
+      ],
+      "title": "Monitor Events",
+      "type": "timeseries"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 12,
+        "y": 23
+      },
+      "id": 10,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, rate(pod_monitor_processing_duration_seconds_bucket[5m]))",
+          "legendFormat": "p95",
+          "refId": "A"
+        }
+      ],
+      "title": "Monitor Processing Duration",
+      "type": "timeseries"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 0,
+        "y": 29
+      },
+      "id": 11,
+      "targets": [
+        {
+          "expr": "rate(pod_monitor_watch_errors_total[5m])",
+          "legendFormat": "Errors",
+          "refId": "A"
+        },
+        {
+          "expr": "rate(pod_monitor_watch_reconnects_total[5m])",
+          "legendFormat": "Reconnects",
+          "refId": "B"
+        }
+      ],
+      "title": "Watch Errors",
+      "type": "timeseries"
+    }
+  ],
+  "refresh": "10s",
+  "schemaVersion": 33,
+  "style": "dark",
+  "tags": [
+    "kubernetes",
+    "pods"
+  ],
+  "templating": {
+    "list": []
+  },
+  "time": {
+    "from": "now-3h",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "",
+  "title": "Kubernetes & Pods",
+  "uid": "kubernetes-pods",
+  "version": 1
+}
diff --git a/backend/grafana/provisioning/dashboards/notifications.json b/backend/grafana/provisioning/dashboards/notifications.json
new file mode 100644
index 00000000..97096a25
--- /dev/null
+++ b/backend/grafana/provisioning/dashboards/notifications.json
@@ -0,0 +1,500 @@
+{
+  "annotations": {
+    "list": []
+  },
+  "description": "Notifications",
+  "editable": true,
+  "gnetId": null,
+  "graphTooltip": 1,
+  "id": null,
+  "panels": [
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 0
+      },
+      "id": 100,
+      "panels": [],
+      "title": "Overview",
+      "type": "row"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 0,
+        "y": 1
+      },
+      "id": 1,
+      "targets": [
+        {
+          "expr": "rate(notifications_sent_total[5m])",
+          "legendFormat": "Sent",
+          "refId": "A"
+        },
+        {
+          "expr": "rate(notifications_failed_total[5m])",
+          "legendFormat": "Failed",
+          "refId": "B"
+        },
+        {
+          "expr": "rate(notifications_read_total[5m])",
+          "legendFormat": "Read",
+          "refId": "C"
+        }
+      ],
+      "title": "Notification Flow",
+      "type": "timeseries"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 12,
+        "y": 1
+      },
+      "id": 2,
+      "targets": [
+        {
+          "expr": "notifications_pending",
+          "refId": "A"
+        }
+      ],
+      "title": "Pending",
+      "type": "stat"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 16,
+        "y": 1
+      },
+      "id": 3,
+      "targets": [
+        {
+          "expr": "notifications_queued",
+          "refId": "A"
+        }
+      ],
+      "title": "Queued",
+      "type": "stat"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 20,
+        "y": 1
+      },
+      "id": 4,
+      "targets": [
+        {
+          "expr": "notifications_unread_count",
+          "refId": "A"
+        }
+      ],
+      "title": "Unread",
+      "type": "stat"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 7
+      },
+      "id": 101,
+      "panels": [],
+      "title": "Channels",
+      "type": "row"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 0,
+        "y": 8
+      },
+      "id": 5,
+      "targets": [
+        {
+          "expr": "rate(notifications_by_channel_total[5m])",
+          "legendFormat": "{{channel}}",
+          "refId": "A"
+        }
+      ],
+      "title": "By Channel",
+      "type": "timeseries"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 12,
+        "y": 8
+      },
+      "id": 6,
+      "targets": [
+        {
+          "expr": "rate(notifications_by_severity_total[5m])",
+          "legendFormat": "{{severity}}",
+          "refId": "A"
+        }
+      ],
+      "title": "By Severity",
+      "type": "timeseries"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 0,
+        "y": 14
+      },
+      "id": 7,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, rate(notification_channel_delivery_time_seconds_bucket[5m]))",
+          "legendFormat": "p95",
+          "refId": "A"
+        }
+      ],
+      "title": "Channel Delivery Time",
+      "type": "timeseries"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 12,
+        "y": 14
+      },
+      "id": 8,
+      "targets": [
+        {
+          "expr": "rate(notification_channel_failures_total[5m])",
+          "legendFormat": "{{channel}}",
+          "refId": "A"
+        }
+      ],
+      "title": "Channel Failures",
+      "type": "timeseries"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 20
+      },
+      "id": 102,
+      "panels": [],
+      "title": "Delivery",
+      "type": "row"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 0,
+        "y": 21
+      },
+      "id": 9,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, rate(notification_delivery_time_seconds_bucket[5m]))",
+          "legendFormat": "p95",
+          "refId": "A"
+        }
+      ],
+      "title": "Overall Delivery Time",
+      "type": "timeseries"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 12,
+        "y": 21
+      },
+      "id": 10,
+      "targets": [
+        {
+          "expr": "rate(notification_status_changes_total[5m])",
+          "legendFormat": "Changes",
+          "refId": "A"
+        }
+      ],
+      "title": "Status Changes",
+      "type": "timeseries"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 27
+      },
+      "id": 103,
+      "panels": [],
+      "title": "Throttling & Retries",
+      "type": "row"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 0,
+        "y": 28
+      },
+      "id": 11,
+      "targets": [
+        {
+          "expr": "rate(notifications_throttled_total[5m])",
+          "legendFormat": "Throttled",
+          "refId": "A"
+        },
+        {
+          "expr": "rate(notification_throttle_window_hits_total[5m])",
+          "legendFormat": "Window Hits",
+          "refId": "B"
+        }
+      ],
+      "title": "Throttling",
+      "type": "timeseries"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 12,
+        "y": 28
+      },
+      "id": 12,
+      "targets": [
+        {
+          "expr": "rate(notification_retries_total[5m])",
+          "legendFormat": "Retries",
+          "refId": "A"
+        },
+        {
+          "expr": "histogram_quantile(0.50, rate(notification_retry_success_rate_percent_bucket[5m]))",
+          "legendFormat": "Success Rate p50",
+          "refId": "B"
+        }
+      ],
+      "title": "Retries",
+      "type": "timeseries"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 34
+      },
+      "id": 104,
+      "panels": [],
+      "title": "Webhooks & Slack",
+      "type": "row"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 0,
+        "y": 35
+      },
+      "id": 13,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, rate(notification_webhook_delivery_time_seconds_bucket[5m]))",
+          "legendFormat": "p95",
+          "refId": "A"
+        },
+        {
+          "expr": "rate(notification_webhook_response_status_total[5m])",
+          "legendFormat": "{{status}}",
+          "refId": "B"
+        }
+      ],
+      "title": "Webhook Delivery",
+      "type": "timeseries"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 12,
+        "y": 35
+      },
+      "id": 14,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, rate(notification_slack_delivery_time_seconds_bucket[5m]))",
+          "legendFormat": "p95",
+          "refId": "A"
+        },
+        {
+          "expr": "rate(notification_slack_api_errors_total[5m])",
+          "legendFormat": "Errors",
+          "refId": "B"
+        }
+      ],
+      "title": "Slack",
+      "type": "timeseries"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 41
+      },
+      "id": 105,
+      "panels": [],
+      "title": "Subscriptions",
+      "type": "row"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 0,
+        "y": 42
+      },
+      "id": 15,
+      "targets": [
+        {
+          "expr": "notification_subscriptions_active",
+          "legendFormat": "Active",
+          "refId": "A"
+        },
+        {
+          "expr": "rate(notification_subscription_changes_total[5m])",
+          "legendFormat": "Changes",
+          "refId": "B"
+        }
+      ],
+      "title": "Subscriptions",
+      "type": "timeseries"
+    }
+  ],
+  "refresh": "10s",
+  "schemaVersion": 33,
+  "style": "dark",
+  "tags": [
+    "notifications"
+  ],
+  "templating": {
+    "list": []
+  },
+  "time": {
+    "from": "now-3h",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "",
+  "title": "Notifications",
+  "uid": "notifications",
+  "version": 1
+}
diff --git a/backend/grafana/provisioning/dashboards/security-auth.json b/backend/grafana/provisioning/dashboards/security-auth.json
new file mode 100644
index 00000000..191696bf
--- /dev/null
+++ b/backend/grafana/provisioning/dashboards/security-auth.json
@@ -0,0 +1,355 @@
+{
+  "annotations": {
+    "list": []
+  },
+  "description": "Security & Authentication",
+  "editable": true,
+  "gnetId": null,
+  "graphTooltip": 1,
+  "id": null,
+  "panels": [
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 0
+      },
+      "id": 100,
+      "panels": [],
+      "title": "Authentication",
+      "type": "row"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 0,
+        "y": 1
+      },
+      "id": 1,
+      "targets": [
+        {
+          "expr": "rate(authentication_attempts_total[5m])",
+          "legendFormat": "Attempts",
+          "refId": "A"
+        },
+        {
+          "expr": "rate(authentication_failures_total[5m])",
+          "legendFormat": "Failures",
+          "refId": "B"
+        }
+      ],
+      "title": "Authentication Attempts",
+      "type": "timeseries"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 12,
+        "y": 1
+      },
+      "id": 2,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, rate(authentication_duration_seconds_bucket[5m]))",
+          "legendFormat": "p95",
+          "refId": "A"
+        }
+      ],
+      "title": "Authentication Duration",
+      "type": "timeseries"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 6,
+        "x": 0,
+        "y": 7
+      },
+      "id": 3,
+      "targets": [
+        {
+          "expr": "authentication_sessions_active",
+          "refId": "A"
+        }
+      ],
+      "title": "Active Sessions",
+      "type": "stat"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 6,
+        "x": 6,
+        "y": 7
+      },
+      "id": 4,
+      "targets": [
+        {
+          "expr": "increase(accounts_locked_total[24h])",
+          "refId": "A"
+        }
+      ],
+      "title": "Accounts Locked (24h)",
+      "type": "stat"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 6,
+        "x": 12,
+        "y": 7
+      },
+      "id": 5,
+      "targets": [
+        {
+          "expr": "increase(brute_force_attempts_total[1h])",
+          "refId": "A"
+        }
+      ],
+      "title": "Brute Force Attempts (1h)",
+      "type": "stat"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 6,
+        "x": 18,
+        "y": 7
+      },
+      "id": 6,
+      "targets": [
+        {
+          "expr": "increase(weak_password_attempts_total[24h])",
+          "refId": "A"
+        }
+      ],
+      "title": "Weak Password Attempts",
+      "type": "stat"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 11
+      },
+      "id": 101,
+      "panels": [],
+      "title": "Tokens & CSRF",
+      "type": "row"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 0,
+        "y": 12
+      },
+      "id": 7,
+      "targets": [
+        {
+          "expr": "rate(tokens_generated_total[5m])",
+          "legendFormat": "Generated",
+          "refId": "A"
+        },
+        {
+          "expr": "rate(tokens_revoked_total[5m])",
+          "legendFormat": "Revoked",
+          "refId": "B"
+        },
+        {
+          "expr": "rate(token_validation_failures_total[5m])",
+          "legendFormat": "Validation Failures",
+          "refId": "C"
+        }
+      ],
+      "title": "Token Operations",
+      "type": "timeseries"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 12,
+        "y": 12
+      },
+      "id": 8,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.50, rate(token_expiry_time_seconds_bucket[5m]))",
+          "legendFormat": "p50",
+          "refId": "A"
+        }
+      ],
+      "title": "Token Expiry Time",
+      "type": "timeseries"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 0,
+        "y": 18
+      },
+      "id": 9,
+      "targets": [
+        {
+          "expr": "rate(csrf_tokens_generated_total[5m])",
+          "legendFormat": "Generated",
+          "refId": "A"
+        },
+        {
+          "expr": "rate(csrf_validation_failures_total[5m])",
+          "legendFormat": "Failures",
+          "refId": "B"
+        }
+      ],
+      "title": "CSRF",
+      "type": "timeseries"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 24
+      },
+      "id": 102,
+      "panels": [],
+      "title": "Authorization",
+      "type": "row"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 0,
+        "y": 25
+      },
+      "id": 10,
+      "targets": [
+        {
+          "expr": "rate(authorization_checks_total[5m])",
+          "legendFormat": "Checks",
+          "refId": "A"
+        },
+        {
+          "expr": "rate(authorization_denials_total[5m])",
+          "legendFormat": "Denials",
+          "refId": "B"
+        }
+      ],
+      "title": "Authorization",
+      "type": "timeseries"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 12,
+        "x": 12,
+        "y": 25
+      },
+      "id": 11,
+      "targets": [
+        {
+          "expr": "rate(password_reset_requests_total[5m])",
+          "legendFormat": "Resets",
+          "refId": "A"
+        }
+      ],
+      "title": "Password Resets",
+      "type": "timeseries"
+    }
+  ],
+  "refresh": "10s",
+  "schemaVersion": 33,
+  "style": "dark",
+  "tags": [
+    "security",
+    "auth"
+  ],
+  "templating": {
+    "list": []
+  },
+  "time": {
+    "from": "now-3h",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "",
+  "title": "Security & Authentication",
+  "uid": "security-auth",
+  "version": 1
+}
diff --git a/backend/tests/unit/conftest.py b/backend/tests/unit/conftest.py
index a02357cd..c3ad50d8 100644
--- a/backend/tests/unit/conftest.py
+++ b/backend/tests/unit/conftest.py
@@ -11,7 +11,6 @@
     DLQMetrics,
     EventMetrics,
     ExecutionMetrics,
-    HealthMetrics,
     KubernetesMetrics,
     NotificationMetrics,
     RateLimitMetrics,
@@ -219,11 +218,6 @@ def execution_metrics(test_settings: Settings) -> ExecutionMetrics:
     return ExecutionMetrics(test_settings)
 
 
-@pytest.fixture
-def health_metrics(test_settings: Settings) -> HealthMetrics:
-    return HealthMetrics(test_settings)
-
-
 @pytest.fixture
 def kubernetes_metrics(test_settings: Settings) -> KubernetesMetrics:
     return KubernetesMetrics(test_settings)
diff --git a/backend/tests/unit/core/metrics/test_connections_and_coordinator_metrics.py b/backend/tests/unit/core/metrics/test_connections_and_coordinator_metrics.py
index d49c5ddd..a2745838 100644
--- a/backend/tests/unit/core/metrics/test_connections_and_coordinator_metrics.py
+++ b/backend/tests/unit/core/metrics/test_connections_and_coordinator_metrics.py
@@ -13,11 +13,6 @@ def test_connection_metrics_methods(test_settings: Settings) -> None:
     m.record_sse_message_sent("/events", "etype")
     m.record_sse_connection_duration(1.2, "/events")
     m.update_sse_draining_connections(1)
-    m.record_sse_shutdown_duration(0.5, "phase1")
-    m.update_sse_shutdown_duration(0.6, "phase2")
-    m.increment_event_bus_subscriptions()
-    m.decrement_event_bus_subscriptions(1)
-    m.update_event_bus_subscribers(3, "*")
 
 
 def test_coordinator_metrics_methods(test_settings: Settings) -> None:
diff --git a/backend/tests/unit/core/metrics/test_database_and_dlq_metrics.py b/backend/tests/unit/core/metrics/test_database_and_dlq_metrics.py
index 08a14b69..03f26f74 100644
--- a/backend/tests/unit/core/metrics/test_database_and_dlq_metrics.py
+++ b/backend/tests/unit/core/metrics/test_database_and_dlq_metrics.py
@@ -36,8 +36,6 @@ def test_dlq_metrics_methods(test_settings: Settings) -> None:
     m.update_dlq_queue_size("topic", 10)
     m.update_dlq_queue_size("topic", 7)
     m.record_dlq_message_age(5.0)
-    m.record_dlq_retry_attempt("topic", "etype", 2)
     m.record_dlq_processing_error("topic", "etype", "err")
-    m.record_dlq_throughput(12.0, "topic")
     m.increment_dlq_queue_size("topic")
     m.decrement_dlq_queue_size("topic")
diff --git a/backend/tests/unit/core/metrics/test_execution_and_events_metrics.py b/backend/tests/unit/core/metrics/test_execution_and_events_metrics.py
index a102a02f..a61b3f5c 100644
--- a/backend/tests/unit/core/metrics/test_execution_and_events_metrics.py
+++ b/backend/tests/unit/core/metrics/test_execution_and_events_metrics.py
@@ -21,10 +21,6 @@ def test_execution_metrics_methods(test_settings: Settings) -> None:
     m.record_execution_assigned()
     m.record_execution_queued()
     m.record_execution_scheduled()
-    m.update_cpu_available(100.0)
-    m.update_memory_available(512.0)
-    m.update_gpu_available(1)
-    m.update_allocations_active(2)
 
 
 def test_event_metrics_methods(test_settings: Settings) -> None:
@@ -32,15 +28,7 @@ def test_event_metrics_methods(test_settings: Settings) -> None:
     m = EventMetrics(test_settings)
     m.record_event_published("execution.requested", None)
     m.record_event_processing_duration(0.05, "execution.requested")
-    m.record_pod_event_published("pod.running")
     m.record_event_replay_operation("prepare", "success")
-    m.update_event_buffer_size(3)
-    m.record_event_buffer_dropped()
-    m.record_event_buffer_processed()
-    m.record_event_buffer_latency(0.2)
-    m.set_event_buffer_backpressure(True)
-    m.set_event_buffer_backpressure(False)
-    m.record_event_buffer_memory_usage(12.3)
     m.record_event_stored("execution.requested", "events")
     m.record_events_processing_failed("topic", "etype", "group", "error")
     m.record_event_store_duration(0.1, "insert", "events")
@@ -49,10 +37,8 @@ def test_event_metrics_methods(test_settings: Settings) -> None:
     m.record_processing_duration(0.3, "topic", "etype", "group")
     m.record_kafka_message_produced("t")
     m.record_kafka_message_consumed("t", "g")
-    m.record_kafka_consumer_lag(10, "t", "g", 0)
     m.record_kafka_production_error("t", "e")
     m.record_kafka_consumption_error("t", "g", "e")
     m.update_event_bus_queue_size(1, "default")
     m.set_event_bus_queue_size(5, "default")
     m.set_event_bus_queue_size(2, "default")
-
diff --git a/backend/tests/unit/core/metrics/test_health_and_rate_limit_metrics.py b/backend/tests/unit/core/metrics/test_health_and_rate_limit_metrics.py
index 54d06d27..b1cb7f84 100644
--- a/backend/tests/unit/core/metrics/test_health_and_rate_limit_metrics.py
+++ b/backend/tests/unit/core/metrics/test_health_and_rate_limit_metrics.py
@@ -1,24 +1,7 @@
 import pytest
-from app.core.metrics import HealthMetrics
-from app.settings import Settings
 
 pytestmark = pytest.mark.unit
 
 
-def test_health_metrics_methods(test_settings: Settings) -> None:
-    """Test with no-op metrics."""
-    m = HealthMetrics(test_settings)
-    m.record_health_check_duration(0.1, "liveness", "basic")
-    m.record_health_check_failure("readiness", "db", "timeout")
-    m.update_health_check_status(1, "liveness", "basic")
-    m.record_health_status("svc", "healthy")
-    m.record_service_health_score("svc", 95.0)
-    m.update_liveness_status(True, "app")
-    m.update_readiness_status(False, "app")
-    m.record_dependency_health("mongo", True, 0.2)
-    m.record_health_check_timeout("readiness", "db")
-    m.update_component_health("kafka", True)
-
-
 def test_rate_limit_metrics_methods() -> None:
     """Test with no-op metrics."""
diff --git a/backend/tests/unit/core/metrics/test_kubernetes_and_notifications_metrics.py b/backend/tests/unit/core/metrics/test_kubernetes_and_notifications_metrics.py
index 3a12d8de..d6597c9c 100644
--- a/backend/tests/unit/core/metrics/test_kubernetes_and_notifications_metrics.py
+++ b/backend/tests/unit/core/metrics/test_kubernetes_and_notifications_metrics.py
@@ -18,8 +18,6 @@ def test_kubernetes_metrics_methods(test_settings: Settings) -> None:
     m.record_k8s_pod_created("success", "python")
     m.record_k8s_pod_creation_duration(0.3, "python")
     m.record_k8s_config_map_created("ok")
-    m.record_k8s_network_policy_created("ok")
-    m.update_k8s_active_creations(1)
     m.increment_pod_monitor_watch_reconnects()
     m.record_pod_monitor_event_processing_duration(0.2, "ADDED")
     m.record_pod_monitor_event_published("PodRunning", "Running")
@@ -29,9 +27,6 @@ def test_kubernetes_metrics_methods(test_settings: Settings) -> None:
     m.record_pod_phase_transition("Pending", "Running", "pod1")
     m.record_pod_lifetime(12.0, "Succeeded", "python")
     m.update_pods_by_phase("Running", 2)
-    m.record_pod_resource_request("cpu", 500.0, "python")
-    m.record_pod_resource_limit("memory", 256.0, "python")
-    m.record_pods_per_node("node1", 7)
 
 
 def test_notification_metrics_methods(test_settings: Settings) -> None:
@@ -41,17 +36,12 @@ def test_notification_metrics_methods(test_settings: Settings) -> None:
     m.record_notification_failed("welcome", "smtp_error", channel="email")
     m.record_notification_delivery_time(0.5, "welcome", channel="email")
     m.record_notification_status_change("n1", "pending", "queued")
-    m.record_notification_read("welcome", 2.0)
-    m.record_notification_clicked("welcome")
-    m.update_unread_count("u1", 5)
-    m.update_unread_count("u1", 2)
+    m.record_notification_read("welcome")
+    m.decrement_unread_count("u1")
     m.record_notification_throttled("welcome", "u1")
     m.record_throttle_window_hit("u1")
     m.record_notification_retry("welcome", 1, False)
     m.record_notification_retry("welcome", 2, True)
-    m.record_batch_processed(10, 1.2, notification_type="welcome")
-    m.record_template_render(0.2, "tmpl", success=True)
-    m.record_template_render(0.1, "tmpl", success=False)
     m.record_webhook_delivery(0.3, 200, "/hooks/*")
     m.record_slack_delivery(0.4, "#general", False, error_type="rate_limited")
     m.update_active_subscriptions("u1", 3)
diff --git a/backend/tests/unit/core/metrics/test_metrics_classes.py b/backend/tests/unit/core/metrics/test_metrics_classes.py
index 12fba98f..3e198ef0 100644
--- a/backend/tests/unit/core/metrics/test_metrics_classes.py
+++ b/backend/tests/unit/core/metrics/test_metrics_classes.py
@@ -6,7 +6,6 @@
     DLQMetrics,
     EventMetrics,
     ExecutionMetrics,
-    HealthMetrics,
     KubernetesMetrics,
     NotificationMetrics,
     RateLimitMetrics,
@@ -27,8 +26,6 @@ def test_connection_metrics_smoke(test_settings: Settings) -> None:
     m.record_sse_message_sent("exec", "evt")
     m.record_sse_connection_duration(0.1, "exec")
     m.update_sse_draining_connections(1)
-    m.record_sse_shutdown_duration(0.01, "notify")
-    m.update_event_bus_subscribers(3, "*")
 
 
 def test_event_metrics_smoke(test_settings: Settings) -> None:
@@ -36,14 +33,7 @@ def test_event_metrics_smoke(test_settings: Settings) -> None:
     m = EventMetrics(test_settings)
     m.record_event_published("execution.requested")
     m.record_event_processing_duration(0.01, "execution.requested")
-    m.record_pod_event_published("pod.created")
     m.record_event_replay_operation("replay", "success")
-    m.update_event_buffer_size(1)
-    m.record_event_buffer_dropped()
-    m.record_event_buffer_processed()
-    m.record_event_buffer_latency(0.005)
-    m.set_event_buffer_backpressure(True)
-    m.record_event_buffer_memory_usage(1.2)
     m.record_event_stored("x", "events")
     m.record_events_processing_failed("t", "x", "g", "ValueError")
     m.record_event_store_duration(0.01, "store", "events")
@@ -52,7 +42,6 @@ def test_event_metrics_smoke(test_settings: Settings) -> None:
     m.record_processing_duration(0.03, "t", "x", "g")
     m.record_kafka_message_produced("t")
     m.record_kafka_message_consumed("t", "g")
-    m.record_kafka_consumer_lag(10, "t", "g", 0)
     m.record_kafka_production_error("t", "E")
     m.record_kafka_consumption_error("t", "g", "E")
     m.update_event_bus_queue_size(1)
@@ -65,9 +54,8 @@ def test_other_metrics_classes_smoke(test_settings: Settings) -> None:
     DatabaseMetrics(test_settings).record_mongodb_operation("read", "ok")
     DLQMetrics(test_settings).record_dlq_message_received("topic", "type")
     ExecutionMetrics(test_settings).record_script_execution(ExecutionStatus.QUEUED, "python")
-    HealthMetrics(test_settings).record_health_check_duration(0.001, "liveness", "basic")
     KubernetesMetrics(test_settings).record_k8s_pod_created("success", "python")
     NotificationMetrics(test_settings).record_notification_sent("welcome", channel="email")
     RateLimitMetrics(test_settings).record_request("/api/test", True, "sliding_window")
     ReplayMetrics(test_settings).record_session_created("by_id", "kafka")
-    SecurityMetrics(test_settings).record_security_event("scan", severity="low")
+    SecurityMetrics(test_settings).record_authentication_attempt("password", True)
diff --git a/backend/tests/unit/core/metrics/test_replay_and_security_metrics.py b/backend/tests/unit/core/metrics/test_replay_and_security_metrics.py
index c7966e94..7db12533 100644
--- a/backend/tests/unit/core/metrics/test_replay_and_security_metrics.py
+++ b/backend/tests/unit/core/metrics/test_replay_and_security_metrics.py
@@ -24,9 +24,6 @@ def test_replay_metrics_methods(test_settings: Settings) -> None:
     m.record_speed_multiplier(2.0, "by_id")
     m.record_delay_applied(0.05)
     m.record_batch_size(10, "by_id")
-    m.record_events_filtered("type", 5)
-    m.record_filter_effectiveness(5, 10, "type")
-    m.record_replay_memory_usage(123.0, "s1")
     m.update_replay_queue_size("s1", 10)
     m.update_replay_queue_size("s1", 4)
 
@@ -34,31 +31,16 @@ def test_replay_metrics_methods(test_settings: Settings) -> None:
 def test_security_metrics_methods(test_settings: Settings) -> None:
     """Test SecurityMetrics methods with no-op metrics."""
     m = SecurityMetrics(test_settings)
-    m.record_security_event("scan_started", severity="high", source="scanner")
-    m.record_security_violation("csrf", user_id="u1", ip_address="127.0.0.1")
     m.record_authentication_attempt("password", False, user_id="u1", duration_seconds=0.2)
-    m.update_active_sessions(2)
     m.increment_active_sessions()
     m.decrement_active_sessions()
     m.record_token_generated("access", 3600)
-    m.record_token_refreshed("access")
     m.record_token_revoked("access", "logout")
     m.record_token_validation_failure("access", "expired")
     m.record_authorization_check("/admin", "GET", False, user_role="user")
-    m.record_permission_check("write", True, user_id="u1")
     m.record_csrf_token_generated()
     m.record_csrf_validation_failure("missing")
-    m.record_network_policy_violation("np1", "pod1", violation_type="egress")
-    m.record_privilege_escalation_attempt("u1", "admin", True)
-    m.record_rate_limit_hit("/api")
-    m.record_rate_limit_violation("/api", limit=100)
-    m.record_api_key_created("kid")
-    m.record_api_key_revoked("kid", "compromised")
-    m.record_api_key_usage("kid", "/api")
-    m.record_audit_event("config_change", "u1", resource="system")
-    m.record_password_change("u1", True)
-    m.record_password_reset_request("u1", method="email")
+    m.record_password_reset_request("u1", method="admin")
     m.record_weak_password_attempt("u1", "common_password")
     m.record_brute_force_attempt("1.2.3.4", target_user="u1", action_taken="blocked")
     m.record_account_locked("u1", "brute_force", duration_seconds=600)
-
diff --git a/backend/tests/unit/core/test_csrf.py b/backend/tests/unit/core/test_csrf.py
index bc20f7d2..3c0838c2 100644
--- a/backend/tests/unit/core/test_csrf.py
+++ b/backend/tests/unit/core/test_csrf.py
@@ -1,4 +1,5 @@
 import pytest
+from app.core.metrics import SecurityMetrics
 from app.core.security import SecurityService
 from app.domain.user import CSRFValidationError
 from app.settings import Settings
@@ -28,9 +29,9 @@ def make_request(
 class TestCSRFTokenGeneration:
     """Tests for CSRF token generation."""
 
-    def test_generates_signed_token_format(self, test_settings: Settings) -> None:
+    def test_generates_signed_token_format(self, test_settings: Settings, security_metrics: SecurityMetrics) -> None:
         """CSRF token has nonce.signature format."""
-        security = SecurityService(test_settings)
+        security = SecurityService(test_settings, security_metrics)
 
         token = security.generate_csrf_token("session-abc")
 
@@ -40,17 +41,19 @@ def test_generates_signed_token_format(self, test_settings: Settings) -> None:
         assert len(nonce) > 0
         assert len(signature) == 64  # sha256 hexdigest
 
-    def test_generates_unique_tokens(self, test_settings: Settings) -> None:
+    def test_generates_unique_tokens(self, test_settings: Settings, security_metrics: SecurityMetrics) -> None:
         """Each CSRF token is unique (different nonce each time)."""
-        security = SecurityService(test_settings)
+        security = SecurityService(test_settings, security_metrics)
 
         tokens = {security.generate_csrf_token("same-session") for _ in range(100)}
 
         assert len(tokens) == 100
 
-    def test_different_sessions_produce_different_tokens(self, test_settings: Settings) -> None:
+    def test_different_sessions_produce_different_tokens(
+        self, test_settings: Settings, security_metrics: SecurityMetrics,
+    ) -> None:
         """Tokens for different sessions differ even with same nonce derivation."""
-        security = SecurityService(test_settings)
+        security = SecurityService(test_settings, security_metrics)
 
         token_a = security.generate_csrf_token("session-a")
         token_b = security.generate_csrf_token("session-b")
@@ -61,18 +64,18 @@ def test_different_sessions_produce_different_tokens(self, test_settings: Settin
 class TestCSRFTokenValidation:
     """Tests for CSRF token validation (double-submit check)."""
 
-    def test_validates_matching_tokens(self, test_settings: Settings) -> None:
+    def test_validates_matching_tokens(self, test_settings: Settings, security_metrics: SecurityMetrics) -> None:
         """Matching CSRF tokens pass validation."""
-        security = SecurityService(test_settings)
+        security = SecurityService(test_settings, security_metrics)
         token = security.generate_csrf_token("session-1")
 
         result = security.validate_csrf_token(token, token)
 
         assert result is True
 
-    def test_rejects_mismatched_tokens(self, test_settings: Settings) -> None:
+    def test_rejects_mismatched_tokens(self, test_settings: Settings, security_metrics: SecurityMetrics) -> None:
         """Mismatched CSRF tokens fail validation."""
-        security = SecurityService(test_settings)
+        security = SecurityService(test_settings, security_metrics)
 
         token1 = security.generate_csrf_token("session-1")
         token2 = security.generate_csrf_token("session-2")
@@ -91,10 +94,11 @@ def test_rejects_mismatched_tokens(self, test_settings: Settings) -> None:
         ids=["empty_header", "empty_cookie", "both_empty"],
     )
     def test_rejects_empty_tokens(
-        self, test_settings: Settings, header_token: str, cookie_token: str
+        self, test_settings: Settings, security_metrics: SecurityMetrics,
+        header_token: str, cookie_token: str,
     ) -> None:
         """Empty CSRF tokens fail validation."""
-        security = SecurityService(test_settings)
+        security = SecurityService(test_settings, security_metrics)
 
         result = security.validate_csrf_token(header_token, cookie_token)
 
@@ -104,29 +108,29 @@ def test_rejects_empty_tokens(
 class TestCSRFSignatureVerification:
     """Tests for CSRF HMAC signature verification."""
 
-    def test_valid_signature_passes(self, test_settings: Settings) -> None:
+    def test_valid_signature_passes(self, test_settings: Settings, security_metrics: SecurityMetrics) -> None:
         """Token verified against the same session_id succeeds."""
-        security = SecurityService(test_settings)
+        security = SecurityService(test_settings, security_metrics)
         token = security.generate_csrf_token("my-session")
 
         assert security._verify_csrf_signature(token, "my-session") is True
 
-    def test_wrong_session_rejected(self, test_settings: Settings) -> None:
+    def test_wrong_session_rejected(self, test_settings: Settings, security_metrics: SecurityMetrics) -> None:
         """Token signed for session A fails verification against session B."""
-        security = SecurityService(test_settings)
+        security = SecurityService(test_settings, security_metrics)
         token = security.generate_csrf_token("session-a")
 
         assert security._verify_csrf_signature(token, "session-b") is False
 
-    def test_unsigned_token_rejected(self, test_settings: Settings) -> None:
+    def test_unsigned_token_rejected(self, test_settings: Settings, security_metrics: SecurityMetrics) -> None:
         """A plain random string without signature structure is rejected."""
-        security = SecurityService(test_settings)
+        security = SecurityService(test_settings, security_metrics)
 
         assert security._verify_csrf_signature("plain-random-token", "session") is False
 
-    def test_tampered_signature_rejected(self, test_settings: Settings) -> None:
+    def test_tampered_signature_rejected(self, test_settings: Settings, security_metrics: SecurityMetrics) -> None:
         """Modifying the signature portion causes rejection."""
-        security = SecurityService(test_settings)
+        security = SecurityService(test_settings, security_metrics)
         token = security.generate_csrf_token("session-x")
 
         nonce, sig = token.split(".", 1)
@@ -139,23 +143,23 @@ class TestCSRFExemptPaths:
     """Tests for CSRF exempt path configuration."""
 
     def test_exempt_paths_includes_login_and_register(
-        self, test_settings: Settings
+        self, test_settings: Settings, security_metrics: SecurityMetrics,
     ) -> None:
         """CSRF exempt paths include login and register."""
-        security = SecurityService(test_settings)
+        security = SecurityService(test_settings, security_metrics)
 
         assert "/api/v1/auth/login" in security.CSRF_EXEMPT_PATHS
         assert "/api/v1/auth/register" in security.CSRF_EXEMPT_PATHS
 
-    def test_logout_is_not_exempt(self, test_settings: Settings) -> None:
+    def test_logout_is_not_exempt(self, test_settings: Settings, security_metrics: SecurityMetrics) -> None:
         """Logout is NOT exempt from CSRF validation."""
-        security = SecurityService(test_settings)
+        security = SecurityService(test_settings, security_metrics)
 
         assert "/api/v1/auth/logout" not in security.CSRF_EXEMPT_PATHS
 
-    def test_exempt_paths_is_frozenset(self, test_settings: Settings) -> None:
+    def test_exempt_paths_is_frozenset(self, test_settings: Settings, security_metrics: SecurityMetrics) -> None:
         """CSRF exempt paths is a frozenset (immutable)."""
-        security = SecurityService(test_settings)
+        security = SecurityService(test_settings, security_metrics)
 
         assert isinstance(security.CSRF_EXEMPT_PATHS, frozenset)
 
@@ -163,18 +167,18 @@ def test_exempt_paths_is_frozenset(self, test_settings: Settings) -> None:
 class TestCSRFRequestValidation:
     """Tests for CSRF validation from HTTP requests."""
 
-    def test_skips_get_requests(self, test_settings: Settings) -> None:
+    def test_skips_get_requests(self, test_settings: Settings, security_metrics: SecurityMetrics) -> None:
         """GET requests skip CSRF validation."""
-        security = SecurityService(test_settings)
+        security = SecurityService(test_settings, security_metrics)
         req = make_request("GET", "/api/v1/anything")
 
         assert security.validate_csrf_from_request(req) == "skip"
 
     def test_missing_header_raises_when_authenticated(
-        self, test_settings: Settings
+        self, test_settings: Settings, security_metrics: SecurityMetrics,
     ) -> None:
         """Missing CSRF header raises error for authenticated POST."""
-        security = SecurityService(test_settings)
+        security = SecurityService(test_settings, security_metrics)
         req = make_request(
             "POST",
             "/api/v1/items",
@@ -184,9 +188,9 @@ def test_missing_header_raises_when_authenticated(
         with pytest.raises(CSRFValidationError):
             security.validate_csrf_from_request(req)
 
-    def test_valid_tokens_pass(self, test_settings: Settings) -> None:
+    def test_valid_tokens_pass(self, test_settings: Settings, security_metrics: SecurityMetrics) -> None:
         """Valid signed CSRF tokens pass full request validation."""
-        security = SecurityService(test_settings)
+        security = SecurityService(test_settings, security_metrics)
         access_token = "my-access-token-value"
         token = security.generate_csrf_token(access_token)
         req = make_request(
@@ -198,9 +202,9 @@ def test_valid_tokens_pass(self, test_settings: Settings) -> None:
 
         assert security.validate_csrf_from_request(req) == token
 
-    def test_forged_token_rejected(self, test_settings: Settings) -> None:
+    def test_forged_token_rejected(self, test_settings: Settings, security_metrics: SecurityMetrics) -> None:
         """Unsigned token matching in header+cookie is rejected (signature check)."""
-        security = SecurityService(test_settings)
+        security = SecurityService(test_settings, security_metrics)
         forged = "forged-random-value"
         req = make_request(
             "POST",
@@ -212,9 +216,9 @@ def test_forged_token_rejected(self, test_settings: Settings) -> None:
         with pytest.raises(CSRFValidationError, match="signature invalid"):
             security.validate_csrf_from_request(req)
 
-    def test_wrong_session_token_rejected(self, test_settings: Settings) -> None:
+    def test_wrong_session_token_rejected(self, test_settings: Settings, security_metrics: SecurityMetrics) -> None:
         """Token signed for one session rejected when presented with different access_token."""
-        security = SecurityService(test_settings)
+        security = SecurityService(test_settings, security_metrics)
         token = security.generate_csrf_token("session-A-jwt")
         req = make_request(
             "POST",
@@ -226,9 +230,9 @@ def test_wrong_session_token_rejected(self, test_settings: Settings) -> None:
         with pytest.raises(CSRFValidationError, match="signature invalid"):
             security.validate_csrf_from_request(req)
 
-    def test_logout_requires_csrf(self, test_settings: Settings) -> None:
+    def test_logout_requires_csrf(self, test_settings: Settings, security_metrics: SecurityMetrics) -> None:
         """POST to logout with authentication requires CSRF token."""
-        security = SecurityService(test_settings)
+        security = SecurityService(test_settings, security_metrics)
         req = make_request(
             "POST",
             "/api/v1/auth/logout",
diff --git a/backend/tests/unit/core/test_security.py b/backend/tests/unit/core/test_security.py
index 27bd4386..7aedee5e 100644
--- a/backend/tests/unit/core/test_security.py
+++ b/backend/tests/unit/core/test_security.py
@@ -4,6 +4,7 @@
 
 import jwt
 import pytest
+from app.core.metrics import SecurityMetrics
 from app.core.security import SecurityService
 from app.domain.enums import UserRole
 from app.domain.user import InvalidCredentialsError
@@ -15,9 +16,9 @@ class TestPasswordHashing:
     """Test password hashing functionality."""
 
     @pytest.fixture
-    def security_svc(self, test_settings: Settings) -> SecurityService:
+    def security_svc(self, test_settings: Settings, security_metrics: SecurityMetrics) -> SecurityService:
         """Create SecurityService instance."""
-        return SecurityService(test_settings)
+        return SecurityService(test_settings, security_metrics)
 
     def test_password_hash_creates_different_hash(self, security_svc: SecurityService) -> None:
         """Test that password hashing creates unique hashes."""
@@ -72,9 +73,9 @@ class TestSecurityService:
     """Test SecurityService functionality."""
 
     @pytest.fixture
-    def security_service(self, test_settings: Settings) -> SecurityService:
+    def security_service(self, test_settings: Settings, security_metrics: SecurityMetrics) -> SecurityService:
         """Create SecurityService instance using test settings."""
-        return SecurityService(test_settings)
+        return SecurityService(test_settings, security_metrics)
 
     def test_create_access_token_basic(
             self,
@@ -283,9 +284,9 @@ def test_token_has_only_expected_claims(self, security_service: SecurityService)
         assert decoded["role"] == UserRole.USER
         assert "extra_field" in decoded  # Claims are carried as provided
 
-    def test_password_context_configuration(self, test_settings: Settings) -> None:
+    def test_password_context_configuration(self, test_settings: Settings, security_metrics: SecurityMetrics) -> None:
         """Test password context is properly configured."""
-        svc = SecurityService(test_settings)
+        svc = SecurityService(test_settings, security_metrics)
         password = "test_password"
         hashed = svc.get_password_hash(password)
         assert svc.verify_password(password, hashed)
@@ -311,8 +312,8 @@ class TestDecodeToken:
     """Test SecurityService.decode_token method."""
 
     @pytest.fixture
-    def security_service(self, test_settings: Settings) -> SecurityService:
-        return SecurityService(test_settings)
+    def security_service(self, test_settings: Settings, security_metrics: SecurityMetrics) -> SecurityService:
+        return SecurityService(test_settings, security_metrics)
 
     def test_valid_token_returns_username(self, security_service: SecurityService) -> None:
         token = security_service.create_access_token(

From f238b69a3127a6ee7c1f33637a44aa22f64ed225 Mon Sep 17 00:00:00 2001
From: HardMax71 <maxymazatyan@gmail.com>
Date: Mon, 16 Feb 2026 20:23:03 +0100
Subject: [PATCH 5/9] fix: wrong error msg

---
 backend/app/domain/user/exceptions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/app/domain/user/exceptions.py b/backend/app/domain/user/exceptions.py
index dc1b9acb..8b87c7a3 100644
--- a/backend/app/domain/user/exceptions.py
+++ b/backend/app/domain/user/exceptions.py
@@ -11,7 +11,7 @@ def __init__(self, message: str = "Not authenticated") -> None:
 class InvalidCredentialsError(UnauthorizedError):
     """Raised when credentials are invalid."""
 
-    def __init__(self, message: str = "Could not validate credentials") -> None:
+    def __init__(self, message: str = "Invalid credentials") -> None:
         super().__init__(message)
 
 

From 245e5378733cd3d94f8741d742fc02c01258bfa5 Mon Sep 17 00:00:00 2001
From: HardMax71 <maxymazatyan@gmail.com>
Date: Mon, 16 Feb 2026 20:55:37 +0100
Subject: [PATCH 6/9] fix: detected issues

---
 backend/app/core/metrics/dlq.py               |  7 +--
 backend/app/core/metrics/notifications.py     | 35 +++----------
 backend/app/core/metrics/security.py          |  7 ++-
 backend/app/dlq/manager.py                    |  2 +-
 backend/app/services/auth_service.py          | 29 +++++++----
 backend/app/services/notification_service.py  |  5 +-
 .../dashboards/coordinator-execution.json     | 14 +++---
 .../provisioning/dashboards/event-replay.json | 12 ++---
 .../dashboards/event-stream-monitoring.json   | 13 -----
 .../dashboards/http-middleware.json           | 10 ++--
 .../dashboards/kubernetes-pods.json           |  8 +--
 .../dashboards/notifications.json             | 50 ++++---------------
 .../dashboards/security-auth.json             |  4 +-
 .../tests/contract/test_grafana_metrics.py    | 11 +++-
 ...st_kubernetes_and_notifications_metrics.py | 11 ++--
 .../test_replay_and_security_metrics.py       |  4 +-
 16 files changed, 88 insertions(+), 134 deletions(-)

diff --git a/backend/app/core/metrics/dlq.py b/backend/app/core/metrics/dlq.py
index 18383cdc..1a73d477 100644
--- a/backend/app/core/metrics/dlq.py
+++ b/backend/app/core/metrics/dlq.py
@@ -5,6 +5,8 @@ class DLQMetrics(BaseMetrics):
     """Metrics for Dead Letter Queue operations."""
 
     def _create_instruments(self) -> None:
+        self._dlq_sizes: dict[str, int] = {}
+
         self.dlq_messages_received = self._meter.create_counter(
             name="dlq.messages.received.total", description="Total number of messages received in DLQ", unit="1"
         )
@@ -50,12 +52,11 @@ def record_dlq_processing_duration(self, duration_seconds: float, operation: str
         self.dlq_processing_duration.record(duration_seconds, attributes={"operation": operation})
 
     def update_dlq_queue_size(self, original_topic: str, size: int) -> None:
-        key = f"_dlq_size_{original_topic}"
-        current_val = getattr(self, key, 0)
+        current_val = self._dlq_sizes.get(original_topic, 0)
         delta = size - current_val
         if delta != 0:
             self.dlq_queue_size.add(delta, attributes={"original_topic": original_topic})
-        setattr(self, key, size)
+        self._dlq_sizes[original_topic] = size
 
     def record_dlq_message_age(self, age_seconds: float) -> None:
         self.dlq_message_age.record(age_seconds)
diff --git a/backend/app/core/metrics/notifications.py b/backend/app/core/metrics/notifications.py
index 13081829..b13f23bb 100644
--- a/backend/app/core/metrics/notifications.py
+++ b/backend/app/core/metrics/notifications.py
@@ -54,10 +54,6 @@ def _create_instruments(self) -> None:
             name="notifications.read.total", description="Total notifications read by users", unit="1"
         )
 
-        self.unread_count = self._meter.create_up_down_counter(
-            name="notifications.unread.count", description="Current unread notifications per user", unit="1"
-        )
-
         # Throttling metrics
         self.notifications_throttled = self._meter.create_counter(
             name="notifications.throttled.total", description="Total notifications throttled", unit="1"
@@ -101,12 +97,6 @@ def _create_instruments(self) -> None:
         )
 
         # Subscription metrics
-        self.subscriptions_active = self._meter.create_up_down_counter(
-            name="notification.subscriptions.active",
-            description="Number of active notification subscriptions",
-            unit="1",
-        )
-
         self.subscription_changes = self._meter.create_counter(
             name="notification.subscription.changes.total", description="Total subscription changes", unit="1"
         )
@@ -146,14 +136,11 @@ def record_notification_status_change(self, notification_id: str, from_status: s
     def record_notification_read(self, notification_type: str) -> None:
         self.notifications_read.add(1, attributes={"category": notification_type})
 
-    def decrement_unread_count(self, user_id: str) -> None:
-        self.unread_count.add(-1, attributes={"user_id": user_id})
-
-    def record_notification_throttled(self, notification_type: str, user_id: str) -> None:
-        self.notifications_throttled.add(1, attributes={"category": notification_type, "user_id": user_id})
+    def record_notification_throttled(self, notification_type: str) -> None:
+        self.notifications_throttled.add(1, attributes={"category": notification_type})
 
-    def record_throttle_window_hit(self, user_id: str) -> None:
-        self.throttle_window_hits.add(1, attributes={"user_id": user_id})
+    def record_throttle_window_hit(self) -> None:
+        self.throttle_window_hits.add(1)
 
     def record_notification_retry(self, notification_type: str, attempt_number: int, success: bool) -> None:
         self.notification_retries.add(
@@ -177,20 +164,12 @@ def record_slack_delivery(
         if not success and error_type:
             self.slack_api_errors.add(1, attributes={"error_type": error_type, "channel": channel})
 
-    def update_active_subscriptions(self, user_id: str, count: int) -> None:
-        key = f"_subscriptions_{user_id}"
-        current_val = getattr(self, key, 0)
-        delta = count - current_val
-        if delta != 0:
-            self.subscriptions_active.add(delta, attributes={"user_id": user_id})
-        setattr(self, key, count)
-
-    def record_subscription_change(self, user_id: str, notification_type: str, action: str) -> None:
+    def record_subscription_change(self, channel: str, enabled: bool | None) -> None:
+        action = "enabled" if enabled is True else "disabled" if enabled is False else "updated"
         self.subscription_changes.add(
             1,
             attributes={
-                "user_id": user_id,
-                "category": notification_type,
+                "channel": channel,
                 "action": action,
             },
         )
diff --git a/backend/app/core/metrics/security.py b/backend/app/core/metrics/security.py
index 589fb1c8..15086dfb 100644
--- a/backend/app/core/metrics/security.py
+++ b/backend/app/core/metrics/security.py
@@ -131,8 +131,8 @@ def record_csrf_validation_failure(self, reason: str) -> None:
     def record_password_reset_request(self, user_id: str, method: str = "admin") -> None:
         self.password_reset_requests.add(1, attributes={"user_id": user_id, "method": method})
 
-    def record_weak_password_attempt(self, user_id: str, weakness_type: str) -> None:
-        self.weak_password_attempts.add(1, attributes={"user_id": user_id, "weakness_type": weakness_type})
+    def record_weak_password_attempt(self, weakness_type: str) -> None:
+        self.weak_password_attempts.add(1, attributes={"weakness_type": weakness_type})
 
     def record_brute_force_attempt(
         self, ip_address: str, target_user: str | None = None, action_taken: str = "logged"
@@ -146,11 +146,10 @@ def record_brute_force_attempt(
             },
         )
 
-    def record_account_locked(self, user_id: str, reason: str, duration_seconds: float | None = None) -> None:
+    def record_account_locked(self, reason: str, duration_seconds: float | None = None) -> None:
         self.accounts_locked.add(
             1,
             attributes={
-                "user_id": user_id,
                 "reason": reason,
                 "duration": str(duration_seconds) if duration_seconds else "permanent",
             },
diff --git a/backend/app/dlq/manager.py b/backend/app/dlq/manager.py
index 968219b5..1e23a70f 100644
--- a/backend/app/dlq/manager.py
+++ b/backend/app/dlq/manager.py
@@ -246,7 +246,7 @@ async def retry_messages_batch(self, event_ids: list[str]) -> DLQBatchRetryResul
                     details.append(DLQRetryResult(event_id=event_id, status="failed", error="Retry failed"))
             except Exception as e:
                 self.logger.error(f"Error retrying message {event_id}: {e}")
-                self.metrics.record_dlq_processing_error("batch_retry", event_id, type(e).__name__)
+                self.metrics.record_dlq_processing_error("batch_retry", "unknown", type(e).__name__)
                 failed += 1
                 details.append(DLQRetryResult(event_id=event_id, status="failed", error=str(e)))
 
diff --git a/backend/app/services/auth_service.py b/backend/app/services/auth_service.py
index 243ccd31..5132a736 100644
--- a/backend/app/services/auth_service.py
+++ b/backend/app/services/auth_service.py
@@ -31,6 +31,13 @@
 
 
 class AuthService:
+    # Pre-computed bcrypt hash used as a timing side-channel mitigation.
+    # When a login attempt targets a non-existent username, we still run
+    # verify_password against this hash so the response time is comparable
+    # to the "user exists, wrong password" path. Without this, an attacker
+    # could measure response times to enumerate valid usernames.
+    _dummy_hash = "$2b$12$hDE3I.Y1MHugA561T/NQgebE/IVQS.2YliUDGfqADq7v/MTUG6.Bi"
+
     def __init__(
         self,
         user_repo: UserRepository,
@@ -106,7 +113,7 @@ async def _fail_login(
             key=username,
         )
         if locked:
-            self.security_metrics.record_account_locked(username, "brute_force")
+            self.security_metrics.record_account_locked("brute_force")
             raise AccountLockedError("Account locked due to too many failed attempts")
         raise InvalidCredentialsError()
 
@@ -123,10 +130,11 @@ async def login(
         user = await self.user_repo.get_user(username)
 
         if not user:
+            self.security_service.verify_password(password, self._dummy_hash)
             await self._fail_login(username, "user_not_found", ip_address, user_agent)
 
         if not self.security_service.verify_password(password, user.hashed_password):
-            await self._fail_login(username, "invalid_password", ip_address, user_agent, user_id=str(user.user_id))
+            await self._fail_login(username, "invalid_password", ip_address, user_agent, user_id=user.user_id)
 
         await self._lockout.clear_attempts(username)
 
@@ -149,11 +157,11 @@ async def login(
 
         await self._producer.produce(
             event_to_produce=UserLoggedInEvent(
-                user_id=str(user.user_id),
+                user_id=user.user_id,
                 login_method=LoginMethod.PASSWORD,
                 ip_address=ip_address,
                 user_agent=user_agent,
-                metadata=self._build_metadata(user_id=str(user.user_id)),
+                metadata=self._build_metadata(user_id=user.user_id),
             ),
             key=user.username,
         )
@@ -177,7 +185,7 @@ async def register(
         effective = await self._runtime_settings.get_effective_settings()
         min_len = effective.password_min_length
         if len(password) < min_len:
-            self.security_metrics.record_weak_password_attempt(username, "too_short")
+            self.security_metrics.record_weak_password_attempt("too_short")
             raise ValidationError(f"Password must be at least {min_len} characters")
 
         existing = await self.user_repo.get_user(username)
@@ -210,10 +218,10 @@ async def register(
 
         await self._producer.produce(
             event_to_produce=UserRegisteredEvent(
-                user_id=str(created_user.user_id),
+                user_id=created_user.user_id,
                 username=created_user.username,
                 email=created_user.email,
-                metadata=self._build_metadata(user_id=str(created_user.user_id)),
+                metadata=self._build_metadata(user_id=created_user.user_id),
             ),
             key=created_user.username,
         )
@@ -224,11 +232,14 @@ async def publish_logout_event(self, token: str | None) -> None:
         if not token:
             return
         username = self.security_service.decode_token(token)
+        user = await self.user_repo.get_user(username)
+        if not user:
+            return
         await self._producer.produce(
             event_to_produce=UserLoggedOutEvent(
-                user_id=username,
+                user_id=user.user_id,
                 logout_reason="user_initiated",
-                metadata=self._build_metadata(user_id=username),
+                metadata=self._build_metadata(user_id=user.user_id),
             ),
             key=username,
         )
diff --git a/backend/app/services/notification_service.py b/backend/app/services/notification_service.py
index aaaa47d8..bd54b508 100644
--- a/backend/app/services/notification_service.py
+++ b/backend/app/services/notification_service.py
@@ -168,7 +168,7 @@ async def create_notification(
                 f"per {self.settings.NOTIF_THROTTLE_WINDOW_HOURS} hour(s)"
             )
             self.logger.warning(error_msg)
-            self.metrics.record_notification_throttled("general", user_id)
+            self.metrics.record_notification_throttled("general")
             raise NotificationThrottledError(
                 user_id,
                 self.settings.NOTIF_THROTTLE_MAX_PER_HOUR,
@@ -546,8 +546,7 @@ async def update_subscription(
                 raise NotificationValidationError("slack_webhook is required when enabling SLACK")
 
         result = await self.repository.upsert_subscription(user_id, channel, update_data)
-        action = "enabled" if update_data.enabled else "updated"
-        self.metrics.record_subscription_change(user_id, channel, action)
+        self.metrics.record_subscription_change(channel, update_data.enabled)
         return result
 
     async def mark_all_as_read(self, user_id: str) -> int:
diff --git a/backend/grafana/provisioning/dashboards/coordinator-execution.json b/backend/grafana/provisioning/dashboards/coordinator-execution.json
index 4026abe6..6481d9ad 100644
--- a/backend/grafana/provisioning/dashboards/coordinator-execution.json
+++ b/backend/grafana/provisioning/dashboards/coordinator-execution.json
@@ -37,7 +37,7 @@
       "id": 1,
       "targets": [
         {
-          "expr": "histogram_quantile(0.95, rate(coordinator_scheduling_duration_seconds_bucket[5m]))",
+          "expr": "histogram_quantile(0.95, sum(rate(coordinator_scheduling_duration_seconds_bucket[5m])) by (le))",
           "legendFormat": "p95",
           "refId": "A"
         }
@@ -61,7 +61,7 @@
       "id": 2,
       "targets": [
         {
-          "expr": "histogram_quantile(0.95, rate(coordinator_queue_wait_time_seconds_bucket[5m]))",
+          "expr": "histogram_quantile(0.95, sum(rate(coordinator_queue_wait_time_seconds_bucket[5m])) by (le))",
           "legendFormat": "p95",
           "refId": "A"
         }
@@ -144,7 +144,7 @@
       "id": 5,
       "targets": [
         {
-          "expr": "histogram_quantile(0.95, rate(execution_queue_wait_time_seconds_bucket[5m]))",
+          "expr": "histogram_quantile(0.95, sum(rate(execution_queue_wait_time_seconds_bucket[5m])) by (le))",
           "legendFormat": "p95",
           "refId": "A"
         }
@@ -210,12 +210,12 @@
       "id": 7,
       "targets": [
         {
-          "expr": "histogram_quantile(0.50, rate(script_memory_usage_MiB_bucket[5m]))",
+          "expr": "histogram_quantile(0.50, sum(rate(script_memory_usage_MiB_bucket[5m])) by (le))",
           "legendFormat": "p50",
           "refId": "A"
         },
         {
-          "expr": "histogram_quantile(0.95, rate(script_memory_usage_MiB_bucket[5m]))",
+          "expr": "histogram_quantile(0.95, sum(rate(script_memory_usage_MiB_bucket[5m])) by (le))",
           "legendFormat": "p95",
           "refId": "B"
         }
@@ -239,12 +239,12 @@
       "id": 8,
       "targets": [
         {
-          "expr": "histogram_quantile(0.50, rate(script_memory_utilization_percent_bucket[5m]))",
+          "expr": "histogram_quantile(0.50, sum(rate(script_memory_utilization_percent_bucket[5m])) by (le))",
           "legendFormat": "p50",
           "refId": "A"
         },
         {
-          "expr": "histogram_quantile(0.95, rate(script_memory_utilization_percent_bucket[5m]))",
+          "expr": "histogram_quantile(0.95, sum(rate(script_memory_utilization_percent_bucket[5m])) by (le))",
           "legendFormat": "p95",
           "refId": "B"
         }
diff --git a/backend/grafana/provisioning/dashboards/event-replay.json b/backend/grafana/provisioning/dashboards/event-replay.json
index 56a9eb47..edac9b2a 100644
--- a/backend/grafana/provisioning/dashboards/event-replay.json
+++ b/backend/grafana/provisioning/dashboards/event-replay.json
@@ -191,7 +191,7 @@
       "id": 6,
       "targets": [
         {
-          "expr": "histogram_quantile(0.95, rate(replay_duration_seconds_bucket[5m]))",
+          "expr": "histogram_quantile(0.95, sum(rate(replay_duration_seconds_bucket[5m])) by (le))",
           "legendFormat": "p95",
           "refId": "A"
         }
@@ -215,7 +215,7 @@
       "id": 7,
       "targets": [
         {
-          "expr": "histogram_quantile(0.95, rate(replay_event_processing_time_seconds_bucket[5m]))",
+          "expr": "histogram_quantile(0.95, sum(rate(replay_event_processing_time_seconds_bucket[5m])) by (le))",
           "legendFormat": "p95",
           "refId": "A"
         }
@@ -239,7 +239,7 @@
       "id": 8,
       "targets": [
         {
-          "expr": "histogram_quantile(0.50, rate(replay_throughput_event_per_second_bucket[5m]))",
+          "expr": "histogram_quantile(0.50, sum(rate(replay_throughput_event_per_second_bucket[5m])) by (le))",
           "legendFormat": "p50",
           "refId": "A"
         }
@@ -263,7 +263,7 @@
       "id": 9,
       "targets": [
         {
-          "expr": "histogram_quantile(0.50, rate(replay_batch_size_bucket[5m]))",
+          "expr": "histogram_quantile(0.50, sum(rate(replay_batch_size_bucket[5m])) by (le))",
           "legendFormat": "p50",
           "refId": "A"
         }
@@ -329,12 +329,12 @@
       "id": 11,
       "targets": [
         {
-          "expr": "histogram_quantile(0.50, rate(replay_speed_multiplier_x_bucket[5m]))",
+          "expr": "histogram_quantile(0.50, sum(rate(replay_speed_multiplier_x_bucket[5m])) by (le))",
           "legendFormat": "Multiplier p50",
           "refId": "A"
         },
         {
-          "expr": "histogram_quantile(0.50, rate(replay_delay_applied_seconds_bucket[5m]))",
+          "expr": "histogram_quantile(0.50, sum(rate(replay_delay_applied_seconds_bucket[5m])) by (le))",
           "legendFormat": "Delay p50",
           "refId": "B"
         }
diff --git a/backend/grafana/provisioning/dashboards/event-stream-monitoring.json b/backend/grafana/provisioning/dashboards/event-stream-monitoring.json
index fde9d831..f085c6d8 100644
--- a/backend/grafana/provisioning/dashboards/event-stream-monitoring.json
+++ b/backend/grafana/provisioning/dashboards/event-stream-monitoring.json
@@ -600,19 +600,6 @@
         "x": 0,
         "y": 16
       },
-      "id": 60,
-      "panels": [],
-      "title": "Event Buffer Performance",
-      "type": "row"
-    },
-    {
-      "collapsed": false,
-      "gridPos": {
-        "h": 1,
-        "w": 24,
-        "x": 0,
-        "y": 33
-      },
       "id": 65,
       "panels": [],
       "title": "Event Flow Analysis",
diff --git a/backend/grafana/provisioning/dashboards/http-middleware.json b/backend/grafana/provisioning/dashboards/http-middleware.json
index 908d3a90..37240637 100644
--- a/backend/grafana/provisioning/dashboards/http-middleware.json
+++ b/backend/grafana/provisioning/dashboards/http-middleware.json
@@ -61,7 +61,7 @@
       "id": 2,
       "targets": [
         {
-          "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))",
+          "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
           "legendFormat": "p95",
           "refId": "A"
         }
@@ -108,12 +108,12 @@
       "id": 4,
       "targets": [
         {
-          "expr": "histogram_quantile(0.50, rate(http_request_size_bytes_bucket[5m]))",
+          "expr": "histogram_quantile(0.50, sum(rate(http_request_size_bytes_bucket[5m])) by (le))",
           "legendFormat": "Request p50",
           "refId": "A"
         },
         {
-          "expr": "histogram_quantile(0.50, rate(http_response_size_bytes_bucket[5m]))",
+          "expr": "histogram_quantile(0.50, sum(rate(http_response_size_bytes_bucket[5m])) by (le))",
           "legendFormat": "Response p50",
           "refId": "B"
         }
@@ -179,7 +179,7 @@
       "id": 6,
       "targets": [
         {
-          "expr": "histogram_quantile(0.95, rate(mongodb_event_query_duration_seconds_bucket[5m]))",
+          "expr": "histogram_quantile(0.95, sum(rate(mongodb_event_query_duration_seconds_bucket[5m])) by (le))",
           "legendFormat": "p95",
           "refId": "A"
         }
@@ -231,7 +231,7 @@
           "refId": "A"
         },
         {
-          "expr": "histogram_quantile(0.95, rate(idempotency_processing_duration_seconds_bucket[5m]))",
+          "expr": "histogram_quantile(0.95, sum(rate(idempotency_processing_duration_seconds_bucket[5m])) by (le))",
           "legendFormat": "Idempotency p95",
           "refId": "B"
         }
diff --git a/backend/grafana/provisioning/dashboards/kubernetes-pods.json b/backend/grafana/provisioning/dashboards/kubernetes-pods.json
index 47cf6dd4..812200ac 100644
--- a/backend/grafana/provisioning/dashboards/kubernetes-pods.json
+++ b/backend/grafana/provisioning/dashboards/kubernetes-pods.json
@@ -66,7 +66,7 @@
       "id": 2,
       "targets": [
         {
-          "expr": "histogram_quantile(0.95, rate(pod_creation_duration_seconds_bucket[5m]))",
+          "expr": "histogram_quantile(0.95, sum(rate(pod_creation_duration_seconds_bucket[5m])) by (le))",
           "legendFormat": "p95",
           "refId": "A"
         }
@@ -173,12 +173,12 @@
       "id": 6,
       "targets": [
         {
-          "expr": "histogram_quantile(0.50, rate(pod_lifetime_seconds_bucket[5m]))",
+          "expr": "histogram_quantile(0.50, sum(rate(pod_lifetime_seconds_bucket[5m])) by (le))",
           "legendFormat": "p50",
           "refId": "A"
         },
         {
-          "expr": "histogram_quantile(0.95, rate(pod_lifetime_seconds_bucket[5m]))",
+          "expr": "histogram_quantile(0.95, sum(rate(pod_lifetime_seconds_bucket[5m])) by (le))",
           "legendFormat": "p95",
           "refId": "B"
         }
@@ -290,7 +290,7 @@
       "id": 10,
       "targets": [
         {
-          "expr": "histogram_quantile(0.95, rate(pod_monitor_processing_duration_seconds_bucket[5m]))",
+          "expr": "histogram_quantile(0.95, sum(rate(pod_monitor_processing_duration_seconds_bucket[5m])) by (le))",
           "legendFormat": "p95",
           "refId": "A"
         }
diff --git a/backend/grafana/provisioning/dashboards/notifications.json b/backend/grafana/provisioning/dashboards/notifications.json
index 97096a25..a076c8e9 100644
--- a/backend/grafana/provisioning/dashboards/notifications.json
+++ b/backend/grafana/provisioning/dashboards/notifications.json
@@ -101,29 +101,6 @@
       "title": "Queued",
       "type": "stat"
     },
-    {
-      "datasource": "Victoria Metrics",
-      "fieldConfig": {
-        "defaults": {
-          "unit": "short"
-        }
-      },
-      "gridPos": {
-        "h": 4,
-        "w": 4,
-        "x": 20,
-        "y": 1
-      },
-      "id": 4,
-      "targets": [
-        {
-          "expr": "notifications_unread_count",
-          "refId": "A"
-        }
-      ],
-      "title": "Unread",
-      "type": "stat"
-    },
     {
       "collapsed": false,
       "gridPos": {
@@ -189,7 +166,7 @@
       "datasource": "Victoria Metrics",
       "fieldConfig": {
         "defaults": {
-          "unit": "short"
+          "unit": "s"
         }
       },
       "gridPos": {
@@ -201,7 +178,7 @@
       "id": 7,
       "targets": [
         {
-          "expr": "histogram_quantile(0.95, rate(notification_channel_delivery_time_seconds_bucket[5m]))",
+          "expr": "histogram_quantile(0.95, sum(rate(notification_channel_delivery_time_seconds_bucket[5m])) by (le))",
           "legendFormat": "p95",
           "refId": "A"
         }
@@ -250,7 +227,7 @@
       "datasource": "Victoria Metrics",
       "fieldConfig": {
         "defaults": {
-          "unit": "short"
+          "unit": "s"
         }
       },
       "gridPos": {
@@ -262,7 +239,7 @@
       "id": 9,
       "targets": [
         {
-          "expr": "histogram_quantile(0.95, rate(notification_delivery_time_seconds_bucket[5m]))",
+          "expr": "histogram_quantile(0.95, sum(rate(notification_delivery_time_seconds_bucket[5m])) by (le))",
           "legendFormat": "p95",
           "refId": "A"
         }
@@ -357,7 +334,7 @@
           "refId": "A"
         },
         {
-          "expr": "histogram_quantile(0.50, rate(notification_retry_success_rate_percent_bucket[5m]))",
+          "expr": "histogram_quantile(0.50, sum(rate(notification_retry_success_rate_percent_bucket[5m])) by (le))",
           "legendFormat": "Success Rate p50",
           "refId": "B"
         }
@@ -382,7 +359,7 @@
       "datasource": "Victoria Metrics",
       "fieldConfig": {
         "defaults": {
-          "unit": "short"
+          "unit": "s"
         }
       },
       "gridPos": {
@@ -394,7 +371,7 @@
       "id": 13,
       "targets": [
         {
-          "expr": "histogram_quantile(0.95, rate(notification_webhook_delivery_time_seconds_bucket[5m]))",
+          "expr": "histogram_quantile(0.95, sum(rate(notification_webhook_delivery_time_seconds_bucket[5m])) by (le))",
           "legendFormat": "p95",
           "refId": "A"
         },
@@ -411,7 +388,7 @@
       "datasource": "Victoria Metrics",
       "fieldConfig": {
         "defaults": {
-          "unit": "short"
+          "unit": "s"
         }
       },
       "gridPos": {
@@ -423,7 +400,7 @@
       "id": 14,
       "targets": [
         {
-          "expr": "histogram_quantile(0.95, rate(notification_slack_delivery_time_seconds_bucket[5m]))",
+          "expr": "histogram_quantile(0.95, sum(rate(notification_slack_delivery_time_seconds_bucket[5m])) by (le))",
           "legendFormat": "p95",
           "refId": "A"
         },
@@ -464,15 +441,10 @@
       },
       "id": 15,
       "targets": [
-        {
-          "expr": "notification_subscriptions_active",
-          "legendFormat": "Active",
-          "refId": "A"
-        },
         {
           "expr": "rate(notification_subscription_changes_total[5m])",
-          "legendFormat": "Changes",
-          "refId": "B"
+          "legendFormat": "Changes/s",
+          "refId": "A"
         }
       ],
       "title": "Subscriptions",
diff --git a/backend/grafana/provisioning/dashboards/security-auth.json b/backend/grafana/provisioning/dashboards/security-auth.json
index 191696bf..25cf242f 100644
--- a/backend/grafana/provisioning/dashboards/security-auth.json
+++ b/backend/grafana/provisioning/dashboards/security-auth.json
@@ -66,7 +66,7 @@
       "id": 2,
       "targets": [
         {
-          "expr": "histogram_quantile(0.95, rate(authentication_duration_seconds_bucket[5m]))",
+          "expr": "histogram_quantile(0.95, sum(rate(authentication_duration_seconds_bucket[5m])) by (le))",
           "legendFormat": "p95",
           "refId": "A"
         }
@@ -229,7 +229,7 @@
       "id": 8,
       "targets": [
         {
-          "expr": "histogram_quantile(0.50, rate(token_expiry_time_seconds_bucket[5m]))",
+          "expr": "histogram_quantile(0.50, sum(rate(token_expiry_time_seconds_bucket[5m])) by (le))",
           "legendFormat": "p50",
           "refId": "A"
         }
diff --git a/backend/tests/contract/test_grafana_metrics.py b/backend/tests/contract/test_grafana_metrics.py
index 41dcec5b..7b63bada 100644
--- a/backend/tests/contract/test_grafana_metrics.py
+++ b/backend/tests/contract/test_grafana_metrics.py
@@ -49,19 +49,26 @@
 
 
 @pytest.fixture(scope="module")
-def prometheus_families() -> dict[str, set[str]]:
+def prometheus_families(request: pytest.FixtureRequest) -> dict[str, set[str]]:
     """Instantiate every metric class through the real OTel -> Prometheus pipeline.
 
     Returns:
         Mapping of family name to set of sample names produced by that family.
     """
     # pytest-env sets OTEL_SDK_DISABLED=true; override so the real SDK is active.
-    os.environ.pop("OTEL_SDK_DISABLED", None)
+    old_otel_disabled = os.environ.pop("OTEL_SDK_DISABLED", None)
 
     reader = PrometheusMetricReader()
     provider = MeterProvider(metric_readers=[reader])
     otel_api.set_meter_provider(provider)
 
+    def _teardown() -> None:
+        provider.shutdown()
+        if old_otel_disabled is not None:
+            os.environ["OTEL_SDK_DISABLED"] = old_otel_disabled
+
+    request.addfinalizer(_teardown)
+
     for _, mod_name, _ in pkgutil.iter_modules(metrics_pkg.__path__):
         importlib.import_module(f"app.core.metrics.{mod_name}")
 
diff --git a/backend/tests/unit/core/metrics/test_kubernetes_and_notifications_metrics.py b/backend/tests/unit/core/metrics/test_kubernetes_and_notifications_metrics.py
index d6597c9c..c1ae315a 100644
--- a/backend/tests/unit/core/metrics/test_kubernetes_and_notifications_metrics.py
+++ b/backend/tests/unit/core/metrics/test_kubernetes_and_notifications_metrics.py
@@ -37,16 +37,15 @@ def test_notification_metrics_methods(test_settings: Settings) -> None:
     m.record_notification_delivery_time(0.5, "welcome", channel="email")
     m.record_notification_status_change("n1", "pending", "queued")
     m.record_notification_read("welcome")
-    m.decrement_unread_count("u1")
-    m.record_notification_throttled("welcome", "u1")
-    m.record_throttle_window_hit("u1")
+    m.record_notification_throttled("welcome")
+    m.record_throttle_window_hit()
     m.record_notification_retry("welcome", 1, False)
     m.record_notification_retry("welcome", 2, True)
     m.record_webhook_delivery(0.3, 200, "/hooks/*")
     m.record_slack_delivery(0.4, "#general", False, error_type="rate_limited")
-    m.update_active_subscriptions("u1", 3)
-    m.update_active_subscriptions("u1", 1)
-    m.record_subscription_change("u1", "welcome", "subscribe")
+    m.record_subscription_change("email", True)
+    m.record_subscription_change("webhook", False)
+    m.record_subscription_change("slack", None)
     m.increment_pending_notifications()
     m.decrement_pending_notifications()
     m.increment_queued_notifications()
diff --git a/backend/tests/unit/core/metrics/test_replay_and_security_metrics.py b/backend/tests/unit/core/metrics/test_replay_and_security_metrics.py
index 7db12533..82c52aaf 100644
--- a/backend/tests/unit/core/metrics/test_replay_and_security_metrics.py
+++ b/backend/tests/unit/core/metrics/test_replay_and_security_metrics.py
@@ -41,6 +41,6 @@ def test_security_metrics_methods(test_settings: Settings) -> None:
     m.record_csrf_token_generated()
     m.record_csrf_validation_failure("missing")
     m.record_password_reset_request("u1", method="admin")
-    m.record_weak_password_attempt("u1", "common_password")
+    m.record_weak_password_attempt("common_password")
     m.record_brute_force_attempt("1.2.3.4", target_user="u1", action_taken="blocked")
-    m.record_account_locked("u1", "brute_force", duration_seconds=600)
+    m.record_account_locked("brute_force", duration_seconds=600)

From 2fab9b30f155e11970a1100f4b61a488a23c17be Mon Sep 17 00:00:00 2001
From: HardMax71 <maxymazatyan@gmail.com>
Date: Mon, 16 Feb 2026 21:23:08 +0100
Subject: [PATCH 7/9] fix: detected issues

---
 backend/app/core/metrics/dlq.py               |  5 --
 backend/app/core/metrics/security.py          | 34 +++++---
 .../app/services/admin/admin_user_service.py  |  2 +-
 backend/app/services/auth_service.py          | 79 ++++++++++---------
 .../dashboards/coordinator-execution.json     | 10 +--
 .../dashboards/http-middleware.json           |  6 +-
 .../dashboards/security-auth.json             |  4 +-
 .../metrics/test_database_and_dlq_metrics.py  |  2 -
 .../unit/core/metrics/test_metrics_classes.py |  2 +-
 .../test_replay_and_security_metrics.py       |  6 +-
 10 files changed, 77 insertions(+), 73 deletions(-)

diff --git a/backend/app/core/metrics/dlq.py b/backend/app/core/metrics/dlq.py
index 1a73d477..d4262223 100644
--- a/backend/app/core/metrics/dlq.py
+++ b/backend/app/core/metrics/dlq.py
@@ -66,8 +66,3 @@ def record_dlq_processing_error(self, original_topic: str, event_type: str, erro
             1, attributes={"original_topic": original_topic, "event_type": event_type, "error_type": error_type}
         )
 
-    def increment_dlq_queue_size(self, original_topic: str) -> None:
-        self.dlq_queue_size.add(1, attributes={"original_topic": original_topic})
-
-    def decrement_dlq_queue_size(self, original_topic: str) -> None:
-        self.dlq_queue_size.add(-1, attributes={"original_topic": original_topic})
diff --git a/backend/app/core/metrics/security.py b/backend/app/core/metrics/security.py
index 15086dfb..82f4ca7c 100644
--- a/backend/app/core/metrics/security.py
+++ b/backend/app/core/metrics/security.py
@@ -1,3 +1,7 @@
+import time
+from collections.abc import Generator
+from contextlib import contextmanager
+
 from app.core.metrics.base import BaseMetrics
 
 
@@ -75,18 +79,22 @@ def _create_instruments(self) -> None:
             name="accounts.locked.total", description="Total number of accounts locked due to security", unit="1"
         )
 
-    def record_authentication_attempt(
-        self, method: str, success: bool, user_id: str | None = None, duration_seconds: float | None = None
-    ) -> None:
-        self.authentication_attempts.add(
-            1, attributes={"method": method, "success": str(success), "user_id": user_id or "unknown"}
-        )
-
+    def record_authentication_attempt(self, method: str, success: bool, duration_seconds: float) -> None:
+        attrs = {"method": method, "success": str(success)}
+        self.authentication_attempts.add(1, attributes=attrs)
         if not success:
-            self.authentication_failures.add(1, attributes={"method": method, "user_id": user_id or "unknown"})
-
-        if duration_seconds is not None:
-            self.authentication_duration.record(duration_seconds, attributes={"method": method})
+            self.authentication_failures.add(1, attributes={"method": method})
+        self.authentication_duration.record(duration_seconds, attributes={"method": method})
+
+    @contextmanager
+    def track_authentication(self, method: str) -> Generator[None, None, None]:
+        start = time.monotonic()
+        success = False
+        try:
+            yield
+            success = True
+        finally:
+            self.record_authentication_attempt(method, success, time.monotonic() - start)
 
     def increment_active_sessions(self) -> None:
         self.active_sessions.add(1)
@@ -128,8 +136,8 @@ def record_csrf_token_generated(self) -> None:
     def record_csrf_validation_failure(self, reason: str) -> None:
         self.csrf_validation_failures.add(1, attributes={"reason": reason})
 
-    def record_password_reset_request(self, user_id: str, method: str = "admin") -> None:
-        self.password_reset_requests.add(1, attributes={"user_id": user_id, "method": method})
+    def record_password_reset_request(self, method: str = "admin") -> None:
+        self.password_reset_requests.add(1, attributes={"method": method})
 
     def record_weak_password_attempt(self, weakness_type: str) -> None:
         self.weak_password_attempts.add(1, attributes={"weakness_type": weakness_type})
diff --git a/backend/app/services/admin/admin_user_service.py b/backend/app/services/admin/admin_user_service.py
index b9ed09af..bdd61a76 100644
--- a/backend/app/services/admin/admin_user_service.py
+++ b/backend/app/services/admin/admin_user_service.py
@@ -205,7 +205,7 @@ async def reset_user_password(self, *, admin_user_id: str, user_id: str, new_pas
         self.logger.info(
             "Admin resetting user password", admin_user_id=admin_user_id, target_user_id=user_id
         )
-        self._security_metrics.record_password_reset_request(user_id, method="admin")
+        self._security_metrics.record_password_reset_request(method="admin")
         hashed = self._security.get_password_hash(new_password)
         pr = PasswordReset(user_id=user_id, new_password=hashed)
         ok = await self._users.reset_user_password(pr)
diff --git a/backend/app/services/auth_service.py b/backend/app/services/auth_service.py
index 5132a736..713c72cd 100644
--- a/backend/app/services/auth_service.py
+++ b/backend/app/services/auth_service.py
@@ -127,52 +127,53 @@ async def login(
         if await self._lockout.check_locked(username):
             raise AccountLockedError("Account temporarily locked due to too many failed attempts")
 
-        user = await self.user_repo.get_user(username)
-
-        if not user:
-            self.security_service.verify_password(password, self._dummy_hash)
-            await self._fail_login(username, "user_not_found", ip_address, user_agent)
+        with self.security_metrics.track_authentication("login"):
+            user = await self.user_repo.get_user(username)
 
-        if not self.security_service.verify_password(password, user.hashed_password):
-            await self._fail_login(username, "invalid_password", ip_address, user_agent, user_id=user.user_id)
+            if not user:
+                self.security_service.verify_password(password, self._dummy_hash)
+                await self._fail_login(username, "user_not_found", ip_address, user_agent)
 
-        await self._lockout.clear_attempts(username)
+            if not self.security_service.verify_password(password, user.hashed_password):
+                await self._fail_login(username, "invalid_password", ip_address, user_agent, user_id=user.user_id)
 
-        effective = await self._runtime_settings.get_effective_settings()
-        session_timeout = effective.session_timeout_minutes
-
-        self.logger.info(
-            "Login successful",
-            username=user.username,
-            client_ip=ip_address,
-            user_agent=user_agent,
-            token_expires_in_minutes=session_timeout,
-        )
+            await self._lockout.clear_attempts(username)
 
-        access_token_expires = timedelta(minutes=session_timeout)
-        access_token = self.security_service.create_access_token(
-            data={"sub": user.username}, expires_delta=access_token_expires,
-        )
-        csrf_token = self.security_service.generate_csrf_token(access_token)
+            effective = await self._runtime_settings.get_effective_settings()
+            session_timeout = effective.session_timeout_minutes
 
-        await self._producer.produce(
-            event_to_produce=UserLoggedInEvent(
-                user_id=user.user_id,
-                login_method=LoginMethod.PASSWORD,
-                ip_address=ip_address,
+            self.logger.info(
+                "Login successful",
+                username=user.username,
+                client_ip=ip_address,
                 user_agent=user_agent,
-                metadata=self._build_metadata(user_id=user.user_id),
-            ),
-            key=user.username,
-        )
+                token_expires_in_minutes=session_timeout,
+            )
 
-        return LoginResult(
-            username=user.username,
-            role=user.role,
-            access_token=access_token,
-            csrf_token=csrf_token,
-            session_timeout_minutes=session_timeout,
-        )
+            access_token_expires = timedelta(minutes=session_timeout)
+            access_token = self.security_service.create_access_token(
+                data={"sub": user.username}, expires_delta=access_token_expires,
+            )
+            csrf_token = self.security_service.generate_csrf_token(access_token)
+
+            await self._producer.produce(
+                event_to_produce=UserLoggedInEvent(
+                    user_id=user.user_id,
+                    login_method=LoginMethod.PASSWORD,
+                    ip_address=ip_address,
+                    user_agent=user_agent,
+                    metadata=self._build_metadata(user_id=user.user_id),
+                ),
+                key=user.username,
+            )
+
+            return LoginResult(
+                username=user.username,
+                role=user.role,
+                access_token=access_token,
+                csrf_token=csrf_token,
+                session_timeout_minutes=session_timeout,
+            )
 
     async def register(
         self,
diff --git a/backend/grafana/provisioning/dashboards/coordinator-execution.json b/backend/grafana/provisioning/dashboards/coordinator-execution.json
index 6481d9ad..0e120ccd 100644
--- a/backend/grafana/provisioning/dashboards/coordinator-execution.json
+++ b/backend/grafana/provisioning/dashboards/coordinator-execution.json
@@ -25,7 +25,7 @@
       "datasource": "Victoria Metrics",
       "fieldConfig": {
         "defaults": {
-          "unit": "short"
+          "unit": "s"
         }
       },
       "gridPos": {
@@ -49,7 +49,7 @@
       "datasource": "Victoria Metrics",
       "fieldConfig": {
         "defaults": {
-          "unit": "short"
+          "unit": "s"
         }
       },
       "gridPos": {
@@ -132,7 +132,7 @@
       "datasource": "Victoria Metrics",
       "fieldConfig": {
         "defaults": {
-          "unit": "short"
+          "unit": "s"
         }
       },
       "gridPos": {
@@ -198,7 +198,7 @@
       "datasource": "Victoria Metrics",
       "fieldConfig": {
         "defaults": {
-          "unit": "short"
+          "unit": "decmbytes"
         }
       },
       "gridPos": {
@@ -227,7 +227,7 @@
       "datasource": "Victoria Metrics",
       "fieldConfig": {
         "defaults": {
-          "unit": "short"
+          "unit": "percent"
         }
       },
       "gridPos": {
diff --git a/backend/grafana/provisioning/dashboards/http-middleware.json b/backend/grafana/provisioning/dashboards/http-middleware.json
index 37240637..506e3d21 100644
--- a/backend/grafana/provisioning/dashboards/http-middleware.json
+++ b/backend/grafana/provisioning/dashboards/http-middleware.json
@@ -49,7 +49,7 @@
       "datasource": "Victoria Metrics",
       "fieldConfig": {
         "defaults": {
-          "unit": "short"
+          "unit": "s"
         }
       },
       "gridPos": {
@@ -96,7 +96,7 @@
       "datasource": "Victoria Metrics",
       "fieldConfig": {
         "defaults": {
-          "unit": "short"
+          "unit": "bytes"
         }
       },
       "gridPos": {
@@ -167,7 +167,7 @@
       "datasource": "Victoria Metrics",
       "fieldConfig": {
         "defaults": {
-          "unit": "short"
+          "unit": "s"
         }
       },
       "gridPos": {
diff --git a/backend/grafana/provisioning/dashboards/security-auth.json b/backend/grafana/provisioning/dashboards/security-auth.json
index 25cf242f..4c4ca87f 100644
--- a/backend/grafana/provisioning/dashboards/security-auth.json
+++ b/backend/grafana/provisioning/dashboards/security-auth.json
@@ -54,7 +54,7 @@
       "datasource": "Victoria Metrics",
       "fieldConfig": {
         "defaults": {
-          "unit": "short"
+          "unit": "s"
         }
       },
       "gridPos": {
@@ -217,7 +217,7 @@
       "datasource": "Victoria Metrics",
       "fieldConfig": {
         "defaults": {
-          "unit": "short"
+          "unit": "s"
         }
       },
       "gridPos": {
diff --git a/backend/tests/unit/core/metrics/test_database_and_dlq_metrics.py b/backend/tests/unit/core/metrics/test_database_and_dlq_metrics.py
index 03f26f74..7484a573 100644
--- a/backend/tests/unit/core/metrics/test_database_and_dlq_metrics.py
+++ b/backend/tests/unit/core/metrics/test_database_and_dlq_metrics.py
@@ -37,5 +37,3 @@ def test_dlq_metrics_methods(test_settings: Settings) -> None:
     m.update_dlq_queue_size("topic", 7)
     m.record_dlq_message_age(5.0)
     m.record_dlq_processing_error("topic", "etype", "err")
-    m.increment_dlq_queue_size("topic")
-    m.decrement_dlq_queue_size("topic")
diff --git a/backend/tests/unit/core/metrics/test_metrics_classes.py b/backend/tests/unit/core/metrics/test_metrics_classes.py
index 3e198ef0..fd08c5ca 100644
--- a/backend/tests/unit/core/metrics/test_metrics_classes.py
+++ b/backend/tests/unit/core/metrics/test_metrics_classes.py
@@ -58,4 +58,4 @@ def test_other_metrics_classes_smoke(test_settings: Settings) -> None:
     NotificationMetrics(test_settings).record_notification_sent("welcome", channel="email")
     RateLimitMetrics(test_settings).record_request("/api/test", True, "sliding_window")
     ReplayMetrics(test_settings).record_session_created("by_id", "kafka")
-    SecurityMetrics(test_settings).record_authentication_attempt("password", True)
+    SecurityMetrics(test_settings).record_authentication_attempt("password", True, 0.1)
diff --git a/backend/tests/unit/core/metrics/test_replay_and_security_metrics.py b/backend/tests/unit/core/metrics/test_replay_and_security_metrics.py
index 82c52aaf..cddbfe13 100644
--- a/backend/tests/unit/core/metrics/test_replay_and_security_metrics.py
+++ b/backend/tests/unit/core/metrics/test_replay_and_security_metrics.py
@@ -31,7 +31,9 @@ def test_replay_metrics_methods(test_settings: Settings) -> None:
 def test_security_metrics_methods(test_settings: Settings) -> None:
     """Test SecurityMetrics methods with no-op metrics."""
     m = SecurityMetrics(test_settings)
-    m.record_authentication_attempt("password", False, user_id="u1", duration_seconds=0.2)
+    m.record_authentication_attempt("password", False, 0.2)
+    with m.track_authentication("password"):
+        pass
     m.increment_active_sessions()
     m.decrement_active_sessions()
     m.record_token_generated("access", 3600)
@@ -40,7 +42,7 @@ def test_security_metrics_methods(test_settings: Settings) -> None:
     m.record_authorization_check("/admin", "GET", False, user_role="user")
     m.record_csrf_token_generated()
     m.record_csrf_validation_failure("missing")
-    m.record_password_reset_request("u1", method="admin")
+    m.record_password_reset_request(method="admin")
     m.record_weak_password_attempt("common_password")
     m.record_brute_force_attempt("1.2.3.4", target_user="u1", action_taken="blocked")
     m.record_account_locked("brute_force", duration_seconds=600)

From 3f37c306e3817dab0f128a0d2224323d69c53ea3 Mon Sep 17 00:00:00 2001
From: HardMax71 <maxymazatyan@gmail.com>
Date: Mon, 16 Feb 2026 23:28:17 +0100
Subject: [PATCH 8/9] fix: detected issues

---
 .../grafana/provisioning/dashboards/coordinator-execution.json  | 2 +-
 backend/grafana/provisioning/dashboards/http-middleware.json    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/backend/grafana/provisioning/dashboards/coordinator-execution.json b/backend/grafana/provisioning/dashboards/coordinator-execution.json
index 0e120ccd..282c3c8a 100644
--- a/backend/grafana/provisioning/dashboards/coordinator-execution.json
+++ b/backend/grafana/provisioning/dashboards/coordinator-execution.json
@@ -198,7 +198,7 @@
       "datasource": "Victoria Metrics",
       "fieldConfig": {
         "defaults": {
-          "unit": "decmbytes"
+          "unit": "mbytes"
         }
       },
       "gridPos": {
diff --git a/backend/grafana/provisioning/dashboards/http-middleware.json b/backend/grafana/provisioning/dashboards/http-middleware.json
index 506e3d21..f3c7dc1e 100644
--- a/backend/grafana/provisioning/dashboards/http-middleware.json
+++ b/backend/grafana/provisioning/dashboards/http-middleware.json
@@ -321,7 +321,7 @@
       "datasource": "Victoria Metrics",
       "fieldConfig": {
         "defaults": {
-          "unit": "short"
+          "unit": "percent"
         }
       },
       "gridPos": {

From 24c40ca2420cda3f1e77f68658c9ba8f2c8af37d Mon Sep 17 00:00:00 2001
From: HardMax71 <maxymazatyan@gmail.com>
Date: Mon, 16 Feb 2026 23:33:27 +0100
Subject: [PATCH 9/9] fix: plot with http ops

---
 .../dashboards/http-middleware.json           | 29 +++++++++++++++----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/backend/grafana/provisioning/dashboards/http-middleware.json b/backend/grafana/provisioning/dashboards/http-middleware.json
index f3c7dc1e..8f03fbf6 100644
--- a/backend/grafana/provisioning/dashboards/http-middleware.json
+++ b/backend/grafana/provisioning/dashboards/http-middleware.json
@@ -219,7 +219,7 @@
       },
       "gridPos": {
         "h": 6,
-        "w": 18,
+        "w": 9,
         "x": 6,
         "y": 20
       },
@@ -229,14 +229,33 @@
           "expr": "rate(event_store_operations_total[5m])",
           "legendFormat": "Store Ops",
           "refId": "A"
-        },
+        }
+      ],
+      "title": "Event Store Operations",
+      "type": "timeseries"
+    },
+    {
+      "datasource": "Victoria Metrics",
+      "fieldConfig": {
+        "defaults": {
+          "unit": "s"
+        }
+      },
+      "gridPos": {
+        "h": 6,
+        "w": 9,
+        "x": 15,
+        "y": 20
+      },
+      "id": 13,
+      "targets": [
         {
           "expr": "histogram_quantile(0.95, sum(rate(idempotency_processing_duration_seconds_bucket[5m])) by (le))",
-          "legendFormat": "Idempotency p95",
-          "refId": "B"
+          "legendFormat": "p95",
+          "refId": "A"
         }
       ],
-      "title": "Event Store & Idempotency",
+      "title": "Idempotency Processing Duration",
       "type": "timeseries"
     },
     {