From 845d8002d53a10041a1ec1842479295f5be081f2 Mon Sep 17 00:00:00 2001 From: HardMax71 Date: Mon, 16 Feb 2026 13:53:32 +0100 Subject: [PATCH 1/9] fix: grafana url updated, also user/pass is passed through gh secrets, not hardcoded in plaintext --- .github/workflows/release-deploy.yml | 7 ++++++- backend/grafana/grafana.ini | 8 -------- docker-compose.yaml | 3 +++ 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/release-deploy.yml b/.github/workflows/release-deploy.yml index 30c9f168..3bb636db 100644 --- a/.github/workflows/release-deploy.yml +++ b/.github/workflows/release-deploy.yml @@ -122,11 +122,13 @@ jobs: GHCR_TOKEN: ${{ secrets.DEPLOY_GHCR_TOKEN }} GHCR_USER: ${{ github.repository_owner }} IMAGE_TAG: ${{ needs.release.outputs.version }} + GRAFANA_ADMIN_USER: ${{ secrets.GRAFANA_ADMIN_USER }} + GRAFANA_ADMIN_PASSWORD: ${{ secrets.GRAFANA_ADMIN_PASSWORD }} with: host: ${{ secrets.DEPLOY_HOST }} username: ${{ secrets.DEPLOY_USER }} key: ${{ secrets.DEPLOY_SSH_KEY }} - envs: GHCR_TOKEN,GHCR_USER,IMAGE_TAG + envs: GHCR_TOKEN,GHCR_USER,IMAGE_TAG,GRAFANA_ADMIN_USER,GRAFANA_ADMIN_PASSWORD command_timeout: 10m script: | set -e @@ -138,6 +140,9 @@ jobs: export IMAGE_TAG="$IMAGE_TAG" export COMPOSE_PROFILES=observability + export GRAFANA_ROOT_URL="https://grafana.integr8scode.cc/" + export GRAFANA_ADMIN_USER="$GRAFANA_ADMIN_USER" + export GRAFANA_ADMIN_PASSWORD="$GRAFANA_ADMIN_PASSWORD" docker compose pull docker compose up -d --remove-orphans --no-build --wait --wait-timeout 180 diff --git a/backend/grafana/grafana.ini b/backend/grafana/grafana.ini index bf9130c9..b98be700 100644 --- a/backend/grafana/grafana.ini +++ b/backend/grafana/grafana.ini @@ -1,11 +1,3 @@ -[server] -root_url = %(protocol)s://%(domain)s:%(http_port)s/grafana/ -serve_from_sub_path = true - -[security] -admin_user = admin -admin_password = admin123 - [users] allow_sign_up = false allow_org_create = false diff --git a/docker-compose.yaml b/docker-compose.yaml index c456492f..45d0f098 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -173,6 +173,9 @@ services: - app-network environment: - GF_LOG_LEVEL=warn + - GF_SERVER_ROOT_URL=${GRAFANA_ROOT_URL:-http://localhost:3000/} + - GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER:-admin} + - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin123} # Kafka Infrastructure for Event-Driven Design # Certificate generator for Zookeeper/Kafka SSL From b525f70fe24e53bd701eb0a7c2b8169e386cd77b Mon Sep 17 00:00:00 2001 From: HardMax71 Date: Mon, 16 Feb 2026 13:53:52 +0100 Subject: [PATCH 2/9] fix: nginx rewrite for grafana path --- frontend/nginx.conf.template | 1 + 1 file changed, 1 insertion(+) diff --git a/frontend/nginx.conf.template b/frontend/nginx.conf.template index 88df162c..532977d0 100644 --- a/frontend/nginx.conf.template +++ b/frontend/nginx.conf.template @@ -94,6 +94,7 @@ server { location /grafana/ { resolver 127.0.0.11 valid=30s ipv6=off; set $grafana_upstream http://grafana:3000; + rewrite ^/grafana/(.*) /$1 break; proxy_pass $grafana_upstream; proxy_set_header Host $host; proxy_set_header X-Real-IP $remote_addr; From fecc94ed4dd87b8469bd0100826fb7888d798d35 Mon Sep 17 00:00:00 2001 From: HardMax71 Date: Mon, 16 Feb 2026 16:58:54 +0100 Subject: [PATCH 3/9] feat: contract testing for metrics/labels for metrics backend <-> grafana; also added docs file explaining what is what --- .../dashboards/dlq-monitoring.json | 16 +- backend/pyproject.toml | 1 + backend/tests/contract/__init__.py | 0 .../tests/contract/test_grafana_metrics.py | 178 ++++++++++++++++++ docs/testing/contract-testing.md | 70 +++++++ mkdocs.yml | 1 + 6 files changed, 258 insertions(+), 8 deletions(-) create mode 100644 backend/tests/contract/__init__.py create mode 100644 backend/tests/contract/test_grafana_metrics.py create mode 100644 docs/testing/contract-testing.md diff --git a/backend/grafana/provisioning/dashboards/dlq-monitoring.json b/backend/grafana/provisioning/dashboards/dlq-monitoring.json index 5030ade0..74e95eab 100644 --- a/backend/grafana/provisioning/dashboards/dlq-monitoring.json +++ b/backend/grafana/provisioning/dashboards/dlq-monitoring.json @@ -591,17 +591,17 @@ "pluginVersion": "8.3.3", "targets": [ { - "expr": "histogram_quantile(0.50, sum(rate(dlq_message_age_bucket[5m])) by (le))", + "expr": "histogram_quantile(0.50, sum(rate(dlq_message_age_seconds_bucket[5m])) by (le))", "legendFormat": "p50", "refId": "A" }, { - "expr": "histogram_quantile(0.90, sum(rate(dlq_message_age_bucket[5m])) by (le))", + "expr": "histogram_quantile(0.90, sum(rate(dlq_message_age_seconds_bucket[5m])) by (le))", "legendFormat": "p90", "refId": "B" }, { - "expr": "histogram_quantile(0.99, sum(rate(dlq_message_age_bucket[5m])) by (le))", + "expr": "histogram_quantile(0.99, sum(rate(dlq_message_age_seconds_bucket[5m])) by (le))", "legendFormat": "p99", "refId": "C" } @@ -678,12 +678,12 @@ "pluginVersion": "8.3.3", "targets": [ { - "expr": "histogram_quantile(0.50, sum(rate(dlq_processing_duration_bucket[5m])) by (le, operation))", + "expr": "histogram_quantile(0.50, sum(rate(dlq_processing_duration_seconds_bucket[5m])) by (le, operation))", "legendFormat": "p50 {{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(0.95, sum(rate(dlq_processing_duration_bucket[5m])) by (le, operation))", + "expr": "histogram_quantile(0.95, sum(rate(dlq_processing_duration_seconds_bucket[5m])) by (le, operation))", "legendFormat": "p95 {{operation}}", "refId": "B" } @@ -755,7 +755,7 @@ "pluginVersion": "8.3.3", "targets": [ { - "expr": "avg by (original_topic) (dlq_retry_attempts)", + "expr": "sum by (original_topic) (dlq_retry_attempts_sum) / sum by (original_topic) (dlq_retry_attempts_count)", "legendFormat": "{{original_topic}}", "refId": "A" } @@ -909,7 +909,7 @@ "pluginVersion": "8.3.3", "targets": [ { - "expr": "sum by (original_topic) (dlq_throughput_rate)", + "expr": "sum by (original_topic) (rate(dlq_throughput_rate_msg_per_second_sum[5m])) / sum by (original_topic) (rate(dlq_throughput_rate_msg_per_second_count[5m]))", "legendFormat": "{{original_topic}}", "refId": "A" } @@ -1494,7 +1494,7 @@ "pluginVersion": "8.3.3", "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(dlq_message_age_bucket[5m])) by (le))", + "expr": "histogram_quantile(0.99, sum(rate(dlq_message_age_seconds_bucket[5m])) by (le))", "refId": "A" } ], diff --git a/backend/pyproject.toml b/backend/pyproject.toml index b1a96720..459a718c 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -210,6 +210,7 @@ markers = [ "k8s: marks tests as requiring Kubernetes", "performance: marks tests as performance tests", "admin: marks tests as admin-only functionality tests", + "grafana_contract: marks tests as Grafana dashboard contract tests", ] asyncio_mode = "auto" asyncio_default_fixture_loop_scope = "session" diff --git a/backend/tests/contract/__init__.py b/backend/tests/contract/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/backend/tests/contract/test_grafana_metrics.py b/backend/tests/contract/test_grafana_metrics.py new file mode 100644 index 00000000..41dcec5b --- /dev/null +++ b/backend/tests/contract/test_grafana_metrics.py @@ -0,0 +1,178 @@ +from __future__ import annotations + +import importlib +import json +import os +import pkgutil +import re +from pathlib import Path +from unittest.mock import MagicMock + +import app.core.metrics as metrics_pkg +import pytest +from app.core.metrics.base import BaseMetrics +from app.core.middlewares.metrics import MetricsMiddleware, create_system_metrics +from opentelemetry import metrics as otel_api +from opentelemetry.exporter.prometheus import PrometheusMetricReader +from opentelemetry.sdk.metrics import MeterProvider + +BACKEND_ROOT = Path(__file__).resolve().parent.parent.parent +DASHBOARDS_DIR = BACKEND_ROOT / "grafana" / "provisioning" / "dashboards" + +PROMQL_BUILTINS = frozenset({ + "sum", "avg", "min", "max", "count", "stddev", "stdvar", "group", + "count_values", "topk", "bottomk", "quantile", + "by", "without", "on", "ignoring", "group_left", "group_right", "bool", + "sum_over_time", "avg_over_time", "min_over_time", "max_over_time", + "count_over_time", "stddev_over_time", "stdvar_over_time", + "last_over_time", "present_over_time", "quantile_over_time", + "rate", "irate", "increase", "delta", "idelta", "deriv", "predict_linear", + "histogram_quantile", "histogram_avg", "histogram_count", "histogram_sum", + "histogram_fraction", "histogram_stddev", "histogram_stdvar", + "holt_winters", + "changes", "resets", + "vector", "scalar", "time", "timestamp", + "absent", "absent_over_time", "sgn", + "sort", "sort_desc", "sort_by_label", "sort_by_label_desc", + "label_replace", "label_join", + "round", "ceil", "floor", "clamp", "clamp_min", "clamp_max", + "abs", "sqrt", "ln", "log2", "log10", "exp", "exp2", + "acos", "asin", "atan", "atan2", "cos", "sin", "tan", + "acosh", "asinh", "atanh", "cosh", "sinh", "tanh", + "deg", "rad", "pi", + "day_of_month", "day_of_week", "day_of_year", "days_in_month", + "hour", "minute", "month", "year", + "or", "and", "unless", + "offset", "inf", "nan", + "le", "result", "status", "type", "format", "table", "instant", +}) + + +@pytest.fixture(scope="module") +def prometheus_families() -> dict[str, set[str]]: + """Instantiate every metric class through the real OTel -> Prometheus pipeline. + + Returns: + Mapping of family name to set of sample names produced by that family. + """ + # pytest-env sets OTEL_SDK_DISABLED=true; override so the real SDK is active. + os.environ.pop("OTEL_SDK_DISABLED", None) + + reader = PrometheusMetricReader() + provider = MeterProvider(metric_readers=[reader]) + otel_api.set_meter_provider(provider) + + for _, mod_name, _ in pkgutil.iter_modules(metrics_pkg.__path__): + importlib.import_module(f"app.core.metrics.{mod_name}") + + for cls in BaseMetrics.__subclasses__(): + cls(MagicMock()) + MetricsMiddleware(MagicMock()) + create_system_metrics() + + # Trigger every synchronous instrument via the SDK meter registry. + # Duck-typed getattr dispatch — no isinstance, works for any instrument type. + for meter in provider._meters.values(): + for instrument in meter._instrument_id_instrument.values(): + for method_name in ("add", "record", "set"): + method = getattr(instrument, method_name, None) + if method is not None: + method(1) + break + + families: dict[str, set[str]] = {} + for family in reader._collector.collect(): + sample_names: set[str] = set() + for sample in family.samples: + sample_names.add(sample.name) + if sample_names: + families[family.name] = sample_names + return families + + +def _collect_exprs(obj: object, out: list[str]) -> None: + """Recursively extract ``expr`` field values from a Grafana dashboard JSON.""" + if isinstance(obj, dict): + for key, value in obj.items(): + if key == "expr" and isinstance(value, str): + out.append(value) + else: + _collect_exprs(value, out) + elif isinstance(obj, list): + for item in obj: + _collect_exprs(item, out) + + +def _extract_metric_names(expr: str) -> set[str]: + """Extract Prometheus metric names from a PromQL expression.""" + expr = re.sub(r'"[^"]*"', "", expr) + expr = re.sub(r"\{[^}]*\}", "", expr) + expr = re.sub(r"\[[^\]]*\]", "", expr) + expr = re.sub(r"\b(?:by|without)\s*\([^)]*\)", "", expr, flags=re.IGNORECASE) + tokens = re.findall(r"[a-zA-Z_:][a-zA-Z0-9_:]*", expr) + return {t for t in tokens if t.lower() not in PROMQL_BUILTINS and ("_" in t or len(t) > 3)} + + +def _dashboard_metrics() -> dict[str, set[str]]: + """Return ``{dashboard_filename: {metric_name, ...}}`` for all dashboards.""" + result: dict[str, set[str]] = {} + for path in sorted(DASHBOARDS_DIR.glob("*.json")): + data = json.loads(path.read_text()) + exprs: list[str] = [] + _collect_exprs(data, exprs) + metrics: set[str] = set() + for expr in exprs: + metrics |= _extract_metric_names(expr) + if metrics: + result[path.name] = metrics + return result + + +@pytest.mark.grafana_contract +def test_dashboard_metrics_defined_in_code( + prometheus_families: dict[str, set[str]], +) -> None: + """Every metric in Grafana dashboards must map to a Python OTel definition.""" + prom_names: set[str] = set() + for samples in prometheus_families.values(): + prom_names |= samples + + orphaned: dict[str, set[str]] = {} + for dashboard, metrics in _dashboard_metrics().items(): + missing = metrics - prom_names + if missing: + orphaned[dashboard] = missing + + if orphaned: + lines = ["Grafana dashboards reference metrics not found in code:\n"] + for dashboard, metrics in sorted(orphaned.items()): + lines.append(f" {dashboard}:") + for m in sorted(metrics): + lines.append(f" - {m}") + pytest.fail("\n".join(lines)) + + +@pytest.mark.grafana_contract +def test_code_metrics_used_in_dashboards( + prometheus_families: dict[str, set[str]], +) -> None: + """Every metric defined in Python OTel code must appear in a Grafana dashboard.""" + all_dashboard_metrics: set[str] = set() + for metrics in _dashboard_metrics().values(): + all_dashboard_metrics |= metrics + + auto_generated = {"target"} + unused: dict[str, set[str]] = {} + for family_name, sample_names in sorted(prometheus_families.items()): + if family_name in auto_generated: + continue + if not sample_names & all_dashboard_metrics: + unused[family_name] = sample_names + + if unused: + lines = ["Code defines metrics not used in any Grafana dashboard:\n"] + for family, samples in sorted(unused.items()): + lines.append(f" {family}:") + for s in sorted(samples): + lines.append(f" - {s}") + pytest.fail("\n".join(lines)) diff --git a/docs/testing/contract-testing.md b/docs/testing/contract-testing.md new file mode 100644 index 00000000..a7e4360e --- /dev/null +++ b/docs/testing/contract-testing.md @@ -0,0 +1,70 @@ +# Contract testing + +Contract tests sit between unit tests and integration tests. They verify that two parts of the system agree on a shared +interface without actually running those parts together. In this project the main contract boundary is between Python +OTel metric definitions and Grafana dashboard JSON files — both reference the same Prometheus metric names, but neither +knows about the other at runtime. + +## Grafana metrics contract + +The test file lives at `backend/tests/contract/test_grafana_metrics.py`. It uses a real OTel-to-Prometheus export +pipeline so that metric name conversion (dots to underscores, unit suffixes, `_total` / `_bucket` / etc.) is handled by +the SDK, not hand-rolled. + +The setup works like this: a `PrometheusMetricReader` is attached to a `MeterProvider`, then every `BaseMetrics` +subclass, the `MetricsMiddleware`, and the system metrics are instantiated so that their instruments get registered in +the SDK. After that, the fixture walks `MeterProvider._meters` and triggers every synchronous instrument through +duck-typed `getattr` — it tries `add`, `record`, `set` in order, calls the first one that exists, and moves on. +Observable instruments (like the system CPU/memory gauges) fire their callbacks automatically during collection, so they +need no explicit trigger. The result is a `dict[str, set[str]]` mapping each Prometheus family name to the set of sample +names it produces. + +There are two tests that share this fixture: + +`test_dashboard_metrics_defined_in_code` is the forward check. It parses every `*.json` dashboard in +`backend/grafana/provisioning/dashboards/`, extracts `expr` fields, tokenizes the PromQL, filters out known builtins +(`rate`, `sum`, `by`, etc.), and checks that every remaining metric name exists in the Prometheus sample set. If a +dashboard references `foo_bar_total` but no Python code defines that metric, the test fails and lists the offending +dashboard and metric names. + +`test_code_metrics_used_in_dashboards` is the reverse check. It flattens every dashboard metric into one set, then +iterates over the Prometheus families from the fixture. For each family it checks whether any of its sample names appear +in that dashboard set. Because a single histogram like `execution_duration` produces `_bucket`, `_count`, `_sum`, and +`_created` samples, the dashboard only needs to reference one of them for the family to pass. The `target` family is +skipped since it's auto-generated by the OTel SDK and not something you'd panel in Grafana. If a metric family has no +dashboard coverage at all, the test fails with a list of unused families and their samples. + +## Why duck-typed instrument triggering + +The earlier version of this test had an `isinstance` chain — check for `Counter`, call `add`; check for `Histogram`, +call `record`; check for `UpDownCounter`, call `add`. This breaks silently when a new instrument type shows up (the +project already uses `ObservableGauge`, which the old code didn't handle). The current approach iterates the SDK's +internal meter registry and calls whichever method the instrument exposes. If OTel adds a new synchronous instrument +type tomorrow that has an `add` or `record` method, the test picks it up with zero changes. + +## Running the tests + +```bash +cd backend +uv run pytest tests/contract/test_grafana_metrics.py -v -o "addopts=" +``` + +The `-o "addopts="` override is needed because `pyproject.toml` sets `-n auto --dist=loadfile` for xdist, which +interferes with the module-scoped fixture (the OTel `MeterProvider` can only be set once per process). Running without +xdist is fine since these tests finish in under a second. + +Both tests use the `@pytest.mark.grafana_contract` marker, so you can also run them via: + +```bash +uv run pytest -m grafana_contract -o "addopts=" +``` + +## Adding a new metric + +When you add a metric to a `BaseMetrics` subclass, the forward test will keep passing (dashboards don't reference it +yet). But the reverse test will fail, telling you exactly which family has no dashboard coverage. At that point either +add a panel to an existing dashboard or create a new one. Conversely, if you add a PromQL expression to a dashboard that +references a metric that doesn't exist in code, the forward test catches it. + +The goal is to keep the two sides in sync so you don't end up with dead panels pointing at metrics that were renamed +three months ago, or metrics that nobody ever looks at because they were never wired into a dashboard. diff --git a/mkdocs.yml b/mkdocs.yml index f9aba928..150558d7 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -188,6 +188,7 @@ nav: - Error Handling: frontend/error-handling.md - Testing: + - Contract Testing: testing/contract-testing.md - Load Testing: testing/load-testing.md - Frontend Testing: testing/frontend-testing.md - Kafka Test Stability: testing/kafka-test-stability.md From 22fe234ce9f868d46b5a64b6180379e0832dbd84 Mon Sep 17 00:00:00 2001 From: HardMax71 Date: Mon, 16 Feb 2026 20:09:22 +0100 Subject: [PATCH 4/9] feat: updated metrics system --- backend/app/api/routes/auth.py | 152 +--- backend/app/core/exceptions/handlers.py | 3 + backend/app/core/metrics/__init__.py | 2 - backend/app/core/metrics/connections.py | 35 +- backend/app/core/metrics/dlq.py | 22 - backend/app/core/metrics/events.py | 89 +- backend/app/core/metrics/execution.py | 24 - backend/app/core/metrics/health.py | 109 --- backend/app/core/metrics/kubernetes.py | 38 +- backend/app/core/metrics/notifications.py | 79 +- backend/app/core/metrics/replay.py | 32 +- backend/app/core/metrics/security.py | 164 +--- backend/app/core/middlewares/csrf.py | 17 +- backend/app/core/providers.py | 27 +- backend/app/core/security.py | 7 +- backend/app/dlq/manager.py | 1 + backend/app/domain/exceptions.py | 6 + backend/app/domain/user/__init__.py | 2 + backend/app/domain/user/user_models.py | 11 + .../app/services/admin/admin_user_service.py | 4 + backend/app/services/auth_service.py | 200 +++- .../services/event_replay/replay_service.py | 7 + backend/app/services/k8s_worker/worker.py | 4 +- backend/app/services/notification_service.py | 8 +- backend/app/services/sse/sse_service.py | 3 + .../dashboards/coordinator-execution.json | 275 ++++++ .../dashboards/dlq-monitoring.json | 236 ++--- .../provisioning/dashboards/event-replay.json | 387 ++++++++ .../dashboards/event-stream-monitoring.json | 861 +++--------------- .../dashboards/http-middleware.json | 387 ++++++++ .../provisioning/dashboards/integr8scode.json | 148 ++- .../dashboards/kafka-events-monitoring.json | 239 ++--- .../dashboards/kubernetes-pods.json | 350 +++++++ .../dashboards/notifications.json | 500 ++++++++++ .../dashboards/security-auth.json | 355 ++++++++ backend/tests/unit/conftest.py | 6 - ...est_connections_and_coordinator_metrics.py | 5 - .../metrics/test_database_and_dlq_metrics.py | 2 - .../test_execution_and_events_metrics.py | 14 - .../test_health_and_rate_limit_metrics.py | 17 - ...st_kubernetes_and_notifications_metrics.py | 14 +- .../unit/core/metrics/test_metrics_classes.py | 14 +- .../test_replay_and_security_metrics.py | 20 +- backend/tests/unit/core/test_csrf.py | 80 +- backend/tests/unit/core/test_security.py | 17 +- 45 files changed, 2942 insertions(+), 2031 deletions(-) delete mode 100644 backend/app/core/metrics/health.py create mode 100644 backend/grafana/provisioning/dashboards/coordinator-execution.json create mode 100644 backend/grafana/provisioning/dashboards/event-replay.json create mode 100644 backend/grafana/provisioning/dashboards/http-middleware.json create mode 100644 backend/grafana/provisioning/dashboards/kubernetes-pods.json create mode 100644 backend/grafana/provisioning/dashboards/notifications.json create mode 100644 backend/grafana/provisioning/dashboards/security-auth.json diff --git a/backend/app/api/routes/auth.py b/backend/app/api/routes/auth.py index 679ea939..3aeb9de7 100644 --- a/backend/app/api/routes/auth.py +++ b/backend/app/api/routes/auth.py @@ -1,16 +1,10 @@ -from datetime import timedelta - import structlog from dishka import FromDishka from dishka.integrations.fastapi import DishkaRoute -from fastapi import APIRouter, Depends, HTTPException, Request, Response +from fastapi import APIRouter, Depends, Request, Response from fastapi.security import OAuth2PasswordRequestForm -from app.core.security import SecurityService from app.core.utils import get_client_ip -from app.db.repositories import UserRepository -from app.domain.enums import UserRole -from app.domain.user import DomainUserCreate from app.schemas_pydantic.common import ErrorResponse from app.schemas_pydantic.user import ( LoginResponse, @@ -19,8 +13,6 @@ UserResponse, ) from app.services.auth_service import AuthService -from app.services.login_lockout import LoginLockoutService -from app.services.runtime_settings import RuntimeSettingsLoader router = APIRouter(prefix="/auth", tags=["authentication"], route_class=DishkaRoute) @@ -36,10 +28,7 @@ async def login( request: Request, response: Response, - user_repo: FromDishka[UserRepository], - security_service: FromDishka[SecurityService], - runtime_settings: FromDishka[RuntimeSettingsLoader], - lockout_service: FromDishka[LoginLockoutService], + auth_service: FromDishka[AuthService], logger: FromDishka[structlog.stdlib.BoundLogger], form_data: OAuth2PasswordRequestForm = Depends(), ) -> LoginResponse: @@ -52,75 +41,18 @@ async def login( user_agent=request.headers.get("user-agent"), ) - if await lockout_service.check_locked(form_data.username): - raise HTTPException( - status_code=423, - detail="Account temporarily locked due to too many failed attempts", - ) - - user = await user_repo.get_user(form_data.username) - - if not user: - logger.warning( - "Login failed - user not found", - username=form_data.username, - client_ip=get_client_ip(request), - user_agent=request.headers.get("user-agent"), - ) - locked = await lockout_service.record_failed_attempt(form_data.username) - if locked: - raise HTTPException( - status_code=423, - detail="Account locked due to too many failed attempts", - ) - raise HTTPException( - status_code=401, - detail="Invalid credentials", - headers={"WWW-Authenticate": "Bearer"}, - ) - - if not security_service.verify_password(form_data.password, user.hashed_password): - logger.warning( - "Login failed - invalid password", - username=form_data.username, - client_ip=get_client_ip(request), - user_agent=request.headers.get("user-agent"), - ) - locked = await lockout_service.record_failed_attempt(form_data.username) - if locked: - raise HTTPException( - status_code=423, - detail="Account locked due to too many failed attempts", - ) - raise HTTPException( - status_code=401, - detail="Invalid credentials", - headers={"WWW-Authenticate": "Bearer"}, - ) - - await lockout_service.clear_attempts(form_data.username) - - effective = await runtime_settings.get_effective_settings() - session_timeout = effective.session_timeout_minutes - - logger.info( - "Login successful", - username=user.username, - client_ip=get_client_ip(request), - user_agent=request.headers.get("user-agent"), - token_expires_in_minutes=session_timeout, + result = await auth_service.login( + form_data.username, + form_data.password, + get_client_ip(request), + request.headers.get("user-agent"), ) - access_token_expires = timedelta(minutes=session_timeout) - access_token = security_service.create_access_token(data={"sub": user.username}, expires_delta=access_token_expires) - - csrf_token = security_service.generate_csrf_token(access_token) - # --8<-- [start:login_cookies] response.set_cookie( key="access_token", - value=access_token, - max_age=session_timeout * 60, # Convert to seconds + value=result.access_token, + max_age=result.session_timeout_minutes * 60, httponly=True, secure=True, # HTTPS only samesite="strict", # CSRF protection @@ -129,8 +61,8 @@ async def login( response.set_cookie( key="csrf_token", - value=csrf_token, - max_age=session_timeout * 60, + value=result.csrf_token, + max_age=result.session_timeout_minutes * 60, httponly=False, # JavaScript needs to read this secure=True, samesite="strict", @@ -143,9 +75,9 @@ async def login( return LoginResponse( message="Login successful", - username=user.username, - role=user.role, - csrf_token=csrf_token, + username=result.username, + role=result.role, + csrf_token=result.csrf_token, ) @@ -160,9 +92,7 @@ async def login( async def register( request: Request, user: UserCreate, - user_repo: FromDishka[UserRepository], - security_service: FromDishka[SecurityService], - runtime_settings: FromDishka[RuntimeSettingsLoader], + auth_service: FromDishka[AuthService], logger: FromDishka[structlog.stdlib.BoundLogger], ) -> UserResponse: """Register a new user account.""" @@ -174,37 +104,12 @@ async def register( user_agent=request.headers.get("user-agent"), ) - effective = await runtime_settings.get_effective_settings() - min_len = effective.password_min_length - if len(user.password) < min_len: - raise HTTPException(status_code=400, detail=f"Password must be at least {min_len} characters") - - db_user = await user_repo.get_user(user.username) - if db_user: - logger.warning( - "Registration failed - username taken", - username=user.username, - client_ip=get_client_ip(request), - user_agent=request.headers.get("user-agent"), - ) - raise HTTPException(status_code=409, detail="Username already registered") - - hashed_password = security_service.get_password_hash(user.password) - create_data = DomainUserCreate( - username=user.username, - email=user.email, - hashed_password=hashed_password, - role=UserRole.USER, - is_active=True, - is_superuser=False, - ) - created_user = await user_repo.create_user(create_data) - - logger.info( - "Registration successful", - username=created_user.username, - client_ip=get_client_ip(request), - user_agent=request.headers.get("user-agent"), + created_user = await auth_service.register( + user.username, + user.email, + user.password, + get_client_ip(request), + request.headers.get("user-agent"), ) return UserResponse.model_validate(created_user) @@ -238,6 +143,7 @@ async def get_current_user_profile( async def logout( request: Request, response: Response, + auth_service: FromDishka[AuthService], logger: FromDishka[structlog.stdlib.BoundLogger], ) -> MessageResponse: """Log out and clear session cookies.""" @@ -248,17 +154,11 @@ async def logout( user_agent=request.headers.get("user-agent"), ) - # Clear the httpOnly cookie - response.delete_cookie( - key="access_token", - path="/", - ) + token = request.cookies.get("access_token") + await auth_service.publish_logout_event(token) - # Clear the CSRF cookie - response.delete_cookie( - key="csrf_token", - path="/", - ) + response.delete_cookie(key="access_token", path="/") + response.delete_cookie(key="csrf_token", path="/") logger.info( "Logout successful", diff --git a/backend/app/core/exceptions/handlers.py b/backend/app/core/exceptions/handlers.py index 94cc6437..7453817e 100644 --- a/backend/app/core/exceptions/handlers.py +++ b/backend/app/core/exceptions/handlers.py @@ -2,6 +2,7 @@ from fastapi.responses import JSONResponse from app.domain.exceptions import ( + AccountLockedError, ConflictError, DomainError, ForbiddenError, @@ -40,6 +41,8 @@ def _map_to_status_code(exc: DomainError) -> int: return 403 if isinstance(exc, InvalidStateError): return 400 + if isinstance(exc, AccountLockedError): + return 423 if isinstance(exc, InfrastructureError): return 500 return 500 diff --git a/backend/app/core/metrics/__init__.py b/backend/app/core/metrics/__init__.py index 77d1687d..497229ec 100644 --- a/backend/app/core/metrics/__init__.py +++ b/backend/app/core/metrics/__init__.py @@ -5,7 +5,6 @@ from app.core.metrics.dlq import DLQMetrics from app.core.metrics.events import EventMetrics from app.core.metrics.execution import ExecutionMetrics -from app.core.metrics.health import HealthMetrics from app.core.metrics.kubernetes import KubernetesMetrics from app.core.metrics.notifications import NotificationMetrics from app.core.metrics.rate_limit import RateLimitMetrics @@ -20,7 +19,6 @@ "DLQMetrics", "EventMetrics", "ExecutionMetrics", - "HealthMetrics", "KubernetesMetrics", "NotificationMetrics", "RateLimitMetrics", diff --git a/backend/app/core/metrics/connections.py b/backend/app/core/metrics/connections.py index 3ca3c04f..aa40143a 100644 --- a/backend/app/core/metrics/connections.py +++ b/backend/app/core/metrics/connections.py @@ -2,7 +2,7 @@ class ConnectionMetrics(BaseMetrics): - """Metrics for SSE connections and event bus.""" + """Metrics for SSE connections.""" def _create_instruments(self) -> None: self.sse_active_connections = self._meter.create_up_down_counter( @@ -23,19 +23,6 @@ def _create_instruments(self) -> None: unit="1", ) - self.sse_shutdown_duration = self._meter.create_histogram( - name="sse.shutdown.duration", description="Time taken for SSE shutdown phases in seconds", unit="s" - ) - - # Event bus metrics - self.event_bus_subscribers = self._meter.create_up_down_counter( - name="event.bus.subscribers", description="Number of active event bus subscribers by pattern", unit="1" - ) - - self.event_bus_subscriptions = self._meter.create_up_down_counter( - name="event.bus.subscriptions.total", description="Total number of event bus subscriptions", unit="1" - ) - def increment_sse_connections(self, endpoint: str = "default") -> None: self.sse_active_connections.add(1, attributes={"endpoint": endpoint}) @@ -50,23 +37,3 @@ def record_sse_connection_duration(self, duration_seconds: float, endpoint: str) def update_sse_draining_connections(self, delta: int) -> None: self.sse_draining_connections.add(delta) - - def record_sse_shutdown_duration(self, duration_seconds: float, phase: str) -> None: - self.sse_shutdown_duration.record(duration_seconds, attributes={"phase": phase}) - - def update_sse_shutdown_duration(self, duration_seconds: float, phase: str) -> None: - self.sse_shutdown_duration.record(duration_seconds, attributes={"phase": phase}) - - def increment_event_bus_subscriptions(self) -> None: - self.event_bus_subscriptions.add(1) - - def decrement_event_bus_subscriptions(self, count: int = 1) -> None: - self.event_bus_subscriptions.add(-count) - - def update_event_bus_subscribers(self, count: int, pattern: str) -> None: - """Update the count of event bus subscribers for a specific pattern.""" - # This tracks the current number of subscribers for a pattern - # We need to track the delta from the previous value - # Since we can't store state in metrics, we record the absolute value - # The metric system will handle the up/down nature - self.event_bus_subscribers.add(count, attributes={"pattern": pattern}) diff --git a/backend/app/core/metrics/dlq.py b/backend/app/core/metrics/dlq.py index be8b988e..18383cdc 100644 --- a/backend/app/core/metrics/dlq.py +++ b/backend/app/core/metrics/dlq.py @@ -5,7 +5,6 @@ class DLQMetrics(BaseMetrics): """Metrics for Dead Letter Queue operations.""" def _create_instruments(self) -> None: - # DLQ message metrics self.dlq_messages_received = self._meter.create_counter( name="dlq.messages.received.total", description="Total number of messages received in DLQ", unit="1" ) @@ -18,7 +17,6 @@ def _create_instruments(self) -> None: name="dlq.messages.discarded.total", description="Total number of DLQ messages discarded", unit="1" ) - # DLQ processing metrics self.dlq_processing_duration = self._meter.create_histogram( name="dlq.processing.duration", description="Time spent processing DLQ messages in seconds", unit="s" ) @@ -27,25 +25,14 @@ def _create_instruments(self) -> None: name="dlq.message.age", description="Age of messages in DLQ in seconds", unit="s" ) - # DLQ queue metrics self.dlq_queue_size = self._meter.create_up_down_counter( name="dlq.queue.size", description="Current size of DLQ by topic", unit="1" ) - self.dlq_retry_attempts = self._meter.create_histogram( - name="dlq.retry.attempts", description="Number of retry attempts for DLQ messages", unit="1" - ) - - # DLQ error metrics self.dlq_processing_errors = self._meter.create_counter( name="dlq.processing.errors.total", description="Total number of DLQ processing errors", unit="1" ) - # DLQ throughput metrics - self.dlq_throughput_rate = self._meter.create_histogram( - name="dlq.throughput.rate", description="Messages processed per second from DLQ", unit="msg/s" - ) - def record_dlq_message_received(self, original_topic: str, event_type: str) -> None: self.dlq_messages_received.add(1, attributes={"original_topic": original_topic, "event_type": event_type}) @@ -63,7 +50,6 @@ def record_dlq_processing_duration(self, duration_seconds: float, operation: str self.dlq_processing_duration.record(duration_seconds, attributes={"operation": operation}) def update_dlq_queue_size(self, original_topic: str, size: int) -> None: - # Track the delta for gauge-like behavior key = f"_dlq_size_{original_topic}" current_val = getattr(self, key, 0) delta = size - current_val @@ -74,19 +60,11 @@ def update_dlq_queue_size(self, original_topic: str, size: int) -> None: def record_dlq_message_age(self, age_seconds: float) -> None: self.dlq_message_age.record(age_seconds) - def record_dlq_retry_attempt(self, original_topic: str, event_type: str, attempt_number: int) -> None: - self.dlq_retry_attempts.record( - attempt_number, attributes={"original_topic": original_topic, "event_type": event_type} - ) - def record_dlq_processing_error(self, original_topic: str, event_type: str, error_type: str) -> None: self.dlq_processing_errors.add( 1, attributes={"original_topic": original_topic, "event_type": event_type, "error_type": error_type} ) - def record_dlq_throughput(self, messages_per_second: float, original_topic: str) -> None: - self.dlq_throughput_rate.record(messages_per_second, attributes={"original_topic": original_topic}) - def increment_dlq_queue_size(self, original_topic: str) -> None: self.dlq_queue_size.add(1, attributes={"original_topic": original_topic}) diff --git a/backend/app/core/metrics/events.py b/backend/app/core/metrics/events.py index bd417078..f5dbdf49 100644 --- a/backend/app/core/metrics/events.py +++ b/backend/app/core/metrics/events.py @@ -2,20 +2,7 @@ class EventMetrics(BaseMetrics): - """Metrics for event processing and Kafka. - - This class tracks metrics related to event processing, event buffers, - and Kafka message production/consumption. Metrics are provided via - dependency injection (DI) through the MetricsProvider. - - Usage (via DI): - class MyService: - def __init__(self, event_metrics: EventMetrics): - self.metrics = event_metrics - - def my_method(self): - self.metrics.record_event_published("execution.requested") - """ + """Metrics for event processing and Kafka.""" def _create_instruments(self) -> None: # Core event metrics @@ -36,43 +23,11 @@ def _create_instruments(self) -> None: name="event.bus.queue.size", description="Size of event bus message queue", unit="1" ) - # Pod event metrics - self.pod_event_published = self._meter.create_counter( - name="pod.events.published.total", description="Total number of pod events published", unit="1" - ) - # Event replay metrics self.event_replay_operations = self._meter.create_counter( name="event.replay.operations.total", description="Total number of event replay operations", unit="1" ) - # Event buffer metrics - self.event_buffer_size = self._meter.create_up_down_counter( - name="event.buffer.size", description="Current number of events in buffer", unit="1" - ) - - self.event_buffer_dropped = self._meter.create_counter( - name="event.buffer.dropped.total", description="Total number of events dropped from buffer", unit="1" - ) - - self.event_buffer_processed = self._meter.create_counter( - name="event.buffer.processed.total", description="Total number of events processed from buffer", unit="1" - ) - - self.event_buffer_latency = self._meter.create_histogram( - name="event.buffer.latency", description="Time between event creation and processing in seconds", unit="s" - ) - - self.event_buffer_backpressure = self._meter.create_up_down_counter( - name="event.buffer.backpressure.active", - description="Whether backpressure is currently active (1=active, 0=inactive)", - unit="1", - ) - - self.event_buffer_memory_usage = self._meter.create_histogram( - name="event.buffer.memory.usage", description="Memory usage of event buffer in MB", unit="MB" - ) - # Kafka-specific metrics self.kafka_messages_produced = self._meter.create_counter( name="kafka.messages.produced.total", description="Total number of messages produced to Kafka", unit="1" @@ -82,10 +37,6 @@ def _create_instruments(self) -> None: name="kafka.messages.consumed.total", description="Total number of messages consumed from Kafka", unit="1" ) - self.kafka_consumer_lag = self._meter.create_histogram( - name="kafka.consumer.lag", description="Consumer lag in number of messages", unit="1" - ) - self.kafka_production_errors = self._meter.create_counter( name="kafka.production.errors.total", description="Total number of Kafka production errors", unit="1" ) @@ -95,15 +46,7 @@ def _create_instruments(self) -> None: ) def record_event_published(self, event_type: str, event_category: str | None = None) -> None: - """ - Record that an event was published. - - Args: - event_type: Full event type (e.g., "execution.requested") - event_category: Event category (e.g., "execution"). If None, extracted from event_type. - """ if event_category is None: - # Extract category from event type (e.g., "execution" from "execution.requested") event_category = event_type.split(".")[0] if "." in event_type else event_type self.event_published.add(1, attributes={"event_type": event_type, "event_category": event_category}) @@ -111,31 +54,9 @@ def record_event_published(self, event_type: str, event_category: str | None = N def record_event_processing_duration(self, duration_seconds: float, event_type: str) -> None: self.event_processing_duration.record(duration_seconds, attributes={"event_type": event_type}) - def record_pod_event_published(self, event_type: str) -> None: - self.pod_event_published.add(1, attributes={"event_type": event_type}) - def record_event_replay_operation(self, operation: str, status: str) -> None: self.event_replay_operations.add(1, attributes={"operation": operation, "status": status}) - def update_event_buffer_size(self, delta: int) -> None: - self.event_buffer_size.add(delta) - - def record_event_buffer_dropped(self) -> None: - self.event_buffer_dropped.add(1) - - def record_event_buffer_processed(self) -> None: - self.event_buffer_processed.add(1) - - def record_event_buffer_latency(self, latency_seconds: float) -> None: - self.event_buffer_latency.record(latency_seconds) - - def set_event_buffer_backpressure(self, active: bool) -> None: - self.event_buffer_backpressure.add(-1 if not active else 0) - self.event_buffer_backpressure.add(1 if active else 0) - - def record_event_buffer_memory_usage(self, memory_mb: float) -> None: - self.event_buffer_memory_usage.record(memory_mb) - def record_event_stored(self, event_type: str, collection: str) -> None: self.event_published.add(1, attributes={"event_type": event_type, "aggregate_type": collection}) @@ -153,17 +74,14 @@ def record_events_processing_failed( ) def record_event_store_duration(self, duration: float, operation: str, collection: str) -> None: - """Record event store operation duration.""" self.event_processing_duration.record(duration, attributes={"operation": operation, "collection": collection}) def record_event_store_failed(self, event_type: str, error_type: str) -> None: - """Record event store failure.""" self.event_processing_errors.add( 1, attributes={"event_type": event_type, "error_type": error_type, "operation": "store"} ) def record_event_query_duration(self, duration: float, query_type: str, collection: str) -> None: - """Record event query duration.""" self.event_processing_duration.record( duration, attributes={"operation": f"query_{query_type}", "collection": collection} ) @@ -183,11 +101,6 @@ def record_kafka_message_produced(self, topic: str, partition: int = -1) -> None def record_kafka_message_consumed(self, topic: str, consumer_group: str) -> None: self.kafka_messages_consumed.add(1, attributes={"topic": topic, "consumer_group": consumer_group}) - def record_kafka_consumer_lag(self, lag: int, topic: str, consumer_group: str, partition: int) -> None: - self.kafka_consumer_lag.record( - lag, attributes={"topic": topic, "consumer_group": consumer_group, "partition": str(partition)} - ) - def record_kafka_production_error(self, topic: str, error_type: str) -> None: self.kafka_production_errors.add(1, attributes={"topic": topic, "error_type": error_type}) diff --git a/backend/app/core/metrics/execution.py b/backend/app/core/metrics/execution.py index f033447b..adb96dab 100644 --- a/backend/app/core/metrics/execution.py +++ b/backend/app/core/metrics/execution.py @@ -10,12 +10,6 @@ def _create_instruments(self) -> None: name="script.executions.total", description="Total number of script executions", unit="1" ) - self.execution_events = self._meter.create_observable_gauge( - name="script.execution.events", - description="Instantaneous execution events (1 when execution starts, 0 otherwise)", - unit="1", - ) - self.execution_duration = self._meter.create_histogram( name="script.execution.duration", description="Time spent executing scripts in seconds", unit="s" ) @@ -28,12 +22,6 @@ def _create_instruments(self) -> None: name="script.memory.usage", description="Memory usage per script execution in MiB", unit="MiB" ) - self.cpu_utilization = self._meter.create_histogram( - name="script.cpu.utilization", - description="CPU utilization in millicores per script execution", - unit="millicores", - ) - self.memory_utilization_percent = self._meter.create_histogram( name="script.memory.utilization.percent", description="Memory utilization as percentage of available memory", @@ -94,15 +82,3 @@ def record_execution_queued(self) -> None: def record_execution_scheduled(self) -> None: self.executions_assigned.add(1) - - def update_cpu_available(self, cores: float) -> None: - self.cpu_utilization.record(cores) - - def update_memory_available(self, memory_mb: float) -> None: - self.memory_usage.record(memory_mb, attributes={"lang_and_version": "resource_manager"}) - - def update_gpu_available(self, count: int) -> None: - self.cpu_utilization.record(count, attributes={"resource": "gpu"}) - - def update_allocations_active(self, count: int) -> None: - self.memory_utilization_percent.record(count, attributes={"metric": "allocations"}) diff --git a/backend/app/core/metrics/health.py b/backend/app/core/metrics/health.py deleted file mode 100644 index eb26af27..00000000 --- a/backend/app/core/metrics/health.py +++ /dev/null @@ -1,109 +0,0 @@ -from app.core.metrics.base import BaseMetrics - - -class HealthMetrics(BaseMetrics): - """Metrics for health checks.""" - - def _create_instruments(self) -> None: - # Core health check metrics - simple histogram to track latest value - self.health_check_status = self._meter.create_histogram( - name="health.check.status", description="Health check status (1=healthy, 0=unhealthy)", unit="1" - ) - - self.health_check_duration = self._meter.create_histogram( - name="health.check.duration", description="Time taken to perform health check in seconds", unit="s" - ) - - self.health_check_failures = self._meter.create_counter( - name="health.check.failures.total", description="Total number of health check failures", unit="1" - ) - - # Service health metrics - self.service_health_status = self._meter.create_histogram( - name="service.health.status", description="Service health status by service name", unit="1" - ) - - self.service_health_score = self._meter.create_histogram( - name="service.health.score", description="Overall health score for a service (0-100)", unit="%" - ) - - # Liveness and readiness specific metrics - self.liveness_check_status = self._meter.create_histogram( - name="liveness.check.status", description="Liveness check status (1=alive, 0=dead)", unit="1" - ) - - self.readiness_check_status = self._meter.create_histogram( - name="readiness.check.status", description="Readiness check status (1=ready, 0=not ready)", unit="1" - ) - - # Dependency health metrics - self.dependency_health_status = self._meter.create_histogram( - name="dependency.health.status", description="Health status of external dependencies", unit="1" - ) - - self.dependency_response_time = self._meter.create_histogram( - name="dependency.response.time", description="Response time for dependency health checks", unit="s" - ) - - # Health check execution metrics - self.health_checks_executed = self._meter.create_counter( - name="health.checks.executed.total", description="Total number of health checks executed", unit="1" - ) - - self.health_check_timeouts = self._meter.create_counter( - name="health.check.timeouts.total", description="Total number of health check timeouts", unit="1" - ) - - # Component health metrics - self.component_health_status = self._meter.create_histogram( - name="component.health.status", description="Health status of system components", unit="1" - ) - - def record_health_check_duration(self, duration_seconds: float, check_type: str, check_name: str) -> None: - self.health_check_duration.record( - duration_seconds, attributes={"check_type": check_type, "check_name": check_name} - ) - - # Also increment execution counter - self.health_checks_executed.add(1, attributes={"check_type": check_type, "check_name": check_name}) - - def record_health_check_failure(self, check_type: str, check_name: str, failure_type: str) -> None: - self.health_check_failures.add( - 1, attributes={"check_type": check_type, "check_name": check_name, "failure_type": failure_type} - ) - - def update_health_check_status(self, status_value: int, check_type: str, check_name: str) -> None: - # Just record the current status value - self.health_check_status.record(status_value, attributes={"check_type": check_type, "check_name": check_name}) - - def record_health_status(self, service_name: str, status: str) -> None: - # Map status to numeric value - status_value = 1 if status.lower() in ["healthy", "ok", "up"] else 0 - # Record the current status - self.service_health_status.record(status_value, attributes={"service": service_name}) - - def record_service_health_score(self, service_name: str, score: float) -> None: - self.service_health_score.record(score, attributes={"service": service_name}) - - def update_liveness_status(self, is_alive: bool, component: str = "default") -> None: - status_value = 1 if is_alive else 0 - self.liveness_check_status.record(status_value, attributes={"component": component}) - - def update_readiness_status(self, is_ready: bool, component: str = "default") -> None: - status_value = 1 if is_ready else 0 - self.readiness_check_status.record(status_value, attributes={"component": component}) - - def record_dependency_health(self, dependency_name: str, is_healthy: bool, response_time: float) -> None: - # Update health status - status_value = 1 if is_healthy else 0 - self.dependency_health_status.record(status_value, attributes={"dependency": dependency_name}) - - # Record response time - self.dependency_response_time.record(response_time, attributes={"dependency": dependency_name}) - - def record_health_check_timeout(self, check_type: str, check_name: str) -> None: - self.health_check_timeouts.add(1, attributes={"check_type": check_type, "check_name": check_name}) - - def update_component_health(self, component_name: str, is_healthy: bool) -> None: - status_value = 1 if is_healthy else 0 - self.component_health_status.record(status_value, attributes={"component": component_name}) diff --git a/backend/app/core/metrics/kubernetes.py b/backend/app/core/metrics/kubernetes.py index 06d45bec..cb1f2be1 100644 --- a/backend/app/core/metrics/kubernetes.py +++ b/backend/app/core/metrics/kubernetes.py @@ -35,15 +35,11 @@ def _create_instruments(self) -> None: name="pods.by.phase", description="Current number of pods by phase", unit="1" ) - # ConfigMap and NetworkPolicy metrics + # ConfigMap metrics self.config_maps_created = self._meter.create_counter( name="configmaps.created.total", description="Total number of ConfigMaps created", unit="1" ) - self.network_policies_created = self._meter.create_counter( - name="networkpolicies.created.total", description="Total number of NetworkPolicies created", unit="1" - ) - # Pod monitor metrics self.pod_monitor_events = self._meter.create_counter( name="pod.monitor.events.total", description="Total number of pod monitor events", unit="1" @@ -75,20 +71,6 @@ def _create_instruments(self) -> None: name="pods.monitored", description="Number of pods currently being monitored", unit="1" ) - # Resource metrics - self.pod_resource_requests = self._meter.create_histogram( - name="pod.resource.requests", description="Pod resource requests", unit="1" - ) - - self.pod_resource_limits = self._meter.create_histogram( - name="pod.resource.limits", description="Pod resource limits", unit="1" - ) - - # Node metrics - self.pods_per_node = self._meter.create_histogram( - name="pods.per.node", description="Number of pods per node", unit="1" - ) - def record_pod_creation_failure(self, failure_reason: str) -> None: self.pod_creation_failures.add(1, attributes={"failure_reason": failure_reason}) @@ -99,7 +81,6 @@ def record_pod_creation_duration(self, duration_seconds: float, language: str) - self.pod_creation_duration.record(duration_seconds, attributes={"language": language}) def update_active_pod_creations(self, count: int) -> None: - # Track the delta for gauge-like behavior key = "_active_pod_creations" current_val = getattr(self, key, 0) delta = count - current_val @@ -125,12 +106,6 @@ def record_k8s_pod_creation_duration(self, duration_seconds: float, language: st def record_k8s_config_map_created(self, status: str) -> None: self.record_config_map_created(status) - def record_k8s_network_policy_created(self, status: str) -> None: - self.network_policies_created.add(1, attributes={"status": status}) - - def update_k8s_active_creations(self, count: int) -> None: - self.update_active_pod_creations(count) - def increment_pod_monitor_watch_reconnects(self) -> None: self.pod_monitor_watch_reconnects.add(1) @@ -147,7 +122,6 @@ def record_pod_monitor_watch_error(self, error_type: str) -> None: self.pod_monitor_watch_errors.add(1, attributes={"error_type": error_type}) def update_pod_monitor_pods_watched(self, count: int) -> None: - # Track the delta for gauge-like behavior key = "_pods_monitored" current_val = getattr(self, key, 0) delta = count - current_val @@ -164,19 +138,9 @@ def record_pod_lifetime(self, lifetime_seconds: float, final_phase: str, languag self.pod_lifetime.record(lifetime_seconds, attributes={"final_phase": final_phase, "language": language}) def update_pods_by_phase(self, phase: str, count: int) -> None: - # Track the delta for gauge-like behavior key = f"_pods_phase_{phase}" current_val = getattr(self, key, 0) delta = count - current_val if delta != 0: self.pods_by_phase.add(delta, attributes={"phase": phase}) setattr(self, key, count) - - def record_pod_resource_request(self, resource_type: str, value: float, language: str) -> None: - self.pod_resource_requests.record(value, attributes={"resource_type": resource_type, "language": language}) - - def record_pod_resource_limit(self, resource_type: str, value: float, language: str) -> None: - self.pod_resource_limits.record(value, attributes={"resource_type": resource_type, "language": language}) - - def record_pods_per_node(self, node_name: str, pod_count: int) -> None: - self.pods_per_node.record(pod_count, attributes={"node_name": node_name}) diff --git a/backend/app/core/metrics/notifications.py b/backend/app/core/metrics/notifications.py index 1610e659..13081829 100644 --- a/backend/app/core/metrics/notifications.py +++ b/backend/app/core/metrics/notifications.py @@ -54,14 +54,6 @@ def _create_instruments(self) -> None: name="notifications.read.total", description="Total notifications read by users", unit="1" ) - self.notifications_clicked = self._meter.create_counter( - name="notifications.clicked.total", description="Total notifications clicked by users", unit="1" - ) - - self.time_to_read = self._meter.create_histogram( - name="notification.time.to.read", description="Time between notification sent and read in seconds", unit="s" - ) - self.unread_count = self._meter.create_up_down_counter( name="notifications.unread.count", description="Current unread notifications per user", unit="1" ) @@ -86,32 +78,6 @@ def _create_instruments(self) -> None: name="notification.retry.success.rate", description="Success rate of retried notifications", unit="%" ) - # Batch processing metrics - self.batch_notifications_processed = self._meter.create_counter( - name="notification.batch.processed.total", description="Total notifications processed in batches", unit="1" - ) - - self.batch_processing_time = self._meter.create_histogram( - name="notification.batch.processing.time", - description="Time to process notification batch in seconds", - unit="s", - ) - - self.batch_size = self._meter.create_histogram( - name="notification.batch.size", description="Size of notification batches", unit="1" - ) - - # Template rendering metrics - self.template_render_time = self._meter.create_histogram( - name="notification.template.render.time", - description="Time to render notification template in seconds", - unit="s", - ) - - self.template_render_errors = self._meter.create_counter( - name="notification.template.render.errors.total", description="Total template rendering errors", unit="1" - ) - # Webhook-specific metrics self.webhook_delivery_time = self._meter.create_histogram( name="notification.webhook.delivery.time", @@ -149,21 +115,17 @@ def record_notification_sent( self, notification_type: str, channel: str = "in_app", severity: str = "medium" ) -> None: self.notifications_sent.add(1, attributes={"category": notification_type}) - self.notifications_by_channel.add(1, attributes={"channel": channel, "category": notification_type}) - self.notifications_by_severity.add(1, attributes={"severity": severity, "category": notification_type}) def record_notification_failed(self, notification_type: str, error: str, channel: str = "in_app") -> None: self.notifications_failed.add(1, attributes={"category": notification_type, "error": error}) - self.channel_failures.add(1, attributes={"channel": channel, "error": error}) def record_notification_delivery_time( self, duration_seconds: float, notification_type: str, channel: str = "in_app" ) -> None: self.notification_delivery_time.record(duration_seconds, attributes={"category": notification_type}) - self.channel_delivery_time.record( duration_seconds, attributes={"channel": channel, "category": notification_type} ) @@ -171,7 +133,6 @@ def record_notification_delivery_time( def record_notification_status_change(self, notification_id: str, from_status: str, to_status: str) -> None: self.notification_status_changes.add(1, attributes={"from_status": from_status, "to_status": to_status}) - # Update pending/queued counters if from_status == "pending": self.notifications_pending.add(-1) if to_status == "pending": @@ -182,22 +143,11 @@ def record_notification_status_change(self, notification_id: str, from_status: s if to_status == "queued": self.notifications_queued.add(1) - def record_notification_read(self, notification_type: str, time_to_read_seconds: float) -> None: + def record_notification_read(self, notification_type: str) -> None: self.notifications_read.add(1, attributes={"category": notification_type}) - self.time_to_read.record(time_to_read_seconds, attributes={"category": notification_type}) - - def record_notification_clicked(self, notification_type: str) -> None: - self.notifications_clicked.add(1, attributes={"category": notification_type}) - - def update_unread_count(self, user_id: str, count: int) -> None: - # Track the delta for gauge-like behavior - key = f"_unread_{user_id}" - current_val = getattr(self, key, 0) - delta = count - current_val - if delta != 0: - self.unread_count.add(delta, attributes={"user_id": user_id}) - setattr(self, key, count) + def decrement_unread_count(self, user_id: str) -> None: + self.unread_count.add(-1, attributes={"user_id": user_id}) def record_notification_throttled(self, notification_type: str, user_id: str) -> None: self.notifications_throttled.add(1, attributes={"category": notification_type, "user_id": user_id}) @@ -210,31 +160,13 @@ def record_notification_retry(self, notification_type: str, attempt_number: int, 1, attributes={"category": notification_type, "attempt": str(attempt_number), "success": str(success)} ) - if attempt_number > 1: # Only record retry success rate for actual retries + if attempt_number > 1: self.retry_success_rate.record(100.0 if success else 0.0, attributes={"category": notification_type}) - def record_batch_processed( - self, batch_size_count: int, processing_time_seconds: float, notification_type: str = "mixed" - ) -> None: - self.batch_notifications_processed.add(batch_size_count, attributes={"category": notification_type}) - - self.batch_processing_time.record(processing_time_seconds, attributes={"category": notification_type}) - - self.batch_size.record(batch_size_count, attributes={"category": notification_type}) - - def record_template_render(self, duration_seconds: float, template_name: str, success: bool) -> None: - self.template_render_time.record( - duration_seconds, attributes={"template": template_name, "success": str(success)} - ) - - if not success: - self.template_render_errors.add(1, attributes={"template": template_name}) - def record_webhook_delivery(self, duration_seconds: float, status_code: int, url_pattern: str) -> None: self.webhook_delivery_time.record( duration_seconds, attributes={"status_code": str(status_code), "url_pattern": url_pattern} ) - self.webhook_response_status.add(1, attributes={"status_code": str(status_code), "url_pattern": url_pattern}) def record_slack_delivery( @@ -246,7 +178,6 @@ def record_slack_delivery( self.slack_api_errors.add(1, attributes={"error_type": error_type, "channel": channel}) def update_active_subscriptions(self, user_id: str, count: int) -> None: - # Track the delta for gauge-like behavior key = f"_subscriptions_{user_id}" current_val = getattr(self, key, 0) delta = count - current_val @@ -260,7 +191,7 @@ def record_subscription_change(self, user_id: str, notification_type: str, actio attributes={ "user_id": user_id, "category": notification_type, - "action": action, # "subscribe" or "unsubscribe" + "action": action, }, ) diff --git a/backend/app/core/metrics/replay.py b/backend/app/core/metrics/replay.py index fc5beae9..24acd4b2 100644 --- a/backend/app/core/metrics/replay.py +++ b/backend/app/core/metrics/replay.py @@ -72,20 +72,7 @@ def _create_instruments(self) -> None: name="replay.delay.applied", description="Delay applied between replay events in seconds", unit="s" ) - # Filter metrics - self.replay_events_filtered = self._meter.create_counter( - name="replay.events.filtered.total", description="Total events filtered during replay", unit="1" - ) - - self.replay_filter_effectiveness = self._meter.create_histogram( - name="replay.filter.effectiveness", description="Percentage of events passing filters", unit="%" - ) - - # Memory and resource metrics - self.replay_memory_usage = self._meter.create_histogram( - name="replay.memory.usage", description="Memory usage during replay in MB", unit="MB" - ) - + # Queue metrics self.replay_queue_size = self._meter.create_up_down_counter( name="replay.queue.size", description="Size of replay event queue", unit="1" ) @@ -94,7 +81,6 @@ def record_session_created(self, replay_type: str, target: str) -> None: self.replay_sessions_created.add(1, attributes={"replay_type": replay_type, "target": target}) def update_active_replays(self, count: int) -> None: - # Track the delta for gauge-like behavior key = "_active_replays" current_val = getattr(self, key, 0) delta = count - current_val @@ -122,7 +108,6 @@ def record_event_replayed(self, replay_type: str, event_type: str, status: str) def record_replay_duration(self, duration_seconds: float, replay_type: str, total_events: int = 0) -> None: self.replay_duration.record(duration_seconds, attributes={"replay_type": replay_type}) - # Calculate and record throughput if events were processed if total_events > 0 and duration_seconds > 0: throughput = total_events / duration_seconds self.replay_throughput.record(throughput, attributes={"replay_type": replay_type}) @@ -135,8 +120,6 @@ def record_replay_error(self, error_type: str, replay_type: str = "unknown") -> def record_status_change(self, session_id: str, from_status: str, to_status: str) -> None: self.replay_status_changes.add(1, attributes={"from_status": from_status, "to_status": to_status}) - - # Update sessions by status self.update_sessions_by_status(from_status, -1) self.update_sessions_by_status(to_status, 1) @@ -146,7 +129,6 @@ def update_sessions_by_status(self, status: str, delta: int) -> None: def record_replay_by_target(self, target: str, success: bool) -> None: self.replay_by_target.add(1, attributes={"target": target, "success": str(success)}) - if not success: self.replay_target_errors.add(1, attributes={"target": target}) @@ -159,19 +141,7 @@ def record_delay_applied(self, delay_seconds: float) -> None: def record_batch_size(self, size: int, replay_type: str) -> None: self.replay_batch_size.record(size, attributes={"replay_type": replay_type}) - def record_events_filtered(self, filter_type: str, count: int) -> None: - self.replay_events_filtered.add(count, attributes={"filter_type": filter_type}) - - def record_filter_effectiveness(self, passed: int, total: int, filter_type: str) -> None: - if total > 0: - effectiveness = (passed / total) * 100 - self.replay_filter_effectiveness.record(effectiveness, attributes={"filter_type": filter_type}) - - def record_replay_memory_usage(self, memory_mb: float, session_id: str) -> None: - self.replay_memory_usage.record(memory_mb, attributes={"session_id": session_id}) - def update_replay_queue_size(self, session_id: str, size: int) -> None: - # Track the delta for gauge-like behavior key = f"_queue_{session_id}" current_val = getattr(self, key, 0) delta = size - current_val diff --git a/backend/app/core/metrics/security.py b/backend/app/core/metrics/security.py index c89229c3..589fb1c8 100644 --- a/backend/app/core/metrics/security.py +++ b/backend/app/core/metrics/security.py @@ -5,19 +5,6 @@ class SecurityMetrics(BaseMetrics): """Metrics for security events.""" def _create_instruments(self) -> None: - # Core security event metrics - self.security_events = self._meter.create_counter( - name="security.events.total", description="Total number of security events by type", unit="1" - ) - - self.security_violations = self._meter.create_counter( - name="security.violations.total", description="Total number of security violations", unit="1" - ) - - self.security_alerts = self._meter.create_counter( - name="security.alerts.total", description="Total number of security alerts raised", unit="1" - ) - # Authentication metrics self.authentication_attempts = self._meter.create_counter( name="authentication.attempts.total", description="Total number of authentication attempts", unit="1" @@ -40,10 +27,6 @@ def _create_instruments(self) -> None: name="tokens.generated.total", description="Total number of tokens generated", unit="1" ) - self.tokens_refreshed = self._meter.create_counter( - name="tokens.refreshed.total", description="Total number of tokens refreshed", unit="1" - ) - self.tokens_revoked = self._meter.create_counter( name="tokens.revoked.total", description="Total number of tokens revoked", unit="1" ) @@ -65,10 +48,6 @@ def _create_instruments(self) -> None: name="authorization.denials.total", description="Total number of authorization denials", unit="1" ) - self.permission_checks = self._meter.create_counter( - name="permission.checks.total", description="Total number of permission checks", unit="1" - ) - # CSRF protection metrics self.csrf_tokens_generated = self._meter.create_counter( name="csrf.tokens.generated.total", description="Total number of CSRF tokens generated", unit="1" @@ -78,60 +57,7 @@ def _create_instruments(self) -> None: name="csrf.validation.failures.total", description="Total number of CSRF validation failures", unit="1" ) - # Network security metrics - self.network_policy_violations = self._meter.create_counter( - name="network.policy.violations.total", description="Total number of network policy violations", unit="1" - ) - - self.network_policy_created = self._meter.create_counter( - name="network.policies.created.total", description="Total number of network policies created", unit="1" - ) - - # Privilege escalation metrics - self.privilege_escalation_attempts = self._meter.create_counter( - name="privilege.escalation.attempts.total", - description="Total number of privilege escalation attempts", - unit="1", - ) - - self.privilege_escalation_blocked = self._meter.create_counter( - name="privilege.escalation.blocked.total", - description="Total number of blocked privilege escalation attempts", - unit="1", - ) - - # Rate limiting metrics - self.rate_limit_hits = self._meter.create_counter( - name="rate.limit.hits.total", description="Total number of rate limit hits", unit="1" - ) - - self.rate_limit_violations = self._meter.create_counter( - name="rate.limit.violations.total", description="Total number of rate limit violations", unit="1" - ) - - # API key metrics - self.api_keys_created = self._meter.create_counter( - name="api.keys.created.total", description="Total number of API keys created", unit="1" - ) - - self.api_keys_revoked = self._meter.create_counter( - name="api.keys.revoked.total", description="Total number of API keys revoked", unit="1" - ) - - self.api_key_usage = self._meter.create_counter( - name="api.key.usage.total", description="Total API key usage", unit="1" - ) - - # Audit log metrics - self.audit_events_logged = self._meter.create_counter( - name="audit.events.logged.total", description="Total number of audit events logged", unit="1" - ) - # Password metrics - self.password_changes = self._meter.create_counter( - name="password.changes.total", description="Total number of password changes", unit="1" - ) - self.password_reset_requests = self._meter.create_counter( name="password.reset.requests.total", description="Total number of password reset requests", unit="1" ) @@ -149,24 +75,6 @@ def _create_instruments(self) -> None: name="accounts.locked.total", description="Total number of accounts locked due to security", unit="1" ) - def record_security_event(self, event_type: str, severity: str = "info", source: str = "system") -> None: - self.security_events.add(1, attributes={"event_type": event_type, "severity": severity, "source": source}) - - if severity in ["critical", "high"]: - self.security_alerts.add(1, attributes={"event_type": event_type, "severity": severity}) - - def record_security_violation( - self, violation_type: str, user_id: str | None = None, ip_address: str | None = None - ) -> None: - self.security_violations.add( - 1, - attributes={ - "violation_type": violation_type, - "user_id": user_id or "anonymous", - "ip_address": ip_address or "unknown", - }, - ) - def record_authentication_attempt( self, method: str, success: bool, user_id: str | None = None, duration_seconds: float | None = None ) -> None: @@ -180,15 +88,6 @@ def record_authentication_attempt( if duration_seconds is not None: self.authentication_duration.record(duration_seconds, attributes={"method": method}) - def update_active_sessions(self, count: int) -> None: - # Track the delta for gauge-like behavior - key = "_active_sessions" - current_val = getattr(self, key, 0) - delta = count - current_val - if delta != 0: - self.active_sessions.add(delta) - setattr(self, key, count) - def increment_active_sessions(self) -> None: self.active_sessions.add(1) @@ -197,12 +96,8 @@ def decrement_active_sessions(self) -> None: def record_token_generated(self, token_type: str, expiry_seconds: float) -> None: self.tokens_generated.add(1, attributes={"token_type": token_type}) - self.token_expiry_time.record(expiry_seconds, attributes={"token_type": token_type}) - def record_token_refreshed(self, token_type: str) -> None: - self.tokens_refreshed.add(1, attributes={"token_type": token_type}) - def record_token_revoked(self, token_type: str, reason: str) -> None: self.tokens_revoked.add(1, attributes={"token_type": token_type, "reason": reason}) @@ -227,70 +122,13 @@ def record_authorization_check( 1, attributes={"resource": resource, "action": action, "user_role": user_role or "unknown"} ) - def record_permission_check(self, permission: str, granted: bool, user_id: str | None = None) -> None: - self.permission_checks.add( - 1, attributes={"permission": permission, "granted": str(granted), "user_id": user_id or "unknown"} - ) - def record_csrf_token_generated(self) -> None: self.csrf_tokens_generated.add(1) def record_csrf_validation_failure(self, reason: str) -> None: self.csrf_validation_failures.add(1, attributes={"reason": reason}) - def record_network_policy_violation( - self, policy_name: str, pod_name: str | None = None, violation_type: str = "ingress" - ) -> None: - self.network_policy_violations.add( - 1, - attributes={ - "policy_name": policy_name, - "pod_name": pod_name or "unknown", - "violation_type": violation_type, - }, - ) - - def record_privilege_escalation_attempt(self, user_id: str, target_privilege: str, blocked: bool) -> None: - self.privilege_escalation_attempts.add( - 1, attributes={"user_id": user_id, "target_privilege": target_privilege, "blocked": str(blocked)} - ) - - if blocked: - self.privilege_escalation_blocked.add( - 1, attributes={"user_id": user_id, "target_privilege": target_privilege} - ) - - def record_rate_limit_hit(self, endpoint: str, user_id: str | None = None) -> None: - self.rate_limit_hits.add(1, attributes={"endpoint": endpoint, "user_id": user_id or "anonymous"}) - - def record_rate_limit_violation(self, endpoint: str, user_id: str | None = None, limit: int | None = None) -> None: - self.rate_limit_violations.add( - 1, - attributes={ - "endpoint": endpoint, - "user_id": user_id or "anonymous", - "limit": str(limit) if limit else "unknown", - }, - ) - - def record_api_key_created(self, key_id: str, scopes: str | None = None) -> None: - self.api_keys_created.add(1, attributes={"key_id": key_id, "scopes": scopes or "default"}) - - def record_api_key_revoked(self, key_id: str, reason: str) -> None: - self.api_keys_revoked.add(1, attributes={"key_id": key_id, "reason": reason}) - - def record_api_key_usage(self, key_id: str, endpoint: str) -> None: - self.api_key_usage.add(1, attributes={"key_id": key_id, "endpoint": endpoint}) - - def record_audit_event(self, event_type: str, user_id: str, resource: str | None = None) -> None: - self.audit_events_logged.add( - 1, attributes={"event_type": event_type, "user_id": user_id, "resource": resource or "system"} - ) - - def record_password_change(self, user_id: str, forced: bool = False) -> None: - self.password_changes.add(1, attributes={"user_id": user_id, "forced": str(forced)}) - - def record_password_reset_request(self, user_id: str, method: str = "email") -> None: + def record_password_reset_request(self, user_id: str, method: str = "admin") -> None: self.password_reset_requests.add(1, attributes={"user_id": user_id, "method": method}) def record_weak_password_attempt(self, user_id: str, weakness_type: str) -> None: diff --git a/backend/app/core/middlewares/csrf.py b/backend/app/core/middlewares/csrf.py index dcde555a..ad070b4d 100644 --- a/backend/app/core/middlewares/csrf.py +++ b/backend/app/core/middlewares/csrf.py @@ -1,6 +1,5 @@ -import logging -from typing import TYPE_CHECKING - +import structlog +from dishka import AsyncContainer from starlette.requests import Request from starlette.responses import JSONResponse from starlette.types import ASGIApp, Receive, Scope, Send @@ -8,10 +7,7 @@ from app.core.security import SecurityService from app.domain.user import CSRFValidationError -if TYPE_CHECKING: - from dishka import AsyncContainer - -logger = logging.getLogger(__name__) +logger = structlog.get_logger() class CSRFMiddleware: @@ -36,22 +32,21 @@ async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None: await self.app(scope, receive, send) return - # Get container from app state (set during lifespan) container: AsyncContainer = scope["app"].state.dishka_container security_service: SecurityService = await container.get(SecurityService) request = Request(scope, receive=receive) try: - # validate_csrf_from_request returns "skip" or the token if valid - # raises CSRFValidationError if invalid security_service.validate_csrf_from_request(request) await self.app(scope, receive, send) except CSRFValidationError as e: logger.warning( "CSRF validation failed", - extra={"path": request.url.path, "method": request.method, "reason": str(e)}, + path=request.url.path, + method=request.method, + reason=str(e), ) response = JSONResponse( status_code=403, diff --git a/backend/app/core/providers.py b/backend/app/core/providers.py index 8eb85a4c..f9b79ef3 100644 --- a/backend/app/core/providers.py +++ b/backend/app/core/providers.py @@ -18,7 +18,6 @@ DLQMetrics, EventMetrics, ExecutionMetrics, - HealthMetrics, KubernetesMetrics, NotificationMetrics, RateLimitMetrics, @@ -155,8 +154,8 @@ class CoreServicesProvider(Provider): scope = Scope.APP @provide - def get_security_service(self, settings: Settings) -> SecurityService: - return SecurityService(settings) + def get_security_service(self, settings: Settings, security_metrics: SecurityMetrics) -> SecurityService: + return SecurityService(settings, security_metrics) @provide def get_tracer( @@ -315,10 +314,6 @@ def get_execution_metrics(self, settings: Settings) -> ExecutionMetrics: def get_database_metrics(self, settings: Settings) -> DatabaseMetrics: return DatabaseMetrics(settings) - @provide - def get_health_metrics(self, settings: Settings) -> HealthMetrics: - return HealthMetrics(settings) - @provide def get_kubernetes_metrics(self, settings: Settings) -> KubernetesMetrics: return KubernetesMetrics(settings) @@ -441,9 +436,23 @@ def get_auth_service( self, user_repository: UserRepository, security_service: SecurityService, + security_metrics: SecurityMetrics, logger: structlog.stdlib.BoundLogger, + lockout_service: LoginLockoutService, + runtime_settings: RuntimeSettingsLoader, + producer: UnifiedProducer, + settings: Settings, ) -> AuthService: - return AuthService(user_repository, security_service, logger) + return AuthService( + user_repo=user_repository, + security_service=security_service, + security_metrics=security_metrics, + logger=logger, + lockout_service=lockout_service, + runtime_settings=runtime_settings, + producer=producer, + settings=settings, + ) class KafkaServicesProvider(Provider): @@ -628,6 +637,7 @@ def get_admin_user_service( execution_service: ExecutionService, rate_limit_service: RateLimitService, security_service: SecurityService, + security_metrics: SecurityMetrics, logger: structlog.stdlib.BoundLogger, ) -> AdminUserService: return AdminUserService( @@ -636,6 +646,7 @@ def get_admin_user_service( execution_service=execution_service, rate_limit_service=rate_limit_service, security_service=security_service, + security_metrics=security_metrics, logger=logger, ) diff --git a/backend/app/core/security.py b/backend/app/core/security.py index ad25d507..4901c310 100644 --- a/backend/app/core/security.py +++ b/backend/app/core/security.py @@ -8,6 +8,7 @@ from fastapi.security import OAuth2PasswordBearer from passlib.context import CryptContext +from app.core.metrics import SecurityMetrics from app.domain.user import CSRFValidationError, InvalidCredentialsError from app.settings import Settings @@ -15,8 +16,9 @@ class SecurityService: - def __init__(self, settings: Settings) -> None: + def __init__(self, settings: Settings, security_metrics: SecurityMetrics) -> None: self.settings = settings + self._security_metrics = security_metrics # --8<-- [start:password_hashing] self.pwd_context = CryptContext( schemes=["bcrypt"], @@ -123,12 +125,15 @@ def validate_csrf_from_request(self, request: Request) -> str: cookie_token = request.cookies.get("csrf_token", "") if not header_token: + self._security_metrics.record_csrf_validation_failure("missing_header") raise CSRFValidationError("CSRF token missing from X-CSRF-Token header") if not self.validate_csrf_token(header_token, cookie_token): + self._security_metrics.record_csrf_validation_failure("token_mismatch") raise CSRFValidationError("CSRF token invalid or does not match cookie") if not self._verify_csrf_signature(header_token, access_token): + self._security_metrics.record_csrf_validation_failure("invalid_signature") raise CSRFValidationError("CSRF token signature invalid") return header_token diff --git a/backend/app/dlq/manager.py b/backend/app/dlq/manager.py index 6ff600a2..968219b5 100644 --- a/backend/app/dlq/manager.py +++ b/backend/app/dlq/manager.py @@ -246,6 +246,7 @@ async def retry_messages_batch(self, event_ids: list[str]) -> DLQBatchRetryResul details.append(DLQRetryResult(event_id=event_id, status="failed", error="Retry failed")) except Exception as e: self.logger.error(f"Error retrying message {event_id}: {e}") + self.metrics.record_dlq_processing_error("batch_retry", event_id, type(e).__name__) failed += 1 details.append(DLQRetryResult(event_id=event_id, status="failed", error=str(e))) diff --git a/backend/app/domain/exceptions.py b/backend/app/domain/exceptions.py index 79b46b43..30c326a8 100644 --- a/backend/app/domain/exceptions.py +++ b/backend/app/domain/exceptions.py @@ -51,6 +51,12 @@ class InvalidStateError(DomainError): pass +class AccountLockedError(DomainError): + """Account temporarily locked (maps to 423).""" + + pass + + class InfrastructureError(DomainError): """Infrastructure failure - DB, Kafka, K8s, etc (maps to 500).""" diff --git a/backend/app/domain/user/__init__.py b/backend/app/domain/user/__init__.py index 7bd51a2b..4f5957f5 100644 --- a/backend/app/domain/user/__init__.py +++ b/backend/app/domain/user/__init__.py @@ -20,6 +20,7 @@ from .user_models import ( DomainUserCreate, DomainUserUpdate, + LoginResult, PasswordReset, User, UserDeleteResult, @@ -43,6 +44,7 @@ "DomainUserSettingsUpdate", "DomainUserUpdate", "InvalidCredentialsError", + "LoginResult", "PasswordReset", "TokenExpiredError", "User", diff --git a/backend/app/domain/user/user_models.py b/backend/app/domain/user/user_models.py index 550ef41e..5665a135 100644 --- a/backend/app/domain/user/user_models.py +++ b/backend/app/domain/user/user_models.py @@ -103,6 +103,17 @@ class DomainUserCreate: is_superuser: bool = False +@dataclass +class LoginResult: + """Result of a successful login.""" + + username: str + role: UserRole + access_token: str + csrf_token: str + session_timeout_minutes: int + + @dataclass class DomainUserUpdate: """User update data for repository (with hashed password).""" diff --git a/backend/app/services/admin/admin_user_service.py b/backend/app/services/admin/admin_user_service.py index b30a0376..b9ed09af 100644 --- a/backend/app/services/admin/admin_user_service.py +++ b/backend/app/services/admin/admin_user_service.py @@ -3,6 +3,7 @@ import structlog +from app.core.metrics import SecurityMetrics from app.core.security import SecurityService from app.db.repositories import UserRepository from app.domain.admin import AdminUserOverviewDomain, DerivedCountsDomain, RateLimitSummaryDomain @@ -31,6 +32,7 @@ def __init__( execution_service: ExecutionService, rate_limit_service: RateLimitService, security_service: SecurityService, + security_metrics: SecurityMetrics, logger: structlog.stdlib.BoundLogger, ) -> None: self._users = user_repository @@ -38,6 +40,7 @@ def __init__( self._executions = execution_service self._rate_limits = rate_limit_service self._security = security_service + self._security_metrics = security_metrics self.logger = logger async def get_user_overview(self, user_id: str, hours: int = 24) -> AdminUserOverviewDomain: @@ -202,6 +205,7 @@ async def reset_user_password(self, *, admin_user_id: str, user_id: str, new_pas self.logger.info( "Admin resetting user password", admin_user_id=admin_user_id, target_user_id=user_id ) + self._security_metrics.record_password_reset_request(user_id, method="admin") hashed = self._security.get_password_hash(new_password) pr = PasswordReset(user_id=user_id, new_password=hashed) ok = await self._users.reset_user_password(pr) diff --git a/backend/app/services/auth_service.py b/backend/app/services/auth_service.py index fd05954e..243ccd31 100644 --- a/backend/app/services/auth_service.py +++ b/backend/app/services/auth_service.py @@ -1,10 +1,33 @@ +from datetime import timedelta +from typing import NoReturn + import structlog from fastapi import Request +from app.core.metrics import SecurityMetrics from app.core.security import SecurityService from app.db.repositories import UserRepository -from app.domain.enums import UserRole -from app.domain.user import AdminAccessRequiredError, AuthenticationRequiredError, InvalidCredentialsError, User +from app.domain.enums import LoginMethod, UserRole +from app.domain.events import ( + AuthFailedEvent, + EventMetadata, + UserLoggedInEvent, + UserLoggedOutEvent, + UserRegisteredEvent, +) +from app.domain.exceptions import AccountLockedError, ConflictError, ValidationError +from app.domain.user import ( + AdminAccessRequiredError, + AuthenticationRequiredError, + DomainUserCreate, + InvalidCredentialsError, + LoginResult, + User, +) +from app.events.core import UnifiedProducer +from app.services.login_lockout import LoginLockoutService +from app.services.runtime_settings import RuntimeSettingsLoader +from app.settings import Settings class AuthService: @@ -12,11 +35,28 @@ def __init__( self, user_repo: UserRepository, security_service: SecurityService, + security_metrics: SecurityMetrics, logger: structlog.stdlib.BoundLogger, + lockout_service: LoginLockoutService, + runtime_settings: RuntimeSettingsLoader, + producer: UnifiedProducer, + settings: Settings, ): self.user_repo = user_repo self.security_service = security_service + self.security_metrics = security_metrics self.logger = logger + self._lockout = lockout_service + self._runtime_settings = runtime_settings + self._producer = producer + self._settings = settings + + def _build_metadata(self, user_id: str = "") -> EventMetadata: + return EventMetadata( + service_name=self._settings.SERVICE_NAME, + service_version=self._settings.SERVICE_VERSION, + user_id=user_id, + ) async def get_current_user(self, request: Request) -> User: token = request.cookies.get("access_token") @@ -24,6 +64,7 @@ async def get_current_user(self, request: Request) -> User: raise AuthenticationRequiredError() username = self.security_service.decode_token(token) + user = await self.user_repo.get_user(username) if user is None: raise InvalidCredentialsError() @@ -32,7 +73,162 @@ async def get_current_user(self, request: Request) -> User: async def get_admin(self, request: Request) -> User: user = await self.get_current_user(request) + self.security_metrics.record_authorization_check( + "/admin", request.method, user.role == UserRole.ADMIN, user_role=user.role, + ) if user.role != UserRole.ADMIN: self.logger.warning(f"Admin access denied for user: {user.username} (role: {user.role})") raise AdminAccessRequiredError(user.username) return user + + async def _fail_login( + self, + username: str, + reason: str, + ip_address: str, + user_agent: str | None, + user_id: str = "", + ) -> NoReturn: + self.logger.warning( + f"Login failed - {reason}", + username=username, + client_ip=ip_address, + user_agent=user_agent, + ) + locked = await self._lockout.record_failed_attempt(username) + await self._producer.produce( + event_to_produce=AuthFailedEvent( + username=username, + reason=reason, + ip_address=ip_address, + metadata=self._build_metadata(user_id=user_id), + ), + key=username, + ) + if locked: + self.security_metrics.record_account_locked(username, "brute_force") + raise AccountLockedError("Account locked due to too many failed attempts") + raise InvalidCredentialsError() + + async def login( + self, + username: str, + password: str, + ip_address: str, + user_agent: str | None, + ) -> LoginResult: + if await self._lockout.check_locked(username): + raise AccountLockedError("Account temporarily locked due to too many failed attempts") + + user = await self.user_repo.get_user(username) + + if not user: + await self._fail_login(username, "user_not_found", ip_address, user_agent) + + if not self.security_service.verify_password(password, user.hashed_password): + await self._fail_login(username, "invalid_password", ip_address, user_agent, user_id=str(user.user_id)) + + await self._lockout.clear_attempts(username) + + effective = await self._runtime_settings.get_effective_settings() + session_timeout = effective.session_timeout_minutes + + self.logger.info( + "Login successful", + username=user.username, + client_ip=ip_address, + user_agent=user_agent, + token_expires_in_minutes=session_timeout, + ) + + access_token_expires = timedelta(minutes=session_timeout) + access_token = self.security_service.create_access_token( + data={"sub": user.username}, expires_delta=access_token_expires, + ) + csrf_token = self.security_service.generate_csrf_token(access_token) + + await self._producer.produce( + event_to_produce=UserLoggedInEvent( + user_id=str(user.user_id), + login_method=LoginMethod.PASSWORD, + ip_address=ip_address, + user_agent=user_agent, + metadata=self._build_metadata(user_id=str(user.user_id)), + ), + key=user.username, + ) + + return LoginResult( + username=user.username, + role=user.role, + access_token=access_token, + csrf_token=csrf_token, + session_timeout_minutes=session_timeout, + ) + + async def register( + self, + username: str, + email: str, + password: str, + ip_address: str, + user_agent: str | None, + ) -> User: + effective = await self._runtime_settings.get_effective_settings() + min_len = effective.password_min_length + if len(password) < min_len: + self.security_metrics.record_weak_password_attempt(username, "too_short") + raise ValidationError(f"Password must be at least {min_len} characters") + + existing = await self.user_repo.get_user(username) + if existing: + self.logger.warning( + "Registration failed - username taken", + username=username, + client_ip=ip_address, + user_agent=user_agent, + ) + raise ConflictError("Username already registered") + + hashed_password = self.security_service.get_password_hash(password) + create_data = DomainUserCreate( + username=username, + email=email, + hashed_password=hashed_password, + role=UserRole.USER, + is_active=True, + is_superuser=False, + ) + created_user = await self.user_repo.create_user(create_data) + + self.logger.info( + "Registration successful", + username=created_user.username, + client_ip=ip_address, + user_agent=user_agent, + ) + + await self._producer.produce( + event_to_produce=UserRegisteredEvent( + user_id=str(created_user.user_id), + username=created_user.username, + email=created_user.email, + metadata=self._build_metadata(user_id=str(created_user.user_id)), + ), + key=created_user.username, + ) + + return created_user + + async def publish_logout_event(self, token: str | None) -> None: + if not token: + return + username = self.security_service.decode_token(token) + await self._producer.produce( + event_to_produce=UserLoggedOutEvent( + user_id=username, + logout_reason="user_initiated", + metadata=self._build_metadata(user_id=username), + ), + key=username, + ) diff --git a/backend/app/services/event_replay/replay_service.py b/backend/app/services/event_replay/replay_service.py index fecaf40a..5dc65db6 100644 --- a/backend/app/services/event_replay/replay_service.py +++ b/backend/app/services/event_replay/replay_service.py @@ -51,6 +51,7 @@ async def create_session_from_config(self, config: ReplayConfig) -> ReplayOperat state = ReplaySessionState(session_id=str(uuid4()), config=config) self._sessions[state.session_id] = state await self._repository.save_session(state) + self._metrics.record_session_created(config.replay_type, config.target) return ReplayOperationResult( session_id=state.session_id, status=ReplayStatus.CREATED, @@ -94,6 +95,7 @@ async def start_session(self, session_id: str) -> ReplayOperationResult: session.status = ReplayStatus.RUNNING session.started_at = datetime.now(timezone.utc) self._metrics.increment_active_replays() + self._metrics.record_speed_multiplier(session.config.speed_multiplier, session.config.replay_type) await self._repository.update_session_status(session_id, ReplayStatus.RUNNING) return ReplayOperationResult( session_id=session_id, status=ReplayStatus.RUNNING, message="Replay session started" @@ -214,6 +216,9 @@ async def _dispatch_next(self, session: ReplaySessionState) -> None: time_diff = (next_event.timestamp - session.last_event_at).total_seconds() delay = max(time_diff / session.config.speed_multiplier, 0) + if delay > 0: + self._metrics.record_delay_applied(delay) + scheduler = self._schedulers.get(session.session_id) if scheduler and scheduler.running and session.status == ReplayStatus.RUNNING: scheduler.add_job( @@ -343,8 +348,10 @@ async def _dispatch() -> None: try: await _dispatch() + self._metrics.record_replay_by_target(config.target, success=True) return True except Exception: + self._metrics.record_replay_by_target(config.target, success=False) return False async def _write_event_to_file(self, event: DomainEvent, file_path: str) -> None: diff --git a/backend/app/services/k8s_worker/worker.py b/backend/app/services/k8s_worker/worker.py index e5429eab..5e48bbc0 100644 --- a/backend/app/services/k8s_worker/worker.py +++ b/backend/app/services/k8s_worker/worker.py @@ -109,7 +109,7 @@ async def _create_pod_for_execution(self, command: CreatePodCommandEvent) -> Non async with self._creation_semaphore: execution_id = command.execution_id self._active_creations.add(execution_id) - self.metrics.update_k8s_active_creations(len(self._active_creations)) + self.metrics.update_active_pod_creations(len(self._active_creations)) start_time = time.time() @@ -151,7 +151,7 @@ async def _create_pod_for_execution(self, command: CreatePodCommandEvent) -> Non finally: self._active_creations.discard(execution_id) - self.metrics.update_k8s_active_creations(len(self._active_creations)) + self.metrics.update_active_pod_creations(len(self._active_creations)) async def _get_entrypoint_script(self) -> str: """Get entrypoint script content""" diff --git a/backend/app/services/notification_service.py b/backend/app/services/notification_service.py index 0566f334..aaaa47d8 100644 --- a/backend/app/services/notification_service.py +++ b/backend/app/services/notification_service.py @@ -168,6 +168,7 @@ async def create_notification( f"per {self.settings.NOTIF_THROTTLE_WINDOW_HOURS} hour(s)" ) self.logger.warning(error_msg) + self.metrics.record_notification_throttled("general", user_id) raise NotificationThrottledError( user_id, self.settings.NOTIF_THROTTLE_MAX_PER_HOUR, @@ -544,7 +545,10 @@ async def update_subscription( if not update_data.slack_webhook: raise NotificationValidationError("slack_webhook is required when enabling SLACK") - return await self.repository.upsert_subscription(user_id, channel, update_data) + result = await self.repository.upsert_subscription(user_id, channel, update_data) + action = "enabled" if update_data.enabled else "updated" + self.metrics.record_subscription_change(user_id, channel, action) + return result async def mark_all_as_read(self, user_id: str) -> int: """Mark all notifications as read for a user.""" @@ -693,6 +697,8 @@ async def _attempt() -> None: retry_count=notification.max_retries, ), ) + notification_type = notification.tags[0] if notification.tags else "unknown" + self.metrics.record_notification_failed(notification_type, str(last_error), channel=notification.channel) self.logger.error( f"All delivery attempts exhausted for {notification.notification_id}: {last_error}", exc_info=last_error, diff --git a/backend/app/services/sse/sse_service.py b/backend/app/services/sse/sse_service.py index cf9b78af..c698ec0e 100644 --- a/backend/app/services/sse/sse_service.py +++ b/backend/app/services/sse/sse_service.py @@ -58,6 +58,7 @@ def __init__( async def create_execution_stream(self, execution_id: str, user_id: str) -> AsyncGenerator[dict[str, Any], None]: subscription: SSERedisSubscription | None = None + start_time = datetime.now(timezone.utc) self.metrics.increment_sse_connections("executions") try: yield self._format_sse_event( @@ -121,6 +122,8 @@ async def create_execution_stream(self, execution_id: str, user_id: str) -> Asyn finally: if subscription is not None: await asyncio.shield(subscription.close()) + duration = (datetime.now(timezone.utc) - start_time).total_seconds() + self.metrics.record_sse_connection_duration(duration, "executions") self.metrics.decrement_sse_connections("executions") self.logger.info("SSE connection closed", execution_id=execution_id) diff --git a/backend/grafana/provisioning/dashboards/coordinator-execution.json b/backend/grafana/provisioning/dashboards/coordinator-execution.json new file mode 100644 index 00000000..4026abe6 --- /dev/null +++ b/backend/grafana/provisioning/dashboards/coordinator-execution.json @@ -0,0 +1,275 @@ +{ + "annotations": { + "list": [] + }, + "description": "Coordinator & Execution", + "editable": true, + "gnetId": null, + "graphTooltip": 1, + "id": null, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 100, + "panels": [], + "title": "Coordinator", + "type": "row" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 1, + "targets": [ + { + "expr": "histogram_quantile(0.95, rate(coordinator_scheduling_duration_seconds_bucket[5m]))", + "legendFormat": "p95", + "refId": "A" + } + ], + "title": "Scheduling Duration", + "type": "timeseries" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 2, + "targets": [ + { + "expr": "histogram_quantile(0.95, rate(coordinator_queue_wait_time_seconds_bucket[5m]))", + "legendFormat": "p95", + "refId": "A" + } + ], + "title": "Queue Wait Time", + "type": "timeseries" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 7 + }, + "id": 3, + "targets": [ + { + "expr": "coordinator_executions_active", + "refId": "A" + } + ], + "title": "Active Executions", + "type": "stat" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 7 + }, + "id": 4, + "targets": [ + { + "expr": "rate(coordinator_executions_scheduled_total[5m])", + "refId": "A" + } + ], + "title": "Scheduled (5m)", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 11 + }, + "id": 101, + "panels": [], + "title": "Execution Queue", + "type": "row" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 12 + }, + "id": 5, + "targets": [ + { + "expr": "histogram_quantile(0.95, rate(execution_queue_wait_time_seconds_bucket[5m]))", + "legendFormat": "p95", + "refId": "A" + } + ], + "title": "Queue Wait Time (Execution)", + "type": "timeseries" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 12 + }, + "id": 6, + "targets": [ + { + "expr": "rate(executions_assigned_total[5m])", + "legendFormat": "Assigned", + "refId": "A" + }, + { + "expr": "rate(executions_queued_total[5m])", + "legendFormat": "Queued", + "refId": "B" + } + ], + "title": "Assigned & Queued", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 18 + }, + "id": 102, + "panels": [], + "title": "Script Resources", + "type": "row" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 19 + }, + "id": 7, + "targets": [ + { + "expr": "histogram_quantile(0.50, rate(script_memory_usage_MiB_bucket[5m]))", + "legendFormat": "p50", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, rate(script_memory_usage_MiB_bucket[5m]))", + "legendFormat": "p95", + "refId": "B" + } + ], + "title": "Memory Usage", + "type": "timeseries" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 19 + }, + "id": 8, + "targets": [ + { + "expr": "histogram_quantile(0.50, rate(script_memory_utilization_percent_bucket[5m]))", + "legendFormat": "p50", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, rate(script_memory_utilization_percent_bucket[5m]))", + "legendFormat": "p95", + "refId": "B" + } + ], + "title": "Memory Utilization", + "type": "timeseries" + } + ], + "refresh": "10s", + "schemaVersion": 33, + "style": "dark", + "tags": [ + "coordinator", + "execution" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Coordinator & Execution", + "uid": "coordinator-execution", + "version": 1 +} diff --git a/backend/grafana/provisioning/dashboards/dlq-monitoring.json b/backend/grafana/provisioning/dashboards/dlq-monitoring.json index 74e95eab..9a1b7a68 100644 --- a/backend/grafana/provisioning/dashboards/dlq-monitoring.json +++ b/backend/grafana/provisioning/dashboards/dlq-monitoring.json @@ -85,7 +85,9 @@ "options": { "orientation": "auto", "reduceOptions": { - "calcs": ["lastNotNull"], + "calcs": [ + "lastNotNull" + ], "fields": "", "values": false }, @@ -221,7 +223,9 @@ "options": { "orientation": "auto", "reduceOptions": { - "calcs": ["lastNotNull"], + "calcs": [ + "lastNotNull" + ], "fields": "", "values": false }, @@ -274,7 +278,9 @@ "justifyMode": "auto", "orientation": "auto", "reduceOptions": { - "calcs": ["lastNotNull"], + "calcs": [ + "lastNotNull" + ], "fields": "", "values": false }, @@ -332,15 +338,22 @@ }, "id": 5, "options": { - "displayLabels": ["name", "percent"], + "displayLabels": [ + "name", + "percent" + ], "legend": { "displayMode": "table", "placement": "right", - "values": ["value"] + "values": [ + "value" + ] }, "pieType": "donut", "reduceOptions": { - "calcs": ["lastNotNull"], + "calcs": [ + "lastNotNull" + ], "fields": "", "values": false }, @@ -417,7 +430,9 @@ "id": 6, "options": { "legend": { - "calcs": ["mean"], + "calcs": [ + "mean" + ], "displayMode": "list", "placement": "bottom" }, @@ -473,15 +488,23 @@ }, "id": 7, "options": { - "displayLabels": ["name", "value"], + "displayLabels": [ + "name", + "value" + ], "legend": { "displayMode": "table", "placement": "bottom", - "values": ["value", "percent"] + "values": [ + "value", + "percent" + ] }, "pieType": "pie", "reduceOptions": { - "calcs": ["lastNotNull"], + "calcs": [ + "lastNotNull" + ], "fields": "", "values": false }, @@ -580,7 +603,10 @@ "id": 8, "options": { "legend": { - "calcs": ["mean", "max"], + "calcs": [ + "mean", + "max" + ], "displayMode": "list", "placement": "bottom" }, @@ -667,7 +693,10 @@ "id": 9, "options": { "legend": { - "calcs": ["mean", "max"], + "calcs": [ + "mean", + "max" + ], "displayMode": "list", "placement": "bottom" }, @@ -705,64 +734,6 @@ "title": "Retry Metrics", "type": "row" }, - { - "datasource": "Victoria Metrics", - "description": "Number of retry attempts for messages", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 3 - }, - { - "color": "red", - "value": 5 - } - ] - }, - "unit": "short" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 26 - }, - "id": 10, - "options": { - "displayMode": "lcd", - "orientation": "horizontal", - "reduceOptions": { - "calcs": ["mean"], - "fields": "", - "values": false - }, - "showUnfilled": true, - "text": {} - }, - "pluginVersion": "8.3.3", - "targets": [ - { - "expr": "sum by (original_topic) (dlq_retry_attempts_sum) / sum by (original_topic) (dlq_retry_attempts_count)", - "legendFormat": "{{original_topic}}", - "refId": "A" - } - ], - "title": "Average Retry Attempts by Topic", - "type": "bargauge" - }, { "datasource": "Victoria Metrics", "description": "Retry success/failure breakdown", @@ -821,7 +792,9 @@ "id": 11, "options": { "legend": { - "calcs": ["sum"], + "calcs": [ + "sum" + ], "displayMode": "list", "placement": "bottom" }, @@ -840,83 +813,6 @@ "title": "Retry Results (1h)", "type": "timeseries" }, - { - "datasource": "Victoria Metrics", - "description": "Messages processed per second from DLQ", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "msg/s" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 26 - }, - "id": 12, - "options": { - "legend": { - "calcs": ["mean", "max"], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "multi" - } - }, - "pluginVersion": "8.3.3", - "targets": [ - { - "expr": "sum by (original_topic) (rate(dlq_throughput_rate_msg_per_second_sum[5m])) / sum by (original_topic) (rate(dlq_throughput_rate_msg_per_second_count[5m]))", - "legendFormat": "{{original_topic}}", - "refId": "A" - } - ], - "title": "DLQ Throughput by Topic", - "type": "timeseries" - }, { "collapsed": false, "datasource": null, @@ -992,7 +888,9 @@ "id": 13, "options": { "legend": { - "calcs": ["sum"], + "calcs": [ + "sum" + ], "displayMode": "list", "placement": "bottom" }, @@ -1230,7 +1128,9 @@ "id": 16, "options": { "legend": { - "calcs": ["lastNotNull"], + "calcs": [ + "lastNotNull" + ], "displayMode": "list", "placement": "bottom" }, @@ -1305,7 +1205,9 @@ "justifyMode": "center", "orientation": "auto", "reduceOptions": { - "calcs": ["lastNotNull"], + "calcs": [ + "lastNotNull" + ], "fields": "", "values": false }, @@ -1366,7 +1268,9 @@ "justifyMode": "center", "orientation": "auto", "reduceOptions": { - "calcs": ["lastNotNull"], + "calcs": [ + "lastNotNull" + ], "fields": "", "values": false }, @@ -1425,7 +1329,9 @@ "justifyMode": "center", "orientation": "auto", "reduceOptions": { - "calcs": ["lastNotNull"], + "calcs": [ + "lastNotNull" + ], "fields": "", "values": false }, @@ -1484,7 +1390,9 @@ "justifyMode": "center", "orientation": "auto", "reduceOptions": { - "calcs": ["lastNotNull"], + "calcs": [ + "lastNotNull" + ], "fields": "", "values": false }, @@ -1505,7 +1413,10 @@ "refresh": "10s", "schemaVersion": 33, "style": "dark", - "tags": ["kafka", "dlq"], + "tags": [ + "kafka", + "dlq" + ], "templating": { "list": [] }, @@ -1514,11 +1425,22 @@ "to": "now" }, "timepicker": { - "refresh_intervals": ["5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d"] + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] }, "timezone": "", "title": "Dead Letter Queue", "uid": "dlq-monitoring", "version": 1, "weekStart": "" -} \ No newline at end of file +} diff --git a/backend/grafana/provisioning/dashboards/event-replay.json b/backend/grafana/provisioning/dashboards/event-replay.json new file mode 100644 index 00000000..56a9eb47 --- /dev/null +++ b/backend/grafana/provisioning/dashboards/event-replay.json @@ -0,0 +1,387 @@ +{ + "annotations": { + "list": [] + }, + "description": "Event Replay", + "editable": true, + "gnetId": null, + "graphTooltip": 1, + "id": null, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 100, + "panels": [], + "title": "Sessions", + "type": "row" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 1, + "targets": [ + { + "expr": "rate(replay_sessions_created_total[5m])", + "legendFormat": "Created", + "refId": "A" + } + ], + "title": "Sessions Created", + "type": "timeseries" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 1 + }, + "id": 2, + "targets": [ + { + "expr": "replay_sessions_active", + "refId": "A" + } + ], + "title": "Active Sessions", + "type": "stat" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 1 + }, + "id": 3, + "targets": [ + { + "expr": "replay_sessions_by_status", + "refId": "A" + } + ], + "title": "By Status", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 7 + }, + "id": 101, + "panels": [], + "title": "Events", + "type": "row" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 4, + "targets": [ + { + "expr": "rate(replay_events_processed_total[5m])", + "legendFormat": "Processed", + "refId": "A" + }, + { + "expr": "rate(replay_events_failed_total[5m])", + "legendFormat": "Failed", + "refId": "B" + }, + { + "expr": "rate(replay_events_skipped_total[5m])", + "legendFormat": "Skipped", + "refId": "C" + } + ], + "title": "Events Processed", + "type": "timeseries" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 5, + "targets": [ + { + "expr": "rate(replay_status_changes_total[5m])", + "legendFormat": "Changes", + "refId": "A" + } + ], + "title": "Status Changes", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 14 + }, + "id": 102, + "panels": [], + "title": "Performance", + "type": "row" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 15 + }, + "id": 6, + "targets": [ + { + "expr": "histogram_quantile(0.95, rate(replay_duration_seconds_bucket[5m]))", + "legendFormat": "p95", + "refId": "A" + } + ], + "title": "Replay Duration", + "type": "timeseries" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 15 + }, + "id": 7, + "targets": [ + { + "expr": "histogram_quantile(0.95, rate(replay_event_processing_time_seconds_bucket[5m]))", + "legendFormat": "p95", + "refId": "A" + } + ], + "title": "Event Processing Time", + "type": "timeseries" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 21 + }, + "id": 8, + "targets": [ + { + "expr": "histogram_quantile(0.50, rate(replay_throughput_event_per_second_bucket[5m]))", + "legendFormat": "p50", + "refId": "A" + } + ], + "title": "Throughput", + "type": "timeseries" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 21 + }, + "id": 9, + "targets": [ + { + "expr": "histogram_quantile(0.50, rate(replay_batch_size_bucket[5m]))", + "legendFormat": "p50", + "refId": "A" + } + ], + "title": "Batch Size", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 27 + }, + "id": 103, + "panels": [], + "title": "Targets & Speed", + "type": "row" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 28 + }, + "id": 10, + "targets": [ + { + "expr": "rate(replay_by_target_total[5m])", + "legendFormat": "{{target}}", + "refId": "A" + }, + { + "expr": "rate(replay_target_errors_total[5m])", + "legendFormat": "Errors", + "refId": "B" + } + ], + "title": "By Target", + "type": "timeseries" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 28 + }, + "id": 11, + "targets": [ + { + "expr": "histogram_quantile(0.50, rate(replay_speed_multiplier_x_bucket[5m]))", + "legendFormat": "Multiplier p50", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.50, rate(replay_delay_applied_seconds_bucket[5m]))", + "legendFormat": "Delay p50", + "refId": "B" + } + ], + "title": "Speed Control", + "type": "timeseries" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 12, + "x": 0, + "y": 34 + }, + "id": 12, + "targets": [ + { + "expr": "replay_queue_size", + "refId": "A" + } + ], + "title": "Queue Size", + "type": "stat" + } + ], + "refresh": "10s", + "schemaVersion": 33, + "style": "dark", + "tags": [ + "replay" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Event Replay", + "uid": "event-replay", + "version": 1 +} diff --git a/backend/grafana/provisioning/dashboards/event-stream-monitoring.json b/backend/grafana/provisioning/dashboards/event-stream-monitoring.json index 3f421163..fde9d831 100644 --- a/backend/grafana/provisioning/dashboards/event-stream-monitoring.json +++ b/backend/grafana/provisioning/dashboards/event-stream-monitoring.json @@ -114,7 +114,9 @@ "justifyMode": "auto", "orientation": "auto", "reduceOptions": { - "calcs": ["lastNotNull"], + "calcs": [ + "lastNotNull" + ], "fields": "", "values": false }, @@ -179,7 +181,9 @@ "justifyMode": "auto", "orientation": "auto", "reduceOptions": { - "calcs": ["lastNotNull"], + "calcs": [ + "lastNotNull" + ], "fields": "", "values": false }, @@ -203,527 +207,7 @@ }, { "datasource": "Victoria Metrics", - "description": "Total events currently buffered", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 1000 - }, - { - "color": "red", - "value": 5000 - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 4, - "x": 8, - "y": 1 - }, - "id": 53, - "options": { - "colorMode": "background", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": ["lastNotNull"], - "fields": "", - "values": false - }, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "10.2.0", - "targets": [ - { - "datasource": "Victoria Metrics", - "editorMode": "code", - "expr": "event_buffer_size", - "instant": false, - "legendFormat": "Buffered Events", - "range": true, - "refId": "A" - } - ], - "title": "Events in Buffer", - "type": "stat" - }, - { - "datasource": "Victoria Metrics", - "description": "Total event publishing rate", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "Events/sec", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "ops" - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 6, - "x": 12, - "y": 1 - }, - "id": 54, - "options": { - "legend": { - "calcs": ["mean", "max"], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": "Victoria Metrics", - "editorMode": "code", - "expr": "sum(rate(events_published_total[1m]))", - "instant": false, - "legendFormat": "Event Rate", - "range": true, - "refId": "A" - } - ], - "title": "Event Publishing Rate", - "type": "timeseries" - }, - { - "datasource": "Victoria Metrics", - "description": "Event delivery success rate", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "max": 100, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "red", - "value": null - }, - { - "color": "yellow", - "value": 90 - }, - { - "color": "green", - "value": 95 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 6, - "x": 18, - "y": 1 - }, - "id": 55, - "options": { - "minVizHeight": 75, - "minVizWidth": 75, - "orientation": "auto", - "reduceOptions": { - "calcs": ["lastNotNull"], - "fields": "", - "values": false - }, - "showThresholdLabels": false, - "showThresholdMarkers": true, - "sizing": "auto" - }, - "pluginVersion": "10.2.0", - "targets": [ - { - "datasource": "Victoria Metrics", - "editorMode": "code", - "expr": "100 - (sum(rate(event_buffer_dropped_total[5m])) / (sum(rate(event_buffer_processed_total[5m])) + 0.0001) * 100)", - "instant": false, - "legendFormat": "Delivery Rate", - "range": true, - "refId": "A" - } - ], - "title": "Event Delivery Success", - "type": "gauge" - }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 7 - }, - "id": 56, - "panels": [], - "title": "SSE Connection Details", - "type": "row" - }, - { - "datasource": "Victoria Metrics", - "description": "SSE connections by endpoint over time", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "Connections", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 8 - }, - "id": 57, - "options": { - "legend": { - "calcs": ["mean", "max", "lastNotNull"], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "targets": [ - { - "datasource": "Victoria Metrics", - "editorMode": "code", - "expr": "sse_connections_active", - "instant": false, - "legendFormat": "{{endpoint}}", - "range": true, - "refId": "A" - } - ], - "title": "SSE Connections by Endpoint", - "type": "timeseries" - }, - { - "datasource": "Victoria Metrics", - "description": "Distribution of SSE connection durations", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "Duration (seconds)", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "s" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 8 - }, - "id": 58, - "options": { - "legend": { - "calcs": ["mean", "max"], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - "targets": [ - { - "datasource": "Victoria Metrics", - "editorMode": "code", - "expr": "histogram_quantile(0.99, sum(rate(sse_connection_duration_seconds_bucket[5m])) by (le))", - "instant": false, - "legendFormat": "p99", - "range": true, - "refId": "A" - }, - { - "datasource": "Victoria Metrics", - "editorMode": "code", - "expr": "histogram_quantile(0.95, sum(rate(sse_connection_duration_seconds_bucket[5m])) by (le))", - "instant": false, - "legendFormat": "p95", - "range": true, - "refId": "B" - }, - { - "datasource": "Victoria Metrics", - "editorMode": "code", - "expr": "histogram_quantile(0.50, sum(rate(sse_connection_duration_seconds_bucket[5m])) by (le))", - "instant": false, - "legendFormat": "p50", - "range": true, - "refId": "C" - } - ], - "title": "SSE Connection Duration Percentiles", - "type": "timeseries" - }, - { - "datasource": "Victoria Metrics", - "description": "SSE messages sent by event type", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "Messages/sec", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "ops" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 8 - }, - "id": 59, - "options": { - "legend": { - "calcs": ["sum"], - "displayMode": "table", - "placement": "right", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "targets": [ - { - "datasource": "Victoria Metrics", - "editorMode": "code", - "expr": "rate(sse_messages_sent_total[5m])", - "instant": false, - "legendFormat": "{{event_type}}", - "range": true, - "refId": "A" - } - ], - "title": "SSE Messages by Event Type", - "type": "timeseries" - }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 16 - }, - "id": 60, - "panels": [], - "title": "Event Buffer Performance", - "type": "row" - }, - { - "datasource": "Victoria Metrics", - "description": "Event buffer size trends", + "description": "Total event publishing rate", "fieldConfig": { "defaults": { "color": { @@ -732,11 +216,11 @@ "custom": { "axisCenteredZero": false, "axisColorMode": "text", - "axisLabel": "Events", + "axisLabel": "Events/sec", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", - "fillOpacity": 20, + "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, @@ -767,57 +251,65 @@ { "color": "green", "value": null - }, - { - "color": "yellow", - "value": 1000 - }, - { - "color": "red", - "value": 5000 } ] }, - "unit": "short" + "unit": "ops" }, "overrides": [] }, "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 17 + "h": 6, + "w": 6, + "x": 12, + "y": 1 }, - "id": 61, + "id": 54, "options": { "legend": { - "calcs": ["mean", "max", "lastNotNull"], - "displayMode": "table", + "calcs": [ + "mean", + "max" + ], + "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { - "mode": "multi", - "sort": "desc" + "mode": "single", + "sort": "none" } }, "targets": [ { "datasource": "Victoria Metrics", "editorMode": "code", - "expr": "event_buffer_size", + "expr": "sum(rate(events_published_total[1m]))", "instant": false, - "legendFormat": "Buffer Size", + "legendFormat": "Event Rate", "range": true, "refId": "A" } ], - "title": "Event Buffer Size Over Time", + "title": "Event Publishing Rate", "type": "timeseries" }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 7 + }, + "id": 56, + "panels": [], + "title": "SSE Connection Details", + "type": "row" + }, { "datasource": "Victoria Metrics", - "description": "Event buffer processing and drop rates", + "description": "SSE connections by endpoint over time", "fieldConfig": { "defaults": { "color": { @@ -826,11 +318,11 @@ "custom": { "axisCenteredZero": false, "axisColorMode": "text", - "axisLabel": "Events/sec", + "axisLabel": "Connections", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", - "fillOpacity": 10, + "fillOpacity": 20, "gradientMode": "none", "hideFrom": { "legend": false, @@ -848,7 +340,7 @@ "spanNulls": false, "stacking": { "group": "A", - "mode": "none" + "mode": "normal" }, "thresholdsStyle": { "mode": "off" @@ -864,51 +356,24 @@ } ] }, - "unit": "ops" + "unit": "short" }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Dropped" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "red", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Processed" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "green", - "mode": "fixed" - } - } - ] - } - ] + "overrides": [] }, "gridPos": { "h": 8, - "w": 12, - "x": 12, - "y": 17 + "w": 8, + "x": 0, + "y": 8 }, - "id": 62, + "id": 57, "options": { "legend": { - "calcs": ["mean", "max", "sum"], + "calcs": [ + "mean", + "max", + "lastNotNull" + ], "displayMode": "table", "placement": "bottom", "showLegend": true @@ -922,28 +387,19 @@ { "datasource": "Victoria Metrics", "editorMode": "code", - "expr": "rate(event_buffer_processed_total[5m])", + "expr": "sse_connections_active", "instant": false, - "legendFormat": "Processed", + "legendFormat": "{{endpoint}}", "range": true, "refId": "A" - }, - { - "datasource": "Victoria Metrics", - "editorMode": "code", - "expr": "rate(event_buffer_dropped_total[5m])", - "instant": false, - "legendFormat": "Dropped", - "range": true, - "refId": "B" } ], - "title": "Event Buffer Processing vs Drops", + "title": "SSE Connections by Endpoint", "type": "timeseries" }, { "datasource": "Victoria Metrics", - "description": "Event processing latency percentiles", + "description": "Distribution of SSE connection durations", "fieldConfig": { "defaults": { "color": { @@ -952,7 +408,7 @@ "custom": { "axisCenteredZero": false, "axisColorMode": "text", - "axisLabel": "Latency (seconds)", + "axisLabel": "Duration (seconds)", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", @@ -987,14 +443,6 @@ { "color": "green", "value": null - }, - { - "color": "yellow", - "value": 1 - }, - { - "color": "red", - "value": 5 } ] }, @@ -1004,15 +452,18 @@ }, "gridPos": { "h": 8, - "w": 12, - "x": 0, - "y": 25 + "w": 8, + "x": 8, + "y": 8 }, - "id": 63, + "id": 58, "options": { "legend": { - "calcs": ["mean", "max"], - "displayMode": "table", + "calcs": [ + "mean", + "max" + ], + "displayMode": "list", "placement": "bottom", "showLegend": true }, @@ -1025,7 +476,7 @@ { "datasource": "Victoria Metrics", "editorMode": "code", - "expr": "histogram_quantile(0.99, sum(rate(event_buffer_latency_seconds_bucket[5m])) by (le))", + "expr": "histogram_quantile(0.99, sum(rate(sse_connection_duration_seconds_bucket[5m])) by (le))", "instant": false, "legendFormat": "p99", "range": true, @@ -1034,7 +485,7 @@ { "datasource": "Victoria Metrics", "editorMode": "code", - "expr": "histogram_quantile(0.95, sum(rate(event_buffer_latency_seconds_bucket[5m])) by (le))", + "expr": "histogram_quantile(0.95, sum(rate(sse_connection_duration_seconds_bucket[5m])) by (le))", "instant": false, "legendFormat": "p95", "range": true, @@ -1043,19 +494,19 @@ { "datasource": "Victoria Metrics", "editorMode": "code", - "expr": "histogram_quantile(0.50, sum(rate(event_buffer_latency_seconds_bucket[5m])) by (le))", + "expr": "histogram_quantile(0.50, sum(rate(sse_connection_duration_seconds_bucket[5m])) by (le))", "instant": false, "legendFormat": "p50", "range": true, "refId": "C" } ], - "title": "Event Buffer Processing Latency", + "title": "SSE Connection Duration Percentiles", "type": "timeseries" }, { "datasource": "Victoria Metrics", - "description": "Memory usage of event buffers", + "description": "SSE messages sent by event type", "fieldConfig": { "defaults": { "color": { @@ -1064,7 +515,7 @@ "custom": { "axisCenteredZero": false, "axisColorMode": "text", - "axisLabel": "Memory (MB)", + "axisLabel": "Messages/sec", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", @@ -1077,7 +528,7 @@ }, "insertNulls": false, "lineInterpolation": "linear", - "lineWidth": 2, + "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" @@ -1086,7 +537,7 @@ "spanNulls": false, "stacking": { "group": "A", - "mode": "none" + "mode": "normal" }, "thresholdsStyle": { "mode": "off" @@ -1099,33 +550,27 @@ { "color": "green", "value": null - }, - { - "color": "yellow", - "value": 50 - }, - { - "color": "red", - "value": 100 } ] }, - "unit": "decmbytes" + "unit": "ops" }, "overrides": [] }, "gridPos": { "h": 8, - "w": 12, - "x": 12, - "y": 25 + "w": 8, + "x": 16, + "y": 8 }, - "id": 64, + "id": 59, "options": { "legend": { - "calcs": ["mean", "max", "lastNotNull"], + "calcs": [ + "sum" + ], "displayMode": "table", - "placement": "bottom", + "placement": "right", "showLegend": true }, "tooltip": { @@ -1137,16 +582,29 @@ { "datasource": "Victoria Metrics", "editorMode": "code", - "expr": "event_buffer_memory_usage_MB_sum / event_buffer_memory_usage_MB_count", + "expr": "rate(sse_messages_sent_total[5m])", "instant": false, - "legendFormat": "Memory Usage", + "legendFormat": "{{event_type}}", "range": true, "refId": "A" } ], - "title": "Event Buffer Memory Usage", + "title": "SSE Messages by Event Type", "type": "timeseries" }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 16 + }, + "id": 60, + "panels": [], + "title": "Event Buffer Performance", + "type": "row" + }, { "collapsed": false, "gridPos": { @@ -1222,7 +680,10 @@ "id": 66, "options": { "legend": { - "calcs": ["mean", "max"], + "calcs": [ + "mean", + "max" + ], "displayMode": "table", "placement": "right", "showLegend": true @@ -1316,7 +777,10 @@ "id": 67, "options": { "legend": { - "calcs": ["mean", "max"], + "calcs": [ + "mean", + "max" + ], "displayMode": "table", "placement": "right", "showLegend": true @@ -1431,7 +895,11 @@ "id": 69, "options": { "legend": { - "calcs": ["mean", "max", "sum"], + "calcs": [ + "mean", + "max", + "sum" + ], "displayMode": "table", "placement": "bottom", "showLegend": true @@ -1534,7 +1002,10 @@ "id": 70, "options": { "legend": { - "calcs": ["mean", "max"], + "calcs": [ + "mean", + "max" + ], "displayMode": "table", "placement": "bottom", "showLegend": true @@ -1580,92 +1051,6 @@ "title": "SSE Shutdown Monitoring", "type": "row" }, - { - "datasource": "Victoria Metrics", - "description": "SSE graceful shutdown phases", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "Duration (seconds)", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "bars", - "fillOpacity": 50, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "s" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 52 - }, - "id": 72, - "options": { - "legend": { - "calcs": ["lastNotNull"], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - "targets": [ - { - "datasource": "Victoria Metrics", - "editorMode": "code", - "expr": "sse_shutdown_duration_seconds_sum / sse_shutdown_duration_seconds_count", - "instant": false, - "legendFormat": "{{phase}}", - "range": true, - "refId": "A" - } - ], - "title": "SSE Shutdown Phase Durations", - "type": "timeseries" - }, { "datasource": "Victoria Metrics", "description": "SSE draining connections during shutdown", @@ -1728,7 +1113,10 @@ "id": 73, "options": { "legend": { - "calcs": ["max", "lastNotNull"], + "calcs": [ + "max", + "lastNotNull" + ], "displayMode": "list", "placement": "bottom", "showLegend": true @@ -1755,7 +1143,10 @@ ], "refresh": "5s", "schemaVersion": 38, - "tags": ["event-streaming", "sse"], + "tags": [ + "event-streaming", + "sse" + ], "templating": { "list": [] }, @@ -1769,4 +1160,4 @@ "uid": "event-stream-monitoring", "version": 2, "weekStart": "" -} \ No newline at end of file +} diff --git a/backend/grafana/provisioning/dashboards/http-middleware.json b/backend/grafana/provisioning/dashboards/http-middleware.json new file mode 100644 index 00000000..908d3a90 --- /dev/null +++ b/backend/grafana/provisioning/dashboards/http-middleware.json @@ -0,0 +1,387 @@ +{ + "annotations": { + "list": [] + }, + "description": "HTTP & Middleware", + "editable": true, + "gnetId": null, + "graphTooltip": 1, + "id": null, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 100, + "panels": [], + "title": "HTTP Requests", + "type": "row" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 1, + "targets": [ + { + "expr": "rate(http_requests_total[5m])", + "legendFormat": "Requests", + "refId": "A" + } + ], + "title": "Request Rate", + "type": "timeseries" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 2, + "targets": [ + { + "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))", + "legendFormat": "p95", + "refId": "A" + } + ], + "title": "Request Duration", + "type": "timeseries" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 0, + "y": 7 + }, + "id": 3, + "targets": [ + { + "expr": "http_requests_active_requests", + "refId": "A" + } + ], + "title": "Active Requests", + "type": "stat" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 16, + "x": 8, + "y": 7 + }, + "id": 4, + "targets": [ + { + "expr": "histogram_quantile(0.50, rate(http_request_size_bytes_bucket[5m]))", + "legendFormat": "Request p50", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.50, rate(http_response_size_bytes_bucket[5m]))", + "legendFormat": "Response p50", + "refId": "B" + } + ], + "title": "Request/Response Size", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 13 + }, + "id": 101, + "panels": [], + "title": "Database & Event Store", + "type": "row" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 14 + }, + "id": 5, + "targets": [ + { + "expr": "rate(mongodb_event_operations_total[5m])", + "legendFormat": "Operations", + "refId": "A" + }, + { + "expr": "rate(database_connection_errors_total[5m])", + "legendFormat": "Connection Errors", + "refId": "B" + } + ], + "title": "MongoDB Operations", + "type": "timeseries" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 14 + }, + "id": 6, + "targets": [ + { + "expr": "histogram_quantile(0.95, rate(mongodb_event_query_duration_seconds_bucket[5m]))", + "legendFormat": "p95", + "refId": "A" + } + ], + "title": "MongoDB Query Duration", + "type": "timeseries" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 20 + }, + "id": 7, + "targets": [ + { + "expr": "database_connections_active", + "refId": "A" + } + ], + "title": "Active DB Connections", + "type": "stat" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 18, + "x": 6, + "y": 20 + }, + "id": 8, + "targets": [ + { + "expr": "rate(event_store_operations_total[5m])", + "legendFormat": "Store Ops", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, rate(idempotency_processing_duration_seconds_bucket[5m]))", + "legendFormat": "Idempotency p95", + "refId": "B" + } + ], + "title": "Event Store & Idempotency", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 26 + }, + "id": 102, + "panels": [], + "title": "Kafka Errors & Event Bus", + "type": "row" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 27 + }, + "id": 9, + "targets": [ + { + "expr": "rate(kafka_production_errors_total[5m])", + "legendFormat": "Production", + "refId": "A" + }, + { + "expr": "rate(kafka_consumption_errors_total[5m])", + "legendFormat": "Consumption", + "refId": "B" + } + ], + "title": "Kafka Errors", + "type": "timeseries" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 12, + "x": 12, + "y": 27 + }, + "id": 10, + "targets": [ + { + "expr": "event_bus_queue_size", + "refId": "A" + } + ], + "title": "Event Bus Queue Size", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 33 + }, + "id": 103, + "panels": [], + "title": "System", + "type": "row" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 0, + "y": 34 + }, + "id": 11, + "targets": [ + { + "expr": "system_cpu_percent", + "refId": "A" + } + ], + "title": "CPU", + "type": "stat" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 8, + "y": 34 + }, + "id": 12, + "targets": [ + { + "expr": "process_metrics_mixed", + "refId": "A" + } + ], + "title": "Process Metrics", + "type": "stat" + } + ], + "refresh": "10s", + "schemaVersion": 33, + "style": "dark", + "tags": [ + "http", + "middleware", + "database" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "HTTP & Middleware", + "uid": "http-middleware", + "version": 1 +} diff --git a/backend/grafana/provisioning/dashboards/integr8scode.json b/backend/grafana/provisioning/dashboards/integr8scode.json index b1092a22..24687020 100644 --- a/backend/grafana/provisioning/dashboards/integr8scode.json +++ b/backend/grafana/provisioning/dashboards/integr8scode.json @@ -68,7 +68,9 @@ "options": { "orientation": "auto", "reduceOptions": { - "calcs": ["lastNotNull"], + "calcs": [ + "lastNotNull" + ], "fields": "", "values": false }, @@ -80,7 +82,7 @@ "targets": [ { "datasource": "Victoria Metrics", - "expr": "sum(rate(script_executions_total[1m]))" , + "expr": "sum(rate(script_executions_total[1m]))", "refId": "A" } ], @@ -129,7 +131,9 @@ "options": { "orientation": "auto", "reduceOptions": { - "calcs": ["lastNotNull"], + "calcs": [ + "lastNotNull" + ], "fields": "", "values": false }, @@ -187,7 +191,9 @@ "options": { "orientation": "auto", "reduceOptions": { - "calcs": ["lastNotNull"], + "calcs": [ + "lastNotNull" + ], "fields": "", "values": false }, @@ -245,7 +251,9 @@ "options": { "orientation": "auto", "reduceOptions": { - "calcs": ["lastNotNull"], + "calcs": [ + "lastNotNull" + ], "fields": "", "values": false }, @@ -323,7 +331,10 @@ "id": 5, "options": { "legend": { - "calcs": ["mean", "lastNotNull"], + "calcs": [ + "mean", + "lastNotNull" + ], "displayMode": "table", "placement": "bottom" }, @@ -336,7 +347,7 @@ { "datasource": "Victoria Metrics", "editorMode": "code", - "expr": "sum by (lang_and_version) (rate(script_executions_total[1m]))" , + "expr": "sum by (lang_and_version) (rate(script_executions_total[1m]))", "legendFormat": "{{lang_and_version}}", "refId": "A" } @@ -403,7 +414,10 @@ "id": 6, "options": { "legend": { - "calcs": ["mean", "max"], + "calcs": [ + "mean", + "max" + ], "displayMode": "table", "placement": "bottom" }, @@ -502,7 +516,10 @@ "id": 7, "options": { "legend": { - "calcs": ["mean", "max"], + "calcs": [ + "mean", + "max" + ], "displayMode": "list", "placement": "bottom" }, @@ -646,7 +663,9 @@ "options": { "orientation": "auto", "reduceOptions": { - "calcs": ["lastNotNull"], + "calcs": [ + "lastNotNull" + ], "fields": "", "values": false }, @@ -671,83 +690,6 @@ "title": "Memory Saturation", "type": "gauge" }, - { - "datasource": "Victoria Metrics", - "description": "System health indicators", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [ - { - "options": { - "0": { - "color": "red", - "index": 0, - "text": "Unhealthy" - }, - "1": { - "color": "green", - "index": 1, - "text": "Healthy" - } - }, - "type": "value" - } - ], - "noValue": "Healthy", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 0.5 - }, - { - "color": "green", - "value": 0.99 - } - ] - }, - "unit": "short" - } - }, - "gridPos": { - "h": 4, - "w": 6, - "x": 0, - "y": 22 - }, - "id": 10, - "options": { - "colorMode": "background", - "graphMode": "none", - "justifyMode": "center", - "orientation": "auto", - "reduceOptions": { - "calcs": ["lastNotNull"], - "fields": "", - "values": false - }, - "text": {}, - "textMode": "value" - }, - "pluginVersion": "10.2.0", - "targets": [ - { - "datasource": "Victoria Metrics", - "expr": "(sum(health_check_status_sum{check_type=\"readiness\"}) or vector(1)) / (sum(health_check_status_count{check_type=\"readiness\"}) or vector(1))", - "refId": "A" - } - ], - "title": "System Health", - "type": "stat" - }, { "datasource": "Victoria Metrics", "description": "Number of active SSE connections", @@ -790,7 +732,9 @@ "justifyMode": "center", "orientation": "auto", "reduceOptions": { - "calcs": ["lastNotNull"], + "calcs": [ + "lastNotNull" + ], "fields": "", "values": false }, @@ -850,7 +794,9 @@ "justifyMode": "center", "orientation": "auto", "reduceOptions": { - "calcs": ["lastNotNull"], + "calcs": [ + "lastNotNull" + ], "fields": "", "values": false }, @@ -911,7 +857,9 @@ "justifyMode": "center", "orientation": "auto", "reduceOptions": { - "calcs": ["lastNotNull"], + "calcs": [ + "lastNotNull" + ], "fields": "", "values": false }, @@ -933,7 +881,10 @@ "refresh": "10s", "schemaVersion": 38, "style": "dark", - "tags": ["overview", "main"], + "tags": [ + "overview", + "main" + ], "templating": { "list": [] }, @@ -942,11 +893,22 @@ "to": "now" }, "timepicker": { - "refresh_intervals": ["5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d"] + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] }, "timezone": "", "title": "System Overview", "uid": "integr8scode-main", "version": 1, "weekStart": "" -} \ No newline at end of file +} diff --git a/backend/grafana/provisioning/dashboards/kafka-events-monitoring.json b/backend/grafana/provisioning/dashboards/kafka-events-monitoring.json index 734eeaa4..4c5f93b3 100644 --- a/backend/grafana/provisioning/dashboards/kafka-events-monitoring.json +++ b/backend/grafana/provisioning/dashboards/kafka-events-monitoring.json @@ -114,7 +114,9 @@ "justifyMode": "auto", "orientation": "auto", "reduceOptions": { - "calcs": ["lastNotNull"], + "calcs": [ + "lastNotNull" + ], "fields": "", "values": false }, @@ -136,71 +138,6 @@ "title": "Message Throughput", "type": "stat" }, - { - "datasource": "Victoria Metrics", - "description": "Total consumer lag across all consumer groups", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 100 - }, - { - "color": "red", - "value": 1000 - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 6, - "w": 4, - "x": 4, - "y": 1 - }, - "id": 3, - "options": { - "colorMode": "background", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": ["lastNotNull"], - "fields": "", - "values": false - }, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "10.2.0", - "targets": [ - { - "datasource": "Victoria Metrics", - "editorMode": "code", - "expr": "sum(kafka_consumer_lag_sum / kafka_consumer_lag_count)", - "instant": false, - "legendFormat": "Total Lag", - "range": true, - "refId": "A" - } - ], - "title": "Total Consumer Lag", - "type": "stat" - }, { "datasource": "Victoria Metrics", "description": "Messages in Dead Letter Queue", @@ -244,7 +181,9 @@ "justifyMode": "auto", "orientation": "auto", "reduceOptions": { - "calcs": ["lastNotNull"], + "calcs": [ + "lastNotNull" + ], "fields": "", "values": false }, @@ -359,7 +298,10 @@ "id": 5, "options": { "legend": { - "calcs": ["mean", "max"], + "calcs": [ + "mean", + "max" + ], "displayMode": "list", "placement": "bottom", "showLegend": true @@ -467,7 +409,10 @@ "id": 7, "options": { "legend": { - "calcs": ["mean", "max"], + "calcs": [ + "mean", + "max" + ], "displayMode": "table", "placement": "right", "showLegend": true @@ -561,7 +506,10 @@ "id": 8, "options": { "legend": { - "calcs": ["mean", "max"], + "calcs": [ + "mean", + "max" + ], "displayMode": "list", "placement": "bottom", "showLegend": true @@ -616,100 +564,6 @@ "title": "Consumer Metrics", "type": "row" }, - { - "datasource": "Victoria Metrics", - "description": "Consumer lag by consumer group", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "Lag (messages)", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 20, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 100 - }, - { - "color": "red", - "value": 1000 - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 17 - }, - "id": 10, - "options": { - "legend": { - "calcs": ["mean", "max", "lastNotNull"], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "targets": [ - { - "datasource": "Victoria Metrics", - "editorMode": "code", - "expr": "kafka_consumer_lag_sum / kafka_consumer_lag_count", - "instant": false, - "legendFormat": "{{consumer_group}}", - "range": true, - "refId": "A" - } - ], - "title": "Consumer Lag by Group", - "type": "timeseries" - }, { "datasource": "Victoria Metrics", "description": "Message consumption rate by consumer group", @@ -772,7 +626,10 @@ "id": 11, "options": { "legend": { - "calcs": ["mean", "max"], + "calcs": [ + "mean", + "max" + ], "displayMode": "table", "placement": "right", "showLegend": true @@ -879,7 +736,10 @@ "id": 13, "options": { "legend": { - "calcs": ["mean", "max"], + "calcs": [ + "mean", + "max" + ], "displayMode": "table", "placement": "right", "showLegend": true @@ -965,7 +825,9 @@ "id": 14, "options": { "legend": { - "calcs": ["sum"], + "calcs": [ + "sum" + ], "displayMode": "table", "placement": "right", "showLegend": true @@ -1072,7 +934,10 @@ "id": 16, "options": { "legend": { - "calcs": ["lastNotNull", "max"], + "calcs": [ + "lastNotNull", + "max" + ], "displayMode": "table", "placement": "bottom", "showLegend": true @@ -1204,7 +1069,10 @@ "id": 17, "options": { "legend": { - "calcs": ["mean", "sum"], + "calcs": [ + "mean", + "sum" + ], "displayMode": "list", "placement": "bottom", "showLegend": true @@ -1289,7 +1157,9 @@ "justifyMode": "auto", "orientation": "auto", "reduceOptions": { - "calcs": ["max"], + "calcs": [ + "max" + ], "fields": "", "values": false }, @@ -1386,7 +1256,10 @@ "id": 20, "options": { "legend": { - "calcs": ["mean", "max"], + "calcs": [ + "mean", + "max" + ], "displayMode": "table", "placement": "right", "showLegend": true @@ -1472,7 +1345,9 @@ "id": 21, "options": { "legend": { - "calcs": ["sum"], + "calcs": [ + "sum" + ], "displayMode": "table", "placement": "bottom", "showLegend": true @@ -1571,7 +1446,9 @@ "id": 23, "options": { "legend": { - "calcs": ["sum"], + "calcs": [ + "sum" + ], "displayMode": "list", "placement": "bottom", "showLegend": true @@ -1703,7 +1580,10 @@ "id": 24, "options": { "legend": { - "calcs": ["mean", "sum"], + "calcs": [ + "mean", + "sum" + ], "displayMode": "list", "placement": "bottom", "showLegend": true @@ -1788,7 +1668,9 @@ "justifyMode": "auto", "orientation": "auto", "reduceOptions": { - "calcs": ["lastNotNull"], + "calcs": [ + "lastNotNull" + ], "fields": "", "values": false }, @@ -1813,7 +1695,10 @@ ], "refresh": "5s", "schemaVersion": 38, - "tags": ["kafka", "events"], + "tags": [ + "kafka", + "events" + ], "templating": { "list": [] }, @@ -1827,4 +1712,4 @@ "uid": "kafka-events-monitoring", "version": 2, "weekStart": "" -} \ No newline at end of file +} diff --git a/backend/grafana/provisioning/dashboards/kubernetes-pods.json b/backend/grafana/provisioning/dashboards/kubernetes-pods.json new file mode 100644 index 00000000..47cf6dd4 --- /dev/null +++ b/backend/grafana/provisioning/dashboards/kubernetes-pods.json @@ -0,0 +1,350 @@ +{ + "annotations": { + "list": [] + }, + "description": "Kubernetes & Pods", + "editable": true, + "gnetId": null, + "graphTooltip": 1, + "id": null, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 100, + "panels": [], + "title": "Pod Creation", + "type": "row" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 1, + "targets": [ + { + "expr": "rate(pod_creations_total[5m])", + "legendFormat": "Created", + "refId": "A" + }, + { + "expr": "rate(pod_creation_failures_total[5m])", + "legendFormat": "Failed", + "refId": "B" + } + ], + "title": "Pod Creations", + "type": "timeseries" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 2, + "targets": [ + { + "expr": "histogram_quantile(0.95, rate(pod_creation_duration_seconds_bucket[5m]))", + "legendFormat": "p95", + "refId": "A" + } + ], + "title": "Pod Creation Duration", + "type": "timeseries" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 7 + }, + "id": 3, + "targets": [ + { + "expr": "pod_creations_active", + "refId": "A" + } + ], + "title": "Active Creations", + "type": "stat" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 7 + }, + "id": 4, + "targets": [ + { + "expr": "increase(configmaps_created_total[24h])", + "refId": "A" + } + ], + "title": "ConfigMaps Created (24h)", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 11 + }, + "id": 101, + "panels": [], + "title": "Pod Lifecycle", + "type": "row" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 12 + }, + "id": 5, + "targets": [ + { + "expr": "rate(pod_phase_transitions_total[5m])", + "legendFormat": "{{phase}}", + "refId": "A" + } + ], + "title": "Phase Transitions", + "type": "timeseries" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 12 + }, + "id": 6, + "targets": [ + { + "expr": "histogram_quantile(0.50, rate(pod_lifetime_seconds_bucket[5m]))", + "legendFormat": "p50", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, rate(pod_lifetime_seconds_bucket[5m]))", + "legendFormat": "p95", + "refId": "B" + } + ], + "title": "Pod Lifetime", + "type": "timeseries" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 12, + "x": 0, + "y": 18 + }, + "id": 7, + "targets": [ + { + "expr": "pods_by_phase", + "refId": "A" + } + ], + "title": "Pods by Phase", + "type": "stat" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 12, + "x": 12, + "y": 18 + }, + "id": 8, + "targets": [ + { + "expr": "pods_monitored", + "refId": "A" + } + ], + "title": "Pods Monitored", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 22 + }, + "id": 102, + "panels": [], + "title": "Pod Monitor", + "type": "row" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 23 + }, + "id": 9, + "targets": [ + { + "expr": "rate(pod_monitor_events_total[5m])", + "legendFormat": "Events", + "refId": "A" + }, + { + "expr": "rate(pod_monitor_reconciliations_total[5m])", + "legendFormat": "Reconciliations", + "refId": "B" + } + ], + "title": "Monitor Events", + "type": "timeseries" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 23 + }, + "id": 10, + "targets": [ + { + "expr": "histogram_quantile(0.95, rate(pod_monitor_processing_duration_seconds_bucket[5m]))", + "legendFormat": "p95", + "refId": "A" + } + ], + "title": "Monitor Processing Duration", + "type": "timeseries" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 29 + }, + "id": 11, + "targets": [ + { + "expr": "rate(pod_monitor_watch_errors_total[5m])", + "legendFormat": "Errors", + "refId": "A" + }, + { + "expr": "rate(pod_monitor_watch_reconnects_total[5m])", + "legendFormat": "Reconnects", + "refId": "B" + } + ], + "title": "Watch Errors", + "type": "timeseries" + } + ], + "refresh": "10s", + "schemaVersion": 33, + "style": "dark", + "tags": [ + "kubernetes", + "pods" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Kubernetes & Pods", + "uid": "kubernetes-pods", + "version": 1 +} diff --git a/backend/grafana/provisioning/dashboards/notifications.json b/backend/grafana/provisioning/dashboards/notifications.json new file mode 100644 index 00000000..97096a25 --- /dev/null +++ b/backend/grafana/provisioning/dashboards/notifications.json @@ -0,0 +1,500 @@ +{ + "annotations": { + "list": [] + }, + "description": "Notifications", + "editable": true, + "gnetId": null, + "graphTooltip": 1, + "id": null, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 100, + "panels": [], + "title": "Overview", + "type": "row" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 1, + "targets": [ + { + "expr": "rate(notifications_sent_total[5m])", + "legendFormat": "Sent", + "refId": "A" + }, + { + "expr": "rate(notifications_failed_total[5m])", + "legendFormat": "Failed", + "refId": "B" + }, + { + "expr": "rate(notifications_read_total[5m])", + "legendFormat": "Read", + "refId": "C" + } + ], + "title": "Notification Flow", + "type": "timeseries" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 12, + "y": 1 + }, + "id": 2, + "targets": [ + { + "expr": "notifications_pending", + "refId": "A" + } + ], + "title": "Pending", + "type": "stat" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 16, + "y": 1 + }, + "id": 3, + "targets": [ + { + "expr": "notifications_queued", + "refId": "A" + } + ], + "title": "Queued", + "type": "stat" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 20, + "y": 1 + }, + "id": 4, + "targets": [ + { + "expr": "notifications_unread_count", + "refId": "A" + } + ], + "title": "Unread", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 7 + }, + "id": 101, + "panels": [], + "title": "Channels", + "type": "row" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 5, + "targets": [ + { + "expr": "rate(notifications_by_channel_total[5m])", + "legendFormat": "{{channel}}", + "refId": "A" + } + ], + "title": "By Channel", + "type": "timeseries" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 6, + "targets": [ + { + "expr": "rate(notifications_by_severity_total[5m])", + "legendFormat": "{{severity}}", + "refId": "A" + } + ], + "title": "By Severity", + "type": "timeseries" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 14 + }, + "id": 7, + "targets": [ + { + "expr": "histogram_quantile(0.95, rate(notification_channel_delivery_time_seconds_bucket[5m]))", + "legendFormat": "p95", + "refId": "A" + } + ], + "title": "Channel Delivery Time", + "type": "timeseries" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 14 + }, + "id": 8, + "targets": [ + { + "expr": "rate(notification_channel_failures_total[5m])", + "legendFormat": "{{channel}}", + "refId": "A" + } + ], + "title": "Channel Failures", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 20 + }, + "id": 102, + "panels": [], + "title": "Delivery", + "type": "row" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 21 + }, + "id": 9, + "targets": [ + { + "expr": "histogram_quantile(0.95, rate(notification_delivery_time_seconds_bucket[5m]))", + "legendFormat": "p95", + "refId": "A" + } + ], + "title": "Overall Delivery Time", + "type": "timeseries" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 21 + }, + "id": 10, + "targets": [ + { + "expr": "rate(notification_status_changes_total[5m])", + "legendFormat": "Changes", + "refId": "A" + } + ], + "title": "Status Changes", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 27 + }, + "id": 103, + "panels": [], + "title": "Throttling & Retries", + "type": "row" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 28 + }, + "id": 11, + "targets": [ + { + "expr": "rate(notifications_throttled_total[5m])", + "legendFormat": "Throttled", + "refId": "A" + }, + { + "expr": "rate(notification_throttle_window_hits_total[5m])", + "legendFormat": "Window Hits", + "refId": "B" + } + ], + "title": "Throttling", + "type": "timeseries" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 28 + }, + "id": 12, + "targets": [ + { + "expr": "rate(notification_retries_total[5m])", + "legendFormat": "Retries", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.50, rate(notification_retry_success_rate_percent_bucket[5m]))", + "legendFormat": "Success Rate p50", + "refId": "B" + } + ], + "title": "Retries", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 34 + }, + "id": 104, + "panels": [], + "title": "Webhooks & Slack", + "type": "row" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 35 + }, + "id": 13, + "targets": [ + { + "expr": "histogram_quantile(0.95, rate(notification_webhook_delivery_time_seconds_bucket[5m]))", + "legendFormat": "p95", + "refId": "A" + }, + { + "expr": "rate(notification_webhook_response_status_total[5m])", + "legendFormat": "{{status}}", + "refId": "B" + } + ], + "title": "Webhook Delivery", + "type": "timeseries" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 35 + }, + "id": 14, + "targets": [ + { + "expr": "histogram_quantile(0.95, rate(notification_slack_delivery_time_seconds_bucket[5m]))", + "legendFormat": "p95", + "refId": "A" + }, + { + "expr": "rate(notification_slack_api_errors_total[5m])", + "legendFormat": "Errors", + "refId": "B" + } + ], + "title": "Slack", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 41 + }, + "id": 105, + "panels": [], + "title": "Subscriptions", + "type": "row" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 42 + }, + "id": 15, + "targets": [ + { + "expr": "notification_subscriptions_active", + "legendFormat": "Active", + "refId": "A" + }, + { + "expr": "rate(notification_subscription_changes_total[5m])", + "legendFormat": "Changes", + "refId": "B" + } + ], + "title": "Subscriptions", + "type": "timeseries" + } + ], + "refresh": "10s", + "schemaVersion": 33, + "style": "dark", + "tags": [ + "notifications" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Notifications", + "uid": "notifications", + "version": 1 +} diff --git a/backend/grafana/provisioning/dashboards/security-auth.json b/backend/grafana/provisioning/dashboards/security-auth.json new file mode 100644 index 00000000..191696bf --- /dev/null +++ b/backend/grafana/provisioning/dashboards/security-auth.json @@ -0,0 +1,355 @@ +{ + "annotations": { + "list": [] + }, + "description": "Security & Authentication", + "editable": true, + "gnetId": null, + "graphTooltip": 1, + "id": null, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 100, + "panels": [], + "title": "Authentication", + "type": "row" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 1, + "targets": [ + { + "expr": "rate(authentication_attempts_total[5m])", + "legendFormat": "Attempts", + "refId": "A" + }, + { + "expr": "rate(authentication_failures_total[5m])", + "legendFormat": "Failures", + "refId": "B" + } + ], + "title": "Authentication Attempts", + "type": "timeseries" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 2, + "targets": [ + { + "expr": "histogram_quantile(0.95, rate(authentication_duration_seconds_bucket[5m]))", + "legendFormat": "p95", + "refId": "A" + } + ], + "title": "Authentication Duration", + "type": "timeseries" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 7 + }, + "id": 3, + "targets": [ + { + "expr": "authentication_sessions_active", + "refId": "A" + } + ], + "title": "Active Sessions", + "type": "stat" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 7 + }, + "id": 4, + "targets": [ + { + "expr": "increase(accounts_locked_total[24h])", + "refId": "A" + } + ], + "title": "Accounts Locked (24h)", + "type": "stat" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 7 + }, + "id": 5, + "targets": [ + { + "expr": "increase(brute_force_attempts_total[1h])", + "refId": "A" + } + ], + "title": "Brute Force Attempts (1h)", + "type": "stat" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 7 + }, + "id": 6, + "targets": [ + { + "expr": "increase(weak_password_attempts_total[24h])", + "refId": "A" + } + ], + "title": "Weak Password Attempts", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 11 + }, + "id": 101, + "panels": [], + "title": "Tokens & CSRF", + "type": "row" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 12 + }, + "id": 7, + "targets": [ + { + "expr": "rate(tokens_generated_total[5m])", + "legendFormat": "Generated", + "refId": "A" + }, + { + "expr": "rate(tokens_revoked_total[5m])", + "legendFormat": "Revoked", + "refId": "B" + }, + { + "expr": "rate(token_validation_failures_total[5m])", + "legendFormat": "Validation Failures", + "refId": "C" + } + ], + "title": "Token Operations", + "type": "timeseries" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 12 + }, + "id": 8, + "targets": [ + { + "expr": "histogram_quantile(0.50, rate(token_expiry_time_seconds_bucket[5m]))", + "legendFormat": "p50", + "refId": "A" + } + ], + "title": "Token Expiry Time", + "type": "timeseries" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 18 + }, + "id": 9, + "targets": [ + { + "expr": "rate(csrf_tokens_generated_total[5m])", + "legendFormat": "Generated", + "refId": "A" + }, + { + "expr": "rate(csrf_validation_failures_total[5m])", + "legendFormat": "Failures", + "refId": "B" + } + ], + "title": "CSRF", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 102, + "panels": [], + "title": "Authorization", + "type": "row" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 25 + }, + "id": 10, + "targets": [ + { + "expr": "rate(authorization_checks_total[5m])", + "legendFormat": "Checks", + "refId": "A" + }, + { + "expr": "rate(authorization_denials_total[5m])", + "legendFormat": "Denials", + "refId": "B" + } + ], + "title": "Authorization", + "type": "timeseries" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 25 + }, + "id": 11, + "targets": [ + { + "expr": "rate(password_reset_requests_total[5m])", + "legendFormat": "Resets", + "refId": "A" + } + ], + "title": "Password Resets", + "type": "timeseries" + } + ], + "refresh": "10s", + "schemaVersion": 33, + "style": "dark", + "tags": [ + "security", + "auth" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Security & Authentication", + "uid": "security-auth", + "version": 1 +} diff --git a/backend/tests/unit/conftest.py b/backend/tests/unit/conftest.py index a02357cd..c3ad50d8 100644 --- a/backend/tests/unit/conftest.py +++ b/backend/tests/unit/conftest.py @@ -11,7 +11,6 @@ DLQMetrics, EventMetrics, ExecutionMetrics, - HealthMetrics, KubernetesMetrics, NotificationMetrics, RateLimitMetrics, @@ -219,11 +218,6 @@ def execution_metrics(test_settings: Settings) -> ExecutionMetrics: return ExecutionMetrics(test_settings) -@pytest.fixture -def health_metrics(test_settings: Settings) -> HealthMetrics: - return HealthMetrics(test_settings) - - @pytest.fixture def kubernetes_metrics(test_settings: Settings) -> KubernetesMetrics: return KubernetesMetrics(test_settings) diff --git a/backend/tests/unit/core/metrics/test_connections_and_coordinator_metrics.py b/backend/tests/unit/core/metrics/test_connections_and_coordinator_metrics.py index d49c5ddd..a2745838 100644 --- a/backend/tests/unit/core/metrics/test_connections_and_coordinator_metrics.py +++ b/backend/tests/unit/core/metrics/test_connections_and_coordinator_metrics.py @@ -13,11 +13,6 @@ def test_connection_metrics_methods(test_settings: Settings) -> None: m.record_sse_message_sent("/events", "etype") m.record_sse_connection_duration(1.2, "/events") m.update_sse_draining_connections(1) - m.record_sse_shutdown_duration(0.5, "phase1") - m.update_sse_shutdown_duration(0.6, "phase2") - m.increment_event_bus_subscriptions() - m.decrement_event_bus_subscriptions(1) - m.update_event_bus_subscribers(3, "*") def test_coordinator_metrics_methods(test_settings: Settings) -> None: diff --git a/backend/tests/unit/core/metrics/test_database_and_dlq_metrics.py b/backend/tests/unit/core/metrics/test_database_and_dlq_metrics.py index 08a14b69..03f26f74 100644 --- a/backend/tests/unit/core/metrics/test_database_and_dlq_metrics.py +++ b/backend/tests/unit/core/metrics/test_database_and_dlq_metrics.py @@ -36,8 +36,6 @@ def test_dlq_metrics_methods(test_settings: Settings) -> None: m.update_dlq_queue_size("topic", 10) m.update_dlq_queue_size("topic", 7) m.record_dlq_message_age(5.0) - m.record_dlq_retry_attempt("topic", "etype", 2) m.record_dlq_processing_error("topic", "etype", "err") - m.record_dlq_throughput(12.0, "topic") m.increment_dlq_queue_size("topic") m.decrement_dlq_queue_size("topic") diff --git a/backend/tests/unit/core/metrics/test_execution_and_events_metrics.py b/backend/tests/unit/core/metrics/test_execution_and_events_metrics.py index a102a02f..a61b3f5c 100644 --- a/backend/tests/unit/core/metrics/test_execution_and_events_metrics.py +++ b/backend/tests/unit/core/metrics/test_execution_and_events_metrics.py @@ -21,10 +21,6 @@ def test_execution_metrics_methods(test_settings: Settings) -> None: m.record_execution_assigned() m.record_execution_queued() m.record_execution_scheduled() - m.update_cpu_available(100.0) - m.update_memory_available(512.0) - m.update_gpu_available(1) - m.update_allocations_active(2) def test_event_metrics_methods(test_settings: Settings) -> None: @@ -32,15 +28,7 @@ def test_event_metrics_methods(test_settings: Settings) -> None: m = EventMetrics(test_settings) m.record_event_published("execution.requested", None) m.record_event_processing_duration(0.05, "execution.requested") - m.record_pod_event_published("pod.running") m.record_event_replay_operation("prepare", "success") - m.update_event_buffer_size(3) - m.record_event_buffer_dropped() - m.record_event_buffer_processed() - m.record_event_buffer_latency(0.2) - m.set_event_buffer_backpressure(True) - m.set_event_buffer_backpressure(False) - m.record_event_buffer_memory_usage(12.3) m.record_event_stored("execution.requested", "events") m.record_events_processing_failed("topic", "etype", "group", "error") m.record_event_store_duration(0.1, "insert", "events") @@ -49,10 +37,8 @@ def test_event_metrics_methods(test_settings: Settings) -> None: m.record_processing_duration(0.3, "topic", "etype", "group") m.record_kafka_message_produced("t") m.record_kafka_message_consumed("t", "g") - m.record_kafka_consumer_lag(10, "t", "g", 0) m.record_kafka_production_error("t", "e") m.record_kafka_consumption_error("t", "g", "e") m.update_event_bus_queue_size(1, "default") m.set_event_bus_queue_size(5, "default") m.set_event_bus_queue_size(2, "default") - diff --git a/backend/tests/unit/core/metrics/test_health_and_rate_limit_metrics.py b/backend/tests/unit/core/metrics/test_health_and_rate_limit_metrics.py index 54d06d27..b1cb7f84 100644 --- a/backend/tests/unit/core/metrics/test_health_and_rate_limit_metrics.py +++ b/backend/tests/unit/core/metrics/test_health_and_rate_limit_metrics.py @@ -1,24 +1,7 @@ import pytest -from app.core.metrics import HealthMetrics -from app.settings import Settings pytestmark = pytest.mark.unit -def test_health_metrics_methods(test_settings: Settings) -> None: - """Test with no-op metrics.""" - m = HealthMetrics(test_settings) - m.record_health_check_duration(0.1, "liveness", "basic") - m.record_health_check_failure("readiness", "db", "timeout") - m.update_health_check_status(1, "liveness", "basic") - m.record_health_status("svc", "healthy") - m.record_service_health_score("svc", 95.0) - m.update_liveness_status(True, "app") - m.update_readiness_status(False, "app") - m.record_dependency_health("mongo", True, 0.2) - m.record_health_check_timeout("readiness", "db") - m.update_component_health("kafka", True) - - def test_rate_limit_metrics_methods() -> None: """Test with no-op metrics.""" diff --git a/backend/tests/unit/core/metrics/test_kubernetes_and_notifications_metrics.py b/backend/tests/unit/core/metrics/test_kubernetes_and_notifications_metrics.py index 3a12d8de..d6597c9c 100644 --- a/backend/tests/unit/core/metrics/test_kubernetes_and_notifications_metrics.py +++ b/backend/tests/unit/core/metrics/test_kubernetes_and_notifications_metrics.py @@ -18,8 +18,6 @@ def test_kubernetes_metrics_methods(test_settings: Settings) -> None: m.record_k8s_pod_created("success", "python") m.record_k8s_pod_creation_duration(0.3, "python") m.record_k8s_config_map_created("ok") - m.record_k8s_network_policy_created("ok") - m.update_k8s_active_creations(1) m.increment_pod_monitor_watch_reconnects() m.record_pod_monitor_event_processing_duration(0.2, "ADDED") m.record_pod_monitor_event_published("PodRunning", "Running") @@ -29,9 +27,6 @@ def test_kubernetes_metrics_methods(test_settings: Settings) -> None: m.record_pod_phase_transition("Pending", "Running", "pod1") m.record_pod_lifetime(12.0, "Succeeded", "python") m.update_pods_by_phase("Running", 2) - m.record_pod_resource_request("cpu", 500.0, "python") - m.record_pod_resource_limit("memory", 256.0, "python") - m.record_pods_per_node("node1", 7) def test_notification_metrics_methods(test_settings: Settings) -> None: @@ -41,17 +36,12 @@ def test_notification_metrics_methods(test_settings: Settings) -> None: m.record_notification_failed("welcome", "smtp_error", channel="email") m.record_notification_delivery_time(0.5, "welcome", channel="email") m.record_notification_status_change("n1", "pending", "queued") - m.record_notification_read("welcome", 2.0) - m.record_notification_clicked("welcome") - m.update_unread_count("u1", 5) - m.update_unread_count("u1", 2) + m.record_notification_read("welcome") + m.decrement_unread_count("u1") m.record_notification_throttled("welcome", "u1") m.record_throttle_window_hit("u1") m.record_notification_retry("welcome", 1, False) m.record_notification_retry("welcome", 2, True) - m.record_batch_processed(10, 1.2, notification_type="welcome") - m.record_template_render(0.2, "tmpl", success=True) - m.record_template_render(0.1, "tmpl", success=False) m.record_webhook_delivery(0.3, 200, "/hooks/*") m.record_slack_delivery(0.4, "#general", False, error_type="rate_limited") m.update_active_subscriptions("u1", 3) diff --git a/backend/tests/unit/core/metrics/test_metrics_classes.py b/backend/tests/unit/core/metrics/test_metrics_classes.py index 12fba98f..3e198ef0 100644 --- a/backend/tests/unit/core/metrics/test_metrics_classes.py +++ b/backend/tests/unit/core/metrics/test_metrics_classes.py @@ -6,7 +6,6 @@ DLQMetrics, EventMetrics, ExecutionMetrics, - HealthMetrics, KubernetesMetrics, NotificationMetrics, RateLimitMetrics, @@ -27,8 +26,6 @@ def test_connection_metrics_smoke(test_settings: Settings) -> None: m.record_sse_message_sent("exec", "evt") m.record_sse_connection_duration(0.1, "exec") m.update_sse_draining_connections(1) - m.record_sse_shutdown_duration(0.01, "notify") - m.update_event_bus_subscribers(3, "*") def test_event_metrics_smoke(test_settings: Settings) -> None: @@ -36,14 +33,7 @@ def test_event_metrics_smoke(test_settings: Settings) -> None: m = EventMetrics(test_settings) m.record_event_published("execution.requested") m.record_event_processing_duration(0.01, "execution.requested") - m.record_pod_event_published("pod.created") m.record_event_replay_operation("replay", "success") - m.update_event_buffer_size(1) - m.record_event_buffer_dropped() - m.record_event_buffer_processed() - m.record_event_buffer_latency(0.005) - m.set_event_buffer_backpressure(True) - m.record_event_buffer_memory_usage(1.2) m.record_event_stored("x", "events") m.record_events_processing_failed("t", "x", "g", "ValueError") m.record_event_store_duration(0.01, "store", "events") @@ -52,7 +42,6 @@ def test_event_metrics_smoke(test_settings: Settings) -> None: m.record_processing_duration(0.03, "t", "x", "g") m.record_kafka_message_produced("t") m.record_kafka_message_consumed("t", "g") - m.record_kafka_consumer_lag(10, "t", "g", 0) m.record_kafka_production_error("t", "E") m.record_kafka_consumption_error("t", "g", "E") m.update_event_bus_queue_size(1) @@ -65,9 +54,8 @@ def test_other_metrics_classes_smoke(test_settings: Settings) -> None: DatabaseMetrics(test_settings).record_mongodb_operation("read", "ok") DLQMetrics(test_settings).record_dlq_message_received("topic", "type") ExecutionMetrics(test_settings).record_script_execution(ExecutionStatus.QUEUED, "python") - HealthMetrics(test_settings).record_health_check_duration(0.001, "liveness", "basic") KubernetesMetrics(test_settings).record_k8s_pod_created("success", "python") NotificationMetrics(test_settings).record_notification_sent("welcome", channel="email") RateLimitMetrics(test_settings).record_request("/api/test", True, "sliding_window") ReplayMetrics(test_settings).record_session_created("by_id", "kafka") - SecurityMetrics(test_settings).record_security_event("scan", severity="low") + SecurityMetrics(test_settings).record_authentication_attempt("password", True) diff --git a/backend/tests/unit/core/metrics/test_replay_and_security_metrics.py b/backend/tests/unit/core/metrics/test_replay_and_security_metrics.py index c7966e94..7db12533 100644 --- a/backend/tests/unit/core/metrics/test_replay_and_security_metrics.py +++ b/backend/tests/unit/core/metrics/test_replay_and_security_metrics.py @@ -24,9 +24,6 @@ def test_replay_metrics_methods(test_settings: Settings) -> None: m.record_speed_multiplier(2.0, "by_id") m.record_delay_applied(0.05) m.record_batch_size(10, "by_id") - m.record_events_filtered("type", 5) - m.record_filter_effectiveness(5, 10, "type") - m.record_replay_memory_usage(123.0, "s1") m.update_replay_queue_size("s1", 10) m.update_replay_queue_size("s1", 4) @@ -34,31 +31,16 @@ def test_replay_metrics_methods(test_settings: Settings) -> None: def test_security_metrics_methods(test_settings: Settings) -> None: """Test SecurityMetrics methods with no-op metrics.""" m = SecurityMetrics(test_settings) - m.record_security_event("scan_started", severity="high", source="scanner") - m.record_security_violation("csrf", user_id="u1", ip_address="127.0.0.1") m.record_authentication_attempt("password", False, user_id="u1", duration_seconds=0.2) - m.update_active_sessions(2) m.increment_active_sessions() m.decrement_active_sessions() m.record_token_generated("access", 3600) - m.record_token_refreshed("access") m.record_token_revoked("access", "logout") m.record_token_validation_failure("access", "expired") m.record_authorization_check("/admin", "GET", False, user_role="user") - m.record_permission_check("write", True, user_id="u1") m.record_csrf_token_generated() m.record_csrf_validation_failure("missing") - m.record_network_policy_violation("np1", "pod1", violation_type="egress") - m.record_privilege_escalation_attempt("u1", "admin", True) - m.record_rate_limit_hit("/api") - m.record_rate_limit_violation("/api", limit=100) - m.record_api_key_created("kid") - m.record_api_key_revoked("kid", "compromised") - m.record_api_key_usage("kid", "/api") - m.record_audit_event("config_change", "u1", resource="system") - m.record_password_change("u1", True) - m.record_password_reset_request("u1", method="email") + m.record_password_reset_request("u1", method="admin") m.record_weak_password_attempt("u1", "common_password") m.record_brute_force_attempt("1.2.3.4", target_user="u1", action_taken="blocked") m.record_account_locked("u1", "brute_force", duration_seconds=600) - diff --git a/backend/tests/unit/core/test_csrf.py b/backend/tests/unit/core/test_csrf.py index bc20f7d2..3c0838c2 100644 --- a/backend/tests/unit/core/test_csrf.py +++ b/backend/tests/unit/core/test_csrf.py @@ -1,4 +1,5 @@ import pytest +from app.core.metrics import SecurityMetrics from app.core.security import SecurityService from app.domain.user import CSRFValidationError from app.settings import Settings @@ -28,9 +29,9 @@ def make_request( class TestCSRFTokenGeneration: """Tests for CSRF token generation.""" - def test_generates_signed_token_format(self, test_settings: Settings) -> None: + def test_generates_signed_token_format(self, test_settings: Settings, security_metrics: SecurityMetrics) -> None: """CSRF token has nonce.signature format.""" - security = SecurityService(test_settings) + security = SecurityService(test_settings, security_metrics) token = security.generate_csrf_token("session-abc") @@ -40,17 +41,19 @@ def test_generates_signed_token_format(self, test_settings: Settings) -> None: assert len(nonce) > 0 assert len(signature) == 64 # sha256 hexdigest - def test_generates_unique_tokens(self, test_settings: Settings) -> None: + def test_generates_unique_tokens(self, test_settings: Settings, security_metrics: SecurityMetrics) -> None: """Each CSRF token is unique (different nonce each time).""" - security = SecurityService(test_settings) + security = SecurityService(test_settings, security_metrics) tokens = {security.generate_csrf_token("same-session") for _ in range(100)} assert len(tokens) == 100 - def test_different_sessions_produce_different_tokens(self, test_settings: Settings) -> None: + def test_different_sessions_produce_different_tokens( + self, test_settings: Settings, security_metrics: SecurityMetrics, + ) -> None: """Tokens for different sessions differ even with same nonce derivation.""" - security = SecurityService(test_settings) + security = SecurityService(test_settings, security_metrics) token_a = security.generate_csrf_token("session-a") token_b = security.generate_csrf_token("session-b") @@ -61,18 +64,18 @@ def test_different_sessions_produce_different_tokens(self, test_settings: Settin class TestCSRFTokenValidation: """Tests for CSRF token validation (double-submit check).""" - def test_validates_matching_tokens(self, test_settings: Settings) -> None: + def test_validates_matching_tokens(self, test_settings: Settings, security_metrics: SecurityMetrics) -> None: """Matching CSRF tokens pass validation.""" - security = SecurityService(test_settings) + security = SecurityService(test_settings, security_metrics) token = security.generate_csrf_token("session-1") result = security.validate_csrf_token(token, token) assert result is True - def test_rejects_mismatched_tokens(self, test_settings: Settings) -> None: + def test_rejects_mismatched_tokens(self, test_settings: Settings, security_metrics: SecurityMetrics) -> None: """Mismatched CSRF tokens fail validation.""" - security = SecurityService(test_settings) + security = SecurityService(test_settings, security_metrics) token1 = security.generate_csrf_token("session-1") token2 = security.generate_csrf_token("session-2") @@ -91,10 +94,11 @@ def test_rejects_mismatched_tokens(self, test_settings: Settings) -> None: ids=["empty_header", "empty_cookie", "both_empty"], ) def test_rejects_empty_tokens( - self, test_settings: Settings, header_token: str, cookie_token: str + self, test_settings: Settings, security_metrics: SecurityMetrics, + header_token: str, cookie_token: str, ) -> None: """Empty CSRF tokens fail validation.""" - security = SecurityService(test_settings) + security = SecurityService(test_settings, security_metrics) result = security.validate_csrf_token(header_token, cookie_token) @@ -104,29 +108,29 @@ def test_rejects_empty_tokens( class TestCSRFSignatureVerification: """Tests for CSRF HMAC signature verification.""" - def test_valid_signature_passes(self, test_settings: Settings) -> None: + def test_valid_signature_passes(self, test_settings: Settings, security_metrics: SecurityMetrics) -> None: """Token verified against the same session_id succeeds.""" - security = SecurityService(test_settings) + security = SecurityService(test_settings, security_metrics) token = security.generate_csrf_token("my-session") assert security._verify_csrf_signature(token, "my-session") is True - def test_wrong_session_rejected(self, test_settings: Settings) -> None: + def test_wrong_session_rejected(self, test_settings: Settings, security_metrics: SecurityMetrics) -> None: """Token signed for session A fails verification against session B.""" - security = SecurityService(test_settings) + security = SecurityService(test_settings, security_metrics) token = security.generate_csrf_token("session-a") assert security._verify_csrf_signature(token, "session-b") is False - def test_unsigned_token_rejected(self, test_settings: Settings) -> None: + def test_unsigned_token_rejected(self, test_settings: Settings, security_metrics: SecurityMetrics) -> None: """A plain random string without signature structure is rejected.""" - security = SecurityService(test_settings) + security = SecurityService(test_settings, security_metrics) assert security._verify_csrf_signature("plain-random-token", "session") is False - def test_tampered_signature_rejected(self, test_settings: Settings) -> None: + def test_tampered_signature_rejected(self, test_settings: Settings, security_metrics: SecurityMetrics) -> None: """Modifying the signature portion causes rejection.""" - security = SecurityService(test_settings) + security = SecurityService(test_settings, security_metrics) token = security.generate_csrf_token("session-x") nonce, sig = token.split(".", 1) @@ -139,23 +143,23 @@ class TestCSRFExemptPaths: """Tests for CSRF exempt path configuration.""" def test_exempt_paths_includes_login_and_register( - self, test_settings: Settings + self, test_settings: Settings, security_metrics: SecurityMetrics, ) -> None: """CSRF exempt paths include login and register.""" - security = SecurityService(test_settings) + security = SecurityService(test_settings, security_metrics) assert "/api/v1/auth/login" in security.CSRF_EXEMPT_PATHS assert "/api/v1/auth/register" in security.CSRF_EXEMPT_PATHS - def test_logout_is_not_exempt(self, test_settings: Settings) -> None: + def test_logout_is_not_exempt(self, test_settings: Settings, security_metrics: SecurityMetrics) -> None: """Logout is NOT exempt from CSRF validation.""" - security = SecurityService(test_settings) + security = SecurityService(test_settings, security_metrics) assert "/api/v1/auth/logout" not in security.CSRF_EXEMPT_PATHS - def test_exempt_paths_is_frozenset(self, test_settings: Settings) -> None: + def test_exempt_paths_is_frozenset(self, test_settings: Settings, security_metrics: SecurityMetrics) -> None: """CSRF exempt paths is a frozenset (immutable).""" - security = SecurityService(test_settings) + security = SecurityService(test_settings, security_metrics) assert isinstance(security.CSRF_EXEMPT_PATHS, frozenset) @@ -163,18 +167,18 @@ def test_exempt_paths_is_frozenset(self, test_settings: Settings) -> None: class TestCSRFRequestValidation: """Tests for CSRF validation from HTTP requests.""" - def test_skips_get_requests(self, test_settings: Settings) -> None: + def test_skips_get_requests(self, test_settings: Settings, security_metrics: SecurityMetrics) -> None: """GET requests skip CSRF validation.""" - security = SecurityService(test_settings) + security = SecurityService(test_settings, security_metrics) req = make_request("GET", "/api/v1/anything") assert security.validate_csrf_from_request(req) == "skip" def test_missing_header_raises_when_authenticated( - self, test_settings: Settings + self, test_settings: Settings, security_metrics: SecurityMetrics, ) -> None: """Missing CSRF header raises error for authenticated POST.""" - security = SecurityService(test_settings) + security = SecurityService(test_settings, security_metrics) req = make_request( "POST", "/api/v1/items", @@ -184,9 +188,9 @@ def test_missing_header_raises_when_authenticated( with pytest.raises(CSRFValidationError): security.validate_csrf_from_request(req) - def test_valid_tokens_pass(self, test_settings: Settings) -> None: + def test_valid_tokens_pass(self, test_settings: Settings, security_metrics: SecurityMetrics) -> None: """Valid signed CSRF tokens pass full request validation.""" - security = SecurityService(test_settings) + security = SecurityService(test_settings, security_metrics) access_token = "my-access-token-value" token = security.generate_csrf_token(access_token) req = make_request( @@ -198,9 +202,9 @@ def test_valid_tokens_pass(self, test_settings: Settings) -> None: assert security.validate_csrf_from_request(req) == token - def test_forged_token_rejected(self, test_settings: Settings) -> None: + def test_forged_token_rejected(self, test_settings: Settings, security_metrics: SecurityMetrics) -> None: """Unsigned token matching in header+cookie is rejected (signature check).""" - security = SecurityService(test_settings) + security = SecurityService(test_settings, security_metrics) forged = "forged-random-value" req = make_request( "POST", @@ -212,9 +216,9 @@ def test_forged_token_rejected(self, test_settings: Settings) -> None: with pytest.raises(CSRFValidationError, match="signature invalid"): security.validate_csrf_from_request(req) - def test_wrong_session_token_rejected(self, test_settings: Settings) -> None: + def test_wrong_session_token_rejected(self, test_settings: Settings, security_metrics: SecurityMetrics) -> None: """Token signed for one session rejected when presented with different access_token.""" - security = SecurityService(test_settings) + security = SecurityService(test_settings, security_metrics) token = security.generate_csrf_token("session-A-jwt") req = make_request( "POST", @@ -226,9 +230,9 @@ def test_wrong_session_token_rejected(self, test_settings: Settings) -> None: with pytest.raises(CSRFValidationError, match="signature invalid"): security.validate_csrf_from_request(req) - def test_logout_requires_csrf(self, test_settings: Settings) -> None: + def test_logout_requires_csrf(self, test_settings: Settings, security_metrics: SecurityMetrics) -> None: """POST to logout with authentication requires CSRF token.""" - security = SecurityService(test_settings) + security = SecurityService(test_settings, security_metrics) req = make_request( "POST", "/api/v1/auth/logout", diff --git a/backend/tests/unit/core/test_security.py b/backend/tests/unit/core/test_security.py index 27bd4386..7aedee5e 100644 --- a/backend/tests/unit/core/test_security.py +++ b/backend/tests/unit/core/test_security.py @@ -4,6 +4,7 @@ import jwt import pytest +from app.core.metrics import SecurityMetrics from app.core.security import SecurityService from app.domain.enums import UserRole from app.domain.user import InvalidCredentialsError @@ -15,9 +16,9 @@ class TestPasswordHashing: """Test password hashing functionality.""" @pytest.fixture - def security_svc(self, test_settings: Settings) -> SecurityService: + def security_svc(self, test_settings: Settings, security_metrics: SecurityMetrics) -> SecurityService: """Create SecurityService instance.""" - return SecurityService(test_settings) + return SecurityService(test_settings, security_metrics) def test_password_hash_creates_different_hash(self, security_svc: SecurityService) -> None: """Test that password hashing creates unique hashes.""" @@ -72,9 +73,9 @@ class TestSecurityService: """Test SecurityService functionality.""" @pytest.fixture - def security_service(self, test_settings: Settings) -> SecurityService: + def security_service(self, test_settings: Settings, security_metrics: SecurityMetrics) -> SecurityService: """Create SecurityService instance using test settings.""" - return SecurityService(test_settings) + return SecurityService(test_settings, security_metrics) def test_create_access_token_basic( self, @@ -283,9 +284,9 @@ def test_token_has_only_expected_claims(self, security_service: SecurityService) assert decoded["role"] == UserRole.USER assert "extra_field" in decoded # Claims are carried as provided - def test_password_context_configuration(self, test_settings: Settings) -> None: + def test_password_context_configuration(self, test_settings: Settings, security_metrics: SecurityMetrics) -> None: """Test password context is properly configured.""" - svc = SecurityService(test_settings) + svc = SecurityService(test_settings, security_metrics) password = "test_password" hashed = svc.get_password_hash(password) assert svc.verify_password(password, hashed) @@ -311,8 +312,8 @@ class TestDecodeToken: """Test SecurityService.decode_token method.""" @pytest.fixture - def security_service(self, test_settings: Settings) -> SecurityService: - return SecurityService(test_settings) + def security_service(self, test_settings: Settings, security_metrics: SecurityMetrics) -> SecurityService: + return SecurityService(test_settings, security_metrics) def test_valid_token_returns_username(self, security_service: SecurityService) -> None: token = security_service.create_access_token( From f238b69a3127a6ee7c1f33637a44aa22f64ed225 Mon Sep 17 00:00:00 2001 From: HardMax71 Date: Mon, 16 Feb 2026 20:23:03 +0100 Subject: [PATCH 5/9] fix: wrong error msg --- backend/app/domain/user/exceptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/app/domain/user/exceptions.py b/backend/app/domain/user/exceptions.py index dc1b9acb..8b87c7a3 100644 --- a/backend/app/domain/user/exceptions.py +++ b/backend/app/domain/user/exceptions.py @@ -11,7 +11,7 @@ def __init__(self, message: str = "Not authenticated") -> None: class InvalidCredentialsError(UnauthorizedError): """Raised when credentials are invalid.""" - def __init__(self, message: str = "Could not validate credentials") -> None: + def __init__(self, message: str = "Invalid credentials") -> None: super().__init__(message) From 245e5378733cd3d94f8741d742fc02c01258bfa5 Mon Sep 17 00:00:00 2001 From: HardMax71 Date: Mon, 16 Feb 2026 20:55:37 +0100 Subject: [PATCH 6/9] fix: detected issues --- backend/app/core/metrics/dlq.py | 7 +-- backend/app/core/metrics/notifications.py | 35 +++---------- backend/app/core/metrics/security.py | 7 ++- backend/app/dlq/manager.py | 2 +- backend/app/services/auth_service.py | 29 +++++++---- backend/app/services/notification_service.py | 5 +- .../dashboards/coordinator-execution.json | 14 +++--- .../provisioning/dashboards/event-replay.json | 12 ++--- .../dashboards/event-stream-monitoring.json | 13 ----- .../dashboards/http-middleware.json | 10 ++-- .../dashboards/kubernetes-pods.json | 8 +-- .../dashboards/notifications.json | 50 ++++--------------- .../dashboards/security-auth.json | 4 +- .../tests/contract/test_grafana_metrics.py | 11 +++- ...st_kubernetes_and_notifications_metrics.py | 11 ++-- .../test_replay_and_security_metrics.py | 4 +- 16 files changed, 88 insertions(+), 134 deletions(-) diff --git a/backend/app/core/metrics/dlq.py b/backend/app/core/metrics/dlq.py index 18383cdc..1a73d477 100644 --- a/backend/app/core/metrics/dlq.py +++ b/backend/app/core/metrics/dlq.py @@ -5,6 +5,8 @@ class DLQMetrics(BaseMetrics): """Metrics for Dead Letter Queue operations.""" def _create_instruments(self) -> None: + self._dlq_sizes: dict[str, int] = {} + self.dlq_messages_received = self._meter.create_counter( name="dlq.messages.received.total", description="Total number of messages received in DLQ", unit="1" ) @@ -50,12 +52,11 @@ def record_dlq_processing_duration(self, duration_seconds: float, operation: str self.dlq_processing_duration.record(duration_seconds, attributes={"operation": operation}) def update_dlq_queue_size(self, original_topic: str, size: int) -> None: - key = f"_dlq_size_{original_topic}" - current_val = getattr(self, key, 0) + current_val = self._dlq_sizes.get(original_topic, 0) delta = size - current_val if delta != 0: self.dlq_queue_size.add(delta, attributes={"original_topic": original_topic}) - setattr(self, key, size) + self._dlq_sizes[original_topic] = size def record_dlq_message_age(self, age_seconds: float) -> None: self.dlq_message_age.record(age_seconds) diff --git a/backend/app/core/metrics/notifications.py b/backend/app/core/metrics/notifications.py index 13081829..b13f23bb 100644 --- a/backend/app/core/metrics/notifications.py +++ b/backend/app/core/metrics/notifications.py @@ -54,10 +54,6 @@ def _create_instruments(self) -> None: name="notifications.read.total", description="Total notifications read by users", unit="1" ) - self.unread_count = self._meter.create_up_down_counter( - name="notifications.unread.count", description="Current unread notifications per user", unit="1" - ) - # Throttling metrics self.notifications_throttled = self._meter.create_counter( name="notifications.throttled.total", description="Total notifications throttled", unit="1" @@ -101,12 +97,6 @@ def _create_instruments(self) -> None: ) # Subscription metrics - self.subscriptions_active = self._meter.create_up_down_counter( - name="notification.subscriptions.active", - description="Number of active notification subscriptions", - unit="1", - ) - self.subscription_changes = self._meter.create_counter( name="notification.subscription.changes.total", description="Total subscription changes", unit="1" ) @@ -146,14 +136,11 @@ def record_notification_status_change(self, notification_id: str, from_status: s def record_notification_read(self, notification_type: str) -> None: self.notifications_read.add(1, attributes={"category": notification_type}) - def decrement_unread_count(self, user_id: str) -> None: - self.unread_count.add(-1, attributes={"user_id": user_id}) - - def record_notification_throttled(self, notification_type: str, user_id: str) -> None: - self.notifications_throttled.add(1, attributes={"category": notification_type, "user_id": user_id}) + def record_notification_throttled(self, notification_type: str) -> None: + self.notifications_throttled.add(1, attributes={"category": notification_type}) - def record_throttle_window_hit(self, user_id: str) -> None: - self.throttle_window_hits.add(1, attributes={"user_id": user_id}) + def record_throttle_window_hit(self) -> None: + self.throttle_window_hits.add(1) def record_notification_retry(self, notification_type: str, attempt_number: int, success: bool) -> None: self.notification_retries.add( @@ -177,20 +164,12 @@ def record_slack_delivery( if not success and error_type: self.slack_api_errors.add(1, attributes={"error_type": error_type, "channel": channel}) - def update_active_subscriptions(self, user_id: str, count: int) -> None: - key = f"_subscriptions_{user_id}" - current_val = getattr(self, key, 0) - delta = count - current_val - if delta != 0: - self.subscriptions_active.add(delta, attributes={"user_id": user_id}) - setattr(self, key, count) - - def record_subscription_change(self, user_id: str, notification_type: str, action: str) -> None: + def record_subscription_change(self, channel: str, enabled: bool | None) -> None: + action = "enabled" if enabled is True else "disabled" if enabled is False else "updated" self.subscription_changes.add( 1, attributes={ - "user_id": user_id, - "category": notification_type, + "channel": channel, "action": action, }, ) diff --git a/backend/app/core/metrics/security.py b/backend/app/core/metrics/security.py index 589fb1c8..15086dfb 100644 --- a/backend/app/core/metrics/security.py +++ b/backend/app/core/metrics/security.py @@ -131,8 +131,8 @@ def record_csrf_validation_failure(self, reason: str) -> None: def record_password_reset_request(self, user_id: str, method: str = "admin") -> None: self.password_reset_requests.add(1, attributes={"user_id": user_id, "method": method}) - def record_weak_password_attempt(self, user_id: str, weakness_type: str) -> None: - self.weak_password_attempts.add(1, attributes={"user_id": user_id, "weakness_type": weakness_type}) + def record_weak_password_attempt(self, weakness_type: str) -> None: + self.weak_password_attempts.add(1, attributes={"weakness_type": weakness_type}) def record_brute_force_attempt( self, ip_address: str, target_user: str | None = None, action_taken: str = "logged" @@ -146,11 +146,10 @@ def record_brute_force_attempt( }, ) - def record_account_locked(self, user_id: str, reason: str, duration_seconds: float | None = None) -> None: + def record_account_locked(self, reason: str, duration_seconds: float | None = None) -> None: self.accounts_locked.add( 1, attributes={ - "user_id": user_id, "reason": reason, "duration": str(duration_seconds) if duration_seconds else "permanent", }, diff --git a/backend/app/dlq/manager.py b/backend/app/dlq/manager.py index 968219b5..1e23a70f 100644 --- a/backend/app/dlq/manager.py +++ b/backend/app/dlq/manager.py @@ -246,7 +246,7 @@ async def retry_messages_batch(self, event_ids: list[str]) -> DLQBatchRetryResul details.append(DLQRetryResult(event_id=event_id, status="failed", error="Retry failed")) except Exception as e: self.logger.error(f"Error retrying message {event_id}: {e}") - self.metrics.record_dlq_processing_error("batch_retry", event_id, type(e).__name__) + self.metrics.record_dlq_processing_error("batch_retry", "unknown", type(e).__name__) failed += 1 details.append(DLQRetryResult(event_id=event_id, status="failed", error=str(e))) diff --git a/backend/app/services/auth_service.py b/backend/app/services/auth_service.py index 243ccd31..5132a736 100644 --- a/backend/app/services/auth_service.py +++ b/backend/app/services/auth_service.py @@ -31,6 +31,13 @@ class AuthService: + # Pre-computed bcrypt hash used as a timing side-channel mitigation. + # When a login attempt targets a non-existent username, we still run + # verify_password against this hash so the response time is comparable + # to the "user exists, wrong password" path. Without this, an attacker + # could measure response times to enumerate valid usernames. + _dummy_hash = "$2b$12$hDE3I.Y1MHugA561T/NQgebE/IVQS.2YliUDGfqADq7v/MTUG6.Bi" + def __init__( self, user_repo: UserRepository, @@ -106,7 +113,7 @@ async def _fail_login( key=username, ) if locked: - self.security_metrics.record_account_locked(username, "brute_force") + self.security_metrics.record_account_locked("brute_force") raise AccountLockedError("Account locked due to too many failed attempts") raise InvalidCredentialsError() @@ -123,10 +130,11 @@ async def login( user = await self.user_repo.get_user(username) if not user: + self.security_service.verify_password(password, self._dummy_hash) await self._fail_login(username, "user_not_found", ip_address, user_agent) if not self.security_service.verify_password(password, user.hashed_password): - await self._fail_login(username, "invalid_password", ip_address, user_agent, user_id=str(user.user_id)) + await self._fail_login(username, "invalid_password", ip_address, user_agent, user_id=user.user_id) await self._lockout.clear_attempts(username) @@ -149,11 +157,11 @@ async def login( await self._producer.produce( event_to_produce=UserLoggedInEvent( - user_id=str(user.user_id), + user_id=user.user_id, login_method=LoginMethod.PASSWORD, ip_address=ip_address, user_agent=user_agent, - metadata=self._build_metadata(user_id=str(user.user_id)), + metadata=self._build_metadata(user_id=user.user_id), ), key=user.username, ) @@ -177,7 +185,7 @@ async def register( effective = await self._runtime_settings.get_effective_settings() min_len = effective.password_min_length if len(password) < min_len: - self.security_metrics.record_weak_password_attempt(username, "too_short") + self.security_metrics.record_weak_password_attempt("too_short") raise ValidationError(f"Password must be at least {min_len} characters") existing = await self.user_repo.get_user(username) @@ -210,10 +218,10 @@ async def register( await self._producer.produce( event_to_produce=UserRegisteredEvent( - user_id=str(created_user.user_id), + user_id=created_user.user_id, username=created_user.username, email=created_user.email, - metadata=self._build_metadata(user_id=str(created_user.user_id)), + metadata=self._build_metadata(user_id=created_user.user_id), ), key=created_user.username, ) @@ -224,11 +232,14 @@ async def publish_logout_event(self, token: str | None) -> None: if not token: return username = self.security_service.decode_token(token) + user = await self.user_repo.get_user(username) + if not user: + return await self._producer.produce( event_to_produce=UserLoggedOutEvent( - user_id=username, + user_id=user.user_id, logout_reason="user_initiated", - metadata=self._build_metadata(user_id=username), + metadata=self._build_metadata(user_id=user.user_id), ), key=username, ) diff --git a/backend/app/services/notification_service.py b/backend/app/services/notification_service.py index aaaa47d8..bd54b508 100644 --- a/backend/app/services/notification_service.py +++ b/backend/app/services/notification_service.py @@ -168,7 +168,7 @@ async def create_notification( f"per {self.settings.NOTIF_THROTTLE_WINDOW_HOURS} hour(s)" ) self.logger.warning(error_msg) - self.metrics.record_notification_throttled("general", user_id) + self.metrics.record_notification_throttled("general") raise NotificationThrottledError( user_id, self.settings.NOTIF_THROTTLE_MAX_PER_HOUR, @@ -546,8 +546,7 @@ async def update_subscription( raise NotificationValidationError("slack_webhook is required when enabling SLACK") result = await self.repository.upsert_subscription(user_id, channel, update_data) - action = "enabled" if update_data.enabled else "updated" - self.metrics.record_subscription_change(user_id, channel, action) + self.metrics.record_subscription_change(channel, update_data.enabled) return result async def mark_all_as_read(self, user_id: str) -> int: diff --git a/backend/grafana/provisioning/dashboards/coordinator-execution.json b/backend/grafana/provisioning/dashboards/coordinator-execution.json index 4026abe6..6481d9ad 100644 --- a/backend/grafana/provisioning/dashboards/coordinator-execution.json +++ b/backend/grafana/provisioning/dashboards/coordinator-execution.json @@ -37,7 +37,7 @@ "id": 1, "targets": [ { - "expr": "histogram_quantile(0.95, rate(coordinator_scheduling_duration_seconds_bucket[5m]))", + "expr": "histogram_quantile(0.95, sum(rate(coordinator_scheduling_duration_seconds_bucket[5m])) by (le))", "legendFormat": "p95", "refId": "A" } @@ -61,7 +61,7 @@ "id": 2, "targets": [ { - "expr": "histogram_quantile(0.95, rate(coordinator_queue_wait_time_seconds_bucket[5m]))", + "expr": "histogram_quantile(0.95, sum(rate(coordinator_queue_wait_time_seconds_bucket[5m])) by (le))", "legendFormat": "p95", "refId": "A" } @@ -144,7 +144,7 @@ "id": 5, "targets": [ { - "expr": "histogram_quantile(0.95, rate(execution_queue_wait_time_seconds_bucket[5m]))", + "expr": "histogram_quantile(0.95, sum(rate(execution_queue_wait_time_seconds_bucket[5m])) by (le))", "legendFormat": "p95", "refId": "A" } @@ -210,12 +210,12 @@ "id": 7, "targets": [ { - "expr": "histogram_quantile(0.50, rate(script_memory_usage_MiB_bucket[5m]))", + "expr": "histogram_quantile(0.50, sum(rate(script_memory_usage_MiB_bucket[5m])) by (le))", "legendFormat": "p50", "refId": "A" }, { - "expr": "histogram_quantile(0.95, rate(script_memory_usage_MiB_bucket[5m]))", + "expr": "histogram_quantile(0.95, sum(rate(script_memory_usage_MiB_bucket[5m])) by (le))", "legendFormat": "p95", "refId": "B" } @@ -239,12 +239,12 @@ "id": 8, "targets": [ { - "expr": "histogram_quantile(0.50, rate(script_memory_utilization_percent_bucket[5m]))", + "expr": "histogram_quantile(0.50, sum(rate(script_memory_utilization_percent_bucket[5m])) by (le))", "legendFormat": "p50", "refId": "A" }, { - "expr": "histogram_quantile(0.95, rate(script_memory_utilization_percent_bucket[5m]))", + "expr": "histogram_quantile(0.95, sum(rate(script_memory_utilization_percent_bucket[5m])) by (le))", "legendFormat": "p95", "refId": "B" } diff --git a/backend/grafana/provisioning/dashboards/event-replay.json b/backend/grafana/provisioning/dashboards/event-replay.json index 56a9eb47..edac9b2a 100644 --- a/backend/grafana/provisioning/dashboards/event-replay.json +++ b/backend/grafana/provisioning/dashboards/event-replay.json @@ -191,7 +191,7 @@ "id": 6, "targets": [ { - "expr": "histogram_quantile(0.95, rate(replay_duration_seconds_bucket[5m]))", + "expr": "histogram_quantile(0.95, sum(rate(replay_duration_seconds_bucket[5m])) by (le))", "legendFormat": "p95", "refId": "A" } @@ -215,7 +215,7 @@ "id": 7, "targets": [ { - "expr": "histogram_quantile(0.95, rate(replay_event_processing_time_seconds_bucket[5m]))", + "expr": "histogram_quantile(0.95, sum(rate(replay_event_processing_time_seconds_bucket[5m])) by (le))", "legendFormat": "p95", "refId": "A" } @@ -239,7 +239,7 @@ "id": 8, "targets": [ { - "expr": "histogram_quantile(0.50, rate(replay_throughput_event_per_second_bucket[5m]))", + "expr": "histogram_quantile(0.50, sum(rate(replay_throughput_event_per_second_bucket[5m])) by (le))", "legendFormat": "p50", "refId": "A" } @@ -263,7 +263,7 @@ "id": 9, "targets": [ { - "expr": "histogram_quantile(0.50, rate(replay_batch_size_bucket[5m]))", + "expr": "histogram_quantile(0.50, sum(rate(replay_batch_size_bucket[5m])) by (le))", "legendFormat": "p50", "refId": "A" } @@ -329,12 +329,12 @@ "id": 11, "targets": [ { - "expr": "histogram_quantile(0.50, rate(replay_speed_multiplier_x_bucket[5m]))", + "expr": "histogram_quantile(0.50, sum(rate(replay_speed_multiplier_x_bucket[5m])) by (le))", "legendFormat": "Multiplier p50", "refId": "A" }, { - "expr": "histogram_quantile(0.50, rate(replay_delay_applied_seconds_bucket[5m]))", + "expr": "histogram_quantile(0.50, sum(rate(replay_delay_applied_seconds_bucket[5m])) by (le))", "legendFormat": "Delay p50", "refId": "B" } diff --git a/backend/grafana/provisioning/dashboards/event-stream-monitoring.json b/backend/grafana/provisioning/dashboards/event-stream-monitoring.json index fde9d831..f085c6d8 100644 --- a/backend/grafana/provisioning/dashboards/event-stream-monitoring.json +++ b/backend/grafana/provisioning/dashboards/event-stream-monitoring.json @@ -600,19 +600,6 @@ "x": 0, "y": 16 }, - "id": 60, - "panels": [], - "title": "Event Buffer Performance", - "type": "row" - }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 33 - }, "id": 65, "panels": [], "title": "Event Flow Analysis", diff --git a/backend/grafana/provisioning/dashboards/http-middleware.json b/backend/grafana/provisioning/dashboards/http-middleware.json index 908d3a90..37240637 100644 --- a/backend/grafana/provisioning/dashboards/http-middleware.json +++ b/backend/grafana/provisioning/dashboards/http-middleware.json @@ -61,7 +61,7 @@ "id": 2, "targets": [ { - "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))", + "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))", "legendFormat": "p95", "refId": "A" } @@ -108,12 +108,12 @@ "id": 4, "targets": [ { - "expr": "histogram_quantile(0.50, rate(http_request_size_bytes_bucket[5m]))", + "expr": "histogram_quantile(0.50, sum(rate(http_request_size_bytes_bucket[5m])) by (le))", "legendFormat": "Request p50", "refId": "A" }, { - "expr": "histogram_quantile(0.50, rate(http_response_size_bytes_bucket[5m]))", + "expr": "histogram_quantile(0.50, sum(rate(http_response_size_bytes_bucket[5m])) by (le))", "legendFormat": "Response p50", "refId": "B" } @@ -179,7 +179,7 @@ "id": 6, "targets": [ { - "expr": "histogram_quantile(0.95, rate(mongodb_event_query_duration_seconds_bucket[5m]))", + "expr": "histogram_quantile(0.95, sum(rate(mongodb_event_query_duration_seconds_bucket[5m])) by (le))", "legendFormat": "p95", "refId": "A" } @@ -231,7 +231,7 @@ "refId": "A" }, { - "expr": "histogram_quantile(0.95, rate(idempotency_processing_duration_seconds_bucket[5m]))", + "expr": "histogram_quantile(0.95, sum(rate(idempotency_processing_duration_seconds_bucket[5m])) by (le))", "legendFormat": "Idempotency p95", "refId": "B" } diff --git a/backend/grafana/provisioning/dashboards/kubernetes-pods.json b/backend/grafana/provisioning/dashboards/kubernetes-pods.json index 47cf6dd4..812200ac 100644 --- a/backend/grafana/provisioning/dashboards/kubernetes-pods.json +++ b/backend/grafana/provisioning/dashboards/kubernetes-pods.json @@ -66,7 +66,7 @@ "id": 2, "targets": [ { - "expr": "histogram_quantile(0.95, rate(pod_creation_duration_seconds_bucket[5m]))", + "expr": "histogram_quantile(0.95, sum(rate(pod_creation_duration_seconds_bucket[5m])) by (le))", "legendFormat": "p95", "refId": "A" } @@ -173,12 +173,12 @@ "id": 6, "targets": [ { - "expr": "histogram_quantile(0.50, rate(pod_lifetime_seconds_bucket[5m]))", + "expr": "histogram_quantile(0.50, sum(rate(pod_lifetime_seconds_bucket[5m])) by (le))", "legendFormat": "p50", "refId": "A" }, { - "expr": "histogram_quantile(0.95, rate(pod_lifetime_seconds_bucket[5m]))", + "expr": "histogram_quantile(0.95, sum(rate(pod_lifetime_seconds_bucket[5m])) by (le))", "legendFormat": "p95", "refId": "B" } @@ -290,7 +290,7 @@ "id": 10, "targets": [ { - "expr": "histogram_quantile(0.95, rate(pod_monitor_processing_duration_seconds_bucket[5m]))", + "expr": "histogram_quantile(0.95, sum(rate(pod_monitor_processing_duration_seconds_bucket[5m])) by (le))", "legendFormat": "p95", "refId": "A" } diff --git a/backend/grafana/provisioning/dashboards/notifications.json b/backend/grafana/provisioning/dashboards/notifications.json index 97096a25..a076c8e9 100644 --- a/backend/grafana/provisioning/dashboards/notifications.json +++ b/backend/grafana/provisioning/dashboards/notifications.json @@ -101,29 +101,6 @@ "title": "Queued", "type": "stat" }, - { - "datasource": "Victoria Metrics", - "fieldConfig": { - "defaults": { - "unit": "short" - } - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 20, - "y": 1 - }, - "id": 4, - "targets": [ - { - "expr": "notifications_unread_count", - "refId": "A" - } - ], - "title": "Unread", - "type": "stat" - }, { "collapsed": false, "gridPos": { @@ -189,7 +166,7 @@ "datasource": "Victoria Metrics", "fieldConfig": { "defaults": { - "unit": "short" + "unit": "s" } }, "gridPos": { @@ -201,7 +178,7 @@ "id": 7, "targets": [ { - "expr": "histogram_quantile(0.95, rate(notification_channel_delivery_time_seconds_bucket[5m]))", + "expr": "histogram_quantile(0.95, sum(rate(notification_channel_delivery_time_seconds_bucket[5m])) by (le))", "legendFormat": "p95", "refId": "A" } @@ -250,7 +227,7 @@ "datasource": "Victoria Metrics", "fieldConfig": { "defaults": { - "unit": "short" + "unit": "s" } }, "gridPos": { @@ -262,7 +239,7 @@ "id": 9, "targets": [ { - "expr": "histogram_quantile(0.95, rate(notification_delivery_time_seconds_bucket[5m]))", + "expr": "histogram_quantile(0.95, sum(rate(notification_delivery_time_seconds_bucket[5m])) by (le))", "legendFormat": "p95", "refId": "A" } @@ -357,7 +334,7 @@ "refId": "A" }, { - "expr": "histogram_quantile(0.50, rate(notification_retry_success_rate_percent_bucket[5m]))", + "expr": "histogram_quantile(0.50, sum(rate(notification_retry_success_rate_percent_bucket[5m])) by (le))", "legendFormat": "Success Rate p50", "refId": "B" } @@ -382,7 +359,7 @@ "datasource": "Victoria Metrics", "fieldConfig": { "defaults": { - "unit": "short" + "unit": "s" } }, "gridPos": { @@ -394,7 +371,7 @@ "id": 13, "targets": [ { - "expr": "histogram_quantile(0.95, rate(notification_webhook_delivery_time_seconds_bucket[5m]))", + "expr": "histogram_quantile(0.95, sum(rate(notification_webhook_delivery_time_seconds_bucket[5m])) by (le))", "legendFormat": "p95", "refId": "A" }, @@ -411,7 +388,7 @@ "datasource": "Victoria Metrics", "fieldConfig": { "defaults": { - "unit": "short" + "unit": "s" } }, "gridPos": { @@ -423,7 +400,7 @@ "id": 14, "targets": [ { - "expr": "histogram_quantile(0.95, rate(notification_slack_delivery_time_seconds_bucket[5m]))", + "expr": "histogram_quantile(0.95, sum(rate(notification_slack_delivery_time_seconds_bucket[5m])) by (le))", "legendFormat": "p95", "refId": "A" }, @@ -464,15 +441,10 @@ }, "id": 15, "targets": [ - { - "expr": "notification_subscriptions_active", - "legendFormat": "Active", - "refId": "A" - }, { "expr": "rate(notification_subscription_changes_total[5m])", - "legendFormat": "Changes", - "refId": "B" + "legendFormat": "Changes/s", + "refId": "A" } ], "title": "Subscriptions", diff --git a/backend/grafana/provisioning/dashboards/security-auth.json b/backend/grafana/provisioning/dashboards/security-auth.json index 191696bf..25cf242f 100644 --- a/backend/grafana/provisioning/dashboards/security-auth.json +++ b/backend/grafana/provisioning/dashboards/security-auth.json @@ -66,7 +66,7 @@ "id": 2, "targets": [ { - "expr": "histogram_quantile(0.95, rate(authentication_duration_seconds_bucket[5m]))", + "expr": "histogram_quantile(0.95, sum(rate(authentication_duration_seconds_bucket[5m])) by (le))", "legendFormat": "p95", "refId": "A" } @@ -229,7 +229,7 @@ "id": 8, "targets": [ { - "expr": "histogram_quantile(0.50, rate(token_expiry_time_seconds_bucket[5m]))", + "expr": "histogram_quantile(0.50, sum(rate(token_expiry_time_seconds_bucket[5m])) by (le))", "legendFormat": "p50", "refId": "A" } diff --git a/backend/tests/contract/test_grafana_metrics.py b/backend/tests/contract/test_grafana_metrics.py index 41dcec5b..7b63bada 100644 --- a/backend/tests/contract/test_grafana_metrics.py +++ b/backend/tests/contract/test_grafana_metrics.py @@ -49,19 +49,26 @@ @pytest.fixture(scope="module") -def prometheus_families() -> dict[str, set[str]]: +def prometheus_families(request: pytest.FixtureRequest) -> dict[str, set[str]]: """Instantiate every metric class through the real OTel -> Prometheus pipeline. Returns: Mapping of family name to set of sample names produced by that family. """ # pytest-env sets OTEL_SDK_DISABLED=true; override so the real SDK is active. - os.environ.pop("OTEL_SDK_DISABLED", None) + old_otel_disabled = os.environ.pop("OTEL_SDK_DISABLED", None) reader = PrometheusMetricReader() provider = MeterProvider(metric_readers=[reader]) otel_api.set_meter_provider(provider) + def _teardown() -> None: + provider.shutdown() + if old_otel_disabled is not None: + os.environ["OTEL_SDK_DISABLED"] = old_otel_disabled + + request.addfinalizer(_teardown) + for _, mod_name, _ in pkgutil.iter_modules(metrics_pkg.__path__): importlib.import_module(f"app.core.metrics.{mod_name}") diff --git a/backend/tests/unit/core/metrics/test_kubernetes_and_notifications_metrics.py b/backend/tests/unit/core/metrics/test_kubernetes_and_notifications_metrics.py index d6597c9c..c1ae315a 100644 --- a/backend/tests/unit/core/metrics/test_kubernetes_and_notifications_metrics.py +++ b/backend/tests/unit/core/metrics/test_kubernetes_and_notifications_metrics.py @@ -37,16 +37,15 @@ def test_notification_metrics_methods(test_settings: Settings) -> None: m.record_notification_delivery_time(0.5, "welcome", channel="email") m.record_notification_status_change("n1", "pending", "queued") m.record_notification_read("welcome") - m.decrement_unread_count("u1") - m.record_notification_throttled("welcome", "u1") - m.record_throttle_window_hit("u1") + m.record_notification_throttled("welcome") + m.record_throttle_window_hit() m.record_notification_retry("welcome", 1, False) m.record_notification_retry("welcome", 2, True) m.record_webhook_delivery(0.3, 200, "/hooks/*") m.record_slack_delivery(0.4, "#general", False, error_type="rate_limited") - m.update_active_subscriptions("u1", 3) - m.update_active_subscriptions("u1", 1) - m.record_subscription_change("u1", "welcome", "subscribe") + m.record_subscription_change("email", True) + m.record_subscription_change("webhook", False) + m.record_subscription_change("slack", None) m.increment_pending_notifications() m.decrement_pending_notifications() m.increment_queued_notifications() diff --git a/backend/tests/unit/core/metrics/test_replay_and_security_metrics.py b/backend/tests/unit/core/metrics/test_replay_and_security_metrics.py index 7db12533..82c52aaf 100644 --- a/backend/tests/unit/core/metrics/test_replay_and_security_metrics.py +++ b/backend/tests/unit/core/metrics/test_replay_and_security_metrics.py @@ -41,6 +41,6 @@ def test_security_metrics_methods(test_settings: Settings) -> None: m.record_csrf_token_generated() m.record_csrf_validation_failure("missing") m.record_password_reset_request("u1", method="admin") - m.record_weak_password_attempt("u1", "common_password") + m.record_weak_password_attempt("common_password") m.record_brute_force_attempt("1.2.3.4", target_user="u1", action_taken="blocked") - m.record_account_locked("u1", "brute_force", duration_seconds=600) + m.record_account_locked("brute_force", duration_seconds=600) From 2fab9b30f155e11970a1100f4b61a488a23c17be Mon Sep 17 00:00:00 2001 From: HardMax71 Date: Mon, 16 Feb 2026 21:23:08 +0100 Subject: [PATCH 7/9] fix: detected issues --- backend/app/core/metrics/dlq.py | 5 -- backend/app/core/metrics/security.py | 34 +++++--- .../app/services/admin/admin_user_service.py | 2 +- backend/app/services/auth_service.py | 79 ++++++++++--------- .../dashboards/coordinator-execution.json | 10 +-- .../dashboards/http-middleware.json | 6 +- .../dashboards/security-auth.json | 4 +- .../metrics/test_database_and_dlq_metrics.py | 2 - .../unit/core/metrics/test_metrics_classes.py | 2 +- .../test_replay_and_security_metrics.py | 6 +- 10 files changed, 77 insertions(+), 73 deletions(-) diff --git a/backend/app/core/metrics/dlq.py b/backend/app/core/metrics/dlq.py index 1a73d477..d4262223 100644 --- a/backend/app/core/metrics/dlq.py +++ b/backend/app/core/metrics/dlq.py @@ -66,8 +66,3 @@ def record_dlq_processing_error(self, original_topic: str, event_type: str, erro 1, attributes={"original_topic": original_topic, "event_type": event_type, "error_type": error_type} ) - def increment_dlq_queue_size(self, original_topic: str) -> None: - self.dlq_queue_size.add(1, attributes={"original_topic": original_topic}) - - def decrement_dlq_queue_size(self, original_topic: str) -> None: - self.dlq_queue_size.add(-1, attributes={"original_topic": original_topic}) diff --git a/backend/app/core/metrics/security.py b/backend/app/core/metrics/security.py index 15086dfb..82f4ca7c 100644 --- a/backend/app/core/metrics/security.py +++ b/backend/app/core/metrics/security.py @@ -1,3 +1,7 @@ +import time +from collections.abc import Generator +from contextlib import contextmanager + from app.core.metrics.base import BaseMetrics @@ -75,18 +79,22 @@ def _create_instruments(self) -> None: name="accounts.locked.total", description="Total number of accounts locked due to security", unit="1" ) - def record_authentication_attempt( - self, method: str, success: bool, user_id: str | None = None, duration_seconds: float | None = None - ) -> None: - self.authentication_attempts.add( - 1, attributes={"method": method, "success": str(success), "user_id": user_id or "unknown"} - ) - + def record_authentication_attempt(self, method: str, success: bool, duration_seconds: float) -> None: + attrs = {"method": method, "success": str(success)} + self.authentication_attempts.add(1, attributes=attrs) if not success: - self.authentication_failures.add(1, attributes={"method": method, "user_id": user_id or "unknown"}) - - if duration_seconds is not None: - self.authentication_duration.record(duration_seconds, attributes={"method": method}) + self.authentication_failures.add(1, attributes={"method": method}) + self.authentication_duration.record(duration_seconds, attributes={"method": method}) + + @contextmanager + def track_authentication(self, method: str) -> Generator[None, None, None]: + start = time.monotonic() + success = False + try: + yield + success = True + finally: + self.record_authentication_attempt(method, success, time.monotonic() - start) def increment_active_sessions(self) -> None: self.active_sessions.add(1) @@ -128,8 +136,8 @@ def record_csrf_token_generated(self) -> None: def record_csrf_validation_failure(self, reason: str) -> None: self.csrf_validation_failures.add(1, attributes={"reason": reason}) - def record_password_reset_request(self, user_id: str, method: str = "admin") -> None: - self.password_reset_requests.add(1, attributes={"user_id": user_id, "method": method}) + def record_password_reset_request(self, method: str = "admin") -> None: + self.password_reset_requests.add(1, attributes={"method": method}) def record_weak_password_attempt(self, weakness_type: str) -> None: self.weak_password_attempts.add(1, attributes={"weakness_type": weakness_type}) diff --git a/backend/app/services/admin/admin_user_service.py b/backend/app/services/admin/admin_user_service.py index b9ed09af..bdd61a76 100644 --- a/backend/app/services/admin/admin_user_service.py +++ b/backend/app/services/admin/admin_user_service.py @@ -205,7 +205,7 @@ async def reset_user_password(self, *, admin_user_id: str, user_id: str, new_pas self.logger.info( "Admin resetting user password", admin_user_id=admin_user_id, target_user_id=user_id ) - self._security_metrics.record_password_reset_request(user_id, method="admin") + self._security_metrics.record_password_reset_request(method="admin") hashed = self._security.get_password_hash(new_password) pr = PasswordReset(user_id=user_id, new_password=hashed) ok = await self._users.reset_user_password(pr) diff --git a/backend/app/services/auth_service.py b/backend/app/services/auth_service.py index 5132a736..713c72cd 100644 --- a/backend/app/services/auth_service.py +++ b/backend/app/services/auth_service.py @@ -127,52 +127,53 @@ async def login( if await self._lockout.check_locked(username): raise AccountLockedError("Account temporarily locked due to too many failed attempts") - user = await self.user_repo.get_user(username) - - if not user: - self.security_service.verify_password(password, self._dummy_hash) - await self._fail_login(username, "user_not_found", ip_address, user_agent) + with self.security_metrics.track_authentication("login"): + user = await self.user_repo.get_user(username) - if not self.security_service.verify_password(password, user.hashed_password): - await self._fail_login(username, "invalid_password", ip_address, user_agent, user_id=user.user_id) + if not user: + self.security_service.verify_password(password, self._dummy_hash) + await self._fail_login(username, "user_not_found", ip_address, user_agent) - await self._lockout.clear_attempts(username) + if not self.security_service.verify_password(password, user.hashed_password): + await self._fail_login(username, "invalid_password", ip_address, user_agent, user_id=user.user_id) - effective = await self._runtime_settings.get_effective_settings() - session_timeout = effective.session_timeout_minutes - - self.logger.info( - "Login successful", - username=user.username, - client_ip=ip_address, - user_agent=user_agent, - token_expires_in_minutes=session_timeout, - ) + await self._lockout.clear_attempts(username) - access_token_expires = timedelta(minutes=session_timeout) - access_token = self.security_service.create_access_token( - data={"sub": user.username}, expires_delta=access_token_expires, - ) - csrf_token = self.security_service.generate_csrf_token(access_token) + effective = await self._runtime_settings.get_effective_settings() + session_timeout = effective.session_timeout_minutes - await self._producer.produce( - event_to_produce=UserLoggedInEvent( - user_id=user.user_id, - login_method=LoginMethod.PASSWORD, - ip_address=ip_address, + self.logger.info( + "Login successful", + username=user.username, + client_ip=ip_address, user_agent=user_agent, - metadata=self._build_metadata(user_id=user.user_id), - ), - key=user.username, - ) + token_expires_in_minutes=session_timeout, + ) - return LoginResult( - username=user.username, - role=user.role, - access_token=access_token, - csrf_token=csrf_token, - session_timeout_minutes=session_timeout, - ) + access_token_expires = timedelta(minutes=session_timeout) + access_token = self.security_service.create_access_token( + data={"sub": user.username}, expires_delta=access_token_expires, + ) + csrf_token = self.security_service.generate_csrf_token(access_token) + + await self._producer.produce( + event_to_produce=UserLoggedInEvent( + user_id=user.user_id, + login_method=LoginMethod.PASSWORD, + ip_address=ip_address, + user_agent=user_agent, + metadata=self._build_metadata(user_id=user.user_id), + ), + key=user.username, + ) + + return LoginResult( + username=user.username, + role=user.role, + access_token=access_token, + csrf_token=csrf_token, + session_timeout_minutes=session_timeout, + ) async def register( self, diff --git a/backend/grafana/provisioning/dashboards/coordinator-execution.json b/backend/grafana/provisioning/dashboards/coordinator-execution.json index 6481d9ad..0e120ccd 100644 --- a/backend/grafana/provisioning/dashboards/coordinator-execution.json +++ b/backend/grafana/provisioning/dashboards/coordinator-execution.json @@ -25,7 +25,7 @@ "datasource": "Victoria Metrics", "fieldConfig": { "defaults": { - "unit": "short" + "unit": "s" } }, "gridPos": { @@ -49,7 +49,7 @@ "datasource": "Victoria Metrics", "fieldConfig": { "defaults": { - "unit": "short" + "unit": "s" } }, "gridPos": { @@ -132,7 +132,7 @@ "datasource": "Victoria Metrics", "fieldConfig": { "defaults": { - "unit": "short" + "unit": "s" } }, "gridPos": { @@ -198,7 +198,7 @@ "datasource": "Victoria Metrics", "fieldConfig": { "defaults": { - "unit": "short" + "unit": "decmbytes" } }, "gridPos": { @@ -227,7 +227,7 @@ "datasource": "Victoria Metrics", "fieldConfig": { "defaults": { - "unit": "short" + "unit": "percent" } }, "gridPos": { diff --git a/backend/grafana/provisioning/dashboards/http-middleware.json b/backend/grafana/provisioning/dashboards/http-middleware.json index 37240637..506e3d21 100644 --- a/backend/grafana/provisioning/dashboards/http-middleware.json +++ b/backend/grafana/provisioning/dashboards/http-middleware.json @@ -49,7 +49,7 @@ "datasource": "Victoria Metrics", "fieldConfig": { "defaults": { - "unit": "short" + "unit": "s" } }, "gridPos": { @@ -96,7 +96,7 @@ "datasource": "Victoria Metrics", "fieldConfig": { "defaults": { - "unit": "short" + "unit": "bytes" } }, "gridPos": { @@ -167,7 +167,7 @@ "datasource": "Victoria Metrics", "fieldConfig": { "defaults": { - "unit": "short" + "unit": "s" } }, "gridPos": { diff --git a/backend/grafana/provisioning/dashboards/security-auth.json b/backend/grafana/provisioning/dashboards/security-auth.json index 25cf242f..4c4ca87f 100644 --- a/backend/grafana/provisioning/dashboards/security-auth.json +++ b/backend/grafana/provisioning/dashboards/security-auth.json @@ -54,7 +54,7 @@ "datasource": "Victoria Metrics", "fieldConfig": { "defaults": { - "unit": "short" + "unit": "s" } }, "gridPos": { @@ -217,7 +217,7 @@ "datasource": "Victoria Metrics", "fieldConfig": { "defaults": { - "unit": "short" + "unit": "s" } }, "gridPos": { diff --git a/backend/tests/unit/core/metrics/test_database_and_dlq_metrics.py b/backend/tests/unit/core/metrics/test_database_and_dlq_metrics.py index 03f26f74..7484a573 100644 --- a/backend/tests/unit/core/metrics/test_database_and_dlq_metrics.py +++ b/backend/tests/unit/core/metrics/test_database_and_dlq_metrics.py @@ -37,5 +37,3 @@ def test_dlq_metrics_methods(test_settings: Settings) -> None: m.update_dlq_queue_size("topic", 7) m.record_dlq_message_age(5.0) m.record_dlq_processing_error("topic", "etype", "err") - m.increment_dlq_queue_size("topic") - m.decrement_dlq_queue_size("topic") diff --git a/backend/tests/unit/core/metrics/test_metrics_classes.py b/backend/tests/unit/core/metrics/test_metrics_classes.py index 3e198ef0..fd08c5ca 100644 --- a/backend/tests/unit/core/metrics/test_metrics_classes.py +++ b/backend/tests/unit/core/metrics/test_metrics_classes.py @@ -58,4 +58,4 @@ def test_other_metrics_classes_smoke(test_settings: Settings) -> None: NotificationMetrics(test_settings).record_notification_sent("welcome", channel="email") RateLimitMetrics(test_settings).record_request("/api/test", True, "sliding_window") ReplayMetrics(test_settings).record_session_created("by_id", "kafka") - SecurityMetrics(test_settings).record_authentication_attempt("password", True) + SecurityMetrics(test_settings).record_authentication_attempt("password", True, 0.1) diff --git a/backend/tests/unit/core/metrics/test_replay_and_security_metrics.py b/backend/tests/unit/core/metrics/test_replay_and_security_metrics.py index 82c52aaf..cddbfe13 100644 --- a/backend/tests/unit/core/metrics/test_replay_and_security_metrics.py +++ b/backend/tests/unit/core/metrics/test_replay_and_security_metrics.py @@ -31,7 +31,9 @@ def test_replay_metrics_methods(test_settings: Settings) -> None: def test_security_metrics_methods(test_settings: Settings) -> None: """Test SecurityMetrics methods with no-op metrics.""" m = SecurityMetrics(test_settings) - m.record_authentication_attempt("password", False, user_id="u1", duration_seconds=0.2) + m.record_authentication_attempt("password", False, 0.2) + with m.track_authentication("password"): + pass m.increment_active_sessions() m.decrement_active_sessions() m.record_token_generated("access", 3600) @@ -40,7 +42,7 @@ def test_security_metrics_methods(test_settings: Settings) -> None: m.record_authorization_check("/admin", "GET", False, user_role="user") m.record_csrf_token_generated() m.record_csrf_validation_failure("missing") - m.record_password_reset_request("u1", method="admin") + m.record_password_reset_request(method="admin") m.record_weak_password_attempt("common_password") m.record_brute_force_attempt("1.2.3.4", target_user="u1", action_taken="blocked") m.record_account_locked("brute_force", duration_seconds=600) From 3f37c306e3817dab0f128a0d2224323d69c53ea3 Mon Sep 17 00:00:00 2001 From: HardMax71 Date: Mon, 16 Feb 2026 23:28:17 +0100 Subject: [PATCH 8/9] fix: detected issues --- .../grafana/provisioning/dashboards/coordinator-execution.json | 2 +- backend/grafana/provisioning/dashboards/http-middleware.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/grafana/provisioning/dashboards/coordinator-execution.json b/backend/grafana/provisioning/dashboards/coordinator-execution.json index 0e120ccd..282c3c8a 100644 --- a/backend/grafana/provisioning/dashboards/coordinator-execution.json +++ b/backend/grafana/provisioning/dashboards/coordinator-execution.json @@ -198,7 +198,7 @@ "datasource": "Victoria Metrics", "fieldConfig": { "defaults": { - "unit": "decmbytes" + "unit": "mbytes" } }, "gridPos": { diff --git a/backend/grafana/provisioning/dashboards/http-middleware.json b/backend/grafana/provisioning/dashboards/http-middleware.json index 506e3d21..f3c7dc1e 100644 --- a/backend/grafana/provisioning/dashboards/http-middleware.json +++ b/backend/grafana/provisioning/dashboards/http-middleware.json @@ -321,7 +321,7 @@ "datasource": "Victoria Metrics", "fieldConfig": { "defaults": { - "unit": "short" + "unit": "percent" } }, "gridPos": { From 24c40ca2420cda3f1e77f68658c9ba8f2c8af37d Mon Sep 17 00:00:00 2001 From: HardMax71 Date: Mon, 16 Feb 2026 23:33:27 +0100 Subject: [PATCH 9/9] fix: plot with http ops --- .../dashboards/http-middleware.json | 29 +++++++++++++++---- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/backend/grafana/provisioning/dashboards/http-middleware.json b/backend/grafana/provisioning/dashboards/http-middleware.json index f3c7dc1e..8f03fbf6 100644 --- a/backend/grafana/provisioning/dashboards/http-middleware.json +++ b/backend/grafana/provisioning/dashboards/http-middleware.json @@ -219,7 +219,7 @@ }, "gridPos": { "h": 6, - "w": 18, + "w": 9, "x": 6, "y": 20 }, @@ -229,14 +229,33 @@ "expr": "rate(event_store_operations_total[5m])", "legendFormat": "Store Ops", "refId": "A" - }, + } + ], + "title": "Event Store Operations", + "type": "timeseries" + }, + { + "datasource": "Victoria Metrics", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, + "gridPos": { + "h": 6, + "w": 9, + "x": 15, + "y": 20 + }, + "id": 13, + "targets": [ { "expr": "histogram_quantile(0.95, sum(rate(idempotency_processing_duration_seconds_bucket[5m])) by (le))", - "legendFormat": "Idempotency p95", - "refId": "B" + "legendFormat": "p95", + "refId": "A" } ], - "title": "Event Store & Idempotency", + "title": "Idempotency Processing Duration", "type": "timeseries" }, {