diff --git a/charts/countly-observability/vm-observability/configs/Caddyfile b/charts/countly-observability/vm-observability/configs/Caddyfile new file mode 100644 index 0000000..9a004d8 --- /dev/null +++ b/charts/countly-observability/vm-observability/configs/Caddyfile @@ -0,0 +1,30 @@ +obs-newarch.count.ly { + + # Prometheus remote write (from cluster Alloy agents) + handle /api/v1/write { + reverse_proxy prometheus:9090 + } + + # Loki push + query + handle /loki/* { + reverse_proxy loki:3100 + } + + # Pyroscope ingest + handle /ingest* { + reverse_proxy pyroscope:4040 + } + handle /push.v1.* { + reverse_proxy pyroscope:4040 + } + handle /querier.v1.* { + reverse_proxy pyroscope:4040 + } + + # Grafana (catch-all) + handle { + reverse_proxy grafana:3000 + } + + encode gzip +} diff --git a/charts/countly-observability/vm-observability/configs/grafana/provisioning/dashboards/provider.yaml b/charts/countly-observability/vm-observability/configs/grafana/provisioning/dashboards/provider.yaml new file mode 100644 index 0000000..b52fdcc --- /dev/null +++ b/charts/countly-observability/vm-observability/configs/grafana/provisioning/dashboards/provider.yaml @@ -0,0 +1,11 @@ +apiVersion: 1 +providers: + - name: default + orgId: 1 + folder: Countly + type: file + disableDeletion: false + updateIntervalSeconds: 60 + allowUiUpdates: true + options: + path: /var/lib/grafana/dashboards diff --git a/charts/countly-observability/vm-observability/configs/grafana/provisioning/datasources/datasources.yaml b/charts/countly-observability/vm-observability/configs/grafana/provisioning/datasources/datasources.yaml new file mode 100644 index 0000000..34066fa --- /dev/null +++ b/charts/countly-observability/vm-observability/configs/grafana/provisioning/datasources/datasources.yaml @@ -0,0 +1,35 @@ +apiVersion: 1 +datasources: + + - name: Prometheus + uid: prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + + - name: Loki + uid: loki + type: loki + access: proxy + url: http://loki:3100 + jsonData: + derivedFields: + - name: traceID + matcherRegex: 'traceId=([A-Fa-f0-9]{32})' + url: ${__value.raw} + datasourceUid: tempo + + - name: Tempo + uid: tempo + type: tempo + access: proxy + url: http://tempo:3200 + jsonData: + httpMethod: GET + + - name: Pyroscope + uid: pyroscope + type: grafana-pyroscope-datasource + access: proxy + url: http://pyroscope:4040 diff --git a/charts/countly-observability/vm-observability/configs/loki.yaml b/charts/countly-observability/vm-observability/configs/loki.yaml new file mode 100644 index 0000000..6f65ec5 --- /dev/null +++ b/charts/countly-observability/vm-observability/configs/loki.yaml @@ -0,0 +1,54 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + grpc_listen_port: 9096 + +common: + path_prefix: /data/loki + storage: + filesystem: + chunks_directory: /data/loki/chunks + rules_directory: /data/loki/rules + replication_factor: 1 + ring: + instance_addr: 127.0.0.1 + kvstore: + store: inmemory + +query_range: + results_cache: + cache: + embedded_cache: + enabled: true + max_size_mb: 100 + +schema_config: + configs: + - from: 2020-10-24 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +ruler: + alertmanager_url: http://localhost:9093 + +limits_config: + retention_period: 30d + max_streams_per_user: 30000 + max_line_size: 256000 + reject_old_samples: true + reject_old_samples_max_age: 168h + ingestion_rate_mb: 64 + ingestion_burst_size_mb: 128 + +compactor: + working_directory: /data/loki/compactor + compaction_interval: 10m + retention_enabled: true + retention_delete_delay: 2h + retention_delete_worker_count: 150 + delete_request_store: filesystem diff --git a/charts/countly-observability/vm-observability/configs/prometheus.yml b/charts/countly-observability/vm-observability/configs/prometheus.yml new file mode 100644 index 0000000..f6e2594 --- /dev/null +++ b/charts/countly-observability/vm-observability/configs/prometheus.yml @@ -0,0 +1,6 @@ +global: + scrape_interval: 15s + external_labels: + cluster: 'obs-central' + +# No scrape_configs — all metrics arrive via remote_write from cluster Alloy agents diff --git a/charts/countly-observability/vm-observability/configs/pyroscope.yaml b/charts/countly-observability/vm-observability/configs/pyroscope.yaml new file mode 100644 index 0000000..f756d3b --- /dev/null +++ b/charts/countly-observability/vm-observability/configs/pyroscope.yaml @@ -0,0 +1,5 @@ +pyroscopedb: + data_path: /data/pyroscope + +server: + http_listen_port: 4040 diff --git a/charts/countly-observability/vm-observability/configs/tempo.yaml b/charts/countly-observability/vm-observability/configs/tempo.yaml new file mode 100644 index 0000000..17cfb4a --- /dev/null +++ b/charts/countly-observability/vm-observability/configs/tempo.yaml @@ -0,0 +1,57 @@ +server: + http_listen_port: 3200 + grpc_listen_port: 9095 + +distributor: + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +overrides: + ingestion_rate_limit_bytes: 100000000 + ingestion_burst_size_bytes: 150000000 + max_traces_per_user: 50000 + max_bytes_per_trace: 5000000 + +ingester: + max_block_duration: 5m + trace_idle_period: 10s + max_block_bytes: 1048576 + +metrics_generator: + registry: + external_labels: + source: tempo + cluster: obs-central + storage: + path: /data/tempo/generator/wal + remote_write: + - url: http://prometheus:9090/api/v1/write + send_exemplars: true + +compactor: + compaction: + block_retention: 12h + compacted_block_retention: 1h + +storage: + trace: + backend: local + local: + path: /data/tempo/traces + wal: + path: /data/tempo/wal + pool: + max_workers: 50 + queue_depth: 5000 + +usage_report: + reporting_enabled: false + +querier: + frontend_worker: + frontend_address: 127.0.0.1:9095 diff --git a/charts/countly-observability/vm-observability/docker-compose.yml b/charts/countly-observability/vm-observability/docker-compose.yml new file mode 100644 index 0000000..4b2eb8b --- /dev/null +++ b/charts/countly-observability/vm-observability/docker-compose.yml @@ -0,0 +1,105 @@ +services: + + caddy: + image: caddy:2.9 + restart: unless-stopped + ports: + - "80:80" + - "443:443" + - "443:443/udp" + volumes: + - ./configs/Caddyfile:/etc/caddy/Caddyfile:ro + - caddy_data:/data + - caddy_config:/config + depends_on: + - grafana + - prometheus + - loki + - pyroscope + + prometheus: + image: prom/prometheus:v3.8.1 + restart: unless-stopped + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--storage.tsdb.retention.time=30d' + - '--storage.tsdb.retention.size=50GB' + - '--web.enable-remote-write-receiver' + - '--web.enable-lifecycle' + volumes: + - ./configs/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - /data/prometheus:/prometheus + expose: + - "9090" + + loki: + image: grafana/loki:3.5.3 + restart: unless-stopped + command: -config.file=/etc/loki/loki.yaml + volumes: + - ./configs/loki.yaml:/etc/loki/loki.yaml:ro + - /data/loki:/data/loki + expose: + - "3100" + + tempo: + image: grafana/tempo:2.8.1 + restart: unless-stopped + command: -config.file=/etc/tempo/tempo.yaml + volumes: + - ./configs/tempo.yaml:/etc/tempo/tempo.yaml:ro + - /data/tempo:/data/tempo + ports: + - "4317:4317" # OTLP gRPC (plain, no TLS — matches cluster alloy config) + - "4318:4318" # OTLP HTTP + expose: + - "3200" + + pyroscope: + image: grafana/pyroscope:1.16.0 + restart: unless-stopped + command: + - -config.file=/etc/pyroscope/pyroscope.yaml + volumes: + - ./configs/pyroscope.yaml:/etc/pyroscope/pyroscope.yaml:ro + - /data/pyroscope:/data/pyroscope + expose: + - "4040" + + grafana: + image: grafana/grafana:12.1.0 + restart: unless-stopped + environment: + GF_SERVER_DOMAIN: obs-newarch.count.ly + GF_SERVER_ROOT_URL: https://obs-newarch.count.ly + GF_AUTH_GOOGLE_ENABLED: "true" + GF_AUTH_GOOGLE_CLIENT_ID: "${GF_AUTH_GOOGLE_CLIENT_ID}" + GF_AUTH_GOOGLE_CLIENT_SECRET: "${GF_AUTH_GOOGLE_CLIENT_SECRET}" + GF_AUTH_GOOGLE_SCOPES: "https://www.googleapis.com/auth/userinfo.profile https://www.googleapis.com/auth/userinfo.email" + GF_AUTH_GOOGLE_AUTH_URL: "https://accounts.google.com/o/oauth2/v2/auth" + GF_AUTH_GOOGLE_TOKEN_URL: "https://oauth2.googleapis.com/token" + GF_AUTH_GOOGLE_API_URL: "https://openidconnect.googleapis.com/v1/userinfo" + GF_AUTH_GOOGLE_USE_PKCE: "true" + GF_INSTALL_PLUGINS: "grafana-pyroscope-datasource" + GF_PLUGINS_ALLOW_LOADING_UNSIGNED_PLUGINS: "grafana-pyroscope-app,grafana-lokiexplore-app,grafana-exploretraces-app" + GF_USERS_ALLOW_SIGN_UP: "false" + GF_USERS_ALLOW_ORG_CREATE: "false" + GF_USERS_AUTO_ASSIGN_ORG_ROLE: "Editor" + GF_AUTH_ANONYMOUS_ENABLED: "false" + GF_SECURITY_ALLOW_EMBEDDING: "true" + GF_FEATURE_TOGGLES_ENABLE: "tempoSearch,tempoBackendSearch,traceqlEditor,exploreTraces" + volumes: + - /data/grafana:/var/lib/grafana + - ./configs/grafana/provisioning:/etc/grafana/provisioning:ro + expose: + - "3000" + depends_on: + - prometheus + - loki + - tempo + - pyroscope + +volumes: + caddy_data: + caddy_config: diff --git a/charts/countly-observability/vm-observability/setup.sh b/charts/countly-observability/vm-observability/setup.sh new file mode 100644 index 0000000..da804d3 --- /dev/null +++ b/charts/countly-observability/vm-observability/setup.sh @@ -0,0 +1,49 @@ +#!/bin/bash +set -euo pipefail + +echo "=== [1/5] Formatting and mounting data disk ===" +if ! mountpoint -q /data; then + sudo mkfs.ext4 -F /dev/sdb + sudo mkdir -p /data + sudo mount /dev/sdb /data + echo '/dev/sdb /data ext4 defaults 0 2' | sudo tee -a /etc/fstab + echo "Disk mounted at /data" +else + echo "Already mounted" +fi + +echo "=== [2/5] Creating data directories ===" +sudo mkdir -p \ + /data/prometheus \ + /data/loki/{chunks,rules,tsdb-shipper-cache,wal,compactor} \ + /data/grafana/dashboards \ + /data/tempo/{traces,wal,generator/wal} \ + /data/pyroscope +sudo chown -R 10001:10001 /data/loki +sudo chown -R 472:472 /data/grafana +sudo chown -R 65534:65534 /data/prometheus +echo "Directories created" + +echo "=== [3/5] Installing Docker ===" +if ! command -v docker &>/dev/null; then + curl -fsSL https://get.docker.com | sudo sh + sudo systemctl enable --now docker + sudo usermod -aG docker "$USER" + echo "Docker installed" +else + echo "Docker already installed" +fi + +echo "=== [4/5] Copying app files ===" +sudo mkdir -p /opt/observability +sudo cp -r /tmp/obs-upload/* /opt/observability/ +sudo chown -R root:root /opt/observability + +echo "=== [5/5] Starting stack ===" +cd /opt/observability +sudo docker compose pull +sudo docker compose up -d + +echo "" +echo "=== Done! Services status: ===" +sudo docker compose ps