diff --git a/.gitignore b/.gitignore
index 2faf43d..756fa35 100644
--- a/.gitignore
+++ b/.gitignore
@@ -35,3 +35,4 @@ override.tf.json
# Ignore CLI configuration files
.terraformrc
terraform.rc
+/infrastructure/.terraform.lock.hcl
diff --git a/infrastructure/main.tf b/infrastructure/main.tf
index 4ff9286..b83e952 100644
--- a/infrastructure/main.tf
+++ b/infrastructure/main.tf
@@ -22,7 +22,7 @@ module "cluster-issuer" {
# Complete Observability Stack Deployment
module "observability" {
- source = "git::https://github.com/necro-cloud/modules//modules/observability?ref=main"
+ source = "git::https://github.com/necro-cloud/modules//modules/observability?ref=task/90/cnpg-dashboards"
// Certificates Details
cluster_issuer_name = module.cluster-issuer.cluster-issuer-name
@@ -60,7 +60,7 @@ module "garage" {
# Cloudnative PG Deployment for PostgreSQL Database Solution
module "cnpg" {
- source = "git::https://github.com/necro-cloud/modules//modules/cnpg?ref=main"
+ source = "git::https://github.com/necro-cloud/modules//modules/cnpg?ref=task/90/cnpg-dashboards"
// Garage Cluster Details for configuration of PITR Backups
garage_certificate_authority = module.garage.garage_internal_certificate_secret
@@ -68,6 +68,9 @@ module "cnpg" {
garage_configuration = "walbackups-credentials"
backup_bucket_name = "postgresql"
+ // Observability details
+ observability_namespace = module.observability.observability_namespace
+
// Required client details to allow access and generate credentials and certificates for
clients = [
{
@@ -96,7 +99,7 @@ module "cnpg" {
# FerretDB Deployment for MongoDB Database Solution
module "ferretdb" {
- source = "git::https://github.com/necro-cloud/modules//modules/ferretdb?ref=main"
+ source = "git::https://github.com/necro-cloud/modules//modules/ferretdb?ref=task/90/cnpg-dashboards"
// Garage Cluster Details for configuration of PITR Backups
garage_certificate_authority = module.garage.garage_internal_certificate_secret
@@ -104,6 +107,9 @@ module "ferretdb" {
garage_configuration = "walbackups-credentials"
backup_bucket_name = "ferret"
+ // Observability details
+ observability_namespace = module.observability.observability_namespace
+
// Required client details to allow access and generate credentials and certificates for
clients = [
{
diff --git a/modules/cnpg/README.md b/modules/cnpg/README.md
index ad72e77..a71796d 100644
--- a/modules/cnpg/README.md
+++ b/modules/cnpg/README.md
@@ -82,6 +82,7 @@ Required Modules to deploy Cloudnative PG PostgreSQL Database:
| [kubernetes\_api\_port](#input\_kubernetes\_api\_port) | Port for the Kubernetes API | `number` | n/a | yes |
| [kubernetes\_api\_protocol](#input\_kubernetes\_api\_protocol) | Protocol for the Kubernetes API | `string` | n/a | yes |
| [namespace](#input\_namespace) | Namespace to be used for deploying PostgreSQL Database | `string` | `"postgres"` | no |
+| [observability\_namespace](#input\_observability\_namespace) | Namespace where all components for observability are deployed | `string` | n/a | yes |
| [organization\_name](#input\_organization\_name) | Organization name for deploying PostgreSQL Database | `string` | `"cloud"` | no |
| [proxy\_image](#input\_proxy\_image) | Docker image to be used for deployment of PGAdmin NGINX Proxy for TLS | `string` | `"nginx"` | no |
| [proxy\_repository](#input\_proxy\_repository) | Repository to be used for deployment of PGAdmin NGINX Proxy for TLS | `string` | `"docker.io/library"` | no |
diff --git a/modules/cnpg/cluster.tf b/modules/cnpg/cluster.tf
index 1041ef8..e62dade 100644
--- a/modules/cnpg/cluster.tf
+++ b/modules/cnpg/cluster.tf
@@ -16,6 +16,11 @@ resource "kubernetes_manifest" "cluster" {
"labels" = {
"garage-access" = true
}
+ "annotations" = {
+ "prometheus.io/scrape" = "true"
+ "prometheus.io/port" = "9187"
+ "prometheus.io/path" = "/metrics"
+ }
}
"topologySpreadConstraints" = [
{
diff --git a/modules/cnpg/networkpolicy.tf b/modules/cnpg/networkpolicy.tf
index 0d6055f..36d72ae 100644
--- a/modules/cnpg/networkpolicy.tf
+++ b/modules/cnpg/networkpolicy.tf
@@ -88,6 +88,29 @@ resource "kubernetes_network_policy" "cnpg_network_policy" {
}
}
+
+ # Rule 4: Allow OpenTelemetry Collector to scrape CNPG metrics
+ ingress {
+ from {
+ namespace_selector {
+ match_labels = {
+ "kubernetes.io/metadata.name" = var.observability_namespace
+ }
+ }
+
+ pod_selector {
+ match_labels = {
+ "app.kubernetes.io/instance" = "otel-collector"
+ }
+ }
+ }
+
+ ports {
+ protocol = "TCP"
+ port = 9187
+ }
+ }
+
# -------------- EGRESS RULES -------------- #
# Rule 1: Allow egress to other CNPG pods
egress {
diff --git a/modules/cnpg/variables.tf b/modules/cnpg/variables.tf
index c787f68..7270d76 100644
--- a/modules/cnpg/variables.tf
+++ b/modules/cnpg/variables.tf
@@ -30,6 +30,12 @@ variable "garage_namespace" {
nullable = false
}
+variable "observability_namespace" {
+ description = "Namespace where all components for observability are deployed"
+ type = string
+ nullable = false
+}
+
# --------------- CERTIFICATE VARIABLES --------------- #
variable "garage_certificate_authority" {
description = "Name of the Certificate Authority associated with the Garage Storage Solution"
diff --git a/modules/ferretdb/README.md b/modules/ferretdb/README.md
index d21482d..5d78f2c 100644
--- a/modules/ferretdb/README.md
+++ b/modules/ferretdb/README.md
@@ -83,6 +83,7 @@ Required Modules to deploy FerretDB Database:
| [mongo\_express\_repository](#input\_mongo\_express\_repository) | Repository to be used for deployment of Mongo Express UI | `string` | `"docker.io/library"` | no |
| [mongo\_express\_tag](#input\_mongo\_express\_tag) | Docker tag to be used for deployment of Mongo Express UI | `string` | `"1.0.2-20-alpine3.19"` | no |
| [namespace](#input\_namespace) | Namespace to be used for deploying Ferret Database | `string` | `"ferret"` | no |
+| [observability\_namespace](#input\_observability\_namespace) | Namespace where all components for observability are deployed | `string` | n/a | yes |
| [organization\_name](#input\_organization\_name) | Organization name for deploying Ferret Database | `string` | `"cloud"` | no |
| [repository](#input\_repository) | Repository to be used for deployment of FerretDB | `string` | `"ghcr.io/ferretdb"` | no |
| [server\_certificate\_authority\_name](#input\_server\_certificate\_authority\_name) | Name of the Certificate Authority to be used with Ferret Server | `string` | `"ferretdb-server-certificate-authority"` | no |
diff --git a/modules/ferretdb/cluster.tf b/modules/ferretdb/cluster.tf
index 291a90c..629d22b 100644
--- a/modules/ferretdb/cluster.tf
+++ b/modules/ferretdb/cluster.tf
@@ -16,6 +16,11 @@ resource "kubernetes_manifest" "cluster" {
"labels" = {
"garage-access" = true
}
+ "annotations" = {
+ "prometheus.io/scrape" = "true"
+ "prometheus.io/port" = "9187"
+ "prometheus.io/path" = "/metrics"
+ }
}
"postgresUID" = 999
"postgresGID" = 999
diff --git a/modules/ferretdb/networkpolicy.tf b/modules/ferretdb/networkpolicy.tf
index 100a935..8478f07 100644
--- a/modules/ferretdb/networkpolicy.tf
+++ b/modules/ferretdb/networkpolicy.tf
@@ -88,6 +88,28 @@ resource "kubernetes_network_policy" "cnpg_network_policy" {
}
}
+ # Rule 4: Allow OpenTelemetry Collector to scrape CNPG metrics
+ ingress {
+ from {
+ namespace_selector {
+ match_labels = {
+ "kubernetes.io/metadata.name" = var.observability_namespace
+ }
+ }
+
+ pod_selector {
+ match_labels = {
+ "app.kubernetes.io/instance" = "otel-collector"
+ }
+ }
+ }
+
+ ports {
+ protocol = "TCP"
+ port = 9187
+ }
+ }
+
# -------------- EGRESS RULES -------------- #
# Rule 1: Allow egress to other CNPG pods
egress {
diff --git a/modules/ferretdb/variables.tf b/modules/ferretdb/variables.tf
index 3585df8..6d57da7 100644
--- a/modules/ferretdb/variables.tf
+++ b/modules/ferretdb/variables.tf
@@ -30,6 +30,12 @@ variable "garage_namespace" {
nullable = false
}
+variable "observability_namespace" {
+ description = "Namespace where all components for observability are deployed"
+ type = string
+ nullable = false
+}
+
# --------------- CERTIFICATE VARIABLES --------------- #
variable "garage_certificate_authority" {
description = "Name of the Certificate Authority associated with the Garage Storage Solution"
diff --git a/modules/observability/dashboards/postgresql.json b/modules/observability/dashboards/postgresql.json
new file mode 100644
index 0000000..fe9d544
--- /dev/null
+++ b/modules/observability/dashboards/postgresql.json
@@ -0,0 +1,1024 @@
+{
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": {
+ "type": "grafana",
+ "uid": "-- Grafana --"
+ },
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": true,
+ "fiscalYearStartMonth": 0,
+ "graphTooltip": 0,
+ "id": 2,
+ "links": [],
+ "panels": [
+ {
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 0
+ },
+ "id": 7,
+ "panels": [],
+ "title": "PostgreSQL Health & Performance",
+ "type": "row"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "P4169E866C3094E38"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "continuous-GrYlRd"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": 0
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 6,
+ "x": 0,
+ "y": 1
+ },
+ "id": 8,
+ "options": {
+ "displayMode": "lcd",
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": false
+ },
+ "maxVizHeight": 300,
+ "minVizHeight": 16,
+ "minVizWidth": 8,
+ "namePlacement": "auto",
+ "orientation": "horizontal",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showUnfilled": true,
+ "sizing": "auto",
+ "valueMode": "color"
+ },
+ "pluginVersion": "12.3.3",
+ "targets": [
+ {
+ "editorMode": "code",
+ "expr": "max(cnpg_collector_up{namespace=\"$Namespace\", cluster=\"$Cluster\"})",
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "Cluster Status",
+ "type": "bargauge"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "P4169E866C3094E38"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "continuous-GrYlRd"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": 0
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 6,
+ "x": 6,
+ "y": 1
+ },
+ "id": 9,
+ "options": {
+ "displayMode": "lcd",
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": false
+ },
+ "maxVizHeight": 300,
+ "minVizHeight": 16,
+ "minVizWidth": 8,
+ "namePlacement": "auto",
+ "orientation": "horizontal",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showUnfilled": true,
+ "sizing": "auto",
+ "valueMode": "color"
+ },
+ "pluginVersion": "12.3.3",
+ "targets": [
+ {
+ "editorMode": "code",
+ "expr": "sum(cnpg_backends_total{namespace=\"$Namespace\", pod=~\"^$Cluster-.*\"}) by (usename)",
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "Active Connections",
+ "type": "bargauge"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "P4169E866C3094E38"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "showValues": false,
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": 0
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 6,
+ "x": 12,
+ "y": 1
+ },
+ "id": 6,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "hidden",
+ "placement": "right",
+ "showLegend": false
+ },
+ "tooltip": {
+ "hideZeros": false,
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "pluginVersion": "12.3.3",
+ "targets": [
+ {
+ "editorMode": "code",
+ "expr": "cnpg_pg_replication_lag{namespace=\"$Namespace\", pod=~\"^$Cluster-.*\"}",
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "Replication Lag",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "P4169E866C3094E38"
+ },
+ "description": "At its core, Cache Hit Ratio is the ultimate measure of how efficiently your database is using its memory (RAM)",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 25,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "showValues": false,
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": 0
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 6,
+ "x": 18,
+ "y": 1
+ },
+ "id": 3,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "hidden",
+ "placement": "right",
+ "showLegend": false
+ },
+ "tooltip": {
+ "hideZeros": false,
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "pluginVersion": "12.3.3",
+ "targets": [
+ {
+ "editorMode": "code",
+ "expr": "sum(rate(cnpg_pg_stat_database_blks_hit_total{namespace=\"$Namespace\"}[10m]))\n/\n(sum(rate(cnpg_pg_stat_database_blks_hit_total{namespace=\"$Namespace\"}[10m])) + sum(rate(cnpg_pg_stat_database_blks_read_total{namespace=\"$Namespace\"}[10m])))",
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "Cache Hit Ratio",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "P4169E866C3094E38"
+ },
+ "description": "Transaction Commit Rate is the most direct measurement of how much actual work your database is accomplishing at any given moment.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "continuous-GrYlRd"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": 0
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 9
+ },
+ "id": 4,
+ "options": {
+ "displayMode": "lcd",
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": false
+ },
+ "maxVizHeight": 300,
+ "minVizHeight": 16,
+ "minVizWidth": 8,
+ "namePlacement": "auto",
+ "orientation": "horizontal",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showUnfilled": true,
+ "sizing": "auto",
+ "valueMode": "color"
+ },
+ "pluginVersion": "12.3.3",
+ "targets": [
+ {
+ "editorMode": "code",
+ "expr": "sum(rate(cnpg_pg_stat_database_xact_commit_total{namespace=\"$Namespace\"}[5m])) by (datname)",
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "Transaction Commit Rate",
+ "type": "bargauge"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "P4169E866C3094E38"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "fieldMinMax": false,
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": 0
+ }
+ ]
+ },
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 9
+ },
+ "id": 5,
+ "options": {
+ "colorMode": "background",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "percentChangeColorMode": "standard",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showPercentChange": false,
+ "textMode": "auto",
+ "wideLayout": true
+ },
+ "pluginVersion": "12.3.3",
+ "targets": [
+ {
+ "editorMode": "code",
+ "expr": "sum(cnpg_pg_database_size_bytes{namespace=\"$Namespace\"}) by (datname)",
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "Database Sizes",
+ "type": "stat"
+ },
+ {
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 17
+ },
+ "id": 10,
+ "panels": [],
+ "title": "Backup & WAL Archiving",
+ "type": "row"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "P4169E866C3094E38"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": 0
+ }
+ ]
+ },
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 5,
+ "x": 0,
+ "y": 18
+ },
+ "id": 11,
+ "options": {
+ "colorMode": "background",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "percentChangeColorMode": "standard",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showPercentChange": false,
+ "textMode": "auto",
+ "wideLayout": true
+ },
+ "pluginVersion": "12.3.3",
+ "targets": [
+ {
+ "editorMode": "code",
+ "expr": "sum(cnpg_pg_stat_archiver_archived_count_total{namespace=\"$Namespace\", pod=~\"^$Cluster-.*\"}[5m])",
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "WAL Archiving Rate Success",
+ "type": "stat"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "P4169E866C3094E38"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": 0
+ },
+ {
+ "color": "red",
+ "value": 0
+ }
+ ]
+ },
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 5,
+ "x": 5,
+ "y": 18
+ },
+ "id": 12,
+ "options": {
+ "colorMode": "background",
+ "graphMode": "none",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "percentChangeColorMode": "standard",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showPercentChange": false,
+ "textMode": "auto",
+ "wideLayout": true
+ },
+ "pluginVersion": "12.3.3",
+ "targets": [
+ {
+ "editorMode": "code",
+ "expr": "sum(cnpg_pg_stat_archiver_failed_count_total{namespace=\"$Namespace\", pod=~\"^$Cluster-.*\"}[5m])",
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "WAL Archiving Rate Failure",
+ "type": "stat"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "P4169E866C3094E38"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": 0
+ }
+ ]
+ },
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 14,
+ "x": 10,
+ "y": 18
+ },
+ "id": 13,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "percentChangeColorMode": "standard",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showPercentChange": false,
+ "textMode": "auto",
+ "wideLayout": true
+ },
+ "pluginVersion": "12.3.3",
+ "targets": [
+ {
+ "editorMode": "code",
+ "expr": "cnpg_collector_wal_bytes{namespace=\"$Namespace\", pod=\"$Cluster-1\"}",
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "WAL Size on Disk",
+ "type": "stat"
+ },
+ {
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 26
+ },
+ "id": 14,
+ "panels": [],
+ "title": "Hardware & Infrastructure",
+ "type": "row"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "P4169E866C3094E38"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "mappings": [],
+ "thresholds": {
+ "mode": "percentage",
+ "steps": [
+ {
+ "color": "green",
+ "value": 0
+ },
+ {
+ "color": "orange",
+ "value": 70
+ },
+ {
+ "color": "red",
+ "value": 85
+ }
+ ]
+ },
+ "unit": "percentunit"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 13,
+ "w": 6,
+ "x": 0,
+ "y": 27
+ },
+ "id": 15,
+ "options": {
+ "minVizHeight": 75,
+ "minVizWidth": 75,
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showThresholdLabels": false,
+ "showThresholdMarkers": false,
+ "sizing": "auto"
+ },
+ "pluginVersion": "12.3.3",
+ "targets": [
+ {
+ "editorMode": "code",
+ "expr": "sum by (pod) (\n label_replace(\n rate(container_cpu_usage{job=~\"^$Namespace/$Cluster-.*\"}[5m]), \n \"pod\", \n \"$1\", \n \"job\", \n \".*/(.*)\"\n )\n) / max(kube_pod_container_resource_limits{namespace=\"$Namespace\", pod=~\"^$Cluster-.*\", resource=\"cpu\"}) by (pod)",
+ "hide": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "P4169E866C3094E38"
+ },
+ "editorMode": "code",
+ "expr": "",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "B"
+ }
+ ],
+ "title": "CPU Usage %",
+ "type": "gauge"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "P4169E866C3094E38"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "mappings": [],
+ "thresholds": {
+ "mode": "percentage",
+ "steps": [
+ {
+ "color": "green",
+ "value": 0
+ },
+ {
+ "color": "orange",
+ "value": 70
+ },
+ {
+ "color": "red",
+ "value": 85
+ }
+ ]
+ },
+ "unit": "percentunit"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 13,
+ "w": 6,
+ "x": 6,
+ "y": 27
+ },
+ "id": 16,
+ "options": {
+ "minVizHeight": 75,
+ "minVizWidth": 75,
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showThresholdLabels": false,
+ "showThresholdMarkers": false,
+ "sizing": "auto"
+ },
+ "pluginVersion": "12.3.3",
+ "targets": [
+ {
+ "editorMode": "code",
+ "expr": "sum by (pod) (\n label_replace(\n container_memory_usage_bytes{job=~\"^$Namespace/$Cluster-.*\"}, \n \"pod\", \n \"$1\", \n \"job\", \n \".*/(.*)\"\n )\n) \n/ \nmax by (pod) (\n kube_pod_container_resource_limits{namespace=\"$Namespace\", pod=~\"^$Cluster-.*\", resource=\"memory\"}\n)",
+ "hide": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "Memory Usage %",
+ "type": "gauge"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "P4169E866C3094E38"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "mappings": [],
+ "thresholds": {
+ "mode": "percentage",
+ "steps": [
+ {
+ "color": "green",
+ "value": 0
+ },
+ {
+ "color": "orange",
+ "value": 70
+ },
+ {
+ "color": "red",
+ "value": 85
+ }
+ ]
+ },
+ "unit": "none"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 13,
+ "w": 6,
+ "x": 12,
+ "y": 27
+ },
+ "id": 17,
+ "options": {
+ "minVizHeight": 75,
+ "minVizWidth": 75,
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showThresholdLabels": false,
+ "showThresholdMarkers": false,
+ "sizing": "auto"
+ },
+ "pluginVersion": "12.3.3",
+ "targets": [
+ {
+ "editorMode": "code",
+ "expr": "max by (pod) (\n kube_pod_container_status_restarts_total{namespace=\"$Namespace\", pod=~\"^$Cluster-.*\"}\n)",
+ "hide": false,
+ "legendFormat": "__auto",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "Pod Restarts",
+ "type": "gauge"
+ },
+ {
+ "collapsed": false,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 40
+ },
+ "id": 18,
+ "panels": [],
+ "title": "Logs",
+ "type": "row"
+ },
+ {
+ "datasource": {
+ "type": "victoriametrics-logs-datasource",
+ "uid": "PD775F2863313E6C7"
+ },
+ "fieldConfig": {
+ "defaults": {},
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 16,
+ "w": 24,
+ "x": 0,
+ "y": 41
+ },
+ "id": 19,
+ "options": {
+ "dedupStrategy": "none",
+ "detailsMode": "sidebar",
+ "enableInfiniteScrolling": false,
+ "enableLogDetails": true,
+ "showControls": true,
+ "showLabels": false,
+ "showTime": false,
+ "sortOrder": "Descending",
+ "syntaxHighlighting": true,
+ "timestampResolution": "ns",
+ "wrapLogMessage": true
+ },
+ "pluginVersion": "12.3.3",
+ "targets": [
+ {
+ "datasource": {
+ "type": "victoriametrics-logs-datasource",
+ "uid": "PD775F2863313E6C7"
+ },
+ "direction": "desc",
+ "editorMode": "code",
+ "expr": "cnpg.io/cluster: \"$Cluster\" AND k8s.namespace.name: \"$Namespace\"",
+ "queryType": "instant",
+ "refId": "A"
+ }
+ ],
+ "title": "Logging",
+ "type": "logs"
+ }
+ ],
+ "preload": false,
+ "refresh": "5s",
+ "schemaVersion": 42,
+ "tags": [],
+ "templating": {
+ "list": [
+ {
+ "current": {
+ "text": "postgres",
+ "value": "postgres"
+ },
+ "definition": "label_values(cnpg_collector_up,namespace)",
+ "description": "Name of the namespace where the CNPG cluster lives in",
+ "name": "Namespace",
+ "options": [],
+ "query": {
+ "qryType": 1,
+ "query": "label_values(cnpg_collector_up,namespace)",
+ "refId": "PrometheusVariableQueryEditor-VariableQuery"
+ },
+ "refresh": 1,
+ "regex": "",
+ "type": "query"
+ },
+ {
+ "current": {
+ "text": "postgresql-cluster",
+ "value": "postgresql-cluster"
+ },
+ "definition": "label_values(cnpg_collector_up,cluster)",
+ "description": "Name of the CNPG Cluster to pull in data from",
+ "name": "Cluster",
+ "options": [],
+ "query": {
+ "qryType": 1,
+ "query": "label_values(cnpg_collector_up,cluster)",
+ "refId": "PrometheusVariableQueryEditor-VariableQuery"
+ },
+ "refresh": 1,
+ "regex": "",
+ "type": "query"
+ }
+ ]
+ },
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "timepicker": {},
+ "timezone": "browser",
+ "title": "Cloudnative PostgreSQL Database Monitoring",
+ "uid": "obbmrs2",
+ "version": 1
+}
\ No newline at end of file
diff --git a/modules/observability/grafana.tf b/modules/observability/grafana.tf
index f920453..11a1643 100644
--- a/modules/observability/grafana.tf
+++ b/modules/observability/grafana.tf
@@ -98,7 +98,37 @@ resource "helm_release" "grafana" {
scheme = "HTTPS"
}
}
-
+
+
+ // Deploy dashboards to Grafana
+ dashboardProviders = {
+ "dashboardproviders.yaml" = {
+ apiVersion = 1
+ providers = [
+ {
+ name = "Cloudnative PostgreSQL Database Dashboard"
+ orgId = 1
+ folder = "Database Dashboards"
+ type = "file"
+ disableDeletion = false
+ editable = true
+ options = {
+ path = "/var/lib/grafana/dashboards/default"
+ }
+ }
+ ]
+ }
+ }
+
+ // Injecting the Dashboard JSON file into the Grafana container
+ dashboards = {
+ default = {
+ postgres-dashboard = {
+ json = file("${path.module}/dashboards/postgresql.json")
+ }
+ }
+ }
+
affinity = {
nodeAffinity = {
requiredDuringSchedulingIgnoredDuringExecution = {
diff --git a/modules/observability/kube-state-metrics.tf b/modules/observability/kube-state-metrics.tf
new file mode 100644
index 0000000..08e7444
--- /dev/null
+++ b/modules/observability/kube-state-metrics.tf
@@ -0,0 +1,28 @@
+resource "helm_release" "kube_state_metrics" {
+ name = "kube-state-metrics"
+ repository = "https://prometheus-community.github.io/helm-charts"
+ chart = "kube-state-metrics"
+ version = "7.1.0"
+
+ namespace = kubernetes_namespace.namespace.metadata[0].name
+
+ values = [
+ yamlencode({
+ podAnnotations = {
+ "prometheus.io/scrape" = "true"
+ "prometheus.io/port" = "8080"
+ }
+
+ resources = {
+ requests = {
+ cpu = "50m"
+ memory = "128Mi"
+ }
+ limits = {
+ cpu = "200m"
+ memory = "256Mi"
+ }
+ }
+ })
+ ]
+}
diff --git a/modules/observability/otel-collector.tf b/modules/observability/otel-collector.tf
index 72b3e12..1b2d05a 100644
--- a/modules/observability/otel-collector.tf
+++ b/modules/observability/otel-collector.tf
@@ -84,6 +84,7 @@ resource "helm_release" "otel_collector" {
scrape_configs = [
{
job_name = "kubernetes-pods"
+ honor_labels = true
scrape_interval = "30s"
body_size_limit = "50MB"
kubernetes_sd_configs = [
@@ -120,6 +121,48 @@ resource "helm_release" "otel_collector" {
regex = "([^:]+)(?::\\d+)?;(\\d+)"
replacement = "$1:$2"
target_label = "__address__"
+ },
+ // Add namespace and pod to the metrics data
+ {
+ source_labels = ["__meta_kubernetes_namespace"]
+ action = "replace"
+ target_label = "namespace"
+ },
+ {
+ source_labels = ["__meta_kubernetes_pod_name"]
+ action = "replace"
+ target_label = "pod"
+ }
+ ]
+ },
+ // Scrape Kubernetes cAdvisor Metrics
+ {
+ job_name = "kubelet-cadvisor"
+ scheme = "https"
+ tls_config = {
+ ca_file = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
+ insecure_skip_verify = true
+ }
+ bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token"
+
+ kubernetes_sd_configs = [
+ {
+ role = "node"
+ }
+ ]
+
+ relabel_configs = [
+ // 1. Only scrape the local node this DaemonSet pod is running on
+ {
+ source_labels = ["__meta_kubernetes_node_name"]
+ action = "keep"
+ regex = "$${env:K8S_NODE_NAME}"
+ },
+ // 2. Point directly to the internal cAdvisor endpoint
+ {
+ action = "replace"
+ target_label = "__metrics_path__"
+ replacement = "/metrics/cadvisor"
}
]
}
@@ -147,6 +190,18 @@ resource "helm_release" "otel_collector" {
}
]
}
+
+ transform = {
+ metric_statements = [
+ {
+ context = "datapoint"
+ statements = [
+ "set(attributes[\"namespace\"], resource.attributes[\"namespace\"]) where attributes[\"namespace\"] == nil and resource.attributes[\"namespace\"] != nil",
+ "set(attributes[\"pod\"], resource.attributes[\"pod\"]) where attributes[\"pod\"] == nil and resource.attributes[\"pod\"] != nil"
+ ]
+ }
+ ]
+ }
}
// Exporters
@@ -174,7 +229,7 @@ resource "helm_release" "otel_collector" {
metrics = {
// 'hostmetrics' & 'kubeletstats' come from presets. 'prometheus' is our custom one.
receivers = ["otlp", "hostmetrics", "kubeletstats", "prometheus"]
- processors = ["memory_limiter", "k8sattributes", "batch"]
+ processors = ["memory_limiter", "k8sattributes", "transform", "batch"]
exporters = ["prometheusremotewrite"]
}
logs = {
diff --git a/modules/observability/outputs.tf b/modules/observability/outputs.tf
new file mode 100644
index 0000000..797afb4
--- /dev/null
+++ b/modules/observability/outputs.tf
@@ -0,0 +1,5 @@
+output "observability_namespace" {
+ description = "Namespace where all components for observability are deployed"
+ value = kubernetes_namespace.namespace.metadata[0].name
+ depends_on = [ helm_release.grafana ]
+}
diff --git a/modules/observability/variables.tf b/modules/observability/variables.tf
index fed4156..3fa1f37 100644
--- a/modules/observability/variables.tf
+++ b/modules/observability/variables.tf
@@ -2,7 +2,7 @@
variable "app_name" {
description = "App name for deploying the Observability Stack"
type = string
- default = "ferret"
+ default = "observability"
}
variable "organization_name" {