From 7327627f9573c029920cc259eb985e6f6282210d Mon Sep 17 00:00:00 2001 From: Gavin Elder Date: Wed, 17 Dec 2025 15:53:34 +0000 Subject: [PATCH 1/8] docs: Move memory tuning to an advanced topic --- .../enterprise-sidebar.json | 3 +- .../advanced-topics/jvm-memory-tuning.md | 65 +++++++++++++++++++ .../enterprise/_templates/docker/tower.env | 3 - .../enterprise/_templates/k8s/tower-cron.yml | 3 - .../enterprise/_templates/k8s/tower-svc.yml | 3 - .../advanced-topics/jvm-memory-tuning.md | 65 +++++++++++++++++++ .../enterprise/configuration/overview.mdx | 44 ------------- .../version-24.1/enterprise/upgrade.md | 11 ---- .../enterprise/_templates/docker/tower.env | 3 - .../enterprise/_templates/k8s/tower-cron.yml | 3 - .../enterprise/_templates/k8s/tower-svc.yml | 3 - .../advanced-topics/jvm-memory-tuning.md | 65 +++++++++++++++++++ .../enterprise/configuration/overview.mdx | 44 ------------- .../version-24.2/enterprise/upgrade.md | 11 ---- .../enterprise/_templates/docker/tower.env | 3 - .../enterprise/_templates/k8s/tower-cron.yml | 3 - .../enterprise/_templates/k8s/tower-svc.yml | 3 - .../advanced-topics/jvm-memory-tuning.md | 65 +++++++++++++++++++ .../enterprise/configuration/overview.mdx | 44 ------------- .../version-25.1/enterprise/upgrade.md | 11 ---- .../enterprise/_templates/docker/tower.env | 3 - .../enterprise/_templates/k8s/tower-cron.yml | 3 - .../enterprise/_templates/k8s/tower-svc.yml | 3 - .../advanced-topics/jvm-memory-tuning.md | 65 +++++++++++++++++++ .../enterprise/configuration/overview.mdx | 44 ------------- .../version-25.2/enterprise/upgrade.md | 12 ---- .../enterprise/_templates/docker/tower.env | 4 -- .../enterprise/_templates/k8s/tower-cron.yml | 3 - .../enterprise/_templates/k8s/tower-svc.yml | 3 - .../advanced-topics/jvm-memory-tuning.md | 65 +++++++++++++++++++ .../enterprise/configuration/overview.mdx | 44 ------------- .../version-25.3/enterprise/upgrade.md | 11 ---- .../version-24.1-sidebars.json | 3 +- .../version-24.2-sidebars.json | 3 +- .../version-25.1-sidebars.json | 3 +- .../version-25.2-sidebars.json | 3 +- .../version-25.3-sidebars.json | 3 +- 37 files changed, 402 insertions(+), 328 deletions(-) create mode 100644 platform-enterprise_docs/enterprise/advanced-topics/jvm-memory-tuning.md create mode 100644 platform-enterprise_versioned_docs/version-24.1/enterprise/advanced-topics/jvm-memory-tuning.md create mode 100644 platform-enterprise_versioned_docs/version-24.2/enterprise/advanced-topics/jvm-memory-tuning.md create mode 100644 platform-enterprise_versioned_docs/version-25.1/enterprise/advanced-topics/jvm-memory-tuning.md create mode 100644 platform-enterprise_versioned_docs/version-25.2/enterprise/advanced-topics/jvm-memory-tuning.md create mode 100644 platform-enterprise_versioned_docs/version-25.3/enterprise/advanced-topics/jvm-memory-tuning.md diff --git a/platform-enterprise_docs/enterprise-sidebar.json b/platform-enterprise_docs/enterprise-sidebar.json index 21509dca0..35f4c920e 100644 --- a/platform-enterprise_docs/enterprise-sidebar.json +++ b/platform-enterprise_docs/enterprise-sidebar.json @@ -58,7 +58,8 @@ "enterprise/advanced-topics/custom-launch-container", "enterprise/advanced-topics/firewall-configuration", "enterprise/advanced-topics/seqera-container-images", - "enterprise/advanced-topics/content-security-policy" + "enterprise/advanced-topics/content-security-policy", + "enterprise/advanced-topics/jvm-memory-tuning" ] }, "enterprise/general_troubleshooting" diff --git a/platform-enterprise_docs/enterprise/advanced-topics/jvm-memory-tuning.md b/platform-enterprise_docs/enterprise/advanced-topics/jvm-memory-tuning.md new file mode 100644 index 000000000..5a9f85438 --- /dev/null +++ b/platform-enterprise_docs/enterprise/advanced-topics/jvm-memory-tuning.md @@ -0,0 +1,65 @@ +--- +title: "JVM memory tuning" +description: Configure JVM memory parameters for Seqera Platform Enterprise deployments +tags: [configuration, jvm, memory, tuning] +--- + +# JVM memory tuning + +:::warning +JVM memory tuning is an advanced topic that may cause instability and performance issues. +::: + +Seqera Platform scales memory allocation based on resources allocated to the application. To best inform available memory, set memory requests and limits on your deployments. We recommend increasing memory allocation before manually configuring JVM settings. + +If you wish to manually configure JVM memory, use the following baseline recommendations. + +## Memory parameters + +Set JVM memory parameters using the `JAVA_OPTS` environment variable. The following parameters control memory allocation: + +| Parameter | Description | +| -------------------------- | ----------------------------------------------------------------------------------------------------------------------------------- | +| `-Xms` / `-Xmx` | Set the initial (`Xms`) and maximum (`Xmx`) heap size. The heap stores Java objects and should be 50-70% of total allocated memory. | +| `-XX:MaxDirectMemorySize` | Set the maximum direct (off-heap) memory. Used for NIO operations, network buffers, and file I/O. | +| `-XX:ActiveProcessorCount` | Set the number of CPUs available to the JVM. Should match the number of vCPUs allocated to the container. | + +## Resource allocation guidelines + +- **Heap (`-Xmx`)**: 50-70% of total allocated memory +- **Direct memory**: 10-20% of total allocated memory +- **Overhead** (metaspace, thread stacks, native memory): ~10% of total allocated memory + +Ensure total JVM memory (heap + direct memory + overhead) does not exceed container memory limits. + +## Example configurations + +The following table provides example configurations for common deployment sizes. These are starting points and may need to be tuned based on your specific usage patterns. + +| vCPU | RAM | Heap (`-Xmx`) | Direct Memory | `JAVA_OPTS` | +| :--: | :---: | :-----------: | :-----------: | ------------------------------------------------------------------------------- | +| 1 | 2 GB | 1 GB | 512 MB | `-XX:ActiveProcessorCount=1 -Xms500M -Xmx1000M -XX:MaxDirectMemorySize=512m` | +| 1 | 4 GB | 2.5 GB | 800 MB | `-XX:ActiveProcessorCount=1 -Xms1000M -Xmx2500M -XX:MaxDirectMemorySize=800m` | +| 2 | 2 GB | 1 GB | 512 MB | `-XX:ActiveProcessorCount=2 -Xms500M -Xmx1000M -XX:MaxDirectMemorySize=512m` | +| 2 | 4 GB | 2 GB | 800 MB | `-XX:ActiveProcessorCount=2 -Xms1000M -Xmx2000M -XX:MaxDirectMemorySize=800m` | +| 2 | 8 GB | 5 GB | 1.5 GB | `-XX:ActiveProcessorCount=2 -Xms2000M -Xmx5000M -XX:MaxDirectMemorySize=1500m` | +| 3 | 2 GB | 1 GB | 512 MB | `-XX:ActiveProcessorCount=3 -Xms500M -Xmx1000M -XX:MaxDirectMemorySize=512m` | +| 3 | 4 GB | 2 GB | 800 MB | `-XX:ActiveProcessorCount=3 -Xms1000M -Xmx2000M -XX:MaxDirectMemorySize=800m` | +| 3 | 8 GB | 5 GB | 1.5 GB | `-XX:ActiveProcessorCount=3 -Xms2000M -Xmx5000M -XX:MaxDirectMemorySize=1500m` | +| 3 | 16 GB | 11 GB | 2.5 GB | `-XX:ActiveProcessorCount=3 -Xms4000M -Xmx11000M -XX:MaxDirectMemorySize=2500m` | + +## When to adjust memory settings + +Adjust your JVM memory settings if you observe the following issues in your deployment: + +**Increase heap memory (`-Xmx`)** if you see: + +- `OutOfMemoryError: Java heap space` errors in logs +- Garbage collection pauses affecting performance +- Steadily growing memory usage under sustained load + +**Increase direct memory (`MaxDirectMemorySize`)** if you see: + +- `OutOfMemoryError: Direct buffer memory` errors in logs +- High concurrent workflow launch rates (more than 100 simultaneous workflows) +- Large configuration payloads or extensive API usage diff --git a/platform-enterprise_versioned_docs/version-24.1/enterprise/_templates/docker/tower.env b/platform-enterprise_versioned_docs/version-24.1/enterprise/_templates/docker/tower.env index 7adc905ec..5d201aa09 100644 --- a/platform-enterprise_versioned_docs/version-24.1/enterprise/_templates/docker/tower.env +++ b/platform-enterprise_versioned_docs/version-24.1/enterprise/_templates/docker/tower.env @@ -4,9 +4,6 @@ TOWER_JWT_SECRET= TOWER_LICENSE= -# Configuration to optimize JVM memory settings. See https://docs.seqera.io/platform-enterprise/24.1/enterprise/configuration/overview#backend-memory-requirements -JAVA_OPTS="-Xms1000M -Xmx2000M -XX:MaxDirectMemorySize=800m -Dio.netty.maxDirectMemory=0 -Djdk.nio.maxCachedBufferSize=262144" - # Compute environment settings TOWER_ENABLE_PLATFORMS=altair-platform,awsbatch-platform,azbatch-platform,eks-platform,gke-platform,googlebatch-platform,k8s-platform,lsf-platform,moab-platform,slurm-platform,uge-platform diff --git a/platform-enterprise_versioned_docs/version-24.1/enterprise/_templates/k8s/tower-cron.yml b/platform-enterprise_versioned_docs/version-24.1/enterprise/_templates/k8s/tower-cron.yml index 981636d90..94754c867 100644 --- a/platform-enterprise_versioned_docs/version-24.1/enterprise/_templates/k8s/tower-cron.yml +++ b/platform-enterprise_versioned_docs/version-24.1/enterprise/_templates/k8s/tower-cron.yml @@ -43,9 +43,6 @@ spec: env: - name: MICRONAUT_ENVIRONMENTS value: "prod,redis,cron" - # Configuration to optimize JVM memory settings. See https://docs.seqera.io/platform-enterprise/24.1/enterprise/configuration/overview#backend-memory-requirements - - name: JAVA_OPTS - value: "-Xms1000M -Xmx2000M -XX:MaxDirectMemorySize=800m -Dio.netty.maxDirectMemory=0 -Djdk.nio.maxCachedBufferSize=262144" ports: - containerPort: 8080 resources: diff --git a/platform-enterprise_versioned_docs/version-24.1/enterprise/_templates/k8s/tower-svc.yml b/platform-enterprise_versioned_docs/version-24.1/enterprise/_templates/k8s/tower-svc.yml index 7aeb36d78..411baa263 100644 --- a/platform-enterprise_versioned_docs/version-24.1/enterprise/_templates/k8s/tower-svc.yml +++ b/platform-enterprise_versioned_docs/version-24.1/enterprise/_templates/k8s/tower-svc.yml @@ -36,9 +36,6 @@ spec: env: - name: MICRONAUT_ENVIRONMENTS value: "prod,redis,ha" - # Configuration to optimize JVM memory settings. See https://docs.seqera.io/platform-enterprise/24.1/enterprise/configuration/overview#backend-memory-requirements - - name: JAVA_OPTS - value: "-Xms1000M -Xmx2000M -XX:MaxDirectMemorySize=800m -Dio.netty.maxDirectMemory=0 -Djdk.nio.maxCachedBufferSize=262144" # TLS certificate for Studios #- name: TOWER_OIDC_PEM_PATH # value: '/data/certs/oidc.pem' diff --git a/platform-enterprise_versioned_docs/version-24.1/enterprise/advanced-topics/jvm-memory-tuning.md b/platform-enterprise_versioned_docs/version-24.1/enterprise/advanced-topics/jvm-memory-tuning.md new file mode 100644 index 000000000..5a9f85438 --- /dev/null +++ b/platform-enterprise_versioned_docs/version-24.1/enterprise/advanced-topics/jvm-memory-tuning.md @@ -0,0 +1,65 @@ +--- +title: "JVM memory tuning" +description: Configure JVM memory parameters for Seqera Platform Enterprise deployments +tags: [configuration, jvm, memory, tuning] +--- + +# JVM memory tuning + +:::warning +JVM memory tuning is an advanced topic that may cause instability and performance issues. +::: + +Seqera Platform scales memory allocation based on resources allocated to the application. To best inform available memory, set memory requests and limits on your deployments. We recommend increasing memory allocation before manually configuring JVM settings. + +If you wish to manually configure JVM memory, use the following baseline recommendations. + +## Memory parameters + +Set JVM memory parameters using the `JAVA_OPTS` environment variable. The following parameters control memory allocation: + +| Parameter | Description | +| -------------------------- | ----------------------------------------------------------------------------------------------------------------------------------- | +| `-Xms` / `-Xmx` | Set the initial (`Xms`) and maximum (`Xmx`) heap size. The heap stores Java objects and should be 50-70% of total allocated memory. | +| `-XX:MaxDirectMemorySize` | Set the maximum direct (off-heap) memory. Used for NIO operations, network buffers, and file I/O. | +| `-XX:ActiveProcessorCount` | Set the number of CPUs available to the JVM. Should match the number of vCPUs allocated to the container. | + +## Resource allocation guidelines + +- **Heap (`-Xmx`)**: 50-70% of total allocated memory +- **Direct memory**: 10-20% of total allocated memory +- **Overhead** (metaspace, thread stacks, native memory): ~10% of total allocated memory + +Ensure total JVM memory (heap + direct memory + overhead) does not exceed container memory limits. + +## Example configurations + +The following table provides example configurations for common deployment sizes. These are starting points and may need to be tuned based on your specific usage patterns. + +| vCPU | RAM | Heap (`-Xmx`) | Direct Memory | `JAVA_OPTS` | +| :--: | :---: | :-----------: | :-----------: | ------------------------------------------------------------------------------- | +| 1 | 2 GB | 1 GB | 512 MB | `-XX:ActiveProcessorCount=1 -Xms500M -Xmx1000M -XX:MaxDirectMemorySize=512m` | +| 1 | 4 GB | 2.5 GB | 800 MB | `-XX:ActiveProcessorCount=1 -Xms1000M -Xmx2500M -XX:MaxDirectMemorySize=800m` | +| 2 | 2 GB | 1 GB | 512 MB | `-XX:ActiveProcessorCount=2 -Xms500M -Xmx1000M -XX:MaxDirectMemorySize=512m` | +| 2 | 4 GB | 2 GB | 800 MB | `-XX:ActiveProcessorCount=2 -Xms1000M -Xmx2000M -XX:MaxDirectMemorySize=800m` | +| 2 | 8 GB | 5 GB | 1.5 GB | `-XX:ActiveProcessorCount=2 -Xms2000M -Xmx5000M -XX:MaxDirectMemorySize=1500m` | +| 3 | 2 GB | 1 GB | 512 MB | `-XX:ActiveProcessorCount=3 -Xms500M -Xmx1000M -XX:MaxDirectMemorySize=512m` | +| 3 | 4 GB | 2 GB | 800 MB | `-XX:ActiveProcessorCount=3 -Xms1000M -Xmx2000M -XX:MaxDirectMemorySize=800m` | +| 3 | 8 GB | 5 GB | 1.5 GB | `-XX:ActiveProcessorCount=3 -Xms2000M -Xmx5000M -XX:MaxDirectMemorySize=1500m` | +| 3 | 16 GB | 11 GB | 2.5 GB | `-XX:ActiveProcessorCount=3 -Xms4000M -Xmx11000M -XX:MaxDirectMemorySize=2500m` | + +## When to adjust memory settings + +Adjust your JVM memory settings if you observe the following issues in your deployment: + +**Increase heap memory (`-Xmx`)** if you see: + +- `OutOfMemoryError: Java heap space` errors in logs +- Garbage collection pauses affecting performance +- Steadily growing memory usage under sustained load + +**Increase direct memory (`MaxDirectMemorySize`)** if you see: + +- `OutOfMemoryError: Direct buffer memory` errors in logs +- High concurrent workflow launch rates (more than 100 simultaneous workflows) +- Large configuration payloads or extensive API usage diff --git a/platform-enterprise_versioned_docs/version-24.1/enterprise/configuration/overview.mdx b/platform-enterprise_versioned_docs/version-24.1/enterprise/configuration/overview.mdx index 357b24eff..107e75495 100644 --- a/platform-enterprise_versioned_docs/version-24.1/enterprise/configuration/overview.mdx +++ b/platform-enterprise_versioned_docs/version-24.1/enterprise/configuration/overview.mdx @@ -411,50 +411,6 @@ services: These default memory allocation limits are included in your Kubernetes manifests ([tower-svc.yml](../_templates/k8s/tower-svc.yml) and [tower-cron.yml](../_templates/k8s/tower-cron.yml)) and Docker Compose ([docker-compose.yml](../_templates/docker/docker-compose.yml)) configuration templates. ::: -### JVM memory tuning - -For production deployments, configure JVM memory parameters via the `JAVA_OPTS` environment variable. The following baseline configuration is suitable for most deployments: - -```bash -JAVA_OPTS: -Xms1000M -Xmx2000M -XX:MaxDirectMemorySize=800m -Dio.netty.maxDirectMemory=0 -Djdk.nio.maxCachedBufferSize=262144 -``` - -:::note -These default JVM memory settings are included in the configuration templates provided in these docs: -- Kubernetes: [tower-svc.yml](../_templates/k8s/tower-svc.yml) and [tower-cron.yml](../_templates/k8s/tower-cron.yml) -- Docker Compose: [tower.env](../_templates/docker/tower.env) -::: - -**Parameter descriptions:** -- **Heap memory** (`-Xms`/`-Xmx`): Memory pool for Java objects. Set initial (`Xms`) and maximum (`Xmx`) heap size. -- **Direct memory** (`MaxDirectMemorySize`): Off-heap memory used for NIO operations, network buffers, and file I/O. Critical for handling concurrent workflow API operations. -- **Netty memory accounting** (`io.netty.maxDirectMemory=0`): Disables Netty's internal tracking; relies on JVM direct memory limits instead. -- **Buffer caching** (`jdk.nio.maxCachedBufferSize`): Limits size of cached NIO buffers to prevent excessive memory retention. - -**When to adjust these values:** - -Increase `MaxDirectMemorySize` if you observe: -- `OutOfMemoryError: Direct buffer memory` in logs -- High concurrent workflow launch rates (>100 simultaneous workflows) -- Large configuration payloads or extensive API usage - -Increase heap memory (`-Xmx`) if you observe: -- `OutOfMemoryError: Java heap space` in logs -- Garbage collection pauses affecting performance -- Growing memory usage under sustained load - -**Example: High-concurrency deployment** -For deployments running 200+ concurrent workflows: -```bash -JAVA_OPTS: -Xms1000M -Xmx3000M -XX:MaxDirectMemorySize=1600m -Dio.netty.maxDirectMemory=0 -Djdk.nio.maxCachedBufferSize=262144 -``` - -Ensure container/pod memory limits are set higher than JVM limits to accommodate non-heap memory usage. - -:::warning -These are starting recommendations. Monitor your deployment's actual memory usage and adjust based on your specific workload patterns. Undersized memory allocation can cause OOM failures and service instability. -::: - ## Compute environments Configuration values to enable computing platforms and customize Batch Forge resource naming. diff --git a/platform-enterprise_versioned_docs/version-24.1/enterprise/upgrade.md b/platform-enterprise_versioned_docs/version-24.1/enterprise/upgrade.md index 223b4fdc9..75a310a12 100644 --- a/platform-enterprise_versioned_docs/version-24.1/enterprise/upgrade.md +++ b/platform-enterprise_versioned_docs/version-24.1/enterprise/upgrade.md @@ -15,17 +15,6 @@ The database volume is persistent on the local machine by default if you use the 1. Download the latest versions of your deployment templates and update your Seqera container versions: - [docker-compose.yml](./_templates/docker/docker-compose.yml) for Docker Compose deployments - [tower-cron.yml](./_templates/k8s/tower-cron.yml) and [tower-svc.yml](./_templates/k8s/tower-svc.yml) for Kubernetes deployments -1. **JVM memory configuration defaults (recommended)**: The following `JAVA_OPTS` environment variable is included in the deployment templates downloaded in the preceding step, to optimize JVM memory settings: - - ```bash - JAVA_OPTS: -Xms1000M -Xmx2000M -XX:MaxDirectMemorySize=800m -Dio.netty.maxDirectMemory=0 -Djdk.nio.maxCachedBufferSize=262144 - ``` - - These baseline values are suitable for most deployments running moderate concurrent workflow loads. - - :::tip - These are starting recommendations that may require tuning based on your deployment's workload. See [Backend memory requirements](./configuration/overview.mdx#backend-memory-requirements) for detailed guidance on when and how to adjust these values for your environment. - ::: 1. Restart the application. 1. If you're using a containerized database as part of your implementation: 1. Stop the application. diff --git a/platform-enterprise_versioned_docs/version-24.2/enterprise/_templates/docker/tower.env b/platform-enterprise_versioned_docs/version-24.2/enterprise/_templates/docker/tower.env index bc54d11f4..5d201aa09 100644 --- a/platform-enterprise_versioned_docs/version-24.2/enterprise/_templates/docker/tower.env +++ b/platform-enterprise_versioned_docs/version-24.2/enterprise/_templates/docker/tower.env @@ -4,9 +4,6 @@ TOWER_JWT_SECRET= TOWER_LICENSE= -# Configuration to optimize JVM memory settings. See https://docs.seqera.io/platform-enterprise/24.2/enterprise/configuration/overview#backend-memory-requirements -JAVA_OPTS="-Xms1000M -Xmx2000M -XX:MaxDirectMemorySize=800m -Dio.netty.maxDirectMemory=0 -Djdk.nio.maxCachedBufferSize=262144" - # Compute environment settings TOWER_ENABLE_PLATFORMS=altair-platform,awsbatch-platform,azbatch-platform,eks-platform,gke-platform,googlebatch-platform,k8s-platform,lsf-platform,moab-platform,slurm-platform,uge-platform diff --git a/platform-enterprise_versioned_docs/version-24.2/enterprise/_templates/k8s/tower-cron.yml b/platform-enterprise_versioned_docs/version-24.2/enterprise/_templates/k8s/tower-cron.yml index 0f75b3e90..50cf1a80e 100644 --- a/platform-enterprise_versioned_docs/version-24.2/enterprise/_templates/k8s/tower-cron.yml +++ b/platform-enterprise_versioned_docs/version-24.2/enterprise/_templates/k8s/tower-cron.yml @@ -43,9 +43,6 @@ spec: env: - name: MICRONAUT_ENVIRONMENTS value: "prod,redis,cron" - # Configuration to optimize JVM memory settings. See https://docs.seqera.io/platform-enterprise/24.2/enterprise/configuration/overview#backend-memory-requirements - - name: JAVA_OPTS - value: "-Xms1000M -Xmx2000M -XX:MaxDirectMemorySize=800m -Dio.netty.maxDirectMemory=0 -Djdk.nio.maxCachedBufferSize=262144" ports: - containerPort: 8080 resources: diff --git a/platform-enterprise_versioned_docs/version-24.2/enterprise/_templates/k8s/tower-svc.yml b/platform-enterprise_versioned_docs/version-24.2/enterprise/_templates/k8s/tower-svc.yml index baafe9d20..454b21a8e 100644 --- a/platform-enterprise_versioned_docs/version-24.2/enterprise/_templates/k8s/tower-svc.yml +++ b/platform-enterprise_versioned_docs/version-24.2/enterprise/_templates/k8s/tower-svc.yml @@ -36,9 +36,6 @@ spec: env: - name: MICRONAUT_ENVIRONMENTS value: "prod,redis,ha" - # Configuration to optimize JVM memory settings. See https://docs.seqera.io/platform-enterprise/24.2/enterprise/configuration/overview#backend-memory-requirements - - name: JAVA_OPTS - value: "-Xms1000M -Xmx2000M -XX:MaxDirectMemorySize=800m -Dio.netty.maxDirectMemory=0 -Djdk.nio.maxCachedBufferSize=262144" # TLS certificate for Studios #- name: TOWER_OIDC_PEM_PATH # value: '/data/certs/oidc.pem' diff --git a/platform-enterprise_versioned_docs/version-24.2/enterprise/advanced-topics/jvm-memory-tuning.md b/platform-enterprise_versioned_docs/version-24.2/enterprise/advanced-topics/jvm-memory-tuning.md new file mode 100644 index 000000000..5a9f85438 --- /dev/null +++ b/platform-enterprise_versioned_docs/version-24.2/enterprise/advanced-topics/jvm-memory-tuning.md @@ -0,0 +1,65 @@ +--- +title: "JVM memory tuning" +description: Configure JVM memory parameters for Seqera Platform Enterprise deployments +tags: [configuration, jvm, memory, tuning] +--- + +# JVM memory tuning + +:::warning +JVM memory tuning is an advanced topic that may cause instability and performance issues. +::: + +Seqera Platform scales memory allocation based on resources allocated to the application. To best inform available memory, set memory requests and limits on your deployments. We recommend increasing memory allocation before manually configuring JVM settings. + +If you wish to manually configure JVM memory, use the following baseline recommendations. + +## Memory parameters + +Set JVM memory parameters using the `JAVA_OPTS` environment variable. The following parameters control memory allocation: + +| Parameter | Description | +| -------------------------- | ----------------------------------------------------------------------------------------------------------------------------------- | +| `-Xms` / `-Xmx` | Set the initial (`Xms`) and maximum (`Xmx`) heap size. The heap stores Java objects and should be 50-70% of total allocated memory. | +| `-XX:MaxDirectMemorySize` | Set the maximum direct (off-heap) memory. Used for NIO operations, network buffers, and file I/O. | +| `-XX:ActiveProcessorCount` | Set the number of CPUs available to the JVM. Should match the number of vCPUs allocated to the container. | + +## Resource allocation guidelines + +- **Heap (`-Xmx`)**: 50-70% of total allocated memory +- **Direct memory**: 10-20% of total allocated memory +- **Overhead** (metaspace, thread stacks, native memory): ~10% of total allocated memory + +Ensure total JVM memory (heap + direct memory + overhead) does not exceed container memory limits. + +## Example configurations + +The following table provides example configurations for common deployment sizes. These are starting points and may need to be tuned based on your specific usage patterns. + +| vCPU | RAM | Heap (`-Xmx`) | Direct Memory | `JAVA_OPTS` | +| :--: | :---: | :-----------: | :-----------: | ------------------------------------------------------------------------------- | +| 1 | 2 GB | 1 GB | 512 MB | `-XX:ActiveProcessorCount=1 -Xms500M -Xmx1000M -XX:MaxDirectMemorySize=512m` | +| 1 | 4 GB | 2.5 GB | 800 MB | `-XX:ActiveProcessorCount=1 -Xms1000M -Xmx2500M -XX:MaxDirectMemorySize=800m` | +| 2 | 2 GB | 1 GB | 512 MB | `-XX:ActiveProcessorCount=2 -Xms500M -Xmx1000M -XX:MaxDirectMemorySize=512m` | +| 2 | 4 GB | 2 GB | 800 MB | `-XX:ActiveProcessorCount=2 -Xms1000M -Xmx2000M -XX:MaxDirectMemorySize=800m` | +| 2 | 8 GB | 5 GB | 1.5 GB | `-XX:ActiveProcessorCount=2 -Xms2000M -Xmx5000M -XX:MaxDirectMemorySize=1500m` | +| 3 | 2 GB | 1 GB | 512 MB | `-XX:ActiveProcessorCount=3 -Xms500M -Xmx1000M -XX:MaxDirectMemorySize=512m` | +| 3 | 4 GB | 2 GB | 800 MB | `-XX:ActiveProcessorCount=3 -Xms1000M -Xmx2000M -XX:MaxDirectMemorySize=800m` | +| 3 | 8 GB | 5 GB | 1.5 GB | `-XX:ActiveProcessorCount=3 -Xms2000M -Xmx5000M -XX:MaxDirectMemorySize=1500m` | +| 3 | 16 GB | 11 GB | 2.5 GB | `-XX:ActiveProcessorCount=3 -Xms4000M -Xmx11000M -XX:MaxDirectMemorySize=2500m` | + +## When to adjust memory settings + +Adjust your JVM memory settings if you observe the following issues in your deployment: + +**Increase heap memory (`-Xmx`)** if you see: + +- `OutOfMemoryError: Java heap space` errors in logs +- Garbage collection pauses affecting performance +- Steadily growing memory usage under sustained load + +**Increase direct memory (`MaxDirectMemorySize`)** if you see: + +- `OutOfMemoryError: Direct buffer memory` errors in logs +- High concurrent workflow launch rates (more than 100 simultaneous workflows) +- Large configuration payloads or extensive API usage diff --git a/platform-enterprise_versioned_docs/version-24.2/enterprise/configuration/overview.mdx b/platform-enterprise_versioned_docs/version-24.2/enterprise/configuration/overview.mdx index f6d8f6422..59fce3059 100644 --- a/platform-enterprise_versioned_docs/version-24.2/enterprise/configuration/overview.mdx +++ b/platform-enterprise_versioned_docs/version-24.2/enterprise/configuration/overview.mdx @@ -416,50 +416,6 @@ services: These default memory allocation limits are included in your Kubernetes manifests ([tower-svc.yml](../_templates/k8s/tower-svc.yml) and [tower-cron.yml](../_templates/k8s/tower-cron.yml)) and Docker Compose ([docker-compose.yml](../_templates/docker/docker-compose.yml)) configuration templates. ::: -### JVM memory tuning - -For production deployments, configure JVM memory parameters via the `JAVA_OPTS` environment variable. The following baseline configuration is suitable for most deployments: - -```bash -JAVA_OPTS: -Xms1000M -Xmx2000M -XX:MaxDirectMemorySize=800m -Dio.netty.maxDirectMemory=0 -Djdk.nio.maxCachedBufferSize=262144 -``` - -:::note -These default JVM memory settings are included in the configuration templates provided in these docs: -- Kubernetes: [tower-svc.yml](../_templates/k8s/tower-svc.yml) and [tower-cron.yml](../_templates/k8s/tower-cron.yml) -- Docker Compose: [tower.env](../_templates/docker/tower.env) -::: - -**Parameter descriptions:** -- **Heap memory** (`-Xms`/`-Xmx`): Memory pool for Java objects. Set initial (`Xms`) and maximum (`Xmx`) heap size. -- **Direct memory** (`MaxDirectMemorySize`): Off-heap memory used for NIO operations, network buffers, and file I/O. Critical for handling concurrent workflow API operations. -- **Netty memory accounting** (`io.netty.maxDirectMemory=0`): Disables Netty's internal tracking; relies on JVM direct memory limits instead. -- **Buffer caching** (`jdk.nio.maxCachedBufferSize`): Limits size of cached NIO buffers to prevent excessive memory retention. - -**When to adjust these values:** - -Increase `MaxDirectMemorySize` if you observe: -- `OutOfMemoryError: Direct buffer memory` in logs -- High concurrent workflow launch rates (>100 simultaneous workflows) -- Large configuration payloads or extensive API usage - -Increase heap memory (`-Xmx`) if you observe: -- `OutOfMemoryError: Java heap space` in logs -- Garbage collection pauses affecting performance -- Growing memory usage under sustained load - -**Example: High-concurrency deployment** -For deployments running 200+ concurrent workflows: -```bash -JAVA_OPTS: -Xms1000M -Xmx3000M -XX:MaxDirectMemorySize=1600m -Dio.netty.maxDirectMemory=0 -Djdk.nio.maxCachedBufferSize=262144 -``` - -Ensure container/pod memory limits are set higher than JVM limits to accommodate non-heap memory usage. - -:::warning -These are starting recommendations. Monitor your deployment's actual memory usage and adjust based on your specific workload patterns. Undersized memory allocation can cause OOM failures and service instability. -::: - ## Compute environments Configuration values to enable computing platforms and customize Batch Forge resource naming. diff --git a/platform-enterprise_versioned_docs/version-24.2/enterprise/upgrade.md b/platform-enterprise_versioned_docs/version-24.2/enterprise/upgrade.md index 52cdf65a3..3f0fcff60 100644 --- a/platform-enterprise_versioned_docs/version-24.2/enterprise/upgrade.md +++ b/platform-enterprise_versioned_docs/version-24.2/enterprise/upgrade.md @@ -15,17 +15,6 @@ The database volume is persistent on the local machine by default if you use the 1. Download the latest versions of your deployment templates and update your Seqera container versions: - [docker-compose.yml](./_templates/docker/docker-compose.yml) for Docker Compose deployments - [tower-cron.yml](./_templates/k8s/tower-cron.yml) and [tower-svc.yml](./_templates/k8s/tower-svc.yml) for Kubernetes deployments -1. **JVM memory configuration defaults (recommended)**: The following `JAVA_OPTS` environment variable is included in the deployment templates downloaded in the preceding step, to optimize JVM memory settings: - - ```bash - JAVA_OPTS: -Xms1000M -Xmx2000M -XX:MaxDirectMemorySize=800m -Dio.netty.maxDirectMemory=0 -Djdk.nio.maxCachedBufferSize=262144 - ``` - - These baseline values are suitable for most deployments running moderate concurrent workflow loads. - - :::tip - These are starting recommendations that may require tuning based on your deployment's workload. See [Backend memory requirements](./configuration/overview.mdx#backend-memory-requirements) for detailed guidance on when and how to adjust these values for your environment. - ::: 1. Restart the application. 1. If you're using a containerized database as part of your implementation: 1. Stop the application. diff --git a/platform-enterprise_versioned_docs/version-25.1/enterprise/_templates/docker/tower.env b/platform-enterprise_versioned_docs/version-25.1/enterprise/_templates/docker/tower.env index e40b2c5c8..b8d34bbd6 100644 --- a/platform-enterprise_versioned_docs/version-25.1/enterprise/_templates/docker/tower.env +++ b/platform-enterprise_versioned_docs/version-25.1/enterprise/_templates/docker/tower.env @@ -4,9 +4,6 @@ TOWER_JWT_SECRET= TOWER_LICENSE= -# Configuration to optimize JVM memory settings. See https://docs.seqera.io/platform-enterprise/25.1/enterprise/configuration/overview#backend-memory-requirements -JAVA_OPTS="-Xms1000M -Xmx2000M -XX:MaxDirectMemorySize=800m -Dio.netty.maxDirectMemory=0 -Djdk.nio.maxCachedBufferSize=262144" - # Compute environment settings TOWER_ENABLE_PLATFORMS=altair-platform,awsbatch-platform,azbatch-platform,eks-platform,gke-platform,googlebatch-platform,k8s-platform,lsf-platform,moab-platform,slurm-platform,uge-platform diff --git a/platform-enterprise_versioned_docs/version-25.1/enterprise/_templates/k8s/tower-cron.yml b/platform-enterprise_versioned_docs/version-25.1/enterprise/_templates/k8s/tower-cron.yml index 336e69d02..1f145189a 100644 --- a/platform-enterprise_versioned_docs/version-25.1/enterprise/_templates/k8s/tower-cron.yml +++ b/platform-enterprise_versioned_docs/version-25.1/enterprise/_templates/k8s/tower-cron.yml @@ -43,9 +43,6 @@ spec: env: - name: MICRONAUT_ENVIRONMENTS value: "prod,redis,cron" - # Configuration to optimize JVM memory settings. See https://docs.seqera.io/platform-enterprise/25.1/enterprise/configuration/overview#backend-memory-requirements - - name: JAVA_OPTS - value: "-Xms1000M -Xmx2000M -XX:MaxDirectMemorySize=800m -Dio.netty.maxDirectMemory=0 -Djdk.nio.maxCachedBufferSize=262144" ports: - containerPort: 8080 resources: diff --git a/platform-enterprise_versioned_docs/version-25.1/enterprise/_templates/k8s/tower-svc.yml b/platform-enterprise_versioned_docs/version-25.1/enterprise/_templates/k8s/tower-svc.yml index 120222614..1d7d6ac7f 100644 --- a/platform-enterprise_versioned_docs/version-25.1/enterprise/_templates/k8s/tower-svc.yml +++ b/platform-enterprise_versioned_docs/version-25.1/enterprise/_templates/k8s/tower-svc.yml @@ -36,9 +36,6 @@ spec: env: - name: MICRONAUT_ENVIRONMENTS value: "prod,redis,ha" - # Configuration to optimize JVM memory settings. See https://docs.seqera.io/platform-enterprise/25.1/enterprise/configuration/overview#backend-memory-requirements - - name: JAVA_OPTS - value: "-Xms1000M -Xmx2000M -XX:MaxDirectMemorySize=800m -Dio.netty.maxDirectMemory=0 -Djdk.nio.maxCachedBufferSize=262144" # TLS certificate for Studios #- name: TOWER_OIDC_PEM_PATH # value: '/data/certs/oidc.pem' diff --git a/platform-enterprise_versioned_docs/version-25.1/enterprise/advanced-topics/jvm-memory-tuning.md b/platform-enterprise_versioned_docs/version-25.1/enterprise/advanced-topics/jvm-memory-tuning.md new file mode 100644 index 000000000..5a9f85438 --- /dev/null +++ b/platform-enterprise_versioned_docs/version-25.1/enterprise/advanced-topics/jvm-memory-tuning.md @@ -0,0 +1,65 @@ +--- +title: "JVM memory tuning" +description: Configure JVM memory parameters for Seqera Platform Enterprise deployments +tags: [configuration, jvm, memory, tuning] +--- + +# JVM memory tuning + +:::warning +JVM memory tuning is an advanced topic that may cause instability and performance issues. +::: + +Seqera Platform scales memory allocation based on resources allocated to the application. To best inform available memory, set memory requests and limits on your deployments. We recommend increasing memory allocation before manually configuring JVM settings. + +If you wish to manually configure JVM memory, use the following baseline recommendations. + +## Memory parameters + +Set JVM memory parameters using the `JAVA_OPTS` environment variable. The following parameters control memory allocation: + +| Parameter | Description | +| -------------------------- | ----------------------------------------------------------------------------------------------------------------------------------- | +| `-Xms` / `-Xmx` | Set the initial (`Xms`) and maximum (`Xmx`) heap size. The heap stores Java objects and should be 50-70% of total allocated memory. | +| `-XX:MaxDirectMemorySize` | Set the maximum direct (off-heap) memory. Used for NIO operations, network buffers, and file I/O. | +| `-XX:ActiveProcessorCount` | Set the number of CPUs available to the JVM. Should match the number of vCPUs allocated to the container. | + +## Resource allocation guidelines + +- **Heap (`-Xmx`)**: 50-70% of total allocated memory +- **Direct memory**: 10-20% of total allocated memory +- **Overhead** (metaspace, thread stacks, native memory): ~10% of total allocated memory + +Ensure total JVM memory (heap + direct memory + overhead) does not exceed container memory limits. + +## Example configurations + +The following table provides example configurations for common deployment sizes. These are starting points and may need to be tuned based on your specific usage patterns. + +| vCPU | RAM | Heap (`-Xmx`) | Direct Memory | `JAVA_OPTS` | +| :--: | :---: | :-----------: | :-----------: | ------------------------------------------------------------------------------- | +| 1 | 2 GB | 1 GB | 512 MB | `-XX:ActiveProcessorCount=1 -Xms500M -Xmx1000M -XX:MaxDirectMemorySize=512m` | +| 1 | 4 GB | 2.5 GB | 800 MB | `-XX:ActiveProcessorCount=1 -Xms1000M -Xmx2500M -XX:MaxDirectMemorySize=800m` | +| 2 | 2 GB | 1 GB | 512 MB | `-XX:ActiveProcessorCount=2 -Xms500M -Xmx1000M -XX:MaxDirectMemorySize=512m` | +| 2 | 4 GB | 2 GB | 800 MB | `-XX:ActiveProcessorCount=2 -Xms1000M -Xmx2000M -XX:MaxDirectMemorySize=800m` | +| 2 | 8 GB | 5 GB | 1.5 GB | `-XX:ActiveProcessorCount=2 -Xms2000M -Xmx5000M -XX:MaxDirectMemorySize=1500m` | +| 3 | 2 GB | 1 GB | 512 MB | `-XX:ActiveProcessorCount=3 -Xms500M -Xmx1000M -XX:MaxDirectMemorySize=512m` | +| 3 | 4 GB | 2 GB | 800 MB | `-XX:ActiveProcessorCount=3 -Xms1000M -Xmx2000M -XX:MaxDirectMemorySize=800m` | +| 3 | 8 GB | 5 GB | 1.5 GB | `-XX:ActiveProcessorCount=3 -Xms2000M -Xmx5000M -XX:MaxDirectMemorySize=1500m` | +| 3 | 16 GB | 11 GB | 2.5 GB | `-XX:ActiveProcessorCount=3 -Xms4000M -Xmx11000M -XX:MaxDirectMemorySize=2500m` | + +## When to adjust memory settings + +Adjust your JVM memory settings if you observe the following issues in your deployment: + +**Increase heap memory (`-Xmx`)** if you see: + +- `OutOfMemoryError: Java heap space` errors in logs +- Garbage collection pauses affecting performance +- Steadily growing memory usage under sustained load + +**Increase direct memory (`MaxDirectMemorySize`)** if you see: + +- `OutOfMemoryError: Direct buffer memory` errors in logs +- High concurrent workflow launch rates (more than 100 simultaneous workflows) +- Large configuration payloads or extensive API usage diff --git a/platform-enterprise_versioned_docs/version-25.1/enterprise/configuration/overview.mdx b/platform-enterprise_versioned_docs/version-25.1/enterprise/configuration/overview.mdx index dfe4cabc5..e591684f7 100644 --- a/platform-enterprise_versioned_docs/version-25.1/enterprise/configuration/overview.mdx +++ b/platform-enterprise_versioned_docs/version-25.1/enterprise/configuration/overview.mdx @@ -416,50 +416,6 @@ services: These default memory allocation limits are included in your Kubernetes manifests ([tower-svc.yml](../_templates/k8s/tower-svc.yml) and [tower-cron.yml](../_templates/k8s/tower-cron.yml)) and Docker Compose ([docker-compose.yml](../_templates/docker/docker-compose.yml)) configuration templates. ::: -### JVM memory tuning - -For production deployments, configure JVM memory parameters via the `JAVA_OPTS` environment variable. The following baseline configuration is suitable for most deployments: - -```bash -JAVA_OPTS: -Xms1000M -Xmx2000M -XX:MaxDirectMemorySize=800m -Dio.netty.maxDirectMemory=0 -Djdk.nio.maxCachedBufferSize=262144 -``` - -:::note -These default JVM memory settings are included in the configuration templates provided in these docs: -- Kubernetes: [tower-svc.yml](../_templates/k8s/tower-svc.yml) and [tower-cron.yml](../_templates/k8s/tower-cron.yml) -- Docker Compose: [tower.env](../_templates/docker/tower.env) -::: - -**Parameter descriptions:** -- **Heap memory** (`-Xms`/`-Xmx`): Memory pool for Java objects. Set initial (`Xms`) and maximum (`Xmx`) heap size. -- **Direct memory** (`MaxDirectMemorySize`): Off-heap memory used for NIO operations, network buffers, and file I/O. Critical for handling concurrent workflow API operations. -- **Netty memory accounting** (`io.netty.maxDirectMemory=0`): Disables Netty's internal tracking; relies on JVM direct memory limits instead. -- **Buffer caching** (`jdk.nio.maxCachedBufferSize`): Limits size of cached NIO buffers to prevent excessive memory retention. - -**When to adjust these values:** - -Increase `MaxDirectMemorySize` if you observe: -- `OutOfMemoryError: Direct buffer memory` in logs -- High concurrent workflow launch rates (>100 simultaneous workflows) -- Large configuration payloads or extensive API usage - -Increase heap memory (`-Xmx`) if you observe: -- `OutOfMemoryError: Java heap space` in logs -- Garbage collection pauses affecting performance -- Growing memory usage under sustained load - -**Example: High-concurrency deployment** -For deployments running 200+ concurrent workflows: -```bash -JAVA_OPTS: -Xms1000M -Xmx3000M -XX:MaxDirectMemorySize=1600m -Dio.netty.maxDirectMemory=0 -Djdk.nio.maxCachedBufferSize=262144 -``` - -Ensure container/pod memory limits are set higher than JVM limits to accommodate non-heap memory usage. - -:::warning -These are starting recommendations. Monitor your deployment's actual memory usage and adjust based on your specific workload patterns. Undersized memory allocation can cause OOM failures and service instability. -::: - ## Compute environments Configuration values to enable computing platforms and customize Batch Forge resource naming. diff --git a/platform-enterprise_versioned_docs/version-25.1/enterprise/upgrade.md b/platform-enterprise_versioned_docs/version-25.1/enterprise/upgrade.md index 52cdf65a3..3f0fcff60 100644 --- a/platform-enterprise_versioned_docs/version-25.1/enterprise/upgrade.md +++ b/platform-enterprise_versioned_docs/version-25.1/enterprise/upgrade.md @@ -15,17 +15,6 @@ The database volume is persistent on the local machine by default if you use the 1. Download the latest versions of your deployment templates and update your Seqera container versions: - [docker-compose.yml](./_templates/docker/docker-compose.yml) for Docker Compose deployments - [tower-cron.yml](./_templates/k8s/tower-cron.yml) and [tower-svc.yml](./_templates/k8s/tower-svc.yml) for Kubernetes deployments -1. **JVM memory configuration defaults (recommended)**: The following `JAVA_OPTS` environment variable is included in the deployment templates downloaded in the preceding step, to optimize JVM memory settings: - - ```bash - JAVA_OPTS: -Xms1000M -Xmx2000M -XX:MaxDirectMemorySize=800m -Dio.netty.maxDirectMemory=0 -Djdk.nio.maxCachedBufferSize=262144 - ``` - - These baseline values are suitable for most deployments running moderate concurrent workflow loads. - - :::tip - These are starting recommendations that may require tuning based on your deployment's workload. See [Backend memory requirements](./configuration/overview.mdx#backend-memory-requirements) for detailed guidance on when and how to adjust these values for your environment. - ::: 1. Restart the application. 1. If you're using a containerized database as part of your implementation: 1. Stop the application. diff --git a/platform-enterprise_versioned_docs/version-25.2/enterprise/_templates/docker/tower.env b/platform-enterprise_versioned_docs/version-25.2/enterprise/_templates/docker/tower.env index 1e7900dee..aa01975f3 100644 --- a/platform-enterprise_versioned_docs/version-25.2/enterprise/_templates/docker/tower.env +++ b/platform-enterprise_versioned_docs/version-25.2/enterprise/_templates/docker/tower.env @@ -4,9 +4,6 @@ TOWER_JWT_SECRET= TOWER_LICENSE= -# Configuration to optimize JVM memory settings. See https://docs.seqera.io/platform-enterprise/25.2/enterprise/configuration/overview#backend-memory-requirements -JAVA_OPTS="-Xms1000M -Xmx2000M -XX:MaxDirectMemorySize=800m -Dio.netty.maxDirectMemory=0 -Djdk.nio.maxCachedBufferSize=262144" - # Compute environment settings TOWER_ENABLE_PLATFORMS=altair-platform,awsbatch-platform,awscloud-platform,azbatch-platform,eks-platform,gke-platform,googlebatch-platform,googlecloud-platform,k8s-platform,lsf-platform,moab-platform,slurm-platform,uge-platform diff --git a/platform-enterprise_versioned_docs/version-25.2/enterprise/_templates/k8s/tower-cron.yml b/platform-enterprise_versioned_docs/version-25.2/enterprise/_templates/k8s/tower-cron.yml index 12ef4d268..7176dcc55 100644 --- a/platform-enterprise_versioned_docs/version-25.2/enterprise/_templates/k8s/tower-cron.yml +++ b/platform-enterprise_versioned_docs/version-25.2/enterprise/_templates/k8s/tower-cron.yml @@ -43,9 +43,6 @@ spec: env: - name: MICRONAUT_ENVIRONMENTS value: "prod,redis,cron" - # Configuration to optimize JVM memory settings. See https://docs.seqera.io/platform-enterprise/25.2/enterprise/configuration/overview#backend-memory-requirements - - name: JAVA_OPTS - value: "-Xms1000M -Xmx2000M -XX:MaxDirectMemorySize=800m -Dio.netty.maxDirectMemory=0 -Djdk.nio.maxCachedBufferSize=262144" ports: - containerPort: 8080 resources: diff --git a/platform-enterprise_versioned_docs/version-25.2/enterprise/_templates/k8s/tower-svc.yml b/platform-enterprise_versioned_docs/version-25.2/enterprise/_templates/k8s/tower-svc.yml index e13cb31a0..8eaae680c 100644 --- a/platform-enterprise_versioned_docs/version-25.2/enterprise/_templates/k8s/tower-svc.yml +++ b/platform-enterprise_versioned_docs/version-25.2/enterprise/_templates/k8s/tower-svc.yml @@ -36,9 +36,6 @@ spec: env: - name: MICRONAUT_ENVIRONMENTS value: "prod,redis,ha" - # Configuration to optimize JVM memory settings. See https://docs.seqera.io/platform-enterprise/25.2/enterprise/configuration/overview#backend-memory-requirements - - name: JAVA_OPTS - value: "-Xms1000M -Xmx2000M -XX:MaxDirectMemorySize=800m -Dio.netty.maxDirectMemory=0 -Djdk.nio.maxCachedBufferSize=262144" # TLS certificate for Studios #- name: TOWER_OIDC_PEM_PATH # value: '/data/certs/oidc.pem' diff --git a/platform-enterprise_versioned_docs/version-25.2/enterprise/advanced-topics/jvm-memory-tuning.md b/platform-enterprise_versioned_docs/version-25.2/enterprise/advanced-topics/jvm-memory-tuning.md new file mode 100644 index 000000000..5a9f85438 --- /dev/null +++ b/platform-enterprise_versioned_docs/version-25.2/enterprise/advanced-topics/jvm-memory-tuning.md @@ -0,0 +1,65 @@ +--- +title: "JVM memory tuning" +description: Configure JVM memory parameters for Seqera Platform Enterprise deployments +tags: [configuration, jvm, memory, tuning] +--- + +# JVM memory tuning + +:::warning +JVM memory tuning is an advanced topic that may cause instability and performance issues. +::: + +Seqera Platform scales memory allocation based on resources allocated to the application. To best inform available memory, set memory requests and limits on your deployments. We recommend increasing memory allocation before manually configuring JVM settings. + +If you wish to manually configure JVM memory, use the following baseline recommendations. + +## Memory parameters + +Set JVM memory parameters using the `JAVA_OPTS` environment variable. The following parameters control memory allocation: + +| Parameter | Description | +| -------------------------- | ----------------------------------------------------------------------------------------------------------------------------------- | +| `-Xms` / `-Xmx` | Set the initial (`Xms`) and maximum (`Xmx`) heap size. The heap stores Java objects and should be 50-70% of total allocated memory. | +| `-XX:MaxDirectMemorySize` | Set the maximum direct (off-heap) memory. Used for NIO operations, network buffers, and file I/O. | +| `-XX:ActiveProcessorCount` | Set the number of CPUs available to the JVM. Should match the number of vCPUs allocated to the container. | + +## Resource allocation guidelines + +- **Heap (`-Xmx`)**: 50-70% of total allocated memory +- **Direct memory**: 10-20% of total allocated memory +- **Overhead** (metaspace, thread stacks, native memory): ~10% of total allocated memory + +Ensure total JVM memory (heap + direct memory + overhead) does not exceed container memory limits. + +## Example configurations + +The following table provides example configurations for common deployment sizes. These are starting points and may need to be tuned based on your specific usage patterns. + +| vCPU | RAM | Heap (`-Xmx`) | Direct Memory | `JAVA_OPTS` | +| :--: | :---: | :-----------: | :-----------: | ------------------------------------------------------------------------------- | +| 1 | 2 GB | 1 GB | 512 MB | `-XX:ActiveProcessorCount=1 -Xms500M -Xmx1000M -XX:MaxDirectMemorySize=512m` | +| 1 | 4 GB | 2.5 GB | 800 MB | `-XX:ActiveProcessorCount=1 -Xms1000M -Xmx2500M -XX:MaxDirectMemorySize=800m` | +| 2 | 2 GB | 1 GB | 512 MB | `-XX:ActiveProcessorCount=2 -Xms500M -Xmx1000M -XX:MaxDirectMemorySize=512m` | +| 2 | 4 GB | 2 GB | 800 MB | `-XX:ActiveProcessorCount=2 -Xms1000M -Xmx2000M -XX:MaxDirectMemorySize=800m` | +| 2 | 8 GB | 5 GB | 1.5 GB | `-XX:ActiveProcessorCount=2 -Xms2000M -Xmx5000M -XX:MaxDirectMemorySize=1500m` | +| 3 | 2 GB | 1 GB | 512 MB | `-XX:ActiveProcessorCount=3 -Xms500M -Xmx1000M -XX:MaxDirectMemorySize=512m` | +| 3 | 4 GB | 2 GB | 800 MB | `-XX:ActiveProcessorCount=3 -Xms1000M -Xmx2000M -XX:MaxDirectMemorySize=800m` | +| 3 | 8 GB | 5 GB | 1.5 GB | `-XX:ActiveProcessorCount=3 -Xms2000M -Xmx5000M -XX:MaxDirectMemorySize=1500m` | +| 3 | 16 GB | 11 GB | 2.5 GB | `-XX:ActiveProcessorCount=3 -Xms4000M -Xmx11000M -XX:MaxDirectMemorySize=2500m` | + +## When to adjust memory settings + +Adjust your JVM memory settings if you observe the following issues in your deployment: + +**Increase heap memory (`-Xmx`)** if you see: + +- `OutOfMemoryError: Java heap space` errors in logs +- Garbage collection pauses affecting performance +- Steadily growing memory usage under sustained load + +**Increase direct memory (`MaxDirectMemorySize`)** if you see: + +- `OutOfMemoryError: Direct buffer memory` errors in logs +- High concurrent workflow launch rates (more than 100 simultaneous workflows) +- Large configuration payloads or extensive API usage diff --git a/platform-enterprise_versioned_docs/version-25.2/enterprise/configuration/overview.mdx b/platform-enterprise_versioned_docs/version-25.2/enterprise/configuration/overview.mdx index 81fc2614d..35235a808 100644 --- a/platform-enterprise_versioned_docs/version-25.2/enterprise/configuration/overview.mdx +++ b/platform-enterprise_versioned_docs/version-25.2/enterprise/configuration/overview.mdx @@ -455,50 +455,6 @@ services: These default memory allocation limits are included in your Kubernetes manifests ([tower-svc.yml](../_templates/k8s/tower-svc.yml) and [tower-cron.yml](../_templates/k8s/tower-cron.yml)) and Docker Compose ([docker-compose.yml](../_templates/docker/docker-compose.yml)) configuration templates. ::: -### JVM memory tuning - -For production deployments, configure JVM memory parameters via the `JAVA_OPTS` environment variable. The following baseline configuration is suitable for most deployments: - -```bash -JAVA_OPTS: -Xms1000M -Xmx2000M -XX:MaxDirectMemorySize=800m -Dio.netty.maxDirectMemory=0 -Djdk.nio.maxCachedBufferSize=262144 -``` - -:::note -These default JVM memory settings are included in the configuration templates provided in these docs: -- Kubernetes: [tower-svc.yml](../_templates/k8s/tower-svc.yml) and [tower-cron.yml](../_templates/k8s/tower-cron.yml) -- Docker Compose: [tower.env](../_templates/docker/tower.env) -::: - -**Parameter descriptions:** -- **Heap memory** (`-Xms`/`-Xmx`): Memory pool for Java objects. Set initial (`Xms`) and maximum (`Xmx`) heap size. -- **Direct memory** (`MaxDirectMemorySize`): Off-heap memory used for NIO operations, network buffers, and file I/O. Critical for handling concurrent workflow API operations. -- **Netty memory accounting** (`io.netty.maxDirectMemory=0`): Disables Netty's internal tracking; relies on JVM direct memory limits instead. -- **Buffer caching** (`jdk.nio.maxCachedBufferSize`): Limits size of cached NIO buffers to prevent excessive memory retention. - -**When to adjust these values:** - -Increase `MaxDirectMemorySize` if you observe: -- `OutOfMemoryError: Direct buffer memory` in logs -- High concurrent workflow launch rates (>100 simultaneous workflows) -- Large configuration payloads or extensive API usage - -Increase heap memory (`-Xmx`) if you observe: -- `OutOfMemoryError: Java heap space` in logs -- Garbage collection pauses affecting performance -- Growing memory usage under sustained load - -**Example: High-concurrency deployment** -For deployments running 200+ concurrent workflows: -```bash -JAVA_OPTS: -Xms1000M -Xmx3000M -XX:MaxDirectMemorySize=1600m -Dio.netty.maxDirectMemory=0 -Djdk.nio.maxCachedBufferSize=262144 -``` - -Ensure container/pod memory limits are set higher than JVM limits to accommodate non-heap memory usage. - -:::warning -These are starting recommendations. Monitor your deployment's actual memory usage and adjust based on your specific workload patterns. Undersized memory allocation can cause OOM failures and service instability. -::: - ## Compute environments Configuration values to enable computing platforms and customize Batch Forge resource naming. diff --git a/platform-enterprise_versioned_docs/version-25.2/enterprise/upgrade.md b/platform-enterprise_versioned_docs/version-25.2/enterprise/upgrade.md index 5eff196f7..d3778ee30 100644 --- a/platform-enterprise_versioned_docs/version-25.2/enterprise/upgrade.md +++ b/platform-enterprise_versioned_docs/version-25.2/enterprise/upgrade.md @@ -70,18 +70,6 @@ The database volume is persistent on the local machine by default if you use the 1. Download the latest versions of your deployment templates and update your Seqera container versions: - [docker-compose.yml](./_templates/docker/docker-compose.yml) for Docker Compose deployments - [tower-cron.yml](./_templates/k8s/tower-cron.yml) and [tower-svc.yml](./_templates/k8s/tower-svc.yml) for Kubernetes deployments -1. **JVM memory configuration defaults (recommended)**: The following `JAVA_OPTS` environment variable is included in the deployment templates downloaded in the preceding step, to optimize JVM memory settings: - - ```bash - JAVA_OPTS: -Xms1000M -Xmx2000M -XX:MaxDirectMemorySize=800m -Dio.netty.maxDirectMemory=0 -Djdk.nio.maxCachedBufferSize=262144 - ``` - - These baseline values are suitable for most deployments running moderate concurrent workflow loads. - - :::tip - These are starting recommendations that may require tuning based on your deployment's workload. See [Backend memory requirements](./configuration/overview.mdx#backend-memory-requirements) for detailed guidance on when and how to adjust these values for your environment. - ::: - 1. If you're using Studios, download and apply the latest versions of the Kubernetes manifests: - [proxy.yml](./_templates/k8s/data_studios/proxy.yml) - [server.yml](./_templates/k8s/data_studios/server.yml) diff --git a/platform-enterprise_versioned_docs/version-25.3/enterprise/_templates/docker/tower.env b/platform-enterprise_versioned_docs/version-25.3/enterprise/_templates/docker/tower.env index 28491f14b..9874de2d6 100644 --- a/platform-enterprise_versioned_docs/version-25.3/enterprise/_templates/docker/tower.env +++ b/platform-enterprise_versioned_docs/version-25.3/enterprise/_templates/docker/tower.env @@ -4,10 +4,6 @@ TOWER_JWT_SECRET= TOWER_LICENSE= - -# Configuration to optimize JVM memory settings. See https://docs.seqera.io/platform-enterprise/enterprise/configuration/overview#backend-memory-requirements -JAVA_OPTS="-Xms1000M -Xmx2000M -XX:MaxDirectMemorySize=800m -Dio.netty.maxDirectMemory=0 -Djdk.nio.maxCachedBufferSize=262144" - # Compute environment settings TOWER_ENABLE_PLATFORMS=awsbatch-platform,azbatch-platform,gls-platform,googlebatch-platform,k8s-platform,uge-platform,slurm-platform diff --git a/platform-enterprise_versioned_docs/version-25.3/enterprise/_templates/k8s/tower-cron.yml b/platform-enterprise_versioned_docs/version-25.3/enterprise/_templates/k8s/tower-cron.yml index aa21d30e6..ed6f97da9 100644 --- a/platform-enterprise_versioned_docs/version-25.3/enterprise/_templates/k8s/tower-cron.yml +++ b/platform-enterprise_versioned_docs/version-25.3/enterprise/_templates/k8s/tower-cron.yml @@ -43,9 +43,6 @@ spec: env: - name: MICRONAUT_ENVIRONMENTS value: "prod,redis,cron" - # Configuration to optimize JVM memory settings. See https://docs.seqera.io/platform-enterprise/25.2/enterprise/configuration/overview#backend-memory-requirements - - name: JAVA_OPTS - value: "-Xms1000M -Xmx2000M -XX:MaxDirectMemorySize=800m -Dio.netty.maxDirectMemory=0 -Djdk.nio.maxCachedBufferSize=262144" ports: - containerPort: 8080 resources: diff --git a/platform-enterprise_versioned_docs/version-25.3/enterprise/_templates/k8s/tower-svc.yml b/platform-enterprise_versioned_docs/version-25.3/enterprise/_templates/k8s/tower-svc.yml index 256eb9693..9be690674 100644 --- a/platform-enterprise_versioned_docs/version-25.3/enterprise/_templates/k8s/tower-svc.yml +++ b/platform-enterprise_versioned_docs/version-25.3/enterprise/_templates/k8s/tower-svc.yml @@ -36,9 +36,6 @@ spec: env: - name: MICRONAUT_ENVIRONMENTS value: "prod,redis,ha" - # Configuration to optimize JVM memory settings. See https://docs.seqera.io/platform-enterprise/25.2/enterprise/configuration/overview#backend-memory-requirements - - name: JAVA_OPTS - value: "-Xms1000M -Xmx2000M -XX:MaxDirectMemorySize=800m -Dio.netty.maxDirectMemory=0 -Djdk.nio.maxCachedBufferSize=262144" # TLS certificate for Studios #- name: TOWER_OIDC_PEM_PATH # value: '/data/certs/oidc.pem' diff --git a/platform-enterprise_versioned_docs/version-25.3/enterprise/advanced-topics/jvm-memory-tuning.md b/platform-enterprise_versioned_docs/version-25.3/enterprise/advanced-topics/jvm-memory-tuning.md new file mode 100644 index 000000000..5a9f85438 --- /dev/null +++ b/platform-enterprise_versioned_docs/version-25.3/enterprise/advanced-topics/jvm-memory-tuning.md @@ -0,0 +1,65 @@ +--- +title: "JVM memory tuning" +description: Configure JVM memory parameters for Seqera Platform Enterprise deployments +tags: [configuration, jvm, memory, tuning] +--- + +# JVM memory tuning + +:::warning +JVM memory tuning is an advanced topic that may cause instability and performance issues. +::: + +Seqera Platform scales memory allocation based on resources allocated to the application. To best inform available memory, set memory requests and limits on your deployments. We recommend increasing memory allocation before manually configuring JVM settings. + +If you wish to manually configure JVM memory, use the following baseline recommendations. + +## Memory parameters + +Set JVM memory parameters using the `JAVA_OPTS` environment variable. The following parameters control memory allocation: + +| Parameter | Description | +| -------------------------- | ----------------------------------------------------------------------------------------------------------------------------------- | +| `-Xms` / `-Xmx` | Set the initial (`Xms`) and maximum (`Xmx`) heap size. The heap stores Java objects and should be 50-70% of total allocated memory. | +| `-XX:MaxDirectMemorySize` | Set the maximum direct (off-heap) memory. Used for NIO operations, network buffers, and file I/O. | +| `-XX:ActiveProcessorCount` | Set the number of CPUs available to the JVM. Should match the number of vCPUs allocated to the container. | + +## Resource allocation guidelines + +- **Heap (`-Xmx`)**: 50-70% of total allocated memory +- **Direct memory**: 10-20% of total allocated memory +- **Overhead** (metaspace, thread stacks, native memory): ~10% of total allocated memory + +Ensure total JVM memory (heap + direct memory + overhead) does not exceed container memory limits. + +## Example configurations + +The following table provides example configurations for common deployment sizes. These are starting points and may need to be tuned based on your specific usage patterns. + +| vCPU | RAM | Heap (`-Xmx`) | Direct Memory | `JAVA_OPTS` | +| :--: | :---: | :-----------: | :-----------: | ------------------------------------------------------------------------------- | +| 1 | 2 GB | 1 GB | 512 MB | `-XX:ActiveProcessorCount=1 -Xms500M -Xmx1000M -XX:MaxDirectMemorySize=512m` | +| 1 | 4 GB | 2.5 GB | 800 MB | `-XX:ActiveProcessorCount=1 -Xms1000M -Xmx2500M -XX:MaxDirectMemorySize=800m` | +| 2 | 2 GB | 1 GB | 512 MB | `-XX:ActiveProcessorCount=2 -Xms500M -Xmx1000M -XX:MaxDirectMemorySize=512m` | +| 2 | 4 GB | 2 GB | 800 MB | `-XX:ActiveProcessorCount=2 -Xms1000M -Xmx2000M -XX:MaxDirectMemorySize=800m` | +| 2 | 8 GB | 5 GB | 1.5 GB | `-XX:ActiveProcessorCount=2 -Xms2000M -Xmx5000M -XX:MaxDirectMemorySize=1500m` | +| 3 | 2 GB | 1 GB | 512 MB | `-XX:ActiveProcessorCount=3 -Xms500M -Xmx1000M -XX:MaxDirectMemorySize=512m` | +| 3 | 4 GB | 2 GB | 800 MB | `-XX:ActiveProcessorCount=3 -Xms1000M -Xmx2000M -XX:MaxDirectMemorySize=800m` | +| 3 | 8 GB | 5 GB | 1.5 GB | `-XX:ActiveProcessorCount=3 -Xms2000M -Xmx5000M -XX:MaxDirectMemorySize=1500m` | +| 3 | 16 GB | 11 GB | 2.5 GB | `-XX:ActiveProcessorCount=3 -Xms4000M -Xmx11000M -XX:MaxDirectMemorySize=2500m` | + +## When to adjust memory settings + +Adjust your JVM memory settings if you observe the following issues in your deployment: + +**Increase heap memory (`-Xmx`)** if you see: + +- `OutOfMemoryError: Java heap space` errors in logs +- Garbage collection pauses affecting performance +- Steadily growing memory usage under sustained load + +**Increase direct memory (`MaxDirectMemorySize`)** if you see: + +- `OutOfMemoryError: Direct buffer memory` errors in logs +- High concurrent workflow launch rates (more than 100 simultaneous workflows) +- Large configuration payloads or extensive API usage diff --git a/platform-enterprise_versioned_docs/version-25.3/enterprise/configuration/overview.mdx b/platform-enterprise_versioned_docs/version-25.3/enterprise/configuration/overview.mdx index 81fc2614d..35235a808 100644 --- a/platform-enterprise_versioned_docs/version-25.3/enterprise/configuration/overview.mdx +++ b/platform-enterprise_versioned_docs/version-25.3/enterprise/configuration/overview.mdx @@ -455,50 +455,6 @@ services: These default memory allocation limits are included in your Kubernetes manifests ([tower-svc.yml](../_templates/k8s/tower-svc.yml) and [tower-cron.yml](../_templates/k8s/tower-cron.yml)) and Docker Compose ([docker-compose.yml](../_templates/docker/docker-compose.yml)) configuration templates. ::: -### JVM memory tuning - -For production deployments, configure JVM memory parameters via the `JAVA_OPTS` environment variable. The following baseline configuration is suitable for most deployments: - -```bash -JAVA_OPTS: -Xms1000M -Xmx2000M -XX:MaxDirectMemorySize=800m -Dio.netty.maxDirectMemory=0 -Djdk.nio.maxCachedBufferSize=262144 -``` - -:::note -These default JVM memory settings are included in the configuration templates provided in these docs: -- Kubernetes: [tower-svc.yml](../_templates/k8s/tower-svc.yml) and [tower-cron.yml](../_templates/k8s/tower-cron.yml) -- Docker Compose: [tower.env](../_templates/docker/tower.env) -::: - -**Parameter descriptions:** -- **Heap memory** (`-Xms`/`-Xmx`): Memory pool for Java objects. Set initial (`Xms`) and maximum (`Xmx`) heap size. -- **Direct memory** (`MaxDirectMemorySize`): Off-heap memory used for NIO operations, network buffers, and file I/O. Critical for handling concurrent workflow API operations. -- **Netty memory accounting** (`io.netty.maxDirectMemory=0`): Disables Netty's internal tracking; relies on JVM direct memory limits instead. -- **Buffer caching** (`jdk.nio.maxCachedBufferSize`): Limits size of cached NIO buffers to prevent excessive memory retention. - -**When to adjust these values:** - -Increase `MaxDirectMemorySize` if you observe: -- `OutOfMemoryError: Direct buffer memory` in logs -- High concurrent workflow launch rates (>100 simultaneous workflows) -- Large configuration payloads or extensive API usage - -Increase heap memory (`-Xmx`) if you observe: -- `OutOfMemoryError: Java heap space` in logs -- Garbage collection pauses affecting performance -- Growing memory usage under sustained load - -**Example: High-concurrency deployment** -For deployments running 200+ concurrent workflows: -```bash -JAVA_OPTS: -Xms1000M -Xmx3000M -XX:MaxDirectMemorySize=1600m -Dio.netty.maxDirectMemory=0 -Djdk.nio.maxCachedBufferSize=262144 -``` - -Ensure container/pod memory limits are set higher than JVM limits to accommodate non-heap memory usage. - -:::warning -These are starting recommendations. Monitor your deployment's actual memory usage and adjust based on your specific workload patterns. Undersized memory allocation can cause OOM failures and service instability. -::: - ## Compute environments Configuration values to enable computing platforms and customize Batch Forge resource naming. diff --git a/platform-enterprise_versioned_docs/version-25.3/enterprise/upgrade.md b/platform-enterprise_versioned_docs/version-25.3/enterprise/upgrade.md index 28e436bb0..5a06ec925 100644 --- a/platform-enterprise_versioned_docs/version-25.3/enterprise/upgrade.md +++ b/platform-enterprise_versioned_docs/version-25.3/enterprise/upgrade.md @@ -74,17 +74,6 @@ Starting from version 26.1, the frontend image running as root user will be depr 1. Download the latest versions of your deployment templates and update your Seqera container versions: - [docker-compose.yml](./_templates/docker/docker-compose.yml) for Docker Compose deployments - [tower-cron.yml](./_templates/k8s/tower-cron.yml) and [tower-svc.yml](./_templates/k8s/tower-svc.yml) for Kubernetes deployments -1. **JVM memory configuration defaults (recommended)**: The following `JAVA_OPTS` environment variable is included in the deployment templates downloaded in the preceding step, to optimize JVM memory settings: - - ```bash - JAVA_OPTS: -Xms1000M -Xmx2000M -XX:MaxDirectMemorySize=800m -Dio.netty.maxDirectMemory=0 -Djdk.nio.maxCachedBufferSize=262144 - ``` - - These baseline values are suitable for most deployments running moderate concurrent workflow loads. - - :::tip - These are starting recommendations that may require tuning based on your deployment's workload. See [Backend memory requirements](./configuration/overview.mdx#backend-memory-requirements) for detailed guidance on when and how to adjust these values for your environment. - ::: 1. If you're using Studios, download and apply the latest versions of the Kubernetes manifests: - [proxy.yml](./_templates/k8s/data_studios/proxy.yml) - [server.yml](./_templates/k8s/data_studios/server.yml) diff --git a/platform-enterprise_versioned_sidebars/version-24.1-sidebars.json b/platform-enterprise_versioned_sidebars/version-24.1-sidebars.json index 89a418100..c6be05750 100644 --- a/platform-enterprise_versioned_sidebars/version-24.1-sidebars.json +++ b/platform-enterprise_versioned_sidebars/version-24.1-sidebars.json @@ -57,7 +57,8 @@ "enterprise/advanced-topics/db-docker-to-RDS", "enterprise/advanced-topics/use-iam-role", "enterprise/advanced-topics/custom-launch-container", - "enterprise/advanced-topics/firewall-configuration" + "enterprise/advanced-topics/firewall-configuration", + "enterprise/advanced-topics/jvm-memory-tuning" ] }, "enterprise/general_troubleshooting" diff --git a/platform-enterprise_versioned_sidebars/version-24.2-sidebars.json b/platform-enterprise_versioned_sidebars/version-24.2-sidebars.json index 7fde14c9e..09868c874 100644 --- a/platform-enterprise_versioned_sidebars/version-24.2-sidebars.json +++ b/platform-enterprise_versioned_sidebars/version-24.2-sidebars.json @@ -58,7 +58,8 @@ "enterprise/advanced-topics/use-iam-role", "enterprise/advanced-topics/custom-launch-container", "enterprise/advanced-topics/firewall-configuration", - "enterprise/advanced-topics/seqera-container-images" + "enterprise/advanced-topics/seqera-container-images", + "enterprise/advanced-topics/jvm-memory-tuning" ] }, "enterprise/general_troubleshooting" diff --git a/platform-enterprise_versioned_sidebars/version-25.1-sidebars.json b/platform-enterprise_versioned_sidebars/version-25.1-sidebars.json index a8f00955d..cae86add7 100644 --- a/platform-enterprise_versioned_sidebars/version-25.1-sidebars.json +++ b/platform-enterprise_versioned_sidebars/version-25.1-sidebars.json @@ -58,7 +58,8 @@ "enterprise/advanced-topics/use-iam-role", "enterprise/advanced-topics/custom-launch-container", "enterprise/advanced-topics/firewall-configuration", - "enterprise/advanced-topics/seqera-container-images" + "enterprise/advanced-topics/seqera-container-images", + "enterprise/advanced-topics/jvm-memory-tuning" ] }, "enterprise/general_troubleshooting" diff --git a/platform-enterprise_versioned_sidebars/version-25.2-sidebars.json b/platform-enterprise_versioned_sidebars/version-25.2-sidebars.json index 1ded0c712..4603f3aec 100644 --- a/platform-enterprise_versioned_sidebars/version-25.2-sidebars.json +++ b/platform-enterprise_versioned_sidebars/version-25.2-sidebars.json @@ -60,7 +60,8 @@ "enterprise/advanced-topics/custom-launch-container", "enterprise/advanced-topics/firewall-configuration", "enterprise/advanced-topics/seqera-container-images", - "enterprise/advanced-topics/content-security-policy" + "enterprise/advanced-topics/content-security-policy", + "enterprise/advanced-topics/jvm-memory-tuning" ] }, "enterprise/general_troubleshooting" diff --git a/platform-enterprise_versioned_sidebars/version-25.3-sidebars.json b/platform-enterprise_versioned_sidebars/version-25.3-sidebars.json index 829989aa2..5f6512a95 100644 --- a/platform-enterprise_versioned_sidebars/version-25.3-sidebars.json +++ b/platform-enterprise_versioned_sidebars/version-25.3-sidebars.json @@ -59,7 +59,8 @@ "enterprise/advanced-topics/custom-launch-container", "enterprise/advanced-topics/firewall-configuration", "enterprise/advanced-topics/seqera-container-images", - "enterprise/advanced-topics/content-security-policy" + "enterprise/advanced-topics/content-security-policy", + "enterprise/advanced-topics/jvm-memory-tuning" ] }, "enterprise/general_troubleshooting" From 668ec224311f6538cdc54f0a367ac36e8b77dd70 Mon Sep 17 00:00:00 2001 From: Justine Geffen Date: Wed, 17 Dec 2025 18:04:06 +0200 Subject: [PATCH 2/8] Update jvm-memory-tuning.md Signed-off-by: Justine Geffen --- .../enterprise/advanced-topics/jvm-memory-tuning.md | 1 + 1 file changed, 1 insertion(+) diff --git a/platform-enterprise_docs/enterprise/advanced-topics/jvm-memory-tuning.md b/platform-enterprise_docs/enterprise/advanced-topics/jvm-memory-tuning.md index 5a9f85438..165880cf2 100644 --- a/platform-enterprise_docs/enterprise/advanced-topics/jvm-memory-tuning.md +++ b/platform-enterprise_docs/enterprise/advanced-topics/jvm-memory-tuning.md @@ -1,6 +1,7 @@ --- title: "JVM memory tuning" description: Configure JVM memory parameters for Seqera Platform Enterprise deployments +date created: "2025-12-17" tags: [configuration, jvm, memory, tuning] --- From f0825a69c6990141dd02d891a8feb441b5cd4708 Mon Sep 17 00:00:00 2001 From: Justine Geffen Date: Wed, 17 Dec 2025 18:04:28 +0200 Subject: [PATCH 3/8] Update JVM memory tuning documentation metadata Added creation date and tags to the JVM memory tuning documentation. Signed-off-by: Justine Geffen --- .../version-24.1/enterprise/advanced-topics/jvm-memory-tuning.md | 1 + 1 file changed, 1 insertion(+) diff --git a/platform-enterprise_versioned_docs/version-24.1/enterprise/advanced-topics/jvm-memory-tuning.md b/platform-enterprise_versioned_docs/version-24.1/enterprise/advanced-topics/jvm-memory-tuning.md index 5a9f85438..165880cf2 100644 --- a/platform-enterprise_versioned_docs/version-24.1/enterprise/advanced-topics/jvm-memory-tuning.md +++ b/platform-enterprise_versioned_docs/version-24.1/enterprise/advanced-topics/jvm-memory-tuning.md @@ -1,6 +1,7 @@ --- title: "JVM memory tuning" description: Configure JVM memory parameters for Seqera Platform Enterprise deployments +date created: "2025-12-17" tags: [configuration, jvm, memory, tuning] --- From 455954f369f64b25bc1368ccb8407bf291ac58ef Mon Sep 17 00:00:00 2001 From: Justine Geffen Date: Wed, 17 Dec 2025 18:04:47 +0200 Subject: [PATCH 4/8] Update JVM memory tuning documentation metadata Add creation date and tags for JVM memory tuning documentation Signed-off-by: Justine Geffen --- .../version-24.2/enterprise/advanced-topics/jvm-memory-tuning.md | 1 + 1 file changed, 1 insertion(+) diff --git a/platform-enterprise_versioned_docs/version-24.2/enterprise/advanced-topics/jvm-memory-tuning.md b/platform-enterprise_versioned_docs/version-24.2/enterprise/advanced-topics/jvm-memory-tuning.md index 5a9f85438..165880cf2 100644 --- a/platform-enterprise_versioned_docs/version-24.2/enterprise/advanced-topics/jvm-memory-tuning.md +++ b/platform-enterprise_versioned_docs/version-24.2/enterprise/advanced-topics/jvm-memory-tuning.md @@ -1,6 +1,7 @@ --- title: "JVM memory tuning" description: Configure JVM memory parameters for Seqera Platform Enterprise deployments +date created: "2025-12-17" tags: [configuration, jvm, memory, tuning] --- From 707930c5352ff97e9b5075325add2b0fb5366550 Mon Sep 17 00:00:00 2001 From: Justine Geffen Date: Wed, 17 Dec 2025 18:05:06 +0200 Subject: [PATCH 5/8] Update JVM memory tuning doc with metadata Add creation date and tags to JVM memory tuning documentation Signed-off-by: Justine Geffen --- .../version-25.1/enterprise/advanced-topics/jvm-memory-tuning.md | 1 + 1 file changed, 1 insertion(+) diff --git a/platform-enterprise_versioned_docs/version-25.1/enterprise/advanced-topics/jvm-memory-tuning.md b/platform-enterprise_versioned_docs/version-25.1/enterprise/advanced-topics/jvm-memory-tuning.md index 5a9f85438..165880cf2 100644 --- a/platform-enterprise_versioned_docs/version-25.1/enterprise/advanced-topics/jvm-memory-tuning.md +++ b/platform-enterprise_versioned_docs/version-25.1/enterprise/advanced-topics/jvm-memory-tuning.md @@ -1,6 +1,7 @@ --- title: "JVM memory tuning" description: Configure JVM memory parameters for Seqera Platform Enterprise deployments +date created: "2025-12-17" tags: [configuration, jvm, memory, tuning] --- From 5a8f1f07c43e6777f4028ec4c5d537a0fe77d87c Mon Sep 17 00:00:00 2001 From: Justine Geffen Date: Wed, 17 Dec 2025 18:05:37 +0200 Subject: [PATCH 6/8] Update JVM memory tuning doc with date and tags Added creation date and tags to JVM memory tuning documentation. Signed-off-by: Justine Geffen --- .../version-25.3/enterprise/advanced-topics/jvm-memory-tuning.md | 1 + 1 file changed, 1 insertion(+) diff --git a/platform-enterprise_versioned_docs/version-25.3/enterprise/advanced-topics/jvm-memory-tuning.md b/platform-enterprise_versioned_docs/version-25.3/enterprise/advanced-topics/jvm-memory-tuning.md index 5a9f85438..165880cf2 100644 --- a/platform-enterprise_versioned_docs/version-25.3/enterprise/advanced-topics/jvm-memory-tuning.md +++ b/platform-enterprise_versioned_docs/version-25.3/enterprise/advanced-topics/jvm-memory-tuning.md @@ -1,6 +1,7 @@ --- title: "JVM memory tuning" description: Configure JVM memory parameters for Seqera Platform Enterprise deployments +date created: "2025-12-17" tags: [configuration, jvm, memory, tuning] --- From 12d9d53dfa8a3a38d737b25d15831e7129fd2ebb Mon Sep 17 00:00:00 2001 From: Justine Geffen Date: Wed, 17 Dec 2025 18:06:01 +0200 Subject: [PATCH 7/8] Update JVM memory tuning documentation metadata Added creation date and tags to JVM memory tuning documentation. Signed-off-by: Justine Geffen --- .../version-25.2/enterprise/advanced-topics/jvm-memory-tuning.md | 1 + 1 file changed, 1 insertion(+) diff --git a/platform-enterprise_versioned_docs/version-25.2/enterprise/advanced-topics/jvm-memory-tuning.md b/platform-enterprise_versioned_docs/version-25.2/enterprise/advanced-topics/jvm-memory-tuning.md index 5a9f85438..165880cf2 100644 --- a/platform-enterprise_versioned_docs/version-25.2/enterprise/advanced-topics/jvm-memory-tuning.md +++ b/platform-enterprise_versioned_docs/version-25.2/enterprise/advanced-topics/jvm-memory-tuning.md @@ -1,6 +1,7 @@ --- title: "JVM memory tuning" description: Configure JVM memory parameters for Seqera Platform Enterprise deployments +date created: "2025-12-17" tags: [configuration, jvm, memory, tuning] --- From 8e7cca0500e51503b252aca980b9f75b25f96c49 Mon Sep 17 00:00:00 2001 From: Gavin Date: Sun, 25 Jan 2026 11:48:05 +0000 Subject: [PATCH 8/8] feat: add platform monitoring guide (#954) * feat: add platform monitoring guide * feat: Add obs signals * Update monitoring.md Signed-off-by: Justine Geffen * Update monitoring.md Signed-off-by: Justine Geffen * Clean up blank lines in monitoring.md Removed unnecessary blank lines in the monitoring documentation. Signed-off-by: Justine Geffen * Update monitoring.md Signed-off-by: Justine Geffen * Refine Seqera Platform monitoring documentation Updated monitoring documentation for Seqera Platform, including corrections to metric names and descriptions. Signed-off-by: Justine Geffen * Revise monitoring.md for terminology updates Updated monitoring documentation for Seqera Platform to reflect changes in terminology and improve clarity. Signed-off-by: Justine Geffen * Refine Seqera Platform Monitoring documentation Updated monitoring documentation to improve clarity and consistency in terminology. Signed-off-by: Justine Geffen * Apply suggestion from @justinegeffen Signed-off-by: Justine Geffen * Apply suggestion from @justinegeffen Signed-off-by: Justine Geffen * Apply suggestion from @justinegeffen Signed-off-by: Justine Geffen * Apply suggestion from @justinegeffen Signed-off-by: Justine Geffen * Apply suggestion from @justinegeffen Signed-off-by: Justine Geffen --------- Signed-off-by: Justine Geffen Co-authored-by: Justine Geffen --- .../enterprise-sidebar.json | 3 +- .../enterprise/advanced-topics/monitoring.md | 616 ++++++++++++++++++ .../enterprise/advanced-topics/monitoring.md | 616 ++++++++++++++++++ .../enterprise/advanced-topics/monitoring.md | 616 ++++++++++++++++++ .../enterprise/advanced-topics/monitoring.md | 616 ++++++++++++++++++ .../enterprise/advanced-topics/monitoring.md | 616 ++++++++++++++++++ .../enterprise/advanced-topics/monitoring.md | 616 ++++++++++++++++++ .../version-24.1-sidebars.json | 3 +- .../version-24.2-sidebars.json | 3 +- .../version-25.1-sidebars.json | 3 +- .../version-25.2-sidebars.json | 3 +- .../version-25.3-sidebars.json | 3 +- 12 files changed, 3708 insertions(+), 6 deletions(-) create mode 100644 platform-enterprise_docs/enterprise/advanced-topics/monitoring.md create mode 100644 platform-enterprise_versioned_docs/version-24.1/enterprise/advanced-topics/monitoring.md create mode 100644 platform-enterprise_versioned_docs/version-24.2/enterprise/advanced-topics/monitoring.md create mode 100644 platform-enterprise_versioned_docs/version-25.1/enterprise/advanced-topics/monitoring.md create mode 100644 platform-enterprise_versioned_docs/version-25.2/enterprise/advanced-topics/monitoring.md create mode 100644 platform-enterprise_versioned_docs/version-25.3/enterprise/advanced-topics/monitoring.md diff --git a/platform-enterprise_docs/enterprise-sidebar.json b/platform-enterprise_docs/enterprise-sidebar.json index 35f4c920e..daad555fe 100644 --- a/platform-enterprise_docs/enterprise-sidebar.json +++ b/platform-enterprise_docs/enterprise-sidebar.json @@ -59,7 +59,8 @@ "enterprise/advanced-topics/firewall-configuration", "enterprise/advanced-topics/seqera-container-images", "enterprise/advanced-topics/content-security-policy", - "enterprise/advanced-topics/jvm-memory-tuning" + "enterprise/advanced-topics/jvm-memory-tuning", + "enterprise/advanced-topics/monitoring" ] }, "enterprise/general_troubleshooting" diff --git a/platform-enterprise_docs/enterprise/advanced-topics/monitoring.md b/platform-enterprise_docs/enterprise/advanced-topics/monitoring.md new file mode 100644 index 000000000..c721b3ce3 --- /dev/null +++ b/platform-enterprise_docs/enterprise/advanced-topics/monitoring.md @@ -0,0 +1,616 @@ +--- +title: "Seqera Platform Monitoring" +description: "A guide on relevant platform metrics" +date created: "2025-12-17" +tags: [platform, monitoring] +--- + +Seqera Platform has built-in observability metrics which can be enabled by adding `prometheus` to the `MICRONAUT_ENVIRONMENTS` environment variable. This exposes a Prometheus endpoint at `/prometheus` on the default listen port (e.g., `http://localhost:8080/prometheus`). + +Combined with infrastructure monitoring tools such as Node Exporter, you can monitor relevant metrics across your deployment. + +## Key metrics to monitor + +### Seqera Platform-specific metrics + +#### Studios metrics + +| Metric | Description | +| ------------------------------------------------ | -------------------------------------| +| `data_studio_startup_time_failure_seconds_sum` | Time for failed Studio startups | +| `data_studio_startup_time_failure_seconds_count` | Failed Studio startup count | + +Track Studio startup performance to identify environment provisioning issues. Slow or failing startups impact user productivity. + +**Average startup time by tool** + +```shell +sum by (tool) (increase(data_studio_startup_time_success_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) +/ +sum by (tool) (increase(data_studio_startup_time_success_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Failed startup rate** + +```shell +rate(data_studio_startup_time_failure_seconds_count{namespace="$namespace"}[$__rate_interval]) +``` + +#### Error tracking + +| Metric | Description | +| ------------------------------ | ------------------------- | +| `tower_logs_errors_10secCount` | Errors in last 10 seconds | +| `tower_logs_errors_1minCount` | Errors in last minute | +| `tower_logs_errors_5minCount` | Errors in last 5 minutes | + +Monitor application errors across different time windows. Rolling error counts help identify transient issues versus sustained problems. + +**Recent error counts** + +```shell +tower_logs_errors_10secCount{namespace="$namespace"} +tower_logs_errors_1minCount{namespace="$namespace"} +tower_logs_errors_5minCount{namespace="$namespace"} +``` + +**Log events by severity level** + +```shell +rate(logback_events_total{namespace="$namespace"}[$__rate_interval]) +``` + +### Infrastructure resources + +#### CPU usage + +Monitor container CPU consumption against requested resources to identify capacity issues or inefficient resource allocation. + +**Backend CPU usage** + +```shell +rate(container_cpu_usage_seconds_total{container="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Compare against requested resources** to determine if the container is over or under-provisioned: + +```shell +max(kube_pod_container_resource_requests{container="backend", namespace="$namespace", resource="cpu"}) +``` + +#### Memory usage + +Track working set memory, committed memory, and limits to prevent OOM conditions. + +**Backend memory working set** shows actual memory in use: + +```shell +container_memory_working_set_bytes{container="backend", namespace="$namespace"} +``` + +**Memory requests and limits** define the bounds for container memory allocation: + +```shell +max(kube_pod_container_resource_requests{container="backend", namespace="$namespace", resource="memory"}) +max(kube_pod_container_resource_limits{container="backend", namespace="$namespace", resource="memory"}) +``` + +### HTTP server requests + +| Metric | Description | +| ------------------------------------------ | ------------------------------------------------- | +| `http_server_requests_seconds_count` | Total request count by method, status, and URI | +| `http_server_requests_seconds_sum` | Total request duration by method, status, and URI | +| `http_server_requests_seconds_max` | Maximum request duration | +| `http_server_requests_seconds` (quantiles) | Request latency percentiles (p50, p95, p99, p999) | + +HTTP metrics reveal application throughput, error rates, and latency patterns. These are essential for understanding user-facing performance. + +**Total request throughput** shows overall API activity: + +```shell +sum(rate(http_server_requests_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Error rate (4xx and 5xx responses)** indicates client errors and server failures: + +```shell +sum(rate(http_server_requests_seconds_count{app="backend", namespace="$namespace", status=~"[45].."}[$__rate_interval])) +``` + +**Average latency per endpoint** helps identify slow API paths: + +```shell +sum by (method, uri) (rate(http_server_requests_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) +/ +sum by (method, uri) (rate(http_server_requests_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Top 10 endpoints by time spent** highlights where server time is consumed for optimization efforts: + +```shell +topk(10, sum by(method, uri) (rate(http_server_requests_seconds_sum{namespace="$namespace", app="backend"}[$__rate_interval]))) +``` + +### HTTP client requests + +| Metric | Description | +| ------------------------------------ | --------------------------------- | +| `http_client_requests_seconds_count` | Outbound request count | +| `http_client_requests_seconds_sum` | Total outbound request duration | +| `http_client_requests_seconds_max` | Maximum outbound request duration | + +Monitor external API calls and integrations. Slow or failing outbound requests can cascade into application performance issues. + +**Outbound request rate** + +```shell +rate(http_client_requests_seconds_count{namespace="$namespace"}[$__rate_interval]) +``` + +**Average outbound request duration** + +```shell +rate(http_client_requests_seconds_sum{namespace="$namespace"}[$__rate_interval]) +/ +rate(http_client_requests_seconds_count{namespace="$namespace"}[$__rate_interval]) +``` + +**Maximum outbound request duration** identifies slow external dependencies: + +```shell +http_client_requests_seconds_max{namespace="$namespace"} +``` + +### JVM memory metrics + +| Metric | Description | +| ------------------------------ | -------------------------------------------------------- | +| `jvm_buffer_memory_used_bytes` | Memory used by JVM buffer pools (direct, mapped) | +| `jvm_memory_used_bytes` | Amount of used memory by area (heap/non-heap) and region | +| `jvm_memory_committed_bytes` | Memory committed for JVM use | +| `jvm_memory_max_bytes` | Maximum memory available for memory management | +| `jvm_gc_live_data_size_bytes` | Size of long-lived heap memory pool after reclamation | +| `jvm_gc_max_data_size_bytes` | Max size of long-lived heap memory pool | + +JVM memory metrics are critical for preventing OutOfMemoryErrors and identifying memory leaks. Monitor both heap (Java objects) and non-heap (metaspace, code cache) regions. + +**Heap memory usage** shows memory used for Java objects: + +```shell +jvm_memory_used_bytes{app="backend", namespace="$namespace", area="heap"} +jvm_memory_committed_bytes{app="backend", namespace="$namespace", area="heap"} +jvm_memory_max_bytes{app="backend", namespace="$namespace", area="heap"} +``` + +**Non-heap memory** includes metaspace and code cache: + +```shell +jvm_memory_used_bytes{app="backend", namespace="$namespace", area="nonheap"} +jvm_memory_committed_bytes{app="backend", namespace="$namespace", area="nonheap"} +jvm_memory_max_bytes{app="backend", namespace="$namespace", area="nonheap"} +``` + +**Heap usage percentage** provides a quick health indicator. Alert when this exceeds 85%: + +```shell +sum(jvm_memory_used_bytes{area="heap"}) / sum(jvm_memory_max_bytes{area="heap"}) * 100 +``` + +**Direct buffer usage** is important for Netty-based applications. High usage can cause native memory issues: + +```shell +jvm_buffer_memory_used_bytes{namespace="$namespace", app="backend", id="direct"} +jvm_buffer_total_capacity_bytes{namespace="$namespace", app="backend", id="direct"} +``` + +### JVM garbage collection + +| Metric | Description | +| ------------------------------------- | ----------------------------------------- | +| `jvm_gc_pause_seconds_sum` | Total time spent in GC pauses | +| `jvm_gc_pause_seconds_count` | Number of GC pause events | +| `jvm_gc_pause_seconds_max` | Maximum GC pause duration | +| `jvm_gc_memory_allocated_bytes_total` | Total bytes allocated in young generation | +| `jvm_gc_memory_promoted_bytes_total` | Bytes promoted to old generation | + +Garbage collection metrics reveal memory pressure and its impact on application responsiveness. Long GC pauses cause request latency spikes. + +**Average GC pause duration** should remain low (under 100ms for most applications): + +```shell +rate(jvm_gc_pause_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval]) +/ +rate(jvm_gc_pause_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Maximum GC pause** identifies worst-case latency impact. Alert if this exceeds 1 second: + +```shell +jvm_gc_pause_seconds_max{app="backend", namespace="$namespace"} +``` + +**Live data size after GC** shows long-lived objects. If this grows over time, you may have a memory leak: + +```shell +jvm_gc_live_data_size_bytes{app="backend", namespace="$namespace"} +``` + +**Memory allocation and promotion rates** indicate object creation patterns. High promotion rates suggest objects are living longer than expected: + +```shell +rate(jvm_gc_memory_allocated_bytes_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(jvm_gc_memory_promoted_bytes_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +### JVM threads + +| Metric | Description | +| ---------------------------- | ----------------------------------------------------------------- | +| `jvm_threads_live_threads` | Current number of live threads (daemon + non-daemon) | +| `jvm_threads_daemon_threads` | Current number of daemon threads | +| `jvm_threads_peak_threads` | Peak thread count since JVM start | +| `jvm_threads_states_threads` | Thread count by state (runnable, blocked, waiting, timed-waiting) | + +Thread metrics help identify deadlocks, thread pool exhaustion, and concurrency issues. + +**Thread counts** show overall thread activity: + +```shell +jvm_threads_live_threads{app="backend", namespace="$namespace"} +jvm_threads_daemon_threads{app="backend", namespace="$namespace"} +jvm_threads_peak_threads{app="backend", namespace="$namespace"} +``` + +**Thread states** reveal blocking issues. High blocked thread counts indicate lock contention: + +```shell +jvm_threads_states_threads{app="backend", namespace="$namespace"} +``` + +### JVM classes + +| Metric | Description | +| ------------------------------------ | -------------------------------------- | +| `jvm_classes_loaded_classes` | Currently loaded classes | +| `jvm_classes_unloaded_classes_total` | Total classes unloaded since JVM start | + +Class loading metrics help identify class loader leaks or excessive dynamic class generation. + +**Loaded classes** should stabilize after startup. Continuous growth may indicate a class loader leak: + +```shell +jvm_classes_loaded_classes{namespace="$namespace", app="backend"} +``` + +**Class unload rate** + +```shell +rate(jvm_classes_unloaded_classes_total{namespace="$namespace", app="backend"}[$__rate_interval]) +``` + +### Process metrics + +| Metric | Description | +| ---------------------------- | ------------------------------------ | +| `process_cpu_usage` | Recent CPU usage for the JVM process | +| `process_cpu_time_ns_total` | Total CPU time used by the JVM | +| `process_files_open_files` | Open file descriptor count | +| `process_files_max_files` | Maximum file descriptor limit | +| `process_uptime_seconds` | JVM uptime | +| `process_start_time_seconds` | Process start time (unix epoch) | + +Process-level metrics provide visibility into resource consumption and system limits. + +**JVM process CPU usage** + +```shell +process_cpu_usage{namespace="$namespace"} +``` + +**Open file descriptors** should be monitored against limits. Exhaustion causes connection failures: + +```shell +process_files_open_files{namespace="$namespace"} +``` + +**File descriptor utilization percentage** - alert when this exceeds 90%: + +```shell +(process_files_open_files{namespace="$namespace"} / process_files_max_files{namespace="$namespace"}) * 100 +``` + +**Process uptime** helps identify restart events. Low uptime may indicate stability issues: + +```shell +process_uptime_seconds{namespace="$namespace"} +``` + +### System metrics + +| Metric | Description | +| ------------------------ | ------------------------------------- | +| `system_cpu_usage` | System-wide CPU usage | +| `system_cpu_count` | Number of processors available to JVM | +| `system_load_average_1m` | 1-minute load average | + +System metrics provide host-level context for application performance. + +**System-wide CPU usage** + +```shell +system_cpu_usage{namespace="$namespace"} +``` + +**System load average** should remain below the CPU count for healthy systems: + +```shell +system_load_average_1m{namespace="$namespace"} +``` + +**Available CPU count** + +```shell +system_cpu_count{namespace="$namespace"} +``` + +### Executor thread pools + +| Metric | Description | +| -------------------------------- | ---------------------------------------------------------- | +| `executor_active_threads` | Currently active threads by pool (io, blocking, scheduled) | +| `executor_pool_size_threads` | Current thread pool size | +| `executor_pool_max_threads` | Maximum allowed threads in pool | +| `executor_queued_tasks` | Tasks queued for execution | +| `executor_completed_tasks_total` | Total completed tasks | +| `executor_seconds_sum` | Total execution time | + +Thread pool metrics reveal concurrency bottlenecks. Saturated pools cause request queuing and increased latency. + +**Thread pool utilization percentage** - high utilization indicates the pool is near capacity: + +```shell +executor_active_threads{service="backend", namespace="$namespace", name!="scheduled"} +/ +executor_pool_size_threads{service="backend", namespace="$namespace", name!="scheduled"} +``` + +**Cron scheduled executor utilization** + +```shell +executor_active_threads{service="cron", namespace="$namespace", name="scheduled"} +/ +executor_pool_size_threads{service="cron", namespace="$namespace", name="scheduled"} +``` + +**Queued tasks** indicate backlog. Growing queues suggest the pool cannot keep up with demand: + +```shell +executor_queued_tasks{app="backend", namespace="$namespace"} +``` + +**Task completion rate** + +```shell +rate(executor_completed_tasks_total{namespace="$namespace"}[$__rate_interval]) +``` + +### Cache metrics + +| Metric | Description | +| ----------------------- | ----------------------------------- | +| `cache_size` | Number of entries in cache | +| `cache_gets_total` | Cache hits and misses by cache name | +| `cache_puts_total` | Cache entries added | +| `cache_evictions_total` | Cache eviction count | + +Cache effectiveness directly impacts database load and response times. Low hit rates indicate caching issues. + +**Redis cache hit rate** - should be above 70% for effective caching: + +```shell +avg(irate(redis_keyspace_hits_total{app="platform-redis-exporter"}[$__rate_interval]) +/ +(irate(redis_keyspace_misses_total{app="platform-redis-exporter"}[$__rate_interval]) + irate(redis_keyspace_hits_total{app="platform-redis-exporter"}[$__rate_interval]))) +``` + +**Cache size by name** + +```shell +cache_size{namespace="$namespace"} +``` + +**Cache operation rates** + +```shell +rate(cache_gets_total{namespace="$namespace"}[$__rate_interval]) +rate(cache_puts_total{namespace="$namespace"}[$__rate_interval]) +rate(cache_evictions_total{namespace="$namespace"}[$__rate_interval]) +``` + +### Hibernate/Database metrics + +| Metric | Description | +| ---------------------------------------- | ---------------------------------------------------- | +| `hibernate_sessions_open_total` | Total sessions opened | +| `hibernate_sessions_closed_total` | Total sessions closed | +| `hibernate_connections_obtained_total` | Database connections obtained | +| `hibernate_query_executions_total` | Total queries executed | +| `hibernate_query_executions_max_seconds` | Slowest query time | +| `hibernate_entities_inserts_total` | Entity insert operations | +| `hibernate_entities_updates_total` | Entity update operations | +| `hibernate_entities_deletes_total` | Entity delete operations | +| `hibernate_entities_loads_total` | Entity load operations | +| `hibernate_transactions_total` | Transaction count | +| `hibernate_flushes_total` | Session flush count | +| `hibernate_optimistic_failures_total` | Optimistic lock failures (StaleObjectStateException) | + +Database metrics reveal query performance, connection management, and transaction health. + +**Session operations** - open and closed counts should be roughly equal. A growing gap indicates session leaks: + +```shell +rate(hibernate_sessions_open_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_sessions_closed_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Connection acquisition rate** + +```shell +rate(hibernate_connections_obtained_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Query execution rate** + +```shell +rate(hibernate_query_executions_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Query latency by type** helps identify slow queries for optimization: + +```shell +sum by (query) (rate(hibernate_query_execution_total_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) +/ +sum by (query) (rate(hibernate_query_execution_total_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Slowest query time** - alert if this exceeds 5 seconds: + +```shell +hibernate_query_executions_max_seconds{app="backend", namespace="$namespace"} +``` + +**Entity operation rates** show database write patterns: + +```shell +rate(hibernate_entities_inserts_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_entities_updates_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_entities_deletes_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_entities_loads_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Transaction success/failure rate** + +```shell +sum by (result) (rate(hibernate_transactions_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Optimistic lock failures** indicate concurrent modification conflicts. High rates suggest contention issues: + +```shell +rate(hibernate_optimistic_failures_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +### Connection pool metrics + +| Metric | Description | +| ------------------------- | ---------------------------- | +| `jdbc_connections_active` | Active database connections | +| `jdbc_connections_max` | Maximum connection pool size | +| `jdbc_connections_min` | Minimum connection pool size | +| `jdbc_connections_usage` | Connection pool usage | + +Connection pool metrics prevent connection exhaustion during traffic bursts. + +**Active connections vs pool limits** - alert when active connections approach the maximum: + +```shell +sum(jdbc_connections_active{app="backend", namespace="$namespace"}) +sum(jdbc_connections_max{app="backend", namespace="$namespace"}) +sum(jdbc_connections_min{app="backend", namespace="$namespace"}) +sum(jdbc_connections_usage{app="backend", namespace="$namespace"}) +``` + +### Hibernate cache metrics + +Hibernate caching reduces database load. Monitor hit rates to ensure caches are effective. + +**Query cache hit rate** - should exceed 60%: + +```shell +sum(increase(hibernate_cache_query_requests_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) +/ +sum(increase(hibernate_cache_query_requests_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Query plan cache hit rate** + +```shell +sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) +/ +sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Second level cache hit rate by region** + +```shell +sum by (region) (increase(hibernate_second_level_cache_requests_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) +/ +sum by (region) (increase(hibernate_second_level_cache_requests_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +### Logging metrics + +| Metric | Description | +| ---------------------- | ----------------------------------------------------- | +| `logback_events_total` | Log events by level (debug, info, warn, error, trace) | + +Log event metrics provide early warning of application issues. + +**Error rate** - track error log frequency for anomaly detection: + +```shell +rate(logback_events_total{level="error"}[5m]) +``` + +### Kubernetes health + +Monitor pod health to catch deployment or infrastructure issues early. + +**Pods in unhealthy states** + +```shell +sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed", namespace!="wave-build"}) > 0 +``` + +## Alerting recommendations + +### Critical alerts + +- `jvm_memory_used_bytes{area="heap"}` > 90% of `jvm_memory_max_bytes` +- `process_files_open_files` > 90% of `process_files_max_files` +- `logback_events_total{level="error"}` rate > threshold +- `tower_logs_errors_1minCount` > 0 +- HTTP 5xx errors > 5% of total requests +- `jdbc_connections_active` > 90% of `jdbc_connections_max` +- Any pods in Failed/Unknown state for > 5 minutes + +### Warning alerts + +- `jvm_gc_pause_seconds_max` > 1 second +- `jvm_gc_live_data_size_bytes` approaching `jvm_gc_max_data_size_bytes` +- Heap usage > 85% of max heap +- `executor_queued_tasks` > threshold +- Executor utilization > 90% +- `hibernate_optimistic_failures_total` rate increasing +- `hibernate_query_executions_max_seconds` > 5 seconds +- `http_server_requests_seconds` p99 > acceptable latency +- Redis cache hit rate < 70% +- Hibernate query cache hit rate < 60% +- Growing gap between `credits_estimation_workflow_added_total` and `credits_estimation_workflow_ended_total` +- `hibernate_sessions_open_total` >> `hibernate_sessions_closed_total` over time + +## Quick reference: Metrics by troubleshooting scenario + +| Issue | Key Metrics to Check | +| ------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------- | +| **Slow application response** | `http_server_requests_seconds` (latency), `jvm_gc_pause_seconds_max`, `hibernate_query_executions_max_seconds`, `executor_active_threads` | +| **Out of memory errors** | `jvm_memory_used_bytes`, `jvm_gc_pause_seconds`, `jvm_gc_live_data_size_bytes`, `jvm_buffer_memory_used_bytes` | +| **Database performance** | `hibernate_query_executions_max_seconds`, `jdbc_connections_active`, `hibernate_transactions_total`, cache hit rates | +| **High CPU usage** | `process_cpu_usage`, `system_cpu_usage`, `jvm_threads_live_threads`, `executor_active_threads` | +| **Connection exhaustion** | `jdbc_connections_active`, `jdbc_connections_max`, `hibernate_sessions_open_total` vs `hibernate_sessions_closed_total` | +| **Cache issues** | Redis hit rate, `hibernate_cache_query_requests_total`, `cache_gets_total`, `cache_evictions_total` | +| **Workflow processing delays** | `credits_estimation_workflow_*`, `credits_estimation_task_*`, `executor_queued_tasks`, `tower_logs_errors_*` | +| **Thread starvation** | `executor_active_threads`, `executor_queued_tasks`, `jvm_threads_states_threads{state="blocked"}` | +| **Memory leaks** | `jvm_memory_used_bytes` trending up, `jvm_gc_live_data_size_bytes` growing, `jvm_classes_loaded_classes` growing | +| **GC pressure** | `jvm_gc_pause_seconds_max`, `jvm_gc_memory_promoted_bytes_total`, time in GC vs application time | diff --git a/platform-enterprise_versioned_docs/version-24.1/enterprise/advanced-topics/monitoring.md b/platform-enterprise_versioned_docs/version-24.1/enterprise/advanced-topics/monitoring.md new file mode 100644 index 000000000..c692367ee --- /dev/null +++ b/platform-enterprise_versioned_docs/version-24.1/enterprise/advanced-topics/monitoring.md @@ -0,0 +1,616 @@ +--- +title: "Seqera Platform Monitoring" +description: "A guide on relevant platform metrics" +date created: "2025-12-17" +tags: [platform, monitoring] +--- + +You can enable Seqera Platform's built-in observability metrics by adding `prometheus` to the `MICRONAUT_ENVIRONMENTS` environment variable. This exposes a Prometheus endpoint at `/prometheus` on the default listen port (e.g., `http://localhost:8080/prometheus`). + +Combined with infrastructure monitoring tools such as Node Exporter, you can monitor relevant metrics across your deployment. + +## Key metrics to monitor + +### Seqera Platform-specific metrics + +#### Studios metrics + +| Metric | Description | +| ------------------------------------------------ | -------------------------------------| +| `data_studio_startup_time_failure_seconds_sum` | Time for failed Studio startups | +| `data_studio_startup_time_failure_seconds_count` | Failed Studio startup count | + +Track Studio startup performance to identify environment provisioning issues. Slow or failing startups impact user productivity. + +**Average startup time by tool** + +```shell +sum by (tool) (increase(data_studio_startup_time_success_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) +/ +sum by (tool) (increase(data_studio_startup_time_success_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Failed startup rate** + +```shell +rate(data_studio_startup_time_failure_seconds_count{namespace="$namespace"}[$__rate_interval]) +``` + +#### Error tracking + +| Metric | Description | +| ------------------------------ | ------------------------- | +| `tower_logs_errors_10secCount` | Errors in last 10 seconds | +| `tower_logs_errors_1minCount` | Errors in last minute | +| `tower_logs_errors_5minCount` | Errors in last 5 minutes | + +Monitor application errors across different time windows. Rolling error counts help identify transient issues versus sustained problems. + +**Recent error counts** + +```shell +tower_logs_errors_10secCount{namespace="$namespace"} +tower_logs_errors_1minCount{namespace="$namespace"} +tower_logs_errors_5minCount{namespace="$namespace"} +``` + +**Log events by severity level** + +```shell +rate(logback_events_total{namespace="$namespace"}[$__rate_interval]) +``` + +### Infrastructure resources + +#### CPU usage + +Monitor container CPU consumption against requested resources to identify capacity issues or inefficient resource allocation. + +**Backend CPU usage** + +```shell +rate(container_cpu_usage_seconds_total{container="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Compare against requested resources** to determine if the container is over or under-provisioned: + +```shell +max(kube_pod_container_resource_requests{container="backend", namespace="$namespace", resource="cpu"}) +``` + +#### Memory usage + +Track working set memory, committed memory, and limits to prevent OOM conditions. + +**Backend memory working set** shows actual memory in use: + +```shell +container_memory_working_set_bytes{container="backend", namespace="$namespace"} +``` + +**Memory requests and limits** define the bounds for container memory allocation: + +```shell +max(kube_pod_container_resource_requests{container="backend", namespace="$namespace", resource="memory"}) +max(kube_pod_container_resource_limits{container="backend", namespace="$namespace", resource="memory"}) +``` + +### HTTP server requests + +| Metric | Description | +| ------------------------------------------ | ------------------------------------------------- | +| `http_server_requests_seconds_count` | Total request count by method, status, and URI | +| `http_server_requests_seconds_sum` | Total request duration by method, status, and URI | +| `http_server_requests_seconds_max` | Maximum request duration | +| `http_server_requests_seconds` (quantiles) | Request latency percentiles (p50, p95, p99, p999) | + +HTTP metrics reveal application throughput, error rates, and latency patterns. These are essential for understanding user-facing performance. + +**Total request throughput** shows overall API activity: + +```shell +sum(rate(http_server_requests_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Error rate (4xx and 5xx responses)** indicates client errors and server failures: + +```shell +sum(rate(http_server_requests_seconds_count{app="backend", namespace="$namespace", status=~"[45].."}[$__rate_interval])) +``` + +**Average latency per endpoint** helps identify slow API paths: + +```shell +sum by (method, uri) (rate(http_server_requests_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) +/ +sum by (method, uri) (rate(http_server_requests_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Top 10 endpoints by time spent** highlights where server time is consumed for optimization efforts: + +```shell +topk(10, sum by(method, uri) (rate(http_server_requests_seconds_sum{namespace="$namespace", app="backend"}[$__rate_interval]))) +``` + +### HTTP client requests + +| Metric | Description | +| ------------------------------------ | --------------------------------- | +| `http_client_requests_seconds_count` | Outbound request count | +| `http_client_requests_seconds_sum` | Total outbound request duration | +| `http_client_requests_seconds_max` | Maximum outbound request duration | + +Monitor external API calls and integrations. Slow or failing outbound requests can cascade into application performance issues. + +**Outbound request rate** + +```shell +rate(http_client_requests_seconds_count{namespace="$namespace"}[$__rate_interval]) +``` + +**Average outbound request duration** + +```shell +rate(http_client_requests_seconds_sum{namespace="$namespace"}[$__rate_interval]) +/ +rate(http_client_requests_seconds_count{namespace="$namespace"}[$__rate_interval]) +``` + +**Maximum outbound request duration** identifies slow external dependencies: + +```shell +http_client_requests_seconds_max{namespace="$namespace"} +``` + +### JVM memory metrics + +| Metric | Description | +| ------------------------------ | -------------------------------------------------------- | +| `jvm_buffer_memory_used_bytes` | Memory used by JVM buffer pools (direct, mapped) | +| `jvm_memory_used_bytes` | Amount of used memory by area (heap/non-heap) and region | +| `jvm_memory_committed_bytes` | Memory committed for JVM use | +| `jvm_memory_max_bytes` | Maximum memory available for memory management | +| `jvm_gc_live_data_size_bytes` | Size of long-lived heap memory pool after reclamation | +| `jvm_gc_max_data_size_bytes` | Max size of long-lived heap memory pool | + +JVM memory metrics are critical for preventing OutOfMemoryErrors and identifying memory leaks. Monitor both heap (Java objects) and non-heap (metaspace, code cache) regions. + +**Heap memory usage** shows memory used for Java objects: + +```shell +jvm_memory_used_bytes{app="backend", namespace="$namespace", area="heap"} +jvm_memory_committed_bytes{app="backend", namespace="$namespace", area="heap"} +jvm_memory_max_bytes{app="backend", namespace="$namespace", area="heap"} +``` + +**Non-heap memory** includes metaspace and code cache: + +```shell +jvm_memory_used_bytes{app="backend", namespace="$namespace", area="nonheap"} +jvm_memory_committed_bytes{app="backend", namespace="$namespace", area="nonheap"} +jvm_memory_max_bytes{app="backend", namespace="$namespace", area="nonheap"} +``` + +**Heap usage percentage** provides a quick health indicator. Alert when this exceeds 85%: + +```shell +sum(jvm_memory_used_bytes{area="heap"}) / sum(jvm_memory_max_bytes{area="heap"}) * 100 +``` + +**Direct buffer usage** is important for Netty-based applications. High usage can cause native memory issues: + +```shell +jvm_buffer_memory_used_bytes{namespace="$namespace", app="backend", id="direct"} +jvm_buffer_total_capacity_bytes{namespace="$namespace", app="backend", id="direct"} +``` + +### JVM garbage collection + +| Metric | Description | +| ------------------------------------- | ----------------------------------------- | +| `jvm_gc_pause_seconds_sum` | Total time spent in GC pauses | +| `jvm_gc_pause_seconds_count` | Number of GC pause events | +| `jvm_gc_pause_seconds_max` | Maximum GC pause duration | +| `jvm_gc_memory_allocated_bytes_total` | Total bytes allocated in young generation | +| `jvm_gc_memory_promoted_bytes_total` | Bytes promoted to old generation | + +Garbage collection metrics reveal memory pressure and its impact on application responsiveness. Long GC pauses cause request latency spikes. + +**Average GC pause duration** should remain low (under 100ms for most applications): + +```shell +rate(jvm_gc_pause_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval]) +/ +rate(jvm_gc_pause_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Maximum GC pause** identifies worst-case latency impact. Alert if this exceeds 1 second: + +```shell +jvm_gc_pause_seconds_max{app="backend", namespace="$namespace"} +``` + +**Live data size after GC** shows long-lived objects. If this grows over time, you may have a memory leak: + +```shell +jvm_gc_live_data_size_bytes{app="backend", namespace="$namespace"} +``` + +**Memory allocation and promotion rates** indicate object creation patterns. High promotion rates suggest objects are living longer than expected: + +```shell +rate(jvm_gc_memory_allocated_bytes_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(jvm_gc_memory_promoted_bytes_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +### JVM threads + +| Metric | Description | +| ---------------------------- | ----------------------------------------------------------------- | +| `jvm_threads_live_threads` | Current number of live threads (daemon + non-daemon) | +| `jvm_threads_daemon_threads` | Current number of daemon threads | +| `jvm_threads_peak_threads` | Peak thread count since JVM start | +| `jvm_threads_states_threads` | Thread count by state (runnable, blocked, waiting, timed-waiting) | + +Thread metrics help identify deadlocks, thread pool exhaustion, and concurrency issues. + +**Thread counts** show overall thread activity: + +```shell +jvm_threads_live_threads{app="backend", namespace="$namespace"} +jvm_threads_daemon_threads{app="backend", namespace="$namespace"} +jvm_threads_peak_threads{app="backend", namespace="$namespace"} +``` + +**Thread states** reveal blocking issues. High blocked thread counts indicate lock contention: + +```shell +jvm_threads_states_threads{app="backend", namespace="$namespace"} +``` + +### JVM classes + +| Metric | Description | +| ------------------------------------ | -------------------------------------- | +| `jvm_classes_loaded_classes` | Currently loaded classes | +| `jvm_classes_unloaded_classes_total` | Total classes unloaded since JVM start | + +Class loading metrics help identify class loader leaks or excessive dynamic class generation. + +**Loaded classes** should stabilize after startup. Continuous growth may indicate a class loader leak: + +```shell +jvm_classes_loaded_classes{namespace="$namespace", app="backend"} +``` + +**Class unload rate** + +```shell +rate(jvm_classes_unloaded_classes_total{namespace="$namespace", app="backend"}[$__rate_interval]) +``` + +### Process metrics + +| Metric | Description | +| ---------------------------- | ------------------------------------ | +| `process_cpu_usage` | Recent CPU usage for the JVM process | +| `process_cpu_time_ns_total` | Total CPU time used by the JVM | +| `process_files_open_files` | Open file descriptor count | +| `process_files_max_files` | Maximum file descriptor limit | +| `process_uptime_seconds` | JVM uptime | +| `process_start_time_seconds` | Process start time (unix epoch) | + +Process-level metrics provide visibility into resource consumption and system limits. + +**JVM process CPU usage** + +```shell +process_cpu_usage{namespace="$namespace"} +``` + +**Open file descriptors** should be monitored against limits. Exhaustion causes connection failures: + +```shell +process_files_open_files{namespace="$namespace"} +``` + +**File descriptor utilization percentage** - alert when this exceeds 90%: + +```shell +(process_files_open_files{namespace="$namespace"} / process_files_max_files{namespace="$namespace"}) * 100 +``` + +**Process uptime** helps identify restart events. Low uptime may indicate stability issues: + +```shell +process_uptime_seconds{namespace="$namespace"} +``` + +### System metrics + +| Metric | Description | +| ------------------------ | ------------------------------------- | +| `system_cpu_usage` | System-wide CPU usage | +| `system_cpu_count` | Number of processors available to JVM | +| `system_load_average_1m` | 1-minute load average | + +System metrics provide host-level context for application performance. + +**System-wide CPU usage** + +```shell +system_cpu_usage{namespace="$namespace"} +``` + +**System load average** should remain below the CPU count for healthy systems: + +```shell +system_load_average_1m{namespace="$namespace"} +``` + +**Available CPU count** + +```shell +system_cpu_count{namespace="$namespace"} +``` + +### Executor thread pools + +| Metric | Description | +| -------------------------------- | ---------------------------------------------------------- | +| `executor_active_threads` | Currently active threads by pool (io, blocking, scheduled) | +| `executor_pool_size_threads` | Current thread pool size | +| `executor_pool_max_threads` | Maximum allowed threads in pool | +| `executor_queued_tasks` | Tasks queued for execution | +| `executor_completed_tasks_total` | Total completed tasks | +| `executor_seconds_sum` | Total execution time | + +Thread pool metrics reveal concurrency bottlenecks. Saturated pools cause request queuing and increased latency. + +**Thread pool utilization percentage** - high utilization indicates the pool is near capacity: + +```shell +executor_active_threads{service="backend", namespace="$namespace", name!="scheduled"} +/ +executor_pool_size_threads{service="backend", namespace="$namespace", name!="scheduled"} +``` + +**Cron scheduled executor utilization** + +```shell +executor_active_threads{service="cron", namespace="$namespace", name="scheduled"} +/ +executor_pool_size_threads{service="cron", namespace="$namespace", name="scheduled"} +``` + +**Queued tasks** indicate backlog. Growing queues suggest the pool cannot keep up with demand: + +```shell +executor_queued_tasks{app="backend", namespace="$namespace"} +``` + +**Task completion rate** + +```shell +rate(executor_completed_tasks_total{namespace="$namespace"}[$__rate_interval]) +``` + +### Cache metrics + +| Metric | Description | +| ----------------------- | ----------------------------------- | +| `cache_size` | Number of entries in cache | +| `cache_gets_total` | Cache hits and misses by cache name | +| `cache_puts_total` | Cache entries added | +| `cache_evictions_total` | Cache eviction count | + +Cache effectiveness directly impacts database load and response times. Low hit rates indicate caching issues. + +**Redis cache hit rate** - should be above 70% for effective caching: + +```shell +avg(irate(redis_keyspace_hits_total{app="platform-redis-exporter"}[$__rate_interval]) +/ +(irate(redis_keyspace_misses_total{app="platform-redis-exporter"}[$__rate_interval]) + irate(redis_keyspace_hits_total{app="platform-redis-exporter"}[$__rate_interval]))) +``` + +**Cache size by name** + +```shell +cache_size{namespace="$namespace"} +``` + +**Cache operation rates** + +```shell +rate(cache_gets_total{namespace="$namespace"}[$__rate_interval]) +rate(cache_puts_total{namespace="$namespace"}[$__rate_interval]) +rate(cache_evictions_total{namespace="$namespace"}[$__rate_interval]) +``` + +### Hibernate/Database metrics + +| Metric | Description | +| ---------------------------------------- | ---------------------------------------------------- | +| `hibernate_sessions_open_total` | Total sessions opened | +| `hibernate_sessions_closed_total` | Total sessions closed | +| `hibernate_connections_obtained_total` | Database connections obtained | +| `hibernate_query_executions_total` | Total queries executed | +| `hibernate_query_executions_max_seconds` | Slowest query time | +| `hibernate_entities_inserts_total` | Entity insert operations | +| `hibernate_entities_updates_total` | Entity update operations | +| `hibernate_entities_deletes_total` | Entity delete operations | +| `hibernate_entities_loads_total` | Entity load operations | +| `hibernate_transactions_total` | Transaction count | +| `hibernate_flushes_total` | Session flush count | +| `hibernate_optimistic_failures_total` | Optimistic lock failures (StaleObjectStateException) | + +Database metrics reveal query performance, connection management, and transaction health. + +**Session operations** - open and closed counts should be roughly equal. A growing gap indicates session leaks: + +```shell +rate(hibernate_sessions_open_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_sessions_closed_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Connection acquisition rate** + +```shell +rate(hibernate_connections_obtained_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Query execution rate** + +```shell +rate(hibernate_query_executions_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Query latency by type** helps identify slow queries for optimization: + +```shell +sum by (query) (rate(hibernate_query_execution_total_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) +/ +sum by (query) (rate(hibernate_query_execution_total_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Slowest query time** - alert if this exceeds 5 seconds: + +```shell +hibernate_query_executions_max_seconds{app="backend", namespace="$namespace"} +``` + +**Entity operation rates** show database write patterns: + +```shell +rate(hibernate_entities_inserts_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_entities_updates_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_entities_deletes_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_entities_loads_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Transaction success/failure rate** + +```shell +sum by (result) (rate(hibernate_transactions_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Optimistic lock failures** indicate concurrent modification conflicts. High rates suggest contention issues: + +```shell +rate(hibernate_optimistic_failures_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +### Connection pool metrics + +| Metric | Description | +| ------------------------- | ---------------------------- | +| `jdbc_connections_active` | Active database connections | +| `jdbc_connections_max` | Maximum connection pool size | +| `jdbc_connections_min` | Minimum connection pool size | +| `jdbc_connections_usage` | Connection pool usage | + +Connection pool metrics prevent connection exhaustion during traffic bursts. + +**Active connections vs pool limits** - alert when active connections approach the maximum: + +```shell +sum(jdbc_connections_active{app="backend", namespace="$namespace"}) +sum(jdbc_connections_max{app="backend", namespace="$namespace"}) +sum(jdbc_connections_min{app="backend", namespace="$namespace"}) +sum(jdbc_connections_usage{app="backend", namespace="$namespace"}) +``` + +### Hibernate cache metrics + +Hibernate caching reduces database load. Monitor hit rates to ensure caches are effective. + +**Query cache hit rate** - should exceed 60%: + +```shell +sum(increase(hibernate_cache_query_requests_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) +/ +sum(increase(hibernate_cache_query_requests_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Query plan cache hit rate** + +```shell +sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) +/ +sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Second level cache hit rate by region** + +```shell +sum by (region) (increase(hibernate_second_level_cache_requests_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) +/ +sum by (region) (increase(hibernate_second_level_cache_requests_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +### Logging metrics + +| Metric | Description | +| ---------------------- | ----------------------------------------------------- | +| `logback_events_total` | Log events by level (debug, info, warn, error, trace) | + +Log event metrics provide early warning of application issues. + +**Error rate** - track error log frequency for anomaly detection: + +```shell +rate(logback_events_total{level="error"}[5m]) +``` + +### Kubernetes health + +Monitor pod health to catch deployment or infrastructure issues early. + +**Pods in unhealthy states** + +```shell +sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed", namespace!="wave-build"}) > 0 +``` + +## Alerting recommendations + +### Critical alerts + +- `jvm_memory_used_bytes{area="heap"}` > 90% of `jvm_memory_max_bytes` +- `process_files_open_files` > 90% of `process_files_max_files` +- `logback_events_total{level="error"}` rate > threshold +- `tower_logs_errors_1minCount` > 0 +- HTTP 5xx errors > 5% of total requests +- `jdbc_connections_active` > 90% of `jdbc_connections_max` +- Any pods in Failed/Unknown state for > 5 minutes + +### Warning alerts + +- `jvm_gc_pause_seconds_max` > 1 second +- `jvm_gc_live_data_size_bytes` approaching `jvm_gc_max_data_size_bytes` +- Heap usage > 85% of max heap +- `executor_queued_tasks` > threshold +- Executor utilization > 90% +- `hibernate_optimistic_failures_total` rate increasing +- `hibernate_query_executions_max_seconds` > 5 seconds +- `http_server_requests_seconds` p99 > acceptable latency +- Redis cache hit rate < 70% +- Hibernate query cache hit rate < 60% +- Growing gap between `credits_estimation_workflow_added_total` and `credits_estimation_workflow_ended_total` +- `hibernate_sessions_open_total` >> `hibernate_sessions_closed_total` over time + +## Quick reference: Metrics by troubleshooting scenario + +| Issue | Key Metrics to Check | +| ------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------- | +| **Slow application response** | `http_server_requests_seconds` (latency), `jvm_gc_pause_seconds_max`, `hibernate_query_executions_max_seconds`, `executor_active_threads` | +| **Out of memory errors** | `jvm_memory_used_bytes`, `jvm_gc_pause_seconds`, `jvm_gc_live_data_size_bytes`, `jvm_buffer_memory_used_bytes` | +| **Database performance** | `hibernate_query_executions_max_seconds`, `jdbc_connections_active`, `hibernate_transactions_total`, cache hit rates | +| **High CPU usage** | `process_cpu_usage`, `system_cpu_usage`, `jvm_threads_live_threads`, `executor_active_threads` | +| **Connection exhaustion** | `jdbc_connections_active`, `jdbc_connections_max`, `hibernate_sessions_open_total` vs `hibernate_sessions_closed_total` | +| **Cache issues** | Redis hit rate, `hibernate_cache_query_requests_total`, `cache_gets_total`, `cache_evictions_total` | +| **Workflow processing delays** | `credits_estimation_workflow_*`, `credits_estimation_task_*`, `executor_queued_tasks`, `tower_logs_errors_*` | +| **Thread starvation** | `executor_active_threads`, `executor_queued_tasks`, `jvm_threads_states_threads{state="blocked"}` | +| **Memory leaks** | `jvm_memory_used_bytes` trending up, `jvm_gc_live_data_size_bytes` growing, `jvm_classes_loaded_classes` growing | +| **GC pressure** | `jvm_gc_pause_seconds_max`, `jvm_gc_memory_promoted_bytes_total`, time in GC vs application time | diff --git a/platform-enterprise_versioned_docs/version-24.2/enterprise/advanced-topics/monitoring.md b/platform-enterprise_versioned_docs/version-24.2/enterprise/advanced-topics/monitoring.md new file mode 100644 index 000000000..c692367ee --- /dev/null +++ b/platform-enterprise_versioned_docs/version-24.2/enterprise/advanced-topics/monitoring.md @@ -0,0 +1,616 @@ +--- +title: "Seqera Platform Monitoring" +description: "A guide on relevant platform metrics" +date created: "2025-12-17" +tags: [platform, monitoring] +--- + +You can enable Seqera Platform's built-in observability metrics by adding `prometheus` to the `MICRONAUT_ENVIRONMENTS` environment variable. This exposes a Prometheus endpoint at `/prometheus` on the default listen port (e.g., `http://localhost:8080/prometheus`). + +Combined with infrastructure monitoring tools such as Node Exporter, you can monitor relevant metrics across your deployment. + +## Key metrics to monitor + +### Seqera Platform-specific metrics + +#### Studios metrics + +| Metric | Description | +| ------------------------------------------------ | -------------------------------------| +| `data_studio_startup_time_failure_seconds_sum` | Time for failed Studio startups | +| `data_studio_startup_time_failure_seconds_count` | Failed Studio startup count | + +Track Studio startup performance to identify environment provisioning issues. Slow or failing startups impact user productivity. + +**Average startup time by tool** + +```shell +sum by (tool) (increase(data_studio_startup_time_success_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) +/ +sum by (tool) (increase(data_studio_startup_time_success_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Failed startup rate** + +```shell +rate(data_studio_startup_time_failure_seconds_count{namespace="$namespace"}[$__rate_interval]) +``` + +#### Error tracking + +| Metric | Description | +| ------------------------------ | ------------------------- | +| `tower_logs_errors_10secCount` | Errors in last 10 seconds | +| `tower_logs_errors_1minCount` | Errors in last minute | +| `tower_logs_errors_5minCount` | Errors in last 5 minutes | + +Monitor application errors across different time windows. Rolling error counts help identify transient issues versus sustained problems. + +**Recent error counts** + +```shell +tower_logs_errors_10secCount{namespace="$namespace"} +tower_logs_errors_1minCount{namespace="$namespace"} +tower_logs_errors_5minCount{namespace="$namespace"} +``` + +**Log events by severity level** + +```shell +rate(logback_events_total{namespace="$namespace"}[$__rate_interval]) +``` + +### Infrastructure resources + +#### CPU usage + +Monitor container CPU consumption against requested resources to identify capacity issues or inefficient resource allocation. + +**Backend CPU usage** + +```shell +rate(container_cpu_usage_seconds_total{container="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Compare against requested resources** to determine if the container is over or under-provisioned: + +```shell +max(kube_pod_container_resource_requests{container="backend", namespace="$namespace", resource="cpu"}) +``` + +#### Memory usage + +Track working set memory, committed memory, and limits to prevent OOM conditions. + +**Backend memory working set** shows actual memory in use: + +```shell +container_memory_working_set_bytes{container="backend", namespace="$namespace"} +``` + +**Memory requests and limits** define the bounds for container memory allocation: + +```shell +max(kube_pod_container_resource_requests{container="backend", namespace="$namespace", resource="memory"}) +max(kube_pod_container_resource_limits{container="backend", namespace="$namespace", resource="memory"}) +``` + +### HTTP server requests + +| Metric | Description | +| ------------------------------------------ | ------------------------------------------------- | +| `http_server_requests_seconds_count` | Total request count by method, status, and URI | +| `http_server_requests_seconds_sum` | Total request duration by method, status, and URI | +| `http_server_requests_seconds_max` | Maximum request duration | +| `http_server_requests_seconds` (quantiles) | Request latency percentiles (p50, p95, p99, p999) | + +HTTP metrics reveal application throughput, error rates, and latency patterns. These are essential for understanding user-facing performance. + +**Total request throughput** shows overall API activity: + +```shell +sum(rate(http_server_requests_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Error rate (4xx and 5xx responses)** indicates client errors and server failures: + +```shell +sum(rate(http_server_requests_seconds_count{app="backend", namespace="$namespace", status=~"[45].."}[$__rate_interval])) +``` + +**Average latency per endpoint** helps identify slow API paths: + +```shell +sum by (method, uri) (rate(http_server_requests_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) +/ +sum by (method, uri) (rate(http_server_requests_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Top 10 endpoints by time spent** highlights where server time is consumed for optimization efforts: + +```shell +topk(10, sum by(method, uri) (rate(http_server_requests_seconds_sum{namespace="$namespace", app="backend"}[$__rate_interval]))) +``` + +### HTTP client requests + +| Metric | Description | +| ------------------------------------ | --------------------------------- | +| `http_client_requests_seconds_count` | Outbound request count | +| `http_client_requests_seconds_sum` | Total outbound request duration | +| `http_client_requests_seconds_max` | Maximum outbound request duration | + +Monitor external API calls and integrations. Slow or failing outbound requests can cascade into application performance issues. + +**Outbound request rate** + +```shell +rate(http_client_requests_seconds_count{namespace="$namespace"}[$__rate_interval]) +``` + +**Average outbound request duration** + +```shell +rate(http_client_requests_seconds_sum{namespace="$namespace"}[$__rate_interval]) +/ +rate(http_client_requests_seconds_count{namespace="$namespace"}[$__rate_interval]) +``` + +**Maximum outbound request duration** identifies slow external dependencies: + +```shell +http_client_requests_seconds_max{namespace="$namespace"} +``` + +### JVM memory metrics + +| Metric | Description | +| ------------------------------ | -------------------------------------------------------- | +| `jvm_buffer_memory_used_bytes` | Memory used by JVM buffer pools (direct, mapped) | +| `jvm_memory_used_bytes` | Amount of used memory by area (heap/non-heap) and region | +| `jvm_memory_committed_bytes` | Memory committed for JVM use | +| `jvm_memory_max_bytes` | Maximum memory available for memory management | +| `jvm_gc_live_data_size_bytes` | Size of long-lived heap memory pool after reclamation | +| `jvm_gc_max_data_size_bytes` | Max size of long-lived heap memory pool | + +JVM memory metrics are critical for preventing OutOfMemoryErrors and identifying memory leaks. Monitor both heap (Java objects) and non-heap (metaspace, code cache) regions. + +**Heap memory usage** shows memory used for Java objects: + +```shell +jvm_memory_used_bytes{app="backend", namespace="$namespace", area="heap"} +jvm_memory_committed_bytes{app="backend", namespace="$namespace", area="heap"} +jvm_memory_max_bytes{app="backend", namespace="$namespace", area="heap"} +``` + +**Non-heap memory** includes metaspace and code cache: + +```shell +jvm_memory_used_bytes{app="backend", namespace="$namespace", area="nonheap"} +jvm_memory_committed_bytes{app="backend", namespace="$namespace", area="nonheap"} +jvm_memory_max_bytes{app="backend", namespace="$namespace", area="nonheap"} +``` + +**Heap usage percentage** provides a quick health indicator. Alert when this exceeds 85%: + +```shell +sum(jvm_memory_used_bytes{area="heap"}) / sum(jvm_memory_max_bytes{area="heap"}) * 100 +``` + +**Direct buffer usage** is important for Netty-based applications. High usage can cause native memory issues: + +```shell +jvm_buffer_memory_used_bytes{namespace="$namespace", app="backend", id="direct"} +jvm_buffer_total_capacity_bytes{namespace="$namespace", app="backend", id="direct"} +``` + +### JVM garbage collection + +| Metric | Description | +| ------------------------------------- | ----------------------------------------- | +| `jvm_gc_pause_seconds_sum` | Total time spent in GC pauses | +| `jvm_gc_pause_seconds_count` | Number of GC pause events | +| `jvm_gc_pause_seconds_max` | Maximum GC pause duration | +| `jvm_gc_memory_allocated_bytes_total` | Total bytes allocated in young generation | +| `jvm_gc_memory_promoted_bytes_total` | Bytes promoted to old generation | + +Garbage collection metrics reveal memory pressure and its impact on application responsiveness. Long GC pauses cause request latency spikes. + +**Average GC pause duration** should remain low (under 100ms for most applications): + +```shell +rate(jvm_gc_pause_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval]) +/ +rate(jvm_gc_pause_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Maximum GC pause** identifies worst-case latency impact. Alert if this exceeds 1 second: + +```shell +jvm_gc_pause_seconds_max{app="backend", namespace="$namespace"} +``` + +**Live data size after GC** shows long-lived objects. If this grows over time, you may have a memory leak: + +```shell +jvm_gc_live_data_size_bytes{app="backend", namespace="$namespace"} +``` + +**Memory allocation and promotion rates** indicate object creation patterns. High promotion rates suggest objects are living longer than expected: + +```shell +rate(jvm_gc_memory_allocated_bytes_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(jvm_gc_memory_promoted_bytes_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +### JVM threads + +| Metric | Description | +| ---------------------------- | ----------------------------------------------------------------- | +| `jvm_threads_live_threads` | Current number of live threads (daemon + non-daemon) | +| `jvm_threads_daemon_threads` | Current number of daemon threads | +| `jvm_threads_peak_threads` | Peak thread count since JVM start | +| `jvm_threads_states_threads` | Thread count by state (runnable, blocked, waiting, timed-waiting) | + +Thread metrics help identify deadlocks, thread pool exhaustion, and concurrency issues. + +**Thread counts** show overall thread activity: + +```shell +jvm_threads_live_threads{app="backend", namespace="$namespace"} +jvm_threads_daemon_threads{app="backend", namespace="$namespace"} +jvm_threads_peak_threads{app="backend", namespace="$namespace"} +``` + +**Thread states** reveal blocking issues. High blocked thread counts indicate lock contention: + +```shell +jvm_threads_states_threads{app="backend", namespace="$namespace"} +``` + +### JVM classes + +| Metric | Description | +| ------------------------------------ | -------------------------------------- | +| `jvm_classes_loaded_classes` | Currently loaded classes | +| `jvm_classes_unloaded_classes_total` | Total classes unloaded since JVM start | + +Class loading metrics help identify class loader leaks or excessive dynamic class generation. + +**Loaded classes** should stabilize after startup. Continuous growth may indicate a class loader leak: + +```shell +jvm_classes_loaded_classes{namespace="$namespace", app="backend"} +``` + +**Class unload rate** + +```shell +rate(jvm_classes_unloaded_classes_total{namespace="$namespace", app="backend"}[$__rate_interval]) +``` + +### Process metrics + +| Metric | Description | +| ---------------------------- | ------------------------------------ | +| `process_cpu_usage` | Recent CPU usage for the JVM process | +| `process_cpu_time_ns_total` | Total CPU time used by the JVM | +| `process_files_open_files` | Open file descriptor count | +| `process_files_max_files` | Maximum file descriptor limit | +| `process_uptime_seconds` | JVM uptime | +| `process_start_time_seconds` | Process start time (unix epoch) | + +Process-level metrics provide visibility into resource consumption and system limits. + +**JVM process CPU usage** + +```shell +process_cpu_usage{namespace="$namespace"} +``` + +**Open file descriptors** should be monitored against limits. Exhaustion causes connection failures: + +```shell +process_files_open_files{namespace="$namespace"} +``` + +**File descriptor utilization percentage** - alert when this exceeds 90%: + +```shell +(process_files_open_files{namespace="$namespace"} / process_files_max_files{namespace="$namespace"}) * 100 +``` + +**Process uptime** helps identify restart events. Low uptime may indicate stability issues: + +```shell +process_uptime_seconds{namespace="$namespace"} +``` + +### System metrics + +| Metric | Description | +| ------------------------ | ------------------------------------- | +| `system_cpu_usage` | System-wide CPU usage | +| `system_cpu_count` | Number of processors available to JVM | +| `system_load_average_1m` | 1-minute load average | + +System metrics provide host-level context for application performance. + +**System-wide CPU usage** + +```shell +system_cpu_usage{namespace="$namespace"} +``` + +**System load average** should remain below the CPU count for healthy systems: + +```shell +system_load_average_1m{namespace="$namespace"} +``` + +**Available CPU count** + +```shell +system_cpu_count{namespace="$namespace"} +``` + +### Executor thread pools + +| Metric | Description | +| -------------------------------- | ---------------------------------------------------------- | +| `executor_active_threads` | Currently active threads by pool (io, blocking, scheduled) | +| `executor_pool_size_threads` | Current thread pool size | +| `executor_pool_max_threads` | Maximum allowed threads in pool | +| `executor_queued_tasks` | Tasks queued for execution | +| `executor_completed_tasks_total` | Total completed tasks | +| `executor_seconds_sum` | Total execution time | + +Thread pool metrics reveal concurrency bottlenecks. Saturated pools cause request queuing and increased latency. + +**Thread pool utilization percentage** - high utilization indicates the pool is near capacity: + +```shell +executor_active_threads{service="backend", namespace="$namespace", name!="scheduled"} +/ +executor_pool_size_threads{service="backend", namespace="$namespace", name!="scheduled"} +``` + +**Cron scheduled executor utilization** + +```shell +executor_active_threads{service="cron", namespace="$namespace", name="scheduled"} +/ +executor_pool_size_threads{service="cron", namespace="$namespace", name="scheduled"} +``` + +**Queued tasks** indicate backlog. Growing queues suggest the pool cannot keep up with demand: + +```shell +executor_queued_tasks{app="backend", namespace="$namespace"} +``` + +**Task completion rate** + +```shell +rate(executor_completed_tasks_total{namespace="$namespace"}[$__rate_interval]) +``` + +### Cache metrics + +| Metric | Description | +| ----------------------- | ----------------------------------- | +| `cache_size` | Number of entries in cache | +| `cache_gets_total` | Cache hits and misses by cache name | +| `cache_puts_total` | Cache entries added | +| `cache_evictions_total` | Cache eviction count | + +Cache effectiveness directly impacts database load and response times. Low hit rates indicate caching issues. + +**Redis cache hit rate** - should be above 70% for effective caching: + +```shell +avg(irate(redis_keyspace_hits_total{app="platform-redis-exporter"}[$__rate_interval]) +/ +(irate(redis_keyspace_misses_total{app="platform-redis-exporter"}[$__rate_interval]) + irate(redis_keyspace_hits_total{app="platform-redis-exporter"}[$__rate_interval]))) +``` + +**Cache size by name** + +```shell +cache_size{namespace="$namespace"} +``` + +**Cache operation rates** + +```shell +rate(cache_gets_total{namespace="$namespace"}[$__rate_interval]) +rate(cache_puts_total{namespace="$namespace"}[$__rate_interval]) +rate(cache_evictions_total{namespace="$namespace"}[$__rate_interval]) +``` + +### Hibernate/Database metrics + +| Metric | Description | +| ---------------------------------------- | ---------------------------------------------------- | +| `hibernate_sessions_open_total` | Total sessions opened | +| `hibernate_sessions_closed_total` | Total sessions closed | +| `hibernate_connections_obtained_total` | Database connections obtained | +| `hibernate_query_executions_total` | Total queries executed | +| `hibernate_query_executions_max_seconds` | Slowest query time | +| `hibernate_entities_inserts_total` | Entity insert operations | +| `hibernate_entities_updates_total` | Entity update operations | +| `hibernate_entities_deletes_total` | Entity delete operations | +| `hibernate_entities_loads_total` | Entity load operations | +| `hibernate_transactions_total` | Transaction count | +| `hibernate_flushes_total` | Session flush count | +| `hibernate_optimistic_failures_total` | Optimistic lock failures (StaleObjectStateException) | + +Database metrics reveal query performance, connection management, and transaction health. + +**Session operations** - open and closed counts should be roughly equal. A growing gap indicates session leaks: + +```shell +rate(hibernate_sessions_open_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_sessions_closed_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Connection acquisition rate** + +```shell +rate(hibernate_connections_obtained_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Query execution rate** + +```shell +rate(hibernate_query_executions_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Query latency by type** helps identify slow queries for optimization: + +```shell +sum by (query) (rate(hibernate_query_execution_total_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) +/ +sum by (query) (rate(hibernate_query_execution_total_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Slowest query time** - alert if this exceeds 5 seconds: + +```shell +hibernate_query_executions_max_seconds{app="backend", namespace="$namespace"} +``` + +**Entity operation rates** show database write patterns: + +```shell +rate(hibernate_entities_inserts_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_entities_updates_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_entities_deletes_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_entities_loads_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Transaction success/failure rate** + +```shell +sum by (result) (rate(hibernate_transactions_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Optimistic lock failures** indicate concurrent modification conflicts. High rates suggest contention issues: + +```shell +rate(hibernate_optimistic_failures_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +### Connection pool metrics + +| Metric | Description | +| ------------------------- | ---------------------------- | +| `jdbc_connections_active` | Active database connections | +| `jdbc_connections_max` | Maximum connection pool size | +| `jdbc_connections_min` | Minimum connection pool size | +| `jdbc_connections_usage` | Connection pool usage | + +Connection pool metrics prevent connection exhaustion during traffic bursts. + +**Active connections vs pool limits** - alert when active connections approach the maximum: + +```shell +sum(jdbc_connections_active{app="backend", namespace="$namespace"}) +sum(jdbc_connections_max{app="backend", namespace="$namespace"}) +sum(jdbc_connections_min{app="backend", namespace="$namespace"}) +sum(jdbc_connections_usage{app="backend", namespace="$namespace"}) +``` + +### Hibernate cache metrics + +Hibernate caching reduces database load. Monitor hit rates to ensure caches are effective. + +**Query cache hit rate** - should exceed 60%: + +```shell +sum(increase(hibernate_cache_query_requests_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) +/ +sum(increase(hibernate_cache_query_requests_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Query plan cache hit rate** + +```shell +sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) +/ +sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Second level cache hit rate by region** + +```shell +sum by (region) (increase(hibernate_second_level_cache_requests_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) +/ +sum by (region) (increase(hibernate_second_level_cache_requests_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +### Logging metrics + +| Metric | Description | +| ---------------------- | ----------------------------------------------------- | +| `logback_events_total` | Log events by level (debug, info, warn, error, trace) | + +Log event metrics provide early warning of application issues. + +**Error rate** - track error log frequency for anomaly detection: + +```shell +rate(logback_events_total{level="error"}[5m]) +``` + +### Kubernetes health + +Monitor pod health to catch deployment or infrastructure issues early. + +**Pods in unhealthy states** + +```shell +sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed", namespace!="wave-build"}) > 0 +``` + +## Alerting recommendations + +### Critical alerts + +- `jvm_memory_used_bytes{area="heap"}` > 90% of `jvm_memory_max_bytes` +- `process_files_open_files` > 90% of `process_files_max_files` +- `logback_events_total{level="error"}` rate > threshold +- `tower_logs_errors_1minCount` > 0 +- HTTP 5xx errors > 5% of total requests +- `jdbc_connections_active` > 90% of `jdbc_connections_max` +- Any pods in Failed/Unknown state for > 5 minutes + +### Warning alerts + +- `jvm_gc_pause_seconds_max` > 1 second +- `jvm_gc_live_data_size_bytes` approaching `jvm_gc_max_data_size_bytes` +- Heap usage > 85% of max heap +- `executor_queued_tasks` > threshold +- Executor utilization > 90% +- `hibernate_optimistic_failures_total` rate increasing +- `hibernate_query_executions_max_seconds` > 5 seconds +- `http_server_requests_seconds` p99 > acceptable latency +- Redis cache hit rate < 70% +- Hibernate query cache hit rate < 60% +- Growing gap between `credits_estimation_workflow_added_total` and `credits_estimation_workflow_ended_total` +- `hibernate_sessions_open_total` >> `hibernate_sessions_closed_total` over time + +## Quick reference: Metrics by troubleshooting scenario + +| Issue | Key Metrics to Check | +| ------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------- | +| **Slow application response** | `http_server_requests_seconds` (latency), `jvm_gc_pause_seconds_max`, `hibernate_query_executions_max_seconds`, `executor_active_threads` | +| **Out of memory errors** | `jvm_memory_used_bytes`, `jvm_gc_pause_seconds`, `jvm_gc_live_data_size_bytes`, `jvm_buffer_memory_used_bytes` | +| **Database performance** | `hibernate_query_executions_max_seconds`, `jdbc_connections_active`, `hibernate_transactions_total`, cache hit rates | +| **High CPU usage** | `process_cpu_usage`, `system_cpu_usage`, `jvm_threads_live_threads`, `executor_active_threads` | +| **Connection exhaustion** | `jdbc_connections_active`, `jdbc_connections_max`, `hibernate_sessions_open_total` vs `hibernate_sessions_closed_total` | +| **Cache issues** | Redis hit rate, `hibernate_cache_query_requests_total`, `cache_gets_total`, `cache_evictions_total` | +| **Workflow processing delays** | `credits_estimation_workflow_*`, `credits_estimation_task_*`, `executor_queued_tasks`, `tower_logs_errors_*` | +| **Thread starvation** | `executor_active_threads`, `executor_queued_tasks`, `jvm_threads_states_threads{state="blocked"}` | +| **Memory leaks** | `jvm_memory_used_bytes` trending up, `jvm_gc_live_data_size_bytes` growing, `jvm_classes_loaded_classes` growing | +| **GC pressure** | `jvm_gc_pause_seconds_max`, `jvm_gc_memory_promoted_bytes_total`, time in GC vs application time | diff --git a/platform-enterprise_versioned_docs/version-25.1/enterprise/advanced-topics/monitoring.md b/platform-enterprise_versioned_docs/version-25.1/enterprise/advanced-topics/monitoring.md new file mode 100644 index 000000000..c692367ee --- /dev/null +++ b/platform-enterprise_versioned_docs/version-25.1/enterprise/advanced-topics/monitoring.md @@ -0,0 +1,616 @@ +--- +title: "Seqera Platform Monitoring" +description: "A guide on relevant platform metrics" +date created: "2025-12-17" +tags: [platform, monitoring] +--- + +You can enable Seqera Platform's built-in observability metrics by adding `prometheus` to the `MICRONAUT_ENVIRONMENTS` environment variable. This exposes a Prometheus endpoint at `/prometheus` on the default listen port (e.g., `http://localhost:8080/prometheus`). + +Combined with infrastructure monitoring tools such as Node Exporter, you can monitor relevant metrics across your deployment. + +## Key metrics to monitor + +### Seqera Platform-specific metrics + +#### Studios metrics + +| Metric | Description | +| ------------------------------------------------ | -------------------------------------| +| `data_studio_startup_time_failure_seconds_sum` | Time for failed Studio startups | +| `data_studio_startup_time_failure_seconds_count` | Failed Studio startup count | + +Track Studio startup performance to identify environment provisioning issues. Slow or failing startups impact user productivity. + +**Average startup time by tool** + +```shell +sum by (tool) (increase(data_studio_startup_time_success_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) +/ +sum by (tool) (increase(data_studio_startup_time_success_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Failed startup rate** + +```shell +rate(data_studio_startup_time_failure_seconds_count{namespace="$namespace"}[$__rate_interval]) +``` + +#### Error tracking + +| Metric | Description | +| ------------------------------ | ------------------------- | +| `tower_logs_errors_10secCount` | Errors in last 10 seconds | +| `tower_logs_errors_1minCount` | Errors in last minute | +| `tower_logs_errors_5minCount` | Errors in last 5 minutes | + +Monitor application errors across different time windows. Rolling error counts help identify transient issues versus sustained problems. + +**Recent error counts** + +```shell +tower_logs_errors_10secCount{namespace="$namespace"} +tower_logs_errors_1minCount{namespace="$namespace"} +tower_logs_errors_5minCount{namespace="$namespace"} +``` + +**Log events by severity level** + +```shell +rate(logback_events_total{namespace="$namespace"}[$__rate_interval]) +``` + +### Infrastructure resources + +#### CPU usage + +Monitor container CPU consumption against requested resources to identify capacity issues or inefficient resource allocation. + +**Backend CPU usage** + +```shell +rate(container_cpu_usage_seconds_total{container="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Compare against requested resources** to determine if the container is over or under-provisioned: + +```shell +max(kube_pod_container_resource_requests{container="backend", namespace="$namespace", resource="cpu"}) +``` + +#### Memory usage + +Track working set memory, committed memory, and limits to prevent OOM conditions. + +**Backend memory working set** shows actual memory in use: + +```shell +container_memory_working_set_bytes{container="backend", namespace="$namespace"} +``` + +**Memory requests and limits** define the bounds for container memory allocation: + +```shell +max(kube_pod_container_resource_requests{container="backend", namespace="$namespace", resource="memory"}) +max(kube_pod_container_resource_limits{container="backend", namespace="$namespace", resource="memory"}) +``` + +### HTTP server requests + +| Metric | Description | +| ------------------------------------------ | ------------------------------------------------- | +| `http_server_requests_seconds_count` | Total request count by method, status, and URI | +| `http_server_requests_seconds_sum` | Total request duration by method, status, and URI | +| `http_server_requests_seconds_max` | Maximum request duration | +| `http_server_requests_seconds` (quantiles) | Request latency percentiles (p50, p95, p99, p999) | + +HTTP metrics reveal application throughput, error rates, and latency patterns. These are essential for understanding user-facing performance. + +**Total request throughput** shows overall API activity: + +```shell +sum(rate(http_server_requests_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Error rate (4xx and 5xx responses)** indicates client errors and server failures: + +```shell +sum(rate(http_server_requests_seconds_count{app="backend", namespace="$namespace", status=~"[45].."}[$__rate_interval])) +``` + +**Average latency per endpoint** helps identify slow API paths: + +```shell +sum by (method, uri) (rate(http_server_requests_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) +/ +sum by (method, uri) (rate(http_server_requests_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Top 10 endpoints by time spent** highlights where server time is consumed for optimization efforts: + +```shell +topk(10, sum by(method, uri) (rate(http_server_requests_seconds_sum{namespace="$namespace", app="backend"}[$__rate_interval]))) +``` + +### HTTP client requests + +| Metric | Description | +| ------------------------------------ | --------------------------------- | +| `http_client_requests_seconds_count` | Outbound request count | +| `http_client_requests_seconds_sum` | Total outbound request duration | +| `http_client_requests_seconds_max` | Maximum outbound request duration | + +Monitor external API calls and integrations. Slow or failing outbound requests can cascade into application performance issues. + +**Outbound request rate** + +```shell +rate(http_client_requests_seconds_count{namespace="$namespace"}[$__rate_interval]) +``` + +**Average outbound request duration** + +```shell +rate(http_client_requests_seconds_sum{namespace="$namespace"}[$__rate_interval]) +/ +rate(http_client_requests_seconds_count{namespace="$namespace"}[$__rate_interval]) +``` + +**Maximum outbound request duration** identifies slow external dependencies: + +```shell +http_client_requests_seconds_max{namespace="$namespace"} +``` + +### JVM memory metrics + +| Metric | Description | +| ------------------------------ | -------------------------------------------------------- | +| `jvm_buffer_memory_used_bytes` | Memory used by JVM buffer pools (direct, mapped) | +| `jvm_memory_used_bytes` | Amount of used memory by area (heap/non-heap) and region | +| `jvm_memory_committed_bytes` | Memory committed for JVM use | +| `jvm_memory_max_bytes` | Maximum memory available for memory management | +| `jvm_gc_live_data_size_bytes` | Size of long-lived heap memory pool after reclamation | +| `jvm_gc_max_data_size_bytes` | Max size of long-lived heap memory pool | + +JVM memory metrics are critical for preventing OutOfMemoryErrors and identifying memory leaks. Monitor both heap (Java objects) and non-heap (metaspace, code cache) regions. + +**Heap memory usage** shows memory used for Java objects: + +```shell +jvm_memory_used_bytes{app="backend", namespace="$namespace", area="heap"} +jvm_memory_committed_bytes{app="backend", namespace="$namespace", area="heap"} +jvm_memory_max_bytes{app="backend", namespace="$namespace", area="heap"} +``` + +**Non-heap memory** includes metaspace and code cache: + +```shell +jvm_memory_used_bytes{app="backend", namespace="$namespace", area="nonheap"} +jvm_memory_committed_bytes{app="backend", namespace="$namespace", area="nonheap"} +jvm_memory_max_bytes{app="backend", namespace="$namespace", area="nonheap"} +``` + +**Heap usage percentage** provides a quick health indicator. Alert when this exceeds 85%: + +```shell +sum(jvm_memory_used_bytes{area="heap"}) / sum(jvm_memory_max_bytes{area="heap"}) * 100 +``` + +**Direct buffer usage** is important for Netty-based applications. High usage can cause native memory issues: + +```shell +jvm_buffer_memory_used_bytes{namespace="$namespace", app="backend", id="direct"} +jvm_buffer_total_capacity_bytes{namespace="$namespace", app="backend", id="direct"} +``` + +### JVM garbage collection + +| Metric | Description | +| ------------------------------------- | ----------------------------------------- | +| `jvm_gc_pause_seconds_sum` | Total time spent in GC pauses | +| `jvm_gc_pause_seconds_count` | Number of GC pause events | +| `jvm_gc_pause_seconds_max` | Maximum GC pause duration | +| `jvm_gc_memory_allocated_bytes_total` | Total bytes allocated in young generation | +| `jvm_gc_memory_promoted_bytes_total` | Bytes promoted to old generation | + +Garbage collection metrics reveal memory pressure and its impact on application responsiveness. Long GC pauses cause request latency spikes. + +**Average GC pause duration** should remain low (under 100ms for most applications): + +```shell +rate(jvm_gc_pause_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval]) +/ +rate(jvm_gc_pause_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Maximum GC pause** identifies worst-case latency impact. Alert if this exceeds 1 second: + +```shell +jvm_gc_pause_seconds_max{app="backend", namespace="$namespace"} +``` + +**Live data size after GC** shows long-lived objects. If this grows over time, you may have a memory leak: + +```shell +jvm_gc_live_data_size_bytes{app="backend", namespace="$namespace"} +``` + +**Memory allocation and promotion rates** indicate object creation patterns. High promotion rates suggest objects are living longer than expected: + +```shell +rate(jvm_gc_memory_allocated_bytes_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(jvm_gc_memory_promoted_bytes_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +### JVM threads + +| Metric | Description | +| ---------------------------- | ----------------------------------------------------------------- | +| `jvm_threads_live_threads` | Current number of live threads (daemon + non-daemon) | +| `jvm_threads_daemon_threads` | Current number of daemon threads | +| `jvm_threads_peak_threads` | Peak thread count since JVM start | +| `jvm_threads_states_threads` | Thread count by state (runnable, blocked, waiting, timed-waiting) | + +Thread metrics help identify deadlocks, thread pool exhaustion, and concurrency issues. + +**Thread counts** show overall thread activity: + +```shell +jvm_threads_live_threads{app="backend", namespace="$namespace"} +jvm_threads_daemon_threads{app="backend", namespace="$namespace"} +jvm_threads_peak_threads{app="backend", namespace="$namespace"} +``` + +**Thread states** reveal blocking issues. High blocked thread counts indicate lock contention: + +```shell +jvm_threads_states_threads{app="backend", namespace="$namespace"} +``` + +### JVM classes + +| Metric | Description | +| ------------------------------------ | -------------------------------------- | +| `jvm_classes_loaded_classes` | Currently loaded classes | +| `jvm_classes_unloaded_classes_total` | Total classes unloaded since JVM start | + +Class loading metrics help identify class loader leaks or excessive dynamic class generation. + +**Loaded classes** should stabilize after startup. Continuous growth may indicate a class loader leak: + +```shell +jvm_classes_loaded_classes{namespace="$namespace", app="backend"} +``` + +**Class unload rate** + +```shell +rate(jvm_classes_unloaded_classes_total{namespace="$namespace", app="backend"}[$__rate_interval]) +``` + +### Process metrics + +| Metric | Description | +| ---------------------------- | ------------------------------------ | +| `process_cpu_usage` | Recent CPU usage for the JVM process | +| `process_cpu_time_ns_total` | Total CPU time used by the JVM | +| `process_files_open_files` | Open file descriptor count | +| `process_files_max_files` | Maximum file descriptor limit | +| `process_uptime_seconds` | JVM uptime | +| `process_start_time_seconds` | Process start time (unix epoch) | + +Process-level metrics provide visibility into resource consumption and system limits. + +**JVM process CPU usage** + +```shell +process_cpu_usage{namespace="$namespace"} +``` + +**Open file descriptors** should be monitored against limits. Exhaustion causes connection failures: + +```shell +process_files_open_files{namespace="$namespace"} +``` + +**File descriptor utilization percentage** - alert when this exceeds 90%: + +```shell +(process_files_open_files{namespace="$namespace"} / process_files_max_files{namespace="$namespace"}) * 100 +``` + +**Process uptime** helps identify restart events. Low uptime may indicate stability issues: + +```shell +process_uptime_seconds{namespace="$namespace"} +``` + +### System metrics + +| Metric | Description | +| ------------------------ | ------------------------------------- | +| `system_cpu_usage` | System-wide CPU usage | +| `system_cpu_count` | Number of processors available to JVM | +| `system_load_average_1m` | 1-minute load average | + +System metrics provide host-level context for application performance. + +**System-wide CPU usage** + +```shell +system_cpu_usage{namespace="$namespace"} +``` + +**System load average** should remain below the CPU count for healthy systems: + +```shell +system_load_average_1m{namespace="$namespace"} +``` + +**Available CPU count** + +```shell +system_cpu_count{namespace="$namespace"} +``` + +### Executor thread pools + +| Metric | Description | +| -------------------------------- | ---------------------------------------------------------- | +| `executor_active_threads` | Currently active threads by pool (io, blocking, scheduled) | +| `executor_pool_size_threads` | Current thread pool size | +| `executor_pool_max_threads` | Maximum allowed threads in pool | +| `executor_queued_tasks` | Tasks queued for execution | +| `executor_completed_tasks_total` | Total completed tasks | +| `executor_seconds_sum` | Total execution time | + +Thread pool metrics reveal concurrency bottlenecks. Saturated pools cause request queuing and increased latency. + +**Thread pool utilization percentage** - high utilization indicates the pool is near capacity: + +```shell +executor_active_threads{service="backend", namespace="$namespace", name!="scheduled"} +/ +executor_pool_size_threads{service="backend", namespace="$namespace", name!="scheduled"} +``` + +**Cron scheduled executor utilization** + +```shell +executor_active_threads{service="cron", namespace="$namespace", name="scheduled"} +/ +executor_pool_size_threads{service="cron", namespace="$namespace", name="scheduled"} +``` + +**Queued tasks** indicate backlog. Growing queues suggest the pool cannot keep up with demand: + +```shell +executor_queued_tasks{app="backend", namespace="$namespace"} +``` + +**Task completion rate** + +```shell +rate(executor_completed_tasks_total{namespace="$namespace"}[$__rate_interval]) +``` + +### Cache metrics + +| Metric | Description | +| ----------------------- | ----------------------------------- | +| `cache_size` | Number of entries in cache | +| `cache_gets_total` | Cache hits and misses by cache name | +| `cache_puts_total` | Cache entries added | +| `cache_evictions_total` | Cache eviction count | + +Cache effectiveness directly impacts database load and response times. Low hit rates indicate caching issues. + +**Redis cache hit rate** - should be above 70% for effective caching: + +```shell +avg(irate(redis_keyspace_hits_total{app="platform-redis-exporter"}[$__rate_interval]) +/ +(irate(redis_keyspace_misses_total{app="platform-redis-exporter"}[$__rate_interval]) + irate(redis_keyspace_hits_total{app="platform-redis-exporter"}[$__rate_interval]))) +``` + +**Cache size by name** + +```shell +cache_size{namespace="$namespace"} +``` + +**Cache operation rates** + +```shell +rate(cache_gets_total{namespace="$namespace"}[$__rate_interval]) +rate(cache_puts_total{namespace="$namespace"}[$__rate_interval]) +rate(cache_evictions_total{namespace="$namespace"}[$__rate_interval]) +``` + +### Hibernate/Database metrics + +| Metric | Description | +| ---------------------------------------- | ---------------------------------------------------- | +| `hibernate_sessions_open_total` | Total sessions opened | +| `hibernate_sessions_closed_total` | Total sessions closed | +| `hibernate_connections_obtained_total` | Database connections obtained | +| `hibernate_query_executions_total` | Total queries executed | +| `hibernate_query_executions_max_seconds` | Slowest query time | +| `hibernate_entities_inserts_total` | Entity insert operations | +| `hibernate_entities_updates_total` | Entity update operations | +| `hibernate_entities_deletes_total` | Entity delete operations | +| `hibernate_entities_loads_total` | Entity load operations | +| `hibernate_transactions_total` | Transaction count | +| `hibernate_flushes_total` | Session flush count | +| `hibernate_optimistic_failures_total` | Optimistic lock failures (StaleObjectStateException) | + +Database metrics reveal query performance, connection management, and transaction health. + +**Session operations** - open and closed counts should be roughly equal. A growing gap indicates session leaks: + +```shell +rate(hibernate_sessions_open_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_sessions_closed_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Connection acquisition rate** + +```shell +rate(hibernate_connections_obtained_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Query execution rate** + +```shell +rate(hibernate_query_executions_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Query latency by type** helps identify slow queries for optimization: + +```shell +sum by (query) (rate(hibernate_query_execution_total_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) +/ +sum by (query) (rate(hibernate_query_execution_total_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Slowest query time** - alert if this exceeds 5 seconds: + +```shell +hibernate_query_executions_max_seconds{app="backend", namespace="$namespace"} +``` + +**Entity operation rates** show database write patterns: + +```shell +rate(hibernate_entities_inserts_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_entities_updates_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_entities_deletes_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_entities_loads_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Transaction success/failure rate** + +```shell +sum by (result) (rate(hibernate_transactions_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Optimistic lock failures** indicate concurrent modification conflicts. High rates suggest contention issues: + +```shell +rate(hibernate_optimistic_failures_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +### Connection pool metrics + +| Metric | Description | +| ------------------------- | ---------------------------- | +| `jdbc_connections_active` | Active database connections | +| `jdbc_connections_max` | Maximum connection pool size | +| `jdbc_connections_min` | Minimum connection pool size | +| `jdbc_connections_usage` | Connection pool usage | + +Connection pool metrics prevent connection exhaustion during traffic bursts. + +**Active connections vs pool limits** - alert when active connections approach the maximum: + +```shell +sum(jdbc_connections_active{app="backend", namespace="$namespace"}) +sum(jdbc_connections_max{app="backend", namespace="$namespace"}) +sum(jdbc_connections_min{app="backend", namespace="$namespace"}) +sum(jdbc_connections_usage{app="backend", namespace="$namespace"}) +``` + +### Hibernate cache metrics + +Hibernate caching reduces database load. Monitor hit rates to ensure caches are effective. + +**Query cache hit rate** - should exceed 60%: + +```shell +sum(increase(hibernate_cache_query_requests_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) +/ +sum(increase(hibernate_cache_query_requests_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Query plan cache hit rate** + +```shell +sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) +/ +sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Second level cache hit rate by region** + +```shell +sum by (region) (increase(hibernate_second_level_cache_requests_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) +/ +sum by (region) (increase(hibernate_second_level_cache_requests_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +### Logging metrics + +| Metric | Description | +| ---------------------- | ----------------------------------------------------- | +| `logback_events_total` | Log events by level (debug, info, warn, error, trace) | + +Log event metrics provide early warning of application issues. + +**Error rate** - track error log frequency for anomaly detection: + +```shell +rate(logback_events_total{level="error"}[5m]) +``` + +### Kubernetes health + +Monitor pod health to catch deployment or infrastructure issues early. + +**Pods in unhealthy states** + +```shell +sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed", namespace!="wave-build"}) > 0 +``` + +## Alerting recommendations + +### Critical alerts + +- `jvm_memory_used_bytes{area="heap"}` > 90% of `jvm_memory_max_bytes` +- `process_files_open_files` > 90% of `process_files_max_files` +- `logback_events_total{level="error"}` rate > threshold +- `tower_logs_errors_1minCount` > 0 +- HTTP 5xx errors > 5% of total requests +- `jdbc_connections_active` > 90% of `jdbc_connections_max` +- Any pods in Failed/Unknown state for > 5 minutes + +### Warning alerts + +- `jvm_gc_pause_seconds_max` > 1 second +- `jvm_gc_live_data_size_bytes` approaching `jvm_gc_max_data_size_bytes` +- Heap usage > 85% of max heap +- `executor_queued_tasks` > threshold +- Executor utilization > 90% +- `hibernate_optimistic_failures_total` rate increasing +- `hibernate_query_executions_max_seconds` > 5 seconds +- `http_server_requests_seconds` p99 > acceptable latency +- Redis cache hit rate < 70% +- Hibernate query cache hit rate < 60% +- Growing gap between `credits_estimation_workflow_added_total` and `credits_estimation_workflow_ended_total` +- `hibernate_sessions_open_total` >> `hibernate_sessions_closed_total` over time + +## Quick reference: Metrics by troubleshooting scenario + +| Issue | Key Metrics to Check | +| ------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------- | +| **Slow application response** | `http_server_requests_seconds` (latency), `jvm_gc_pause_seconds_max`, `hibernate_query_executions_max_seconds`, `executor_active_threads` | +| **Out of memory errors** | `jvm_memory_used_bytes`, `jvm_gc_pause_seconds`, `jvm_gc_live_data_size_bytes`, `jvm_buffer_memory_used_bytes` | +| **Database performance** | `hibernate_query_executions_max_seconds`, `jdbc_connections_active`, `hibernate_transactions_total`, cache hit rates | +| **High CPU usage** | `process_cpu_usage`, `system_cpu_usage`, `jvm_threads_live_threads`, `executor_active_threads` | +| **Connection exhaustion** | `jdbc_connections_active`, `jdbc_connections_max`, `hibernate_sessions_open_total` vs `hibernate_sessions_closed_total` | +| **Cache issues** | Redis hit rate, `hibernate_cache_query_requests_total`, `cache_gets_total`, `cache_evictions_total` | +| **Workflow processing delays** | `credits_estimation_workflow_*`, `credits_estimation_task_*`, `executor_queued_tasks`, `tower_logs_errors_*` | +| **Thread starvation** | `executor_active_threads`, `executor_queued_tasks`, `jvm_threads_states_threads{state="blocked"}` | +| **Memory leaks** | `jvm_memory_used_bytes` trending up, `jvm_gc_live_data_size_bytes` growing, `jvm_classes_loaded_classes` growing | +| **GC pressure** | `jvm_gc_pause_seconds_max`, `jvm_gc_memory_promoted_bytes_total`, time in GC vs application time | diff --git a/platform-enterprise_versioned_docs/version-25.2/enterprise/advanced-topics/monitoring.md b/platform-enterprise_versioned_docs/version-25.2/enterprise/advanced-topics/monitoring.md new file mode 100644 index 000000000..c692367ee --- /dev/null +++ b/platform-enterprise_versioned_docs/version-25.2/enterprise/advanced-topics/monitoring.md @@ -0,0 +1,616 @@ +--- +title: "Seqera Platform Monitoring" +description: "A guide on relevant platform metrics" +date created: "2025-12-17" +tags: [platform, monitoring] +--- + +You can enable Seqera Platform's built-in observability metrics by adding `prometheus` to the `MICRONAUT_ENVIRONMENTS` environment variable. This exposes a Prometheus endpoint at `/prometheus` on the default listen port (e.g., `http://localhost:8080/prometheus`). + +Combined with infrastructure monitoring tools such as Node Exporter, you can monitor relevant metrics across your deployment. + +## Key metrics to monitor + +### Seqera Platform-specific metrics + +#### Studios metrics + +| Metric | Description | +| ------------------------------------------------ | -------------------------------------| +| `data_studio_startup_time_failure_seconds_sum` | Time for failed Studio startups | +| `data_studio_startup_time_failure_seconds_count` | Failed Studio startup count | + +Track Studio startup performance to identify environment provisioning issues. Slow or failing startups impact user productivity. + +**Average startup time by tool** + +```shell +sum by (tool) (increase(data_studio_startup_time_success_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) +/ +sum by (tool) (increase(data_studio_startup_time_success_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Failed startup rate** + +```shell +rate(data_studio_startup_time_failure_seconds_count{namespace="$namespace"}[$__rate_interval]) +``` + +#### Error tracking + +| Metric | Description | +| ------------------------------ | ------------------------- | +| `tower_logs_errors_10secCount` | Errors in last 10 seconds | +| `tower_logs_errors_1minCount` | Errors in last minute | +| `tower_logs_errors_5minCount` | Errors in last 5 minutes | + +Monitor application errors across different time windows. Rolling error counts help identify transient issues versus sustained problems. + +**Recent error counts** + +```shell +tower_logs_errors_10secCount{namespace="$namespace"} +tower_logs_errors_1minCount{namespace="$namespace"} +tower_logs_errors_5minCount{namespace="$namespace"} +``` + +**Log events by severity level** + +```shell +rate(logback_events_total{namespace="$namespace"}[$__rate_interval]) +``` + +### Infrastructure resources + +#### CPU usage + +Monitor container CPU consumption against requested resources to identify capacity issues or inefficient resource allocation. + +**Backend CPU usage** + +```shell +rate(container_cpu_usage_seconds_total{container="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Compare against requested resources** to determine if the container is over or under-provisioned: + +```shell +max(kube_pod_container_resource_requests{container="backend", namespace="$namespace", resource="cpu"}) +``` + +#### Memory usage + +Track working set memory, committed memory, and limits to prevent OOM conditions. + +**Backend memory working set** shows actual memory in use: + +```shell +container_memory_working_set_bytes{container="backend", namespace="$namespace"} +``` + +**Memory requests and limits** define the bounds for container memory allocation: + +```shell +max(kube_pod_container_resource_requests{container="backend", namespace="$namespace", resource="memory"}) +max(kube_pod_container_resource_limits{container="backend", namespace="$namespace", resource="memory"}) +``` + +### HTTP server requests + +| Metric | Description | +| ------------------------------------------ | ------------------------------------------------- | +| `http_server_requests_seconds_count` | Total request count by method, status, and URI | +| `http_server_requests_seconds_sum` | Total request duration by method, status, and URI | +| `http_server_requests_seconds_max` | Maximum request duration | +| `http_server_requests_seconds` (quantiles) | Request latency percentiles (p50, p95, p99, p999) | + +HTTP metrics reveal application throughput, error rates, and latency patterns. These are essential for understanding user-facing performance. + +**Total request throughput** shows overall API activity: + +```shell +sum(rate(http_server_requests_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Error rate (4xx and 5xx responses)** indicates client errors and server failures: + +```shell +sum(rate(http_server_requests_seconds_count{app="backend", namespace="$namespace", status=~"[45].."}[$__rate_interval])) +``` + +**Average latency per endpoint** helps identify slow API paths: + +```shell +sum by (method, uri) (rate(http_server_requests_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) +/ +sum by (method, uri) (rate(http_server_requests_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Top 10 endpoints by time spent** highlights where server time is consumed for optimization efforts: + +```shell +topk(10, sum by(method, uri) (rate(http_server_requests_seconds_sum{namespace="$namespace", app="backend"}[$__rate_interval]))) +``` + +### HTTP client requests + +| Metric | Description | +| ------------------------------------ | --------------------------------- | +| `http_client_requests_seconds_count` | Outbound request count | +| `http_client_requests_seconds_sum` | Total outbound request duration | +| `http_client_requests_seconds_max` | Maximum outbound request duration | + +Monitor external API calls and integrations. Slow or failing outbound requests can cascade into application performance issues. + +**Outbound request rate** + +```shell +rate(http_client_requests_seconds_count{namespace="$namespace"}[$__rate_interval]) +``` + +**Average outbound request duration** + +```shell +rate(http_client_requests_seconds_sum{namespace="$namespace"}[$__rate_interval]) +/ +rate(http_client_requests_seconds_count{namespace="$namespace"}[$__rate_interval]) +``` + +**Maximum outbound request duration** identifies slow external dependencies: + +```shell +http_client_requests_seconds_max{namespace="$namespace"} +``` + +### JVM memory metrics + +| Metric | Description | +| ------------------------------ | -------------------------------------------------------- | +| `jvm_buffer_memory_used_bytes` | Memory used by JVM buffer pools (direct, mapped) | +| `jvm_memory_used_bytes` | Amount of used memory by area (heap/non-heap) and region | +| `jvm_memory_committed_bytes` | Memory committed for JVM use | +| `jvm_memory_max_bytes` | Maximum memory available for memory management | +| `jvm_gc_live_data_size_bytes` | Size of long-lived heap memory pool after reclamation | +| `jvm_gc_max_data_size_bytes` | Max size of long-lived heap memory pool | + +JVM memory metrics are critical for preventing OutOfMemoryErrors and identifying memory leaks. Monitor both heap (Java objects) and non-heap (metaspace, code cache) regions. + +**Heap memory usage** shows memory used for Java objects: + +```shell +jvm_memory_used_bytes{app="backend", namespace="$namespace", area="heap"} +jvm_memory_committed_bytes{app="backend", namespace="$namespace", area="heap"} +jvm_memory_max_bytes{app="backend", namespace="$namespace", area="heap"} +``` + +**Non-heap memory** includes metaspace and code cache: + +```shell +jvm_memory_used_bytes{app="backend", namespace="$namespace", area="nonheap"} +jvm_memory_committed_bytes{app="backend", namespace="$namespace", area="nonheap"} +jvm_memory_max_bytes{app="backend", namespace="$namespace", area="nonheap"} +``` + +**Heap usage percentage** provides a quick health indicator. Alert when this exceeds 85%: + +```shell +sum(jvm_memory_used_bytes{area="heap"}) / sum(jvm_memory_max_bytes{area="heap"}) * 100 +``` + +**Direct buffer usage** is important for Netty-based applications. High usage can cause native memory issues: + +```shell +jvm_buffer_memory_used_bytes{namespace="$namespace", app="backend", id="direct"} +jvm_buffer_total_capacity_bytes{namespace="$namespace", app="backend", id="direct"} +``` + +### JVM garbage collection + +| Metric | Description | +| ------------------------------------- | ----------------------------------------- | +| `jvm_gc_pause_seconds_sum` | Total time spent in GC pauses | +| `jvm_gc_pause_seconds_count` | Number of GC pause events | +| `jvm_gc_pause_seconds_max` | Maximum GC pause duration | +| `jvm_gc_memory_allocated_bytes_total` | Total bytes allocated in young generation | +| `jvm_gc_memory_promoted_bytes_total` | Bytes promoted to old generation | + +Garbage collection metrics reveal memory pressure and its impact on application responsiveness. Long GC pauses cause request latency spikes. + +**Average GC pause duration** should remain low (under 100ms for most applications): + +```shell +rate(jvm_gc_pause_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval]) +/ +rate(jvm_gc_pause_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Maximum GC pause** identifies worst-case latency impact. Alert if this exceeds 1 second: + +```shell +jvm_gc_pause_seconds_max{app="backend", namespace="$namespace"} +``` + +**Live data size after GC** shows long-lived objects. If this grows over time, you may have a memory leak: + +```shell +jvm_gc_live_data_size_bytes{app="backend", namespace="$namespace"} +``` + +**Memory allocation and promotion rates** indicate object creation patterns. High promotion rates suggest objects are living longer than expected: + +```shell +rate(jvm_gc_memory_allocated_bytes_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(jvm_gc_memory_promoted_bytes_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +### JVM threads + +| Metric | Description | +| ---------------------------- | ----------------------------------------------------------------- | +| `jvm_threads_live_threads` | Current number of live threads (daemon + non-daemon) | +| `jvm_threads_daemon_threads` | Current number of daemon threads | +| `jvm_threads_peak_threads` | Peak thread count since JVM start | +| `jvm_threads_states_threads` | Thread count by state (runnable, blocked, waiting, timed-waiting) | + +Thread metrics help identify deadlocks, thread pool exhaustion, and concurrency issues. + +**Thread counts** show overall thread activity: + +```shell +jvm_threads_live_threads{app="backend", namespace="$namespace"} +jvm_threads_daemon_threads{app="backend", namespace="$namespace"} +jvm_threads_peak_threads{app="backend", namespace="$namespace"} +``` + +**Thread states** reveal blocking issues. High blocked thread counts indicate lock contention: + +```shell +jvm_threads_states_threads{app="backend", namespace="$namespace"} +``` + +### JVM classes + +| Metric | Description | +| ------------------------------------ | -------------------------------------- | +| `jvm_classes_loaded_classes` | Currently loaded classes | +| `jvm_classes_unloaded_classes_total` | Total classes unloaded since JVM start | + +Class loading metrics help identify class loader leaks or excessive dynamic class generation. + +**Loaded classes** should stabilize after startup. Continuous growth may indicate a class loader leak: + +```shell +jvm_classes_loaded_classes{namespace="$namespace", app="backend"} +``` + +**Class unload rate** + +```shell +rate(jvm_classes_unloaded_classes_total{namespace="$namespace", app="backend"}[$__rate_interval]) +``` + +### Process metrics + +| Metric | Description | +| ---------------------------- | ------------------------------------ | +| `process_cpu_usage` | Recent CPU usage for the JVM process | +| `process_cpu_time_ns_total` | Total CPU time used by the JVM | +| `process_files_open_files` | Open file descriptor count | +| `process_files_max_files` | Maximum file descriptor limit | +| `process_uptime_seconds` | JVM uptime | +| `process_start_time_seconds` | Process start time (unix epoch) | + +Process-level metrics provide visibility into resource consumption and system limits. + +**JVM process CPU usage** + +```shell +process_cpu_usage{namespace="$namespace"} +``` + +**Open file descriptors** should be monitored against limits. Exhaustion causes connection failures: + +```shell +process_files_open_files{namespace="$namespace"} +``` + +**File descriptor utilization percentage** - alert when this exceeds 90%: + +```shell +(process_files_open_files{namespace="$namespace"} / process_files_max_files{namespace="$namespace"}) * 100 +``` + +**Process uptime** helps identify restart events. Low uptime may indicate stability issues: + +```shell +process_uptime_seconds{namespace="$namespace"} +``` + +### System metrics + +| Metric | Description | +| ------------------------ | ------------------------------------- | +| `system_cpu_usage` | System-wide CPU usage | +| `system_cpu_count` | Number of processors available to JVM | +| `system_load_average_1m` | 1-minute load average | + +System metrics provide host-level context for application performance. + +**System-wide CPU usage** + +```shell +system_cpu_usage{namespace="$namespace"} +``` + +**System load average** should remain below the CPU count for healthy systems: + +```shell +system_load_average_1m{namespace="$namespace"} +``` + +**Available CPU count** + +```shell +system_cpu_count{namespace="$namespace"} +``` + +### Executor thread pools + +| Metric | Description | +| -------------------------------- | ---------------------------------------------------------- | +| `executor_active_threads` | Currently active threads by pool (io, blocking, scheduled) | +| `executor_pool_size_threads` | Current thread pool size | +| `executor_pool_max_threads` | Maximum allowed threads in pool | +| `executor_queued_tasks` | Tasks queued for execution | +| `executor_completed_tasks_total` | Total completed tasks | +| `executor_seconds_sum` | Total execution time | + +Thread pool metrics reveal concurrency bottlenecks. Saturated pools cause request queuing and increased latency. + +**Thread pool utilization percentage** - high utilization indicates the pool is near capacity: + +```shell +executor_active_threads{service="backend", namespace="$namespace", name!="scheduled"} +/ +executor_pool_size_threads{service="backend", namespace="$namespace", name!="scheduled"} +``` + +**Cron scheduled executor utilization** + +```shell +executor_active_threads{service="cron", namespace="$namespace", name="scheduled"} +/ +executor_pool_size_threads{service="cron", namespace="$namespace", name="scheduled"} +``` + +**Queued tasks** indicate backlog. Growing queues suggest the pool cannot keep up with demand: + +```shell +executor_queued_tasks{app="backend", namespace="$namespace"} +``` + +**Task completion rate** + +```shell +rate(executor_completed_tasks_total{namespace="$namespace"}[$__rate_interval]) +``` + +### Cache metrics + +| Metric | Description | +| ----------------------- | ----------------------------------- | +| `cache_size` | Number of entries in cache | +| `cache_gets_total` | Cache hits and misses by cache name | +| `cache_puts_total` | Cache entries added | +| `cache_evictions_total` | Cache eviction count | + +Cache effectiveness directly impacts database load and response times. Low hit rates indicate caching issues. + +**Redis cache hit rate** - should be above 70% for effective caching: + +```shell +avg(irate(redis_keyspace_hits_total{app="platform-redis-exporter"}[$__rate_interval]) +/ +(irate(redis_keyspace_misses_total{app="platform-redis-exporter"}[$__rate_interval]) + irate(redis_keyspace_hits_total{app="platform-redis-exporter"}[$__rate_interval]))) +``` + +**Cache size by name** + +```shell +cache_size{namespace="$namespace"} +``` + +**Cache operation rates** + +```shell +rate(cache_gets_total{namespace="$namespace"}[$__rate_interval]) +rate(cache_puts_total{namespace="$namespace"}[$__rate_interval]) +rate(cache_evictions_total{namespace="$namespace"}[$__rate_interval]) +``` + +### Hibernate/Database metrics + +| Metric | Description | +| ---------------------------------------- | ---------------------------------------------------- | +| `hibernate_sessions_open_total` | Total sessions opened | +| `hibernate_sessions_closed_total` | Total sessions closed | +| `hibernate_connections_obtained_total` | Database connections obtained | +| `hibernate_query_executions_total` | Total queries executed | +| `hibernate_query_executions_max_seconds` | Slowest query time | +| `hibernate_entities_inserts_total` | Entity insert operations | +| `hibernate_entities_updates_total` | Entity update operations | +| `hibernate_entities_deletes_total` | Entity delete operations | +| `hibernate_entities_loads_total` | Entity load operations | +| `hibernate_transactions_total` | Transaction count | +| `hibernate_flushes_total` | Session flush count | +| `hibernate_optimistic_failures_total` | Optimistic lock failures (StaleObjectStateException) | + +Database metrics reveal query performance, connection management, and transaction health. + +**Session operations** - open and closed counts should be roughly equal. A growing gap indicates session leaks: + +```shell +rate(hibernate_sessions_open_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_sessions_closed_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Connection acquisition rate** + +```shell +rate(hibernate_connections_obtained_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Query execution rate** + +```shell +rate(hibernate_query_executions_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Query latency by type** helps identify slow queries for optimization: + +```shell +sum by (query) (rate(hibernate_query_execution_total_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) +/ +sum by (query) (rate(hibernate_query_execution_total_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Slowest query time** - alert if this exceeds 5 seconds: + +```shell +hibernate_query_executions_max_seconds{app="backend", namespace="$namespace"} +``` + +**Entity operation rates** show database write patterns: + +```shell +rate(hibernate_entities_inserts_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_entities_updates_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_entities_deletes_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_entities_loads_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Transaction success/failure rate** + +```shell +sum by (result) (rate(hibernate_transactions_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Optimistic lock failures** indicate concurrent modification conflicts. High rates suggest contention issues: + +```shell +rate(hibernate_optimistic_failures_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +### Connection pool metrics + +| Metric | Description | +| ------------------------- | ---------------------------- | +| `jdbc_connections_active` | Active database connections | +| `jdbc_connections_max` | Maximum connection pool size | +| `jdbc_connections_min` | Minimum connection pool size | +| `jdbc_connections_usage` | Connection pool usage | + +Connection pool metrics prevent connection exhaustion during traffic bursts. + +**Active connections vs pool limits** - alert when active connections approach the maximum: + +```shell +sum(jdbc_connections_active{app="backend", namespace="$namespace"}) +sum(jdbc_connections_max{app="backend", namespace="$namespace"}) +sum(jdbc_connections_min{app="backend", namespace="$namespace"}) +sum(jdbc_connections_usage{app="backend", namespace="$namespace"}) +``` + +### Hibernate cache metrics + +Hibernate caching reduces database load. Monitor hit rates to ensure caches are effective. + +**Query cache hit rate** - should exceed 60%: + +```shell +sum(increase(hibernate_cache_query_requests_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) +/ +sum(increase(hibernate_cache_query_requests_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Query plan cache hit rate** + +```shell +sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) +/ +sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Second level cache hit rate by region** + +```shell +sum by (region) (increase(hibernate_second_level_cache_requests_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) +/ +sum by (region) (increase(hibernate_second_level_cache_requests_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +### Logging metrics + +| Metric | Description | +| ---------------------- | ----------------------------------------------------- | +| `logback_events_total` | Log events by level (debug, info, warn, error, trace) | + +Log event metrics provide early warning of application issues. + +**Error rate** - track error log frequency for anomaly detection: + +```shell +rate(logback_events_total{level="error"}[5m]) +``` + +### Kubernetes health + +Monitor pod health to catch deployment or infrastructure issues early. + +**Pods in unhealthy states** + +```shell +sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed", namespace!="wave-build"}) > 0 +``` + +## Alerting recommendations + +### Critical alerts + +- `jvm_memory_used_bytes{area="heap"}` > 90% of `jvm_memory_max_bytes` +- `process_files_open_files` > 90% of `process_files_max_files` +- `logback_events_total{level="error"}` rate > threshold +- `tower_logs_errors_1minCount` > 0 +- HTTP 5xx errors > 5% of total requests +- `jdbc_connections_active` > 90% of `jdbc_connections_max` +- Any pods in Failed/Unknown state for > 5 minutes + +### Warning alerts + +- `jvm_gc_pause_seconds_max` > 1 second +- `jvm_gc_live_data_size_bytes` approaching `jvm_gc_max_data_size_bytes` +- Heap usage > 85% of max heap +- `executor_queued_tasks` > threshold +- Executor utilization > 90% +- `hibernate_optimistic_failures_total` rate increasing +- `hibernate_query_executions_max_seconds` > 5 seconds +- `http_server_requests_seconds` p99 > acceptable latency +- Redis cache hit rate < 70% +- Hibernate query cache hit rate < 60% +- Growing gap between `credits_estimation_workflow_added_total` and `credits_estimation_workflow_ended_total` +- `hibernate_sessions_open_total` >> `hibernate_sessions_closed_total` over time + +## Quick reference: Metrics by troubleshooting scenario + +| Issue | Key Metrics to Check | +| ------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------- | +| **Slow application response** | `http_server_requests_seconds` (latency), `jvm_gc_pause_seconds_max`, `hibernate_query_executions_max_seconds`, `executor_active_threads` | +| **Out of memory errors** | `jvm_memory_used_bytes`, `jvm_gc_pause_seconds`, `jvm_gc_live_data_size_bytes`, `jvm_buffer_memory_used_bytes` | +| **Database performance** | `hibernate_query_executions_max_seconds`, `jdbc_connections_active`, `hibernate_transactions_total`, cache hit rates | +| **High CPU usage** | `process_cpu_usage`, `system_cpu_usage`, `jvm_threads_live_threads`, `executor_active_threads` | +| **Connection exhaustion** | `jdbc_connections_active`, `jdbc_connections_max`, `hibernate_sessions_open_total` vs `hibernate_sessions_closed_total` | +| **Cache issues** | Redis hit rate, `hibernate_cache_query_requests_total`, `cache_gets_total`, `cache_evictions_total` | +| **Workflow processing delays** | `credits_estimation_workflow_*`, `credits_estimation_task_*`, `executor_queued_tasks`, `tower_logs_errors_*` | +| **Thread starvation** | `executor_active_threads`, `executor_queued_tasks`, `jvm_threads_states_threads{state="blocked"}` | +| **Memory leaks** | `jvm_memory_used_bytes` trending up, `jvm_gc_live_data_size_bytes` growing, `jvm_classes_loaded_classes` growing | +| **GC pressure** | `jvm_gc_pause_seconds_max`, `jvm_gc_memory_promoted_bytes_total`, time in GC vs application time | diff --git a/platform-enterprise_versioned_docs/version-25.3/enterprise/advanced-topics/monitoring.md b/platform-enterprise_versioned_docs/version-25.3/enterprise/advanced-topics/monitoring.md new file mode 100644 index 000000000..c692367ee --- /dev/null +++ b/platform-enterprise_versioned_docs/version-25.3/enterprise/advanced-topics/monitoring.md @@ -0,0 +1,616 @@ +--- +title: "Seqera Platform Monitoring" +description: "A guide on relevant platform metrics" +date created: "2025-12-17" +tags: [platform, monitoring] +--- + +You can enable Seqera Platform's built-in observability metrics by adding `prometheus` to the `MICRONAUT_ENVIRONMENTS` environment variable. This exposes a Prometheus endpoint at `/prometheus` on the default listen port (e.g., `http://localhost:8080/prometheus`). + +Combined with infrastructure monitoring tools such as Node Exporter, you can monitor relevant metrics across your deployment. + +## Key metrics to monitor + +### Seqera Platform-specific metrics + +#### Studios metrics + +| Metric | Description | +| ------------------------------------------------ | -------------------------------------| +| `data_studio_startup_time_failure_seconds_sum` | Time for failed Studio startups | +| `data_studio_startup_time_failure_seconds_count` | Failed Studio startup count | + +Track Studio startup performance to identify environment provisioning issues. Slow or failing startups impact user productivity. + +**Average startup time by tool** + +```shell +sum by (tool) (increase(data_studio_startup_time_success_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) +/ +sum by (tool) (increase(data_studio_startup_time_success_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Failed startup rate** + +```shell +rate(data_studio_startup_time_failure_seconds_count{namespace="$namespace"}[$__rate_interval]) +``` + +#### Error tracking + +| Metric | Description | +| ------------------------------ | ------------------------- | +| `tower_logs_errors_10secCount` | Errors in last 10 seconds | +| `tower_logs_errors_1minCount` | Errors in last minute | +| `tower_logs_errors_5minCount` | Errors in last 5 minutes | + +Monitor application errors across different time windows. Rolling error counts help identify transient issues versus sustained problems. + +**Recent error counts** + +```shell +tower_logs_errors_10secCount{namespace="$namespace"} +tower_logs_errors_1minCount{namespace="$namespace"} +tower_logs_errors_5minCount{namespace="$namespace"} +``` + +**Log events by severity level** + +```shell +rate(logback_events_total{namespace="$namespace"}[$__rate_interval]) +``` + +### Infrastructure resources + +#### CPU usage + +Monitor container CPU consumption against requested resources to identify capacity issues or inefficient resource allocation. + +**Backend CPU usage** + +```shell +rate(container_cpu_usage_seconds_total{container="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Compare against requested resources** to determine if the container is over or under-provisioned: + +```shell +max(kube_pod_container_resource_requests{container="backend", namespace="$namespace", resource="cpu"}) +``` + +#### Memory usage + +Track working set memory, committed memory, and limits to prevent OOM conditions. + +**Backend memory working set** shows actual memory in use: + +```shell +container_memory_working_set_bytes{container="backend", namespace="$namespace"} +``` + +**Memory requests and limits** define the bounds for container memory allocation: + +```shell +max(kube_pod_container_resource_requests{container="backend", namespace="$namespace", resource="memory"}) +max(kube_pod_container_resource_limits{container="backend", namespace="$namespace", resource="memory"}) +``` + +### HTTP server requests + +| Metric | Description | +| ------------------------------------------ | ------------------------------------------------- | +| `http_server_requests_seconds_count` | Total request count by method, status, and URI | +| `http_server_requests_seconds_sum` | Total request duration by method, status, and URI | +| `http_server_requests_seconds_max` | Maximum request duration | +| `http_server_requests_seconds` (quantiles) | Request latency percentiles (p50, p95, p99, p999) | + +HTTP metrics reveal application throughput, error rates, and latency patterns. These are essential for understanding user-facing performance. + +**Total request throughput** shows overall API activity: + +```shell +sum(rate(http_server_requests_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Error rate (4xx and 5xx responses)** indicates client errors and server failures: + +```shell +sum(rate(http_server_requests_seconds_count{app="backend", namespace="$namespace", status=~"[45].."}[$__rate_interval])) +``` + +**Average latency per endpoint** helps identify slow API paths: + +```shell +sum by (method, uri) (rate(http_server_requests_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) +/ +sum by (method, uri) (rate(http_server_requests_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Top 10 endpoints by time spent** highlights where server time is consumed for optimization efforts: + +```shell +topk(10, sum by(method, uri) (rate(http_server_requests_seconds_sum{namespace="$namespace", app="backend"}[$__rate_interval]))) +``` + +### HTTP client requests + +| Metric | Description | +| ------------------------------------ | --------------------------------- | +| `http_client_requests_seconds_count` | Outbound request count | +| `http_client_requests_seconds_sum` | Total outbound request duration | +| `http_client_requests_seconds_max` | Maximum outbound request duration | + +Monitor external API calls and integrations. Slow or failing outbound requests can cascade into application performance issues. + +**Outbound request rate** + +```shell +rate(http_client_requests_seconds_count{namespace="$namespace"}[$__rate_interval]) +``` + +**Average outbound request duration** + +```shell +rate(http_client_requests_seconds_sum{namespace="$namespace"}[$__rate_interval]) +/ +rate(http_client_requests_seconds_count{namespace="$namespace"}[$__rate_interval]) +``` + +**Maximum outbound request duration** identifies slow external dependencies: + +```shell +http_client_requests_seconds_max{namespace="$namespace"} +``` + +### JVM memory metrics + +| Metric | Description | +| ------------------------------ | -------------------------------------------------------- | +| `jvm_buffer_memory_used_bytes` | Memory used by JVM buffer pools (direct, mapped) | +| `jvm_memory_used_bytes` | Amount of used memory by area (heap/non-heap) and region | +| `jvm_memory_committed_bytes` | Memory committed for JVM use | +| `jvm_memory_max_bytes` | Maximum memory available for memory management | +| `jvm_gc_live_data_size_bytes` | Size of long-lived heap memory pool after reclamation | +| `jvm_gc_max_data_size_bytes` | Max size of long-lived heap memory pool | + +JVM memory metrics are critical for preventing OutOfMemoryErrors and identifying memory leaks. Monitor both heap (Java objects) and non-heap (metaspace, code cache) regions. + +**Heap memory usage** shows memory used for Java objects: + +```shell +jvm_memory_used_bytes{app="backend", namespace="$namespace", area="heap"} +jvm_memory_committed_bytes{app="backend", namespace="$namespace", area="heap"} +jvm_memory_max_bytes{app="backend", namespace="$namespace", area="heap"} +``` + +**Non-heap memory** includes metaspace and code cache: + +```shell +jvm_memory_used_bytes{app="backend", namespace="$namespace", area="nonheap"} +jvm_memory_committed_bytes{app="backend", namespace="$namespace", area="nonheap"} +jvm_memory_max_bytes{app="backend", namespace="$namespace", area="nonheap"} +``` + +**Heap usage percentage** provides a quick health indicator. Alert when this exceeds 85%: + +```shell +sum(jvm_memory_used_bytes{area="heap"}) / sum(jvm_memory_max_bytes{area="heap"}) * 100 +``` + +**Direct buffer usage** is important for Netty-based applications. High usage can cause native memory issues: + +```shell +jvm_buffer_memory_used_bytes{namespace="$namespace", app="backend", id="direct"} +jvm_buffer_total_capacity_bytes{namespace="$namespace", app="backend", id="direct"} +``` + +### JVM garbage collection + +| Metric | Description | +| ------------------------------------- | ----------------------------------------- | +| `jvm_gc_pause_seconds_sum` | Total time spent in GC pauses | +| `jvm_gc_pause_seconds_count` | Number of GC pause events | +| `jvm_gc_pause_seconds_max` | Maximum GC pause duration | +| `jvm_gc_memory_allocated_bytes_total` | Total bytes allocated in young generation | +| `jvm_gc_memory_promoted_bytes_total` | Bytes promoted to old generation | + +Garbage collection metrics reveal memory pressure and its impact on application responsiveness. Long GC pauses cause request latency spikes. + +**Average GC pause duration** should remain low (under 100ms for most applications): + +```shell +rate(jvm_gc_pause_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval]) +/ +rate(jvm_gc_pause_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Maximum GC pause** identifies worst-case latency impact. Alert if this exceeds 1 second: + +```shell +jvm_gc_pause_seconds_max{app="backend", namespace="$namespace"} +``` + +**Live data size after GC** shows long-lived objects. If this grows over time, you may have a memory leak: + +```shell +jvm_gc_live_data_size_bytes{app="backend", namespace="$namespace"} +``` + +**Memory allocation and promotion rates** indicate object creation patterns. High promotion rates suggest objects are living longer than expected: + +```shell +rate(jvm_gc_memory_allocated_bytes_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(jvm_gc_memory_promoted_bytes_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +### JVM threads + +| Metric | Description | +| ---------------------------- | ----------------------------------------------------------------- | +| `jvm_threads_live_threads` | Current number of live threads (daemon + non-daemon) | +| `jvm_threads_daemon_threads` | Current number of daemon threads | +| `jvm_threads_peak_threads` | Peak thread count since JVM start | +| `jvm_threads_states_threads` | Thread count by state (runnable, blocked, waiting, timed-waiting) | + +Thread metrics help identify deadlocks, thread pool exhaustion, and concurrency issues. + +**Thread counts** show overall thread activity: + +```shell +jvm_threads_live_threads{app="backend", namespace="$namespace"} +jvm_threads_daemon_threads{app="backend", namespace="$namespace"} +jvm_threads_peak_threads{app="backend", namespace="$namespace"} +``` + +**Thread states** reveal blocking issues. High blocked thread counts indicate lock contention: + +```shell +jvm_threads_states_threads{app="backend", namespace="$namespace"} +``` + +### JVM classes + +| Metric | Description | +| ------------------------------------ | -------------------------------------- | +| `jvm_classes_loaded_classes` | Currently loaded classes | +| `jvm_classes_unloaded_classes_total` | Total classes unloaded since JVM start | + +Class loading metrics help identify class loader leaks or excessive dynamic class generation. + +**Loaded classes** should stabilize after startup. Continuous growth may indicate a class loader leak: + +```shell +jvm_classes_loaded_classes{namespace="$namespace", app="backend"} +``` + +**Class unload rate** + +```shell +rate(jvm_classes_unloaded_classes_total{namespace="$namespace", app="backend"}[$__rate_interval]) +``` + +### Process metrics + +| Metric | Description | +| ---------------------------- | ------------------------------------ | +| `process_cpu_usage` | Recent CPU usage for the JVM process | +| `process_cpu_time_ns_total` | Total CPU time used by the JVM | +| `process_files_open_files` | Open file descriptor count | +| `process_files_max_files` | Maximum file descriptor limit | +| `process_uptime_seconds` | JVM uptime | +| `process_start_time_seconds` | Process start time (unix epoch) | + +Process-level metrics provide visibility into resource consumption and system limits. + +**JVM process CPU usage** + +```shell +process_cpu_usage{namespace="$namespace"} +``` + +**Open file descriptors** should be monitored against limits. Exhaustion causes connection failures: + +```shell +process_files_open_files{namespace="$namespace"} +``` + +**File descriptor utilization percentage** - alert when this exceeds 90%: + +```shell +(process_files_open_files{namespace="$namespace"} / process_files_max_files{namespace="$namespace"}) * 100 +``` + +**Process uptime** helps identify restart events. Low uptime may indicate stability issues: + +```shell +process_uptime_seconds{namespace="$namespace"} +``` + +### System metrics + +| Metric | Description | +| ------------------------ | ------------------------------------- | +| `system_cpu_usage` | System-wide CPU usage | +| `system_cpu_count` | Number of processors available to JVM | +| `system_load_average_1m` | 1-minute load average | + +System metrics provide host-level context for application performance. + +**System-wide CPU usage** + +```shell +system_cpu_usage{namespace="$namespace"} +``` + +**System load average** should remain below the CPU count for healthy systems: + +```shell +system_load_average_1m{namespace="$namespace"} +``` + +**Available CPU count** + +```shell +system_cpu_count{namespace="$namespace"} +``` + +### Executor thread pools + +| Metric | Description | +| -------------------------------- | ---------------------------------------------------------- | +| `executor_active_threads` | Currently active threads by pool (io, blocking, scheduled) | +| `executor_pool_size_threads` | Current thread pool size | +| `executor_pool_max_threads` | Maximum allowed threads in pool | +| `executor_queued_tasks` | Tasks queued for execution | +| `executor_completed_tasks_total` | Total completed tasks | +| `executor_seconds_sum` | Total execution time | + +Thread pool metrics reveal concurrency bottlenecks. Saturated pools cause request queuing and increased latency. + +**Thread pool utilization percentage** - high utilization indicates the pool is near capacity: + +```shell +executor_active_threads{service="backend", namespace="$namespace", name!="scheduled"} +/ +executor_pool_size_threads{service="backend", namespace="$namespace", name!="scheduled"} +``` + +**Cron scheduled executor utilization** + +```shell +executor_active_threads{service="cron", namespace="$namespace", name="scheduled"} +/ +executor_pool_size_threads{service="cron", namespace="$namespace", name="scheduled"} +``` + +**Queued tasks** indicate backlog. Growing queues suggest the pool cannot keep up with demand: + +```shell +executor_queued_tasks{app="backend", namespace="$namespace"} +``` + +**Task completion rate** + +```shell +rate(executor_completed_tasks_total{namespace="$namespace"}[$__rate_interval]) +``` + +### Cache metrics + +| Metric | Description | +| ----------------------- | ----------------------------------- | +| `cache_size` | Number of entries in cache | +| `cache_gets_total` | Cache hits and misses by cache name | +| `cache_puts_total` | Cache entries added | +| `cache_evictions_total` | Cache eviction count | + +Cache effectiveness directly impacts database load and response times. Low hit rates indicate caching issues. + +**Redis cache hit rate** - should be above 70% for effective caching: + +```shell +avg(irate(redis_keyspace_hits_total{app="platform-redis-exporter"}[$__rate_interval]) +/ +(irate(redis_keyspace_misses_total{app="platform-redis-exporter"}[$__rate_interval]) + irate(redis_keyspace_hits_total{app="platform-redis-exporter"}[$__rate_interval]))) +``` + +**Cache size by name** + +```shell +cache_size{namespace="$namespace"} +``` + +**Cache operation rates** + +```shell +rate(cache_gets_total{namespace="$namespace"}[$__rate_interval]) +rate(cache_puts_total{namespace="$namespace"}[$__rate_interval]) +rate(cache_evictions_total{namespace="$namespace"}[$__rate_interval]) +``` + +### Hibernate/Database metrics + +| Metric | Description | +| ---------------------------------------- | ---------------------------------------------------- | +| `hibernate_sessions_open_total` | Total sessions opened | +| `hibernate_sessions_closed_total` | Total sessions closed | +| `hibernate_connections_obtained_total` | Database connections obtained | +| `hibernate_query_executions_total` | Total queries executed | +| `hibernate_query_executions_max_seconds` | Slowest query time | +| `hibernate_entities_inserts_total` | Entity insert operations | +| `hibernate_entities_updates_total` | Entity update operations | +| `hibernate_entities_deletes_total` | Entity delete operations | +| `hibernate_entities_loads_total` | Entity load operations | +| `hibernate_transactions_total` | Transaction count | +| `hibernate_flushes_total` | Session flush count | +| `hibernate_optimistic_failures_total` | Optimistic lock failures (StaleObjectStateException) | + +Database metrics reveal query performance, connection management, and transaction health. + +**Session operations** - open and closed counts should be roughly equal. A growing gap indicates session leaks: + +```shell +rate(hibernate_sessions_open_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_sessions_closed_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Connection acquisition rate** + +```shell +rate(hibernate_connections_obtained_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Query execution rate** + +```shell +rate(hibernate_query_executions_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Query latency by type** helps identify slow queries for optimization: + +```shell +sum by (query) (rate(hibernate_query_execution_total_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) +/ +sum by (query) (rate(hibernate_query_execution_total_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Slowest query time** - alert if this exceeds 5 seconds: + +```shell +hibernate_query_executions_max_seconds{app="backend", namespace="$namespace"} +``` + +**Entity operation rates** show database write patterns: + +```shell +rate(hibernate_entities_inserts_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_entities_updates_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_entities_deletes_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_entities_loads_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Transaction success/failure rate** + +```shell +sum by (result) (rate(hibernate_transactions_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Optimistic lock failures** indicate concurrent modification conflicts. High rates suggest contention issues: + +```shell +rate(hibernate_optimistic_failures_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +### Connection pool metrics + +| Metric | Description | +| ------------------------- | ---------------------------- | +| `jdbc_connections_active` | Active database connections | +| `jdbc_connections_max` | Maximum connection pool size | +| `jdbc_connections_min` | Minimum connection pool size | +| `jdbc_connections_usage` | Connection pool usage | + +Connection pool metrics prevent connection exhaustion during traffic bursts. + +**Active connections vs pool limits** - alert when active connections approach the maximum: + +```shell +sum(jdbc_connections_active{app="backend", namespace="$namespace"}) +sum(jdbc_connections_max{app="backend", namespace="$namespace"}) +sum(jdbc_connections_min{app="backend", namespace="$namespace"}) +sum(jdbc_connections_usage{app="backend", namespace="$namespace"}) +``` + +### Hibernate cache metrics + +Hibernate caching reduces database load. Monitor hit rates to ensure caches are effective. + +**Query cache hit rate** - should exceed 60%: + +```shell +sum(increase(hibernate_cache_query_requests_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) +/ +sum(increase(hibernate_cache_query_requests_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Query plan cache hit rate** + +```shell +sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) +/ +sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Second level cache hit rate by region** + +```shell +sum by (region) (increase(hibernate_second_level_cache_requests_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) +/ +sum by (region) (increase(hibernate_second_level_cache_requests_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +### Logging metrics + +| Metric | Description | +| ---------------------- | ----------------------------------------------------- | +| `logback_events_total` | Log events by level (debug, info, warn, error, trace) | + +Log event metrics provide early warning of application issues. + +**Error rate** - track error log frequency for anomaly detection: + +```shell +rate(logback_events_total{level="error"}[5m]) +``` + +### Kubernetes health + +Monitor pod health to catch deployment or infrastructure issues early. + +**Pods in unhealthy states** + +```shell +sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed", namespace!="wave-build"}) > 0 +``` + +## Alerting recommendations + +### Critical alerts + +- `jvm_memory_used_bytes{area="heap"}` > 90% of `jvm_memory_max_bytes` +- `process_files_open_files` > 90% of `process_files_max_files` +- `logback_events_total{level="error"}` rate > threshold +- `tower_logs_errors_1minCount` > 0 +- HTTP 5xx errors > 5% of total requests +- `jdbc_connections_active` > 90% of `jdbc_connections_max` +- Any pods in Failed/Unknown state for > 5 minutes + +### Warning alerts + +- `jvm_gc_pause_seconds_max` > 1 second +- `jvm_gc_live_data_size_bytes` approaching `jvm_gc_max_data_size_bytes` +- Heap usage > 85% of max heap +- `executor_queued_tasks` > threshold +- Executor utilization > 90% +- `hibernate_optimistic_failures_total` rate increasing +- `hibernate_query_executions_max_seconds` > 5 seconds +- `http_server_requests_seconds` p99 > acceptable latency +- Redis cache hit rate < 70% +- Hibernate query cache hit rate < 60% +- Growing gap between `credits_estimation_workflow_added_total` and `credits_estimation_workflow_ended_total` +- `hibernate_sessions_open_total` >> `hibernate_sessions_closed_total` over time + +## Quick reference: Metrics by troubleshooting scenario + +| Issue | Key Metrics to Check | +| ------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------- | +| **Slow application response** | `http_server_requests_seconds` (latency), `jvm_gc_pause_seconds_max`, `hibernate_query_executions_max_seconds`, `executor_active_threads` | +| **Out of memory errors** | `jvm_memory_used_bytes`, `jvm_gc_pause_seconds`, `jvm_gc_live_data_size_bytes`, `jvm_buffer_memory_used_bytes` | +| **Database performance** | `hibernate_query_executions_max_seconds`, `jdbc_connections_active`, `hibernate_transactions_total`, cache hit rates | +| **High CPU usage** | `process_cpu_usage`, `system_cpu_usage`, `jvm_threads_live_threads`, `executor_active_threads` | +| **Connection exhaustion** | `jdbc_connections_active`, `jdbc_connections_max`, `hibernate_sessions_open_total` vs `hibernate_sessions_closed_total` | +| **Cache issues** | Redis hit rate, `hibernate_cache_query_requests_total`, `cache_gets_total`, `cache_evictions_total` | +| **Workflow processing delays** | `credits_estimation_workflow_*`, `credits_estimation_task_*`, `executor_queued_tasks`, `tower_logs_errors_*` | +| **Thread starvation** | `executor_active_threads`, `executor_queued_tasks`, `jvm_threads_states_threads{state="blocked"}` | +| **Memory leaks** | `jvm_memory_used_bytes` trending up, `jvm_gc_live_data_size_bytes` growing, `jvm_classes_loaded_classes` growing | +| **GC pressure** | `jvm_gc_pause_seconds_max`, `jvm_gc_memory_promoted_bytes_total`, time in GC vs application time | diff --git a/platform-enterprise_versioned_sidebars/version-24.1-sidebars.json b/platform-enterprise_versioned_sidebars/version-24.1-sidebars.json index c6be05750..60796c618 100644 --- a/platform-enterprise_versioned_sidebars/version-24.1-sidebars.json +++ b/platform-enterprise_versioned_sidebars/version-24.1-sidebars.json @@ -58,7 +58,8 @@ "enterprise/advanced-topics/use-iam-role", "enterprise/advanced-topics/custom-launch-container", "enterprise/advanced-topics/firewall-configuration", - "enterprise/advanced-topics/jvm-memory-tuning" + "enterprise/advanced-topics/jvm-memory-tuning", + "enterprise/advanced-topics/monitoring" ] }, "enterprise/general_troubleshooting" diff --git a/platform-enterprise_versioned_sidebars/version-24.2-sidebars.json b/platform-enterprise_versioned_sidebars/version-24.2-sidebars.json index 09868c874..0cc8d2ae1 100644 --- a/platform-enterprise_versioned_sidebars/version-24.2-sidebars.json +++ b/platform-enterprise_versioned_sidebars/version-24.2-sidebars.json @@ -59,7 +59,8 @@ "enterprise/advanced-topics/custom-launch-container", "enterprise/advanced-topics/firewall-configuration", "enterprise/advanced-topics/seqera-container-images", - "enterprise/advanced-topics/jvm-memory-tuning" + "enterprise/advanced-topics/jvm-memory-tuning", + "enterprise/advanced-topics/monitoring" ] }, "enterprise/general_troubleshooting" diff --git a/platform-enterprise_versioned_sidebars/version-25.1-sidebars.json b/platform-enterprise_versioned_sidebars/version-25.1-sidebars.json index cae86add7..3ded33fb8 100644 --- a/platform-enterprise_versioned_sidebars/version-25.1-sidebars.json +++ b/platform-enterprise_versioned_sidebars/version-25.1-sidebars.json @@ -59,7 +59,8 @@ "enterprise/advanced-topics/custom-launch-container", "enterprise/advanced-topics/firewall-configuration", "enterprise/advanced-topics/seqera-container-images", - "enterprise/advanced-topics/jvm-memory-tuning" + "enterprise/advanced-topics/jvm-memory-tuning", + "enterprise/advanced-topics/monitoring" ] }, "enterprise/general_troubleshooting" diff --git a/platform-enterprise_versioned_sidebars/version-25.2-sidebars.json b/platform-enterprise_versioned_sidebars/version-25.2-sidebars.json index 600226882..d9b0a105d 100644 --- a/platform-enterprise_versioned_sidebars/version-25.2-sidebars.json +++ b/platform-enterprise_versioned_sidebars/version-25.2-sidebars.json @@ -61,7 +61,8 @@ "enterprise/advanced-topics/firewall-configuration", "enterprise/advanced-topics/seqera-container-images", "enterprise/advanced-topics/content-security-policy", - "enterprise/advanced-topics/jvm-memory-tuning" + "enterprise/advanced-topics/jvm-memory-tuning", + "enterprise/advanced-topics/monitoring" ] }, "enterprise/general_troubleshooting" diff --git a/platform-enterprise_versioned_sidebars/version-25.3-sidebars.json b/platform-enterprise_versioned_sidebars/version-25.3-sidebars.json index 90ee6b5d1..29ec0858d 100644 --- a/platform-enterprise_versioned_sidebars/version-25.3-sidebars.json +++ b/platform-enterprise_versioned_sidebars/version-25.3-sidebars.json @@ -60,7 +60,8 @@ "enterprise/advanced-topics/firewall-configuration", "enterprise/advanced-topics/seqera-container-images", "enterprise/advanced-topics/content-security-policy", - "enterprise/advanced-topics/jvm-memory-tuning" + "enterprise/advanced-topics/jvm-memory-tuning", + "enterprise/advanced-topics/monitoring" ] }, "enterprise/general_troubleshooting"