From 8167ed5782d4cae13052cc098463b3582454ccc1 Mon Sep 17 00:00:00 2001
From: nscuro <nscuro@protonmail.com>
Date: Sat, 9 May 2026 13:59:24 +0200
Subject: [PATCH] Add autoscaling docs

Signed-off-by: nscuro <nscuro@protonmail.com>
---
 .github/workflows/docs.yml            | 18 +++----
 docs/guides/administration/scaling.md | 67 ++++++++++++++++++++++++++-
 docs/includes/abbreviations.md        |  1 +
 3 files changed, 77 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 5db87ac..b058c81 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -34,10 +34,11 @@ jobs:
       with:
         persist-credentials: false
     - name: Install imaging dependencies
-      run: >-
-        sudo apt-get install -y --no-install-recommends
-        libcairo2-dev libfreetype6-dev libffi-dev
-        libjpeg-dev libpng-dev libz-dev
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y --no-install-recommends \
+          libcairo2-dev libfreetype6-dev libffi-dev \
+          libjpeg-dev libpng-dev libz-dev
     - uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0
       with:
         enable-cache: false
@@ -70,10 +71,11 @@ jobs:
       env:
         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
     - name: Install imaging dependencies
-      run: >-
-        sudo apt-get install -y --no-install-recommends
-        libcairo2-dev libfreetype6-dev libffi-dev
-        libjpeg-dev libpng-dev libz-dev
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y --no-install-recommends \
+          libcairo2-dev libfreetype6-dev libffi-dev \
+          libjpeg-dev libpng-dev libz-dev
     - uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0
       with:
         enable-cache: false
diff --git a/docs/guides/administration/scaling.md b/docs/guides/administration/scaling.md
index 09c3b6c..9e27664 100644
--- a/docs/guides/administration/scaling.md
+++ b/docs/guides/administration/scaling.md
@@ -73,7 +73,7 @@ For the full list of workers and their defaults, see the
 under sustained load), lower the queue's capacity to apply backpressure. Once a queue hits capacity, the
 scheduler pauses task creation, propagating throttling back to BOM upload clients.
 
-Change capacity at runtime from the administrator panel under *Workflows → Task Queues*, or via the REST
+Change capacity at runtime from the administrator panel under *Workflows > Task Queues*, or via the REST
 API. Defaults live in the [task queues reference](../../reference/configuration/dex-engine.md#task-queues).
 
 **Lower-level engine tuning.** When metrics show write-buffer flush latency, run-history cache misses, or
@@ -83,6 +83,71 @@ and
 [`notification.outbox-relay.*`](../../reference/configuration/properties.md#dtnotificationoutbox-relaypoll-interval-ms)
 properties.
 
+## Scale workers horizontally
+
+After tuning the vertical knobs, if an activity backlog keeps growing, scale worker instances
+horizontally on demand signals from the durable execution engine. Do not scale on CPU or memory
+alone: activity workers are I/O-bound (database, registry calls, vulnerability sources) and spend
+most of their time waiting, so CPU stays low while tasks queue up.
+
+The engine exposes three Prometheus metrics for this. The management server serves them once you
+[turn on Prometheus metrics scraping](configuring-observability.md#enabling-prometheus-metrics-scraping).
+
+| Metric | What it tells you | Use as |
+|---|---|---|
+| `dt_dex_engine_activity_task_queue_backlog{queueName}` | Approximate count of ready-to-schedule activity tasks per queue, capped at 10000. | Primary scale-up trigger. |
+| `dt_dex_engine_activity_task_queue_backlog_age_seconds{queueName}` | Age of the oldest ready-to-schedule activity task per queue. | SLO-aligned secondary trigger. |
+| `dt_dex_engine_task_worker_concurrency_utilization{workerType,name}` | Fraction (0–1) of a worker's concurrency slots currently in use. | Scale-down guard. |
+
+Scale up when the backlog exceeds a target per instance, or when the oldest task has waited longer
+than the SLO. Scale down only when worker slots stay below a low-use threshold across all
+instances.
+
+!!! note "Combine across instances and queues"
+    Every instance publishes the backlog and age gauges. Most deployments run all activity workers
+    together, so the right HPA signal is "any queue needs scale-up." Collapse to a single value
+    with `max(...)` (no `by` clause), for example
+    `max(dt_dex_engine_activity_task_queue_backlog)`. Add `by (queueName)` only if you split
+    worker types across separate Deployments and want per-queue scaling.
+
+!!! note "Backlog count is approximate"
+    The engine caps the count at 10000 per queue to bound query cost. Beyond the cap, the value
+    saturates at 10000. This is precise enough to drive scaling decisions.
+
+<!-- vale Google.Headings = NO -->
+### KEDA example
+<!-- vale Google.Headings = YES -->
+
+[KEDA](https://keda.sh) can drive a Deployment from these metrics. The `ScaledObject` below
+targets worker nodes (no `web` profile), scaling on the worst-case backlog across all queues,
+with the worst-case oldest-task age as a secondary trigger. Each query wraps the metric in
+`avg_over_time(...[5m:30s])` so a transient spike (a single large BOM upload) does not trigger
+churn.
+
+??? example "`ScaledObject` manifest"
+    ```yaml linenums="1"
+    apiVersion: keda.sh/v1alpha1
+    kind: ScaledObject
+    metadata:
+      name: dependencytrack-worker
+    spec:
+      scaleTargetRef:
+        name: dependencytrack-worker
+      minReplicaCount: 2
+      maxReplicaCount: 5
+      triggers:
+        - type: prometheus
+          metadata:
+            serverAddress: http://prometheus.monitoring:9090
+            query: avg_over_time(max(dt_dex_engine_activity_task_queue_backlog)[5m:30s])
+            threshold: "1000"
+        - type: prometheus
+          metadata:
+            serverAddress: http://prometheus.monitoring:9090
+            query: avg_over_time(max(dt_dex_engine_activity_task_queue_backlog_age_seconds)[5m:30s])
+            threshold: "300"
+    ```
+
 ## Pool connections centrally
 
 - **Up to roughly 5 instances** at the default pool size of 30: the per-instance pool works.
diff --git a/docs/includes/abbreviations.md b/docs/includes/abbreviations.md
index 26d9a06..e499267 100644
--- a/docs/includes/abbreviations.md
+++ b/docs/includes/abbreviations.md
@@ -14,5 +14,6 @@
 *[OSV]: Open Source Vulnerabilities
 *[PURL]: Package URL, a standardized format for identifying software packages
 *[SBOM]: Software Bill of Materials
+*[SLO]: Service Level Objectives
 *[VDR]: Vulnerability Disclosure Report
 *[VEX]: Vulnerability Exploitability eXchange