From ee7f281705fc190a75ef550d40b7fc434f9e190f Mon Sep 17 00:00:00 2001 From: Joao Foltran Date: Tue, 27 Jan 2026 17:03:24 -0300 Subject: [PATCH 1/3] Updated both metrics for long running transactions. One returns 4 metrics (one for each threshold, 1min, 5min, 10min, 30min) and another one returns the duration of the longest running transaction. --- collector/pg_long_running_transactions.go | 92 ++++++++++++++++------- 1 file changed, 65 insertions(+), 27 deletions(-) diff --git a/collector/pg_long_running_transactions.go b/collector/pg_long_running_transactions.go index 0a7f8c969..85f8d6e83 100644 --- a/collector/pg_long_running_transactions.go +++ b/collector/pg_long_running_transactions.go @@ -15,6 +15,8 @@ package collector import ( "context" + "database/sql" + "fmt" "log/slog" "github.com/prometheus/client_golang/prometheus" @@ -34,11 +36,13 @@ func NewPGLongRunningTransactionsCollector(config collectorConfig) (Collector, e return &PGLongRunningTransactionsCollector{log: config.logger}, nil } +var longRunningTransactionThresholds = []int{60, 300, 600, 1800} // 1min, 5min, 10min, 30min + var ( longRunningTransactionsCount = prometheus.NewDesc( - "pg_long_running_transactions", - "Current number of long running transactions", - []string{}, + prometheus.BuildFQName(namespace, longRunningTransactionsSubsystem, "count"), + "Number of transactions running longer than threshold", + []string{"threshold"}, prometheus.Labels{}, ) @@ -50,46 +54,80 @@ var ( ) longRunningTransactionsQuery = ` - SELECT - COUNT(*) as transactions, - MAX(EXTRACT(EPOCH FROM clock_timestamp() - pg_stat_activity.xact_start)) AS oldest_timestamp_seconds -FROM pg_catalog.pg_stat_activity -WHERE state IS DISTINCT FROM 'idle' -AND query NOT LIKE 'autovacuum:%' -AND pg_stat_activity.xact_start IS NOT NULL; - ` + SELECT + COUNT(*) as transactions, + MAX(EXTRACT(EPOCH FROM clock_timestamp() - pg_stat_activity.xact_start)) AS oldest_timestamp_seconds + FROM pg_catalog.pg_stat_activity + WHERE state IS DISTINCT FROM 'idle' + AND query NOT LIKE 'autovacuum:%' + AND pg_stat_activity.xact_start IS NOT NULL + AND EXTRACT(EPOCH FROM clock_timestamp() - pg_stat_activity.xact_start) >= $1; + ` + + longRunningTransactionsMaxAgeQuery = ` + SELECT + MAX(EXTRACT(EPOCH FROM clock_timestamp() - pg_stat_activity.xact_start)) AS oldest_timestamp_seconds + FROM pg_catalog.pg_stat_activity + WHERE state IS DISTINCT FROM 'idle' + AND query NOT LIKE 'autovacuum:%' + AND pg_stat_activity.xact_start IS NOT NULL; + ` ) func (PGLongRunningTransactionsCollector) Update(ctx context.Context, instance *Instance, ch chan<- prometheus.Metric) error { db := instance.getDB() - rows, err := db.QueryContext(ctx, - longRunningTransactionsQuery) - if err != nil { - return err - } - defer rows.Close() + // Query for each threshold + for _, threshold := range longRunningTransactionThresholds { + rows, err := db.QueryContext(ctx, longRunningTransactionsQuery, threshold) + if err != nil { + return err + } - for rows.Next() { - var transactions, ageInSeconds float64 + var count float64 + var maxAge sql.NullFloat64 - if err := rows.Scan(&transactions, &ageInSeconds); err != nil { - return err + if rows.Next() { + if err := rows.Scan(&count, &maxAge); err != nil { + rows.Close() + return err + } } + rows.Close() + // Emit count metric with threshold label ch <- prometheus.MustNewConstMetric( longRunningTransactionsCount, prometheus.GaugeValue, - transactions, + count, + fmt.Sprintf("%d", threshold), ) + } + + // Query for max age (no threshold filter) + rows, err := db.QueryContext(ctx, longRunningTransactionsMaxAgeQuery) + if err != nil { + return err + } + defer rows.Close() + + if rows.Next() { + var maxAge sql.NullFloat64 + if err := rows.Scan(&maxAge); err != nil { + return err + } + + ageValue := 0.0 + if maxAge.Valid { + ageValue = maxAge.Float64 + } + ch <- prometheus.MustNewConstMetric( longRunningTransactionsAgeInSeconds, prometheus.GaugeValue, - ageInSeconds, + ageValue, ) } - if err := rows.Err(); err != nil { - return err - } - return nil + + return rows.Err() } From 7b38a0baed46c7c30089a373b588536f1ca7d24a Mon Sep 17 00:00:00 2001 From: Joao Foltran Date: Thu, 29 Jan 2026 11:37:04 -0300 Subject: [PATCH 2/3] Optimize long running transactions collector to a single query as per Matt review --- collector/pg_long_running_transactions.go | 125 ++++++++++------------ 1 file changed, 57 insertions(+), 68 deletions(-) diff --git a/collector/pg_long_running_transactions.go b/collector/pg_long_running_transactions.go index 85f8d6e83..82c3e1993 100644 --- a/collector/pg_long_running_transactions.go +++ b/collector/pg_long_running_transactions.go @@ -16,7 +16,6 @@ package collector import ( "context" "database/sql" - "fmt" "log/slog" "github.com/prometheus/client_golang/prometheus" @@ -36,8 +35,6 @@ func NewPGLongRunningTransactionsCollector(config collectorConfig) (Collector, e return &PGLongRunningTransactionsCollector{log: config.logger}, nil } -var longRunningTransactionThresholds = []int{60, 300, 600, 1800} // 1min, 5min, 10min, 30min - var ( longRunningTransactionsCount = prometheus.NewDesc( prometheus.BuildFQName(namespace, longRunningTransactionsSubsystem, "count"), @@ -54,80 +51,72 @@ var ( ) longRunningTransactionsQuery = ` - SELECT - COUNT(*) as transactions, - MAX(EXTRACT(EPOCH FROM clock_timestamp() - pg_stat_activity.xact_start)) AS oldest_timestamp_seconds - FROM pg_catalog.pg_stat_activity - WHERE state IS DISTINCT FROM 'idle' - AND query NOT LIKE 'autovacuum:%' - AND pg_stat_activity.xact_start IS NOT NULL - AND EXTRACT(EPOCH FROM clock_timestamp() - pg_stat_activity.xact_start) >= $1; - ` - - longRunningTransactionsMaxAgeQuery = ` - SELECT - MAX(EXTRACT(EPOCH FROM clock_timestamp() - pg_stat_activity.xact_start)) AS oldest_timestamp_seconds - FROM pg_catalog.pg_stat_activity - WHERE state IS DISTINCT FROM 'idle' - AND query NOT LIKE 'autovacuum:%' - AND pg_stat_activity.xact_start IS NOT NULL; - ` + SELECT + COUNT(*) FILTER (WHERE EXTRACT(EPOCH FROM clock_timestamp() - pg_stat_activity.xact_start) >= 60) AS count_60s, + COUNT(*) FILTER (WHERE EXTRACT(EPOCH FROM clock_timestamp() - pg_stat_activity.xact_start) >= 300) AS count_300s, + COUNT(*) FILTER (WHERE EXTRACT(EPOCH FROM clock_timestamp() - pg_stat_activity.xact_start) >= 600) AS count_600s, + COUNT(*) FILTER (WHERE EXTRACT(EPOCH FROM clock_timestamp() - pg_stat_activity.xact_start) >= 1800) AS count_1800s, + MAX(EXTRACT(EPOCH FROM clock_timestamp() - pg_stat_activity.xact_start)) AS oldest_timestamp_seconds + FROM pg_catalog.pg_stat_activity + WHERE state IS DISTINCT FROM 'idle' + AND query NOT LIKE 'autovacuum:%' + AND pg_stat_activity.xact_start IS NOT NULL; + ` ) func (PGLongRunningTransactionsCollector) Update(ctx context.Context, instance *Instance, ch chan<- prometheus.Metric) error { db := instance.getDB() - // Query for each threshold - for _, threshold := range longRunningTransactionThresholds { - rows, err := db.QueryContext(ctx, longRunningTransactionsQuery, threshold) - if err != nil { - return err - } - - var count float64 - var maxAge sql.NullFloat64 - - if rows.Next() { - if err := rows.Scan(&count, &maxAge); err != nil { - rows.Close() - return err - } - } - rows.Close() - - // Emit count metric with threshold label - ch <- prometheus.MustNewConstMetric( - longRunningTransactionsCount, - prometheus.GaugeValue, - count, - fmt.Sprintf("%d", threshold), - ) - } + var count60s, count300s, count600s, count1800s float64 + var maxAge sql.NullFloat64 - // Query for max age (no threshold filter) - rows, err := db.QueryContext(ctx, longRunningTransactionsMaxAgeQuery) + err := db.QueryRowContext(ctx, longRunningTransactionsQuery).Scan( + &count60s, + &count300s, + &count600s, + &count1800s, + &maxAge, + ) if err != nil { return err } - defer rows.Close() - - if rows.Next() { - var maxAge sql.NullFloat64 - if err := rows.Scan(&maxAge); err != nil { - return err - } - - ageValue := 0.0 - if maxAge.Valid { - ageValue = maxAge.Float64 - } - - ch <- prometheus.MustNewConstMetric( - longRunningTransactionsAgeInSeconds, - prometheus.GaugeValue, - ageValue, - ) + + // Emit count metrics with threshold labels + ch <- prometheus.MustNewConstMetric( + longRunningTransactionsCount, + prometheus.GaugeValue, + count60s, + "60", + ) + ch <- prometheus.MustNewConstMetric( + longRunningTransactionsCount, + prometheus.GaugeValue, + count300s, + "300", + ) + ch <- prometheus.MustNewConstMetric( + longRunningTransactionsCount, + prometheus.GaugeValue, + count600s, + "600", + ) + ch <- prometheus.MustNewConstMetric( + longRunningTransactionsCount, + prometheus.GaugeValue, + count1800s, + "1800", + ) + + // Emit max age metric + ageValue := 0.0 + if maxAge.Valid { + ageValue = maxAge.Float64 } + ch <- prometheus.MustNewConstMetric( + longRunningTransactionsAgeInSeconds, + prometheus.GaugeValue, + ageValue, + ) - return rows.Err() + return nil } From 4af8fca6ca9d57561d726706f3efd7bbfee0e60c Mon Sep 17 00:00:00 2001 From: Joao Foltran Date: Wed, 25 Feb 2026 10:38:27 -0300 Subject: [PATCH 3/3] Rename _count metric to follow Prometheus naming conventions _count suffix is reserved for histograms/summaries. Use pg_long_running_transactions (no suffix) since this is a gauge. --- collector/pg_long_running_transactions.go | 66 +++++++++---------- .../pg_long_running_transactions_test.go | 14 ++-- 2 files changed, 42 insertions(+), 38 deletions(-) diff --git a/collector/pg_long_running_transactions.go b/collector/pg_long_running_transactions.go index 82c3e1993..0fd650df3 100644 --- a/collector/pg_long_running_transactions.go +++ b/collector/pg_long_running_transactions.go @@ -37,7 +37,7 @@ func NewPGLongRunningTransactionsCollector(config collectorConfig) (Collector, e var ( longRunningTransactionsCount = prometheus.NewDesc( - prometheus.BuildFQName(namespace, longRunningTransactionsSubsystem, "count"), + prometheus.BuildFQName(namespace, "", longRunningTransactionsSubsystem), "Number of transactions running longer than threshold", []string{"threshold"}, prometheus.Labels{}, @@ -51,16 +51,20 @@ var ( ) longRunningTransactionsQuery = ` + WITH transaction_ages AS ( + SELECT EXTRACT(EPOCH FROM clock_timestamp() - pg_stat_activity.xact_start) AS age_seconds + FROM pg_catalog.pg_stat_activity + WHERE state IS DISTINCT FROM 'idle' + AND query NOT LIKE 'autovacuum:%' + AND pg_stat_activity.xact_start IS NOT NULL + ) SELECT - COUNT(*) FILTER (WHERE EXTRACT(EPOCH FROM clock_timestamp() - pg_stat_activity.xact_start) >= 60) AS count_60s, - COUNT(*) FILTER (WHERE EXTRACT(EPOCH FROM clock_timestamp() - pg_stat_activity.xact_start) >= 300) AS count_300s, - COUNT(*) FILTER (WHERE EXTRACT(EPOCH FROM clock_timestamp() - pg_stat_activity.xact_start) >= 600) AS count_600s, - COUNT(*) FILTER (WHERE EXTRACT(EPOCH FROM clock_timestamp() - pg_stat_activity.xact_start) >= 1800) AS count_1800s, - MAX(EXTRACT(EPOCH FROM clock_timestamp() - pg_stat_activity.xact_start)) AS oldest_timestamp_seconds - FROM pg_catalog.pg_stat_activity - WHERE state IS DISTINCT FROM 'idle' - AND query NOT LIKE 'autovacuum:%' - AND pg_stat_activity.xact_start IS NOT NULL; + COUNT(*) FILTER (WHERE age_seconds >= 60) AS count_60s, + COUNT(*) FILTER (WHERE age_seconds >= 300) AS count_300s, + COUNT(*) FILTER (WHERE age_seconds >= 600) AS count_600s, + COUNT(*) FILTER (WHERE age_seconds >= 1800) AS count_1800s, + MAX(age_seconds) AS oldest_timestamp_seconds + FROM transaction_ages; ` ) @@ -82,30 +86,24 @@ func (PGLongRunningTransactionsCollector) Update(ctx context.Context, instance * } // Emit count metrics with threshold labels - ch <- prometheus.MustNewConstMetric( - longRunningTransactionsCount, - prometheus.GaugeValue, - count60s, - "60", - ) - ch <- prometheus.MustNewConstMetric( - longRunningTransactionsCount, - prometheus.GaugeValue, - count300s, - "300", - ) - ch <- prometheus.MustNewConstMetric( - longRunningTransactionsCount, - prometheus.GaugeValue, - count600s, - "600", - ) - ch <- prometheus.MustNewConstMetric( - longRunningTransactionsCount, - prometheus.GaugeValue, - count1800s, - "1800", - ) + thresholds := []struct { + threshold string + count float64 + }{ + {"60", count60s}, + {"300", count300s}, + {"600", count600s}, + {"1800", count1800s}, + } + + for _, t := range thresholds { + ch <- prometheus.MustNewConstMetric( + longRunningTransactionsCount, + prometheus.GaugeValue, + t.count, + t.threshold, + ) + } // Emit max age metric ageValue := 0.0 diff --git a/collector/pg_long_running_transactions_test.go b/collector/pg_long_running_transactions_test.go index 181073965..b52b8fddd 100644 --- a/collector/pg_long_running_transactions_test.go +++ b/collector/pg_long_running_transactions_test.go @@ -30,11 +30,14 @@ func TestPGLongRunningTransactionsCollector(t *testing.T) { defer db.Close() inst := &Instance{db: db} columns := []string{ - "transactions", - "age_in_seconds", + "count_60s", + "count_300s", + "count_600s", + "count_1800s", + "oldest_timestamp_seconds", } rows := sqlmock.NewRows(columns). - AddRow(20, 1200) + AddRow(5, 3, 2, 1, 1200) mock.ExpectQuery(sanitizeQuery(longRunningTransactionsQuery)).WillReturnRows(rows) @@ -48,7 +51,10 @@ func TestPGLongRunningTransactionsCollector(t *testing.T) { } }() expected := []MetricResult{ - {labels: labelMap{}, value: 20, metricType: dto.MetricType_GAUGE}, + {labels: labelMap{"threshold": "60"}, value: 5, metricType: dto.MetricType_GAUGE}, + {labels: labelMap{"threshold": "300"}, value: 3, metricType: dto.MetricType_GAUGE}, + {labels: labelMap{"threshold": "600"}, value: 2, metricType: dto.MetricType_GAUGE}, + {labels: labelMap{"threshold": "1800"}, value: 1, metricType: dto.MetricType_GAUGE}, {labels: labelMap{}, value: 1200, metricType: dto.MetricType_GAUGE}, } convey.Convey("Metrics comparison", t, func() {