From 6f1896d496a4116006c4f3c892c37e4eb682412e Mon Sep 17 00:00:00 2001 From: Michael Weibel Date: Mon, 18 May 2026 12:58:58 +0200 Subject: [PATCH] feat: Observability --- AGENTS.md | 2 +- Dockerfile | 3 +- Makefile | 9 +- README.md | 1 + cmd/main.go | 104 +- docs/development.md | 18 + docs/observability.md | 93 ++ go.mod | 12 +- go.sum | 2 - grafana/custom-metrics/config.yaml | 81 +- .../custom-metrics-dashboard.json | 995 ++++++++++++++++++ internal/cloudscale/client.go | 9 +- internal/cloudscale/client_test.go | 2 +- internal/cloudscale/flavors.go | 2 +- internal/cloudscale/flavors_test.go | 2 +- internal/cloudscale/regions.go | 2 +- internal/cloudscale/regions_test.go | 2 +- internal/cloudscale/services.go | 2 +- internal/controller/cloudscale_services.go | 2 +- .../controller/cloudscale_services_test.go | 2 +- internal/controller/cloudscale_tags.go | 2 +- .../cloudscalecluster_controller.go | 25 +- .../cloudscalecluster_floatingip.go | 8 +- .../cloudscalecluster_floatingip_test.go | 2 +- .../cloudscalecluster_loadbalancer.go | 8 +- .../cloudscalecluster_loadbalancer_test.go | 2 +- .../controller/cloudscalecluster_network.go | 8 +- .../cloudscalecluster_network_test.go | 2 +- .../cloudscalecluster_reconcile_test.go | 2 +- .../cloudscalecluster_servergroup.go | 2 +- .../cloudscalecluster_servergroup_test.go | 2 +- .../cloudscalemachine_controller.go | 25 +- .../cloudscalemachine_reconcile_test.go | 2 +- .../controller/cloudscalemachine_server.go | 8 +- .../cloudscalemachine_server_test.go | 2 +- .../cloudscalemachine_servergroup.go | 8 +- .../cloudscalemachine_servergroup_test.go | 2 +- internal/observability/composite_logger.go | 81 ++ internal/observability/span.go | 46 + internal/observability/span_logger.go | 93 ++ internal/observability/tracing.go | 78 ++ internal/testutils/fixtures.go | 2 +- internal/testutils/mocks.go | 2 +- test/e2e/cloudscale_helpers.go | 2 +- 44 files changed, 1652 insertions(+), 107 deletions(-) create mode 100644 docs/observability.md create mode 100644 grafana/custom-metrics/custom-metrics-dashboard.json create mode 100644 internal/observability/composite_logger.go create mode 100644 internal/observability/span.go create mode 100644 internal/observability/span_logger.go create mode 100644 internal/observability/tracing.go diff --git a/AGENTS.md b/AGENTS.md index 22713a8..43672eb 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -112,7 +112,7 @@ For the prose architecture sketch see [`docs/development.md`](docs/development.m ## Cloudscale SDK usage -- Do not `import "github.com/cloudscale-ch/cloudscale-go-sdk/v8"` outside +- Do not `import "github.com/cloudscale-ch/cloudscale-go-sdk/v9"` outside `internal/cloudscale/`. Controllers and webhooks talk to the SDK through the service interfaces on `cloudscale.Client` (`internal/cloudscale/client.go:32`). diff --git a/Dockerfile b/Dockerfile index 5b59f51..029dfdc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,6 +2,7 @@ FROM golang:1.26 AS builder ARG TARGETOS ARG TARGETARCH +ARG VERSION=dev WORKDIR /workspace # Copy the Go Modules manifests @@ -19,7 +20,7 @@ COPY . . # was called. For example, if we call make docker-build in a local env which has the Apple Silicon M1 SO # the docker BUILDPLATFORM arg will be linux/arm64 when for Apple x86 it will be linux/amd64. Therefore, # by leaving it empty we can ensure that the container and binary shipped on it will have the same platform. -RUN CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} go build -a -o manager cmd/main.go +RUN CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} go build -ldflags "-X main.version=${VERSION}" -a -o manager cmd/main.go # Use distroless as minimal base image to package the manager binary # Refer to https://github.com/GoogleContainerTools/distroless for more details diff --git a/Makefile b/Makefile index 46826d5..32d71e9 100644 --- a/Makefile +++ b/Makefile @@ -3,6 +3,7 @@ TAG ?= dev IMG ?= quay.io/cloudscalech/capcs-staging:$(TAG) # YEAR defines the year value used for substituting the YEAR placeholder in the boilerplate header. YEAR ?= $(shell date +%Y) +LDFLAGS ?= -X main.version=$(TAG) # E2E image configuration E2E_TAG ?= e2e-$(shell git rev-parse --short HEAD) @@ -287,18 +288,18 @@ test-e2e-conformance-fast: $(GINKGO) generate-e2e-templates generate-e2e-config .PHONY: build build: manifests generate fmt vet ## Build manager binary. - go build -o bin/manager cmd/main.go + go build -ldflags '$(LDFLAGS)' -o bin/manager cmd/main.go .PHONY: run run: manifests generate fmt vet ## Run a controller from your host. - go run ./cmd/main.go + go run -ldflags '$(LDFLAGS)' ./cmd/main.go # If you wish to build the manager image targeting other platforms you can use the --platform flag. # (i.e. docker build --platform linux/arm64). However, you must enable docker buildKit for it. # More info: https://docs.docker.com/develop/develop-images/build_enhancements/ .PHONY: docker-build docker-build: ## Build docker image with the manager. - $(CONTAINER_TOOL) build --platform linux/amd64 -t ${IMG} . + $(CONTAINER_TOOL) build --platform linux/amd64 --build-arg VERSION=$(TAG) -t ${IMG} . .PHONY: docker-push docker-push: ## Push docker image with the manager. @@ -321,7 +322,7 @@ docker-buildx: ## Build and push docker image for the manager for cross-platform sed -e '1 s/\(^FROM\)/FROM --platform=\$$\{BUILDPLATFORM\}/; t' -e ' 1,// s//FROM --platform=\$$\{BUILDPLATFORM\}/' Dockerfile > Dockerfile.cross - $(CONTAINER_TOOL) buildx create --name cluster-api-provider-cloudscale-builder $(CONTAINER_TOOL) buildx use cluster-api-provider-cloudscale-builder - - $(CONTAINER_TOOL) buildx build --push --platform=$(PLATFORMS) --tag ${IMG} -f Dockerfile.cross . + - $(CONTAINER_TOOL) buildx build --push --platform=$(PLATFORMS) --build-arg VERSION=$(TAG) --tag ${IMG} -f Dockerfile.cross . - $(CONTAINER_TOOL) buildx rm cluster-api-provider-cloudscale-builder rm Dockerfile.cross diff --git a/README.md b/README.md index 0274c69..dbab340 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,7 @@ variables and the other template flavors. |-------------------------------------|----------------------------------------------------------------------------------------------------------------| | New to Cluster API, or new to CAPCS | [Getting Started](docs/getting-started.md) | | Looking up a CRD field | `kubectl explain cloudscalecluster.spec` (or the generated CRDs under [`config/crd/bases/`](config/crd/bases)) | +| Setting up monitoring or tracing | [Observability](docs/observability.md) | | Hitting an error | [Troubleshooting](docs/troubleshooting.md) | | Contributing to CAPCS | [Development](docs/development.md), [CONTRIBUTING.md](CONTRIBUTING.md) | | Cutting a release | [Releasing](docs/releasing.md), [Testing releases](docs/testing-releases.md) | diff --git a/cmd/main.go b/cmd/main.go index f03145a..55938f1 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -25,12 +25,13 @@ import ( "os" "time" - "golang.org/x/sync/errgroup" - // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) // to ensure that exec-entrypoint and run can make use of them. _ "k8s.io/client-go/plugin/pkg/client/auth" + "github.com/cloudscale-ch/cloudscale-go-sdk/v9/instrumentation" + "go.opentelemetry.io/otel" + "golang.org/x/sync/errgroup" "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" @@ -38,6 +39,7 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/healthz" "sigs.k8s.io/controller-runtime/pkg/log/zap" + ctrlmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" "sigs.k8s.io/controller-runtime/pkg/metrics/filters" metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" "sigs.k8s.io/controller-runtime/pkg/webhook" @@ -45,6 +47,7 @@ import ( infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/controller" + "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/observability" webhookv1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/webhook/v1beta2" // +kubebuilder:scaffold:imports ) @@ -52,6 +55,7 @@ import ( var ( scheme = runtime.NewScheme() setupLog = ctrl.Log.WithName("setup") + version = "dev" ) func init() { @@ -62,8 +66,14 @@ func init() { // +kubebuilder:scaffold:scheme } -// nolint:gocyclo func main() { + if err := run(); err != nil { + fmt.Fprintf(os.Stderr, "%v\n", err) + os.Exit(1) + } +} + +func run() error { var metricsAddr string var metricsCertPath, metricsCertName, metricsCertKey string var webhookCertPath, webhookCertName, webhookCertKey string @@ -75,6 +85,9 @@ func main() { var machineConcurrency int var watchFilter string var tlsOpts []func(*tls.Config) + var enableTracing bool + var tracingSampleRate float64 + var profilerAddress string flag.StringVar(&metricsAddr, "metrics-bind-address", "0", "The address the metrics endpoint binds to. "+ "Use :8443 for HTTPS or :8080 for HTTP, or leave as 0 to disable the metrics service.") flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") @@ -99,6 +112,11 @@ func main() { flag.StringVar(&watchFilter, "watch-filter", "", fmt.Sprintf("Label value that the controller watches to reconcile cluster-api objects. Label key is always %s. "+ "If unspecified, the controller watches for all cluster-api objects.", clusterv1.WatchLabel)) + flag.BoolVar(&enableTracing, "enable-tracing", false, "Enable OpenTelemetry tracing") + flag.Float64Var(&tracingSampleRate, "tracing-sample-rate", 0.1, + "Trace sampling rate, between 0.0 and 1.0 (1.0 = always sample)") + flag.StringVar(&profilerAddress, "profiler-address", "", + "Bind address to expose the pprof profiler (e.g. localhost:6060)") opts := zap.Options{ Development: true, } @@ -108,14 +126,10 @@ func main() { ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) if clusterConcurrency < 1 || clusterConcurrency > 4 { - setupLog.Error( - fmt.Errorf("--cluster-concurrency must be between 1 and 4, got %d", clusterConcurrency), "invalid flag") - os.Exit(1) + return fmt.Errorf("invalid flag: --cluster-concurrency must be between 1 and 4, got %d", clusterConcurrency) } if machineConcurrency < 1 || machineConcurrency > 10 { - setupLog.Error( - fmt.Errorf("--machine-concurrency must be between 1 and 10, got %d", machineConcurrency), "invalid flag") - os.Exit(1) + return fmt.Errorf("invalid flag: --machine-concurrency must be between 1 and 10, got %d", machineConcurrency) } // if the enable-http2 flag is false (the default), http/2 should be disabled @@ -192,24 +206,37 @@ func main() { HealthProbeBindAddress: probeAddr, LeaderElection: enableLeaderElection, LeaderElectionID: "cloudscale.infrastructure.cluster.x-k8s.io", + PprofBindAddress: profilerAddress, // LeaderElectionReleaseOnCancel: true, }) if err != nil { - setupLog.Error(err, "Failed to start manager") - os.Exit(1) + return fmt.Errorf("failed to start manager: %w", err) } ctx := ctrl.SetupSignalHandler() - // Create a shared HTTP transport for all cloudscale API clients. - // This enables connection pooling and HTTP/2 multiplexing across reconciles. - transport := cloudscale.NewTransport() + if enableTracing { + shutdown, err := observability.InitTracing(ctx, setupLog, "capcs", version, tracingSampleRate) + if err != nil { + return fmt.Errorf("failed to initialize tracing: %w", err) + } + defer shutdown() + } + + // Wrap the transport with SDK instrumentation so all cloudscale API calls + // emit Prometheus metrics and OpenTelemetry spans. + // + // The wrapped transport is shared for all cloudscale API clients to enable connection pooling and HTTP/2 multiplexing + // across reconciles. + instrumentedTransport := instrumentation.InstrumentedTransport(cloudscale.NewTransport(), instrumentation.Options{ + PrometheusRegistry: ctrlmetrics.Registry, + Tracer: otel.Tracer("cloudscale-go-sdk"), + }) // Fetch region information for controllers and webhooks - regionInfo, flavorInfo, err := fetchAPIInfo(transport) + regionInfo, flavorInfo, err := fetchAPIInfo(instrumentedTransport, version) if err != nil { - setupLog.Error(err, "unable to fetch API information") - os.Exit(1) + return fmt.Errorf("failed to fetch API info: %w", err) } setupLog.Info("fetched region information", "regions", regionInfo.GetAllRegions()) setupLog.Info("fetched flavor information", "flavors", len(flavorInfo.GetAllFlavors())) @@ -218,72 +245,65 @@ func main() { Client: mgr.GetClient(), Scheme: mgr.GetScheme(), WatchFilter: watchFilter, - Transport: transport, + Transport: instrumentedTransport, + Version: version, MaxConcurrentReconciles: clusterConcurrency, }).SetupWithManager(ctx, mgr); err != nil { - setupLog.Error(err, "Failed to create controller", "controller", "CloudscaleCluster") - os.Exit(1) + return fmt.Errorf("failed to create controller CloudscaleCluster: %w", err) } if err := (&controller.CloudscaleMachineReconciler{ Client: mgr.GetClient(), Scheme: mgr.GetScheme(), WatchFilter: watchFilter, - Transport: transport, + Transport: instrumentedTransport, + Version: version, MaxConcurrentReconciles: machineConcurrency, }).SetupWithManager(ctx, mgr); err != nil { - setupLog.Error(err, "Failed to create controller", "controller", "CloudscaleMachine") - os.Exit(1) + return fmt.Errorf("failed to create controller CloudscaleMachine: %w", err) } if err := (&controller.CloudscaleMachineTemplateReconciler{ Client: mgr.GetClient(), Scheme: mgr.GetScheme(), FlavorInfo: flavorInfo, }).SetupWithManager(mgr); err != nil { - setupLog.Error(err, "Failed to create controller", "controller", "CloudscaleMachineTemplate") - os.Exit(1) + return fmt.Errorf("failed to create controller CloudscaleMachineTemplate: %w", err) } webhooksEnabled := os.Getenv("ENABLE_WEBHOOKS") != "false" if webhooksEnabled { if err := webhookv1beta2.SetupCloudscaleClusterWebhookWithManager(mgr, regionInfo); err != nil { - setupLog.Error(err, "Failed to create webhook", "webhook", "CloudscaleCluster") - os.Exit(1) + return fmt.Errorf("failed to setup webhook validation webhook CloudscaleCluster: %w", err) } if err := webhookv1beta2.SetupCloudscaleMachineWebhookWithManager(mgr, flavorInfo); err != nil { - setupLog.Error(err, "Failed to create webhook", "webhook", "CloudscaleMachine") - os.Exit(1) + return fmt.Errorf("failed to setup webhook validation webhook CloudscaleMachine: %w", err) } if err := webhookv1beta2.SetupCloudscaleMachineTemplateWebhookWithManager(mgr, flavorInfo); err != nil { - setupLog.Error(err, "Failed to create webhook", "webhook", "CloudscaleMachineTemplate") - os.Exit(1) + return fmt.Errorf("failed to setup webhook validation webhook CloudscaleMachineTemplate: %w", err) } if err := webhookv1beta2.SetupCloudscaleClusterTemplateWebhookWithManager(mgr, regionInfo); err != nil { - setupLog.Error(err, "Failed to create webhook", "webhook", "CloudscaleClusterTemplate") - os.Exit(1) + return fmt.Errorf("failed to setup webhook validation webhook CloudscaleClusterTemplate: %w", err) } } // +kubebuilder:scaffold:builder if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { - setupLog.Error(err, "Failed to set up health check") - os.Exit(1) + return fmt.Errorf("failed to set up health check: %w", err) } if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil { - setupLog.Error(err, "Failed to set up ready check") - os.Exit(1) + return fmt.Errorf("failed to set up ready check: %w", err) } - setupLog.Info("Starting manager") + setupLog.Info("Starting manager", "version", version) if err := mgr.Start(ctx); err != nil { - setupLog.Error(err, "Failed to run manager") - os.Exit(1) + return fmt.Errorf("failed to run manager: %w", err) } + return nil } // fetchAPIInfo fetches region and flavor information from cloudscale.ch API. // Requires CLOUDSCALE_API_TOKEN environment variable. -func fetchAPIInfo(transport *http.Transport) (*cloudscale.RegionInfo, *cloudscale.FlavorInfo, error) { +func fetchAPIInfo(transport http.RoundTripper, version string) (*cloudscale.RegionInfo, *cloudscale.FlavorInfo, error) { token := os.Getenv("CLOUDSCALE_API_TOKEN") if token == "" { return nil, nil, fmt.Errorf("CLOUDSCALE_API_TOKEN environment variable is required") @@ -292,7 +312,7 @@ func fetchAPIInfo(transport *http.Transport) (*cloudscale.RegionInfo, *cloudscal ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() - client := cloudscale.NewClient(token, transport) + client := cloudscale.NewClient(token, version, transport) var regionInfo *cloudscale.RegionInfo var flavorInfo *cloudscale.FlavorInfo diff --git a/docs/development.md b/docs/development.md index cbb8c5f..08c793b 100644 --- a/docs/development.md +++ b/docs/development.md @@ -119,10 +119,28 @@ template_dirs: - ./test/infrastructure/docker/templates cloudscale: - path/to/local/clone/cluster-api-provider-cloudscale/templates +# optional, if wanting to deploy the observability stack +#deploy_observability: +# - grafana +# - kube-state-metrics +# - loki +# - metrics-server +# - prometheus +# - alloy +# - parca +# - tempo ``` Then `tilt up` from the cluster-api checkout. +The `deploy_observability` block is processed by the cluster-api Tiltfile and +brings up Prometheus, Grafana, Tempo, and friends in the management cluster; +see [Cluster API's Tilt documentation](https://cluster-api.sigs.k8s.io/developer/core/tilt) +for what each component does and how to reach the resulting UIs. CAPCS's +`ServiceMonitor` is auto-discovered once the prometheus kustomization is +enabled. For production metric/tracing setup, see +[Observability](observability.md). + ## Tests | Layer | Location | What it covers | diff --git a/docs/observability.md b/docs/observability.md new file mode 100644 index 0000000..cdee9d9 --- /dev/null +++ b/docs/observability.md @@ -0,0 +1,93 @@ +# Observability + +CAPCS exposes Prometheus metrics, ships Grafana dashboards, and can emit +OpenTelemetry traces. Tracing and the pprof profiler are opt-in; metrics are +always served but require opt-in wiring to be scraped. + +For the developer Tilt loop, see [Development](development.md#tilt) — the +cluster-api core Tiltfile can deploy the full Prometheus / Grafana / Tempo +stack alongside CAPCS. + +## Metrics + +The manager exposes controller-runtime metrics on **HTTPS port 8443** at +`/metrics`, served via the `controller-manager-metrics-service` Service +(port name `https`). Authentication is via Kubernetes ServiceAccount bearer +token. + +Relevant flags (defaults shown): + +``` +--metrics-bind-address=:8443 +--metrics-secure=true +``` + +### Enabling scraping + +The shipped `config/default/kustomization.yaml` leaves the `ServiceMonitor` +and `NetworkPolicy` commented out. To enable scraping with the +[Prometheus Operator](https://prometheus-operator.dev/): + +1. Uncomment these two resources in `config/default/kustomization.yaml`: + + ```yaml + - ../prometheus + - ../network-policy + ``` + +2. Label the namespace that runs Prometheus so the `NetworkPolicy` allows + ingress: + + ```bash + kubectl label namespace metrics=enabled + ``` + +The shipped `ServiceMonitor` (`config/prometheus/monitor.yaml`) uses +`insecureSkipVerify: true` against the manager's self-signed TLS. For +production, enable the cert-manager-backed patch +`config/prometheus/monitor_tls_patch.yaml` (see comments in +`config/default/kustomization.yaml`). + +## Dashboards + +Three Grafana dashboards live under [`grafana/`](../grafana): + +| File | What it shows | +|-----------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------| +| `controller-runtime-metrics.json` | Standard controller-runtime metrics: reconcile rate, queue depth, latency | +| `controller-resources-metrics.json` | Pod CPU/memory/goroutine metrics for the manager | +| `custom-metrics/custom-metrics-dashboard.json` (and accompanying `config.yaml`) | cloudscale.ch API call rate and error rate, by endpoint | + +The custom dashboard reads `cloudscale_requests_total`, so it works for any +workload that uses cloudscale-go-sdk v9 with the instrumented transport, not +just CAPCS. + +## Tracing (opt-in) + +Tracing is **off by default**. To enable it, set the following on the manager: + +``` +--enable-tracing=true +--tracing-sample-rate=0.1 # 0.0–1.0; default 0.1 +``` + +Spans are exported via OTLP/gRPC (insecure). The endpoint is read from +`OTEL_EXPORTER_OTLP_ENDPOINT` (defaults to `localhost:4317`). Point it at your +collector — Tempo, Alloy, or an OpenTelemetry Collector — for example: + +```yaml +env: + - name: OTEL_EXPORTER_OTLP_ENDPOINT + value: http://tempo.observability.svc:4317 +``` + +## Profiler (opt-in) + +pprof is **off by default**. Set `--profiler-address` to bind it: + +``` +--profiler-address=localhost:6060 +``` + +Bind to loopback in production and reach it via `kubectl port-forward`. Do +not expose pprof on a routable interface. diff --git a/go.mod b/go.mod index 277e8f4..6de8df8 100644 --- a/go.mod +++ b/go.mod @@ -3,10 +3,14 @@ module github.com/cloudscale-ch/cluster-api-provider-cloudscale go 1.26.0 require ( - github.com/cloudscale-ch/cloudscale-go-sdk/v8 v8.0.0 + github.com/cloudscale-ch/cloudscale-go-sdk/v9 v9.0.0 github.com/go-logr/logr v1.4.3 github.com/onsi/ginkgo/v2 v2.29.0 github.com/onsi/gomega v1.41.0 + go.opentelemetry.io/otel v1.43.0 + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0 + go.opentelemetry.io/otel/sdk v1.43.0 + go.opentelemetry.io/otel/trace v1.43.0 golang.org/x/crypto v0.52.0 golang.org/x/oauth2 v0.36.0 golang.org/x/sync v0.20.0 @@ -122,12 +126,8 @@ require ( go.opentelemetry.io/auto/sdk v1.2.1 // indirect go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.68.0 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.68.0 // indirect - go.opentelemetry.io/otel v1.43.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 // indirect - go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0 // indirect go.opentelemetry.io/otel/metric v1.43.0 // indirect - go.opentelemetry.io/otel/sdk v1.43.0 // indirect - go.opentelemetry.io/otel/trace v1.43.0 // indirect go.opentelemetry.io/proto/otlp v1.10.0 // indirect go.uber.org/multierr v1.11.0 // indirect go.uber.org/zap v1.28.0 // indirect @@ -160,3 +160,5 @@ require ( sigs.k8s.io/structured-merge-diff/v6 v6.4.0 // indirect sigs.k8s.io/yaml v1.6.0 // indirect ) + +replace github.com/cloudscale-ch/cloudscale-go-sdk/v9 => ../cloudscale-go-sdk diff --git a/go.sum b/go.sum index 69e4b7f..adcddd1 100644 --- a/go.sum +++ b/go.sum @@ -28,8 +28,6 @@ github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1x github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/cloudscale-ch/cloudscale-go-sdk/v8 v8.0.0 h1:XP3thdgotNVpPF27568RYHt9kqosVm8eJznJ+X4PJIk= -github.com/cloudscale-ch/cloudscale-go-sdk/v8 v8.0.0/go.mod h1:H4qxiHJof+IdwvaV26ZcmNR39EyggnKIcDfLYcYnBCI= github.com/containerd/errdefs v1.0.0 h1:tg5yIfIlQIrxYtu9ajqY42W3lpS19XqdxRQeEwYG8PI= github.com/containerd/errdefs v1.0.0/go.mod h1:+YBYIdtsnF4Iw6nWZhJcqGSg/dwvV7tyJ/kCkyJ2k+M= github.com/containerd/errdefs/pkg v0.3.0 h1:9IKJ06FvyNlexW690DXuQNx2KA2cUJXx151Xdx3ZPPE= diff --git a/grafana/custom-metrics/config.yaml b/grafana/custom-metrics/config.yaml index 3ee1beb..3f4d9dd 100644 --- a/grafana/custom-metrics/config.yaml +++ b/grafana/custom-metrics/config.yaml @@ -1,15 +1,72 @@ --- -customMetrics: -# - metric: # Raw custom metric (required) -# type: # Metric type: counter/gauge/histogram (required) -# expr: # Prom_ql for the metric (optional) -# unit: # Unit of measurement, examples: s,none,bytes,percent,etc. (optional) +# Custom metrics consumed by the grafana.kubebuilder.io/v1-alpha plugin. +# Regenerate dashboards with: +# kubebuilder edit --plugins=grafana.kubebuilder.io/v1-alpha # +# Schema per entry: +# metric: raw Prometheus metric name (required) +# type: counter | gauge | histogram (required) +# expr: PromQL expression (optional; overrides plugin default) +# unit: unit of measurement (optional) # -# Example: -# --- -# customMetrics: -# - metric: foo_bar -# unit: none -# type: histogram -# expr: histogram_quantile(0.90, sum by(instance, le) (rate(foo_bar{job=\"$job\", namespace=\"$namespace\"}[5m]))) +# NOTE: label matchers use single quotes (e.g. {job='$job'}) because the +# kubebuilder grafana plugin substitutes `expr` into JSON without escaping +# inner double quotes, which would break the generated dashboard JSON. +customMetrics: + # ---- cloudscale-go-sdk: cloudscale.ch API ---- + + - metric: cloudscale_requests_total + type: counter + unit: reqps + expr: "sum by (endpoint, method, status) (rate(cloudscale_requests_total{job='$job', namespace='$namespace'}[5m]))" + + - metric: cloudscale_requests_total + type: counter + unit: reqps + expr: "sum by (endpoint, status) (rate(cloudscale_requests_total{job='$job', namespace='$namespace', status!~'2..|3..'}[5m]))" + + - metric: cloudscale_request_duration_seconds + type: histogram + unit: s + expr: "histogram_quantile(0.95, sum by (endpoint, le) (rate(cloudscale_request_duration_seconds_bucket{job='$job', namespace='$namespace'}[5m])))" + + - metric: cloudscale_in_flight_requests + type: gauge + unit: none + expr: "cloudscale_in_flight_requests{job='$job', namespace='$namespace'}" + + # ---- controller-runtime: reconciler health ---- + + - metric: controller_runtime_reconcile_time_seconds + type: histogram + unit: s + expr: "histogram_quantile(0.99, sum by (controller, le) (rate(controller_runtime_reconcile_time_seconds_bucket{job='$job', namespace='$namespace'}[5m])))" + + - metric: controller_runtime_terminal_reconcile_errors_total + type: counter + unit: none + expr: "sum by (controller) (rate(controller_runtime_terminal_reconcile_errors_total{job='$job', namespace='$namespace'}[5m]))" + + - metric: controller_runtime_reconcile_panics_total + type: counter + unit: none + expr: "sum by (controller) (rate(controller_runtime_reconcile_panics_total{job='$job', namespace='$namespace'}[5m]))" + + # ---- controller-runtime: admission webhooks ---- + + - metric: controller_runtime_webhook_requests_total + type: counter + unit: reqps + expr: "sum by (webhook, code) (rate(controller_runtime_webhook_requests_total{job='$job', namespace='$namespace'}[5m]))" + + - metric: controller_runtime_webhook_requests_in_flight + type: gauge + unit: none + expr: "controller_runtime_webhook_requests_in_flight{job='$job', namespace='$namespace'}" + + # ---- client-go: management-cluster apiserver traffic ---- + + - metric: rest_client_requests_total + type: counter + unit: reqps + expr: "sum by (code, method) (rate(rest_client_requests_total{job='$job', namespace='$namespace'}[5m]))" diff --git a/grafana/custom-metrics/custom-metrics-dashboard.json b/grafana/custom-metrics/custom-metrics-dashboard.json new file mode 100644 index 0000000..e5fa398 --- /dev/null +++ b/grafana/custom-metrics/custom-metrics-dashboard.json @@ -0,0 +1,995 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line",that + "fillOpacity": 20, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 3, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24 + }, + "interval": "1m", + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.4.3", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "exemplar": true, + "expr": "sum by (endpoint, method, status) (rate(cloudscale_requests_total{job='$job', namespace='$namespace'}[5m]))", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "refId": "A", + "step": 10 + } + ], + "title": "cloudscale_requests_total (counter)", + "type": "timeseries" + }, + + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 3, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24 + }, + "interval": "1m", + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.4.3", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "exemplar": true, + "expr": "sum by (endpoint, status) (rate(cloudscale_requests_total{job='$job', namespace='$namespace', status!~'2..|3..'}[5m]))", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "refId": "A", + "step": 10 + } + ], + "title": "cloudscale_requests_total (counter)", + "type": "timeseries" + }, + + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 3, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24 + }, + "interval": "1m", + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.4.3", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "exemplar": true, + "expr": "histogram_quantile(0.95, sum by (endpoint, le) (rate(cloudscale_request_duration_seconds_bucket{job='$job', namespace='$namespace'}[5m])))", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "refId": "A", + "step": 10 + } + ], + "title": "cloudscale_request_duration_seconds (histogram)", + "type": "timeseries" + }, + + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 3, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24 + }, + "interval": "1m", + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.4.3", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "exemplar": true, + "expr": "cloudscale_in_flight_requests{job='$job', namespace='$namespace'}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "refId": "A", + "step": 10 + } + ], + "title": "cloudscale_in_flight_requests (gauge)", + "type": "timeseries" + }, + + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 3, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24 + }, + "interval": "1m", + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.4.3", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "exemplar": true, + "expr": "histogram_quantile(0.99, sum by (controller, le) (rate(controller_runtime_reconcile_time_seconds_bucket{job='$job', namespace='$namespace'}[5m])))", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "refId": "A", + "step": 10 + } + ], + "title": "controller_runtime_reconcile_time_seconds (histogram)", + "type": "timeseries" + }, + + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 3, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24 + }, + "interval": "1m", + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.4.3", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "exemplar": true, + "expr": "sum by (controller) (rate(controller_runtime_terminal_reconcile_errors_total{job='$job', namespace='$namespace'}[5m]))", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "refId": "A", + "step": 10 + } + ], + "title": "controller_runtime_terminal_reconcile_errors_total (counter)", + "type": "timeseries" + }, + + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 3, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24 + }, + "interval": "1m", + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.4.3", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "exemplar": true, + "expr": "sum by (controller) (rate(controller_runtime_reconcile_panics_total{job='$job', namespace='$namespace'}[5m]))", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "refId": "A", + "step": 10 + } + ], + "title": "controller_runtime_reconcile_panics_total (counter)", + "type": "timeseries" + }, + + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 3, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24 + }, + "interval": "1m", + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.4.3", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "exemplar": true, + "expr": "sum by (webhook, code) (rate(controller_runtime_webhook_requests_total{job='$job', namespace='$namespace'}[5m]))", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "refId": "A", + "step": 10 + } + ], + "title": "controller_runtime_webhook_requests_total (counter)", + "type": "timeseries" + }, + + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 3, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24 + }, + "interval": "1m", + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.4.3", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "exemplar": true, + "expr": "controller_runtime_webhook_requests_in_flight{job='$job', namespace='$namespace'}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "refId": "A", + "step": 10 + } + ], + "title": "controller_runtime_webhook_requests_in_flight (gauge)", + "type": "timeseries" + }, + + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 3, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24 + }, + "interval": "1m", + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.4.3", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "exemplar": true, + "expr": "sum by (code, method) (rate(rest_client_requests_total{job='$job', namespace='$namespace'}[5m]))", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "refId": "A", + "step": 10 + } + ], + "title": "rest_client_requests_total (counter)", + "type": "timeseries" + } + ], + "refresh": "", + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "datasource": "${DS_PROMETHEUS}", + "definition": "label_values(controller_runtime_reconcile_total{namespace=~\"$namespace\"}, job)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "job", + "options": [], + "query": { + "query": "label_values(controller_runtime_reconcile_total{namespace=~\"$namespace\"}, job)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "observability", + "value": "observability" + }, + "datasource": "${DS_PROMETHEUS}", + "definition": "label_values(controller_runtime_reconcile_total, namespace)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "namespace", + "options": [], + "query": { + "query": "label_values(controller_runtime_reconcile_total, namespace)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": "${DS_PROMETHEUS}", + "definition": "label_values(controller_runtime_reconcile_total{namespace=~\"$namespace\", job=~\"$job\"}, pod)", + "hide": 2, + "includeAll": true, + "label": "pod", + "multi": true, + "name": "pod", + "options": [], + "query": { + "query": "label_values(controller_runtime_reconcile_total{namespace=~\"$namespace\", job=~\"$job\"}, pod)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Custom-Metrics", + "weekStart": "" +} diff --git a/internal/cloudscale/client.go b/internal/cloudscale/client.go index 2438603..86e0900 100644 --- a/internal/cloudscale/client.go +++ b/internal/cloudscale/client.go @@ -25,7 +25,7 @@ import ( "strings" "time" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" "golang.org/x/oauth2" ) @@ -87,10 +87,14 @@ func NewTransport() *http.Transport { // and reused across clients. Each client gets its own oauth2 token injection // but shares the underlying connection pool. // +// version is appended to the SDK's User-Agent header (e.g. +// "cloudscale/v9.0.0 capcs/") so the API server can identify +// the controller making the call. +// // No global HTTP timeout is set on the client. Instead, callers must use // context.WithTimeout with ReadTimeout, WriteTimeout, or DeleteTimeout // for each API call. -func NewClient(token string, transport *http.Transport) *Client { +func NewClient(token, version string, transport http.RoundTripper) *Client { tokenSource := oauth2.StaticTokenSource(&oauth2.Token{AccessToken: token}) httpClient := &http.Client{ @@ -100,6 +104,7 @@ func NewClient(token string, transport *http.Transport) *Client { }, } sdkClient := cloudscalesdk.NewClient(httpClient) + sdkClient.UserAgent = sdkClient.UserAgent + " capcs/" + version return &Client{ Networks: sdkClient.Networks, diff --git a/internal/cloudscale/client_test.go b/internal/cloudscale/client_test.go index 865c0a2..5dc9e16 100644 --- a/internal/cloudscale/client_test.go +++ b/internal/cloudscale/client_test.go @@ -22,7 +22,7 @@ import ( "os" "testing" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" . "github.com/onsi/gomega" ) diff --git a/internal/cloudscale/flavors.go b/internal/cloudscale/flavors.go index 1120c3d..9776baf 100644 --- a/internal/cloudscale/flavors.go +++ b/internal/cloudscale/flavors.go @@ -3,7 +3,7 @@ package cloudscale import ( "fmt" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" ) diff --git a/internal/cloudscale/flavors_test.go b/internal/cloudscale/flavors_test.go index b954017..5397635 100644 --- a/internal/cloudscale/flavors_test.go +++ b/internal/cloudscale/flavors_test.go @@ -3,7 +3,7 @@ package cloudscale import ( "testing" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" . "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" diff --git a/internal/cloudscale/regions.go b/internal/cloudscale/regions.go index e70ad0d..e0e891b 100644 --- a/internal/cloudscale/regions.go +++ b/internal/cloudscale/regions.go @@ -17,7 +17,7 @@ limitations under the License. package cloudscale import ( - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" ) // RegionInfo holds cloudscale.ch region and zone information for validation. diff --git a/internal/cloudscale/regions_test.go b/internal/cloudscale/regions_test.go index d6f634f..bbeeeed 100644 --- a/internal/cloudscale/regions_test.go +++ b/internal/cloudscale/regions_test.go @@ -19,7 +19,7 @@ package cloudscale import ( "testing" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" . "github.com/onsi/gomega" ) diff --git a/internal/cloudscale/services.go b/internal/cloudscale/services.go index e6290a0..464d0f5 100644 --- a/internal/cloudscale/services.go +++ b/internal/cloudscale/services.go @@ -19,7 +19,7 @@ package cloudscale import ( "context" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" ) type NetworkService interface { diff --git a/internal/controller/cloudscale_services.go b/internal/controller/cloudscale_services.go index 664b571..e7b6062 100644 --- a/internal/controller/cloudscale_services.go +++ b/internal/controller/cloudscale_services.go @@ -4,7 +4,7 @@ import ( "context" "fmt" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/scope" diff --git a/internal/controller/cloudscale_services_test.go b/internal/controller/cloudscale_services_test.go index df601bd..b105f4c 100644 --- a/internal/controller/cloudscale_services_test.go +++ b/internal/controller/cloudscale_services_test.go @@ -21,7 +21,7 @@ import ( "fmt" "testing" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" "github.com/go-logr/logr" . "github.com/onsi/gomega" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" diff --git a/internal/controller/cloudscale_tags.go b/internal/controller/cloudscale_tags.go index 2323711..3edffa3 100644 --- a/internal/controller/cloudscale_tags.go +++ b/internal/controller/cloudscale_tags.go @@ -1,7 +1,7 @@ package controller import ( - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" ) diff --git a/internal/controller/cloudscalecluster_controller.go b/internal/controller/cloudscalecluster_controller.go index 92acf1a..5d7da24 100644 --- a/internal/controller/cloudscalecluster_controller.go +++ b/internal/controller/cloudscalecluster_controller.go @@ -23,6 +23,7 @@ import ( "net/http" "time" + "go.opentelemetry.io/otel/attribute" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -38,11 +39,11 @@ import ( "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/handler" - logf "sigs.k8s.io/controller-runtime/pkg/log" infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/credentials" + "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/observability" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/scope" ) @@ -52,7 +53,8 @@ type CloudscaleClusterReconciler struct { Scheme *runtime.Scheme recorder events.EventRecorder WatchFilter string - Transport *http.Transport + Transport http.RoundTripper + Version string MaxConcurrentReconciles int } @@ -69,7 +71,12 @@ func (r *CloudscaleClusterReconciler) Reconcile(ctx context.Context, req ctrl.Re ctx, cancel := context.WithTimeout(ctx, 5*time.Minute) defer cancel() - logger := logf.FromContext(ctx) + ctx, logger, done := observability.StartSpanWithLogger(ctx, + "controllers.CloudscaleClusterReconciler.Reconcile", + attribute.String("namespace", req.Namespace), + attribute.String("name", req.Name), + ) + defer done() cloudscaleCluster := &infrastructurev1beta2.CloudscaleCluster{} if err := r.Get(ctx, req.NamespacedName, cloudscaleCluster); err != nil { @@ -108,7 +115,7 @@ func (r *CloudscaleClusterReconciler) Reconcile(ctx context.Context, req ctrl.Re return ctrl.Result{}, fmt.Errorf("failed to get cloudscale.ch credentials: %w", err) } - cloudscaleClient := cloudscale.NewClient(token, r.Transport) + cloudscaleClient := cloudscale.NewClient(token, r.Version, r.Transport) clusterScope, err := scope.NewClusterScope(scope.ClusterScopeParams{ Client: r.Client, @@ -144,7 +151,10 @@ func (r *CloudscaleClusterReconciler) Reconcile(ctx context.Context, req ctrl.Re // reconcileNormal handles normal reconciliation of cloudscale infrastructure. func (r *CloudscaleClusterReconciler) reconcileNormal(ctx context.Context, clusterScope *scope.ClusterScope) (ctrl.Result, error) { - clusterScope.Info("Reconciling CloudscaleCluster") + ctx, logger, done := observability.StartSpanWithLogger(ctx, "controllers.CloudscaleClusterReconciler.reconcileNormal") + defer done() + + logger.Info("Reconciling CloudscaleCluster") // update ready conditions upon returning from this function based on updated clusterScope. defer r.setReadyCondition(clusterScope) @@ -187,7 +197,10 @@ func (r *CloudscaleClusterReconciler) reconcileNormal(ctx context.Context, clust // //nolint:unparam // Returns ctrl.Result for consistency with reconcile pattern func (r *CloudscaleClusterReconciler) reconcileDelete(ctx context.Context, clusterScope *scope.ClusterScope) (ctrl.Result, error) { - clusterScope.Info("Reconciling CloudscaleCluster deletion") + ctx, logger, done := observability.StartSpanWithLogger(ctx, "controllers.CloudscaleClusterReconciler.reconcileDelete") + defer done() + + logger.Info("Reconciling CloudscaleCluster deletion") // Set Deleting condition r.setCondition(clusterScope, infrastructurev1beta2.DeletingCondition, metav1.ConditionTrue, infrastructurev1beta2.DeletingReason, "Deleting infrastructure resources") diff --git a/internal/controller/cloudscalecluster_floatingip.go b/internal/controller/cloudscalecluster_floatingip.go index 0acb95f..12a561e 100644 --- a/internal/controller/cloudscalecluster_floatingip.go +++ b/internal/controller/cloudscalecluster_floatingip.go @@ -21,7 +21,7 @@ import ( "fmt" "time" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/utils/ptr" @@ -31,6 +31,7 @@ import ( infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" + "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/observability" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/scope" ) @@ -39,6 +40,11 @@ const createFloatingIPTimeoutRequeueAfter = 5 * time.Second // reconcileFloatingIP ensures the floating IP exists and is assigned to the correct target. // When no floating IP is configured, this sets the condition to true and returns. func (r *CloudscaleClusterReconciler) reconcileFloatingIP(ctx context.Context, clusterScope *scope.ClusterScope) (_ ctrl.Result, reterr error) { + ctx, logger, done := observability.StartSpanWithLogger(ctx, "controllers.CloudscaleClusterReconciler.reconcileFloatingIP") + defer done() + + logger.Info("Reconciling floating IP") + fipSpec := clusterScope.CloudscaleCluster.Spec.FloatingIP if fipSpec == nil { r.setCondition(clusterScope, infrastructurev1beta2.FloatingIPReadyCondition, metav1.ConditionTrue, infrastructurev1beta2.FloatingIPDisabledReason, "") diff --git a/internal/controller/cloudscalecluster_floatingip_test.go b/internal/controller/cloudscalecluster_floatingip_test.go index 38f1cb6..e497f67 100644 --- a/internal/controller/cloudscalecluster_floatingip_test.go +++ b/internal/controller/cloudscalecluster_floatingip_test.go @@ -23,7 +23,7 @@ import ( "os" "testing" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" . "github.com/onsi/gomega" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" diff --git a/internal/controller/cloudscalecluster_loadbalancer.go b/internal/controller/cloudscalecluster_loadbalancer.go index b7f4ab9..c9cfc80 100644 --- a/internal/controller/cloudscalecluster_loadbalancer.go +++ b/internal/controller/cloudscalecluster_loadbalancer.go @@ -23,7 +23,7 @@ import ( "slices" "time" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/utils/ptr" @@ -33,6 +33,7 @@ import ( infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" + "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/observability" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/scope" ) @@ -48,6 +49,11 @@ const ( // It also sets the control plane endpoint from the load balancer's VIP address. // When the load balancer is disabled (external control plane), this function returns immediately. func (r *CloudscaleClusterReconciler) reconcileLoadBalancer(ctx context.Context, clusterScope *scope.ClusterScope) (result ctrl.Result, reterr error) { + ctx, logger, done := observability.StartSpanWithLogger(ctx, "controllers.CloudscaleClusterReconciler.reconcileLoadBalancer") + defer done() + + logger.Info("Reconciling load balancer") + // LB disabled: set condition and return before defer is registered if !ptr.Deref(clusterScope.CloudscaleCluster.Spec.ControlPlaneLoadBalancer.Enabled, true) { clusterScope.Info("Load balancer is disabled, skipping reconciliation (external control plane)") diff --git a/internal/controller/cloudscalecluster_loadbalancer_test.go b/internal/controller/cloudscalecluster_loadbalancer_test.go index fdd4100..182c0d7 100644 --- a/internal/controller/cloudscalecluster_loadbalancer_test.go +++ b/internal/controller/cloudscalecluster_loadbalancer_test.go @@ -21,7 +21,7 @@ import ( "testing" "time" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" . "github.com/onsi/gomega" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" diff --git a/internal/controller/cloudscalecluster_network.go b/internal/controller/cloudscalecluster_network.go index 3f9fb08..d4e4a3f 100644 --- a/internal/controller/cloudscalecluster_network.go +++ b/internal/controller/cloudscalecluster_network.go @@ -23,13 +23,14 @@ import ( "strings" "time" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ctrl "sigs.k8s.io/controller-runtime" infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" + "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/observability" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/scope" ) @@ -38,6 +39,11 @@ const createNetworkTimeoutRequeueAfter = 5 * time.Second // reconcileNetwork orchestrates network and subnet provisioning for all networks // defined in spec.networks. A single NetworkReadyCondition covers all networks. func (r *CloudscaleClusterReconciler) reconcileNetwork(ctx context.Context, clusterScope *scope.ClusterScope) (_ ctrl.Result, reterr error) { + ctx, logger, done := observability.StartSpanWithLogger(ctx, "controllers.CloudscaleClusterReconciler.reconcileNetwork") + defer done() + + logger.Info("Reconciling network") + defer func() { if reterr != nil { r.setCondition(clusterScope, infrastructurev1beta2.NetworkReadyCondition, metav1.ConditionFalse, infrastructurev1beta2.NetworkErrorReason, reterr.Error()) diff --git a/internal/controller/cloudscalecluster_network_test.go b/internal/controller/cloudscalecluster_network_test.go index b99e1b2..513c475 100644 --- a/internal/controller/cloudscalecluster_network_test.go +++ b/internal/controller/cloudscalecluster_network_test.go @@ -23,7 +23,7 @@ import ( "os" "testing" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" . "github.com/onsi/gomega" infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" diff --git a/internal/controller/cloudscalecluster_reconcile_test.go b/internal/controller/cloudscalecluster_reconcile_test.go index ff8dc16..b89a0aa 100644 --- a/internal/controller/cloudscalecluster_reconcile_test.go +++ b/internal/controller/cloudscalecluster_reconcile_test.go @@ -27,7 +27,7 @@ import ( "context" "testing" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" . "github.com/onsi/gomega" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" diff --git a/internal/controller/cloudscalecluster_servergroup.go b/internal/controller/cloudscalecluster_servergroup.go index 3f480e9..300c60d 100644 --- a/internal/controller/cloudscalecluster_servergroup.go +++ b/internal/controller/cloudscalecluster_servergroup.go @@ -21,7 +21,7 @@ import ( "errors" "fmt" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" "k8s.io/apimachinery/pkg/util/sets" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" "sigs.k8s.io/controller-runtime/pkg/client" diff --git a/internal/controller/cloudscalecluster_servergroup_test.go b/internal/controller/cloudscalecluster_servergroup_test.go index 0e5e16e..a73e643 100644 --- a/internal/controller/cloudscalecluster_servergroup_test.go +++ b/internal/controller/cloudscalecluster_servergroup_test.go @@ -21,7 +21,7 @@ import ( "fmt" "testing" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" . "github.com/onsi/gomega" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/tools/events" diff --git a/internal/controller/cloudscalemachine_controller.go b/internal/controller/cloudscalemachine_controller.go index ac141e9..f758585 100644 --- a/internal/controller/cloudscalemachine_controller.go +++ b/internal/controller/cloudscalemachine_controller.go @@ -22,6 +22,7 @@ import ( "net/http" "time" + "go.opentelemetry.io/otel/attribute" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -37,11 +38,11 @@ import ( "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/handler" - logf "sigs.k8s.io/controller-runtime/pkg/log" infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/credentials" + "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/observability" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/scope" ) @@ -56,7 +57,8 @@ type CloudscaleMachineReconciler struct { Scheme *runtime.Scheme recorder events.EventRecorder WatchFilter string - Transport *http.Transport + Transport http.RoundTripper + Version string MaxConcurrentReconciles int } @@ -71,7 +73,12 @@ func (r *CloudscaleMachineReconciler) Reconcile(ctx context.Context, req ctrl.Re ctx, cancel := context.WithTimeout(ctx, 3*time.Minute) defer cancel() - logger := logf.FromContext(ctx) + ctx, logger, done := observability.StartSpanWithLogger(ctx, + "controllers.CloudscaleMachineReconciler.Reconcile", + attribute.String("namespace", req.Namespace), + attribute.String("name", req.Name), + ) + defer done() cloudscaleMachine := &infrastructurev1beta2.CloudscaleMachine{} if err := r.Get(ctx, req.NamespacedName, cloudscaleMachine); err != nil { @@ -134,7 +141,7 @@ func (r *CloudscaleMachineReconciler) Reconcile(ctx context.Context, req ctrl.Re return ctrl.Result{}, fmt.Errorf("failed to get cloudscale.ch credentials: %w", err) } - cloudscaleClient := cloudscale.NewClient(token, r.Transport) + cloudscaleClient := cloudscale.NewClient(token, r.Version, r.Transport) machineScope, err := scope.NewMachineScope(scope.MachineScopeParams{ Client: r.Client, @@ -172,7 +179,10 @@ func (r *CloudscaleMachineReconciler) Reconcile(ctx context.Context, req ctrl.Re // reconcileNormal handles normal reconciliation of CloudscaleMachine. func (r *CloudscaleMachineReconciler) reconcileNormal(ctx context.Context, machineScope *scope.MachineScope) (ctrl.Result, error) { - machineScope.Info("Reconciling CloudscaleMachine") + ctx, logger, done := observability.StartSpanWithLogger(ctx, "controllers.CloudscaleMachineReconciler.reconcileNormal") + defer done() + + logger.Info("Reconciling CloudscaleMachine") defer r.setReadyCondition(machineScope.CloudscaleMachine) @@ -242,7 +252,10 @@ func (r *CloudscaleMachineReconciler) setReadyCondition(machine *infrastructurev // //nolint:unparam // Returns ctrl.Result for consistency with reconcile pattern func (r *CloudscaleMachineReconciler) reconcileDelete(ctx context.Context, machineScope *scope.MachineScope) (ctrl.Result, error) { - machineScope.Info("Reconciling CloudscaleMachine deletion") + ctx, logger, done := observability.StartSpanWithLogger(ctx, "controllers.CloudscaleMachineReconciler.reconcileDelete") + defer done() + + logger.Info("Reconciling CloudscaleMachine deletion") // Set Deleting condition r.setCondition(machineScope.CloudscaleMachine, infrastructurev1beta2.DeletingCondition, metav1.ConditionTrue, infrastructurev1beta2.DeletingReason, "Deleting server") diff --git a/internal/controller/cloudscalemachine_reconcile_test.go b/internal/controller/cloudscalemachine_reconcile_test.go index 9865775..cc9bb17 100644 --- a/internal/controller/cloudscalemachine_reconcile_test.go +++ b/internal/controller/cloudscalemachine_reconcile_test.go @@ -27,7 +27,7 @@ import ( "fmt" "testing" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" . "github.com/onsi/gomega" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" diff --git a/internal/controller/cloudscalemachine_server.go b/internal/controller/cloudscalemachine_server.go index ce307b6..7c1353c 100644 --- a/internal/controller/cloudscalemachine_server.go +++ b/internal/controller/cloudscalemachine_server.go @@ -22,7 +22,7 @@ import ( "maps" "time" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/utils/ptr" @@ -31,6 +31,7 @@ import ( infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" + "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/observability" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/scope" ) @@ -69,6 +70,11 @@ const ( ) func (r *CloudscaleMachineReconciler) reconcileServer(ctx context.Context, machineScope *scope.MachineScope) (_ ctrl.Result, reterr error) { + ctx, logger, done := observability.StartSpanWithLogger(ctx, "controllers.CloudscaleMachineReconciler.reconcileServer") + defer done() + + logger.Info("Reconciling server") + var server *cloudscalesdk.Server defer func() { if reterr != nil { diff --git a/internal/controller/cloudscalemachine_server_test.go b/internal/controller/cloudscalemachine_server_test.go index 7923112..955c242 100644 --- a/internal/controller/cloudscalemachine_server_test.go +++ b/internal/controller/cloudscalemachine_server_test.go @@ -20,7 +20,7 @@ import ( "context" "testing" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" . "github.com/onsi/gomega" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/tools/events" diff --git a/internal/controller/cloudscalemachine_servergroup.go b/internal/controller/cloudscalemachine_servergroup.go index b2dcfce..e32a6fa 100644 --- a/internal/controller/cloudscalemachine_servergroup.go +++ b/internal/controller/cloudscalemachine_servergroup.go @@ -22,12 +22,13 @@ import ( "sync" "time" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ctrl "sigs.k8s.io/controller-runtime" infrastructurev1beta2 "github.com/cloudscale-ch/cluster-api-provider-cloudscale/api/v1beta2" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/cloudscale" + "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/observability" "github.com/cloudscale-ch/cluster-api-provider-cloudscale/internal/scope" ) @@ -40,6 +41,11 @@ var serverGroupMu sync.Mutex // reconcileServerGroup ensures the server group exists if specified. // Server groups are zone-scoped and created once per unique name+zone combination. func (r *CloudscaleMachineReconciler) reconcileServerGroup(ctx context.Context, machineScope *scope.MachineScope) (_ ctrl.Result, reterr error) { + ctx, logger, done := observability.StartSpanWithLogger(ctx, "controllers.CloudscaleMachineReconciler.reconcileServerGroup") + defer done() + + logger.Info("Reconciling server group") + defer func() { if reterr != nil { r.setCondition(machineScope.CloudscaleMachine, infrastructurev1beta2.ServerGroupReadyCondition, metav1.ConditionFalse, infrastructurev1beta2.ServerGroupErrorReason, reterr.Error()) diff --git a/internal/controller/cloudscalemachine_servergroup_test.go b/internal/controller/cloudscalemachine_servergroup_test.go index 0a3aa62..9e5c0c1 100644 --- a/internal/controller/cloudscalemachine_servergroup_test.go +++ b/internal/controller/cloudscalemachine_servergroup_test.go @@ -21,7 +21,7 @@ import ( "fmt" "testing" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" . "github.com/onsi/gomega" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/tools/events" diff --git a/internal/observability/composite_logger.go b/internal/observability/composite_logger.go new file mode 100644 index 0000000..74301ee --- /dev/null +++ b/internal/observability/composite_logger.go @@ -0,0 +1,81 @@ +/* +Copyright 2026 cloudscale.ch. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package observability + +import "github.com/go-logr/logr" + +// compositeLogSink is a logr.LogSink that multiplexes calls to multiple +// underlying sinks. +type compositeLogSink struct { + sinks []logr.LogSink +} + +// Init implements logr.LogSink. +func (c *compositeLogSink) Init(info logr.RuntimeInfo) { + for _, s := range c.sinks { + s.Init(info) + } +} + +// Enabled implements logr.LogSink. It returns true if any underlying sink is +// enabled. +func (c *compositeLogSink) Enabled(level int) bool { + for _, s := range c.sinks { + if s.Enabled(level) { + return true + } + } + return false +} + +// Info implements logr.LogSink. +func (c *compositeLogSink) Info(level int, msg string, keysAndValues ...any) { + for _, s := range c.sinks { + s.Info(level, msg, keysAndValues...) + } +} + +// Error implements logr.LogSink. +func (c *compositeLogSink) Error(err error, msg string, keysAndValues ...any) { + for _, s := range c.sinks { + s.Error(err, msg, keysAndValues...) + } +} + +// WithValues implements logr.LogSink. +func (c *compositeLogSink) WithValues(keysAndValues ...any) logr.LogSink { + newSinks := make([]logr.LogSink, len(c.sinks)) + for i, s := range c.sinks { + newSinks[i] = s.WithValues(keysAndValues...) + } + return &compositeLogSink{sinks: newSinks} +} + +// WithName implements logr.LogSink. +func (c *compositeLogSink) WithName(name string) logr.LogSink { + newSinks := make([]logr.LogSink, len(c.sinks)) + for i, s := range c.sinks { + newSinks[i] = s.WithName(name) + } + return &compositeLogSink{sinks: newSinks} +} + +// NewCompositeLogger returns a LogSink that forwards calls to all provided +// sinks. +func NewCompositeLogger(sinks ...logr.LogSink) logr.LogSink { + return &compositeLogSink{sinks: sinks} +} diff --git a/internal/observability/span.go b/internal/observability/span.go new file mode 100644 index 0000000..e9e543b --- /dev/null +++ b/internal/observability/span.go @@ -0,0 +1,46 @@ +/* +Copyright 2026 cloudscale.ch. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package observability + +import ( + "context" + + "github.com/go-logr/logr" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" + logf "sigs.k8s.io/controller-runtime/pkg/log" +) + +// StartSpanWithLogger starts a new OTel span and returns a context, logger, and +// done function. The returned logger is a composite that writes to both the +// standard logger and the span as events. +func StartSpanWithLogger( + ctx context.Context, + spanName string, + attrs ...attribute.KeyValue, +) (context.Context, logr.Logger, func()) { + tracer := otel.Tracer("capcs") + ctx, span := tracer.Start(ctx, spanName, trace.WithAttributes(attrs...)) + + baseLogger := logf.FromContext(ctx) + sink := NewCompositeLogger(baseLogger.GetSink(), NewSpanLogSink(span)) + logger := logr.New(sink).WithName(spanName) + ctx = logr.NewContext(ctx, logger) + + return ctx, logger, func() { span.End() } +} diff --git a/internal/observability/span_logger.go b/internal/observability/span_logger.go new file mode 100644 index 0000000..cb1ebb6 --- /dev/null +++ b/internal/observability/span_logger.go @@ -0,0 +1,93 @@ +/* +Copyright 2026 cloudscale.ch. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package observability + +import ( + "fmt" + "time" + + "github.com/go-logr/logr" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" +) + +// spanLogSink is a logr.LogSink implementation that writes log data to an +// OpenTelemetry span as events. +type spanLogSink struct { + span trace.Span + name string + vals []any +} + +// Init implements logr.LogSink. +func (s *spanLogSink) Init(_ logr.RuntimeInfo) {} + +// Enabled implements logr.LogSink. +func (s *spanLogSink) Enabled(_ int) bool { return true } + +// Info implements logr.LogSink, writing an event to the span. +func (s *spanLogSink) Info(_ int, msg string, keysAndValues ...any) { + attrs := kvsToAttrs(append(s.vals, keysAndValues...)...) + s.span.AddEvent( + fmt.Sprintf("[INFO | %s] %s", s.name, msg), + trace.WithTimestamp(time.Now()), + trace.WithAttributes(attrs...), + ) +} + +// Error implements logr.LogSink, recording the error and writing an event to +// the span. +func (s *spanLogSink) Error(err error, msg string, keysAndValues ...any) { + attrs := kvsToAttrs(append(s.vals, keysAndValues...)...) + s.span.RecordError(err) + s.span.AddEvent( + fmt.Sprintf("[ERROR | %s] %s (%s)", s.name, msg, err), + trace.WithTimestamp(time.Now()), + trace.WithAttributes(attrs...), + ) +} + +// WithValues implements logr.LogSink. +func (s spanLogSink) WithValues(keysAndValues ...any) logr.LogSink { + vals := make([]any, len(s.vals)+len(keysAndValues)) + copy(vals, s.vals) + copy(vals[len(s.vals):], keysAndValues) + s.vals = vals + return &s +} + +// WithName implements logr.LogSink. +func (s spanLogSink) WithName(name string) logr.LogSink { + s.name = name + return &s +} + +// NewSpanLogSink returns a LogSink that writes log events to the given span. +func NewSpanLogSink(span trace.Span) logr.LogSink { + return &spanLogSink{span: span} +} + +// kvsToAttrs converts key-value pairs (from a logr call) to OTel attributes. +func kvsToAttrs(kvs ...any) []attribute.KeyValue { + var attrs []attribute.KeyValue + for i := 0; i+1 < len(kvs); i += 2 { + k := fmt.Sprint(kvs[i]) + v := fmt.Sprint(kvs[i+1]) + attrs = append(attrs, attribute.String(k, v)) + } + return attrs +} diff --git a/internal/observability/tracing.go b/internal/observability/tracing.go new file mode 100644 index 0000000..b3e7d50 --- /dev/null +++ b/internal/observability/tracing.go @@ -0,0 +1,78 @@ +/* +Copyright 2026 cloudscale.ch. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package observability + +import ( + "context" + "fmt" + "time" + + "github.com/go-logr/logr" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" + "go.opentelemetry.io/otel/propagation" + "go.opentelemetry.io/otel/sdk/resource" + sdktrace "go.opentelemetry.io/otel/sdk/trace" + semconv "go.opentelemetry.io/otel/semconv/v1.4.0" +) + +// InitTracing initializes an OpenTelemetry tracer provider with an OTLP gRPC +// exporter. The OTLP endpoint is read from the OTEL_EXPORTER_OTLP_ENDPOINT +// environment variable (defaults to localhost:4317 if unset). +func InitTracing(ctx context.Context, log logr.Logger, serviceName, version string, sampleRate float64) (func(), error) { + if sampleRate < 0.0 || sampleRate > 1.0 { + return nil, fmt.Errorf("tracing-sample-rate must be between 0.0 and 1.0, got %f", sampleRate) + } + + res, err := resource.New(ctx, + resource.WithAttributes( + semconv.ServiceNameKey.String(serviceName), + attribute.String("version", version), + ), + ) + if err != nil { + return nil, fmt.Errorf("failed to create opentelemetry resource: %w", err) + } + + exporter, err := otlptracegrpc.New(ctx, otlptracegrpc.WithInsecure()) + if err != nil { + return nil, fmt.Errorf("failed to create OTLP trace exporter: %w", err) + } + + sampler := sdktrace.AlwaysSample() + if sampleRate < 1.0 { + sampler = sdktrace.ParentBased(sdktrace.TraceIDRatioBased(sampleRate)) + } + + tp := sdktrace.NewTracerProvider( + sdktrace.WithSampler(sampler), + sdktrace.WithResource(res), + sdktrace.WithBatcher(exporter), + ) + otel.SetTracerProvider(tp) + otel.SetTextMapPropagator(propagation.TraceContext{}) + + shutdown := func() { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if err := tp.Shutdown(ctx); err != nil { + log.Error(err, "Failed to shut down tracer provider") + } + } + return shutdown, nil +} diff --git a/internal/testutils/fixtures.go b/internal/testutils/fixtures.go index 0f589ca..4803137 100644 --- a/internal/testutils/fixtures.go +++ b/internal/testutils/fixtures.go @@ -1,7 +1,7 @@ package testutils import ( - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" diff --git a/internal/testutils/mocks.go b/internal/testutils/mocks.go index 0eb80b5..ae2d1c2 100644 --- a/internal/testutils/mocks.go +++ b/internal/testutils/mocks.go @@ -4,7 +4,7 @@ import ( "context" "errors" - cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v8" + cloudscalesdk "github.com/cloudscale-ch/cloudscale-go-sdk/v9" ) // --- Network Service Mock --- diff --git a/test/e2e/cloudscale_helpers.go b/test/e2e/cloudscale_helpers.go index f46a973..ac86377 100644 --- a/test/e2e/cloudscale_helpers.go +++ b/test/e2e/cloudscale_helpers.go @@ -28,7 +28,7 @@ import ( // newCloudscaleClient creates a new cloudscale API client from the given token. func newCloudscaleClient(token string) *cloudscale.Client { - return cloudscale.NewClient(token, cloudscale.NewTransport()) + return cloudscale.NewClient(token, "e2e", cloudscale.NewTransport()) } // resourceSnapshot holds a snapshot of cloudscale API resources for leak detection.