From 42ef7c1d496f5f7cc83334acd7d9ba9381df385b Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Mon, 20 Oct 2025 11:14:00 +0200 Subject: [PATCH 1/2] add prometheus annotations to metrics service --- rust/operator-binary/src/controller.rs | 190 ++--------------------- rust/operator-binary/src/crd/mod.rs | 8 +- rust/operator-binary/src/discovery.rs | 2 +- rust/operator-binary/src/main.rs | 1 + rust/operator-binary/src/service.rs | 203 +++++++++++++++++++++++++ 5 files changed, 224 insertions(+), 180 deletions(-) create mode 100644 rust/operator-binary/src/service.rs diff --git a/rust/operator-binary/src/controller.rs b/rust/operator-binary/src/controller.rs index 3b721ea1..c092e924 100644 --- a/rust/operator-binary/src/controller.rs +++ b/rust/operator-binary/src/controller.rs @@ -39,8 +39,7 @@ use stackable_operator::{ apps::v1::{DaemonSet, DaemonSetSpec}, core::v1::{ ConfigMap, EmptyDirVolumeSource, EnvVar, EnvVarSource, HTTPGetAction, - ObjectFieldSelector, Probe, SecretVolumeSource, Service, ServiceAccount, - ServicePort, ServiceSpec, + ObjectFieldSelector, Probe, SecretVolumeSource, ServiceAccount, }, }, apimachinery::pkg::{apis::meta::v1::LabelSelector, util::intstr::IntOrString}, @@ -78,15 +77,17 @@ use crate::{ discovery::{self, build_discovery_configmaps}, operations::graceful_shutdown::add_graceful_shutdown_config, product_logging::{BundleBuilderLogLevel, extend_role_group_config_map}, + service::{ + self, APP_PORT, APP_PORT_NAME, build_rolegroup_headless_service, + build_rolegroup_metrics_service, build_server_role_service, + }, }; pub const OPA_CONTROLLER_NAME: &str = "opacluster"; pub const OPA_FULL_CONTROLLER_NAME: &str = concatcp!(OPA_CONTROLLER_NAME, '.', OPERATOR_NAME); pub const CONFIG_FILE: &str = "config.json"; -pub const APP_PORT: u16 = 8081; -pub const APP_PORT_NAME: &str = "http"; -pub const METRICS_PORT_NAME: &str = "metrics"; + pub const BUNDLES_ACTIVE_DIR: &str = "/bundles/active"; pub const BUNDLES_INCOMING_DIR: &str = "/bundles/incoming"; pub const BUNDLES_TMP_DIR: &str = "/bundles/tmp"; @@ -172,9 +173,6 @@ pub enum Error { source: stackable_opa_operator::crd::Error, }, - #[snafu(display("failed to calculate role service name"))] - RoleServiceNameNotFound, - #[snafu(display("failed to apply role Service"))] ApplyRoleService { source: stackable_operator::cluster_resources::Error, @@ -186,12 +184,6 @@ pub enum Error { rolegroup: RoleGroupRef, }, - #[snafu(display("failed to apply metrics Service for [{rolegroup}]"))] - ApplyRoleGroupMetricsService { - source: stackable_operator::cluster_resources::Error, - rolegroup: RoleGroupRef, - }, - #[snafu(display("failed to build ConfigMap for [{rolegroup}]"))] BuildRoleGroupConfig { source: stackable_operator::builder::configmap::Error, @@ -334,6 +326,9 @@ pub enum Error { ResolveProductImage { source: product_image_selection::Error, }, + + #[snafu(display("failed to build service"))] + BuildService { source: service::Error }, } type Result = std::result::Result; @@ -474,7 +469,8 @@ pub async fn reconcile_opa( .map(Cow::Borrowed) .unwrap_or_default(); - let server_role_service = build_server_role_service(opa, &resolved_product_image)?; + let server_role_service = + build_server_role_service(opa, &resolved_product_image).context(BuildServiceSnafu)?; // required for discovery config map later let server_role_service = cluster_resources .add(client, server_role_service) @@ -516,10 +512,11 @@ pub async fn reconcile_opa( &rolegroup, &merged_config, )?; - let rg_service = - build_rolegroup_headless_service(opa, &resolved_product_image, &rolegroup)?; + let rg_service = build_rolegroup_headless_service(opa, &resolved_product_image, &rolegroup) + .context(BuildServiceSnafu)?; let rg_metrics_service = - build_rolegroup_metrics_service(opa, &resolved_product_image, &rolegroup)?; + build_rolegroup_metrics_service(opa, &resolved_product_image, &rolegroup) + .context(BuildServiceSnafu)?; let rg_daemonset = build_server_rolegroup_daemonset( opa, &resolved_product_image, @@ -617,143 +614,6 @@ pub async fn reconcile_opa( Ok(Action::await_change()) } -/// The server-role service is the primary endpoint that should be used by clients that do not perform internal load balancing, -/// including targets outside of the cluster. -pub fn build_server_role_service( - opa: &v1alpha1::OpaCluster, - resolved_product_image: &ResolvedProductImage, -) -> Result { - let role_name = v1alpha1::OpaRole::Server.to_string(); - let role_svc_name = opa - .server_role_service_name() - .context(RoleServiceNameNotFoundSnafu)?; - - let metadata = ObjectMetaBuilder::new() - .name_and_namespace(opa) - .name(&role_svc_name) - .ownerreference_from_resource(opa, None, Some(true)) - .context(ObjectMissingMetadataForOwnerRefSnafu)? - .with_recommended_labels(build_recommended_labels( - opa, - &resolved_product_image.app_version_label_value, - &role_name, - "global", - )) - .context(ObjectMetaSnafu)? - .build(); - - let service_selector_labels = - Labels::role_selector(opa, APP_NAME, &role_name).context(BuildLabelSnafu)?; - - let service_spec = ServiceSpec { - type_: Some(opa.spec.cluster_config.listener_class.k8s_service_type()), - ports: Some(data_service_ports()), - selector: Some(service_selector_labels.into()), - internal_traffic_policy: Some("Local".to_string()), - ..ServiceSpec::default() - }; - - Ok(Service { - metadata, - spec: Some(service_spec), - status: None, - }) -} - -/// The rolegroup [`Service`] is a headless service that allows direct access to the instances of a certain rolegroup -/// -/// This is mostly useful for internal communication between peers, or for clients that perform client-side load balancing. -fn build_rolegroup_headless_service( - opa: &v1alpha1::OpaCluster, - resolved_product_image: &ResolvedProductImage, - rolegroup: &RoleGroupRef, -) -> Result { - let metadata = ObjectMetaBuilder::new() - .name_and_namespace(opa) - .name(rolegroup.rolegroup_headless_service_name()) - .ownerreference_from_resource(opa, None, Some(true)) - .context(ObjectMissingMetadataForOwnerRefSnafu)? - .with_recommended_labels(build_recommended_labels( - opa, - &resolved_product_image.app_version_label_value, - &rolegroup.role, - &rolegroup.role_group, - )) - .context(ObjectMetaSnafu)? - .build(); - - let service_spec = ServiceSpec { - // Currently we don't offer listener-exposition of OPA mostly due to security concerns. - // OPA is currently public within the Kubernetes (without authentication). - // Opening it up to outside of Kubernetes might worsen things. - // We are open to implement listener-integration, but this needs to be thought through before - // implementing it. - // Note: We have kind of similar situations for HMS and Zookeeper, as the authentication - // options there are non-existent (mTLS still opens plain port) or suck (Kerberos). - type_: Some("ClusterIP".to_string()), - cluster_ip: Some("None".to_string()), - ports: Some(data_service_ports()), - selector: Some(role_group_selector_labels(opa, rolegroup)?.into()), - publish_not_ready_addresses: Some(true), - ..ServiceSpec::default() - }; - - Ok(Service { - metadata, - spec: Some(service_spec), - status: None, - }) -} - -/// The rolegroup metrics [`Service`] is a service that exposes metrics and has the -/// prometheus.io/scrape label. -fn build_rolegroup_metrics_service( - opa: &v1alpha1::OpaCluster, - resolved_product_image: &ResolvedProductImage, - rolegroup: &RoleGroupRef, -) -> Result { - let labels = Labels::try_from([("prometheus.io/scrape", "true")]) - .expect("static Prometheus labels must be valid"); - - let metadata = ObjectMetaBuilder::new() - .name_and_namespace(opa) - .name(rolegroup.rolegroup_metrics_service_name()) - .ownerreference_from_resource(opa, None, Some(true)) - .context(ObjectMissingMetadataForOwnerRefSnafu)? - .with_recommended_labels(build_recommended_labels( - opa, - &resolved_product_image.app_version_label_value, - &rolegroup.role, - &rolegroup.role_group, - )) - .context(ObjectMetaSnafu)? - .with_labels(labels) - .build(); - - let service_spec = ServiceSpec { - type_: Some("ClusterIP".to_string()), - cluster_ip: Some("None".to_string()), - ports: Some(vec![metrics_service_port()]), - selector: Some(role_group_selector_labels(opa, rolegroup)?.into()), - ..ServiceSpec::default() - }; - - Ok(Service { - metadata, - spec: Some(service_spec), - status: None, - }) -} - -/// Returns the [`Labels`] that can be used to select all Pods that are part of the roleGroup. -fn role_group_selector_labels( - opa: &v1alpha1::OpaCluster, - rolegroup: &RoleGroupRef, -) -> Result { - Labels::role_group_selector(opa, APP_NAME, &rolegroup.role, &rolegroup.role_group) - .context(BuildLabelSnafu) -} - /// The rolegroup [`ConfigMap`] configures the rolegroup based on the configuration given by the administrator fn build_server_rolegroup_config_map( opa: &v1alpha1::OpaCluster, @@ -1470,26 +1330,6 @@ fn build_prepare_start_command( prepare_container_args } -fn data_service_ports() -> Vec { - // Currently only HTTP is exposed - vec![ServicePort { - name: Some(APP_PORT_NAME.to_string()), - port: APP_PORT.into(), - protocol: Some("TCP".to_string()), - ..ServicePort::default() - }] -} - -fn metrics_service_port() -> ServicePort { - ServicePort { - name: Some(METRICS_PORT_NAME.to_string()), - // The metrics are served on the same port as the HTTP traffic - port: APP_PORT.into(), - protocol: Some("TCP".to_string()), - ..ServicePort::default() - } -} - /// Creates recommended `ObjectLabels` to be used in deployed resources pub fn build_recommended_labels<'a, T>( owner: &'a T, diff --git a/rust/operator-binary/src/crd/mod.rs b/rust/operator-binary/src/crd/mod.rs index 775bfdd1..0cd9951f 100644 --- a/rust/operator-binary/src/crd/mod.rs +++ b/rust/operator-binary/src/crd/mod.rs @@ -325,19 +325,19 @@ impl v1alpha1::OpaCluster { } /// The name of the role-level load-balanced Kubernetes `Service` - pub fn server_role_service_name(&self) -> Option { - Some(format!( + pub fn server_role_service_name(&self) -> String { + format!( "{cluster_name}-{role}", cluster_name = self.name_any(), role = v1alpha1::OpaRole::Server - )) + ) } /// The fully-qualified domain name of the role-level load-balanced Kubernetes `Service` pub fn server_role_service_fqdn(&self, cluster_info: &KubernetesClusterInfo) -> Option { Some(format!( "{role_service_name}.{namespace}.svc.{cluster_domain}", - role_service_name = self.server_role_service_name()?, + role_service_name = self.server_role_service_name(), namespace = self.metadata.namespace.as_ref()?, cluster_domain = cluster_info.cluster_domain )) diff --git a/rust/operator-binary/src/discovery.rs b/rust/operator-binary/src/discovery.rs index 5247ad5a..f9c1023b 100644 --- a/rust/operator-binary/src/discovery.rs +++ b/rust/operator-binary/src/discovery.rs @@ -8,7 +8,7 @@ use stackable_operator::{ utils::cluster_info::KubernetesClusterInfo, }; -use crate::controller::{APP_PORT, build_recommended_labels}; +use crate::{controller::build_recommended_labels, service::APP_PORT}; #[derive(Snafu, Debug)] pub enum Error { diff --git a/rust/operator-binary/src/main.rs b/rust/operator-binary/src/main.rs index ef6ad257..5bdb7007 100644 --- a/rust/operator-binary/src/main.rs +++ b/rust/operator-binary/src/main.rs @@ -37,6 +37,7 @@ mod controller; mod discovery; mod operations; mod product_logging; +mod service; pub mod built_info { include!(concat!(env!("OUT_DIR"), "/built.rs")); diff --git a/rust/operator-binary/src/service.rs b/rust/operator-binary/src/service.rs new file mode 100644 index 00000000..6fb58f09 --- /dev/null +++ b/rust/operator-binary/src/service.rs @@ -0,0 +1,203 @@ +use snafu::{ResultExt, Snafu}; +use stackable_opa_operator::crd::{APP_NAME, v1alpha1}; +use stackable_operator::{ + builder::meta::ObjectMetaBuilder, + commons::product_image_selection::ResolvedProductImage, + k8s_openapi::api::core::v1::{Service, ServicePort, ServiceSpec}, + kvp::{Annotations, LabelError, Labels}, + role_utils::RoleGroupRef, +}; + +use crate::controller::build_recommended_labels; + +pub const APP_PORT: u16 = 8081; +pub const APP_PORT_NAME: &str = "http"; +pub const METRICS_PORT_NAME: &str = "metrics"; + +#[derive(Snafu, Debug)] +pub enum Error { + #[snafu(display("failed to build label"))] + BuildLabel { source: LabelError }, + + #[snafu(display("failed to build object meta data"))] + ObjectMeta { + source: stackable_operator::builder::meta::Error, + }, + + #[snafu(display("object is missing metadata to build owner reference"))] + ObjectMissingMetadataForOwnerRef { + source: stackable_operator::builder::meta::Error, + }, +} + +/// The server-role service is the primary endpoint that should be used by clients that do not perform internal load balancing, +/// including targets outside of the cluster. +pub(crate) fn build_server_role_service( + opa: &v1alpha1::OpaCluster, + resolved_product_image: &ResolvedProductImage, +) -> Result { + let role_name = v1alpha1::OpaRole::Server.to_string(); + + let metadata = ObjectMetaBuilder::new() + .name_and_namespace(opa) + .name(opa.server_role_service_name()) + .ownerreference_from_resource(opa, None, Some(true)) + .context(ObjectMissingMetadataForOwnerRefSnafu)? + .with_recommended_labels(build_recommended_labels( + opa, + &resolved_product_image.app_version_label_value, + &role_name, + "global", + )) + .context(ObjectMetaSnafu)? + .build(); + + let service_selector_labels = + Labels::role_selector(opa, APP_NAME, &role_name).context(BuildLabelSnafu)?; + + let service_spec = ServiceSpec { + type_: Some(opa.spec.cluster_config.listener_class.k8s_service_type()), + ports: Some(data_service_ports()), + selector: Some(service_selector_labels.into()), + internal_traffic_policy: Some("Local".to_string()), + ..ServiceSpec::default() + }; + + Ok(Service { + metadata, + spec: Some(service_spec), + status: None, + }) +} + +/// The rolegroup [`Service`] is a headless service that allows direct access to the instances of a certain rolegroup +/// +/// This is mostly useful for internal communication between peers, or for clients that perform client-side load balancing. +pub(crate) fn build_rolegroup_headless_service( + opa: &v1alpha1::OpaCluster, + resolved_product_image: &ResolvedProductImage, + rolegroup: &RoleGroupRef, +) -> Result { + let metadata = ObjectMetaBuilder::new() + .name_and_namespace(opa) + .name(rolegroup.rolegroup_headless_service_name()) + .ownerreference_from_resource(opa, None, Some(true)) + .context(ObjectMissingMetadataForOwnerRefSnafu)? + .with_recommended_labels(build_recommended_labels( + opa, + &resolved_product_image.app_version_label_value, + &rolegroup.role, + &rolegroup.role_group, + )) + .context(ObjectMetaSnafu)? + .build(); + + let service_spec = ServiceSpec { + // Currently we don't offer listener-exposition of OPA mostly due to security concerns. + // OPA is currently public within the Kubernetes (without authentication). + // Opening it up to outside of Kubernetes might worsen things. + // We are open to implement listener-integration, but this needs to be thought through before + // implementing it. + // Note: We have kind of similar situations for HMS and Zookeeper, as the authentication + // options there are non-existent (mTLS still opens plain port) or suck (Kerberos). + type_: Some("ClusterIP".to_string()), + cluster_ip: Some("None".to_string()), + ports: Some(data_service_ports()), + selector: Some(role_group_selector_labels(opa, rolegroup)?.into()), + publish_not_ready_addresses: Some(true), + ..ServiceSpec::default() + }; + + Ok(Service { + metadata, + spec: Some(service_spec), + status: None, + }) +} + +/// The rolegroup metrics [`Service`] is a service that exposes metrics and has the +/// prometheus.io/scrape label. +pub(crate) fn build_rolegroup_metrics_service( + opa: &v1alpha1::OpaCluster, + resolved_product_image: &ResolvedProductImage, + rolegroup: &RoleGroupRef, +) -> Result { + let metadata = ObjectMetaBuilder::new() + .name_and_namespace(opa) + .name(rolegroup.rolegroup_metrics_service_name()) + .ownerreference_from_resource(opa, None, Some(true)) + .context(ObjectMissingMetadataForOwnerRefSnafu)? + .with_recommended_labels(build_recommended_labels( + opa, + &resolved_product_image.app_version_label_value, + &rolegroup.role, + &rolegroup.role_group, + )) + .context(ObjectMetaSnafu)? + .with_labels(prometheus_labels()) + .with_annotations(prometheus_annotations()) + .build(); + + let service_spec = ServiceSpec { + type_: Some("ClusterIP".to_string()), + cluster_ip: Some("None".to_string()), + ports: Some(vec![metrics_service_port()]), + selector: Some(role_group_selector_labels(opa, rolegroup)?.into()), + ..ServiceSpec::default() + }; + + Ok(Service { + metadata, + spec: Some(service_spec), + status: None, + }) +} + +/// Returns the [`Labels`] that can be used to select all Pods that are part of the roleGroup. +fn role_group_selector_labels( + opa: &v1alpha1::OpaCluster, + rolegroup: &RoleGroupRef, +) -> Result { + Labels::role_group_selector(opa, APP_NAME, &rolegroup.role, &rolegroup.role_group) + .context(BuildLabelSnafu) +} + +fn data_service_ports() -> Vec { + // Currently only HTTP is exposed + vec![ServicePort { + name: Some(APP_PORT_NAME.to_string()), + port: APP_PORT.into(), + protocol: Some("TCP".to_string()), + ..ServicePort::default() + }] +} + +fn metrics_service_port() -> ServicePort { + ServicePort { + name: Some(METRICS_PORT_NAME.to_string()), + // The metrics are served on the same port as the HTTP traffic + port: APP_PORT.into(), + protocol: Some("TCP".to_string()), + ..ServicePort::default() + } +} + +/// Common labels for Prometheus +fn prometheus_labels() -> Labels { + Labels::try_from([("prometheus.io/scrape", "true")]).expect("should be a valid label") +} + +/// Common annotations for Prometheus +/// +/// These annotations can be used in a ServiceMonitor. +/// +/// see also +fn prometheus_annotations() -> Annotations { + Annotations::try_from([ + ("prometheus.io/path".to_owned(), "/metrics".to_owned()), + ("prometheus.io/port".to_owned(), APP_PORT.to_string()), + ("prometheus.io/scheme".to_owned(), "http".to_owned()), + ("prometheus.io/scrape".to_owned(), "true".to_owned()), + ]) + .expect("should be valid annotations") +} From 2268d38b1c00a3589fd3e2d88591ae849127b887 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Mon, 20 Oct 2025 11:49:39 +0200 Subject: [PATCH 2/2] adapted changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index cb7091ec..251298f0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ All notable changes to this project will be documented in this file. - Expose more Prometheus metrics, such as successful or failed bundle loads and information about the OPA environment ([#748]). - Helm: Allow Pod `priorityClassName` to be configured ([#762]). - Add support for OPA `1.8.0` ([#765]). +- Add `prometheus.io/path|port|scheme` annotations to metrics service ([#767]). ### Changed @@ -32,6 +33,7 @@ All notable changes to this project will be documented in this file. [#754]: https://github.com/stackabletech/opa-operator/pull/754 [#762]: https://github.com/stackabletech/opa-operator/pull/762 [#765]: https://github.com/stackabletech/opa-operator/pull/765 +[#767]: https://github.com/stackabletech/opa-operator/pull/767 ## [25.7.0] - 2025-07-23