From 5926129008535730bfcf3714125b268ad9226737 Mon Sep 17 00:00:00 2001 From: Sebastian Bernauer Date: Tue, 22 Jul 2025 10:31:49 +0200 Subject: [PATCH 01/16] feat!: Add dedicated -metrics service --- CHANGELOG.md | 4 + rust/operator-binary/src/controller.rs | 148 +++++++++++++----- tests/templates/kuttl/smoke/20-assert.yaml | 2 +- ...regorule.yaml => 20-install-test-opa.yaml} | 10 +- tests/templates/kuttl/smoke/30-assert.yaml | 2 +- .../kuttl/smoke/30-prepare-test-opa.yaml | 5 + .../kuttl/smoke/30-prepare-test-regorule.yaml | 4 - .../templates/kuttl/smoke/30_test-metrics.py | 12 ++ .../{test-regorule.py => 30_test-regorule.py} | 4 +- tests/templates/kuttl/smoke/31-assert.yaml | 7 + 10 files changed, 151 insertions(+), 47 deletions(-) rename tests/templates/kuttl/smoke/{20-install-test-regorule.yaml => 20-install-test-opa.yaml} (78%) create mode 100644 tests/templates/kuttl/smoke/30-prepare-test-opa.yaml delete mode 100644 tests/templates/kuttl/smoke/30-prepare-test-regorule.yaml create mode 100644 tests/templates/kuttl/smoke/30_test-metrics.py rename tests/templates/kuttl/smoke/{test-regorule.py => 30_test-regorule.py} (92%) create mode 100644 tests/templates/kuttl/smoke/31-assert.yaml diff --git a/CHANGELOG.md b/CHANGELOG.md index 7cdf15fe..f26dd2ed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,8 @@ All notable changes to this project will be documented in this file. - Support experimental user-info-fetcher Entra backend to fetch user groups ([#712]). - Add support for OPA `1.4.2` ([#723]). - Add RBAC rule to helm template for automatic cluster domain detection ([#743]). +- Add a dedicated per-rolegroup `-metrics` Service, which can be used to get Prometheus metrics ([#XXX]). +- Expose more Prometheus metrics, such as successful or failed bundle loads and information about the OPA environment ([#XXX]). ### Changed @@ -49,6 +51,8 @@ All notable changes to this project will be documented in this file. - The CLI argument `--kubernetes-node-name` or env variable `KUBERNETES_NODE_NAME` needs to be set. The helm-chart takes care of this. - The operator helm-chart now grants RBAC `patch` permissions on `events.k8s.io/events`, so events can be aggregated (e.g. "error happened 10 times over the last 5 minutes") ([#745]). +- The per-rolegroup services now only server the HTTP port and have a `-headless` suffix to better indicate there + purpose and to be consistent with other operators ([#XXX]). ### Fixed diff --git a/rust/operator-binary/src/controller.rs b/rust/operator-binary/src/controller.rs index 87d5291e..6dffbe7d 100644 --- a/rust/operator-binary/src/controller.rs +++ b/rust/operator-binary/src/controller.rs @@ -50,7 +50,7 @@ use stackable_operator::{ core::{DeserializeGuard, error_boundary}, runtime::{controller::Action, reflector::ObjectRef}, }, - kvp::{Label, LabelError, Labels, ObjectLabels}, + kvp::{LabelError, Labels, ObjectLabels}, logging::controller::ReconcilerError, memory::{BinaryMultiple, MemoryQuantity}, product_config_utils::{transform_all_roles_to_config, validate_all_roles_and_groups_config}, @@ -91,6 +91,7 @@ pub const BUNDLES_ACTIVE_DIR: &str = "/bundles/active"; pub const BUNDLES_INCOMING_DIR: &str = "/bundles/incoming"; pub const BUNDLES_TMP_DIR: &str = "/bundles/tmp"; pub const BUNDLE_BUILDER_PORT: i32 = 3030; +pub const OPA_STACKABLE_SERVICE_NAME: &str = "stackable"; const CONFIG_VOLUME_NAME: &str = "config"; const CONFIG_DIR: &str = "/stackable/config"; @@ -185,6 +186,12 @@ pub enum Error { rolegroup: RoleGroupRef, }, + #[snafu(display("failed to apply metrics Service for [{rolegroup}]"))] + ApplyRoleGroupMetricsService { + source: stackable_operator::cluster_resources::Error, + rolegroup: RoleGroupRef, + }, + #[snafu(display("failed to build ConfigMap for [{rolegroup}]"))] BuildRoleGroupConfig { source: stackable_operator::builder::configmap::Error, @@ -337,19 +344,20 @@ pub struct OpaClusterConfigFile { bundles: OpaClusterBundle, #[serde(skip_serializing_if = "Option::is_none")] decision_logs: Option, + status: Option, } impl OpaClusterConfigFile { pub fn new(decision_logging: Option) -> Self { Self { services: vec![OpaClusterConfigService { - name: String::from("stackable"), - url: String::from("http://localhost:3030/opa/v1"), + name: OPA_STACKABLE_SERVICE_NAME.to_owned(), + url: "http://localhost:3030/opa/v1".to_owned(), }], bundles: OpaClusterBundle { stackable: OpaClusterBundleConfig { - service: String::from("stackable"), - resource: String::from("opa/bundle.tar.gz"), + service: OPA_STACKABLE_SERVICE_NAME.to_owned(), + resource: "opa/bundle.tar.gz".to_owned(), persist: true, polling: OpaClusterBundleConfigPolling { min_delay_seconds: 10, @@ -358,6 +366,12 @@ impl OpaClusterConfigFile { }, }, decision_logs: decision_logging, + // Enable more Prometheus statistics, such as bundle loads + // See https://www.openpolicyagent.org/docs/monitoring#status-metrics + status: Some(OpaClusterConfigStatus { + service: OPA_STACKABLE_SERVICE_NAME.to_owned(), + prometheus: true, + }), } } } @@ -392,6 +406,12 @@ pub struct OpaClusterConfigDecisionLog { console: bool, } +#[derive(Serialize, Deserialize)] +struct OpaClusterConfigStatus { + service: String, + prometheus: bool, +} + pub async fn reconcile_opa( opa: Arc>, ctx: Arc, @@ -489,7 +509,10 @@ pub async fn reconcile_opa( &rolegroup, &merged_config, )?; - let rg_service = build_rolegroup_service(opa, &resolved_product_image, &rolegroup)?; + let rg_service = + build_rolegroup_headless_service(opa, &resolved_product_image, &rolegroup)?; + let rg_metrics_service = + build_rolegroup_metrics_service(opa, &resolved_product_image, &rolegroup)?; let rg_daemonset = build_server_rolegroup_daemonset( opa, &resolved_product_image, @@ -515,6 +538,12 @@ pub async fn reconcile_opa( .with_context(|_| ApplyRoleGroupServiceSnafu { rolegroup: rolegroup.clone(), })?; + cluster_resources + .add(client, rg_metrics_service) + .await + .with_context(|_| ApplyRoleGroupServiceSnafu { + rolegroup: rolegroup.clone(), + })?; ds_cond_builder.add( cluster_resources .add(client, rg_daemonset.clone()) @@ -632,17 +661,14 @@ pub fn build_server_role_service( /// The rolegroup [`Service`] is a headless service that allows direct access to the instances of a certain rolegroup /// /// This is mostly useful for internal communication between peers, or for clients that perform client-side load balancing. -fn build_rolegroup_service( +fn build_rolegroup_headless_service( opa: &v1alpha1::OpaCluster, resolved_product_image: &ResolvedProductImage, rolegroup: &RoleGroupRef, ) -> Result { - let prometheus_label = - Label::try_from(("prometheus.io/scrape", "true")).context(BuildLabelSnafu)?; - let metadata = ObjectMetaBuilder::new() .name_and_namespace(opa) - .name(rolegroup.object_name()) + .name(rolegroup.rolegroup_headless_service_name()) .ownerreference_from_resource(opa, None, Some(true)) .context(ObjectMissingMetadataForOwnerRefSnafu)? .with_recommended_labels(build_recommended_labels( @@ -652,19 +678,20 @@ fn build_rolegroup_service( &rolegroup.role_group, )) .context(ObjectMetaSnafu)? - .with_label(prometheus_label) .build(); - let service_selector_labels = - Labels::role_group_selector(opa, APP_NAME, &rolegroup.role, &rolegroup.role_group) - .context(BuildLabelSnafu)?; - let service_spec = ServiceSpec { - // Internal communication does not need to be exposed + // Currently we don't offer listener-exposition of OPA mostly due to security concerns. + // OPA is currently public within the Kubernetes (without authentication). + // Opening it up to outside of Kubernetes might worsen things. + // We are open to implement listener-integration, but this needs to be though through before + // implementing it. + // Note: We have kind of similar situations for HMS and Zookeeper, as the authentication + // options there are non-existent (mTLS still opens plain port) or suck (Kerberos). type_: Some("ClusterIP".to_string()), cluster_ip: Some("None".to_string()), - ports: Some(service_ports()), - selector: Some(service_selector_labels.into()), + ports: Some(data_service_ports()), + selector: Some(role_group_selector_labels(opa, rolegroup)?.into()), publish_not_ready_addresses: Some(true), ..ServiceSpec::default() }; @@ -676,6 +703,55 @@ fn build_rolegroup_service( }) } +/// The rolegroup metrics [`Service`] is a service that exposes metrics and has the +/// prometheus.io/scrape label. +fn build_rolegroup_metrics_service( + opa: &v1alpha1::OpaCluster, + resolved_product_image: &ResolvedProductImage, + rolegroup: &RoleGroupRef, +) -> Result { + let labels = Labels::try_from([("prometheus.io/scrape", "true")]) + .expect("static Prometheus labels must be valid"); + + let metadata = ObjectMetaBuilder::new() + .name_and_namespace(opa) + .name(rolegroup.rolegroup_metrics_service_name()) + .ownerreference_from_resource(opa, None, Some(true)) + .context(ObjectMissingMetadataForOwnerRefSnafu)? + .with_recommended_labels(build_recommended_labels( + opa, + &resolved_product_image.app_version_label, + &rolegroup.role, + &rolegroup.role_group, + )) + .context(ObjectMetaSnafu)? + .with_labels(labels) + .build(); + + let service_spec = ServiceSpec { + type_: Some("ClusterIP".to_string()), + cluster_ip: Some("None".to_string()), + ports: Some(vec![metrics_service_port()]), + selector: Some(role_group_selector_labels(opa, rolegroup)?.into()), + ..ServiceSpec::default() + }; + + Ok(Service { + metadata, + spec: Some(service_spec), + status: None, + }) +} + +/// Returns the [`Labels`] that can be used to select all Pods that are part of the roleGroup. +fn role_group_selector_labels( + opa: &v1alpha1::OpaCluster, + rolegroup: &RoleGroupRef, +) -> Result { + Labels::role_group_selector(opa, APP_NAME, &rolegroup.role, &rolegroup.role_group) + .context(BuildLabelSnafu) +} + /// The rolegroup [`ConfigMap`] configures the rolegroup based on the configuration given by the administrator fn build_server_rolegroup_config_map( opa: &v1alpha1::OpaCluster, @@ -1387,22 +1463,24 @@ fn build_prepare_start_command( prepare_container_args } -fn service_ports() -> Vec { - vec![ - ServicePort { - name: Some(APP_PORT_NAME.to_string()), - port: APP_PORT.into(), - protocol: Some("TCP".to_string()), - ..ServicePort::default() - }, - ServicePort { - name: Some(METRICS_PORT_NAME.to_string()), - port: 9504, // Arbitrary port number, this is never actually used anywhere - protocol: Some("TCP".to_string()), - target_port: Some(IntOrString::String(APP_PORT_NAME.to_string())), - ..ServicePort::default() - }, - ] +fn data_service_ports() -> Vec { + // Currently only HTTP is exposed + vec![ServicePort { + name: Some(APP_PORT_NAME.to_string()), + port: APP_PORT.into(), + protocol: Some("TCP".to_string()), + ..ServicePort::default() + }] +} + +fn metrics_service_port() -> ServicePort { + ServicePort { + name: Some(METRICS_PORT_NAME.to_string()), + // The metrics are served on the same port as the HTTP traffic + port: APP_PORT.into(), + protocol: Some("TCP".to_string()), + ..ServicePort::default() + } } /// Creates recommended `ObjectLabels` to be used in deployed resources diff --git a/tests/templates/kuttl/smoke/20-assert.yaml b/tests/templates/kuttl/smoke/20-assert.yaml index 6041b967..579dfd86 100644 --- a/tests/templates/kuttl/smoke/20-assert.yaml +++ b/tests/templates/kuttl/smoke/20-assert.yaml @@ -6,7 +6,7 @@ timeout: 300 apiVersion: apps/v1 kind: StatefulSet metadata: - name: test-regorule + name: test-opa status: readyReplicas: 1 replicas: 1 diff --git a/tests/templates/kuttl/smoke/20-install-test-regorule.yaml b/tests/templates/kuttl/smoke/20-install-test-opa.yaml similarity index 78% rename from tests/templates/kuttl/smoke/20-install-test-regorule.yaml rename to tests/templates/kuttl/smoke/20-install-test-opa.yaml index 816f6148..05d33126 100644 --- a/tests/templates/kuttl/smoke/20-install-test-regorule.yaml +++ b/tests/templates/kuttl/smoke/20-install-test-opa.yaml @@ -2,21 +2,21 @@ apiVersion: apps/v1 kind: StatefulSet metadata: - name: test-regorule + name: test-opa labels: - app: test-regorule + app: test-opa spec: replicas: 1 selector: matchLabels: - app: test-regorule + app: test-opa template: metadata: labels: - app: test-regorule + app: test-opa spec: containers: - - name: test-regorule + - name: test-opa image: oci.stackable.tech/sdp/testing-tools:0.2.0-stackable0.0.0-dev stdin: true tty: true diff --git a/tests/templates/kuttl/smoke/30-assert.yaml b/tests/templates/kuttl/smoke/30-assert.yaml index 0d0ef28f..57a25e21 100644 --- a/tests/templates/kuttl/smoke/30-assert.yaml +++ b/tests/templates/kuttl/smoke/30-assert.yaml @@ -4,4 +4,4 @@ kind: TestAssert metadata: name: test-regorule commands: - - script: kubectl exec -n $NAMESPACE test-regorule-0 -- python /tmp/test-regorule.py -u 'http://test-opa-server-default:8081/v1/data/test' + - script: kubectl exec -n $NAMESPACE test-opa-0 -- python /tmp/30_test-regorule.py -u 'http://test-opa:8081/v1/data/test' diff --git a/tests/templates/kuttl/smoke/30-prepare-test-opa.yaml b/tests/templates/kuttl/smoke/30-prepare-test-opa.yaml new file mode 100644 index 00000000..84162ae6 --- /dev/null +++ b/tests/templates/kuttl/smoke/30-prepare-test-opa.yaml @@ -0,0 +1,5 @@ +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +commands: + - script: kubectl cp -n $NAMESPACE ./30_test-regorule.py test-opa-0:/tmp + - script: kubectl cp -n $NAMESPACE ./30_test-metrics.py test-opa-0:/tmp diff --git a/tests/templates/kuttl/smoke/30-prepare-test-regorule.yaml b/tests/templates/kuttl/smoke/30-prepare-test-regorule.yaml deleted file mode 100644 index 1623156a..00000000 --- a/tests/templates/kuttl/smoke/30-prepare-test-regorule.yaml +++ /dev/null @@ -1,4 +0,0 @@ -apiVersion: kuttl.dev/v1beta1 -kind: TestStep -commands: - - script: kubectl cp -n $NAMESPACE ./test-regorule.py test-regorule-0:/tmp diff --git a/tests/templates/kuttl/smoke/30_test-metrics.py b/tests/templates/kuttl/smoke/30_test-metrics.py new file mode 100644 index 00000000..d09945ec --- /dev/null +++ b/tests/templates/kuttl/smoke/30_test-metrics.py @@ -0,0 +1,12 @@ +#!/usr/bin/env python +import requests + +# metrics_url = "http://test-opa-server-default:9504/metrics" +metrics_url = "http://test-opa-server-default-metrics:8081/metrics" +# FIXME: Ideally this would be exposed via a metrics service (as the other operators do) +response = requests.get(metrics_url) + +assert response.status_code == 200, "Metrics endpoint must return a 200 status code" +assert "bundle_loaded_counter" in response.text, f"Metric bundle_loaded_counter should exist in {metrics_url}" + +print("Metrics test successful!") diff --git a/tests/templates/kuttl/smoke/test-regorule.py b/tests/templates/kuttl/smoke/30_test-regorule.py similarity index 92% rename from tests/templates/kuttl/smoke/test-regorule.py rename to tests/templates/kuttl/smoke/30_test-regorule.py index 3e2a78b6..9d378a87 100755 --- a/tests/templates/kuttl/smoke/test-regorule.py +++ b/tests/templates/kuttl/smoke/30_test-regorule.py @@ -34,7 +34,7 @@ and "hello" in response["result"] and response["result"]["hello"] ): - print("Test successful!") + print("Regorule test successful!") exit(0) else: print( @@ -43,3 +43,5 @@ + " - expected: {'result': {'hello': True}}" ) exit(-1) + + metrics = requests.get(f"{url}/metrics") diff --git a/tests/templates/kuttl/smoke/31-assert.yaml b/tests/templates/kuttl/smoke/31-assert.yaml new file mode 100644 index 00000000..0084e371 --- /dev/null +++ b/tests/templates/kuttl/smoke/31-assert.yaml @@ -0,0 +1,7 @@ +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestAssert +metadata: + name: test-metrics +commands: + - script: kubectl exec -n $NAMESPACE test-opa-0 -- python /tmp/30_test-metrics.py From 44d098bfaec84fdb7294503a6d25e297d0ed2976 Mon Sep 17 00:00:00 2001 From: Sebastian Bernauer Date: Tue, 22 Jul 2025 10:34:27 +0200 Subject: [PATCH 02/16] changelog --- CHANGELOG.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f26dd2ed..28ac6eae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,8 +14,8 @@ All notable changes to this project will be documented in this file. - Support experimental user-info-fetcher Entra backend to fetch user groups ([#712]). - Add support for OPA `1.4.2` ([#723]). - Add RBAC rule to helm template for automatic cluster domain detection ([#743]). -- Add a dedicated per-rolegroup `-metrics` Service, which can be used to get Prometheus metrics ([#XXX]). -- Expose more Prometheus metrics, such as successful or failed bundle loads and information about the OPA environment ([#XXX]). +- Add a dedicated per-rolegroup `-metrics` Service, which can be used to get Prometheus metrics ([#748]). +- Expose more Prometheus metrics, such as successful or failed bundle loads and information about the OPA environment ([#748]). ### Changed @@ -52,7 +52,7 @@ All notable changes to this project will be documented in this file. - The operator helm-chart now grants RBAC `patch` permissions on `events.k8s.io/events`, so events can be aggregated (e.g. "error happened 10 times over the last 5 minutes") ([#745]). - The per-rolegroup services now only server the HTTP port and have a `-headless` suffix to better indicate there - purpose and to be consistent with other operators ([#XXX]). + purpose and to be consistent with other operators ([#748]). ### Fixed @@ -81,6 +81,7 @@ All notable changes to this project will be documented in this file. [#743]: https://github.com/stackabletech/opa-operator/pull/743 [#744]: https://github.com/stackabletech/opa-operator/pull/744 [#745]: https://github.com/stackabletech/opa-operator/pull/745 +[#748]: https://github.com/stackabletech/opa-operator/pull/748 ## [25.3.0] - 2025-03-21 From d2ab8940680e7779a66b874f86eff2bc535c88cb Mon Sep 17 00:00:00 2001 From: Sebastian Bernauer Date: Tue, 22 Jul 2025 10:36:13 +0200 Subject: [PATCH 03/16] improve doc comment --- rust/operator-binary/src/controller.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/operator-binary/src/controller.rs b/rust/operator-binary/src/controller.rs index 6dffbe7d..1975810a 100644 --- a/rust/operator-binary/src/controller.rs +++ b/rust/operator-binary/src/controller.rs @@ -366,7 +366,7 @@ impl OpaClusterConfigFile { }, }, decision_logs: decision_logging, - // Enable more Prometheus statistics, such as bundle loads + // Enable more Prometheus metrics, such as bundle loads // See https://www.openpolicyagent.org/docs/monitoring#status-metrics status: Some(OpaClusterConfigStatus { service: OPA_STACKABLE_SERVICE_NAME.to_owned(), From 78481b1e048a657a823334611a6a2cd809bae26b Mon Sep 17 00:00:00 2001 From: Sebastian Bernauer Date: Tue, 22 Jul 2025 10:46:42 +0200 Subject: [PATCH 04/16] linter --- tests/templates/kuttl/smoke/30_test-metrics.py | 3 ++- tests/templates/kuttl/smoke/30_test-regorule.py | 2 -- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/templates/kuttl/smoke/30_test-metrics.py b/tests/templates/kuttl/smoke/30_test-metrics.py index d09945ec..fdb81667 100644 --- a/tests/templates/kuttl/smoke/30_test-metrics.py +++ b/tests/templates/kuttl/smoke/30_test-metrics.py @@ -7,6 +7,7 @@ response = requests.get(metrics_url) assert response.status_code == 200, "Metrics endpoint must return a 200 status code" -assert "bundle_loaded_counter" in response.text, f"Metric bundle_loaded_counter should exist in {metrics_url}" +assert "bundle_loaded_counter" in response.text, + f"Metric bundle_loaded_counter should exist in {metrics_url}" print("Metrics test successful!") diff --git a/tests/templates/kuttl/smoke/30_test-regorule.py b/tests/templates/kuttl/smoke/30_test-regorule.py index 9d378a87..d3caedc2 100755 --- a/tests/templates/kuttl/smoke/30_test-regorule.py +++ b/tests/templates/kuttl/smoke/30_test-regorule.py @@ -43,5 +43,3 @@ + " - expected: {'result': {'hello': True}}" ) exit(-1) - - metrics = requests.get(f"{url}/metrics") From 4b876ec68be8103411e5265107621cb866527b92 Mon Sep 17 00:00:00 2001 From: Sebastian Bernauer Date: Tue, 22 Jul 2025 10:57:30 +0200 Subject: [PATCH 05/16] linter --- tests/templates/kuttl/smoke/30_test-metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/templates/kuttl/smoke/30_test-metrics.py b/tests/templates/kuttl/smoke/30_test-metrics.py index fdb81667..28cb9490 100644 --- a/tests/templates/kuttl/smoke/30_test-metrics.py +++ b/tests/templates/kuttl/smoke/30_test-metrics.py @@ -7,7 +7,7 @@ response = requests.get(metrics_url) assert response.status_code == 200, "Metrics endpoint must return a 200 status code" -assert "bundle_loaded_counter" in response.text, +assert "bundle_loaded_counter" in response.text, \ f"Metric bundle_loaded_counter should exist in {metrics_url}" print("Metrics test successful!") From 6fb8a5555a66c5609a3bca1e47eb7969791da8b5 Mon Sep 17 00:00:00 2001 From: Sebastian Bernauer Date: Tue, 22 Jul 2025 11:12:15 +0200 Subject: [PATCH 06/16] Add container port --- rust/operator-binary/src/controller.rs | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/rust/operator-binary/src/controller.rs b/rust/operator-binary/src/controller.rs index 1975810a..53061d2e 100644 --- a/rust/operator-binary/src/controller.rs +++ b/rust/operator-binary/src/controller.rs @@ -640,12 +640,7 @@ pub fn build_server_role_service( let service_spec = ServiceSpec { type_: Some(opa.spec.cluster_config.listener_class.k8s_service_type()), - ports: Some(vec![ServicePort { - name: Some(APP_PORT_NAME.to_string()), - port: APP_PORT.into(), - protocol: Some("TCP".to_string()), - ..ServicePort::default() - }]), + ports: Some(data_service_ports()), selector: Some(service_selector_labels.into()), internal_traffic_policy: Some("Local".to_string()), ..ServiceSpec::default() @@ -980,6 +975,8 @@ fn build_server_rolegroup_daemonset( format!("{STACKABLE_LOG_DIR}/containerdebug"), ) .add_container_port(APP_PORT_NAME, APP_PORT.into()) + // The metrics are served on the same port as the HTTP traffic + .add_container_port(METRICS_PORT_NAME, APP_PORT.into()) .add_volume_mount(CONFIG_VOLUME_NAME, CONFIG_DIR) .context(AddVolumeMountSnafu)? .add_volume_mount(LOG_VOLUME_NAME, STACKABLE_LOG_DIR) From 376aba0a12c22c54d137e928196d0a0b32a9da2b Mon Sep 17 00:00:00 2001 From: Sebastian Bernauer Date: Tue, 22 Jul 2025 11:16:24 +0200 Subject: [PATCH 07/16] fix port problem --- rust/operator-binary/src/controller.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/rust/operator-binary/src/controller.rs b/rust/operator-binary/src/controller.rs index 53061d2e..7b599b40 100644 --- a/rust/operator-binary/src/controller.rs +++ b/rust/operator-binary/src/controller.rs @@ -975,8 +975,11 @@ fn build_server_rolegroup_daemonset( format!("{STACKABLE_LOG_DIR}/containerdebug"), ) .add_container_port(APP_PORT_NAME, APP_PORT.into()) - // The metrics are served on the same port as the HTTP traffic - .add_container_port(METRICS_PORT_NAME, APP_PORT.into()) + // If we also add a container port "metrics" pointing to the same port number, we get a + // + // .spec.template.spec.containers[name="opa"].ports: duplicate entries for key [containerPort=8081,protocol="TCP"] + // + // So we don't do that .add_volume_mount(CONFIG_VOLUME_NAME, CONFIG_DIR) .context(AddVolumeMountSnafu)? .add_volume_mount(LOG_VOLUME_NAME, STACKABLE_LOG_DIR) From c6d1ce38d89bcb9acaf799140327fa84cf7eaab2 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Tue, 22 Jul 2025 12:20:24 +0200 Subject: [PATCH 08/16] fix tests und role service name --- rust/operator-binary/src/crd/mod.rs | 8 ++++++-- tests/templates/kuttl/aas-user-info/30-assert.yaml | 2 +- tests/templates/kuttl/ad-user-info/30-assert.yaml | 2 +- tests/templates/kuttl/keycloak-user-info/30-assert.yaml | 2 +- tests/templates/kuttl/smoke/30_test-metrics.py | 2 -- 5 files changed, 9 insertions(+), 7 deletions(-) diff --git a/rust/operator-binary/src/crd/mod.rs b/rust/operator-binary/src/crd/mod.rs index 010a0c29..2bcbf0f5 100644 --- a/rust/operator-binary/src/crd/mod.rs +++ b/rust/operator-binary/src/crd/mod.rs @@ -17,7 +17,7 @@ use stackable_operator::{ merge::Merge, }, k8s_openapi::apimachinery::pkg::api::resource::Quantity, - kube::CustomResource, + kube::{CustomResource, ResourceExt}, product_config_utils::Configuration, product_logging::{self, spec::Logging}, role_utils::{ @@ -326,7 +326,11 @@ impl v1alpha1::OpaCluster { /// The name of the role-level load-balanced Kubernetes `Service` pub fn server_role_service_name(&self) -> Option { - self.metadata.name.clone() + Some(format!( + "{cluster_name}-{role}", + cluster_name = self.name_any(), + role = v1alpha1::OpaRole::Server + )) } /// The fully-qualified domain name of the role-level load-balanced Kubernetes `Service` diff --git a/tests/templates/kuttl/aas-user-info/30-assert.yaml b/tests/templates/kuttl/aas-user-info/30-assert.yaml index 0d0ef28f..c9f2337d 100644 --- a/tests/templates/kuttl/aas-user-info/30-assert.yaml +++ b/tests/templates/kuttl/aas-user-info/30-assert.yaml @@ -4,4 +4,4 @@ kind: TestAssert metadata: name: test-regorule commands: - - script: kubectl exec -n $NAMESPACE test-regorule-0 -- python /tmp/test-regorule.py -u 'http://test-opa-server-default:8081/v1/data/test' + - script: kubectl exec -n $NAMESPACE test-regorule-0 -- python /tmp/test-regorule.py -u 'http://test-opa-server:8081/v1/data/test' diff --git a/tests/templates/kuttl/ad-user-info/30-assert.yaml b/tests/templates/kuttl/ad-user-info/30-assert.yaml index 0d0ef28f..c9f2337d 100644 --- a/tests/templates/kuttl/ad-user-info/30-assert.yaml +++ b/tests/templates/kuttl/ad-user-info/30-assert.yaml @@ -4,4 +4,4 @@ kind: TestAssert metadata: name: test-regorule commands: - - script: kubectl exec -n $NAMESPACE test-regorule-0 -- python /tmp/test-regorule.py -u 'http://test-opa-server-default:8081/v1/data/test' + - script: kubectl exec -n $NAMESPACE test-regorule-0 -- python /tmp/test-regorule.py -u 'http://test-opa-server:8081/v1/data/test' diff --git a/tests/templates/kuttl/keycloak-user-info/30-assert.yaml b/tests/templates/kuttl/keycloak-user-info/30-assert.yaml index 0d0ef28f..c9f2337d 100644 --- a/tests/templates/kuttl/keycloak-user-info/30-assert.yaml +++ b/tests/templates/kuttl/keycloak-user-info/30-assert.yaml @@ -4,4 +4,4 @@ kind: TestAssert metadata: name: test-regorule commands: - - script: kubectl exec -n $NAMESPACE test-regorule-0 -- python /tmp/test-regorule.py -u 'http://test-opa-server-default:8081/v1/data/test' + - script: kubectl exec -n $NAMESPACE test-regorule-0 -- python /tmp/test-regorule.py -u 'http://test-opa-server:8081/v1/data/test' diff --git a/tests/templates/kuttl/smoke/30_test-metrics.py b/tests/templates/kuttl/smoke/30_test-metrics.py index 28cb9490..ffe99c6e 100644 --- a/tests/templates/kuttl/smoke/30_test-metrics.py +++ b/tests/templates/kuttl/smoke/30_test-metrics.py @@ -1,9 +1,7 @@ #!/usr/bin/env python import requests -# metrics_url = "http://test-opa-server-default:9504/metrics" metrics_url = "http://test-opa-server-default-metrics:8081/metrics" -# FIXME: Ideally this would be exposed via a metrics service (as the other operators do) response = requests.get(metrics_url) assert response.status_code == 200, "Metrics endpoint must return a 200 status code" From 4c2c8a470261f49490f0ce72080ebb364691a5c9 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Tue, 22 Jul 2025 12:27:01 +0200 Subject: [PATCH 09/16] fix linter --- tests/templates/kuttl/smoke/30_test-metrics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/templates/kuttl/smoke/30_test-metrics.py b/tests/templates/kuttl/smoke/30_test-metrics.py index ffe99c6e..54625f03 100644 --- a/tests/templates/kuttl/smoke/30_test-metrics.py +++ b/tests/templates/kuttl/smoke/30_test-metrics.py @@ -5,7 +5,7 @@ response = requests.get(metrics_url) assert response.status_code == 200, "Metrics endpoint must return a 200 status code" -assert "bundle_loaded_counter" in response.text, \ +assert "bundle_loaded_counter" in response.text, ( f"Metric bundle_loaded_counter should exist in {metrics_url}" - +) print("Metrics test successful!") From 3fd592eaa4b5fc9e80fc758dc9ec50132bc02850 Mon Sep 17 00:00:00 2001 From: Sebastian Bernauer Date: Tue, 22 Jul 2025 12:52:23 +0200 Subject: [PATCH 10/16] Update rust/operator-binary/src/controller.rs Co-authored-by: Malte Sander --- rust/operator-binary/src/controller.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/operator-binary/src/controller.rs b/rust/operator-binary/src/controller.rs index 7b599b40..0c6f0384 100644 --- a/rust/operator-binary/src/controller.rs +++ b/rust/operator-binary/src/controller.rs @@ -679,7 +679,7 @@ fn build_rolegroup_headless_service( // Currently we don't offer listener-exposition of OPA mostly due to security concerns. // OPA is currently public within the Kubernetes (without authentication). // Opening it up to outside of Kubernetes might worsen things. - // We are open to implement listener-integration, but this needs to be though through before + // We are open to implement listener-integration, but this needs to be thought through before // implementing it. // Note: We have kind of similar situations for HMS and Zookeeper, as the authentication // options there are non-existent (mTLS still opens plain port) or suck (Kerberos). From bec72eeba174d8ed624c096fea90044bdf4d8d2e Mon Sep 17 00:00:00 2001 From: Sebastian Bernauer Date: Tue, 22 Jul 2025 12:57:57 +0200 Subject: [PATCH 11/16] changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 28ac6eae..62e5d72e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -53,6 +53,7 @@ All notable changes to this project will be documented in this file. so events can be aggregated (e.g. "error happened 10 times over the last 5 minutes") ([#745]). - The per-rolegroup services now only server the HTTP port and have a `-headless` suffix to better indicate there purpose and to be consistent with other operators ([#748]). +- The per-role server service is now prefixed with `-server` to be consistent with other operators ([#748]). ### Fixed From b39ea444fce6206d831191f0f39ba8e87dbfec35 Mon Sep 17 00:00:00 2001 From: Sebastian Bernauer Date: Thu, 24 Jul 2025 13:55:44 +0200 Subject: [PATCH 12/16] changelog --- CHANGELOG.md | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 03b42cdc..067584a6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,19 @@ All notable changes to this project will be documented in this file. ## [Unreleased] +### Added + +- Add a dedicated per-rolegroup `-metrics` Service, which can be used to get Prometheus metrics ([#748]). +- Expose more Prometheus metrics, such as successful or failed bundle loads and information about the OPA environment ([#748]). + +### Changed + +- BREAKING: The per-rolegroup services now only server the HTTP port and have a `-headless` suffix to better indicate there + purpose and to be consistent with other operators ([#748]). +- BREAKING: The per-role server service is now prefixed with `-server` to be consistent with other operators ([#748]). + +[#748]: https://github.com/stackabletech/opa-operator/pull/748 + ## [25.7.0] - 2025-07-23 ## [25.7.0-rc1] - 2025-07-18 @@ -18,8 +31,6 @@ All notable changes to this project will be documented in this file. - Support experimental user-info-fetcher Entra backend to fetch user groups ([#712]). - Add support for OPA `1.4.2` ([#723]). - Add RBAC rule to helm template for automatic cluster domain detection ([#743]). -- Add a dedicated per-rolegroup `-metrics` Service, which can be used to get Prometheus metrics ([#748]). -- Expose more Prometheus metrics, such as successful or failed bundle loads and information about the OPA environment ([#748]). ### Changed @@ -55,9 +66,6 @@ All notable changes to this project will be documented in this file. - The CLI argument `--kubernetes-node-name` or env variable `KUBERNETES_NODE_NAME` needs to be set. The helm-chart takes care of this. - The operator helm-chart now grants RBAC `patch` permissions on `events.k8s.io/events`, so events can be aggregated (e.g. "error happened 10 times over the last 5 minutes") ([#745]). -- The per-rolegroup services now only server the HTTP port and have a `-headless` suffix to better indicate there - purpose and to be consistent with other operators ([#748]). -- The per-role server service is now prefixed with `-server` to be consistent with other operators ([#748]). ### Fixed @@ -86,7 +94,6 @@ All notable changes to this project will be documented in this file. [#743]: https://github.com/stackabletech/opa-operator/pull/743 [#744]: https://github.com/stackabletech/opa-operator/pull/744 [#745]: https://github.com/stackabletech/opa-operator/pull/745 -[#748]: https://github.com/stackabletech/opa-operator/pull/748 ## [25.3.0] - 2025-03-21 From 311784c44e81bf255e8ffe2963e3a71cfde481dc Mon Sep 17 00:00:00 2001 From: Sebastian Bernauer Date: Thu, 24 Jul 2025 13:56:29 +0200 Subject: [PATCH 13/16] changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 067584a6..2b9c6a69 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,7 @@ All notable changes to this project will be documented in this file. ### Changed -- BREAKING: The per-rolegroup services now only server the HTTP port and have a `-headless` suffix to better indicate there +- BREAKING: The per-rolegroup services now only serves the HTTP port and has a `-headless` suffix to better indicate there purpose and to be consistent with other operators ([#748]). - BREAKING: The per-role server service is now prefixed with `-server` to be consistent with other operators ([#748]). From a43bcb26563cb367b5887565fecf87daf7d9ca01 Mon Sep 17 00:00:00 2001 From: Sebastian Bernauer Date: Thu, 24 Jul 2025 14:02:02 +0200 Subject: [PATCH 14/16] Update CHANGELOG.md Co-authored-by: Malte Sander --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2b9c6a69..466f91f8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,7 @@ All notable changes to this project will be documented in this file. ### Changed -- BREAKING: The per-rolegroup services now only serves the HTTP port and has a `-headless` suffix to better indicate there +- BREAKING: The per-rolegroup services now only serves the HTTP port and has a `-headless` suffix to better indicate their purpose and to be consistent with other operators ([#748]). - BREAKING: The per-role server service is now prefixed with `-server` to be consistent with other operators ([#748]). From f6f1bbba339c24ba3e2bcde658488cf2c9c9de52 Mon Sep 17 00:00:00 2001 From: Sebastian Bernauer Date: Thu, 24 Jul 2025 14:19:24 +0200 Subject: [PATCH 15/16] fix tests --- tests/templates/kuttl/logging/test_log_aggregation.py | 2 +- tests/templates/kuttl/smoke/30-assert.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/templates/kuttl/logging/test_log_aggregation.py b/tests/templates/kuttl/logging/test_log_aggregation.py index 3ed838f0..7d526296 100755 --- a/tests/templates/kuttl/logging/test_log_aggregation.py +++ b/tests/templates/kuttl/logging/test_log_aggregation.py @@ -5,7 +5,7 @@ def send_opa_decision_request(): response = requests.post( - 'http://test-opa:8081/v1/data/test/world' + 'http://test-opa-server:8081/v1/data/test/world' ) assert response.status_code == 200, \ diff --git a/tests/templates/kuttl/smoke/30-assert.yaml b/tests/templates/kuttl/smoke/30-assert.yaml index 57a25e21..9c83596e 100644 --- a/tests/templates/kuttl/smoke/30-assert.yaml +++ b/tests/templates/kuttl/smoke/30-assert.yaml @@ -4,4 +4,4 @@ kind: TestAssert metadata: name: test-regorule commands: - - script: kubectl exec -n $NAMESPACE test-opa-0 -- python /tmp/30_test-regorule.py -u 'http://test-opa:8081/v1/data/test' + - script: kubectl exec -n $NAMESPACE test-opa-0 -- python /tmp/30_test-regorule.py -u 'http://test-opa-server:8081/v1/data/test' From be56034469f97b5a527206bc45cafc18a89f8863 Mon Sep 17 00:00:00 2001 From: Sebastian Bernauer Date: Thu, 24 Jul 2025 14:29:51 +0200 Subject: [PATCH 16/16] ruff ruff --- .../kuttl/logging/test_log_aggregation.py | 43 +++++++++---------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/tests/templates/kuttl/logging/test_log_aggregation.py b/tests/templates/kuttl/logging/test_log_aggregation.py index 7d526296..8ba52855 100755 --- a/tests/templates/kuttl/logging/test_log_aggregation.py +++ b/tests/templates/kuttl/logging/test_log_aggregation.py @@ -4,18 +4,16 @@ def send_opa_decision_request(): - response = requests.post( - 'http://test-opa-server:8081/v1/data/test/world' - ) + response = requests.post("http://test-opa-server:8081/v1/data/test/world") + + assert response.status_code == 200, "Cannot access the API of the opa cluster." - assert response.status_code == 200, \ - 'Cannot access the API of the opa cluster.' def check_sent_events(): response = requests.post( - 'http://opa-vector-aggregator:8686/graphql', + "http://opa-vector-aggregator:8686/graphql", json={ - 'query': """ + "query": """ { transforms(first:100) { nodes { @@ -29,31 +27,32 @@ def check_sent_events(): } } """ - } + }, ) - assert response.status_code == 200, \ - 'Cannot access the API of the vector aggregator.' + assert response.status_code == 200, ( + "Cannot access the API of the vector aggregator." + ) result = response.json() - transforms = result['data']['transforms']['nodes'] + transforms = result["data"]["transforms"]["nodes"] for transform in transforms: - sentEvents = transform['metrics']['sentEventsTotal'] - componentId = transform['componentId'] + sentEvents = transform["metrics"]["sentEventsTotal"] + componentId = transform["componentId"] - if componentId == 'filteredInvalidEvents': - assert sentEvents is None or \ - sentEvents['sentEventsTotal'] == 0, \ - 'Invalid log events were sent.' + if componentId == "filteredInvalidEvents": + assert sentEvents is None or sentEvents["sentEventsTotal"] == 0, ( + "Invalid log events were sent." + ) else: - assert sentEvents is not None and \ - sentEvents['sentEventsTotal'] > 0, \ - f'No events were sent in "{componentId}".' + assert sentEvents is not None and sentEvents["sentEventsTotal"] > 0, ( + f'No events were sent in "{componentId}".' + ) -if __name__ == '__main__': +if __name__ == "__main__": send_opa_decision_request() time.sleep(10) check_sent_events() - print('Test successful!') + print("Test successful!")