From 9b66d04bcef4e35bdb43b7f3fe84815cd7690cfc Mon Sep 17 00:00:00 2001 From: cbippley Date: Fri, 16 Jan 2026 15:18:22 -0500 Subject: [PATCH] OCPBUGS-11767 unstable cluster due to admission webhooks --- .../builds/triggering-builds-build-hooks.adoc | 16 +++++ ...accessing-monitoring-web-service-apis.adoc | 10 +-- .../preventing-cluster-webhook-failure.adoc | 20 ++++++ .../recover-unstable-cluster-webhooks.adoc | 68 +++++++++++++++++++ 4 files changed, 107 insertions(+), 7 deletions(-) create mode 100644 modules/preventing-cluster-webhook-failure.adoc create mode 100644 modules/recover-unstable-cluster-webhooks.adoc diff --git a/cicd/builds/triggering-builds-build-hooks.adoc b/cicd/builds/triggering-builds-build-hooks.adoc index 123549e06b44..e7fc7a149726 100644 --- a/cicd/builds/triggering-builds-build-hooks.adoc +++ b/cicd/builds/triggering-builds-build-hooks.adoc @@ -12,6 +12,22 @@ include::modules/builds-triggers.adoc[leveloffset=+1] include::modules/builds-webhook-triggers.adoc[leveloffset=+2] +// Preventing cluster failure due to webhooks +include::modules/preventing-cluster-webhook-failure.adoc[leveloffset=+3] + +// Recovering an unstable cluster due to admission webhooks +include::modules/recover-unstable-cluster-webhooks.adoc[leveloffset=+3] + +[role="_additional-resources"] +.Additional resources + +* xref:../../authentication/using-rbac.adoc#unauthenticated-users-cluster-role-bindings-concept_using-rbac[Cluster role bindings for unauthenticated groups] + +ifndef::openshift-rosa-hcp[] +* xref:../../architecture/admission-plug-ins.adoc#admission-webhooks-about_admission-plug-ins[Webhook admission plugins] + +endif::openshift-rosa-hcp[] + include::modules/builds-using-github-webhooks.adoc[leveloffset=+3] include::modules/builds-using-gitlab-webhooks.adoc[leveloffset=+3] diff --git a/modules/monitoring-about-accessing-monitoring-web-service-apis.adoc b/modules/monitoring-about-accessing-monitoring-web-service-apis.adoc index 16d6aaaba09e..0599013ddaf6 100644 --- a/modules/monitoring-about-accessing-monitoring-web-service-apis.adoc +++ b/modules/monitoring-about-accessing-monitoring-web-service-apis.adoc @@ -1,17 +1,13 @@ // Module included in the following assemblies: // -// * observability/monitoring/accessing-third-party-monitoring-apis.adoc +// * observability/monitoring/accessing-metrics/accessing-monitoring-apis-by-using-the-cli.adoc :_mod-docs-content-type: CONCEPT [id="about-accessing-monitoring-web-service-apis_{context}"] = About accessing monitoring web service APIs -You can directly access web service API endpoints from the command line for the following monitoring stack components: - -* Prometheus -* Alertmanager -* Thanos Ruler -* Thanos Querier +[role="_abstract"] +To interact with the monitoring stack by using the command line, you can access web service API endpoints for Prometheus, Alertmanager, Thanos Ruler, and Thanos Querier. Direct API access requires bearer token authentication and the correct namespace permissions. [IMPORTANT] ==== diff --git a/modules/preventing-cluster-webhook-failure.adoc b/modules/preventing-cluster-webhook-failure.adoc new file mode 100644 index 000000000000..2d1d488c98d7 --- /dev/null +++ b/modules/preventing-cluster-webhook-failure.adoc @@ -0,0 +1,20 @@ +// Module included in the following assemblies: +// +// * builds/triggering-builds-build-hooks.adoc + +:_mod-docs-content-type: CONCEPT +[id="third-party-cluster-webhook-failure_{context}"] += Prevent cluster failure due to webhooks + +[role="_abstract"] +To prevent potential cluster failure and ensure pods can always start, you must configure third-party admission webhooks to exclude infrastructure namespaces. Implementing specific selectors and adopting a `ValidatingAdmissionPolicy` resource provides a more stable environment for cluster recovery and management. + +When possible, use a `ValidatingAdmissionPolicy` resource instead of an admission webhook. It does not require an external service, has no timeout limitations, and cannot cause cluster-wide failures. + +If you use admission webhooks take the following precautions: + +* Configure the webhook to exclude {product-title} and Kubernetes infrastructure namespaces. + +* Configure webhook timeouts to 10 seconds or less to provide a safety buffer for the system-enforced 13-second limit. + +* Set the `failurePolicy` value to `Ignore` for non-critical webhooks so that requests can proceed if the webhook is unavailable. \ No newline at end of file diff --git a/modules/recover-unstable-cluster-webhooks.adoc b/modules/recover-unstable-cluster-webhooks.adoc new file mode 100644 index 000000000000..d69a0a75066e --- /dev/null +++ b/modules/recover-unstable-cluster-webhooks.adoc @@ -0,0 +1,68 @@ +// Module included in the following assemblies: +// +// * builds/triggering-builds-build-hooks.adoc + +:_mod-docs-content-type: PROCEDURE +[id="third-party-cluster-webhook-failures_{context}"] += Recovering an unstable cluster due to admission webhooks + +[role="_abstract"] +If a misconfigured admission webhook causes your cluster to fail, you must delete the webhook configuration to restore functionality. + +.Procedure + +. Back up the webhook configuration. Choose either `ValidatingWebhookConfiguration` or `MutatingWebhookConfiguration` for the `` value. ++ +[source,terminal] +---- +oc get -o yaml > webhook-backup.yaml +---- + +. Delete the webhook. ++ +[source,terminal] +---- +oc delete +---- + +. Fix the webhook configuration to exclude infrastructure namespaces when you reapply it. ++ +.Example +[source,yaml] +---- +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: machine-api +webhooks: + - name: default.machine.machine.openshift.io + rules: + - apiGroups: [""] + apiVersions: ["v1"] + operations: ["CREATE", "UPDATE"] + resources: ["pods"] + scope: "*" + clientConfig: + service: + namespace: machine-api-operator-webhook + name: openshift-machine-api + path: "/validate" + admissionReviewVersions: ["v1"] + sideEffects: None + timeoutSeconds: 5 + namespaceSelector: + matchExpressions: + - key: kubernetes.io/metadata.name + operator: NotIn + values: + - openshift + - openshift-apiserver + - openshift-authentication + - openshift-monitoring + - kube-system + - kube-public + - kube-node-lease + - default +---- ++ +Where `kind` is the type of webhook configuration you are using. Valid values are `ValidatingWebhookConfiguration` or `MutatingWebhookConfiguration`. \ No newline at end of file