From 0bbcd92692545263f13588b572234719dae9fe36 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Wed, 11 Feb 2026 14:31:40 +0100 Subject: [PATCH 1/6] add waring to format-namenode script --- rust/operator-binary/src/container.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/rust/operator-binary/src/container.rs b/rust/operator-binary/src/container.rs index 5503422b..88347834 100644 --- a/rust/operator-binary/src/container.rs +++ b/rust/operator-binary/src/container.rs @@ -709,6 +709,16 @@ impl ContainerConfig { if [ ! -f "{NAMENODE_ROOT_DATA_DIR}/current/VERSION" ] then + # Sanity check for data corruption: VERSION file exists but no fsimage_xxx files were created. + FSIMAGE_COUNT=$(find "{NAMENODE_ROOT_DATA_DIR}/current" -maxdepth 1 -regextype posix-egrep -regex ".*/fsimage_[0-9]+" | wc -l) + + if [ "${{FSIMAGE_COUNT}}" -eq 0 ] + then + echo "WARNING: {NAMENODE_ROOT_DATA_DIR}/current/VERSION file exists but no fsimage file(s) found." + echo "This indicates an incomplete and corrupted namenode formatting. Please check the troubleshooting guide. + exit 1 + fi + if [ -z ${{ACTIVE_NAMENODE+x}} ] then echo "No active namenode found. Formatting $POD_NAME as active." From 81628d24978c2c69a380d6b9cdacb94f72d51aa0 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Wed, 11 Feb 2026 15:22:06 +0100 Subject: [PATCH 2/6] wip --- .../hdfs/pages/reference/troubleshooting.adoc | 27 +++++++++++++++++++ docs/modules/hdfs/partials/nav.adoc | 1 + rust/operator-binary/src/container.rs | 20 +++++++------- rust/operator-binary/src/hdfs_controller.rs | 8 +----- tests/templates/kuttl/smoke/30-assert.yaml.j2 | 9 ------- 5 files changed, 39 insertions(+), 26 deletions(-) create mode 100644 docs/modules/hdfs/pages/reference/troubleshooting.adoc diff --git a/docs/modules/hdfs/pages/reference/troubleshooting.adoc b/docs/modules/hdfs/pages/reference/troubleshooting.adoc new file mode 100644 index 00000000..70a5e5b6 --- /dev/null +++ b/docs/modules/hdfs/pages/reference/troubleshooting.adoc @@ -0,0 +1,27 @@ += Troubleshooting + +[#init-container-format-namenode-fails] +== Init container format-namenodes fails + +When creating fresh HDFS clusters, unexpected pod restarts might corrupt the initial namenode formatting. +This leaves the namenode data PVC in a dangling state, where e.g. the `../current/VERSION` file is created, but `fsimage_xxx` files are missing. + +After a restart corrupted the namenode formatting, reformatting again fails due to directories and files existing. +We do not want to force (override) the formatting process to avoid data loss and other implications. + +[source] +---- +Running in non-interactive mode, and data appears to exist in Storage Directory root= /stackable/data/namenode; location= null. Not formatting. +---- + +Another error message indicating a corrupt formatting state appears in the namenode main container during startup. + +[source] +---- +java.io.FileNotFoundException: No valid image files found +---- + +WARNING: The following fix should only be applied to fresh clusters. For existing clusters please consider support. + +1. Remove the PVC called `data--namenode--0` for a failed namenode 0. +2. Restart the namenode afterwards. diff --git a/docs/modules/hdfs/partials/nav.adoc b/docs/modules/hdfs/partials/nav.adoc index 2e67c8e8..b3631067 100644 --- a/docs/modules/hdfs/partials/nav.adoc +++ b/docs/modules/hdfs/partials/nav.adoc @@ -23,3 +23,4 @@ ** xref:hdfs:reference/discovery.adoc[] ** xref:hdfs:reference/commandline-parameters.adoc[] ** xref:hdfs:reference/environment-variables.adoc[] +* xref:hdfs:reference/troubleshooting.adoc[] diff --git a/rust/operator-binary/src/container.rs b/rust/operator-binary/src/container.rs index 88347834..84f7e850 100644 --- a/rust/operator-binary/src/container.rs +++ b/rust/operator-binary/src/container.rs @@ -709,16 +709,6 @@ impl ContainerConfig { if [ ! -f "{NAMENODE_ROOT_DATA_DIR}/current/VERSION" ] then - # Sanity check for data corruption: VERSION file exists but no fsimage_xxx files were created. - FSIMAGE_COUNT=$(find "{NAMENODE_ROOT_DATA_DIR}/current" -maxdepth 1 -regextype posix-egrep -regex ".*/fsimage_[0-9]+" | wc -l) - - if [ "${{FSIMAGE_COUNT}}" -eq 0 ] - then - echo "WARNING: {NAMENODE_ROOT_DATA_DIR}/current/VERSION file exists but no fsimage file(s) found." - echo "This indicates an incomplete and corrupted namenode formatting. Please check the troubleshooting guide. - exit 1 - fi - if [ -z ${{ACTIVE_NAMENODE+x}} ] then echo "No active namenode found. Formatting $POD_NAME as active." @@ -728,6 +718,16 @@ impl ContainerConfig { exclude_from_capture {hadoop_home}/bin/hdfs namenode -bootstrapStandby -nonInteractive fi else + # Sanity check for initial format data corruption: VERSION file exists but no fsimage files were created. + FSIMAGE_COUNT=$(find "{NAMENODE_ROOT_DATA_DIR}/current" -maxdepth 1 -regextype posix-egrep -regex ".*/fsimage_[0-9]+" | wc -l) + + if [ "${{FSIMAGE_COUNT}}" -eq 0 ] + then + echo "WARNING: {NAMENODE_ROOT_DATA_DIR}/current/VERSION file exists but no fsimage files were found." + echo "This indicates an incomplete and corrupted namenode formatting. Please check the troubleshooting guide." + exit 1 + fi + cat "{NAMENODE_ROOT_DATA_DIR}/current/VERSION" echo "Pod $POD_NAME already formatted. Skipping..." fi diff --git a/rust/operator-binary/src/hdfs_controller.rs b/rust/operator-binary/src/hdfs_controller.rs index ea884d36..34d4f68a 100644 --- a/rust/operator-binary/src/hdfs_controller.rs +++ b/rust/operator-binary/src/hdfs_controller.rs @@ -22,7 +22,6 @@ use stackable_operator::{ product_image_selection::{self, ResolvedProductImage}, rbac::build_rbac_resources, }, - constants::RESTART_CONTROLLER_ENABLED_LABEL, iter::reverse_if, k8s_openapi::{ DeepMerge, @@ -901,13 +900,8 @@ fn rolegroup_statefulset( ..StatefulSetSpec::default() }; - let sts_metadata = metadata - .clone() - .with_label(RESTART_CONTROLLER_ENABLED_LABEL.to_owned()) - .build(); - Ok(StatefulSet { - metadata: sts_metadata, + metadata: metadata.build(), spec: Some(statefulset_spec), status: None, }) diff --git a/tests/templates/kuttl/smoke/30-assert.yaml.j2 b/tests/templates/kuttl/smoke/30-assert.yaml.j2 index 2c9cb0e3..2f17591a 100644 --- a/tests/templates/kuttl/smoke/30-assert.yaml.j2 +++ b/tests/templates/kuttl/smoke/30-assert.yaml.j2 @@ -7,9 +7,6 @@ apiVersion: apps/v1 kind: StatefulSet metadata: name: hdfs-namenode-default - generation: 1 # There should be no unneeded Pod restarts - labels: - restarter.stackable.tech/enabled: "true" spec: template: spec: @@ -35,9 +32,6 @@ apiVersion: apps/v1 kind: StatefulSet metadata: name: hdfs-journalnode-default - generation: 1 # There should be no unneeded Pod restarts - labels: - restarter.stackable.tech/enabled: "true" spec: template: spec: @@ -62,9 +56,6 @@ apiVersion: apps/v1 kind: StatefulSet metadata: name: hdfs-datanode-default - generation: 1 # There should be no unneeded Pod restarts - labels: - restarter.stackable.tech/enabled: "true" spec: template: spec: From 65660c28b38acc8c63cb53d6456936ba0e4326dd Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Wed, 11 Feb 2026 15:25:45 +0100 Subject: [PATCH 3/6] adapted changelog --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e26a17a5..679ae8bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,11 +8,11 @@ All notable changes to this project will be documented in this file. - Support objectOverrides using `.spec.objectOverrides`. See [objectOverrides concepts page](https://docs.stackable.tech/home/nightly/concepts/overrides/#object-overrides) for details ([#741]). -- Enable the [restart-controller](https://docs.stackable.tech/home/nightly/commons-operator/restarter/), so that the Pods are automatically restarted on config changes ([#743]). ### Changed - Gracefully shutdown all concurrent tasks by forwarding the SIGTERM signal ([#747]). +- Added warning and exit condition to format-namenodes container script to check for corrupted data after formatting ([#751]). ### Fixed @@ -22,6 +22,7 @@ All notable changes to this project will be documented in this file. [#743]: https://github.com/stackabletech/hdfs-operator/pull/743 [#746]: https://github.com/stackabletech/hdfs-operator/pull/746 [#747]: https://github.com/stackabletech/hdfs-operator/pull/747 +[#751]: https://github.com/stackabletech/hdfs-operator/pull/751 ## [25.11.0] - 2025-11-07 From b63c5ea7f51676910eb393b11e38482d352218a2 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Wed, 11 Feb 2026 15:26:32 +0100 Subject: [PATCH 4/6] remove pr ref for restart enable --- CHANGELOG.md | 1 - 1 file changed, 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 679ae8bc..1725a56e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,7 +19,6 @@ All notable changes to this project will be documented in this file. - Previously, some shell output of init-containers was not logged properly and therefore not aggregated, which is fixed now ([#746]). [#741]: https://github.com/stackabletech/hdfs-operator/pull/741 -[#743]: https://github.com/stackabletech/hdfs-operator/pull/743 [#746]: https://github.com/stackabletech/hdfs-operator/pull/746 [#747]: https://github.com/stackabletech/hdfs-operator/pull/747 [#751]: https://github.com/stackabletech/hdfs-operator/pull/751 From 69788a9308ffc2b00206cb7729722ed7bba0c5e5 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Wed, 11 Feb 2026 16:36:50 +0100 Subject: [PATCH 5/6] Update docs/modules/hdfs/pages/reference/troubleshooting.adoc Co-authored-by: Sebastian Bernauer --- docs/modules/hdfs/pages/reference/troubleshooting.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/modules/hdfs/pages/reference/troubleshooting.adoc b/docs/modules/hdfs/pages/reference/troubleshooting.adoc index 70a5e5b6..bb2d80f8 100644 --- a/docs/modules/hdfs/pages/reference/troubleshooting.adoc +++ b/docs/modules/hdfs/pages/reference/troubleshooting.adoc @@ -4,7 +4,7 @@ == Init container format-namenodes fails When creating fresh HDFS clusters, unexpected pod restarts might corrupt the initial namenode formatting. -This leaves the namenode data PVC in a dangling state, where e.g. the `../current/VERSION` file is created, but `fsimage_xxx` files are missing. +This leaves the namenode data PVC in a dangling state, where e.g. the `../current/VERSION` file is created, but `../current/fsimage_xxx` files are missing. After a restart corrupted the namenode formatting, reformatting again fails due to directories and files existing. We do not want to force (override) the formatting process to avoid data loss and other implications. From 63a2072f148e5ec1a8e5b68581a74abb3ab22dda Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Wed, 11 Feb 2026 16:49:33 +0100 Subject: [PATCH 6/6] precommit --- docs/modules/hdfs/pages/reference/troubleshooting.adoc | 2 +- rust/operator-binary/src/container.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/modules/hdfs/pages/reference/troubleshooting.adoc b/docs/modules/hdfs/pages/reference/troubleshooting.adoc index bb2d80f8..38a783a3 100644 --- a/docs/modules/hdfs/pages/reference/troubleshooting.adoc +++ b/docs/modules/hdfs/pages/reference/troubleshooting.adoc @@ -23,5 +23,5 @@ java.io.FileNotFoundException: No valid image files found WARNING: The following fix should only be applied to fresh clusters. For existing clusters please consider support. -1. Remove the PVC called `data--namenode--0` for a failed namenode 0. +1. Remove the PVC called `data--namenode--0` for a failed namenode 0. 2. Restart the namenode afterwards. diff --git a/rust/operator-binary/src/container.rs b/rust/operator-binary/src/container.rs index 84f7e850..e0677f47 100644 --- a/rust/operator-binary/src/container.rs +++ b/rust/operator-binary/src/container.rs @@ -721,7 +721,7 @@ impl ContainerConfig { # Sanity check for initial format data corruption: VERSION file exists but no fsimage files were created. FSIMAGE_COUNT=$(find "{NAMENODE_ROOT_DATA_DIR}/current" -maxdepth 1 -regextype posix-egrep -regex ".*/fsimage_[0-9]+" | wc -l) - if [ "${{FSIMAGE_COUNT}}" -eq 0 ] + if [ "${{FSIMAGE_COUNT}}" -eq 0 ] then echo "WARNING: {NAMENODE_ROOT_DATA_DIR}/current/VERSION file exists but no fsimage files were found." echo "This indicates an incomplete and corrupted namenode formatting. Please check the troubleshooting guide."