diff --git a/CHANGELOG.md b/CHANGELOG.md index e26a17a5..1725a56e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,20 +8,20 @@ All notable changes to this project will be documented in this file. - Support objectOverrides using `.spec.objectOverrides`. See [objectOverrides concepts page](https://docs.stackable.tech/home/nightly/concepts/overrides/#object-overrides) for details ([#741]). -- Enable the [restart-controller](https://docs.stackable.tech/home/nightly/commons-operator/restarter/), so that the Pods are automatically restarted on config changes ([#743]). ### Changed - Gracefully shutdown all concurrent tasks by forwarding the SIGTERM signal ([#747]). +- Added warning and exit condition to format-namenodes container script to check for corrupted data after formatting ([#751]). ### Fixed - Previously, some shell output of init-containers was not logged properly and therefore not aggregated, which is fixed now ([#746]). [#741]: https://github.com/stackabletech/hdfs-operator/pull/741 -[#743]: https://github.com/stackabletech/hdfs-operator/pull/743 [#746]: https://github.com/stackabletech/hdfs-operator/pull/746 [#747]: https://github.com/stackabletech/hdfs-operator/pull/747 +[#751]: https://github.com/stackabletech/hdfs-operator/pull/751 ## [25.11.0] - 2025-11-07 diff --git a/docs/modules/hdfs/pages/reference/troubleshooting.adoc b/docs/modules/hdfs/pages/reference/troubleshooting.adoc new file mode 100644 index 00000000..38a783a3 --- /dev/null +++ b/docs/modules/hdfs/pages/reference/troubleshooting.adoc @@ -0,0 +1,27 @@ += Troubleshooting + +[#init-container-format-namenode-fails] +== Init container format-namenodes fails + +When creating fresh HDFS clusters, unexpected pod restarts might corrupt the initial namenode formatting. +This leaves the namenode data PVC in a dangling state, where e.g. the `../current/VERSION` file is created, but `../current/fsimage_xxx` files are missing. + +After a restart corrupted the namenode formatting, reformatting again fails due to directories and files existing. +We do not want to force (override) the formatting process to avoid data loss and other implications. + +[source] +---- +Running in non-interactive mode, and data appears to exist in Storage Directory root= /stackable/data/namenode; location= null. Not formatting. +---- + +Another error message indicating a corrupt formatting state appears in the namenode main container during startup. + +[source] +---- +java.io.FileNotFoundException: No valid image files found +---- + +WARNING: The following fix should only be applied to fresh clusters. For existing clusters please consider support. + +1. Remove the PVC called `data--namenode--0` for a failed namenode 0. +2. Restart the namenode afterwards. diff --git a/docs/modules/hdfs/partials/nav.adoc b/docs/modules/hdfs/partials/nav.adoc index 2e67c8e8..b3631067 100644 --- a/docs/modules/hdfs/partials/nav.adoc +++ b/docs/modules/hdfs/partials/nav.adoc @@ -23,3 +23,4 @@ ** xref:hdfs:reference/discovery.adoc[] ** xref:hdfs:reference/commandline-parameters.adoc[] ** xref:hdfs:reference/environment-variables.adoc[] +* xref:hdfs:reference/troubleshooting.adoc[] diff --git a/rust/operator-binary/src/container.rs b/rust/operator-binary/src/container.rs index 5503422b..e0677f47 100644 --- a/rust/operator-binary/src/container.rs +++ b/rust/operator-binary/src/container.rs @@ -718,6 +718,16 @@ impl ContainerConfig { exclude_from_capture {hadoop_home}/bin/hdfs namenode -bootstrapStandby -nonInteractive fi else + # Sanity check for initial format data corruption: VERSION file exists but no fsimage files were created. + FSIMAGE_COUNT=$(find "{NAMENODE_ROOT_DATA_DIR}/current" -maxdepth 1 -regextype posix-egrep -regex ".*/fsimage_[0-9]+" | wc -l) + + if [ "${{FSIMAGE_COUNT}}" -eq 0 ] + then + echo "WARNING: {NAMENODE_ROOT_DATA_DIR}/current/VERSION file exists but no fsimage files were found." + echo "This indicates an incomplete and corrupted namenode formatting. Please check the troubleshooting guide." + exit 1 + fi + cat "{NAMENODE_ROOT_DATA_DIR}/current/VERSION" echo "Pod $POD_NAME already formatted. Skipping..." fi diff --git a/rust/operator-binary/src/hdfs_controller.rs b/rust/operator-binary/src/hdfs_controller.rs index ea884d36..34d4f68a 100644 --- a/rust/operator-binary/src/hdfs_controller.rs +++ b/rust/operator-binary/src/hdfs_controller.rs @@ -22,7 +22,6 @@ use stackable_operator::{ product_image_selection::{self, ResolvedProductImage}, rbac::build_rbac_resources, }, - constants::RESTART_CONTROLLER_ENABLED_LABEL, iter::reverse_if, k8s_openapi::{ DeepMerge, @@ -901,13 +900,8 @@ fn rolegroup_statefulset( ..StatefulSetSpec::default() }; - let sts_metadata = metadata - .clone() - .with_label(RESTART_CONTROLLER_ENABLED_LABEL.to_owned()) - .build(); - Ok(StatefulSet { - metadata: sts_metadata, + metadata: metadata.build(), spec: Some(statefulset_spec), status: None, }) diff --git a/tests/templates/kuttl/smoke/30-assert.yaml.j2 b/tests/templates/kuttl/smoke/30-assert.yaml.j2 index 2c9cb0e3..2f17591a 100644 --- a/tests/templates/kuttl/smoke/30-assert.yaml.j2 +++ b/tests/templates/kuttl/smoke/30-assert.yaml.j2 @@ -7,9 +7,6 @@ apiVersion: apps/v1 kind: StatefulSet metadata: name: hdfs-namenode-default - generation: 1 # There should be no unneeded Pod restarts - labels: - restarter.stackable.tech/enabled: "true" spec: template: spec: @@ -35,9 +32,6 @@ apiVersion: apps/v1 kind: StatefulSet metadata: name: hdfs-journalnode-default - generation: 1 # There should be no unneeded Pod restarts - labels: - restarter.stackable.tech/enabled: "true" spec: template: spec: @@ -62,9 +56,6 @@ apiVersion: apps/v1 kind: StatefulSet metadata: name: hdfs-datanode-default - generation: 1 # There should be no unneeded Pod restarts - labels: - restarter.stackable.tech/enabled: "true" spec: template: spec: