From 6785032338690d81b389e4b41c1be1cf04cc2d43 Mon Sep 17 00:00:00 2001 From: Tom Nabarro Date: Mon, 8 Jun 2026 21:45:41 +0100 Subject: [PATCH] DAOS-19008 control: Erase formatting after failed format --replace Signed-off-by: Tom Nabarro --- src/control/cmd/dmg/storage_test.go | 19 +++++++++++ src/control/server/instance.go | 20 +++++++++--- src/control/server/instance_storage.go | 45 ++++++++++++++++++++++++++ 3 files changed, 80 insertions(+), 4 deletions(-) diff --git a/src/control/cmd/dmg/storage_test.go b/src/control/cmd/dmg/storage_test.go index 124b46e984a..2f0fdb50588 100644 --- a/src/control/cmd/dmg/storage_test.go +++ b/src/control/cmd/dmg/storage_test.go @@ -1,5 +1,6 @@ // // (C) Copyright 2019-2022 Intel Corporation. +// (C) Copyright 2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -154,6 +155,24 @@ func TestStorageCommands(t *testing.T) { printRequest(t, nvmeAddDeviceReq().WithStorageTierIndex(0)), nil, }, + { + "Format with replace; no hosts in hostlist", + "storage format --replace", + "", + errors.New("expects a single host"), + }, + { + "Format with replace; multiple hosts in hostlist", + "storage format --replace -l foo[1,2].com", + "", + errors.New("expects a single host"), + }, + { + "Format with replace and force", + "storage format --replace --force", + "", + errors.New("may not be mixed with --force"), + }, { "Nonexistent subcommand", "storage quack", diff --git a/src/control/server/instance.go b/src/control/server/instance.go index c763ae1b7a9..432985fc33a 100644 --- a/src/control/server/instance.go +++ b/src/control/server/instance.go @@ -1,6 +1,6 @@ // // (C) Copyright 2019-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -216,9 +216,24 @@ func (ei *EngineInstance) determineRank(ctx context.Context, ready *srvpb.Notify Replace: ei.replaceRank.Load(), } + // Reset replaceRank state for instance after joinSystem() has been attempted. + defer ei.replaceRank.SetFalse() + resp, err := ei.joinSystem(ctx, joinReq) if err != nil { ei.log.Errorf("join failed: %s", err) + + // If this is a replace operation and join failed, clean up the formatted storage to + // prevent leaving the rank in a formatted state. This prevents the engine + // inadvertently being joined later with a new rank. + if ei.replaceRank.Load() { + ei.log.Infof("cleaning up after join failure during replace") + if cleanupErr := ei.cleanupAfterFailedJoin(ctx); cleanupErr != nil { + ei.log.Errorf("failed to cleanup after join failure: %v", cleanupErr) + // Don't override the original join error + } + } + return ranklist.NilRank, false, 0, err } switch resp.State { @@ -237,9 +252,6 @@ func (ei *EngineInstance) determineRank(ctx context.Context, ready *srvpb.Notify } r = ranklist.Rank(resp.Rank) - // Reset replaceRank state for instance after joinSystem() has returned. - ei.replaceRank.SetFalse() - if !superblock.ValidRank || ready.Uri != superblock.URI { ei.log.Noticef("updating rank %d URI to %s", resp.Rank, ready.Uri) superblock.Rank = new(ranklist.Rank) diff --git a/src/control/server/instance_storage.go b/src/control/server/instance_storage.go index ab8f8adcb98..6b34779341e 100644 --- a/src/control/server/instance_storage.go +++ b/src/control/server/instance_storage.go @@ -19,6 +19,7 @@ import ( "github.com/daos-stack/daos/src/control/build" "github.com/daos-stack/daos/src/control/fault" "github.com/daos-stack/daos/src/control/fault/code" + "github.com/daos-stack/daos/src/control/lib/ranklist" "github.com/daos-stack/daos/src/control/server/storage" ) @@ -76,6 +77,50 @@ func (ei *EngineInstance) NotifyStorageReady(replaceRank bool) { }() } +// cleanupAfterFailedJoin cleans up storage after a join failure during replace operation. +// This is called when format succeeded but the join to the system failed, leaving +// the storage in a partially initialized state. +func (ei *EngineInstance) cleanupAfterFailedJoin(ctx context.Context) error { + idx := ei.Index() + ei.log.Infof("instance %d: cleaning up after join failure during replace", idx) + + storageProv := ei.GetStorage() + + // Get SCM config to access mount point and class + scmCfg, err := storageProv.GetScmConfig() + if err != nil { + return errors.Wrap(err, "failed to get SCM config") + } + + if scmCfg == nil { + ei.log.Debugf("instance %d: no SCM config, nothing to clean", idx) + return nil + } + + if ei.IsStarted() { + return FaultInstancesNotStopped("cleanup after failed join", ranklist.NilRank) + } + + // For RAM-based SCM (tmpfs), unmount to reset state + if scmCfg.Class == storage.ClassRam { + ei.log.Debugf("instance %d: unmounting tmpfs at %s", idx, scmCfg.Scm.MountPoint) + if err := storageProv.UnmountTmpfs(); err != nil { + ei.log.Errorf("instance %d: unmount failed: %v", idx, err) + // Continue anyway - log the error but don't fail the cleanup + } else { + ei.log.Debugf("instance %d: tmpfs unmounted successfully", idx) + } + } + + // Removing superblock prevents subsequent join without reformat. + if err := ei.RemoveSuperblock(); err != nil { + return err + } + + ei.log.Infof("instance %d: cleanup after join failure complete", idx) + return nil +} + func (ei *EngineInstance) checkScmNeedFormat() (bool, error) { msgIdx := fmt.Sprintf("instance %d", ei.Index())