diff --git a/lib/hypervisor/firecracker/config.go b/lib/hypervisor/firecracker/config.go index 9576f2ca..e4ccefd8 100644 --- a/lib/hypervisor/firecracker/config.go +++ b/lib/hypervisor/firecracker/config.go @@ -101,8 +101,9 @@ type instanceInfo struct { } type restoreMetadata struct { - NetworkOverrides []networkOverride `json:"network_overrides,omitempty"` - SnapshotSourceDataDir string `json:"snapshot_source_data_dir,omitempty"` + NetworkOverrides []networkOverride `json:"network_overrides,omitempty"` + SnapshotSourceDataDir string `json:"snapshot_source_data_dir,omitempty"` + RetainSnapshotSourceDataDirAlias bool `json:"retain_snapshot_source_data_dir_alias,omitempty"` } func toBootSource(cfg hypervisor.VMConfig) bootSource { diff --git a/lib/hypervisor/firecracker/fork.go b/lib/hypervisor/firecracker/fork.go index 81936929..67a70293 100644 --- a/lib/hypervisor/firecracker/fork.go +++ b/lib/hypervisor/firecracker/fork.go @@ -2,6 +2,8 @@ package firecracker import ( "context" + "fmt" + "os" "path/filepath" "github.com/kernel/hypeman/lib/hypervisor" @@ -48,9 +50,26 @@ func (s *Starter) PrepareFork(ctx context.Context, req hypervisor.ForkPrepareReq } } if req.SourceDataDir != "" && req.TargetDataDir != "" && req.SourceDataDir != req.TargetDataDir { - if meta.SnapshotSourceDataDir != req.SourceDataDir { - meta.SnapshotSourceDataDir = req.SourceDataDir - changed = true + if meta.RetainSnapshotSourceDataDirAlias && meta.SnapshotSourceDataDir != "" { + // Keep the upstream source path for snapshot-derived forks. The retained + // Firecracker base can still reference that path after later diff snapshots. + } else { + retainAlias := false + if _, err := os.Stat(req.SourceDataDir); err != nil { + if os.IsNotExist(err) { + retainAlias = true + } else { + return hypervisor.ForkPrepareResult{}, fmt.Errorf("stat snapshot source data dir %q: %w", req.SourceDataDir, err) + } + } + if meta.SnapshotSourceDataDir != req.SourceDataDir { + meta.SnapshotSourceDataDir = req.SourceDataDir + changed = true + } + if meta.RetainSnapshotSourceDataDirAlias != retainAlias { + meta.RetainSnapshotSourceDataDirAlias = retainAlias + changed = true + } } } diff --git a/lib/hypervisor/firecracker/fork_test.go b/lib/hypervisor/firecracker/fork_test.go index 52826d92..109ae7e7 100644 --- a/lib/hypervisor/firecracker/fork_test.go +++ b/lib/hypervisor/firecracker/fork_test.go @@ -40,4 +40,100 @@ func TestPrepareFork_SnapshotRewritePersistsRestoreMetadata(t *testing.T) { require.Len(t, meta.NetworkOverrides, 1) assert.Equal(t, "tap-new", meta.NetworkOverrides[0].HostDevName) assert.Equal(t, filepath.Join(tmp, "source"), meta.SnapshotSourceDataDir) + assert.True(t, meta.RetainSnapshotSourceDataDirAlias) +} + +func TestPrepareFork_DoesNotRetainExistingSourceAlias(t *testing.T) { + starter := NewStarter() + tmp := t.TempDir() + sourceDir := filepath.Join(tmp, "source") + targetDir := filepath.Join(tmp, "target") + require.NoError(t, os.MkdirAll(sourceDir, 0755)) + require.NoError(t, os.MkdirAll(targetDir, 0755)) + require.NoError(t, saveRestoreMetadata(targetDir, nil)) + + _, err := starter.PrepareFork(context.Background(), hypervisor.ForkPrepareRequest{ + SnapshotConfigPath: filepath.Join(targetDir, "snapshots", "snapshot-latest", "config.json"), + SourceDataDir: sourceDir, + TargetDataDir: targetDir, + }) + require.NoError(t, err) + + meta, err := loadRestoreMetadata(targetDir) + require.NoError(t, err) + assert.Equal(t, sourceDir, meta.SnapshotSourceDataDir) + assert.False(t, meta.RetainSnapshotSourceDataDirAlias) +} + +func TestPrepareFork_ReturnsSourceStatErrors(t *testing.T) { + starter := NewStarter() + tmp := t.TempDir() + targetDir := filepath.Join(tmp, "target") + require.NoError(t, os.MkdirAll(targetDir, 0755)) + require.NoError(t, saveRestoreMetadata(targetDir, nil)) + + _, err := starter.PrepareFork(context.Background(), hypervisor.ForkPrepareRequest{ + SnapshotConfigPath: filepath.Join(targetDir, "snapshots", "snapshot-latest", "config.json"), + SourceDataDir: filepath.Join(tmp, "source") + "\x00", + TargetDataDir: targetDir, + }) + require.Error(t, err) + assert.Contains(t, err.Error(), "stat snapshot source data dir") +} + +func TestPrepareFork_PreservesRetainedUpstreamAlias(t *testing.T) { + starter := NewStarter() + tmp := t.TempDir() + upstreamDir := filepath.Join(tmp, "upstream") + sourceDir := filepath.Join(tmp, "source") + targetDir := filepath.Join(tmp, "target") + require.NoError(t, os.MkdirAll(sourceDir, 0755)) + require.NoError(t, os.MkdirAll(targetDir, 0755)) + require.NoError(t, saveRestoreMetadataState(targetDir, &restoreMetadata{ + SnapshotSourceDataDir: upstreamDir, + RetainSnapshotSourceDataDirAlias: true, + })) + + _, err := starter.PrepareFork(context.Background(), hypervisor.ForkPrepareRequest{ + SnapshotConfigPath: filepath.Join(targetDir, "snapshots", "snapshot-latest", "config.json"), + SourceDataDir: sourceDir, + TargetDataDir: targetDir, + }) + require.NoError(t, err) + + meta, err := loadRestoreMetadata(targetDir) + require.NoError(t, err) + assert.Equal(t, upstreamDir, meta.SnapshotSourceDataDir) + assert.True(t, meta.RetainSnapshotSourceDataDirAlias) +} + +func TestPrepareFork_NetworkRewritePreservesRetainedAlias(t *testing.T) { + starter := NewStarter() + tmp := t.TempDir() + upstreamDir := filepath.Join(tmp, "upstream") + targetDir := filepath.Join(tmp, "target") + require.NoError(t, os.MkdirAll(targetDir, 0755)) + require.NoError(t, saveRestoreMetadataState(targetDir, &restoreMetadata{ + SnapshotSourceDataDir: upstreamDir, + RetainSnapshotSourceDataDirAlias: true, + NetworkOverrides: []networkOverride{{ + IfaceID: "eth0", + HostDevName: "tap-old", + }}, + })) + + _, err := starter.PrepareFork(context.Background(), hypervisor.ForkPrepareRequest{ + SnapshotConfigPath: filepath.Join(targetDir, "snapshots", "snapshot-latest", "config.json"), + Network: &hypervisor.ForkNetworkConfig{ + TAPDevice: "tap-new", + }, + }) + require.NoError(t, err) + + meta, err := loadRestoreMetadata(targetDir) + require.NoError(t, err) + require.Len(t, meta.NetworkOverrides, 1) + assert.Equal(t, "tap-new", meta.NetworkOverrides[0].HostDevName) + assert.Equal(t, upstreamDir, meta.SnapshotSourceDataDir) + assert.True(t, meta.RetainSnapshotSourceDataDirAlias) } diff --git a/lib/hypervisor/firecracker/process.go b/lib/hypervisor/firecracker/process.go index 371e1f2e..b29539e9 100644 --- a/lib/hypervisor/firecracker/process.go +++ b/lib/hypervisor/firecracker/process.go @@ -125,7 +125,7 @@ func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string, if err != nil { return 0, nil, fmt.Errorf("load firecracker snapshot: %w", err) } - if meta.SnapshotSourceDataDir != "" { + if meta.SnapshotSourceDataDir != "" && !meta.RetainSnapshotSourceDataDirAlias { meta.SnapshotSourceDataDir = "" if err := saveRestoreMetadataState(filepath.Dir(socketPath), meta); err != nil { return 0, nil, fmt.Errorf("clear firecracker snapshot source alias metadata: %w", err) diff --git a/lib/hypervisor/qemu/process.go b/lib/hypervisor/qemu/process.go index 6ee99b36..f13fb222 100644 --- a/lib/hypervisor/qemu/process.go +++ b/lib/hypervisor/qemu/process.go @@ -466,6 +466,10 @@ func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string, } log.DebugContext(ctx, "VM ready", "duration_ms", time.Since(migrationWaitStart).Milliseconds()) + if err := saveVMConfig(filepath.Dir(socketPath), config); err != nil { + return 0, nil, fmt.Errorf("save restored vm config: %w", err) + } + cu.Release() log.DebugContext(ctx, "QEMU restore complete", "pid", pid, "total_duration_ms", time.Since(startTime).Milliseconds()) return pid, hv, nil diff --git a/lib/instances/firecracker_test.go b/lib/instances/firecracker_test.go index ba67343d..67e4b0f0 100644 --- a/lib/instances/firecracker_test.go +++ b/lib/instances/firecracker_test.go @@ -25,6 +25,7 @@ import ( "github.com/kernel/hypeman/lib/network" "github.com/kernel/hypeman/lib/paths" "github.com/kernel/hypeman/lib/resources" + snapshottest "github.com/kernel/hypeman/lib/snapshot/testsupport" "github.com/kernel/hypeman/lib/system" "github.com/kernel/hypeman/lib/volumes" "github.com/stretchr/testify/assert" @@ -557,6 +558,95 @@ func TestFirecrackerSnapshotFeature(t *testing.T) { }) } +func TestFirecrackerWarmForkChain(t *testing.T) { + t.Parallel() + requireFirecrackerIntegrationPrereqs(t) + + mgr, tmpDir := setupTestManagerForFirecrackerNoNetwork(t) + ctx := context.Background() + p := paths.New(tmpDir) + + imageManager, err := images.NewManager(p, 1, nil) + require.NoError(t, err) + imageName := integrationTestImageRef(t, "docker.io/library/alpine:latest") + snapshottest.EnsureImageReady(t, ctx, p, imageManager, imageName) + + systemManager := system.NewManager(p) + require.NoError(t, systemManager.EnsureSystemFiles(ctx)) + + source, err := mgr.CreateInstance(ctx, CreateInstanceRequest{ + Name: "fc-warm-chain-src", + Image: imageName, + Size: 1024 * 1024 * 1024, + OverlaySize: 1024 * 1024 * 1024, + Vcpus: 1, + NetworkEnabled: false, + Hypervisor: hypervisor.TypeFirecracker, + Cmd: []string{"sleep", "infinity"}, + }) + require.NoError(t, err) + sourceID := source.Id + sourceDeleted := false + t.Cleanup(func() { + if !sourceDeleted { + _ = mgr.DeleteInstance(context.Background(), sourceID) + } + }) + + source, err = waitForInstanceState(ctx, mgr, sourceID, StateRunning, integrationTestTimeout(20*time.Second)) + require.NoError(t, err) + require.NoError(t, waitForExecAgent(ctx, mgr, sourceID, 30*time.Second)) + + snapshot, err := mgr.CreateSnapshot(ctx, sourceID, CreateSnapshotRequest{ + Kind: SnapshotKindStandby, + Name: "fc-warm-chain-snap", + }) + require.NoError(t, err) + require.Equal(t, SnapshotKindStandby, snapshot.Kind) + + require.NoError(t, mgr.DeleteInstance(ctx, sourceID)) + sourceDeleted = true + + warm, err := mgr.ForkSnapshot(ctx, snapshot.Id, ForkSnapshotRequest{ + Name: "fc-warm-chain-warm", + TargetState: StateRunning, + }) + require.NoError(t, err) + warmID := warm.Id + warmDeleted := false + t.Cleanup(func() { + if !warmDeleted { + _ = mgr.DeleteInstance(context.Background(), warmID) + } + }) + warm, err = waitForInstanceState(ctx, mgr, warmID, StateRunning, integrationTestTimeout(20*time.Second)) + require.NoError(t, err) + require.NoError(t, waitForExecAgent(ctx, mgr, warmID, 30*time.Second)) + + child, err := mgr.ForkInstance(ctx, warmID, ForkInstanceRequest{ + Name: "fc-warm-chain-child", + FromRunning: true, + TargetState: StateStopped, + }) + require.NoError(t, err) + require.Equal(t, StateStopped, child.State) + childID := child.Id + t.Cleanup(func() { _ = mgr.DeleteInstance(context.Background(), childID) }) + + warm, err = mgr.GetInstance(ctx, warmID) + require.NoError(t, err) + if warm.State != StateRunning { + warm, err = waitForInstanceState(ctx, mgr, warmID, StateRunning, integrationTestTimeout(20*time.Second)) + require.NoError(t, err) + } + require.Equal(t, StateRunning, warm.State) + require.NoError(t, waitForExecAgent(ctx, mgr, warmID, 30*time.Second)) + + require.NoError(t, mgr.DeleteInstance(ctx, warmID)) + warmDeleted = true + require.NoError(t, mgr.DeleteSnapshot(ctx, snapshot.Id)) +} + // TestFirecrackerForkIsolation verifies CoW isolation between a firecracker // source's standby snapshot and a fork derived from it. A fork must end up // with its own mem-file inode (reflink-cloned, not hardlinked) so that diff --git a/lib/instances/fork.go b/lib/instances/fork.go index c29d7189..202660e2 100644 --- a/lib/instances/fork.go +++ b/lib/instances/fork.go @@ -68,7 +68,7 @@ func (m *manager) forkInstance(ctx context.Context, id string, req ForkInstanceR } forked, forkErr := m.forkInstanceFromStoppedOrStandby(ctx, id, req, true) - if forkErr == nil { + if forkErr == nil && targetState != StateStopped { if err := m.rotateSourceVsockForRestore(ctx, id, forked.Id); err != nil { forkErr = fmt.Errorf("prepare source snapshot for restore: %w", err) if cleanupErr := m.cleanupForkInstanceOnError(ctx, forked.Id); cleanupErr != nil { @@ -437,6 +437,14 @@ func (m *manager) applyForkTargetState(ctx context.Context, forkID string, targe if err := os.RemoveAll(m.paths.InstanceSnapshotLatest(forkID)); err != nil { return nil, fmt.Errorf("remove fork snapshot: %w", err) } + meta, err := m.loadMetadata(forkID) + if err != nil { + return nil, fmt.Errorf("load stopped fork metadata: %w", err) + } + meta.StoredMetadata.VsockCID = generateVsockCID(forkID) + if err := m.saveMetadata(meta); err != nil { + return nil, fmt.Errorf("save stopped fork metadata: %w", err) + } return returnWithReadiness(m.getInstance(ctx, forkID)) } case StateRunning: diff --git a/lib/instances/fork_test.go b/lib/instances/fork_test.go index 3141f658..eafcd77a 100644 --- a/lib/instances/fork_test.go +++ b/lib/instances/fork_test.go @@ -22,6 +22,7 @@ import ( "github.com/kernel/hypeman/lib/instances/phasetracking" "github.com/kernel/hypeman/lib/paths" snapshotstore "github.com/kernel/hypeman/lib/snapshot" + snapshottest "github.com/kernel/hypeman/lib/snapshot/testsupport" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -280,6 +281,44 @@ func TestForkInstanceFromStandbyCancelsCompressionJobAndCopiesRawMemory(t *testi assert.False(t, ok, "forked standby guest directory should not retain compressed memory artifacts from the source instance") } +func TestApplyForkTargetStateStoppedRefreshesSnapshotForkCID(t *testing.T) { + t.Parallel() + + manager, _ := setupTestManager(t) + ctx := context.Background() + + forkID := "fork-target-stopped" + require.NoError(t, manager.ensureDirectories(forkID)) + snapshotDir := manager.paths.InstanceSnapshotLatest(forkID) + require.NoError(t, os.MkdirAll(snapshotDir, 0755)) + require.NoError(t, os.WriteFile(filepath.Join(snapshotDir, "memory"), []byte("snapshot"), 0644)) + + const sourceCID = int64(100) + require.NotEqual(t, sourceCID, generateVsockCID(forkID)) + meta := &metadata{StoredMetadata: StoredMetadata{ + Id: forkID, + Name: forkID, + CreatedAt: time.Now(), + HypervisorType: hypervisor.TypeFirecracker, + SocketPath: manager.paths.InstanceSocket(forkID, "firecracker.sock"), + DataDir: manager.paths.InstanceDir(forkID), + VsockCID: sourceCID, + VsockSocket: manager.paths.InstanceVsockSocket(forkID), + }} + meta.StoredMetadata.Phases.Record(phasetracking.PhaseStandby, time.Now()) + require.NoError(t, manager.saveMetadata(meta)) + + inst, err := manager.applyForkTargetState(ctx, forkID, StateStopped) + require.NoError(t, err) + require.Equal(t, StateStopped, inst.State) + require.Equal(t, generateVsockCID(forkID), inst.VsockCID) + require.NoDirExists(t, snapshotDir) + + updated, err := manager.loadMetadata(forkID) + require.NoError(t, err) + assert.Equal(t, generateVsockCID(forkID), updated.StoredMetadata.VsockCID) +} + func TestCloneStoredMetadataForFork_DeepCopiesReferenceFields(t *testing.T) { t.Parallel() startedAt := time.Now().Add(-2 * time.Minute) @@ -569,6 +608,117 @@ func TestForkCloudHypervisorFromRunningNetwork(t *testing.T) { assert.Greater(t, sourceAfterFork.Phases.Cumulative[phasetracking.PhaseRunning], int64(0), "source's pre-fork running stint should be cumulated") } +func TestCloudHypervisorWarmForkChain(t *testing.T) { + t.Parallel() + if _, err := os.Stat("/dev/kvm"); os.IsNotExist(err) { + t.Skip("/dev/kvm not available, skipping on this platform") + } + + manager, tmpDir := setupTestManager(t) + runWarmForkChain(t, manager, tmpDir, warmForkChainConfig{ + hypervisor: hypervisor.TypeCloudHypervisor, + namePrefix: "ch", + }) +} + +type warmForkChainConfig struct { + hypervisor hypervisor.Type + namePrefix string +} + +func runWarmForkChain(t *testing.T, mgr *manager, tmpDir string, cfg warmForkChainConfig) { + t.Helper() + + ctx := context.Background() + p := paths.New(tmpDir) + + imageManager, err := images.NewManager(p, 1, nil) + require.NoError(t, err) + imageName := integrationTestImageRef(t, "docker.io/library/alpine:latest") + snapshottest.EnsureImageReady(t, ctx, p, imageManager, imageName) + + require.NoError(t, mgr.systemManager.EnsureSystemFiles(ctx)) + + source, err := mgr.CreateInstance(ctx, CreateInstanceRequest{ + Name: cfg.namePrefix + "-warm-chain-src", + Image: imageName, + Size: 1024 * 1024 * 1024, + OverlaySize: 1024 * 1024 * 1024, + Vcpus: 1, + NetworkEnabled: false, + Hypervisor: cfg.hypervisor, + Cmd: []string{"sleep", "infinity"}, + }) + require.NoError(t, err) + sourceID := source.Id + sourceDeleted := false + t.Cleanup(func() { + if !sourceDeleted { + _ = mgr.DeleteInstance(context.Background(), sourceID) + } + }) + + source, err = waitForInstanceState(ctx, mgr, sourceID, StateRunning, integrationTestTimeout(45*time.Second)) + require.NoError(t, err) + require.NoError(t, waitForExecAgent(ctx, mgr, sourceID, 45*time.Second)) + + snapshot, err := mgr.CreateSnapshot(ctx, sourceID, CreateSnapshotRequest{ + Kind: SnapshotKindStandby, + Name: cfg.namePrefix + "-warm-chain-snap", + }) + require.NoError(t, err) + require.Equal(t, SnapshotKindStandby, snapshot.Kind) + snapshotDeleted := false + t.Cleanup(func() { + if !snapshotDeleted { + _ = mgr.DeleteSnapshot(context.Background(), snapshot.Id) + } + }) + + require.NoError(t, mgr.DeleteInstance(ctx, sourceID)) + sourceDeleted = true + + warm, err := mgr.ForkSnapshot(ctx, snapshot.Id, ForkSnapshotRequest{ + Name: cfg.namePrefix + "-warm-chain-warm", + TargetState: StateRunning, + }) + require.NoError(t, err) + warmID := warm.Id + warmDeleted := false + t.Cleanup(func() { + if !warmDeleted { + _ = mgr.DeleteInstance(context.Background(), warmID) + } + }) + warm, err = waitForInstanceState(ctx, mgr, warmID, StateRunning, integrationTestTimeout(45*time.Second)) + require.NoError(t, err) + require.NoError(t, waitForExecAgent(ctx, mgr, warmID, 45*time.Second)) + + child, err := mgr.ForkInstance(ctx, warmID, ForkInstanceRequest{ + Name: cfg.namePrefix + "-warm-chain-child", + FromRunning: true, + TargetState: StateStopped, + }) + require.NoError(t, err) + require.Equal(t, StateStopped, child.State) + childID := child.Id + t.Cleanup(func() { _ = mgr.DeleteInstance(context.Background(), childID) }) + + warm, err = mgr.GetInstance(ctx, warmID) + require.NoError(t, err) + if warm.State != StateRunning { + warm, err = waitForInstanceState(ctx, mgr, warmID, StateRunning, integrationTestTimeout(45*time.Second)) + require.NoError(t, err) + } + require.Equal(t, StateRunning, warm.State) + require.NoError(t, waitForExecAgent(ctx, mgr, warmID, 45*time.Second)) + + require.NoError(t, mgr.DeleteInstance(ctx, warmID)) + warmDeleted = true + require.NoError(t, mgr.DeleteSnapshot(ctx, snapshot.Id)) + snapshotDeleted = true +} + func assertHostCanReachNginx(t *testing.T, ip string, port int, timeout time.Duration) { t.Helper() diff --git a/lib/instances/qemu_test.go b/lib/instances/qemu_test.go index 015f8078..30f2669a 100644 --- a/lib/instances/qemu_test.go +++ b/lib/instances/qemu_test.go @@ -998,3 +998,14 @@ func TestQEMUSnapshotFeature(t *testing.T) { forkName: "qemu-snapshot-fork", }) } + +func TestQEMUWarmForkChain(t *testing.T) { + t.Parallel() + requireQEMUUsable(t) + + mgr, tmpDir := setupTestManagerForQEMU(t) + runWarmForkChain(t, mgr, tmpDir, warmForkChainConfig{ + hypervisor: hypervisor.TypeQEMU, + namePrefix: "qemu", + }) +}