From bef992dd5a68551a5e7322ccbf507d5a98a8bafa Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Thu, 13 Nov 2025 13:25:58 -0500 Subject: [PATCH] ephemeral: Fix `run-ssh` error handling If we fail to launch, because we used `--rm` the container would exit and we couldn't get its logs. Change to removing the container image via a drop handler, so if the system fails to launch (e.g. it's missing a kernel or bwrap) we can get the logs from that. Signed-off-by: Colin Walters --- .../fixtures/Dockerfile.no-kernel | 10 ++ .../src/tests/run_ephemeral_ssh.rs | 140 +++++++++++++-- crates/kit/src/run_ephemeral.rs | 5 +- crates/kit/src/run_ephemeral_ssh.rs | 161 ++++++++++++++++-- 4 files changed, 285 insertions(+), 31 deletions(-) create mode 100644 crates/integration-tests/fixtures/Dockerfile.no-kernel diff --git a/crates/integration-tests/fixtures/Dockerfile.no-kernel b/crates/integration-tests/fixtures/Dockerfile.no-kernel new file mode 100644 index 0000000..b1a1b92 --- /dev/null +++ b/crates/integration-tests/fixtures/Dockerfile.no-kernel @@ -0,0 +1,10 @@ +# Test fixture: Bootc image with kernel removed +# This simulates a broken/malformed bootc image that will fail during ephemeral VM startup +# Used to test error handling and cleanup in ephemeral run-ssh command + +ARG BASE_IMAGE=quay.io/centos-bootc/centos-bootc:stream10 + +FROM ${BASE_IMAGE} + +# Remove kernel and modules to simulate a broken bootc image +RUN rm -rf /usr/lib/modules diff --git a/crates/integration-tests/src/tests/run_ephemeral_ssh.rs b/crates/integration-tests/src/tests/run_ephemeral_ssh.rs index 4a4d915..a1f6067 100644 --- a/crates/integration-tests/src/tests/run_ephemeral_ssh.rs +++ b/crates/integration-tests/src/tests/run_ephemeral_ssh.rs @@ -18,11 +18,71 @@ use color_eyre::Result; use integration_tests::{integration_test, parameterized_integration_test}; use std::process::Command; -use std::thread; -use std::time::Duration; +use std::time::{Duration, Instant}; use crate::{get_test_image, run_bcvk, INTEGRATION_TEST_LABEL}; +/// Poll until a container is removed or timeout is reached +/// +/// Returns Ok(()) if container is removed within timeout, Err otherwise. +/// Timeout is set to 60 seconds to account for slow CI runners. +fn wait_for_container_removal(container_name: &str) -> Result<()> { + let timeout = Duration::from_secs(60); + let start = Instant::now(); + let poll_interval = Duration::from_millis(100); + + loop { + let output = Command::new("podman") + .args(["ps", "-a", "--format", "{{.Names}}"]) + .output() + .expect("Failed to list containers"); + + let containers = String::from_utf8_lossy(&output.stdout); + if !containers.lines().any(|line| line == container_name) { + return Ok(()); + } + + if start.elapsed() >= timeout { + return Err(color_eyre::eyre::eyre!( + "Timeout waiting for container {} to be removed. Active containers: {}", + container_name, + containers + )); + } + + std::thread::sleep(poll_interval); + } +} + +/// Build a test fixture image with the kernel removed +fn build_broken_image() -> Result { + let fixture_path = concat!(env!("CARGO_MANIFEST_DIR"), "/fixtures/Dockerfile.no-kernel"); + let image_name = format!("localhost/bcvk-test-no-kernel:{}", std::process::id()); + + let output = Command::new("podman") + .args([ + "build", + "-f", + fixture_path, + "-t", + &image_name, + "--build-arg", + &format!("BASE_IMAGE={}", get_test_image()), + ".", + ]) + .output()?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + return Err(color_eyre::eyre::eyre!( + "Failed to build broken test image: {}", + stderr + )); + } + + Ok(image_name) +} + /// Test running a non-interactive command via SSH fn test_run_ephemeral_ssh_command() -> Result<()> { let output = run_bcvk(&[ @@ -66,20 +126,9 @@ fn test_run_ephemeral_ssh_cleanup() -> Result<()> { output.assert_success("ephemeral run-ssh"); - thread::sleep(Duration::from_secs(1)); - - let check_output = Command::new("podman") - .args(["ps", "-a", "--format", "{{.Names}}"]) - .output() - .expect("Failed to list containers"); + // Poll for container removal with timeout + wait_for_container_removal(&container_name)?; - let containers = String::from_utf8_lossy(&check_output.stdout); - assert!( - !containers.contains(&container_name), - "Container {} was not cleaned up after SSH exit. Active containers: {}", - container_name, - containers - ); Ok(()) } integration_test!(test_run_ephemeral_ssh_cleanup); @@ -248,3 +297,64 @@ echo "All checks passed!" Ok(()) } integration_test!(test_run_tmpfs); + +/// Test that containers are properly cleaned up even when the image is broken +/// +/// This test verifies that the drop handler for ContainerCleanup works correctly +/// when ephemeral run-ssh fails early due to a broken image (missing kernel). +/// Previously this would fail with "setns `mnt`: Bad file descriptor" when using +/// podman's --rm flag. Now it should fail cleanly and remove the container. +fn test_run_ephemeral_ssh_broken_image_cleanup() -> Result<()> { + // Build a broken test image (bootc image with kernel removed) + eprintln!("Building broken test image..."); + let broken_image = build_broken_image()?; + eprintln!("Built broken image: {}", broken_image); + + let container_name = format!("test-broken-cleanup-{}", std::process::id()); + + // Try to run ephemeral SSH with the broken image - this should fail + let output = run_bcvk(&[ + "ephemeral", + "run-ssh", + "--name", + &container_name, + "--label", + INTEGRATION_TEST_LABEL, + &broken_image, + "--", + "echo", + "should not reach here", + ])?; + + // The command should fail (no kernel found) + assert!( + !output.success(), + "Expected ephemeral run-ssh to fail with broken image, but it succeeded" + ); + + // Verify the error message indicates the problem + assert!( + output + .stderr + .contains("Failed to read kernel modules directory") + || output + .stderr + .contains("Container exited before SSH became available") + || output + .stderr + .contains("Monitor process exited unexpectedly"), + "Expected error about missing kernel or container failure, got: {}", + output.stderr + ); + + // Poll for container removal with timeout + wait_for_container_removal(&container_name)?; + + // Clean up the test image + let _ = Command::new("podman") + .args(["rmi", "-f", &broken_image]) + .output(); + + Ok(()) +} +integration_test!(test_run_ephemeral_ssh_broken_image_cleanup); diff --git a/crates/kit/src/run_ephemeral.rs b/crates/kit/src/run_ephemeral.rs index 28576c5..e28111a 100644 --- a/crates/kit/src/run_ephemeral.rs +++ b/crates/kit/src/run_ephemeral.rs @@ -777,7 +777,10 @@ pub(crate) async fn run_impl(opts: RunEphemeralOpts) -> Result<()> { let mut vmlinuz_path: Option = None; let mut initramfs_path: Option = None; - for entry in fs::read_dir(modules_dir)? { + let entries = fs::read_dir(modules_dir) + .with_context(|| format!("Failed to read kernel modules directory at {}. This container image may not be a valid bootc image.", modules_dir))?; + + for entry in entries { let entry = entry?; let path = Utf8PathBuf::from_path_buf(entry.path()) .map_err(|p| eyre!("Path is not valid UTF-8: {}", p.display()))?; diff --git a/crates/kit/src/run_ephemeral_ssh.rs b/crates/kit/src/run_ephemeral_ssh.rs index 6260a31..21c8e34 100644 --- a/crates/kit/src/run_ephemeral_ssh.rs +++ b/crates/kit/src/run_ephemeral_ssh.rs @@ -10,6 +10,117 @@ use crate::run_ephemeral::{run_detached, RunEphemeralOpts}; use crate::ssh; use crate::supervisor_status::{SupervisorState, SupervisorStatus}; +/// Container state from podman inspect +#[derive(Debug, serde::Deserialize)] +struct ContainerInspect { + #[serde(rename = "State")] + state: ContainerState, +} + +#[derive(Debug, serde::Deserialize)] +struct ContainerState { + #[serde(rename = "Status")] + status: String, + #[serde(rename = "ExitCode")] + exit_code: i32, + #[serde(rename = "Error")] + error: Option, +} + +/// Fetch and display container logs to help diagnose startup failures +fn show_container_logs(container_name: &str) { + debug!("Fetching container logs for {}", container_name); + + // Get container state in a single inspect call + let state = Command::new("podman") + .args(["inspect", container_name]) + .output() + .ok() + .and_then(|output| { + serde_json::from_slice::>(&output.stdout) + .ok() + .and_then(|mut inspects| inspects.pop()) + .map(|inspect| inspect.state) + }); + + if let Some(ref s) = state { + eprint!( + "\nContainer state: {} (exit code: {})", + s.status, s.exit_code + ); + if let Some(ref err) = s.error { + if !err.is_empty() { + eprint!(" - Error: {}", err); + } + } + eprintln!(); + + // Provide helpful hints for common exit codes + match s.exit_code { + 127 => { + eprintln!("\nNote: Exit code 127 typically means 'command not found'."); + eprintln!("This container image may not be a valid bootc image."); + eprintln!("Bootc images must have systemd and kernel modules in /usr/lib/modules."); + } + 126 => { + eprintln!("\nNote: Exit code 126 typically means 'permission denied' or file not executable."); + } + _ => {} + } + } + + let output = match Command::new("podman") + .args(["logs", container_name]) + .stderr(Stdio::inherit()) + .output() + { + Ok(output) => output, + Err(e) => { + eprintln!("Failed to fetch container logs: {}", e); + return; + } + }; + + let logs = String::from_utf8_lossy(&output.stdout); + if !logs.trim().is_empty() { + eprintln!("\nContainer logs:"); + eprintln!("----------------------------------------"); + for line in logs.lines() { + eprintln!("{}", line); + } + eprintln!("----------------------------------------\n"); + } else { + eprintln!("(Container produced no output)"); + } +} + +/// RAII guard for ephemeral container cleanup +/// Ensures container is removed when dropped, even on error paths +struct ContainerCleanup { + container_id: String, +} + +impl ContainerCleanup { + fn new(container_id: String) -> Self { + Self { container_id } + } +} + +impl Drop for ContainerCleanup { + fn drop(&mut self) { + debug!("Cleaning up ephemeral container {}", self.container_id); + let result = Command::new("podman") + .args(["rm", "-f", &self.container_id]) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .status(); + + if let Err(e) = result { + tracing::warn!("Failed to remove container {}: {}", self.container_id, e); + } + } +} + /// Timeout waiting for connection pub(crate) const SSH_TIMEOUT: std::time::Duration = const { Duration::from_secs(240) }; @@ -23,6 +134,17 @@ pub struct RunEphemeralSshOpts { pub ssh_args: Vec, } +/// Check if container is running +fn is_container_running(container_name: &str) -> Result { + let output = Command::new("podman") + .args(["inspect", container_name, "--format", "{{.State.Status}}"]) + .output() + .context("Failed to inspect container state")?; + + let state = String::from_utf8_lossy(&output.stdout); + Ok(state.trim() == "running") +} + /// Wait for VM SSH availability using the supervisor status file /// /// Monitors /run/supervisor-status.json inside the container for SSH. @@ -40,6 +162,13 @@ pub fn wait_for_vm_ssh( timeout.as_secs() ); + // Check if container is still running before attempting exec + if !is_container_running(container_name)? { + progress.finish_and_clear(); + show_container_logs(container_name); + return Err(eyre!("Container exited before SSH became available")); + } + // Use the new monitor-status subcommand for efficient inotify-based monitoring let mut cmd = Command::new("podman"); cmd.args([ @@ -56,10 +185,14 @@ pub fn wait_for_vm_ssh( .map_err(Into::into) }); } - let mut child = cmd - .stdout(Stdio::piped()) - .spawn() - .context("Failed to start status monitor")?; + let mut child = match cmd.stdout(Stdio::piped()).stderr(Stdio::inherit()).spawn() { + Ok(child) => child, + Err(e) => { + progress.finish_and_clear(); + show_container_logs(container_name); + return Err(e).context("Failed to start status monitor"); + } + }; let stdout = child.stdout.take().unwrap(); let reader = std::io::BufReader::new(stdout); @@ -100,6 +233,9 @@ pub fn wait_for_vm_ssh( } let status = child.wait()?; + + progress.finish_and_clear(); + show_container_logs(container_name); Err(eyre!("Monitor process exited unexpectedly: {status:?}")) } @@ -147,7 +283,6 @@ pub fn wait_for_ssh_ready( pub fn run_ephemeral_ssh(opts: RunEphemeralSshOpts) -> Result<()> { // Start the ephemeral pod in detached mode with SSH enabled let mut ephemeral_opts = opts.run_opts.clone(); - ephemeral_opts.podman.rm = true; ephemeral_opts.podman.detach = true; ephemeral_opts.common.ssh_keygen = true; // Enable SSH key generation and access @@ -155,6 +290,9 @@ pub fn run_ephemeral_ssh(opts: RunEphemeralSshOpts) -> Result<()> { let container_id = run_detached(ephemeral_opts)?; debug!("Ephemeral VM started with container ID: {}", container_id); + // Create cleanup guard to ensure container removal on any exit path + let _cleanup = ContainerCleanup::new(container_id.clone()); + // Use the container ID for SSH and cleanup let container_name = container_id; debug!("Using container ID: {}", container_name); @@ -176,16 +314,9 @@ pub fn run_ephemeral_ssh(opts: RunEphemeralSshOpts) -> Result<()> { let exit_code = status.code().unwrap_or(1); debug!("SSH exit code: {}", exit_code); - // SSH completed, proceed with cleanup - - // Cleanup: stop and remove the container immediately - debug!("SSH session ended, cleaning up ephemeral pod..."); - - let _ = Command::new("podman") - .args(["rm", "-f", &container_name]) - .stdout(Stdio::null()) - .stderr(Stdio::null()) - .status(); + // Explicitly drop the cleanup guard before exit to ensure container removal + // (std::process::exit doesn't run drop handlers) + drop(_cleanup); // Exit with SSH client's exit code std::process::exit(exit_code);