From b458068eeb2590710128066ed487ef14a61182f0 Mon Sep 17 00:00:00 2001 From: Ludvig Liljenberg <4257730+ludfjig@users.noreply.github.com> Date: Tue, 24 Mar 2026 15:00:20 -0700 Subject: [PATCH 1/6] Add another static Signed-off-by: Ludvig Liljenberg <4257730+ludfjig@users.noreply.github.com> --- src/hyperlight_guest_bin/src/guest_function/call.rs | 10 ++++++++++ src/hyperlight_guest_bin/src/lib.rs | 5 +++++ 2 files changed, 15 insertions(+) diff --git a/src/hyperlight_guest_bin/src/guest_function/call.rs b/src/hyperlight_guest_bin/src/guest_function/call.rs index ebadda540..1dc39ee4c 100644 --- a/src/hyperlight_guest_bin/src/guest_function/call.rs +++ b/src/hyperlight_guest_bin/src/guest_function/call.rs @@ -86,6 +86,16 @@ pub(crate) fn internal_dispatch_function() { }; let handle = unsafe { GUEST_HANDLE }; + let canary = unsafe { crate::GUEST_HANDLE_CANARY }; + + if handle.peb().is_none() || canary != 0xDEAD_BEEF { + panic!( + "GUEST_HANDLE is None! canary={:#x} (expected 0xDEADBEEF), peb={:#x}. \ + If canary is also 0, snapshot memory was not restored for this page.", + canary, + handle.peb().map_or(0, |p| p as u64), + ); + } let function_call = handle .try_pop_shared_input_data_into::() diff --git a/src/hyperlight_guest_bin/src/lib.rs b/src/hyperlight_guest_bin/src/lib.rs index 204cf92f0..63d140abe 100644 --- a/src/hyperlight_guest_bin/src/lib.rs +++ b/src/hyperlight_guest_bin/src/lib.rs @@ -116,6 +116,10 @@ pub(crate) static HEAP_ALLOCATOR: ProfiledLockedHeap<32> = ProfiledLockedHeap(LockedHeap::<32>::empty()); pub static mut GUEST_HANDLE: GuestHandle = GuestHandle::new(); +/// Canary value set to 0xDEAD_BEEF during init. If this reads as 0 +/// after snapshot restore while GUEST_HANDLE is None, the snapshot +/// memory for this region was not properly restored. +pub static mut GUEST_HANDLE_CANARY: u64 = 0; pub(crate) static mut REGISTERED_GUEST_FUNCTIONS: GuestFunctionRegister = GuestFunctionRegister::new(); @@ -205,6 +209,7 @@ pub(crate) extern "C" fn generic_init( ) -> u64 { unsafe { GUEST_HANDLE = GuestHandle::init(peb_address as *mut HyperlightPEB); + GUEST_HANDLE_CANARY = 0xDEAD_BEEF; #[allow(static_mut_refs)] let peb_ptr = GUEST_HANDLE.peb().unwrap(); From 9aebbcc87592ca1d9c2fe92699692bead8af1062 Mon Sep 17 00:00:00 2001 From: Ludvig Liljenberg <4257730+ludfjig@users.noreply.github.com> Date: Wed, 25 Mar 2026 09:38:34 -0700 Subject: [PATCH 2/6] Use WHvResetPartition on windows Signed-off-by: Ludvig Liljenberg <4257730+ludfjig@users.noreply.github.com> --- .github/workflows/ValidatePullRequest.yml | 4 +- .github/workflows/dep_build_test.yml | 2 +- Justfile | 7 +- .../src/hypervisor/hyperlight_vm/aarch64.rs | 8 +- .../src/hypervisor/hyperlight_vm/x86_64.rs | 161 +++++++++++++----- .../src/hypervisor/virtual_machine/mod.rs | 19 ++- .../src/hypervisor/virtual_machine/whp.rs | 146 +++++++--------- .../src/sandbox/initialized_multi_use.rs | 12 +- src/hyperlight_host/tests/integration_test.rs | 2 +- 9 files changed, 225 insertions(+), 136 deletions(-) diff --git a/.github/workflows/ValidatePullRequest.yml b/.github/workflows/ValidatePullRequest.yml index e690cefcf..ac06bcd82 100644 --- a/.github/workflows/ValidatePullRequest.yml +++ b/.github/workflows/ValidatePullRequest.yml @@ -89,7 +89,7 @@ jobs: # See: https://github.com/actions/runner/issues/2205 if: ${{ !cancelled() && !failure() }} strategy: - fail-fast: true + fail-fast: false matrix: hypervisor: ['hyperv-ws2025', mshv3, kvm] cpu: [amd, intel] @@ -112,7 +112,7 @@ jobs: # See: https://github.com/actions/runner/issues/2205 if: ${{ !cancelled() && !failure() }} strategy: - fail-fast: true + fail-fast: false matrix: hypervisor: ['hyperv-ws2025', mshv3, kvm] cpu: [amd, intel] diff --git a/.github/workflows/dep_build_test.yml b/.github/workflows/dep_build_test.yml index 7d4e253c7..e141f260f 100644 --- a/.github/workflows/dep_build_test.yml +++ b/.github/workflows/dep_build_test.yml @@ -37,7 +37,7 @@ defaults: jobs: build-and-test: if: ${{ inputs.docs_only == 'false' }} - timeout-minutes: 45 + timeout-minutes: 60 runs-on: ${{ fromJson( format('["self-hosted", "{0}", "X64", "1ES.Pool=hld-{1}-{2}"]', inputs.hypervisor == 'hyperv-ws2025' && 'Windows' || 'Linux', diff --git a/Justfile b/Justfile index d71630d47..a039e1626 100644 --- a/Justfile +++ b/Justfile @@ -235,7 +235,7 @@ test-integration target=default-target features="": {{ cargo-cmd }} test {{ if features =="" {"--features executable_heap"} else if features=="no-default-features" {"--no-default-features --features executable_heap"} else {"--no-default-features -F executable_heap," + features } }} --profile={{ if target == "debug" { "dev" } else { target } }} {{ target-triple-flag }} --test integration_test execute_on_heap @# run the rest of the integration tests - {{ cargo-cmd }} test -p hyperlight-host {{ if features =="" {''} else if features=="no-default-features" {"--no-default-features" } else {"--no-default-features -F " + features } }} --profile={{ if target == "debug" { "dev" } else { target } }} {{ target-triple-flag }} --test '*' + {{ cargo-cmd }} test -p hyperlight-host {{ if features =="" {''} else if features=="no-default-features" {"--no-default-features" } else {"--no-default-features -F " + features } }} --profile={{ if target == "debug" { "dev" } else { target } }} {{ target-triple-flag }} --test '*' -- --nocapture # tests compilation with no default features on different platforms test-compilation-no-default-features target=default-target: @@ -323,6 +323,11 @@ clippy target=default-target: (witguest-wit) clippyw target=default-target: (witguest-wit) {{ cargo-cmd }} clippy --all-targets --all-features --target x86_64-pc-windows-gnu --profile={{ if target == "debug" { "dev" } else { target } }} -- -D warnings +# Cross-check for linux from a windows host using clippy (no linking needed). +# Only checks lib targets to avoid dev-dependencies (criterion->alloca) that need a C cross-compiler. +clippyl target=default-target: + {{ cargo-cmd }} clippy --lib --all-features --target x86_64-unknown-linux-gnu --profile={{ if target == "debug" { "dev" } else { target } }} -- -D warnings + clippy-guests target=default-target: (witguest-wit) (ensure-cargo-hyperlight) cd src/tests/rust_guests/simpleguest && cargo hyperlight clippy --profile={{ if target == "debug" { "dev" } else { target } }} -- -D warnings cd src/tests/rust_guests/witguest && cargo hyperlight clippy --profile={{ if target == "debug" { "dev" } else { target } }} -- -D warnings diff --git a/src/hyperlight_host/src/hypervisor/hyperlight_vm/aarch64.rs b/src/hyperlight_host/src/hypervisor/hyperlight_vm/aarch64.rs index c4288ae60..b1a2150f5 100644 --- a/src/hyperlight_host/src/hypervisor/hyperlight_vm/aarch64.rs +++ b/src/hyperlight_host/src/hypervisor/hyperlight_vm/aarch64.rs @@ -89,11 +89,15 @@ impl HyperlightVm { unimplemented!("get_snapshot_sregs") } - pub(crate) fn reset_vcpu( + pub(crate) fn reset_vm_state(&mut self) -> std::result::Result<(), RegisterError> { + unimplemented!("reset_vm_state") + } + + pub(crate) fn restore_sregs( &mut self, _cr3: u64, _sregs: &CommonSpecialRegisters, ) -> std::result::Result<(), RegisterError> { - unimplemented!("reset_vcpu") + unimplemented!("restore_sregs") } } diff --git a/src/hyperlight_host/src/hypervisor/hyperlight_vm/x86_64.rs b/src/hyperlight_host/src/hypervisor/hyperlight_vm/x86_64.rs index 698ab49e5..e51a23b48 100644 --- a/src/hyperlight_host/src/hypervisor/hyperlight_vm/x86_64.rs +++ b/src/hyperlight_host/src/hypervisor/hyperlight_vm/x86_64.rs @@ -40,9 +40,9 @@ use crate::hypervisor::gdb::{ }; #[cfg(gdb)] use crate::hypervisor::gdb::{DebugError, DebugMemoryAccessError}; -use crate::hypervisor::regs::{ - CommonDebugRegs, CommonFpu, CommonRegisters, CommonSpecialRegisters, -}; +#[cfg(not(target_os = "windows"))] +use crate::hypervisor::regs::CommonDebugRegs; +use crate::hypervisor::regs::{CommonFpu, CommonRegisters, CommonSpecialRegisters}; #[cfg(not(gdb))] use crate::hypervisor::virtual_machine::VirtualMachine; #[cfg(kvm)] @@ -335,27 +335,39 @@ impl HyperlightVm { } /// Resets the following vCPU state: - /// - General purpose registers - /// - Debug registers - /// - XSAVE (includes FPU/SSE state with proper FCW and MXCSR defaults) - /// - Special registers (restored from snapshot, with CR3 updated to new page table location) + /// - On Windows: calls WHvResetPartition (resets all per-VP state including + /// GP registers, debug registers, XSAVE, MSRs, APIC, etc.) + /// - On Linux: explicitly resets GP registers, debug registers, and XSAVE + /// + /// This does NOT restore special registers — call `restore_sregs` separately + /// after memory mappings are established. // TODO: check if other state needs to be reset - pub(crate) fn reset_vcpu( + pub(crate) fn reset_vm_state(&mut self) -> std::result::Result<(), RegisterError> { + #[cfg(target_os = "windows")] + self.vm.reset_partition()?; + + #[cfg(not(target_os = "windows"))] + { + self.vm.set_regs(&CommonRegisters { + rflags: 1 << 1, // Reserved bit always set + ..Default::default() + })?; + self.vm.set_debug_regs(&CommonDebugRegs::default())?; + self.vm.reset_xsave()?; + } + + Ok(()) + } + + /// Restores special registers from snapshot with CR3 updated to the + /// new page table location. + pub(crate) fn restore_sregs( &mut self, cr3: u64, sregs: &CommonSpecialRegisters, ) -> std::result::Result<(), RegisterError> { - self.vm.set_regs(&CommonRegisters { - rflags: 1 << 1, // Reserved bit always set - ..Default::default() - })?; - self.vm.set_debug_regs(&CommonDebugRegs::default())?; - self.vm.reset_xsave()?; - #[cfg(not(feature = "nanvix-unstable"))] { - // Restore the full special registers from snapshot, but update CR3 - // to point to the new (relocated) page tables let mut sregs = *sregs; sregs.cr3 = cr3; self.pending_tlb_flush = true; @@ -885,7 +897,9 @@ mod tests { use super::*; #[cfg(kvm)] use crate::hypervisor::regs::FP_CONTROL_WORD_DEFAULT; - use crate::hypervisor::regs::{CommonSegmentRegister, CommonTableRegister, MXCSR_DEFAULT}; + use crate::hypervisor::regs::{ + CommonDebugRegs, CommonSegmentRegister, CommonTableRegister, MXCSR_DEFAULT, + }; use crate::hypervisor::virtual_machine::VirtualMachine; use crate::mem::layout::SandboxMemoryLayout; use crate::mem::memory_region::{GuestMemoryRegion, MemoryRegionFlags}; @@ -912,7 +926,7 @@ mod tests { // Dirty State Builders - Create non-default vCPU state for testing reset // ========================================================================== - /// Build dirty general purpose registers for testing reset_vcpu. + /// Build dirty general purpose registers for testing reset. fn dirty_regs() -> CommonRegisters { CommonRegisters { rax: 0x1111111111111111, @@ -936,7 +950,7 @@ mod tests { } } - /// Build dirty FPU state for testing reset_vcpu. + /// Build dirty FPU state for testing reset. fn dirty_fpu() -> CommonFpu { CommonFpu { fpr: [[0xAB; 16]; 8], @@ -951,7 +965,7 @@ mod tests { } } - /// Build dirty special registers for testing reset_vcpu. + /// Build dirty special registers for testing reset. /// Must be consistent for 64-bit long mode (CR0/CR4/EFER). fn dirty_sregs(_pml4_addr: u64) -> CommonSpecialRegisters { let segment = CommonSegmentRegister { @@ -1020,7 +1034,7 @@ mod tests { } } - /// Build dirty debug registers for testing reset_vcpu. + /// Build dirty debug registers for testing reset. /// /// DR6 bit layout (Intel SDM / AMD APM): /// Bits 0-3 (B0-B3): Breakpoint condition detected - software writable/clearable @@ -1064,8 +1078,8 @@ mod tests { } } - /// Returns default test values for reset_vcpu parameters. - /// Uses standard 64-bit defaults since reset_vcpu now restores full sregs from snapshot. + /// Returns default test values for restore_sregs parameters. + /// Uses standard 64-bit defaults since restore_sregs restores full sregs from snapshot. fn default_sregs() -> CommonSpecialRegisters { CommonSpecialRegisters::standard_64bit_defaults(0) } @@ -1189,9 +1203,18 @@ mod tests { // Assertion Helpers - Verify vCPU state after reset // ========================================================================== - /// Assert that debug registers are in reset state. - /// Reserved bits in DR6/DR7 are read-only (set by CPU), so we only check - /// that writable bits are cleared to 0 and DR0-DR3 are zeroed. + /// Assert that debug registers are in architectural reset state. + /// + /// On Linux (KVM/MSHV): reset_vm_state explicitly zeroes debug registers. + /// + /// On Windows: WHvResetPartition resets to power-on defaults per + /// Intel SDM Vol. 3, Table 10-1: + /// DR0-DR3 = 0 (breakpoint addresses cleared) + /// DR6 = 0xFFFF0FF0 (reserved bits set, writable bits cleared) + /// DR7 = 0x00000400 (reserved bit 10 set, all enables cleared) + /// + /// Reserved bits in DR6/DR7 are read-only and CPU-dependent, so we only + /// verify that writable bits are cleared to 0 and DR0-DR3 are zeroed. fn assert_debug_regs_reset(vm: &dyn VirtualMachine) { let debug_regs = vm.debug_regs().unwrap(); let expected = CommonDebugRegs { @@ -1206,19 +1229,58 @@ mod tests { } /// Assert that general-purpose registers are in reset state. - /// After reset, all registers should be zeroed except rflags which has - /// reserved bit 1 always set. + /// + /// On Linux (KVM/MSHV): reset_vm_state explicitly zeroes all GP regs and sets + /// rflags = 0x2, so we verify all-zeros. + /// + /// On Windows: WHvResetPartition sets architectural power-on defaults + /// per Intel SDM Vol. 3, Table 10-1: + /// RIP = 0xFFF0 (reset vector) + /// RDX = CPUID signature (CPU-dependent stepping/model/family) + /// RFLAGS = 0x2 (only reserved bit 1 set) + /// All other GP regs = 0 + /// These are overwritten by dispatch_call_from_host before guest execution, + /// but we still verify the power-on state is correct. fn assert_regs_reset(vm: &dyn VirtualMachine) { + let regs = vm.regs().unwrap(); + #[cfg(not(target_os = "windows"))] assert_eq!( - vm.regs().unwrap(), + regs, CommonRegisters { - rflags: 1 << 1, // Reserved bit 1 is always set + rflags: 1 << 1, ..Default::default() } ); + #[cfg(target_os = "windows")] + { + // WHvResetPartition sets x86 power-on reset values + // (Intel SDM Vol. 3, Table 10-1) + let expected = CommonRegisters { + rip: 0xFFF0, // Reset vector + rdx: regs.rdx, // CPUID signature (CPU-dependent) + rflags: 0x2, // Reserved bit 1 + ..Default::default() + }; + assert_ne!( + regs.rdx, 0x4444444444444444, + "RDX should not retain dirty value" + ); + assert_eq!(regs, expected); + } } /// Assert that FPU state is in reset state. + /// + /// On Linux (KVM/MSHV): reset_vm_state calls reset_xsave which zeroes FPU state + /// and sets FCW/MXCSR to defaults. + /// + /// On Windows: WHvResetPartition resets to power-on defaults per + /// Intel SDM Vol. 3, Table 10-1 (FINIT-equivalent state): + /// FCW = 0x037F (all exceptions masked, precision=64-bit, round=nearest) + /// FSW = 0, FTW = 0 (all empty), FOP = 0, FIP = 0, FDP = 0 + /// MXCSR = 0x1F80 (all SIMD exceptions masked, round=nearest) + /// ST0-ST7 = 0, XMM0-XMM15 = 0 + /// /// Handles hypervisor-specific quirks (KVM MXCSR, empty FPU registers). fn assert_fpu_reset(vm: &dyn VirtualMachine) { let fpu = vm.fpu().unwrap(); @@ -1228,8 +1290,14 @@ mod tests { assert_eq!(fpu, expected_fpu); } - /// Assert that special registers are in reset state. - /// Handles hypervisor-specific differences in hidden descriptor cache fields. + /// Assert that special registers match the expected snapshot state. + /// + /// After reset, sregs are explicitly restored from the snapshot + /// (with CR3 updated). This verifies they match the expected 64-bit + /// long mode configuration from CommonSpecialRegisters::standard_64bit_defaults. + /// + /// Handles hypervisor-specific differences in hidden descriptor cache fields + /// (unusable, granularity, type_ for unused segments). fn assert_sregs_reset(vm: &dyn VirtualMachine, pml4_addr: u64) { let defaults = CommonSpecialRegisters::standard_64bit_defaults(pml4_addr); let sregs = vm.sregs().unwrap(); @@ -1339,7 +1407,7 @@ mod tests { dirtied_mask } - /// Dirty the legacy XSAVE region (bytes 0-511) for testing reset_vcpu. + /// Dirty the legacy XSAVE region (bytes 0-511) for testing reset. /// This includes FPU/x87 state, SSE state, and reserved areas. /// /// Layout (from Intel SDM Table 13-1): @@ -1629,7 +1697,8 @@ mod tests { assert_eq!(got_sregs, expected_sregs); // Reset the vCPU - hyperlight_vm.reset_vcpu(0, &default_sregs()).unwrap(); + hyperlight_vm.reset_vm_state().unwrap(); + hyperlight_vm.restore_sregs(0, &default_sregs()).unwrap(); // Verify registers are reset to defaults assert_regs_reset(hyperlight_vm.vm.as_ref()); @@ -1694,7 +1763,7 @@ mod tests { "xsave should be zeroed except for hypervisor-specific fields" ); - // Verify sregs are reset to defaults (CR3 is 0 as passed to reset_vcpu) + // Verify sregs are reset to defaults assert_sregs_reset(hyperlight_vm.vm.as_ref(), 0); } @@ -1758,7 +1827,8 @@ mod tests { assert_eq!(regs, expected_dirty); // Reset vcpu - hyperlight_vm.reset_vcpu(0, &default_sregs()).unwrap(); + hyperlight_vm.reset_vm_state().unwrap(); + hyperlight_vm.restore_sregs(0, &default_sregs()).unwrap(); // Check registers are reset to defaults assert_regs_reset(hyperlight_vm.vm.as_ref()); @@ -1882,7 +1952,8 @@ mod tests { } // Reset vcpu - hyperlight_vm.reset_vcpu(0, &default_sregs()).unwrap(); + hyperlight_vm.reset_vm_state().unwrap(); + hyperlight_vm.restore_sregs(0, &default_sregs()).unwrap(); // Check FPU is reset to defaults assert_fpu_reset(hyperlight_vm.vm.as_ref()); @@ -1933,7 +2004,8 @@ mod tests { assert_eq!(debug_regs, expected_dirty); // Reset vcpu - hyperlight_vm.reset_vcpu(0, &default_sregs()).unwrap(); + hyperlight_vm.reset_vm_state().unwrap(); + hyperlight_vm.restore_sregs(0, &default_sregs()).unwrap(); // Check debug registers are reset to default values assert_debug_regs_reset(hyperlight_vm.vm.as_ref()); @@ -1982,9 +2054,10 @@ mod tests { assert_eq!(sregs, expected_dirty); // Reset vcpu - hyperlight_vm.reset_vcpu(0, &default_sregs()).unwrap(); + hyperlight_vm.reset_vm_state().unwrap(); + hyperlight_vm.restore_sregs(0, &default_sregs()).unwrap(); - // Check registers are reset to defaults (CR3 is 0 as passed to reset_vcpu) + // Check registers are reset to defaults let sregs = hyperlight_vm.vm.sregs().unwrap(); let mut expected_reset = CommonSpecialRegisters::standard_64bit_defaults(0); normalize_sregs_for_run_tests(&mut expected_reset, &sregs); @@ -2020,7 +2093,11 @@ mod tests { let root_pt_addr = ctx.ctx.vm.get_root_pt().unwrap(); let segment_state = ctx.ctx.vm.get_snapshot_sregs().unwrap(); - ctx.ctx.vm.reset_vcpu(root_pt_addr, &segment_state).unwrap(); + ctx.ctx.vm.reset_vm_state().unwrap(); + ctx.ctx + .vm + .restore_sregs(root_pt_addr, &segment_state) + .unwrap(); // Re-run from entrypoint (flag=1 means guest skips dirty phase, just does FXSAVE) // Use stack_top - 8 to match initialise()'s behavior (simulates call pushing return addr) diff --git a/src/hyperlight_host/src/hypervisor/virtual_machine/mod.rs b/src/hyperlight_host/src/hypervisor/virtual_machine/mod.rs index 9edaa1f87..e919505b1 100644 --- a/src/hyperlight_host/src/hypervisor/virtual_machine/mod.rs +++ b/src/hyperlight_host/src/hypervisor/virtual_machine/mod.rs @@ -107,7 +107,7 @@ pub(crate) enum HypervisorType { /// Minimum XSAVE buffer size: 512 bytes legacy region + 64 bytes header. /// Only used by MSHV and WHP which use compacted XSAVE format and need to /// validate buffer size before accessing XCOMP_BV. -#[cfg(any(mshv3, target_os = "windows"))] +#[cfg(mshv3)] pub(crate) const XSAVE_MIN_SIZE: usize = 576; /// Standard XSAVE buffer size (4KB) used by KVM and MSHV. @@ -244,6 +244,9 @@ pub enum RegisterError { #[cfg(target_os = "windows")] #[error("Failed to convert WHP registers: {0}")] ConversionFailed(String), + #[cfg(target_os = "windows")] + #[error("Failed to reset partition: {0}")] + ResetPartition(HypervisorError), } /// Map memory error @@ -341,18 +344,32 @@ pub(crate) trait VirtualMachine: Debug + Send { #[allow(dead_code)] fn debug_regs(&self) -> std::result::Result; /// Set the debug registers of the vCPU + #[allow(dead_code)] // Called on Linux (reset_vcpu_state) and via gdb, but dead on Windows release builds without gdb fn set_debug_regs(&self, drs: &CommonDebugRegs) -> std::result::Result<(), RegisterError>; /// Get xsave #[allow(dead_code)] fn xsave(&self) -> std::result::Result, RegisterError>; /// Reset xsave to default state + #[cfg(any(kvm, mshv3))] fn reset_xsave(&self) -> std::result::Result<(), RegisterError>; /// Set xsave - only used for tests #[cfg(test)] #[cfg(not(feature = "nanvix-unstable"))] fn set_xsave(&self, xsave: &[u32]) -> std::result::Result<(), RegisterError>; + /// Reset the partition using WHvResetPartition. + /// + /// Resets all per-VP state to architectural defaults: GP registers, segment + /// registers, control registers, EFER, MSRs, debug registers, full XSAVE + /// state (x87/SSE/AVX/AVX-512/AMX), XCR0, APIC/LAPIC, pending interrupts, + /// and VMCS/VMCB internals. + /// + /// Does NOT reset: GPA memory mappings or contents, partition configuration, + /// notification ports, or VPCI device state. + #[cfg(target_os = "windows")] + fn reset_partition(&mut self) -> std::result::Result<(), RegisterError>; + /// Get partition handle #[cfg(target_os = "windows")] fn partition_handle(&self) -> windows::Win32::System::Hypervisor::WHV_PARTITION_HANDLE; diff --git a/src/hyperlight_host/src/hypervisor/virtual_machine/whp.rs b/src/hyperlight_host/src/hypervisor/virtual_machine/whp.rs index 3c6ae5a9d..a778085ee 100644 --- a/src/hyperlight_host/src/hypervisor/virtual_machine/whp.rs +++ b/src/hyperlight_host/src/hypervisor/virtual_machine/whp.rs @@ -32,9 +32,8 @@ use windows_result::HRESULT; use crate::hypervisor::gdb::{DebugError, DebuggableVm}; use crate::hypervisor::regs::{ Align16, CommonDebugRegs, CommonFpu, CommonRegisters, CommonSpecialRegisters, - FP_CONTROL_WORD_DEFAULT, MXCSR_DEFAULT, WHP_DEBUG_REGS_NAMES, WHP_DEBUG_REGS_NAMES_LEN, - WHP_FPU_NAMES, WHP_FPU_NAMES_LEN, WHP_REGS_NAMES, WHP_REGS_NAMES_LEN, WHP_SREGS_NAMES, - WHP_SREGS_NAMES_LEN, + WHP_DEBUG_REGS_NAMES, WHP_DEBUG_REGS_NAMES_LEN, WHP_FPU_NAMES, WHP_FPU_NAMES_LEN, + WHP_REGS_NAMES, WHP_REGS_NAMES_LEN, WHP_SREGS_NAMES, WHP_SREGS_NAMES_LEN, }; use crate::hypervisor::surrogate_process::SurrogateProcess; use crate::hypervisor::surrogate_process_manager::get_surrogate_process_manager; @@ -42,7 +41,7 @@ use crate::hypervisor::surrogate_process_manager::get_surrogate_process_manager; use crate::hypervisor::virtual_machine::x86_64::hw_interrupts::TimerThread; use crate::hypervisor::virtual_machine::{ CreateVmError, HypervisorError, MapMemoryError, RegisterError, RunVcpuError, UnmapMemoryError, - VirtualMachine, VmExit, XSAVE_MIN_SIZE, + VirtualMachine, VmExit, }; use crate::hypervisor::wrappers::HandleWrapper; use crate::mem::memory_region::{MemoryRegion, MemoryRegionFlags, MemoryRegionType}; @@ -556,6 +555,12 @@ impl VirtualMachine for WhpVm { let whp_regs: [(WHV_REGISTER_NAME, Align16); WHP_SREGS_NAMES_LEN] = sregs.into(); + // TODO: Remove this filter and instead fix the callers to pass + // the correct APIC_BASE value (e.g. read it from the VP before + // the first set_sregs, or use 0xFEE00900 in standard_64bit_defaults). + // Filtering here causes AMD/SVM bugs because SVM_CLEAN_FIELD_AVIC + // is never dirtied after WHvResetPartition (see reset_partition). + // // When LAPIC emulation is active (always with hw-interrupts), // skip writing APIC_BASE. The generic CommonSpecialRegisters // defaults APIC_BASE to 0 which would globally disable the LAPIC. @@ -607,6 +612,7 @@ impl VirtualMachine for WhpVm { }) } + #[allow(dead_code)] fn set_debug_regs(&self, drs: &CommonDebugRegs) -> std::result::Result<(), RegisterError> { let whp_regs: [(WHV_REGISTER_NAME, Align16); WHP_DEBUG_REGS_NAMES_LEN] = drs.into(); @@ -666,85 +672,6 @@ impl VirtualMachine for WhpVm { Ok(xsave_buffer) } - fn reset_xsave(&self) -> std::result::Result<(), RegisterError> { - // WHP uses compacted XSAVE format (bit 63 of XCOMP_BV set). - // We cannot just zero out the xsave area, we need to preserve the XCOMP_BV. - - // Get the required buffer size by calling with NULL buffer. - let mut buffer_size_needed: u32 = 0; - - let result = unsafe { - WHvGetVirtualProcessorXsaveState( - self.partition, - 0, - std::ptr::null_mut(), - 0, - &mut buffer_size_needed, - ) - }; - - // Expect insufficient buffer error; any other error is unexpected - if let Err(e) = result - && e.code() != windows::Win32::Foundation::WHV_E_INSUFFICIENT_BUFFER - { - return Err(RegisterError::GetXsaveSize(e.into())); - } - - if buffer_size_needed < XSAVE_MIN_SIZE as u32 { - return Err(RegisterError::XsaveSizeMismatch { - expected: XSAVE_MIN_SIZE as u32, - actual: buffer_size_needed, - }); - } - - // Create a buffer to hold the current state (to get the correct XCOMP_BV) - let mut current_state = vec![0u8; buffer_size_needed as usize]; - let mut written_bytes = 0; - unsafe { - WHvGetVirtualProcessorXsaveState( - self.partition, - 0, - current_state.as_mut_ptr() as *mut std::ffi::c_void, - buffer_size_needed, - &mut written_bytes, - ) - .map_err(|e| RegisterError::GetXsave(e.into()))?; - }; - - // Zero out most of the buffer, preserving only XCOMP_BV (520-528). - // Extended components with XSTATE_BV bit=0 will use their init values. - // - // - Legacy region (0-512): x87 FPU + SSE state - // - XSTATE_BV (512-520): Feature bitmap - // - XCOMP_BV (520-528): Compaction bitmap + format bit (KEEP) - // - Reserved (528-576): Header padding - // - Extended (576+): AVX, AVX-512, MPX, PKRU, AMX, etc. - current_state[0..520].fill(0); - current_state[528..].fill(0); - - // XSAVE area layout from Intel SDM Vol. 1 Section 13.4.1: - // - Bytes 0-1: FCW (x87 FPU Control Word) - // - Bytes 24-27: MXCSR - // - Bytes 512-519: XSTATE_BV (bitmap of valid state components) - current_state[0..2].copy_from_slice(&FP_CONTROL_WORD_DEFAULT.to_le_bytes()); - current_state[24..28].copy_from_slice(&MXCSR_DEFAULT.to_le_bytes()); - // XSTATE_BV = 0x3: bits 0,1 = x87 + SSE valid. Explicitly tell hypervisor - // to apply the legacy region from this buffer for consistent behavior. - current_state[512..520].copy_from_slice(&0x3u64.to_le_bytes()); - - unsafe { - WHvSetVirtualProcessorXsaveState( - self.partition, - 0, - current_state.as_ptr() as *const std::ffi::c_void, - buffer_size_needed, - ) - .map_err(|e| RegisterError::SetXsave(e.into()))?; - } - - Ok(()) - } - #[cfg(test)] #[cfg(not(feature = "nanvix-unstable"))] fn set_xsave(&self, xsave: &[u32]) -> std::result::Result<(), RegisterError> { @@ -791,6 +718,59 @@ impl VirtualMachine for WhpVm { Ok(()) } + fn reset_partition(&mut self) -> std::result::Result<(), RegisterError> { + // Note: we intentionally do NOT stop the timer here. The timer + // thread's inject closure uses `let _ =` so any WHvRequestInterrupt + // that lands during the brief LAPIC-unconfigured window is silently + // ignored. Destroying the timer would be permanent — the guest + // configured it during init and won't reconfigure after restore. + + unsafe { + WHvResetPartition(self.partition) + .map_err(|e| RegisterError::ResetPartition(e.into()))?; + + // WHvResetPartition resets the VMCB but may not update the + // hypervisor's internal register-change tracking used for + // TLB flush decisions on AMD SVM. Explicitly re-announce + // the post-reset CR0 (PG=0) via the WHP API so that the + // subsequent restore_sregs() write of CR0 with PG=1 is + // detected as a paging-mode change, triggering + // SVM_TLB_FLUSH_CURRENT_ASID on the next VMRUN. + let post_reset_sregs = self.sregs()?; + self.set_registers(&[( + WHvX64RegisterCr0, + Align16(WHV_REGISTER_VALUE { + Reg64: post_reset_sregs.cr0, + }), + )]) + .map_err(|e| RegisterError::ResetPartition(e.into()))?; + + // WHvResetPartition resets LAPIC to power-on defaults. + // Re-initialize it when LAPIC emulation is active. + #[cfg(feature = "hw-interrupts")] + Self::init_lapic_bulk(self.partition) + .map_err(|e| RegisterError::ResetPartition(e.into()))?; + + // With hw-interrupts, set_sregs filters out APIC_BASE to + // avoid disabling the LAPIC (the snapshot sregs default it + // to 0). On AMD/SVM this means SVM_CLEAN_FIELD_AVIC is + // never dirtied after the VMCB rebuild, causing the CPU to + // use stale cached AVIC state on VMRUN which corrupts guest + // memory reads. Re-writing APIC_BASE to its post-reset + // value forces the dirty bit via ValSetEmulatedApicBase. + #[cfg(feature = "hw-interrupts")] + { + let apic_base = self.sregs()?.apic_base; + self.set_registers(&[( + WHvX64RegisterApicBase, + Align16(WHV_REGISTER_VALUE { Reg64: apic_base }), + )]) + .map_err(|e| RegisterError::ResetPartition(e.into()))?; + } + } + Ok(()) + } + /// Get the partition handle for this VM fn partition_handle(&self) -> WHV_PARTITION_HANDLE { self.partition diff --git a/src/hyperlight_host/src/sandbox/initialized_multi_use.rs b/src/hyperlight_host/src/sandbox/initialized_multi_use.rs index 72de96035..7e72d63fe 100644 --- a/src/hyperlight_host/src/sandbox/initialized_multi_use.rs +++ b/src/hyperlight_host/src/sandbox/initialized_multi_use.rs @@ -287,6 +287,12 @@ impl MultiUseSandbox { return Err(SnapshotSandboxMismatch); } + // Reset VM/vCPU state before memory restore. + self.vm.reset_vm_state().map_err(|e| { + self.poisoned = true; + HyperlightVmError::Restore(e) + })?; + let (gsnapshot, gscratch) = self.mem_mgr.restore_snapshot(&snapshot)?; if let Some(gsnapshot) = gsnapshot { self.vm @@ -305,7 +311,7 @@ impl MultiUseSandbox { // TODO (ludfjig): Go through the rest of possible errors in this `MultiUseSandbox::restore` function // and determine if they should also poison the sandbox. self.vm - .reset_vcpu(snapshot.root_pt_gpa(), sregs) + .restore_sregs(snapshot.root_pt_gpa(), sregs) .map_err(|e| { self.poisoned = true; HyperlightVmError::Restore(e) @@ -1436,7 +1442,7 @@ mod tests { } /// Test that snapshot restore properly resets vCPU debug registers. This test verifies - /// that restore() calls reset_vcpu(). + /// that restore() calls reset_vm_state(). #[test] fn snapshot_restore_resets_debug_registers() { let mut sandbox: MultiUseSandbox = { @@ -1466,7 +1472,7 @@ mod tests { let dr0_after_restore: u64 = sandbox.call("GetDr0", ()).unwrap(); assert_eq!( dr0_after_restore, 0, - "DR0 should be 0 after restore (reset_vcpu should have been called)" + "DR0 should be 0 after restore (reset_vm_state should have been called)" ); } diff --git a/src/hyperlight_host/tests/integration_test.rs b/src/hyperlight_host/tests/integration_test.rs index 1f823dc49..151d55446 100644 --- a/src/hyperlight_host/tests/integration_test.rs +++ b/src/hyperlight_host/tests/integration_test.rs @@ -195,7 +195,7 @@ fn interrupt_same_thread() { // Only allow successful calls or interrupted. // The call can be successful in case the call is finished before kill() is called. Ok(_) | Err(HyperlightError::ExecutionCanceledByHost()) => {} - _ => panic!("Unexpected return"), + other => panic!("Unexpected return: {:?}", other), }; if sbox2.poisoned() { sbox2.restore(snapshot2.clone()).unwrap(); From cd50b1b055b44be692711d5eae1812eb10ff4dc1 Mon Sep 17 00:00:00 2001 From: Ludvig Liljenberg <4257730+ludfjig@users.noreply.github.com> Date: Thu, 26 Mar 2026 11:51:51 -0700 Subject: [PATCH 3/6] Be more careful about poisoning Signed-off-by: Ludvig Liljenberg <4257730+ludfjig@users.noreply.github.com> --- .../src/sandbox/initialized_multi_use.rs | 61 +++++++++++++++---- 1 file changed, 48 insertions(+), 13 deletions(-) diff --git a/src/hyperlight_host/src/sandbox/initialized_multi_use.rs b/src/hyperlight_host/src/sandbox/initialized_multi_use.rs index 7e72d63fe..024fb9371 100644 --- a/src/hyperlight_host/src/sandbox/initialized_multi_use.rs +++ b/src/hyperlight_host/src/sandbox/initialized_multi_use.rs @@ -287,35 +287,64 @@ impl MultiUseSandbox { return Err(SnapshotSandboxMismatch); } - // Reset VM/vCPU state before memory restore. - self.vm.reset_vm_state().map_err(|e| { - self.poisoned = true; - HyperlightVmError::Restore(e) - })?; + // A prior failed restore may have swapped mem_mgr.shared_mem + // without updating the hypervisor's GPA mapping. Track this + // so we can force a remap below if needed. + let was_poisoned = self.poisoned; + + // Assume poisoned: any failure below leaves the sandbox + // inconsistent. Only cleared at the end on success. + self.poisoned = true; + + // Failure here may partially reset vCPU state. The sandbox is + // poisoned, and the next restore() retry will call + // reset_vm_state() again which is idempotent. + self.vm + .reset_vm_state() + .map_err(HyperlightVmError::Restore)?; + // Failure here may leave mem_mgr with partially swapped + // shared_mem/scratch_mem. The sandbox is poisoned, and if + // shared_mem was swapped without updating the hypervisor, the + // was_poisoned path below handles it on the retry. let (gsnapshot, gscratch) = self.mem_mgr.restore_snapshot(&snapshot)?; if let Some(gsnapshot) = gsnapshot { + // Failure here leaves the hypervisor pointing at the old + // (possibly freed) snapshot. The sandbox is poisoned, and + // the next restore() will remap via the was_poisoned path. + self.vm + .update_snapshot_mapping(gsnapshot) + .map_err(|e| HyperlightError::HyperlightVmError(e.into()))?; + } else if was_poisoned { + // restore_snapshot() skipped the swap because shared_mem + // already matched, but the hypervisor may still point at + // the old backing from before the failed restore. Force + // a remap. Failure leaves us still poisoned for the same + // reason as above. + let snap_mem = snapshot.memory().to_mgr_snapshot_mem()?; + let (_, gsnapshot) = snap_mem.build(); self.vm .update_snapshot_mapping(gsnapshot) .map_err(|e| HyperlightError::HyperlightVmError(e.into()))?; } if let Some(gscratch) = gscratch { + // Failure here leaves the hypervisor pointing at old + // scratch. The sandbox is poisoned; scratch size is fixed + // per sandbox so retry will zero the same allocation and + // remap here again. self.vm .update_scratch_mapping(gscratch) .map_err(|e| HyperlightError::HyperlightVmError(e.into()))?; } + // Failure here means sregs (CR3, etc.) are not set. The + // sandbox is poisoned; retry will call restore_sregs() again. let sregs = snapshot.sregs().ok_or_else(|| { HyperlightError::Error("snapshot from running sandbox should have sregs".to_string()) })?; - // TODO (ludfjig): Go through the rest of possible errors in this `MultiUseSandbox::restore` function - // and determine if they should also poison the sandbox. self.vm .restore_sregs(snapshot.root_pt_gpa(), sregs) - .map_err(|e| { - self.poisoned = true; - HyperlightVmError::Restore(e) - })?; + .map_err(HyperlightVmError::Restore)?; self.vm.set_stack_top(snapshot.stack_top_gva()); self.vm.set_entrypoint(snapshot.entrypoint()); @@ -327,12 +356,19 @@ impl MultiUseSandbox { let regions_to_map = snapshot_regions.difference(¤t_regions); for region in regions_to_unmap { + // Failure here leaves extra regions mapped. The sandbox is + // poisoned; retry will attempt the unmap again since the + // region will still be in the current set. self.vm .unmap_region(region) .map_err(HyperlightVmError::UnmapRegion)?; } for region in regions_to_map { + // Failure here leaves regions missing. The sandbox is + // poisoned; retry will attempt the map again since the + // region will still be in the snapshot set. + // // Safety: The region has been mapped before, and at that point the caller promised that the memory region is valid // in their call to `MultiUseSandbox::map_region` unsafe { self.vm.map_region(region) }.map_err(HyperlightVmError::MapRegion)?; @@ -341,9 +377,8 @@ impl MultiUseSandbox { // The restored snapshot is now our most current snapshot self.snapshot = Some(snapshot.clone()); - // Clear poison state when successfully restoring from snapshot. + // Clear poison state: restore completed successfully. // - // # Safety: // This is safe because: // 1. Snapshots can only be taken from non-poisoned sandboxes (verified at snapshot creation) // 2. Restoration completely replaces all memory state, eliminating: From 4e923d4c2c5106636910642537890cf8e01fd5ee Mon Sep 17 00:00:00 2001 From: Ludvig Liljenberg <4257730+ludfjig@users.noreply.github.com> Date: Thu, 26 Mar 2026 15:37:03 -0700 Subject: [PATCH 4/6] Revert "Be more careful about poisoning" This reverts commit cd50b1b055b44be692711d5eae1812eb10ff4dc1. Signed-off-by: Ludvig Liljenberg <4257730+ludfjig@users.noreply.github.com> --- .../src/sandbox/initialized_multi_use.rs | 61 ++++--------------- 1 file changed, 13 insertions(+), 48 deletions(-) diff --git a/src/hyperlight_host/src/sandbox/initialized_multi_use.rs b/src/hyperlight_host/src/sandbox/initialized_multi_use.rs index 024fb9371..7e72d63fe 100644 --- a/src/hyperlight_host/src/sandbox/initialized_multi_use.rs +++ b/src/hyperlight_host/src/sandbox/initialized_multi_use.rs @@ -287,64 +287,35 @@ impl MultiUseSandbox { return Err(SnapshotSandboxMismatch); } - // A prior failed restore may have swapped mem_mgr.shared_mem - // without updating the hypervisor's GPA mapping. Track this - // so we can force a remap below if needed. - let was_poisoned = self.poisoned; - - // Assume poisoned: any failure below leaves the sandbox - // inconsistent. Only cleared at the end on success. - self.poisoned = true; - - // Failure here may partially reset vCPU state. The sandbox is - // poisoned, and the next restore() retry will call - // reset_vm_state() again which is idempotent. - self.vm - .reset_vm_state() - .map_err(HyperlightVmError::Restore)?; + // Reset VM/vCPU state before memory restore. + self.vm.reset_vm_state().map_err(|e| { + self.poisoned = true; + HyperlightVmError::Restore(e) + })?; - // Failure here may leave mem_mgr with partially swapped - // shared_mem/scratch_mem. The sandbox is poisoned, and if - // shared_mem was swapped without updating the hypervisor, the - // was_poisoned path below handles it on the retry. let (gsnapshot, gscratch) = self.mem_mgr.restore_snapshot(&snapshot)?; if let Some(gsnapshot) = gsnapshot { - // Failure here leaves the hypervisor pointing at the old - // (possibly freed) snapshot. The sandbox is poisoned, and - // the next restore() will remap via the was_poisoned path. - self.vm - .update_snapshot_mapping(gsnapshot) - .map_err(|e| HyperlightError::HyperlightVmError(e.into()))?; - } else if was_poisoned { - // restore_snapshot() skipped the swap because shared_mem - // already matched, but the hypervisor may still point at - // the old backing from before the failed restore. Force - // a remap. Failure leaves us still poisoned for the same - // reason as above. - let snap_mem = snapshot.memory().to_mgr_snapshot_mem()?; - let (_, gsnapshot) = snap_mem.build(); self.vm .update_snapshot_mapping(gsnapshot) .map_err(|e| HyperlightError::HyperlightVmError(e.into()))?; } if let Some(gscratch) = gscratch { - // Failure here leaves the hypervisor pointing at old - // scratch. The sandbox is poisoned; scratch size is fixed - // per sandbox so retry will zero the same allocation and - // remap here again. self.vm .update_scratch_mapping(gscratch) .map_err(|e| HyperlightError::HyperlightVmError(e.into()))?; } - // Failure here means sregs (CR3, etc.) are not set. The - // sandbox is poisoned; retry will call restore_sregs() again. let sregs = snapshot.sregs().ok_or_else(|| { HyperlightError::Error("snapshot from running sandbox should have sregs".to_string()) })?; + // TODO (ludfjig): Go through the rest of possible errors in this `MultiUseSandbox::restore` function + // and determine if they should also poison the sandbox. self.vm .restore_sregs(snapshot.root_pt_gpa(), sregs) - .map_err(HyperlightVmError::Restore)?; + .map_err(|e| { + self.poisoned = true; + HyperlightVmError::Restore(e) + })?; self.vm.set_stack_top(snapshot.stack_top_gva()); self.vm.set_entrypoint(snapshot.entrypoint()); @@ -356,19 +327,12 @@ impl MultiUseSandbox { let regions_to_map = snapshot_regions.difference(¤t_regions); for region in regions_to_unmap { - // Failure here leaves extra regions mapped. The sandbox is - // poisoned; retry will attempt the unmap again since the - // region will still be in the current set. self.vm .unmap_region(region) .map_err(HyperlightVmError::UnmapRegion)?; } for region in regions_to_map { - // Failure here leaves regions missing. The sandbox is - // poisoned; retry will attempt the map again since the - // region will still be in the snapshot set. - // // Safety: The region has been mapped before, and at that point the caller promised that the memory region is valid // in their call to `MultiUseSandbox::map_region` unsafe { self.vm.map_region(region) }.map_err(HyperlightVmError::MapRegion)?; @@ -377,8 +341,9 @@ impl MultiUseSandbox { // The restored snapshot is now our most current snapshot self.snapshot = Some(snapshot.clone()); - // Clear poison state: restore completed successfully. + // Clear poison state when successfully restoring from snapshot. // + // # Safety: // This is safe because: // 1. Snapshots can only be taken from non-poisoned sandboxes (verified at snapshot creation) // 2. Restoration completely replaces all memory state, eliminating: From 0ee9778f2f66ace8b1e52652436222ce4315e801 Mon Sep 17 00:00:00 2001 From: Ludvig Liljenberg <4257730+ludfjig@users.noreply.github.com> Date: Fri, 27 Mar 2026 11:22:20 -0700 Subject: [PATCH 5/6] Address stale NPT bug Signed-off-by: Ludvig Liljenberg <4257730+ludfjig@users.noreply.github.com> --- .../src/hypervisor/virtual_machine/whp.rs | 188 +++++++++++++++--- src/hyperlight_host/src/mem/layout.rs | 2 +- 2 files changed, 160 insertions(+), 30 deletions(-) diff --git a/src/hyperlight_host/src/hypervisor/virtual_machine/whp.rs b/src/hyperlight_host/src/hypervisor/virtual_machine/whp.rs index a778085ee..b908d4392 100644 --- a/src/hyperlight_host/src/hypervisor/virtual_machine/whp.rs +++ b/src/hyperlight_host/src/hypervisor/virtual_machine/whp.rs @@ -16,16 +16,19 @@ limitations under the License. use std::os::raw::c_void; +use hyperlight_common::mem::PAGE_SIZE_USIZE; use hyperlight_common::outb::VmAction; #[cfg(feature = "trace_guest")] use tracing::Span; #[cfg(feature = "trace_guest")] use tracing_opentelemetry::OpenTelemetrySpanExt; -use windows::Win32::Foundation::{CloseHandle, FreeLibrary, HANDLE}; +use windows::Win32::Foundation::{CloseHandle, FreeLibrary, HANDLE, INVALID_HANDLE_VALUE}; use windows::Win32::System::Hypervisor::*; use windows::Win32::System::LibraryLoader::*; -use windows::Win32::System::Memory::{MEMORY_MAPPED_VIEW_ADDRESS, UnmapViewOfFile}; -use windows::core::s; +use windows::Win32::System::Memory::{ + CreateFileMappingA, MEMORY_MAPPED_VIEW_ADDRESS, PAGE_READWRITE, UnmapViewOfFile, +}; +use windows::core::{PCSTR, s}; use windows_result::HRESULT; #[cfg(gdb)] @@ -44,7 +47,10 @@ use crate::hypervisor::virtual_machine::{ VirtualMachine, VmExit, }; use crate::hypervisor::wrappers::HandleWrapper; -use crate::mem::memory_region::{MemoryRegion, MemoryRegionFlags, MemoryRegionType}; +use crate::mem::layout::SandboxMemoryLayout; +use crate::mem::memory_region::{ + MemoryRegion, MemoryRegionFlags, MemoryRegionType, SurrogateMapping, +}; #[cfg(feature = "trace_guest")] use crate::sandbox::trace::TraceContext as SandboxTraceContext; @@ -86,6 +92,134 @@ fn release_file_mapping(view_base: *mut c_void, mapping_handle: HandleWrapper) { } } +/// GPA for the NPT flush dummy page. Placed just past the maximum +/// snapshot region so it can never collide with guest memory. +const NPT_FLUSH_GPA: u64 = + (SandboxMemoryLayout::BASE_ADDRESS + SandboxMemoryLayout::MAX_MEMORY_SIZE) as u64; + +/// A single dummy page mapped at a fixed GPA whose permissions are +/// toggled on each [`reset_partition`](VirtualMachine::reset_partition) +/// call. On AMD SVM, `WHvResetPartition` does not flush the NPT +/// (GPA→HPA) TLB. Changing a GPA mapping's permissions via +/// `WHvMapGpaRange2` is the only WHP API path that triggers +/// `ValFlushNestedTb` (an NPT TbGeneration bump), which causes +/// `SVM_TLB_FLUSH_CURRENT_ASID` on the next VMRUN. +#[derive(Debug)] +struct NptFlushPage { + /// Backing file-mapping handle. + handle: HANDLE, + /// Address of the page in the surrogate process. + surrogate_addr: *mut c_void, + /// Cached function pointer for `WHvMapGpaRange2`. + map_gpa_range2: WHvMapGpaRange2Func, + /// Toggled on each flush to alternate the GPA permission between + /// READ and READ|WRITE. + toggle: bool, +} + +impl NptFlushPage { + const GPA_FLAGS_READ: WHV_MAP_GPA_RANGE_FLAGS = WHvMapGpaRangeFlagRead; + const GPA_FLAGS_READWRITE: WHV_MAP_GPA_RANGE_FLAGS = + WHV_MAP_GPA_RANGE_FLAGS(WHvMapGpaRangeFlagRead.0 | WHvMapGpaRangeFlagWrite.0); + + /// Allocate a dummy page, map it into the surrogate process, and + /// create the initial GPA mapping at [`NPT_FLUSH_GPA`]. + fn new( + partition: WHV_PARTITION_HANDLE, + surrogate_process: &mut SurrogateProcess, + ) -> Result { + let handle = unsafe { + CreateFileMappingA( + INVALID_HANDLE_VALUE, + None, + PAGE_READWRITE, + 0, + PAGE_SIZE_USIZE as u32, + PCSTR::null(), + ) + .map_err(|e| CreateVmError::InitializeVm(e.into()))? + }; + + let surrogate_addr = surrogate_process + .map( + handle.into(), + 0, // sentinel key; MapViewOfFile never returns 0 + PAGE_SIZE_USIZE, + &SurrogateMapping::SandboxMemory, + ) + .map_err(|e| CreateVmError::SurrogateProcess(e.to_string()))?; + + let map_gpa_range2 = unsafe { + try_load_whv_map_gpa_range2().map_err(|e| CreateVmError::InitializeVm(e.into()))? + }; + + let res = unsafe { + map_gpa_range2( + partition, + surrogate_process.process_handle.into(), + surrogate_addr, + NPT_FLUSH_GPA, + PAGE_SIZE_USIZE as u64, + Self::GPA_FLAGS_READ, + ) + }; + if res.is_err() { + return Err(CreateVmError::InitializeVm( + windows_result::Error::from_hresult(res).into(), + )); + } + + Ok(Self { + handle, + surrogate_addr, + map_gpa_range2, + toggle: false, + }) + } + + /// Toggle the dummy page's GPA permission to force an NPT TLB + /// flush. VID skips the hypercall when permissions are unchanged, + /// so we alternate between READ and READ|WRITE. + fn flush( + &mut self, + partition: WHV_PARTITION_HANDLE, + surrogate_process: &SurrogateProcess, + ) -> std::result::Result<(), RegisterError> { + self.toggle = !self.toggle; + let flags = if self.toggle { + Self::GPA_FLAGS_READWRITE + } else { + Self::GPA_FLAGS_READ + }; + let res = unsafe { + (self.map_gpa_range2)( + partition, + surrogate_process.process_handle.into(), + self.surrogate_addr, + NPT_FLUSH_GPA, + PAGE_SIZE_USIZE as u64, + flags, + ) + }; + if res.is_err() { + return Err(RegisterError::ResetPartition( + windows_result::Error::from_hresult(res).into(), + )); + } + Ok(()) + } +} + +impl Drop for NptFlushPage { + fn drop(&mut self) { + // The surrogate mapping is freed when SurrogateProcess drops; + // we only need to close the file-mapping handle. + if let Err(e) = unsafe { CloseHandle(self.handle) } { + tracing::error!("Failed to close NPT flush page handle: {:?}", e); + } + } +} + /// A Windows Hypervisor Platform implementation of a single-vcpu VM #[derive(Debug)] pub(crate) struct WhpVm { @@ -98,6 +232,9 @@ pub(crate) struct WhpVm { /// Handle to the background timer (if started). #[cfg(feature = "hw-interrupts")] timer: Option, + /// Dummy page whose GPA permissions are toggled to force NPT TLB + /// flushes after `WHvResetPartition` on AMD SVM hosts. + npt_flush: NptFlushPage, } // Safety: `WhpVm` is !Send because it holds `SurrogateProcess` which contains a raw pointer @@ -105,8 +242,8 @@ pub(crate) struct WhpVm { // in the surrogate process. It is never dereferenced, only used for address arithmetic and // resource management (unmapping). This is a system resource that is not bound to the creating // thread and can be safely transferred between threads. -// `file_mappings` contains raw pointers that are also kernel resource handles, -// safe to use from any thread. +// `file_mappings` and `NptFlushPage` contain raw pointers that are also kernel +// resource handles, safe to use from any thread. unsafe impl Send for WhpVm {} impl WhpVm { @@ -144,16 +281,19 @@ impl WhpVm { let mgr = get_surrogate_process_manager() .map_err(|e| CreateVmError::SurrogateProcess(e.to_string()))?; - let surrogate_process = mgr + let mut surrogate_process = mgr .get_surrogate_process() .map_err(|e| CreateVmError::SurrogateProcess(e.to_string()))?; + let npt_flush = NptFlushPage::new(partition, &mut surrogate_process)?; + Ok(WhpVm { partition, surrogate_process, file_mappings: Vec::new(), #[cfg(feature = "hw-interrupts")] timer: None, + npt_flush, }) } @@ -729,21 +869,12 @@ impl VirtualMachine for WhpVm { WHvResetPartition(self.partition) .map_err(|e| RegisterError::ResetPartition(e.into()))?; - // WHvResetPartition resets the VMCB but may not update the - // hypervisor's internal register-change tracking used for - // TLB flush decisions on AMD SVM. Explicitly re-announce - // the post-reset CR0 (PG=0) via the WHP API so that the - // subsequent restore_sregs() write of CR0 with PG=1 is - // detected as a paging-mode change, triggering - // SVM_TLB_FLUSH_CURRENT_ASID on the next VMRUN. - let post_reset_sregs = self.sregs()?; - self.set_registers(&[( - WHvX64RegisterCr0, - Align16(WHV_REGISTER_VALUE { - Reg64: post_reset_sregs.cr0, - }), - )]) - .map_err(|e| RegisterError::ResetPartition(e.into()))?; + // WHvResetPartition does not flush the NPT (GPA→HPA) TLB + // on AMD SVM. Toggle the dummy page's GPA permission to + // trigger a TbGeneration bump, forcing the next VMRUN to + // issue SVM_TLB_FLUSH_CURRENT_ASID. + self.npt_flush + .flush(self.partition, &self.surrogate_process)?; // WHvResetPartition resets LAPIC to power-on defaults. // Re-initialize it when LAPIC emulation is active. @@ -751,13 +882,12 @@ impl VirtualMachine for WhpVm { Self::init_lapic_bulk(self.partition) .map_err(|e| RegisterError::ResetPartition(e.into()))?; - // With hw-interrupts, set_sregs filters out APIC_BASE to - // avoid disabling the LAPIC (the snapshot sregs default it - // to 0). On AMD/SVM this means SVM_CLEAN_FIELD_AVIC is - // never dirtied after the VMCB rebuild, causing the CPU to - // use stale cached AVIC state on VMRUN which corrupts guest - // memory reads. Re-writing APIC_BASE to its post-reset - // value forces the dirty bit via ValSetEmulatedApicBase. + // set_sregs filters out APIC_BASE (the snapshot sregs + // default it to 0, which would disable the LAPIC). This + // means APIC_BASE is never written after the VMCB rebuild, + // so the AVIC clean bit stays set and the CPU may use stale + // cached AVIC state. Re-writing the post-reset value forces + // the hypervisor to dirty SVM_CLEAN_FIELD_AVIC. #[cfg(feature = "hw-interrupts")] { let apic_base = self.sregs()?.apic_base; diff --git a/src/hyperlight_host/src/mem/layout.rs b/src/hyperlight_host/src/mem/layout.rs index b55189969..e7afb51ca 100644 --- a/src/hyperlight_host/src/mem/layout.rs +++ b/src/hyperlight_host/src/mem/layout.rs @@ -313,7 +313,7 @@ impl SandboxMemoryLayout { /// Both the scratch region and the snapshot region are bounded by /// this size. The value is arbitrary but chosen to be large enough /// for most workloads while preventing accidental resource exhaustion. - const MAX_MEMORY_SIZE: usize = (16 * 1024 * 1024 * 1024) - Self::BASE_ADDRESS; // 16 GiB - BASE_ADDRESS + pub(crate) const MAX_MEMORY_SIZE: usize = (16 * 1024 * 1024 * 1024) - Self::BASE_ADDRESS; // 16 GiB - BASE_ADDRESS /// The base address of the sandbox's memory. #[cfg(not(feature = "nanvix-unstable"))] From 829c25b7cbf4235eaaafaa391cfb368ec9d05e80 Mon Sep 17 00:00:00 2001 From: Ludvig Liljenberg <4257730+ludfjig@users.noreply.github.com> Date: Fri, 27 Mar 2026 11:49:17 -0700 Subject: [PATCH 6/6] Try unmap/remap instead since prev didn't work Signed-off-by: Ludvig Liljenberg <4257730+ludfjig@users.noreply.github.com> --- .../src/hypervisor/virtual_machine/whp.rs | 25 +++++++------------ 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/src/hyperlight_host/src/hypervisor/virtual_machine/whp.rs b/src/hyperlight_host/src/hypervisor/virtual_machine/whp.rs index b908d4392..c0827d651 100644 --- a/src/hyperlight_host/src/hypervisor/virtual_machine/whp.rs +++ b/src/hyperlight_host/src/hypervisor/virtual_machine/whp.rs @@ -112,15 +112,10 @@ struct NptFlushPage { surrogate_addr: *mut c_void, /// Cached function pointer for `WHvMapGpaRange2`. map_gpa_range2: WHvMapGpaRange2Func, - /// Toggled on each flush to alternate the GPA permission between - /// READ and READ|WRITE. - toggle: bool, } impl NptFlushPage { const GPA_FLAGS_READ: WHV_MAP_GPA_RANGE_FLAGS = WHvMapGpaRangeFlagRead; - const GPA_FLAGS_READWRITE: WHV_MAP_GPA_RANGE_FLAGS = - WHV_MAP_GPA_RANGE_FLAGS(WHvMapGpaRangeFlagRead.0 | WHvMapGpaRangeFlagWrite.0); /// Allocate a dummy page, map it into the surrogate process, and /// create the initial GPA mapping at [`NPT_FLUSH_GPA`]. @@ -173,24 +168,22 @@ impl NptFlushPage { handle, surrogate_addr, map_gpa_range2, - toggle: false, }) } - /// Toggle the dummy page's GPA permission to force an NPT TLB - /// flush. VID skips the hypercall when permissions are unchanged, - /// so we alternate between READ and READ|WRITE. + /// Force an NPT TLB flush by unmapping and remapping the dummy + /// page. An unmap+remap cycle forces VID through the full + /// delete+create GPA range path (bypassing `TryUpdateInPlace`), + /// which guarantees the hypervisor issues `ValFlushNestedTb`. fn flush( &mut self, partition: WHV_PARTITION_HANDLE, surrogate_process: &SurrogateProcess, ) -> std::result::Result<(), RegisterError> { - self.toggle = !self.toggle; - let flags = if self.toggle { - Self::GPA_FLAGS_READWRITE - } else { - Self::GPA_FLAGS_READ - }; + unsafe { + WHvUnmapGpaRange(partition, NPT_FLUSH_GPA, PAGE_SIZE_USIZE as u64) + .map_err(|e| RegisterError::ResetPartition(e.into()))?; + } let res = unsafe { (self.map_gpa_range2)( partition, @@ -198,7 +191,7 @@ impl NptFlushPage { self.surrogate_addr, NPT_FLUSH_GPA, PAGE_SIZE_USIZE as u64, - flags, + Self::GPA_FLAGS_READ, ) }; if res.is_err() {