virt_mshv_vtl/
lib.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! Implementation of the Underhill hypervisor backend, which uses
5//! `/dev/mshv_vtl` to interact with the Microsoft hypervisor while running in
6//! VTL2.
7
8#![cfg(target_os = "linux")]
9
10mod devmsr;
11
12cfg_if::cfg_if!(
13    if #[cfg(target_arch = "x86_64")] { // xtask-fmt allow-target-arch sys-crate
14        mod cvm_cpuid;
15        pub use processor::snp::SnpBacked;
16        pub use processor::tdx::TdxBacked;
17        use crate::processor::HardwareIsolatedBacking;
18        pub use crate::processor::mshv::x64::HypervisorBackedX86 as HypervisorBacked;
19        use crate::processor::mshv::x64::HypervisorBackedX86Shared as HypervisorBackedShared;
20        use bitvec::prelude::BitArray;
21        use bitvec::prelude::Lsb0;
22        use devmsr::MsrDevice;
23        use hv1_emulator::hv::ProcessorVtlHv;
24        use processor::LapicState;
25        use processor::snp::SnpBackedShared;
26        use processor::tdx::TdxBackedShared;
27        use std::arch::x86_64::CpuidResult;
28        use virt::CpuidLeaf;
29        use virt::state::StateElement;
30        use virt::vp::MpState;
31        /// Bitarray type for representing IRR bits in a x86-64 APIC
32        /// Each bit represent the 256 possible vectors.
33        type IrrBitmap = BitArray<[u32; 8], Lsb0>;
34    } else if #[cfg(target_arch = "aarch64")] { // xtask-fmt allow-target-arch sys-crate
35        pub use crate::processor::mshv::arm64::HypervisorBackedArm64 as HypervisorBacked;
36        use crate::processor::mshv::arm64::HypervisorBackedArm64Shared as HypervisorBackedShared;
37    }
38);
39
40mod processor;
41pub use processor::Backing;
42pub use processor::UhProcessor;
43
44use anyhow::Context as AnyhowContext;
45use bitfield_struct::bitfield;
46use bitvec::boxed::BitBox;
47use bitvec::vec::BitVec;
48use cvm_tracing::CVM_ALLOWED;
49use guestmem::GuestMemory;
50use guestmem::GuestMemoryBackingError;
51use hcl::GuestVtl;
52use hcl::ioctl::Hcl;
53use hcl::ioctl::SetVsmPartitionConfigError;
54use hv1_emulator::hv::GlobalHv;
55use hv1_emulator::message_queues::MessageQueues;
56use hv1_emulator::synic::GlobalSynic;
57use hv1_emulator::synic::SintProxied;
58use hv1_structs::VtlArray;
59use hvdef::GuestCrashCtl;
60use hvdef::HV_PAGE_SHIFT;
61use hvdef::HV_PAGE_SIZE;
62use hvdef::HV_PAGE_SIZE_USIZE;
63use hvdef::HvError;
64use hvdef::HvMapGpaFlags;
65use hvdef::HvRegisterName;
66use hvdef::HvRegisterVsmPartitionConfig;
67use hvdef::HvRegisterVsmPartitionStatus;
68use hvdef::Vtl;
69use hvdef::hypercall::HV_INTERCEPT_ACCESS_MASK_EXECUTE;
70use hvdef::hypercall::HV_INTERCEPT_ACCESS_MASK_NONE;
71use hvdef::hypercall::HV_INTERCEPT_ACCESS_MASK_READ_WRITE;
72use hvdef::hypercall::HV_INTERCEPT_ACCESS_MASK_WRITE;
73use hvdef::hypercall::HostVisibilityType;
74use hvdef::hypercall::HvGuestOsId;
75use hvdef::hypercall::HvInputVtl;
76use hvdef::hypercall::HvInterceptParameters;
77use hvdef::hypercall::HvInterceptType;
78use inspect::Inspect;
79use inspect::InspectMut;
80use memory_range::MemoryRange;
81use pal::unix::affinity;
82use pal::unix::affinity::CpuSet;
83use pal_async::driver::Driver;
84use pal_async::driver::SpawnDriver;
85use pal_uring::IdleControl;
86use parking_lot::Mutex;
87use parking_lot::RwLock;
88use processor::BackingSharedParams;
89use processor::SidecarExitReason;
90use sidecar_client::NewSidecarClientError;
91use std::ops::RangeInclusive;
92use std::os::fd::AsRawFd;
93use std::sync::Arc;
94use std::sync::Weak;
95use std::sync::atomic::AtomicBool;
96use std::sync::atomic::AtomicU8;
97use std::sync::atomic::AtomicU32;
98use std::sync::atomic::AtomicU64;
99use std::sync::atomic::Ordering;
100use std::task::Waker;
101use thiserror::Error;
102use user_driver::DmaClient;
103use virt::IsolationType;
104use virt::PartitionCapabilities;
105use virt::VpIndex;
106use virt::irqcon::IoApicRouting;
107use virt::irqcon::MsiRequest;
108use virt::x86::apic_software_device::ApicSoftwareDevices;
109use virt_support_apic::LocalApicSet;
110use vm_topology::memory::MemoryLayout;
111use vm_topology::processor::ProcessorTopology;
112use vm_topology::processor::TargetVpInfo;
113use vmcore::monitor::MonitorPage;
114use vmcore::reference_time::GetReferenceTime;
115use vmcore::reference_time::ReferenceTimeResult;
116use vmcore::reference_time::ReferenceTimeSource;
117use vmcore::vmtime::VmTimeSource;
118use x86defs::snp::REG_TWEAK_BITMAP_OFFSET;
119use x86defs::snp::REG_TWEAK_BITMAP_SIZE;
120use x86defs::tdx::TdCallResult;
121use zerocopy::FromBytes;
122use zerocopy::FromZeros;
123use zerocopy::Immutable;
124use zerocopy::IntoBytes;
125use zerocopy::KnownLayout;
126
127/// General error returned by operations.
128#[derive(Error, Debug)]
129#[expect(missing_docs)]
130pub enum Error {
131    #[error("hcl error")]
132    Hcl(#[source] hcl::ioctl::Error),
133    #[error("failed to open sidecar client")]
134    Sidecar(#[source] NewSidecarClientError),
135    #[error("failed to install {0:?} intercept: {1:?}")]
136    InstallIntercept(HvInterceptType, HvError),
137    #[error("failed to query hypervisor register {0:#x?}")]
138    Register(HvRegisterName, #[source] HvError),
139    #[error("failed to set vsm partition config register")]
140    VsmPartitionConfig(#[source] SetVsmPartitionConfigError),
141    #[error("failed to create virtual device")]
142    NewDevice(#[source] virt::x86::apic_software_device::DeviceIdInUse),
143    #[error("failed to create cpuid tables for cvm")]
144    #[cfg(guest_arch = "x86_64")]
145    CvmCpuid(#[source] cvm_cpuid::CpuidResultsError),
146    #[error("failed to update hypercall msr")]
147    UpdateHypercallMsr,
148    #[error("failed to update reference tsc msr")]
149    UpdateReferenceTsc,
150    #[error("failed to map overlay page")]
151    MapOverlay(#[source] std::io::Error),
152    #[error("failed to allocate shared visibility pages for overlay")]
153    AllocateSharedVisOverlay(#[source] anyhow::Error),
154    #[error("failed to open msr device")]
155    OpenMsr(#[source] std::io::Error),
156    #[error("cpuid did not contain valid TSC frequency information")]
157    BadCpuidTsc,
158    #[error("failed to read tsc frequency")]
159    ReadTscFrequency(#[source] std::io::Error),
160    #[error(
161        "tsc frequency mismatch between hypervisor ({hv}) and hardware {hw}, exceeds allowed error {allowed_error}"
162    )]
163    TscFrequencyMismatch {
164        hv: u64,
165        hw: u64,
166        allowed_error: u64,
167    },
168    #[error("failed to set vsm partition config: {0:?}")]
169    FailedToSetL2Ctls(TdCallResult),
170    #[error("debugging is configured but the binary does not have the gdb feature")]
171    InvalidDebugConfiguration,
172    #[error("failed to allocate TLB flush page")]
173    AllocateTlbFlushPage(#[source] anyhow::Error),
174}
175
176/// Error revoking guest VSM.
177#[derive(Error, Debug)]
178#[expect(missing_docs)]
179pub enum RevokeGuestVsmError {
180    #[error("failed to set vsm config")]
181    SetGuestVsmConfig(#[source] hcl::ioctl::SetGuestVsmConfigError),
182    #[error("VTL 1 is already enabled")]
183    Vtl1AlreadyEnabled,
184}
185
186/// Underhill partition.
187#[derive(Inspect)]
188pub struct UhPartition {
189    #[inspect(flatten)]
190    inner: Arc<UhPartitionInner>,
191    // TODO: remove this extra indirection by refactoring some traits.
192    #[inspect(skip)]
193    interrupt_targets: VtlArray<Arc<UhInterruptTarget>, 2>,
194}
195
196/// Underhill partition.
197#[derive(Inspect)]
198#[inspect(extra = "UhPartitionInner::inspect_extra")]
199struct UhPartitionInner {
200    #[inspect(skip)]
201    hcl: Hcl,
202    #[inspect(skip)] // inspected separately
203    vps: Vec<UhVpInner>,
204    irq_routes: virt::irqcon::IrqRoutes,
205    caps: PartitionCapabilities,
206    #[inspect(skip)] // handled in `inspect_extra`
207    enter_modes: Mutex<EnterModes>,
208    #[inspect(skip)]
209    enter_modes_atomic: AtomicU8,
210    #[cfg(guest_arch = "x86_64")]
211    cpuid: virt::CpuidLeafSet,
212    lower_vtl_memory_layout: MemoryLayout,
213    gm: VtlArray<GuestMemory, 2>,
214    vtl0_kernel_exec_gm: GuestMemory,
215    vtl0_user_exec_gm: GuestMemory,
216    #[cfg_attr(guest_arch = "aarch64", expect(dead_code))]
217    #[inspect(skip)]
218    crash_notification_send: mesh::Sender<VtlCrash>,
219    monitor_page: MonitorPage,
220    #[inspect(skip)]
221    allocated_monitor_page: Mutex<Option<user_driver::memory::MemoryBlock>>,
222    software_devices: Option<ApicSoftwareDevices>,
223    #[inspect(skip)]
224    vmtime: VmTimeSource,
225    isolation: IsolationType,
226    #[inspect(with = "inspect::AtomicMut")]
227    no_sidecar_hotplug: AtomicBool,
228    use_mmio_hypercalls: bool,
229    backing_shared: BackingShared,
230    intercept_debug_exceptions: bool,
231    #[cfg(guest_arch = "x86_64")]
232    // N.B For now, only one device vector table i.e. for VTL0 only
233    #[inspect(hex, with = "|x| inspect::iter_by_index(x.read().into_inner())")]
234    device_vector_table: RwLock<IrrBitmap>,
235    vmbus_relay: bool,
236}
237
238#[derive(Inspect)]
239#[inspect(untagged)]
240enum BackingShared {
241    Hypervisor(#[inspect(flatten)] HypervisorBackedShared),
242    #[cfg(guest_arch = "x86_64")]
243    Snp(#[inspect(flatten)] SnpBackedShared),
244    #[cfg(guest_arch = "x86_64")]
245    Tdx(#[inspect(flatten)] TdxBackedShared),
246}
247
248impl BackingShared {
249    fn new(
250        isolation: IsolationType,
251        partition_params: &UhPartitionNewParams<'_>,
252        backing_shared_params: BackingSharedParams<'_>,
253    ) -> Result<BackingShared, Error> {
254        Ok(match isolation {
255            IsolationType::None | IsolationType::Vbs => {
256                assert!(backing_shared_params.cvm_state.is_none());
257                BackingShared::Hypervisor(HypervisorBackedShared::new(
258                    partition_params,
259                    backing_shared_params,
260                )?)
261            }
262            #[cfg(guest_arch = "x86_64")]
263            IsolationType::Snp => BackingShared::Snp(SnpBackedShared::new(
264                partition_params,
265                backing_shared_params,
266            )?),
267            #[cfg(guest_arch = "x86_64")]
268            IsolationType::Tdx => BackingShared::Tdx(TdxBackedShared::new(
269                partition_params,
270                backing_shared_params,
271            )?),
272            #[cfg(not(guest_arch = "x86_64"))]
273            _ => unreachable!(),
274        })
275    }
276
277    fn cvm_state(&self) -> Option<&UhCvmPartitionState> {
278        match self {
279            BackingShared::Hypervisor(_) => None,
280            #[cfg(guest_arch = "x86_64")]
281            BackingShared::Snp(SnpBackedShared { cvm, .. })
282            | BackingShared::Tdx(TdxBackedShared { cvm, .. }) => Some(cvm),
283        }
284    }
285
286    #[cfg_attr(guest_arch = "aarch64", expect(dead_code))]
287    fn guest_vsm_disabled(&self) -> bool {
288        match self {
289            BackingShared::Hypervisor(h) => {
290                matches!(*h.guest_vsm.read(), GuestVsmState::NotPlatformSupported)
291            }
292            #[cfg(guest_arch = "x86_64")]
293            BackingShared::Snp(SnpBackedShared { cvm, .. })
294            | BackingShared::Tdx(TdxBackedShared { cvm, .. }) => {
295                matches!(*cvm.guest_vsm.read(), GuestVsmState::NotPlatformSupported)
296            }
297        }
298    }
299
300    fn untrusted_synic(&self) -> Option<&GlobalSynic> {
301        match self {
302            BackingShared::Hypervisor(_) => None,
303            #[cfg(guest_arch = "x86_64")]
304            BackingShared::Snp(_) => None,
305            #[cfg(guest_arch = "x86_64")]
306            BackingShared::Tdx(s) => s.untrusted_synic.as_ref(),
307        }
308    }
309}
310
311#[derive(InspectMut, Copy, Clone)]
312struct EnterModes {
313    #[inspect(mut)]
314    first: EnterMode,
315    #[inspect(mut)]
316    second: EnterMode,
317}
318
319impl Default for EnterModes {
320    fn default() -> Self {
321        Self {
322            first: EnterMode::Fast,
323            second: EnterMode::IdleToVtl0,
324        }
325    }
326}
327
328impl From<EnterModes> for hcl::protocol::EnterModes {
329    fn from(value: EnterModes) -> Self {
330        Self::new()
331            .with_first(value.first.into())
332            .with_second(value.second.into())
333    }
334}
335
336#[derive(InspectMut, Copy, Clone)]
337enum EnterMode {
338    Fast,
339    PlayIdle,
340    IdleToVtl0,
341}
342
343impl From<EnterMode> for hcl::protocol::EnterMode {
344    fn from(value: EnterMode) -> Self {
345        match value {
346            EnterMode::Fast => Self::FAST,
347            EnterMode::PlayIdle => Self::PLAY_IDLE,
348            EnterMode::IdleToVtl0 => Self::IDLE_TO_VTL0,
349        }
350    }
351}
352
353#[cfg(guest_arch = "x86_64")]
354#[derive(Inspect)]
355struct GuestVsmVpState {
356    /// The pending event that VTL 1 wants to inject into VTL 0. Injected on
357    /// next exit to VTL 0.
358    #[inspect(with = "|x| x.as_ref().map(inspect::AsDebug)")]
359    vtl0_exit_pending_event: Option<hvdef::HvX64PendingExceptionEvent>,
360    reg_intercept: SecureRegisterInterceptState,
361}
362
363#[cfg(guest_arch = "x86_64")]
364impl GuestVsmVpState {
365    fn new() -> Self {
366        GuestVsmVpState {
367            vtl0_exit_pending_event: None,
368            reg_intercept: Default::default(),
369        }
370    }
371}
372
373#[cfg(guest_arch = "x86_64")]
374#[derive(Inspect)]
375/// VP state for CVMs.
376struct UhCvmVpState {
377    // Allocation handle for direct overlays
378    #[inspect(debug)]
379    direct_overlay_handle: user_driver::memory::MemoryBlock,
380    /// Used in VTL 2 exit code to determine which VTL to exit to.
381    exit_vtl: GuestVtl,
382    /// Hypervisor enlightenment emulator state.
383    hv: VtlArray<ProcessorVtlHv, 2>,
384    /// LAPIC state.
385    lapics: VtlArray<LapicState, 2>,
386    /// Guest VSM state for this vp. Some when VTL 1 is enabled.
387    vtl1: Option<GuestVsmVpState>,
388}
389
390#[cfg(guest_arch = "x86_64")]
391impl UhCvmVpState {
392    /// Creates a new CVM VP state.
393    pub(crate) fn new(
394        cvm_partition: &UhCvmPartitionState,
395        inner: &UhPartitionInner,
396        vp_info: &TargetVpInfo,
397        overlay_pages_required: usize,
398    ) -> Result<Self, Error> {
399        let direct_overlay_handle = cvm_partition
400            .shared_dma_client
401            .allocate_dma_buffer(overlay_pages_required * HV_PAGE_SIZE as usize)
402            .map_err(Error::AllocateSharedVisOverlay)?;
403
404        let apic_base = virt::vp::Apic::at_reset(&inner.caps, vp_info).apic_base;
405        let lapics = VtlArray::from_fn(|vtl| {
406            let apic_set = &cvm_partition.lapic[vtl];
407
408            // The APIC is software-enabled after reset for secure VTLs, to
409            // maintain compatibility with released versions of secure kernel
410            let mut lapic = apic_set.add_apic(vp_info, vtl == Vtl::Vtl1);
411            // Initialize APIC base to match the reset VM state.
412            lapic.set_apic_base(apic_base).unwrap();
413            // Only the VTL 0 non-BSP LAPICs should be in the WaitForSipi state.
414            let activity = if vtl == Vtl::Vtl0 && !vp_info.base.is_bsp() {
415                MpState::WaitForSipi
416            } else {
417                MpState::Running
418            };
419            LapicState::new(lapic, activity)
420        });
421
422        let hv = VtlArray::from_fn(|vtl| cvm_partition.hv.add_vp(vp_info.base.vp_index, vtl));
423
424        Ok(Self {
425            direct_overlay_handle,
426            exit_vtl: GuestVtl::Vtl0,
427            hv,
428            lapics,
429            vtl1: None,
430        })
431    }
432}
433
434#[cfg(guest_arch = "x86_64")]
435#[derive(Inspect, Default)]
436#[inspect(hex)]
437/// Configuration of VTL 1 registration for intercepts on certain registers
438pub struct SecureRegisterInterceptState {
439    #[inspect(with = "|&x| u64::from(x)")]
440    intercept_control: hvdef::HvRegisterCrInterceptControl,
441    cr0_mask: u64,
442    cr4_mask: u64,
443    // Writes to X86X_IA32_MSR_MISC_ENABLE are dropped, so this is only used so
444    // that get_vp_register returns the correct value from a set_vp_register
445    ia32_misc_enable_mask: u64,
446}
447
448#[derive(Inspect)]
449/// Partition-wide state for CVMs.
450struct UhCvmPartitionState {
451    #[cfg(guest_arch = "x86_64")]
452    vps_per_socket: u32,
453    /// VPs that have locked their TLB.
454    #[inspect(
455        with = "|arr| inspect::iter_by_index(arr.iter()).map_value(|bb| inspect::iter_by_index(bb.iter().map(|v| *v)))"
456    )]
457    tlb_locked_vps: VtlArray<BitBox<AtomicU64>, 2>,
458    #[inspect(with = "inspect::iter_by_index")]
459    vps: Vec<UhCvmVpInner>,
460    shared_memory: GuestMemory,
461    #[cfg_attr(guest_arch = "aarch64", expect(dead_code))]
462    #[inspect(skip)]
463    isolated_memory_protector: Arc<dyn ProtectIsolatedMemory>,
464    /// The emulated local APIC set.
465    lapic: VtlArray<LocalApicSet, 2>,
466    /// The emulated hypervisor state.
467    hv: GlobalHv<2>,
468    /// Guest VSM state.
469    guest_vsm: RwLock<GuestVsmState<CvmVtl1State>>,
470    /// Dma client for shared visibility pages.
471    shared_dma_client: Arc<dyn DmaClient>,
472    /// Dma client for private visibility pages.
473    private_dma_client: Arc<dyn DmaClient>,
474    hide_isolation: bool,
475}
476
477#[cfg_attr(guest_arch = "aarch64", expect(dead_code))]
478impl UhCvmPartitionState {
479    fn vp_inner(&self, vp_index: u32) -> &UhCvmVpInner {
480        &self.vps[vp_index as usize]
481    }
482
483    fn is_lower_vtl_startup_denied(&self) -> bool {
484        matches!(
485            *self.guest_vsm.read(),
486            GuestVsmState::Enabled {
487                vtl1: CvmVtl1State {
488                    deny_lower_vtl_startup: true,
489                    ..
490                }
491            }
492        )
493    }
494}
495
496#[derive(Inspect)]
497/// Per-vp state for CVMs.
498struct UhCvmVpInner {
499    /// The current status of TLB locks
500    tlb_lock_info: VtlArray<TlbLockInfo, 2>,
501    /// Whether EnableVpVtl for VTL 1 has been called on this VP.
502    vtl1_enable_called: Mutex<bool>,
503    /// Whether the VP has been started via the StartVp hypercall.
504    started: AtomicBool,
505    /// Start context for StartVp and EnableVpVtl calls.
506    #[inspect(with = "|arr| inspect::iter_by_index(arr.iter().map(|v| v.lock().is_some()))")]
507    hv_start_enable_vtl_vp: VtlArray<Mutex<Option<Box<VpStartEnableVtl>>>, 2>,
508}
509
510#[cfg_attr(guest_arch = "aarch64", expect(dead_code))]
511#[derive(Inspect)]
512#[inspect(tag = "guest_vsm_state")]
513/// Partition-wide state for guest vsm.
514enum GuestVsmState<T: Inspect> {
515    NotPlatformSupported,
516    NotGuestEnabled,
517    Enabled {
518        #[inspect(flatten)]
519        vtl1: T,
520    },
521}
522
523impl<T: Inspect> GuestVsmState<T> {
524    pub fn from_availability(guest_vsm_available: bool) -> Self {
525        if guest_vsm_available {
526            GuestVsmState::NotGuestEnabled
527        } else {
528            GuestVsmState::NotPlatformSupported
529        }
530    }
531}
532
533#[derive(Inspect)]
534struct CvmVtl1State {
535    /// Whether VTL 1 has been enabled on any vp
536    enabled_on_any_vp: bool,
537    /// Whether guest memory should be zeroed before it resets.
538    zero_memory_on_reset: bool,
539    /// Whether a vp can be started or reset by a lower vtl.
540    deny_lower_vtl_startup: bool,
541    /// Whether Mode-Based Execution Control should be enforced on lower VTLs.
542    pub mbec_enabled: bool,
543    /// Whether shadow supervisor stack is enabled.
544    pub shadow_supervisor_stack_enabled: bool,
545    #[inspect(with = "|bb| inspect::iter_by_index(bb.iter().map(|v| *v))")]
546    io_read_intercepts: BitBox<u64>,
547    #[inspect(with = "|bb| inspect::iter_by_index(bb.iter().map(|v| *v))")]
548    io_write_intercepts: BitBox<u64>,
549}
550
551#[cfg_attr(guest_arch = "aarch64", expect(dead_code))]
552impl CvmVtl1State {
553    fn new(mbec_enabled: bool) -> Self {
554        Self {
555            enabled_on_any_vp: false,
556            zero_memory_on_reset: false,
557            deny_lower_vtl_startup: false,
558            mbec_enabled,
559            shadow_supervisor_stack_enabled: false,
560            io_read_intercepts: BitVec::repeat(false, u16::MAX as usize + 1).into_boxed_bitslice(),
561            io_write_intercepts: BitVec::repeat(false, u16::MAX as usize + 1).into_boxed_bitslice(),
562        }
563    }
564}
565
566#[cfg_attr(guest_arch = "aarch64", expect(dead_code))]
567struct TscReferenceTimeSource {
568    tsc_scale: u64,
569}
570
571#[cfg_attr(guest_arch = "aarch64", expect(dead_code))]
572impl TscReferenceTimeSource {
573    fn new(tsc_frequency: u64) -> Self {
574        TscReferenceTimeSource {
575            tsc_scale: (((10_000_000_u128) << 64) / tsc_frequency as u128) as u64,
576        }
577    }
578}
579
580/// A time implementation based on TSC.
581impl GetReferenceTime for TscReferenceTimeSource {
582    fn now(&self) -> ReferenceTimeResult {
583        #[cfg(guest_arch = "x86_64")]
584        {
585            let tsc = safe_intrinsics::rdtsc();
586            let ref_time = ((self.tsc_scale as u128 * tsc as u128) >> 64) as u64;
587            ReferenceTimeResult {
588                ref_time,
589                system_time: None,
590            }
591        }
592
593        #[cfg(guest_arch = "aarch64")]
594        {
595            todo!("AARCH64_TODO");
596        }
597    }
598}
599
600#[cfg(guest_arch = "aarch64")]
601impl virt::irqcon::ControlGic for UhPartitionInner {
602    fn set_spi_irq(&self, irq_id: u32, high: bool) {
603        if let Err(err) = self.hcl.request_interrupt(
604            hvdef::HvInterruptControl::new()
605                .with_arm64_asserted(high)
606                .with_interrupt_type(hvdef::HvInterruptType::HvArm64InterruptTypeFixed),
607            0,
608            irq_id,
609            GuestVtl::Vtl0,
610        ) {
611            tracelimit::warn_ratelimited!(
612                error = &err as &dyn std::error::Error,
613                irq = irq_id,
614                asserted = high,
615                "failed to request spi"
616            );
617        }
618    }
619}
620
621#[cfg(guest_arch = "aarch64")]
622impl virt::Aarch64Partition for UhPartition {
623    fn control_gic(&self, vtl: Vtl) -> Arc<dyn virt::irqcon::ControlGic> {
624        debug_assert!(vtl == Vtl::Vtl0);
625        self.inner.clone()
626    }
627}
628
629/// A wrapper around [`UhProcessor`] that is [`Send`].
630///
631/// This is used to instantiate the processor object on the correct thread,
632/// since all lower VTL processor state accesses must occur from the same
633/// processor at VTL2.
634pub struct UhProcessorBox {
635    partition: Arc<UhPartitionInner>,
636    vp_info: TargetVpInfo,
637}
638
639impl UhProcessorBox {
640    /// Returns the VP index.
641    pub fn vp_index(&self) -> VpIndex {
642        self.vp_info.base.vp_index
643    }
644
645    /// Returns the base CPU that manages this processor, when it is a sidecar
646    /// VP.
647    pub fn sidecar_base_cpu(&self) -> Option<u32> {
648        self.partition
649            .hcl
650            .sidecar_base_cpu(self.vp_info.base.vp_index.index())
651    }
652
653    /// Returns the processor object, bound to this thread.
654    ///
655    /// If `control` is provided, then this must be called on the VP's
656    /// associated thread pool thread, and it will dispatch the VP directly.
657    /// Otherwise, the processor will control the processor via the sidecar
658    /// kernel.
659    pub fn bind_processor<'a, T: Backing>(
660        &'a mut self,
661        driver: &impl Driver,
662        control: Option<&'a mut IdleControl>,
663    ) -> Result<UhProcessor<'a, T>, Error> {
664        if let Some(control) = &control {
665            let vp_index = self.vp_info.base.vp_index;
666
667            let mut current = Default::default();
668            affinity::get_current_thread_affinity(&mut current).unwrap();
669            assert_eq!(&current, CpuSet::new().set(vp_index.index()));
670
671            self.partition
672                .hcl
673                .set_poll_file(
674                    self.partition.vp(vp_index).unwrap().cpu_index,
675                    control.ring_fd().as_raw_fd(),
676                )
677                .map_err(Error::Hcl)?;
678        }
679
680        UhProcessor::new(driver, &self.partition, self.vp_info, control)
681    }
682
683    /// Sets the sidecar remove reason for the processor to be due to a task
684    /// running with the given name.
685    ///
686    /// This is useful for diagnostics.
687    pub fn set_sidecar_exit_due_to_task(&self, task: Arc<str>) {
688        self.partition
689            .vp(self.vp_info.base.vp_index)
690            .unwrap()
691            .set_sidecar_exit_reason(SidecarExitReason::TaskRequest(task))
692    }
693}
694
695#[derive(Debug, Inspect)]
696struct UhVpInner {
697    /// 32 bits per VTL: top bits are VTL 1, bottom bits are VTL 0.
698    wake_reasons: AtomicU64,
699    #[inspect(skip)]
700    waker: RwLock<Option<Waker>>,
701    message_queues: VtlArray<MessageQueues, 2>,
702    #[inspect(skip)]
703    vp_info: TargetVpInfo,
704    /// The Linux kernel's CPU index for this VP. This should be used instead of VpIndex
705    /// when interacting with non-MSHV kernel interfaces.
706    cpu_index: u32,
707    sidecar_exit_reason: Mutex<Option<SidecarExitReason>>,
708}
709
710impl UhVpInner {
711    pub fn vp_index(&self) -> VpIndex {
712        self.vp_info.base.vp_index
713    }
714}
715
716#[cfg_attr(not(guest_arch = "x86_64"), expect(dead_code))]
717#[derive(Debug, Inspect)]
718/// Which operation is setting the initial vp context
719enum InitialVpContextOperation {
720    /// The VP is being started via the StartVp hypercall.
721    StartVp,
722    /// The VP is being started via the EnableVpVtl hypercall.
723    EnableVpVtl,
724}
725
726#[cfg_attr(not(guest_arch = "x86_64"), expect(dead_code))]
727#[derive(Debug, Inspect)]
728/// State for handling StartVp/EnableVpVtl hypercalls.
729struct VpStartEnableVtl {
730    /// Which operation, startvp or enablevpvtl, is setting the initial vp
731    /// context
732    operation: InitialVpContextOperation,
733    #[inspect(skip)]
734    context: hvdef::hypercall::InitialVpContextX64,
735}
736
737#[derive(Debug, Inspect)]
738struct TlbLockInfo {
739    /// The set of VPs that are waiting for this VP to release the TLB lock.
740    #[inspect(with = "|bb| inspect::iter_by_index(bb.iter().map(|v| *v))")]
741    blocked_vps: BitBox<AtomicU64>,
742    /// The set of VPs that are holding the TLB lock and preventing this VP
743    /// from proceeding.
744    #[inspect(with = "|bb| inspect::iter_by_index(bb.iter().map(|v| *v))")]
745    blocking_vps: BitBox<AtomicU64>,
746    /// The count of blocking VPs. This should always be equivalent to
747    /// `blocking_vps.count_ones()`, however it is accessible in a single
748    /// atomic operation while counting is not.
749    blocking_vp_count: AtomicU32,
750    /// Whether the VP is sleeping due to a TLB lock.
751    sleeping: AtomicBool,
752}
753
754#[cfg_attr(not(guest_arch = "x86_64"), expect(dead_code))]
755impl TlbLockInfo {
756    fn new(vp_count: usize) -> Self {
757        Self {
758            blocked_vps: BitVec::repeat(false, vp_count).into_boxed_bitslice(),
759            blocking_vps: BitVec::repeat(false, vp_count).into_boxed_bitslice(),
760            blocking_vp_count: AtomicU32::new(0),
761            sleeping: false.into(),
762        }
763    }
764}
765
766#[bitfield(u32)]
767#[derive(IntoBytes, Immutable, KnownLayout, FromBytes)]
768struct WakeReason {
769    extint: bool,
770    message_queues: bool,
771    hv_start_enable_vtl_vp: bool,
772    intcon: bool,
773    update_proxy_irr_filter: bool,
774    #[bits(27)]
775    _reserved: u32,
776}
777
778impl WakeReason {
779    // Convenient constants.
780    const EXTINT: Self = Self::new().with_extint(true);
781    const MESSAGE_QUEUES: Self = Self::new().with_message_queues(true);
782    #[cfg(guest_arch = "x86_64")]
783    const HV_START_ENABLE_VP_VTL: Self = Self::new().with_hv_start_enable_vtl_vp(true); // StartVp/EnableVpVtl handling
784    const INTCON: Self = Self::new().with_intcon(true);
785    #[cfg(guest_arch = "x86_64")]
786    const UPDATE_PROXY_IRR_FILTER: Self = Self::new().with_update_proxy_irr_filter(true);
787}
788
789#[bitfield(u32)]
790#[derive(IntoBytes, Immutable, KnownLayout, FromBytes)]
791struct ExitActivity {
792    pending_event: bool,
793    #[bits(31)]
794    _reserved: u32,
795}
796
797/// Immutable access to useful bits of Partition state.
798impl UhPartition {
799    /// Revokes guest VSM.
800    pub fn revoke_guest_vsm(&self) -> Result<(), RevokeGuestVsmError> {
801        fn revoke<T: Inspect>(vsm_state: &mut GuestVsmState<T>) -> Result<(), RevokeGuestVsmError> {
802            if matches!(vsm_state, GuestVsmState::Enabled { .. }) {
803                return Err(RevokeGuestVsmError::Vtl1AlreadyEnabled);
804            }
805            *vsm_state = GuestVsmState::NotPlatformSupported;
806            Ok(())
807        }
808
809        match &self.inner.backing_shared {
810            BackingShared::Hypervisor(s) => {
811                revoke(&mut *s.guest_vsm.write())?;
812                self.inner
813                    .hcl
814                    .set_guest_vsm_partition_config(false)
815                    .map_err(RevokeGuestVsmError::SetGuestVsmConfig)?;
816            }
817            #[cfg(guest_arch = "x86_64")]
818            BackingShared::Snp(SnpBackedShared { cvm, .. })
819            | BackingShared::Tdx(TdxBackedShared { cvm, .. }) => {
820                revoke(&mut *cvm.guest_vsm.write())?;
821            }
822        };
823
824        Ok(())
825    }
826
827    /// Returns the current hypervisor reference time, in 100ns units.
828    pub fn reference_time(&self) -> u64 {
829        if let Some(hv) = self.inner.hv() {
830            hv.ref_time_source().now().ref_time
831        } else {
832            self.inner
833                .hcl
834                .reference_time()
835                .expect("should not fail to get the reference time")
836        }
837    }
838}
839
840impl virt::Partition for UhPartition {
841    fn supports_reset(&self) -> Option<&dyn virt::ResetPartition<Error = Self::Error>> {
842        None
843    }
844
845    fn caps(&self) -> &PartitionCapabilities {
846        &self.inner.caps
847    }
848
849    fn request_msi(&self, vtl: Vtl, request: MsiRequest) {
850        self.inner
851            .request_msi(vtl.try_into().expect("higher vtl not configured"), request)
852    }
853
854    fn request_yield(&self, _vp_index: VpIndex) {
855        unimplemented!()
856    }
857}
858
859impl virt::X86Partition for UhPartition {
860    fn ioapic_routing(&self) -> Arc<dyn IoApicRouting> {
861        self.inner.clone()
862    }
863
864    fn pulse_lint(&self, vp_index: VpIndex, vtl: Vtl, lint: u8) {
865        let vtl = GuestVtl::try_from(vtl).expect("higher vtl not configured");
866        if let Some(apic) = &self.inner.lapic(vtl) {
867            apic.lint(vp_index, lint.into(), |vp_index| {
868                self.inner
869                    .vp(vp_index)
870                    .unwrap()
871                    .wake(vtl, WakeReason::INTCON);
872            });
873        } else if lint == 0 {
874            self.inner
875                .vp(vp_index)
876                .unwrap()
877                .wake(vtl, WakeReason::EXTINT);
878        } else {
879            unimplemented!()
880        }
881    }
882}
883
884impl UhPartitionInner {
885    fn vp(&self, index: VpIndex) -> Option<&'_ UhVpInner> {
886        self.vps.get(index.index() as usize)
887    }
888
889    fn lapic(&self, vtl: GuestVtl) -> Option<&LocalApicSet> {
890        self.backing_shared.cvm_state().map(|x| &x.lapic[vtl])
891    }
892
893    fn hv(&self) -> Option<&GlobalHv<2>> {
894        self.backing_shared.cvm_state().map(|x| &x.hv)
895    }
896
897    /// For requester VP to issue `proxy_irr_blocked` update to other VPs
898    #[cfg(guest_arch = "x86_64")]
899    fn request_proxy_irr_filter_update(
900        &self,
901        vtl: GuestVtl,
902        device_vector: u8,
903        req_vp_index: VpIndex,
904    ) {
905        tracing::debug!(
906            ?vtl,
907            device_vector,
908            req_vp_index = req_vp_index.index(),
909            "request_proxy_irr_filter_update"
910        );
911
912        // Add given vector to partition global device vector table (VTL0 only for now)
913        {
914            let mut device_vector_table = self.device_vector_table.write();
915            device_vector_table.set(device_vector as usize, true);
916        }
917
918        // Wake all other VPs for their `proxy_irr_blocked` filter update
919        for vp in self.vps.iter() {
920            if vp.vp_index() != req_vp_index {
921                vp.wake(vtl, WakeReason::UPDATE_PROXY_IRR_FILTER);
922            }
923        }
924    }
925
926    /// Get current partition global device irr vectors (VTL0 for now)
927    #[cfg(guest_arch = "x86_64")]
928    fn fill_device_vectors(&self, _vtl: GuestVtl, irr_vectors: &mut IrrBitmap) {
929        let device_vector_table = self.device_vector_table.read();
930        for idx in device_vector_table.iter_ones() {
931            irr_vectors.set(idx, true);
932        }
933    }
934
935    fn inspect_extra(&self, resp: &mut inspect::Response<'_>) {
936        let mut wake_vps = false;
937        resp.field_mut(
938            "enter_modes",
939            &mut inspect::adhoc_mut(|req| {
940                let update = req.is_update();
941                {
942                    let mut modes = self.enter_modes.lock();
943                    modes.inspect_mut(req);
944                    if update {
945                        self.enter_modes_atomic.store(
946                            hcl::protocol::EnterModes::from(*modes).into(),
947                            Ordering::Relaxed,
948                        );
949                        wake_vps = true;
950                    }
951                }
952            }),
953        );
954
955        // Wake VPs to propagate updates.
956        if wake_vps {
957            for vp in self.vps.iter() {
958                vp.wake_vtl2();
959            }
960        }
961    }
962
963    // TODO VBS GUEST VSM: enable for aarch64
964    #[cfg_attr(guest_arch = "aarch64", expect(dead_code))]
965    fn vsm_status(&self) -> Result<HvRegisterVsmPartitionStatus, hcl::ioctl::Error> {
966        // TODO: It might be possible to cache VsmPartitionStatus.
967        self.hcl.get_vsm_partition_status()
968    }
969}
970
971impl virt::Synic for UhPartition {
972    fn post_message(&self, vtl: Vtl, vp_index: VpIndex, sint: u8, typ: u32, payload: &[u8]) {
973        let vtl = GuestVtl::try_from(vtl).expect("higher vtl not configured");
974        let Some(vp) = self.inner.vp(vp_index) else {
975            tracelimit::warn_ratelimited!(
976                CVM_ALLOWED,
977                vp = vp_index.index(),
978                "invalid vp target for post_message"
979            );
980            return;
981        };
982
983        vp.post_message(
984            vtl,
985            sint,
986            &hvdef::HvMessage::new(hvdef::HvMessageType(typ), 0, payload),
987        );
988    }
989
990    fn new_guest_event_port(
991        &self,
992        vtl: Vtl,
993        vp: u32,
994        sint: u8,
995        flag: u16,
996    ) -> Box<dyn vmcore::synic::GuestEventPort> {
997        let vtl = GuestVtl::try_from(vtl).expect("higher vtl not configured");
998        Box::new(UhEventPort {
999            partition: Arc::downgrade(&self.inner),
1000            params: Arc::new(Mutex::new(UhEventPortParams {
1001                vp: VpIndex::new(vp),
1002                sint,
1003                flag,
1004                vtl,
1005            })),
1006        })
1007    }
1008
1009    fn prefer_os_events(&self) -> bool {
1010        false
1011    }
1012
1013    fn monitor_support(&self) -> Option<&dyn virt::SynicMonitor> {
1014        Some(self)
1015    }
1016}
1017
1018impl virt::SynicMonitor for UhPartition {
1019    fn set_monitor_page(&self, vtl: Vtl, gpa: Option<u64>) -> anyhow::Result<()> {
1020        // Keep this locked the whole function to avoid racing with allocate_monitor_page.
1021        let mut allocated_block = self.inner.allocated_monitor_page.lock();
1022        let old_gpa = self.inner.monitor_page.set_gpa(gpa);
1023
1024        // Take ownership of any allocated monitor page so it will be freed on function exit.
1025        let allocated_page = allocated_block.take();
1026        if let Some(old_gpa) = old_gpa {
1027            let allocated_gpa = allocated_page
1028                .as_ref()
1029                .map(|b| b.pfns()[0] << HV_PAGE_SHIFT);
1030
1031            // Revert the old page's permissions, using the appropriate method depending on
1032            // whether it was allocated or guest-supplied.
1033            let result = if allocated_gpa == Some(old_gpa) {
1034                let vtl = GuestVtl::try_from(vtl).unwrap();
1035                self.unregister_cvm_dma_overlay_page(vtl, old_gpa >> HV_PAGE_SHIFT)
1036            } else {
1037                self.inner
1038                    .hcl
1039                    .modify_vtl_protection_mask(
1040                        MemoryRange::new(old_gpa..old_gpa + HV_PAGE_SIZE),
1041                        hvdef::HV_MAP_GPA_PERMISSIONS_ALL,
1042                        HvInputVtl::CURRENT_VTL,
1043                    )
1044                    .map_err(|err| anyhow::anyhow!(err))
1045            };
1046
1047            result
1048                .context("failed to unregister old monitor page")
1049                .inspect_err(|_| {
1050                    // Leave the page unset if returning a failure.
1051                    self.inner.monitor_page.set_gpa(None);
1052                })?;
1053
1054            tracing::debug!(old_gpa, "unregistered monitor page");
1055        }
1056
1057        if let Some(gpa) = gpa {
1058            // Disallow VTL0 from writing to the page, so we'll get an intercept. Note that read
1059            // permissions must be enabled or this doesn't work correctly.
1060            self.inner
1061                .hcl
1062                .modify_vtl_protection_mask(
1063                    MemoryRange::new(gpa..gpa + HV_PAGE_SIZE),
1064                    HvMapGpaFlags::new().with_readable(true),
1065                    HvInputVtl::CURRENT_VTL,
1066                )
1067                .context("failed to register monitor page")
1068                .inspect_err(|_| {
1069                    // Leave the page unset if returning a failure.
1070                    self.inner.monitor_page.set_gpa(None);
1071                })?;
1072
1073            tracing::debug!(gpa, "registered monitor page");
1074        }
1075
1076        Ok(())
1077    }
1078
1079    fn register_monitor(
1080        &self,
1081        monitor_id: vmcore::monitor::MonitorId,
1082        connection_id: u32,
1083    ) -> Box<dyn Sync + Send> {
1084        self.inner
1085            .monitor_page
1086            .register_monitor(monitor_id, connection_id)
1087    }
1088
1089    fn allocate_monitor_page(&self, vtl: Vtl) -> anyhow::Result<Option<u64>> {
1090        let vtl = GuestVtl::try_from(vtl).unwrap();
1091
1092        // Allocating a monitor page is only supported for CVMs.
1093        let Some(state) = self.inner.backing_shared.cvm_state() else {
1094            return Ok(None);
1095        };
1096
1097        let mut allocated_block = self.inner.allocated_monitor_page.lock();
1098        if let Some(block) = allocated_block.as_ref() {
1099            // An allocated monitor page is already in use; no need to change it.
1100            let gpa = block.pfns()[0] << HV_PAGE_SHIFT;
1101            assert_eq!(self.inner.monitor_page.gpa(), Some(gpa));
1102            return Ok(Some(gpa));
1103        }
1104
1105        let block = state
1106            .private_dma_client
1107            .allocate_dma_buffer(HV_PAGE_SIZE_USIZE)
1108            .context("failed to allocate monitor page")?;
1109
1110        let gpn = block.pfns()[0];
1111        *allocated_block = Some(block);
1112        let gpa = gpn << HV_PAGE_SHIFT;
1113        let old_gpa = self.inner.monitor_page.set_gpa(Some(gpa));
1114        if let Some(old_gpa) = old_gpa {
1115            // The old GPA is guaranteed not to be allocated, since that was checked above, so
1116            // revert its permissions using the method for guest-supplied memory.
1117            self.inner
1118                .hcl
1119                .modify_vtl_protection_mask(
1120                    MemoryRange::new(old_gpa..old_gpa + HV_PAGE_SIZE),
1121                    hvdef::HV_MAP_GPA_PERMISSIONS_ALL,
1122                    HvInputVtl::CURRENT_VTL,
1123                )
1124                .context("failed to unregister old monitor page")
1125                .inspect_err(|_| {
1126                    // Leave the page unset if returning a failure.
1127                    self.inner.monitor_page.set_gpa(None);
1128                })?;
1129
1130            tracing::debug!(old_gpa, "unregistered monitor page");
1131        }
1132
1133        // Disallow VTL0 from writing to the page, so we'll get an intercept. Note that read
1134        // permissions must be enabled or this doesn't work correctly.
1135        self.register_cvm_dma_overlay_page(vtl, gpn, HvMapGpaFlags::new().with_readable(true))
1136            .context("failed to unregister monitor page")
1137            .inspect_err(|_| {
1138                // Leave the page unset if returning a failure.
1139                self.inner.monitor_page.set_gpa(None);
1140            })?;
1141
1142        tracing::debug!(gpa, "registered allocated monitor page");
1143
1144        Ok(Some(gpa))
1145    }
1146}
1147
1148impl UhPartitionInner {
1149    #[cfg(guest_arch = "x86_64")]
1150    pub(crate) fn synic_interrupt(
1151        &self,
1152        vp_index: VpIndex,
1153        vtl: GuestVtl,
1154    ) -> impl '_ + hv1_emulator::RequestInterrupt {
1155        // TODO CVM: optimize for SNP with secure avic to avoid internal wake
1156        // and for TDX to avoid trip to user mode
1157        move |vector, auto_eoi| {
1158            self.lapic(vtl).unwrap().synic_interrupt(
1159                vp_index,
1160                vector as u8,
1161                auto_eoi,
1162                |vp_index| self.vp(vp_index).unwrap().wake(vtl, WakeReason::INTCON),
1163            );
1164        }
1165    }
1166
1167    #[cfg(guest_arch = "aarch64")]
1168    fn synic_interrupt(
1169        &self,
1170        _vp_index: VpIndex,
1171        _vtl: GuestVtl,
1172    ) -> impl '_ + hv1_emulator::RequestInterrupt {
1173        move |_, _| {}
1174    }
1175}
1176
1177#[derive(Debug)]
1178struct UhEventPort {
1179    partition: Weak<UhPartitionInner>,
1180    params: Arc<Mutex<UhEventPortParams>>,
1181}
1182
1183#[derive(Debug, Copy, Clone)]
1184struct UhEventPortParams {
1185    vp: VpIndex,
1186    sint: u8,
1187    flag: u16,
1188    vtl: GuestVtl,
1189}
1190
1191impl vmcore::synic::GuestEventPort for UhEventPort {
1192    fn interrupt(&self) -> vmcore::interrupt::Interrupt {
1193        let partition = self.partition.clone();
1194        let params = self.params.clone();
1195        vmcore::interrupt::Interrupt::from_fn(move || {
1196            let UhEventPortParams {
1197                vp,
1198                sint,
1199                flag,
1200                vtl,
1201            } = *params.lock();
1202            let Some(partition) = partition.upgrade() else {
1203                return;
1204            };
1205            tracing::trace!(vp = vp.index(), sint, flag, "signal_event");
1206            if let Some(hv) = partition.hv() {
1207                match hv.synic[vtl].signal_event(
1208                    vp,
1209                    sint,
1210                    flag,
1211                    &mut partition.synic_interrupt(vp, vtl),
1212                ) {
1213                    Ok(_) => {}
1214                    Err(SintProxied) => {
1215                        tracing::trace!(
1216                            vp = vp.index(),
1217                            sint,
1218                            flag,
1219                            "forwarding event to untrusted synic"
1220                        );
1221                        if let Some(synic) = partition.backing_shared.untrusted_synic() {
1222                            synic
1223                                .signal_event(
1224                                    vp,
1225                                    sint,
1226                                    flag,
1227                                    &mut partition.synic_interrupt(vp, vtl),
1228                                )
1229                                .ok();
1230                        } else {
1231                            partition.hcl.signal_event_direct(vp.index(), sint, flag)
1232                        }
1233                    }
1234                }
1235            } else {
1236                partition.hcl.signal_event_direct(vp.index(), sint, flag);
1237            }
1238        })
1239    }
1240
1241    fn set_target_vp(&mut self, vp: u32) -> Result<(), vmcore::synic::HypervisorError> {
1242        self.params.lock().vp = VpIndex::new(vp);
1243        Ok(())
1244    }
1245}
1246
1247impl virt::Hv1 for UhPartition {
1248    type Error = Error;
1249    type Device = virt::x86::apic_software_device::ApicSoftwareDevice;
1250
1251    fn reference_time_source(&self) -> Option<ReferenceTimeSource> {
1252        Some(if let Some(hv) = self.inner.hv() {
1253            hv.ref_time_source().clone()
1254        } else {
1255            ReferenceTimeSource::from(self.inner.clone() as Arc<_>)
1256        })
1257    }
1258
1259    fn new_virtual_device(
1260        &self,
1261    ) -> Option<&dyn virt::DeviceBuilder<Device = Self::Device, Error = Self::Error>> {
1262        self.inner.software_devices.is_some().then_some(self)
1263    }
1264}
1265
1266impl GetReferenceTime for UhPartitionInner {
1267    fn now(&self) -> ReferenceTimeResult {
1268        ReferenceTimeResult {
1269            ref_time: self.hcl.reference_time().unwrap(),
1270            system_time: None,
1271        }
1272    }
1273}
1274
1275impl virt::DeviceBuilder for UhPartition {
1276    fn build(&self, vtl: Vtl, device_id: u64) -> Result<Self::Device, Self::Error> {
1277        let vtl = GuestVtl::try_from(vtl).expect("higher vtl not configured");
1278        let device = self
1279            .inner
1280            .software_devices
1281            .as_ref()
1282            .expect("checked in new_virtual_device")
1283            .new_device(self.interrupt_targets[vtl].clone(), device_id)
1284            .map_err(Error::NewDevice)?;
1285
1286        Ok(device)
1287    }
1288}
1289
1290struct UhInterruptTarget {
1291    partition: Arc<UhPartitionInner>,
1292    vtl: GuestVtl,
1293}
1294
1295impl pci_core::msi::MsiInterruptTarget for UhInterruptTarget {
1296    fn new_interrupt(&self) -> Box<dyn pci_core::msi::MsiControl> {
1297        let partition = self.partition.clone();
1298        let vtl = self.vtl;
1299        Box::new(move |address, data| partition.request_msi(vtl, MsiRequest { address, data }))
1300    }
1301}
1302
1303impl UhPartitionInner {
1304    fn request_msi(&self, vtl: GuestVtl, request: MsiRequest) {
1305        if let Some(lapic) = self.lapic(vtl) {
1306            tracing::trace!(?request, "interrupt");
1307            lapic.request_interrupt(request.address, request.data, |vp_index| {
1308                self.vp(vp_index).unwrap().wake(vtl, WakeReason::INTCON)
1309            });
1310        } else {
1311            let (address, data) = request.as_x86();
1312            if let Err(err) = self.hcl.request_interrupt(
1313                request.hv_x86_interrupt_control(),
1314                address.virt_destination().into(),
1315                data.vector().into(),
1316                vtl,
1317            ) {
1318                tracelimit::warn_ratelimited!(
1319                    CVM_ALLOWED,
1320                    error = &err as &dyn std::error::Error,
1321                    address = request.address,
1322                    data = request.data,
1323                    "failed to request msi"
1324                );
1325            }
1326        }
1327    }
1328}
1329
1330impl IoApicRouting for UhPartitionInner {
1331    fn set_irq_route(&self, irq: u8, request: Option<MsiRequest>) {
1332        self.irq_routes.set_irq_route(irq, request)
1333    }
1334
1335    // The IO-APIC is always hooked up to VTL0.
1336    fn assert_irq(&self, irq: u8) {
1337        self.irq_routes
1338            .assert_irq(irq, |request| self.request_msi(GuestVtl::Vtl0, request))
1339    }
1340}
1341
1342/// Configure the [`hvdef::HvRegisterVsmPartitionConfig`] register with the
1343/// values used by underhill.
1344fn set_vtl2_vsm_partition_config(hcl: &Hcl) -> Result<(), Error> {
1345    // Read available capabilities to determine what to enable.
1346    let caps = hcl.get_vsm_capabilities().map_err(Error::Hcl)?;
1347    let hardware_isolated = hcl.isolation().is_hardware_isolated();
1348    let isolated = hcl.isolation().is_isolated();
1349
1350    let config = HvRegisterVsmPartitionConfig::new()
1351        .with_default_vtl_protection_mask(0xF)
1352        .with_enable_vtl_protection(!hardware_isolated)
1353        .with_zero_memory_on_reset(!hardware_isolated)
1354        .with_intercept_cpuid_unimplemented(!hardware_isolated)
1355        .with_intercept_page(caps.intercept_page_available())
1356        .with_intercept_unrecoverable_exception(true)
1357        .with_intercept_not_present(caps.intercept_not_present_available() && !isolated)
1358        .with_intercept_acceptance(isolated)
1359        .with_intercept_enable_vtl_protection(isolated && !hardware_isolated)
1360        .with_intercept_system_reset(caps.intercept_system_reset_available());
1361
1362    hcl.set_vtl2_vsm_partition_config(config)
1363        .map_err(Error::VsmPartitionConfig)
1364}
1365
1366/// Configuration parameters supplied to [`UhProtoPartition::new`].
1367///
1368/// These do not include runtime resources.
1369pub struct UhPartitionNewParams<'a> {
1370    /// The isolation type for the partition.
1371    pub isolation: IsolationType,
1372    /// Hide isolation from the guest. The guest will run as if it is not
1373    /// isolated.
1374    pub hide_isolation: bool,
1375    /// The memory layout for lower VTLs.
1376    pub lower_vtl_memory_layout: &'a MemoryLayout,
1377    /// The guest processor topology.
1378    pub topology: &'a ProcessorTopology,
1379    /// The unparsed CVM cpuid info.
1380    // TODO: move parsing up a layer.
1381    pub cvm_cpuid_info: Option<&'a [u8]>,
1382    /// The unparsed CVM secrets page.
1383    pub snp_secrets: Option<&'a [u8]>,
1384    /// The virtual top of memory for hardware-isolated VMs.
1385    ///
1386    /// Must be a power of two.
1387    pub vtom: Option<u64>,
1388    /// Handle synic messages and events.
1389    ///
1390    /// On TDX, this prevents the hypervisor from getting vmtdcall exits.
1391    pub handle_synic: bool,
1392    /// Do not hotplug sidecar VPs on their first exit. Just continue running
1393    /// the VP remotely.
1394    pub no_sidecar_hotplug: bool,
1395    /// Use MMIO access hypercalls.
1396    pub use_mmio_hypercalls: bool,
1397    /// Intercept guest debug exceptions to support gdbstub.
1398    pub intercept_debug_exceptions: bool,
1399}
1400
1401/// Parameters to [`UhProtoPartition::build`].
1402pub struct UhLateParams<'a> {
1403    /// Guest memory for lower VTLs.
1404    pub gm: VtlArray<GuestMemory, 2>,
1405    /// Guest memory for VTL 0 kernel execute access.
1406    pub vtl0_kernel_exec_gm: GuestMemory,
1407    /// Guest memory for VTL 0 user execute access.
1408    pub vtl0_user_exec_gm: GuestMemory,
1409    /// The CPUID leaves to expose to the guest.
1410    #[cfg(guest_arch = "x86_64")]
1411    pub cpuid: Vec<CpuidLeaf>,
1412    /// The mesh sender to use for crash notifications.
1413    // FUTURE: remove mesh dependency from this layer.
1414    pub crash_notification_send: mesh::Sender<VtlCrash>,
1415    /// The VM time source.
1416    pub vmtime: &'a VmTimeSource,
1417    /// Parameters for CVMs only.
1418    pub cvm_params: Option<CvmLateParams>,
1419    /// vmbus_relay is enabled and active for partition
1420    pub vmbus_relay: bool,
1421}
1422
1423/// CVM-only parameters to [`UhProtoPartition::build`].
1424pub struct CvmLateParams {
1425    /// Guest memory for untrusted devices, like overlay pages.
1426    pub shared_gm: GuestMemory,
1427    /// An object to call to change host visibility on guest memory.
1428    pub isolated_memory_protector: Arc<dyn ProtectIsolatedMemory>,
1429    /// Dma client for shared visibility pages.
1430    pub shared_dma_client: Arc<dyn DmaClient>,
1431    /// Allocator for private visibility pages.
1432    pub private_dma_client: Arc<dyn DmaClient>,
1433}
1434
1435/// Represents a GPN that is either in guest memory or was allocated by dma_client.
1436#[derive(Debug, Copy, Clone, PartialEq, Eq)]
1437pub enum GpnSource {
1438    /// The GPN is in regular guest RAM.
1439    GuestMemory,
1440    /// The GPN was allocated by dma_client and is not in guest RAM.
1441    Dma,
1442}
1443
1444/// Trait for CVM-related protections on guest memory.
1445pub trait ProtectIsolatedMemory: Send + Sync {
1446    /// Changes host visibility on guest memory.
1447    fn change_host_visibility(
1448        &self,
1449        vtl: GuestVtl,
1450        shared: bool,
1451        gpns: &[u64],
1452        tlb_access: &mut dyn TlbFlushLockAccess,
1453    ) -> Result<(), (HvError, usize)>;
1454
1455    /// Queries host visibility on guest memory.
1456    fn query_host_visibility(
1457        &self,
1458        gpns: &[u64],
1459        host_visibility: &mut [HostVisibilityType],
1460    ) -> Result<(), (HvError, usize)>;
1461
1462    /// Gets the default protections/permissions for VTL 0.
1463    fn default_vtl0_protections(&self) -> HvMapGpaFlags;
1464
1465    /// Changes the default protections/permissions for a VTL. For VBS-isolated
1466    /// VMs, the protections apply to all vtls lower than the specified one. For
1467    /// hardware-isolated VMs, they apply just to the given vtl.
1468    fn change_default_vtl_protections(
1469        &self,
1470        target_vtl: GuestVtl,
1471        protections: HvMapGpaFlags,
1472        tlb_access: &mut dyn TlbFlushLockAccess,
1473    ) -> Result<(), HvError>;
1474
1475    /// Changes the vtl protections on a range of guest memory.
1476    fn change_vtl_protections(
1477        &self,
1478        target_vtl: GuestVtl,
1479        gpns: &[u64],
1480        protections: HvMapGpaFlags,
1481        tlb_access: &mut dyn TlbFlushLockAccess,
1482    ) -> Result<(), (HvError, usize)>;
1483
1484    /// Registers a page as an overlay page by first validating it has the
1485    /// required permissions, optionally modifying them, then locking them.
1486    fn register_overlay_page(
1487        &self,
1488        vtl: GuestVtl,
1489        gpn: u64,
1490        gpn_source: GpnSource,
1491        check_perms: HvMapGpaFlags,
1492        new_perms: Option<HvMapGpaFlags>,
1493        tlb_access: &mut dyn TlbFlushLockAccess,
1494    ) -> Result<(), HvError>;
1495
1496    /// Unregisters an overlay page, removing its permission lock and restoring
1497    /// the previous permissions.
1498    fn unregister_overlay_page(
1499        &self,
1500        vtl: GuestVtl,
1501        gpn: u64,
1502        tlb_access: &mut dyn TlbFlushLockAccess,
1503    ) -> Result<(), HvError>;
1504
1505    /// Checks whether a page is currently registered as an overlay page.
1506    fn is_overlay_page(&self, vtl: GuestVtl, gpn: u64) -> bool;
1507
1508    /// Locks the permissions and mappings for a set of guest pages.
1509    fn lock_gpns(&self, vtl: GuestVtl, gpns: &[u64]) -> Result<(), GuestMemoryBackingError>;
1510
1511    /// Unlocks the permissions and mappings for a set of guest pages.
1512    ///
1513    /// Panics if asked to unlock a page that was not previously locked. The
1514    /// caller must ensure that the given slice has the same ordering as the
1515    /// one passed to `lock_gpns`.
1516    fn unlock_gpns(&self, vtl: GuestVtl, gpns: &[u64]);
1517
1518    /// Alerts the memory protector that vtl 1 is ready to set vtl protections
1519    /// on lower-vtl memory, and that these protections should be enforced.
1520    fn set_vtl1_protections_enabled(&self);
1521
1522    /// Whether VTL 1 is prepared to modify vtl protections on lower-vtl memory,
1523    /// and therefore whether these protections should be enforced.
1524    fn vtl1_protections_enabled(&self) -> bool;
1525}
1526
1527/// Trait for access to TLB flush and lock machinery.
1528pub trait TlbFlushLockAccess {
1529    /// Flush the entire TLB for all VPs for the given VTL.
1530    fn flush(&mut self, vtl: GuestVtl);
1531
1532    /// Flush the entire TLB for all VPs for all VTLs.
1533    fn flush_entire(&mut self);
1534
1535    /// Causes the specified VTL on the current VP to wait on all TLB locks.
1536    fn set_wait_for_tlb_locks(&mut self, vtl: GuestVtl);
1537}
1538
1539/// A partially built partition. Used to allow querying partition capabilities
1540/// before fully instantiating the partition.
1541pub struct UhProtoPartition<'a> {
1542    params: UhPartitionNewParams<'a>,
1543    hcl: Hcl,
1544    guest_vsm_available: bool,
1545    #[cfg(guest_arch = "x86_64")]
1546    cpuid: virt::CpuidLeafSet,
1547}
1548
1549impl<'a> UhProtoPartition<'a> {
1550    /// Creates a new prototype partition.
1551    ///
1552    /// `driver(cpu)` returns the driver to use for polling the sidecar device
1553    /// whose base CPU is `cpu`.
1554    pub fn new<T: SpawnDriver>(
1555        params: UhPartitionNewParams<'a>,
1556        driver: impl FnMut(u32) -> T,
1557    ) -> Result<Self, Error> {
1558        let hcl_isolation = match params.isolation {
1559            IsolationType::None => hcl::ioctl::IsolationType::None,
1560            IsolationType::Vbs => hcl::ioctl::IsolationType::Vbs,
1561            IsolationType::Snp => hcl::ioctl::IsolationType::Snp,
1562            IsolationType::Tdx => hcl::ioctl::IsolationType::Tdx,
1563        };
1564
1565        // Try to open the sidecar device, if it is present.
1566        let sidecar = sidecar_client::SidecarClient::new(driver).map_err(Error::Sidecar)?;
1567
1568        let hcl = Hcl::new(hcl_isolation, sidecar).map_err(Error::Hcl)?;
1569
1570        // Set the hypercalls that this process will use.
1571        let mut allowed_hypercalls = vec![
1572            hvdef::HypercallCode::HvCallGetVpRegisters,
1573            hvdef::HypercallCode::HvCallSetVpRegisters,
1574            hvdef::HypercallCode::HvCallInstallIntercept,
1575            hvdef::HypercallCode::HvCallTranslateVirtualAddress,
1576            hvdef::HypercallCode::HvCallPostMessageDirect,
1577            hvdef::HypercallCode::HvCallSignalEventDirect,
1578            hvdef::HypercallCode::HvCallModifyVtlProtectionMask,
1579            hvdef::HypercallCode::HvCallTranslateVirtualAddressEx,
1580            hvdef::HypercallCode::HvCallCheckSparseGpaPageVtlAccess,
1581            hvdef::HypercallCode::HvCallAssertVirtualInterrupt,
1582            hvdef::HypercallCode::HvCallGetVpIndexFromApicId,
1583            hvdef::HypercallCode::HvCallAcceptGpaPages,
1584            hvdef::HypercallCode::HvCallModifySparseGpaPageHostVisibility,
1585        ];
1586
1587        if params.isolation.is_hardware_isolated() {
1588            allowed_hypercalls.extend(vec![
1589                hvdef::HypercallCode::HvCallEnablePartitionVtl,
1590                hvdef::HypercallCode::HvCallRetargetDeviceInterrupt,
1591                hvdef::HypercallCode::HvCallEnableVpVtl,
1592            ]);
1593        }
1594
1595        if params.use_mmio_hypercalls {
1596            allowed_hypercalls.extend(vec![
1597                hvdef::HypercallCode::HvCallMemoryMappedIoRead,
1598                hvdef::HypercallCode::HvCallMemoryMappedIoWrite,
1599            ]);
1600        }
1601
1602        hcl.set_allowed_hypercalls(allowed_hypercalls.as_slice());
1603
1604        set_vtl2_vsm_partition_config(&hcl)?;
1605
1606        let guest_vsm_available = Self::check_guest_vsm_support(&hcl)?;
1607
1608        #[cfg(guest_arch = "x86_64")]
1609        let cpuid = match params.isolation {
1610            IsolationType::Snp => cvm_cpuid::CpuidResultsIsolationType::Snp {
1611                cpuid_pages: params.cvm_cpuid_info.unwrap(),
1612                vtom: params.vtom.unwrap(),
1613                access_vsm: guest_vsm_available,
1614            }
1615            .build()
1616            .map_err(Error::CvmCpuid)?,
1617
1618            IsolationType::Tdx => cvm_cpuid::CpuidResultsIsolationType::Tdx {
1619                topology: params.topology,
1620                vtom: params.vtom.unwrap(),
1621                access_vsm: guest_vsm_available,
1622            }
1623            .build()
1624            .map_err(Error::CvmCpuid)?,
1625            IsolationType::Vbs | IsolationType::None => Default::default(),
1626        };
1627
1628        Ok(UhProtoPartition {
1629            hcl,
1630            params,
1631            guest_vsm_available,
1632            #[cfg(guest_arch = "x86_64")]
1633            cpuid,
1634        })
1635    }
1636
1637    /// Returns whether VSM support will be available to the guest.
1638    pub fn guest_vsm_available(&self) -> bool {
1639        self.guest_vsm_available
1640    }
1641
1642    /// Returns a new Underhill partition.
1643    pub async fn build(
1644        self,
1645        late_params: UhLateParams<'_>,
1646    ) -> Result<(UhPartition, Vec<UhProcessorBox>), Error> {
1647        let Self {
1648            mut hcl,
1649            params,
1650            guest_vsm_available,
1651            #[cfg(guest_arch = "x86_64")]
1652            cpuid,
1653        } = self;
1654        let isolation = params.isolation;
1655        let is_hardware_isolated = isolation.is_hardware_isolated();
1656
1657        // Intercept Debug Exceptions
1658        // On TDX because all OpenHCL TDs today have the debug policy bit set,
1659        // OpenHCL registers for the intercepts itself.
1660        // However, on non-TDX platforms hypervisor installs the
1661        // intercept on behalf of the guest.
1662        if params.intercept_debug_exceptions {
1663            if !cfg!(feature = "gdb") {
1664                return Err(Error::InvalidDebugConfiguration);
1665            }
1666
1667            cfg_if::cfg_if! {
1668                if #[cfg(guest_arch = "x86_64")] {
1669                    if isolation != IsolationType::Tdx {
1670                        let debug_exception_vector = 0x1;
1671                        hcl.register_intercept(
1672                            HvInterceptType::HvInterceptTypeException,
1673                            HV_INTERCEPT_ACCESS_MASK_EXECUTE,
1674                            HvInterceptParameters::new_exception(debug_exception_vector),
1675                        )
1676                        .map_err(|err| Error::InstallIntercept(HvInterceptType::HvInterceptTypeException, err))?;
1677                    }
1678                } else {
1679                    return Err(Error::InvalidDebugConfiguration);
1680                }
1681            }
1682        }
1683
1684        if !is_hardware_isolated {
1685            if cfg!(guest_arch = "x86_64") {
1686                hcl.register_intercept(
1687                    HvInterceptType::HvInterceptTypeX64Msr,
1688                    HV_INTERCEPT_ACCESS_MASK_READ_WRITE,
1689                    HvInterceptParameters::new_zeroed(),
1690                )
1691                .map_err(|err| {
1692                    Error::InstallIntercept(HvInterceptType::HvInterceptTypeX64Msr, err)
1693                })?;
1694
1695                hcl.register_intercept(
1696                    HvInterceptType::HvInterceptTypeX64ApicEoi,
1697                    HV_INTERCEPT_ACCESS_MASK_WRITE,
1698                    HvInterceptParameters::new_zeroed(),
1699                )
1700                .map_err(|err| {
1701                    Error::InstallIntercept(HvInterceptType::HvInterceptTypeX64ApicEoi, err)
1702                })?;
1703            } else {
1704                if false {
1705                    todo!("AARCH64_TODO");
1706                }
1707            }
1708        }
1709
1710        if isolation == IsolationType::Snp {
1711            // SNP VMs register for the #VC exception to support reflect-VC.
1712            hcl.register_intercept(
1713                HvInterceptType::HvInterceptTypeException,
1714                HV_INTERCEPT_ACCESS_MASK_EXECUTE,
1715                HvInterceptParameters::new_exception(0x1D),
1716            )
1717            .map_err(|err| {
1718                Error::InstallIntercept(HvInterceptType::HvInterceptTypeException, err)
1719            })?;
1720
1721            // Get the register tweak bitmap from secrets page.
1722            let mut bitmap = [0u8; 64];
1723            if let Some(secrets) = params.snp_secrets {
1724                bitmap.copy_from_slice(
1725                    &secrets
1726                        [REG_TWEAK_BITMAP_OFFSET..REG_TWEAK_BITMAP_OFFSET + REG_TWEAK_BITMAP_SIZE],
1727                );
1728            }
1729            hcl.set_snp_register_bitmap(bitmap);
1730        }
1731
1732        // Do per-VP HCL initialization.
1733        hcl.add_vps(
1734            params.topology.vp_count(),
1735            late_params
1736                .cvm_params
1737                .as_ref()
1738                .map(|x| &x.private_dma_client),
1739        )
1740        .map_err(Error::Hcl)?;
1741
1742        let vps: Vec<_> = params
1743            .topology
1744            .vps_arch()
1745            .map(|vp_info| {
1746                // TODO: determine CPU index, which in theory could be different
1747                // from the VP index, though this hasn't happened yet.
1748                let cpu_index = vp_info.base.vp_index.index();
1749                UhVpInner::new(cpu_index, vp_info)
1750            })
1751            .collect();
1752
1753        // Enable support for VPCI devices if the hypervisor supports it.
1754        #[cfg(guest_arch = "x86_64")]
1755        let software_devices = {
1756            let res = if !is_hardware_isolated {
1757                hcl.register_intercept(
1758                    HvInterceptType::HvInterceptTypeRetargetInterruptWithUnknownDeviceId,
1759                    HV_INTERCEPT_ACCESS_MASK_EXECUTE,
1760                    HvInterceptParameters::new_zeroed(),
1761                )
1762            } else {
1763                Ok(())
1764            };
1765            match res {
1766                Ok(()) => Some(ApicSoftwareDevices::new(
1767                    params.topology.vps_arch().map(|vp| vp.apic_id).collect(),
1768                )),
1769                Err(HvError::InvalidParameter | HvError::AccessDenied) => None,
1770                Err(err) => {
1771                    return Err(Error::InstallIntercept(
1772                        HvInterceptType::HvInterceptTypeRetargetInterruptWithUnknownDeviceId,
1773                        err,
1774                    ));
1775                }
1776            }
1777        };
1778
1779        #[cfg(guest_arch = "aarch64")]
1780        let software_devices = None;
1781
1782        #[cfg(guest_arch = "aarch64")]
1783        let caps = virt::aarch64::Aarch64PartitionCapabilities {};
1784
1785        #[cfg(guest_arch = "x86_64")]
1786        let cpuid = UhPartition::construct_cpuid_results(
1787            cpuid,
1788            &late_params.cpuid,
1789            params.topology,
1790            isolation,
1791            params.hide_isolation,
1792        );
1793
1794        #[cfg(guest_arch = "x86_64")]
1795        let caps = UhPartition::construct_capabilities(
1796            params.topology,
1797            &cpuid,
1798            isolation,
1799            params.hide_isolation,
1800        );
1801
1802        if params.handle_synic && !matches!(isolation, IsolationType::Tdx) {
1803            // The hypervisor will manage the untrusted SINTs (or the whole
1804            // synic for non-hardware-isolated VMs), but some event ports
1805            // and message ports are implemented here. Register an intercept
1806            // to handle HvSignalEvent and HvPostMessage hypercalls when the
1807            // hypervisor doesn't recognize the connection ID.
1808            //
1809            // TDX manages this locally instead of through the hypervisor.
1810            hcl.register_intercept(
1811                HvInterceptType::HvInterceptTypeUnknownSynicConnection,
1812                HV_INTERCEPT_ACCESS_MASK_EXECUTE,
1813                HvInterceptParameters::new_zeroed(),
1814            )
1815            .expect("registering synic intercept cannot fail");
1816        }
1817
1818        #[cfg(guest_arch = "x86_64")]
1819        let cvm_state = if is_hardware_isolated {
1820            Some(Self::construct_cvm_state(
1821                &params,
1822                late_params.cvm_params.unwrap(),
1823                &caps,
1824                guest_vsm_available,
1825            )?)
1826        } else {
1827            None
1828        };
1829        #[cfg(guest_arch = "aarch64")]
1830        let cvm_state = None;
1831
1832        let backing_shared = BackingShared::new(
1833            isolation,
1834            &params,
1835            BackingSharedParams {
1836                cvm_state,
1837                #[cfg(guest_arch = "x86_64")]
1838                cpuid: &cpuid,
1839                hcl: &hcl,
1840                guest_vsm_available,
1841            },
1842        )?;
1843
1844        let enter_modes = EnterModes::default();
1845
1846        let partition = Arc::new(UhPartitionInner {
1847            hcl,
1848            vps,
1849            irq_routes: Default::default(),
1850            caps,
1851            enter_modes: Mutex::new(enter_modes),
1852            enter_modes_atomic: u8::from(hcl::protocol::EnterModes::from(enter_modes)).into(),
1853            gm: late_params.gm,
1854            vtl0_kernel_exec_gm: late_params.vtl0_kernel_exec_gm,
1855            vtl0_user_exec_gm: late_params.vtl0_user_exec_gm,
1856            #[cfg(guest_arch = "x86_64")]
1857            cpuid,
1858            crash_notification_send: late_params.crash_notification_send,
1859            monitor_page: MonitorPage::new(),
1860            allocated_monitor_page: Mutex::new(None),
1861            software_devices,
1862            lower_vtl_memory_layout: params.lower_vtl_memory_layout.clone(),
1863            vmtime: late_params.vmtime.clone(),
1864            isolation,
1865            no_sidecar_hotplug: params.no_sidecar_hotplug.into(),
1866            use_mmio_hypercalls: params.use_mmio_hypercalls,
1867            backing_shared,
1868            #[cfg(guest_arch = "x86_64")]
1869            device_vector_table: RwLock::new(IrrBitmap::new(Default::default())),
1870            intercept_debug_exceptions: params.intercept_debug_exceptions,
1871            vmbus_relay: late_params.vmbus_relay,
1872        });
1873
1874        if cfg!(guest_arch = "x86_64") {
1875            // Intercept all IOs unless opted out.
1876            partition.manage_io_port_intercept_region(0, !0, true);
1877        }
1878
1879        let vps = params
1880            .topology
1881            .vps_arch()
1882            .map(|vp_info| UhProcessorBox {
1883                partition: partition.clone(),
1884                vp_info,
1885            })
1886            .collect();
1887
1888        Ok((
1889            UhPartition {
1890                inner: partition.clone(),
1891                interrupt_targets: VtlArray::from_fn(|vtl| {
1892                    Arc::new(UhInterruptTarget {
1893                        partition: partition.clone(),
1894                        vtl: vtl.try_into().unwrap(),
1895                    })
1896                }),
1897            },
1898            vps,
1899        ))
1900    }
1901}
1902
1903impl UhPartition {
1904    /// Gets the guest OS ID for VTL0.
1905    pub fn vtl0_guest_os_id(&self) -> Result<HvGuestOsId, Error> {
1906        // If Underhill is emulating the hypervisor interfaces, get this value
1907        // from the emulator. This happens when running under hardware isolation
1908        // or when configured for testing.
1909        let id = if let Some(hv) = self.inner.hv() {
1910            hv.guest_os_id(Vtl::Vtl0)
1911        } else {
1912            // Ask the hypervisor for this value.
1913            self.inner
1914                .hcl
1915                .get_guest_os_id(Vtl::Vtl0)
1916                .map_err(Error::Hcl)?
1917        };
1918        Ok(id)
1919    }
1920
1921    /// Configures guest accesses to IO ports in `range` to go directly to the
1922    /// host.
1923    ///
1924    /// When the return value is dropped, the ports will be unregistered.
1925    pub fn register_host_io_port_fast_path(
1926        &self,
1927        range: RangeInclusive<u16>,
1928    ) -> HostIoPortFastPathHandle {
1929        // There is no way to provide a fast path for some hardware isolated
1930        // VM architectures. The devices that do use this facility are not
1931        // enabled on hardware isolated VMs.
1932        assert!(!self.inner.isolation.is_hardware_isolated());
1933
1934        self.inner
1935            .manage_io_port_intercept_region(*range.start(), *range.end(), false);
1936        HostIoPortFastPathHandle {
1937            inner: Arc::downgrade(&self.inner),
1938            begin: *range.start(),
1939            end: *range.end(),
1940        }
1941    }
1942
1943    /// Enables or disables the PM timer assist.
1944    pub fn set_pm_timer_assist(&self, port: Option<u16>) -> Result<(), HvError> {
1945        self.inner.hcl.set_pm_timer_assist(port)
1946    }
1947
1948    /// Sets guest memory protections for a monitor page.
1949    fn register_cvm_dma_overlay_page(
1950        &self,
1951        vtl: GuestVtl,
1952        gpn: u64,
1953        new_perms: HvMapGpaFlags,
1954    ) -> anyhow::Result<()> {
1955        // How the monitor page is protected depends on the isolation type of the VM.
1956        match &self.inner.backing_shared {
1957            #[cfg(guest_arch = "x86_64")]
1958            BackingShared::Snp(snp_backed_shared) => snp_backed_shared
1959                .cvm
1960                .isolated_memory_protector
1961                .register_overlay_page(
1962                    vtl,
1963                    gpn,
1964                    // On a CVM, the monitor page is always DMA-allocated.
1965                    GpnSource::Dma,
1966                    HvMapGpaFlags::new(),
1967                    Some(new_perms),
1968                    &mut SnpBacked::tlb_flush_lock_access(
1969                        None,
1970                        self.inner.as_ref(),
1971                        snp_backed_shared,
1972                    ),
1973                )
1974                .map_err(|e| anyhow::anyhow!(e)),
1975            #[cfg(guest_arch = "x86_64")]
1976            BackingShared::Tdx(tdx_backed_shared) => tdx_backed_shared
1977                .cvm
1978                .isolated_memory_protector
1979                .register_overlay_page(
1980                    vtl,
1981                    gpn,
1982                    GpnSource::Dma,
1983                    HvMapGpaFlags::new(),
1984                    Some(new_perms),
1985                    &mut TdxBacked::tlb_flush_lock_access(
1986                        None,
1987                        self.inner.as_ref(),
1988                        tdx_backed_shared,
1989                    ),
1990                )
1991                .map_err(|e| anyhow::anyhow!(e)),
1992            BackingShared::Hypervisor(_) => {
1993                let _ = (vtl, gpn, new_perms);
1994                unreachable!()
1995            }
1996        }
1997    }
1998
1999    /// Reverts guest memory protections for a monitor page.
2000    fn unregister_cvm_dma_overlay_page(&self, vtl: GuestVtl, gpn: u64) -> anyhow::Result<()> {
2001        // How the monitor page is protected depends on the isolation type of the VM.
2002        match &self.inner.backing_shared {
2003            #[cfg(guest_arch = "x86_64")]
2004            BackingShared::Snp(snp_backed_shared) => snp_backed_shared
2005                .cvm
2006                .isolated_memory_protector
2007                .unregister_overlay_page(
2008                    vtl,
2009                    gpn,
2010                    &mut SnpBacked::tlb_flush_lock_access(
2011                        None,
2012                        self.inner.as_ref(),
2013                        snp_backed_shared,
2014                    ),
2015                )
2016                .map_err(|e| anyhow::anyhow!(e)),
2017            #[cfg(guest_arch = "x86_64")]
2018            BackingShared::Tdx(tdx_backed_shared) => tdx_backed_shared
2019                .cvm
2020                .isolated_memory_protector
2021                .unregister_overlay_page(
2022                    vtl,
2023                    gpn,
2024                    &mut TdxBacked::tlb_flush_lock_access(
2025                        None,
2026                        self.inner.as_ref(),
2027                        tdx_backed_shared,
2028                    ),
2029                )
2030                .map_err(|e| anyhow::anyhow!(e)),
2031            BackingShared::Hypervisor(_) => {
2032                let _ = (vtl, gpn);
2033                unreachable!()
2034            }
2035        }
2036    }
2037}
2038
2039impl UhProtoPartition<'_> {
2040    /// Whether Guest VSM is available to the guest. If so, for hardware CVMs,
2041    /// it is safe to expose Guest VSM support via cpuid.
2042    fn check_guest_vsm_support(hcl: &Hcl) -> Result<bool, Error> {
2043        #[cfg(guest_arch = "x86_64")]
2044        let privs = {
2045            let result = safe_intrinsics::cpuid(hvdef::HV_CPUID_FUNCTION_MS_HV_FEATURES, 0);
2046            let num = result.eax as u64 | ((result.ebx as u64) << 32);
2047            hvdef::HvPartitionPrivilege::from(num)
2048        };
2049
2050        #[cfg(guest_arch = "aarch64")]
2051        let privs = hcl.get_privileges_and_features_info().map_err(Error::Hcl)?;
2052
2053        if !privs.access_vsm() {
2054            return Ok(false);
2055        }
2056        let guest_vsm_config = hcl.get_guest_vsm_partition_config().map_err(Error::Hcl)?;
2057        Ok(guest_vsm_config.maximum_vtl() >= u8::from(GuestVtl::Vtl1))
2058    }
2059
2060    #[cfg(guest_arch = "x86_64")]
2061    /// Constructs partition-wide CVM state.
2062    fn construct_cvm_state(
2063        params: &UhPartitionNewParams<'_>,
2064        late_params: CvmLateParams,
2065        caps: &PartitionCapabilities,
2066        guest_vsm_available: bool,
2067    ) -> Result<UhCvmPartitionState, Error> {
2068        use vmcore::reference_time::ReferenceTimeSource;
2069
2070        let vp_count = params.topology.vp_count() as usize;
2071        let vps = (0..vp_count)
2072            .map(|vp_index| UhCvmVpInner {
2073                tlb_lock_info: VtlArray::from_fn(|_| TlbLockInfo::new(vp_count)),
2074                vtl1_enable_called: Mutex::new(false),
2075                started: AtomicBool::new(vp_index == 0),
2076                hv_start_enable_vtl_vp: VtlArray::from_fn(|_| Mutex::new(None)),
2077            })
2078            .collect();
2079        let tlb_locked_vps =
2080            VtlArray::from_fn(|_| BitVec::repeat(false, vp_count).into_boxed_bitslice());
2081
2082        let lapic = VtlArray::from_fn(|_| {
2083            LocalApicSet::builder()
2084                .x2apic_capable(caps.x2apic)
2085                .hyperv_enlightenments(true)
2086                .build()
2087        });
2088
2089        let tsc_frequency = get_tsc_frequency(params.isolation)?;
2090        let ref_time = ReferenceTimeSource::new(TscReferenceTimeSource::new(tsc_frequency));
2091
2092        // If we're emulating the APIC, then we also must emulate the hypervisor
2093        // enlightenments, since the hypervisor can't support enlightenments
2094        // without also providing an APIC.
2095        //
2096        // Additionally, TDX provides hardware APIC emulation but we still need
2097        // to emulate the hypervisor enlightenments.
2098        let hv = GlobalHv::new(hv1_emulator::hv::GlobalHvParams {
2099            max_vp_count: params.topology.vp_count(),
2100            vendor: caps.vendor,
2101            tsc_frequency,
2102            ref_time,
2103            is_ref_time_backed_by_tsc: true,
2104        });
2105
2106        Ok(UhCvmPartitionState {
2107            vps_per_socket: params.topology.reserved_vps_per_socket(),
2108            tlb_locked_vps,
2109            vps,
2110            shared_memory: late_params.shared_gm,
2111            isolated_memory_protector: late_params.isolated_memory_protector,
2112            lapic,
2113            hv,
2114            guest_vsm: RwLock::new(GuestVsmState::from_availability(guest_vsm_available)),
2115            shared_dma_client: late_params.shared_dma_client,
2116            private_dma_client: late_params.private_dma_client,
2117            hide_isolation: params.hide_isolation,
2118        })
2119    }
2120}
2121
2122impl UhPartition {
2123    #[cfg(guest_arch = "x86_64")]
2124    /// Constructs the set of cpuid results to show to the guest
2125    fn construct_cpuid_results(
2126        cpuid: virt::CpuidLeafSet,
2127        initial_cpuid: &[CpuidLeaf],
2128        topology: &ProcessorTopology<vm_topology::processor::x86::X86Topology>,
2129        isolation: IsolationType,
2130        hide_isolation: bool,
2131    ) -> virt::CpuidLeafSet {
2132        let mut cpuid = cpuid.into_leaves();
2133        if isolation.is_hardware_isolated() {
2134            // Update the x2apic leaf based on the topology.
2135            let x2apic = match topology.apic_mode() {
2136                vm_topology::processor::x86::ApicMode::XApic => false,
2137                vm_topology::processor::x86::ApicMode::X2ApicSupported => true,
2138                vm_topology::processor::x86::ApicMode::X2ApicEnabled => true,
2139            };
2140            let ecx = x86defs::cpuid::VersionAndFeaturesEcx::new().with_x2_apic(x2apic);
2141            let ecx_mask = x86defs::cpuid::VersionAndFeaturesEcx::new().with_x2_apic(true);
2142            cpuid.push(
2143                CpuidLeaf::new(
2144                    x86defs::cpuid::CpuidFunction::VersionAndFeatures.0,
2145                    [0, 0, ecx.into(), 0],
2146                )
2147                .masked([0, 0, ecx_mask.into(), 0]),
2148            );
2149
2150            // Get the hypervisor version from the host. This is just for
2151            // reporting purposes, so it is safe even if the hypervisor is not
2152            // trusted.
2153            let hv_version = safe_intrinsics::cpuid(hvdef::HV_CPUID_FUNCTION_MS_HV_VERSION, 0);
2154
2155            // Perform final processing steps for synthetic leaves.
2156            hv1_emulator::cpuid::process_hv_cpuid_leaves(
2157                &mut cpuid,
2158                hide_isolation,
2159                [
2160                    hv_version.eax,
2161                    hv_version.ebx,
2162                    hv_version.ecx,
2163                    hv_version.edx,
2164                ],
2165            );
2166        }
2167        cpuid.extend(initial_cpuid);
2168        virt::CpuidLeafSet::new(cpuid)
2169    }
2170
2171    #[cfg(guest_arch = "x86_64")]
2172    /// Computes the partition capabilities
2173    fn construct_capabilities(
2174        topology: &ProcessorTopology,
2175        cpuid: &virt::CpuidLeafSet,
2176        isolation: IsolationType,
2177        hide_isolation: bool,
2178    ) -> virt::x86::X86PartitionCapabilities {
2179        let mut native_cpuid_fn;
2180        let mut cvm_cpuid_fn;
2181
2182        // Determine the method to get cpuid results for the guest when
2183        // computing partition capabilities.
2184        let cpuid_fn: &mut dyn FnMut(u32, u32) -> [u32; 4] = if isolation.is_hardware_isolated() {
2185            // Use the filtered CPUID to determine capabilities.
2186            cvm_cpuid_fn = move |leaf, sub_leaf| cpuid.result(leaf, sub_leaf, &[0, 0, 0, 0]);
2187            &mut cvm_cpuid_fn
2188        } else {
2189            // Just use the native cpuid.
2190            native_cpuid_fn = |leaf, sub_leaf| {
2191                let CpuidResult { eax, ebx, ecx, edx } = safe_intrinsics::cpuid(leaf, sub_leaf);
2192                cpuid.result(leaf, sub_leaf, &[eax, ebx, ecx, edx])
2193            };
2194            &mut native_cpuid_fn
2195        };
2196
2197        // Compute and validate capabilities.
2198        let mut caps = virt::x86::X86PartitionCapabilities::from_cpuid(topology, cpuid_fn);
2199        match isolation {
2200            IsolationType::Tdx => {
2201                assert_eq!(caps.vtom.is_some(), !hide_isolation);
2202                // TDX 1.5 requires EFER.NXE to be set to 1, so set it at RESET/INIT.
2203                caps.nxe_forced_on = true;
2204            }
2205            IsolationType::Snp => {
2206                assert_eq!(caps.vtom.is_some(), !hide_isolation);
2207            }
2208            _ => {
2209                assert!(caps.vtom.is_none());
2210            }
2211        }
2212
2213        caps
2214    }
2215}
2216
2217#[cfg(guest_arch = "x86_64")]
2218/// Gets the TSC frequency for the current platform.
2219fn get_tsc_frequency(isolation: IsolationType) -> Result<u64, Error> {
2220    // Always get the frequency from the hypervisor. It's believed that, as long
2221    // as the hypervisor is behaving, it will provide the most precise and accurate frequency.
2222    let msr = MsrDevice::new(0).map_err(Error::OpenMsr)?;
2223    let hv_frequency = msr
2224        .read_msr(hvdef::HV_X64_MSR_TSC_FREQUENCY)
2225        .map_err(Error::ReadTscFrequency)?;
2226
2227    // Get the hardware-advertised frequency and validate that the
2228    // hypervisor frequency is not too far off.
2229    let hw_info = match isolation {
2230        IsolationType::Tdx => {
2231            // TDX provides the TSC frequency via cpuid.
2232            let max_function =
2233                safe_intrinsics::cpuid(x86defs::cpuid::CpuidFunction::VendorAndMaxFunction.0, 0)
2234                    .eax;
2235
2236            if max_function < x86defs::cpuid::CpuidFunction::CoreCrystalClockInformation.0 {
2237                return Err(Error::BadCpuidTsc);
2238            }
2239            let result = safe_intrinsics::cpuid(
2240                x86defs::cpuid::CpuidFunction::CoreCrystalClockInformation.0,
2241                0,
2242            );
2243            let ratio_denom = result.eax;
2244            let ratio_num = result.ebx;
2245            let clock = result.ecx;
2246            if ratio_num == 0 || ratio_denom == 0 || clock == 0 {
2247                return Err(Error::BadCpuidTsc);
2248            }
2249            // TDX TSC is configurable in units of 25MHz, so allow up to 12.5MHz
2250            // error.
2251            let allowed_error = 12_500_000;
2252            Some((
2253                clock as u64 * ratio_num as u64 / ratio_denom as u64,
2254                allowed_error,
2255            ))
2256        }
2257        IsolationType::Snp => {
2258            // SNP currently does not provide the frequency.
2259            None
2260        }
2261        IsolationType::Vbs | IsolationType::None => None,
2262    };
2263
2264    if let Some((hw_frequency, allowed_error)) = hw_info {
2265        // Don't allow the frequencies to be different by more than the hardware
2266        // precision.
2267        let delta = hw_frequency.abs_diff(hv_frequency);
2268        if delta > allowed_error {
2269            return Err(Error::TscFrequencyMismatch {
2270                hv: hv_frequency,
2271                hw: hw_frequency,
2272                allowed_error,
2273            });
2274        }
2275    }
2276
2277    Ok(hv_frequency)
2278}
2279
2280impl UhPartitionInner {
2281    fn manage_io_port_intercept_region(&self, begin: u16, end: u16, active: bool) {
2282        if self.isolation.is_hardware_isolated() {
2283            return;
2284        }
2285
2286        static SKIP_RANGE: AtomicBool = AtomicBool::new(false);
2287
2288        let access_type_mask = if active {
2289            HV_INTERCEPT_ACCESS_MASK_READ_WRITE
2290        } else {
2291            HV_INTERCEPT_ACCESS_MASK_NONE
2292        };
2293
2294        // Try to register the whole range at once.
2295        if !SKIP_RANGE.load(Ordering::Relaxed) {
2296            match self.hcl.register_intercept(
2297                HvInterceptType::HvInterceptTypeX64IoPortRange,
2298                access_type_mask,
2299                HvInterceptParameters::new_io_port_range(begin..=end),
2300            ) {
2301                Ok(()) => return,
2302                Err(HvError::InvalidParameter) => {
2303                    // Probably a build that doesn't support range wrapping yet.
2304                    // Don't try again.
2305                    SKIP_RANGE.store(true, Ordering::Relaxed);
2306                    tracing::warn!(
2307                        CVM_ALLOWED,
2308                        "old hypervisor build; using slow path for intercept ranges"
2309                    );
2310                }
2311                Err(err) => {
2312                    panic!("io port range registration failure: {err:?}");
2313                }
2314            }
2315        }
2316
2317        // Fall back to registering one port at a time.
2318        for port in begin..=end {
2319            self.hcl
2320                .register_intercept(
2321                    HvInterceptType::HvInterceptTypeX64IoPort,
2322                    access_type_mask,
2323                    HvInterceptParameters::new_io_port(port),
2324                )
2325                .expect("registering io intercept cannot fail");
2326        }
2327    }
2328
2329    fn is_gpa_lower_vtl_ram(&self, gpa: u64) -> bool {
2330        // TODO: this probably should reflect changes to the memory map via PAM
2331        // registers. Right now this isn't an issue because the relevant region,
2332        // VGA, is handled on the host.
2333        self.lower_vtl_memory_layout
2334            .ram()
2335            .iter()
2336            .any(|m| m.range.contains_addr(gpa))
2337    }
2338
2339    fn is_gpa_mapped(&self, gpa: u64, write: bool) -> bool {
2340        // TODO: this probably should reflect changes to the memory map via PAM
2341        // registers. Right now this isn't an issue because the relevant region,
2342        // VGA, is handled on the host.
2343        if self.is_gpa_lower_vtl_ram(gpa) {
2344            // The monitor page is protected against lower VTL writes.
2345            !write || self.monitor_page.gpa() != Some(gpa & !(HV_PAGE_SIZE - 1))
2346        } else {
2347            false
2348        }
2349    }
2350
2351    /// Gets the CPUID result, applying any necessary runtime modifications.
2352    #[cfg(guest_arch = "x86_64")]
2353    fn cpuid_result(&self, eax: u32, ecx: u32, default: &[u32; 4]) -> [u32; 4] {
2354        let r = self.cpuid.result(eax, ecx, default);
2355        if eax == hvdef::HV_CPUID_FUNCTION_MS_HV_FEATURES {
2356            // Update the VSM access privilege.
2357            //
2358            // FUTURE: Investigate if this is really necessary for non-CVM--the
2359            // hypervisor should already update this correctly.
2360            //
2361            // If it is only for CVM, then it should be moved to the
2362            // CVM-specific cpuid fixups.
2363            let mut features = hvdef::HvFeatures::from_cpuid(r);
2364            if self.backing_shared.guest_vsm_disabled() {
2365                features.set_privileges(features.privileges().with_access_vsm(false));
2366            }
2367            features.into_cpuid()
2368        } else {
2369            r
2370        }
2371    }
2372}
2373
2374/// Handle returned by [`UhPartition::register_host_io_port_fast_path`].
2375///
2376/// When dropped, unregisters the IO ports so that they are no longer forwarded
2377/// to the host.
2378#[must_use]
2379pub struct HostIoPortFastPathHandle {
2380    inner: Weak<UhPartitionInner>,
2381    begin: u16,
2382    end: u16,
2383}
2384
2385impl Drop for HostIoPortFastPathHandle {
2386    fn drop(&mut self) {
2387        if let Some(inner) = self.inner.upgrade() {
2388            inner.manage_io_port_intercept_region(self.begin, self.end, true);
2389        }
2390    }
2391}
2392
2393/// The application level VTL crash data not suited for putting
2394/// on the wire.
2395///
2396/// FUTURE: move/remove this to standardize across virt backends.
2397#[derive(Copy, Clone, Debug)]
2398pub struct VtlCrash {
2399    /// The VP that crashed.
2400    pub vp_index: VpIndex,
2401    /// The VTL that crashed.
2402    pub last_vtl: GuestVtl,
2403    /// The crash control information.
2404    pub control: GuestCrashCtl,
2405    /// The crash parameters.
2406    pub parameters: [u64; 5],
2407}
2408
2409/// Validate that flags is a valid setting for VTL memory protection when
2410/// applied to VTL 1.
2411#[cfg_attr(guest_arch = "aarch64", expect(dead_code))]
2412fn validate_vtl_gpa_flags(
2413    flags: HvMapGpaFlags,
2414    mbec_enabled: bool,
2415    shadow_supervisor_stack_enabled: bool,
2416) -> bool {
2417    // Adjust is not allowed for VTL1.
2418    if flags.adjustable() {
2419        return false;
2420    }
2421
2422    // KX must equal UX unless MBEC is enabled. KX && !UX is invalid.
2423    if flags.kernel_executable() != flags.user_executable() {
2424        if (flags.kernel_executable() && !flags.user_executable()) || !mbec_enabled {
2425            return false;
2426        }
2427    }
2428
2429    // Read must be specified if anything else is specified.
2430    if flags.writable()
2431        || flags.kernel_executable()
2432        || flags.user_executable()
2433        || flags.supervisor_shadow_stack()
2434        || flags.paging_writability()
2435        || flags.verify_paging_writability()
2436    {
2437        if !flags.readable() {
2438            return false;
2439        }
2440    }
2441
2442    // Supervisor shadow stack protection is invalid if shadow stacks are disabled
2443    // or if execute is not specified.
2444    if flags.supervisor_shadow_stack()
2445        && ((!flags.kernel_executable() && !flags.user_executable())
2446            || shadow_supervisor_stack_enabled)
2447    {
2448        return false;
2449    }
2450
2451    true
2452}