virt_mshv_vtl/
lib.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! Implementation of the Underhill hypervisor backend, which uses
5//! `/dev/mshv_vtl` to interact with the Microsoft hypervisor while running in
6//! VTL2.
7
8#![cfg(all(guest_is_native, target_os = "linux"))]
9
10mod devmsr;
11
12cfg_if::cfg_if!(
13    if #[cfg(guest_arch = "x86_64")] {
14        mod cvm_cpuid;
15        pub use processor::snp::SnpBacked;
16        pub use processor::tdx::TdxBacked;
17        use crate::processor::HardwareIsolatedBacking;
18        pub use crate::processor::mshv::x64::HypervisorBackedX86 as HypervisorBacked;
19        use crate::processor::mshv::x64::HypervisorBackedX86Shared as HypervisorBackedShared;
20        use bitvec::prelude::BitArray;
21        use bitvec::prelude::Lsb0;
22        use devmsr::MsrDevice;
23        use hv1_emulator::hv::ProcessorVtlHv;
24        use processor::LapicState;
25        use processor::snp::SnpBackedShared;
26        use processor::tdx::TdxBackedShared;
27        use std::arch::x86_64::CpuidResult;
28        use virt::CpuidLeaf;
29        use virt::state::StateElement;
30        use virt::vp::MpState;
31        /// Bitarray type for representing IRR bits in a x86-64 APIC
32        /// Each bit represent the 256 possible vectors.
33        type IrrBitmap = BitArray<[u32; 8], Lsb0>;
34    } else if #[cfg(guest_arch = "aarch64")] {
35        pub use crate::processor::mshv::arm64::HypervisorBackedArm64 as HypervisorBacked;
36        use crate::processor::mshv::arm64::HypervisorBackedArm64Shared as HypervisorBackedShared;
37    }
38);
39
40mod processor;
41pub use processor::Backing;
42pub use processor::UhProcessor;
43
44use anyhow::Context as AnyhowContext;
45use bitfield_struct::bitfield;
46use bitvec::boxed::BitBox;
47use bitvec::vec::BitVec;
48use cvm_tracing::CVM_ALLOWED;
49use guestmem::GuestMemory;
50use guestmem::GuestMemoryBackingError;
51use hcl::GuestVtl;
52use hcl::ioctl::Hcl;
53use hcl::ioctl::SetVsmPartitionConfigError;
54use hv1_emulator::hv::GlobalHv;
55use hv1_emulator::message_queues::MessageQueues;
56use hv1_emulator::synic::GlobalSynic;
57use hv1_emulator::synic::SintProxied;
58use hv1_structs::VtlArray;
59use hvdef::GuestCrashCtl;
60use hvdef::HV_PAGE_SHIFT;
61use hvdef::HV_PAGE_SIZE;
62use hvdef::HV_PAGE_SIZE_USIZE;
63use hvdef::HvError;
64use hvdef::HvMapGpaFlags;
65use hvdef::HvRegisterName;
66use hvdef::HvRegisterVsmPartitionConfig;
67use hvdef::HvRegisterVsmPartitionStatus;
68use hvdef::Vtl;
69use hvdef::hypercall::HV_INTERCEPT_ACCESS_MASK_EXECUTE;
70use hvdef::hypercall::HV_INTERCEPT_ACCESS_MASK_NONE;
71use hvdef::hypercall::HV_INTERCEPT_ACCESS_MASK_READ_WRITE;
72use hvdef::hypercall::HV_INTERCEPT_ACCESS_MASK_WRITE;
73use hvdef::hypercall::HostVisibilityType;
74use hvdef::hypercall::HvGuestOsId;
75use hvdef::hypercall::HvInputVtl;
76use hvdef::hypercall::HvInterceptParameters;
77use hvdef::hypercall::HvInterceptType;
78use inspect::Inspect;
79use inspect::InspectMut;
80use memory_range::MemoryRange;
81use pal::unix::affinity;
82use pal::unix::affinity::CpuSet;
83use pal_async::driver::Driver;
84use pal_async::driver::SpawnDriver;
85use pal_uring::IdleControl;
86use parking_lot::Mutex;
87use parking_lot::RwLock;
88use processor::BackingSharedParams;
89use processor::SidecarExitReason;
90use sidecar_client::NewSidecarClientError;
91use std::ops::RangeInclusive;
92use std::os::fd::AsRawFd;
93use std::sync::Arc;
94use std::sync::Weak;
95use std::sync::atomic::AtomicBool;
96use std::sync::atomic::AtomicU8;
97use std::sync::atomic::AtomicU32;
98use std::sync::atomic::AtomicU64;
99use std::sync::atomic::Ordering;
100use std::task::Waker;
101use thiserror::Error;
102use user_driver::DmaClient;
103use virt::IsolationType;
104use virt::PartitionCapabilities;
105use virt::VpIndex;
106use virt::irqcon::IoApicRouting;
107use virt::irqcon::MsiRequest;
108use virt::x86::apic_software_device::ApicSoftwareDevices;
109use virt_support_apic::LocalApicSet;
110use vm_topology::memory::MemoryLayout;
111use vm_topology::processor::ProcessorTopology;
112use vm_topology::processor::TargetVpInfo;
113use vmcore::monitor::MonitorPage;
114use vmcore::reference_time::GetReferenceTime;
115use vmcore::reference_time::ReferenceTimeResult;
116use vmcore::reference_time::ReferenceTimeSource;
117use vmcore::vmtime::VmTimeSource;
118use x86defs::snp::REG_TWEAK_BITMAP_OFFSET;
119use x86defs::snp::REG_TWEAK_BITMAP_SIZE;
120use x86defs::tdx::TdCallResult;
121use zerocopy::FromBytes;
122use zerocopy::FromZeros;
123use zerocopy::Immutable;
124use zerocopy::IntoBytes;
125use zerocopy::KnownLayout;
126
127/// General error returned by operations.
128#[derive(Error, Debug)]
129#[expect(missing_docs)]
130pub enum Error {
131    #[error("hcl error")]
132    Hcl(#[source] hcl::ioctl::Error),
133    #[error("failed to open sidecar client")]
134    Sidecar(#[source] NewSidecarClientError),
135    #[error("failed to install {0:?} intercept: {1:?}")]
136    InstallIntercept(HvInterceptType, HvError),
137    #[error("failed to query hypervisor register {0:#x?}")]
138    Register(HvRegisterName, #[source] HvError),
139    #[error("failed to set vsm partition config register")]
140    VsmPartitionConfig(#[source] SetVsmPartitionConfigError),
141    #[error("failed to create virtual device")]
142    NewDevice(#[source] virt::x86::apic_software_device::DeviceIdInUse),
143    #[error("failed to create cpuid tables for cvm")]
144    #[cfg(guest_arch = "x86_64")]
145    CvmCpuid(#[source] cvm_cpuid::CpuidResultsError),
146    #[error("failed to update hypercall msr")]
147    UpdateHypercallMsr,
148    #[error("failed to update reference tsc msr")]
149    UpdateReferenceTsc,
150    #[error("failed to map overlay page")]
151    MapOverlay(#[source] std::io::Error),
152    #[error("failed to allocate shared visibility pages for overlay")]
153    AllocateSharedVisOverlay(#[source] anyhow::Error),
154    #[error("failed to open msr device")]
155    OpenMsr(#[source] std::io::Error),
156    #[error("cpuid did not contain valid TSC frequency information")]
157    BadCpuidTsc,
158    #[error("failed to read tsc frequency")]
159    ReadTscFrequency(#[source] std::io::Error),
160    #[error(
161        "tsc frequency mismatch between hypervisor ({hv}) and hardware {hw}, exceeds allowed error {allowed_error}"
162    )]
163    TscFrequencyMismatch {
164        hv: u64,
165        hw: u64,
166        allowed_error: u64,
167    },
168    #[error("failed to set vsm partition config: {0:?}")]
169    FailedToSetL2Ctls(TdCallResult),
170    #[error("debugging is configured but the binary does not have the gdb feature")]
171    InvalidDebugConfiguration,
172    #[error("failed to allocate TLB flush page")]
173    AllocateTlbFlushPage(#[source] anyhow::Error),
174    #[error("host does not support required cpu capabilities")]
175    Capabilities(virt::PartitionCapabilitiesError),
176}
177
178/// Error revoking guest VSM.
179#[derive(Error, Debug)]
180#[expect(missing_docs)]
181pub enum RevokeGuestVsmError {
182    #[error("failed to set vsm config")]
183    SetGuestVsmConfig(#[source] hcl::ioctl::SetGuestVsmConfigError),
184    #[error("VTL 1 is already enabled")]
185    Vtl1AlreadyEnabled,
186}
187
188/// Underhill partition.
189#[derive(Inspect)]
190pub struct UhPartition {
191    #[inspect(flatten)]
192    inner: Arc<UhPartitionInner>,
193    // TODO: remove this extra indirection by refactoring some traits.
194    #[inspect(skip)]
195    interrupt_targets: VtlArray<Arc<UhInterruptTarget>, 2>,
196}
197
198/// Underhill partition.
199#[derive(Inspect)]
200#[inspect(extra = "UhPartitionInner::inspect_extra")]
201struct UhPartitionInner {
202    #[inspect(skip)]
203    hcl: Hcl,
204    #[inspect(skip)] // inspected separately
205    vps: Vec<UhVpInner>,
206    irq_routes: virt::irqcon::IrqRoutes,
207    caps: PartitionCapabilities,
208    #[inspect(skip)] // handled in `inspect_extra`
209    enter_modes: Mutex<EnterModes>,
210    #[inspect(skip)]
211    enter_modes_atomic: AtomicU8,
212    #[cfg(guest_arch = "x86_64")]
213    cpuid: virt::CpuidLeafSet,
214    lower_vtl_memory_layout: MemoryLayout,
215    gm: VtlArray<GuestMemory, 2>,
216    vtl0_kernel_exec_gm: GuestMemory,
217    vtl0_user_exec_gm: GuestMemory,
218    #[cfg_attr(guest_arch = "aarch64", expect(dead_code))]
219    #[inspect(skip)]
220    crash_notification_send: mesh::Sender<VtlCrash>,
221    monitor_page: MonitorPage,
222    #[inspect(skip)]
223    allocated_monitor_page: Mutex<Option<user_driver::memory::MemoryBlock>>,
224    software_devices: Option<ApicSoftwareDevices>,
225    #[inspect(skip)]
226    vmtime: VmTimeSource,
227    isolation: IsolationType,
228    #[inspect(with = "inspect::AtomicMut")]
229    no_sidecar_hotplug: AtomicBool,
230    use_mmio_hypercalls: bool,
231    backing_shared: BackingShared,
232    intercept_debug_exceptions: bool,
233    #[cfg(guest_arch = "x86_64")]
234    // N.B For now, only one device vector table i.e. for VTL0 only
235    #[inspect(hex, with = "|x| inspect::iter_by_index(x.read().into_inner())")]
236    device_vector_table: RwLock<IrrBitmap>,
237    vmbus_relay: bool,
238}
239
240#[derive(Inspect)]
241#[inspect(untagged)]
242enum BackingShared {
243    Hypervisor(#[inspect(flatten)] HypervisorBackedShared),
244    #[cfg(guest_arch = "x86_64")]
245    Snp(#[inspect(flatten)] SnpBackedShared),
246    #[cfg(guest_arch = "x86_64")]
247    Tdx(#[inspect(flatten)] TdxBackedShared),
248}
249
250impl BackingShared {
251    fn new(
252        isolation: IsolationType,
253        partition_params: &UhPartitionNewParams<'_>,
254        backing_shared_params: BackingSharedParams<'_>,
255    ) -> Result<BackingShared, Error> {
256        Ok(match isolation {
257            IsolationType::None | IsolationType::Vbs => {
258                assert!(backing_shared_params.cvm_state.is_none());
259                BackingShared::Hypervisor(HypervisorBackedShared::new(
260                    partition_params,
261                    backing_shared_params,
262                )?)
263            }
264            #[cfg(guest_arch = "x86_64")]
265            IsolationType::Snp => BackingShared::Snp(SnpBackedShared::new(
266                partition_params,
267                backing_shared_params,
268            )?),
269            #[cfg(guest_arch = "x86_64")]
270            IsolationType::Tdx => BackingShared::Tdx(TdxBackedShared::new(
271                partition_params,
272                backing_shared_params,
273            )?),
274            #[cfg(not(guest_arch = "x86_64"))]
275            _ => unreachable!(),
276        })
277    }
278
279    fn cvm_state(&self) -> Option<&UhCvmPartitionState> {
280        match self {
281            BackingShared::Hypervisor(_) => None,
282            #[cfg(guest_arch = "x86_64")]
283            BackingShared::Snp(SnpBackedShared { cvm, .. })
284            | BackingShared::Tdx(TdxBackedShared { cvm, .. }) => Some(cvm),
285        }
286    }
287
288    fn untrusted_synic(&self) -> Option<&GlobalSynic> {
289        match self {
290            BackingShared::Hypervisor(_) => None,
291            #[cfg(guest_arch = "x86_64")]
292            BackingShared::Snp(_) => None,
293            #[cfg(guest_arch = "x86_64")]
294            BackingShared::Tdx(s) => s.untrusted_synic.as_ref(),
295        }
296    }
297}
298
299#[derive(InspectMut, Copy, Clone)]
300struct EnterModes {
301    #[inspect(mut)]
302    first: EnterMode,
303    #[inspect(mut)]
304    second: EnterMode,
305}
306
307impl Default for EnterModes {
308    fn default() -> Self {
309        Self {
310            first: EnterMode::Fast,
311            second: EnterMode::IdleToVtl0,
312        }
313    }
314}
315
316impl From<EnterModes> for hcl::protocol::EnterModes {
317    fn from(value: EnterModes) -> Self {
318        Self::new()
319            .with_first(value.first.into())
320            .with_second(value.second.into())
321    }
322}
323
324#[derive(InspectMut, Copy, Clone)]
325enum EnterMode {
326    Fast,
327    PlayIdle,
328    IdleToVtl0,
329}
330
331impl From<EnterMode> for hcl::protocol::EnterMode {
332    fn from(value: EnterMode) -> Self {
333        match value {
334            EnterMode::Fast => Self::FAST,
335            EnterMode::PlayIdle => Self::PLAY_IDLE,
336            EnterMode::IdleToVtl0 => Self::IDLE_TO_VTL0,
337        }
338    }
339}
340
341#[cfg(guest_arch = "x86_64")]
342#[derive(Inspect)]
343struct GuestVsmVpState {
344    /// The pending event that VTL 1 wants to inject into VTL 0. Injected on
345    /// next exit to VTL 0.
346    #[inspect(with = "|x| x.as_ref().map(inspect::AsDebug)")]
347    vtl0_exit_pending_event: Option<hvdef::HvX64PendingExceptionEvent>,
348    reg_intercept: SecureRegisterInterceptState,
349}
350
351#[cfg(guest_arch = "x86_64")]
352impl GuestVsmVpState {
353    fn new() -> Self {
354        GuestVsmVpState {
355            vtl0_exit_pending_event: None,
356            reg_intercept: Default::default(),
357        }
358    }
359}
360
361#[cfg(guest_arch = "x86_64")]
362#[derive(Inspect)]
363/// VP state for CVMs.
364struct UhCvmVpState {
365    // Allocation handle for direct overlays
366    #[inspect(debug)]
367    direct_overlay_handle: user_driver::memory::MemoryBlock,
368    /// Used in VTL 2 exit code to determine which VTL to exit to.
369    exit_vtl: GuestVtl,
370    /// Hypervisor enlightenment emulator state.
371    hv: VtlArray<ProcessorVtlHv, 2>,
372    /// LAPIC state.
373    lapics: VtlArray<LapicState, 2>,
374    /// Guest VSM state for this vp. Some when VTL 1 is enabled.
375    vtl1: Option<GuestVsmVpState>,
376}
377
378#[cfg(guest_arch = "x86_64")]
379impl UhCvmVpState {
380    /// Creates a new CVM VP state.
381    pub(crate) fn new(
382        cvm_partition: &UhCvmPartitionState,
383        inner: &UhPartitionInner,
384        vp_info: &TargetVpInfo,
385        overlay_pages_required: usize,
386    ) -> Result<Self, Error> {
387        let direct_overlay_handle = cvm_partition
388            .shared_dma_client
389            .allocate_dma_buffer(overlay_pages_required * HV_PAGE_SIZE as usize)
390            .map_err(Error::AllocateSharedVisOverlay)?;
391
392        let apic_base = virt::vp::Apic::at_reset(&inner.caps, vp_info).apic_base;
393        let lapics = VtlArray::from_fn(|vtl| {
394            let apic_set = &cvm_partition.lapic[vtl];
395
396            // The APIC is software-enabled after reset for secure VTLs, to
397            // maintain compatibility with released versions of secure kernel
398            let mut lapic = apic_set.add_apic(vp_info, vtl == Vtl::Vtl1);
399            // Initialize APIC base to match the reset VM state.
400            lapic.set_apic_base(apic_base).unwrap();
401            // Only the VTL 0 non-BSP LAPICs should be in the WaitForSipi state.
402            let activity = if vtl == Vtl::Vtl0 && !vp_info.base.is_bsp() {
403                MpState::WaitForSipi
404            } else {
405                MpState::Running
406            };
407            LapicState::new(lapic, activity)
408        });
409
410        let hv = VtlArray::from_fn(|vtl| cvm_partition.hv.add_vp(vp_info.base.vp_index, vtl));
411
412        Ok(Self {
413            direct_overlay_handle,
414            exit_vtl: GuestVtl::Vtl0,
415            hv,
416            lapics,
417            vtl1: None,
418        })
419    }
420}
421
422#[cfg(guest_arch = "x86_64")]
423#[derive(Inspect, Default)]
424#[inspect(hex)]
425/// Configuration of VTL 1 registration for intercepts on certain registers
426pub struct SecureRegisterInterceptState {
427    #[inspect(with = "|&x| u64::from(x)")]
428    intercept_control: hvdef::HvRegisterCrInterceptControl,
429    cr0_mask: u64,
430    cr4_mask: u64,
431    // Writes to X86X_IA32_MSR_MISC_ENABLE are dropped, so this is only used so
432    // that get_vp_register returns the correct value from a set_vp_register
433    ia32_misc_enable_mask: u64,
434}
435
436#[derive(Inspect)]
437/// Partition-wide state for CVMs.
438struct UhCvmPartitionState {
439    #[cfg(guest_arch = "x86_64")]
440    vps_per_socket: u32,
441    /// VPs that have locked their TLB.
442    #[inspect(
443        with = "|arr| inspect::iter_by_index(arr.iter()).map_value(|bb| inspect::iter_by_index(bb.iter().map(|v| *v)))"
444    )]
445    tlb_locked_vps: VtlArray<BitBox<AtomicU64>, 2>,
446    #[inspect(with = "inspect::iter_by_index")]
447    vps: Vec<UhCvmVpInner>,
448    shared_memory: GuestMemory,
449    #[cfg_attr(guest_arch = "aarch64", expect(dead_code))]
450    #[inspect(skip)]
451    isolated_memory_protector: Arc<dyn ProtectIsolatedMemory>,
452    /// The emulated local APIC set.
453    lapic: VtlArray<LocalApicSet, 2>,
454    /// The emulated hypervisor state.
455    hv: GlobalHv<2>,
456    /// Guest VSM state.
457    guest_vsm: RwLock<GuestVsmState<CvmVtl1State>>,
458    /// Dma client for shared visibility pages.
459    shared_dma_client: Arc<dyn DmaClient>,
460    /// Dma client for private visibility pages.
461    private_dma_client: Arc<dyn DmaClient>,
462    hide_isolation: bool,
463}
464
465#[cfg_attr(guest_arch = "aarch64", expect(dead_code))]
466impl UhCvmPartitionState {
467    fn vp_inner(&self, vp_index: u32) -> &UhCvmVpInner {
468        &self.vps[vp_index as usize]
469    }
470
471    fn is_lower_vtl_startup_denied(&self) -> bool {
472        matches!(
473            *self.guest_vsm.read(),
474            GuestVsmState::Enabled {
475                vtl1: CvmVtl1State {
476                    deny_lower_vtl_startup: true,
477                    ..
478                }
479            }
480        )
481    }
482}
483
484#[derive(Inspect)]
485/// Per-vp state for CVMs.
486struct UhCvmVpInner {
487    /// The current status of TLB locks
488    tlb_lock_info: VtlArray<TlbLockInfo, 2>,
489    /// Whether EnableVpVtl for VTL 1 has been called on this VP.
490    vtl1_enable_called: Mutex<bool>,
491    /// Whether the VP has been started via the StartVp hypercall.
492    started: AtomicBool,
493    /// Start context for StartVp and EnableVpVtl calls.
494    #[inspect(with = "|arr| inspect::iter_by_index(arr.iter().map(|v| v.lock().is_some()))")]
495    hv_start_enable_vtl_vp: VtlArray<Mutex<Option<Box<VpStartEnableVtl>>>, 2>,
496}
497
498#[cfg_attr(guest_arch = "aarch64", expect(dead_code))]
499#[derive(Inspect)]
500#[inspect(tag = "guest_vsm_state")]
501/// Partition-wide state for guest vsm.
502enum GuestVsmState<T: Inspect> {
503    NotPlatformSupported,
504    NotGuestEnabled,
505    Enabled {
506        #[inspect(flatten)]
507        vtl1: T,
508    },
509}
510
511impl<T: Inspect> GuestVsmState<T> {
512    pub fn from_availability(guest_vsm_available: bool) -> Self {
513        if guest_vsm_available {
514            GuestVsmState::NotGuestEnabled
515        } else {
516            GuestVsmState::NotPlatformSupported
517        }
518    }
519}
520
521#[derive(Inspect)]
522struct CvmVtl1State {
523    /// Whether VTL 1 has been enabled on any vp
524    enabled_on_any_vp: bool,
525    /// Whether guest memory should be zeroed before it resets.
526    zero_memory_on_reset: bool,
527    /// Whether a vp can be started or reset by a lower vtl.
528    deny_lower_vtl_startup: bool,
529    /// Whether Mode-Based Execution Control should be enforced on lower VTLs.
530    pub mbec_enabled: bool,
531    /// Whether shadow supervisor stack is enabled.
532    pub shadow_supervisor_stack_enabled: bool,
533    #[inspect(with = "|bb| inspect::iter_by_index(bb.iter().map(|v| *v))")]
534    io_read_intercepts: BitBox<u64>,
535    #[inspect(with = "|bb| inspect::iter_by_index(bb.iter().map(|v| *v))")]
536    io_write_intercepts: BitBox<u64>,
537}
538
539#[cfg_attr(guest_arch = "aarch64", expect(dead_code))]
540impl CvmVtl1State {
541    fn new(mbec_enabled: bool) -> Self {
542        Self {
543            enabled_on_any_vp: false,
544            zero_memory_on_reset: false,
545            deny_lower_vtl_startup: false,
546            mbec_enabled,
547            shadow_supervisor_stack_enabled: false,
548            io_read_intercepts: BitVec::repeat(false, u16::MAX as usize + 1).into_boxed_bitslice(),
549            io_write_intercepts: BitVec::repeat(false, u16::MAX as usize + 1).into_boxed_bitslice(),
550        }
551    }
552}
553
554#[cfg_attr(guest_arch = "aarch64", expect(dead_code))]
555struct TscReferenceTimeSource {
556    tsc_scale: u64,
557}
558
559#[cfg_attr(guest_arch = "aarch64", expect(dead_code))]
560impl TscReferenceTimeSource {
561    fn new(tsc_frequency: u64) -> Self {
562        TscReferenceTimeSource {
563            tsc_scale: (((10_000_000_u128) << 64) / tsc_frequency as u128) as u64,
564        }
565    }
566}
567
568/// A time implementation based on TSC.
569impl GetReferenceTime for TscReferenceTimeSource {
570    fn now(&self) -> ReferenceTimeResult {
571        #[cfg(guest_arch = "x86_64")]
572        {
573            let tsc = safe_intrinsics::rdtsc();
574            let ref_time = ((self.tsc_scale as u128 * tsc as u128) >> 64) as u64;
575            ReferenceTimeResult {
576                ref_time,
577                system_time: None,
578            }
579        }
580
581        #[cfg(guest_arch = "aarch64")]
582        {
583            todo!("AARCH64_TODO");
584        }
585    }
586}
587
588#[cfg(guest_arch = "aarch64")]
589impl virt::irqcon::ControlGic for UhPartitionInner {
590    fn set_spi_irq(&self, irq_id: u32, high: bool) {
591        if let Err(err) = self.hcl.request_interrupt(
592            hvdef::HvInterruptControl::new()
593                .with_arm64_asserted(high)
594                .with_interrupt_type(hvdef::HvInterruptType::HvArm64InterruptTypeFixed),
595            0,
596            irq_id,
597            GuestVtl::Vtl0,
598        ) {
599            tracelimit::warn_ratelimited!(
600                error = &err as &dyn std::error::Error,
601                irq = irq_id,
602                asserted = high,
603                "failed to request spi"
604            );
605        }
606    }
607}
608
609#[cfg(guest_arch = "aarch64")]
610impl virt::Aarch64Partition for UhPartition {
611    fn control_gic(&self, vtl: Vtl) -> Arc<dyn virt::irqcon::ControlGic> {
612        debug_assert!(vtl == Vtl::Vtl0);
613        self.inner.clone()
614    }
615}
616
617/// A wrapper around [`UhProcessor`] that is [`Send`].
618///
619/// This is used to instantiate the processor object on the correct thread,
620/// since all lower VTL processor state accesses must occur from the same
621/// processor at VTL2.
622pub struct UhProcessorBox {
623    partition: Arc<UhPartitionInner>,
624    vp_info: TargetVpInfo,
625}
626
627impl UhProcessorBox {
628    /// Returns the VP index.
629    pub fn vp_index(&self) -> VpIndex {
630        self.vp_info.base.vp_index
631    }
632
633    /// Returns the base CPU that manages this processor, when it is a sidecar
634    /// VP.
635    pub fn sidecar_base_cpu(&self) -> Option<u32> {
636        self.partition
637            .hcl
638            .sidecar_base_cpu(self.vp_info.base.vp_index.index())
639    }
640
641    /// Returns the processor object, bound to this thread.
642    ///
643    /// If `control` is provided, then this must be called on the VP's
644    /// associated thread pool thread, and it will dispatch the VP directly.
645    /// Otherwise, the processor will control the processor via the sidecar
646    /// kernel.
647    pub fn bind_processor<'a, T: Backing>(
648        &'a mut self,
649        driver: &impl Driver,
650        control: Option<&'a mut IdleControl>,
651    ) -> Result<UhProcessor<'a, T>, Error> {
652        if let Some(control) = &control {
653            let vp_index = self.vp_info.base.vp_index;
654
655            let mut current = Default::default();
656            affinity::get_current_thread_affinity(&mut current).unwrap();
657            assert_eq!(&current, CpuSet::new().set(vp_index.index()));
658
659            self.partition
660                .hcl
661                .set_poll_file(
662                    self.partition.vp(vp_index).unwrap().cpu_index,
663                    control.ring_fd().as_raw_fd(),
664                )
665                .map_err(Error::Hcl)?;
666        }
667
668        UhProcessor::new(driver, &self.partition, self.vp_info, control)
669    }
670
671    /// Sets the sidecar remove reason for the processor to be due to a task
672    /// running with the given name.
673    ///
674    /// This is useful for diagnostics.
675    pub fn set_sidecar_exit_due_to_task(&self, task: Arc<str>) {
676        self.partition
677            .vp(self.vp_info.base.vp_index)
678            .unwrap()
679            .set_sidecar_exit_reason(SidecarExitReason::TaskRequest(task))
680    }
681}
682
683#[derive(Debug, Inspect)]
684struct UhVpInner {
685    /// 32 bits per VTL: top bits are VTL 1, bottom bits are VTL 0.
686    wake_reasons: AtomicU64,
687    #[inspect(skip)]
688    waker: RwLock<Option<Waker>>,
689    message_queues: VtlArray<MessageQueues, 2>,
690    #[inspect(skip)]
691    vp_info: TargetVpInfo,
692    /// The Linux kernel's CPU index for this VP. This should be used instead of VpIndex
693    /// when interacting with non-MSHV kernel interfaces.
694    cpu_index: u32,
695    sidecar_exit_reason: Mutex<Option<SidecarExitReason>>,
696}
697
698impl UhVpInner {
699    pub fn vp_index(&self) -> VpIndex {
700        self.vp_info.base.vp_index
701    }
702}
703
704#[cfg_attr(not(guest_arch = "x86_64"), expect(dead_code))]
705#[derive(Debug, Inspect)]
706/// Which operation is setting the initial vp context
707enum InitialVpContextOperation {
708    /// The VP is being started via the StartVp hypercall.
709    StartVp,
710    /// The VP is being started via the EnableVpVtl hypercall.
711    EnableVpVtl,
712}
713
714#[cfg_attr(not(guest_arch = "x86_64"), expect(dead_code))]
715#[derive(Debug, Inspect)]
716/// State for handling StartVp/EnableVpVtl hypercalls.
717struct VpStartEnableVtl {
718    /// Which operation, startvp or enablevpvtl, is setting the initial vp
719    /// context
720    operation: InitialVpContextOperation,
721    #[inspect(skip)]
722    context: hvdef::hypercall::InitialVpContextX64,
723}
724
725#[derive(Debug, Inspect)]
726struct TlbLockInfo {
727    /// The set of VPs that are waiting for this VP to release the TLB lock.
728    #[inspect(with = "|bb| inspect::iter_by_index(bb.iter().map(|v| *v))")]
729    blocked_vps: BitBox<AtomicU64>,
730    /// The set of VPs that are holding the TLB lock and preventing this VP
731    /// from proceeding.
732    #[inspect(with = "|bb| inspect::iter_by_index(bb.iter().map(|v| *v))")]
733    blocking_vps: BitBox<AtomicU64>,
734    /// The count of blocking VPs. This should always be equivalent to
735    /// `blocking_vps.count_ones()`, however it is accessible in a single
736    /// atomic operation while counting is not.
737    blocking_vp_count: AtomicU32,
738    /// Whether the VP is sleeping due to a TLB lock.
739    sleeping: AtomicBool,
740}
741
742#[cfg_attr(not(guest_arch = "x86_64"), expect(dead_code))]
743impl TlbLockInfo {
744    fn new(vp_count: usize) -> Self {
745        Self {
746            blocked_vps: BitVec::repeat(false, vp_count).into_boxed_bitslice(),
747            blocking_vps: BitVec::repeat(false, vp_count).into_boxed_bitslice(),
748            blocking_vp_count: AtomicU32::new(0),
749            sleeping: false.into(),
750        }
751    }
752}
753
754#[bitfield(u32)]
755#[derive(IntoBytes, Immutable, KnownLayout, FromBytes)]
756struct WakeReason {
757    extint: bool,
758    message_queues: bool,
759    hv_start_enable_vtl_vp: bool,
760    intcon: bool,
761    update_proxy_irr_filter: bool,
762    #[bits(27)]
763    _reserved: u32,
764}
765
766impl WakeReason {
767    // Convenient constants.
768    const EXTINT: Self = Self::new().with_extint(true);
769    const MESSAGE_QUEUES: Self = Self::new().with_message_queues(true);
770    #[cfg(guest_arch = "x86_64")]
771    const HV_START_ENABLE_VP_VTL: Self = Self::new().with_hv_start_enable_vtl_vp(true); // StartVp/EnableVpVtl handling
772    const INTCON: Self = Self::new().with_intcon(true);
773    #[cfg(guest_arch = "x86_64")]
774    const UPDATE_PROXY_IRR_FILTER: Self = Self::new().with_update_proxy_irr_filter(true);
775}
776
777#[bitfield(u32)]
778#[derive(IntoBytes, Immutable, KnownLayout, FromBytes)]
779struct ExitActivity {
780    pending_event: bool,
781    #[bits(31)]
782    _reserved: u32,
783}
784
785/// Immutable access to useful bits of Partition state.
786impl UhPartition {
787    /// Revokes guest VSM.
788    pub fn revoke_guest_vsm(&self) -> Result<(), RevokeGuestVsmError> {
789        fn revoke<T: Inspect>(vsm_state: &mut GuestVsmState<T>) -> Result<(), RevokeGuestVsmError> {
790            if matches!(vsm_state, GuestVsmState::Enabled { .. }) {
791                return Err(RevokeGuestVsmError::Vtl1AlreadyEnabled);
792            }
793            *vsm_state = GuestVsmState::NotPlatformSupported;
794            Ok(())
795        }
796
797        match &self.inner.backing_shared {
798            BackingShared::Hypervisor(s) => {
799                revoke(&mut *s.guest_vsm.write())?;
800                self.inner
801                    .hcl
802                    .set_guest_vsm_partition_config(false)
803                    .map_err(RevokeGuestVsmError::SetGuestVsmConfig)?;
804            }
805            #[cfg(guest_arch = "x86_64")]
806            BackingShared::Snp(SnpBackedShared { cvm, .. })
807            | BackingShared::Tdx(TdxBackedShared { cvm, .. }) => {
808                revoke(&mut *cvm.guest_vsm.write())?;
809            }
810        };
811
812        Ok(())
813    }
814
815    /// Returns the current hypervisor reference time, in 100ns units.
816    pub fn reference_time(&self) -> u64 {
817        if let Some(hv) = self.inner.hv() {
818            hv.ref_time_source().now().ref_time
819        } else {
820            self.inner
821                .hcl
822                .reference_time()
823                .expect("should not fail to get the reference time")
824        }
825    }
826}
827
828impl virt::Partition for UhPartition {
829    fn supports_reset(&self) -> Option<&dyn virt::ResetPartition<Error = Self::Error>> {
830        None
831    }
832
833    fn caps(&self) -> &PartitionCapabilities {
834        &self.inner.caps
835    }
836
837    fn request_msi(&self, vtl: Vtl, request: MsiRequest) {
838        self.inner
839            .request_msi(vtl.try_into().expect("higher vtl not configured"), request)
840    }
841
842    fn request_yield(&self, _vp_index: VpIndex) {
843        unimplemented!()
844    }
845}
846
847impl virt::X86Partition for UhPartition {
848    fn ioapic_routing(&self) -> Arc<dyn IoApicRouting> {
849        self.inner.clone()
850    }
851
852    fn pulse_lint(&self, vp_index: VpIndex, vtl: Vtl, lint: u8) {
853        let vtl = GuestVtl::try_from(vtl).expect("higher vtl not configured");
854        if let Some(apic) = &self.inner.lapic(vtl) {
855            apic.lint(vp_index, lint.into(), |vp_index| {
856                self.inner
857                    .vp(vp_index)
858                    .unwrap()
859                    .wake(vtl, WakeReason::INTCON);
860            });
861        } else if lint == 0 {
862            self.inner
863                .vp(vp_index)
864                .unwrap()
865                .wake(vtl, WakeReason::EXTINT);
866        } else {
867            unimplemented!()
868        }
869    }
870}
871
872impl UhPartitionInner {
873    fn vp(&self, index: VpIndex) -> Option<&'_ UhVpInner> {
874        self.vps.get(index.index() as usize)
875    }
876
877    fn lapic(&self, vtl: GuestVtl) -> Option<&LocalApicSet> {
878        self.backing_shared.cvm_state().map(|x| &x.lapic[vtl])
879    }
880
881    fn hv(&self) -> Option<&GlobalHv<2>> {
882        self.backing_shared.cvm_state().map(|x| &x.hv)
883    }
884
885    /// For requester VP to issue `proxy_irr_blocked` update to other VPs
886    #[cfg(guest_arch = "x86_64")]
887    fn request_proxy_irr_filter_update(
888        &self,
889        vtl: GuestVtl,
890        device_vector: u8,
891        req_vp_index: VpIndex,
892    ) {
893        tracing::debug!(
894            ?vtl,
895            device_vector,
896            req_vp_index = req_vp_index.index(),
897            "request_proxy_irr_filter_update"
898        );
899
900        // Add given vector to partition global device vector table (VTL0 only for now)
901        {
902            let mut device_vector_table = self.device_vector_table.write();
903            device_vector_table.set(device_vector as usize, true);
904        }
905
906        // Wake all other VPs for their `proxy_irr_blocked` filter update
907        for vp in self.vps.iter() {
908            if vp.vp_index() != req_vp_index {
909                vp.wake(vtl, WakeReason::UPDATE_PROXY_IRR_FILTER);
910            }
911        }
912    }
913
914    /// Get current partition global device irr vectors (VTL0 for now)
915    #[cfg(guest_arch = "x86_64")]
916    fn fill_device_vectors(&self, _vtl: GuestVtl, irr_vectors: &mut IrrBitmap) {
917        let device_vector_table = self.device_vector_table.read();
918        for idx in device_vector_table.iter_ones() {
919            irr_vectors.set(idx, true);
920        }
921    }
922
923    fn inspect_extra(&self, resp: &mut inspect::Response<'_>) {
924        let mut wake_vps = false;
925        resp.field_mut(
926            "enter_modes",
927            &mut inspect::adhoc_mut(|req| {
928                let update = req.is_update();
929                {
930                    let mut modes = self.enter_modes.lock();
931                    modes.inspect_mut(req);
932                    if update {
933                        self.enter_modes_atomic.store(
934                            hcl::protocol::EnterModes::from(*modes).into(),
935                            Ordering::Relaxed,
936                        );
937                        wake_vps = true;
938                    }
939                }
940            }),
941        );
942
943        // Wake VPs to propagate updates.
944        if wake_vps {
945            for vp in self.vps.iter() {
946                vp.wake_vtl2();
947            }
948        }
949    }
950
951    // TODO VBS GUEST VSM: enable for aarch64
952    #[cfg_attr(guest_arch = "aarch64", expect(dead_code))]
953    fn vsm_status(&self) -> Result<HvRegisterVsmPartitionStatus, hcl::ioctl::Error> {
954        // TODO: It might be possible to cache VsmPartitionStatus.
955        self.hcl.get_vsm_partition_status()
956    }
957}
958
959impl virt::Synic for UhPartition {
960    fn post_message(&self, vtl: Vtl, vp_index: VpIndex, sint: u8, typ: u32, payload: &[u8]) {
961        let vtl = GuestVtl::try_from(vtl).expect("higher vtl not configured");
962        let Some(vp) = self.inner.vp(vp_index) else {
963            tracelimit::warn_ratelimited!(
964                CVM_ALLOWED,
965                vp = vp_index.index(),
966                "invalid vp target for post_message"
967            );
968            return;
969        };
970
971        vp.post_message(
972            vtl,
973            sint,
974            &hvdef::HvMessage::new(hvdef::HvMessageType(typ), 0, payload),
975        );
976    }
977
978    fn new_guest_event_port(
979        &self,
980        vtl: Vtl,
981        vp: u32,
982        sint: u8,
983        flag: u16,
984    ) -> Box<dyn vmcore::synic::GuestEventPort> {
985        let vtl = GuestVtl::try_from(vtl).expect("higher vtl not configured");
986        Box::new(UhEventPort {
987            partition: Arc::downgrade(&self.inner),
988            params: Arc::new(Mutex::new(UhEventPortParams {
989                vp: VpIndex::new(vp),
990                sint,
991                flag,
992                vtl,
993            })),
994        })
995    }
996
997    fn prefer_os_events(&self) -> bool {
998        false
999    }
1000
1001    fn monitor_support(&self) -> Option<&dyn virt::SynicMonitor> {
1002        Some(self)
1003    }
1004}
1005
1006impl virt::SynicMonitor for UhPartition {
1007    fn set_monitor_page(&self, vtl: Vtl, gpa: Option<u64>) -> anyhow::Result<()> {
1008        // Keep this locked the whole function to avoid racing with allocate_monitor_page.
1009        let mut allocated_block = self.inner.allocated_monitor_page.lock();
1010        let old_gpa = self.inner.monitor_page.set_gpa(gpa);
1011
1012        // Take ownership of any allocated monitor page so it will be freed on function exit.
1013        let allocated_page = allocated_block.take();
1014        if let Some(old_gpa) = old_gpa {
1015            let allocated_gpa = allocated_page
1016                .as_ref()
1017                .map(|b| b.pfns()[0] << HV_PAGE_SHIFT);
1018
1019            // Revert the old page's permissions, using the appropriate method depending on
1020            // whether it was allocated or guest-supplied.
1021            let result = if allocated_gpa == Some(old_gpa) {
1022                let vtl = GuestVtl::try_from(vtl).unwrap();
1023                self.unregister_cvm_dma_overlay_page(vtl, old_gpa >> HV_PAGE_SHIFT)
1024            } else {
1025                self.inner
1026                    .hcl
1027                    .modify_vtl_protection_mask(
1028                        MemoryRange::new(old_gpa..old_gpa + HV_PAGE_SIZE),
1029                        hvdef::HV_MAP_GPA_PERMISSIONS_ALL,
1030                        HvInputVtl::CURRENT_VTL,
1031                    )
1032                    .map_err(|err| anyhow::anyhow!(err))
1033            };
1034
1035            result
1036                .context("failed to unregister old monitor page")
1037                .inspect_err(|_| {
1038                    // Leave the page unset if returning a failure.
1039                    self.inner.monitor_page.set_gpa(None);
1040                })?;
1041
1042            tracing::debug!(old_gpa, "unregistered monitor page");
1043        }
1044
1045        if let Some(gpa) = gpa {
1046            // Disallow VTL0 from writing to the page, so we'll get an intercept. Note that read
1047            // permissions must be enabled or this doesn't work correctly.
1048            self.inner
1049                .hcl
1050                .modify_vtl_protection_mask(
1051                    MemoryRange::new(gpa..gpa + HV_PAGE_SIZE),
1052                    HvMapGpaFlags::new().with_readable(true),
1053                    HvInputVtl::CURRENT_VTL,
1054                )
1055                .context("failed to register monitor page")
1056                .inspect_err(|_| {
1057                    // Leave the page unset if returning a failure.
1058                    self.inner.monitor_page.set_gpa(None);
1059                })?;
1060
1061            tracing::debug!(gpa, "registered monitor page");
1062        }
1063
1064        Ok(())
1065    }
1066
1067    fn register_monitor(
1068        &self,
1069        monitor_id: vmcore::monitor::MonitorId,
1070        connection_id: u32,
1071    ) -> Box<dyn Sync + Send> {
1072        self.inner
1073            .monitor_page
1074            .register_monitor(monitor_id, connection_id)
1075    }
1076
1077    fn allocate_monitor_page(&self, vtl: Vtl) -> anyhow::Result<Option<u64>> {
1078        let vtl = GuestVtl::try_from(vtl).unwrap();
1079
1080        // Allocating a monitor page is only supported for CVMs.
1081        let Some(state) = self.inner.backing_shared.cvm_state() else {
1082            return Ok(None);
1083        };
1084
1085        let mut allocated_block = self.inner.allocated_monitor_page.lock();
1086        if let Some(block) = allocated_block.as_ref() {
1087            // An allocated monitor page is already in use; no need to change it.
1088            let gpa = block.pfns()[0] << HV_PAGE_SHIFT;
1089            assert_eq!(self.inner.monitor_page.gpa(), Some(gpa));
1090            return Ok(Some(gpa));
1091        }
1092
1093        let block = state
1094            .private_dma_client
1095            .allocate_dma_buffer(HV_PAGE_SIZE_USIZE)
1096            .context("failed to allocate monitor page")?;
1097
1098        let gpn = block.pfns()[0];
1099        *allocated_block = Some(block);
1100        let gpa = gpn << HV_PAGE_SHIFT;
1101        let old_gpa = self.inner.monitor_page.set_gpa(Some(gpa));
1102        if let Some(old_gpa) = old_gpa {
1103            // The old GPA is guaranteed not to be allocated, since that was checked above, so
1104            // revert its permissions using the method for guest-supplied memory.
1105            self.inner
1106                .hcl
1107                .modify_vtl_protection_mask(
1108                    MemoryRange::new(old_gpa..old_gpa + HV_PAGE_SIZE),
1109                    hvdef::HV_MAP_GPA_PERMISSIONS_ALL,
1110                    HvInputVtl::CURRENT_VTL,
1111                )
1112                .context("failed to unregister old monitor page")
1113                .inspect_err(|_| {
1114                    // Leave the page unset if returning a failure.
1115                    self.inner.monitor_page.set_gpa(None);
1116                })?;
1117
1118            tracing::debug!(old_gpa, "unregistered monitor page");
1119        }
1120
1121        // Disallow VTL0 from writing to the page, so we'll get an intercept. Note that read
1122        // permissions must be enabled or this doesn't work correctly.
1123        self.register_cvm_dma_overlay_page(vtl, gpn, HvMapGpaFlags::new().with_readable(true))
1124            .context("failed to unregister monitor page")
1125            .inspect_err(|_| {
1126                // Leave the page unset if returning a failure.
1127                self.inner.monitor_page.set_gpa(None);
1128            })?;
1129
1130        tracing::debug!(gpa, "registered allocated monitor page");
1131
1132        Ok(Some(gpa))
1133    }
1134}
1135
1136impl UhPartitionInner {
1137    #[cfg(guest_arch = "x86_64")]
1138    pub(crate) fn synic_interrupt(
1139        &self,
1140        vp_index: VpIndex,
1141        vtl: GuestVtl,
1142    ) -> impl '_ + hv1_emulator::RequestInterrupt {
1143        // TODO CVM: optimize for SNP with secure avic to avoid internal wake
1144        // and for TDX to avoid trip to user mode
1145        move |vector, auto_eoi| {
1146            self.lapic(vtl).unwrap().synic_interrupt(
1147                vp_index,
1148                vector as u8,
1149                auto_eoi,
1150                |vp_index| self.vp(vp_index).unwrap().wake(vtl, WakeReason::INTCON),
1151            );
1152        }
1153    }
1154
1155    #[cfg(guest_arch = "aarch64")]
1156    fn synic_interrupt(
1157        &self,
1158        _vp_index: VpIndex,
1159        _vtl: GuestVtl,
1160    ) -> impl '_ + hv1_emulator::RequestInterrupt {
1161        move |_, _| {}
1162    }
1163}
1164
1165#[derive(Debug)]
1166struct UhEventPort {
1167    partition: Weak<UhPartitionInner>,
1168    params: Arc<Mutex<UhEventPortParams>>,
1169}
1170
1171#[derive(Debug, Copy, Clone)]
1172struct UhEventPortParams {
1173    vp: VpIndex,
1174    sint: u8,
1175    flag: u16,
1176    vtl: GuestVtl,
1177}
1178
1179impl vmcore::synic::GuestEventPort for UhEventPort {
1180    fn interrupt(&self) -> vmcore::interrupt::Interrupt {
1181        let partition = self.partition.clone();
1182        let params = self.params.clone();
1183        vmcore::interrupt::Interrupt::from_fn(move || {
1184            let UhEventPortParams {
1185                vp,
1186                sint,
1187                flag,
1188                vtl,
1189            } = *params.lock();
1190            let Some(partition) = partition.upgrade() else {
1191                return;
1192            };
1193            tracing::trace!(vp = vp.index(), sint, flag, "signal_event");
1194            if let Some(hv) = partition.hv() {
1195                match hv.synic[vtl].signal_event(
1196                    vp,
1197                    sint,
1198                    flag,
1199                    &mut partition.synic_interrupt(vp, vtl),
1200                ) {
1201                    Ok(_) => {}
1202                    Err(SintProxied) => {
1203                        tracing::trace!(
1204                            vp = vp.index(),
1205                            sint,
1206                            flag,
1207                            "forwarding event to untrusted synic"
1208                        );
1209                        if let Some(synic) = partition.backing_shared.untrusted_synic() {
1210                            synic
1211                                .signal_event(
1212                                    vp,
1213                                    sint,
1214                                    flag,
1215                                    &mut partition.synic_interrupt(vp, vtl),
1216                                )
1217                                .ok();
1218                        } else {
1219                            partition.hcl.signal_event_direct(vp.index(), sint, flag)
1220                        }
1221                    }
1222                }
1223            } else {
1224                partition.hcl.signal_event_direct(vp.index(), sint, flag);
1225            }
1226        })
1227    }
1228
1229    fn set_target_vp(&mut self, vp: u32) -> Result<(), vmcore::synic::HypervisorError> {
1230        self.params.lock().vp = VpIndex::new(vp);
1231        Ok(())
1232    }
1233}
1234
1235impl virt::Hv1 for UhPartition {
1236    type Error = Error;
1237    type Device = virt::x86::apic_software_device::ApicSoftwareDevice;
1238
1239    fn reference_time_source(&self) -> Option<ReferenceTimeSource> {
1240        Some(if let Some(hv) = self.inner.hv() {
1241            hv.ref_time_source().clone()
1242        } else {
1243            ReferenceTimeSource::from(self.inner.clone() as Arc<_>)
1244        })
1245    }
1246
1247    fn new_virtual_device(
1248        &self,
1249    ) -> Option<&dyn virt::DeviceBuilder<Device = Self::Device, Error = Self::Error>> {
1250        self.inner.software_devices.is_some().then_some(self)
1251    }
1252}
1253
1254impl GetReferenceTime for UhPartitionInner {
1255    fn now(&self) -> ReferenceTimeResult {
1256        ReferenceTimeResult {
1257            ref_time: self.hcl.reference_time().unwrap(),
1258            system_time: None,
1259        }
1260    }
1261}
1262
1263impl virt::DeviceBuilder for UhPartition {
1264    fn build(&self, vtl: Vtl, device_id: u64) -> Result<Self::Device, Self::Error> {
1265        let vtl = GuestVtl::try_from(vtl).expect("higher vtl not configured");
1266        let device = self
1267            .inner
1268            .software_devices
1269            .as_ref()
1270            .expect("checked in new_virtual_device")
1271            .new_device(self.interrupt_targets[vtl].clone(), device_id)
1272            .map_err(Error::NewDevice)?;
1273
1274        Ok(device)
1275    }
1276}
1277
1278struct UhInterruptTarget {
1279    partition: Arc<UhPartitionInner>,
1280    vtl: GuestVtl,
1281}
1282
1283impl pci_core::msi::MsiInterruptTarget for UhInterruptTarget {
1284    fn new_interrupt(&self) -> Box<dyn pci_core::msi::MsiControl> {
1285        let partition = self.partition.clone();
1286        let vtl = self.vtl;
1287        Box::new(move |address, data| partition.request_msi(vtl, MsiRequest { address, data }))
1288    }
1289}
1290
1291impl UhPartitionInner {
1292    fn request_msi(&self, vtl: GuestVtl, request: MsiRequest) {
1293        if let Some(lapic) = self.lapic(vtl) {
1294            tracing::trace!(?request, "interrupt");
1295            lapic.request_interrupt(request.address, request.data, |vp_index| {
1296                self.vp(vp_index).unwrap().wake(vtl, WakeReason::INTCON)
1297            });
1298        } else {
1299            let (address, data) = request.as_x86();
1300            if let Err(err) = self.hcl.request_interrupt(
1301                request.hv_x86_interrupt_control(),
1302                address.virt_destination().into(),
1303                data.vector().into(),
1304                vtl,
1305            ) {
1306                tracelimit::warn_ratelimited!(
1307                    CVM_ALLOWED,
1308                    error = &err as &dyn std::error::Error,
1309                    address = request.address,
1310                    data = request.data,
1311                    "failed to request msi"
1312                );
1313            }
1314        }
1315    }
1316}
1317
1318impl IoApicRouting for UhPartitionInner {
1319    fn set_irq_route(&self, irq: u8, request: Option<MsiRequest>) {
1320        self.irq_routes.set_irq_route(irq, request)
1321    }
1322
1323    // The IO-APIC is always hooked up to VTL0.
1324    fn assert_irq(&self, irq: u8) {
1325        self.irq_routes
1326            .assert_irq(irq, |request| self.request_msi(GuestVtl::Vtl0, request))
1327    }
1328}
1329
1330/// Configure the [`hvdef::HvRegisterVsmPartitionConfig`] register with the
1331/// values used by underhill.
1332fn set_vtl2_vsm_partition_config(hcl: &Hcl) -> Result<(), Error> {
1333    // Read available capabilities to determine what to enable.
1334    let caps = hcl.get_vsm_capabilities().map_err(Error::Hcl)?;
1335    let hardware_isolated = hcl.isolation().is_hardware_isolated();
1336    let isolated = hcl.isolation().is_isolated();
1337
1338    let config = HvRegisterVsmPartitionConfig::new()
1339        .with_default_vtl_protection_mask(0xF)
1340        .with_enable_vtl_protection(!hardware_isolated)
1341        .with_zero_memory_on_reset(!hardware_isolated)
1342        .with_intercept_cpuid_unimplemented(!hardware_isolated)
1343        .with_intercept_page(caps.intercept_page_available())
1344        .with_intercept_unrecoverable_exception(true)
1345        .with_intercept_not_present(caps.intercept_not_present_available() && !isolated)
1346        .with_intercept_acceptance(isolated)
1347        .with_intercept_enable_vtl_protection(isolated && !hardware_isolated)
1348        .with_intercept_system_reset(caps.intercept_system_reset_available());
1349
1350    hcl.set_vtl2_vsm_partition_config(config)
1351        .map_err(Error::VsmPartitionConfig)
1352}
1353
1354/// Configuration parameters supplied to [`UhProtoPartition::new`].
1355///
1356/// These do not include runtime resources.
1357pub struct UhPartitionNewParams<'a> {
1358    /// The isolation type for the partition.
1359    pub isolation: IsolationType,
1360    /// Hide isolation from the guest. The guest will run as if it is not
1361    /// isolated.
1362    pub hide_isolation: bool,
1363    /// The memory layout for lower VTLs.
1364    pub lower_vtl_memory_layout: &'a MemoryLayout,
1365    /// The guest processor topology.
1366    pub topology: &'a ProcessorTopology,
1367    /// The unparsed CVM cpuid info.
1368    // TODO: move parsing up a layer.
1369    pub cvm_cpuid_info: Option<&'a [u8]>,
1370    /// The unparsed CVM secrets page.
1371    pub snp_secrets: Option<&'a [u8]>,
1372    /// The virtual top of memory for hardware-isolated VMs.
1373    ///
1374    /// Must be a power of two.
1375    pub vtom: Option<u64>,
1376    /// Handle synic messages and events.
1377    ///
1378    /// On TDX, this prevents the hypervisor from getting vmtdcall exits.
1379    pub handle_synic: bool,
1380    /// Do not hotplug sidecar VPs on their first exit. Just continue running
1381    /// the VP remotely.
1382    pub no_sidecar_hotplug: bool,
1383    /// Use MMIO access hypercalls.
1384    pub use_mmio_hypercalls: bool,
1385    /// Intercept guest debug exceptions to support gdbstub.
1386    pub intercept_debug_exceptions: bool,
1387}
1388
1389/// Parameters to [`UhProtoPartition::build`].
1390pub struct UhLateParams<'a> {
1391    /// Guest memory for lower VTLs.
1392    pub gm: VtlArray<GuestMemory, 2>,
1393    /// Guest memory for VTL 0 kernel execute access.
1394    pub vtl0_kernel_exec_gm: GuestMemory,
1395    /// Guest memory for VTL 0 user execute access.
1396    pub vtl0_user_exec_gm: GuestMemory,
1397    /// The CPUID leaves to expose to the guest.
1398    #[cfg(guest_arch = "x86_64")]
1399    pub cpuid: Vec<CpuidLeaf>,
1400    /// The mesh sender to use for crash notifications.
1401    // FUTURE: remove mesh dependency from this layer.
1402    pub crash_notification_send: mesh::Sender<VtlCrash>,
1403    /// The VM time source.
1404    pub vmtime: &'a VmTimeSource,
1405    /// Parameters for CVMs only.
1406    pub cvm_params: Option<CvmLateParams>,
1407    /// vmbus_relay is enabled and active for partition
1408    pub vmbus_relay: bool,
1409}
1410
1411/// CVM-only parameters to [`UhProtoPartition::build`].
1412pub struct CvmLateParams {
1413    /// Guest memory for untrusted devices, like overlay pages.
1414    pub shared_gm: GuestMemory,
1415    /// An object to call to change host visibility on guest memory.
1416    pub isolated_memory_protector: Arc<dyn ProtectIsolatedMemory>,
1417    /// Dma client for shared visibility pages.
1418    pub shared_dma_client: Arc<dyn DmaClient>,
1419    /// Allocator for private visibility pages.
1420    pub private_dma_client: Arc<dyn DmaClient>,
1421}
1422
1423/// Represents a GPN that is either in guest memory or was allocated by dma_client.
1424#[derive(Debug, Copy, Clone, PartialEq, Eq)]
1425pub enum GpnSource {
1426    /// The GPN is in regular guest RAM.
1427    GuestMemory,
1428    /// The GPN was allocated by dma_client and is not in guest RAM.
1429    Dma,
1430}
1431
1432/// Trait for CVM-related protections on guest memory.
1433pub trait ProtectIsolatedMemory: Send + Sync {
1434    /// Changes host visibility on guest memory.
1435    fn change_host_visibility(
1436        &self,
1437        vtl: GuestVtl,
1438        shared: bool,
1439        gpns: &[u64],
1440        tlb_access: &mut dyn TlbFlushLockAccess,
1441    ) -> Result<(), (HvError, usize)>;
1442
1443    /// Queries host visibility on guest memory.
1444    fn query_host_visibility(
1445        &self,
1446        gpns: &[u64],
1447        host_visibility: &mut [HostVisibilityType],
1448    ) -> Result<(), (HvError, usize)>;
1449
1450    /// Gets the default protections/permissions for VTL 0.
1451    fn default_vtl0_protections(&self) -> HvMapGpaFlags;
1452
1453    /// Changes the default protections/permissions for a VTL. For VBS-isolated
1454    /// VMs, the protections apply to all vtls lower than the specified one. For
1455    /// hardware-isolated VMs, they apply just to the given vtl.
1456    fn change_default_vtl_protections(
1457        &self,
1458        target_vtl: GuestVtl,
1459        protections: HvMapGpaFlags,
1460        tlb_access: &mut dyn TlbFlushLockAccess,
1461    ) -> Result<(), HvError>;
1462
1463    /// Changes the vtl protections on a range of guest memory.
1464    fn change_vtl_protections(
1465        &self,
1466        target_vtl: GuestVtl,
1467        gpns: &[u64],
1468        protections: HvMapGpaFlags,
1469        tlb_access: &mut dyn TlbFlushLockAccess,
1470    ) -> Result<(), (HvError, usize)>;
1471
1472    /// Registers a page as an overlay page by first validating it has the
1473    /// required permissions, optionally modifying them, then locking them.
1474    fn register_overlay_page(
1475        &self,
1476        vtl: GuestVtl,
1477        gpn: u64,
1478        gpn_source: GpnSource,
1479        check_perms: HvMapGpaFlags,
1480        new_perms: Option<HvMapGpaFlags>,
1481        tlb_access: &mut dyn TlbFlushLockAccess,
1482    ) -> Result<(), HvError>;
1483
1484    /// Unregisters an overlay page, removing its permission lock and restoring
1485    /// the previous permissions.
1486    fn unregister_overlay_page(
1487        &self,
1488        vtl: GuestVtl,
1489        gpn: u64,
1490        tlb_access: &mut dyn TlbFlushLockAccess,
1491    ) -> Result<(), HvError>;
1492
1493    /// Checks whether a page is currently registered as an overlay page.
1494    fn is_overlay_page(&self, vtl: GuestVtl, gpn: u64) -> bool;
1495
1496    /// Locks the permissions and mappings for a set of guest pages.
1497    fn lock_gpns(&self, vtl: GuestVtl, gpns: &[u64]) -> Result<(), GuestMemoryBackingError>;
1498
1499    /// Unlocks the permissions and mappings for a set of guest pages.
1500    ///
1501    /// Panics if asked to unlock a page that was not previously locked. The
1502    /// caller must ensure that the given slice has the same ordering as the
1503    /// one passed to `lock_gpns`.
1504    fn unlock_gpns(&self, vtl: GuestVtl, gpns: &[u64]);
1505
1506    /// Alerts the memory protector that vtl 1 is ready to set vtl protections
1507    /// on lower-vtl memory, and that these protections should be enforced.
1508    fn set_vtl1_protections_enabled(&self);
1509
1510    /// Whether VTL 1 is prepared to modify vtl protections on lower-vtl memory,
1511    /// and therefore whether these protections should be enforced.
1512    fn vtl1_protections_enabled(&self) -> bool;
1513}
1514
1515/// Trait for access to TLB flush and lock machinery.
1516pub trait TlbFlushLockAccess {
1517    /// Flush the entire TLB for all VPs for the given VTL.
1518    fn flush(&mut self, vtl: GuestVtl);
1519
1520    /// Flush the entire TLB for all VPs for all VTLs.
1521    fn flush_entire(&mut self);
1522
1523    /// Causes the specified VTL on the current VP to wait on all TLB locks.
1524    fn set_wait_for_tlb_locks(&mut self, vtl: GuestVtl);
1525}
1526
1527/// A partially built partition. Used to allow querying partition capabilities
1528/// before fully instantiating the partition.
1529pub struct UhProtoPartition<'a> {
1530    params: UhPartitionNewParams<'a>,
1531    hcl: Hcl,
1532    guest_vsm_available: bool,
1533    #[cfg(guest_arch = "x86_64")]
1534    cpuid: virt::CpuidLeafSet,
1535}
1536
1537impl<'a> UhProtoPartition<'a> {
1538    /// Creates a new prototype partition.
1539    ///
1540    /// `driver(cpu)` returns the driver to use for polling the sidecar device
1541    /// whose base CPU is `cpu`.
1542    pub fn new<T: SpawnDriver>(
1543        params: UhPartitionNewParams<'a>,
1544        driver: impl FnMut(u32) -> T,
1545    ) -> Result<Self, Error> {
1546        let hcl_isolation = match params.isolation {
1547            IsolationType::None => hcl::ioctl::IsolationType::None,
1548            IsolationType::Vbs => hcl::ioctl::IsolationType::Vbs,
1549            IsolationType::Snp => hcl::ioctl::IsolationType::Snp,
1550            IsolationType::Tdx => hcl::ioctl::IsolationType::Tdx,
1551        };
1552
1553        // Try to open the sidecar device, if it is present.
1554        let sidecar = sidecar_client::SidecarClient::new(driver).map_err(Error::Sidecar)?;
1555
1556        let hcl = Hcl::new(hcl_isolation, sidecar).map_err(Error::Hcl)?;
1557
1558        // Set the hypercalls that this process will use.
1559        let mut allowed_hypercalls = vec![
1560            hvdef::HypercallCode::HvCallGetVpRegisters,
1561            hvdef::HypercallCode::HvCallSetVpRegisters,
1562            hvdef::HypercallCode::HvCallInstallIntercept,
1563            hvdef::HypercallCode::HvCallTranslateVirtualAddress,
1564            hvdef::HypercallCode::HvCallPostMessageDirect,
1565            hvdef::HypercallCode::HvCallSignalEventDirect,
1566            hvdef::HypercallCode::HvCallModifyVtlProtectionMask,
1567            hvdef::HypercallCode::HvCallTranslateVirtualAddressEx,
1568            hvdef::HypercallCode::HvCallCheckSparseGpaPageVtlAccess,
1569            hvdef::HypercallCode::HvCallAssertVirtualInterrupt,
1570            hvdef::HypercallCode::HvCallGetVpIndexFromApicId,
1571            hvdef::HypercallCode::HvCallAcceptGpaPages,
1572            hvdef::HypercallCode::HvCallModifySparseGpaPageHostVisibility,
1573        ];
1574
1575        if params.isolation.is_hardware_isolated() {
1576            allowed_hypercalls.extend(vec![
1577                hvdef::HypercallCode::HvCallEnablePartitionVtl,
1578                hvdef::HypercallCode::HvCallRetargetDeviceInterrupt,
1579                hvdef::HypercallCode::HvCallEnableVpVtl,
1580            ]);
1581        }
1582
1583        if params.use_mmio_hypercalls {
1584            allowed_hypercalls.extend(vec![
1585                hvdef::HypercallCode::HvCallMemoryMappedIoRead,
1586                hvdef::HypercallCode::HvCallMemoryMappedIoWrite,
1587            ]);
1588        }
1589
1590        hcl.set_allowed_hypercalls(allowed_hypercalls.as_slice());
1591
1592        set_vtl2_vsm_partition_config(&hcl)?;
1593
1594        let guest_vsm_available = Self::check_guest_vsm_support(&hcl)?;
1595
1596        #[cfg(guest_arch = "x86_64")]
1597        let cpuid = match params.isolation {
1598            IsolationType::Snp => cvm_cpuid::CpuidResultsIsolationType::Snp {
1599                cpuid_pages: params.cvm_cpuid_info.unwrap(),
1600                vtom: params.vtom.unwrap(),
1601                access_vsm: guest_vsm_available,
1602            }
1603            .build()
1604            .map_err(Error::CvmCpuid)?,
1605
1606            IsolationType::Tdx => cvm_cpuid::CpuidResultsIsolationType::Tdx {
1607                topology: params.topology,
1608                vtom: params.vtom.unwrap(),
1609                access_vsm: guest_vsm_available,
1610            }
1611            .build()
1612            .map_err(Error::CvmCpuid)?,
1613            IsolationType::Vbs | IsolationType::None => Default::default(),
1614        };
1615
1616        Ok(UhProtoPartition {
1617            hcl,
1618            params,
1619            guest_vsm_available,
1620            #[cfg(guest_arch = "x86_64")]
1621            cpuid,
1622        })
1623    }
1624
1625    /// Returns whether VSM support will be available to the guest.
1626    pub fn guest_vsm_available(&self) -> bool {
1627        self.guest_vsm_available
1628    }
1629
1630    /// Returns a new Underhill partition.
1631    pub async fn build(
1632        self,
1633        late_params: UhLateParams<'_>,
1634    ) -> Result<(UhPartition, Vec<UhProcessorBox>), Error> {
1635        let Self {
1636            mut hcl,
1637            params,
1638            guest_vsm_available,
1639            #[cfg(guest_arch = "x86_64")]
1640            cpuid,
1641        } = self;
1642        let isolation = params.isolation;
1643        let is_hardware_isolated = isolation.is_hardware_isolated();
1644
1645        // Intercept Debug Exceptions
1646        // On TDX because all OpenHCL TDs today have the debug policy bit set,
1647        // OpenHCL registers for the intercepts itself.
1648        // However, on non-TDX platforms hypervisor installs the
1649        // intercept on behalf of the guest.
1650        if params.intercept_debug_exceptions {
1651            if !cfg!(feature = "gdb") {
1652                return Err(Error::InvalidDebugConfiguration);
1653            }
1654
1655            cfg_if::cfg_if! {
1656                if #[cfg(guest_arch = "x86_64")] {
1657                    if isolation != IsolationType::Tdx {
1658                        let debug_exception_vector = 0x1;
1659                        hcl.register_intercept(
1660                            HvInterceptType::HvInterceptTypeException,
1661                            HV_INTERCEPT_ACCESS_MASK_EXECUTE,
1662                            HvInterceptParameters::new_exception(debug_exception_vector),
1663                        )
1664                        .map_err(|err| Error::InstallIntercept(HvInterceptType::HvInterceptTypeException, err))?;
1665                    }
1666                } else {
1667                    return Err(Error::InvalidDebugConfiguration);
1668                }
1669            }
1670        }
1671
1672        if !is_hardware_isolated {
1673            if cfg!(guest_arch = "x86_64") {
1674                hcl.register_intercept(
1675                    HvInterceptType::HvInterceptTypeX64Msr,
1676                    HV_INTERCEPT_ACCESS_MASK_READ_WRITE,
1677                    HvInterceptParameters::new_zeroed(),
1678                )
1679                .map_err(|err| {
1680                    Error::InstallIntercept(HvInterceptType::HvInterceptTypeX64Msr, err)
1681                })?;
1682
1683                hcl.register_intercept(
1684                    HvInterceptType::HvInterceptTypeX64ApicEoi,
1685                    HV_INTERCEPT_ACCESS_MASK_WRITE,
1686                    HvInterceptParameters::new_zeroed(),
1687                )
1688                .map_err(|err| {
1689                    Error::InstallIntercept(HvInterceptType::HvInterceptTypeX64ApicEoi, err)
1690                })?;
1691            } else {
1692                if false {
1693                    todo!("AARCH64_TODO");
1694                }
1695            }
1696        }
1697
1698        if isolation == IsolationType::Snp {
1699            // SNP VMs register for the #VC exception to support reflect-VC.
1700            hcl.register_intercept(
1701                HvInterceptType::HvInterceptTypeException,
1702                HV_INTERCEPT_ACCESS_MASK_EXECUTE,
1703                HvInterceptParameters::new_exception(0x1D),
1704            )
1705            .map_err(|err| {
1706                Error::InstallIntercept(HvInterceptType::HvInterceptTypeException, err)
1707            })?;
1708
1709            // Get the register tweak bitmap from secrets page.
1710            let mut bitmap = [0u8; 64];
1711            if let Some(secrets) = params.snp_secrets {
1712                bitmap.copy_from_slice(
1713                    &secrets
1714                        [REG_TWEAK_BITMAP_OFFSET..REG_TWEAK_BITMAP_OFFSET + REG_TWEAK_BITMAP_SIZE],
1715                );
1716            }
1717            hcl.set_snp_register_bitmap(bitmap);
1718        }
1719
1720        // Do per-VP HCL initialization.
1721        hcl.add_vps(
1722            params.topology.vp_count(),
1723            late_params
1724                .cvm_params
1725                .as_ref()
1726                .map(|x| &x.private_dma_client),
1727        )
1728        .map_err(Error::Hcl)?;
1729
1730        let vps: Vec<_> = params
1731            .topology
1732            .vps_arch()
1733            .map(|vp_info| {
1734                // TODO: determine CPU index, which in theory could be different
1735                // from the VP index, though this hasn't happened yet.
1736                let cpu_index = vp_info.base.vp_index.index();
1737                UhVpInner::new(cpu_index, vp_info)
1738            })
1739            .collect();
1740
1741        // Enable support for VPCI devices if the hypervisor supports it.
1742        #[cfg(guest_arch = "x86_64")]
1743        let software_devices = {
1744            let res = if !is_hardware_isolated {
1745                hcl.register_intercept(
1746                    HvInterceptType::HvInterceptTypeRetargetInterruptWithUnknownDeviceId,
1747                    HV_INTERCEPT_ACCESS_MASK_EXECUTE,
1748                    HvInterceptParameters::new_zeroed(),
1749                )
1750            } else {
1751                Ok(())
1752            };
1753            match res {
1754                Ok(()) => Some(ApicSoftwareDevices::new(
1755                    params.topology.vps_arch().map(|vp| vp.apic_id).collect(),
1756                )),
1757                Err(HvError::InvalidParameter | HvError::AccessDenied) => None,
1758                Err(err) => {
1759                    return Err(Error::InstallIntercept(
1760                        HvInterceptType::HvInterceptTypeRetargetInterruptWithUnknownDeviceId,
1761                        err,
1762                    ));
1763                }
1764            }
1765        };
1766
1767        #[cfg(guest_arch = "aarch64")]
1768        let software_devices = None;
1769
1770        #[cfg(guest_arch = "aarch64")]
1771        let caps = virt::aarch64::Aarch64PartitionCapabilities {};
1772
1773        #[cfg(guest_arch = "x86_64")]
1774        let cpuid = UhPartition::construct_cpuid_results(
1775            cpuid,
1776            &late_params.cpuid,
1777            params.topology,
1778            isolation,
1779            params.hide_isolation,
1780        );
1781
1782        #[cfg(guest_arch = "x86_64")]
1783        let caps = UhPartition::construct_capabilities(
1784            params.topology,
1785            &cpuid,
1786            isolation,
1787            params.hide_isolation,
1788        )
1789        .map_err(Error::Capabilities)?;
1790
1791        if params.handle_synic && !matches!(isolation, IsolationType::Tdx) {
1792            // The hypervisor will manage the untrusted SINTs (or the whole
1793            // synic for non-hardware-isolated VMs), but some event ports
1794            // and message ports are implemented here. Register an intercept
1795            // to handle HvSignalEvent and HvPostMessage hypercalls when the
1796            // hypervisor doesn't recognize the connection ID.
1797            //
1798            // TDX manages this locally instead of through the hypervisor.
1799            hcl.register_intercept(
1800                HvInterceptType::HvInterceptTypeUnknownSynicConnection,
1801                HV_INTERCEPT_ACCESS_MASK_EXECUTE,
1802                HvInterceptParameters::new_zeroed(),
1803            )
1804            .expect("registering synic intercept cannot fail");
1805        }
1806
1807        #[cfg(guest_arch = "x86_64")]
1808        let cvm_state = if is_hardware_isolated {
1809            Some(Self::construct_cvm_state(
1810                &params,
1811                late_params.cvm_params.unwrap(),
1812                &caps,
1813                guest_vsm_available,
1814            )?)
1815        } else {
1816            None
1817        };
1818        #[cfg(guest_arch = "aarch64")]
1819        let cvm_state = None;
1820
1821        let backing_shared = BackingShared::new(
1822            isolation,
1823            &params,
1824            BackingSharedParams {
1825                cvm_state,
1826                #[cfg(guest_arch = "x86_64")]
1827                cpuid: &cpuid,
1828                hcl: &hcl,
1829                guest_vsm_available,
1830            },
1831        )?;
1832
1833        let enter_modes = EnterModes::default();
1834
1835        let partition = Arc::new(UhPartitionInner {
1836            hcl,
1837            vps,
1838            irq_routes: Default::default(),
1839            caps,
1840            enter_modes: Mutex::new(enter_modes),
1841            enter_modes_atomic: u8::from(hcl::protocol::EnterModes::from(enter_modes)).into(),
1842            gm: late_params.gm,
1843            vtl0_kernel_exec_gm: late_params.vtl0_kernel_exec_gm,
1844            vtl0_user_exec_gm: late_params.vtl0_user_exec_gm,
1845            #[cfg(guest_arch = "x86_64")]
1846            cpuid,
1847            crash_notification_send: late_params.crash_notification_send,
1848            monitor_page: MonitorPage::new(),
1849            allocated_monitor_page: Mutex::new(None),
1850            software_devices,
1851            lower_vtl_memory_layout: params.lower_vtl_memory_layout.clone(),
1852            vmtime: late_params.vmtime.clone(),
1853            isolation,
1854            no_sidecar_hotplug: params.no_sidecar_hotplug.into(),
1855            use_mmio_hypercalls: params.use_mmio_hypercalls,
1856            backing_shared,
1857            #[cfg(guest_arch = "x86_64")]
1858            device_vector_table: RwLock::new(IrrBitmap::new(Default::default())),
1859            intercept_debug_exceptions: params.intercept_debug_exceptions,
1860            vmbus_relay: late_params.vmbus_relay,
1861        });
1862
1863        if cfg!(guest_arch = "x86_64") {
1864            // Intercept all IOs unless opted out.
1865            partition.manage_io_port_intercept_region(0, !0, true);
1866        }
1867
1868        let vps = params
1869            .topology
1870            .vps_arch()
1871            .map(|vp_info| UhProcessorBox {
1872                partition: partition.clone(),
1873                vp_info,
1874            })
1875            .collect();
1876
1877        Ok((
1878            UhPartition {
1879                inner: partition.clone(),
1880                interrupt_targets: VtlArray::from_fn(|vtl| {
1881                    Arc::new(UhInterruptTarget {
1882                        partition: partition.clone(),
1883                        vtl: vtl.try_into().unwrap(),
1884                    })
1885                }),
1886            },
1887            vps,
1888        ))
1889    }
1890}
1891
1892impl UhPartition {
1893    /// Gets the guest OS ID for VTL0.
1894    pub fn vtl0_guest_os_id(&self) -> Result<HvGuestOsId, Error> {
1895        // If Underhill is emulating the hypervisor interfaces, get this value
1896        // from the emulator. This happens when running under hardware isolation
1897        // or when configured for testing.
1898        let id = if let Some(hv) = self.inner.hv() {
1899            hv.guest_os_id(Vtl::Vtl0)
1900        } else {
1901            // Ask the hypervisor for this value.
1902            self.inner
1903                .hcl
1904                .get_guest_os_id(Vtl::Vtl0)
1905                .map_err(Error::Hcl)?
1906        };
1907        Ok(id)
1908    }
1909
1910    /// Configures guest accesses to IO ports in `range` to go directly to the
1911    /// host.
1912    ///
1913    /// When the return value is dropped, the ports will be unregistered.
1914    pub fn register_host_io_port_fast_path(
1915        &self,
1916        range: RangeInclusive<u16>,
1917    ) -> HostIoPortFastPathHandle {
1918        // There is no way to provide a fast path for some hardware isolated
1919        // VM architectures. The devices that do use this facility are not
1920        // enabled on hardware isolated VMs.
1921        assert!(!self.inner.isolation.is_hardware_isolated());
1922
1923        self.inner
1924            .manage_io_port_intercept_region(*range.start(), *range.end(), false);
1925        HostIoPortFastPathHandle {
1926            inner: Arc::downgrade(&self.inner),
1927            begin: *range.start(),
1928            end: *range.end(),
1929        }
1930    }
1931
1932    /// Enables or disables the PM timer assist.
1933    pub fn set_pm_timer_assist(&self, port: Option<u16>) -> Result<(), HvError> {
1934        self.inner.hcl.set_pm_timer_assist(port)
1935    }
1936
1937    /// Sets guest memory protections for a monitor page.
1938    fn register_cvm_dma_overlay_page(
1939        &self,
1940        vtl: GuestVtl,
1941        gpn: u64,
1942        new_perms: HvMapGpaFlags,
1943    ) -> anyhow::Result<()> {
1944        // How the monitor page is protected depends on the isolation type of the VM.
1945        match &self.inner.backing_shared {
1946            #[cfg(guest_arch = "x86_64")]
1947            BackingShared::Snp(snp_backed_shared) => snp_backed_shared
1948                .cvm
1949                .isolated_memory_protector
1950                .register_overlay_page(
1951                    vtl,
1952                    gpn,
1953                    // On a CVM, the monitor page is always DMA-allocated.
1954                    GpnSource::Dma,
1955                    HvMapGpaFlags::new(),
1956                    Some(new_perms),
1957                    &mut SnpBacked::tlb_flush_lock_access(
1958                        None,
1959                        self.inner.as_ref(),
1960                        snp_backed_shared,
1961                    ),
1962                )
1963                .map_err(|e| anyhow::anyhow!(e)),
1964            #[cfg(guest_arch = "x86_64")]
1965            BackingShared::Tdx(tdx_backed_shared) => tdx_backed_shared
1966                .cvm
1967                .isolated_memory_protector
1968                .register_overlay_page(
1969                    vtl,
1970                    gpn,
1971                    GpnSource::Dma,
1972                    HvMapGpaFlags::new(),
1973                    Some(new_perms),
1974                    &mut TdxBacked::tlb_flush_lock_access(
1975                        None,
1976                        self.inner.as_ref(),
1977                        tdx_backed_shared,
1978                    ),
1979                )
1980                .map_err(|e| anyhow::anyhow!(e)),
1981            BackingShared::Hypervisor(_) => {
1982                let _ = (vtl, gpn, new_perms);
1983                unreachable!()
1984            }
1985        }
1986    }
1987
1988    /// Reverts guest memory protections for a monitor page.
1989    fn unregister_cvm_dma_overlay_page(&self, vtl: GuestVtl, gpn: u64) -> anyhow::Result<()> {
1990        // How the monitor page is protected depends on the isolation type of the VM.
1991        match &self.inner.backing_shared {
1992            #[cfg(guest_arch = "x86_64")]
1993            BackingShared::Snp(snp_backed_shared) => snp_backed_shared
1994                .cvm
1995                .isolated_memory_protector
1996                .unregister_overlay_page(
1997                    vtl,
1998                    gpn,
1999                    &mut SnpBacked::tlb_flush_lock_access(
2000                        None,
2001                        self.inner.as_ref(),
2002                        snp_backed_shared,
2003                    ),
2004                )
2005                .map_err(|e| anyhow::anyhow!(e)),
2006            #[cfg(guest_arch = "x86_64")]
2007            BackingShared::Tdx(tdx_backed_shared) => tdx_backed_shared
2008                .cvm
2009                .isolated_memory_protector
2010                .unregister_overlay_page(
2011                    vtl,
2012                    gpn,
2013                    &mut TdxBacked::tlb_flush_lock_access(
2014                        None,
2015                        self.inner.as_ref(),
2016                        tdx_backed_shared,
2017                    ),
2018                )
2019                .map_err(|e| anyhow::anyhow!(e)),
2020            BackingShared::Hypervisor(_) => {
2021                let _ = (vtl, gpn);
2022                unreachable!()
2023            }
2024        }
2025    }
2026}
2027
2028impl UhProtoPartition<'_> {
2029    /// Whether Guest VSM is available to the guest. If so, for hardware CVMs,
2030    /// it is safe to expose Guest VSM support via cpuid.
2031    fn check_guest_vsm_support(hcl: &Hcl) -> Result<bool, Error> {
2032        #[cfg(guest_arch = "x86_64")]
2033        let privs = {
2034            let result = safe_intrinsics::cpuid(hvdef::HV_CPUID_FUNCTION_MS_HV_FEATURES, 0);
2035            let num = result.eax as u64 | ((result.ebx as u64) << 32);
2036            hvdef::HvPartitionPrivilege::from(num)
2037        };
2038
2039        #[cfg(guest_arch = "aarch64")]
2040        let privs = hcl.get_privileges_and_features_info().map_err(Error::Hcl)?;
2041
2042        if !privs.access_vsm() {
2043            return Ok(false);
2044        }
2045        let guest_vsm_config = hcl.get_guest_vsm_partition_config().map_err(Error::Hcl)?;
2046        Ok(guest_vsm_config.maximum_vtl() >= u8::from(GuestVtl::Vtl1))
2047    }
2048
2049    #[cfg(guest_arch = "x86_64")]
2050    /// Constructs partition-wide CVM state.
2051    fn construct_cvm_state(
2052        params: &UhPartitionNewParams<'_>,
2053        late_params: CvmLateParams,
2054        caps: &PartitionCapabilities,
2055        guest_vsm_available: bool,
2056    ) -> Result<UhCvmPartitionState, Error> {
2057        use vmcore::reference_time::ReferenceTimeSource;
2058
2059        let vp_count = params.topology.vp_count() as usize;
2060        let vps = (0..vp_count)
2061            .map(|vp_index| UhCvmVpInner {
2062                tlb_lock_info: VtlArray::from_fn(|_| TlbLockInfo::new(vp_count)),
2063                vtl1_enable_called: Mutex::new(false),
2064                started: AtomicBool::new(vp_index == 0),
2065                hv_start_enable_vtl_vp: VtlArray::from_fn(|_| Mutex::new(None)),
2066            })
2067            .collect();
2068        let tlb_locked_vps =
2069            VtlArray::from_fn(|_| BitVec::repeat(false, vp_count).into_boxed_bitslice());
2070
2071        let lapic = VtlArray::from_fn(|_| {
2072            LocalApicSet::builder()
2073                .x2apic_capable(caps.x2apic)
2074                .hyperv_enlightenments(true)
2075                .build()
2076        });
2077
2078        let tsc_frequency = get_tsc_frequency(params.isolation)?;
2079        let ref_time = ReferenceTimeSource::new(TscReferenceTimeSource::new(tsc_frequency));
2080
2081        // If we're emulating the APIC, then we also must emulate the hypervisor
2082        // enlightenments, since the hypervisor can't support enlightenments
2083        // without also providing an APIC.
2084        //
2085        // Additionally, TDX provides hardware APIC emulation but we still need
2086        // to emulate the hypervisor enlightenments.
2087        let hv = GlobalHv::new(hv1_emulator::hv::GlobalHvParams {
2088            max_vp_count: params.topology.vp_count(),
2089            vendor: caps.vendor,
2090            tsc_frequency,
2091            ref_time,
2092            is_ref_time_backed_by_tsc: true,
2093        });
2094
2095        Ok(UhCvmPartitionState {
2096            vps_per_socket: params.topology.reserved_vps_per_socket(),
2097            tlb_locked_vps,
2098            vps,
2099            shared_memory: late_params.shared_gm,
2100            isolated_memory_protector: late_params.isolated_memory_protector,
2101            lapic,
2102            hv,
2103            guest_vsm: RwLock::new(GuestVsmState::from_availability(guest_vsm_available)),
2104            shared_dma_client: late_params.shared_dma_client,
2105            private_dma_client: late_params.private_dma_client,
2106            hide_isolation: params.hide_isolation,
2107        })
2108    }
2109}
2110
2111impl UhPartition {
2112    #[cfg(guest_arch = "x86_64")]
2113    /// Constructs the set of cpuid results to show to the guest
2114    fn construct_cpuid_results(
2115        cpuid: virt::CpuidLeafSet,
2116        initial_cpuid: &[CpuidLeaf],
2117        topology: &ProcessorTopology<vm_topology::processor::x86::X86Topology>,
2118        isolation: IsolationType,
2119        hide_isolation: bool,
2120    ) -> virt::CpuidLeafSet {
2121        let mut cpuid = cpuid.into_leaves();
2122        if isolation.is_hardware_isolated() {
2123            // Update the x2apic leaf based on the topology.
2124            let x2apic = match topology.apic_mode() {
2125                vm_topology::processor::x86::ApicMode::XApic => false,
2126                vm_topology::processor::x86::ApicMode::X2ApicSupported => true,
2127                vm_topology::processor::x86::ApicMode::X2ApicEnabled => true,
2128            };
2129            let ecx = x86defs::cpuid::VersionAndFeaturesEcx::new().with_x2_apic(x2apic);
2130            let ecx_mask = x86defs::cpuid::VersionAndFeaturesEcx::new().with_x2_apic(true);
2131            cpuid.push(
2132                CpuidLeaf::new(
2133                    x86defs::cpuid::CpuidFunction::VersionAndFeatures.0,
2134                    [0, 0, ecx.into(), 0],
2135                )
2136                .masked([0, 0, ecx_mask.into(), 0]),
2137            );
2138
2139            // Get the hypervisor version from the host. This is just for
2140            // reporting purposes, so it is safe even if the hypervisor is not
2141            // trusted.
2142            let hv_version = safe_intrinsics::cpuid(hvdef::HV_CPUID_FUNCTION_MS_HV_VERSION, 0);
2143
2144            // Perform final processing steps for synthetic leaves.
2145            hv1_emulator::cpuid::process_hv_cpuid_leaves(
2146                &mut cpuid,
2147                hide_isolation,
2148                [
2149                    hv_version.eax,
2150                    hv_version.ebx,
2151                    hv_version.ecx,
2152                    hv_version.edx,
2153                ],
2154            );
2155        }
2156        cpuid.extend(initial_cpuid);
2157        virt::CpuidLeafSet::new(cpuid)
2158    }
2159
2160    #[cfg(guest_arch = "x86_64")]
2161    /// Computes the partition capabilities
2162    fn construct_capabilities(
2163        topology: &ProcessorTopology,
2164        cpuid: &virt::CpuidLeafSet,
2165        isolation: IsolationType,
2166        hide_isolation: bool,
2167    ) -> Result<virt::x86::X86PartitionCapabilities, virt::x86::X86PartitionCapabilitiesError> {
2168        let mut native_cpuid_fn;
2169        let mut cvm_cpuid_fn;
2170
2171        // Determine the method to get cpuid results for the guest when
2172        // computing partition capabilities.
2173        let cpuid_fn: &mut dyn FnMut(u32, u32) -> [u32; 4] = if isolation.is_hardware_isolated() {
2174            // Use the filtered CPUID to determine capabilities.
2175            cvm_cpuid_fn = move |leaf, sub_leaf| cpuid.result(leaf, sub_leaf, &[0, 0, 0, 0]);
2176            &mut cvm_cpuid_fn
2177        } else {
2178            // Just use the native cpuid.
2179            native_cpuid_fn = |leaf, sub_leaf| {
2180                let CpuidResult { eax, ebx, ecx, edx } = safe_intrinsics::cpuid(leaf, sub_leaf);
2181                cpuid.result(leaf, sub_leaf, &[eax, ebx, ecx, edx])
2182            };
2183            &mut native_cpuid_fn
2184        };
2185
2186        // Compute and validate capabilities.
2187        let mut caps = virt::x86::X86PartitionCapabilities::from_cpuid(topology, cpuid_fn)?;
2188        match isolation {
2189            IsolationType::Tdx => {
2190                assert_eq!(caps.vtom.is_some(), !hide_isolation);
2191                // TDX 1.5 requires EFER.NXE to be set to 1, so set it at RESET/INIT.
2192                caps.nxe_forced_on = true;
2193            }
2194            IsolationType::Snp => {
2195                assert_eq!(caps.vtom.is_some(), !hide_isolation);
2196            }
2197            _ => {
2198                assert!(caps.vtom.is_none());
2199            }
2200        }
2201
2202        Ok(caps)
2203    }
2204}
2205
2206#[cfg(guest_arch = "x86_64")]
2207/// Gets the TSC frequency for the current platform.
2208fn get_tsc_frequency(isolation: IsolationType) -> Result<u64, Error> {
2209    // Always get the frequency from the hypervisor. It's believed that, as long
2210    // as the hypervisor is behaving, it will provide the most precise and accurate frequency.
2211    let msr = MsrDevice::new(0).map_err(Error::OpenMsr)?;
2212    let hv_frequency = msr
2213        .read_msr(hvdef::HV_X64_MSR_TSC_FREQUENCY)
2214        .map_err(Error::ReadTscFrequency)?;
2215
2216    // Get the hardware-advertised frequency and validate that the
2217    // hypervisor frequency is not too far off.
2218    let hw_info = match isolation {
2219        IsolationType::Tdx => {
2220            // TDX provides the TSC frequency via cpuid.
2221            let max_function =
2222                safe_intrinsics::cpuid(x86defs::cpuid::CpuidFunction::VendorAndMaxFunction.0, 0)
2223                    .eax;
2224
2225            if max_function < x86defs::cpuid::CpuidFunction::CoreCrystalClockInformation.0 {
2226                return Err(Error::BadCpuidTsc);
2227            }
2228            let result = safe_intrinsics::cpuid(
2229                x86defs::cpuid::CpuidFunction::CoreCrystalClockInformation.0,
2230                0,
2231            );
2232            let ratio_denom = result.eax;
2233            let ratio_num = result.ebx;
2234            let clock = result.ecx;
2235            if ratio_num == 0 || ratio_denom == 0 || clock == 0 {
2236                return Err(Error::BadCpuidTsc);
2237            }
2238            // TDX TSC is configurable in units of 25MHz, so allow up to 12.5MHz
2239            // error.
2240            let allowed_error = 12_500_000;
2241            Some((
2242                clock as u64 * ratio_num as u64 / ratio_denom as u64,
2243                allowed_error,
2244            ))
2245        }
2246        IsolationType::Snp => {
2247            // SNP currently does not provide the frequency.
2248            None
2249        }
2250        IsolationType::Vbs | IsolationType::None => None,
2251    };
2252
2253    if let Some((hw_frequency, allowed_error)) = hw_info {
2254        // Don't allow the frequencies to be different by more than the hardware
2255        // precision.
2256        let delta = hw_frequency.abs_diff(hv_frequency);
2257        if delta > allowed_error {
2258            return Err(Error::TscFrequencyMismatch {
2259                hv: hv_frequency,
2260                hw: hw_frequency,
2261                allowed_error,
2262            });
2263        }
2264    }
2265
2266    Ok(hv_frequency)
2267}
2268
2269impl UhPartitionInner {
2270    fn manage_io_port_intercept_region(&self, begin: u16, end: u16, active: bool) {
2271        if self.isolation.is_hardware_isolated() {
2272            return;
2273        }
2274
2275        static SKIP_RANGE: AtomicBool = AtomicBool::new(false);
2276
2277        let access_type_mask = if active {
2278            HV_INTERCEPT_ACCESS_MASK_READ_WRITE
2279        } else {
2280            HV_INTERCEPT_ACCESS_MASK_NONE
2281        };
2282
2283        // Try to register the whole range at once.
2284        if !SKIP_RANGE.load(Ordering::Relaxed) {
2285            match self.hcl.register_intercept(
2286                HvInterceptType::HvInterceptTypeX64IoPortRange,
2287                access_type_mask,
2288                HvInterceptParameters::new_io_port_range(begin..=end),
2289            ) {
2290                Ok(()) => return,
2291                Err(HvError::InvalidParameter) => {
2292                    // Probably a build that doesn't support range wrapping yet.
2293                    // Don't try again.
2294                    SKIP_RANGE.store(true, Ordering::Relaxed);
2295                    tracing::warn!(
2296                        CVM_ALLOWED,
2297                        "old hypervisor build; using slow path for intercept ranges"
2298                    );
2299                }
2300                Err(err) => {
2301                    panic!("io port range registration failure: {err:?}");
2302                }
2303            }
2304        }
2305
2306        // Fall back to registering one port at a time.
2307        for port in begin..=end {
2308            self.hcl
2309                .register_intercept(
2310                    HvInterceptType::HvInterceptTypeX64IoPort,
2311                    access_type_mask,
2312                    HvInterceptParameters::new_io_port(port),
2313                )
2314                .expect("registering io intercept cannot fail");
2315        }
2316    }
2317
2318    fn is_gpa_lower_vtl_ram(&self, gpa: u64) -> bool {
2319        // TODO: this probably should reflect changes to the memory map via PAM
2320        // registers. Right now this isn't an issue because the relevant region,
2321        // VGA, is handled on the host.
2322        self.lower_vtl_memory_layout
2323            .ram()
2324            .iter()
2325            .any(|m| m.range.contains_addr(gpa))
2326    }
2327
2328    fn is_gpa_mapped(&self, gpa: u64, write: bool) -> bool {
2329        // TODO: this probably should reflect changes to the memory map via PAM
2330        // registers. Right now this isn't an issue because the relevant region,
2331        // VGA, is handled on the host.
2332        if self.is_gpa_lower_vtl_ram(gpa) {
2333            // The monitor page is protected against lower VTL writes.
2334            !write || self.monitor_page.gpa() != Some(gpa & !(HV_PAGE_SIZE - 1))
2335        } else {
2336            false
2337        }
2338    }
2339}
2340
2341/// Handle returned by [`UhPartition::register_host_io_port_fast_path`].
2342///
2343/// When dropped, unregisters the IO ports so that they are no longer forwarded
2344/// to the host.
2345#[must_use]
2346pub struct HostIoPortFastPathHandle {
2347    inner: Weak<UhPartitionInner>,
2348    begin: u16,
2349    end: u16,
2350}
2351
2352impl Drop for HostIoPortFastPathHandle {
2353    fn drop(&mut self) {
2354        if let Some(inner) = self.inner.upgrade() {
2355            inner.manage_io_port_intercept_region(self.begin, self.end, true);
2356        }
2357    }
2358}
2359
2360/// The application level VTL crash data not suited for putting
2361/// on the wire.
2362///
2363/// FUTURE: move/remove this to standardize across virt backends.
2364#[derive(Copy, Clone, Debug)]
2365pub struct VtlCrash {
2366    /// The VP that crashed.
2367    pub vp_index: VpIndex,
2368    /// The VTL that crashed.
2369    pub last_vtl: GuestVtl,
2370    /// The crash control information.
2371    pub control: GuestCrashCtl,
2372    /// The crash parameters.
2373    pub parameters: [u64; 5],
2374}
2375
2376/// Validate that flags is a valid setting for VTL memory protection when
2377/// applied to VTL 1.
2378#[cfg_attr(guest_arch = "aarch64", expect(dead_code))]
2379fn validate_vtl_gpa_flags(
2380    flags: HvMapGpaFlags,
2381    mbec_enabled: bool,
2382    shadow_supervisor_stack_enabled: bool,
2383) -> bool {
2384    // Adjust is not allowed for VTL1.
2385    if flags.adjustable() {
2386        return false;
2387    }
2388
2389    // KX must equal UX unless MBEC is enabled. KX && !UX is invalid.
2390    if flags.kernel_executable() != flags.user_executable() {
2391        if (flags.kernel_executable() && !flags.user_executable()) || !mbec_enabled {
2392            return false;
2393        }
2394    }
2395
2396    // Read must be specified if anything else is specified.
2397    if flags.writable()
2398        || flags.kernel_executable()
2399        || flags.user_executable()
2400        || flags.supervisor_shadow_stack()
2401        || flags.paging_writability()
2402        || flags.verify_paging_writability()
2403    {
2404        if !flags.readable() {
2405            return false;
2406        }
2407    }
2408
2409    // Supervisor shadow stack protection is invalid if shadow stacks are disabled
2410    // or if execute is not specified.
2411    if flags.supervisor_shadow_stack()
2412        && ((!flags.kernel_executable() && !flags.user_executable())
2413            || shadow_supervisor_stack_enabled)
2414    {
2415        return false;
2416    }
2417
2418    true
2419}