virt_mshv_vtl/
lib.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! Implementation of the Underhill hypervisor backend, which uses
5//! `/dev/mshv_vtl` to interact with the Microsoft hypervisor while running in
6//! VTL2.
7
8#![cfg(all(guest_is_native, target_os = "linux"))]
9
10mod devmsr;
11
12cfg_if::cfg_if!(
13    if #[cfg(guest_arch = "x86_64")] {
14        mod cvm_cpuid;
15        pub use processor::snp::SnpBacked;
16        pub use processor::tdx::TdxBacked;
17        use crate::processor::HardwareIsolatedBacking;
18        pub use crate::processor::mshv::x64::HypervisorBackedX86 as HypervisorBacked;
19        use crate::processor::mshv::x64::HypervisorBackedX86Shared as HypervisorBackedShared;
20        use bitvec::prelude::BitArray;
21        use bitvec::prelude::Lsb0;
22        use devmsr::MsrDevice;
23        use hv1_emulator::hv::ProcessorVtlHv;
24        use processor::LapicState;
25        use processor::snp::SnpBackedShared;
26        use processor::tdx::TdxBackedShared;
27        use std::arch::x86_64::CpuidResult;
28        use virt::CpuidLeaf;
29        use virt::state::StateElement;
30        use virt::vp::MpState;
31        /// Bitarray type for representing IRR bits in a x86-64 APIC
32        /// Each bit represent the 256 possible vectors.
33        type IrrBitmap = BitArray<[u32; 8], Lsb0>;
34    } else if #[cfg(guest_arch = "aarch64")] {
35        pub use crate::processor::mshv::arm64::HypervisorBackedArm64 as HypervisorBacked;
36        use crate::processor::mshv::arm64::HypervisorBackedArm64Shared as HypervisorBackedShared;
37    }
38);
39
40mod processor;
41pub use processor::Backing;
42pub use processor::UhProcessor;
43
44use anyhow::Context as AnyhowContext;
45use bitfield_struct::bitfield;
46use bitvec::boxed::BitBox;
47use bitvec::vec::BitVec;
48use cvm_tracing::CVM_ALLOWED;
49use guestmem::GuestMemory;
50use guestmem::GuestMemoryBackingError;
51use hcl::GuestVtl;
52use hcl::ioctl::Hcl;
53use hcl::ioctl::SetVsmPartitionConfigError;
54use hv1_emulator::hv::GlobalHv;
55use hv1_emulator::message_queues::MessageQueues;
56use hv1_emulator::synic::GlobalSynic;
57use hv1_emulator::synic::SintProxied;
58use hv1_structs::VtlArray;
59use hvdef::GuestCrashCtl;
60use hvdef::HV_PAGE_SHIFT;
61use hvdef::HV_PAGE_SIZE;
62use hvdef::HV_PAGE_SIZE_USIZE;
63use hvdef::HvError;
64use hvdef::HvMapGpaFlags;
65use hvdef::HvRegisterName;
66use hvdef::HvRegisterVsmPartitionConfig;
67use hvdef::HvRegisterVsmPartitionStatus;
68use hvdef::Vtl;
69use hvdef::hypercall::HV_INTERCEPT_ACCESS_MASK_EXECUTE;
70use hvdef::hypercall::HV_INTERCEPT_ACCESS_MASK_NONE;
71use hvdef::hypercall::HV_INTERCEPT_ACCESS_MASK_READ_WRITE;
72use hvdef::hypercall::HV_INTERCEPT_ACCESS_MASK_WRITE;
73use hvdef::hypercall::HostVisibilityType;
74use hvdef::hypercall::HvGuestOsId;
75use hvdef::hypercall::HvInputVtl;
76use hvdef::hypercall::HvInterceptParameters;
77use hvdef::hypercall::HvInterceptType;
78use inspect::Inspect;
79use inspect::InspectMut;
80use memory_range::MemoryRange;
81use pal::unix::affinity;
82use pal::unix::affinity::CpuSet;
83use pal_async::driver::Driver;
84use pal_async::driver::SpawnDriver;
85use pal_uring::IdleControl;
86use parking_lot::Mutex;
87use parking_lot::RwLock;
88use processor::BackingSharedParams;
89use processor::SidecarExitReason;
90use sidecar_client::NewSidecarClientError;
91use std::ops::RangeInclusive;
92use std::os::fd::AsRawFd;
93use std::sync::Arc;
94use std::sync::Weak;
95use std::sync::atomic::AtomicBool;
96use std::sync::atomic::AtomicU8;
97use std::sync::atomic::AtomicU32;
98use std::sync::atomic::AtomicU64;
99use std::sync::atomic::Ordering;
100use std::task::Waker;
101use thiserror::Error;
102use user_driver::DmaClient;
103use virt::IsolationType;
104use virt::PartitionCapabilities;
105use virt::VpIndex;
106use virt::X86Partition;
107use virt::irqcon::IoApicRouting;
108use virt::irqcon::MsiRequest;
109use virt::x86::apic_software_device::ApicSoftwareDevices;
110use virt_support_apic::LocalApicSet;
111use vm_topology::memory::MemoryLayout;
112use vm_topology::processor::ProcessorTopology;
113use vm_topology::processor::TargetVpInfo;
114use vmcore::monitor::MonitorPage;
115use vmcore::reference_time::GetReferenceTime;
116use vmcore::reference_time::ReferenceTimeResult;
117use vmcore::reference_time::ReferenceTimeSource;
118use vmcore::vmtime::VmTimeSource;
119use x86defs::snp::REG_TWEAK_BITMAP_OFFSET;
120use x86defs::snp::REG_TWEAK_BITMAP_SIZE;
121use x86defs::tdx::TdCallResult;
122use zerocopy::FromBytes;
123use zerocopy::FromZeros;
124use zerocopy::Immutable;
125use zerocopy::IntoBytes;
126use zerocopy::KnownLayout;
127
128/// General error returned by operations.
129#[derive(Error, Debug)]
130#[expect(missing_docs)]
131pub enum Error {
132    #[error("hcl error")]
133    Hcl(#[source] hcl::ioctl::Error),
134    #[error("failed to open sidecar client")]
135    Sidecar(#[source] NewSidecarClientError),
136    #[error("failed to install {0:?} intercept: {1:?}")]
137    InstallIntercept(HvInterceptType, HvError),
138    #[error("failed to query hypervisor register {0:#x?}")]
139    Register(HvRegisterName, #[source] HvError),
140    #[error("failed to set vsm partition config register")]
141    VsmPartitionConfig(#[source] SetVsmPartitionConfigError),
142    #[error("failed to create virtual device")]
143    NewDevice(#[source] virt::x86::apic_software_device::DeviceIdInUse),
144    #[error("failed to create cpuid tables for cvm")]
145    #[cfg(guest_arch = "x86_64")]
146    CvmCpuid(#[source] cvm_cpuid::CpuidResultsError),
147    #[error("failed to update hypercall msr")]
148    UpdateHypercallMsr,
149    #[error("failed to update reference tsc msr")]
150    UpdateReferenceTsc,
151    #[error("failed to map overlay page")]
152    MapOverlay(#[source] std::io::Error),
153    #[error("failed to allocate shared visibility pages for overlay")]
154    AllocateSharedVisOverlay(#[source] anyhow::Error),
155    #[error("failed to open msr device")]
156    OpenMsr(#[source] std::io::Error),
157    #[error("cpuid did not contain valid TSC frequency information")]
158    BadCpuidTsc,
159    #[error("failed to read tsc frequency")]
160    ReadTscFrequency(#[source] std::io::Error),
161    #[error(
162        "tsc frequency mismatch between hypervisor ({hv}) and hardware {hw}, exceeds allowed error {allowed_error}"
163    )]
164    TscFrequencyMismatch {
165        hv: u64,
166        hw: u64,
167        allowed_error: u64,
168    },
169    #[error("failed to set vsm partition config: {0:?}")]
170    FailedToSetL2Ctls(TdCallResult),
171    #[error("debugging is configured but the binary does not have the gdb feature")]
172    InvalidDebugConfiguration,
173    #[error("failed to allocate TLB flush page")]
174    AllocateTlbFlushPage(#[source] anyhow::Error),
175    #[error("host does not support required cpu capabilities")]
176    Capabilities(virt::PartitionCapabilitiesError),
177}
178
179/// Error revoking guest VSM.
180#[derive(Error, Debug)]
181#[expect(missing_docs)]
182pub enum RevokeGuestVsmError {
183    #[error("failed to set vsm config")]
184    SetGuestVsmConfig(#[source] hcl::ioctl::SetGuestVsmConfigError),
185    #[error("VTL 1 is already enabled")]
186    Vtl1AlreadyEnabled,
187}
188
189/// Underhill partition.
190#[derive(Inspect)]
191pub struct UhPartition {
192    #[inspect(flatten)]
193    inner: Arc<UhPartitionInner>,
194    // TODO: remove this extra indirection by refactoring some traits.
195    #[inspect(skip)]
196    interrupt_targets: VtlArray<Arc<UhInterruptTarget>, 2>,
197}
198
199/// Underhill partition.
200#[derive(Inspect)]
201#[inspect(extra = "UhPartitionInner::inspect_extra")]
202struct UhPartitionInner {
203    #[inspect(skip)]
204    hcl: Hcl,
205    #[inspect(skip)] // inspected separately
206    vps: Vec<UhVpInner>,
207    irq_routes: virt::irqcon::IrqRoutes,
208    caps: PartitionCapabilities,
209    #[inspect(skip)] // handled in `inspect_extra`
210    enter_modes: Mutex<EnterModes>,
211    #[inspect(skip)]
212    enter_modes_atomic: AtomicU8,
213    #[cfg(guest_arch = "x86_64")]
214    cpuid: virt::CpuidLeafSet,
215    lower_vtl_memory_layout: MemoryLayout,
216    gm: VtlArray<GuestMemory, 2>,
217    vtl0_kernel_exec_gm: GuestMemory,
218    vtl0_user_exec_gm: GuestMemory,
219    #[cfg_attr(guest_arch = "aarch64", expect(dead_code))]
220    #[inspect(skip)]
221    crash_notification_send: mesh::Sender<VtlCrash>,
222    monitor_page: MonitorPage,
223    #[inspect(skip)]
224    allocated_monitor_page: Mutex<Option<user_driver::memory::MemoryBlock>>,
225    software_devices: Option<ApicSoftwareDevices>,
226    #[inspect(skip)]
227    vmtime: VmTimeSource,
228    isolation: IsolationType,
229    #[inspect(with = "inspect::AtomicMut")]
230    no_sidecar_hotplug: AtomicBool,
231    use_mmio_hypercalls: bool,
232    backing_shared: BackingShared,
233    intercept_debug_exceptions: bool,
234    #[cfg(guest_arch = "x86_64")]
235    // N.B For now, only one device vector table i.e. for VTL0 only
236    #[inspect(hex, with = "|x| inspect::iter_by_index(x.read().into_inner())")]
237    device_vector_table: RwLock<IrrBitmap>,
238    vmbus_relay: bool,
239}
240
241#[derive(Inspect)]
242#[inspect(untagged)]
243enum BackingShared {
244    Hypervisor(#[inspect(flatten)] HypervisorBackedShared),
245    #[cfg(guest_arch = "x86_64")]
246    Snp(#[inspect(flatten)] SnpBackedShared),
247    #[cfg(guest_arch = "x86_64")]
248    Tdx(#[inspect(flatten)] TdxBackedShared),
249}
250
251impl BackingShared {
252    fn new(
253        isolation: IsolationType,
254        partition_params: &UhPartitionNewParams<'_>,
255        backing_shared_params: BackingSharedParams<'_>,
256    ) -> Result<BackingShared, Error> {
257        Ok(match isolation {
258            IsolationType::None | IsolationType::Vbs => {
259                assert!(backing_shared_params.cvm_state.is_none());
260                BackingShared::Hypervisor(HypervisorBackedShared::new(
261                    partition_params,
262                    backing_shared_params,
263                )?)
264            }
265            #[cfg(guest_arch = "x86_64")]
266            IsolationType::Snp => BackingShared::Snp(SnpBackedShared::new(
267                partition_params,
268                backing_shared_params,
269            )?),
270            #[cfg(guest_arch = "x86_64")]
271            IsolationType::Tdx => BackingShared::Tdx(TdxBackedShared::new(
272                partition_params,
273                backing_shared_params,
274            )?),
275            #[cfg(not(guest_arch = "x86_64"))]
276            _ => unreachable!(),
277        })
278    }
279
280    fn cvm_state(&self) -> Option<&UhCvmPartitionState> {
281        match self {
282            BackingShared::Hypervisor(_) => None,
283            #[cfg(guest_arch = "x86_64")]
284            BackingShared::Snp(SnpBackedShared { cvm, .. })
285            | BackingShared::Tdx(TdxBackedShared { cvm, .. }) => Some(cvm),
286        }
287    }
288
289    fn untrusted_synic(&self) -> Option<&GlobalSynic> {
290        match self {
291            BackingShared::Hypervisor(_) => None,
292            #[cfg(guest_arch = "x86_64")]
293            BackingShared::Snp(_) => None,
294            #[cfg(guest_arch = "x86_64")]
295            BackingShared::Tdx(s) => s.untrusted_synic.as_ref(),
296        }
297    }
298}
299
300#[derive(InspectMut, Copy, Clone)]
301struct EnterModes {
302    #[inspect(mut)]
303    first: EnterMode,
304    #[inspect(mut)]
305    second: EnterMode,
306}
307
308impl Default for EnterModes {
309    fn default() -> Self {
310        Self {
311            first: EnterMode::Fast,
312            second: EnterMode::IdleToVtl0,
313        }
314    }
315}
316
317impl From<EnterModes> for hcl::protocol::EnterModes {
318    fn from(value: EnterModes) -> Self {
319        Self::new()
320            .with_first(value.first.into())
321            .with_second(value.second.into())
322    }
323}
324
325#[derive(InspectMut, Copy, Clone)]
326enum EnterMode {
327    Fast,
328    PlayIdle,
329    IdleToVtl0,
330}
331
332impl From<EnterMode> for hcl::protocol::EnterMode {
333    fn from(value: EnterMode) -> Self {
334        match value {
335            EnterMode::Fast => Self::FAST,
336            EnterMode::PlayIdle => Self::PLAY_IDLE,
337            EnterMode::IdleToVtl0 => Self::IDLE_TO_VTL0,
338        }
339    }
340}
341
342#[cfg(guest_arch = "x86_64")]
343#[derive(Inspect)]
344struct GuestVsmVpState {
345    /// The pending event that VTL 1 wants to inject into VTL 0. Injected on
346    /// next exit to VTL 0.
347    #[inspect(with = "|x| x.as_ref().map(inspect::AsDebug)")]
348    vtl0_exit_pending_event: Option<hvdef::HvX64PendingExceptionEvent>,
349    reg_intercept: SecureRegisterInterceptState,
350}
351
352#[cfg(guest_arch = "x86_64")]
353impl GuestVsmVpState {
354    fn new() -> Self {
355        GuestVsmVpState {
356            vtl0_exit_pending_event: None,
357            reg_intercept: Default::default(),
358        }
359    }
360}
361
362#[cfg(guest_arch = "x86_64")]
363#[derive(Inspect)]
364/// VP state for CVMs.
365struct UhCvmVpState {
366    // Allocation handle for direct overlays
367    #[inspect(debug)]
368    direct_overlay_handle: user_driver::memory::MemoryBlock,
369    /// Used in VTL 2 exit code to determine which VTL to exit to.
370    exit_vtl: GuestVtl,
371    /// Hypervisor enlightenment emulator state.
372    hv: VtlArray<ProcessorVtlHv, 2>,
373    /// LAPIC state.
374    lapics: VtlArray<LapicState, 2>,
375    /// Guest VSM state for this vp. Some when VTL 1 is enabled.
376    vtl1: Option<GuestVsmVpState>,
377}
378
379#[cfg(guest_arch = "x86_64")]
380impl UhCvmVpState {
381    /// Creates a new CVM VP state.
382    pub(crate) fn new(
383        cvm_partition: &UhCvmPartitionState,
384        inner: &UhPartitionInner,
385        vp_info: &TargetVpInfo,
386        overlay_pages_required: usize,
387    ) -> Result<Self, Error> {
388        let direct_overlay_handle = cvm_partition
389            .shared_dma_client
390            .allocate_dma_buffer(overlay_pages_required * HV_PAGE_SIZE as usize)
391            .map_err(Error::AllocateSharedVisOverlay)?;
392
393        let apic_base = virt::vp::Apic::at_reset(&inner.caps, vp_info).apic_base;
394        let lapics = VtlArray::from_fn(|vtl| {
395            let apic_set = &cvm_partition.lapic[vtl];
396
397            // The APIC is software-enabled after reset for secure VTLs, to
398            // maintain compatibility with released versions of secure kernel
399            let mut lapic = apic_set.add_apic(vp_info, vtl == Vtl::Vtl1);
400            // Initialize APIC base to match the reset VM state.
401            lapic.set_apic_base(apic_base).unwrap();
402            // Only the VTL 0 non-BSP LAPICs should be in the WaitForSipi state.
403            let activity = if vtl == Vtl::Vtl0 && !vp_info.base.is_bsp() {
404                MpState::WaitForSipi
405            } else {
406                MpState::Running
407            };
408            LapicState::new(lapic, activity)
409        });
410
411        let hv = VtlArray::from_fn(|vtl| cvm_partition.hv.add_vp(vp_info.base.vp_index, vtl));
412
413        Ok(Self {
414            direct_overlay_handle,
415            exit_vtl: GuestVtl::Vtl0,
416            hv,
417            lapics,
418            vtl1: None,
419        })
420    }
421}
422
423#[cfg(guest_arch = "x86_64")]
424#[derive(Inspect, Default)]
425#[inspect(hex)]
426/// Configuration of VTL 1 registration for intercepts on certain registers
427pub struct SecureRegisterInterceptState {
428    #[inspect(with = "|&x| u64::from(x)")]
429    intercept_control: hvdef::HvRegisterCrInterceptControl,
430    cr0_mask: u64,
431    cr4_mask: u64,
432    // Writes to X86X_IA32_MSR_MISC_ENABLE are dropped, so this is only used so
433    // that get_vp_register returns the correct value from a set_vp_register
434    ia32_misc_enable_mask: u64,
435}
436
437#[derive(Inspect)]
438/// Partition-wide state for CVMs.
439struct UhCvmPartitionState {
440    #[cfg(guest_arch = "x86_64")]
441    vps_per_socket: u32,
442    /// VPs that have locked their TLB.
443    #[inspect(
444        with = "|arr| inspect::iter_by_index(arr.iter()).map_value(|bb| inspect::iter_by_index(bb.iter().map(|v| *v)))"
445    )]
446    tlb_locked_vps: VtlArray<BitBox<AtomicU64>, 2>,
447    #[inspect(with = "inspect::iter_by_index")]
448    vps: Vec<UhCvmVpInner>,
449    shared_memory: GuestMemory,
450    #[cfg_attr(guest_arch = "aarch64", expect(dead_code))]
451    #[inspect(skip)]
452    isolated_memory_protector: Arc<dyn ProtectIsolatedMemory>,
453    /// The emulated local APIC set.
454    lapic: VtlArray<LocalApicSet, 2>,
455    /// The emulated hypervisor state.
456    hv: GlobalHv<2>,
457    /// Guest VSM state.
458    guest_vsm: RwLock<GuestVsmState<CvmVtl1State>>,
459    /// Dma client for shared visibility pages.
460    shared_dma_client: Arc<dyn DmaClient>,
461    /// Dma client for private visibility pages.
462    private_dma_client: Arc<dyn DmaClient>,
463    hide_isolation: bool,
464}
465
466#[cfg_attr(guest_arch = "aarch64", expect(dead_code))]
467impl UhCvmPartitionState {
468    fn vp_inner(&self, vp_index: u32) -> &UhCvmVpInner {
469        &self.vps[vp_index as usize]
470    }
471
472    fn is_lower_vtl_startup_denied(&self) -> bool {
473        matches!(
474            *self.guest_vsm.read(),
475            GuestVsmState::Enabled {
476                vtl1: CvmVtl1State {
477                    deny_lower_vtl_startup: true,
478                    ..
479                }
480            }
481        )
482    }
483}
484
485#[derive(Inspect)]
486/// Per-vp state for CVMs.
487struct UhCvmVpInner {
488    /// The current status of TLB locks
489    tlb_lock_info: VtlArray<TlbLockInfo, 2>,
490    /// Whether EnableVpVtl for VTL 1 has been called on this VP.
491    vtl1_enable_called: Mutex<bool>,
492    /// Whether the VP has been started via the StartVp hypercall.
493    started: AtomicBool,
494    /// Start context for StartVp and EnableVpVtl calls.
495    #[inspect(with = "|arr| inspect::iter_by_index(arr.iter().map(|v| v.lock().is_some()))")]
496    hv_start_enable_vtl_vp: VtlArray<Mutex<Option<Box<VpStartEnableVtl>>>, 2>,
497}
498
499#[cfg_attr(guest_arch = "aarch64", expect(dead_code))]
500#[derive(Inspect)]
501#[inspect(tag = "guest_vsm_state")]
502/// Partition-wide state for guest vsm.
503enum GuestVsmState<T: Inspect> {
504    NotPlatformSupported,
505    NotGuestEnabled,
506    Enabled {
507        #[inspect(flatten)]
508        vtl1: T,
509    },
510}
511
512impl<T: Inspect> GuestVsmState<T> {
513    pub fn from_availability(guest_vsm_available: bool) -> Self {
514        if guest_vsm_available {
515            GuestVsmState::NotGuestEnabled
516        } else {
517            GuestVsmState::NotPlatformSupported
518        }
519    }
520}
521
522#[derive(Inspect)]
523struct CvmVtl1State {
524    /// Whether VTL 1 has been enabled on any vp
525    enabled_on_any_vp: bool,
526    /// Whether guest memory should be zeroed before it resets.
527    zero_memory_on_reset: bool,
528    /// Whether a vp can be started or reset by a lower vtl.
529    deny_lower_vtl_startup: bool,
530    /// Whether Mode-Based Execution Control should be enforced on lower VTLs.
531    pub mbec_enabled: bool,
532    /// Whether shadow supervisor stack is enabled.
533    pub shadow_supervisor_stack_enabled: bool,
534    #[inspect(with = "|bb| inspect::iter_by_index(bb.iter().map(|v| *v))")]
535    io_read_intercepts: BitBox<u64>,
536    #[inspect(with = "|bb| inspect::iter_by_index(bb.iter().map(|v| *v))")]
537    io_write_intercepts: BitBox<u64>,
538}
539
540#[cfg_attr(guest_arch = "aarch64", expect(dead_code))]
541impl CvmVtl1State {
542    fn new(mbec_enabled: bool) -> Self {
543        Self {
544            enabled_on_any_vp: false,
545            zero_memory_on_reset: false,
546            deny_lower_vtl_startup: false,
547            mbec_enabled,
548            shadow_supervisor_stack_enabled: false,
549            io_read_intercepts: BitVec::repeat(false, u16::MAX as usize + 1).into_boxed_bitslice(),
550            io_write_intercepts: BitVec::repeat(false, u16::MAX as usize + 1).into_boxed_bitslice(),
551        }
552    }
553}
554
555#[cfg_attr(guest_arch = "aarch64", expect(dead_code))]
556struct TscReferenceTimeSource {
557    tsc_scale: u64,
558}
559
560#[cfg_attr(guest_arch = "aarch64", expect(dead_code))]
561impl TscReferenceTimeSource {
562    fn new(tsc_frequency: u64) -> Self {
563        TscReferenceTimeSource {
564            tsc_scale: (((10_000_000_u128) << 64) / tsc_frequency as u128) as u64,
565        }
566    }
567}
568
569/// A time implementation based on TSC.
570impl GetReferenceTime for TscReferenceTimeSource {
571    fn now(&self) -> ReferenceTimeResult {
572        #[cfg(guest_arch = "x86_64")]
573        {
574            let tsc = safe_intrinsics::rdtsc();
575            let ref_time = ((self.tsc_scale as u128 * tsc as u128) >> 64) as u64;
576            ReferenceTimeResult {
577                ref_time,
578                system_time: None,
579            }
580        }
581
582        #[cfg(guest_arch = "aarch64")]
583        {
584            todo!("AARCH64_TODO");
585        }
586    }
587}
588
589impl virt::irqcon::ControlGic for UhPartitionInner {
590    fn set_spi_irq(&self, irq_id: u32, high: bool) {
591        if let Err(err) = self.hcl.request_interrupt(
592            hvdef::HvInterruptControl::new()
593                .with_arm64_asserted(high)
594                .with_interrupt_type(hvdef::HvInterruptType::HvArm64InterruptTypeFixed),
595            0,
596            irq_id,
597            GuestVtl::Vtl0,
598        ) {
599            tracelimit::warn_ratelimited!(
600                error = &err as &dyn std::error::Error,
601                irq = irq_id,
602                asserted = high,
603                "failed to request spi"
604            );
605        }
606    }
607}
608
609impl virt::Aarch64Partition for UhPartition {
610    fn control_gic(&self, vtl: Vtl) -> Arc<dyn virt::irqcon::ControlGic> {
611        debug_assert!(vtl == Vtl::Vtl0);
612        self.inner.clone()
613    }
614}
615
616/// A wrapper around [`UhProcessor`] that is [`Send`].
617///
618/// This is used to instantiate the processor object on the correct thread,
619/// since all lower VTL processor state accesses must occur from the same
620/// processor at VTL2.
621pub struct UhProcessorBox {
622    partition: Arc<UhPartitionInner>,
623    vp_info: TargetVpInfo,
624}
625
626impl UhProcessorBox {
627    /// Returns the VP index.
628    pub fn vp_index(&self) -> VpIndex {
629        self.vp_info.base.vp_index
630    }
631
632    /// Returns the base CPU that manages this processor, when it is a sidecar
633    /// VP.
634    pub fn sidecar_base_cpu(&self) -> Option<u32> {
635        self.partition
636            .hcl
637            .sidecar_base_cpu(self.vp_info.base.vp_index.index())
638    }
639
640    /// Returns the processor object, bound to this thread.
641    ///
642    /// If `control` is provided, then this must be called on the VP's
643    /// associated thread pool thread, and it will dispatch the VP directly.
644    /// Otherwise, the processor will control the processor via the sidecar
645    /// kernel.
646    pub fn bind_processor<'a, T: Backing>(
647        &'a mut self,
648        driver: &impl Driver,
649        control: Option<&'a mut IdleControl>,
650    ) -> Result<UhProcessor<'a, T>, Error> {
651        if let Some(control) = &control {
652            let vp_index = self.vp_info.base.vp_index;
653
654            let mut current = Default::default();
655            affinity::get_current_thread_affinity(&mut current).unwrap();
656            assert_eq!(&current, CpuSet::new().set(vp_index.index()));
657
658            self.partition
659                .hcl
660                .set_poll_file(
661                    self.partition.vp(vp_index).unwrap().cpu_index,
662                    control.ring_fd().as_raw_fd(),
663                )
664                .map_err(Error::Hcl)?;
665        }
666
667        UhProcessor::new(driver, &self.partition, self.vp_info, control)
668    }
669
670    /// Sets the sidecar remove reason for the processor to be due to a task
671    /// running with the given name.
672    ///
673    /// This is useful for diagnostics.
674    pub fn set_sidecar_exit_due_to_task(&self, task: Arc<str>) {
675        self.partition
676            .vp(self.vp_info.base.vp_index)
677            .unwrap()
678            .set_sidecar_exit_reason(SidecarExitReason::TaskRequest(task))
679    }
680}
681
682#[derive(Debug, Inspect)]
683struct UhVpInner {
684    /// 32 bits per VTL: top bits are VTL 1, bottom bits are VTL 0.
685    wake_reasons: AtomicU64,
686    #[inspect(skip)]
687    waker: RwLock<Option<Waker>>,
688    message_queues: VtlArray<MessageQueues, 2>,
689    #[inspect(skip)]
690    vp_info: TargetVpInfo,
691    /// The Linux kernel's CPU index for this VP. This should be used instead of VpIndex
692    /// when interacting with non-MSHV kernel interfaces.
693    cpu_index: u32,
694    sidecar_exit_reason: Mutex<Option<SidecarExitReason>>,
695}
696
697impl UhVpInner {
698    pub fn vp_index(&self) -> VpIndex {
699        self.vp_info.base.vp_index
700    }
701}
702
703#[cfg_attr(not(guest_arch = "x86_64"), expect(dead_code))]
704#[derive(Debug, Inspect)]
705/// Which operation is setting the initial vp context
706enum InitialVpContextOperation {
707    /// The VP is being started via the StartVp hypercall.
708    StartVp,
709    /// The VP is being started via the EnableVpVtl hypercall.
710    EnableVpVtl,
711}
712
713#[cfg_attr(not(guest_arch = "x86_64"), expect(dead_code))]
714#[derive(Debug, Inspect)]
715/// State for handling StartVp/EnableVpVtl hypercalls.
716struct VpStartEnableVtl {
717    /// Which operation, startvp or enablevpvtl, is setting the initial vp
718    /// context
719    operation: InitialVpContextOperation,
720    #[inspect(skip)]
721    context: hvdef::hypercall::InitialVpContextX64,
722}
723
724#[derive(Debug, Inspect)]
725struct TlbLockInfo {
726    /// The set of VPs that are waiting for this VP to release the TLB lock.
727    #[inspect(with = "|bb| inspect::iter_by_index(bb.iter().map(|v| *v))")]
728    blocked_vps: BitBox<AtomicU64>,
729    /// The set of VPs that are holding the TLB lock and preventing this VP
730    /// from proceeding.
731    #[inspect(with = "|bb| inspect::iter_by_index(bb.iter().map(|v| *v))")]
732    blocking_vps: BitBox<AtomicU64>,
733    /// The count of blocking VPs. This should always be equivalent to
734    /// `blocking_vps.count_ones()`, however it is accessible in a single
735    /// atomic operation while counting is not.
736    blocking_vp_count: AtomicU32,
737    /// Whether the VP is sleeping due to a TLB lock.
738    sleeping: AtomicBool,
739}
740
741#[cfg_attr(not(guest_arch = "x86_64"), expect(dead_code))]
742impl TlbLockInfo {
743    fn new(vp_count: usize) -> Self {
744        Self {
745            blocked_vps: BitVec::repeat(false, vp_count).into_boxed_bitslice(),
746            blocking_vps: BitVec::repeat(false, vp_count).into_boxed_bitslice(),
747            blocking_vp_count: AtomicU32::new(0),
748            sleeping: false.into(),
749        }
750    }
751}
752
753#[bitfield(u32)]
754#[derive(IntoBytes, Immutable, KnownLayout, FromBytes)]
755struct WakeReason {
756    extint: bool,
757    message_queues: bool,
758    hv_start_enable_vtl_vp: bool,
759    intcon: bool,
760    update_proxy_irr_filter: bool,
761    #[bits(27)]
762    _reserved: u32,
763}
764
765impl WakeReason {
766    // Convenient constants.
767    const EXTINT: Self = Self::new().with_extint(true);
768    const MESSAGE_QUEUES: Self = Self::new().with_message_queues(true);
769    #[cfg(guest_arch = "x86_64")]
770    const HV_START_ENABLE_VP_VTL: Self = Self::new().with_hv_start_enable_vtl_vp(true); // StartVp/EnableVpVtl handling
771    const INTCON: Self = Self::new().with_intcon(true);
772    #[cfg(guest_arch = "x86_64")]
773    const UPDATE_PROXY_IRR_FILTER: Self = Self::new().with_update_proxy_irr_filter(true);
774}
775
776#[bitfield(u32)]
777#[derive(IntoBytes, Immutable, KnownLayout, FromBytes)]
778struct ExitActivity {
779    pending_event: bool,
780    #[bits(31)]
781    _reserved: u32,
782}
783
784/// Immutable access to useful bits of Partition state.
785impl UhPartition {
786    /// Revokes guest VSM.
787    pub fn revoke_guest_vsm(&self) -> Result<(), RevokeGuestVsmError> {
788        fn revoke<T: Inspect>(vsm_state: &mut GuestVsmState<T>) -> Result<(), RevokeGuestVsmError> {
789            if matches!(vsm_state, GuestVsmState::Enabled { .. }) {
790                return Err(RevokeGuestVsmError::Vtl1AlreadyEnabled);
791            }
792            *vsm_state = GuestVsmState::NotPlatformSupported;
793            Ok(())
794        }
795
796        match &self.inner.backing_shared {
797            BackingShared::Hypervisor(s) => {
798                revoke(&mut *s.guest_vsm.write())?;
799                self.inner
800                    .hcl
801                    .set_guest_vsm_partition_config(false)
802                    .map_err(RevokeGuestVsmError::SetGuestVsmConfig)?;
803            }
804            #[cfg(guest_arch = "x86_64")]
805            BackingShared::Snp(SnpBackedShared { cvm, .. })
806            | BackingShared::Tdx(TdxBackedShared { cvm, .. }) => {
807                revoke(&mut *cvm.guest_vsm.write())?;
808            }
809        };
810
811        Ok(())
812    }
813
814    /// Returns the current hypervisor reference time, in 100ns units.
815    pub fn reference_time(&self) -> u64 {
816        if let Some(hv) = self.inner.hv() {
817            hv.ref_time_source().now().ref_time
818        } else {
819            self.inner
820                .hcl
821                .reference_time()
822                .expect("should not fail to get the reference time")
823        }
824    }
825}
826
827impl virt::Partition for UhPartition {
828    fn supports_reset(&self) -> Option<&dyn virt::ResetPartition<Error = Self::Error>> {
829        None
830    }
831
832    fn caps(&self) -> &PartitionCapabilities {
833        &self.inner.caps
834    }
835
836    fn request_msi(&self, vtl: Vtl, request: MsiRequest) {
837        self.inner
838            .request_msi(vtl.try_into().expect("higher vtl not configured"), request)
839    }
840
841    fn request_yield(&self, _vp_index: VpIndex) {
842        unimplemented!()
843    }
844}
845
846impl X86Partition for UhPartition {
847    fn ioapic_routing(&self) -> Arc<dyn IoApicRouting> {
848        self.inner.clone()
849    }
850
851    fn pulse_lint(&self, vp_index: VpIndex, vtl: Vtl, lint: u8) {
852        let vtl = GuestVtl::try_from(vtl).expect("higher vtl not configured");
853        if let Some(apic) = &self.inner.lapic(vtl) {
854            apic.lint(vp_index, lint.into(), |vp_index| {
855                self.inner
856                    .vp(vp_index)
857                    .unwrap()
858                    .wake(vtl, WakeReason::INTCON);
859            });
860        } else if lint == 0 {
861            self.inner
862                .vp(vp_index)
863                .unwrap()
864                .wake(vtl, WakeReason::EXTINT);
865        } else {
866            unimplemented!()
867        }
868    }
869}
870
871impl UhPartitionInner {
872    fn vp(&self, index: VpIndex) -> Option<&'_ UhVpInner> {
873        self.vps.get(index.index() as usize)
874    }
875
876    fn lapic(&self, vtl: GuestVtl) -> Option<&LocalApicSet> {
877        self.backing_shared.cvm_state().map(|x| &x.lapic[vtl])
878    }
879
880    fn hv(&self) -> Option<&GlobalHv<2>> {
881        self.backing_shared.cvm_state().map(|x| &x.hv)
882    }
883
884    /// For requester VP to issue `proxy_irr_blocked` update to other VPs
885    #[cfg(guest_arch = "x86_64")]
886    fn request_proxy_irr_filter_update(
887        &self,
888        vtl: GuestVtl,
889        device_vector: u8,
890        req_vp_index: VpIndex,
891    ) {
892        tracing::debug!(
893            ?vtl,
894            device_vector,
895            req_vp_index = req_vp_index.index(),
896            "request_proxy_irr_filter_update"
897        );
898
899        // Add given vector to partition global device vector table (VTL0 only for now)
900        {
901            let mut device_vector_table = self.device_vector_table.write();
902            device_vector_table.set(device_vector as usize, true);
903        }
904
905        // Wake all other VPs for their `proxy_irr_blocked` filter update
906        for vp in self.vps.iter() {
907            if vp.vp_index() != req_vp_index {
908                vp.wake(vtl, WakeReason::UPDATE_PROXY_IRR_FILTER);
909            }
910        }
911    }
912
913    /// Get current partition global device irr vectors (VTL0 for now)
914    #[cfg(guest_arch = "x86_64")]
915    fn fill_device_vectors(&self, _vtl: GuestVtl, irr_vectors: &mut IrrBitmap) {
916        let device_vector_table = self.device_vector_table.read();
917        for idx in device_vector_table.iter_ones() {
918            irr_vectors.set(idx, true);
919        }
920    }
921
922    fn inspect_extra(&self, resp: &mut inspect::Response<'_>) {
923        let mut wake_vps = false;
924        resp.field_mut(
925            "enter_modes",
926            &mut inspect::adhoc_mut(|req| {
927                let update = req.is_update();
928                {
929                    let mut modes = self.enter_modes.lock();
930                    modes.inspect_mut(req);
931                    if update {
932                        self.enter_modes_atomic.store(
933                            hcl::protocol::EnterModes::from(*modes).into(),
934                            Ordering::Relaxed,
935                        );
936                        wake_vps = true;
937                    }
938                }
939            }),
940        );
941
942        // Wake VPs to propagate updates.
943        if wake_vps {
944            for vp in self.vps.iter() {
945                vp.wake_vtl2();
946            }
947        }
948    }
949
950    // TODO VBS GUEST VSM: enable for aarch64
951    #[cfg_attr(guest_arch = "aarch64", expect(dead_code))]
952    fn vsm_status(&self) -> Result<HvRegisterVsmPartitionStatus, hcl::ioctl::Error> {
953        // TODO: It might be possible to cache VsmPartitionStatus.
954        self.hcl.get_vsm_partition_status()
955    }
956}
957
958impl virt::Synic for UhPartition {
959    fn post_message(&self, vtl: Vtl, vp_index: VpIndex, sint: u8, typ: u32, payload: &[u8]) {
960        let vtl = GuestVtl::try_from(vtl).expect("higher vtl not configured");
961        let Some(vp) = self.inner.vp(vp_index) else {
962            tracelimit::warn_ratelimited!(
963                CVM_ALLOWED,
964                vp = vp_index.index(),
965                "invalid vp target for post_message"
966            );
967            return;
968        };
969
970        vp.post_message(
971            vtl,
972            sint,
973            &hvdef::HvMessage::new(hvdef::HvMessageType(typ), 0, payload),
974        );
975    }
976
977    fn new_guest_event_port(
978        &self,
979        vtl: Vtl,
980        vp: u32,
981        sint: u8,
982        flag: u16,
983    ) -> Box<dyn vmcore::synic::GuestEventPort> {
984        let vtl = GuestVtl::try_from(vtl).expect("higher vtl not configured");
985        Box::new(UhEventPort {
986            partition: Arc::downgrade(&self.inner),
987            params: Arc::new(Mutex::new(UhEventPortParams {
988                vp: VpIndex::new(vp),
989                sint,
990                flag,
991                vtl,
992            })),
993        })
994    }
995
996    fn prefer_os_events(&self) -> bool {
997        false
998    }
999
1000    fn monitor_support(&self) -> Option<&dyn virt::SynicMonitor> {
1001        Some(self)
1002    }
1003}
1004
1005impl virt::SynicMonitor for UhPartition {
1006    fn set_monitor_page(&self, vtl: Vtl, gpa: Option<u64>) -> anyhow::Result<()> {
1007        // Keep this locked the whole function to avoid racing with allocate_monitor_page.
1008        let mut allocated_block = self.inner.allocated_monitor_page.lock();
1009        let old_gpa = self.inner.monitor_page.set_gpa(gpa);
1010
1011        // Take ownership of any allocated monitor page so it will be freed on function exit.
1012        let allocated_page = allocated_block.take();
1013        if let Some(old_gpa) = old_gpa {
1014            let allocated_gpa = allocated_page
1015                .as_ref()
1016                .map(|b| b.pfns()[0] << HV_PAGE_SHIFT);
1017
1018            // Revert the old page's permissions, using the appropriate method depending on
1019            // whether it was allocated or guest-supplied.
1020            let result = if allocated_gpa == Some(old_gpa) {
1021                let vtl = GuestVtl::try_from(vtl).unwrap();
1022                self.unregister_cvm_dma_overlay_page(vtl, old_gpa >> HV_PAGE_SHIFT)
1023            } else {
1024                self.inner
1025                    .hcl
1026                    .modify_vtl_protection_mask(
1027                        MemoryRange::new(old_gpa..old_gpa + HV_PAGE_SIZE),
1028                        hvdef::HV_MAP_GPA_PERMISSIONS_ALL,
1029                        HvInputVtl::CURRENT_VTL,
1030                    )
1031                    .map_err(|err| anyhow::anyhow!(err))
1032            };
1033
1034            result
1035                .context("failed to unregister old monitor page")
1036                .inspect_err(|_| {
1037                    // Leave the page unset if returning a failure.
1038                    self.inner.monitor_page.set_gpa(None);
1039                })?;
1040
1041            tracing::debug!(old_gpa, "unregistered monitor page");
1042        }
1043
1044        if let Some(gpa) = gpa {
1045            // Disallow VTL0 from writing to the page, so we'll get an intercept. Note that read
1046            // permissions must be enabled or this doesn't work correctly.
1047            self.inner
1048                .hcl
1049                .modify_vtl_protection_mask(
1050                    MemoryRange::new(gpa..gpa + HV_PAGE_SIZE),
1051                    HvMapGpaFlags::new().with_readable(true),
1052                    HvInputVtl::CURRENT_VTL,
1053                )
1054                .context("failed to register monitor page")
1055                .inspect_err(|_| {
1056                    // Leave the page unset if returning a failure.
1057                    self.inner.monitor_page.set_gpa(None);
1058                })?;
1059
1060            tracing::debug!(gpa, "registered monitor page");
1061        }
1062
1063        Ok(())
1064    }
1065
1066    fn register_monitor(
1067        &self,
1068        monitor_id: vmcore::monitor::MonitorId,
1069        connection_id: u32,
1070    ) -> Box<dyn Sync + Send> {
1071        self.inner
1072            .monitor_page
1073            .register_monitor(monitor_id, connection_id)
1074    }
1075
1076    fn allocate_monitor_page(&self, vtl: Vtl) -> anyhow::Result<Option<u64>> {
1077        let vtl = GuestVtl::try_from(vtl).unwrap();
1078
1079        // Allocating a monitor page is only supported for CVMs.
1080        let Some(state) = self.inner.backing_shared.cvm_state() else {
1081            return Ok(None);
1082        };
1083
1084        let mut allocated_block = self.inner.allocated_monitor_page.lock();
1085        if let Some(block) = allocated_block.as_ref() {
1086            // An allocated monitor page is already in use; no need to change it.
1087            let gpa = block.pfns()[0] << HV_PAGE_SHIFT;
1088            assert_eq!(self.inner.monitor_page.gpa(), Some(gpa));
1089            return Ok(Some(gpa));
1090        }
1091
1092        let block = state
1093            .private_dma_client
1094            .allocate_dma_buffer(HV_PAGE_SIZE_USIZE)
1095            .context("failed to allocate monitor page")?;
1096
1097        let gpn = block.pfns()[0];
1098        *allocated_block = Some(block);
1099        let gpa = gpn << HV_PAGE_SHIFT;
1100        let old_gpa = self.inner.monitor_page.set_gpa(Some(gpa));
1101        if let Some(old_gpa) = old_gpa {
1102            // The old GPA is guaranteed not to be allocated, since that was checked above, so
1103            // revert its permissions using the method for guest-supplied memory.
1104            self.inner
1105                .hcl
1106                .modify_vtl_protection_mask(
1107                    MemoryRange::new(old_gpa..old_gpa + HV_PAGE_SIZE),
1108                    hvdef::HV_MAP_GPA_PERMISSIONS_ALL,
1109                    HvInputVtl::CURRENT_VTL,
1110                )
1111                .context("failed to unregister old monitor page")
1112                .inspect_err(|_| {
1113                    // Leave the page unset if returning a failure.
1114                    self.inner.monitor_page.set_gpa(None);
1115                })?;
1116
1117            tracing::debug!(old_gpa, "unregistered monitor page");
1118        }
1119
1120        // Disallow VTL0 from writing to the page, so we'll get an intercept. Note that read
1121        // permissions must be enabled or this doesn't work correctly.
1122        self.register_cvm_dma_overlay_page(vtl, gpn, HvMapGpaFlags::new().with_readable(true))
1123            .context("failed to unregister monitor page")
1124            .inspect_err(|_| {
1125                // Leave the page unset if returning a failure.
1126                self.inner.monitor_page.set_gpa(None);
1127            })?;
1128
1129        tracing::debug!(gpa, "registered allocated monitor page");
1130
1131        Ok(Some(gpa))
1132    }
1133}
1134
1135impl UhPartitionInner {
1136    #[cfg(guest_arch = "x86_64")]
1137    pub(crate) fn synic_interrupt(
1138        &self,
1139        vp_index: VpIndex,
1140        vtl: GuestVtl,
1141    ) -> impl '_ + hv1_emulator::RequestInterrupt {
1142        // TODO CVM: optimize for SNP with secure avic to avoid internal wake
1143        // and for TDX to avoid trip to user mode
1144        move |vector, auto_eoi| {
1145            self.lapic(vtl).unwrap().synic_interrupt(
1146                vp_index,
1147                vector as u8,
1148                auto_eoi,
1149                |vp_index| self.vp(vp_index).unwrap().wake(vtl, WakeReason::INTCON),
1150            );
1151        }
1152    }
1153
1154    #[cfg(guest_arch = "aarch64")]
1155    fn synic_interrupt(
1156        &self,
1157        _vp_index: VpIndex,
1158        _vtl: GuestVtl,
1159    ) -> impl '_ + hv1_emulator::RequestInterrupt {
1160        move |_, _| {}
1161    }
1162}
1163
1164#[derive(Debug)]
1165struct UhEventPort {
1166    partition: Weak<UhPartitionInner>,
1167    params: Arc<Mutex<UhEventPortParams>>,
1168}
1169
1170#[derive(Debug, Copy, Clone)]
1171struct UhEventPortParams {
1172    vp: VpIndex,
1173    sint: u8,
1174    flag: u16,
1175    vtl: GuestVtl,
1176}
1177
1178impl vmcore::synic::GuestEventPort for UhEventPort {
1179    fn interrupt(&self) -> vmcore::interrupt::Interrupt {
1180        let partition = self.partition.clone();
1181        let params = self.params.clone();
1182        vmcore::interrupt::Interrupt::from_fn(move || {
1183            let UhEventPortParams {
1184                vp,
1185                sint,
1186                flag,
1187                vtl,
1188            } = *params.lock();
1189            let Some(partition) = partition.upgrade() else {
1190                return;
1191            };
1192            tracing::trace!(vp = vp.index(), sint, flag, "signal_event");
1193            if let Some(hv) = partition.hv() {
1194                match hv.synic[vtl].signal_event(
1195                    vp,
1196                    sint,
1197                    flag,
1198                    &mut partition.synic_interrupt(vp, vtl),
1199                ) {
1200                    Ok(_) => {}
1201                    Err(SintProxied) => {
1202                        tracing::trace!(
1203                            vp = vp.index(),
1204                            sint,
1205                            flag,
1206                            "forwarding event to untrusted synic"
1207                        );
1208                        if let Some(synic) = partition.backing_shared.untrusted_synic() {
1209                            synic
1210                                .signal_event(
1211                                    vp,
1212                                    sint,
1213                                    flag,
1214                                    &mut partition.synic_interrupt(vp, vtl),
1215                                )
1216                                .ok();
1217                        } else {
1218                            partition.hcl.signal_event_direct(vp.index(), sint, flag)
1219                        }
1220                    }
1221                }
1222            } else {
1223                partition.hcl.signal_event_direct(vp.index(), sint, flag);
1224            }
1225        })
1226    }
1227
1228    fn set_target_vp(&mut self, vp: u32) -> Result<(), vmcore::synic::HypervisorError> {
1229        self.params.lock().vp = VpIndex::new(vp);
1230        Ok(())
1231    }
1232}
1233
1234impl virt::Hv1 for UhPartition {
1235    type Error = Error;
1236    type Device = virt::x86::apic_software_device::ApicSoftwareDevice;
1237
1238    fn reference_time_source(&self) -> Option<ReferenceTimeSource> {
1239        Some(if let Some(hv) = self.inner.hv() {
1240            hv.ref_time_source().clone()
1241        } else {
1242            ReferenceTimeSource::from(self.inner.clone() as Arc<_>)
1243        })
1244    }
1245
1246    fn new_virtual_device(
1247        &self,
1248    ) -> Option<&dyn virt::DeviceBuilder<Device = Self::Device, Error = Self::Error>> {
1249        self.inner.software_devices.is_some().then_some(self)
1250    }
1251}
1252
1253impl GetReferenceTime for UhPartitionInner {
1254    fn now(&self) -> ReferenceTimeResult {
1255        ReferenceTimeResult {
1256            ref_time: self.hcl.reference_time().unwrap(),
1257            system_time: None,
1258        }
1259    }
1260}
1261
1262impl virt::DeviceBuilder for UhPartition {
1263    fn build(&self, vtl: Vtl, device_id: u64) -> Result<Self::Device, Self::Error> {
1264        let vtl = GuestVtl::try_from(vtl).expect("higher vtl not configured");
1265        let device = self
1266            .inner
1267            .software_devices
1268            .as_ref()
1269            .expect("checked in new_virtual_device")
1270            .new_device(self.interrupt_targets[vtl].clone(), device_id)
1271            .map_err(Error::NewDevice)?;
1272
1273        Ok(device)
1274    }
1275}
1276
1277struct UhInterruptTarget {
1278    partition: Arc<UhPartitionInner>,
1279    vtl: GuestVtl,
1280}
1281
1282impl pci_core::msi::MsiInterruptTarget for UhInterruptTarget {
1283    fn new_interrupt(&self) -> Box<dyn pci_core::msi::MsiControl> {
1284        let partition = self.partition.clone();
1285        let vtl = self.vtl;
1286        Box::new(move |address, data| partition.request_msi(vtl, MsiRequest { address, data }))
1287    }
1288}
1289
1290impl UhPartitionInner {
1291    fn request_msi(&self, vtl: GuestVtl, request: MsiRequest) {
1292        if let Some(lapic) = self.lapic(vtl) {
1293            tracing::trace!(?request, "interrupt");
1294            lapic.request_interrupt(request.address, request.data, |vp_index| {
1295                self.vp(vp_index).unwrap().wake(vtl, WakeReason::INTCON)
1296            });
1297        } else {
1298            let (address, data) = request.as_x86();
1299            if let Err(err) = self.hcl.request_interrupt(
1300                request.hv_x86_interrupt_control(),
1301                address.virt_destination().into(),
1302                data.vector().into(),
1303                vtl,
1304            ) {
1305                tracelimit::warn_ratelimited!(
1306                    CVM_ALLOWED,
1307                    error = &err as &dyn std::error::Error,
1308                    address = request.address,
1309                    data = request.data,
1310                    "failed to request msi"
1311                );
1312            }
1313        }
1314    }
1315}
1316
1317impl IoApicRouting for UhPartitionInner {
1318    fn set_irq_route(&self, irq: u8, request: Option<MsiRequest>) {
1319        self.irq_routes.set_irq_route(irq, request)
1320    }
1321
1322    // The IO-APIC is always hooked up to VTL0.
1323    fn assert_irq(&self, irq: u8) {
1324        self.irq_routes
1325            .assert_irq(irq, |request| self.request_msi(GuestVtl::Vtl0, request))
1326    }
1327}
1328
1329/// Configure the [`hvdef::HvRegisterVsmPartitionConfig`] register with the
1330/// values used by underhill.
1331fn set_vtl2_vsm_partition_config(hcl: &Hcl) -> Result<(), Error> {
1332    // Read available capabilities to determine what to enable.
1333    let caps = hcl.get_vsm_capabilities().map_err(Error::Hcl)?;
1334    let hardware_isolated = hcl.isolation().is_hardware_isolated();
1335    let isolated = hcl.isolation().is_isolated();
1336
1337    let config = HvRegisterVsmPartitionConfig::new()
1338        .with_default_vtl_protection_mask(0xF)
1339        .with_enable_vtl_protection(!hardware_isolated)
1340        .with_zero_memory_on_reset(!hardware_isolated)
1341        .with_intercept_cpuid_unimplemented(!hardware_isolated)
1342        .with_intercept_page(caps.intercept_page_available())
1343        .with_intercept_unrecoverable_exception(true)
1344        .with_intercept_not_present(caps.intercept_not_present_available() && !isolated)
1345        .with_intercept_acceptance(isolated)
1346        .with_intercept_enable_vtl_protection(isolated && !hardware_isolated)
1347        .with_intercept_system_reset(caps.intercept_system_reset_available());
1348
1349    hcl.set_vtl2_vsm_partition_config(config)
1350        .map_err(Error::VsmPartitionConfig)
1351}
1352
1353/// Configuration parameters supplied to [`UhProtoPartition::new`].
1354///
1355/// These do not include runtime resources.
1356pub struct UhPartitionNewParams<'a> {
1357    /// The isolation type for the partition.
1358    pub isolation: IsolationType,
1359    /// Hide isolation from the guest. The guest will run as if it is not
1360    /// isolated.
1361    pub hide_isolation: bool,
1362    /// The memory layout for lower VTLs.
1363    pub lower_vtl_memory_layout: &'a MemoryLayout,
1364    /// The guest processor topology.
1365    pub topology: &'a ProcessorTopology,
1366    /// The unparsed CVM cpuid info.
1367    // TODO: move parsing up a layer.
1368    pub cvm_cpuid_info: Option<&'a [u8]>,
1369    /// The unparsed CVM secrets page.
1370    pub snp_secrets: Option<&'a [u8]>,
1371    /// The virtual top of memory for hardware-isolated VMs.
1372    ///
1373    /// Must be a power of two.
1374    pub vtom: Option<u64>,
1375    /// Handle synic messages and events.
1376    ///
1377    /// On TDX, this prevents the hypervisor from getting vmtdcall exits.
1378    pub handle_synic: bool,
1379    /// Do not hotplug sidecar VPs on their first exit. Just continue running
1380    /// the VP remotely.
1381    pub no_sidecar_hotplug: bool,
1382    /// Use MMIO access hypercalls.
1383    pub use_mmio_hypercalls: bool,
1384    /// Intercept guest debug exceptions to support gdbstub.
1385    pub intercept_debug_exceptions: bool,
1386}
1387
1388/// Parameters to [`UhProtoPartition::build`].
1389pub struct UhLateParams<'a> {
1390    /// Guest memory for lower VTLs.
1391    pub gm: VtlArray<GuestMemory, 2>,
1392    /// Guest memory for VTL 0 kernel execute access.
1393    pub vtl0_kernel_exec_gm: GuestMemory,
1394    /// Guest memory for VTL 0 user execute access.
1395    pub vtl0_user_exec_gm: GuestMemory,
1396    /// The CPUID leaves to expose to the guest.
1397    #[cfg(guest_arch = "x86_64")]
1398    pub cpuid: Vec<CpuidLeaf>,
1399    /// The mesh sender to use for crash notifications.
1400    // FUTURE: remove mesh dependency from this layer.
1401    pub crash_notification_send: mesh::Sender<VtlCrash>,
1402    /// The VM time source.
1403    pub vmtime: &'a VmTimeSource,
1404    /// Parameters for CVMs only.
1405    pub cvm_params: Option<CvmLateParams>,
1406    /// vmbus_relay is enabled and active for partition
1407    pub vmbus_relay: bool,
1408}
1409
1410/// CVM-only parameters to [`UhProtoPartition::build`].
1411pub struct CvmLateParams {
1412    /// Guest memory for untrusted devices, like overlay pages.
1413    pub shared_gm: GuestMemory,
1414    /// An object to call to change host visibility on guest memory.
1415    pub isolated_memory_protector: Arc<dyn ProtectIsolatedMemory>,
1416    /// Dma client for shared visibility pages.
1417    pub shared_dma_client: Arc<dyn DmaClient>,
1418    /// Allocator for private visibility pages.
1419    pub private_dma_client: Arc<dyn DmaClient>,
1420}
1421
1422/// Represents a GPN that is either in guest memory or was allocated by dma_client.
1423#[derive(Debug, Copy, Clone, PartialEq, Eq)]
1424pub enum GpnSource {
1425    /// The GPN is in regular guest RAM.
1426    GuestMemory,
1427    /// The GPN was allocated by dma_client and is not in guest RAM.
1428    Dma,
1429}
1430
1431/// Trait for CVM-related protections on guest memory.
1432pub trait ProtectIsolatedMemory: Send + Sync {
1433    /// Changes host visibility on guest memory.
1434    fn change_host_visibility(
1435        &self,
1436        vtl: GuestVtl,
1437        shared: bool,
1438        gpns: &[u64],
1439        tlb_access: &mut dyn TlbFlushLockAccess,
1440    ) -> Result<(), (HvError, usize)>;
1441
1442    /// Queries host visibility on guest memory.
1443    fn query_host_visibility(
1444        &self,
1445        gpns: &[u64],
1446        host_visibility: &mut [HostVisibilityType],
1447    ) -> Result<(), (HvError, usize)>;
1448
1449    /// Gets the default protections/permissions for VTL 0.
1450    fn default_vtl0_protections(&self) -> HvMapGpaFlags;
1451
1452    /// Changes the default protections/permissions for a VTL. For VBS-isolated
1453    /// VMs, the protections apply to all vtls lower than the specified one. For
1454    /// hardware-isolated VMs, they apply just to the given vtl.
1455    fn change_default_vtl_protections(
1456        &self,
1457        target_vtl: GuestVtl,
1458        protections: HvMapGpaFlags,
1459        tlb_access: &mut dyn TlbFlushLockAccess,
1460    ) -> Result<(), HvError>;
1461
1462    /// Changes the vtl protections on a range of guest memory.
1463    fn change_vtl_protections(
1464        &self,
1465        target_vtl: GuestVtl,
1466        gpns: &[u64],
1467        protections: HvMapGpaFlags,
1468        tlb_access: &mut dyn TlbFlushLockAccess,
1469    ) -> Result<(), (HvError, usize)>;
1470
1471    /// Registers a page as an overlay page by first validating it has the
1472    /// required permissions, optionally modifying them, then locking them.
1473    fn register_overlay_page(
1474        &self,
1475        vtl: GuestVtl,
1476        gpn: u64,
1477        gpn_source: GpnSource,
1478        check_perms: HvMapGpaFlags,
1479        new_perms: Option<HvMapGpaFlags>,
1480        tlb_access: &mut dyn TlbFlushLockAccess,
1481    ) -> Result<(), HvError>;
1482
1483    /// Unregisters an overlay page, removing its permission lock and restoring
1484    /// the previous permissions.
1485    fn unregister_overlay_page(
1486        &self,
1487        vtl: GuestVtl,
1488        gpn: u64,
1489        tlb_access: &mut dyn TlbFlushLockAccess,
1490    ) -> Result<(), HvError>;
1491
1492    /// Checks whether a page is currently registered as an overlay page.
1493    fn is_overlay_page(&self, vtl: GuestVtl, gpn: u64) -> bool;
1494
1495    /// Locks the permissions and mappings for a set of guest pages.
1496    fn lock_gpns(&self, vtl: GuestVtl, gpns: &[u64]) -> Result<(), GuestMemoryBackingError>;
1497
1498    /// Unlocks the permissions and mappings for a set of guest pages.
1499    ///
1500    /// Panics if asked to unlock a page that was not previously locked. The
1501    /// caller must ensure that the given slice has the same ordering as the
1502    /// one passed to `lock_gpns`.
1503    fn unlock_gpns(&self, vtl: GuestVtl, gpns: &[u64]);
1504
1505    /// Alerts the memory protector that vtl 1 is ready to set vtl protections
1506    /// on lower-vtl memory, and that these protections should be enforced.
1507    fn set_vtl1_protections_enabled(&self);
1508
1509    /// Whether VTL 1 is prepared to modify vtl protections on lower-vtl memory,
1510    /// and therefore whether these protections should be enforced.
1511    fn vtl1_protections_enabled(&self) -> bool;
1512}
1513
1514/// Trait for access to TLB flush and lock machinery.
1515pub trait TlbFlushLockAccess {
1516    /// Flush the entire TLB for all VPs for the given VTL.
1517    fn flush(&mut self, vtl: GuestVtl);
1518
1519    /// Flush the entire TLB for all VPs for all VTLs.
1520    fn flush_entire(&mut self);
1521
1522    /// Causes the specified VTL on the current VP to wait on all TLB locks.
1523    fn set_wait_for_tlb_locks(&mut self, vtl: GuestVtl);
1524}
1525
1526/// A partially built partition. Used to allow querying partition capabilities
1527/// before fully instantiating the partition.
1528pub struct UhProtoPartition<'a> {
1529    params: UhPartitionNewParams<'a>,
1530    hcl: Hcl,
1531    guest_vsm_available: bool,
1532    #[cfg(guest_arch = "x86_64")]
1533    cpuid: virt::CpuidLeafSet,
1534}
1535
1536impl<'a> UhProtoPartition<'a> {
1537    /// Creates a new prototype partition.
1538    ///
1539    /// `driver(cpu)` returns the driver to use for polling the sidecar device
1540    /// whose base CPU is `cpu`.
1541    pub fn new<T: SpawnDriver>(
1542        params: UhPartitionNewParams<'a>,
1543        driver: impl FnMut(u32) -> T,
1544    ) -> Result<Self, Error> {
1545        let hcl_isolation = match params.isolation {
1546            IsolationType::None => hcl::ioctl::IsolationType::None,
1547            IsolationType::Vbs => hcl::ioctl::IsolationType::Vbs,
1548            IsolationType::Snp => hcl::ioctl::IsolationType::Snp,
1549            IsolationType::Tdx => hcl::ioctl::IsolationType::Tdx,
1550        };
1551
1552        // Try to open the sidecar device, if it is present.
1553        let sidecar = sidecar_client::SidecarClient::new(driver).map_err(Error::Sidecar)?;
1554
1555        let hcl = Hcl::new(hcl_isolation, sidecar).map_err(Error::Hcl)?;
1556
1557        // Set the hypercalls that this process will use.
1558        let mut allowed_hypercalls = vec![
1559            hvdef::HypercallCode::HvCallGetVpRegisters,
1560            hvdef::HypercallCode::HvCallSetVpRegisters,
1561            hvdef::HypercallCode::HvCallInstallIntercept,
1562            hvdef::HypercallCode::HvCallTranslateVirtualAddress,
1563            hvdef::HypercallCode::HvCallPostMessageDirect,
1564            hvdef::HypercallCode::HvCallSignalEventDirect,
1565            hvdef::HypercallCode::HvCallModifyVtlProtectionMask,
1566            hvdef::HypercallCode::HvCallTranslateVirtualAddressEx,
1567            hvdef::HypercallCode::HvCallCheckSparseGpaPageVtlAccess,
1568            hvdef::HypercallCode::HvCallAssertVirtualInterrupt,
1569            hvdef::HypercallCode::HvCallGetVpIndexFromApicId,
1570            hvdef::HypercallCode::HvCallAcceptGpaPages,
1571            hvdef::HypercallCode::HvCallModifySparseGpaPageHostVisibility,
1572        ];
1573
1574        if params.isolation.is_hardware_isolated() {
1575            allowed_hypercalls.extend(vec![
1576                hvdef::HypercallCode::HvCallEnablePartitionVtl,
1577                hvdef::HypercallCode::HvCallRetargetDeviceInterrupt,
1578                hvdef::HypercallCode::HvCallEnableVpVtl,
1579            ]);
1580        }
1581
1582        if params.use_mmio_hypercalls {
1583            allowed_hypercalls.extend(vec![
1584                hvdef::HypercallCode::HvCallMemoryMappedIoRead,
1585                hvdef::HypercallCode::HvCallMemoryMappedIoWrite,
1586            ]);
1587        }
1588
1589        hcl.set_allowed_hypercalls(allowed_hypercalls.as_slice());
1590
1591        set_vtl2_vsm_partition_config(&hcl)?;
1592
1593        let guest_vsm_available = Self::check_guest_vsm_support(&hcl)?;
1594
1595        #[cfg(guest_arch = "x86_64")]
1596        let cpuid = match params.isolation {
1597            IsolationType::Snp => cvm_cpuid::CpuidResultsIsolationType::Snp {
1598                cpuid_pages: params.cvm_cpuid_info.unwrap(),
1599                vtom: params.vtom.unwrap(),
1600                access_vsm: guest_vsm_available,
1601            }
1602            .build()
1603            .map_err(Error::CvmCpuid)?,
1604
1605            IsolationType::Tdx => cvm_cpuid::CpuidResultsIsolationType::Tdx {
1606                topology: params.topology,
1607                vtom: params.vtom.unwrap(),
1608                access_vsm: guest_vsm_available,
1609            }
1610            .build()
1611            .map_err(Error::CvmCpuid)?,
1612            IsolationType::Vbs | IsolationType::None => Default::default(),
1613        };
1614
1615        Ok(UhProtoPartition {
1616            hcl,
1617            params,
1618            guest_vsm_available,
1619            #[cfg(guest_arch = "x86_64")]
1620            cpuid,
1621        })
1622    }
1623
1624    /// Returns whether VSM support will be available to the guest.
1625    pub fn guest_vsm_available(&self) -> bool {
1626        self.guest_vsm_available
1627    }
1628
1629    /// Returns a new Underhill partition.
1630    pub async fn build(
1631        self,
1632        late_params: UhLateParams<'_>,
1633    ) -> Result<(UhPartition, Vec<UhProcessorBox>), Error> {
1634        let Self {
1635            mut hcl,
1636            params,
1637            guest_vsm_available,
1638            #[cfg(guest_arch = "x86_64")]
1639            cpuid,
1640        } = self;
1641        let isolation = params.isolation;
1642        let is_hardware_isolated = isolation.is_hardware_isolated();
1643
1644        // Intercept Debug Exceptions
1645        // On TDX because all OpenHCL TDs today have the debug policy bit set,
1646        // OpenHCL registers for the intercepts itself.
1647        // However, on non-TDX platforms hypervisor installs the
1648        // intercept on behalf of the guest.
1649        if params.intercept_debug_exceptions {
1650            if !cfg!(feature = "gdb") {
1651                return Err(Error::InvalidDebugConfiguration);
1652            }
1653
1654            cfg_if::cfg_if! {
1655                if #[cfg(guest_arch = "x86_64")] {
1656                    if isolation != IsolationType::Tdx {
1657                        let debug_exception_vector = 0x1;
1658                        hcl.register_intercept(
1659                            HvInterceptType::HvInterceptTypeException,
1660                            HV_INTERCEPT_ACCESS_MASK_EXECUTE,
1661                            HvInterceptParameters::new_exception(debug_exception_vector),
1662                        )
1663                        .map_err(|err| Error::InstallIntercept(HvInterceptType::HvInterceptTypeException, err))?;
1664                    }
1665                } else {
1666                    return Err(Error::InvalidDebugConfiguration);
1667                }
1668            }
1669        }
1670
1671        if !is_hardware_isolated {
1672            if cfg!(guest_arch = "x86_64") {
1673                hcl.register_intercept(
1674                    HvInterceptType::HvInterceptTypeX64Msr,
1675                    HV_INTERCEPT_ACCESS_MASK_READ_WRITE,
1676                    HvInterceptParameters::new_zeroed(),
1677                )
1678                .map_err(|err| {
1679                    Error::InstallIntercept(HvInterceptType::HvInterceptTypeX64Msr, err)
1680                })?;
1681
1682                hcl.register_intercept(
1683                    HvInterceptType::HvInterceptTypeX64ApicEoi,
1684                    HV_INTERCEPT_ACCESS_MASK_WRITE,
1685                    HvInterceptParameters::new_zeroed(),
1686                )
1687                .map_err(|err| {
1688                    Error::InstallIntercept(HvInterceptType::HvInterceptTypeX64ApicEoi, err)
1689                })?;
1690            } else {
1691                if false {
1692                    todo!("AARCH64_TODO");
1693                }
1694            }
1695        }
1696
1697        if isolation == IsolationType::Snp {
1698            // SNP VMs register for the #VC exception to support reflect-VC.
1699            hcl.register_intercept(
1700                HvInterceptType::HvInterceptTypeException,
1701                HV_INTERCEPT_ACCESS_MASK_EXECUTE,
1702                HvInterceptParameters::new_exception(0x1D),
1703            )
1704            .map_err(|err| {
1705                Error::InstallIntercept(HvInterceptType::HvInterceptTypeException, err)
1706            })?;
1707
1708            // Get the register tweak bitmap from secrets page.
1709            let mut bitmap = [0u8; 64];
1710            if let Some(secrets) = params.snp_secrets {
1711                bitmap.copy_from_slice(
1712                    &secrets
1713                        [REG_TWEAK_BITMAP_OFFSET..REG_TWEAK_BITMAP_OFFSET + REG_TWEAK_BITMAP_SIZE],
1714                );
1715            }
1716            hcl.set_snp_register_bitmap(bitmap);
1717        }
1718
1719        // Do per-VP HCL initialization.
1720        hcl.add_vps(
1721            params.topology.vp_count(),
1722            late_params
1723                .cvm_params
1724                .as_ref()
1725                .map(|x| &x.private_dma_client),
1726        )
1727        .map_err(Error::Hcl)?;
1728
1729        let vps: Vec<_> = params
1730            .topology
1731            .vps_arch()
1732            .map(|vp_info| {
1733                // TODO: determine CPU index, which in theory could be different
1734                // from the VP index, though this hasn't happened yet.
1735                let cpu_index = vp_info.base.vp_index.index();
1736                UhVpInner::new(cpu_index, vp_info)
1737            })
1738            .collect();
1739
1740        // Enable support for VPCI devices if the hypervisor supports it.
1741        #[cfg(guest_arch = "x86_64")]
1742        let software_devices = {
1743            let res = if !is_hardware_isolated {
1744                hcl.register_intercept(
1745                    HvInterceptType::HvInterceptTypeRetargetInterruptWithUnknownDeviceId,
1746                    HV_INTERCEPT_ACCESS_MASK_EXECUTE,
1747                    HvInterceptParameters::new_zeroed(),
1748                )
1749            } else {
1750                Ok(())
1751            };
1752            match res {
1753                Ok(()) => Some(ApicSoftwareDevices::new(
1754                    params.topology.vps_arch().map(|vp| vp.apic_id).collect(),
1755                )),
1756                Err(HvError::InvalidParameter | HvError::AccessDenied) => None,
1757                Err(err) => {
1758                    return Err(Error::InstallIntercept(
1759                        HvInterceptType::HvInterceptTypeRetargetInterruptWithUnknownDeviceId,
1760                        err,
1761                    ));
1762                }
1763            }
1764        };
1765
1766        #[cfg(guest_arch = "aarch64")]
1767        let software_devices = None;
1768
1769        #[cfg(guest_arch = "aarch64")]
1770        let caps = virt::aarch64::Aarch64PartitionCapabilities {};
1771
1772        #[cfg(guest_arch = "x86_64")]
1773        let cpuid = UhPartition::construct_cpuid_results(
1774            cpuid,
1775            &late_params.cpuid,
1776            params.topology,
1777            isolation,
1778            params.hide_isolation,
1779        );
1780
1781        #[cfg(guest_arch = "x86_64")]
1782        let caps = UhPartition::construct_capabilities(
1783            params.topology,
1784            &cpuid,
1785            isolation,
1786            params.hide_isolation,
1787        )
1788        .map_err(Error::Capabilities)?;
1789
1790        if params.handle_synic && !matches!(isolation, IsolationType::Tdx) {
1791            // The hypervisor will manage the untrusted SINTs (or the whole
1792            // synic for non-hardware-isolated VMs), but some event ports
1793            // and message ports are implemented here. Register an intercept
1794            // to handle HvSignalEvent and HvPostMessage hypercalls when the
1795            // hypervisor doesn't recognize the connection ID.
1796            //
1797            // TDX manages this locally instead of through the hypervisor.
1798            hcl.register_intercept(
1799                HvInterceptType::HvInterceptTypeUnknownSynicConnection,
1800                HV_INTERCEPT_ACCESS_MASK_EXECUTE,
1801                HvInterceptParameters::new_zeroed(),
1802            )
1803            .expect("registering synic intercept cannot fail");
1804        }
1805
1806        #[cfg(guest_arch = "x86_64")]
1807        let cvm_state = if is_hardware_isolated {
1808            Some(Self::construct_cvm_state(
1809                &params,
1810                late_params.cvm_params.unwrap(),
1811                &caps,
1812                guest_vsm_available,
1813            )?)
1814        } else {
1815            None
1816        };
1817        #[cfg(guest_arch = "aarch64")]
1818        let cvm_state = None;
1819
1820        let backing_shared = BackingShared::new(
1821            isolation,
1822            &params,
1823            BackingSharedParams {
1824                cvm_state,
1825                #[cfg(guest_arch = "x86_64")]
1826                cpuid: &cpuid,
1827                hcl: &hcl,
1828                guest_vsm_available,
1829            },
1830        )?;
1831
1832        let enter_modes = EnterModes::default();
1833
1834        let partition = Arc::new(UhPartitionInner {
1835            hcl,
1836            vps,
1837            irq_routes: Default::default(),
1838            caps,
1839            enter_modes: Mutex::new(enter_modes),
1840            enter_modes_atomic: u8::from(hcl::protocol::EnterModes::from(enter_modes)).into(),
1841            gm: late_params.gm,
1842            vtl0_kernel_exec_gm: late_params.vtl0_kernel_exec_gm,
1843            vtl0_user_exec_gm: late_params.vtl0_user_exec_gm,
1844            #[cfg(guest_arch = "x86_64")]
1845            cpuid,
1846            crash_notification_send: late_params.crash_notification_send,
1847            monitor_page: MonitorPage::new(),
1848            allocated_monitor_page: Mutex::new(None),
1849            software_devices,
1850            lower_vtl_memory_layout: params.lower_vtl_memory_layout.clone(),
1851            vmtime: late_params.vmtime.clone(),
1852            isolation,
1853            no_sidecar_hotplug: params.no_sidecar_hotplug.into(),
1854            use_mmio_hypercalls: params.use_mmio_hypercalls,
1855            backing_shared,
1856            #[cfg(guest_arch = "x86_64")]
1857            device_vector_table: RwLock::new(IrrBitmap::new(Default::default())),
1858            intercept_debug_exceptions: params.intercept_debug_exceptions,
1859            vmbus_relay: late_params.vmbus_relay,
1860        });
1861
1862        if cfg!(guest_arch = "x86_64") {
1863            // Intercept all IOs unless opted out.
1864            partition.manage_io_port_intercept_region(0, !0, true);
1865        }
1866
1867        let vps = params
1868            .topology
1869            .vps_arch()
1870            .map(|vp_info| UhProcessorBox {
1871                partition: partition.clone(),
1872                vp_info,
1873            })
1874            .collect();
1875
1876        Ok((
1877            UhPartition {
1878                inner: partition.clone(),
1879                interrupt_targets: VtlArray::from_fn(|vtl| {
1880                    Arc::new(UhInterruptTarget {
1881                        partition: partition.clone(),
1882                        vtl: vtl.try_into().unwrap(),
1883                    })
1884                }),
1885            },
1886            vps,
1887        ))
1888    }
1889}
1890
1891impl UhPartition {
1892    /// Gets the guest OS ID for VTL0.
1893    pub fn vtl0_guest_os_id(&self) -> Result<HvGuestOsId, Error> {
1894        // If Underhill is emulating the hypervisor interfaces, get this value
1895        // from the emulator. This happens when running under hardware isolation
1896        // or when configured for testing.
1897        let id = if let Some(hv) = self.inner.hv() {
1898            hv.guest_os_id(Vtl::Vtl0)
1899        } else {
1900            // Ask the hypervisor for this value.
1901            self.inner
1902                .hcl
1903                .get_guest_os_id(Vtl::Vtl0)
1904                .map_err(Error::Hcl)?
1905        };
1906        Ok(id)
1907    }
1908
1909    /// Configures guest accesses to IO ports in `range` to go directly to the
1910    /// host.
1911    ///
1912    /// When the return value is dropped, the ports will be unregistered.
1913    pub fn register_host_io_port_fast_path(
1914        &self,
1915        range: RangeInclusive<u16>,
1916    ) -> HostIoPortFastPathHandle {
1917        // There is no way to provide a fast path for some hardware isolated
1918        // VM architectures. The devices that do use this facility are not
1919        // enabled on hardware isolated VMs.
1920        assert!(!self.inner.isolation.is_hardware_isolated());
1921
1922        self.inner
1923            .manage_io_port_intercept_region(*range.start(), *range.end(), false);
1924        HostIoPortFastPathHandle {
1925            inner: Arc::downgrade(&self.inner),
1926            begin: *range.start(),
1927            end: *range.end(),
1928        }
1929    }
1930
1931    /// Trigger the LINT1 interrupt vector on the LAPIC of the BSP.
1932    pub fn assert_debug_interrupt(&self, _vtl: u8) {
1933        #[cfg(guest_arch = "x86_64")]
1934        const LINT_INDEX_1: u8 = 1;
1935        #[cfg(guest_arch = "x86_64")]
1936        match self.inner.isolation {
1937            IsolationType::Snp => {
1938                tracing::error!(?_vtl, "Debug interrupts cannot be injected into SNP VMs",);
1939            }
1940            _ => {
1941                let bsp_index = VpIndex::new(0);
1942                self.pulse_lint(bsp_index, Vtl::try_from(_vtl).unwrap(), LINT_INDEX_1)
1943            }
1944        }
1945    }
1946
1947    /// Enables or disables the PM timer assist.
1948    pub fn set_pm_timer_assist(&self, port: Option<u16>) -> Result<(), HvError> {
1949        self.inner.hcl.set_pm_timer_assist(port)
1950    }
1951
1952    /// Sets guest memory protections for a monitor page.
1953    fn register_cvm_dma_overlay_page(
1954        &self,
1955        vtl: GuestVtl,
1956        gpn: u64,
1957        new_perms: HvMapGpaFlags,
1958    ) -> anyhow::Result<()> {
1959        // How the monitor page is protected depends on the isolation type of the VM.
1960        match &self.inner.backing_shared {
1961            #[cfg(guest_arch = "x86_64")]
1962            BackingShared::Snp(snp_backed_shared) => snp_backed_shared
1963                .cvm
1964                .isolated_memory_protector
1965                .register_overlay_page(
1966                    vtl,
1967                    gpn,
1968                    // On a CVM, the monitor page is always DMA-allocated.
1969                    GpnSource::Dma,
1970                    HvMapGpaFlags::new(),
1971                    Some(new_perms),
1972                    &mut SnpBacked::tlb_flush_lock_access(
1973                        None,
1974                        self.inner.as_ref(),
1975                        snp_backed_shared,
1976                    ),
1977                )
1978                .map_err(|e| anyhow::anyhow!(e)),
1979            #[cfg(guest_arch = "x86_64")]
1980            BackingShared::Tdx(tdx_backed_shared) => tdx_backed_shared
1981                .cvm
1982                .isolated_memory_protector
1983                .register_overlay_page(
1984                    vtl,
1985                    gpn,
1986                    GpnSource::Dma,
1987                    HvMapGpaFlags::new(),
1988                    Some(new_perms),
1989                    &mut TdxBacked::tlb_flush_lock_access(
1990                        None,
1991                        self.inner.as_ref(),
1992                        tdx_backed_shared,
1993                    ),
1994                )
1995                .map_err(|e| anyhow::anyhow!(e)),
1996            BackingShared::Hypervisor(_) => {
1997                let _ = (vtl, gpn, new_perms);
1998                unreachable!()
1999            }
2000        }
2001    }
2002
2003    /// Reverts guest memory protections for a monitor page.
2004    fn unregister_cvm_dma_overlay_page(&self, vtl: GuestVtl, gpn: u64) -> anyhow::Result<()> {
2005        // How the monitor page is protected depends on the isolation type of the VM.
2006        match &self.inner.backing_shared {
2007            #[cfg(guest_arch = "x86_64")]
2008            BackingShared::Snp(snp_backed_shared) => snp_backed_shared
2009                .cvm
2010                .isolated_memory_protector
2011                .unregister_overlay_page(
2012                    vtl,
2013                    gpn,
2014                    &mut SnpBacked::tlb_flush_lock_access(
2015                        None,
2016                        self.inner.as_ref(),
2017                        snp_backed_shared,
2018                    ),
2019                )
2020                .map_err(|e| anyhow::anyhow!(e)),
2021            #[cfg(guest_arch = "x86_64")]
2022            BackingShared::Tdx(tdx_backed_shared) => tdx_backed_shared
2023                .cvm
2024                .isolated_memory_protector
2025                .unregister_overlay_page(
2026                    vtl,
2027                    gpn,
2028                    &mut TdxBacked::tlb_flush_lock_access(
2029                        None,
2030                        self.inner.as_ref(),
2031                        tdx_backed_shared,
2032                    ),
2033                )
2034                .map_err(|e| anyhow::anyhow!(e)),
2035            BackingShared::Hypervisor(_) => {
2036                let _ = (vtl, gpn);
2037                unreachable!()
2038            }
2039        }
2040    }
2041}
2042
2043impl UhProtoPartition<'_> {
2044    /// Whether Guest VSM is available to the guest. If so, for hardware CVMs,
2045    /// it is safe to expose Guest VSM support via cpuid.
2046    fn check_guest_vsm_support(hcl: &Hcl) -> Result<bool, Error> {
2047        #[cfg(guest_arch = "x86_64")]
2048        let privs = {
2049            let result = safe_intrinsics::cpuid(hvdef::HV_CPUID_FUNCTION_MS_HV_FEATURES, 0);
2050            let num = result.eax as u64 | ((result.ebx as u64) << 32);
2051            hvdef::HvPartitionPrivilege::from(num)
2052        };
2053
2054        #[cfg(guest_arch = "aarch64")]
2055        let privs = hcl.get_privileges_and_features_info().map_err(Error::Hcl)?;
2056
2057        if !privs.access_vsm() {
2058            return Ok(false);
2059        }
2060        let guest_vsm_config = hcl.get_guest_vsm_partition_config().map_err(Error::Hcl)?;
2061        Ok(guest_vsm_config.maximum_vtl() >= u8::from(GuestVtl::Vtl1))
2062    }
2063
2064    #[cfg(guest_arch = "x86_64")]
2065    /// Constructs partition-wide CVM state.
2066    fn construct_cvm_state(
2067        params: &UhPartitionNewParams<'_>,
2068        late_params: CvmLateParams,
2069        caps: &PartitionCapabilities,
2070        guest_vsm_available: bool,
2071    ) -> Result<UhCvmPartitionState, Error> {
2072        use vmcore::reference_time::ReferenceTimeSource;
2073
2074        let vp_count = params.topology.vp_count() as usize;
2075        let vps = (0..vp_count)
2076            .map(|vp_index| UhCvmVpInner {
2077                tlb_lock_info: VtlArray::from_fn(|_| TlbLockInfo::new(vp_count)),
2078                vtl1_enable_called: Mutex::new(false),
2079                started: AtomicBool::new(vp_index == 0),
2080                hv_start_enable_vtl_vp: VtlArray::from_fn(|_| Mutex::new(None)),
2081            })
2082            .collect();
2083        let tlb_locked_vps =
2084            VtlArray::from_fn(|_| BitVec::repeat(false, vp_count).into_boxed_bitslice());
2085
2086        let lapic = VtlArray::from_fn(|_| {
2087            LocalApicSet::builder()
2088                .x2apic_capable(caps.x2apic)
2089                .hyperv_enlightenments(true)
2090                .build()
2091        });
2092
2093        let tsc_frequency = get_tsc_frequency(params.isolation)?;
2094        let ref_time = ReferenceTimeSource::new(TscReferenceTimeSource::new(tsc_frequency));
2095
2096        // If we're emulating the APIC, then we also must emulate the hypervisor
2097        // enlightenments, since the hypervisor can't support enlightenments
2098        // without also providing an APIC.
2099        //
2100        // Additionally, TDX provides hardware APIC emulation but we still need
2101        // to emulate the hypervisor enlightenments.
2102        let hv = GlobalHv::new(hv1_emulator::hv::GlobalHvParams {
2103            max_vp_count: params.topology.vp_count(),
2104            vendor: caps.vendor,
2105            tsc_frequency,
2106            ref_time,
2107            is_ref_time_backed_by_tsc: true,
2108        });
2109
2110        Ok(UhCvmPartitionState {
2111            vps_per_socket: params.topology.reserved_vps_per_socket(),
2112            tlb_locked_vps,
2113            vps,
2114            shared_memory: late_params.shared_gm,
2115            isolated_memory_protector: late_params.isolated_memory_protector,
2116            lapic,
2117            hv,
2118            guest_vsm: RwLock::new(GuestVsmState::from_availability(guest_vsm_available)),
2119            shared_dma_client: late_params.shared_dma_client,
2120            private_dma_client: late_params.private_dma_client,
2121            hide_isolation: params.hide_isolation,
2122        })
2123    }
2124}
2125
2126impl UhPartition {
2127    #[cfg(guest_arch = "x86_64")]
2128    /// Constructs the set of cpuid results to show to the guest
2129    fn construct_cpuid_results(
2130        cpuid: virt::CpuidLeafSet,
2131        initial_cpuid: &[CpuidLeaf],
2132        topology: &ProcessorTopology<vm_topology::processor::x86::X86Topology>,
2133        isolation: IsolationType,
2134        hide_isolation: bool,
2135    ) -> virt::CpuidLeafSet {
2136        let mut cpuid = cpuid.into_leaves();
2137        if isolation.is_hardware_isolated() {
2138            // Update the x2apic leaf based on the topology.
2139            let x2apic = match topology.apic_mode() {
2140                vm_topology::processor::x86::ApicMode::XApic => false,
2141                vm_topology::processor::x86::ApicMode::X2ApicSupported => true,
2142                vm_topology::processor::x86::ApicMode::X2ApicEnabled => true,
2143            };
2144            let ecx = x86defs::cpuid::VersionAndFeaturesEcx::new().with_x2_apic(x2apic);
2145            let ecx_mask = x86defs::cpuid::VersionAndFeaturesEcx::new().with_x2_apic(true);
2146            cpuid.push(
2147                CpuidLeaf::new(
2148                    x86defs::cpuid::CpuidFunction::VersionAndFeatures.0,
2149                    [0, 0, ecx.into(), 0],
2150                )
2151                .masked([0, 0, ecx_mask.into(), 0]),
2152            );
2153
2154            // Get the hypervisor version from the host. This is just for
2155            // reporting purposes, so it is safe even if the hypervisor is not
2156            // trusted.
2157            let hv_version = safe_intrinsics::cpuid(hvdef::HV_CPUID_FUNCTION_MS_HV_VERSION, 0);
2158
2159            // Perform final processing steps for synthetic leaves.
2160            hv1_emulator::cpuid::process_hv_cpuid_leaves(
2161                &mut cpuid,
2162                hide_isolation,
2163                [
2164                    hv_version.eax,
2165                    hv_version.ebx,
2166                    hv_version.ecx,
2167                    hv_version.edx,
2168                ],
2169            );
2170        }
2171        cpuid.extend(initial_cpuid);
2172        virt::CpuidLeafSet::new(cpuid)
2173    }
2174
2175    #[cfg(guest_arch = "x86_64")]
2176    /// Computes the partition capabilities
2177    fn construct_capabilities(
2178        topology: &ProcessorTopology,
2179        cpuid: &virt::CpuidLeafSet,
2180        isolation: IsolationType,
2181        hide_isolation: bool,
2182    ) -> Result<virt::x86::X86PartitionCapabilities, virt::x86::X86PartitionCapabilitiesError> {
2183        let mut native_cpuid_fn;
2184        let mut cvm_cpuid_fn;
2185
2186        // Determine the method to get cpuid results for the guest when
2187        // computing partition capabilities.
2188        let cpuid_fn: &mut dyn FnMut(u32, u32) -> [u32; 4] = if isolation.is_hardware_isolated() {
2189            // Use the filtered CPUID to determine capabilities.
2190            cvm_cpuid_fn = move |leaf, sub_leaf| cpuid.result(leaf, sub_leaf, &[0, 0, 0, 0]);
2191            &mut cvm_cpuid_fn
2192        } else {
2193            // Just use the native cpuid.
2194            native_cpuid_fn = |leaf, sub_leaf| {
2195                let CpuidResult { eax, ebx, ecx, edx } = safe_intrinsics::cpuid(leaf, sub_leaf);
2196                cpuid.result(leaf, sub_leaf, &[eax, ebx, ecx, edx])
2197            };
2198            &mut native_cpuid_fn
2199        };
2200
2201        // Compute and validate capabilities.
2202        let mut caps = virt::x86::X86PartitionCapabilities::from_cpuid(topology, cpuid_fn)?;
2203        match isolation {
2204            IsolationType::Tdx => {
2205                assert_eq!(caps.vtom.is_some(), !hide_isolation);
2206                // TDX 1.5 requires EFER.NXE to be set to 1, so set it at RESET/INIT.
2207                caps.nxe_forced_on = true;
2208            }
2209            IsolationType::Snp => {
2210                assert_eq!(caps.vtom.is_some(), !hide_isolation);
2211            }
2212            _ => {
2213                assert!(caps.vtom.is_none());
2214            }
2215        }
2216
2217        Ok(caps)
2218    }
2219}
2220
2221#[cfg(guest_arch = "x86_64")]
2222/// Gets the TSC frequency for the current platform.
2223fn get_tsc_frequency(isolation: IsolationType) -> Result<u64, Error> {
2224    // Always get the frequency from the hypervisor. It's believed that, as long
2225    // as the hypervisor is behaving, it will provide the most precise and accurate frequency.
2226    let msr = MsrDevice::new(0).map_err(Error::OpenMsr)?;
2227    let hv_frequency = msr
2228        .read_msr(hvdef::HV_X64_MSR_TSC_FREQUENCY)
2229        .map_err(Error::ReadTscFrequency)?;
2230
2231    // Get the hardware-advertised frequency and validate that the
2232    // hypervisor frequency is not too far off.
2233    let hw_info = match isolation {
2234        IsolationType::Tdx => {
2235            // TDX provides the TSC frequency via cpuid.
2236            let max_function =
2237                safe_intrinsics::cpuid(x86defs::cpuid::CpuidFunction::VendorAndMaxFunction.0, 0)
2238                    .eax;
2239
2240            if max_function < x86defs::cpuid::CpuidFunction::CoreCrystalClockInformation.0 {
2241                return Err(Error::BadCpuidTsc);
2242            }
2243            let result = safe_intrinsics::cpuid(
2244                x86defs::cpuid::CpuidFunction::CoreCrystalClockInformation.0,
2245                0,
2246            );
2247            let ratio_denom = result.eax;
2248            let ratio_num = result.ebx;
2249            let clock = result.ecx;
2250            if ratio_num == 0 || ratio_denom == 0 || clock == 0 {
2251                return Err(Error::BadCpuidTsc);
2252            }
2253            // TDX TSC is configurable in units of 25MHz, so allow up to 12.5MHz
2254            // error.
2255            let allowed_error = 12_500_000;
2256            Some((
2257                clock as u64 * ratio_num as u64 / ratio_denom as u64,
2258                allowed_error,
2259            ))
2260        }
2261        IsolationType::Snp => {
2262            // SNP currently does not provide the frequency.
2263            None
2264        }
2265        IsolationType::Vbs | IsolationType::None => None,
2266    };
2267
2268    if let Some((hw_frequency, allowed_error)) = hw_info {
2269        // Don't allow the frequencies to be different by more than the hardware
2270        // precision.
2271        let delta = hw_frequency.abs_diff(hv_frequency);
2272        if delta > allowed_error {
2273            return Err(Error::TscFrequencyMismatch {
2274                hv: hv_frequency,
2275                hw: hw_frequency,
2276                allowed_error,
2277            });
2278        }
2279    }
2280
2281    Ok(hv_frequency)
2282}
2283
2284impl UhPartitionInner {
2285    fn manage_io_port_intercept_region(&self, begin: u16, end: u16, active: bool) {
2286        if self.isolation.is_hardware_isolated() {
2287            return;
2288        }
2289
2290        static SKIP_RANGE: AtomicBool = AtomicBool::new(false);
2291
2292        let access_type_mask = if active {
2293            HV_INTERCEPT_ACCESS_MASK_READ_WRITE
2294        } else {
2295            HV_INTERCEPT_ACCESS_MASK_NONE
2296        };
2297
2298        // Try to register the whole range at once.
2299        if !SKIP_RANGE.load(Ordering::Relaxed) {
2300            match self.hcl.register_intercept(
2301                HvInterceptType::HvInterceptTypeX64IoPortRange,
2302                access_type_mask,
2303                HvInterceptParameters::new_io_port_range(begin..=end),
2304            ) {
2305                Ok(()) => return,
2306                Err(HvError::InvalidParameter) => {
2307                    // Probably a build that doesn't support range wrapping yet.
2308                    // Don't try again.
2309                    SKIP_RANGE.store(true, Ordering::Relaxed);
2310                    tracing::warn!(
2311                        CVM_ALLOWED,
2312                        "old hypervisor build; using slow path for intercept ranges"
2313                    );
2314                }
2315                Err(err) => {
2316                    panic!("io port range registration failure: {err:?}");
2317                }
2318            }
2319        }
2320
2321        // Fall back to registering one port at a time.
2322        for port in begin..=end {
2323            self.hcl
2324                .register_intercept(
2325                    HvInterceptType::HvInterceptTypeX64IoPort,
2326                    access_type_mask,
2327                    HvInterceptParameters::new_io_port(port),
2328                )
2329                .expect("registering io intercept cannot fail");
2330        }
2331    }
2332
2333    fn is_gpa_lower_vtl_ram(&self, gpa: u64) -> bool {
2334        // TODO: this probably should reflect changes to the memory map via PAM
2335        // registers. Right now this isn't an issue because the relevant region,
2336        // VGA, is handled on the host.
2337        self.lower_vtl_memory_layout
2338            .ram()
2339            .iter()
2340            .any(|m| m.range.contains_addr(gpa))
2341    }
2342
2343    fn is_gpa_mapped(&self, gpa: u64, write: bool) -> bool {
2344        // TODO: this probably should reflect changes to the memory map via PAM
2345        // registers. Right now this isn't an issue because the relevant region,
2346        // VGA, is handled on the host.
2347        if self.is_gpa_lower_vtl_ram(gpa) {
2348            // The monitor page is protected against lower VTL writes.
2349            !write || self.monitor_page.gpa() != Some(gpa & !(HV_PAGE_SIZE - 1))
2350        } else {
2351            false
2352        }
2353    }
2354}
2355
2356/// Handle returned by [`UhPartition::register_host_io_port_fast_path`].
2357///
2358/// When dropped, unregisters the IO ports so that they are no longer forwarded
2359/// to the host.
2360#[must_use]
2361pub struct HostIoPortFastPathHandle {
2362    inner: Weak<UhPartitionInner>,
2363    begin: u16,
2364    end: u16,
2365}
2366
2367impl Drop for HostIoPortFastPathHandle {
2368    fn drop(&mut self) {
2369        if let Some(inner) = self.inner.upgrade() {
2370            inner.manage_io_port_intercept_region(self.begin, self.end, true);
2371        }
2372    }
2373}
2374
2375/// The application level VTL crash data not suited for putting
2376/// on the wire.
2377///
2378/// FUTURE: move/remove this to standardize across virt backends.
2379#[derive(Copy, Clone, Debug)]
2380pub struct VtlCrash {
2381    /// The VP that crashed.
2382    pub vp_index: VpIndex,
2383    /// The VTL that crashed.
2384    pub last_vtl: GuestVtl,
2385    /// The crash control information.
2386    pub control: GuestCrashCtl,
2387    /// The crash parameters.
2388    pub parameters: [u64; 5],
2389}
2390
2391/// Validate that flags is a valid setting for VTL memory protection when
2392/// applied to VTL 1.
2393#[cfg_attr(guest_arch = "aarch64", expect(dead_code))]
2394fn validate_vtl_gpa_flags(
2395    flags: HvMapGpaFlags,
2396    mbec_enabled: bool,
2397    shadow_supervisor_stack_enabled: bool,
2398) -> bool {
2399    // Adjust is not allowed for VTL1.
2400    if flags.adjustable() {
2401        return false;
2402    }
2403
2404    // KX must equal UX unless MBEC is enabled. KX && !UX is invalid.
2405    if flags.kernel_executable() != flags.user_executable() {
2406        if (flags.kernel_executable() && !flags.user_executable()) || !mbec_enabled {
2407            return false;
2408        }
2409    }
2410
2411    // Read must be specified if anything else is specified.
2412    if flags.writable()
2413        || flags.kernel_executable()
2414        || flags.user_executable()
2415        || flags.supervisor_shadow_stack()
2416        || flags.paging_writability()
2417        || flags.verify_paging_writability()
2418    {
2419        if !flags.readable() {
2420            return false;
2421        }
2422    }
2423
2424    // Supervisor shadow stack protection is invalid if shadow stacks are disabled
2425    // or if execute is not specified.
2426    if flags.supervisor_shadow_stack()
2427        && ((!flags.kernel_executable() && !flags.user_executable())
2428            || shadow_supervisor_stack_enabled)
2429    {
2430        return false;
2431    }
2432
2433    true
2434}