virt_mshv_vtl/
lib.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! Implementation of the Underhill hypervisor backend, which uses
5//! `/dev/mshv_vtl` to interact with the Microsoft hypervisor while running in
6//! VTL2.
7
8#![cfg(target_os = "linux")]
9
10mod devmsr;
11
12cfg_if::cfg_if!(
13    if #[cfg(target_arch = "x86_64")] { // xtask-fmt allow-target-arch sys-crate
14        mod cvm_cpuid;
15        pub use processor::mshv::x64::HypervisorBackedX86 as HypervisorBacked;
16        pub use processor::snp::SnpBacked;
17        pub use processor::tdx::TdxBacked;
18        use bitvec::prelude::BitArray;
19        use bitvec::prelude::Lsb0;
20        use devmsr::MsrDevice;
21        use hv1_emulator::hv::ProcessorVtlHv;
22        use processor::HardwareIsolatedBacking;
23        use processor::LapicState;
24        use processor::mshv::x64::HypervisorBackedX86Shared as HypervisorBackedShared;
25        use processor::snp::SnpBackedShared;
26        use processor::tdx::TdxBackedShared;
27        use std::arch::x86_64::CpuidResult;
28        use virt::CpuidLeaf;
29        use virt::state::StateElement;
30        use virt::vp::MpState;
31        /// Bitarray type for representing IRR bits in a x86-64 APIC
32        /// Each bit represent the 256 possible vectors.
33        type IrrBitmap = BitArray<[u32; 8], Lsb0>;
34    } else if #[cfg(target_arch = "aarch64")] { // xtask-fmt allow-target-arch sys-crate
35        pub use processor::mshv::arm64::HypervisorBackedArm64 as HypervisorBacked;
36        use processor::mshv::arm64::HypervisorBackedArm64Shared as HypervisorBackedShared;
37    }
38);
39
40mod processor;
41pub use processor::Backing;
42pub use processor::UhProcessor;
43
44use anyhow::Context as AnyhowContext;
45use bitfield_struct::bitfield;
46use bitvec::boxed::BitBox;
47use bitvec::vec::BitVec;
48use cvm_tracing::CVM_ALLOWED;
49use guestmem::GuestMemory;
50use guestmem::GuestMemoryBackingError;
51use hcl::GuestVtl;
52use hcl::ioctl::Hcl;
53use hcl::ioctl::SetVsmPartitionConfigError;
54use hv1_emulator::hv::GlobalHv;
55use hv1_emulator::message_queues::MessageQueues;
56use hv1_emulator::synic::GlobalSynic;
57use hv1_emulator::synic::SintProxied;
58use hv1_structs::VtlArray;
59use hvdef::GuestCrashCtl;
60use hvdef::HV_PAGE_SIZE;
61use hvdef::HvError;
62use hvdef::HvMapGpaFlags;
63use hvdef::HvRegisterName;
64use hvdef::HvRegisterVsmPartitionConfig;
65use hvdef::HvRegisterVsmPartitionStatus;
66use hvdef::Vtl;
67use hvdef::hypercall::HV_INTERCEPT_ACCESS_MASK_EXECUTE;
68use hvdef::hypercall::HV_INTERCEPT_ACCESS_MASK_NONE;
69use hvdef::hypercall::HV_INTERCEPT_ACCESS_MASK_READ_WRITE;
70use hvdef::hypercall::HV_INTERCEPT_ACCESS_MASK_WRITE;
71use hvdef::hypercall::HostVisibilityType;
72use hvdef::hypercall::HvGuestOsId;
73use hvdef::hypercall::HvInputVtl;
74use hvdef::hypercall::HvInterceptParameters;
75use hvdef::hypercall::HvInterceptType;
76use inspect::Inspect;
77use inspect::InspectMut;
78use memory_range::MemoryRange;
79use pal::unix::affinity;
80use pal::unix::affinity::CpuSet;
81use pal_async::driver::Driver;
82use pal_async::driver::SpawnDriver;
83use pal_uring::IdleControl;
84use parking_lot::Mutex;
85use parking_lot::RwLock;
86use processor::BackingSharedParams;
87use processor::SidecarExitReason;
88use sidecar_client::NewSidecarClientError;
89use std::ops::RangeInclusive;
90use std::os::fd::AsRawFd;
91use std::sync::Arc;
92use std::sync::Weak;
93use std::sync::atomic::AtomicBool;
94use std::sync::atomic::AtomicU8;
95use std::sync::atomic::AtomicU32;
96use std::sync::atomic::AtomicU64;
97use std::sync::atomic::Ordering;
98use std::task::Waker;
99use thiserror::Error;
100use user_driver::DmaClient;
101use virt::IsolationType;
102use virt::PartitionCapabilities;
103use virt::VpIndex;
104use virt::irqcon::IoApicRouting;
105use virt::irqcon::MsiRequest;
106use virt::x86::apic_software_device::ApicSoftwareDevices;
107use virt_support_apic::LocalApicSet;
108use vm_topology::memory::MemoryLayout;
109use vm_topology::processor::ProcessorTopology;
110use vm_topology::processor::TargetVpInfo;
111use vmcore::monitor::MonitorPage;
112use vmcore::reference_time::GetReferenceTime;
113use vmcore::reference_time::ReferenceTimeResult;
114use vmcore::reference_time::ReferenceTimeSource;
115use vmcore::vmtime::VmTimeSource;
116use x86defs::snp::REG_TWEAK_BITMAP_OFFSET;
117use x86defs::snp::REG_TWEAK_BITMAP_SIZE;
118use x86defs::tdx::TdCallResult;
119use zerocopy::FromBytes;
120use zerocopy::FromZeros;
121use zerocopy::Immutable;
122use zerocopy::IntoBytes;
123use zerocopy::KnownLayout;
124
125/// General error returned by operations.
126#[derive(Error, Debug)]
127#[expect(missing_docs)]
128pub enum Error {
129    #[error("hcl error")]
130    Hcl(#[source] hcl::ioctl::Error),
131    #[error("failed to open sidecar client")]
132    Sidecar(#[source] NewSidecarClientError),
133    #[error("failed to install {0:?} intercept: {1:?}")]
134    InstallIntercept(HvInterceptType, HvError),
135    #[error("failed to query hypervisor register {0:#x?}")]
136    Register(HvRegisterName, #[source] HvError),
137    #[error("failed to set vsm partition config register")]
138    VsmPartitionConfig(#[source] SetVsmPartitionConfigError),
139    #[error("failed to create virtual device")]
140    NewDevice(#[source] virt::x86::apic_software_device::DeviceIdInUse),
141    #[error("failed to create cpuid tables for cvm")]
142    #[cfg(guest_arch = "x86_64")]
143    CvmCpuid(#[source] cvm_cpuid::CpuidResultsError),
144    #[error("failed to update hypercall msr")]
145    UpdateHypercallMsr,
146    #[error("failed to update reference tsc msr")]
147    UpdateReferenceTsc,
148    #[error("failed to map overlay page")]
149    MapOverlay(#[source] std::io::Error),
150    #[error("failed to allocate shared visibility pages for overlay")]
151    AllocateSharedVisOverlay(#[source] anyhow::Error),
152    #[error("failed to open msr device")]
153    OpenMsr(#[source] std::io::Error),
154    #[error("cpuid did not contain valid TSC frequency information")]
155    BadCpuidTsc,
156    #[error("failed to read tsc frequency")]
157    ReadTscFrequency(#[source] std::io::Error),
158    #[error(
159        "tsc frequency mismatch between hypervisor ({hv}) and hardware {hw}, exceeds allowed error {allowed_error}"
160    )]
161    TscFrequencyMismatch {
162        hv: u64,
163        hw: u64,
164        allowed_error: u64,
165    },
166    #[error("failed to set vsm partition config: {0:?}")]
167    FailedToSetL2Ctls(TdCallResult),
168    #[error("debugging is configured but the binary does not have the gdb feature")]
169    InvalidDebugConfiguration,
170    #[error("failed to allocate TLB flush page")]
171    AllocateTlbFlushPage(#[source] anyhow::Error),
172}
173
174/// Error revoking guest VSM.
175#[derive(Error, Debug)]
176#[expect(missing_docs)]
177pub enum RevokeGuestVsmError {
178    #[error("failed to set vsm config")]
179    SetGuestVsmConfig(#[source] hcl::ioctl::SetGuestVsmConfigError),
180    #[error("VTL 1 is already enabled")]
181    Vtl1AlreadyEnabled,
182}
183
184/// Underhill partition.
185#[derive(Inspect)]
186pub struct UhPartition {
187    #[inspect(flatten)]
188    inner: Arc<UhPartitionInner>,
189    // TODO: remove this extra indirection by refactoring some traits.
190    #[inspect(skip)]
191    interrupt_targets: VtlArray<Arc<UhInterruptTarget>, 2>,
192}
193
194/// Underhill partition.
195#[derive(Inspect)]
196#[inspect(extra = "UhPartitionInner::inspect_extra")]
197struct UhPartitionInner {
198    #[inspect(skip)]
199    hcl: Hcl,
200    #[inspect(skip)] // inspected separately
201    vps: Vec<UhVpInner>,
202    irq_routes: virt::irqcon::IrqRoutes,
203    caps: PartitionCapabilities,
204    #[inspect(skip)] // handled in `inspect_extra`
205    enter_modes: Mutex<EnterModes>,
206    #[inspect(skip)]
207    enter_modes_atomic: AtomicU8,
208    #[cfg(guest_arch = "x86_64")]
209    cpuid: virt::CpuidLeafSet,
210    lower_vtl_memory_layout: MemoryLayout,
211    gm: VtlArray<GuestMemory, 2>,
212    vtl0_kernel_exec_gm: GuestMemory,
213    vtl0_user_exec_gm: GuestMemory,
214    #[cfg_attr(guest_arch = "aarch64", expect(dead_code))]
215    #[inspect(skip)]
216    crash_notification_send: mesh::Sender<VtlCrash>,
217    monitor_page: MonitorPage,
218    software_devices: Option<ApicSoftwareDevices>,
219    #[inspect(skip)]
220    vmtime: VmTimeSource,
221    isolation: IsolationType,
222    #[inspect(with = "inspect::AtomicMut")]
223    no_sidecar_hotplug: AtomicBool,
224    use_mmio_hypercalls: bool,
225    backing_shared: BackingShared,
226    intercept_debug_exceptions: bool,
227    #[cfg(guest_arch = "x86_64")]
228    // N.B For now, only one device vector table i.e. for VTL0 only
229    #[inspect(hex, with = "|x| inspect::iter_by_index(x.read().into_inner())")]
230    device_vector_table: RwLock<IrrBitmap>,
231    vmbus_relay: bool,
232}
233
234#[derive(Inspect)]
235#[inspect(untagged)]
236enum BackingShared {
237    Hypervisor(#[inspect(flatten)] HypervisorBackedShared),
238    #[cfg(guest_arch = "x86_64")]
239    Snp(#[inspect(flatten)] SnpBackedShared),
240    #[cfg(guest_arch = "x86_64")]
241    Tdx(#[inspect(flatten)] TdxBackedShared),
242}
243
244impl BackingShared {
245    fn new(
246        isolation: IsolationType,
247        partition_params: &UhPartitionNewParams<'_>,
248        backing_shared_params: BackingSharedParams<'_>,
249    ) -> Result<BackingShared, Error> {
250        Ok(match isolation {
251            IsolationType::None | IsolationType::Vbs => {
252                assert!(backing_shared_params.cvm_state.is_none());
253                BackingShared::Hypervisor(HypervisorBackedShared::new(
254                    partition_params,
255                    backing_shared_params,
256                )?)
257            }
258            #[cfg(guest_arch = "x86_64")]
259            IsolationType::Snp => BackingShared::Snp(SnpBackedShared::new(
260                partition_params,
261                backing_shared_params,
262            )?),
263            #[cfg(guest_arch = "x86_64")]
264            IsolationType::Tdx => BackingShared::Tdx(TdxBackedShared::new(
265                partition_params,
266                backing_shared_params,
267            )?),
268            #[cfg(not(guest_arch = "x86_64"))]
269            _ => unreachable!(),
270        })
271    }
272
273    fn cvm_state(&self) -> Option<&UhCvmPartitionState> {
274        match self {
275            BackingShared::Hypervisor(_) => None,
276            #[cfg(guest_arch = "x86_64")]
277            BackingShared::Snp(SnpBackedShared { cvm, .. })
278            | BackingShared::Tdx(TdxBackedShared { cvm, .. }) => Some(cvm),
279        }
280    }
281
282    #[cfg_attr(guest_arch = "aarch64", expect(dead_code))]
283    fn guest_vsm_disabled(&self) -> bool {
284        match self {
285            BackingShared::Hypervisor(h) => {
286                matches!(*h.guest_vsm.read(), GuestVsmState::NotPlatformSupported)
287            }
288            #[cfg(guest_arch = "x86_64")]
289            BackingShared::Snp(SnpBackedShared { cvm, .. })
290            | BackingShared::Tdx(TdxBackedShared { cvm, .. }) => {
291                matches!(*cvm.guest_vsm.read(), GuestVsmState::NotPlatformSupported)
292            }
293        }
294    }
295
296    fn untrusted_synic(&self) -> Option<&GlobalSynic> {
297        match self {
298            BackingShared::Hypervisor(_) => None,
299            #[cfg(guest_arch = "x86_64")]
300            BackingShared::Snp(_) => None,
301            #[cfg(guest_arch = "x86_64")]
302            BackingShared::Tdx(s) => s.untrusted_synic.as_ref(),
303        }
304    }
305}
306
307#[derive(InspectMut, Copy, Clone)]
308struct EnterModes {
309    #[inspect(mut)]
310    first: EnterMode,
311    #[inspect(mut)]
312    second: EnterMode,
313}
314
315impl Default for EnterModes {
316    fn default() -> Self {
317        Self {
318            first: EnterMode::Fast,
319            second: EnterMode::IdleToVtl0,
320        }
321    }
322}
323
324impl From<EnterModes> for hcl::protocol::EnterModes {
325    fn from(value: EnterModes) -> Self {
326        Self::new()
327            .with_first(value.first.into())
328            .with_second(value.second.into())
329    }
330}
331
332#[derive(InspectMut, Copy, Clone)]
333enum EnterMode {
334    Fast,
335    PlayIdle,
336    IdleToVtl0,
337}
338
339impl From<EnterMode> for hcl::protocol::EnterMode {
340    fn from(value: EnterMode) -> Self {
341        match value {
342            EnterMode::Fast => Self::FAST,
343            EnterMode::PlayIdle => Self::PLAY_IDLE,
344            EnterMode::IdleToVtl0 => Self::IDLE_TO_VTL0,
345        }
346    }
347}
348
349#[cfg(guest_arch = "x86_64")]
350#[derive(Inspect)]
351struct GuestVsmVpState {
352    /// The pending event that VTL 1 wants to inject into VTL 0. Injected on
353    /// next exit to VTL 0.
354    #[inspect(with = "|x| x.as_ref().map(inspect::AsDebug)")]
355    vtl0_exit_pending_event: Option<hvdef::HvX64PendingExceptionEvent>,
356    reg_intercept: SecureRegisterInterceptState,
357}
358
359#[cfg(guest_arch = "x86_64")]
360impl GuestVsmVpState {
361    fn new() -> Self {
362        GuestVsmVpState {
363            vtl0_exit_pending_event: None,
364            reg_intercept: Default::default(),
365        }
366    }
367}
368
369#[cfg(guest_arch = "x86_64")]
370#[derive(Inspect)]
371/// VP state for CVMs.
372struct UhCvmVpState {
373    // Allocation handle for direct overlays
374    #[inspect(debug)]
375    direct_overlay_handle: user_driver::memory::MemoryBlock,
376    /// Used in VTL 2 exit code to determine which VTL to exit to.
377    exit_vtl: GuestVtl,
378    /// Hypervisor enlightenment emulator state.
379    hv: VtlArray<ProcessorVtlHv, 2>,
380    /// LAPIC state.
381    lapics: VtlArray<LapicState, 2>,
382    /// Guest VSM state for this vp. Some when VTL 1 is enabled.
383    vtl1: Option<GuestVsmVpState>,
384}
385
386#[cfg(guest_arch = "x86_64")]
387impl UhCvmVpState {
388    /// Creates a new CVM VP state.
389    pub(crate) fn new(
390        cvm_partition: &UhCvmPartitionState,
391        inner: &UhPartitionInner,
392        vp_info: &TargetVpInfo,
393        overlay_pages_required: usize,
394    ) -> Result<Self, Error> {
395        let direct_overlay_handle = cvm_partition
396            .shared_dma_client
397            .allocate_dma_buffer(overlay_pages_required * HV_PAGE_SIZE as usize)
398            .map_err(Error::AllocateSharedVisOverlay)?;
399
400        let apic_base = virt::vp::Apic::at_reset(&inner.caps, vp_info).apic_base;
401        let lapics = VtlArray::from_fn(|vtl| {
402            let apic_set = &cvm_partition.lapic[vtl];
403
404            // The APIC is software-enabled after reset for secure VTLs, to
405            // maintain compatibility with released versions of secure kernel
406            let mut lapic = apic_set.add_apic(vp_info, vtl == Vtl::Vtl1);
407            // Initialize APIC base to match the reset VM state.
408            lapic.set_apic_base(apic_base).unwrap();
409            // Only the VTL 0 non-BSP LAPICs should be in the WaitForSipi state.
410            let activity = if vtl == Vtl::Vtl0 && !vp_info.base.is_bsp() {
411                MpState::WaitForSipi
412            } else {
413                MpState::Running
414            };
415            LapicState::new(lapic, activity)
416        });
417
418        let hv = VtlArray::from_fn(|vtl| cvm_partition.hv.add_vp(vp_info.base.vp_index, vtl));
419
420        Ok(Self {
421            direct_overlay_handle,
422            exit_vtl: GuestVtl::Vtl0,
423            hv,
424            lapics,
425            vtl1: None,
426        })
427    }
428}
429
430#[cfg(guest_arch = "x86_64")]
431#[derive(Inspect, Default)]
432#[inspect(hex)]
433/// Configuration of VTL 1 registration for intercepts on certain registers
434pub struct SecureRegisterInterceptState {
435    #[inspect(with = "|&x| u64::from(x)")]
436    intercept_control: hvdef::HvRegisterCrInterceptControl,
437    cr0_mask: u64,
438    cr4_mask: u64,
439    // Writes to X86X_IA32_MSR_MISC_ENABLE are dropped, so this is only used so
440    // that get_vp_register returns the correct value from a set_vp_register
441    ia32_misc_enable_mask: u64,
442}
443
444#[derive(Inspect)]
445/// Partition-wide state for CVMs.
446struct UhCvmPartitionState {
447    #[cfg(guest_arch = "x86_64")]
448    vps_per_socket: u32,
449    /// VPs that have locked their TLB.
450    #[inspect(
451        with = "|arr| inspect::iter_by_index(arr.iter()).map_value(|bb| inspect::iter_by_index(bb.iter().map(|v| *v)))"
452    )]
453    tlb_locked_vps: VtlArray<BitBox<AtomicU64>, 2>,
454    #[inspect(with = "inspect::iter_by_index")]
455    vps: Vec<UhCvmVpInner>,
456    shared_memory: GuestMemory,
457    #[cfg_attr(guest_arch = "aarch64", expect(dead_code))]
458    #[inspect(skip)]
459    isolated_memory_protector: Arc<dyn ProtectIsolatedMemory>,
460    /// The emulated local APIC set.
461    lapic: VtlArray<LocalApicSet, 2>,
462    /// The emulated hypervisor state.
463    hv: GlobalHv<2>,
464    /// Guest VSM state.
465    guest_vsm: RwLock<GuestVsmState<CvmVtl1State>>,
466    /// Dma client for shared visibility pages.
467    shared_dma_client: Arc<dyn DmaClient>,
468    /// Dma client for private visibility pages.
469    private_dma_client: Arc<dyn DmaClient>,
470    hide_isolation: bool,
471}
472
473#[cfg_attr(guest_arch = "aarch64", expect(dead_code))]
474impl UhCvmPartitionState {
475    fn vp_inner(&self, vp_index: u32) -> &UhCvmVpInner {
476        &self.vps[vp_index as usize]
477    }
478
479    fn is_lower_vtl_startup_denied(&self) -> bool {
480        matches!(
481            *self.guest_vsm.read(),
482            GuestVsmState::Enabled {
483                vtl1: CvmVtl1State {
484                    deny_lower_vtl_startup: true,
485                    ..
486                }
487            }
488        )
489    }
490}
491
492#[derive(Inspect)]
493/// Per-vp state for CVMs.
494struct UhCvmVpInner {
495    /// The current status of TLB locks
496    tlb_lock_info: VtlArray<TlbLockInfo, 2>,
497    /// Whether EnableVpVtl for VTL 1 has been called on this VP.
498    vtl1_enable_called: Mutex<bool>,
499    /// Whether the VP has been started via the StartVp hypercall.
500    started: AtomicBool,
501    /// Start context for StartVp and EnableVpVtl calls.
502    #[inspect(with = "|arr| inspect::iter_by_index(arr.iter().map(|v| v.lock().is_some()))")]
503    hv_start_enable_vtl_vp: VtlArray<Mutex<Option<Box<VpStartEnableVtl>>>, 2>,
504}
505
506#[cfg_attr(guest_arch = "aarch64", expect(dead_code))]
507#[derive(Inspect)]
508#[inspect(tag = "guest_vsm_state")]
509/// Partition-wide state for guest vsm.
510enum GuestVsmState<T: Inspect> {
511    NotPlatformSupported,
512    NotGuestEnabled,
513    Enabled {
514        #[inspect(flatten)]
515        vtl1: T,
516    },
517}
518
519impl<T: Inspect> GuestVsmState<T> {
520    pub fn from_availability(guest_vsm_available: bool) -> Self {
521        if guest_vsm_available {
522            GuestVsmState::NotGuestEnabled
523        } else {
524            GuestVsmState::NotPlatformSupported
525        }
526    }
527}
528
529#[derive(Inspect)]
530struct CvmVtl1State {
531    /// Whether VTL 1 has been enabled on any vp
532    enabled_on_any_vp: bool,
533    /// Whether guest memory should be zeroed before it resets.
534    zero_memory_on_reset: bool,
535    /// Whether a vp can be started or reset by a lower vtl.
536    deny_lower_vtl_startup: bool,
537    /// Whether Mode-Based Execution Control should be enforced on lower VTLs.
538    pub mbec_enabled: bool,
539    /// Whether shadow supervisor stack is enabled.
540    pub shadow_supervisor_stack_enabled: bool,
541    #[inspect(with = "|bb| inspect::iter_by_index(bb.iter().map(|v| *v))")]
542    io_read_intercepts: BitBox<u64>,
543    #[inspect(with = "|bb| inspect::iter_by_index(bb.iter().map(|v| *v))")]
544    io_write_intercepts: BitBox<u64>,
545}
546
547#[cfg_attr(guest_arch = "aarch64", expect(dead_code))]
548impl CvmVtl1State {
549    fn new(mbec_enabled: bool) -> Self {
550        Self {
551            enabled_on_any_vp: false,
552            zero_memory_on_reset: false,
553            deny_lower_vtl_startup: false,
554            mbec_enabled,
555            shadow_supervisor_stack_enabled: false,
556            io_read_intercepts: BitVec::repeat(false, u16::MAX as usize + 1).into_boxed_bitslice(),
557            io_write_intercepts: BitVec::repeat(false, u16::MAX as usize + 1).into_boxed_bitslice(),
558        }
559    }
560}
561
562#[cfg_attr(guest_arch = "aarch64", expect(dead_code))]
563struct TscReferenceTimeSource {
564    tsc_scale: u64,
565}
566
567#[cfg_attr(guest_arch = "aarch64", expect(dead_code))]
568impl TscReferenceTimeSource {
569    fn new(tsc_frequency: u64) -> Self {
570        TscReferenceTimeSource {
571            tsc_scale: (((10_000_000_u128) << 64) / tsc_frequency as u128) as u64,
572        }
573    }
574}
575
576/// A time implementation based on TSC.
577impl GetReferenceTime for TscReferenceTimeSource {
578    fn now(&self) -> ReferenceTimeResult {
579        #[cfg(guest_arch = "x86_64")]
580        {
581            let tsc = safe_intrinsics::rdtsc();
582            let ref_time = ((self.tsc_scale as u128 * tsc as u128) >> 64) as u64;
583            ReferenceTimeResult {
584                ref_time,
585                system_time: None,
586            }
587        }
588
589        #[cfg(guest_arch = "aarch64")]
590        {
591            todo!("AARCH64_TODO");
592        }
593    }
594}
595
596#[cfg(guest_arch = "aarch64")]
597impl virt::irqcon::ControlGic for UhPartitionInner {
598    fn set_spi_irq(&self, irq_id: u32, high: bool) {
599        if let Err(err) = self.hcl.request_interrupt(
600            hvdef::HvInterruptControl::new()
601                .with_arm64_asserted(high)
602                .with_interrupt_type(hvdef::HvInterruptType::HvArm64InterruptTypeFixed),
603            0,
604            irq_id,
605            GuestVtl::Vtl0,
606        ) {
607            tracelimit::warn_ratelimited!(
608                error = &err as &dyn std::error::Error,
609                irq = irq_id,
610                asserted = high,
611                "failed to request spi"
612            );
613        }
614    }
615}
616
617#[cfg(guest_arch = "aarch64")]
618impl virt::Aarch64Partition for UhPartition {
619    fn control_gic(&self, vtl: Vtl) -> Arc<dyn virt::irqcon::ControlGic> {
620        debug_assert!(vtl == Vtl::Vtl0);
621        self.inner.clone()
622    }
623}
624
625/// A wrapper around [`UhProcessor`] that is [`Send`].
626///
627/// This is used to instantiate the processor object on the correct thread,
628/// since all lower VTL processor state accesses must occur from the same
629/// processor at VTL2.
630pub struct UhProcessorBox {
631    partition: Arc<UhPartitionInner>,
632    vp_info: TargetVpInfo,
633}
634
635impl UhProcessorBox {
636    /// Returns the VP index.
637    pub fn vp_index(&self) -> VpIndex {
638        self.vp_info.base.vp_index
639    }
640
641    /// Returns the base CPU that manages this processor, when it is a sidecar
642    /// VP.
643    pub fn sidecar_base_cpu(&self) -> Option<u32> {
644        self.partition
645            .hcl
646            .sidecar_base_cpu(self.vp_info.base.vp_index.index())
647    }
648
649    /// Returns the processor object, bound to this thread.
650    ///
651    /// If `control` is provided, then this must be called on the VP's
652    /// associated thread pool thread, and it will dispatch the VP directly.
653    /// Otherwise, the processor will control the processor via the sidecar
654    /// kernel.
655    pub fn bind_processor<'a, T: Backing>(
656        &'a mut self,
657        driver: &impl Driver,
658        control: Option<&'a mut IdleControl>,
659    ) -> Result<UhProcessor<'a, T>, Error> {
660        if let Some(control) = &control {
661            let vp_index = self.vp_info.base.vp_index;
662
663            let mut current = Default::default();
664            affinity::get_current_thread_affinity(&mut current).unwrap();
665            assert_eq!(&current, CpuSet::new().set(vp_index.index()));
666
667            self.partition
668                .hcl
669                .set_poll_file(
670                    self.partition.vp(vp_index).unwrap().cpu_index,
671                    control.ring_fd().as_raw_fd(),
672                )
673                .map_err(Error::Hcl)?;
674        }
675
676        UhProcessor::new(driver, &self.partition, self.vp_info, control)
677    }
678
679    /// Sets the sidecar remove reason for the processor to be due to a task
680    /// running with the given name.
681    ///
682    /// This is useful for diagnostics.
683    pub fn set_sidecar_exit_due_to_task(&self, task: Arc<str>) {
684        self.partition
685            .vp(self.vp_info.base.vp_index)
686            .unwrap()
687            .set_sidecar_exit_reason(SidecarExitReason::TaskRequest(task))
688    }
689}
690
691#[derive(Debug, Inspect)]
692struct UhVpInner {
693    /// 32 bits per VTL: top bits are VTL 1, bottom bits are VTL 0.
694    wake_reasons: AtomicU64,
695    #[inspect(skip)]
696    waker: RwLock<Option<Waker>>,
697    message_queues: VtlArray<MessageQueues, 2>,
698    #[inspect(skip)]
699    vp_info: TargetVpInfo,
700    /// The Linux kernel's CPU index for this VP. This should be used instead of VpIndex
701    /// when interacting with non-MSHV kernel interfaces.
702    cpu_index: u32,
703    sidecar_exit_reason: Mutex<Option<SidecarExitReason>>,
704}
705
706impl UhVpInner {
707    pub fn vp_index(&self) -> VpIndex {
708        self.vp_info.base.vp_index
709    }
710}
711
712#[cfg_attr(not(guest_arch = "x86_64"), expect(dead_code))]
713#[derive(Debug, Inspect)]
714/// Which operation is setting the initial vp context
715enum InitialVpContextOperation {
716    /// The VP is being started via the StartVp hypercall.
717    StartVp,
718    /// The VP is being started via the EnableVpVtl hypercall.
719    EnableVpVtl,
720}
721
722#[cfg_attr(not(guest_arch = "x86_64"), expect(dead_code))]
723#[derive(Debug, Inspect)]
724/// State for handling StartVp/EnableVpVtl hypercalls.
725struct VpStartEnableVtl {
726    /// Which operation, startvp or enablevpvtl, is setting the initial vp
727    /// context
728    operation: InitialVpContextOperation,
729    #[inspect(skip)]
730    context: hvdef::hypercall::InitialVpContextX64,
731}
732
733#[derive(Debug, Inspect)]
734struct TlbLockInfo {
735    /// The set of VPs that are waiting for this VP to release the TLB lock.
736    #[inspect(with = "|bb| inspect::iter_by_index(bb.iter().map(|v| *v))")]
737    blocked_vps: BitBox<AtomicU64>,
738    /// The set of VPs that are holding the TLB lock and preventing this VP
739    /// from proceeding.
740    #[inspect(with = "|bb| inspect::iter_by_index(bb.iter().map(|v| *v))")]
741    blocking_vps: BitBox<AtomicU64>,
742    /// The count of blocking VPs. This should always be equivalent to
743    /// `blocking_vps.count_ones()`, however it is accessible in a single
744    /// atomic operation while counting is not.
745    blocking_vp_count: AtomicU32,
746    /// Whether the VP is sleeping due to a TLB lock.
747    sleeping: AtomicBool,
748}
749
750#[cfg_attr(not(guest_arch = "x86_64"), expect(dead_code))]
751impl TlbLockInfo {
752    fn new(vp_count: usize) -> Self {
753        Self {
754            blocked_vps: BitVec::repeat(false, vp_count).into_boxed_bitslice(),
755            blocking_vps: BitVec::repeat(false, vp_count).into_boxed_bitslice(),
756            blocking_vp_count: AtomicU32::new(0),
757            sleeping: false.into(),
758        }
759    }
760}
761
762#[bitfield(u32)]
763#[derive(IntoBytes, Immutable, KnownLayout, FromBytes)]
764struct WakeReason {
765    extint: bool,
766    message_queues: bool,
767    hv_start_enable_vtl_vp: bool,
768    intcon: bool,
769    update_proxy_irr_filter: bool,
770    #[bits(27)]
771    _reserved: u32,
772}
773
774impl WakeReason {
775    // Convenient constants.
776    const EXTINT: Self = Self::new().with_extint(true);
777    const MESSAGE_QUEUES: Self = Self::new().with_message_queues(true);
778    #[cfg(guest_arch = "x86_64")]
779    const HV_START_ENABLE_VP_VTL: Self = Self::new().with_hv_start_enable_vtl_vp(true); // StartVp/EnableVpVtl handling
780    const INTCON: Self = Self::new().with_intcon(true);
781    #[cfg(guest_arch = "x86_64")]
782    const UPDATE_PROXY_IRR_FILTER: Self = Self::new().with_update_proxy_irr_filter(true);
783}
784
785#[bitfield(u32)]
786#[derive(IntoBytes, Immutable, KnownLayout, FromBytes)]
787struct ExitActivity {
788    pending_event: bool,
789    #[bits(31)]
790    _reserved: u32,
791}
792
793/// Immutable access to useful bits of Partition state.
794impl UhPartition {
795    /// Revokes guest VSM.
796    pub fn revoke_guest_vsm(&self) -> Result<(), RevokeGuestVsmError> {
797        fn revoke<T: Inspect>(vsm_state: &mut GuestVsmState<T>) -> Result<(), RevokeGuestVsmError> {
798            if matches!(vsm_state, GuestVsmState::Enabled { .. }) {
799                return Err(RevokeGuestVsmError::Vtl1AlreadyEnabled);
800            }
801            *vsm_state = GuestVsmState::NotPlatformSupported;
802            Ok(())
803        }
804
805        match &self.inner.backing_shared {
806            BackingShared::Hypervisor(s) => {
807                revoke(&mut *s.guest_vsm.write())?;
808                self.inner
809                    .hcl
810                    .set_guest_vsm_partition_config(false)
811                    .map_err(RevokeGuestVsmError::SetGuestVsmConfig)?;
812            }
813            #[cfg(guest_arch = "x86_64")]
814            BackingShared::Snp(SnpBackedShared { cvm, .. })
815            | BackingShared::Tdx(TdxBackedShared { cvm, .. }) => {
816                revoke(&mut *cvm.guest_vsm.write())?;
817            }
818        };
819
820        Ok(())
821    }
822
823    /// Returns the current hypervisor reference time, in 100ns units.
824    pub fn reference_time(&self) -> u64 {
825        if let Some(hv) = self.inner.hv() {
826            hv.ref_time_source().now().ref_time
827        } else {
828            self.inner
829                .hcl
830                .reference_time()
831                .expect("should not fail to get the reference time")
832        }
833    }
834}
835
836impl virt::Partition for UhPartition {
837    fn supports_reset(&self) -> Option<&dyn virt::ResetPartition<Error = Self::Error>> {
838        None
839    }
840
841    fn caps(&self) -> &PartitionCapabilities {
842        &self.inner.caps
843    }
844
845    fn request_msi(&self, vtl: Vtl, request: MsiRequest) {
846        self.inner
847            .request_msi(vtl.try_into().expect("higher vtl not configured"), request)
848    }
849
850    fn request_yield(&self, _vp_index: VpIndex) {
851        unimplemented!()
852    }
853}
854
855impl virt::X86Partition for UhPartition {
856    fn ioapic_routing(&self) -> Arc<dyn IoApicRouting> {
857        self.inner.clone()
858    }
859
860    fn pulse_lint(&self, vp_index: VpIndex, vtl: Vtl, lint: u8) {
861        let vtl = GuestVtl::try_from(vtl).expect("higher vtl not configured");
862        if let Some(apic) = &self.inner.lapic(vtl) {
863            apic.lint(vp_index, lint.into(), |vp_index| {
864                self.inner
865                    .vp(vp_index)
866                    .unwrap()
867                    .wake(vtl, WakeReason::INTCON);
868            });
869        } else if lint == 0 {
870            self.inner
871                .vp(vp_index)
872                .unwrap()
873                .wake(vtl, WakeReason::EXTINT);
874        } else {
875            unimplemented!()
876        }
877    }
878}
879
880impl UhPartitionInner {
881    fn vp(&self, index: VpIndex) -> Option<&'_ UhVpInner> {
882        self.vps.get(index.index() as usize)
883    }
884
885    fn lapic(&self, vtl: GuestVtl) -> Option<&LocalApicSet> {
886        self.backing_shared.cvm_state().map(|x| &x.lapic[vtl])
887    }
888
889    fn hv(&self) -> Option<&GlobalHv<2>> {
890        self.backing_shared.cvm_state().map(|x| &x.hv)
891    }
892
893    /// For requester VP to issue `proxy_irr_blocked` update to other VPs
894    #[cfg(guest_arch = "x86_64")]
895    fn request_proxy_irr_filter_update(
896        &self,
897        vtl: GuestVtl,
898        device_vector: u8,
899        req_vp_index: VpIndex,
900    ) {
901        tracing::debug!(
902            ?vtl,
903            device_vector,
904            req_vp_index = req_vp_index.index(),
905            "request_proxy_irr_filter_update"
906        );
907
908        // Add given vector to partition global device vector table (VTL0 only for now)
909        {
910            let mut device_vector_table = self.device_vector_table.write();
911            device_vector_table.set(device_vector as usize, true);
912        }
913
914        // Wake all other VPs for their `proxy_irr_blocked` filter update
915        for vp in self.vps.iter() {
916            if vp.vp_index() != req_vp_index {
917                vp.wake(vtl, WakeReason::UPDATE_PROXY_IRR_FILTER);
918            }
919        }
920    }
921
922    /// Get current partition global device irr vectors (VTL0 for now)
923    #[cfg(guest_arch = "x86_64")]
924    fn fill_device_vectors(&self, _vtl: GuestVtl, irr_vectors: &mut IrrBitmap) {
925        let device_vector_table = self.device_vector_table.read();
926        for idx in device_vector_table.iter_ones() {
927            irr_vectors.set(idx, true);
928        }
929    }
930
931    fn inspect_extra(&self, resp: &mut inspect::Response<'_>) {
932        let mut wake_vps = false;
933        resp.field_mut(
934            "enter_modes",
935            &mut inspect::adhoc_mut(|req| {
936                let update = req.is_update();
937                {
938                    let mut modes = self.enter_modes.lock();
939                    modes.inspect_mut(req);
940                    if update {
941                        self.enter_modes_atomic.store(
942                            hcl::protocol::EnterModes::from(*modes).into(),
943                            Ordering::Relaxed,
944                        );
945                        wake_vps = true;
946                    }
947                }
948            }),
949        );
950
951        // Wake VPs to propagate updates.
952        if wake_vps {
953            for vp in self.vps.iter() {
954                vp.wake_vtl2();
955            }
956        }
957    }
958
959    // TODO VBS GUEST VSM: enable for aarch64
960    #[cfg_attr(guest_arch = "aarch64", expect(dead_code))]
961    fn vsm_status(&self) -> Result<HvRegisterVsmPartitionStatus, hcl::ioctl::Error> {
962        // TODO: It might be possible to cache VsmPartitionStatus.
963        self.hcl.get_vsm_partition_status()
964    }
965}
966
967impl virt::Synic for UhPartition {
968    fn post_message(&self, vtl: Vtl, vp_index: VpIndex, sint: u8, typ: u32, payload: &[u8]) {
969        let vtl = GuestVtl::try_from(vtl).expect("higher vtl not configured");
970        let Some(vp) = self.inner.vp(vp_index) else {
971            tracelimit::warn_ratelimited!(
972                CVM_ALLOWED,
973                vp = vp_index.index(),
974                "invalid vp target for post_message"
975            );
976            return;
977        };
978
979        vp.post_message(
980            vtl,
981            sint,
982            &hvdef::HvMessage::new(hvdef::HvMessageType(typ), 0, payload),
983        );
984    }
985
986    fn new_guest_event_port(
987        &self,
988        vtl: Vtl,
989        vp: u32,
990        sint: u8,
991        flag: u16,
992    ) -> Box<dyn vmcore::synic::GuestEventPort> {
993        let vtl = GuestVtl::try_from(vtl).expect("higher vtl not configured");
994        Box::new(UhEventPort {
995            partition: Arc::downgrade(&self.inner),
996            params: Arc::new(Mutex::new(UhEventPortParams {
997                vp: VpIndex::new(vp),
998                sint,
999                flag,
1000                vtl,
1001            })),
1002        })
1003    }
1004
1005    fn prefer_os_events(&self) -> bool {
1006        false
1007    }
1008
1009    fn monitor_support(&self) -> Option<&dyn virt::SynicMonitor> {
1010        Some(self)
1011    }
1012}
1013
1014impl virt::SynicMonitor for UhPartition {
1015    fn set_monitor_page(&self, vtl: Vtl, gpa: Option<u64>) -> anyhow::Result<()> {
1016        let _vtl = GuestVtl::try_from(vtl).unwrap();
1017        let old_gpa = self.inner.monitor_page.set_gpa(gpa);
1018
1019        if let Some(old_gpa) = old_gpa {
1020            let old_gpn = old_gpa.checked_div(HV_PAGE_SIZE).unwrap();
1021
1022            match &self.inner.backing_shared {
1023                #[cfg(guest_arch = "x86_64")]
1024                BackingShared::Snp(snp_backed_shared) => snp_backed_shared
1025                    .cvm
1026                    .isolated_memory_protector
1027                    .unregister_overlay_page(
1028                        _vtl,
1029                        old_gpn,
1030                        &mut SnpBacked::tlb_flush_lock_access(
1031                            None,
1032                            self.inner.as_ref(),
1033                            snp_backed_shared,
1034                        ),
1035                    )
1036                    .map_err(|e| anyhow::anyhow!(e)),
1037                #[cfg(guest_arch = "x86_64")]
1038                BackingShared::Tdx(tdx_backed_shared) => tdx_backed_shared
1039                    .cvm
1040                    .isolated_memory_protector
1041                    .unregister_overlay_page(
1042                        _vtl,
1043                        old_gpn,
1044                        &mut TdxBacked::tlb_flush_lock_access(
1045                            None,
1046                            self.inner.as_ref(),
1047                            tdx_backed_shared,
1048                        ),
1049                    )
1050                    .map_err(|e| anyhow::anyhow!(e)),
1051                BackingShared::Hypervisor(_) => self
1052                    .inner
1053                    .hcl
1054                    .modify_vtl_protection_mask(
1055                        MemoryRange::from_4k_gpn_range(old_gpn..old_gpn + 1),
1056                        hvdef::HV_MAP_GPA_PERMISSIONS_ALL,
1057                        HvInputVtl::CURRENT_VTL,
1058                    )
1059                    .map_err(|e| anyhow::anyhow!(e)),
1060            }
1061            .context("failed to unregister old monitor page")?;
1062
1063            tracing::debug!(old_gpa, "unregistered monitor page");
1064        }
1065
1066        if let Some(gpa) = gpa {
1067            let gpn = gpa.checked_div(HV_PAGE_SIZE).unwrap();
1068            let _check_perms = HvMapGpaFlags::new().with_readable(true).with_writable(true);
1069            // Disallow VTL0 from writing to the page, so we'll get an intercept. Note that read
1070            // permissions must be enabled or this doesn't work correctly.
1071            let new_perms = HvMapGpaFlags::new()
1072                .with_readable(true)
1073                .with_writable(false);
1074
1075            let result = match &self.inner.backing_shared {
1076                #[cfg(guest_arch = "x86_64")]
1077                BackingShared::Snp(snp_backed_shared) => snp_backed_shared
1078                    .cvm
1079                    .isolated_memory_protector
1080                    .register_overlay_page(
1081                        _vtl,
1082                        gpn,
1083                        _check_perms,
1084                        Some(new_perms),
1085                        &mut SnpBacked::tlb_flush_lock_access(
1086                            None,
1087                            self.inner.as_ref(),
1088                            snp_backed_shared,
1089                        ),
1090                    )
1091                    .map_err(|e| anyhow::anyhow!(e)),
1092                #[cfg(guest_arch = "x86_64")]
1093                BackingShared::Tdx(tdx_backed_shared) => tdx_backed_shared
1094                    .cvm
1095                    .isolated_memory_protector
1096                    .register_overlay_page(
1097                        _vtl,
1098                        gpn,
1099                        _check_perms,
1100                        Some(new_perms),
1101                        &mut TdxBacked::tlb_flush_lock_access(
1102                            None,
1103                            self.inner.as_ref(),
1104                            tdx_backed_shared,
1105                        ),
1106                    )
1107                    .map_err(|e| anyhow::anyhow!(e)),
1108                BackingShared::Hypervisor(_) => self
1109                    .inner
1110                    .hcl
1111                    .modify_vtl_protection_mask(
1112                        MemoryRange::from_4k_gpn_range(gpn..gpn + 1),
1113                        new_perms,
1114                        HvInputVtl::CURRENT_VTL,
1115                    )
1116                    .map_err(|e| anyhow::anyhow!(e)),
1117            }
1118            .context("failed to register monitor page");
1119
1120            if result.is_err() {
1121                // Unset the page so trying to remove it later won't fail too.
1122                self.inner.monitor_page.set_gpa(None);
1123                return result;
1124            }
1125
1126            tracing::debug!(gpa, "registered monitor page");
1127        }
1128
1129        Ok(())
1130    }
1131
1132    fn register_monitor(
1133        &self,
1134        monitor_id: vmcore::monitor::MonitorId,
1135        connection_id: u32,
1136    ) -> Box<dyn Sync + Send> {
1137        self.inner
1138            .monitor_page
1139            .register_monitor(monitor_id, connection_id)
1140    }
1141}
1142
1143impl UhPartitionInner {
1144    #[cfg(guest_arch = "x86_64")]
1145    pub(crate) fn synic_interrupt(
1146        &self,
1147        vp_index: VpIndex,
1148        vtl: GuestVtl,
1149    ) -> impl '_ + hv1_emulator::RequestInterrupt {
1150        // TODO CVM: optimize for SNP with secure avic to avoid internal wake
1151        // and for TDX to avoid trip to user mode
1152        move |vector, auto_eoi| {
1153            self.lapic(vtl).unwrap().synic_interrupt(
1154                vp_index,
1155                vector as u8,
1156                auto_eoi,
1157                |vp_index| self.vp(vp_index).unwrap().wake(vtl, WakeReason::INTCON),
1158            );
1159        }
1160    }
1161
1162    #[cfg(guest_arch = "aarch64")]
1163    fn synic_interrupt(
1164        &self,
1165        _vp_index: VpIndex,
1166        _vtl: GuestVtl,
1167    ) -> impl '_ + hv1_emulator::RequestInterrupt {
1168        move |_, _| {}
1169    }
1170}
1171
1172#[derive(Debug)]
1173struct UhEventPort {
1174    partition: Weak<UhPartitionInner>,
1175    params: Arc<Mutex<UhEventPortParams>>,
1176}
1177
1178#[derive(Debug, Copy, Clone)]
1179struct UhEventPortParams {
1180    vp: VpIndex,
1181    sint: u8,
1182    flag: u16,
1183    vtl: GuestVtl,
1184}
1185
1186impl vmcore::synic::GuestEventPort for UhEventPort {
1187    fn interrupt(&self) -> vmcore::interrupt::Interrupt {
1188        let partition = self.partition.clone();
1189        let params = self.params.clone();
1190        vmcore::interrupt::Interrupt::from_fn(move || {
1191            let UhEventPortParams {
1192                vp,
1193                sint,
1194                flag,
1195                vtl,
1196            } = *params.lock();
1197            let Some(partition) = partition.upgrade() else {
1198                return;
1199            };
1200            tracing::trace!(vp = vp.index(), sint, flag, "signal_event");
1201            if let Some(hv) = partition.hv() {
1202                match hv.synic[vtl].signal_event(
1203                    vp,
1204                    sint,
1205                    flag,
1206                    &mut partition.synic_interrupt(vp, vtl),
1207                ) {
1208                    Ok(_) => {}
1209                    Err(SintProxied) => {
1210                        tracing::trace!(
1211                            vp = vp.index(),
1212                            sint,
1213                            flag,
1214                            "forwarding event to untrusted synic"
1215                        );
1216                        if let Some(synic) = partition.backing_shared.untrusted_synic() {
1217                            synic
1218                                .signal_event(
1219                                    vp,
1220                                    sint,
1221                                    flag,
1222                                    &mut partition.synic_interrupt(vp, vtl),
1223                                )
1224                                .ok();
1225                        } else {
1226                            partition.hcl.signal_event_direct(vp.index(), sint, flag)
1227                        }
1228                    }
1229                }
1230            } else {
1231                partition.hcl.signal_event_direct(vp.index(), sint, flag);
1232            }
1233        })
1234    }
1235
1236    fn set_target_vp(&mut self, vp: u32) -> Result<(), vmcore::synic::HypervisorError> {
1237        self.params.lock().vp = VpIndex::new(vp);
1238        Ok(())
1239    }
1240}
1241
1242impl virt::Hv1 for UhPartition {
1243    type Error = Error;
1244    type Device = virt::x86::apic_software_device::ApicSoftwareDevice;
1245
1246    fn reference_time_source(&self) -> Option<ReferenceTimeSource> {
1247        Some(if let Some(hv) = self.inner.hv() {
1248            hv.ref_time_source().clone()
1249        } else {
1250            ReferenceTimeSource::from(self.inner.clone() as Arc<_>)
1251        })
1252    }
1253
1254    fn new_virtual_device(
1255        &self,
1256    ) -> Option<&dyn virt::DeviceBuilder<Device = Self::Device, Error = Self::Error>> {
1257        self.inner.software_devices.is_some().then_some(self)
1258    }
1259}
1260
1261impl GetReferenceTime for UhPartitionInner {
1262    fn now(&self) -> ReferenceTimeResult {
1263        ReferenceTimeResult {
1264            ref_time: self.hcl.reference_time().unwrap(),
1265            system_time: None,
1266        }
1267    }
1268}
1269
1270impl virt::DeviceBuilder for UhPartition {
1271    fn build(&self, vtl: Vtl, device_id: u64) -> Result<Self::Device, Self::Error> {
1272        let vtl = GuestVtl::try_from(vtl).expect("higher vtl not configured");
1273        let device = self
1274            .inner
1275            .software_devices
1276            .as_ref()
1277            .expect("checked in new_virtual_device")
1278            .new_device(self.interrupt_targets[vtl].clone(), device_id)
1279            .map_err(Error::NewDevice)?;
1280
1281        Ok(device)
1282    }
1283}
1284
1285struct UhInterruptTarget {
1286    partition: Arc<UhPartitionInner>,
1287    vtl: GuestVtl,
1288}
1289
1290impl pci_core::msi::MsiInterruptTarget for UhInterruptTarget {
1291    fn new_interrupt(&self) -> Box<dyn pci_core::msi::MsiControl> {
1292        let partition = self.partition.clone();
1293        let vtl = self.vtl;
1294        Box::new(move |address, data| partition.request_msi(vtl, MsiRequest { address, data }))
1295    }
1296}
1297
1298impl UhPartitionInner {
1299    fn request_msi(&self, vtl: GuestVtl, request: MsiRequest) {
1300        if let Some(lapic) = self.lapic(vtl) {
1301            tracing::trace!(?request, "interrupt");
1302            lapic.request_interrupt(request.address, request.data, |vp_index| {
1303                self.vp(vp_index).unwrap().wake(vtl, WakeReason::INTCON)
1304            });
1305        } else {
1306            let (address, data) = request.as_x86();
1307            if let Err(err) = self.hcl.request_interrupt(
1308                request.hv_x86_interrupt_control(),
1309                address.virt_destination().into(),
1310                data.vector().into(),
1311                vtl,
1312            ) {
1313                tracelimit::warn_ratelimited!(
1314                    CVM_ALLOWED,
1315                    error = &err as &dyn std::error::Error,
1316                    address = request.address,
1317                    data = request.data,
1318                    "failed to request msi"
1319                );
1320            }
1321        }
1322    }
1323}
1324
1325impl IoApicRouting for UhPartitionInner {
1326    fn set_irq_route(&self, irq: u8, request: Option<MsiRequest>) {
1327        self.irq_routes.set_irq_route(irq, request)
1328    }
1329
1330    // The IO-APIC is always hooked up to VTL0.
1331    fn assert_irq(&self, irq: u8) {
1332        self.irq_routes
1333            .assert_irq(irq, |request| self.request_msi(GuestVtl::Vtl0, request))
1334    }
1335}
1336
1337/// Configure the [`hvdef::HvRegisterVsmPartitionConfig`] register with the
1338/// values used by underhill.
1339fn set_vtl2_vsm_partition_config(hcl: &Hcl) -> Result<(), Error> {
1340    // Read available capabilities to determine what to enable.
1341    let caps = hcl.get_vsm_capabilities().map_err(Error::Hcl)?;
1342    let hardware_isolated = hcl.isolation().is_hardware_isolated();
1343    let isolated = hcl.isolation().is_isolated();
1344
1345    let config = HvRegisterVsmPartitionConfig::new()
1346        .with_default_vtl_protection_mask(0xF)
1347        .with_enable_vtl_protection(!hardware_isolated)
1348        .with_zero_memory_on_reset(!hardware_isolated)
1349        .with_intercept_cpuid_unimplemented(!hardware_isolated)
1350        .with_intercept_page(caps.intercept_page_available())
1351        .with_intercept_unrecoverable_exception(true)
1352        .with_intercept_not_present(caps.intercept_not_present_available() && !isolated)
1353        .with_intercept_acceptance(isolated)
1354        .with_intercept_enable_vtl_protection(isolated && !hardware_isolated)
1355        .with_intercept_system_reset(caps.intercept_system_reset_available());
1356
1357    hcl.set_vtl2_vsm_partition_config(config)
1358        .map_err(Error::VsmPartitionConfig)
1359}
1360
1361/// Configuration parameters supplied to [`UhProtoPartition::new`].
1362///
1363/// These do not include runtime resources.
1364pub struct UhPartitionNewParams<'a> {
1365    /// The isolation type for the partition.
1366    pub isolation: IsolationType,
1367    /// Hide isolation from the guest. The guest will run as if it is not
1368    /// isolated.
1369    pub hide_isolation: bool,
1370    /// The memory layout for lower VTLs.
1371    pub lower_vtl_memory_layout: &'a MemoryLayout,
1372    /// The guest processor topology.
1373    pub topology: &'a ProcessorTopology,
1374    /// The unparsed CVM cpuid info.
1375    // TODO: move parsing up a layer.
1376    pub cvm_cpuid_info: Option<&'a [u8]>,
1377    /// The unparsed CVM secrets page.
1378    pub snp_secrets: Option<&'a [u8]>,
1379    /// The virtual top of memory for hardware-isolated VMs.
1380    ///
1381    /// Must be a power of two.
1382    pub vtom: Option<u64>,
1383    /// Handle synic messages and events.
1384    ///
1385    /// On TDX, this prevents the hypervisor from getting vmtdcall exits.
1386    pub handle_synic: bool,
1387    /// Do not hotplug sidecar VPs on their first exit. Just continue running
1388    /// the VP remotely.
1389    pub no_sidecar_hotplug: bool,
1390    /// Use MMIO access hypercalls.
1391    pub use_mmio_hypercalls: bool,
1392    /// Intercept guest debug exceptions to support gdbstub.
1393    pub intercept_debug_exceptions: bool,
1394}
1395
1396/// Parameters to [`UhProtoPartition::build`].
1397pub struct UhLateParams<'a> {
1398    /// Guest memory for lower VTLs.
1399    pub gm: VtlArray<GuestMemory, 2>,
1400    /// Guest memory for VTL 0 kernel execute access.
1401    pub vtl0_kernel_exec_gm: GuestMemory,
1402    /// Guest memory for VTL 0 user execute access.
1403    pub vtl0_user_exec_gm: GuestMemory,
1404    /// The CPUID leaves to expose to the guest.
1405    #[cfg(guest_arch = "x86_64")]
1406    pub cpuid: Vec<CpuidLeaf>,
1407    /// The mesh sender to use for crash notifications.
1408    // FUTURE: remove mesh dependency from this layer.
1409    pub crash_notification_send: mesh::Sender<VtlCrash>,
1410    /// The VM time source.
1411    pub vmtime: &'a VmTimeSource,
1412    /// Parameters for CVMs only.
1413    pub cvm_params: Option<CvmLateParams>,
1414    /// vmbus_relay is enabled and active for partition
1415    pub vmbus_relay: bool,
1416}
1417
1418/// CVM-only parameters to [`UhProtoPartition::build`].
1419pub struct CvmLateParams {
1420    /// Guest memory for untrusted devices, like overlay pages.
1421    pub shared_gm: GuestMemory,
1422    /// An object to call to change host visibility on guest memory.
1423    pub isolated_memory_protector: Arc<dyn ProtectIsolatedMemory>,
1424    /// Dma client for shared visibility pages.
1425    pub shared_dma_client: Arc<dyn DmaClient>,
1426    /// Allocator for private visibility pages.
1427    pub private_dma_client: Arc<dyn DmaClient>,
1428}
1429
1430/// Trait for CVM-related protections on guest memory.
1431pub trait ProtectIsolatedMemory: Send + Sync {
1432    /// Changes host visibility on guest memory.
1433    fn change_host_visibility(
1434        &self,
1435        vtl: GuestVtl,
1436        shared: bool,
1437        gpns: &[u64],
1438        tlb_access: &mut dyn TlbFlushLockAccess,
1439    ) -> Result<(), (HvError, usize)>;
1440
1441    /// Queries host visibility on guest memory.
1442    fn query_host_visibility(
1443        &self,
1444        gpns: &[u64],
1445        host_visibility: &mut [HostVisibilityType],
1446    ) -> Result<(), (HvError, usize)>;
1447
1448    /// Gets the default protections/permissions for VTL 0.
1449    fn default_vtl0_protections(&self) -> HvMapGpaFlags;
1450
1451    /// Changes the default protections/permissions for a VTL. For VBS-isolated
1452    /// VMs, the protections apply to all vtls lower than the specified one. For
1453    /// hardware-isolated VMs, they apply just to the given vtl.
1454    fn change_default_vtl_protections(
1455        &self,
1456        calling_vtl: Vtl,
1457        target_vtl: GuestVtl,
1458        protections: HvMapGpaFlags,
1459        tlb_access: &mut dyn TlbFlushLockAccess,
1460    ) -> Result<(), HvError>;
1461
1462    /// Changes the vtl protections on a range of guest memory.
1463    fn change_vtl_protections(
1464        &self,
1465        calling_vtl: Vtl,
1466        target_vtl: GuestVtl,
1467        gpns: &[u64],
1468        protections: HvMapGpaFlags,
1469        tlb_access: &mut dyn TlbFlushLockAccess,
1470    ) -> Result<(), (HvError, usize)>;
1471
1472    /// Registers a page as an overlay page by first validating it has the
1473    /// required permissions, optionally modifying them, then locking them.
1474    fn register_overlay_page(
1475        &self,
1476        vtl: GuestVtl,
1477        gpn: u64,
1478        check_perms: HvMapGpaFlags,
1479        new_perms: Option<HvMapGpaFlags>,
1480        tlb_access: &mut dyn TlbFlushLockAccess,
1481    ) -> Result<(), HvError>;
1482
1483    /// Unregisters an overlay page, removing its permission lock and restoring
1484    /// the previous permissions.
1485    fn unregister_overlay_page(
1486        &self,
1487        vtl: GuestVtl,
1488        gpn: u64,
1489        tlb_access: &mut dyn TlbFlushLockAccess,
1490    ) -> Result<(), HvError>;
1491
1492    /// Checks whether a page is currently registered as an overlay page.
1493    fn is_overlay_page(&self, vtl: GuestVtl, gpn: u64) -> bool;
1494
1495    /// Locks the permissions and mappings for a set of guest pages.
1496    fn lock_gpns(&self, vtl: GuestVtl, gpns: &[u64]) -> Result<(), GuestMemoryBackingError>;
1497
1498    /// Unlocks the permissions and mappings for a set of guest pages.
1499    ///
1500    /// Panics if asked to unlock a page that was not previously locked. The
1501    /// caller must ensure that the given slice has the same ordering as the
1502    /// one passed to `lock_gpns`.
1503    fn unlock_gpns(&self, vtl: GuestVtl, gpns: &[u64]);
1504
1505    /// Alerts the memory protector that vtl 1 is ready to set vtl protections
1506    /// on lower-vtl memory, and that these protections should be enforced.
1507    fn set_vtl1_protections_enabled(&self);
1508
1509    /// Whether VTL 1 is prepared to modify vtl protections on lower-vtl memory,
1510    /// and therefore whether these protections should be enforced.
1511    fn vtl1_protections_enabled(&self) -> bool;
1512}
1513
1514/// Trait for access to TLB flush and lock machinery.
1515pub trait TlbFlushLockAccess {
1516    /// Flush the entire TLB for all VPs for the given VTL.
1517    fn flush(&mut self, vtl: GuestVtl);
1518
1519    /// Flush the entire TLB for all VPs for all VTLs.
1520    fn flush_entire(&mut self);
1521
1522    /// Causes the specified VTL on the current VP to wait on all TLB locks.
1523    fn set_wait_for_tlb_locks(&mut self, vtl: GuestVtl);
1524}
1525
1526/// A partially built partition. Used to allow querying partition capabilities
1527/// before fully instantiating the partition.
1528pub struct UhProtoPartition<'a> {
1529    params: UhPartitionNewParams<'a>,
1530    hcl: Hcl,
1531    guest_vsm_available: bool,
1532    #[cfg(guest_arch = "x86_64")]
1533    cpuid: virt::CpuidLeafSet,
1534}
1535
1536impl<'a> UhProtoPartition<'a> {
1537    /// Creates a new prototype partition.
1538    ///
1539    /// `driver(cpu)` returns the driver to use for polling the sidecar device
1540    /// whose base CPU is `cpu`.
1541    pub fn new<T: SpawnDriver>(
1542        params: UhPartitionNewParams<'a>,
1543        driver: impl FnMut(u32) -> T,
1544    ) -> Result<Self, Error> {
1545        let hcl_isolation = match params.isolation {
1546            IsolationType::None => hcl::ioctl::IsolationType::None,
1547            IsolationType::Vbs => hcl::ioctl::IsolationType::Vbs,
1548            IsolationType::Snp => hcl::ioctl::IsolationType::Snp,
1549            IsolationType::Tdx => hcl::ioctl::IsolationType::Tdx,
1550        };
1551
1552        // Try to open the sidecar device, if it is present.
1553        let sidecar = sidecar_client::SidecarClient::new(driver).map_err(Error::Sidecar)?;
1554
1555        let hcl = Hcl::new(hcl_isolation, sidecar).map_err(Error::Hcl)?;
1556
1557        // Set the hypercalls that this process will use.
1558        let mut allowed_hypercalls = vec![
1559            hvdef::HypercallCode::HvCallGetVpRegisters,
1560            hvdef::HypercallCode::HvCallSetVpRegisters,
1561            hvdef::HypercallCode::HvCallInstallIntercept,
1562            hvdef::HypercallCode::HvCallTranslateVirtualAddress,
1563            hvdef::HypercallCode::HvCallPostMessageDirect,
1564            hvdef::HypercallCode::HvCallSignalEventDirect,
1565            hvdef::HypercallCode::HvCallModifyVtlProtectionMask,
1566            hvdef::HypercallCode::HvCallTranslateVirtualAddressEx,
1567            hvdef::HypercallCode::HvCallCheckSparseGpaPageVtlAccess,
1568            hvdef::HypercallCode::HvCallAssertVirtualInterrupt,
1569            hvdef::HypercallCode::HvCallGetVpIndexFromApicId,
1570            hvdef::HypercallCode::HvCallAcceptGpaPages,
1571            hvdef::HypercallCode::HvCallModifySparseGpaPageHostVisibility,
1572        ];
1573
1574        if params.isolation.is_hardware_isolated() {
1575            allowed_hypercalls.extend(vec![
1576                hvdef::HypercallCode::HvCallEnablePartitionVtl,
1577                hvdef::HypercallCode::HvCallRetargetDeviceInterrupt,
1578                hvdef::HypercallCode::HvCallEnableVpVtl,
1579            ]);
1580        }
1581
1582        if params.use_mmio_hypercalls {
1583            allowed_hypercalls.extend(vec![
1584                hvdef::HypercallCode::HvCallMemoryMappedIoRead,
1585                hvdef::HypercallCode::HvCallMemoryMappedIoWrite,
1586            ]);
1587        }
1588
1589        hcl.set_allowed_hypercalls(allowed_hypercalls.as_slice());
1590
1591        set_vtl2_vsm_partition_config(&hcl)?;
1592
1593        let guest_vsm_available = Self::check_guest_vsm_support(&hcl)?;
1594
1595        #[cfg(guest_arch = "x86_64")]
1596        let cpuid = match params.isolation {
1597            IsolationType::Snp => cvm_cpuid::CpuidResultsIsolationType::Snp {
1598                cpuid_pages: params.cvm_cpuid_info.unwrap(),
1599                vtom: params.vtom.unwrap(),
1600                access_vsm: guest_vsm_available,
1601            }
1602            .build()
1603            .map_err(Error::CvmCpuid)?,
1604
1605            IsolationType::Tdx => cvm_cpuid::CpuidResultsIsolationType::Tdx {
1606                topology: params.topology,
1607                vtom: params.vtom.unwrap(),
1608                access_vsm: guest_vsm_available,
1609            }
1610            .build()
1611            .map_err(Error::CvmCpuid)?,
1612            IsolationType::Vbs | IsolationType::None => Default::default(),
1613        };
1614
1615        Ok(UhProtoPartition {
1616            hcl,
1617            params,
1618            guest_vsm_available,
1619            #[cfg(guest_arch = "x86_64")]
1620            cpuid,
1621        })
1622    }
1623
1624    /// Returns whether VSM support will be available to the guest.
1625    pub fn guest_vsm_available(&self) -> bool {
1626        self.guest_vsm_available
1627    }
1628
1629    /// Returns a new Underhill partition.
1630    pub async fn build(
1631        self,
1632        late_params: UhLateParams<'_>,
1633    ) -> Result<(UhPartition, Vec<UhProcessorBox>), Error> {
1634        let Self {
1635            mut hcl,
1636            params,
1637            guest_vsm_available,
1638            #[cfg(guest_arch = "x86_64")]
1639            cpuid,
1640        } = self;
1641        let isolation = params.isolation;
1642        let is_hardware_isolated = isolation.is_hardware_isolated();
1643
1644        // Intercept Debug Exceptions
1645        // On TDX because all OpenHCL TDs today have the debug policy bit set,
1646        // OpenHCL registers for the intercepts itself.
1647        // However, on non-TDX platforms hypervisor installs the
1648        // intercept on behalf of the guest.
1649        if params.intercept_debug_exceptions {
1650            if !cfg!(feature = "gdb") {
1651                return Err(Error::InvalidDebugConfiguration);
1652            }
1653
1654            cfg_if::cfg_if! {
1655                if #[cfg(guest_arch = "x86_64")] {
1656                    if isolation != IsolationType::Tdx {
1657                        let debug_exception_vector = 0x1;
1658                        hcl.register_intercept(
1659                            HvInterceptType::HvInterceptTypeException,
1660                            HV_INTERCEPT_ACCESS_MASK_EXECUTE,
1661                            HvInterceptParameters::new_exception(debug_exception_vector),
1662                        )
1663                        .map_err(|err| Error::InstallIntercept(HvInterceptType::HvInterceptTypeException, err))?;
1664                    }
1665                } else {
1666                    return Err(Error::InvalidDebugConfiguration);
1667                }
1668            }
1669        }
1670
1671        if !is_hardware_isolated {
1672            if cfg!(guest_arch = "x86_64") {
1673                hcl.register_intercept(
1674                    HvInterceptType::HvInterceptTypeX64Msr,
1675                    HV_INTERCEPT_ACCESS_MASK_READ_WRITE,
1676                    HvInterceptParameters::new_zeroed(),
1677                )
1678                .map_err(|err| {
1679                    Error::InstallIntercept(HvInterceptType::HvInterceptTypeX64Msr, err)
1680                })?;
1681
1682                hcl.register_intercept(
1683                    HvInterceptType::HvInterceptTypeX64ApicEoi,
1684                    HV_INTERCEPT_ACCESS_MASK_WRITE,
1685                    HvInterceptParameters::new_zeroed(),
1686                )
1687                .map_err(|err| {
1688                    Error::InstallIntercept(HvInterceptType::HvInterceptTypeX64ApicEoi, err)
1689                })?;
1690            } else {
1691                if false {
1692                    todo!("AARCH64_TODO");
1693                }
1694            }
1695        }
1696
1697        if isolation == IsolationType::Snp {
1698            // SNP VMs register for the #VC exception to support reflect-VC.
1699            hcl.register_intercept(
1700                HvInterceptType::HvInterceptTypeException,
1701                HV_INTERCEPT_ACCESS_MASK_EXECUTE,
1702                HvInterceptParameters::new_exception(0x1D),
1703            )
1704            .map_err(|err| {
1705                Error::InstallIntercept(HvInterceptType::HvInterceptTypeException, err)
1706            })?;
1707
1708            // Get the register tweak bitmap from secrets page.
1709            let mut bitmap = [0u8; 64];
1710            if let Some(secrets) = params.snp_secrets {
1711                bitmap.copy_from_slice(
1712                    &secrets
1713                        [REG_TWEAK_BITMAP_OFFSET..REG_TWEAK_BITMAP_OFFSET + REG_TWEAK_BITMAP_SIZE],
1714                );
1715            }
1716            hcl.set_snp_register_bitmap(bitmap);
1717        }
1718
1719        // Do per-VP HCL initialization.
1720        hcl.add_vps(
1721            params.topology.vp_count(),
1722            late_params
1723                .cvm_params
1724                .as_ref()
1725                .map(|x| &x.private_dma_client),
1726        )
1727        .map_err(Error::Hcl)?;
1728
1729        let vps: Vec<_> = params
1730            .topology
1731            .vps_arch()
1732            .map(|vp_info| {
1733                // TODO: determine CPU index, which in theory could be different
1734                // from the VP index, though this hasn't happened yet.
1735                let cpu_index = vp_info.base.vp_index.index();
1736                UhVpInner::new(cpu_index, vp_info)
1737            })
1738            .collect();
1739
1740        // Enable support for VPCI devices if the hypervisor supports it.
1741        #[cfg(guest_arch = "x86_64")]
1742        let software_devices = {
1743            let res = if !is_hardware_isolated {
1744                hcl.register_intercept(
1745                    HvInterceptType::HvInterceptTypeRetargetInterruptWithUnknownDeviceId,
1746                    HV_INTERCEPT_ACCESS_MASK_EXECUTE,
1747                    HvInterceptParameters::new_zeroed(),
1748                )
1749            } else {
1750                Ok(())
1751            };
1752            match res {
1753                Ok(()) => Some(ApicSoftwareDevices::new(
1754                    params.topology.vps_arch().map(|vp| vp.apic_id).collect(),
1755                )),
1756                Err(HvError::InvalidParameter | HvError::AccessDenied) => None,
1757                Err(err) => {
1758                    return Err(Error::InstallIntercept(
1759                        HvInterceptType::HvInterceptTypeRetargetInterruptWithUnknownDeviceId,
1760                        err,
1761                    ));
1762                }
1763            }
1764        };
1765
1766        #[cfg(guest_arch = "aarch64")]
1767        let software_devices = None;
1768
1769        #[cfg(guest_arch = "aarch64")]
1770        let caps = virt::aarch64::Aarch64PartitionCapabilities {};
1771
1772        #[cfg(guest_arch = "x86_64")]
1773        let cpuid = UhPartition::construct_cpuid_results(
1774            cpuid,
1775            &late_params.cpuid,
1776            params.topology,
1777            isolation,
1778            params.hide_isolation,
1779        );
1780
1781        #[cfg(guest_arch = "x86_64")]
1782        let caps = UhPartition::construct_capabilities(
1783            params.topology,
1784            &cpuid,
1785            isolation,
1786            params.hide_isolation,
1787        );
1788
1789        if params.handle_synic && !matches!(isolation, IsolationType::Tdx) {
1790            // The hypervisor will manage the untrusted SINTs (or the whole
1791            // synic for non-hardware-isolated VMs), but some event ports
1792            // and message ports are implemented here. Register an intercept
1793            // to handle HvSignalEvent and HvPostMessage hypercalls when the
1794            // hypervisor doesn't recognize the connection ID.
1795            //
1796            // TDX manages this locally instead of through the hypervisor.
1797            hcl.register_intercept(
1798                HvInterceptType::HvInterceptTypeUnknownSynicConnection,
1799                HV_INTERCEPT_ACCESS_MASK_EXECUTE,
1800                HvInterceptParameters::new_zeroed(),
1801            )
1802            .expect("registering synic intercept cannot fail");
1803        }
1804
1805        #[cfg(guest_arch = "x86_64")]
1806        let cvm_state = if is_hardware_isolated {
1807            Some(Self::construct_cvm_state(
1808                &params,
1809                late_params.cvm_params.unwrap(),
1810                &caps,
1811                guest_vsm_available,
1812            )?)
1813        } else {
1814            None
1815        };
1816        #[cfg(guest_arch = "aarch64")]
1817        let cvm_state = None;
1818
1819        let backing_shared = BackingShared::new(
1820            isolation,
1821            &params,
1822            BackingSharedParams {
1823                cvm_state,
1824                #[cfg(guest_arch = "x86_64")]
1825                cpuid: &cpuid,
1826                hcl: &hcl,
1827                guest_vsm_available,
1828            },
1829        )?;
1830
1831        let enter_modes = EnterModes::default();
1832
1833        let partition = Arc::new(UhPartitionInner {
1834            hcl,
1835            vps,
1836            irq_routes: Default::default(),
1837            caps,
1838            enter_modes: Mutex::new(enter_modes),
1839            enter_modes_atomic: u8::from(hcl::protocol::EnterModes::from(enter_modes)).into(),
1840            gm: late_params.gm,
1841            vtl0_kernel_exec_gm: late_params.vtl0_kernel_exec_gm,
1842            vtl0_user_exec_gm: late_params.vtl0_user_exec_gm,
1843            #[cfg(guest_arch = "x86_64")]
1844            cpuid,
1845            crash_notification_send: late_params.crash_notification_send,
1846            monitor_page: MonitorPage::new(),
1847            software_devices,
1848            lower_vtl_memory_layout: params.lower_vtl_memory_layout.clone(),
1849            vmtime: late_params.vmtime.clone(),
1850            isolation,
1851            no_sidecar_hotplug: params.no_sidecar_hotplug.into(),
1852            use_mmio_hypercalls: params.use_mmio_hypercalls,
1853            backing_shared,
1854            #[cfg(guest_arch = "x86_64")]
1855            device_vector_table: RwLock::new(IrrBitmap::new(Default::default())),
1856            intercept_debug_exceptions: params.intercept_debug_exceptions,
1857            vmbus_relay: late_params.vmbus_relay,
1858        });
1859
1860        if cfg!(guest_arch = "x86_64") {
1861            // Intercept all IOs unless opted out.
1862            partition.manage_io_port_intercept_region(0, !0, true);
1863        }
1864
1865        let vps = params
1866            .topology
1867            .vps_arch()
1868            .map(|vp_info| UhProcessorBox {
1869                partition: partition.clone(),
1870                vp_info,
1871            })
1872            .collect();
1873
1874        Ok((
1875            UhPartition {
1876                inner: partition.clone(),
1877                interrupt_targets: VtlArray::from_fn(|vtl| {
1878                    Arc::new(UhInterruptTarget {
1879                        partition: partition.clone(),
1880                        vtl: vtl.try_into().unwrap(),
1881                    })
1882                }),
1883            },
1884            vps,
1885        ))
1886    }
1887}
1888
1889impl UhPartition {
1890    /// Gets the guest OS ID for VTL0.
1891    pub fn vtl0_guest_os_id(&self) -> Result<HvGuestOsId, Error> {
1892        // If Underhill is emulating the hypervisor interfaces, get this value
1893        // from the emulator. This happens when running under hardware isolation
1894        // or when configured for testing.
1895        let id = if let Some(hv) = self.inner.hv() {
1896            hv.guest_os_id(Vtl::Vtl0)
1897        } else {
1898            // Ask the hypervisor for this value.
1899            self.inner
1900                .hcl
1901                .get_guest_os_id(Vtl::Vtl0)
1902                .map_err(Error::Hcl)?
1903        };
1904        Ok(id)
1905    }
1906
1907    /// Configures guest accesses to IO ports in `range` to go directly to the
1908    /// host.
1909    ///
1910    /// When the return value is dropped, the ports will be unregistered.
1911    pub fn register_host_io_port_fast_path(
1912        &self,
1913        range: RangeInclusive<u16>,
1914    ) -> HostIoPortFastPathHandle {
1915        // There is no way to provide a fast path for some hardware isolated
1916        // VM architectures. The devices that do use this facility are not
1917        // enabled on hardware isolated VMs.
1918        assert!(!self.inner.isolation.is_hardware_isolated());
1919
1920        self.inner
1921            .manage_io_port_intercept_region(*range.start(), *range.end(), false);
1922        HostIoPortFastPathHandle {
1923            inner: Arc::downgrade(&self.inner),
1924            begin: *range.start(),
1925            end: *range.end(),
1926        }
1927    }
1928
1929    /// Enables or disables the PM timer assist.
1930    pub fn set_pm_timer_assist(&self, port: Option<u16>) -> Result<(), HvError> {
1931        self.inner.hcl.set_pm_timer_assist(port)
1932    }
1933}
1934
1935impl UhProtoPartition<'_> {
1936    /// Whether Guest VSM is available to the guest. If so, for hardware CVMs,
1937    /// it is safe to expose Guest VSM support via cpuid.
1938    fn check_guest_vsm_support(hcl: &Hcl) -> Result<bool, Error> {
1939        #[cfg(guest_arch = "x86_64")]
1940        let privs = {
1941            let result = safe_intrinsics::cpuid(hvdef::HV_CPUID_FUNCTION_MS_HV_FEATURES, 0);
1942            let num = result.eax as u64 | ((result.ebx as u64) << 32);
1943            hvdef::HvPartitionPrivilege::from(num)
1944        };
1945
1946        #[cfg(guest_arch = "aarch64")]
1947        let privs = hcl.get_privileges_and_features_info().map_err(Error::Hcl)?;
1948
1949        if !privs.access_vsm() {
1950            return Ok(false);
1951        }
1952        let guest_vsm_config = hcl.get_guest_vsm_partition_config().map_err(Error::Hcl)?;
1953        Ok(guest_vsm_config.maximum_vtl() >= u8::from(GuestVtl::Vtl1))
1954    }
1955
1956    #[cfg(guest_arch = "x86_64")]
1957    /// Constructs partition-wide CVM state.
1958    fn construct_cvm_state(
1959        params: &UhPartitionNewParams<'_>,
1960        late_params: CvmLateParams,
1961        caps: &PartitionCapabilities,
1962        guest_vsm_available: bool,
1963    ) -> Result<UhCvmPartitionState, Error> {
1964        use vmcore::reference_time::ReferenceTimeSource;
1965
1966        let vp_count = params.topology.vp_count() as usize;
1967        let vps = (0..vp_count)
1968            .map(|vp_index| UhCvmVpInner {
1969                tlb_lock_info: VtlArray::from_fn(|_| TlbLockInfo::new(vp_count)),
1970                vtl1_enable_called: Mutex::new(false),
1971                started: AtomicBool::new(vp_index == 0),
1972                hv_start_enable_vtl_vp: VtlArray::from_fn(|_| Mutex::new(None)),
1973            })
1974            .collect();
1975        let tlb_locked_vps =
1976            VtlArray::from_fn(|_| BitVec::repeat(false, vp_count).into_boxed_bitslice());
1977
1978        let lapic = VtlArray::from_fn(|_| {
1979            LocalApicSet::builder()
1980                .x2apic_capable(caps.x2apic)
1981                .hyperv_enlightenments(true)
1982                .build()
1983        });
1984
1985        let tsc_frequency = get_tsc_frequency(params.isolation)?;
1986        let ref_time = ReferenceTimeSource::new(TscReferenceTimeSource::new(tsc_frequency));
1987
1988        // If we're emulating the APIC, then we also must emulate the hypervisor
1989        // enlightenments, since the hypervisor can't support enlightenments
1990        // without also providing an APIC.
1991        //
1992        // Additionally, TDX provides hardware APIC emulation but we still need
1993        // to emulate the hypervisor enlightenments.
1994        let hv = GlobalHv::new(hv1_emulator::hv::GlobalHvParams {
1995            max_vp_count: params.topology.vp_count(),
1996            vendor: caps.vendor,
1997            tsc_frequency,
1998            ref_time,
1999            is_ref_time_backed_by_tsc: true,
2000        });
2001
2002        Ok(UhCvmPartitionState {
2003            vps_per_socket: params.topology.reserved_vps_per_socket(),
2004            tlb_locked_vps,
2005            vps,
2006            shared_memory: late_params.shared_gm,
2007            isolated_memory_protector: late_params.isolated_memory_protector,
2008            lapic,
2009            hv,
2010            guest_vsm: RwLock::new(GuestVsmState::from_availability(guest_vsm_available)),
2011            shared_dma_client: late_params.shared_dma_client,
2012            private_dma_client: late_params.private_dma_client,
2013            hide_isolation: params.hide_isolation,
2014        })
2015    }
2016}
2017
2018impl UhPartition {
2019    #[cfg(guest_arch = "x86_64")]
2020    /// Constructs the set of cpuid results to show to the guest
2021    fn construct_cpuid_results(
2022        cpuid: virt::CpuidLeafSet,
2023        initial_cpuid: &[CpuidLeaf],
2024        topology: &ProcessorTopology<vm_topology::processor::x86::X86Topology>,
2025        isolation: IsolationType,
2026        hide_isolation: bool,
2027    ) -> virt::CpuidLeafSet {
2028        let mut cpuid = cpuid.into_leaves();
2029        if isolation.is_hardware_isolated() {
2030            // Update the x2apic leaf based on the topology.
2031            let x2apic = match topology.apic_mode() {
2032                vm_topology::processor::x86::ApicMode::XApic => false,
2033                vm_topology::processor::x86::ApicMode::X2ApicSupported => true,
2034                vm_topology::processor::x86::ApicMode::X2ApicEnabled => true,
2035            };
2036            let ecx = x86defs::cpuid::VersionAndFeaturesEcx::new().with_x2_apic(x2apic);
2037            let ecx_mask = x86defs::cpuid::VersionAndFeaturesEcx::new().with_x2_apic(true);
2038            cpuid.push(
2039                CpuidLeaf::new(
2040                    x86defs::cpuid::CpuidFunction::VersionAndFeatures.0,
2041                    [0, 0, ecx.into(), 0],
2042                )
2043                .masked([0, 0, ecx_mask.into(), 0]),
2044            );
2045
2046            // Get the hypervisor version from the host. This is just for
2047            // reporting purposes, so it is safe even if the hypervisor is not
2048            // trusted.
2049            let hv_version = safe_intrinsics::cpuid(hvdef::HV_CPUID_FUNCTION_MS_HV_VERSION, 0);
2050
2051            // Perform final processing steps for synthetic leaves.
2052            hv1_emulator::cpuid::process_hv_cpuid_leaves(
2053                &mut cpuid,
2054                hide_isolation,
2055                [
2056                    hv_version.eax,
2057                    hv_version.ebx,
2058                    hv_version.ecx,
2059                    hv_version.edx,
2060                ],
2061            );
2062        }
2063        cpuid.extend(initial_cpuid);
2064        virt::CpuidLeafSet::new(cpuid)
2065    }
2066
2067    #[cfg(guest_arch = "x86_64")]
2068    /// Computes the partition capabilities
2069    fn construct_capabilities(
2070        topology: &ProcessorTopology,
2071        cpuid: &virt::CpuidLeafSet,
2072        isolation: IsolationType,
2073        hide_isolation: bool,
2074    ) -> virt::x86::X86PartitionCapabilities {
2075        let mut native_cpuid_fn;
2076        let mut cvm_cpuid_fn;
2077
2078        // Determine the method to get cpuid results for the guest when
2079        // computing partition capabilities.
2080        let cpuid_fn: &mut dyn FnMut(u32, u32) -> [u32; 4] = if isolation.is_hardware_isolated() {
2081            // Use the filtered CPUID to determine capabilities.
2082            cvm_cpuid_fn = move |leaf, sub_leaf| cpuid.result(leaf, sub_leaf, &[0, 0, 0, 0]);
2083            &mut cvm_cpuid_fn
2084        } else {
2085            // Just use the native cpuid.
2086            native_cpuid_fn = |leaf, sub_leaf| {
2087                let CpuidResult { eax, ebx, ecx, edx } = safe_intrinsics::cpuid(leaf, sub_leaf);
2088                cpuid.result(leaf, sub_leaf, &[eax, ebx, ecx, edx])
2089            };
2090            &mut native_cpuid_fn
2091        };
2092
2093        // Compute and validate capabilities.
2094        let mut caps = virt::x86::X86PartitionCapabilities::from_cpuid(topology, cpuid_fn);
2095        match isolation {
2096            IsolationType::Tdx => {
2097                assert_eq!(caps.vtom.is_some(), !hide_isolation);
2098                // TDX 1.5 requires EFER.NXE to be set to 1, so set it at RESET/INIT.
2099                caps.nxe_forced_on = true;
2100            }
2101            IsolationType::Snp => {
2102                assert_eq!(caps.vtom.is_some(), !hide_isolation);
2103            }
2104            _ => {
2105                assert!(caps.vtom.is_none());
2106            }
2107        }
2108
2109        caps
2110    }
2111}
2112
2113#[cfg(guest_arch = "x86_64")]
2114/// Gets the TSC frequency for the current platform.
2115fn get_tsc_frequency(isolation: IsolationType) -> Result<u64, Error> {
2116    // Always get the frequency from the hypervisor. It's believed that, as long
2117    // as the hypervisor is behaving, it will provide the most precise and accurate frequency.
2118    let msr = MsrDevice::new(0).map_err(Error::OpenMsr)?;
2119    let hv_frequency = msr
2120        .read_msr(hvdef::HV_X64_MSR_TSC_FREQUENCY)
2121        .map_err(Error::ReadTscFrequency)?;
2122
2123    // Get the hardware-advertised frequency and validate that the
2124    // hypervisor frequency is not too far off.
2125    let hw_info = match isolation {
2126        IsolationType::Tdx => {
2127            // TDX provides the TSC frequency via cpuid.
2128            let max_function =
2129                safe_intrinsics::cpuid(x86defs::cpuid::CpuidFunction::VendorAndMaxFunction.0, 0)
2130                    .eax;
2131
2132            if max_function < x86defs::cpuid::CpuidFunction::CoreCrystalClockInformation.0 {
2133                return Err(Error::BadCpuidTsc);
2134            }
2135            let result = safe_intrinsics::cpuid(
2136                x86defs::cpuid::CpuidFunction::CoreCrystalClockInformation.0,
2137                0,
2138            );
2139            let ratio_denom = result.eax;
2140            let ratio_num = result.ebx;
2141            let clock = result.ecx;
2142            if ratio_num == 0 || ratio_denom == 0 || clock == 0 {
2143                return Err(Error::BadCpuidTsc);
2144            }
2145            // TDX TSC is configurable in units of 25MHz, so allow up to 12.5MHz
2146            // error.
2147            let allowed_error = 12_500_000;
2148            Some((
2149                clock as u64 * ratio_num as u64 / ratio_denom as u64,
2150                allowed_error,
2151            ))
2152        }
2153        IsolationType::Snp => {
2154            // SNP currently does not provide the frequency.
2155            None
2156        }
2157        IsolationType::Vbs | IsolationType::None => None,
2158    };
2159
2160    if let Some((hw_frequency, allowed_error)) = hw_info {
2161        // Don't allow the frequencies to be different by more than the hardware
2162        // precision.
2163        let delta = hw_frequency.abs_diff(hv_frequency);
2164        if delta > allowed_error {
2165            return Err(Error::TscFrequencyMismatch {
2166                hv: hv_frequency,
2167                hw: hw_frequency,
2168                allowed_error,
2169            });
2170        }
2171    }
2172
2173    Ok(hv_frequency)
2174}
2175
2176impl UhPartitionInner {
2177    fn manage_io_port_intercept_region(&self, begin: u16, end: u16, active: bool) {
2178        if self.isolation.is_hardware_isolated() {
2179            return;
2180        }
2181
2182        static SKIP_RANGE: AtomicBool = AtomicBool::new(false);
2183
2184        let access_type_mask = if active {
2185            HV_INTERCEPT_ACCESS_MASK_READ_WRITE
2186        } else {
2187            HV_INTERCEPT_ACCESS_MASK_NONE
2188        };
2189
2190        // Try to register the whole range at once.
2191        if !SKIP_RANGE.load(Ordering::Relaxed) {
2192            match self.hcl.register_intercept(
2193                HvInterceptType::HvInterceptTypeX64IoPortRange,
2194                access_type_mask,
2195                HvInterceptParameters::new_io_port_range(begin..=end),
2196            ) {
2197                Ok(()) => return,
2198                Err(HvError::InvalidParameter) => {
2199                    // Probably a build that doesn't support range wrapping yet.
2200                    // Don't try again.
2201                    SKIP_RANGE.store(true, Ordering::Relaxed);
2202                    tracing::warn!(
2203                        CVM_ALLOWED,
2204                        "old hypervisor build; using slow path for intercept ranges"
2205                    );
2206                }
2207                Err(err) => {
2208                    panic!("io port range registration failure: {err:?}");
2209                }
2210            }
2211        }
2212
2213        // Fall back to registering one port at a time.
2214        for port in begin..=end {
2215            self.hcl
2216                .register_intercept(
2217                    HvInterceptType::HvInterceptTypeX64IoPort,
2218                    access_type_mask,
2219                    HvInterceptParameters::new_io_port(port),
2220                )
2221                .expect("registering io intercept cannot fail");
2222        }
2223    }
2224
2225    fn is_gpa_lower_vtl_ram(&self, gpa: u64) -> bool {
2226        // TODO: this probably should reflect changes to the memory map via PAM
2227        // registers. Right now this isn't an issue because the relevant region,
2228        // VGA, is handled on the host.
2229        self.lower_vtl_memory_layout
2230            .ram()
2231            .iter()
2232            .any(|m| m.range.contains_addr(gpa))
2233    }
2234
2235    fn is_gpa_mapped(&self, gpa: u64, write: bool) -> bool {
2236        // TODO: this probably should reflect changes to the memory map via PAM
2237        // registers. Right now this isn't an issue because the relevant region,
2238        // VGA, is handled on the host.
2239        if self.is_gpa_lower_vtl_ram(gpa) {
2240            // The monitor page is protected against lower VTL writes.
2241            !write || self.monitor_page.gpa() != Some(gpa & !(HV_PAGE_SIZE - 1))
2242        } else {
2243            false
2244        }
2245    }
2246
2247    /// Gets the CPUID result, applying any necessary runtime modifications.
2248    #[cfg(guest_arch = "x86_64")]
2249    fn cpuid_result(&self, eax: u32, ecx: u32, default: &[u32; 4]) -> [u32; 4] {
2250        let r = self.cpuid.result(eax, ecx, default);
2251        if eax == hvdef::HV_CPUID_FUNCTION_MS_HV_FEATURES {
2252            // Update the VSM access privilege.
2253            //
2254            // FUTURE: Investigate if this is really necessary for non-CVM--the
2255            // hypervisor should already update this correctly.
2256            //
2257            // If it is only for CVM, then it should be moved to the
2258            // CVM-specific cpuid fixups.
2259            let mut features = hvdef::HvFeatures::from_cpuid(r);
2260            if self.backing_shared.guest_vsm_disabled() {
2261                features.set_privileges(features.privileges().with_access_vsm(false));
2262            }
2263            features.into_cpuid()
2264        } else {
2265            r
2266        }
2267    }
2268}
2269
2270/// Handle returned by [`UhPartition::register_host_io_port_fast_path`].
2271///
2272/// When dropped, unregisters the IO ports so that they are no longer forwarded
2273/// to the host.
2274#[must_use]
2275pub struct HostIoPortFastPathHandle {
2276    inner: Weak<UhPartitionInner>,
2277    begin: u16,
2278    end: u16,
2279}
2280
2281impl Drop for HostIoPortFastPathHandle {
2282    fn drop(&mut self) {
2283        if let Some(inner) = self.inner.upgrade() {
2284            inner.manage_io_port_intercept_region(self.begin, self.end, true);
2285        }
2286    }
2287}
2288
2289/// The application level VTL crash data not suited for putting
2290/// on the wire.
2291///
2292/// FUTURE: move/remove this to standardize across virt backends.
2293#[derive(Copy, Clone, Debug)]
2294pub struct VtlCrash {
2295    /// The VP that crashed.
2296    pub vp_index: VpIndex,
2297    /// The VTL that crashed.
2298    pub last_vtl: GuestVtl,
2299    /// The crash control information.
2300    pub control: GuestCrashCtl,
2301    /// The crash parameters.
2302    pub parameters: [u64; 5],
2303}
2304
2305/// Validate that flags is a valid setting for VTL memory protection when
2306/// applied to VTL 1.
2307#[cfg_attr(guest_arch = "aarch64", expect(dead_code))]
2308fn validate_vtl_gpa_flags(
2309    flags: HvMapGpaFlags,
2310    mbec_enabled: bool,
2311    shadow_supervisor_stack_enabled: bool,
2312) -> bool {
2313    // Adjust is not allowed for VTL1.
2314    if flags.adjustable() {
2315        return false;
2316    }
2317
2318    // KX must equal UX unless MBEC is enabled. KX && !UX is invalid.
2319    if flags.kernel_executable() != flags.user_executable() {
2320        if (flags.kernel_executable() && !flags.user_executable()) || !mbec_enabled {
2321            return false;
2322        }
2323    }
2324
2325    // Read must be specified if anything else is specified.
2326    if flags.writable()
2327        || flags.kernel_executable()
2328        || flags.user_executable()
2329        || flags.supervisor_shadow_stack()
2330        || flags.paging_writability()
2331        || flags.verify_paging_writability()
2332    {
2333        if !flags.readable() {
2334            return false;
2335        }
2336    }
2337
2338    // Supervisor shadow stack protection is invalid if shadow stacks are disabled
2339    // or if execute is not specified.
2340    if flags.supervisor_shadow_stack()
2341        && ((!flags.kernel_executable() && !flags.user_executable())
2342            || shadow_supervisor_stack_enabled)
2343    {
2344        return false;
2345    }
2346
2347    true
2348}