Skip to main content

virt_mshv/
lib.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! Linux /dev/mshv implementation of the virt::generic interfaces.
5
6#![cfg(all(target_os = "linux", guest_is_native))]
7// UNSAFETY: Calling HV APIs and manually managing memory.
8#![expect(unsafe_code)]
9
10#[cfg(guest_arch = "aarch64")]
11mod aarch64;
12#[cfg(guest_arch = "x86_64")]
13mod x86_64;
14
15#[cfg(guest_arch = "aarch64")]
16use aarch64 as arch;
17#[cfg(guest_arch = "x86_64")]
18use x86_64 as arch;
19
20// irqfd is arch-independent but only wired up on x86_64 for now.
21// TODO: wire up on aarch64 once MSI signaling is implemented.
22#[cfg(guest_arch = "x86_64")]
23pub mod irqfd;
24
25use guestmem::DoorbellRegistration;
26use guestmem::GuestMemory;
27use hv1_emulator::message_queues::MessageQueues;
28use hvdef::HV_PAGE_SHIFT;
29use hvdef::HvDeliverabilityNotificationsRegister;
30use hvdef::HvError;
31use hvdef::HvMessage;
32use hvdef::HvMessageType;
33use hvdef::HvPartitionPropertyCode;
34use hvdef::Vtl;
35use hvdef::hypercall::HV_INTERCEPT_ACCESS_MASK_EXECUTE;
36use hvdef::hypercall::HvRegisterAssoc;
37use inspect::Inspect;
38use inspect::InspectMut;
39use mshv_bindings::MSHV_SET_MEM_BIT_EXECUTABLE;
40use mshv_bindings::MSHV_SET_MEM_BIT_WRITABLE;
41use mshv_bindings::mshv_install_intercept;
42use mshv_bindings::mshv_user_mem_region;
43use mshv_ioctls::Mshv;
44use mshv_ioctls::MshvError;
45use mshv_ioctls::VcpuFd;
46use mshv_ioctls::VmFd;
47use mshv_ioctls::set_bits;
48use pal::unix::pthread::*;
49use pal_event::Event;
50use parking_lot::Mutex;
51use parking_lot::RwLock;
52use std::convert::Infallible;
53use std::future::poll_fn;
54use std::io;
55use std::os::fd::AsFd;
56use std::os::fd::AsRawFd;
57use std::os::fd::IntoRawFd as _;
58use std::sync::Arc;
59use std::sync::Once;
60use std::sync::Weak;
61use std::sync::atomic::AtomicBool;
62use std::sync::atomic::Ordering;
63use std::task::Waker;
64use thiserror::Error;
65use virt::NeedsYield;
66use virt::PartitionAccessState;
67use virt::ProtoPartitionConfig;
68use virt::StopVp;
69use virt::VpHaltReason;
70use virt::VpIndex;
71use virt::io::CpuIo;
72use vmcore::interrupt::Interrupt;
73use vmcore::reference_time::GetReferenceTime;
74use vmcore::reference_time::ReferenceTimeResult;
75use vmcore::synic::GuestEventPort;
76
77/// Extension trait for [`VcpuFd`] to accept hvdef register types directly.
78trait VcpuFdExt {
79    fn get_hvdef_regs(&self, regs: &mut [HvRegisterAssoc]) -> Result<(), KernelError>;
80    fn set_hvdef_regs(&self, regs: &[HvRegisterAssoc]) -> Result<(), KernelError>;
81}
82
83impl VcpuFdExt for VcpuFd {
84    fn get_hvdef_regs(&self, regs: &mut [HvRegisterAssoc]) -> Result<(), KernelError> {
85        use mshv_bindings::hv_register_assoc;
86        const {
87            assert!(size_of::<HvRegisterAssoc>() == size_of::<hv_register_assoc>());
88            assert!(align_of::<HvRegisterAssoc>() >= align_of::<hv_register_assoc>());
89        }
90        // SAFETY: HvRegisterAssoc and hv_register_assoc have the same layout.
91        self.get_reg(unsafe {
92            std::mem::transmute::<&mut [HvRegisterAssoc], &mut [hv_register_assoc]>(regs)
93        })?;
94        Ok(())
95    }
96
97    fn set_hvdef_regs(&self, regs: &[HvRegisterAssoc]) -> Result<(), KernelError> {
98        use mshv_bindings::hv_register_assoc;
99        const {
100            assert!(size_of::<HvRegisterAssoc>() == size_of::<hv_register_assoc>());
101            assert!(align_of::<HvRegisterAssoc>() >= align_of::<hv_register_assoc>());
102        }
103        // SAFETY: HvRegisterAssoc and hv_register_assoc have the same layout.
104        self.set_reg(unsafe {
105            std::mem::transmute::<&[HvRegisterAssoc], &[hv_register_assoc]>(regs)
106        })?;
107        Ok(())
108    }
109}
110
111/// Hypervisor backend for Linux /dev/mshv.
112#[derive(Debug)]
113pub struct LinuxMshv {
114    mshv: Mshv,
115}
116
117impl LinuxMshv {
118    /// Creates a new instance of the LinuxMshv hypervisor backend.
119    pub fn new() -> io::Result<Self> {
120        let file = fs_err::File::open("/dev/mshv")?;
121        Ok(Self::from(std::fs::File::from(file)))
122    }
123}
124
125impl From<std::fs::File> for LinuxMshv {
126    fn from(file: std::fs::File) -> Self {
127        LinuxMshv {
128            // SAFETY: We take ownership of the file descriptor and pass it to Mshv.
129            // TODO: fix mshv_bindings to not need this unsafe code.
130            mshv: unsafe { Mshv::new_with_fd_number(file.into_raw_fd()) },
131        }
132    }
133}
134
135impl<'a> MshvProtoPartition<'a> {
136    /// Performs the post-init partition setup common to both architectures:
137    /// creates VPs, BSP, installs intercepts, sets up the signal handler,
138    /// and checks for unsupported VTL2 configuration.
139    fn new(config: ProtoPartitionConfig<'a>, vmfd: VmFd) -> Result<Self, Error> {
140        if config.processor_topology.vp_count() > u8::MAX as u32 {
141            return Err(ErrorInner::TooManyVps(config.processor_topology.vp_count()).into());
142        }
143
144        let vps = config
145            .processor_topology
146            .vps_arch()
147            .map(|vp| MshvVpInner {
148                vp_info: vp,
149                thread: RwLock::new(None),
150                needs_yield: NeedsYield::new(),
151                message_queues: MessageQueues::new(),
152                message_queues_pending: AtomicBool::new(false),
153                waker: RwLock::new(None),
154            })
155            .collect();
156
157        let bsp = vmfd
158            .create_vcpu(0)
159            .map_err(|e| ErrorInner::CreateVcpu(e.into()))?;
160
161        // Install intercepts required by both architectures.
162        vmfd.install_intercept(mshv_install_intercept {
163            access_type_mask: HV_INTERCEPT_ACCESS_MASK_EXECUTE,
164            intercept_type: hvdef::hypercall::HvInterceptType::HvInterceptTypeHypercall.0,
165            intercept_parameter: Default::default(),
166        })
167        .map_err(|e| ErrorInner::InstallIntercept(e.into()))?;
168
169        vmfd.install_intercept(mshv_install_intercept {
170            access_type_mask: HV_INTERCEPT_ACCESS_MASK_EXECUTE,
171            intercept_type:
172                hvdef::hypercall::HvInterceptType::HvInterceptTypeUnknownSynicConnection.0,
173            intercept_parameter: Default::default(),
174        })
175        .map_err(|e| ErrorInner::InstallIntercept(e.into()))?;
176
177        vmfd.install_intercept(mshv_install_intercept {
178            access_type_mask: HV_INTERCEPT_ACCESS_MASK_EXECUTE,
179            intercept_type:
180                hvdef::hypercall::HvInterceptType::HvInterceptTypeRetargetInterruptWithUnknownDeviceId.0,
181            intercept_parameter: Default::default(),
182        })
183        .map_err(|e| ErrorInner::InstallIntercept(e.into()))?;
184
185        // Set up a signal for forcing vcpufd.run() to exit with EINTR.
186        static SIGNAL_HANDLER_INIT: Once = Once::new();
187        // SAFETY: The signal handler does not perform any actions that are
188        // forbidden for signal handlers to perform, as it performs nothing.
189        SIGNAL_HANDLER_INIT.call_once(|| unsafe {
190            signal_hook::low_level::register(libc::SIGRTMIN(), || {
191                // Signal handler does nothing other than enabling run_fd()
192                // ioctl to return with EINTR, when the associated signal is
193                // sent to run_fd() thread.
194            })
195            .unwrap();
196        });
197
198        if let Some(hv_config) = &config.hv_config {
199            if hv_config.vtl2.is_some() {
200                return Err(ErrorInner::Vtl2NotSupported.into());
201            }
202        }
203
204        Ok(MshvProtoPartition {
205            config,
206            vmfd,
207            vps,
208            bsp,
209        })
210    }
211}
212
213/// Returns whether MSHV is available on this machine.
214pub fn is_available() -> Result<bool, Error> {
215    match std::fs::metadata("/dev/mshv") {
216        Ok(_) => Ok(true),
217        Err(err) if err.kind() == io::ErrorKind::NotFound => Ok(false),
218        Err(err) => Err(ErrorInner::AvailableCheck(err).into()),
219    }
220}
221
222/// Prototype partition.
223pub struct MshvProtoPartition<'a> {
224    config: ProtoPartitionConfig<'a>,
225    vmfd: VmFd,
226    vps: Vec<MshvVpInner>,
227    bsp: VcpuFd,
228}
229
230/// A partition running on the /dev/mshv hypervisor.
231#[derive(Inspect)]
232pub struct MshvPartition {
233    #[inspect(flatten)]
234    inner: Arc<MshvPartitionInner>,
235    #[inspect(skip)]
236    synic_ports: Arc<virt::synic::SynicPorts<MshvPartitionInner>>,
237}
238
239#[derive(Inspect)]
240struct MshvPartitionInner {
241    #[inspect(skip)]
242    vmfd: VmFd,
243    /// The BSP's VcpuFd, retained for partition-level register access
244    /// (VM state get/set). Only used while VPs are stopped.
245    #[inspect(skip)]
246    bsp_vcpufd: VcpuFd,
247    #[inspect(skip)]
248    memory: Mutex<MshvMemoryRangeState>,
249    gm: GuestMemory,
250    mem_layout: vm_topology::memory::MemoryLayout,
251    #[inspect(skip)]
252    vps: Vec<MshvVpInner>,
253    #[cfg(guest_arch = "x86_64")]
254    irq_routes: virt::irqcon::IrqRoutes,
255    #[cfg(guest_arch = "x86_64")]
256    #[inspect(skip)]
257    gsi_states: Mutex<Box<[irqfd::GsiState; irqfd::NUM_GSIS]>>,
258    caps: virt::PartitionCapabilities,
259    synic_ports: virt::synic::SynicPortMap,
260    #[cfg(guest_arch = "x86_64")]
261    cpuid: virt::CpuidLeafSet,
262    #[cfg(guest_arch = "x86_64")]
263    software_devices: virt::x86::apic_software_device::ApicSoftwareDevices,
264    /// Set to `true` when partition time is frozen (e.g. during reset).
265    /// The first VP to enter `run_vp` after a freeze will thaw time.
266    time_frozen: Mutex<bool>,
267}
268
269struct MshvVpInner {
270    vp_info: vm_topology::processor::TargetVpInfo,
271    thread: RwLock<Option<Pthread>>,
272    needs_yield: NeedsYield,
273    message_queues: MessageQueues,
274    /// Set by device threads after enqueuing a message to signal the VP
275    /// thread to flush its message queues.
276    message_queues_pending: AtomicBool,
277    /// Waker for the VP run loop task. Set by the VP thread, used by device
278    /// threads to re-poll the run loop when new messages are enqueued.
279    waker: RwLock<Option<Waker>>,
280}
281
282struct MshvVpInnerCleaner<'a> {
283    vpinner: &'a MshvVpInner,
284}
285
286impl Drop for MshvVpInnerCleaner<'_> {
287    fn drop(&mut self) {
288        self.vpinner.thread.write().take();
289    }
290}
291
292impl GetReferenceTime for MshvPartitionInner {
293    fn now(&self) -> ReferenceTimeResult {
294        // Use the partition property instead of a VP register to avoid
295        // deadlocking when VPs are running.
296        let ref_time = self
297            .vmfd
298            .get_partition_property(HvPartitionPropertyCode::ReferenceTime.0)
299            .unwrap();
300        ReferenceTimeResult {
301            ref_time,
302            system_time: None,
303        }
304    }
305}
306
307impl MshvPartitionInner {
308    fn vp(&self, vp_index: VpIndex) -> &MshvVpInner {
309        &self.vps[vp_index.index() as usize]
310    }
311
312    /// Freezes partition time. Time will remain frozen until [`thaw_time`] is
313    /// called (typically on the first VP run after reset).
314    fn freeze_time(&self) -> Result<(), Error> {
315        let mut frozen = self.time_frozen.lock();
316        if !*frozen {
317            self.vmfd
318                .set_partition_property(HvPartitionPropertyCode::TimeFreeze.0, 1)
319                .map_err(|e| ErrorInner::SetPartitionProperty(e.into()))?;
320            *frozen = true;
321        }
322        Ok(())
323    }
324
325    /// Thaws partition time if it is currently frozen. This is a no-op if
326    /// time is already running.
327    fn thaw_time(&self) -> Result<(), Error> {
328        let mut frozen = self.time_frozen.lock();
329        if *frozen {
330            self.vmfd
331                .set_partition_property(HvPartitionPropertyCode::TimeFreeze.0, 0)
332                .map_err(|e| ErrorInner::SetPartitionProperty(e.into()))?;
333            *frozen = false;
334        }
335        Ok(())
336    }
337
338    fn post_message(&self, vp_index: VpIndex, sint: u8, message: &HvMessage) {
339        let vp = self.vp(vp_index);
340        let wake = vp.message_queues.enqueue_message(sint, message);
341        // Signal the VP thread to flush message queues.
342        if wake && !vp.message_queues_pending.swap(true, Ordering::Release) {
343            if let Some(waker) = &*vp.waker.read() {
344                waker.wake_by_ref();
345            }
346        }
347    }
348
349    /// Posts a message directly to a VP's SynIC sint.
350    ///
351    /// This wraps the HvCallPostMessageDirect hypercall via the raw hvcall
352    /// interface. This is used instead of the `mshv-ioctls` method because
353    /// that method is only available on x86.
354    // TODO: upstream an arch-independent version to mshv-ioctls.
355    fn post_message_direct(&self, vp: u32, sint: u8, message: &HvMessage) -> Result<(), MshvError> {
356        use mshv_bindings::mshv_root_hvcall;
357
358        let post_message = hvdef::hypercall::PostMessageDirect {
359            partition_id: 0,
360            vp_index: vp,
361            vtl: Vtl::Vtl0 as u8,
362            padding0: [0; 3],
363            sint,
364            padding1: [0; 3],
365            message: zerocopy::Unalign::new(*message),
366            padding2: 0,
367        };
368
369        let mut args = mshv_root_hvcall {
370            code: hvdef::HypercallCode::HvCallPostMessageDirect.0,
371            in_sz: size_of::<hvdef::hypercall::PostMessageDirect>() as u16,
372            in_ptr: std::ptr::addr_of!(post_message) as u64,
373            ..Default::default()
374        };
375        self.vmfd.hvcall(&mut args)
376    }
377
378    /// Signals a SynIC event directly on a VP.
379    ///
380    /// This wraps the HvCallSignalEventDirect hypercall via the raw hvcall
381    /// interface. This is used instead of the `mshv-ioctls` method because
382    /// that method is only available on x86.
383    // TODO: upstream an arch-independent version to mshv-ioctls.
384    fn signal_event_direct(&self, vp: u32, sint: u8, flag: u16) -> Result<(), MshvError> {
385        use mshv_bindings::mshv_root_hvcall;
386        use zerocopy::FromZeros;
387
388        let input = hvdef::hypercall::SignalEventDirect {
389            target_partition: 0,
390            target_vp: vp,
391            target_vtl: 0,
392            target_sint: sint,
393            flag_number: flag,
394        };
395        let mut output = hvdef::hypercall::SignalEventDirectOutput::new_zeroed();
396
397        let mut args = mshv_root_hvcall {
398            code: hvdef::HypercallCode::HvCallSignalEventDirect.0,
399            in_sz: size_of::<hvdef::hypercall::SignalEventDirect>() as u16,
400            out_sz: size_of::<hvdef::hypercall::SignalEventDirectOutput>() as u16,
401            in_ptr: std::ptr::addr_of!(input) as u64,
402            out_ptr: std::ptr::addr_of_mut!(output) as u64,
403            ..Default::default()
404        };
405        self.vmfd.hvcall(&mut args)
406    }
407}
408
409/// Binds a virtual processor to the current thread.
410pub struct MshvProcessorBinder {
411    partition: Arc<MshvPartitionInner>,
412    vcpufd: Option<VcpuFd>,
413    vpindex: VpIndex,
414}
415
416/// Wraps a VcpuFd for running a VP. On x86_64, also provides access to the
417/// register page for fast register reads/writes.
418struct MshvVpRunner<'a> {
419    vcpufd: &'a VcpuFd,
420    #[cfg(guest_arch = "x86_64")]
421    reg_page: *mut hvdef::HvX64RegisterPage,
422}
423
424impl MshvVpRunner<'_> {
425    fn run(&mut self) -> Result<HvMessage, MshvError> {
426        self.vcpufd.run().map(|msg| {
427            // SAFETY: hv_message and HvMessage have the same size
428            // (256 bytes) and compatible layout (header + 240-byte
429            // payload).
430            unsafe { std::mem::transmute::<mshv_bindings::hv_message, HvMessage>(msg) }
431        })
432    }
433
434    #[cfg(guest_arch = "x86_64")]
435    fn reg_page(&mut self) -> &mut hvdef::HvX64RegisterPage {
436        // SAFETY: VP is stopped (returned from run()), so we have exclusive
437        // access. The raw pointer was obtained from the kernel's mmap of
438        // the register page and remains valid for the VP's lifetime.
439        unsafe { &mut *self.reg_page }
440    }
441}
442
443/// A bound virtual processor for the /dev/mshv hypervisor.
444#[derive(InspectMut)]
445pub struct MshvProcessor<'a> {
446    #[inspect(skip)]
447    partition: &'a MshvPartitionInner,
448    #[inspect(skip)]
449    inner: &'a MshvVpInner,
450    #[inspect(skip)]
451    vpindex: VpIndex,
452    #[inspect(skip)]
453    runner: MshvVpRunner<'a>,
454    /// The deliverability notification state currently registered with the
455    /// hypervisor.
456    #[inspect(skip)]
457    deliverability_notifications: HvDeliverabilityNotificationsRegister,
458}
459
460impl MshvProcessor<'_> {
461    /// Posts any queued messages for the given sints, and requests
462    /// deliverability notifications for any sints that still have pending
463    /// messages.
464    fn flush_messages(&mut self, deliverable_sints: u16) {
465        let nonempty_sints =
466            self.inner
467                .message_queues
468                .post_pending_messages(deliverable_sints, |sint, message| {
469                    match self
470                        .partition
471                        .post_message_direct(self.vpindex.index(), sint, message)
472                    {
473                        Ok(()) => {
474                            tracing::trace!(sint, "sint message posted successfully");
475                            Ok(())
476                        }
477                        Err(e) => {
478                            tracelimit::warn_ratelimited!(
479                                error = &e as &dyn std::error::Error,
480                                "dropping sint message"
481                            );
482                            Err(HvError::ObjectInUse)
483                        }
484                    }
485                });
486
487        if self.deliverability_notifications.sints() != nonempty_sints {
488            let notifications = self.deliverability_notifications.with_sints(nonempty_sints);
489            tracing::trace!(?notifications, "setting deliverability notifications");
490            self.partition
491                .vmfd
492                .register_deliverabilty_notifications(
493                    self.vpindex.index(),
494                    u64::from(notifications),
495                )
496                .expect("requesting deliverability is not a fallible operation");
497            self.deliverability_notifications = notifications;
498        }
499    }
500
501    /// Handles a synic sint deliverable exit. The deliverable sints bitmap
502    /// is architecture-specific (different message types for x86_64 and
503    /// aarch64), so the caller extracts it and passes it here.
504    fn handle_sint_deliverable(&mut self, deliverable_sints: u16) {
505        // Clear the delivered sints from both the current and next state.
506        self.deliverability_notifications
507            .set_sints(self.deliverability_notifications.sints() & !deliverable_sints);
508
509        self.flush_messages(deliverable_sints);
510    }
511
512    /// Resets the VP's message queue and deliverability notification state.
513    fn reset_synic_state(&mut self) {
514        self.inner.message_queues.clear();
515        self.inner
516            .message_queues_pending
517            .store(false, Ordering::Relaxed);
518        self.deliverability_notifications = HvDeliverabilityNotificationsRegister::new();
519    }
520}
521
522impl virt::Processor for MshvProcessor<'_> {
523    type StateAccess<'a>
524        = &'a mut Self
525    where
526        Self: 'a;
527
528    fn set_debug_state(
529        &mut self,
530        _vtl: Vtl,
531        _state: Option<&virt::x86::DebugState>,
532    ) -> Result<(), <&mut Self as virt::vp::AccessVpState>::Error> {
533        Err(ErrorInner::NotSupported.into())
534    }
535
536    async fn run_vp(
537        &mut self,
538        stop: StopVp<'_>,
539        dev: &impl CpuIo,
540    ) -> Result<Infallible, VpHaltReason> {
541        let vpinner = self.inner;
542        let _cleaner = MshvVpInnerCleaner { vpinner };
543
544        assert!(vpinner.thread.write().replace(Pthread::current()).is_none());
545
546        self.partition
547            .thaw_time()
548            .expect("failed to thaw partition time");
549
550        // Ensure any messages present from a state restore are flushed on
551        // the first loop iteration.
552        if vpinner.message_queues.pending_sints() != 0 {
553            vpinner
554                .message_queues_pending
555                .store(true, Ordering::Relaxed);
556        }
557
558        let mut last_waker: Option<Waker> = None;
559
560        loop {
561            vpinner.needs_yield.maybe_yield().await;
562            stop.check()?;
563
564            // Ensure the waker is set so device threads can wake us.
565            poll_fn(|cx| {
566                if !last_waker.as_ref().is_some_and(|w| cx.waker().will_wake(w)) {
567                    last_waker = Some(cx.waker().clone());
568                    *vpinner.waker.write() = last_waker.clone();
569                }
570                std::task::Poll::Ready(())
571            })
572            .await;
573
574            // Flush any messages enqueued by device threads.
575            if vpinner.message_queues_pending.load(Ordering::Relaxed) {
576                vpinner
577                    .message_queues_pending
578                    .store(false, Ordering::SeqCst);
579                let pending_sints = vpinner.message_queues.pending_sints();
580                if pending_sints != 0 {
581                    self.flush_messages(pending_sints);
582                }
583            }
584
585            match self.runner.run() {
586                Ok(exit) => {
587                    self.handle_exit(&exit, dev).await?;
588                }
589                Err(e) => match e.errno() {
590                    libc::EAGAIN | libc::EINTR => {}
591                    _ => tracing::error!(
592                        error = &e as &dyn std::error::Error,
593                        "vcpufd.run returned error"
594                    ),
595                },
596            }
597        }
598    }
599
600    fn flush_async_requests(&mut self) {}
601
602    fn access_state(&mut self, vtl: Vtl) -> Self::StateAccess<'_> {
603        assert_eq!(vtl, Vtl::Vtl0);
604        self
605    }
606
607    fn reset(&mut self) -> Result<(), impl std::error::Error + Send + Sync + 'static> {
608        use virt::vp::AccessVpState;
609
610        let vp_info = self.inner.vp_info;
611        self.access_state(Vtl::Vtl0)
612            .reset_all(&vp_info)
613            .map_err(|e| ErrorInner::ResetState(Box::new(e)))?;
614
615        self.reset_synic_state();
616
617        Ok::<(), Error>(())
618    }
619}
620
621impl hv1_hypercall::PostMessage for arch::MshvHypercallHandler<'_> {
622    fn post_message(&mut self, connection_id: u32, message: &[u8]) -> hvdef::HvResult<()> {
623        self.partition
624            .synic_ports
625            .handle_post_message(Vtl::Vtl0, connection_id, false, message)
626    }
627}
628
629impl hv1_hypercall::SignalEvent for arch::MshvHypercallHandler<'_> {
630    fn signal_event(&mut self, connection_id: u32, flag: u16) -> hvdef::HvResult<()> {
631        self.partition
632            .synic_ports
633            .handle_signal_event(Vtl::Vtl0, connection_id, flag)
634    }
635}
636
637/// Error type for /dev/mshv operations.
638#[derive(Error, Debug)]
639#[error(transparent)]
640pub struct Error(ErrorInner);
641
642impl<T: Into<ErrorInner>> From<T> for Error {
643    fn from(err: T) -> Self {
644        Error(err.into())
645    }
646}
647
648// TODO: Chunk this up into smaller types.
649#[derive(Error, Debug)]
650enum ErrorInner {
651    #[error("operation not supported")]
652    NotSupported,
653    #[error("create_vm failed")]
654    CreateVMFailed,
655    #[error("failed to initialize VM")]
656    CreateVMInitFailed(#[source] anyhow::Error),
657    #[error("failed to create VCPU")]
658    CreateVcpu(#[source] KernelError),
659    #[error("vtl2 not supported")]
660    Vtl2NotSupported,
661    #[error("isolation not supported")]
662    IsolationNotSupported,
663    #[error("failed to stat /dev/mshv")]
664    AvailableCheck(#[source] io::Error),
665    #[cfg(guest_arch = "x86_64")]
666    #[error("failed to get partition property")]
667    GetPartitionProperty(#[source] KernelError),
668    #[error("failed to set partition property")]
669    SetPartitionProperty(#[source] KernelError),
670    #[error("register access error")]
671    Register(#[source] KernelError),
672    #[cfg(guest_arch = "x86_64")]
673    #[error("failed to get VP state {ty}")]
674    GetVpState {
675        #[source]
676        error: KernelError,
677        ty: u8,
678    },
679    #[cfg(guest_arch = "x86_64")]
680    #[error("failed to set VP state {ty}")]
681    SetVpState {
682        #[source]
683        error: KernelError,
684        ty: u8,
685    },
686    #[error("failed to reset state")]
687    ResetState(#[source] Box<virt::state::StateError<Error>>),
688    #[error("install intercept failed")]
689    InstallIntercept(#[source] KernelError),
690    #[cfg(guest_arch = "x86_64")]
691    #[error("failed to register cpuid override")]
692    RegisterCpuid(#[source] KernelError),
693    #[cfg(guest_arch = "x86_64")]
694    #[error("host does not support required cpu capabilities")]
695    Capabilities(#[source] virt::PartitionCapabilitiesError),
696    #[error("too many virtual processors: {0}")]
697    TooManyVps(u32),
698    #[cfg(guest_arch = "x86_64")]
699    #[error("unsupported processor vendor: {0:?}")]
700    UnsupportedProcessorVendor(hvdef::HvProcessorVendor),
701    #[cfg(guest_arch = "x86_64")]
702    #[error("failed to create virtual device")]
703    NewDevice(#[source] virt::x86::apic_software_device::DeviceIdInUse),
704}
705
706/// Equivalent to [`MshvError`] but has a much better error message.
707#[derive(Error, Debug)]
708enum KernelError {
709    #[error("kernel error")]
710    Kernel(#[source] io::Error),
711    #[error("hypercall {code:#x?} error")]
712    Hypercall {
713        code: hvdef::HypercallCode,
714        #[source]
715        error: HvError,
716    },
717}
718
719impl From<MshvError> for KernelError {
720    fn from(err: MshvError) -> Self {
721        match err {
722            MshvError::Errno(e) => KernelError::Kernel(e.into()),
723            MshvError::Hypercall {
724                code,
725                status_raw,
726                status: _,
727            } => KernelError::Hypercall {
728                code: hvdef::HypercallCode(code),
729                error: HvError::from(
730                    std::num::NonZeroU16::new(status_raw)
731                        .expect("not an error, hypercall returned success"),
732                ),
733            },
734        }
735    }
736}
737
738/// Creates a VM with retry on EINTR.
739fn create_vm_with_retry(
740    mshv: &Mshv,
741    args: &mshv_bindings::mshv_create_partition_v2,
742) -> Result<VmFd, Error> {
743    loop {
744        match mshv.create_vm_with_args(args) {
745            Ok(fd) => return Ok(fd),
746            Err(e) => {
747                if e.errno() == libc::EINTR {
748                    continue;
749                } else {
750                    return Err(ErrorInner::CreateVMFailed.into());
751                }
752            }
753        }
754    }
755}
756
757/// Returns the base set of synthetic processor features shared by both
758/// architectures. Each architecture may add extra features before passing
759/// the result to `set_partition_property`.
760fn common_synthetic_features() -> hvdef::HvPartitionSyntheticProcessorFeatures {
761    hvdef::HvPartitionSyntheticProcessorFeatures::new()
762        .with_hypervisor_present(true)
763        .with_hv1(true)
764        .with_access_vp_run_time_reg(true)
765        .with_access_partition_reference_counter(true)
766        .with_access_synic_regs(true)
767        .with_access_synthetic_timer_regs(true)
768        .with_access_intr_ctrl_regs(true)
769        .with_access_hypercall_regs(true)
770        .with_access_vp_index(true)
771        .with_fast_hypercall_output(true)
772        .with_direct_synthetic_timers(true)
773        .with_extended_processor_masks(true)
774        .with_tb_flush_hypercalls(true)
775        .with_synthetic_cluster_ipi(true)
776        .with_notify_long_spin_wait(true)
777        .with_query_numa_distance(true)
778        .with_signal_events(true)
779        .with_retarget_device_interrupt(true)
780}
781
782impl PartitionAccessState for MshvPartition {
783    type StateAccess<'a> = &'a MshvPartition;
784
785    fn access_state(&self, vtl: Vtl) -> Self::StateAccess<'_> {
786        assert_eq!(vtl, Vtl::Vtl0);
787        self
788    }
789}
790
791#[derive(Debug, Default)]
792struct MshvMemoryRangeState {
793    ranges: Vec<Option<mshv_user_mem_region>>,
794}
795
796impl virt::PartitionMemoryMapper for MshvPartition {
797    fn memory_mapper(&self, vtl: Vtl) -> Arc<dyn virt::PartitionMemoryMap> {
798        assert_eq!(vtl, Vtl::Vtl0);
799        self.inner.clone()
800    }
801}
802
803// TODO: figure out a better abstraction that also works for KVM and WHP.
804impl virt::PartitionMemoryMap for MshvPartitionInner {
805    unsafe fn map_range(
806        &self,
807        data: *mut u8,
808        size: usize,
809        addr: u64,
810        writable: bool,
811        exec: bool,
812    ) -> anyhow::Result<()> {
813        let mut state = self.memory.lock();
814
815        // Memory slots cannot be resized but can be moved within the guest
816        // address space. Find the existing slot if there is one.
817        let mut slot_to_use = None;
818        for (slot, range) in state.ranges.iter_mut().enumerate() {
819            match range {
820                Some(range) if range.userspace_addr == data as u64 => {
821                    slot_to_use = Some(slot);
822                    break;
823                }
824                Some(_) => (),
825                None => slot_to_use = Some(slot),
826            }
827        }
828        if slot_to_use.is_none() {
829            slot_to_use = Some(state.ranges.len());
830            state.ranges.push(None);
831        }
832        let slot_to_use = slot_to_use.unwrap();
833
834        let mut flags = 0;
835        if writable {
836            flags |= set_bits!(u8, MSHV_SET_MEM_BIT_WRITABLE);
837        }
838        if exec {
839            flags |= set_bits!(u8, MSHV_SET_MEM_BIT_EXECUTABLE);
840        }
841        let mem_region = mshv_user_mem_region {
842            size: size as u64,
843            guest_pfn: addr >> HV_PAGE_SHIFT,
844            userspace_addr: data as u64,
845            flags,
846            rsvd: [0; 7],
847        };
848
849        let _span = tracing::info_span!(
850            "mshv map user memory",
851            guest_pfn = mem_region.guest_pfn,
852            size = mem_region.size,
853            writable,
854            exec,
855        )
856        .entered();
857        self.vmfd.map_user_memory(mem_region)?;
858        state.ranges[slot_to_use] = Some(mem_region);
859        Ok(())
860    }
861
862    fn unmap_range(&self, addr: u64, size: u64) -> anyhow::Result<()> {
863        let unmap_start = addr >> HV_PAGE_SHIFT;
864        let unmap_end = (addr + size) >> HV_PAGE_SHIFT;
865        let mut state = self.memory.lock();
866        for entry in &mut state.ranges {
867            let Some(region) = entry.as_ref() else {
868                continue;
869            };
870            let region_start = region.guest_pfn;
871            let region_end = region.guest_pfn + (region.size >> HV_PAGE_SHIFT);
872            if unmap_start <= region_start && region_end <= unmap_end {
873                // Region is fully contained in the unmap range.
874                let _span = tracing::info_span!(
875                    "mshv unmap user memory",
876                    guest_pfn = region.guest_pfn,
877                    size = region.size,
878                )
879                .entered();
880                self.vmfd.unmap_user_memory(*region)?;
881                *entry = None;
882            } else {
883                assert!(
884                    region_end <= unmap_start || unmap_end <= region_start,
885                    "unmap range partially overlaps a mapped region"
886                );
887            }
888        }
889        Ok(())
890    }
891}
892
893/// Holds the state needed to deassign an MSHV ioeventfd on drop.
894///
895/// The kernel's `mshv_deassign_ioeventfd` matches entries by (eventfd,
896/// addr, len, datamatch/wildcard), so we must keep all of these alive
897/// for the deassign ioctl.
898struct MshvDoorbellEntry {
899    partition: Weak<MshvPartitionInner>,
900    event: Event,
901    guest_address: u64,
902    datamatch: u64,
903    len: u32,
904    flags: u32,
905}
906
907impl MshvDoorbellEntry {
908    fn new(
909        partition: &Arc<MshvPartitionInner>,
910        guest_address: u64,
911        value: Option<u64>,
912        length: Option<u32>,
913        fd: &Event,
914    ) -> io::Result<MshvDoorbellEntry> {
915        let flags = if value.is_some() {
916            1 << mshv_bindings::MSHV_IOEVENTFD_BIT_DATAMATCH
917        } else {
918            0
919        };
920        let datamatch = value.unwrap_or(0);
921        let len = length.unwrap_or(0);
922        let event = fd.clone();
923
924        let ioeventfd = mshv_bindings::mshv_user_ioeventfd {
925            datamatch,
926            addr: guest_address,
927            len,
928            fd: event.as_fd().as_raw_fd(),
929            flags,
930            ..Default::default()
931        };
932        // SAFETY: `partition.vmfd` is valid because it is owned by
933        // `MshvPartitionInner`. The `ioeventfd` struct is properly
934        // initialized on the stack.
935        let ret = unsafe {
936            libc::ioctl(
937                partition.vmfd.as_raw_fd(),
938                mshv_ioctls::MSHV_IOEVENTFD() as _,
939                std::ptr::from_ref(&ioeventfd),
940            )
941        };
942        if ret < 0 {
943            return Err(io::Error::last_os_error());
944        }
945
946        Ok(Self {
947            partition: Arc::downgrade(partition),
948            event,
949            guest_address,
950            datamatch,
951            len,
952            flags,
953        })
954    }
955}
956
957impl Drop for MshvDoorbellEntry {
958    fn drop(&mut self) {
959        if let Some(partition) = self.partition.upgrade() {
960            let ioeventfd = mshv_bindings::mshv_user_ioeventfd {
961                datamatch: self.datamatch,
962                addr: self.guest_address,
963                len: self.len,
964                fd: self.event.as_fd().as_raw_fd(),
965                flags: self.flags | (1 << mshv_bindings::MSHV_IOEVENTFD_BIT_DEASSIGN),
966                ..Default::default()
967            };
968            // SAFETY: `partition.vmfd` is valid because we successfully
969            // upgraded the weak reference. The `ioeventfd` struct is
970            // properly initialized on the stack.
971            let ret = unsafe {
972                libc::ioctl(
973                    partition.vmfd.as_raw_fd(),
974                    mshv_ioctls::MSHV_IOEVENTFD() as _,
975                    std::ptr::from_ref(&ioeventfd),
976                )
977            };
978            assert!(
979                ret >= 0,
980                "failed to unregister doorbell at {:#x}: {}",
981                self.guest_address,
982                io::Error::last_os_error()
983            );
984        }
985    }
986}
987
988impl DoorbellRegistration for MshvPartition {
989    fn register_doorbell(
990        &self,
991        guest_address: u64,
992        value: Option<u64>,
993        length: Option<u32>,
994        fd: &Event,
995    ) -> io::Result<Box<dyn Send + Sync>> {
996        Ok(Box::new(MshvDoorbellEntry::new(
997            &self.inner,
998            guest_address,
999            value,
1000            length,
1001            fd,
1002        )?))
1003    }
1004}
1005
1006impl virt::synic::Synic for MshvPartitionInner {
1007    fn port_map(&self) -> &virt::synic::SynicPortMap {
1008        &self.synic_ports
1009    }
1010
1011    fn post_message(&self, _vtl: Vtl, vp: VpIndex, sint: u8, typ: u32, payload: &[u8]) {
1012        self.post_message(vp, sint, &HvMessage::new(HvMessageType(typ), 0, payload));
1013    }
1014
1015    fn new_guest_event_port(
1016        self: Arc<Self>,
1017        _vtl: Vtl,
1018        vp: u32,
1019        sint: u8,
1020        flag: u16,
1021    ) -> Box<dyn GuestEventPort> {
1022        Box::new(MshvGuestEventPort {
1023            partition: Arc::downgrade(&self),
1024            params: Arc::new(Mutex::new(MshvEventPortParams {
1025                vp: VpIndex::new(vp),
1026                sint,
1027                flag,
1028            })),
1029        })
1030    }
1031
1032    fn prefer_os_events(&self) -> bool {
1033        false
1034    }
1035}
1036
1037/// `GuestEventPort` implementation for MSHV partitions.
1038#[derive(Debug, Clone)]
1039struct MshvGuestEventPort {
1040    partition: Weak<MshvPartitionInner>,
1041    params: Arc<Mutex<MshvEventPortParams>>,
1042}
1043
1044#[derive(Debug, Copy, Clone)]
1045struct MshvEventPortParams {
1046    vp: VpIndex,
1047    sint: u8,
1048    flag: u16,
1049}
1050
1051impl GuestEventPort for MshvGuestEventPort {
1052    fn interrupt(&self) -> Interrupt {
1053        let partition = self.partition.clone();
1054        let params = self.params.clone();
1055        Interrupt::from_fn(move || {
1056            let MshvEventPortParams { vp, sint, flag } = *params.lock();
1057            if let Some(partition) = partition.upgrade() {
1058                partition
1059                    .signal_event_direct(vp.index(), sint, flag)
1060                    .unwrap_or_else(|_| {
1061                        panic!(
1062                            "Failed signal synic sint {} on vp {:?} with flag {}",
1063                            sint, vp, flag
1064                        )
1065                    });
1066            }
1067        })
1068    }
1069
1070    fn set_target_vp(&mut self, vp: u32) -> Result<(), vmcore::synic::HypervisorError> {
1071        self.params.lock().vp = VpIndex::new(vp);
1072        Ok(())
1073    }
1074}