kvm/
lib.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4#![expect(missing_docs)]
5#![cfg(target_os = "linux")]
6// UNSAFETY: Calling KVM APIs and IOCTLs and dealing with the raw pointers
7// necessary for doing so.
8#![expect(unsafe_code)]
9
10pub use kvm_bindings::kvm_ioeventfd_flag_nr_datamatch;
11pub use kvm_bindings::kvm_ioeventfd_flag_nr_deassign;
12pub use kvm_bindings::*;
13use pal::unix::pthread::*;
14use parking_lot::RwLock;
15use std::fs::File;
16use std::io;
17use std::marker::PhantomData;
18use std::os::unix::prelude::*;
19use std::sync::Once;
20use std::sync::atomic::AtomicU8;
21use std::sync::atomic::AtomicUsize;
22use std::sync::atomic::Ordering;
23use thiserror::Error;
24
25mod ioctl {
26    use kvm_bindings::*;
27    use nix::ioctl_read;
28    use nix::ioctl_readwrite;
29    use nix::ioctl_write_int_bad;
30    use nix::ioctl_write_ptr;
31    use nix::request_code_none;
32    const KVMIO: u8 = 0xae;
33    ioctl_write_int_bad!(kvm_create_vm, request_code_none!(KVMIO, 0x1));
34    ioctl_write_int_bad!(kvm_check_extension, request_code_none!(KVMIO, 0x03));
35    ioctl_write_int_bad!(kvm_get_vcpu_mmap_size, request_code_none!(KVMIO, 0x04));
36    #[cfg(target_arch = "x86_64")]
37    ioctl_readwrite!(kvm_get_supported_cpuid, KVMIO, 0x05, kvm_cpuid2);
38    ioctl_write_int_bad!(kvm_create_vcpu, request_code_none!(KVMIO, 0x41));
39    ioctl_write_ptr!(
40        kvm_set_user_memory_region,
41        KVMIO,
42        0x46,
43        kvm_userspace_memory_region
44    );
45    ioctl_write_ptr!(kvm_irq_line, KVMIO, 0x61, kvm_irq_level);
46    ioctl_write_ptr!(kvm_set_gsi_routing, KVMIO, 0x6a, kvm_irq_routing);
47    ioctl_write_ptr!(kvm_irqfd, KVMIO, 0x76, kvm_irqfd);
48    ioctl_write_int_bad!(kvm_set_boot_cpu_id, request_code_none!(KVMIO, 0x78));
49    ioctl_read!(kvm_get_clock, KVMIO, 0x7c, kvm_clock_data);
50    ioctl_write_int_bad!(kvm_run, request_code_none!(KVMIO, 0x80));
51    // Is *NOT* defined for arm64
52    #[cfg(not(target_arch = "aarch64"))]
53    ioctl_read!(kvm_get_regs, KVMIO, 0x81, kvm_regs);
54    // Is *NOT* defined for arm64
55    #[cfg(not(target_arch = "aarch64"))]
56    ioctl_write_ptr!(kvm_set_regs, KVMIO, 0x82, kvm_regs);
57    ioctl_read!(kvm_get_sregs, KVMIO, 0x83, kvm_sregs);
58    ioctl_write_ptr!(kvm_set_sregs, KVMIO, 0x84, kvm_sregs);
59    ioctl_readwrite!(kvm_translation, KVMIO, 0x85, kvm_translation);
60    ioctl_write_ptr!(kvm_interrupt, KVMIO, 0x86, kvm_interrupt);
61    #[cfg(target_arch = "x86_64")]
62    ioctl_readwrite!(kvm_get_msrs, KVMIO, 0x88, kvm_msrs);
63    #[cfg(target_arch = "x86_64")]
64    ioctl_write_ptr!(kvm_set_msrs, KVMIO, 0x89, kvm_msrs);
65    ioctl_write_ptr!(kvm_set_signal_mask, KVMIO, 0x8b, kvm_signal_mask);
66    ioctl_read!(kvm_get_fpu, KVMIO, 0x8c, kvm_fpu);
67    ioctl_write_ptr!(kvm_set_fpu, KVMIO, 0x8d, kvm_fpu);
68    #[cfg(target_arch = "x86_64")]
69    ioctl_read!(kvm_get_lapic, KVMIO, 0x8e, kvm_lapic_state);
70    #[cfg(target_arch = "x86_64")]
71    ioctl_write_ptr!(kvm_set_lapic, KVMIO, 0x8f, kvm_lapic_state);
72    #[cfg(target_arch = "x86_64")]
73    ioctl_write_ptr!(kvm_set_cpuid2, KVMIO, 0x90, kvm_cpuid2);
74    ioctl_read!(kvm_get_mp_state, KVMIO, 0x98, kvm_mp_state);
75    ioctl_write_ptr!(kvm_set_mp_state, KVMIO, 0x99, kvm_mp_state);
76    ioctl_read!(kvm_get_vcpu_events, KVMIO, 0x9f, kvm_vcpu_events);
77    ioctl_write_ptr!(kvm_set_vcpu_events, KVMIO, 0xa0, kvm_vcpu_events);
78    #[cfg(target_arch = "x86_64")]
79    ioctl_read!(kvm_get_debugregs, KVMIO, 0xa1, kvm_debugregs);
80    #[cfg(target_arch = "x86_64")]
81    ioctl_write_ptr!(kvm_set_debugregs, KVMIO, 0xa2, kvm_debugregs);
82    ioctl_write_ptr!(kvm_enable_cap, KVMIO, 0xa3, kvm_enable_cap);
83    #[cfg(target_arch = "x86_64")]
84    ioctl_read!(kvm_get_xsave, KVMIO, 0xa4, kvm_xsave);
85    #[cfg(target_arch = "x86_64")]
86    ioctl_write_ptr!(kvm_set_xsave, KVMIO, 0xa5, kvm_xsave);
87    ioctl_write_ptr!(kvm_signal_msi, KVMIO, 0xa5, kvm_msi);
88    #[cfg(target_arch = "x86_64")]
89    ioctl_read!(kvm_get_xcrs, KVMIO, 0xa6, kvm_xcrs);
90    #[cfg(target_arch = "x86_64")]
91    ioctl_write_ptr!(kvm_set_xcrs, KVMIO, 0xa7, kvm_xcrs);
92    ioctl_write_ptr!(kvm_get_reg, KVMIO, 0xab, kvm_one_reg);
93    ioctl_write_ptr!(kvm_set_reg, KVMIO, 0xac, kvm_one_reg);
94    #[cfg(target_arch = "aarch64")]
95    ioctl_write_ptr!(kvm_arm_vcpu_init, KVMIO, 0xae, kvm_vcpu_init);
96    #[cfg(target_arch = "aarch64")]
97    ioctl_read!(kvm_arm_preferred_target, KVMIO, 0xaf, kvm_vcpu_init);
98    ioctl_write_ptr!(kvm_ioeventfd, KVMIO, 0x79, kvm_ioeventfd);
99    ioctl_write_ptr!(kvm_set_guest_debug, KVMIO, 0x9b, kvm_guest_debug);
100    ioctl_readwrite!(kvm_create_device, KVMIO, 0xe0, kvm_create_device);
101    ioctl_write_ptr!(kvm_set_device_attr, KVMIO, 0xe1, kvm_device_attr);
102}
103
104#[derive(Error, Debug)]
105pub enum Error {
106    #[error("failed to open /dev/kvm")]
107    OpenKvm(#[source] io::Error),
108    #[error("SignalMsi")]
109    SignalMsi(#[source] nix::Error),
110    #[error("SetMemoryRegion")]
111    SetMemoryRegion(#[source] nix::Error),
112    #[error("CreateVm")]
113    CreateVm(#[source] nix::Error),
114    #[error("EnableCap({0})")]
115    EnableCap(&'static str, #[source] nix::Error),
116    #[error("CreateVCpu")]
117    CreateVCpu(#[source] nix::Error),
118    #[error("GetRegs")]
119    GetRegs(#[source] nix::Error),
120    #[error("GetSRegs")]
121    GetSRegs(#[source] nix::Error),
122    #[error("SetRegs")]
123    SetRegs(#[source] nix::Error),
124    #[error("SetSRegs")]
125    SetSRegs(#[source] nix::Error),
126    #[error("Run")]
127    Run(#[source] nix::Error),
128    #[error("GetVCpuMmapSize")]
129    GetVCpuMmapSize(#[source] nix::Error),
130    #[error("MmapVCpu")]
131    MmapVCpu(#[source] io::Error),
132    #[error("SetFpu")]
133    SetFpu(#[source] nix::Error),
134    #[error("GetSupportedCpuid")]
135    GetSupportedCpuid(#[source] nix::Error),
136    #[error("SetCpuid")]
137    SetCpuid(#[source] nix::Error),
138    #[error("Interrupt")]
139    Interrupt(#[source] nix::Error),
140    #[error("GetLApic")]
141    GetLApic(#[source] nix::Error),
142    #[error("SetLApic")]
143    SetLApic(#[source] nix::Error),
144    #[error("GetXsave")]
145    GetXsave(#[source] nix::Error),
146    #[error("SetXsave")]
147    SetXsave(#[source] nix::Error),
148    #[error("GetDebugRegs")]
149    GetDebugRegs(#[source] nix::Error),
150    #[error("SetDebugRegs")]
151    SetDebugRegs(#[source] nix::Error),
152    #[error("GetXcrs")]
153    GetXcrs(#[source] nix::Error),
154    #[error("SetXcrs")]
155    SetXcrs(#[source] nix::Error),
156    #[error("xsave is not enabled")]
157    XsaveNotEnabled,
158    #[error("SetGsiRouting")]
159    SetGsiRouting(#[source] nix::Error),
160    #[error("IrqLine")]
161    IrqLine(#[source] nix::Error),
162    #[error("GetMsrs")]
163    GetMsrs(#[source] nix::Error),
164    #[error("SetMsrs")]
165    SetMsrs(#[source] nix::Error),
166    #[error("GetMpState")]
167    GetMpState(#[source] nix::Error),
168    #[error("SetMpState")]
169    SetMpState(#[source] nix::Error),
170    #[error("GetVcpuEvents")]
171    GetVcpuEvents(#[source] nix::Error),
172    #[error("SetVcpuEvents")]
173    SetVcpuEvents(#[source] nix::Error),
174    #[error("TranslateGva")]
175    TranslateGva(#[source] nix::Error),
176    #[error("unknown exit {0:#x}")]
177    UnknownExit(u32),
178    #[error("unknown Hyper-V exit {0:#x}")]
179    UnknownHvExit(u32),
180    #[error("ioeventfd")]
181    IoEventFd(#[source] nix::Error),
182    #[error("irqfd")]
183    IrqFd(#[source] nix::Error),
184    #[error("failed to set BSP")]
185    SetBsp(#[source] nix::Error),
186    #[error("CreateDevice")]
187    CreateDevice(#[source] nix::Error),
188    #[error("SetDeviceAttr")]
189    SetDeviceAttr(#[source] nix::Error),
190}
191
192type Result<T, E = Error> = std::result::Result<T, E>;
193
194#[derive(Debug)]
195struct Vp {
196    vcpu: File,
197    run_data: VpPtr,
198    thread: RwLock<Option<Pthread>>,
199    _phantom: PhantomData<kvm_run>,
200}
201
202/// Send+Sync wrapper around the mapped kvm_run pointer.
203#[derive(Debug)]
204struct VpPtr {
205    ptr: *mut kvm_run,
206    len: usize,
207}
208
209// SAFETY: this type contains a pointer to mapped data. By itself this is
210// Send+Sync since it's just a raw pointer value with no methods, but in context
211// it must be carefully accessed only by one thread at a time. This is mediated
212// by `Vp`.
213unsafe impl Send for VpPtr {}
214// SAFETY: see above comment
215unsafe impl Sync for VpPtr {}
216
217/// An open file to `/dev/kvm`.
218#[derive(Debug)]
219pub struct Kvm(File);
220
221impl Kvm {
222    /// Opens `/dev/kvm`.
223    pub fn new() -> Result<Self> {
224        let kvm = std::fs::OpenOptions::new()
225            .read(true)
226            .write(true)
227            .open("/dev/kvm")
228            .map_err(Error::OpenKvm)?;
229
230        Ok(Self(kvm))
231    }
232
233    /// Returns the CPUID values that are supported by the hypervisor.
234    #[cfg(target_arch = "x86_64")]
235    pub fn supported_cpuid(&self) -> Result<Vec<kvm_cpuid_entry2>> {
236        const MAX_CPUID_ENTRIES: usize = 256;
237        let mut supported_cpuid = Cpuid {
238            cpuid: kvm_cpuid2 {
239                nent: MAX_CPUID_ENTRIES as u32,
240                ..Default::default()
241            },
242            entries: [Default::default(); MAX_CPUID_ENTRIES],
243        };
244
245        // TODO: We are not checking for KVM_CAP_EXT_CPUID first.
246        // SAFETY: We have allocated an array for the ioctl to write to and correctly specified its size in nent.
247        unsafe {
248            ioctl::kvm_get_supported_cpuid(self.as_fd().as_raw_fd(), &mut supported_cpuid.cpuid)
249                .map_err(Error::GetSupportedCpuid)?;
250        }
251
252        Ok(supported_cpuid.entries[..supported_cpuid.cpuid.nent as usize].to_vec())
253    }
254
255    pub fn check_extension(&self, extension: u32) -> nix::Result<libc::c_int> {
256        // SAFETY: Calling IOCTL as documented, with no special requirements.
257        unsafe { ioctl::kvm_check_extension(self.as_fd().as_raw_fd(), extension as i32) }
258    }
259
260    pub fn new_vm(&self) -> Result<Partition> {
261        // On ARM, can request memory isolation which we don't use.
262        // For that, include the `KVM_VM_TYPE_ARM_PROTECTED` flag.
263        // Use 0 as the fallback machine type, which implies 40bit
264        // IPA on ARM64, and on x86_64 is the only option.
265        let vm_type = self.check_extension(KVM_CAP_ARM_VM_IPA_SIZE).unwrap_or(0);
266
267        // SAFETY: Calling IOCTL as documented, with no special requirements.
268        let vm = unsafe {
269            let fd =
270                ioctl::kvm_create_vm(self.as_fd().as_raw_fd(), vm_type).map_err(Error::CreateVm)?;
271            File::from_raw_fd(fd)
272        };
273
274        // TODO: We are not checking KVM_CAP_ENABLE_CAP_VM first.
275        // TODO: We are not calling KVM_CHECK_EXTENSION first.
276        // SAFETY: Calling IOCTLs as documented, with no special requirements.
277        #[cfg(target_arch = "x86_64")]
278        unsafe {
279            // Disable quirks to make KVM behave more architecturally correct.
280            // TODO: Investigate using KVM_CAP_DISABLE_QUIRKS2 instead.
281            ioctl::kvm_enable_cap(
282                vm.as_raw_fd(),
283                &kvm_enable_cap {
284                    cap: KVM_CAP_DISABLE_QUIRKS,
285                    args: [KVM_X86_QUIRK_LINT0_REENABLED.into(), 0, 0, 0],
286                    ..Default::default()
287                },
288            )
289            .map_err(|err| Error::EnableCap("disable_quirks", err))?;
290        }
291
292        // SAFETY: Calling IOCTL as documented, with no special requirements.
293        let mmap_size = unsafe {
294            ioctl::kvm_get_vcpu_mmap_size(self.as_fd().as_raw_fd(), 0)
295                .map_err(Error::GetVCpuMmapSize)? as usize
296        };
297
298        Ok(Partition {
299            vm,
300            vps: Vec::new(),
301            mmap_size,
302        })
303    }
304}
305
306impl AsFd for Kvm {
307    fn as_fd(&self) -> BorrowedFd<'_> {
308        self.0.as_fd()
309    }
310}
311
312#[repr(C)]
313#[cfg(target_arch = "x86_64")]
314struct Cpuid {
315    cpuid: kvm_cpuid2,
316    entries: [kvm_cpuid_entry2; 256],
317}
318
319#[derive(Debug)]
320pub struct Partition {
321    vm: File,
322    vps: Vec<Option<Vp>>,
323    mmap_size: usize,
324}
325
326impl Partition {
327    pub fn enable_split_irqchip(&self, lines: u32) -> Result<()> {
328        // TODO: We are not checking KVM_CAP_ENABLE_CAP_VM first.
329        // TODO: We are not calling KVM_CHECK_EXTENSION first.
330        // SAFETY: Calling IOCTL as documented, with no special requirements.
331        unsafe {
332            ioctl::kvm_enable_cap(
333                self.vm.as_raw_fd(),
334                &kvm_enable_cap {
335                    cap: KVM_CAP_SPLIT_IRQCHIP,
336                    args: [lines.into(), 0, 0, 0],
337                    ..Default::default()
338                },
339            )
340            .map_err(|err| Error::EnableCap("split_irqchip", err))?;
341        }
342        Ok(())
343    }
344
345    /// Enable X2APIC IDs in interrupt and LAPIC APIs.
346    #[cfg(target_arch = "x86_64")]
347    pub fn enable_x2apic_api(&self) -> Result<()> {
348        let flags = KVM_X2APIC_API_USE_32BIT_IDS;
349        // SAFETY: Calling IOCTL as documented, with no special requirements.
350        unsafe {
351            ioctl::kvm_enable_cap(
352                self.vm.as_raw_fd(),
353                &kvm_enable_cap {
354                    cap: KVM_CAP_X2APIC_API,
355                    args: [flags.into(), 0, 0, 0],
356                    ..Default::default()
357                },
358            )
359            .map_err(|err| Error::EnableCap("x2apic_api", err))?;
360        }
361        Ok(())
362    }
363
364    pub fn enable_unknown_msr_exits(&self) -> Result<()> {
365        // SAFETY: Calling IOCTL as documented, with no special requirements.
366        // TODO: We are not checking KVM_CAP_ENABLE_CAP_VM first.
367        unsafe {
368            ioctl::kvm_enable_cap(
369                self.vm.as_raw_fd(),
370                &kvm_enable_cap {
371                    cap: KVM_CAP_X86_USER_SPACE_MSR,
372                    args: [KVM_MSR_EXIT_REASON_UNKNOWN.into(), 0, 0, 0],
373                    ..Default::default()
374                },
375            )
376            .map_err(|err| Error::EnableCap("user_space_msr", err))?;
377        }
378        Ok(())
379    }
380
381    /// Set the VCPU index of the BSP. This must be called before any VCPUs are
382    /// created.
383    #[cfg(target_arch = "x86_64")]
384    pub fn set_bsp(&mut self, vcpu_idx: u32) -> Result<()> {
385        // SAFETY: Calling IOCTL as documented, with no special requirements.
386        unsafe {
387            ioctl::kvm_set_boot_cpu_id(self.vm.as_raw_fd(), vcpu_idx as i32)
388                .map_err(Error::SetBsp)?;
389        }
390
391        Ok(())
392    }
393
394    pub fn add_vp(&mut self, vcpu_idx: u32) -> Result<()> {
395        // TODO: We are not checking KVM_CAP_NR_VCPUS or KVM_CAP_MAX_VCPUS first.
396        // SAFETY: Calling IOCTL as documented, with no special requirements.
397        let vcpu = unsafe {
398            let fd = ioctl::kvm_create_vcpu(self.vm.as_raw_fd(), vcpu_idx as i32)
399                .map_err(Error::CreateVCpu)?;
400            File::from_raw_fd(fd)
401        };
402
403        // SAFETY: Calling mmap with a null pointer is valid, and vcpu is guaranteed to have a valid fd.
404        let ptr = unsafe {
405            let ptr = libc::mmap(
406                std::ptr::null_mut(),
407                self.mmap_size,
408                libc::PROT_READ | libc::PROT_WRITE,
409                libc::MAP_SHARED,
410                vcpu.as_raw_fd(),
411                0,
412            );
413            if ptr == libc::MAP_FAILED {
414                return Err(Error::MmapVCpu(io::Error::last_os_error()));
415            }
416            ptr
417        };
418
419        #[cfg(target_arch = "aarch64")]
420        {
421            // Can request additional features like so:
422            let mut kvi = kvm_vcpu_init::default();
423            kvi.features[0] |= 1 << KVM_ARM_VCPU_PSCI_0_2;
424
425            if vcpu_idx > 0 {
426                kvi.features[0] |= 1 << KVM_ARM_VCPU_POWER_OFF;
427            }
428
429            let mut pref_target = kvm_vcpu_init::default();
430            // SAFETY: Calling IOCTL as documented, with no special requirements.
431            unsafe {
432                ioctl::kvm_arm_preferred_target(self.vm.as_raw_fd(), &mut pref_target)
433                    .map_err(Error::CreateVCpu)?
434            };
435
436            kvi.target = pref_target.target;
437            // SAFETY: Calling IOCTL as documented, with no special requirements.
438            unsafe { ioctl::kvm_arm_vcpu_init(vcpu.as_raw_fd(), &kvi).map_err(Error::CreateVCpu)? };
439        }
440
441        let vp = Vp {
442            vcpu,
443            run_data: VpPtr {
444                ptr: ptr.cast(),
445                len: self.mmap_size,
446            },
447            thread: RwLock::new(None),
448            _phantom: PhantomData,
449        };
450        if self.vps.len() <= vcpu_idx as usize {
451            self.vps.resize_with(vcpu_idx as usize + 1, || None);
452        }
453        assert!(self.vps[vcpu_idx as usize].replace(vp).is_none());
454
455        Ok(())
456    }
457
458    pub fn vp(&self, index: u32) -> Processor<'_> {
459        Processor(self, index)
460    }
461
462    pub fn request_msi(&self, msi: &kvm_msi) -> Result<()> {
463        // TODO: We are not checking KVM_CAP_SIGNAL_MSI first.
464        // SAFETY: Calling IOCTL as documented, with no special requirements.
465        unsafe {
466            ioctl::kvm_signal_msi(self.vm.as_raw_fd(), msi).map_err(Error::SignalMsi)?;
467        }
468        Ok(())
469    }
470
471    #[expect(clippy::missing_safety_doc, clippy::undocumented_unsafe_blocks)]
472    pub unsafe fn set_user_memory_region(
473        &self,
474        slot: u32,
475        data: *mut u8,
476        size: usize,
477        addr: u64,
478        readonly: bool,
479    ) -> Result<()> {
480        let region = kvm_userspace_memory_region {
481            slot,
482            flags: if readonly { KVM_MEM_READONLY } else { 0 },
483            guest_phys_addr: addr,
484            memory_size: size as u64,
485            userspace_addr: data as usize as u64,
486        };
487        unsafe {
488            ioctl::kvm_set_user_memory_region(self.vm.as_raw_fd(), &region)
489                .map_err(Error::SetMemoryRegion)?;
490        }
491        Ok(())
492    }
493
494    pub fn set_gsi_routes(&self, routes: &[(u32, RoutingEntry)]) -> Result<()> {
495        const MAX_ROUTES: usize = 2048;
496        assert!(routes.len() <= MAX_ROUTES);
497
498        #[repr(C)]
499        struct Routes {
500            header: kvm_irq_routing,
501            entries: [kvm_irq_routing_entry; MAX_ROUTES],
502        }
503
504        let mut kvm_routes = Routes {
505            header: Default::default(),
506            entries: [Default::default(); MAX_ROUTES],
507        };
508        for (i, route) in routes.iter().enumerate() {
509            let (type_, u) = match route.1 {
510                RoutingEntry::Msi {
511                    address_lo,
512                    address_hi,
513                    data,
514                } => (
515                    KVM_IRQ_ROUTING_MSI,
516                    kvm_irq_routing_entry__bindgen_ty_1 {
517                        msi: kvm_irq_routing_msi {
518                            address_lo,
519                            address_hi,
520                            data,
521                            __bindgen_anon_1: Default::default(),
522                        },
523                    },
524                ),
525                RoutingEntry::HvSint { vp, sint } => (
526                    KVM_IRQ_ROUTING_HV_SINT,
527                    kvm_irq_routing_entry__bindgen_ty_1 {
528                        hv_sint: kvm_irq_routing_hv_sint {
529                            vcpu: vp,
530                            sint: sint.into(),
531                        },
532                    },
533                ),
534                RoutingEntry::Irqchip { pin } => (
535                    KVM_IRQ_ROUTING_IRQCHIP,
536                    kvm_irq_routing_entry__bindgen_ty_1 {
537                        irqchip: kvm_irq_routing_irqchip { pin, irqchip: 0 },
538                    },
539                ),
540            };
541            kvm_routes.entries[i] = kvm_irq_routing_entry {
542                gsi: route.0,
543                type_,
544                flags: 0,
545                pad: 0,
546                u,
547            };
548            kvm_routes.header.nr += 1;
549        }
550
551        // TODO: We are not checking KVM_CAP_IRQ_ROUTING first.
552        // SAFETY: Our Routes type puts the entries array immediately after the header in memory, as required.
553        unsafe {
554            ioctl::kvm_set_gsi_routing(self.vm.as_raw_fd(), &kvm_routes.header)
555                .map_err(Error::SetGsiRouting)?;
556        }
557        Ok(())
558    }
559
560    pub fn irqfd(&self, gsi: u32, event: RawFd, assign: bool) -> Result<()> {
561        // TODO: We are not checking KVM_CAP_IRQFD first.
562        // SAFETY: Calling IOCTL as documented, with no special requirements.
563        unsafe {
564            ioctl::kvm_irqfd(
565                self.vm.as_raw_fd(),
566                &kvm_irqfd {
567                    fd: event as u32,
568                    gsi,
569                    flags: if assign { 0 } else { KVM_IRQFD_FLAG_DEASSIGN },
570                    resamplefd: 0,
571                    pad: [0; 16],
572                },
573            )
574            .map_err(Error::IrqFd)
575            .map(drop)
576        }
577    }
578
579    pub fn irq_line(&self, gsi: u32, level: bool) -> Result<()> {
580        // TODO: We are not checking KVM_CAP_IRQCHIP first.
581        // SAFETY: Calling IOCTL as documented, with no special requirements.
582        unsafe {
583            ioctl::kvm_irq_line(
584                self.vm.as_raw_fd(),
585                &kvm_irq_level {
586                    __bindgen_anon_1: kvm_irq_level__bindgen_ty_1 { irq: gsi },
587                    level: level.into(),
588                },
589            )
590            .map_err(Error::IrqLine)?;
591        }
592        Ok(())
593    }
594
595    pub fn ioeventfd(
596        &self,
597        datamatch: u64,
598        addr: u64,
599        len: u32,
600        fd: i32,
601        flags: u32,
602    ) -> Result<()> {
603        // TODO: We are not checking KVM_CAP_IOEVENTFD first.
604        // SAFETY: Calling IOCTL as documented, with no special requirements.
605        unsafe {
606            ioctl::kvm_ioeventfd(
607                self.vm.as_raw_fd(),
608                &kvm_ioeventfd {
609                    datamatch,
610                    addr,
611                    len,
612                    fd,
613                    flags,
614                    ..Default::default()
615                },
616            )
617            .map_err(Error::IoEventFd)?;
618        };
619        Ok(())
620    }
621
622    pub fn create_device(&self, ty: u32, flags: u32) -> nix::Result<Device> {
623        // SAFETY: Calling IOCTL as documented, with no special requirements.
624        // The reference: https://www.kernel.org/doc/html/latest/virt/kvm/api.html#kvm-create-device.
625        // The kernel checks on the input parameters and returns the appropriate
626        // error code.
627        unsafe {
628            let mut device = kvm_create_device {
629                type_: ty,
630                fd: 0,
631                flags,
632            };
633            ioctl::kvm_create_device(self.vm.as_raw_fd(), &mut device)?;
634            Ok(Device(File::from_raw_fd(device.fd as i32)))
635        }
636    }
637
638    /// Gets the current kvmclock value.
639    pub fn get_clock_ns(&self) -> Result<kvm_clock_data> {
640        let mut clock = kvm_clock_data::default();
641        // SAFETY: Calling IOCTL as documented, with no special requirements.
642        unsafe {
643            ioctl::kvm_get_clock(self.vm.as_raw_fd(), &mut clock).map_err(Error::GetRegs)?;
644        }
645        Ok(clock)
646    }
647}
648
649/// An in-kernel emulated device.
650pub struct Device(File);
651
652impl Device {
653    /// # Safety
654    ///
655    /// `addr` must point to the appropriate input for the attribute being
656    /// set.
657    pub unsafe fn set_device_attr<T>(
658        &self,
659        group: u32,
660        attr: u32,
661        addr: &T,
662        flags: u32,
663    ) -> nix::Result<()> {
664        // SAFETY: caller guaranteed.
665        unsafe {
666            ioctl::kvm_set_device_attr(
667                self.0.as_raw_fd(),
668                &kvm_device_attr {
669                    group,
670                    attr: attr as u64,
671                    addr: std::ptr::from_ref(addr) as u64,
672                    flags,
673                },
674            )?;
675        }
676        Ok(())
677    }
678}
679
680#[derive(Debug, Copy, Clone, Eq, PartialEq)]
681pub enum RoutingEntry {
682    Irqchip {
683        pin: u32,
684    },
685    Msi {
686        address_lo: u32,
687        address_hi: u32,
688        data: u32,
689    },
690    HvSint {
691        vp: u32,
692        sint: u8,
693    },
694}
695
696pub struct Processor<'a>(&'a Partition, u32);
697
698impl<'a> Processor<'a> {
699    pub fn enable_synic(&self) -> Result<()> {
700        // TODO: We are not checking KVM_CAP_ENABLE_CAP_VM first.
701        // TODO: We are not calling KVM_CHECK_EXTENSION first.
702        // SAFETY: Calling IOCTL as documented, with no special requirements.
703        unsafe {
704            ioctl::kvm_enable_cap(
705                self.get().vcpu.as_raw_fd(),
706                &kvm_enable_cap {
707                    cap: KVM_CAP_HYPERV_SYNIC2,
708                    ..Default::default()
709                },
710            )
711            .map_err(|err| Error::EnableCap("hyperv_synic2", err))?;
712        }
713        Ok(())
714    }
715
716    #[cfg(target_arch = "x86_64")]
717    pub fn set_cpuid(&self, entries: &[kvm_cpuid_entry2]) -> Result<()> {
718        const MAX_CPUID_ENTRIES: usize = 256;
719        assert!(entries.len() <= MAX_CPUID_ENTRIES);
720
721        let mut cpuid: Cpuid = Cpuid {
722            cpuid: Default::default(),
723            entries: [Default::default(); MAX_CPUID_ENTRIES],
724        };
725        for (i, e) in entries.iter().enumerate() {
726            cpuid.entries[i] = *e;
727            cpuid.cpuid.nent += 1;
728        }
729
730        // SAFETY: Our Cpuid type puts the entries array immediately after the header in memory, as required.
731        unsafe {
732            ioctl::kvm_set_cpuid2(self.get().vcpu.as_raw_fd(), &cpuid.cpuid)
733                .map_err(Error::SetCpuid)?;
734        }
735        Ok(())
736    }
737
738    fn get(&self) -> &'a Vp {
739        self.0.vps[self.1 as usize].as_ref().expect("vp exists")
740    }
741
742    /// Forces an exit to be returned from the next call to [`VpRunner::run`].
743    ///
744    /// Note that this does nothing if a [`VpRunner`] does not currently exist
745    /// for this VP, or if this is called from the same thread as the runner.
746    pub fn force_exit(&self) {
747        let vp = self.get();
748        let thread = vp.thread.read();
749        if let Some(thread) = *thread {
750            if thread != Pthread::current() {
751                thread
752                    .signal(libc::SIGRTMIN())
753                    .expect("thread cancel signal failed");
754            }
755        }
756    }
757
758    pub fn interrupt(&self, vector: u32) -> Result<()> {
759        // SAFETY: Calling IOCTL as documented, with no special requirements.
760        unsafe {
761            ioctl::kvm_interrupt(self.get().vcpu.as_raw_fd(), &kvm_interrupt { irq: vector })
762                .map_err(Error::Interrupt)?;
763        };
764        Ok(())
765    }
766
767    /// Very not structured way of setting the register. Could enjoy using an enum.
768    pub fn set_reg64(&self, reg_id: u64, value: u64) -> Result<()> {
769        let reg = kvm_one_reg {
770            id: reg_id,
771            addr: std::ptr::from_ref(&value) as u64,
772        };
773        // SAFETY: Calling IOCTL as documented, with no special requirements.
774        unsafe {
775            ioctl::kvm_set_reg(self.get().vcpu.as_raw_fd(), &reg).map_err(Error::SetRegs)?;
776        }
777        Ok(())
778    }
779
780    #[cfg(not(target_arch = "aarch64"))]
781    pub fn set_regs(&self, regs: &kvm_regs) -> Result<()> {
782        // This IOCTL does not work on arm64.
783        // SAFETY: Calling IOCTL as documented, with no special requirements.
784        unsafe {
785            ioctl::kvm_set_regs(self.get().vcpu.as_raw_fd(), regs).map_err(Error::SetRegs)?;
786        }
787        Ok(())
788    }
789
790    #[cfg(not(target_arch = "aarch64"))]
791    pub fn set_sregs(&self, sregs: &kvm_sregs) -> Result<()> {
792        // This IOCTL does not work on arm64.
793        // SAFETY: Calling IOCTL as documented, with no special requirements.
794        unsafe {
795            ioctl::kvm_set_sregs(self.get().vcpu.as_raw_fd(), sregs).map_err(Error::SetRegs)?;
796        }
797        Ok(())
798    }
799
800    /// Very not structured way of getting the register. Could enjoy using an enum.
801    pub fn get_reg64(&self, reg_id: u64) -> Result<u64> {
802        let mut value: u64 = 0;
803        let reg = kvm_one_reg {
804            id: reg_id,
805            addr: std::ptr::from_mut(&mut value) as u64,
806        };
807        // SAFETY: Calling IOCTL as documented, with no special requirements.
808        unsafe {
809            ioctl::kvm_get_reg(self.get().vcpu.as_raw_fd(), &reg).map_err(Error::GetRegs)?;
810        }
811
812        Ok(value)
813    }
814
815    #[cfg(not(target_arch = "aarch64"))]
816    pub fn get_regs(&self) -> Result<kvm_regs> {
817        let mut regs = Default::default();
818        // This IOCTL does not work on arm64.
819        // SAFETY: Calling IOCTL as documented, with no special requirements.
820        unsafe {
821            ioctl::kvm_get_regs(self.get().vcpu.as_raw_fd(), &mut regs).map_err(Error::GetRegs)?;
822        }
823        Ok(regs)
824    }
825
826    #[cfg(not(target_arch = "aarch64"))]
827    pub fn get_sregs(&self) -> Result<kvm_sregs> {
828        let mut sregs = Default::default();
829        // This IOCTL does not work on arm64.
830        // SAFETY: Calling IOCTL as documented, with no special requirements.
831        unsafe {
832            ioctl::kvm_get_sregs(self.get().vcpu.as_raw_fd(), &mut sregs)
833                .map_err(Error::GetSRegs)?;
834        }
835        Ok(sregs)
836    }
837
838    #[cfg(target_arch = "x86_64")]
839    pub fn get_msrs(&self, msrs: &[u32], values: &mut [u64]) -> Result<()> {
840        const MAX_MSR_ENTRIES: usize = 256;
841        assert_eq!(msrs.len(), values.len());
842        assert!(msrs.len() <= MAX_MSR_ENTRIES);
843
844        #[repr(C)]
845        struct Msrs {
846            header: kvm_msrs,
847            entries: [kvm_msr_entry; MAX_MSR_ENTRIES],
848        }
849        let mut input = Msrs {
850            header: kvm_msrs {
851                nmsrs: msrs.len() as u32,
852                ..Default::default()
853            },
854            entries: [Default::default(); MAX_MSR_ENTRIES],
855        };
856        for (i, msr) in msrs.iter().enumerate() {
857            input.entries[i] = kvm_msr_entry {
858                index: *msr,
859                reserved: 0,
860                data: 0,
861            };
862        }
863
864        // SAFETY: Our Msrs type puts the entries array immediately after the header in memory, as required.
865        unsafe {
866            ioctl::kvm_get_msrs(self.get().vcpu.as_raw_fd(), &mut input.header)
867                .map_err(Error::GetMsrs)?;
868        }
869        for (v, e) in values.iter_mut().zip(&input.entries) {
870            *v = e.data;
871        }
872        Ok(())
873    }
874
875    #[cfg(target_arch = "x86_64")]
876    pub fn set_msrs(&self, msrs: &[(u32, u64)]) -> Result<()> {
877        const MAX_MSR_ENTRIES: usize = 256;
878        assert!(msrs.len() <= MAX_MSR_ENTRIES);
879
880        #[repr(C)]
881        struct Msrs {
882            header: kvm_msrs,
883            entries: [kvm_msr_entry; MAX_MSR_ENTRIES],
884        }
885        let mut input = Msrs {
886            header: kvm_msrs {
887                nmsrs: msrs.len() as u32,
888                ..Default::default()
889            },
890            entries: [Default::default(); MAX_MSR_ENTRIES],
891        };
892        for (i, msr) in msrs.iter().enumerate() {
893            input.entries[i] = kvm_msr_entry {
894                index: msr.0,
895                reserved: 0,
896                data: msr.1,
897            };
898        }
899
900        // SAFETY: Our Msrs type puts the entries array immediately after the header in memory, as required.
901        unsafe {
902            ioctl::kvm_set_msrs(self.get().vcpu.as_raw_fd(), &input.header)
903                .map_err(Error::SetMsrs)?;
904        }
905        Ok(())
906    }
907
908    #[cfg(target_arch = "x86_64")]
909    pub fn get_lapic(&self, state: &mut [u8; 1024]) -> Result<()> {
910        assert_eq!(size_of_val(state), size_of::<kvm_lapic_state>());
911
912        // TODO: We are not checking KVM_CAP_IRQCHIP first.
913        // SAFETY: We have verified that our output buffer is the correct size.
914        unsafe {
915            ioctl::kvm_get_lapic(self.get().vcpu.as_raw_fd(), state.as_mut_ptr().cast())
916                .map_err(Error::GetLApic)?;
917        }
918        Ok(())
919    }
920
921    #[cfg(target_arch = "x86_64")]
922    pub fn set_lapic(&self, state: &[u8; 1024]) -> Result<()> {
923        assert_eq!(size_of_val(state), size_of::<kvm_lapic_state>());
924
925        // TODO: We are not checking KVM_CAP_IRQCHIP first.
926        // SAFETY: We have verified that our input buffer is the correct size.
927        unsafe {
928            ioctl::kvm_set_lapic(self.get().vcpu.as_raw_fd(), state.as_ptr().cast())
929                .map_err(Error::SetLApic)?;
930        }
931        Ok(())
932    }
933
934    #[cfg(target_arch = "x86_64")]
935    pub fn get_xsave(&self, state: &mut [u8; 4096]) -> Result<()> {
936        assert_eq!(size_of_val(state), size_of::<kvm_xsave>());
937
938        // TODO: We are not checking KVM_CAP_XSAVE2 first.
939        // SAFETY: We have verified that our output buffer is the correct size.
940        unsafe {
941            ioctl::kvm_get_xsave(self.get().vcpu.as_raw_fd(), state.as_mut_ptr().cast())
942                .map_err(Error::GetXsave)?;
943        }
944        Ok(())
945    }
946
947    #[cfg(target_arch = "x86_64")]
948    pub fn set_xsave(&self, state: &[u8; 4096]) -> Result<()> {
949        assert_eq!(size_of_val(state), size_of::<kvm_xsave>());
950
951        // TODO: We are not checking KVM_CAP_XSAVE2 first.
952        // SAFETY: We have verified that our input buffer is the correct size.
953        unsafe {
954            ioctl::kvm_set_xsave(self.get().vcpu.as_raw_fd(), state.as_ptr().cast())
955                .map_err(Error::SetXsave)?;
956        }
957        Ok(())
958    }
959
960    #[cfg(target_arch = "x86_64")]
961    pub fn set_debug_regs(&self, regs: &DebugRegisters) -> Result<()> {
962        let data = kvm_debugregs {
963            db: regs.db,
964            dr6: regs.dr6,
965            dr7: regs.dr7,
966            flags: 0,
967            reserved: [0; 9],
968        };
969
970        // TODO: We are not checking KVM_CAP_DEBUGREGS first.
971        // SAFETY: Calling IOCTL as documented, with no special requirements.
972        unsafe {
973            ioctl::kvm_set_debugregs(self.get().vcpu.as_raw_fd(), &data)
974                .map_err(Error::SetDebugRegs)?;
975        }
976        Ok(())
977    }
978
979    #[cfg(target_arch = "x86_64")]
980    pub fn get_debug_regs(&self) -> Result<DebugRegisters> {
981        let mut data = Default::default();
982
983        // TODO: We are not checking KVM_CAP_DEBUGREGS first.
984        // SAFETY: Calling IOCTL as documented, with no special requirements.
985        unsafe {
986            ioctl::kvm_get_debugregs(self.get().vcpu.as_raw_fd(), &mut data)
987                .map_err(Error::GetDebugRegs)?;
988        }
989
990        Ok(DebugRegisters {
991            db: data.db,
992            dr6: data.dr6,
993            dr7: data.dr7,
994        })
995    }
996
997    #[cfg(target_arch = "x86_64")]
998    pub fn set_xcr0(&self, value: u64) -> Result<()> {
999        let mut data = kvm_xcrs {
1000            nr_xcrs: 1,
1001            ..Default::default()
1002        };
1003        data.xcrs[0] = kvm_xcr {
1004            xcr: 0,
1005            reserved: 0,
1006            value,
1007        };
1008
1009        // TODO: We are not checking KVM_CAP_XCRS first.
1010        // SAFETY: Calling IOCTL as documented, with no special requirements.
1011        unsafe {
1012            ioctl::kvm_set_xcrs(self.get().vcpu.as_raw_fd(), &data).map_err(Error::GetXcrs)?;
1013        }
1014        Ok(())
1015    }
1016
1017    #[cfg(target_arch = "x86_64")]
1018    pub fn get_xcr0(&self) -> Result<u64> {
1019        let mut data = Default::default();
1020
1021        // TODO: We are not checking KVM_CAP_XCRS first.
1022        // SAFETY: Calling IOCTL as documented, with no special requirements.
1023        unsafe {
1024            ioctl::kvm_get_xcrs(self.get().vcpu.as_raw_fd(), &mut data).map_err(Error::SetXcrs)?;
1025        }
1026
1027        if data.nr_xcrs < 1 {
1028            return Err(Error::XsaveNotEnabled);
1029        }
1030        assert_eq!(data.nr_xcrs, 1);
1031        assert_eq!(data.xcrs[0].xcr, 0);
1032        Ok(data.xcrs[0].value)
1033    }
1034
1035    pub fn set_mp_state(&self, state: u32) -> Result<()> {
1036        let state = kvm_mp_state { mp_state: state };
1037        // TODO: We are not checking KVM_CAP_MP_STATE first.
1038        // SAFETY: Calling IOCTL as documented, with no special requirements.
1039        unsafe {
1040            ioctl::kvm_set_mp_state(self.get().vcpu.as_raw_fd(), &state)
1041                .map_err(Error::SetMpState)?;
1042        }
1043        Ok(())
1044    }
1045
1046    pub fn get_mp_state(&self) -> Result<u32> {
1047        let mut state = Default::default();
1048        // TODO: We are not checking KVM_CAP_MP_STATE first.
1049        // SAFETY: Calling IOCTL as documented, with no special requirements.
1050        unsafe {
1051            ioctl::kvm_get_mp_state(self.get().vcpu.as_raw_fd(), &mut state)
1052                .map_err(Error::GetMpState)?;
1053        }
1054        Ok(state.mp_state)
1055    }
1056
1057    pub fn set_vcpu_events(&self, events: &kvm_vcpu_events) -> Result<()> {
1058        // TODO: We are not checking KVM_CAP_VCPU_EVENTS first.
1059        // SAFETY: Calling IOCTL as documented, with no special requirements.
1060        unsafe {
1061            ioctl::kvm_set_vcpu_events(self.get().vcpu.as_raw_fd(), events)
1062                .map_err(Error::SetVcpuEvents)?;
1063        }
1064        Ok(())
1065    }
1066
1067    pub fn get_vcpu_events(&self) -> Result<kvm_vcpu_events> {
1068        let mut events = Default::default();
1069        // TODO: We are not checking KVM_CAP_VCPU_EVENTS first.
1070        // SAFETY: Calling IOCTL as documented, with no special requirements.
1071        unsafe {
1072            ioctl::kvm_get_vcpu_events(self.get().vcpu.as_raw_fd(), &mut events)
1073                .map_err(Error::GetVcpuEvents)?;
1074        }
1075        Ok(events)
1076    }
1077
1078    pub fn translate_gva(&self, gva: u64) -> Result<kvm_translation> {
1079        let mut translation = kvm_translation {
1080            linear_address: gva,
1081            ..Default::default()
1082        };
1083
1084        // SAFETY: Calling IOCTL as documented, with no special requirements.
1085        unsafe {
1086            ioctl::kvm_translation(self.get().vcpu.as_raw_fd(), &mut translation)
1087                .map_err(Error::TranslateGva)?;
1088        }
1089
1090        Ok(translation)
1091    }
1092
1093    /// Sets the guest debugging state: `control` bits `KVM_GUESTDBG_*`, `db`
1094    /// containing DR0 through DR3, and `dr7`.
1095    #[cfg(target_arch = "x86_64")]
1096    pub fn set_guest_debug(&self, control: u32, db: [u64; 4], dr7: u64) -> Result<()> {
1097        // N.B. Debug registers 4 through 6 are not used by KVM in this path.
1098        let debug = kvm_guest_debug {
1099            control,
1100            pad: 0,
1101            arch: kvm_guest_debug_arch {
1102                debugreg: [db[0], db[1], db[2], db[3], 0, 0, 0, dr7],
1103            },
1104        };
1105
1106        // TODO: We are not checking KVM_CAP_SET_GUEST_DEBUG first.
1107        // SAFETY: Calling IOCTL as documented, with no special requirements.
1108        unsafe {
1109            ioctl::kvm_set_guest_debug(self.get().vcpu.as_raw_fd(), &debug)
1110                .map_err(Error::GetRegs)?;
1111        }
1112        Ok(())
1113    }
1114
1115    /// # Safety
1116    ///
1117    /// `addr` must point to the appropriate input for the attribute being
1118    /// set.
1119    pub unsafe fn set_device_attr<T>(
1120        &self,
1121        group: u32,
1122        attr: u32,
1123        addr: &T,
1124        flags: u32,
1125    ) -> nix::Result<libc::c_int> {
1126        // SAFETY: caller guaranteed.
1127        unsafe {
1128            ioctl::kvm_set_device_attr(
1129                self.get().vcpu.as_raw_fd(),
1130                &kvm_device_attr {
1131                    group,
1132                    attr: attr as u64,
1133                    addr: std::ptr::from_ref(addr) as u64,
1134                    flags,
1135                },
1136            )
1137        }
1138    }
1139
1140    pub fn runner(&self) -> VpRunner<'a> {
1141        // Ensure this thread is uniquely running the VP, and store the thread
1142        // ID to support cancellation.
1143        assert!(
1144            self.get()
1145                .thread
1146                .write()
1147                .replace(Pthread::current())
1148                .is_none()
1149        );
1150
1151        VpRunner {
1152            partition: self.0,
1153            idx: self.1,
1154            _not_send_sync: PhantomData,
1155        }
1156    }
1157}
1158
1159pub struct VpRunner<'a> {
1160    partition: &'a Partition,
1161    idx: u32,
1162    // This type stores the current thread in `partition` and removes it in
1163    // `drop`, so don't allow sending or sharing this.
1164    _not_send_sync: PhantomData<*const u8>,
1165}
1166
1167impl Drop for VpRunner<'_> {
1168    fn drop(&mut self) {
1169        // The thread is no longer in use.
1170        let thread = self.get().thread.write().take();
1171        assert_eq!(thread, Some(Pthread::current()));
1172    }
1173}
1174
1175impl<'a> VpRunner<'a> {
1176    fn get(&self) -> &'a Vp {
1177        self.partition.vp(self.idx).get()
1178    }
1179
1180    fn run_data(&mut self) -> &mut kvm_run {
1181        let vp = self.get();
1182        // SAFETY: there are no other references to this data right
1183        // now since this thread is uniquely processing the VP, and
1184        // the VP is not running (so the kernel is not mutating the
1185        // structure either).
1186        unsafe { &mut *vp.run_data.ptr }
1187    }
1188
1189    fn run_data_slice(&mut self) -> &mut [u8] {
1190        let vp = self.get();
1191        // SAFETY: there are no other references to this data right
1192        // now since this thread is uniquely processing the VP, and
1193        // the VP is not running (so the kernel is not mutating the
1194        // structure either).
1195        unsafe { std::slice::from_raw_parts_mut(vp.run_data.ptr.cast::<u8>(), vp.run_data.len) }
1196    }
1197
1198    /// Issues an IOCTL to run the VP.
1199    fn run_vp_once(&mut self) -> Result<bool> {
1200        CURRENT_KVM_RUN.with(|r| {
1201            let vp = self.get();
1202
1203            // Clear immediate_exit before giving up exclusive ownership of the
1204            // kvm_run structure.
1205            self.run_data().immediate_exit = 0;
1206
1207            // Swap the kvm_run structure pointer in so the signal handler can set
1208            // immediate_exit if the signal arrives just before the kvm_run ioctl.
1209            match r.swap(vp.run_data.ptr as usize, Ordering::Relaxed) {
1210                NO_KVM_RUN => {}
1211                CANCEL_KVM_RUN => {
1212                    // A cancel request signal arrived before the swap. Set
1213                    // immediate_exit so that any pending exit gets completed,
1214                    // and then the IOCTL returns before actually running the
1215                    // VP.
1216                    //
1217                    // The kvm_run structure is now aliased, so don't call
1218                    // `run_data()` to get it.
1219                    //
1220                    // SAFETY: the signal thread that might access the structure
1221                    // will also use `set_immediate_exit`.
1222                    unsafe { set_immediate_exit(vp.run_data.ptr) };
1223                }
1224                state => unreachable!("unexpected state {:#x}", state),
1225            }
1226
1227            // SAFETY: Calling IOCTL as documented, with no special requirements.
1228            let result = unsafe { ioctl::kvm_run(vp.vcpu.as_raw_fd(), 0) };
1229            CURRENT_KVM_RUN.with(|r| r.store(NO_KVM_RUN, Ordering::Relaxed));
1230            match result {
1231                Ok(_) => Ok(true),
1232                Err(err) => match err {
1233                    nix::errno::Errno::EINTR | nix::errno::Errno::EAGAIN => Ok(false),
1234                    _ => Err(Error::Run(err)),
1235                },
1236            }
1237        })
1238    }
1239
1240    /// Completes the current exit without running the VP further.
1241    ///
1242    /// This may generate more exits.
1243    pub fn complete_exit(&mut self) -> Result<Exit<'_>, Error> {
1244        CURRENT_KVM_RUN.with(|run| run.store(CANCEL_KVM_RUN, Ordering::Relaxed));
1245        self.run()
1246    }
1247
1248    /// Continues running the VP.
1249    ///
1250    /// Runs until an exit occurs or interrupted by a signal or a call to
1251    /// [`Processor::force_exit`].
1252    pub fn run(&mut self) -> Result<Exit<'_>, Error> {
1253        if !self.run_vp_once()? {
1254            return Ok(Exit::Interrupted);
1255        }
1256
1257        let exit = match self.run_data().exit_reason {
1258            KVM_EXIT_DEBUG => {
1259                // SAFETY: no other references to this data.
1260                let debug = unsafe { &self.run_data().__bindgen_anon_1.debug };
1261
1262                #[cfg(not(target_arch = "x86_64"))]
1263                {
1264                    _ = debug;
1265                    todo!("debug exit on non-x86_64")
1266                }
1267
1268                #[cfg(target_arch = "x86_64")]
1269                {
1270                    Exit::Debug {
1271                        exception: debug.arch.exception,
1272                        pc: debug.arch.pc,
1273                        dr6: debug.arch.dr6,
1274                        dr7: debug.arch.dr7,
1275                    }
1276                }
1277            }
1278            KVM_EXIT_IO => {
1279                // SAFETY: this is the active union field.
1280                let io = unsafe { self.run_data().__bindgen_anon_1.io };
1281
1282                let offset = io.data_offset as usize;
1283                let data = &mut self.run_data_slice()
1284                    [offset..offset + io.size as usize * io.count as usize];
1285                if io.direction == KVM_EXIT_IO_IN as u8 {
1286                    Exit::IoIn {
1287                        port: io.port,
1288                        size: io.size,
1289                        data,
1290                    }
1291                } else {
1292                    Exit::IoOut {
1293                        port: io.port,
1294                        size: io.size,
1295                        data,
1296                    }
1297                }
1298            }
1299            KVM_EXIT_IRQ_WINDOW_OPEN => {
1300                let rdata = self.run_data();
1301                assert!(rdata.ready_for_interrupt_injection != 0);
1302                rdata.request_interrupt_window = 0;
1303                Exit::InterruptWindow
1304            }
1305            KVM_EXIT_MMIO => {
1306                // SAFETY: this is the active union field.
1307                let mmio = unsafe { &mut self.run_data().__bindgen_anon_1.mmio };
1308                if mmio.is_write != 0 {
1309                    Exit::MmioWrite {
1310                        address: mmio.phys_addr,
1311                        data: &mmio.data[0..mmio.len as usize],
1312                    }
1313                } else {
1314                    mmio.data = [0; 8];
1315                    Exit::MmioRead {
1316                        address: mmio.phys_addr,
1317                        data: &mut mmio.data[0..mmio.len as usize],
1318                    }
1319                }
1320            }
1321            KVM_EXIT_SHUTDOWN => Exit::Shutdown,
1322            KVM_EXIT_HYPERV => {
1323                // SAFETY: this is the active union field.
1324                let hyperv = unsafe { &mut self.run_data().__bindgen_anon_1.hyperv };
1325                match hyperv.type_ {
1326                    KVM_EXIT_HYPERV_HCALL => {
1327                        // SAFETY: this is the active union field.
1328                        let hcall = unsafe { &mut hyperv.u.hcall };
1329                        Exit::HvHypercall {
1330                            input: hcall.input,
1331                            result: &mut hcall.result,
1332                            params: hcall.params,
1333                        }
1334                    }
1335                    KVM_EXIT_HYPERV_SYNIC => {
1336                        // SAFETY: this is the active union field.
1337                        let synic = unsafe { &hyperv.u.synic };
1338                        Exit::SynicUpdate {
1339                            msr: synic.msr,
1340                            control: synic.control,
1341                            siefp: synic.evt_page,
1342                            simp: synic.msg_page,
1343                        }
1344                    }
1345                    _ => return Err(Error::UnknownHvExit(hyperv.type_)),
1346                }
1347            }
1348            KVM_EXIT_IOAPIC_EOI => {
1349                // SAFETY: this is the active union field.
1350                let eoi = unsafe { &mut self.run_data().__bindgen_anon_1.eoi };
1351
1352                Exit::Eoi { irq: eoi.vector }
1353            }
1354            KVM_EXIT_FAIL_ENTRY => {
1355                // SAFETY: this is the active union field.
1356                let fail_entry = unsafe { &self.run_data().__bindgen_anon_1.fail_entry };
1357                Exit::FailEntry {
1358                    hardware_entry_failure_reason: fail_entry.hardware_entry_failure_reason,
1359                }
1360            }
1361            KVM_EXIT_INTERNAL_ERROR => {
1362                // SAFETY: this is the active union field.
1363                let internal = unsafe { &self.run_data().__bindgen_anon_1.internal };
1364                if internal.suberror == KVM_INTERNAL_ERROR_EMULATION {
1365                    // FUTURE: update bindings and get the instruction bytes when they are present.
1366                    Exit::EmulationFailure {
1367                        instruction_bytes: &[],
1368                    }
1369                } else {
1370                    Exit::InternalError {
1371                        error: internal.suberror,
1372                        data: &internal.data[..internal.ndata as usize],
1373                    }
1374                }
1375            }
1376            KVM_EXIT_X86_WRMSR => {
1377                // SAFETY: this is the active union field.
1378                let msr = unsafe { &mut self.run_data().__bindgen_anon_1.msr };
1379                msr.error = 0;
1380                Exit::MsrWrite {
1381                    index: msr.index,
1382                    data: msr.data,
1383                    error: &mut msr.error,
1384                }
1385            }
1386            KVM_EXIT_X86_RDMSR => {
1387                // SAFETY: this is the active union field.
1388                let msr = unsafe { &mut self.run_data().__bindgen_anon_1.msr };
1389                msr.data = 0;
1390                msr.error = 0;
1391                Exit::MsrRead {
1392                    index: msr.index,
1393                    data: &mut msr.data,
1394                    error: &mut msr.error,
1395                }
1396            }
1397            exit_reason => return Err(Error::UnknownExit(exit_reason)),
1398        };
1399        Ok(exit)
1400    }
1401
1402    /// Request an exit when the interrupt window opens.
1403    ///
1404    /// Returns true if the window is already open (in which case the request is
1405    /// not registered).
1406    #[must_use]
1407    pub fn check_or_request_interrupt_window(&mut self) -> bool {
1408        let rdata = self.run_data();
1409        if rdata.ready_for_interrupt_injection != 0 {
1410            true
1411        } else {
1412            rdata.request_interrupt_window = 1;
1413            false
1414        }
1415    }
1416
1417    /// Injects an extint interrupt.
1418    ///
1419    /// Caller must ensure that either it has received a
1420    /// [`Exit::InterruptWindow`] exit, or that
1421    /// [`Self::check_or_request_interrupt_window`] has returned `true`.
1422    pub fn inject_extint_interrupt(&mut self, vector: u8) -> Result<()> {
1423        self.partition.vp(self.idx).interrupt(vector.into())?;
1424        // Remember that there is a pending extint interrupt. KVM will update
1425        // this field again after the VP runs.
1426        self.run_data().ready_for_interrupt_injection = 0;
1427        Ok(())
1428    }
1429}
1430
1431#[derive(Debug)]
1432pub enum Exit<'a> {
1433    Interrupted,
1434    InterruptWindow,
1435    IoIn {
1436        port: u16,
1437        size: u8,
1438        data: &'a mut [u8],
1439    },
1440    IoOut {
1441        port: u16,
1442        size: u8,
1443        data: &'a [u8],
1444    },
1445    MmioRead {
1446        address: u64,
1447        data: &'a mut [u8],
1448    },
1449    MmioWrite {
1450        address: u64,
1451        data: &'a [u8],
1452    },
1453    MsrRead {
1454        index: u32,
1455        data: &'a mut u64,
1456        error: &'a mut u8,
1457    },
1458    MsrWrite {
1459        index: u32,
1460        data: u64,
1461        error: &'a mut u8,
1462    },
1463    Shutdown,
1464    FailEntry {
1465        hardware_entry_failure_reason: u64,
1466    },
1467    InternalError {
1468        error: u32,
1469        data: &'a [u64],
1470    },
1471    EmulationFailure {
1472        instruction_bytes: &'a [u8],
1473    },
1474    SynicUpdate {
1475        msr: u32,
1476        control: u64,
1477        siefp: u64,
1478        simp: u64,
1479    },
1480    HvHypercall {
1481        input: u64,
1482        result: &'a mut u64,
1483        params: [u64; 2],
1484    },
1485    Debug {
1486        exception: u32,
1487        pc: u64,
1488        dr6: u64,
1489        dr7: u64,
1490    },
1491    Eoi {
1492        irq: u8,
1493    },
1494}
1495
1496/// Set up a signal used to cause KVM run_vp to return.
1497pub fn init() {
1498    static SIGNAL_HANDLER_INIT: Once = Once::new();
1499    SIGNAL_HANDLER_INIT.call_once(|| {
1500        let handler = || {
1501            CURRENT_KVM_RUN.with(|run| {
1502                // This interrupts the other code that accesses CURRENT_KVM_RUN, so a
1503                // compare_exchange is not necessary.
1504                let rdata = run.load(Ordering::Relaxed);
1505                match rdata {
1506                    NO_KVM_RUN => run.store(CANCEL_KVM_RUN, Ordering::Relaxed),
1507                    CANCEL_KVM_RUN => {}
1508                    _ => {
1509                        // SAFETY: other concurrent accesses to the structure are via
1510                        // `set_immediate_exit` or via atomic accesses in the kernel.
1511                        unsafe { set_immediate_exit(rdata as *mut kvm_run) };
1512                    }
1513                }
1514            })
1515        };
1516        // Ensure the thread local is initialized.
1517        CURRENT_KVM_RUN.with(|value| {
1518            std::hint::black_box(value);
1519        });
1520        // SAFETY: The signal handler does not perform any actions that are forbidden
1521        // for signal handlers to perform, as it only performs thread-local and atomic
1522        // reads and writes. We are guaranteed to not interrupt thread local initialization
1523        // as we have ensured it is initialized above.
1524        unsafe {
1525            signal_hook::low_level::register(libc::SIGRTMIN(), handler).unwrap();
1526        }
1527    });
1528}
1529
1530const NO_KVM_RUN: usize = 0;
1531const CANCEL_KVM_RUN: usize = 1;
1532
1533thread_local! {
1534    static CURRENT_KVM_RUN: AtomicUsize = const { AtomicUsize::new(NO_KVM_RUN) };
1535}
1536
1537/// Sets `rdata.immediate_exit` to 1 without constructing a mutable reference.
1538///
1539/// This can be used when the kvm_run is aliased by the kernel or by other
1540/// threads that might call this function.
1541#[expect(clippy::missing_safety_doc)]
1542unsafe fn set_immediate_exit(rdata: *mut kvm_run) {
1543    // SAFETY: rdata may be aliased by the kernel right now, so it's
1544    // not safe to construct a mutable reference to it. Use an
1545    // atomic store to carefully write without requiring a mutable
1546    // reference.
1547    unsafe {
1548        (*(std::ptr::addr_of!((*rdata).immediate_exit).cast::<AtomicU8>()))
1549            .store(1, Ordering::Relaxed);
1550    }
1551}
1552
1553pub struct DebugRegisters {
1554    /// DR0-3.
1555    pub db: [u64; 4],
1556    pub dr6: u64,
1557    pub dr7: u64,
1558}