virt_kvm/arch/x86_64/
mod.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! This module implements support for KVM on x86_64.
5
6#![cfg(all(target_os = "linux", guest_is_native, guest_arch = "x86_64"))]
7
8mod regs;
9mod vm_state;
10mod vp_state;
11
12use crate::KvmError;
13use crate::KvmPartition;
14use crate::KvmPartitionInner;
15use crate::KvmProcessorBinder;
16use crate::KvmRunVpError;
17use crate::gsi;
18use crate::gsi::GsiRouting;
19use guestmem::DoorbellRegistration;
20use guestmem::GuestMemory;
21use guestmem::GuestMemoryError;
22use hv1_emulator::message_queues::MessageQueues;
23use hvdef::HV_PAGE_SIZE;
24use hvdef::HvError;
25use hvdef::HvMessage;
26use hvdef::HvMessageType;
27use hvdef::HvSynicScontrol;
28use hvdef::HvSynicSimpSiefp;
29use hvdef::HypercallCode;
30use hvdef::Vtl;
31use hvdef::hypercall::Control;
32use inspect::Inspect;
33use inspect::InspectMut;
34use kvm::KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
35use kvm::kvm_ioeventfd_flag_nr_datamatch;
36use kvm::kvm_ioeventfd_flag_nr_deassign;
37use pal_event::Event;
38use parking_lot::Mutex;
39use parking_lot::RwLock;
40use pci_core::msi::MsiControl;
41use pci_core::msi::MsiInterruptTarget;
42use std::convert::Infallible;
43use std::future::poll_fn;
44use std::io;
45use std::os::unix::prelude::*;
46use std::sync::Arc;
47use std::sync::Weak;
48use std::sync::atomic::AtomicBool;
49use std::sync::atomic::Ordering;
50use std::task::Poll;
51use std::time::Duration;
52use thiserror::Error;
53use virt::CpuidLeaf;
54use virt::CpuidLeafSet;
55use virt::Hv1;
56use virt::NeedsYield;
57use virt::Partition;
58use virt::PartitionAccessState;
59use virt::PartitionConfig;
60use virt::Processor;
61use virt::ProtoPartition;
62use virt::ProtoPartitionConfig;
63use virt::ResetPartition;
64use virt::StopVp;
65use virt::VpHaltReason;
66use virt::VpIndex;
67use virt::io::CpuIo;
68use virt::irqcon::DeliveryMode;
69use virt::irqcon::IoApicRouting;
70use virt::irqcon::MsiRequest;
71use virt::state::StateElement;
72use virt::vm::AccessVmState;
73use virt::x86::HardwareBreakpoint;
74use virt::x86::max_physical_address_size_from_cpuid;
75use virt::x86::vp::AccessVpState;
76use vm_topology::processor::x86::ApicMode;
77use vm_topology::processor::x86::X86VpInfo;
78use vmcore::interrupt::Interrupt;
79use vmcore::reference_time::GetReferenceTime;
80use vmcore::reference_time::ReferenceTimeResult;
81use vmcore::reference_time::ReferenceTimeSource;
82use vmcore::synic::GuestEventPort;
83use vmcore::vmtime::VmTime;
84use vmcore::vmtime::VmTimeAccess;
85use vp_state::KvmVpStateAccess;
86use x86defs::cpuid::CpuidFunction;
87use x86defs::msi::MsiAddress;
88use x86defs::msi::MsiData;
89use zerocopy::IntoBytes;
90
91// HACK: on certain machines, pcat spams these MSRs during boot.
92//
93// As a workaround, avoid injecting a GFP on these mystery MSRs until we can get
94// to the bottom of what's going on here.
95const MYSTERY_MSRS: &[u32] = &[0x88, 0x89, 0x8a, 0x116, 0x118, 0x119, 0x11a, 0x11b, 0x11e];
96
97#[derive(Debug)]
98pub struct Kvm;
99
100/// CPUID leaf and flag for GB page support.
101const GB_PAGE_LEAF: u32 = 0x80000001;
102const GB_PAGE_FLAG: u32 = 1 << 26;
103
104/// Returns whether the host supports GB pages in the page table.
105fn gb_pages_supported() -> bool {
106    safe_intrinsics::cpuid(0x80000000, 0).eax >= GB_PAGE_LEAF
107        && safe_intrinsics::cpuid(GB_PAGE_LEAF, 0).edx & GB_PAGE_FLAG != 0
108}
109
110impl virt::Hypervisor for Kvm {
111    type ProtoPartition<'a> = KvmProtoPartition<'a>;
112    type Partition = KvmPartition;
113    type Error = KvmError;
114
115    fn new_partition<'a>(
116        &mut self,
117        config: ProtoPartitionConfig<'a>,
118    ) -> Result<Self::ProtoPartition<'a>, Self::Error> {
119        if config.isolation.is_isolated() {
120            return Err(KvmError::IsolationNotSupported);
121        }
122
123        let kvm = kvm::Kvm::new()?;
124        let mut cpuid_entries = kvm
125            .supported_cpuid()?
126            .into_iter()
127            .filter_map(|entry| {
128                // Filter out KVM CPUID entries.
129                if entry.function & 0xf0000000 == 0x40000000 {
130                    return None;
131                }
132                let mut leaf =
133                    CpuidLeaf::new(entry.function, [entry.eax, entry.ebx, entry.ecx, entry.edx]);
134                if entry.flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX != 0 {
135                    leaf = leaf.indexed(entry.index);
136                }
137                Some(leaf)
138            })
139            .collect::<Vec<_>>();
140
141        // Add in GB page support based on the host's capabilities. This bit
142        // is incorrectly stripped by some versions of KVM (but is important
143        // to have for our UEFI implementation).
144        if gb_pages_supported()
145            && cpuid_entries
146                .iter()
147                .any(|x| x.function == CpuidFunction::ExtendedVersionAndFeatures.0)
148        {
149            cpuid_entries.push(
150                CpuidLeaf::new(
151                    CpuidFunction::ExtendedVersionAndFeatures.0,
152                    [0, 0, 0, GB_PAGE_FLAG],
153                )
154                .masked([0, 0, 0, GB_PAGE_FLAG]),
155            );
156        }
157
158        match config.processor_topology.apic_mode() {
159            ApicMode::XApic => {
160                // Disable X2APIC.
161                cpuid_entries.push(
162                    CpuidLeaf::new(CpuidFunction::VersionAndFeatures.0, [0, 0, 0, 0]).masked([
163                        0,
164                        0,
165                        1 << 21,
166                        0,
167                    ]),
168                );
169            }
170            ApicMode::X2ApicSupported | ApicMode::X2ApicEnabled => {}
171        }
172
173        // SGX is not supported on KVM.
174        cpuid_entries.push(
175            CpuidLeaf::new(CpuidFunction::SgxEnumeration.0, [0; 4]).indexed(2), // SGX enumeration is subleaf 2
176        );
177
178        if let Some(hv_config) = &config.hv_config {
179            if hv_config.vtl2.is_some() {
180                return Err(KvmError::Vtl2NotSupported);
181            }
182
183            let split_u128 = |x: u128| -> [u32; 4] {
184                let bytes = x.to_le_bytes();
185                [
186                    u32::from_le_bytes(bytes[0..4].try_into().unwrap()),
187                    u32::from_le_bytes(bytes[4..8].try_into().unwrap()),
188                    u32::from_le_bytes(bytes[8..12].try_into().unwrap()),
189                    u32::from_le_bytes(bytes[12..16].try_into().unwrap()),
190                ]
191            };
192
193            use hvdef::*;
194            let privileges = HvPartitionPrivilege::new()
195                .with_access_partition_reference_counter(true)
196                .with_access_hypercall_msrs(true)
197                .with_access_vp_index(true)
198                .with_access_frequency_msrs(true)
199                .with_access_synic_msrs(true)
200                .with_access_synthetic_timer_msrs(true)
201                .with_access_vp_runtime_msr(true)
202                .with_access_apic_msrs(true);
203
204            let hv_cpuid = &[
205                CpuidLeaf::new(
206                    HV_CPUID_FUNCTION_HV_VENDOR_AND_MAX_FUNCTION,
207                    [
208                        HV_CPUID_FUNCTION_MS_HV_IMPLEMENTATION_LIMITS,
209                        u32::from_le_bytes(*b"Micr"),
210                        u32::from_le_bytes(*b"osof"),
211                        u32::from_le_bytes(*b"t Hv"),
212                    ],
213                ),
214                CpuidLeaf::new(
215                    HV_CPUID_FUNCTION_HV_INTERFACE,
216                    [u32::from_le_bytes(*b"Hv#1"), 0, 0, 0],
217                ),
218                CpuidLeaf::new(HV_CPUID_FUNCTION_MS_HV_VERSION, [0, 0, 0, 0]),
219                CpuidLeaf::new(
220                    HV_CPUID_FUNCTION_MS_HV_FEATURES,
221                    split_u128(u128::from(
222                        HvFeatures::new()
223                            .with_privileges(privileges)
224                            .with_frequency_regs_available(true),
225                    )),
226                ),
227                CpuidLeaf::new(
228                    HV_CPUID_FUNCTION_MS_HV_ENLIGHTENMENT_INFORMATION,
229                    split_u128(
230                        HvEnlightenmentInformation::new()
231                            .with_deprecate_auto_eoi(true)
232                            .with_long_spin_wait_count(0xffffffff) // no spin wait notifications
233                            .into(),
234                    ),
235                ),
236            ];
237
238            cpuid_entries.extend(hv_cpuid);
239        }
240
241        let cpuid_entries = CpuidLeafSet::new(cpuid_entries);
242
243        let vm = kvm.new_vm()?;
244        vm.enable_split_irqchip(virt::irqcon::IRQ_LINES as u32)?;
245        vm.enable_x2apic_api()?;
246        vm.enable_unknown_msr_exits()?;
247
248        Ok(KvmProtoPartition {
249            vm,
250            config,
251            cpuid: cpuid_entries,
252        })
253    }
254
255    fn is_available(&self) -> Result<bool, Self::Error> {
256        match std::fs::metadata("/dev/kvm") {
257            Ok(_) => Ok(true),
258            Err(err) if err.kind() == io::ErrorKind::NotFound => Ok(false),
259            Err(err) => Err(KvmError::AvailableCheck(err)),
260        }
261    }
262}
263
264/// A prototype partition.
265pub struct KvmProtoPartition<'a> {
266    vm: kvm::Partition,
267    config: ProtoPartitionConfig<'a>,
268    cpuid: CpuidLeafSet,
269}
270
271impl ProtoPartition for KvmProtoPartition<'_> {
272    type Partition = KvmPartition;
273    type Error = KvmError;
274    type ProcessorBinder = KvmProcessorBinder;
275
276    fn cpuid(&self, eax: u32, ecx: u32) -> [u32; 4] {
277        self.cpuid.result(eax, ecx, &[0; 4])
278    }
279
280    fn max_physical_address_size(&self) -> u8 {
281        max_physical_address_size_from_cpuid(&|eax, ecx| self.cpuid(eax, ecx))
282    }
283
284    fn build(
285        mut self,
286        config: PartitionConfig<'_>,
287    ) -> Result<(Self::Partition, Vec<Self::ProcessorBinder>), Self::Error> {
288        let mut cpuid = self.cpuid.into_leaves();
289        cpuid.extend(config.cpuid);
290        let cpuid = CpuidLeafSet::new(cpuid);
291
292        let bsp_apic_id = self.config.processor_topology.vp_arch(VpIndex::BSP).apic_id;
293        if bsp_apic_id != 0 {
294            self.vm.set_bsp(bsp_apic_id)?;
295        }
296
297        let mut caps = virt::PartitionCapabilities::from_cpuid(
298            self.config.processor_topology,
299            &mut |function, index| cpuid.result(function, index, &[0; 4]),
300        );
301
302        caps.can_freeze_time = false;
303
304        for vp_info in self.config.processor_topology.vps_arch() {
305            self.vm.add_vp(vp_info.apic_id)?;
306            let vp = self.vm.vp(vp_info.apic_id);
307            if self.config.hv_config.is_some() {
308                vp.enable_synic()?;
309
310                // Set the VP index. Also, KVM incorrectly initializes SCONTROL
311                // to 0. Set it to 1 on each processor.
312                vp.set_msrs(&[
313                    (
314                        hvdef::HV_X64_MSR_VP_INDEX,
315                        vp_info.base.vp_index.index().into(),
316                    ),
317                    (hvdef::HV_X64_MSR_SCONTROL, 1),
318                ])?;
319            }
320
321            // Unlike the Microsoft hypervisor, KVM allows this MSR to be set and
322            // defaults it to zero. Hard code the value here to the same as the
323            // Microsoft hypervisor.
324            vp.set_msrs(&[(
325                x86defs::X86X_IA32_MSR_MISC_ENABLE,
326                hv1_emulator::x86::MISC_ENABLE.into(),
327            )])?;
328
329            // Convert the CPUID entries and update the APIC ID in CPUID for
330            // this VCPU.
331            let cpuid_entries = cpuid
332                .leaves()
333                .iter()
334                .map(|leaf| {
335                    let mut entry = kvm::kvm_cpuid_entry2 {
336                        function: leaf.function,
337                        index: leaf.index.unwrap_or(0),
338                        flags: if leaf.index.is_some() {
339                            KVM_CPUID_FLAG_SIGNIFCANT_INDEX
340                        } else {
341                            0
342                        },
343                        eax: leaf.result[0],
344                        ebx: leaf.result[1],
345                        ecx: leaf.result[2],
346                        edx: leaf.result[3],
347                        padding: [0; 3],
348                    };
349                    match CpuidFunction(leaf.function) {
350                        CpuidFunction::VersionAndFeatures => {
351                            entry.ebx &= 0x00ffffff;
352                            entry.ebx |= vp_info.apic_id << 24;
353                        }
354                        CpuidFunction::ExtendedTopologyEnumeration => {
355                            entry.edx = vp_info.apic_id;
356                        }
357                        CpuidFunction::V2ExtendedTopologyEnumeration => {
358                            entry.edx = vp_info.apic_id;
359                        }
360                        _ => (),
361                    }
362                    entry
363                })
364                .collect::<Vec<_>>();
365
366            vp.set_cpuid(&cpuid_entries)?;
367        }
368
369        let mut gsi_routing = GsiRouting::new();
370
371        // Claim the IOAPIC routes.
372        for gsi in 0..virt::irqcon::IRQ_LINES as u32 {
373            gsi_routing.claim(gsi);
374        }
375
376        if self.config.hv_config.is_some() {
377            // Setup GSI routes for signaling the synic.
378            // TODO: set this up on every SINT, not just the VMBus one.
379            for vp in self.config.processor_topology.vps() {
380                let index = vp.vp_index.index();
381                let gsi = VMBUS_BASE_GSI + index;
382                gsi_routing.claim(gsi);
383                gsi_routing.set(gsi, Some(kvm::RoutingEntry::HvSint { vp: index, sint: 2 }));
384            }
385        }
386
387        kvm::init();
388
389        gsi_routing.update_routes(&self.vm);
390
391        let partition = KvmPartitionInner {
392            kvm: self.vm,
393            memory: Default::default(),
394            hv1_enabled: self.config.hv_config.is_some(),
395            gm: config.guest_memory.clone(),
396            vps: self
397                .config
398                .processor_topology
399                .vps_arch()
400                .map(|vp_info| KvmVpInner {
401                    needs_yield: NeedsYield::new(),
402                    request_interrupt_window: false.into(),
403                    eval: false.into(),
404                    vp_info,
405                    synic_message_queue: MessageQueues::new(),
406                    siefp: Default::default(),
407                })
408                .collect(),
409            gsi_routing: Mutex::new(gsi_routing),
410            caps,
411            cpuid,
412        };
413
414        let partition = KvmPartition {
415            inner: Arc::new(partition),
416        };
417
418        let vps = self
419            .config
420            .processor_topology
421            .vps()
422            .map(|vp| KvmProcessorBinder {
423                partition: partition.inner.clone(),
424                vpindex: vp.vp_index,
425                vmtime: self
426                    .config
427                    .vmtime
428                    .access(format!("vp-{}", vp.vp_index.index())),
429            })
430            .collect::<Vec<_>>();
431
432        if cfg!(debug_assertions) {
433            (&partition).check_reset_all(&partition.inner.vp(VpIndex::BSP).vp_info);
434        }
435
436        Ok((partition, vps))
437    }
438}
439
440const VMBUS_BASE_GSI: u32 = virt::irqcon::IRQ_LINES as u32;
441
442#[derive(Debug, Inspect)]
443pub struct KvmVpInner {
444    #[inspect(skip)]
445    needs_yield: NeedsYield,
446    request_interrupt_window: AtomicBool,
447    eval: AtomicBool,
448    vp_info: X86VpInfo,
449    synic_message_queue: MessageQueues,
450    #[inspect(hex, with = "|x| u64::from(*x.read())")]
451    siefp: RwLock<HvSynicSimpSiefp>,
452}
453
454impl KvmVpInner {
455    pub fn set_eval(&self, value: bool, ordering: Ordering) {
456        self.eval.store(value, ordering);
457    }
458
459    pub fn vp_info(&self) -> &X86VpInfo {
460        &self.vp_info
461    }
462}
463
464impl ResetPartition for KvmPartition {
465    type Error = KvmError;
466
467    fn reset(&self) -> Result<(), Self::Error> {
468        for vp in self.inner.vps() {
469            self.inner
470                .vp_state_access(vp.vp_info.base.vp_index)
471                .reset_all(&vp.vp_info)
472                .map_err(Box::new)?;
473        }
474        let mut this = self;
475        this.reset_all(&self.inner.vp(VpIndex::BSP).vp_info)
476            .map_err(Box::new)?;
477        Ok(())
478    }
479}
480
481impl Partition for KvmPartition {
482    fn supports_reset(&self) -> Option<&dyn ResetPartition<Error = Self::Error>> {
483        Some(self)
484    }
485
486    fn doorbell_registration(
487        self: &Arc<Self>,
488        _minimum_vtl: Vtl,
489    ) -> Option<Arc<dyn DoorbellRegistration>> {
490        Some(self.clone())
491    }
492
493    fn msi_interrupt_target(self: &Arc<Self>, _vtl: Vtl) -> Option<Arc<dyn MsiInterruptTarget>> {
494        Some(Arc::new(KvmMsiTarget(self.inner.clone())))
495    }
496
497    fn caps(&self) -> &virt::PartitionCapabilities {
498        &self.inner.caps
499    }
500
501    fn request_yield(&self, vp_index: VpIndex) {
502        tracing::trace!(vp_index = vp_index.index(), "request yield");
503        if self.inner.vp(vp_index).needs_yield.request_yield() {
504            self.inner.evaluate_vp(vp_index);
505        }
506    }
507
508    fn request_msi(&self, _vtl: Vtl, request: MsiRequest) {
509        self.inner.request_msi(request);
510    }
511}
512
513impl virt::X86Partition for KvmPartition {
514    fn ioapic_routing(&self) -> Arc<dyn IoApicRouting> {
515        self.inner.clone()
516    }
517
518    fn pulse_lint(&self, vp_index: VpIndex, _vtl: Vtl, lint: u8) {
519        if lint == 0 {
520            tracing::trace!(vp_index = vp_index.index(), "request interrupt window");
521            self.inner
522                .vp(vp_index)
523                .request_interrupt_window
524                .store(true, Ordering::Relaxed);
525            self.inner.evaluate_vp(vp_index);
526        } else {
527            // TODO
528            tracing::warn!("ignored lint1 pulse");
529        }
530    }
531}
532
533impl PartitionAccessState for KvmPartition {
534    type StateAccess<'a> = &'a KvmPartition;
535
536    fn access_state(&self, vtl: Vtl) -> Self::StateAccess<'_> {
537        assert_eq!(vtl, Vtl::Vtl0);
538
539        self
540    }
541}
542
543impl Hv1 for KvmPartition {
544    type Error = KvmError;
545    type Device = virt::x86::apic_software_device::ApicSoftwareDevice;
546
547    fn reference_time_source(&self) -> Option<ReferenceTimeSource> {
548        self.inner
549            .hv1_enabled
550            .then(|| ReferenceTimeSource::from(self.inner.clone() as Arc<dyn GetReferenceTime>))
551    }
552
553    fn new_virtual_device(
554        &self,
555    ) -> Option<&dyn virt::DeviceBuilder<Device = Self::Device, Error = Self::Error>> {
556        None
557    }
558}
559
560impl GetReferenceTime for KvmPartitionInner {
561    fn now(&self) -> ReferenceTimeResult {
562        // Although we can query the reference time MSR for a VP, we are not
563        // running in the context of a VP, and so such a query will hang if the
564        // VP is running. Instead, query the KVM clock, which is the backing
565        // clock for the reference time counter within KVM.
566        //
567        // This also gives us the system time, in some configurations.
568        let clock = self.kvm.get_clock_ns().unwrap();
569        ReferenceTimeResult {
570            ref_time: clock.clock / 100,
571            system_time: (clock.flags & kvm::KVM_CLOCK_REALTIME != 0)
572                .then(|| jiff::Timestamp::from_nanosecond(clock.realtime as i128).unwrap()),
573        }
574    }
575}
576
577impl virt::BindProcessor for KvmProcessorBinder {
578    type Processor<'a> = KvmProcessor<'a>;
579    type Error = KvmError;
580
581    fn bind(&mut self) -> Result<Self::Processor<'_>, Self::Error> {
582        // FUTURE: create the vcpu here to get better NUMA affinity.
583
584        let inner = &self.partition.vps[self.vpindex.index() as usize];
585        let kvm = self.partition.kvm.vp(inner.vp_info.apic_id);
586        let mut vp = KvmProcessor {
587            partition: &self.partition,
588            inner,
589            runner: kvm.runner(),
590            kvm,
591            vpindex: self.vpindex,
592            guest_debug_db: [0; 4],
593            scontrol: HvSynicScontrol::new().with_enabled(true),
594            siefp: 0.into(),
595            simp: 0.into(),
596            vmtime: &mut self.vmtime,
597        };
598
599        // 1. Reset the APIC state to clear the directed EOI bit, which is
600        //    set by KVM by default but our IO-APIC does not support.
601        // 2. Enable x2apic if the partition needs it.
602        // 3. Reset register state since KVM does not have the right
603        //    architectural values.
604        let vp_info = inner.vp_info;
605        let mut state = vp.access_state(Vtl::Vtl0);
606        state.set_registers(&virt::x86::vp::Registers::at_reset(
607            &self.partition.caps,
608            &vp_info,
609        ))?;
610        state.set_apic(&virt::x86::vp::Apic::at_reset(
611            &self.partition.caps,
612            &vp_info,
613        ))?;
614
615        if cfg!(debug_assertions) {
616            vp.access_state(Vtl::Vtl0).check_reset_all(&vp_info);
617        }
618
619        Ok(vp)
620    }
621}
622
623#[derive(InspectMut)]
624pub struct KvmProcessor<'a> {
625    #[inspect(skip)]
626    partition: &'a KvmPartitionInner,
627    #[inspect(flatten)]
628    inner: &'a KvmVpInner,
629    #[inspect(skip)]
630    runner: kvm::VpRunner<'a>,
631    #[inspect(skip)]
632    kvm: kvm::Processor<'a>,
633    vpindex: VpIndex,
634    vmtime: &'a mut VmTimeAccess,
635    #[inspect(iter_by_index)]
636    guest_debug_db: [u64; 4],
637    #[inspect(hex, with = "|&x| u64::from(x)")]
638    scontrol: HvSynicScontrol,
639    #[inspect(hex, with = "|&x| u64::from(x)")]
640    siefp: HvSynicSimpSiefp,
641    #[inspect(hex, with = "|&x| u64::from(x)")]
642    simp: HvSynicSimpSiefp,
643}
644
645impl KvmProcessor<'_> {
646    /// Delivers any pending PIC interrupt.
647    ///
648    /// The VP must be known to be stopped and must have an open interrupt
649    /// window.
650    fn deliver_pic_interrupt(&mut self, dev: &impl CpuIo) -> Result<(), KvmRunVpError> {
651        if let Some(vector) = dev.acknowledge_pic_interrupt() {
652            self.runner
653                .inject_extint_interrupt(vector)
654                .map_err(KvmRunVpError::ExtintInterrupt)?;
655        }
656        Ok(())
657    }
658
659    /// Tries to deliver any pending synic messages for a VP.
660    fn try_deliver_synic_messages(&mut self) -> Option<VmTime> {
661        if !self.scontrol.enabled() && self.simp.enabled() {
662            return None;
663        }
664        self.inner
665            .synic_message_queue
666            .post_pending_messages(!0, |sint, message| {
667                match self.write_sint_message(sint, message) {
668                    Ok(true) => {
669                        self.partition
670                            .kvm
671                            .irq_line(VMBUS_BASE_GSI + self.vpindex.index(), true)
672                            .unwrap();
673                        Ok(())
674                    }
675                    Ok(false) => Err(HvError::ObjectInUse),
676                    Err(err) => {
677                        tracelimit::error_ratelimited!(
678                            error = &err as &dyn std::error::Error,
679                            sint,
680                            "failed to write message"
681                        );
682                        Err(HvError::OperationFailed)
683                    }
684                }
685            });
686
687        (self.inner.synic_message_queue.pending_sints() != 0).then(|| {
688            // FUTURE: instead, poll on the resample eventfd for the
689            // relevant SINTs, or get KVM to add a proper EOM exit
690            self.vmtime.now().wrapping_add(Duration::from_millis(1))
691        })
692    }
693
694    /// Writes a message to a synic message page. It is assumed there are no
695    /// competing writers to the page (the VP should be stopped, so neither
696    /// the guest nor KVM should be writing to the page), so no special
697    /// synchronization is required.
698    fn write_sint_message(&mut self, sint: u8, msg: &HvMessage) -> Result<bool, GuestMemoryError> {
699        let simp = self.simp.base_gpn() * HV_PAGE_SIZE + sint as u64 * 256;
700        let typ: u32 = self.partition.gm.read_plain(simp)?;
701        if typ != 0 {
702            self.partition.gm.write_at(simp + 5, &[1u8])?;
703            let typ: u32 = self.partition.gm.read_plain(simp)?;
704            if typ != 0 {
705                return Ok(false);
706            }
707        }
708        self.partition.gm.write_at(simp + 4, &msg.as_bytes()[4..])?;
709        self.partition.gm.write_plain(simp, &msg.header.typ)?;
710        Ok(true)
711    }
712}
713
714struct KvmMsi {
715    address_lo: u32,
716    address_hi: u32,
717    data: u32,
718}
719
720impl KvmMsi {
721    fn new(request: MsiRequest) -> Self {
722        let request_address = MsiAddress::from(request.address as u32);
723        let request_data = MsiData::from(request.data);
724
725        // Although architecturally the destination mode bit is only supposed to
726        // be considered when the redirection hint bit is set, KVM always gets
727        // the destination mode from this bit instead of from the MSI data.
728        let address_lo = MsiAddress::new()
729            .with_address(x86defs::msi::MSI_ADDRESS)
730            .with_destination(request_address.destination())
731            .with_destination_mode_logical(request_address.destination_mode_logical())
732            .with_redirection_hint(request_data.delivery_mode() == DeliveryMode::LOWEST_PRIORITY.0)
733            .into();
734
735        // High bits of the destination go into the high bits of the address.
736        let address_hi = (request_address.virt_destination() & !0xff).into();
737        let data = MsiData::new()
738            .with_delivery_mode(request_data.delivery_mode())
739            .with_assert(request_data.assert())
740            .with_destination_mode_logical(request_data.destination_mode_logical())
741            .with_trigger_mode_level(request_data.trigger_mode_level())
742            .with_vector(request_data.vector())
743            .into();
744
745        Self {
746            address_lo,
747            address_hi,
748            data,
749        }
750    }
751}
752
753impl KvmPartitionInner {
754    fn request_msi(&self, request: MsiRequest) {
755        let KvmMsi {
756            address_lo,
757            address_hi,
758            data,
759        } = KvmMsi::new(request);
760        if let Err(err) = self.kvm.request_msi(&kvm::kvm_msi {
761            address_lo,
762            address_hi,
763            data,
764            flags: 0,
765            devid: 0,
766            pad: [0; 12],
767        }) {
768            tracelimit::warn_ratelimited!(
769                address = request.address,
770                data = request.data,
771                error = &err as &dyn std::error::Error,
772                "failed to request MSI"
773            );
774        }
775    }
776}
777
778impl IoApicRouting for KvmPartitionInner {
779    fn set_irq_route(&self, irq: u8, request: Option<MsiRequest>) {
780        let entry = request.map(|request| {
781            let KvmMsi {
782                address_lo,
783                address_hi,
784                data,
785            } = KvmMsi::new(request);
786            kvm::RoutingEntry::Msi {
787                address_lo,
788                address_hi,
789                data,
790            }
791        });
792        let mut gsi_routing = self.gsi_routing.lock();
793        if gsi_routing.set(irq as u32, entry) {
794            gsi_routing.update_routes(&self.kvm);
795        }
796    }
797
798    fn assert_irq(&self, irq: u8) {
799        if let Err(err) = self.kvm.irq_line(irq as u32, true) {
800            tracing::error!(
801                irq,
802                error = &err as &dyn std::error::Error,
803                "failed to assert irq"
804            );
805        }
806    }
807}
808
809struct KvmDoorbellEntry {
810    partition: Weak<KvmPartitionInner>,
811    event: Event,
812    guest_address: u64,
813    value: u64,
814    length: u32,
815    flags: u32,
816}
817
818impl KvmDoorbellEntry {
819    pub fn new(
820        partition: &Arc<KvmPartitionInner>,
821        guest_address: u64,
822        value: Option<u64>,
823        length: Option<u32>,
824        fd: &Event,
825    ) -> io::Result<KvmDoorbellEntry> {
826        let flags = if value.is_some() {
827            1 << kvm_ioeventfd_flag_nr_datamatch
828        } else {
829            0
830        };
831        let value = value.unwrap_or(0);
832        let length = length.unwrap_or(0);
833
834        // Dup the fd since it's needed to deassign the ioeventfd later.
835        let event = fd.clone();
836
837        if let Err(err) = partition.kvm.ioeventfd(
838            value,
839            guest_address,
840            length,
841            event.as_fd().as_raw_fd(),
842            flags,
843        ) {
844            tracing::warn!(
845                guest_address,
846                error = &err as &dyn std::error::Error,
847                "Failed to register doorbell",
848            );
849            return Err(io::Error::new(
850                io::ErrorKind::InvalidInput,
851                "Failed to register doorbell",
852            ));
853        }
854
855        Ok(Self {
856            partition: Arc::downgrade(partition),
857            guest_address,
858            value,
859            length,
860            flags,
861            event,
862        })
863    }
864}
865
866impl Drop for KvmDoorbellEntry {
867    fn drop(&mut self) {
868        if let Some(partition) = self.partition.upgrade() {
869            let flags: u32 = self.flags | (1 << kvm_ioeventfd_flag_nr_deassign);
870            if let Err(err) = partition.kvm.ioeventfd(
871                self.value,
872                self.guest_address,
873                self.length,
874                self.event.as_fd().as_raw_fd(),
875                flags,
876            ) {
877                tracing::warn!(
878                    guest_address = self.guest_address,
879                    error = &err as &dyn std::error::Error,
880                    "Failed to unregister doorbell",
881                );
882            }
883        }
884    }
885}
886
887impl DoorbellRegistration for KvmPartition {
888    fn register_doorbell(
889        &self,
890        guest_address: u64,
891        value: Option<u64>,
892        length: Option<u32>,
893        fd: &Event,
894    ) -> io::Result<Box<dyn Send + Sync>> {
895        Ok(Box::new(KvmDoorbellEntry::new(
896            &self.inner,
897            guest_address,
898            value,
899            length,
900            fd,
901        )?))
902    }
903}
904
905struct KvmHypercallExit<'a, T> {
906    bus: &'a T,
907    registers: KvmHypercallRegisters,
908}
909
910struct KvmHypercallRegisters {
911    input: u64,
912    params: [u64; 2],
913    result: u64,
914}
915
916impl<T: CpuIo> KvmHypercallExit<'_, T> {
917    const DISPATCHER: hv1_hypercall::Dispatcher<Self> = hv1_hypercall::dispatcher!(
918        Self,
919        [hv1_hypercall::HvPostMessage, hv1_hypercall::HvSignalEvent],
920    );
921}
922
923impl<'a, T: CpuIo> hv1_hypercall::AsHandler<KvmHypercallExit<'a, T>>
924    for &mut KvmHypercallExit<'a, T>
925{
926    fn as_handler(&mut self) -> &mut KvmHypercallExit<'a, T> {
927        self
928    }
929}
930
931impl<T> hv1_hypercall::HypercallIo for KvmHypercallExit<'_, T> {
932    fn advance_ip(&mut self) {
933        // KVM automatically does this.
934    }
935
936    fn retry(&mut self, _control: u64) {
937        unimplemented!("KVM cannot retry hypercalls");
938    }
939
940    fn control(&mut self) -> u64 {
941        // KVM automatically converts HvSignalEvent to a fast hypercall,
942        // but it does not update the control register accordingly.
943        let mut control = Control::from(self.registers.input);
944        if control.code() == HypercallCode::HvCallSignalEvent.0 {
945            control.set_fast(true);
946        }
947        control.into()
948    }
949
950    fn input_gpa(&mut self) -> u64 {
951        self.registers.params[0]
952    }
953
954    fn output_gpa(&mut self) -> u64 {
955        self.registers.params[1]
956    }
957
958    fn fast_register_pair_count(&mut self) -> usize {
959        1
960    }
961
962    fn extended_fast_hypercalls_ok(&mut self) -> bool {
963        false
964    }
965
966    fn fast_input(&mut self, buf: &mut [[u64; 2]], _output_register_pairs: usize) -> usize {
967        self.fast_regs(0, buf);
968        0
969    }
970
971    fn fast_output(&mut self, _starting_pair_index: usize, _buf: &[[u64; 2]]) {}
972
973    fn vtl_input(&mut self) -> u64 {
974        unimplemented!()
975    }
976
977    fn set_result(&mut self, n: u64) {
978        self.registers.result = n;
979    }
980
981    fn fast_regs(&mut self, _starting_pair_index: usize, buf: &mut [[u64; 2]]) {
982        if let [b, ..] = buf {
983            *b = self.registers.params;
984        }
985    }
986}
987
988impl<T: CpuIo> hv1_hypercall::PostMessage for KvmHypercallExit<'_, T> {
989    fn post_message(&mut self, connection_id: u32, message: &[u8]) -> hvdef::HvResult<()> {
990        self.bus
991            .post_synic_message(Vtl::Vtl0, connection_id, false, message)
992    }
993}
994
995impl<T: CpuIo> hv1_hypercall::SignalEvent for KvmHypercallExit<'_, T> {
996    fn signal_event(&mut self, connection_id: u32, flag: u16) -> hvdef::HvResult<()> {
997        self.bus.signal_synic_event(Vtl::Vtl0, connection_id, flag)
998    }
999}
1000
1001impl Processor for KvmProcessor<'_> {
1002    type Error = KvmError;
1003    type RunVpError = KvmRunVpError;
1004    type StateAccess<'a>
1005        = KvmVpStateAccess<'a>
1006    where
1007        Self: 'a;
1008
1009    fn set_debug_state(
1010        &mut self,
1011        _vtl: Vtl,
1012        state: Option<&virt::x86::DebugState>,
1013    ) -> Result<(), Self::Error> {
1014        let mut control = 0;
1015        let mut db = [0; 4];
1016        let mut dr7 = 0;
1017        if let Some(state) = state {
1018            control |= kvm::KVM_GUESTDBG_ENABLE;
1019            if state.single_step {
1020                control |= kvm::KVM_GUESTDBG_SINGLESTEP;
1021            }
1022            for (i, bp) in state.breakpoints.iter().enumerate() {
1023                if let Some(bp) = bp {
1024                    control |= kvm::KVM_GUESTDBG_USE_HW_BP;
1025                    db[i] = bp.address;
1026                    dr7 |= bp.dr7_bits(i);
1027                }
1028            }
1029        }
1030        self.kvm.set_guest_debug(control, db, dr7)?;
1031        // Remember the debug registers to retrieve the address later.
1032        self.guest_debug_db = db;
1033        Ok(())
1034    }
1035
1036    async fn run_vp(
1037        &mut self,
1038        stop: StopVp<'_>,
1039        dev: &impl CpuIo,
1040    ) -> Result<Infallible, VpHaltReason<KvmRunVpError>> {
1041        loop {
1042            self.inner.needs_yield.maybe_yield().await;
1043            stop.check()?;
1044
1045            if self.partition.hv1_enabled {
1046                // Deliver pending synic messages now, while KVM is not
1047                // accessing the message page.
1048                if let Some(next) = self.try_deliver_synic_messages() {
1049                    self.vmtime.set_timeout_if_before(next)
1050                } else {
1051                    self.vmtime.cancel_timeout();
1052                }
1053            }
1054
1055            // Check for pending PIC interrupts.
1056            //
1057            // Check and clear this with a relaxed ordering since `evaluate_vp`
1058            // (called when this is set) will force the VP to exit, causing us
1059            // to re-check.
1060            if self.inner.request_interrupt_window.load(Ordering::Relaxed) {
1061                self.inner
1062                    .request_interrupt_window
1063                    .store(false, Ordering::Relaxed);
1064                if self.runner.check_or_request_interrupt_window() {
1065                    self.deliver_pic_interrupt(dev)
1066                        .map_err(VpHaltReason::Hypervisor)?;
1067                }
1068            }
1069
1070            // Arm the timer. If it has expired, then loop around to scan for
1071            // synic messages again.
1072            if poll_fn(|cx| Poll::Ready(self.vmtime.poll_timeout(cx).is_ready())).await {
1073                continue;
1074            }
1075
1076            // Run the VP and handle exits until `evaluate_vp` is called or the
1077            // thread is otherwise interrupted.
1078            //
1079            // Don't break out of the loop while there is a pending exit so that
1080            // the register state is up-to-date for save.
1081            let mut pending_exit = false;
1082            loop {
1083                let exit = if self.inner.eval.load(Ordering::Relaxed) || stop.check().is_err() {
1084                    // Break out of the loop as soon as there is no pending exit.
1085                    if !pending_exit {
1086                        self.inner.eval.store(false, Ordering::Relaxed);
1087                        break;
1088                    }
1089                    // Complete the current exit.
1090                    self.runner.complete_exit()
1091                } else {
1092                    // Run the VP.
1093                    self.runner.run()
1094                };
1095
1096                let exit = exit.map_err(|err| VpHaltReason::Hypervisor(KvmRunVpError::Run(err)))?;
1097                pending_exit = true;
1098                match exit {
1099                    kvm::Exit::Interrupted => {
1100                        tracing::trace!("interrupted");
1101                        pending_exit = false;
1102                    }
1103                    kvm::Exit::InterruptWindow => {
1104                        self.deliver_pic_interrupt(dev)
1105                            .map_err(VpHaltReason::Hypervisor)?;
1106                    }
1107                    kvm::Exit::IoIn { port, data, size } => {
1108                        for data in data.chunks_mut(size as usize) {
1109                            dev.read_io(self.vpindex, port, data).await;
1110                        }
1111                    }
1112                    kvm::Exit::IoOut { port, data, size } => {
1113                        for data in data.chunks(size as usize) {
1114                            dev.write_io(self.vpindex, port, data).await;
1115                        }
1116                    }
1117                    kvm::Exit::MmioWrite { address, data } => {
1118                        dev.write_mmio(self.vpindex, address, data).await
1119                    }
1120                    kvm::Exit::MmioRead { address, data } => {
1121                        dev.read_mmio(self.vpindex, address, data).await
1122                    }
1123                    kvm::Exit::MsrRead { index, data, error } => {
1124                        if MYSTERY_MSRS.contains(&index) {
1125                            tracelimit::warn_ratelimited!(index, "stubbed out mystery MSR read");
1126                            *data = 0;
1127                        } else {
1128                            tracelimit::error_ratelimited!(index, "unrecognized msr read");
1129                            *error = 1;
1130                        }
1131                    }
1132                    kvm::Exit::MsrWrite { index, data, error } => {
1133                        if MYSTERY_MSRS.contains(&index) {
1134                            tracelimit::warn_ratelimited!(index, "stubbed out mystery MSR write");
1135                        } else {
1136                            tracelimit::error_ratelimited!(index, data, "unrecognized msr write");
1137                            *error = 1;
1138                        }
1139                    }
1140                    kvm::Exit::Shutdown => {
1141                        return Err(VpHaltReason::TripleFault { vtl: Vtl::Vtl0 });
1142                    }
1143                    kvm::Exit::SynicUpdate {
1144                        msr: _msr,
1145                        control,
1146                        siefp,
1147                        simp,
1148                    } => {
1149                        self.scontrol = control.into();
1150                        self.siefp = siefp.into();
1151                        self.simp = simp.into();
1152                        *self.inner.siefp.write() = if self.scontrol.enabled() {
1153                            siefp.into()
1154                        } else {
1155                            0.into()
1156                        };
1157                    }
1158                    kvm::Exit::HvHypercall {
1159                        input,
1160                        result,
1161                        params,
1162                    } => {
1163                        // N.B. this can only be SIGNAL_EVENT or POST_MESSAGE.
1164                        let mut handler = KvmHypercallExit {
1165                            bus: dev,
1166                            registers: KvmHypercallRegisters {
1167                                input,
1168                                params,
1169                                result: 0,
1170                            },
1171                        };
1172                        KvmHypercallExit::DISPATCHER.dispatch(&self.partition.gm, &mut handler);
1173                        *result = handler.registers.result;
1174                    }
1175                    kvm::Exit::Debug {
1176                        exception: _,
1177                        pc: _,
1178                        dr6,
1179                        dr7,
1180                    } => {
1181                        if dr6 & x86defs::DR6_BREAKPOINT_MASK != 0 {
1182                            let i = dr6.trailing_zeros() as usize;
1183                            let bp = HardwareBreakpoint::from_dr7(dr7, self.guest_debug_db[i], i);
1184                            return Err(VpHaltReason::HwBreak(bp));
1185                        } else if dr6 & x86defs::DR6_SINGLE_STEP != 0 {
1186                            return Err(VpHaltReason::SingleStep);
1187                        } else {
1188                            tracing::warn!(dr6, "debug exit with unknown dr6 condition");
1189                        }
1190                    }
1191                    kvm::Exit::Eoi { irq } => {
1192                        dev.handle_eoi(irq.into());
1193                    }
1194                    kvm::Exit::InternalError { error, .. } => {
1195                        return Err(VpHaltReason::Hypervisor(KvmRunVpError::InternalError(
1196                            error,
1197                        )));
1198                    }
1199                    kvm::Exit::EmulationFailure { instruction_bytes } => {
1200                        return Err(VpHaltReason::EmulationFailure(
1201                            EmulationError {
1202                                instruction_bytes: instruction_bytes.to_vec(),
1203                            }
1204                            .into(),
1205                        ));
1206                    }
1207                    kvm::Exit::FailEntry {
1208                        hardware_entry_failure_reason,
1209                    } => {
1210                        tracing::error!(hardware_entry_failure_reason, "VP entry failed");
1211                        return Err(VpHaltReason::InvalidVmState(KvmRunVpError::InvalidVpState));
1212                    }
1213                }
1214            }
1215        }
1216    }
1217
1218    fn flush_async_requests(&mut self) -> Result<(), Self::RunVpError> {
1219        Ok(())
1220    }
1221
1222    fn access_state(&mut self, vtl: Vtl) -> Self::StateAccess<'_> {
1223        assert_eq!(vtl, Vtl::Vtl0);
1224        self.partition.vp_state_access(self.vpindex)
1225    }
1226}
1227
1228impl virt::Synic for KvmPartition {
1229    fn post_message(&self, _vtl: Vtl, vp: VpIndex, sint: u8, typ: u32, payload: &[u8]) {
1230        let wake = self
1231            .inner
1232            .vp(vp)
1233            .synic_message_queue
1234            .enqueue_message(sint, &HvMessage::new(HvMessageType(typ), 0, payload));
1235
1236        if wake {
1237            self.inner.evaluate_vp(vp);
1238        }
1239    }
1240
1241    fn new_guest_event_port(
1242        &self,
1243        _vtl: Vtl,
1244        vp: u32,
1245        sint: u8,
1246        flag: u16,
1247    ) -> Box<dyn GuestEventPort> {
1248        Box::new(KvmGuestEventPort {
1249            partition: Arc::downgrade(&self.inner),
1250            gm: self.inner.gm.clone(),
1251            params: Arc::new(Mutex::new(KvmEventPortParams {
1252                vp: VpIndex::new(vp),
1253                sint,
1254                flag,
1255            })),
1256        })
1257    }
1258
1259    fn prefer_os_events(&self) -> bool {
1260        false
1261    }
1262}
1263
1264#[derive(Debug, Error)]
1265#[error("KVM emulation failure: instruction {instruction_bytes:02x?}")]
1266struct EmulationError {
1267    instruction_bytes: Vec<u8>,
1268}
1269
1270/// `GuestEventPort` implementation for KVM partitions.
1271#[derive(Debug, Clone)]
1272struct KvmGuestEventPort {
1273    partition: Weak<KvmPartitionInner>,
1274    gm: GuestMemory,
1275    params: Arc<Mutex<KvmEventPortParams>>,
1276}
1277
1278#[derive(Debug, Copy, Clone)]
1279struct KvmEventPortParams {
1280    vp: VpIndex,
1281    sint: u8,
1282    flag: u16,
1283}
1284
1285impl GuestEventPort for KvmGuestEventPort {
1286    fn interrupt(&self) -> Interrupt {
1287        let this = self.clone();
1288        Interrupt::from_fn(move || {
1289            let KvmEventPortParams { vp, sint, flag } = *this.params.lock();
1290            let Some(partition) = this.partition.upgrade() else {
1291                return;
1292            };
1293            let siefp = partition.vp(vp).siefp.read();
1294            if !siefp.enabled() {
1295                return;
1296            }
1297            let byte_gpa = siefp.base_gpn() * HV_PAGE_SIZE + sint as u64 * 256 + flag as u64 / 8;
1298            let mut byte = 0;
1299            let mask = 1 << (flag % 8);
1300            while byte & mask == 0 {
1301                match this.gm.compare_exchange(byte_gpa, byte, byte | mask) {
1302                    Ok(Ok(_)) => {
1303                        drop(siefp);
1304                        partition
1305                            .kvm
1306                            .irq_line(VMBUS_BASE_GSI + vp.index(), true)
1307                            .unwrap();
1308
1309                        break;
1310                    }
1311                    Ok(Err(b)) => byte = b,
1312                    Err(err) => {
1313                        tracelimit::warn_ratelimited!(
1314                            error = &err as &dyn std::error::Error,
1315                            "failed to write event flag to guest memory"
1316                        );
1317                        break;
1318                    }
1319                }
1320            }
1321        })
1322    }
1323
1324    fn set_target_vp(&mut self, vp: u32) -> Result<(), vmcore::synic::HypervisorError> {
1325        self.params.lock().vp = VpIndex::new(vp);
1326        Ok(())
1327    }
1328}
1329
1330#[derive(Debug)]
1331struct GsiMsi {
1332    gsi: gsi::GsiRoute,
1333}
1334
1335struct KvmMsiTarget(Arc<KvmPartitionInner>);
1336
1337impl MsiInterruptTarget for KvmMsiTarget {
1338    fn new_interrupt(&self) -> Box<dyn MsiControl> {
1339        let event = Event::new();
1340        let interrupt = self.0.new_route(Some(event)).expect("BUGBUG");
1341        Box::new(GsiMsi { gsi: interrupt })
1342    }
1343}
1344
1345impl MsiControl for GsiMsi {
1346    fn enable(&mut self, address: u64, data: u32) {
1347        let request = MsiRequest { address, data };
1348        let KvmMsi {
1349            address_lo,
1350            address_hi,
1351            data,
1352        } = KvmMsi::new(request);
1353
1354        self.gsi.enable(kvm::RoutingEntry::Msi {
1355            address_lo,
1356            address_hi,
1357            data,
1358        });
1359    }
1360
1361    fn disable(&mut self) {
1362        self.gsi.disable();
1363    }
1364
1365    fn signal(&mut self, _address: u64, _data: u32) {
1366        self.gsi.irqfd_event().unwrap().signal()
1367    }
1368}