1#![cfg(all(target_os = "linux", guest_is_native, guest_arch = "x86_64"))]
7
8mod regs;
9mod vm_state;
10mod vp_state;
11
12use crate::KvmError;
13use crate::KvmPartition;
14use crate::KvmPartitionInner;
15use crate::KvmProcessorBinder;
16use crate::KvmRunVpError;
17use crate::gsi;
18use crate::gsi::GsiRouting;
19use guestmem::DoorbellRegistration;
20use guestmem::GuestMemory;
21use guestmem::GuestMemoryError;
22use hv1_emulator::message_queues::MessageQueues;
23use hvdef::HV_PAGE_SIZE;
24use hvdef::HvError;
25use hvdef::HvMessage;
26use hvdef::HvMessageType;
27use hvdef::HvSynicScontrol;
28use hvdef::HvSynicSimpSiefp;
29use hvdef::HypercallCode;
30use hvdef::Vtl;
31use hvdef::hypercall::Control;
32use inspect::Inspect;
33use inspect::InspectMut;
34use kvm::KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
35use kvm::kvm_ioeventfd_flag_nr_datamatch;
36use kvm::kvm_ioeventfd_flag_nr_deassign;
37use pal_event::Event;
38use parking_lot::Mutex;
39use parking_lot::RwLock;
40use pci_core::msi::MsiControl;
41use pci_core::msi::MsiInterruptTarget;
42use std::convert::Infallible;
43use std::future::poll_fn;
44use std::io;
45use std::os::unix::prelude::*;
46use std::sync::Arc;
47use std::sync::Weak;
48use std::sync::atomic::AtomicBool;
49use std::sync::atomic::Ordering;
50use std::task::Poll;
51use std::time::Duration;
52use thiserror::Error;
53use virt::CpuidLeaf;
54use virt::CpuidLeafSet;
55use virt::Hv1;
56use virt::NeedsYield;
57use virt::Partition;
58use virt::PartitionAccessState;
59use virt::PartitionConfig;
60use virt::Processor;
61use virt::ProtoPartition;
62use virt::ProtoPartitionConfig;
63use virt::ResetPartition;
64use virt::StopVp;
65use virt::VpHaltReason;
66use virt::VpIndex;
67use virt::io::CpuIo;
68use virt::irqcon::DeliveryMode;
69use virt::irqcon::IoApicRouting;
70use virt::irqcon::MsiRequest;
71use virt::state::StateElement;
72use virt::vm::AccessVmState;
73use virt::x86::HardwareBreakpoint;
74use virt::x86::max_physical_address_size_from_cpuid;
75use virt::x86::vp::AccessVpState;
76use vm_topology::processor::x86::ApicMode;
77use vm_topology::processor::x86::X86VpInfo;
78use vmcore::interrupt::Interrupt;
79use vmcore::reference_time::GetReferenceTime;
80use vmcore::reference_time::ReferenceTimeResult;
81use vmcore::reference_time::ReferenceTimeSource;
82use vmcore::synic::GuestEventPort;
83use vmcore::vmtime::VmTime;
84use vmcore::vmtime::VmTimeAccess;
85use vp_state::KvmVpStateAccess;
86use x86defs::cpuid::CpuidFunction;
87use x86defs::msi::MsiAddress;
88use x86defs::msi::MsiData;
89use zerocopy::IntoBytes;
90
91const MYSTERY_MSRS: &[u32] = &[0x88, 0x89, 0x8a, 0x116, 0x118, 0x119, 0x11a, 0x11b, 0x11e];
96
97#[derive(Debug)]
98pub struct Kvm;
99
100const GB_PAGE_LEAF: u32 = 0x80000001;
102const GB_PAGE_FLAG: u32 = 1 << 26;
103
104fn gb_pages_supported() -> bool {
106 safe_intrinsics::cpuid(0x80000000, 0).eax >= GB_PAGE_LEAF
107 && safe_intrinsics::cpuid(GB_PAGE_LEAF, 0).edx & GB_PAGE_FLAG != 0
108}
109
110impl virt::Hypervisor for Kvm {
111 type ProtoPartition<'a> = KvmProtoPartition<'a>;
112 type Partition = KvmPartition;
113 type Error = KvmError;
114
115 fn new_partition<'a>(
116 &mut self,
117 config: ProtoPartitionConfig<'a>,
118 ) -> Result<Self::ProtoPartition<'a>, Self::Error> {
119 if config.isolation.is_isolated() {
120 return Err(KvmError::IsolationNotSupported);
121 }
122
123 let kvm = kvm::Kvm::new()?;
124 let mut cpuid_entries = kvm
125 .supported_cpuid()?
126 .into_iter()
127 .filter_map(|entry| {
128 if entry.function & 0xf0000000 == 0x40000000 {
130 return None;
131 }
132 let mut leaf =
133 CpuidLeaf::new(entry.function, [entry.eax, entry.ebx, entry.ecx, entry.edx]);
134 if entry.flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX != 0 {
135 leaf = leaf.indexed(entry.index);
136 }
137 Some(leaf)
138 })
139 .collect::<Vec<_>>();
140
141 if gb_pages_supported()
145 && cpuid_entries
146 .iter()
147 .any(|x| x.function == CpuidFunction::ExtendedVersionAndFeatures.0)
148 {
149 cpuid_entries.push(
150 CpuidLeaf::new(
151 CpuidFunction::ExtendedVersionAndFeatures.0,
152 [0, 0, 0, GB_PAGE_FLAG],
153 )
154 .masked([0, 0, 0, GB_PAGE_FLAG]),
155 );
156 }
157
158 match config.processor_topology.apic_mode() {
159 ApicMode::XApic => {
160 cpuid_entries.push(
162 CpuidLeaf::new(CpuidFunction::VersionAndFeatures.0, [0, 0, 0, 0]).masked([
163 0,
164 0,
165 1 << 21,
166 0,
167 ]),
168 );
169 }
170 ApicMode::X2ApicSupported | ApicMode::X2ApicEnabled => {}
171 }
172
173 cpuid_entries.push(
175 CpuidLeaf::new(CpuidFunction::SgxEnumeration.0, [0; 4]).indexed(2), );
177
178 if let Some(hv_config) = &config.hv_config {
179 if hv_config.vtl2.is_some() {
180 return Err(KvmError::Vtl2NotSupported);
181 }
182
183 let split_u128 = |x: u128| -> [u32; 4] {
184 let bytes = x.to_le_bytes();
185 [
186 u32::from_le_bytes(bytes[0..4].try_into().unwrap()),
187 u32::from_le_bytes(bytes[4..8].try_into().unwrap()),
188 u32::from_le_bytes(bytes[8..12].try_into().unwrap()),
189 u32::from_le_bytes(bytes[12..16].try_into().unwrap()),
190 ]
191 };
192
193 use hvdef::*;
194 let privileges = HvPartitionPrivilege::new()
195 .with_access_partition_reference_counter(true)
196 .with_access_hypercall_msrs(true)
197 .with_access_vp_index(true)
198 .with_access_frequency_msrs(true)
199 .with_access_synic_msrs(true)
200 .with_access_synthetic_timer_msrs(true)
201 .with_access_vp_runtime_msr(true)
202 .with_access_apic_msrs(true);
203
204 let hv_cpuid = &[
205 CpuidLeaf::new(
206 HV_CPUID_FUNCTION_HV_VENDOR_AND_MAX_FUNCTION,
207 [
208 HV_CPUID_FUNCTION_MS_HV_IMPLEMENTATION_LIMITS,
209 u32::from_le_bytes(*b"Micr"),
210 u32::from_le_bytes(*b"osof"),
211 u32::from_le_bytes(*b"t Hv"),
212 ],
213 ),
214 CpuidLeaf::new(
215 HV_CPUID_FUNCTION_HV_INTERFACE,
216 [u32::from_le_bytes(*b"Hv#1"), 0, 0, 0],
217 ),
218 CpuidLeaf::new(HV_CPUID_FUNCTION_MS_HV_VERSION, [0, 0, 0, 0]),
219 CpuidLeaf::new(
220 HV_CPUID_FUNCTION_MS_HV_FEATURES,
221 split_u128(u128::from(
222 HvFeatures::new()
223 .with_privileges(privileges)
224 .with_frequency_regs_available(true),
225 )),
226 ),
227 CpuidLeaf::new(
228 HV_CPUID_FUNCTION_MS_HV_ENLIGHTENMENT_INFORMATION,
229 split_u128(
230 HvEnlightenmentInformation::new()
231 .with_deprecate_auto_eoi(true)
232 .with_long_spin_wait_count(0xffffffff) .into(),
234 ),
235 ),
236 ];
237
238 cpuid_entries.extend(hv_cpuid);
239 }
240
241 let cpuid_entries = CpuidLeafSet::new(cpuid_entries);
242
243 let vm = kvm.new_vm()?;
244 vm.enable_split_irqchip(virt::irqcon::IRQ_LINES as u32)?;
245 vm.enable_x2apic_api()?;
246 vm.enable_unknown_msr_exits()?;
247
248 Ok(KvmProtoPartition {
249 vm,
250 config,
251 cpuid: cpuid_entries,
252 })
253 }
254
255 fn is_available(&self) -> Result<bool, Self::Error> {
256 match std::fs::metadata("/dev/kvm") {
257 Ok(_) => Ok(true),
258 Err(err) if err.kind() == io::ErrorKind::NotFound => Ok(false),
259 Err(err) => Err(KvmError::AvailableCheck(err)),
260 }
261 }
262}
263
264pub struct KvmProtoPartition<'a> {
266 vm: kvm::Partition,
267 config: ProtoPartitionConfig<'a>,
268 cpuid: CpuidLeafSet,
269}
270
271impl ProtoPartition for KvmProtoPartition<'_> {
272 type Partition = KvmPartition;
273 type Error = KvmError;
274 type ProcessorBinder = KvmProcessorBinder;
275
276 fn cpuid(&self, eax: u32, ecx: u32) -> [u32; 4] {
277 self.cpuid.result(eax, ecx, &[0; 4])
278 }
279
280 fn max_physical_address_size(&self) -> u8 {
281 max_physical_address_size_from_cpuid(&|eax, ecx| self.cpuid(eax, ecx))
282 }
283
284 fn build(
285 mut self,
286 config: PartitionConfig<'_>,
287 ) -> Result<(Self::Partition, Vec<Self::ProcessorBinder>), Self::Error> {
288 let mut cpuid = self.cpuid.into_leaves();
289 cpuid.extend(config.cpuid);
290 let cpuid = CpuidLeafSet::new(cpuid);
291
292 let bsp_apic_id = self.config.processor_topology.vp_arch(VpIndex::BSP).apic_id;
293 if bsp_apic_id != 0 {
294 self.vm.set_bsp(bsp_apic_id)?;
295 }
296
297 let mut caps = virt::PartitionCapabilities::from_cpuid(
298 self.config.processor_topology,
299 &mut |function, index| cpuid.result(function, index, &[0; 4]),
300 );
301
302 caps.can_freeze_time = false;
303
304 for vp_info in self.config.processor_topology.vps_arch() {
305 self.vm.add_vp(vp_info.apic_id)?;
306 let vp = self.vm.vp(vp_info.apic_id);
307 if self.config.hv_config.is_some() {
308 vp.enable_synic()?;
309
310 vp.set_msrs(&[
313 (
314 hvdef::HV_X64_MSR_VP_INDEX,
315 vp_info.base.vp_index.index().into(),
316 ),
317 (hvdef::HV_X64_MSR_SCONTROL, 1),
318 ])?;
319 }
320
321 vp.set_msrs(&[(
325 x86defs::X86X_IA32_MSR_MISC_ENABLE,
326 hv1_emulator::x86::MISC_ENABLE.into(),
327 )])?;
328
329 let cpuid_entries = cpuid
332 .leaves()
333 .iter()
334 .map(|leaf| {
335 let mut entry = kvm::kvm_cpuid_entry2 {
336 function: leaf.function,
337 index: leaf.index.unwrap_or(0),
338 flags: if leaf.index.is_some() {
339 KVM_CPUID_FLAG_SIGNIFCANT_INDEX
340 } else {
341 0
342 },
343 eax: leaf.result[0],
344 ebx: leaf.result[1],
345 ecx: leaf.result[2],
346 edx: leaf.result[3],
347 padding: [0; 3],
348 };
349 match CpuidFunction(leaf.function) {
350 CpuidFunction::VersionAndFeatures => {
351 entry.ebx &= 0x00ffffff;
352 entry.ebx |= vp_info.apic_id << 24;
353 }
354 CpuidFunction::ExtendedTopologyEnumeration => {
355 entry.edx = vp_info.apic_id;
356 }
357 CpuidFunction::V2ExtendedTopologyEnumeration => {
358 entry.edx = vp_info.apic_id;
359 }
360 _ => (),
361 }
362 entry
363 })
364 .collect::<Vec<_>>();
365
366 vp.set_cpuid(&cpuid_entries)?;
367 }
368
369 let mut gsi_routing = GsiRouting::new();
370
371 for gsi in 0..virt::irqcon::IRQ_LINES as u32 {
373 gsi_routing.claim(gsi);
374 }
375
376 if self.config.hv_config.is_some() {
377 for vp in self.config.processor_topology.vps() {
380 let index = vp.vp_index.index();
381 let gsi = VMBUS_BASE_GSI + index;
382 gsi_routing.claim(gsi);
383 gsi_routing.set(gsi, Some(kvm::RoutingEntry::HvSint { vp: index, sint: 2 }));
384 }
385 }
386
387 kvm::init();
388
389 gsi_routing.update_routes(&self.vm);
390
391 let partition = KvmPartitionInner {
392 kvm: self.vm,
393 memory: Default::default(),
394 hv1_enabled: self.config.hv_config.is_some(),
395 gm: config.guest_memory.clone(),
396 vps: self
397 .config
398 .processor_topology
399 .vps_arch()
400 .map(|vp_info| KvmVpInner {
401 needs_yield: NeedsYield::new(),
402 request_interrupt_window: false.into(),
403 eval: false.into(),
404 vp_info,
405 synic_message_queue: MessageQueues::new(),
406 siefp: Default::default(),
407 })
408 .collect(),
409 gsi_routing: Mutex::new(gsi_routing),
410 caps,
411 cpuid,
412 };
413
414 let partition = KvmPartition {
415 inner: Arc::new(partition),
416 };
417
418 let vps = self
419 .config
420 .processor_topology
421 .vps()
422 .map(|vp| KvmProcessorBinder {
423 partition: partition.inner.clone(),
424 vpindex: vp.vp_index,
425 vmtime: self
426 .config
427 .vmtime
428 .access(format!("vp-{}", vp.vp_index.index())),
429 })
430 .collect::<Vec<_>>();
431
432 if cfg!(debug_assertions) {
433 (&partition).check_reset_all(&partition.inner.vp(VpIndex::BSP).vp_info);
434 }
435
436 Ok((partition, vps))
437 }
438}
439
440const VMBUS_BASE_GSI: u32 = virt::irqcon::IRQ_LINES as u32;
441
442#[derive(Debug, Inspect)]
443pub struct KvmVpInner {
444 #[inspect(skip)]
445 needs_yield: NeedsYield,
446 request_interrupt_window: AtomicBool,
447 eval: AtomicBool,
448 vp_info: X86VpInfo,
449 synic_message_queue: MessageQueues,
450 #[inspect(hex, with = "|x| u64::from(*x.read())")]
451 siefp: RwLock<HvSynicSimpSiefp>,
452}
453
454impl KvmVpInner {
455 pub fn set_eval(&self, value: bool, ordering: Ordering) {
456 self.eval.store(value, ordering);
457 }
458
459 pub fn vp_info(&self) -> &X86VpInfo {
460 &self.vp_info
461 }
462}
463
464impl ResetPartition for KvmPartition {
465 type Error = KvmError;
466
467 fn reset(&self) -> Result<(), Self::Error> {
468 for vp in self.inner.vps() {
469 self.inner
470 .vp_state_access(vp.vp_info.base.vp_index)
471 .reset_all(&vp.vp_info)
472 .map_err(Box::new)?;
473 }
474 let mut this = self;
475 this.reset_all(&self.inner.vp(VpIndex::BSP).vp_info)
476 .map_err(Box::new)?;
477 Ok(())
478 }
479}
480
481impl Partition for KvmPartition {
482 fn supports_reset(&self) -> Option<&dyn ResetPartition<Error = Self::Error>> {
483 Some(self)
484 }
485
486 fn doorbell_registration(
487 self: &Arc<Self>,
488 _minimum_vtl: Vtl,
489 ) -> Option<Arc<dyn DoorbellRegistration>> {
490 Some(self.clone())
491 }
492
493 fn msi_interrupt_target(self: &Arc<Self>, _vtl: Vtl) -> Option<Arc<dyn MsiInterruptTarget>> {
494 Some(Arc::new(KvmMsiTarget(self.inner.clone())))
495 }
496
497 fn caps(&self) -> &virt::PartitionCapabilities {
498 &self.inner.caps
499 }
500
501 fn request_yield(&self, vp_index: VpIndex) {
502 tracing::trace!(vp_index = vp_index.index(), "request yield");
503 if self.inner.vp(vp_index).needs_yield.request_yield() {
504 self.inner.evaluate_vp(vp_index);
505 }
506 }
507
508 fn request_msi(&self, _vtl: Vtl, request: MsiRequest) {
509 self.inner.request_msi(request);
510 }
511}
512
513impl virt::X86Partition for KvmPartition {
514 fn ioapic_routing(&self) -> Arc<dyn IoApicRouting> {
515 self.inner.clone()
516 }
517
518 fn pulse_lint(&self, vp_index: VpIndex, _vtl: Vtl, lint: u8) {
519 if lint == 0 {
520 tracing::trace!(vp_index = vp_index.index(), "request interrupt window");
521 self.inner
522 .vp(vp_index)
523 .request_interrupt_window
524 .store(true, Ordering::Relaxed);
525 self.inner.evaluate_vp(vp_index);
526 } else {
527 tracing::warn!("ignored lint1 pulse");
529 }
530 }
531}
532
533impl PartitionAccessState for KvmPartition {
534 type StateAccess<'a> = &'a KvmPartition;
535
536 fn access_state(&self, vtl: Vtl) -> Self::StateAccess<'_> {
537 assert_eq!(vtl, Vtl::Vtl0);
538
539 self
540 }
541}
542
543impl Hv1 for KvmPartition {
544 type Error = KvmError;
545 type Device = virt::x86::apic_software_device::ApicSoftwareDevice;
546
547 fn reference_time_source(&self) -> Option<ReferenceTimeSource> {
548 self.inner
549 .hv1_enabled
550 .then(|| ReferenceTimeSource::from(self.inner.clone() as Arc<dyn GetReferenceTime>))
551 }
552
553 fn new_virtual_device(
554 &self,
555 ) -> Option<&dyn virt::DeviceBuilder<Device = Self::Device, Error = Self::Error>> {
556 None
557 }
558}
559
560impl GetReferenceTime for KvmPartitionInner {
561 fn now(&self) -> ReferenceTimeResult {
562 let clock = self.kvm.get_clock_ns().unwrap();
569 ReferenceTimeResult {
570 ref_time: clock.clock / 100,
571 system_time: (clock.flags & kvm::KVM_CLOCK_REALTIME != 0)
572 .then(|| jiff::Timestamp::from_nanosecond(clock.realtime as i128).unwrap()),
573 }
574 }
575}
576
577impl virt::BindProcessor for KvmProcessorBinder {
578 type Processor<'a> = KvmProcessor<'a>;
579 type Error = KvmError;
580
581 fn bind(&mut self) -> Result<Self::Processor<'_>, Self::Error> {
582 let inner = &self.partition.vps[self.vpindex.index() as usize];
585 let kvm = self.partition.kvm.vp(inner.vp_info.apic_id);
586 let mut vp = KvmProcessor {
587 partition: &self.partition,
588 inner,
589 runner: kvm.runner(),
590 kvm,
591 vpindex: self.vpindex,
592 guest_debug_db: [0; 4],
593 scontrol: HvSynicScontrol::new().with_enabled(true),
594 siefp: 0.into(),
595 simp: 0.into(),
596 vmtime: &mut self.vmtime,
597 };
598
599 let vp_info = inner.vp_info;
605 let mut state = vp.access_state(Vtl::Vtl0);
606 state.set_registers(&virt::x86::vp::Registers::at_reset(
607 &self.partition.caps,
608 &vp_info,
609 ))?;
610 state.set_apic(&virt::x86::vp::Apic::at_reset(
611 &self.partition.caps,
612 &vp_info,
613 ))?;
614
615 if cfg!(debug_assertions) {
616 vp.access_state(Vtl::Vtl0).check_reset_all(&vp_info);
617 }
618
619 Ok(vp)
620 }
621}
622
623#[derive(InspectMut)]
624pub struct KvmProcessor<'a> {
625 #[inspect(skip)]
626 partition: &'a KvmPartitionInner,
627 #[inspect(flatten)]
628 inner: &'a KvmVpInner,
629 #[inspect(skip)]
630 runner: kvm::VpRunner<'a>,
631 #[inspect(skip)]
632 kvm: kvm::Processor<'a>,
633 vpindex: VpIndex,
634 vmtime: &'a mut VmTimeAccess,
635 #[inspect(iter_by_index)]
636 guest_debug_db: [u64; 4],
637 #[inspect(hex, with = "|&x| u64::from(x)")]
638 scontrol: HvSynicScontrol,
639 #[inspect(hex, with = "|&x| u64::from(x)")]
640 siefp: HvSynicSimpSiefp,
641 #[inspect(hex, with = "|&x| u64::from(x)")]
642 simp: HvSynicSimpSiefp,
643}
644
645impl KvmProcessor<'_> {
646 fn deliver_pic_interrupt(&mut self, dev: &impl CpuIo) -> Result<(), KvmRunVpError> {
651 if let Some(vector) = dev.acknowledge_pic_interrupt() {
652 self.runner
653 .inject_extint_interrupt(vector)
654 .map_err(KvmRunVpError::ExtintInterrupt)?;
655 }
656 Ok(())
657 }
658
659 fn try_deliver_synic_messages(&mut self) -> Option<VmTime> {
661 if !self.scontrol.enabled() && self.simp.enabled() {
662 return None;
663 }
664 self.inner
665 .synic_message_queue
666 .post_pending_messages(!0, |sint, message| {
667 match self.write_sint_message(sint, message) {
668 Ok(true) => {
669 self.partition
670 .kvm
671 .irq_line(VMBUS_BASE_GSI + self.vpindex.index(), true)
672 .unwrap();
673 Ok(())
674 }
675 Ok(false) => Err(HvError::ObjectInUse),
676 Err(err) => {
677 tracelimit::error_ratelimited!(
678 error = &err as &dyn std::error::Error,
679 sint,
680 "failed to write message"
681 );
682 Err(HvError::OperationFailed)
683 }
684 }
685 });
686
687 (self.inner.synic_message_queue.pending_sints() != 0).then(|| {
688 self.vmtime.now().wrapping_add(Duration::from_millis(1))
691 })
692 }
693
694 fn write_sint_message(&mut self, sint: u8, msg: &HvMessage) -> Result<bool, GuestMemoryError> {
699 let simp = self.simp.base_gpn() * HV_PAGE_SIZE + sint as u64 * 256;
700 let typ: u32 = self.partition.gm.read_plain(simp)?;
701 if typ != 0 {
702 self.partition.gm.write_at(simp + 5, &[1u8])?;
703 let typ: u32 = self.partition.gm.read_plain(simp)?;
704 if typ != 0 {
705 return Ok(false);
706 }
707 }
708 self.partition.gm.write_at(simp + 4, &msg.as_bytes()[4..])?;
709 self.partition.gm.write_plain(simp, &msg.header.typ)?;
710 Ok(true)
711 }
712}
713
714struct KvmMsi {
715 address_lo: u32,
716 address_hi: u32,
717 data: u32,
718}
719
720impl KvmMsi {
721 fn new(request: MsiRequest) -> Self {
722 let request_address = MsiAddress::from(request.address as u32);
723 let request_data = MsiData::from(request.data);
724
725 let address_lo = MsiAddress::new()
729 .with_address(x86defs::msi::MSI_ADDRESS)
730 .with_destination(request_address.destination())
731 .with_destination_mode_logical(request_address.destination_mode_logical())
732 .with_redirection_hint(request_data.delivery_mode() == DeliveryMode::LOWEST_PRIORITY.0)
733 .into();
734
735 let address_hi = (request_address.virt_destination() & !0xff).into();
737 let data = MsiData::new()
738 .with_delivery_mode(request_data.delivery_mode())
739 .with_assert(request_data.assert())
740 .with_destination_mode_logical(request_data.destination_mode_logical())
741 .with_trigger_mode_level(request_data.trigger_mode_level())
742 .with_vector(request_data.vector())
743 .into();
744
745 Self {
746 address_lo,
747 address_hi,
748 data,
749 }
750 }
751}
752
753impl KvmPartitionInner {
754 fn request_msi(&self, request: MsiRequest) {
755 let KvmMsi {
756 address_lo,
757 address_hi,
758 data,
759 } = KvmMsi::new(request);
760 if let Err(err) = self.kvm.request_msi(&kvm::kvm_msi {
761 address_lo,
762 address_hi,
763 data,
764 flags: 0,
765 devid: 0,
766 pad: [0; 12],
767 }) {
768 tracelimit::warn_ratelimited!(
769 address = request.address,
770 data = request.data,
771 error = &err as &dyn std::error::Error,
772 "failed to request MSI"
773 );
774 }
775 }
776}
777
778impl IoApicRouting for KvmPartitionInner {
779 fn set_irq_route(&self, irq: u8, request: Option<MsiRequest>) {
780 let entry = request.map(|request| {
781 let KvmMsi {
782 address_lo,
783 address_hi,
784 data,
785 } = KvmMsi::new(request);
786 kvm::RoutingEntry::Msi {
787 address_lo,
788 address_hi,
789 data,
790 }
791 });
792 let mut gsi_routing = self.gsi_routing.lock();
793 if gsi_routing.set(irq as u32, entry) {
794 gsi_routing.update_routes(&self.kvm);
795 }
796 }
797
798 fn assert_irq(&self, irq: u8) {
799 if let Err(err) = self.kvm.irq_line(irq as u32, true) {
800 tracing::error!(
801 irq,
802 error = &err as &dyn std::error::Error,
803 "failed to assert irq"
804 );
805 }
806 }
807}
808
809struct KvmDoorbellEntry {
810 partition: Weak<KvmPartitionInner>,
811 event: Event,
812 guest_address: u64,
813 value: u64,
814 length: u32,
815 flags: u32,
816}
817
818impl KvmDoorbellEntry {
819 pub fn new(
820 partition: &Arc<KvmPartitionInner>,
821 guest_address: u64,
822 value: Option<u64>,
823 length: Option<u32>,
824 fd: &Event,
825 ) -> io::Result<KvmDoorbellEntry> {
826 let flags = if value.is_some() {
827 1 << kvm_ioeventfd_flag_nr_datamatch
828 } else {
829 0
830 };
831 let value = value.unwrap_or(0);
832 let length = length.unwrap_or(0);
833
834 let event = fd.clone();
836
837 if let Err(err) = partition.kvm.ioeventfd(
838 value,
839 guest_address,
840 length,
841 event.as_fd().as_raw_fd(),
842 flags,
843 ) {
844 tracing::warn!(
845 guest_address,
846 error = &err as &dyn std::error::Error,
847 "Failed to register doorbell",
848 );
849 return Err(io::Error::new(
850 io::ErrorKind::InvalidInput,
851 "Failed to register doorbell",
852 ));
853 }
854
855 Ok(Self {
856 partition: Arc::downgrade(partition),
857 guest_address,
858 value,
859 length,
860 flags,
861 event,
862 })
863 }
864}
865
866impl Drop for KvmDoorbellEntry {
867 fn drop(&mut self) {
868 if let Some(partition) = self.partition.upgrade() {
869 let flags: u32 = self.flags | (1 << kvm_ioeventfd_flag_nr_deassign);
870 if let Err(err) = partition.kvm.ioeventfd(
871 self.value,
872 self.guest_address,
873 self.length,
874 self.event.as_fd().as_raw_fd(),
875 flags,
876 ) {
877 tracing::warn!(
878 guest_address = self.guest_address,
879 error = &err as &dyn std::error::Error,
880 "Failed to unregister doorbell",
881 );
882 }
883 }
884 }
885}
886
887impl DoorbellRegistration for KvmPartition {
888 fn register_doorbell(
889 &self,
890 guest_address: u64,
891 value: Option<u64>,
892 length: Option<u32>,
893 fd: &Event,
894 ) -> io::Result<Box<dyn Send + Sync>> {
895 Ok(Box::new(KvmDoorbellEntry::new(
896 &self.inner,
897 guest_address,
898 value,
899 length,
900 fd,
901 )?))
902 }
903}
904
905struct KvmHypercallExit<'a, T> {
906 bus: &'a T,
907 registers: KvmHypercallRegisters,
908}
909
910struct KvmHypercallRegisters {
911 input: u64,
912 params: [u64; 2],
913 result: u64,
914}
915
916impl<T: CpuIo> KvmHypercallExit<'_, T> {
917 const DISPATCHER: hv1_hypercall::Dispatcher<Self> = hv1_hypercall::dispatcher!(
918 Self,
919 [hv1_hypercall::HvPostMessage, hv1_hypercall::HvSignalEvent],
920 );
921}
922
923impl<'a, T: CpuIo> hv1_hypercall::AsHandler<KvmHypercallExit<'a, T>>
924 for &mut KvmHypercallExit<'a, T>
925{
926 fn as_handler(&mut self) -> &mut KvmHypercallExit<'a, T> {
927 self
928 }
929}
930
931impl<T> hv1_hypercall::HypercallIo for KvmHypercallExit<'_, T> {
932 fn advance_ip(&mut self) {
933 }
935
936 fn retry(&mut self, _control: u64) {
937 unimplemented!("KVM cannot retry hypercalls");
938 }
939
940 fn control(&mut self) -> u64 {
941 let mut control = Control::from(self.registers.input);
944 if control.code() == HypercallCode::HvCallSignalEvent.0 {
945 control.set_fast(true);
946 }
947 control.into()
948 }
949
950 fn input_gpa(&mut self) -> u64 {
951 self.registers.params[0]
952 }
953
954 fn output_gpa(&mut self) -> u64 {
955 self.registers.params[1]
956 }
957
958 fn fast_register_pair_count(&mut self) -> usize {
959 1
960 }
961
962 fn extended_fast_hypercalls_ok(&mut self) -> bool {
963 false
964 }
965
966 fn fast_input(&mut self, buf: &mut [[u64; 2]], _output_register_pairs: usize) -> usize {
967 self.fast_regs(0, buf);
968 0
969 }
970
971 fn fast_output(&mut self, _starting_pair_index: usize, _buf: &[[u64; 2]]) {}
972
973 fn vtl_input(&mut self) -> u64 {
974 unimplemented!()
975 }
976
977 fn set_result(&mut self, n: u64) {
978 self.registers.result = n;
979 }
980
981 fn fast_regs(&mut self, _starting_pair_index: usize, buf: &mut [[u64; 2]]) {
982 if let [b, ..] = buf {
983 *b = self.registers.params;
984 }
985 }
986}
987
988impl<T: CpuIo> hv1_hypercall::PostMessage for KvmHypercallExit<'_, T> {
989 fn post_message(&mut self, connection_id: u32, message: &[u8]) -> hvdef::HvResult<()> {
990 self.bus
991 .post_synic_message(Vtl::Vtl0, connection_id, false, message)
992 }
993}
994
995impl<T: CpuIo> hv1_hypercall::SignalEvent for KvmHypercallExit<'_, T> {
996 fn signal_event(&mut self, connection_id: u32, flag: u16) -> hvdef::HvResult<()> {
997 self.bus.signal_synic_event(Vtl::Vtl0, connection_id, flag)
998 }
999}
1000
1001impl Processor for KvmProcessor<'_> {
1002 type Error = KvmError;
1003 type RunVpError = KvmRunVpError;
1004 type StateAccess<'a>
1005 = KvmVpStateAccess<'a>
1006 where
1007 Self: 'a;
1008
1009 fn set_debug_state(
1010 &mut self,
1011 _vtl: Vtl,
1012 state: Option<&virt::x86::DebugState>,
1013 ) -> Result<(), Self::Error> {
1014 let mut control = 0;
1015 let mut db = [0; 4];
1016 let mut dr7 = 0;
1017 if let Some(state) = state {
1018 control |= kvm::KVM_GUESTDBG_ENABLE;
1019 if state.single_step {
1020 control |= kvm::KVM_GUESTDBG_SINGLESTEP;
1021 }
1022 for (i, bp) in state.breakpoints.iter().enumerate() {
1023 if let Some(bp) = bp {
1024 control |= kvm::KVM_GUESTDBG_USE_HW_BP;
1025 db[i] = bp.address;
1026 dr7 |= bp.dr7_bits(i);
1027 }
1028 }
1029 }
1030 self.kvm.set_guest_debug(control, db, dr7)?;
1031 self.guest_debug_db = db;
1033 Ok(())
1034 }
1035
1036 async fn run_vp(
1037 &mut self,
1038 stop: StopVp<'_>,
1039 dev: &impl CpuIo,
1040 ) -> Result<Infallible, VpHaltReason<KvmRunVpError>> {
1041 loop {
1042 self.inner.needs_yield.maybe_yield().await;
1043 stop.check()?;
1044
1045 if self.partition.hv1_enabled {
1046 if let Some(next) = self.try_deliver_synic_messages() {
1049 self.vmtime.set_timeout_if_before(next)
1050 } else {
1051 self.vmtime.cancel_timeout();
1052 }
1053 }
1054
1055 if self.inner.request_interrupt_window.load(Ordering::Relaxed) {
1061 self.inner
1062 .request_interrupt_window
1063 .store(false, Ordering::Relaxed);
1064 if self.runner.check_or_request_interrupt_window() {
1065 self.deliver_pic_interrupt(dev)
1066 .map_err(VpHaltReason::Hypervisor)?;
1067 }
1068 }
1069
1070 if poll_fn(|cx| Poll::Ready(self.vmtime.poll_timeout(cx).is_ready())).await {
1073 continue;
1074 }
1075
1076 let mut pending_exit = false;
1082 loop {
1083 let exit = if self.inner.eval.load(Ordering::Relaxed) || stop.check().is_err() {
1084 if !pending_exit {
1086 self.inner.eval.store(false, Ordering::Relaxed);
1087 break;
1088 }
1089 self.runner.complete_exit()
1091 } else {
1092 self.runner.run()
1094 };
1095
1096 let exit = exit.map_err(|err| VpHaltReason::Hypervisor(KvmRunVpError::Run(err)))?;
1097 pending_exit = true;
1098 match exit {
1099 kvm::Exit::Interrupted => {
1100 tracing::trace!("interrupted");
1101 pending_exit = false;
1102 }
1103 kvm::Exit::InterruptWindow => {
1104 self.deliver_pic_interrupt(dev)
1105 .map_err(VpHaltReason::Hypervisor)?;
1106 }
1107 kvm::Exit::IoIn { port, data, size } => {
1108 for data in data.chunks_mut(size as usize) {
1109 dev.read_io(self.vpindex, port, data).await;
1110 }
1111 }
1112 kvm::Exit::IoOut { port, data, size } => {
1113 for data in data.chunks(size as usize) {
1114 dev.write_io(self.vpindex, port, data).await;
1115 }
1116 }
1117 kvm::Exit::MmioWrite { address, data } => {
1118 dev.write_mmio(self.vpindex, address, data).await
1119 }
1120 kvm::Exit::MmioRead { address, data } => {
1121 dev.read_mmio(self.vpindex, address, data).await
1122 }
1123 kvm::Exit::MsrRead { index, data, error } => {
1124 if MYSTERY_MSRS.contains(&index) {
1125 tracelimit::warn_ratelimited!(index, "stubbed out mystery MSR read");
1126 *data = 0;
1127 } else {
1128 tracelimit::error_ratelimited!(index, "unrecognized msr read");
1129 *error = 1;
1130 }
1131 }
1132 kvm::Exit::MsrWrite { index, data, error } => {
1133 if MYSTERY_MSRS.contains(&index) {
1134 tracelimit::warn_ratelimited!(index, "stubbed out mystery MSR write");
1135 } else {
1136 tracelimit::error_ratelimited!(index, data, "unrecognized msr write");
1137 *error = 1;
1138 }
1139 }
1140 kvm::Exit::Shutdown => {
1141 return Err(VpHaltReason::TripleFault { vtl: Vtl::Vtl0 });
1142 }
1143 kvm::Exit::SynicUpdate {
1144 msr: _msr,
1145 control,
1146 siefp,
1147 simp,
1148 } => {
1149 self.scontrol = control.into();
1150 self.siefp = siefp.into();
1151 self.simp = simp.into();
1152 *self.inner.siefp.write() = if self.scontrol.enabled() {
1153 siefp.into()
1154 } else {
1155 0.into()
1156 };
1157 }
1158 kvm::Exit::HvHypercall {
1159 input,
1160 result,
1161 params,
1162 } => {
1163 let mut handler = KvmHypercallExit {
1165 bus: dev,
1166 registers: KvmHypercallRegisters {
1167 input,
1168 params,
1169 result: 0,
1170 },
1171 };
1172 KvmHypercallExit::DISPATCHER.dispatch(&self.partition.gm, &mut handler);
1173 *result = handler.registers.result;
1174 }
1175 kvm::Exit::Debug {
1176 exception: _,
1177 pc: _,
1178 dr6,
1179 dr7,
1180 } => {
1181 if dr6 & x86defs::DR6_BREAKPOINT_MASK != 0 {
1182 let i = dr6.trailing_zeros() as usize;
1183 let bp = HardwareBreakpoint::from_dr7(dr7, self.guest_debug_db[i], i);
1184 return Err(VpHaltReason::HwBreak(bp));
1185 } else if dr6 & x86defs::DR6_SINGLE_STEP != 0 {
1186 return Err(VpHaltReason::SingleStep);
1187 } else {
1188 tracing::warn!(dr6, "debug exit with unknown dr6 condition");
1189 }
1190 }
1191 kvm::Exit::Eoi { irq } => {
1192 dev.handle_eoi(irq.into());
1193 }
1194 kvm::Exit::InternalError { error, .. } => {
1195 return Err(VpHaltReason::Hypervisor(KvmRunVpError::InternalError(
1196 error,
1197 )));
1198 }
1199 kvm::Exit::EmulationFailure { instruction_bytes } => {
1200 return Err(VpHaltReason::EmulationFailure(
1201 EmulationError {
1202 instruction_bytes: instruction_bytes.to_vec(),
1203 }
1204 .into(),
1205 ));
1206 }
1207 kvm::Exit::FailEntry {
1208 hardware_entry_failure_reason,
1209 } => {
1210 tracing::error!(hardware_entry_failure_reason, "VP entry failed");
1211 return Err(VpHaltReason::InvalidVmState(KvmRunVpError::InvalidVpState));
1212 }
1213 }
1214 }
1215 }
1216 }
1217
1218 fn flush_async_requests(&mut self) -> Result<(), Self::RunVpError> {
1219 Ok(())
1220 }
1221
1222 fn access_state(&mut self, vtl: Vtl) -> Self::StateAccess<'_> {
1223 assert_eq!(vtl, Vtl::Vtl0);
1224 self.partition.vp_state_access(self.vpindex)
1225 }
1226}
1227
1228impl virt::Synic for KvmPartition {
1229 fn post_message(&self, _vtl: Vtl, vp: VpIndex, sint: u8, typ: u32, payload: &[u8]) {
1230 let wake = self
1231 .inner
1232 .vp(vp)
1233 .synic_message_queue
1234 .enqueue_message(sint, &HvMessage::new(HvMessageType(typ), 0, payload));
1235
1236 if wake {
1237 self.inner.evaluate_vp(vp);
1238 }
1239 }
1240
1241 fn new_guest_event_port(
1242 &self,
1243 _vtl: Vtl,
1244 vp: u32,
1245 sint: u8,
1246 flag: u16,
1247 ) -> Box<dyn GuestEventPort> {
1248 Box::new(KvmGuestEventPort {
1249 partition: Arc::downgrade(&self.inner),
1250 gm: self.inner.gm.clone(),
1251 params: Arc::new(Mutex::new(KvmEventPortParams {
1252 vp: VpIndex::new(vp),
1253 sint,
1254 flag,
1255 })),
1256 })
1257 }
1258
1259 fn prefer_os_events(&self) -> bool {
1260 false
1261 }
1262}
1263
1264#[derive(Debug, Error)]
1265#[error("KVM emulation failure: instruction {instruction_bytes:02x?}")]
1266struct EmulationError {
1267 instruction_bytes: Vec<u8>,
1268}
1269
1270#[derive(Debug, Clone)]
1272struct KvmGuestEventPort {
1273 partition: Weak<KvmPartitionInner>,
1274 gm: GuestMemory,
1275 params: Arc<Mutex<KvmEventPortParams>>,
1276}
1277
1278#[derive(Debug, Copy, Clone)]
1279struct KvmEventPortParams {
1280 vp: VpIndex,
1281 sint: u8,
1282 flag: u16,
1283}
1284
1285impl GuestEventPort for KvmGuestEventPort {
1286 fn interrupt(&self) -> Interrupt {
1287 let this = self.clone();
1288 Interrupt::from_fn(move || {
1289 let KvmEventPortParams { vp, sint, flag } = *this.params.lock();
1290 let Some(partition) = this.partition.upgrade() else {
1291 return;
1292 };
1293 let siefp = partition.vp(vp).siefp.read();
1294 if !siefp.enabled() {
1295 return;
1296 }
1297 let byte_gpa = siefp.base_gpn() * HV_PAGE_SIZE + sint as u64 * 256 + flag as u64 / 8;
1298 let mut byte = 0;
1299 let mask = 1 << (flag % 8);
1300 while byte & mask == 0 {
1301 match this.gm.compare_exchange(byte_gpa, byte, byte | mask) {
1302 Ok(Ok(_)) => {
1303 drop(siefp);
1304 partition
1305 .kvm
1306 .irq_line(VMBUS_BASE_GSI + vp.index(), true)
1307 .unwrap();
1308
1309 break;
1310 }
1311 Ok(Err(b)) => byte = b,
1312 Err(err) => {
1313 tracelimit::warn_ratelimited!(
1314 error = &err as &dyn std::error::Error,
1315 "failed to write event flag to guest memory"
1316 );
1317 break;
1318 }
1319 }
1320 }
1321 })
1322 }
1323
1324 fn set_target_vp(&mut self, vp: u32) -> Result<(), vmcore::synic::HypervisorError> {
1325 self.params.lock().vp = VpIndex::new(vp);
1326 Ok(())
1327 }
1328}
1329
1330#[derive(Debug)]
1331struct GsiMsi {
1332 gsi: gsi::GsiRoute,
1333}
1334
1335struct KvmMsiTarget(Arc<KvmPartitionInner>);
1336
1337impl MsiInterruptTarget for KvmMsiTarget {
1338 fn new_interrupt(&self) -> Box<dyn MsiControl> {
1339 let event = Event::new();
1340 let interrupt = self.0.new_route(Some(event)).expect("BUGBUG");
1341 Box::new(GsiMsi { gsi: interrupt })
1342 }
1343}
1344
1345impl MsiControl for GsiMsi {
1346 fn enable(&mut self, address: u64, data: u32) {
1347 let request = MsiRequest { address, data };
1348 let KvmMsi {
1349 address_lo,
1350 address_hi,
1351 data,
1352 } = KvmMsi::new(request);
1353
1354 self.gsi.enable(kvm::RoutingEntry::Msi {
1355 address_lo,
1356 address_hi,
1357 data,
1358 });
1359 }
1360
1361 fn disable(&mut self) {
1362 self.gsi.disable();
1363 }
1364
1365 fn signal(&mut self, _address: u64, _data: u32) {
1366 self.gsi.irqfd_event().unwrap().signal()
1367 }
1368}