Skip to main content

openvmm_core\worker/
dispatch.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4use crate::emuplat;
5use crate::partition::BindHvliteVp;
6use crate::partition::HvlitePartition;
7use crate::vmgs_non_volatile_store::HvLiteVmgsNonVolatileStore;
8use crate::worker::rom::RomBuilder;
9use acpi::dsdt;
10use anyhow::Context;
11use cfg_if::cfg_if;
12use chipset_device_resources::IRQ_LINE_SET;
13use chipset_resources::LEGACY_CHIPSET_PCI_BUS_NAME;
14use debug_ptr::DebugPtr;
15use disk_backend::Disk;
16use disk_backend::resolve::ResolveDiskParameters;
17use firmware_uefi::LogLevel;
18use firmware_uefi::UefiCommandSet;
19use floppy_resources::FloppyDiskConfig;
20use futures::FutureExt;
21use futures::StreamExt;
22use futures::executor::block_on;
23use futures::future::try_join_all;
24use futures_concurrency::prelude::*;
25use guestmem::GuestMemory;
26use hvdef::HV_PAGE_SIZE;
27use hvdef::Vtl;
28use hypervisor_resources::HypervisorKind;
29use ide_resources::GuestMedia;
30use ide_resources::IdeDeviceConfig;
31use igvm::IgvmFile;
32use input_core::InputData;
33use input_core::MultiplexedInputHandle;
34use inspect::Inspect;
35use local_clock::LocalClockDelta;
36use membacking::GuestMemoryBuilder;
37use membacking::GuestMemoryManager;
38use membacking::SharedMemoryBacking;
39use memory_range::MemoryRange;
40use mesh::MeshPayload;
41use mesh::error::RemoteError;
42use mesh::payload::Protobuf;
43use mesh::payload::message::ProtobufMessage;
44use mesh_worker::Worker;
45use mesh_worker::WorkerId;
46use mesh_worker::WorkerRpc;
47use missing_dev::MissingDevManifest;
48use openvmm_defs::config::Aarch64TopologyConfig;
49use openvmm_defs::config::ArchTopologyConfig;
50use openvmm_defs::config::Config;
51use openvmm_defs::config::DeviceVtl;
52use openvmm_defs::config::EfiDiagnosticsLogLevelType;
53use openvmm_defs::config::GicConfig;
54use openvmm_defs::config::HypervisorConfig;
55use openvmm_defs::config::LoadMode;
56use openvmm_defs::config::MemoryConfig;
57use openvmm_defs::config::PcieDeviceConfig;
58use openvmm_defs::config::PcieRootComplexConfig;
59use openvmm_defs::config::PcieSwitchConfig;
60use openvmm_defs::config::PmuGsivConfig;
61use openvmm_defs::config::ProcessorTopologyConfig;
62use openvmm_defs::config::VirtioBus;
63use openvmm_defs::config::VmbusConfig;
64use openvmm_defs::config::VpciDeviceConfig;
65use openvmm_defs::config::Vtl2BaseAddressType;
66use openvmm_defs::config::Vtl2Config;
67use openvmm_defs::config::X2ApicConfig;
68use openvmm_defs::config::X86TopologyConfig;
69use openvmm_defs::rpc::PulseSaveRestoreError;
70use openvmm_defs::rpc::VmRpc;
71use openvmm_defs::worker::VM_WORKER;
72use openvmm_defs::worker::VmWorkerParameters;
73use openvmm_pcat_locator::RomFileLocation;
74use pal_async::DefaultDriver;
75use pal_async::DefaultPool;
76use pal_async::local::block_with_io;
77use pal_async::task::Spawn;
78use pal_async::task::Task;
79use pci_core::PciInterruptPin;
80use pcie::root::GenericPcieRootComplex;
81use pcie::root::GenericPcieRootPortDefinition;
82use pcie::switch::GenericPcieSwitch;
83use scsi_core::ResolveScsiDeviceHandleParams;
84use scsidisk::SimpleScsiDisk;
85use scsidisk::atapi_scsi::AtapiScsiDisk;
86use serial_16550_resources::ComPort;
87use state_unit::SavedStateUnit;
88use state_unit::SpawnedUnit;
89use state_unit::StateUnits;
90use std::fs::File;
91use std::sync::Arc;
92use std::thread;
93use std::thread::JoinHandle;
94use storvsp::ScsiControllerDisk;
95use virt::ProtoPartition;
96use virt::VpIndex;
97use virtio::PciInterruptModel;
98use virtio::VirtioMmioDevice;
99use virtio::VirtioPciDevice;
100use virtio::resolve::VirtioResolveInput;
101use vm_loader::initial_regs::initial_regs;
102use vm_resource::Resource;
103use vm_resource::ResourceResolver;
104use vm_resource::kind::DiskHandleKind;
105use vm_resource::kind::KeyboardInputHandleKind;
106use vm_resource::kind::MouseInputHandleKind;
107use vm_resource::kind::VirtioDeviceHandle;
108use vm_resource::kind::VmbusDeviceHandleKind;
109use vm_topology::memory::MemoryLayout;
110use vm_topology::pcie::PcieHostBridge;
111use vm_topology::processor::ArchTopology;
112use vm_topology::processor::ProcessorTopology;
113use vm_topology::processor::TopologyBuilder;
114use vm_topology::processor::aarch64::Aarch64Topology;
115use vm_topology::processor::aarch64::GicVersion;
116use vm_topology::processor::x86::X86Topology;
117use vmbus_channel::channel::VmbusDevice;
118use vmbus_server::HvsockRelayChannel;
119use vmbus_server::VmbusServer;
120use vmbus_server::hvsock::HvsockRelay;
121use vmcore::save_restore::SavedStateRoot;
122use vmcore::vm_task::VmTaskDriverSource;
123use vmcore::vm_task::thread::ThreadDriverBackend;
124use vmcore::vmtime::VmTime;
125use vmcore::vmtime::VmTimeKeeper;
126use vmcore::vmtime::VmTimeSource;
127use vmgs_resources::GuestStateEncryptionPolicy;
128use vmgs_resources::VmgsResource;
129use vmm_core::acpi_builder::AcpiTablesBuilder;
130use vmm_core::input_distributor::InputDistributor;
131use vmm_core::partition_unit::Halt;
132use vmm_core::partition_unit::PartitionUnit;
133use vmm_core::partition_unit::PartitionUnitParams;
134use vmm_core::partition_unit::block_on_vp;
135use vmm_core::vmbus_unit::ChannelUnit;
136use vmm_core::vmbus_unit::VmbusServerHandle;
137use vmm_core::vmbus_unit::offer_channel_unit;
138use vmm_core::vmbus_unit::offer_vmbus_device_handle_unit;
139use vmm_core_defs::HaltReason;
140use vmotherboard::BaseChipsetBuilder;
141use vmotherboard::BaseChipsetBuilderOutput;
142use vmotherboard::ChipsetDeviceHandle;
143use vmotherboard::ChipsetDevices;
144use vmotherboard::LegacyPciChipsetDeviceHandle;
145use vmotherboard::options::BaseChipsetDevices;
146use vmotherboard::options::BaseChipsetFoundation;
147use vmotherboard::options::BaseChipsetManifest;
148use vmotherboard::options::VmChipsetCapabilities;
149#[cfg(all(windows, feature = "virt_whp"))]
150use vpci::bus::VpciBus;
151use watchdog_core::platform::BaseWatchdogPlatform;
152use watchdog_core::platform::WatchdogCallback;
153use watchdog_core::platform::WatchdogPlatform;
154
155const PM_BASE: u16 = 0x400;
156const SYSTEM_IRQ_ACPI: u32 = 9;
157
158const WDAT_PORT: u16 = 0x30;
159
160/// Creates a thread to run low-performance devices on.
161pub fn new_device_thread() -> (JoinHandle<()>, DefaultDriver) {
162    DefaultPool::spawn_on_thread("basic_device_thread")
163}
164
165impl Manifest {
166    fn from_config(config: Config) -> Self {
167        Self {
168            load_mode: config.load_mode,
169            floppy_disks: config.floppy_disks,
170            ide_disks: config.ide_disks,
171            pcie_root_complexes: config.pcie_root_complexes,
172            pcie_devices: config.pcie_devices,
173            pcie_switches: config.pcie_switches,
174            vpci_devices: config.vpci_devices,
175            hypervisor: config.hypervisor,
176            memory: config.memory,
177            processor_topology: config.processor_topology,
178            chipset: config.chipset,
179            #[cfg(windows)]
180            kernel_vmnics: config.kernel_vmnics,
181            input: config.input,
182            framebuffer: config.framebuffer,
183            vga_firmware: config.vga_firmware,
184            vtl2_gfx: config.vtl2_gfx,
185            virtio_devices: config.virtio_devices,
186            vmbus: config.vmbus,
187            vtl2_vmbus: config.vtl2_vmbus,
188            #[cfg(all(windows, feature = "virt_whp"))]
189            vpci_resources: config.vpci_resources,
190            vmgs: config.vmgs,
191            secure_boot_enabled: config.secure_boot_enabled,
192            custom_uefi_vars: config.custom_uefi_vars,
193            firmware_event_send: config.firmware_event_send,
194            debugger_rpc: config.debugger_rpc,
195            vmbus_devices: config.vmbus_devices,
196            chipset_devices: config.chipset_devices,
197            pci_chipset_devices: config.pci_chipset_devices,
198            chipset_capabilities: config.chipset_capabilities,
199            generation_id_recv: config.generation_id_recv,
200            rtc_delta_milliseconds: config.rtc_delta_milliseconds,
201            automatic_guest_reset: config.automatic_guest_reset,
202            efi_diagnostics_log_level: match config.efi_diagnostics_log_level {
203                EfiDiagnosticsLogLevelType::Default => LogLevel::make_default(),
204                EfiDiagnosticsLogLevelType::Info => LogLevel::make_info(),
205                EfiDiagnosticsLogLevelType::Full => LogLevel::make_full(),
206            },
207        }
208    }
209}
210
211/// This is the manifest of devices with resolved resources (handles, channels).
212///
213/// Currently this is identical to `Config`, but that will change in future
214/// updates.
215#[derive(MeshPayload)]
216pub struct Manifest {
217    load_mode: LoadMode,
218    floppy_disks: Vec<FloppyDiskConfig>,
219    ide_disks: Vec<IdeDeviceConfig>,
220    pcie_root_complexes: Vec<PcieRootComplexConfig>,
221    pcie_devices: Vec<PcieDeviceConfig>,
222    pcie_switches: Vec<PcieSwitchConfig>,
223    vpci_devices: Vec<VpciDeviceConfig>,
224    memory: MemoryConfig,
225    processor_topology: ProcessorTopologyConfig,
226    hypervisor: HypervisorConfig,
227    chipset: BaseChipsetManifest,
228    #[cfg(windows)]
229    kernel_vmnics: Vec<openvmm_defs::config::KernelVmNicConfig>,
230    input: mesh::Receiver<InputData>,
231    framebuffer: Option<framebuffer::Framebuffer>,
232    vga_firmware: Option<RomFileLocation>,
233    vtl2_gfx: bool,
234    virtio_devices: Vec<(VirtioBus, Resource<VirtioDeviceHandle>)>,
235    vmbus: Option<VmbusConfig>,
236    vtl2_vmbus: Option<VmbusConfig>,
237    #[cfg(all(windows, feature = "virt_whp"))]
238    vpci_resources: Vec<virt_whp::device::DeviceHandle>,
239    vmgs: Option<VmgsResource>,
240    secure_boot_enabled: bool,
241    custom_uefi_vars: firmware_uefi_custom_vars::CustomVars,
242    firmware_event_send: Option<mesh::Sender<get_resources::ged::FirmwareEvent>>,
243    debugger_rpc: Option<mesh::Receiver<vmm_core_defs::debug_rpc::DebugRequest>>,
244    vmbus_devices: Vec<(DeviceVtl, Resource<VmbusDeviceHandleKind>)>,
245    chipset_devices: Vec<ChipsetDeviceHandle>,
246    pci_chipset_devices: Vec<LegacyPciChipsetDeviceHandle>,
247    chipset_capabilities: VmChipsetCapabilities,
248    generation_id_recv: Option<mesh::Receiver<[u8; 16]>>,
249    rtc_delta_milliseconds: i64,
250    automatic_guest_reset: bool,
251    efi_diagnostics_log_level: LogLevel,
252}
253
254#[derive(Protobuf, SavedStateRoot)]
255#[mesh(package = "openvmm")]
256pub struct SavedState {
257    #[mesh(1)]
258    pub units: Vec<SavedStateUnit>,
259}
260
261async fn open_simple_disk(
262    resolver: &ResourceResolver,
263    disk_type: Resource<DiskHandleKind>,
264    read_only: bool,
265    driver_source: &VmTaskDriverSource,
266) -> anyhow::Result<Disk> {
267    let disk = resolver
268        .resolve(
269            disk_type,
270            ResolveDiskParameters {
271                read_only,
272                driver_source,
273            },
274        )
275        .await?;
276    Ok(disk.0)
277}
278
279#[derive(MeshPayload)]
280pub struct RestartState {
281    hypervisor: Resource<HypervisorKind>,
282    manifest: Manifest,
283    running: bool,
284    saved_state: SavedState,
285    shared_memory: Option<SharedMemoryBacking>,
286    rpc: mesh::Receiver<VmRpc>,
287    notify: mesh::Sender<HaltReason>,
288}
289
290// Used for locating VM information in a debugger
291// Do not use during program execution
292static LOADED_VM: DebugPtr<LoadedVm> = DebugPtr::new();
293
294/// The VM worker, used to create and run a VM partition.
295pub struct VmWorker {
296    vm: LoadedVm,
297    rpc: mesh::Receiver<VmRpc>,
298    device_thread: JoinHandle<()>,
299}
300
301impl Worker for VmWorker {
302    type Parameters = VmWorkerParameters;
303    type State = RestartState;
304    const ID: WorkerId<Self::Parameters> = VM_WORKER;
305
306    fn new(parameters: Self::Parameters) -> anyhow::Result<Self> {
307        let (device_thread, device_driver) = new_device_thread();
308
309        let manifest = Manifest::from_config(parameters.cfg);
310
311        let hypervisor = block_on(ResourceResolver::new().resolve(parameters.hypervisor, ()))
312            .context("failed to resolve hypervisor backend")?;
313
314        let shared_memory = parameters
315            .shared_memory
316            .map(|fd| SharedMemoryBacking::from_mappable(fd.into()));
317
318        let vm = block_on(InitializedVm::new(
319            VmTaskDriverSource::new(ThreadDriverBackend::new(device_driver)),
320            hypervisor.0,
321            manifest,
322            shared_memory,
323        ))?;
324        let saved_state = parameters
325            .saved_state
326            .map(|m| m.parse())
327            .transpose()
328            .context("failed to decode saved state")?;
329
330        let vm = block_with_io(|_| vm.load(saved_state, parameters.notify))?;
331
332        LOADED_VM.store(&vm);
333
334        Ok(Self {
335            vm,
336            rpc: parameters.rpc,
337            device_thread,
338        })
339    }
340
341    fn restart(state: Self::State) -> anyhow::Result<Self> {
342        let RestartState {
343            hypervisor,
344            manifest,
345            running,
346            saved_state,
347            shared_memory,
348            rpc,
349            notify,
350        } = state;
351        let (device_thread, device_driver) = new_device_thread();
352
353        let hypervisor = block_on(ResourceResolver::new().resolve(hypervisor, ()))
354            .context("failed to resolve hypervisor backend")?;
355
356        let vm = block_on(InitializedVm::new(
357            VmTaskDriverSource::new(ThreadDriverBackend::new(device_driver)),
358            hypervisor.0,
359            manifest,
360            shared_memory,
361        ))?;
362        pal_async::local::block_on(async {
363            let mut vm = vm.load(Some(saved_state), notify).await?;
364
365            LOADED_VM.store(&vm);
366
367            if running {
368                vm.resume().await;
369            }
370            Ok(Self {
371                vm,
372                rpc,
373                device_thread,
374            })
375        })
376    }
377
378    fn run(self, worker_rpc: mesh::Receiver<WorkerRpc<Self::State>>) -> anyhow::Result<()> {
379        DefaultPool::run_with(async |driver| {
380            let driver = driver;
381            self.vm.run(&driver, self.rpc, worker_rpc).await
382        });
383        self.device_thread.join().unwrap();
384        Ok(())
385    }
386}
387
388/// A VM that has been initialized but not yet loaded (i.e. the saved state is
389/// not yet available).
390pub(crate) struct InitializedVm {
391    partition: Arc<dyn HvlitePartition>,
392    vps: Vec<Box<dyn BindHvliteVp>>,
393    vmtime_keeper: VmTimeKeeper,
394    vmtime_source: VmTimeSource,
395    memory_manager: GuestMemoryManager,
396    gm: GuestMemory,
397    cfg: Manifest,
398    mem_layout: MemoryLayout,
399    processor_topology: ProcessorTopology,
400    igvm_file: Option<IgvmFile>,
401    driver_source: VmTaskDriverSource,
402}
403
404trait BuildTopology<T: ArchTopology + Inspect> {
405    fn to_topology(
406        &self,
407        platform_info: &virt::PlatformInfo,
408    ) -> anyhow::Result<ProcessorTopology<T>>;
409}
410
411trait ExtractTopologyConfig {
412    fn to_config(&self) -> ProcessorTopologyConfig;
413}
414
415impl ExtractTopologyConfig for ProcessorTopology<X86Topology> {
416    fn to_config(&self) -> ProcessorTopologyConfig {
417        ProcessorTopologyConfig {
418            proc_count: self.vp_count(),
419            vps_per_socket: Some(self.reserved_vps_per_socket()),
420            enable_smt: Some(self.smt_enabled()),
421            arch: Some(ArchTopologyConfig::X86(X86TopologyConfig {
422                apic_id_offset: self.vp_arch(VpIndex::BSP).apic_id,
423                x2apic: match self.apic_mode() {
424                    vm_topology::processor::x86::ApicMode::XApic => X2ApicConfig::Unsupported,
425                    vm_topology::processor::x86::ApicMode::X2ApicSupported => {
426                        X2ApicConfig::Supported
427                    }
428                    vm_topology::processor::x86::ApicMode::X2ApicEnabled => X2ApicConfig::Enabled,
429                },
430            })),
431        }
432    }
433}
434
435#[cfg(guest_arch = "x86_64")]
436impl BuildTopology<X86Topology> for ProcessorTopologyConfig {
437    fn to_topology(
438        &self,
439        _platform_info: &virt::PlatformInfo,
440    ) -> anyhow::Result<ProcessorTopology<X86Topology>> {
441        use vm_topology::processor::x86::X2ApicState;
442
443        let arch = match &self.arch {
444            None => Default::default(),
445            Some(ArchTopologyConfig::X86(arch)) => arch.clone(),
446            _ => anyhow::bail!("invalid architecture config"),
447        };
448        let mut builder = TopologyBuilder::from_host_topology()?;
449        builder.apic_id_offset(arch.apic_id_offset);
450        if let Some(smt) = self.enable_smt {
451            builder.smt_enabled(smt);
452        }
453        if let Some(count) = self.vps_per_socket {
454            builder.vps_per_socket(count);
455        }
456        let x2apic = match arch.x2apic {
457            X2ApicConfig::Auto => {
458                // FUTURE: query the hypervisor for a recommendation.
459                X2ApicState::Supported
460            }
461            X2ApicConfig::Supported => X2ApicState::Supported,
462            X2ApicConfig::Unsupported => X2ApicState::Unsupported,
463            X2ApicConfig::Enabled => X2ApicState::Enabled,
464        };
465        builder.x2apic(x2apic);
466        Ok(builder.build(self.proc_count)?)
467    }
468}
469
470impl ExtractTopologyConfig for ProcessorTopology<Aarch64Topology> {
471    fn to_config(&self) -> ProcessorTopologyConfig {
472        ProcessorTopologyConfig {
473            proc_count: self.vp_count(),
474            vps_per_socket: Some(self.reserved_vps_per_socket()),
475            enable_smt: Some(self.smt_enabled()),
476            arch: Some(ArchTopologyConfig::Aarch64(Aarch64TopologyConfig {
477                gic_config: Some(match self.gic_version() {
478                    GicVersion::V3 {
479                        redistributors_base,
480                    } => GicConfig::V3(Some(openvmm_defs::config::GicV3Config {
481                        gic_distributor_base: self.gic_distributor_base(),
482                        gic_redistributors_base: redistributors_base,
483                    })),
484                    GicVersion::V2 { cpu_interface_base } => {
485                        GicConfig::V2(Some(openvmm_defs::config::GicV2Config {
486                            gic_distributor_base: self.gic_distributor_base(),
487                            cpu_interface_base,
488                        }))
489                    }
490                }),
491                pmu_gsiv: match self.pmu_gsiv() {
492                    Some(gsiv) => PmuGsivConfig::Gsiv(gsiv),
493                    None => PmuGsivConfig::Disabled,
494                },
495            })),
496        }
497    }
498}
499
500#[cfg(guest_arch = "aarch64")]
501impl BuildTopology<Aarch64Topology> for ProcessorTopologyConfig {
502    fn to_topology(
503        &self,
504        platform_info: &virt::PlatformInfo,
505    ) -> anyhow::Result<ProcessorTopology<Aarch64Topology>> {
506        use vm_topology::processor::aarch64::Aarch64PlatformConfig;
507        use vm_topology::processor::aarch64::GicV2mInfo;
508
509        let arch = match &self.arch {
510            None => Default::default(),
511            Some(ArchTopologyConfig::Aarch64(arch)) => arch.clone(),
512            _ => anyhow::bail!("invalid architecture config"),
513        };
514        let gic_v2m = Some(GicV2mInfo {
515            frame_base: openvmm_defs::config::DEFAULT_GIC_V2M_MSI_FRAME_BASE,
516            spi_base: openvmm_defs::config::DEFAULT_GIC_V2M_SPI_BASE,
517            spi_count: openvmm_defs::config::DEFAULT_GIC_V2M_SPI_COUNT,
518        });
519        let pmu_gsiv = match arch.pmu_gsiv {
520            PmuGsivConfig::Disabled => None,
521            PmuGsivConfig::Gsiv(gsiv) => Some(gsiv),
522            PmuGsivConfig::Platform => platform_info.platform_gsiv,
523        };
524
525        // TODO: When this value is supported on all platforms, we should change
526        // the arch config to not be an option. For now, warn since the ARM VBSA
527        // expects this to be available.
528        if pmu_gsiv.is_none() {
529            tracing::warn!("PMU GSIV is not set");
530        }
531
532        let (gic_distributor_base, gic_version) = match &arch.gic_config {
533            Some(GicConfig::V3(config)) => {
534                let dist = config
535                    .as_ref()
536                    .map(|c| c.gic_distributor_base)
537                    .unwrap_or(openvmm_defs::config::DEFAULT_GIC_DISTRIBUTOR_BASE);
538                let redist = config
539                    .as_ref()
540                    .map(|c| c.gic_redistributors_base)
541                    .unwrap_or(openvmm_defs::config::DEFAULT_GIC_REDISTRIBUTORS_BASE);
542                (
543                    dist,
544                    GicVersion::V3 {
545                        redistributors_base: redist,
546                    },
547                )
548            }
549            Some(GicConfig::V2(config)) => {
550                let dist = config
551                    .as_ref()
552                    .map(|c| c.gic_distributor_base)
553                    .unwrap_or(openvmm_defs::config::DEFAULT_GIC_DISTRIBUTOR_BASE);
554                let cpu_if = config
555                    .as_ref()
556                    .map(|c| c.cpu_interface_base)
557                    .unwrap_or(openvmm_defs::config::DEFAULT_GIC_REDISTRIBUTORS_BASE);
558                (
559                    dist,
560                    GicVersion::V2 {
561                        cpu_interface_base: cpu_if,
562                    },
563                )
564            }
565            None => {
566                // No explicit GIC config — use the hypervisor's detected version
567                // with default addresses.
568                let dist = openvmm_defs::config::DEFAULT_GIC_DISTRIBUTOR_BASE;
569                let second = openvmm_defs::config::DEFAULT_GIC_REDISTRIBUTORS_BASE;
570                if platform_info.supports_gic_v3 {
571                    (
572                        dist,
573                        GicVersion::V3 {
574                            redistributors_base: second,
575                        },
576                    )
577                } else {
578                    (
579                        dist,
580                        GicVersion::V2 {
581                            cpu_interface_base: second,
582                        },
583                    )
584                }
585            }
586        };
587
588        let platform = Aarch64PlatformConfig {
589            gic_distributor_base,
590            gic_version,
591            gic_v2m,
592            pmu_gsiv,
593            virt_timer_ppi: openvmm_defs::config::DEFAULT_VIRT_TIMER_PPI,
594            gic_nr_irqs: openvmm_defs::config::DEFAULT_GIC_NR_IRQS,
595        };
596
597        let mut builder = TopologyBuilder::new_aarch64(platform);
598        if let Some(smt) = self.enable_smt {
599            builder.smt_enabled(smt);
600        }
601        if let Some(count) = self.vps_per_socket {
602            builder.vps_per_socket(count);
603        } else {
604            builder.vps_per_socket(self.proc_count);
605        }
606        Ok(builder.build(self.proc_count)?)
607    }
608}
609
610/// A VM that has been loaded and can be run.
611///
612/// Most new state should be added to [`LoadedVmInner`].
613pub(crate) struct LoadedVm {
614    state_units: StateUnits,
615    inner: LoadedVmInner,
616    running: bool,
617}
618
619/// Most of the VM state for [`LoadedVm`], excluding things that are necessary
620/// for state machine transitions.
621struct LoadedVmInner {
622    driver_source: VmTaskDriverSource,
623    resolver: ResourceResolver,
624    partition_unit: PartitionUnit,
625    partition: Arc<dyn HvlitePartition>,
626    chipset_devices: ChipsetDevices,
627    _vmtime: SpawnedUnit<VmTimeKeeper>,
628    _scsi_devices: Vec<SpawnedUnit<ChannelUnit<storvsp::StorageDevice>>>,
629    memory_manager: GuestMemoryManager,
630    gm: GuestMemory,
631    vtl0_hvsock_relay: Option<HvsockRelay>,
632    vtl2_hvsock_relay: Option<HvsockRelay>,
633    vmbus_server: Option<VmbusServerHandle>,
634    vtl2_vmbus_server: Option<VmbusServerHandle>,
635    #[cfg(windows)]
636    _vmbus_proxy: Option<vmbus_server::ProxyIntegration>,
637    #[cfg(windows)]
638    _kernel_vmnics: Vec<vmswitch::kernel::KernelVmNic>,
639    memory_cfg: MemoryConfig,
640    mem_layout: MemoryLayout,
641    processor_topology: ProcessorTopology,
642    hypervisor_cfg: HypervisorConfig,
643    vmbus_redirect: bool,
644    vmbus_devices: Vec<SpawnedUnit<ChannelUnit<dyn VmbusDevice>>>,
645
646    input_distributor: SpawnedUnit<InputDistributor>,
647    vtl2_framebuffer_gpa_base: Option<u64>,
648
649    chipset_cfg: BaseChipsetManifest,
650    chipset_capabilities: VmChipsetCapabilities,
651    #[cfg_attr(not(guest_arch = "x86_64"), expect(dead_code))]
652    virtio_mmio_count: usize,
653    #[cfg_attr(not(guest_arch = "x86_64"), expect(dead_code))]
654    virtio_mmio_irq: u32,
655    /// ((device, function), interrupt)
656    #[cfg_attr(not(guest_arch = "x86_64"), expect(dead_code))]
657    pci_legacy_interrupts: Vec<((u8, Option<u8>), u32)>,
658    firmware_event_send: Option<mesh::Sender<get_resources::ged::FirmwareEvent>>,
659
660    load_mode: LoadMode,
661    igvm_file: Option<IgvmFile>,
662    next_igvm_file: Option<IgvmFile>,
663    _vmgs_task: Option<Task<()>>,
664    vmgs_client_inspect_handle: Option<vmgs_broker::VmgsClient>,
665
666    /// VFIO container manager inspect handle (Linux only).
667    #[cfg(target_os = "linux")]
668    vfio_inspect: Option<vfio_assigned_device::manager::VfioManagerClient>,
669
670    // relay halt messages, intercepting reset if configured.
671    halt_recv: mesh::Receiver<HaltReason>,
672    client_notify_send: mesh::Sender<HaltReason>,
673    /// allow the guest to reset without notifying the client
674    automatic_guest_reset: bool,
675    pcie_host_bridges: Vec<PcieHostBridge>,
676    pcie_root_complexes: Vec<Arc<closeable_mutex::CloseableMutex<GenericPcieRootComplex>>>,
677    pcie_hotplug_devices: Vec<(
678        String,
679        vmotherboard::DynamicDeviceUnit,
680        Arc<closeable_mutex::CloseableMutex<chipset_device_resources::ErasedChipsetDevice>>,
681    )>,
682}
683
684fn convert_vtl2_config(
685    vtl2_cfg: Option<&Vtl2Config>,
686    load_mode: &LoadMode,
687    igvm_file: Option<&IgvmFile>,
688) -> anyhow::Result<Option<virt::Vtl2Config>> {
689    let vtl2_cfg = match vtl2_cfg {
690        Some(cfg) => cfg,
691        None => return Ok(None),
692    };
693
694    let late_map_vtl0_memory = match vtl2_cfg.late_map_vtl0_memory {
695        Some(policy) => {
696            use super::vm_loaders::igvm::vtl2_memory_info;
697            use virt::LateMapVtl0AllowedRanges;
698            let igvm_file = igvm_file.context("vtl2 configured but not loading from igvm")?;
699
700            let allowed_ranges = if let LoadMode::Igvm {
701                vtl2_base_address, ..
702            } = load_mode
703            {
704                let range = vtl2_memory_info(igvm_file).context("invalid igvm file")?;
705                match vtl2_base_address {
706                    Vtl2BaseAddressType::File => {
707                        // Allowed range is the file range as-is.
708                        LateMapVtl0AllowedRanges::Ranges(vec![range])
709                    }
710                    Vtl2BaseAddressType::Absolute(base) => {
711                        // This file must support relocations.
712                        if !crate::worker::vm_loaders::igvm::supports_relocations(igvm_file) {
713                            anyhow::bail!(
714                                "vtl2 base address is absolute but igvm file does not support relocations"
715                            );
716                        }
717
718                        // Use the size, but the base is the requested load
719                        // base.
720                        LateMapVtl0AllowedRanges::Ranges(vec![MemoryRange::new(
721                            *base..(*base + range.len()),
722                        )])
723                    }
724                    Vtl2BaseAddressType::MemoryLayout { .. } => {
725                        LateMapVtl0AllowedRanges::MemoryLayout
726                    }
727                    Vtl2BaseAddressType::Vtl2Allocate { .. } => {
728                        // When VTL2 is doing allocation, we do not know which
729                        // ranges we should disallow late map access of.
730                        anyhow::bail!(
731                            "late map vtl0 memory is not supported when VTL2 is doing self allocation of ram"
732                        );
733                    }
734                }
735            } else {
736                anyhow::bail!("vtl2 configured but not loading from igvm");
737            };
738
739            Some(virt::LateMapVtl0MemoryConfig {
740                allowed_ranges,
741                policy: policy.into(),
742            })
743        }
744        None => None,
745    };
746
747    let config = virt::Vtl2Config {
748        late_map_vtl0_memory,
749    };
750
751    Ok(Some(config))
752}
753
754impl InitializedVm {
755    /// Creates and initializes a VM using the given backend.
756    async fn new(
757        driver_source: VmTaskDriverSource,
758        create_vm: crate::hypervisor_backend::CreateVmFn,
759        cfg: Manifest,
760        shared_memory: Option<SharedMemoryBacking>,
761    ) -> anyhow::Result<Self> {
762        create_vm(driver_source, cfg, shared_memory).await
763    }
764
765    /// Creates and initializes a VM with the given hypervisor backend.
766    ///
767    /// This is the main monomorphization point — callers provide a concrete
768    /// `virt::Hypervisor` implementation. Called from the blanket impl of
769    /// [`HypervisorBackend`](crate::hypervisor_backend::HypervisorBackend).
770    pub(crate) async fn new_with_hypervisor<P, H>(
771        driver_source: VmTaskDriverSource,
772        hypervisor: &mut H,
773        platform_info: virt::PlatformInfo,
774        cfg: Manifest,
775        shared_memory: Option<SharedMemoryBacking>,
776    ) -> anyhow::Result<Self>
777    where
778        H: virt::Hypervisor<Partition = P>,
779        P: 'static + HvlitePartition,
780    {
781        tracing::info!(mem_size = cfg.memory.mem_size, "guest RAM config");
782
783        let vmtime_keeper = VmTimeKeeper::new(&driver_source.simple(), VmTime::from_100ns(0));
784        let vmtime_source = vmtime_keeper
785            .builder()
786            .build(&driver_source.simple())
787            .await
788            .unwrap();
789
790        // Pre-parse the igvm file early.
791        let igvm_file = if let LoadMode::Igvm { file, .. } = &cfg.load_mode {
792            let igvm_file = super::vm_loaders::igvm::read_igvm_file(file)
793                .context("reading igvm file failed")?;
794            Some(igvm_file)
795        } else {
796            None
797        };
798
799        let hv_config = if cfg.hypervisor.with_hv {
800            cfg_if::cfg_if! {
801                if #[cfg(all(windows, feature = "virt_whp"))] {
802                    let allow_device_assignment = !cfg.vpci_resources.is_empty();
803                } else {
804                    let allow_device_assignment = false;
805                }
806            }
807
808            Some(virt::HvConfig {
809                allow_device_assignment,
810                vtl2: convert_vtl2_config(
811                    cfg.hypervisor.with_vtl2.as_ref(),
812                    &cfg.load_mode,
813                    igvm_file.as_ref(),
814                )?,
815            })
816        } else {
817            None
818        };
819
820        let processor_topology = cfg.processor_topology.to_topology(&platform_info)?;
821
822        let proto = hypervisor
823            .new_partition(virt::ProtoPartitionConfig {
824                processor_topology: &processor_topology,
825                hv_config,
826                vmtime: &vmtime_source,
827                isolation: cfg
828                    .hypervisor
829                    .with_isolation
830                    .map(|typ| typ.into())
831                    .unwrap_or(virt::IsolationType::None),
832            })
833            .context("failed to create the prototype partition")?;
834
835        let physical_address_size = proto.max_physical_address_size();
836
837        // Determine if a special vtl2 memory allocation should be used.
838        let vtl2_range = if let LoadMode::Igvm {
839            vtl2_base_address, ..
840        } = &cfg.load_mode
841        {
842            match vtl2_base_address {
843                Vtl2BaseAddressType::File
844                | Vtl2BaseAddressType::Absolute(_)
845                | Vtl2BaseAddressType::Vtl2Allocate { .. } => None,
846                Vtl2BaseAddressType::MemoryLayout { size } => {
847                    let vtl2_range = super::vm_loaders::igvm::vtl2_memory_range(
848                        physical_address_size,
849                        cfg.memory.mem_size,
850                        &cfg.memory.mmio_gaps,
851                        &cfg.memory.pci_ecam_gaps,
852                        &cfg.memory.pci_mmio_gaps,
853                        igvm_file
854                            .as_ref()
855                            .expect("igvm file should be already parsed"),
856                        *size,
857                    )
858                    .context("unable to determine vtl2 memory range")?;
859                    tracing::info!(?vtl2_range, "vtl2 memory range selected");
860
861                    Some(vtl2_range)
862                }
863            }
864        } else {
865            None
866        };
867
868        // Choose the memory layout of the VM.
869        let mem_layout = if let Some(ref sizes) = cfg.memory.numa_mem_sizes {
870            // When numa_mem_sizes is set, distribute guest RAM across vNUMA nodes
871            // for ACPI SRAT / FDT reporting.
872            //
873            // TODO: The vNUMA nodes reported are meant for test usage only, as they
874            // are not aligned to any physical NUMA node. There is more work to do
875            // to support useful vNUMA reporting.
876            let total: u64 = sizes
877                .iter()
878                .copied()
879                .try_fold(0u64, |acc, s| acc.checked_add(s))
880                .context("numa memory sizes overflow")?;
881            anyhow::ensure!(
882                total == cfg.memory.mem_size,
883                "numa_mem_sizes total ({total:#x}) does not match mem_size ({:#x})",
884                cfg.memory.mem_size
885            );
886
887            MemoryLayout::new_with_numa(
888                sizes,
889                &cfg.memory.mmio_gaps,
890                &cfg.memory.pci_ecam_gaps,
891                &cfg.memory.pci_mmio_gaps,
892                vtl2_range,
893            )
894        } else {
895            MemoryLayout::new(
896                cfg.memory.mem_size,
897                &cfg.memory.mmio_gaps,
898                &cfg.memory.pci_ecam_gaps,
899                &cfg.memory.pci_mmio_gaps,
900                vtl2_range,
901            )
902        }
903        .context("invalid memory configuration")?;
904
905        if mem_layout.end_of_layout() > 1 << physical_address_size {
906            anyhow::bail!(
907                "memory layout ends at {:#x}, which exceeds the address with of {} bits",
908                mem_layout.end_of_layout(),
909                physical_address_size
910            );
911        }
912
913        // Place the alias map at the end of the address space. Newer versions
914        // of OpenHCL support receiving this offset via devicetree (especially
915        // important on ARM64 where the physical address width used here is not
916        // reported to the guest), but older ones depend on it being hardcoded.
917        let vtl0_alias_map = cfg.hypervisor.with_vtl2.as_ref().and_then(|cfg| {
918            cfg.vtl0_alias_map
919                .then_some(1 << (physical_address_size - 1))
920        });
921
922        if let Some(size) = cfg.memory.hugepage_size
923            && !cfg.memory.hugepages
924        {
925            anyhow::bail!("hugepage_size={size} requires hugepages=on");
926        }
927
928        let mut memory_builder = GuestMemoryBuilder::new();
929        memory_builder = memory_builder
930            .existing_backing(shared_memory)
931            .vtl0_alias_map(vtl0_alias_map)
932            .prefetch_ram(cfg.memory.prefetch_memory)
933            .private_memory(cfg.memory.private_memory)
934            .transparent_hugepages(cfg.memory.transparent_hugepages)
935            .x86_legacy_support(
936                matches!(cfg.load_mode, LoadMode::Pcat { .. }) || cfg.chipset.with_hyperv_vga,
937            );
938        if cfg.memory.hugepages {
939            memory_builder = memory_builder.hugepages(cfg.memory.hugepage_size);
940        }
941
942        #[cfg(all(windows, feature = "virt_whp"))]
943        if !cfg.vpci_resources.is_empty() {
944            memory_builder = memory_builder.pin_mappings(true);
945        }
946
947        cfg_if! {
948            if #[cfg(windows)] {
949                let vtl2_memory_process = if cfg.hypervisor.with_vtl2.is_some() {
950                    // VTL2 needs a separate memory hosting process.
951                    let process = pal::windows::process::empty_process()
952                        .context("could not launch a memory process for VTL2")?;
953                    Some(Box::new(process) as _)
954                } else {
955                    None
956                };
957            } else {
958                let vtl2_memory_process = None;
959            }
960        }
961
962        let mut memory_manager = memory_builder
963            .build(&mem_layout)
964            .await
965            .context("failed to build guest memory")?;
966
967        let gm = memory_manager
968            .client()
969            .guest_memory()
970            .await
971            .context("failed to get guest memory")?;
972        let mut cpuid = Vec::new();
973
974        // Add in Hyper-V VMM CPUID leaves.
975        if cfg.hypervisor.with_hv {
976            let confidential_vmbus = false;
977            // Only advertise extended IOAPIC on non-PCAT systems.
978            let extended_ioapic_rte = !matches!(cfg.load_mode, LoadMode::Pcat { .. });
979            cpuid.extend(vmm_core::cpuid::hyperv_cpuid_leaves(
980                extended_ioapic_rte,
981                confidential_vmbus,
982            ));
983        }
984
985        let (partition, vps) = proto
986            .build(virt::PartitionConfig {
987                mem_layout: &mem_layout,
988                guest_memory: &gm,
989                cpuid: &cpuid,
990                vtl0_alias_map,
991            })
992            .context("failed to create the partition")?;
993
994        let vps = vps.into_iter().map(|vp| Box::new(vp) as _).collect();
995
996        let partition = Arc::new(partition);
997
998        memory_manager
999            .attach_partition(Vtl::Vtl0, &partition.memory_mapper(Vtl::Vtl0), None)
1000            .await
1001            .context("failed to attach memory to the partition")?;
1002
1003        if cfg.hypervisor.with_vtl2.is_some() {
1004            memory_manager
1005                .attach_partition(
1006                    Vtl::Vtl2,
1007                    &partition.memory_mapper(Vtl::Vtl2),
1008                    vtl2_memory_process,
1009                )
1010                .await
1011                .context("failed to attach memory to VTL2")?;
1012        }
1013
1014        Ok(Self {
1015            partition,
1016            vps,
1017            vmtime_keeper,
1018            vmtime_source,
1019            memory_manager,
1020            gm,
1021            cfg,
1022            mem_layout,
1023            processor_topology,
1024            igvm_file,
1025            driver_source,
1026        })
1027    }
1028
1029    /// Loads the state for an initialized VM.
1030    ///
1031    // FUTURE: move more of this logic into new() so that more can be done
1032    //         outside the VM-PHU/live migration blackout window.
1033    async fn load(
1034        self,
1035        saved_state: Option<SavedState>,
1036        client_notify_send: mesh::Sender<HaltReason>,
1037    ) -> Result<LoadedVm, anyhow::Error> {
1038        use vmotherboard::options::dev;
1039
1040        let Self {
1041            partition,
1042            vps,
1043            vmtime_keeper,
1044            vmtime_source,
1045            memory_manager,
1046            gm,
1047            cfg,
1048            mem_layout,
1049            processor_topology,
1050            igvm_file,
1051            driver_source,
1052        } = self;
1053
1054        let mut resolver = ResourceResolver::new();
1055
1056        resolver.add_async_resolver(
1057            chipset_device_worker::resolver::RemoteChipsetDeviceResolver(
1058                OpenVmmRemoteDynamicResolvers {},
1059            ),
1060        );
1061
1062        // Expose the partition reference time source, if available.
1063        if cfg.hypervisor.with_hv {
1064            if let Some(ref_time) = partition.reference_time_source() {
1065                resolver.add_resolver(ref_time);
1066            }
1067        }
1068
1069        if cfg
1070            .vmgs
1071            .as_ref()
1072            .is_some_and(|x| !matches!(x.encryption_policy(), GuestStateEncryptionPolicy::None(_)))
1073        {
1074            unimplemented!("guest state encryption not supported on openvmm");
1075        }
1076
1077        let vmgs = match cfg.vmgs {
1078            Some(VmgsResource::Disk(disk)) => Some(
1079                vmgs::Vmgs::try_open(
1080                    open_simple_disk(&resolver, disk.disk, false, &driver_source).await?,
1081                    None,
1082                    true,
1083                    false,
1084                )
1085                .await
1086                .context("failed to open vmgs file")?,
1087            ),
1088            Some(VmgsResource::ReprovisionOnFailure(disk)) => Some(
1089                vmgs::Vmgs::try_open(
1090                    open_simple_disk(&resolver, disk.disk, false, &driver_source).await?,
1091                    None,
1092                    true,
1093                    true,
1094                )
1095                .await
1096                .context("failed to open vmgs file")?,
1097            ),
1098            Some(VmgsResource::Reprovision(disk)) => Some(
1099                vmgs::Vmgs::request_format(
1100                    open_simple_disk(&resolver, disk.disk, false, &driver_source).await?,
1101                    None,
1102                )
1103                .await
1104                .context("failed to format vmgs file")?,
1105            ),
1106            Some(VmgsResource::Ephemeral) => None,
1107            // TODO: make sure we don't need a VMGS
1108            None => None,
1109        };
1110
1111        let (vmgs_client, vmgs_task) = if let Some(vmgs) = vmgs {
1112            let (vmgs_client, vmgs_task) =
1113                vmgs_broker::spawn_vmgs_broker(driver_source.builder().build("vmgs_broker"), vmgs);
1114            resolver.add_resolver(vmgs_client.clone());
1115            (Some(vmgs_client), Some(vmgs_task))
1116        } else {
1117            (None, None)
1118        };
1119
1120        // For sanity: we immediately restrict `vmgs_client` to the
1121        // `HvLiteVmgsNonVolatileStore` API, since we don't want code past this
1122        // point to interact with VMGS as anything but an opaque
1123        // `NonVolatileStore`
1124        //
1125        // ...but we keep a reference to the original untyped client, since we need
1126        // to pass it to LoadedVm so that we can `inspect` VMGS at runtime.
1127        let vmgs_client_inspect_handle = vmgs_client.clone();
1128        let vmgs_client: Option<&dyn HvLiteVmgsNonVolatileStore> =
1129            vmgs_client.as_ref().map(|x| x as _);
1130
1131        let (halt_vps, halt_request_recv) = Halt::new();
1132        let halt_vps = Arc::new(halt_vps);
1133
1134        resolver.add_resolver(vmm_core::platform_resolvers::HaltResolver(halt_vps.clone()));
1135
1136        let generation_id_recv = cfg.generation_id_recv.unwrap_or_else(|| mesh::channel().1);
1137
1138        let logger = Box::new(emuplat::firmware::MeshLogger::new(
1139            cfg.firmware_event_send.clone(),
1140        ));
1141
1142        let mapper = memory_manager.device_memory_mapper();
1143
1144        #[cfg_attr(not(guest_arch = "x86_64"), expect(unused_mut))]
1145        let mut deps_hyperv_firmware_pcat = None;
1146        let mut deps_hyperv_firmware_uefi = None;
1147        match &cfg.load_mode {
1148            LoadMode::Uefi { .. } => {
1149                let (watchdog_send, watchdog_recv) = mesh::channel();
1150                deps_hyperv_firmware_uefi = Some(dev::HyperVFirmwareUefi {
1151                    config: firmware_uefi::UefiConfig {
1152                        custom_uefi_vars: cfg.custom_uefi_vars,
1153                        secure_boot: cfg.secure_boot_enabled,
1154                        initial_generation_id: {
1155                            let mut generation_id = [0; 16];
1156                            getrandom::fill(&mut generation_id).expect("rng failure");
1157                            generation_id
1158                        },
1159                        use_mmio: cfg!(not(guest_arch = "x86_64")),
1160                        command_set: if cfg!(guest_arch = "x86_64") {
1161                            UefiCommandSet::X64
1162                        } else {
1163                            UefiCommandSet::Aarch64
1164                        },
1165                        diagnostics_log_level: cfg.efi_diagnostics_log_level,
1166                    },
1167                    logger,
1168                    nvram_storage: {
1169                        use hcl_compat_uefi_nvram_storage::HclCompatNvram;
1170                        use uefi_nvram_storage::in_memory::InMemoryNvram;
1171                        use vmm_core::emuplat::hcl_compat_uefi_nvram_storage::VmgsStorageBackendAdapter;
1172
1173                        match vmgs_client {
1174                            Some(vmgs) => Box::new(HclCompatNvram::new(
1175                                VmgsStorageBackendAdapter(
1176                                    vmgs.as_non_volatile_store(vmgs::FileId::BIOS_NVRAM, true)
1177                                        .context("failed to instantiate UEFI NVRAM store")?,
1178                                ),
1179                                None,
1180                            )),
1181                            None => Box::new(InMemoryNvram::new()),
1182                        }
1183                    },
1184                    generation_id_recv,
1185                    watchdog_platform: {
1186                        use vmcore::non_volatile_store::EphemeralNonVolatileStore;
1187
1188                        // UEFI watchdog doesn't persist to VMGS at this time
1189                        let store = EphemeralNonVolatileStore::new_boxed();
1190
1191                        // Create the base watchdog platform
1192                        let mut base_watchdog_platform = BaseWatchdogPlatform::new(store).await?;
1193
1194                        // Inject NMI on watchdog timeout
1195                        #[cfg(guest_arch = "x86_64")]
1196                        let watchdog_callback = WatchdogTimeoutNmi {
1197                            partition: partition.clone(),
1198                            watchdog_send: Some(watchdog_send),
1199                        };
1200
1201                        // ARM64 does not have NMI support yet, so halt instead
1202                        #[cfg(guest_arch = "aarch64")]
1203                        let watchdog_callback = WatchdogTimeoutReset {
1204                            halt_vps: halt_vps.clone(),
1205                            watchdog_send: Some(watchdog_send),
1206                        };
1207
1208                        // Add callbacks
1209                        base_watchdog_platform.add_callback(Box::new(watchdog_callback));
1210
1211                        Box::new(base_watchdog_platform)
1212                    },
1213                    watchdog_recv,
1214                    vsm_config: None,
1215                    // TODO: persist SystemTimeClock time across reboots.
1216                    time_source: Box::new(local_clock::SystemTimeClock::new(
1217                        LocalClockDelta::from_millis(cfg.rtc_delta_milliseconds),
1218                    )),
1219                })
1220            }
1221            #[cfg(guest_arch = "x86_64")]
1222            LoadMode::Pcat {
1223                firmware,
1224                boot_order,
1225            } => {
1226                tracing::debug!(?firmware, "Loading BIOS firmware.");
1227                let rom_builder = RomBuilder::new("bios".into(), Box::new(mapper.clone()));
1228                let rom = rom_builder.build_from_file_location(firmware)?;
1229                // TODO: move mtrr replay to a resource.
1230                let halt_vps = halt_vps.clone();
1231                deps_hyperv_firmware_pcat = Some(dev::HyperVFirmwarePcat {
1232                    logger,
1233                    generation_id_recv,
1234                    rom: Some(Box::new(rom)),
1235                    replay_mtrrs: Box::new(move || halt_vps.replay_mtrrs()),
1236                    config: {
1237                        let acpi_tables_builder = AcpiTablesBuilder {
1238                            processor_topology: &processor_topology,
1239                            mem_layout: &mem_layout,
1240                            cache_topology: None,
1241                            pcie_host_bridges: &Vec::new(),
1242                            arch: vmm_core::acpi_builder::AcpiArchConfig::X86 {
1243                                with_ioapic: cfg.chipset.with_generic_ioapic,
1244                                with_pic: cfg.chipset_capabilities.with_pic,
1245                                with_pit: cfg.chipset_capabilities.with_pit,
1246                                with_psp: cfg.chipset.with_generic_psp,
1247                                pm_base: PM_BASE,
1248                                acpi_irq: SYSTEM_IRQ_ACPI,
1249                            },
1250                        };
1251                        let srat = acpi_tables_builder.build_srat();
1252                        firmware_pcat::config::PcatBiosConfig {
1253                            processor_topology: processor_topology.clone(),
1254                            mem_layout: mem_layout.clone(),
1255                            srat,
1256
1257                            hibernation_enabled: false,
1258                            initial_generation_id: {
1259                                let mut generation_id = [0; 16];
1260                                getrandom::fill(&mut generation_id).expect("rng failure");
1261                                generation_id
1262                            },
1263                            boot_order: {
1264                                use firmware_pcat::config::BootDevice;
1265                                use firmware_pcat::config::BootDeviceStatus;
1266                                use openvmm_defs::config::PcatBootDevice;
1267                                boot_order.map(|dev| BootDeviceStatus {
1268                                    kind: match dev {
1269                                        PcatBootDevice::Floppy => BootDevice::Floppy,
1270                                        PcatBootDevice::HardDrive => BootDevice::HardDrive,
1271                                        PcatBootDevice::Optical => BootDevice::Optical,
1272                                        PcatBootDevice::Network => BootDevice::Network,
1273                                    },
1274                                    // TODO: accurately model this?
1275                                    attached: true,
1276                                })
1277                            },
1278                            num_lock_enabled: false,
1279                            // TODO: these are all very bogus values, and need to be swapped out with something better
1280                            smbios: firmware_pcat::config::SmbiosConstants {
1281                                bios_guid: guid::Guid {
1282                                    data1: 0xC4066C45,
1283                                    data2: 0x503D,
1284                                    data3: 0x40E8,
1285                                    data4: [0xB1, 0x5C, 0x31, 0x26, 0x4E, 0x5F, 0xE1, 0xD9],
1286                                },
1287                                system_serial_number: "9583-9572-9874-4843-7295-1653-92".into(),
1288                                base_board_serial_number: "9583-9572-9874-4843-7295-1653-92".into(),
1289                                chassis_serial_number: "9583-9572-9874-4843-7295-1653-92".into(),
1290                                chassis_asset_tag: "9583-9572-9874-4843-7295-1653-92".into(),
1291                                bios_lock_string: "00000000000000000000000000000000".into(),
1292                                processor_manufacturer: b"\0".to_vec(),
1293                                processor_version: b"\0".to_vec(),
1294                                cpu_info_bundle: None,
1295                            },
1296                        }
1297                    },
1298                })
1299            }
1300            _ => {}
1301        };
1302
1303        let vtl2_framebuffer_gpa_base = if cfg.vtl2_gfx {
1304            // calculate a safe place to put the framebuffer mapping in GPA space
1305            // this places it after the end of ram at the first place it won't overlap with MMIO
1306            let len = cfg
1307                .framebuffer
1308                .as_ref()
1309                .context("no framebuffer configured")?
1310                .len();
1311            let mut gpa = mem_layout.end_of_ram();
1312            for mmio in mem_layout.mmio() {
1313                if gpa < mmio.end() && mmio.start() < gpa + len as u64 {
1314                    gpa = mmio.end();
1315                }
1316            }
1317            tracing::debug!("Vtl2 framebuffer gpa base: {:#x}", gpa);
1318            Some(gpa)
1319        } else {
1320            None
1321        };
1322
1323        let state_units = StateUnits::new();
1324
1325        let vmtime = state_units
1326            .add("vmtime")
1327            .spawn(driver_source.simple(), {
1328                |recv| {
1329                    let mut vmtime = vmtime_keeper;
1330                    async move {
1331                        vmm_core::vmtime_unit::run_vmtime(&mut vmtime, recv).await;
1332                        vmtime
1333                    }
1334                }
1335            })
1336            .unwrap();
1337
1338        let mut input_distributor = InputDistributor::new(cfg.input);
1339        resolver.add_async_resolver::<KeyboardInputHandleKind, _, MultiplexedInputHandle, _>(
1340            input_distributor.client().clone(),
1341        );
1342        resolver.add_async_resolver::<MouseInputHandleKind, _, MultiplexedInputHandle, _>(
1343            input_distributor.client().clone(),
1344        );
1345
1346        let input_distributor = state_units
1347            .add("input")
1348            .spawn(driver_source.simple(), async |mut recv| {
1349                input_distributor.run(&mut recv).await;
1350                input_distributor
1351            })
1352            .unwrap();
1353
1354        let mut pci_legacy_interrupts = Vec::new();
1355
1356        let mut ide_drives = [[None, None], [None, None]];
1357        let mut storvsp_ide_disks = Vec::new();
1358        if cfg.chipset.with_hyperv_ide {
1359            pci_legacy_interrupts.push(((7, None), 14));
1360            pci_legacy_interrupts.push(((7, None), 15));
1361
1362            for disk_cfg in cfg.ide_disks {
1363                let path = disk_cfg.path;
1364                let media = match disk_cfg.guest_media {
1365                    GuestMedia::Dvd(disk_type) => {
1366                        let dvd = resolver
1367                            .resolve(
1368                                disk_type,
1369                                ResolveScsiDeviceHandleParams {
1370                                    driver_source: &driver_source,
1371                                },
1372                            )
1373                            .await
1374                            .context("failed to open IDE DVD")?;
1375
1376                        let scsi_disk = Arc::new(AtapiScsiDisk::new(dvd.0));
1377                        ide::DriveMedia::optical_disk(scsi_disk.clone())
1378                    }
1379                    GuestMedia::Disk {
1380                        disk_type,
1381                        read_only,
1382                        disk_parameters,
1383                    } => {
1384                        let disk =
1385                            open_simple_disk(&resolver, disk_type, read_only, &driver_source)
1386                                .await
1387                                .context("failed to open IDE disk")?;
1388
1389                        // Only disks get accelerator channels. DVDs dont.
1390                        let scsi_disk = ScsiControllerDisk::new(Arc::new(SimpleScsiDisk::new(
1391                            disk.clone(),
1392                            disk_parameters.unwrap_or_default(),
1393                        )));
1394                        storvsp_ide_disks.push((path, scsi_disk));
1395                        ide::DriveMedia::hard_disk(disk.clone())
1396                    }
1397                };
1398
1399                let old_media = ide_drives
1400                    .get_mut(path.channel as usize)
1401                    .context("invalid ide channel")?
1402                    .get_mut(path.drive as usize)
1403                    .context("invalid ide device")?
1404                    .replace(media);
1405
1406                if old_media.is_some() {
1407                    anyhow::bail!(
1408                        "ide drive {}:{} is already in use",
1409                        path.channel,
1410                        path.drive
1411                    );
1412                }
1413            }
1414        }
1415
1416        let deps_hyperv_guest_watchdog = if cfg.chipset.with_hyperv_guest_watchdog {
1417            Some(dev::HyperVGuestWatchdogDeps {
1418                port_base: WDAT_PORT,
1419                watchdog_platform: {
1420                    use vmcore::non_volatile_store::EphemeralNonVolatileStore;
1421
1422                    let store = match vmgs_client {
1423                        Some(vmgs) => vmgs
1424                            .as_non_volatile_store(vmgs::FileId::GUEST_WATCHDOG, false)
1425                            .context("failed to instantiate guest watchdog store")?,
1426                        None => EphemeralNonVolatileStore::new_boxed(),
1427                    };
1428
1429                    // Create the base watchdog platform
1430                    let mut base_watchdog_platform = BaseWatchdogPlatform::new(store).await?;
1431
1432                    // Create callback to reset on watchdog timeout
1433                    let watchdog_callback = WatchdogTimeoutReset {
1434                        halt_vps: halt_vps.clone(),
1435                        watchdog_send: None, // This is not the UEFI watchdog, so no need to send
1436                                             // watchdog notifications
1437                    };
1438
1439                    // Add callbacks
1440                    base_watchdog_platform.add_callback(Box::new(watchdog_callback));
1441
1442                    Box::new(base_watchdog_platform)
1443                },
1444            })
1445        } else {
1446            None
1447        };
1448
1449        let initial_rtc_cmos = if matches!(cfg.load_mode, LoadMode::Pcat { .. }) {
1450            Some(firmware_pcat::default_cmos_values(&mem_layout))
1451        } else {
1452            None
1453        };
1454
1455        let deps_generic_cmos_rtc = (cfg.chipset.with_generic_cmos_rtc).then(|| {
1456            // TODO: persist SystemTimeClock time across reboots.
1457            // TODO: move to instantiate via a resource.
1458            let time_source = Box::new(local_clock::SystemTimeClock::new(
1459                LocalClockDelta::from_millis(cfg.rtc_delta_milliseconds),
1460            ));
1461            dev::GenericCmosRtcDeps {
1462                irq: 8,
1463                time_source,
1464                century_reg_idx: 0x32, // TODO: automatically sync with FADT
1465                initial_cmos: initial_rtc_cmos,
1466            }
1467        });
1468
1469        #[cfg(guest_arch = "x86_64")]
1470        let deps_generic_ioapic =
1471            (cfg.chipset.with_generic_ioapic).then(|| dev::GenericIoApicDeps {
1472                num_entries: virt::irqcon::IRQ_LINES as u8,
1473                routing: Box::new(vmm_core::emuplat::ioapic::IoApicRouting(
1474                    partition.clone().ioapic_routing(),
1475                )),
1476            });
1477
1478        #[cfg(guest_arch = "aarch64")]
1479        let deps_generic_ioapic = if cfg.chipset.with_generic_ioapic {
1480            anyhow::bail!("ioapic not supported on this architecture");
1481        } else {
1482            None
1483        };
1484
1485        let deps_generic_isa_dma =
1486            (cfg.chipset.with_generic_isa_dma).then_some(dev::GenericIsaDmaDeps {});
1487
1488        let mut primary_disk_drive = floppy::DriveRibbon::None;
1489        let mut secondary_disk_drive = floppy::DriveRibbon::None;
1490        if cfg.chipset.with_winbond_super_io_and_floppy_full {
1491            let mut pri_drives = Vec::new();
1492            let mut sec_drives = Vec::new();
1493            for (index, disk_cfg) in cfg.floppy_disks.into_iter().enumerate() {
1494                let FloppyDiskConfig {
1495                    disk_type,
1496                    read_only,
1497                } = disk_cfg;
1498
1499                let disk = open_simple_disk(&resolver, disk_type, read_only, &driver_source)
1500                    .await
1501                    .context("failed to open floppy disk")?;
1502                tracing::trace!("floppy opened based on config into DriveRibbon");
1503
1504                if index == 0 {
1505                    pri_drives.push(disk);
1506                } else if index == 1 {
1507                    sec_drives.push(disk)
1508                } else {
1509                    tracing::error!("more than 2 floppy controllers are not supported");
1510                    break;
1511                }
1512            }
1513
1514            primary_disk_drive = floppy::DriveRibbon::from_vec(pri_drives)?;
1515            secondary_disk_drive = floppy::DriveRibbon::from_vec(sec_drives)?;
1516        }
1517
1518        // must enforce exclusivity here due to how the
1519        // `{primary,secondary}_disk_drive` vars get "claimed" by each device.
1520        let (deps_generic_isa_floppy, deps_winbond_super_io_and_floppy_full) = match (
1521            cfg.chipset.with_generic_isa_floppy,
1522            cfg.chipset.with_winbond_super_io_and_floppy_full,
1523        ) {
1524            (true, true) => anyhow::bail!("cannot have both generic and winbond floppy"),
1525            (true, false) => {
1526                if !matches!(secondary_disk_drive, floppy::DriveRibbon::None) {
1527                    anyhow::bail!("more than 1 generic floppy controller is not supported")
1528                }
1529
1530                (
1531                    // Use "standard" ISA constants for IRQ, DMA, and IO Port
1532                    // assignment
1533                    Some(dev::GenericIsaFloppyDeps {
1534                        irq: 6,
1535                        dma_channel: 2,
1536                        pio_base: 0x3f0,
1537                        drives: primary_disk_drive,
1538                    }),
1539                    None,
1540                )
1541            }
1542            (false, true) => (
1543                None,
1544                Some(dev::WinbondSuperIoAndFloppyFullDeps {
1545                    primary_disk_drive,
1546                    secondary_disk_drive,
1547                }),
1548            ),
1549            (false, false) => (None, None),
1550        };
1551
1552        let pci_bus_id_generic = vmotherboard::BusId::new("generic");
1553        let pci_bus_id_piix4 = vmotherboard::BusId::new(LEGACY_CHIPSET_PCI_BUS_NAME);
1554
1555        let deps_generic_pci_bus =
1556            (cfg.chipset.with_generic_pci_bus).then_some(dev::GenericPciBusDeps {
1557                bus_id: pci_bus_id_generic.clone(),
1558                pio_addr: pci_bus::standard_x86_io_ports::ADDR_START,
1559                pio_data: pci_bus::standard_x86_io_ports::DATA_START,
1560            });
1561
1562        let deps_generic_psp = (cfg.chipset.with_generic_psp).then_some(dev::GenericPspDeps {});
1563
1564        let deps_hyperv_framebuffer =
1565            (cfg.chipset.with_hyperv_framebuffer).then(|| dev::HyperVFramebufferDeps {
1566                fb_mapper: Box::new(mapper.clone()),
1567                fb: cfg.framebuffer.unwrap(),
1568                vtl2_framebuffer_gpa_base,
1569            });
1570
1571        let deps_hyperv_power_management =
1572            (cfg.chipset.with_hyperv_power_management).then_some(dev::HyperVPowerManagementDeps {
1573                acpi_irq: SYSTEM_IRQ_ACPI,
1574                pio_base: PM_BASE,
1575                pm_timer_assist: None,
1576            });
1577
1578        let deps_hyperv_vga = if cfg.chipset.with_hyperv_vga {
1579            let vga_firmware = cfg.vga_firmware.as_ref().context("no VGA BIOS file")?;
1580            let rom_builder = RomBuilder::new("vga".into(), Box::new(mapper.clone()));
1581            let rom = rom_builder.build_from_file_location(vga_firmware)?;
1582
1583            Some(dev::HyperVVgaDeps {
1584                attached_to: pci_bus_id_piix4.clone(),
1585                rom: Some(Box::new(rom)),
1586            })
1587        } else {
1588            None
1589        };
1590
1591        let deps_i440bx_host_pci_bridge =
1592            (cfg.chipset.with_i440bx_host_pci_bridge).then(|| dev::I440BxHostPciBridgeDeps {
1593                attached_to: pci_bus_id_piix4.clone(),
1594                adjust_gpa_range: Box::new(
1595                    emuplat::i440bx_host_pci_bridge::ManageRamGpaRange::new(
1596                        memory_manager.ram_visibility_control(),
1597                    ),
1598                ),
1599            });
1600
1601        let deps_piix4_pci_bus = (cfg.chipset.with_piix4_pci_bus).then(|| dev::Piix4PciBusDeps {
1602            bus_id: pci_bus_id_piix4.clone(),
1603        });
1604
1605        let deps_piix4_cmos_rtc = (cfg.chipset.with_piix4_cmos_rtc).then(|| {
1606            // TODO: persist SystemTimeClock time across reboots.
1607            // TODO: move to instantiate via a resource.
1608            let time_source = Box::new(local_clock::SystemTimeClock::new(
1609                LocalClockDelta::from_millis(cfg.rtc_delta_milliseconds),
1610            ));
1611            dev::Piix4CmosRtcDeps {
1612                time_source,
1613                initial_cmos: initial_rtc_cmos,
1614                enlightened_interrupts: true, // As advertised by the PCAT BIOS.
1615            }
1616        });
1617
1618        let [primary_channel_drives, secondary_channel_drives] = ide_drives;
1619        let deps_hyperv_ide = (cfg.chipset.with_hyperv_ide).then_some(dev::HyperVIdeDeps {
1620            attached_to: pci_bus_id_piix4.clone(),
1621            primary_channel_drives,
1622            secondary_channel_drives,
1623        });
1624
1625        let deps_piix4_power_management =
1626            (cfg.chipset.with_piix4_power_management).then_some(dev::Piix4PowerManagementDeps {
1627                attached_to: pci_bus_id_piix4.clone(),
1628                pm_timer_assist: None,
1629            });
1630
1631        let base_chipset_devices = {
1632            BaseChipsetDevices {
1633                deps_generic_cmos_rtc,
1634                deps_generic_ioapic,
1635                deps_generic_isa_dma,
1636                deps_generic_isa_floppy,
1637                deps_generic_pci_bus,
1638                deps_generic_psp,
1639                deps_hyperv_firmware_pcat,
1640                deps_hyperv_firmware_uefi,
1641                deps_hyperv_framebuffer,
1642                deps_hyperv_guest_watchdog,
1643                deps_hyperv_ide,
1644                deps_hyperv_power_management,
1645                deps_hyperv_vga,
1646                deps_i440bx_host_pci_bridge,
1647                deps_piix4_cmos_rtc,
1648                deps_piix4_pci_bus,
1649                deps_piix4_power_management,
1650                deps_underhill_vga_proxy: None,
1651                deps_winbond_super_io_and_floppy_stub: None,
1652                deps_winbond_super_io_and_floppy_full,
1653            }
1654        };
1655
1656        let BaseChipsetBuilderOutput {
1657            chipset_builder,
1658            device_interfaces: base_chipset_device_interfaces,
1659        } = BaseChipsetBuilder::new(
1660            BaseChipsetFoundation {
1661                is_restoring: false,
1662                untrusted_dma_memory: gm.clone(),
1663                // There is no access to encrypted memory on the host, so this
1664                // may be misleading. Presumably in any confidential VM
1665                // scenario, devices using this will not be present or will be
1666                // implemented by a paravisor. But it still must be set for
1667                // non-confidential scenarios.
1668                trusted_vtl0_dma_memory: gm.clone(),
1669                power_event_handler: halt_vps.clone(),
1670                debug_event_handler: halt_vps.clone(),
1671                vmtime: &vmtime_source,
1672                vmtime_unit: vmtime.handle(),
1673                doorbell_registration: partition.clone().into_doorbell_registration(Vtl::Vtl0),
1674            },
1675            base_chipset_devices,
1676        )
1677        .with_expected_manifest(cfg.chipset.clone())
1678        .with_device_handles(cfg.chipset_devices)
1679        .with_pci_device_handles(cfg.pci_chipset_devices)
1680        .with_trace_unknown_pio(true) // todo: add CLI param?
1681        .build(&driver_source, &state_units, &resolver)
1682        .await?;
1683
1684        if cfg.chipset.with_generic_pci_bus {
1685            // HACK: We don't currently have an appropriate generic bus root to
1686            // put on the PCI bus, so we just fake one.
1687            //
1688            // This seems to appease Linux just fine
1689            chipset_builder
1690                .arc_mutex_device("fake-bus-root")
1691                .on_pci_bus(pci_bus_id_generic.clone())
1692                .add(|services| {
1693                    missing_dev::MissingDev::from_manifest(
1694                        MissingDevManifest::new().claim_pci((0, 0, 0), 0x8086, 0x7111),
1695                        &mut services.register_mmio(),
1696                        &mut services.register_pio(),
1697                    )
1698                })?;
1699        }
1700
1701        // Add the GIC.
1702        #[cfg(guest_arch = "aarch64")]
1703        chipset_builder.add_external_line_target(
1704            IRQ_LINE_SET,
1705            0..=vmm_core::emuplat::gic::SPI_RANGE.end() - vmm_core::emuplat::gic::SPI_RANGE.start(),
1706            *vmm_core::emuplat::gic::SPI_RANGE.start(),
1707            "gic",
1708            Arc::new(vmm_core::emuplat::gic::GicInterruptTarget::new(
1709                partition.clone().control_gic(Vtl::Vtl0),
1710            )),
1711        );
1712
1713        // Add the x86 BSP's LINTs for the PIC to use.
1714        #[cfg(guest_arch = "x86_64")]
1715        chipset_builder.add_external_line_target(
1716            chipset_device_resources::BSP_LINT_LINE_SET,
1717            0..=1,
1718            0,
1719            "bsp",
1720            partition.clone().into_lint_target(Vtl::Vtl0),
1721        );
1722
1723        if let Some(framebuffer) = base_chipset_device_interfaces.framebuffer_local_control {
1724            resolver.add_resolver(framebuffer);
1725        }
1726
1727        let pci_inta_line = {
1728            const PCI_LEGACY_INTA_IRQ: u32 = 11;
1729            const PCI_INTA_IRQ: u32 = 16;
1730            if cfg.chipset.with_i440bx_host_pci_bridge {
1731                // Hyper-V hard-wires this to 11.
1732                Some(PCI_LEGACY_INTA_IRQ)
1733            } else if cfg.chipset.with_generic_pci_bus {
1734                // Avoid an ISA interrupt to avoid conflicts and to avoid needing to
1735                // configure the line as level-triggered in the MADT (necessary for
1736                // Linux when the PIC is missing).
1737                if cfg.chipset_capabilities.with_pic {
1738                    Some(PCI_LEGACY_INTA_IRQ)
1739                } else {
1740                    Some(PCI_INTA_IRQ)
1741                }
1742            } else {
1743                None
1744            }
1745        };
1746
1747        let mut scsi_devices = Vec::new();
1748        let mut vtl0_hvsock_relay = None;
1749        #[cfg(windows)]
1750        let mut vmbus_proxy = None;
1751        #[cfg(windows)]
1752        let mut kernel_vmnics = Vec::new();
1753        let mut vmbus_server = None;
1754        let mut vtl2_vmbus_server = None;
1755        let mut vtl2_hvsock_relay = None;
1756        let mut vmbus_redirect = false;
1757
1758        // PCI Express topology
1759
1760        let (pcie_host_bridges, pcie_root_complexes) = {
1761            let mut pcie_host_bridges = Vec::new();
1762            let mut pcie_root_complexes = Vec::new();
1763
1764            for rc in cfg.pcie_root_complexes {
1765                let device_name = format!("pcie-root:{}", rc.name);
1766                let msi_conn = pci_core::msi::MsiConnection::new();
1767                let root_complex =
1768                    chipset_builder
1769                        .arc_mutex_device(device_name)
1770                        .add(|services| {
1771                            let root_port_definitions = rc
1772                                .ports
1773                                .into_iter()
1774                                .map(|rp_cfg| GenericPcieRootPortDefinition {
1775                                    name: rp_cfg.name.into(),
1776                                    hotplug: rp_cfg.hotplug,
1777                                })
1778                                .collect();
1779
1780                            GenericPcieRootComplex::new(
1781                                &mut services.register_mmio(),
1782                                rc.start_bus,
1783                                rc.end_bus,
1784                                rc.ecam_range,
1785                                root_port_definitions,
1786                                msi_conn.target(),
1787                            )
1788                        })?;
1789
1790                if let Some(signal_msi) = partition.as_signal_msi(Vtl::Vtl0) {
1791                    msi_conn.connect(signal_msi);
1792                }
1793
1794                pcie_host_bridges.push(PcieHostBridge {
1795                    index: rc.index,
1796                    segment: rc.segment,
1797                    start_bus: rc.start_bus,
1798                    end_bus: rc.end_bus,
1799                    ecam_range: rc.ecam_range,
1800                    low_mmio: rc.low_mmio,
1801                    high_mmio: rc.high_mmio,
1802                });
1803
1804                pcie_root_complexes.push(root_complex.clone());
1805
1806                let bus_id = vmotherboard::BusId::new(&rc.name);
1807                chipset_builder.register_weak_mutex_pcie_enumerator(bus_id, Box::new(root_complex));
1808            }
1809
1810            (pcie_host_bridges, pcie_root_complexes)
1811        };
1812
1813        for switch in cfg.pcie_switches {
1814            let device_name = format!("pcie-switch:{}", switch.name);
1815            let switch_device = chipset_builder
1816                .arc_mutex_device(device_name)
1817                .on_pcie_port(vmotherboard::BusId::new(&switch.parent_port))
1818                .add(|_services| {
1819                    let definition = pcie::switch::GenericPcieSwitchDefinition {
1820                        name: switch.name.clone().into(),
1821                        downstream_port_count: switch.num_downstream_ports,
1822                        hotplug: switch.hotplug,
1823                    };
1824                    GenericPcieSwitch::new(definition)
1825                })?;
1826
1827            let bus_id = vmotherboard::BusId::new(&switch.name);
1828            chipset_builder.register_weak_mutex_pcie_enumerator(bus_id, Box::new(switch_device));
1829        }
1830
1831        // Register the VFIO resolver, which spawns a container manager task
1832        // internally to share containers across assigned devices.
1833        #[cfg(target_os = "linux")]
1834        let vfio_inspect = {
1835            let vfio_resolver = vfio_assigned_device::resolver::VfioDeviceResolver::new(
1836                driver_source.builder().build("vfio-container-mgr"),
1837                memory_manager.dma_mapper_client(),
1838            );
1839            let handle = vfio_resolver.inspect_handle();
1840            resolver.add_async_resolver::<
1841                vm_resource::kind::PciDeviceHandleKind,
1842                _,
1843                vfio_assigned_device_resources::VfioDeviceHandle,
1844                _,
1845            >(vfio_resolver);
1846            Some(handle)
1847        };
1848
1849        // Resolve PCIe devices concurrently.
1850        try_join_all(cfg.pcie_devices.into_iter().map(|dev_cfg| {
1851            let chipset_builder = &chipset_builder;
1852            let driver_source = &driver_source;
1853            let resolver = &resolver;
1854            let gm = &gm;
1855            let partition = &partition;
1856            let mapper = &mapper;
1857            async move {
1858                vmm_core::device_builder::build_pcie_device(
1859                    chipset_builder,
1860                    dev_cfg.port_name.into(),
1861                    driver_source,
1862                    resolver,
1863                    gm,
1864                    dev_cfg.resource,
1865                    partition.clone().into_doorbell_registration(Vtl::Vtl0),
1866                    Some(mapper),
1867                    partition.as_signal_msi(Vtl::Vtl0),
1868                    partition.irqfd(),
1869                )
1870                .await
1871            }
1872        }))
1873        .await?;
1874
1875        if let Some(vmbus_cfg) = cfg.vmbus {
1876            if !cfg.hypervisor.with_hv {
1877                anyhow::bail!("vmbus required hypervisor enlightements");
1878            }
1879
1880            let synic = partition.synic();
1881
1882            vmbus_redirect = vmbus_cfg.vtl2_redirect;
1883            let hvsock_channel = HvsockRelayChannel::new();
1884
1885            let (vtl2_vmbus, vtl2_request_send) = if let Some(vtl2_vmbus_cfg) = cfg.vtl2_vmbus {
1886                let (server_request_send, server_request_recv) = mesh::channel();
1887                let vtl2_hvsock_channel = HvsockRelayChannel::new();
1888
1889                let vmbus_driver = driver_source.simple();
1890                let vtl2_vmbus =
1891                    VmbusServer::builder(vmbus_driver.clone(), synic.clone(), gm.clone())
1892                        .vtl(Vtl::Vtl2)
1893                        .max_version(
1894                            vtl2_vmbus_cfg
1895                                .vmbus_max_version
1896                                .map(vmbus_core::MaxVersionInfo::new),
1897                        )
1898                        .hvsock_notify(Some(vtl2_hvsock_channel.server_half))
1899                        .external_requests(Some(server_request_recv))
1900                        .enable_mnf(true)
1901                        .build()
1902                        .context("failed to create VTL2 vmbus server")?;
1903
1904                let vtl2_vmbus = VmbusServerHandle::new(
1905                    &vmbus_driver,
1906                    state_units.add("vtl2_vmbus"),
1907                    vtl2_vmbus,
1908                )
1909                .context("failed to add vmbus state unit")?;
1910
1911                let relay = HvsockRelay::new(
1912                    vmbus_driver,
1913                    vtl2_vmbus.control().clone(),
1914                    vtl2_hvsock_channel.relay_half,
1915                    vtl2_vmbus_cfg.vsock_path.map(Into::into),
1916                    vtl2_vmbus_cfg.vsock_listener,
1917                )
1918                .context("failed to create vtl2 hvsock relay")?;
1919
1920                vtl2_hvsock_relay = Some(relay);
1921
1922                (Some(vtl2_vmbus), Some(server_request_send))
1923            } else {
1924                (None, None)
1925            };
1926
1927            let vmbus_driver = driver_source.simple();
1928            let vmbus = VmbusServer::builder(vmbus_driver.clone(), synic.clone(), gm.clone())
1929                .hvsock_notify(Some(hvsock_channel.server_half))
1930                .external_server(vtl2_request_send)
1931                .use_message_redirect(vmbus_cfg.vtl2_redirect)
1932                .max_version(
1933                    vmbus_cfg
1934                        .vmbus_max_version
1935                        .map(vmbus_core::MaxVersionInfo::new),
1936                )
1937                .delay_max_version(matches!(cfg.load_mode, LoadMode::Uefi { .. }))
1938                .enable_mnf(true)
1939                .build()
1940                .context("failed to create vmbus server")?;
1941
1942            // Start the vmbus kernel proxy if it's in use.
1943            #[cfg(windows)]
1944            if let Some(proxy_handle) = vmbus_cfg.vmbusproxy_handle {
1945                vmbus_proxy =
1946                    Some(
1947                        vmbus_server::ProxyIntegration::builder(
1948                            &vmbus_driver,
1949                            proxy_handle,
1950                            vmbus_server::ProxyServerInfo::new(vmbus.control()),
1951                        )
1952                        .vtl2_server(vtl2_vmbus.as_ref().map(|server| {
1953                            vmbus_server::ProxyServerInfo::new(server.control().clone())
1954                        }))
1955                        .memory(Some(&gm))
1956                        .build()
1957                        .await
1958                        .context("failed to start the vmbus proxy")?,
1959                    )
1960            }
1961
1962            let vmbus = VmbusServerHandle::new(&vmbus_driver, state_units.add("vmbus"), vmbus)
1963                .context("failed to add vmbus state unit")?;
1964
1965            let relay = HvsockRelay::new(
1966                vmbus_driver,
1967                vmbus.control().clone(),
1968                hvsock_channel.relay_half,
1969                vmbus_cfg.vsock_path.map(Into::into),
1970                vmbus_cfg.vsock_listener,
1971            )
1972            .context("failed to create hvsock relay")?;
1973
1974            vtl0_hvsock_relay = Some(relay);
1975            vmbus_server = Some(vmbus);
1976            vtl2_vmbus_server = vtl2_vmbus;
1977        }
1978
1979        #[cfg(all(windows, feature = "virt_whp"))]
1980        fn make_ids(
1981            name: &str,
1982            instance_id: Option<guid::Guid>,
1983        ) -> (String, String, guid::Guid, u64) {
1984            let guid = instance_id.unwrap_or_else(guid::Guid::new_random);
1985            // TODO: clarify how the device ID is constructed
1986            let device_id = (guid.data2 as u64) << 16 | (guid.data3 as u64 & 0xfff8);
1987            let vpci_device_name = format!("vpci:{guid}");
1988            let device_name = format!("{name}:vpci-{guid}");
1989            (vpci_device_name, device_name, guid, device_id)
1990        }
1991
1992        // Synthetic devices
1993        {
1994            // Arbitrary default
1995            const DEFAULT_IO_QUEUE_DEPTH: u32 = 256;
1996            if let Some(vmbus) = &vmbus_server {
1997                for (path, scsi_disk) in storvsp_ide_disks {
1998                    scsi_devices.push(
1999                        offer_channel_unit(
2000                            &driver_source.simple(),
2001                            &state_units,
2002                            vmbus,
2003                            storvsp::StorageDevice::build_ide(
2004                                &driver_source,
2005                                path.channel,
2006                                path.drive,
2007                                scsi_disk,
2008                                DEFAULT_IO_QUEUE_DEPTH,
2009                            ),
2010                        )
2011                        .await?,
2012                    );
2013                }
2014            }
2015
2016            #[cfg(windows)]
2017            for nic_config in cfg.kernel_vmnics {
2018                let mut nic = vmswitch::kernel::KernelVmNic::new(
2019                    &guid::Guid::new_random(),
2020                    "nic",
2021                    "nic",
2022                    nic_config.mac_address.into(),
2023                    &nic_config.instance_id,
2024                    vmbus_proxy
2025                        .as_ref()
2026                        .context("missing vmbusproxy handle")?
2027                        .handle(),
2028                )
2029                .context("failed to create a kernel vmnic")?;
2030
2031                nic.connect(&vmswitch::kernel::SwitchPortId {
2032                    switch: nic_config.switch_port_id.switch,
2033                    port: nic_config.switch_port_id.port,
2034                })
2035                .context("failed to connect kernel vmnic")?;
2036
2037                nic.resume().context("failed to resume the kernel vmnic")?;
2038                kernel_vmnics.push(nic);
2039            }
2040
2041            if partition.supports_virtual_devices() {
2042                for dev_cfg in cfg.vpci_devices {
2043                    let vmbus = match dev_cfg.vtl {
2044                        DeviceVtl::Vtl0 => vmbus_server.as_ref().context("vmbus not enabled")?,
2045                        DeviceVtl::Vtl1 => anyhow::bail!("not supported"),
2046                        DeviceVtl::Vtl2 => vtl2_vmbus_server
2047                            .as_ref()
2048                            .context("VTL2 vmbus not enabled")?,
2049                    };
2050
2051                    let vtl = match dev_cfg.vtl {
2052                        DeviceVtl::Vtl0 => Vtl::Vtl0,
2053                        DeviceVtl::Vtl1 => Vtl::Vtl1,
2054                        DeviceVtl::Vtl2 => Vtl::Vtl2,
2055                    };
2056
2057                    vmm_core::device_builder::build_vpci_device(
2058                        &driver_source,
2059                        &resolver,
2060                        &gm,
2061                        vmbus.control(),
2062                        dev_cfg.instance_id,
2063                        dev_cfg.resource,
2064                        &chipset_builder,
2065                        partition.clone().into_doorbell_registration(vtl),
2066                        Some(&mapper),
2067                        |device_id| {
2068                            let hv_device = partition.new_virtual_device(
2069                                match dev_cfg.vtl {
2070                                    DeviceVtl::Vtl0 => Vtl::Vtl0,
2071                                    DeviceVtl::Vtl1 => Vtl::Vtl1,
2072                                    DeviceVtl::Vtl2 => Vtl::Vtl2,
2073                                },
2074                                device_id,
2075                            )?;
2076                            Ok((
2077                                hv_device.clone().target(),
2078                                hv_device.clone().interrupt_mapper(),
2079                            ))
2080                        },
2081                        None,
2082                    )
2083                    .await?;
2084                }
2085
2086                #[cfg(all(windows, feature = "virt_whp"))]
2087                for resource in cfg.vpci_resources {
2088                    let vmbus = vmbus_server
2089                        .as_ref()
2090                        .context("vmbus must be enabled to assign devices")?
2091                        .control()
2092                        .as_ref();
2093
2094                    // TODO: abstract this behind the trait object properly.
2095                    let pd = partition.as_any();
2096                    let p = pd.downcast_ref::<virt_whp::WhpPartition>().unwrap();
2097                    let (vpci_bus_name, device_name, instance_id, device_id) =
2098                        make_ids("assigned-device", None);
2099
2100                    let hv_device = Arc::new(
2101                        p.new_physical_device(Vtl::Vtl0, device_id, resource.0)
2102                            .context("failed to get physical device for assignment")?,
2103                    );
2104
2105                    let device = chipset_builder
2106                        .arc_mutex_device(device_name)
2107                        .with_external_pci()
2108                        .try_add(|services| {
2109                            virt_whp::device::AssignedPciDevice::new(
2110                                &mut services.register_mmio(),
2111                                hv_device.clone(),
2112                            )
2113                        })
2114                        .context("failed to assign device")?;
2115
2116                    chipset_builder
2117                        .arc_mutex_device(vpci_bus_name)
2118                        .try_add_async(async |services| {
2119                            VpciBus::new(
2120                                &driver_source,
2121                                instance_id,
2122                                device,
2123                                &mut services.register_mmio(),
2124                                vmbus,
2125                                crate::partition::VpciDevice::interrupt_mapper(hv_device),
2126                                None,
2127                            )
2128                            .await
2129                        })
2130                        .await?;
2131                }
2132            }
2133        }
2134
2135        // Add vmbus devices.
2136        let mut vmbus_devices = Vec::new();
2137        for (vtl, resource) in cfg.vmbus_devices {
2138            let vmbus = match vtl {
2139                DeviceVtl::Vtl0 => vmbus_server
2140                    .as_ref()
2141                    .context("failed to find vmbus for vtl0"),
2142                DeviceVtl::Vtl1 => anyhow::bail!("vtl1 scsi controllers unsupported"),
2143                DeviceVtl::Vtl2 => vtl2_vmbus_server
2144                    .as_ref()
2145                    .context("failed to find vmbus for vtl2"),
2146            }
2147            .with_context(|| format!("failed to resolve vmbus resource {}", resource.id()))?;
2148            vmbus_devices.push(
2149                offer_vmbus_device_handle_unit(
2150                    &driver_source,
2151                    &state_units,
2152                    vmbus,
2153                    &resolver,
2154                    resource,
2155                )
2156                .await?,
2157            );
2158        }
2159
2160        // add virtio devices
2161
2162        // Construct virtio devices.
2163        //
2164        // TODO: allocate PCI and MMIO space better.
2165        let mut pci_device_number = 10;
2166        if mem_layout.mmio().len() < 2 {
2167            anyhow::bail!("at least two mmio regions are required");
2168        }
2169        let mut virtio_mmio_start = mem_layout.mmio()[1].end();
2170        let mut virtio_mmio_count = 0;
2171
2172        // Avoid an ISA interrupt to avoid conflicts and to avoid needing to
2173        // configure the line as level-triggered in the MADT (necessary for
2174        // Linux when the PIC is missing).
2175        let virtio_mmio_irq = {
2176            const VIRTIO_MMIO_IOAPIC_IRQ: u32 = 17;
2177            const VIRTIO_MMIO_PIC_IRQ: u32 = 5;
2178            if cfg.chipset_capabilities.with_pic {
2179                VIRTIO_MMIO_PIC_IRQ
2180            } else {
2181                VIRTIO_MMIO_IOAPIC_IRQ
2182            }
2183        };
2184        for (bus, device) in cfg.virtio_devices.into_iter() {
2185            let id = device.id().to_string();
2186            let device = resolver
2187                .resolve(
2188                    device,
2189                    VirtioResolveInput {
2190                        driver_source: &driver_source,
2191                    },
2192                )
2193                .await?;
2194            match bus {
2195                VirtioBus::Mmio => {
2196                    let mmio_start = virtio_mmio_start - 0x1000;
2197                    virtio_mmio_start -= 0x1000;
2198                    let id = format!("{id}-{mmio_start}");
2199                    let gm = gm.clone();
2200                    chipset_builder.arc_mutex_device(id).try_add(|services| {
2201                        VirtioMmioDevice::new(
2202                            device.0,
2203                            &driver_source.simple(),
2204                            gm,
2205                            services.new_line(IRQ_LINE_SET, "interrupt", virtio_mmio_irq),
2206                            partition.clone().into_doorbell_registration(Vtl::Vtl0),
2207                            mmio_start,
2208                            0x1000,
2209                        )
2210                    })?;
2211                    virtio_mmio_count += 1;
2212                }
2213                VirtioBus::Pci => {
2214                    let pci_inta_line = pci_inta_line.context("missing PCI INT#A line")?;
2215
2216                    let device_number = pci_device_number;
2217                    pci_device_number += 1;
2218                    pci_legacy_interrupts.push(((device_number, None), pci_inta_line));
2219
2220                    let bus = if cfg.chipset.with_piix4_pci_bus {
2221                        pci_bus_id_piix4.clone()
2222                    } else {
2223                        pci_bus_id_generic.clone()
2224                    };
2225
2226                    chipset_builder
2227                        .arc_mutex_device(format!("{id}-pci"))
2228                        .with_pci_addr(0, device_number, 0)
2229                        .on_pci_bus(bus)
2230                        .try_add(|services| {
2231                            VirtioPciDevice::new(
2232                                device.0,
2233                                &driver_source.simple(),
2234                                gm.clone(),
2235                                PciInterruptModel::IntX(
2236                                    PciInterruptPin::IntA,
2237                                    services.new_line(IRQ_LINE_SET, "interrupt", pci_inta_line),
2238                                ),
2239                                partition.clone().into_doorbell_registration(Vtl::Vtl0),
2240                                &mut services.register_mmio(),
2241                                Some(&mapper),
2242                            )
2243                        })?;
2244                }
2245            }
2246        }
2247
2248        assert!(virtio_mmio_start >= mem_layout.mmio()[1].start());
2249
2250        let (chipset, devices) = chipset_builder.build()?;
2251        let (fatal_error_send, _fatal_error_recv) = mesh::channel();
2252        let chipset = vmm_core::vmotherboard_adapter::AdaptedChipset::new(
2253            chipset,
2254            // TODO: Support this being a cmd line option
2255            vmm_core::vmotherboard_adapter::FatalErrorPolicy::DebugBreak(fatal_error_send),
2256        );
2257
2258        // create a new channel to intercept guest resets
2259        let (halt_send, halt_recv) = mesh::channel();
2260
2261        let (partition_unit, vp_runners) = PartitionUnit::new(
2262            driver_source.simple(),
2263            state_units
2264                .add("partition")
2265                .depends_on(devices.chipset_unit())
2266                .depends_on(vmtime.handle()),
2267            partition.clone().into_vm_partition(),
2268            PartitionUnitParams {
2269                processor_topology: &processor_topology,
2270                halt_vps,
2271                halt_request_recv,
2272                client_notify_send: halt_send,
2273                vtl_guest_memory: [
2274                    Some(&gm),
2275                    None,
2276                    cfg.hypervisor.with_vtl2.is_some().then_some(&gm),
2277                ],
2278                debugger_rpc: cfg.debugger_rpc,
2279            },
2280        )
2281        .context("failed to create partition unit")?;
2282
2283        // Start the VP backing threads.
2284        try_join_all(vps.into_iter().zip(vp_runners).enumerate().map(
2285            |(vp_index, (mut vp, runner))| {
2286                let partition = partition.clone();
2287                let chipset = chipset.clone();
2288                let (send, recv) = mesh::oneshot();
2289                thread::Builder::new()
2290                    .name(format!("vp-{}", vp_index))
2291                    .spawn(move || match vp.bind() {
2292                        Ok(mut vp) => {
2293                            send.send(Ok(()));
2294                            block_on_vp(
2295                                partition,
2296                                VpIndex::new(vp_index as u32),
2297                                vp.run(runner, &chipset),
2298                            )
2299                        }
2300                        Err(err) => {
2301                            send.send(Err(err));
2302                        }
2303                    })
2304                    .unwrap();
2305
2306                async move {
2307                    recv.await
2308                        .unwrap()
2309                        .with_context(|| format!("failed to bind vp {vp_index}"))
2310                }
2311            },
2312        ))
2313        .await?;
2314
2315        let mut this = LoadedVm {
2316            state_units,
2317            running: false,
2318            inner: LoadedVmInner {
2319                driver_source,
2320                resolver,
2321                partition_unit,
2322                partition,
2323                chipset_devices: devices,
2324                _vmtime: vmtime,
2325                _scsi_devices: scsi_devices,
2326                memory_manager,
2327                gm,
2328                vtl0_hvsock_relay,
2329                vtl2_hvsock_relay,
2330                vmbus_server,
2331                vtl2_vmbus_server,
2332                hypervisor_cfg: cfg.hypervisor,
2333                memory_cfg: cfg.memory,
2334                mem_layout,
2335                processor_topology,
2336                vmbus_redirect,
2337                input_distributor,
2338                vtl2_framebuffer_gpa_base,
2339                #[cfg(windows)]
2340                _vmbus_proxy: vmbus_proxy,
2341                #[cfg(windows)]
2342                _kernel_vmnics: kernel_vmnics,
2343                vmbus_devices,
2344                chipset_cfg: cfg.chipset,
2345                chipset_capabilities: cfg.chipset_capabilities,
2346                firmware_event_send: cfg.firmware_event_send,
2347                load_mode: cfg.load_mode,
2348                virtio_mmio_count,
2349                virtio_mmio_irq,
2350                pci_legacy_interrupts,
2351                igvm_file,
2352                next_igvm_file: None,
2353                _vmgs_task: vmgs_task,
2354                vmgs_client_inspect_handle,
2355                #[cfg(target_os = "linux")]
2356                vfio_inspect,
2357                halt_recv,
2358                client_notify_send,
2359                automatic_guest_reset: cfg.automatic_guest_reset,
2360                pcie_host_bridges,
2361                pcie_root_complexes,
2362                pcie_hotplug_devices: Vec::new(),
2363            },
2364        };
2365
2366        if let Some(saved_state) = saved_state {
2367            this.restore(saved_state)
2368                .await
2369                .context("loadedvm restore failed")?;
2370        } else {
2371            this.inner.load_firmware(false).await?;
2372        }
2373
2374        Ok(this)
2375    }
2376}
2377
2378impl LoadedVmInner {
2379    async fn load_firmware(&mut self, vtl2_only: bool) -> anyhow::Result<()> {
2380        let cache_topology = if cfg!(guest_arch = "aarch64") {
2381            Some(
2382                cache_topology::CacheTopology::from_host()
2383                    .context("failed to get cache topology")?,
2384            )
2385        } else {
2386            None
2387        };
2388        let acpi_builder = AcpiTablesBuilder {
2389            processor_topology: &self.processor_topology,
2390            mem_layout: &self.mem_layout,
2391            cache_topology: cache_topology.as_ref(),
2392            pcie_host_bridges: &self.pcie_host_bridges,
2393            #[cfg(guest_arch = "x86_64")]
2394            arch: vmm_core::acpi_builder::AcpiArchConfig::X86 {
2395                with_ioapic: self.chipset_cfg.with_generic_ioapic,
2396                with_psp: self.chipset_cfg.with_generic_psp,
2397                with_pic: self.chipset_capabilities.with_pic,
2398                with_pit: self.chipset_capabilities.with_pit,
2399                pm_base: PM_BASE,
2400                acpi_irq: SYSTEM_IRQ_ACPI,
2401            },
2402            #[cfg(guest_arch = "aarch64")]
2403            arch: vmm_core::acpi_builder::AcpiArchConfig::Aarch64 {
2404                hypervisor_vendor_identity: if self.hypervisor_cfg.with_hv {
2405                    u64::from_le_bytes(*b"MsHyperV")
2406                } else {
2407                    0
2408                },
2409                virt_timer_ppi: self.processor_topology.virt_timer_ppi(),
2410            },
2411        };
2412
2413        if vtl2_only {
2414            assert!(matches!(self.load_mode, LoadMode::Igvm { .. }));
2415        }
2416
2417        #[cfg_attr(not(guest_arch = "x86_64"), expect(unused_mut))]
2418        let (mut regs, initial_page_vis) = match &self.load_mode {
2419            LoadMode::None => return Ok(()),
2420            #[cfg(guest_arch = "x86_64")]
2421            &LoadMode::Linux {
2422                ref kernel,
2423                ref initrd,
2424                ref cmdline,
2425                enable_serial,
2426                ref custom_dsdt,
2427                boot_mode,
2428            } => {
2429                match boot_mode {
2430                    openvmm_defs::config::LinuxDirectBootMode::DeviceTree => {
2431                        anyhow::bail!("device tree boot mode is not supported on x86_64");
2432                    }
2433                    openvmm_defs::config::LinuxDirectBootMode::Acpi => {}
2434                }
2435                let kernel_config = super::vm_loaders::linux::KernelConfig {
2436                    kernel,
2437                    initrd,
2438                    cmdline,
2439                    mem_layout: &self.mem_layout,
2440                };
2441                if custom_dsdt.is_none() && self.mem_layout.mmio().len() < 2 {
2442                    anyhow::bail!("at least two mmio regions are required");
2443                }
2444                let regs =
2445                    super::vm_loaders::linux::load_linux_x86(&kernel_config, &self.gm, |gpa| {
2446                        let tables = if let Some(dsdt) = custom_dsdt {
2447                            acpi_builder.build_acpi_tables_custom_dsdt(gpa, dsdt)
2448                        } else {
2449                            acpi_builder.build_acpi_tables(gpa, |mem_layout, dsdt| {
2450                                add_devices_to_dsdt_x64(
2451                                    mem_layout,
2452                                    dsdt,
2453                                    &self.chipset_cfg,
2454                                    enable_serial,
2455                                    self.virtio_mmio_count,
2456                                    self.virtio_mmio_irq,
2457                                    &self.pci_legacy_interrupts,
2458                                )
2459                            })
2460                        };
2461
2462                        super::vm_loaders::linux::AcpiTables {
2463                            rdsp: tables.rdsp,
2464                            tables: tables.tables,
2465                        }
2466                    })?;
2467
2468                (regs, Vec::new())
2469            }
2470            #[cfg(guest_arch = "aarch64")]
2471            &LoadMode::Linux {
2472                ref kernel,
2473                ref initrd,
2474                ref cmdline,
2475                enable_serial,
2476                custom_dsdt: _,
2477                boot_mode,
2478            } => {
2479                use openvmm_defs::config::LinuxDirectBootMode;
2480
2481                let kernel_config = super::vm_loaders::linux::KernelConfig {
2482                    kernel,
2483                    initrd,
2484                    cmdline,
2485                    mem_layout: &self.mem_layout,
2486                };
2487
2488                let with_hv = self.hypervisor_cfg.with_hv;
2489                let build_acpi = if boot_mode == LinuxDirectBootMode::Acpi {
2490                    Some(|rsdp_gpa: u64| {
2491                        acpi_builder.build_acpi_tables(rsdp_gpa, |mem_layout, dsdt| {
2492                            add_devices_to_dsdt_arm64(mem_layout, dsdt, enable_serial, with_hv)
2493                        })
2494                    })
2495                } else {
2496                    None
2497                };
2498
2499                let regs = super::vm_loaders::linux::load_linux_arm64(
2500                    &kernel_config,
2501                    &self.gm,
2502                    enable_serial,
2503                    &self.processor_topology,
2504                    &self.pcie_host_bridges,
2505                    build_acpi,
2506                )?;
2507
2508                (regs, Vec::new())
2509            }
2510            &LoadMode::Uefi {
2511                ref firmware,
2512                enable_debugging,
2513                enable_memory_protections,
2514                disable_frontpage,
2515                enable_tpm,
2516                enable_battery,
2517                enable_serial,
2518                enable_vpci_boot,
2519                uefi_console_mode,
2520                default_boot_always_attempt,
2521                bios_guid,
2522            } => {
2523                let madt = acpi_builder.build_madt();
2524                let srat = acpi_builder.build_srat();
2525                let mcfg = (!self.pcie_host_bridges.is_empty()).then(|| acpi_builder.build_mcfg());
2526                let pptt = cache_topology.is_some().then(|| acpi_builder.build_pptt());
2527                let load_settings = super::vm_loaders::uefi::UefiLoadSettings {
2528                    debugging: enable_debugging,
2529                    memory_protections: enable_memory_protections,
2530                    frontpage: !disable_frontpage,
2531                    tpm: enable_tpm,
2532                    battery: enable_battery,
2533                    guest_watchdog: self.chipset_cfg.with_hyperv_guest_watchdog,
2534                    vpci_boot: enable_vpci_boot,
2535                    serial: enable_serial,
2536                    uefi_console_mode,
2537                    default_boot_always_attempt,
2538                    bios_guid,
2539                };
2540                let regs = super::vm_loaders::uefi::load_uefi(
2541                    firmware,
2542                    &self.gm,
2543                    &self.processor_topology,
2544                    &self.mem_layout,
2545                    &self.pcie_host_bridges,
2546                    load_settings,
2547                    &madt,
2548                    &srat,
2549                    mcfg.as_deref(),
2550                    pptt.as_deref(),
2551                )?;
2552
2553                (regs, Vec::new())
2554            }
2555            #[cfg(guest_arch = "x86_64")]
2556            LoadMode::Pcat { .. } => {
2557                let regs = super::vm_loaders::pcat::load_pcat(&self.gm, &self.mem_layout)?;
2558
2559                (regs, Vec::new())
2560            }
2561            &LoadMode::Igvm {
2562                file: _,
2563                ref cmdline,
2564                vtl2_base_address,
2565                com_serial,
2566            } => {
2567                let madt = acpi_builder.build_madt();
2568                let srat = acpi_builder.build_srat();
2569                const ENTROPY_SIZE: usize = 64;
2570                let mut entropy = [0u8; ENTROPY_SIZE];
2571                getrandom::fill(&mut entropy).unwrap();
2572
2573                let params = crate::worker::vm_loaders::igvm::LoadIgvmParams {
2574                    igvm_file: self.igvm_file.as_ref().expect("should be already read"),
2575                    gm: &self.gm,
2576                    processor_topology: &self.processor_topology,
2577                    mem_layout: &self.mem_layout,
2578                    cmdline,
2579                    acpi_tables: super::vm_loaders::igvm::AcpiTables {
2580                        madt: &madt,
2581                        srat: &srat,
2582                        slit: None,
2583                        pptt: None,
2584                    },
2585                    vtl2_base_address,
2586                    vtl2_framebuffer_gpa_base: self.vtl2_framebuffer_gpa_base,
2587                    vtl2_only,
2588                    with_vmbus_redirect: self.vmbus_redirect,
2589                    com_serial,
2590                    entropy: Some(&entropy),
2591                };
2592                super::vm_loaders::igvm::load_igvm(params)?
2593            }
2594
2595            #[expect(clippy::allow_attributes)]
2596            #[allow(unreachable_patterns)]
2597            _ => anyhow::bail!("load mode not supported on this platform"),
2598        };
2599
2600        // Don't setup variable MTRRs if VTL2 is present. It's expected that
2601        // VTL2 will setup MTRRs for VTL0 if needed.
2602        #[cfg(guest_arch = "x86_64")]
2603        if self.hypervisor_cfg.with_vtl2.is_none() {
2604            regs.extend(
2605                loader::common::compute_variable_mtrrs(
2606                    &self.mem_layout,
2607                    self.partition.caps().physical_address_width,
2608                )
2609                .context("failed to compute variable mtrrs")?,
2610            );
2611        }
2612
2613        // Only set initial page visibility on isolated partitions.
2614        if self.hypervisor_cfg.with_isolation.is_some() {
2615            tracing::debug!(?initial_page_vis, "initial_page_vis");
2616            self.partition_unit
2617                .set_initial_page_visibility(initial_page_vis)
2618                .await
2619                .context("failed to set initial page visibility")?;
2620        }
2621
2622        let initial_regs = initial_regs(
2623            &regs,
2624            self.partition.caps(),
2625            &self.processor_topology.vp_arch(VpIndex::BSP),
2626        );
2627
2628        tracing::debug!(?initial_regs, "initial_registers");
2629        self.partition_unit
2630            .set_initial_regs(
2631                if self.hypervisor_cfg.with_vtl2.is_some() {
2632                    Vtl::Vtl2
2633                } else {
2634                    Vtl::Vtl0
2635                },
2636                initial_regs,
2637            )
2638            .await
2639            .context("failed to set initial register state")?;
2640
2641        Ok(())
2642    }
2643}
2644
2645impl LoadedVm {
2646    async fn resume(&mut self) -> bool {
2647        if self.running {
2648            return false;
2649        }
2650        self.state_units.start().await;
2651        self.running = true;
2652        true
2653    }
2654
2655    async fn pause(&mut self) -> bool {
2656        if !self.running {
2657            return false;
2658        }
2659        self.state_units.stop().await;
2660        self.running = false;
2661        true
2662    }
2663
2664    pub async fn run(
2665        mut self,
2666        driver: &impl Spawn,
2667        mut rpc_recv: mesh::Receiver<VmRpc>,
2668        mut worker_rpc: mesh::Receiver<WorkerRpc<RestartState>>,
2669    ) {
2670        enum Event {
2671            WorkerRpc(Result<WorkerRpc<RestartState>, mesh::RecvError>),
2672            VmRpc(Result<VmRpc, mesh::RecvError>),
2673            Halt(Result<HaltReason, mesh::RecvError>),
2674        }
2675
2676        // Start a task to handle state unit inspections by filtering the worker
2677        // RPC requests. This is done so that inspect on state units works even
2678        // during state transitions.
2679        let (worker_rpc_send, worker_rpc_recv) = mesh::channel();
2680        let _filter_rpc_task = driver.spawn("loaded-vm-worker-rpc-filter", {
2681            let state_units = self.state_units.inspector();
2682            async move {
2683                while let Some(rpc) = worker_rpc.next().await {
2684                    match rpc {
2685                        WorkerRpc::Inspect(req) => req.respond(|resp| {
2686                            resp.merge(&state_units)
2687                                .merge(inspect::send(&worker_rpc_send, WorkerRpc::Inspect));
2688                        }),
2689                        rpc => worker_rpc_send.send(rpc),
2690                    }
2691                }
2692            }
2693        });
2694        let mut worker_rpc = worker_rpc_recv;
2695
2696        loop {
2697            let event: Event = {
2698                let a = rpc_recv.recv().map(Event::VmRpc);
2699                let b = worker_rpc.recv().map(Event::WorkerRpc);
2700                let c = self.inner.halt_recv.recv().map(Event::Halt);
2701                (a, b, c).race().await
2702            };
2703
2704            match event {
2705                Event::WorkerRpc(Err(_)) => break,
2706                Event::WorkerRpc(Ok(message)) => match message {
2707                    WorkerRpc::Stop => break,
2708                    WorkerRpc::Restart(rpc) => {
2709                        let mut stopped = false;
2710                        // First run the non-destructive operations.
2711                        let r = async {
2712                            let shared_memory = self.inner.memory_manager.shared_memory_backing();
2713                            if shared_memory.is_none() {
2714                                anyhow::bail!("restart is not supported with --private-memory");
2715                            }
2716                            if self.running {
2717                                self.state_units.stop().await;
2718                                stopped = true;
2719                            }
2720                            let saved_state = self.save().await?;
2721                            anyhow::Ok((shared_memory, saved_state))
2722                        }
2723                        .await;
2724                        match r {
2725                            Ok((shared_memory, saved_state)) => {
2726                                rpc.complete(Ok(self
2727                                    .serialize(rpc_recv, shared_memory, saved_state)
2728                                    .await));
2729
2730                                return;
2731                            }
2732                            Err(err) => {
2733                                if stopped {
2734                                    self.state_units.start().await;
2735                                }
2736                                rpc.complete(Err(RemoteError::new(err)));
2737                            }
2738                        }
2739                    }
2740                    WorkerRpc::Inspect(deferred) => deferred.respond(|resp| {
2741                        resp.field("memory", &self.inner.memory_manager)
2742                            .field("memory_layout", &self.inner.mem_layout)
2743                            .field("resolver", &self.inner.resolver)
2744                            .field("vmgs", &self.inner.vmgs_client_inspect_handle);
2745                        #[cfg(target_os = "linux")]
2746                        resp.field("vfio", &self.inner.vfio_inspect);
2747                    }),
2748                },
2749                Event::VmRpc(Err(_)) => break,
2750                Event::VmRpc(Ok(message)) => match message {
2751                    VmRpc::Reset(rpc) => {
2752                        rpc.handle_failable(async |()| self.reset(true).await).await
2753                    }
2754                    VmRpc::ClearHalt(rpc) => {
2755                        rpc.handle(async |()| self.inner.partition_unit.clear_halt().await)
2756                            .await
2757                    }
2758                    VmRpc::Resume(rpc) => rpc.handle(async |()| self.resume().await).await,
2759                    VmRpc::Pause(rpc) => rpc.handle(async |()| self.pause().await).await,
2760                    VmRpc::Save(rpc) => {
2761                        rpc.handle_failable(async |()| self.save().await.map(ProtobufMessage::new))
2762                            .await
2763                    }
2764                    VmRpc::Nmi(rpc) => rpc.handle_sync(|vpindex| {
2765                        if vpindex < self.inner.processor_topology.vp_count() {
2766                            // Send an NMI MSI to the processor. We could raise
2767                            // LINT1 instead, which would allow the guest to
2768                            // reconfigure the LINT to do something other than
2769                            // an NMI. Since this is for diagnostics, that
2770                            // doesn't seem like what we want.
2771                            //
2772                            // AARCH64-TODO: is there an equivalent?
2773                            #[cfg(guest_arch = "x86_64")]
2774                            self.inner.partition.request_msi(
2775                                Vtl::Vtl0,
2776                                virt::irqcon::MsiRequest::new_x86(
2777                                    virt::irqcon::DeliveryMode::NMI,
2778                                    self.inner
2779                                        .processor_topology
2780                                        .vp_arch(VpIndex::new(vpindex))
2781                                        .apic_id,
2782                                    false,
2783                                    0,
2784                                    false,
2785                                ),
2786                            );
2787                        }
2788                    }),
2789                    VmRpc::AddVmbusDevice(rpc) => {
2790                        rpc.handle_failable(async |(vtl, resource)| {
2791                            let vmbus = match vtl {
2792                                DeviceVtl::Vtl0 => self.inner.vmbus_server.as_ref(),
2793                                DeviceVtl::Vtl1 => None,
2794                                DeviceVtl::Vtl2 => self.inner.vtl2_vmbus_server.as_ref(),
2795                            }
2796                            .context("no vmbus available")?;
2797                            let device = offer_vmbus_device_handle_unit(
2798                                &self.inner.driver_source,
2799                                &self.state_units,
2800                                vmbus,
2801                                &self.inner.resolver,
2802                                resource,
2803                            )
2804                            .await?;
2805                            self.inner.vmbus_devices.push(device);
2806                            self.state_units.start_stopped_units().await;
2807                            anyhow::Ok(())
2808                        })
2809                        .await
2810                    }
2811                    VmRpc::ConnectHvsock(rpc) => {
2812                        let ((mut ctx, service_id, vtl), response) = rpc.split();
2813                        if let Some(relay) = self.hvsock_relay(vtl) {
2814                            let fut = relay.connect(&mut ctx, service_id);
2815                            driver
2816                                .spawn("vmrpc-hvsock-connect", async move {
2817                                    response.complete(fut.await.map_err(RemoteError::new))
2818                                })
2819                                .detach();
2820                        } else {
2821                            response.complete(Err(RemoteError::new(anyhow::anyhow!(
2822                                "hvsock is not available"
2823                            ))));
2824                        }
2825                    }
2826                    VmRpc::PulseSaveRestore(rpc) => {
2827                        rpc.handle(async |()| {
2828                            if !self.inner.partition.supports_reset() {
2829                                return Err(PulseSaveRestoreError::ResetNotSupported);
2830                            }
2831                            let paused = self.pause().await;
2832                            self.save_reset_restore().await?;
2833
2834                            if paused {
2835                                self.resume().await;
2836                            }
2837                            Ok(())
2838                        })
2839                        .await
2840                    }
2841                    VmRpc::StartReloadIgvm(rpc) => {
2842                        rpc.handle_failable_sync(|file| self.start_reload_igvm(&file))
2843                    }
2844                    VmRpc::CompleteReloadIgvm(rpc) => {
2845                        rpc.handle_failable(async |complete| {
2846                            self.complete_reload_igvm(complete).await
2847                        })
2848                        .await
2849                    }
2850                    VmRpc::ReadMemory(rpc) => {
2851                        rpc.handle_failable_sync(|(gpa, size)| {
2852                            let mut bytes = vec![0u8; size];
2853                            self.inner
2854                                .gm
2855                                .read_at(gpa, bytes.as_mut_slice())
2856                                .map(|_| bytes)
2857                        });
2858                    }
2859                    VmRpc::WriteMemory(rpc) => rpc.handle_failable_sync(|(gpa, bytes)| {
2860                        self.inner.gm.write_at(gpa, bytes.as_slice())
2861                    }),
2862                    VmRpc::UpdateCliParams(rpc) => {
2863                        rpc.handle_failable_sync(|params| match &mut self.inner.load_mode {
2864                            LoadMode::Igvm { cmdline, .. } => {
2865                                *cmdline = params;
2866                                Ok(())
2867                            }
2868                            _ => anyhow::bail!(
2869                                "Updating command line parameters is only supported for Igvm load mode"
2870                            ),
2871                        })
2872                    }
2873                    VmRpc::AddPcieDevice(rpc) => {
2874                        rpc.handle_failable(async |(port_name, resource)| {
2875                            // Validate the port exists before creating the device
2876                            // to avoid leaking a DynamicDeviceUnit on error.
2877                            let rc = self.inner.pcie_root_complexes.iter()
2878                                .find(|rc| {
2879                                    rc.lock().downstream_ports().iter().any(|(_, name)| name.as_ref() == port_name.as_str())
2880                                })
2881                                .ok_or_else(|| anyhow::anyhow!("port '{}' not found in any root complex", port_name))?;
2882
2883                            let msi_conn = pci_core::msi::MsiConnection::new();
2884                            let signal_msi = self.inner.partition.as_signal_msi(Vtl::Vtl0);
2885
2886                            let (unit, device) = self.inner.chipset_devices.add_dyn_device(
2887                                &self.inner.driver_source,
2888                                &self.state_units,
2889                                format!("pcie-hotplug:{}", port_name),
2890                                async |register_mmio| {
2891                                    self.inner.resolver
2892                                        .resolve(
2893                                            resource,
2894                                            pci_resources::ResolvePciDeviceHandleParams {
2895                                                msi_target: msi_conn.target(),
2896                                                register_mmio,
2897                                                driver_source: &self.inner.driver_source,
2898                                                guest_memory: &self.inner.gm,
2899                                                doorbell_registration: self.inner.partition.clone().into_doorbell_registration(Vtl::Vtl0),
2900                                                shared_mem_mapper: None,
2901                                                irqfd: self.inner.partition.irqfd(),
2902                                            },
2903                                        )
2904                                        .await
2905                                        .map(|r| r.0)
2906                                        .map_err(|e| anyhow::anyhow!(e))
2907                                },
2908                            ).await?;
2909
2910                            if let Some(target) = signal_msi {
2911                                msi_conn.connect(target);
2912                            }
2913
2914                            // Wrap the device as a GenericPciBusDevice for the port.
2915                            // Keep a strong Arc to the device so the Weak stays valid.
2916                            let weak_dev: std::sync::Weak<closeable_mutex::CloseableMutex<dyn chipset_device::ChipsetDevice>> = Arc::downgrade(&(device.clone() as Arc<closeable_mutex::CloseableMutex<dyn chipset_device::ChipsetDevice>>));
2917                            let bus_device = Box::new(WeakMutexPciBusDevice(weak_dev));
2918
2919                            self.inner.pcie_hotplug_devices.push((port_name.clone(), unit, device));
2920
2921                            // Start the device unit before firing the hotplug
2922                            // MSI. The guest may begin probing config space
2923                            // immediately after receiving the interrupt, so
2924                            // the device must be ready first.
2925                            self.state_units.start_stopped_units().await;
2926
2927                            // Now attach the device and notify the guest.
2928                            if let Err(e) = rc.lock().hotplug_add_device(
2929                                &port_name,
2930                                "hotplug-device",
2931                                bus_device,
2932                            ) {
2933                                // Clean up the device unit on failure
2934                                let (_, unit, _) = self.inner.pcie_hotplug_devices.pop().unwrap();
2935                                unit.remove().await;
2936                                return Err(e);
2937                            }
2938                            anyhow::Ok(())
2939                        })
2940                        .await
2941                    }
2942                    VmRpc::RemovePcieDevice(rpc) => {
2943                        rpc.handle_failable(async |port_name: String| {
2944                            // Only allow removing dynamically hot-added devices.
2945                            // Statically-attached devices don't have a tracked unit
2946                            // and removing them would leave their state unit/MMIO
2947                            // registrations running.
2948                            let idx = self.inner.pcie_hotplug_devices.iter()
2949                                .position(|(name, _, _)| name == &port_name)
2950                                .ok_or_else(|| anyhow::anyhow!(
2951                                    "no hot-added device on port '{}' (only dynamically added devices can be hot-removed)",
2952                                    port_name
2953                                ))?;
2954
2955                            // Find the root complex containing the target port
2956                            let rc = self.inner.pcie_root_complexes.iter()
2957                                .find(|rc| {
2958                                    rc.lock().downstream_ports().iter().any(|(_, name)| name.as_ref() == port_name.as_str())
2959                                })
2960                                .ok_or_else(|| anyhow::anyhow!("port '{}' not found in any root complex", port_name))?;
2961
2962                            rc.lock().hotplug_remove_device(&port_name)?;
2963
2964                            // Remove and stop the device unit
2965                            let (_, unit, _device) = self.inner.pcie_hotplug_devices.remove(idx);
2966                            unit.remove().await;
2967
2968                            anyhow::Ok(())
2969                        })
2970                        .await
2971                    }
2972                },
2973                Event::Halt(Err(_)) => break,
2974                Event::Halt(Ok(reason)) => {
2975                    if matches!(reason, HaltReason::Reset) && self.inner.automatic_guest_reset {
2976                        tracing::info!("guest-initiated reset");
2977                        if let Err(err) = self.reset(true).await {
2978                            tracing::error!(?err, "failed to reset VM");
2979                            break;
2980                        }
2981                    } else {
2982                        self.inner.client_notify_send.send(reason);
2983                    }
2984                }
2985            }
2986        }
2987
2988        self.inner.partition_unit.teardown().await;
2989        if let Some(vmbus) = self.inner.vmbus_server {
2990            vmbus.remove().await.shutdown().await;
2991        }
2992    }
2993
2994    fn start_reload_igvm(&mut self, file: &File) -> anyhow::Result<()> {
2995        // Clear any previously staged IGVM file.
2996        self.inner.next_igvm_file = None;
2997
2998        // Load the new IGVM file into memory.
2999        let igvm_file =
3000            super::vm_loaders::igvm::read_igvm_file(file).context("reading igvm file failed")?;
3001
3002        self.inner.next_igvm_file = Some(igvm_file);
3003        Ok(())
3004    }
3005
3006    async fn complete_reload_igvm(&mut self, complete: bool) -> anyhow::Result<()> {
3007        if !complete {
3008            self.inner.next_igvm_file = None;
3009            return Ok(());
3010        }
3011
3012        // Grab the staged IGVM file.
3013        let next_igvm_file = self
3014            .inner
3015            .next_igvm_file
3016            .take()
3017            .context("no staged igvm file")?;
3018
3019        // Stop the partition and VTL2 vmbus so that we can reset vmbus and
3020        // reset the VTL2 register state.
3021        //
3022        // When these units will be resumed when `stopped_units` is dropped.
3023        let vtl2_vmbus = self
3024            .inner
3025            .vtl2_vmbus_server
3026            .as_ref()
3027            .context("missing vtl2 vmbus")?;
3028
3029        // Stop the VPs so that VTL2 state can be replaced.
3030        let stop_vps = self.inner.partition_unit.temporarily_stop_vps().await;
3031
3032        // Reset vmbus VTL2 state so that all DMA transactions to VTL2
3033        // memory stop. We don't need to reset the individual devices, since
3034        // resetting vmbus will close all the channels.
3035        //
3036        // This must be done after the VPs have been stopped to avoid
3037        // confusing VTL2 and to ensure that VTL2 does not send any
3038        // additional vmbus messages.
3039        vtl2_vmbus
3040            .control()
3041            .force_reset()
3042            .await
3043            .context("failed to reset vtl2 vmbus")?;
3044
3045        // Reload the VTL2 firmware.
3046        //
3047        // When the initial registers are set, this will implicitly reset VTL2
3048        // state as well.
3049        let _old_igvm_file = self.inner.igvm_file.replace(next_igvm_file);
3050        self.inner
3051            .load_firmware(true)
3052            .await
3053            .context("failed to reload VTL2 firmware")?;
3054
3055        // OK to resume the VPs now.
3056        drop(stop_vps);
3057        Ok(())
3058    }
3059
3060    /// Get the associated hvsock relay for a given vtl, if any.
3061    fn hvsock_relay(&self, vtl: DeviceVtl) -> Option<&HvsockRelay> {
3062        match vtl {
3063            DeviceVtl::Vtl0 => self.inner.vtl0_hvsock_relay.as_ref(),
3064            DeviceVtl::Vtl1 => None,
3065            DeviceVtl::Vtl2 => self.inner.vtl2_hvsock_relay.as_ref(),
3066        }
3067    }
3068
3069    /// Saves the VM's processor, partition, and device state.
3070    ///
3071    /// TODO: virtio & vmbus unsupported.
3072    async fn save(&mut self) -> anyhow::Result<SavedState> {
3073        Ok(SavedState {
3074            units: self.state_units.save().await?,
3075        })
3076    }
3077
3078    /// Restore state on the VM.
3079    async fn restore(&mut self, state: SavedState) -> anyhow::Result<()> {
3080        self.state_units.restore(state.units).await?;
3081        Ok(())
3082    }
3083
3084    /// Do a save, reset, restore.
3085    async fn save_reset_restore(&mut self) -> anyhow::Result<()> {
3086        let state = self.save().await?;
3087        self.reset(false).await?;
3088        self.restore(state).await?;
3089        Ok(())
3090    }
3091
3092    /// Prepares for restart, serializing the worker's state.
3093    async fn serialize(
3094        mut self,
3095        rpc: mesh::Receiver<VmRpc>,
3096        shared_memory: Option<SharedMemoryBacking>,
3097        saved_state: SavedState,
3098    ) -> RestartState {
3099        let notify = self.inner.partition_unit.teardown().await;
3100        let input = self.inner.input_distributor.remove().await.into_inner();
3101
3102        if let Some(vmbus_server) = self.inner.vmbus_server.take() {
3103            vmbus_server.remove().await.shutdown().await;
3104        }
3105
3106        let manifest = Manifest {
3107            load_mode: self.inner.load_mode,
3108            floppy_disks: vec![],        // TODO
3109            ide_disks: vec![],           // TODO
3110            pcie_root_complexes: vec![], // TODO
3111            pcie_devices: vec![],        // TODO
3112            pcie_switches: vec![],       // TODO
3113            vpci_devices: vec![],        // TODO
3114            memory: self.inner.memory_cfg,
3115            processor_topology: self.inner.processor_topology.to_config(),
3116            chipset: self.inner.chipset_cfg,
3117            vmbus: None,      // TODO
3118            vtl2_vmbus: None, // TODO
3119            hypervisor: self.inner.hypervisor_cfg,
3120            #[cfg(windows)]
3121            kernel_vmnics: vec![], // TODO
3122            input,
3123            framebuffer: None,      // TODO
3124            vga_firmware: None,     // TODO
3125            vtl2_gfx: false,        // TODO
3126            virtio_devices: vec![], // TODO
3127            #[cfg(all(windows, feature = "virt_whp"))]
3128            vpci_resources: vec![], // TODO
3129            vmgs: None,             // TODO
3130            secure_boot_enabled: false, // TODO
3131            custom_uefi_vars: Default::default(), // TODO
3132            firmware_event_send: self.inner.firmware_event_send,
3133            debugger_rpc: None,          // TODO
3134            vmbus_devices: vec![],       // TODO
3135            chipset_devices: vec![],     // TODO
3136            pci_chipset_devices: vec![], // TODO
3137            chipset_capabilities: self.inner.chipset_capabilities,
3138            generation_id_recv: None,  // TODO
3139            rtc_delta_milliseconds: 0, // TODO
3140            automatic_guest_reset: self.inner.automatic_guest_reset,
3141            efi_diagnostics_log_level: Default::default(),
3142        };
3143        #[expect(unreachable_code, reason = "TODO")]
3144        RestartState {
3145            manifest,
3146            running: self.running,
3147            saved_state,
3148            shared_memory,
3149            rpc,
3150            notify,
3151            hypervisor: todo!("TODO: RestartState serialization is broken"),
3152        }
3153    }
3154
3155    async fn reset(&mut self, reload_firmware: bool) -> anyhow::Result<()> {
3156        let resume = self.pause().await;
3157
3158        self.state_units.reset().await?;
3159        // TODO: _vmnic
3160        // TODO: gdb?
3161
3162        // Load again
3163        if reload_firmware {
3164            self.inner.load_firmware(false).await?;
3165        }
3166
3167        if resume {
3168            self.resume().await;
3169        }
3170        Ok(())
3171    }
3172}
3173
3174#[cfg_attr(not(guest_arch = "x86_64"), expect(dead_code))]
3175fn add_devices_to_dsdt_x64(
3176    mem_layout: &MemoryLayout,
3177    dsdt: &mut dsdt::Dsdt,
3178    cfg: &BaseChipsetManifest,
3179    serial_uarts: bool,
3180    virtio_mmio_count: usize,
3181    virtio_mmio_irq: u32,
3182    pci_legacy_interrupts: &[((u8, Option<u8>), u32)], // ((device, function), interrupt)
3183) {
3184    dsdt.add_apic();
3185
3186    // Any serial port configured means all are enabled.
3187    if serial_uarts {
3188        for (name, com_port, ddn, uid) in [
3189            (b"\\_SB.UAR1", ComPort::Com1, b"COM1", 1),
3190            (b"\\_SB.UAR2", ComPort::Com2, b"COM2", 2),
3191            (b"\\_SB.UAR3", ComPort::Com3, b"COM3", 3),
3192            (b"\\_SB.UAR4", ComPort::Com4, b"COM4", 4),
3193        ]
3194        .iter()
3195        .copied()
3196        {
3197            dsdt.add_uart(name, ddn, uid, com_port.io_port(), com_port.irq().into());
3198        }
3199    }
3200
3201    assert!(
3202        mem_layout.mmio().len() >= 2,
3203        "the DSDT describes two MMIO regions"
3204    );
3205    let low_mmio_gap = mem_layout.mmio()[0];
3206    let mut high_mmio_space: std::ops::Range<u64> = mem_layout.mmio()[1].into();
3207    // Device(\_SB.VI00)
3208    // {
3209    //     Name(_HID, "LNRO0005")
3210    //     Name(_UID, 0)
3211    //     Name(_CRS, ResourceTemplate()
3212    //     {
3213    //         QWORDMemory(,,,,,ReadWrite,0,0x1fffff000,0x1ffffffff,0,0x1000)
3214    //         Interrupt(ResourceConsumer, Level, ActiveHigh, Exclusive)
3215    //             {5}
3216    //     })
3217    // }
3218    // TODO: manage MMIO space better than this
3219    for i in 0..virtio_mmio_count {
3220        high_mmio_space.end -= HV_PAGE_SIZE;
3221        let mut device = dsdt::Device::new(format!("\\_SB.VI{i:02}").as_bytes());
3222        device.add_object(&dsdt::NamedString::new(b"_HID", b"LNRO0005"));
3223        device.add_object(&dsdt::NamedInteger::new(b"_UID", i as u64));
3224        let mut crs = dsdt::CurrentResourceSettings::new();
3225        crs.add_resource(&dsdt::QwordMemory::new(high_mmio_space.end, HV_PAGE_SIZE));
3226        let mut intr = dsdt::Interrupt::new(virtio_mmio_irq);
3227        intr.is_edge_triggered = false;
3228        crs.add_resource(&intr);
3229        device.add_object(&crs);
3230        dsdt.add_object(&device);
3231    }
3232
3233    let high_mmio_gap = MemoryRange::new(high_mmio_space);
3234
3235    if cfg.with_generic_pci_bus || cfg.with_i440bx_host_pci_bridge {
3236        // TODO: actually plumb through legacy PCI interrupts
3237        dsdt.add_pci(low_mmio_gap, high_mmio_gap, pci_legacy_interrupts);
3238    } else {
3239        dsdt.add_mmio_module(low_mmio_gap, high_mmio_gap);
3240    }
3241
3242    dsdt.add_vmbus(
3243        cfg.with_generic_pci_bus || cfg.with_i440bx_host_pci_bridge,
3244        None,
3245    );
3246    dsdt.add_rtc();
3247}
3248
3249#[cfg(guest_arch = "aarch64")]
3250fn add_devices_to_dsdt_arm64(
3251    mem_layout: &MemoryLayout,
3252    dsdt: &mut dsdt::Dsdt,
3253    enable_serial: bool,
3254    with_hv: bool,
3255) {
3256    // VMBus GIC INTID (PPI 2 = INTID 16 + 2 = 18), matching the DT path.
3257    const VMBUS_INTID: u32 = openvmm_defs::config::DEFAULT_VMBUS_PPI;
3258    // SBSA UART MMIO bases and sizes.
3259    const PL011_SERIAL0_BASE: u64 = 0xEFFEC000;
3260    const PL011_SERIAL1_BASE: u64 = 0xEFFEB000;
3261    const PL011_SERIAL_SIZE: u64 = 0x1000;
3262    // UART GSIVs (SPI 1 = INTID 33, SPI 2 = INTID 34).
3263    const PL011_SERIAL0_GSIV: u32 = 33;
3264    const PL011_SERIAL1_GSIV: u32 = 34;
3265
3266    if with_hv {
3267        // Internal invariant: the memory layout for ARM64 with HV always has
3268        // at least two MMIO gaps (low + high). This is configured by OpenVMM
3269        // itself, not by guest input.
3270        assert!(
3271            mem_layout.mmio().len() >= 2,
3272            "need at least two MMIO regions"
3273        );
3274        let low_mmio_gap = mem_layout.mmio()[0];
3275        let high_mmio_gap: MemoryRange = mem_layout.mmio()[1];
3276        dsdt.add_mmio_module(low_mmio_gap, high_mmio_gap);
3277        // VMBus on ARM64 ACPI needs a per-CPU interrupt (PPI) in _CRS.
3278        // Always place under VMOD, not PCI0 — ARM64 doesn't use the x86
3279        // PCI0 DSDT node.
3280        dsdt.add_vmbus(false, Some(VMBUS_INTID));
3281    }
3282
3283    if enable_serial {
3284        dsdt.add_sbsa_uart(
3285            b"\\_SB.UAR0",
3286            0,
3287            PL011_SERIAL0_BASE,
3288            PL011_SERIAL_SIZE,
3289            PL011_SERIAL0_GSIV,
3290        );
3291        dsdt.add_sbsa_uart(
3292            b"\\_SB.UAR1",
3293            1,
3294            PL011_SERIAL1_BASE,
3295            PL011_SERIAL_SIZE,
3296            PL011_SERIAL1_GSIV,
3297        );
3298    }
3299}
3300
3301#[cfg(guest_arch = "x86_64")]
3302struct WatchdogTimeoutNmi {
3303    partition: Arc<dyn HvlitePartition>,
3304    watchdog_send: Option<mesh::Sender<()>>,
3305}
3306
3307#[cfg(guest_arch = "x86_64")]
3308#[async_trait::async_trait]
3309impl WatchdogCallback for WatchdogTimeoutNmi {
3310    async fn on_timeout(&mut self) {
3311        // Unlike Hyper-V, we only send the NMI to the BSP.
3312        self.partition.request_msi(
3313            Vtl::Vtl0,
3314            virt::irqcon::MsiRequest::new_x86(virt::irqcon::DeliveryMode::NMI, 0, false, 0, false),
3315        );
3316
3317        if let Some(watchdog_send) = &self.watchdog_send {
3318            watchdog_send.send(());
3319        }
3320    }
3321}
3322
3323struct WatchdogTimeoutReset {
3324    halt_vps: Arc<Halt>,
3325    watchdog_send: Option<mesh::Sender<()>>,
3326}
3327
3328#[async_trait::async_trait]
3329impl WatchdogCallback for WatchdogTimeoutReset {
3330    async fn on_timeout(&mut self) {
3331        self.halt_vps.halt(HaltReason::Reset);
3332
3333        if let Some(watchdog_send) = &self.watchdog_send {
3334            watchdog_send.send(());
3335        }
3336    }
3337}
3338
3339#[derive(MeshPayload, Clone)]
3340struct OpenVmmRemoteDynamicResolvers {}
3341
3342impl chipset_device_worker::RemoteDynamicResolvers for OpenVmmRemoteDynamicResolvers {
3343    const WORKER_ID_STR: &str = "openvmm_remote_chipset_worker";
3344
3345    async fn register_remote_dynamic_resolvers(
3346        self,
3347        _resolver: &mut ResourceResolver,
3348    ) -> anyhow::Result<()> {
3349        Ok(())
3350    }
3351}
3352
3353mesh_worker::register_workers! {
3354    chipset_device_worker::worker::RemoteChipsetDeviceWorker<OpenVmmRemoteDynamicResolvers>
3355}
3356
3357/// Wrapper around `Weak<CloseableMutex<dyn ChipsetDevice>>` that implements
3358/// [`GenericPciBusDevice`] for PCIe hotplug devices.
3359struct WeakMutexPciBusDevice(
3360    std::sync::Weak<closeable_mutex::CloseableMutex<dyn chipset_device::ChipsetDevice>>,
3361);
3362
3363impl pci_bus::GenericPciBusDevice for WeakMutexPciBusDevice {
3364    fn pci_cfg_read(
3365        &mut self,
3366        offset: u16,
3367        value: &mut u32,
3368    ) -> Option<chipset_device::io::IoResult> {
3369        Some(
3370            self.0
3371                .upgrade()?
3372                .lock()
3373                .supports_pci()?
3374                .pci_cfg_read(offset, value),
3375        )
3376    }
3377
3378    fn pci_cfg_write(&mut self, offset: u16, value: u32) -> Option<chipset_device::io::IoResult> {
3379        Some(
3380            self.0
3381                .upgrade()?
3382                .lock()
3383                .supports_pci()?
3384                .pci_cfg_write(offset, value),
3385        )
3386    }
3387
3388    fn pci_cfg_read_with_routing(
3389        &mut self,
3390        secondary_bus: u8,
3391        target_bus: u8,
3392        function: u8,
3393        offset: u16,
3394        value: &mut u32,
3395    ) -> Option<chipset_device::io::IoResult> {
3396        Some(
3397            self.0
3398                .upgrade()?
3399                .lock()
3400                .supports_pci()?
3401                .pci_cfg_read_with_routing(secondary_bus, target_bus, function, offset, value),
3402        )
3403    }
3404
3405    fn pci_cfg_write_with_routing(
3406        &mut self,
3407        secondary_bus: u8,
3408        target_bus: u8,
3409        function: u8,
3410        offset: u16,
3411        value: u32,
3412    ) -> Option<chipset_device::io::IoResult> {
3413        Some(
3414            self.0
3415                .upgrade()?
3416                .lock()
3417                .supports_pci()?
3418                .pci_cfg_write_with_routing(secondary_bus, target_bus, function, offset, value),
3419        )
3420    }
3421}