Skip to main content

openvmm_entry/
lib.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! This module implements the interactive control process and the entry point
5//! for the worker process.
6
7#![expect(missing_docs)]
8#![forbid(unsafe_code)]
9
10mod cli_args;
11mod crash_dump;
12mod kvp;
13mod meshworker;
14mod repl;
15mod serial_io;
16mod storage_builder;
17mod tracing_init;
18mod ttrpc;
19mod vm_controller;
20
21// `pub` so that the missing_docs warning fires for options without
22// documentation.
23pub use cli_args::Options;
24use console_relay::ConsoleLaunchOptions;
25
26use crate::cli_args::SecureBootTemplateCli;
27use anyhow::Context;
28use anyhow::bail;
29use chipset_resources::battery::HostBatteryUpdate;
30use clap::Parser;
31use cli_args::DiskCliKind;
32use cli_args::EfiDiagnosticsLogLevelCli;
33use cli_args::EndpointConfigCli;
34use cli_args::NicConfigCli;
35use cli_args::ProvisionVmgs;
36use cli_args::SerialConfigCli;
37use cli_args::UefiConsoleModeCli;
38use cli_args::VirtioBusCli;
39use cli_args::VmgsCli;
40use crash_dump::spawn_dump_handler;
41use disk_backend_resources::DelayDiskHandle;
42use disk_backend_resources::DiskLayerDescription;
43use disk_backend_resources::layer::DiskLayerHandle;
44use disk_backend_resources::layer::RamDiskLayerHandle;
45use disk_backend_resources::layer::SqliteAutoCacheDiskLayerHandle;
46use disk_backend_resources::layer::SqliteDiskLayerHandle;
47use floppy_resources::FloppyDiskConfig;
48use framebuffer::FRAMEBUFFER_SIZE;
49use framebuffer::FramebufferAccess;
50use futures::AsyncReadExt;
51use futures::AsyncWrite;
52use futures::StreamExt;
53use futures::executor::block_on;
54use futures::io::AllowStdIo;
55use gdma_resources::GdmaDeviceHandle;
56use gdma_resources::VportDefinition;
57use guid::Guid;
58use input_core::MultiplexedInputHandle;
59use inspect::InspectMut;
60use io::Read;
61use memory_range::MemoryRange;
62use mesh::CancelContext;
63use mesh::CellUpdater;
64use mesh::rpc::RpcSend;
65use meshworker::VmmMesh;
66use net_backend_resources::mac_address::MacAddress;
67use nvme_resources::NvmeControllerRequest;
68use openvmm_defs::config::Config;
69use openvmm_defs::config::DEFAULT_MMIO_GAPS_AARCH64;
70use openvmm_defs::config::DEFAULT_MMIO_GAPS_AARCH64_WITH_VTL2;
71use openvmm_defs::config::DEFAULT_MMIO_GAPS_X86;
72use openvmm_defs::config::DEFAULT_MMIO_GAPS_X86_WITH_VTL2;
73use openvmm_defs::config::DEFAULT_PCAT_BOOT_ORDER;
74use openvmm_defs::config::DeviceVtl;
75use openvmm_defs::config::EfiDiagnosticsLogLevelType;
76use openvmm_defs::config::HypervisorConfig;
77use openvmm_defs::config::LateMapVtl0MemoryPolicy;
78use openvmm_defs::config::LoadMode;
79use openvmm_defs::config::MemoryConfig;
80use openvmm_defs::config::PcieDeviceConfig;
81use openvmm_defs::config::PcieRootComplexConfig;
82use openvmm_defs::config::PcieRootPortConfig;
83use openvmm_defs::config::PcieSwitchConfig;
84use openvmm_defs::config::ProcessorTopologyConfig;
85use openvmm_defs::config::SerialInformation;
86use openvmm_defs::config::VirtioBus;
87use openvmm_defs::config::VmbusConfig;
88use openvmm_defs::config::VpciDeviceConfig;
89use openvmm_defs::config::Vtl2BaseAddressType;
90use openvmm_defs::config::Vtl2Config;
91use openvmm_defs::rpc::VmRpc;
92use openvmm_defs::worker::VM_WORKER;
93use openvmm_defs::worker::VmWorkerParameters;
94use openvmm_helpers::disk::OpenDiskOptions;
95use openvmm_helpers::disk::create_disk_type;
96use openvmm_helpers::disk::open_disk_type;
97use pal_async::DefaultDriver;
98use pal_async::DefaultPool;
99use pal_async::socket::PolledSocket;
100use pal_async::task::Spawn;
101use pal_async::task::Task;
102use serial_16550_resources::ComPort;
103use serial_core::resources::DisconnectedSerialBackendHandle;
104use sparse_mmap::alloc_shared_memory;
105use std::cell::RefCell;
106use std::collections::BTreeMap;
107use std::fmt::Write as _;
108use std::future::pending;
109use std::io;
110#[cfg(unix)]
111use std::io::IsTerminal;
112use std::io::Write;
113use std::net::TcpListener;
114use std::path::Path;
115use std::path::PathBuf;
116use std::sync::Arc;
117use std::thread;
118use std::time::Duration;
119use storvsp_resources::ScsiControllerRequest;
120use tpm_resources::TpmDeviceHandle;
121use tpm_resources::TpmRegisterLayout;
122use uidevices_resources::SynthKeyboardHandle;
123use uidevices_resources::SynthMouseHandle;
124use uidevices_resources::SynthVideoHandle;
125use video_core::SharedFramebufferHandle;
126use virtio_resources::VirtioPciDeviceHandle;
127use vm_manifest_builder::BaseChipsetType;
128use vm_manifest_builder::MachineArch;
129use vm_manifest_builder::VmChipsetResult;
130use vm_manifest_builder::VmManifestBuilder;
131use vm_resource::IntoResource;
132use vm_resource::Resource;
133use vm_resource::kind::DiskHandleKind;
134use vm_resource::kind::DiskLayerHandleKind;
135use vm_resource::kind::NetEndpointHandleKind;
136use vm_resource::kind::VirtioDeviceHandle;
137use vm_resource::kind::VmbusDeviceHandleKind;
138use vmbus_serial_resources::VmbusSerialDeviceHandle;
139use vmbus_serial_resources::VmbusSerialPort;
140use vmcore::non_volatile_store::resources::EphemeralNonVolatileStoreHandle;
141use vmgs_resources::GuestStateEncryptionPolicy;
142use vmgs_resources::VmgsDisk;
143use vmgs_resources::VmgsFileHandle;
144use vmgs_resources::VmgsResource;
145use vmotherboard::ChipsetDeviceHandle;
146use vnc_worker_defs::VncParameters;
147
148/// RAII guard that removes the pidfile when dropped. Ensures the pidfile is
149/// cleaned up even if [`do_main`] panics.
150struct PidfileGuard(Option<PathBuf>);
151
152impl Drop for PidfileGuard {
153    fn drop(&mut self) {
154        if let Some(path) = &self.0 {
155            let _ = fs_err::remove_file(path);
156        }
157    }
158}
159
160pub fn openvmm_main() {
161    // Save the current state of the terminal so we can restore it back to
162    // normal before exiting.
163    #[cfg(unix)]
164    let orig_termios = io::stderr().is_terminal().then(term::get_termios);
165
166    let mut pidfile_guard = PidfileGuard(None);
167    let exit_code = match do_main(&mut pidfile_guard.0) {
168        Ok(_) => 0,
169        Err(err) => {
170            eprintln!("fatal error: {:?}", err);
171            1
172        }
173    };
174
175    // Restore the terminal to its initial state.
176    #[cfg(unix)]
177    if let Some(orig_termios) = orig_termios {
178        term::set_termios(orig_termios);
179    }
180
181    // Clean up the pidfile before terminating, since pal::process::terminate
182    // skips destructors.
183    drop(pidfile_guard);
184
185    // Terminate the process immediately without graceful shutdown of DLLs or
186    // C++ destructors or anything like that. This is all unnecessary and saves
187    // time on Windows.
188    //
189    // Do flush stdout, though, since there may be buffered data.
190    let _ = io::stdout().flush();
191    pal::process::terminate(exit_code);
192}
193
194#[derive(Default)]
195struct VmResources {
196    console_in: Option<Box<dyn AsyncWrite + Send + Unpin>>,
197    framebuffer_access: Option<FramebufferAccess>,
198    shutdown_ic: Option<mesh::Sender<hyperv_ic_resources::shutdown::ShutdownRpc>>,
199    kvp_ic: Option<mesh::Sender<hyperv_ic_resources::kvp::KvpConnectRpc>>,
200    scsi_rpc: Option<mesh::Sender<ScsiControllerRequest>>,
201    nvme_vtl2_rpc: Option<mesh::Sender<NvmeControllerRequest>>,
202    ged_rpc: Option<mesh::Sender<get_resources::ged::GuestEmulationRequest>>,
203    vtl2_settings: Option<vtl2_settings_proto::Vtl2Settings>,
204    #[cfg(windows)]
205    switch_ports: Vec<vmswitch::kernel::SwitchPort>,
206}
207
208struct ConsoleState<'a> {
209    device: &'a str,
210    input: Box<dyn AsyncWrite + Unpin + Send>,
211}
212
213/// Build a flat list of switches with their parent port assignments.
214///
215/// This function converts hierarchical CLI switch definitions into a flat list
216/// where each switch specifies its parent port directly.
217fn build_switch_list(all_switches: &[cli_args::GenericPcieSwitchCli]) -> Vec<PcieSwitchConfig> {
218    all_switches
219        .iter()
220        .map(|switch_cli| PcieSwitchConfig {
221            name: switch_cli.name.clone(),
222            num_downstream_ports: switch_cli.num_downstream_ports,
223            parent_port: switch_cli.port_name.clone(),
224            hotplug: switch_cli.hotplug,
225            acs_capabilities_supported: switch_cli.acs_capabilities_supported,
226        })
227        .collect()
228}
229
230async fn vm_config_from_command_line(
231    spawner: impl Spawn,
232    mesh: &VmmMesh,
233    opt: &Options,
234) -> anyhow::Result<(Config, VmResources)> {
235    let (_, serial_driver) = DefaultPool::spawn_on_thread("serial");
236    // Ensure the serial driver stays alive with no tasks.
237    serial_driver.spawn("leak", pending::<()>()).detach();
238
239    let openhcl_vtl = if opt.vtl2 {
240        DeviceVtl::Vtl2
241    } else {
242        DeviceVtl::Vtl0
243    };
244
245    let console_state: RefCell<Option<ConsoleState<'_>>> = RefCell::new(None);
246    let setup_serial = |name: &str, cli_cfg, device| -> anyhow::Result<_> {
247        Ok(match cli_cfg {
248            SerialConfigCli::Console => {
249                if let Some(console_state) = console_state.borrow().as_ref() {
250                    bail!("console already set by {}", console_state.device);
251                }
252                let (config, serial) = serial_io::anonymous_serial_pair(&serial_driver)?;
253                let (serial_read, serial_write) = AsyncReadExt::split(serial);
254                *console_state.borrow_mut() = Some(ConsoleState {
255                    device,
256                    input: Box::new(serial_write),
257                });
258                thread::Builder::new()
259                    .name(name.to_owned())
260                    .spawn(move || {
261                        let _ = block_on(futures::io::copy(
262                            serial_read,
263                            &mut AllowStdIo::new(term::raw_stdout()),
264                        ));
265                    })
266                    .unwrap();
267                Some(config)
268            }
269            SerialConfigCli::Stderr => {
270                let (config, serial) = serial_io::anonymous_serial_pair(&serial_driver)?;
271                thread::Builder::new()
272                    .name(name.to_owned())
273                    .spawn(move || {
274                        let _ = block_on(futures::io::copy(
275                            serial,
276                            &mut AllowStdIo::new(term::raw_stderr()),
277                        ));
278                    })
279                    .unwrap();
280                Some(config)
281            }
282            SerialConfigCli::File(path) => {
283                let (config, serial) = serial_io::anonymous_serial_pair(&serial_driver)?;
284                let file = fs_err::File::create(path).context("failed to create file")?;
285
286                thread::Builder::new()
287                    .name(name.to_owned())
288                    .spawn(move || {
289                        let _ = block_on(futures::io::copy(serial, &mut AllowStdIo::new(file)));
290                    })
291                    .unwrap();
292                Some(config)
293            }
294            SerialConfigCli::None => None,
295            SerialConfigCli::Pipe(path) => {
296                Some(serial_io::bind_serial(&path).context("failed to bind serial")?)
297            }
298            SerialConfigCli::Tcp(addr) => {
299                Some(serial_io::bind_tcp_serial(&addr).context("failed to bind serial")?)
300            }
301            SerialConfigCli::NewConsole(app, window_title) => {
302                let path = console_relay::random_console_path();
303                let config =
304                    serial_io::bind_serial(&path).context("failed to bind console serial")?;
305                let window_title =
306                    window_title.unwrap_or_else(|| name.to_uppercase() + " [OpenVMM]");
307
308                console_relay::launch_console(
309                    app.or_else(openvmm_terminal_app).as_deref(),
310                    &path,
311                    ConsoleLaunchOptions {
312                        window_title: Some(window_title),
313                    },
314                )
315                .context("failed to launch console")?;
316
317                Some(config)
318            }
319        })
320    };
321
322    let mut vmbus_devices = Vec::new();
323
324    let serial0_cfg = setup_serial(
325        "com1",
326        opt.com1.clone().unwrap_or(SerialConfigCli::Console),
327        if cfg!(guest_arch = "x86_64") {
328            "ttyS0"
329        } else {
330            "ttyAMA0"
331        },
332    )?;
333    let serial1_cfg = setup_serial(
334        "com2",
335        opt.com2.clone().unwrap_or(SerialConfigCli::None),
336        if cfg!(guest_arch = "x86_64") {
337            "ttyS1"
338        } else {
339            "ttyAMA1"
340        },
341    )?;
342    let serial2_cfg = setup_serial(
343        "com3",
344        opt.com3.clone().unwrap_or(SerialConfigCli::None),
345        if cfg!(guest_arch = "x86_64") {
346            "ttyS2"
347        } else {
348            "ttyAMA2"
349        },
350    )?;
351    let serial3_cfg = setup_serial(
352        "com4",
353        opt.com4.clone().unwrap_or(SerialConfigCli::None),
354        if cfg!(guest_arch = "x86_64") {
355            "ttyS3"
356        } else {
357            "ttyAMA3"
358        },
359    )?;
360    let with_vmbus_com1_serial = if let Some(vmbus_com1_cfg) = setup_serial(
361        "vmbus_com1",
362        opt.vmbus_com1_serial
363            .clone()
364            .unwrap_or(SerialConfigCli::None),
365        "vmbus_com1",
366    )? {
367        vmbus_devices.push((
368            openhcl_vtl,
369            VmbusSerialDeviceHandle {
370                port: VmbusSerialPort::Com1,
371                backend: vmbus_com1_cfg,
372            }
373            .into_resource(),
374        ));
375        true
376    } else {
377        false
378    };
379    let with_vmbus_com2_serial = if let Some(vmbus_com2_cfg) = setup_serial(
380        "vmbus_com2",
381        opt.vmbus_com2_serial
382            .clone()
383            .unwrap_or(SerialConfigCli::None),
384        "vmbus_com2",
385    )? {
386        vmbus_devices.push((
387            openhcl_vtl,
388            VmbusSerialDeviceHandle {
389                port: VmbusSerialPort::Com2,
390                backend: vmbus_com2_cfg,
391            }
392            .into_resource(),
393        ));
394        true
395    } else {
396        false
397    };
398    let debugcon_cfg = setup_serial(
399        "debugcon",
400        opt.debugcon
401            .clone()
402            .map(|cfg| cfg.serial)
403            .unwrap_or(SerialConfigCli::None),
404        "debugcon",
405    )?;
406
407    let virtio_console_backend = if let Some(serial_cfg) = opt.virtio_console.clone() {
408        setup_serial("virtio-console", serial_cfg, "hvc0")?
409    } else {
410        None
411    };
412
413    let mut resources = VmResources::default();
414    let mut console_str = "";
415    if let Some(ConsoleState { device, input }) = console_state.into_inner() {
416        resources.console_in = Some(input);
417        console_str = device;
418    }
419
420    if opt.shared_memory {
421        tracing::warn!("--shared-memory/-M flag has no effect and will be removed");
422    }
423    if opt.deprecated_prefetch {
424        tracing::warn!("--prefetch is deprecated; use --memory prefetch=on");
425    }
426    if opt.deprecated_private_memory {
427        tracing::warn!("--private-memory is deprecated; use --memory shared=off");
428    }
429    if opt.deprecated_thp {
430        tracing::warn!("--thp is deprecated; use --memory shared=off,thp=on");
431    }
432    if opt.deprecated_memory_backing_file.is_some() {
433        tracing::warn!("--memory-backing-file is deprecated; use --memory file=<path>");
434    }
435
436    opt.validate_memory_options()?;
437
438    const MAX_PROCESSOR_COUNT: u32 = 1024;
439
440    if opt.processors == 0 || opt.processors > MAX_PROCESSOR_COUNT {
441        bail!("invalid proc count: {}", opt.processors);
442    }
443
444    // Total SCSI channel count should not exceed the processor count
445    // (at most, one channel per VP).
446    if opt.scsi_sub_channels > (MAX_PROCESSOR_COUNT - 1) as u16 {
447        bail!(
448            "invalid SCSI sub-channel count: requested {}, max {}",
449            opt.scsi_sub_channels,
450            MAX_PROCESSOR_COUNT - 1
451        );
452    }
453
454    let with_get = opt.get || (opt.vtl2 && !opt.no_get);
455
456    let mut storage = storage_builder::StorageBuilder::new(with_get.then_some(openhcl_vtl));
457    for &cli_args::DiskCli {
458        vtl,
459        ref kind,
460        read_only,
461        is_dvd,
462        underhill,
463        ref pcie_port,
464    } in &opt.disk
465    {
466        if pcie_port.is_some() {
467            anyhow::bail!("`--disk` is incompatible with PCIe");
468        }
469
470        storage
471            .add(
472                vtl,
473                underhill,
474                storage_builder::DiskLocation::Scsi(None),
475                kind,
476                is_dvd,
477                read_only,
478            )
479            .await?;
480    }
481
482    for &cli_args::IdeDiskCli {
483        ref kind,
484        read_only,
485        channel,
486        device,
487        is_dvd,
488    } in &opt.ide
489    {
490        storage
491            .add(
492                DeviceVtl::Vtl0,
493                None,
494                storage_builder::DiskLocation::Ide(channel, device),
495                kind,
496                is_dvd,
497                read_only,
498            )
499            .await?;
500    }
501
502    for &cli_args::DiskCli {
503        vtl,
504        ref kind,
505        read_only,
506        is_dvd,
507        underhill,
508        ref pcie_port,
509    } in &opt.nvme
510    {
511        storage
512            .add(
513                vtl,
514                underhill,
515                storage_builder::DiskLocation::Nvme(None, pcie_port.clone()),
516                kind,
517                is_dvd,
518                read_only,
519            )
520            .await?;
521    }
522
523    for &cli_args::DiskCli {
524        vtl,
525        ref kind,
526        read_only,
527        is_dvd,
528        ref underhill,
529        ref pcie_port,
530    } in &opt.virtio_blk
531    {
532        if underhill.is_some() {
533            anyhow::bail!("underhill not supported with virtio-blk");
534        }
535        storage
536            .add(
537                vtl,
538                None,
539                storage_builder::DiskLocation::VirtioBlk(pcie_port.clone()),
540                kind,
541                is_dvd,
542                read_only,
543            )
544            .await?;
545    }
546
547    let mut floppy_disks = Vec::new();
548    for disk in &opt.floppy {
549        let &cli_args::FloppyDiskCli {
550            ref kind,
551            read_only,
552        } = disk;
553        floppy_disks.push(FloppyDiskConfig {
554            disk_type: disk_open(kind, read_only).await?,
555            read_only,
556        });
557    }
558
559    let mut vpci_mana_nics = [(); 3].map(|()| None);
560    let mut pcie_mana_nics = BTreeMap::<String, GdmaDeviceHandle>::new();
561    let mut underhill_nics = Vec::new();
562    let mut vpci_devices = Vec::new();
563
564    let mut nic_index = 0;
565    for cli_cfg in &opt.net {
566        if cli_cfg.pcie_port.is_some() {
567            anyhow::bail!("`--net` does not support PCIe");
568        }
569        let vport = parse_endpoint(cli_cfg, &mut nic_index, &mut resources)?;
570        if cli_cfg.underhill {
571            if !opt.no_alias_map {
572                anyhow::bail!("must specify --no-alias-map to offer NICs to VTL2");
573            }
574            let mana = vpci_mana_nics[openhcl_vtl as usize].get_or_insert_with(|| {
575                let vpci_instance_id = Guid::new_random();
576                underhill_nics.push(vtl2_settings_proto::NicDeviceLegacy {
577                    instance_id: vpci_instance_id.to_string(),
578                    subordinate_instance_id: None,
579                    max_sub_channels: None,
580                });
581                (vpci_instance_id, GdmaDeviceHandle { vports: Vec::new() })
582            });
583            mana.1.vports.push(VportDefinition {
584                mac_address: vport.mac_address,
585                endpoint: vport.endpoint,
586            });
587        } else {
588            vmbus_devices.push(vport.into_netvsp_handle());
589        }
590    }
591
592    if opt.nic {
593        let nic_config = parse_endpoint(
594            &NicConfigCli {
595                vtl: DeviceVtl::Vtl0,
596                endpoint: EndpointConfigCli::Consomme {
597                    cidr: None,
598                    host_fwd: Vec::new(),
599                },
600                max_queues: None,
601                underhill: false,
602                pcie_port: None,
603            },
604            &mut nic_index,
605            &mut resources,
606        )?;
607        vmbus_devices.push(nic_config.into_netvsp_handle());
608    }
609
610    // Build initial PCIe devices list from CLI options. Storage devices
611    // (e.g., NVMe controllers on PCIe ports) are added later by storage_builder.
612    let mut pcie_devices = Vec::new();
613    for (index, cli_cfg) in opt.pcie_remote.iter().enumerate() {
614        tracing::info!(
615            port_name = %cli_cfg.port_name,
616            socket_addr = ?cli_cfg.socket_addr,
617            "instantiating PCIe remote device"
618        );
619
620        // Generate a deterministic instance ID based on index
621        const PCIE_REMOTE_BASE_INSTANCE_ID: Guid =
622            guid::guid!("28ed784d-c059-429f-9d9a-46bea02562c0");
623        let instance_id = Guid {
624            data1: index as u32,
625            ..PCIE_REMOTE_BASE_INSTANCE_ID
626        };
627
628        pcie_devices.push(PcieDeviceConfig {
629            port_name: cli_cfg.port_name.clone(),
630            resource: pcie_remote_resources::PcieRemoteHandle {
631                instance_id,
632                socket_addr: cli_cfg.socket_addr.clone(),
633                hu: cli_cfg.hu,
634                controller: cli_cfg.controller,
635            }
636            .into_resource(),
637        });
638    }
639
640    #[cfg(windows)]
641    let mut kernel_vmnics = Vec::new();
642    #[cfg(windows)]
643    for (index, switch_id) in opt.kernel_vmnic.iter().enumerate() {
644        // Pick a random MAC address.
645        let mut mac_address = [0x00, 0x15, 0x5D, 0, 0, 0];
646        getrandom::fill(&mut mac_address[3..]).expect("rng failure");
647
648        // Pick a fixed instance ID based on the index.
649        const BASE_INSTANCE_ID: Guid = guid::guid!("00000000-435d-11ee-9f59-00155d5016fc");
650        let instance_id = Guid {
651            data1: index as u32,
652            ..BASE_INSTANCE_ID
653        };
654
655        let switch_id = if switch_id == "default" {
656            DEFAULT_SWITCH
657        } else {
658            switch_id
659        };
660        let (port_id, port) = new_switch_port(switch_id)?;
661        resources.switch_ports.push(port);
662
663        kernel_vmnics.push(openvmm_defs::config::KernelVmNicConfig {
664            instance_id,
665            mac_address: mac_address.into(),
666            switch_port_id: port_id,
667        });
668    }
669
670    for vport in &opt.mana {
671        let vport = parse_endpoint(vport, &mut nic_index, &mut resources)?;
672        let vport_array = match (vport.vtl as usize, vport.pcie_port) {
673            (vtl, None) => {
674                &mut vpci_mana_nics[vtl]
675                    .get_or_insert_with(|| {
676                        (Guid::new_random(), GdmaDeviceHandle { vports: Vec::new() })
677                    })
678                    .1
679                    .vports
680            }
681            (0, Some(pcie_port)) => {
682                &mut pcie_mana_nics
683                    .entry(pcie_port)
684                    .or_insert(GdmaDeviceHandle { vports: Vec::new() })
685                    .vports
686            }
687            _ => anyhow::bail!("PCIe NICs only supported to VTL0"),
688        };
689        vport_array.push(VportDefinition {
690            mac_address: vport.mac_address,
691            endpoint: vport.endpoint,
692        });
693    }
694
695    vpci_devices.extend(
696        vpci_mana_nics
697            .into_iter()
698            .enumerate()
699            .filter_map(|(vtl, nic)| {
700                nic.map(|(instance_id, handle)| VpciDeviceConfig {
701                    vtl: match vtl {
702                        0 => DeviceVtl::Vtl0,
703                        1 => DeviceVtl::Vtl1,
704                        2 => DeviceVtl::Vtl2,
705                        _ => unreachable!(),
706                    },
707                    instance_id,
708                    resource: handle.into_resource(),
709                })
710            }),
711    );
712
713    pcie_devices.extend(
714        pcie_mana_nics
715            .into_iter()
716            .map(|(pcie_port, handle)| PcieDeviceConfig {
717                port_name: pcie_port,
718                resource: handle.into_resource(),
719            }),
720    );
721
722    // If VTL2 is enabled, and we are not in VTL2 self allocate mode, provide an
723    // mmio gap for VTL2.
724    let use_vtl2_gap = opt.vtl2
725        && !matches!(
726            opt.igvm_vtl2_relocation_type,
727            Vtl2BaseAddressType::Vtl2Allocate { .. },
728        );
729
730    #[cfg(guest_arch = "aarch64")]
731    let arch = MachineArch::Aarch64;
732    #[cfg(guest_arch = "x86_64")]
733    let arch = MachineArch::X86_64;
734
735    let mmio_gaps: Vec<MemoryRange> = match (use_vtl2_gap, arch) {
736        (true, MachineArch::X86_64) => DEFAULT_MMIO_GAPS_X86_WITH_VTL2.into(),
737        (true, MachineArch::Aarch64) => DEFAULT_MMIO_GAPS_AARCH64_WITH_VTL2.into(),
738        (false, MachineArch::X86_64) => DEFAULT_MMIO_GAPS_X86.into(),
739        (false, MachineArch::Aarch64) => DEFAULT_MMIO_GAPS_AARCH64.into(),
740    };
741
742    let mut pci_ecam_gaps = Vec::new();
743    let mut pci_mmio_gaps = Vec::new();
744
745    let mut low_mmio_start = mmio_gaps.first().context("expected mmio gap")?.start();
746    let mut high_mmio_end = mmio_gaps.last().context("expected second mmio gap")?.end();
747
748    let mut pcie_root_complexes = Vec::new();
749    for (i, rc_cli) in opt.pcie_root_complex.iter().enumerate() {
750        let ports = opt
751            .pcie_root_port
752            .iter()
753            .filter(|port_cli| port_cli.root_complex_name == rc_cli.name)
754            .map(|port_cli| PcieRootPortConfig {
755                name: port_cli.name.clone(),
756                hotplug: port_cli.hotplug,
757                acs_capabilities_supported: port_cli.acs_capabilities_supported,
758            })
759            .collect();
760
761        const ONE_MB: u64 = 1024 * 1024;
762        let low_mmio_size = (rc_cli.low_mmio as u64).next_multiple_of(ONE_MB);
763        let high_mmio_size = rc_cli
764            .high_mmio
765            .checked_next_multiple_of(ONE_MB)
766            .context("high mmio rounding error")?;
767        let ecam_size = (((rc_cli.end_bus - rc_cli.start_bus) as u64) + 1) * 256 * 4096;
768
769        let low_pci_mmio_start = low_mmio_start
770            .checked_sub(low_mmio_size)
771            .context("pci low mmio underflow")?;
772        let ecam_start = low_pci_mmio_start
773            .checked_sub(ecam_size)
774            .context("pci ecam underflow")?;
775        low_mmio_start = ecam_start;
776        high_mmio_end = high_mmio_end
777            .checked_add(high_mmio_size)
778            .context("pci high mmio overflow")?;
779
780        let ecam_range = MemoryRange::new(ecam_start..ecam_start + ecam_size);
781        let low_mmio = MemoryRange::new(low_pci_mmio_start..low_pci_mmio_start + low_mmio_size);
782        let high_mmio = MemoryRange::new(high_mmio_end - high_mmio_size..high_mmio_end);
783
784        pci_ecam_gaps.push(ecam_range);
785        pci_mmio_gaps.push(low_mmio);
786        pci_mmio_gaps.push(high_mmio);
787
788        pcie_root_complexes.push(PcieRootComplexConfig {
789            index: i as u32,
790            name: rc_cli.name.clone(),
791            segment: rc_cli.segment,
792            start_bus: rc_cli.start_bus,
793            end_bus: rc_cli.end_bus,
794            ecam_range,
795            low_mmio,
796            high_mmio,
797            ports,
798        });
799    }
800
801    pci_ecam_gaps.sort();
802    pci_mmio_gaps.sort();
803
804    let pcie_switches = build_switch_list(&opt.pcie_switch);
805
806    #[cfg(target_os = "linux")]
807    let vfio_pcie_devices: Vec<PcieDeviceConfig> = opt
808        .vfio
809        .iter()
810        .map(|cli_cfg| {
811            use vm_resource::IntoResource;
812
813            let sysfs_path = Path::new("/sys/bus/pci/devices").join(&cli_cfg.pci_id);
814            let iommu_group_link = std::fs::read_link(sysfs_path.join("iommu_group"))
815                .with_context(|| format!("failed to read IOMMU group for {}", cli_cfg.pci_id))?;
816            let group_id: u64 = iommu_group_link
817                .file_name()
818                .and_then(|s| s.to_str())
819                .context("invalid iommu_group symlink")?
820                .parse()
821                .context("failed to parse IOMMU group ID")?;
822            let group = std::fs::OpenOptions::new()
823                .read(true)
824                .write(true)
825                .open(format!("/dev/vfio/{group_id}"))
826                .with_context(|| format!("failed to open /dev/vfio/{group_id}"))?;
827
828            Ok(PcieDeviceConfig {
829                port_name: cli_cfg.port_name.clone(),
830                resource: vfio_assigned_device_resources::VfioDeviceHandle {
831                    pci_id: cli_cfg.pci_id.clone(),
832                    group,
833                }
834                .into_resource(),
835            })
836        })
837        .collect::<anyhow::Result<Vec<_>>>()?;
838
839    #[cfg(windows)]
840    let vpci_resources: Vec<_> = opt
841        .device
842        .iter()
843        .map(|path| -> anyhow::Result<_> {
844            Ok(virt_whp::device::DeviceHandle(
845                whp::VpciResource::new(
846                    None,
847                    Default::default(),
848                    &whp::VpciResourceDescriptor::Sriov(path, 0, 0),
849                )
850                .with_context(|| format!("opening PCI device {}", path))?,
851            ))
852        })
853        .collect::<Result<_, _>>()?;
854
855    // Create a vmbusproxy handle if needed by any devices.
856    #[cfg(windows)]
857    let vmbusproxy_handle = if !kernel_vmnics.is_empty() {
858        Some(vmbus_proxy::ProxyHandle::new().context("failed to open vmbusproxy handle")?)
859    } else {
860        None
861    };
862
863    let framebuffer = if opt.gfx || opt.vtl2_gfx || opt.vnc || opt.pcat {
864        let vram = alloc_shared_memory(FRAMEBUFFER_SIZE, "vram")?;
865        let (fb, fba) =
866            framebuffer::framebuffer(vram, FRAMEBUFFER_SIZE, 0).context("creating framebuffer")?;
867        resources.framebuffer_access = Some(fba);
868        Some(fb)
869    } else {
870        None
871    };
872
873    let load_mode;
874    let with_hv;
875
876    let any_serial_configured = serial0_cfg.is_some()
877        || serial1_cfg.is_some()
878        || serial2_cfg.is_some()
879        || serial3_cfg.is_some();
880
881    let has_com3 = serial2_cfg.is_some();
882
883    let mut chipset = VmManifestBuilder::new(
884        if opt.igvm.is_some() {
885            BaseChipsetType::HclHost
886        } else if opt.pcat {
887            BaseChipsetType::HypervGen1
888        } else if opt.uefi {
889            BaseChipsetType::HypervGen2Uefi
890        } else if opt.hv {
891            BaseChipsetType::HyperVGen2LinuxDirect
892        } else {
893            BaseChipsetType::UnenlightenedLinuxDirect
894        },
895        arch,
896    );
897
898    if framebuffer.is_some() {
899        chipset = chipset.with_framebuffer();
900    }
901    if opt.guest_watchdog {
902        chipset = chipset.with_guest_watchdog();
903    }
904    if any_serial_configured {
905        chipset = chipset.with_serial([serial0_cfg, serial1_cfg, serial2_cfg, serial3_cfg]);
906    }
907    if opt.battery {
908        let (tx, rx) = mesh::channel();
909        tx.send(HostBatteryUpdate::default_present());
910        chipset = chipset.with_battery(rx);
911    }
912    if let Some(cfg) = &opt.debugcon {
913        chipset = chipset.with_debugcon(
914            debugcon_cfg.unwrap_or_else(|| DisconnectedSerialBackendHandle.into_resource()),
915            cfg.port,
916        );
917    }
918
919    // TODO: load from VMGS file if it exists
920    let bios_guid = Guid::new_random();
921
922    let VmChipsetResult {
923        chipset,
924        mut chipset_devices,
925        pci_chipset_devices,
926        capabilities,
927    } = chipset
928        .build()
929        .context("failed to build chipset configuration")?;
930
931    if opt.restore_snapshot.is_some() {
932        // Snapshot restore: skip firmware loading entirely. Device state and
933        // memory come from the snapshot directory.
934        load_mode = LoadMode::None;
935        with_hv = true;
936    } else if let Some(path) = &opt.igvm {
937        let file = fs_err::File::open(path)
938            .context("failed to open igvm file")?
939            .into();
940        let cmdline = opt.cmdline.join(" ");
941        with_hv = true;
942
943        load_mode = LoadMode::Igvm {
944            file,
945            cmdline,
946            vtl2_base_address: opt.igvm_vtl2_relocation_type,
947            com_serial: has_com3.then(|| SerialInformation {
948                io_port: ComPort::Com3.io_port(),
949                irq: ComPort::Com3.irq().into(),
950            }),
951        };
952    } else if opt.pcat {
953        // Emit a nice error early instead of complaining about missing firmware.
954        if arch != MachineArch::X86_64 {
955            anyhow::bail!("pcat not supported on this architecture");
956        }
957        with_hv = true;
958
959        let firmware = openvmm_pcat_locator::find_pcat_bios(opt.pcat_firmware.as_deref())?;
960        load_mode = LoadMode::Pcat {
961            firmware,
962            boot_order: opt
963                .pcat_boot_order
964                .map(|x| x.0)
965                .unwrap_or(DEFAULT_PCAT_BOOT_ORDER),
966        };
967    } else if opt.uefi {
968        use openvmm_defs::config::UefiConsoleMode;
969
970        with_hv = true;
971
972        let firmware = fs_err::File::open(
973            (opt.uefi_firmware.0)
974                .as_ref()
975                .context("must provide uefi firmware when booting with uefi")?,
976        )
977        .context("failed to open uefi firmware")?;
978
979        // TODO: It would be better to default memory protections to on, but currently Linux does not boot via UEFI due to what
980        //       appears to be a GRUB memory protection fault. Memory protections are therefore only enabled if configured.
981        load_mode = LoadMode::Uefi {
982            firmware: firmware.into(),
983            enable_debugging: opt.uefi_debug,
984            enable_memory_protections: opt.uefi_enable_memory_protections,
985            disable_frontpage: opt.disable_frontpage,
986            enable_tpm: opt.tpm,
987            enable_battery: opt.battery,
988            enable_serial: any_serial_configured,
989            enable_vpci_boot: false,
990            uefi_console_mode: opt.uefi_console_mode.map(|m| match m {
991                UefiConsoleModeCli::Default => UefiConsoleMode::Default,
992                UefiConsoleModeCli::Com1 => UefiConsoleMode::Com1,
993                UefiConsoleModeCli::Com2 => UefiConsoleMode::Com2,
994                UefiConsoleModeCli::None => UefiConsoleMode::None,
995            }),
996            default_boot_always_attempt: opt.default_boot_always_attempt,
997            bios_guid,
998        };
999    } else {
1000        // Linux Direct
1001        let mut cmdline = "panic=-1 debug".to_string();
1002
1003        with_hv = opt.hv;
1004        if with_hv && opt.pcie_root_complex.is_empty() {
1005            cmdline += " pci=off";
1006        }
1007
1008        if !console_str.is_empty() {
1009            let _ = write!(&mut cmdline, " console={}", console_str);
1010        }
1011
1012        if opt.gfx {
1013            cmdline += " console=tty";
1014        }
1015        for extra in &opt.cmdline {
1016            let _ = write!(&mut cmdline, " {}", extra);
1017        }
1018
1019        let kernel = fs_err::File::open(
1020            (opt.kernel.0)
1021                .as_ref()
1022                .context("must provide kernel when booting with linux direct")?,
1023        )
1024        .context("failed to open kernel")?;
1025        let initrd = (opt.initrd.0)
1026            .as_ref()
1027            .map(fs_err::File::open)
1028            .transpose()
1029            .context("failed to open initrd")?;
1030
1031        let custom_dsdt = match &opt.custom_dsdt {
1032            Some(path) => {
1033                let mut v = Vec::new();
1034                fs_err::File::open(path)
1035                    .context("failed to open custom dsdt")?
1036                    .read_to_end(&mut v)
1037                    .context("failed to read custom dsdt")?;
1038                Some(v)
1039            }
1040            None => None,
1041        };
1042
1043        load_mode = LoadMode::Linux {
1044            kernel: kernel.into(),
1045            initrd: initrd.map(Into::into),
1046            cmdline,
1047            custom_dsdt,
1048            enable_serial: any_serial_configured,
1049            boot_mode: if opt.device_tree {
1050                openvmm_defs::config::LinuxDirectBootMode::DeviceTree
1051            } else {
1052                openvmm_defs::config::LinuxDirectBootMode::Acpi
1053            },
1054        };
1055    }
1056
1057    let mut vmgs = Some(if let Some(VmgsCli { kind, provision }) = &opt.vmgs {
1058        let disk = VmgsDisk {
1059            disk: disk_open(kind, false)
1060                .await
1061                .context("failed to open vmgs disk")?,
1062            encryption_policy: if opt.test_gsp_by_id {
1063                GuestStateEncryptionPolicy::GspById(true)
1064            } else {
1065                GuestStateEncryptionPolicy::None(true)
1066            },
1067        };
1068        match provision {
1069            ProvisionVmgs::OnEmpty => VmgsResource::Disk(disk),
1070            ProvisionVmgs::OnFailure => VmgsResource::ReprovisionOnFailure(disk),
1071            ProvisionVmgs::True => VmgsResource::Reprovision(disk),
1072        }
1073    } else {
1074        VmgsResource::Ephemeral
1075    });
1076
1077    if with_get && with_hv {
1078        let vtl2_settings = vtl2_settings_proto::Vtl2Settings {
1079            version: vtl2_settings_proto::vtl2_settings_base::Version::V1.into(),
1080            fixed: Some(Default::default()),
1081            dynamic: Some(vtl2_settings_proto::Vtl2SettingsDynamic {
1082                storage_controllers: storage.build_underhill(opt.vmbus_redirect),
1083                nic_devices: underhill_nics,
1084            }),
1085            namespace_settings: Vec::default(),
1086        };
1087
1088        // Cache the VTL2 settings for later modification via the interactive console.
1089        resources.vtl2_settings = Some(vtl2_settings.clone());
1090
1091        let (send, guest_request_recv) = mesh::channel();
1092        resources.ged_rpc = Some(send);
1093
1094        let vmgs = vmgs.take().unwrap();
1095
1096        vmbus_devices.extend([
1097            (
1098                openhcl_vtl,
1099                get_resources::gel::GuestEmulationLogHandle.into_resource(),
1100            ),
1101            (
1102                openhcl_vtl,
1103                get_resources::ged::GuestEmulationDeviceHandle {
1104                    firmware: if opt.pcat {
1105                        get_resources::ged::GuestFirmwareConfig::Pcat {
1106                            boot_order: opt
1107                                .pcat_boot_order
1108                                .map_or(DEFAULT_PCAT_BOOT_ORDER, |x| x.0)
1109                                .map(|x| match x {
1110                                    openvmm_defs::config::PcatBootDevice::Floppy => {
1111                                        get_resources::ged::PcatBootDevice::Floppy
1112                                    }
1113                                    openvmm_defs::config::PcatBootDevice::HardDrive => {
1114                                        get_resources::ged::PcatBootDevice::HardDrive
1115                                    }
1116                                    openvmm_defs::config::PcatBootDevice::Optical => {
1117                                        get_resources::ged::PcatBootDevice::Optical
1118                                    }
1119                                    openvmm_defs::config::PcatBootDevice::Network => {
1120                                        get_resources::ged::PcatBootDevice::Network
1121                                    }
1122                                }),
1123                        }
1124                    } else {
1125                        use get_resources::ged::UefiConsoleMode;
1126
1127                        get_resources::ged::GuestFirmwareConfig::Uefi {
1128                            enable_vpci_boot: storage.has_vtl0_nvme(),
1129                            firmware_debug: opt.uefi_debug,
1130                            disable_frontpage: opt.disable_frontpage,
1131                            console_mode: match opt.uefi_console_mode.unwrap_or(UefiConsoleModeCli::Default) {
1132                                UefiConsoleModeCli::Default => UefiConsoleMode::Default,
1133                                UefiConsoleModeCli::Com1 => UefiConsoleMode::COM1,
1134                                UefiConsoleModeCli::Com2 => UefiConsoleMode::COM2,
1135                                UefiConsoleModeCli::None => UefiConsoleMode::None,
1136                            },
1137                            default_boot_always_attempt: opt.default_boot_always_attempt,
1138                        }
1139                    },
1140                    com1: with_vmbus_com1_serial,
1141                    com2: with_vmbus_com2_serial,
1142                    serial_tx_only: opt.serial_tx_only,
1143                    vtl2_settings: Some(prost::Message::encode_to_vec(&vtl2_settings)),
1144                    vmbus_redirection: opt.vmbus_redirect,
1145                    vmgs,
1146                    framebuffer: opt
1147                        .vtl2_gfx
1148                        .then(|| SharedFramebufferHandle.into_resource()),
1149                    guest_request_recv,
1150                    enable_tpm: opt.tpm,
1151                    firmware_event_send: None,
1152                    secure_boot_enabled: opt.secure_boot,
1153                    secure_boot_template: match opt.secure_boot_template {
1154                        Some(SecureBootTemplateCli::Windows) => {
1155                            get_resources::ged::GuestSecureBootTemplateType::MicrosoftWindows
1156                        },
1157                        Some(SecureBootTemplateCli::UefiCa) => {
1158                            get_resources::ged::GuestSecureBootTemplateType::MicrosoftUefiCertificateAuthority
1159                        }
1160                        None => {
1161                            get_resources::ged::GuestSecureBootTemplateType::None
1162                        },
1163                    },
1164                    enable_battery: opt.battery,
1165                    no_persistent_secrets: true,
1166                    igvm_attest_test_config: None,
1167                    test_gsp_by_id: opt.test_gsp_by_id,
1168                    efi_diagnostics_log_level: {
1169                        match opt.efi_diagnostics_log_level.unwrap_or_default() {
1170                            EfiDiagnosticsLogLevelCli::Default => get_resources::ged::EfiDiagnosticsLogLevelType::Default,
1171                            EfiDiagnosticsLogLevelCli::Info => get_resources::ged::EfiDiagnosticsLogLevelType::Info,
1172                            EfiDiagnosticsLogLevelCli::Full => get_resources::ged::EfiDiagnosticsLogLevelType::Full,
1173                        }
1174                    },
1175                    hv_sint_enabled: false,
1176                }
1177                .into_resource(),
1178            ),
1179        ]);
1180    }
1181
1182    if opt.tpm && !opt.vtl2 {
1183        let register_layout = if cfg!(guest_arch = "x86_64") {
1184            TpmRegisterLayout::IoPort
1185        } else {
1186            TpmRegisterLayout::Mmio
1187        };
1188
1189        let (ppi_store, nvram_store) = if opt.vmgs.is_some() {
1190            (
1191                VmgsFileHandle::new(vmgs_format::FileId::TPM_PPI, true).into_resource(),
1192                VmgsFileHandle::new(vmgs_format::FileId::TPM_NVRAM, true).into_resource(),
1193            )
1194        } else {
1195            (
1196                EphemeralNonVolatileStoreHandle.into_resource(),
1197                EphemeralNonVolatileStoreHandle.into_resource(),
1198            )
1199        };
1200
1201        chipset_devices.push(ChipsetDeviceHandle {
1202            name: "tpm".to_string(),
1203            resource: chipset_device_worker_defs::RemoteChipsetDeviceHandle {
1204                device: TpmDeviceHandle {
1205                    ppi_store,
1206                    nvram_store,
1207                    nvram_size: None,
1208                    refresh_tpm_seeds: false,
1209                    ak_cert_type: tpm_resources::TpmAkCertTypeResource::None,
1210                    register_layout,
1211                    guest_secret_key: None,
1212                    logger: None,
1213                    is_confidential_vm: false,
1214                    bios_guid,
1215                }
1216                .into_resource(),
1217                worker_host: mesh.make_host("tpm", None).await?,
1218            }
1219            .into_resource(),
1220        });
1221    }
1222
1223    let custom_uefi_vars = {
1224        use firmware_uefi_custom_vars::CustomVars;
1225
1226        // load base vars from specified template, or use an empty set of base
1227        // vars if none was specified.
1228        let base_vars = match opt.secure_boot_template {
1229            Some(template) => match (arch, template) {
1230                (MachineArch::X86_64, SecureBootTemplateCli::Windows) => {
1231                    hyperv_secure_boot_templates::x64::microsoft_windows()
1232                }
1233                (MachineArch::X86_64, SecureBootTemplateCli::UefiCa) => {
1234                    hyperv_secure_boot_templates::x64::microsoft_uefi_ca()
1235                }
1236                (MachineArch::Aarch64, SecureBootTemplateCli::Windows) => {
1237                    hyperv_secure_boot_templates::aarch64::microsoft_windows()
1238                }
1239                (MachineArch::Aarch64, SecureBootTemplateCli::UefiCa) => {
1240                    hyperv_secure_boot_templates::aarch64::microsoft_uefi_ca()
1241                }
1242            },
1243            None => CustomVars::default(),
1244        };
1245
1246        // TODO: fallback to VMGS read if no command line flag was given
1247
1248        let custom_uefi_json_data = match &opt.custom_uefi_json {
1249            Some(file) => Some(fs_err::read(file).context("opening custom uefi json file")?),
1250            None => None,
1251        };
1252
1253        // obtain the final custom uefi vars by applying the delta onto the base vars
1254        match custom_uefi_json_data {
1255            Some(data) => {
1256                let delta = hyperv_uefi_custom_vars_json::load_delta_from_json(&data)?;
1257                base_vars.apply_delta(delta)?
1258            }
1259            None => base_vars,
1260        }
1261    };
1262
1263    let vga_firmware = if opt.pcat {
1264        Some(openvmm_pcat_locator::find_svga_bios(
1265            opt.vga_firmware.as_deref(),
1266        )?)
1267    } else {
1268        None
1269    };
1270
1271    if opt.gfx {
1272        vmbus_devices.extend([
1273            (
1274                DeviceVtl::Vtl0,
1275                SynthVideoHandle {
1276                    framebuffer: SharedFramebufferHandle.into_resource(),
1277                }
1278                .into_resource(),
1279            ),
1280            (
1281                DeviceVtl::Vtl0,
1282                SynthKeyboardHandle {
1283                    source: MultiplexedInputHandle {
1284                        // Save 0 for PS/2
1285                        elevation: 1,
1286                    }
1287                    .into_resource(),
1288                }
1289                .into_resource(),
1290            ),
1291            (
1292                DeviceVtl::Vtl0,
1293                SynthMouseHandle {
1294                    source: MultiplexedInputHandle {
1295                        // Save 0 for PS/2
1296                        elevation: 1,
1297                    }
1298                    .into_resource(),
1299                }
1300                .into_resource(),
1301            ),
1302        ]);
1303    }
1304
1305    let vsock_listener = |path: Option<&str>| -> anyhow::Result<_> {
1306        if let Some(path) = path {
1307            cleanup_socket(path.as_ref());
1308            let listener = unix_socket::UnixListener::bind(path)
1309                .with_context(|| format!("failed to bind to hybrid vsock path: {}", path))?;
1310            Ok(Some(listener))
1311        } else {
1312            Ok(None)
1313        }
1314    };
1315
1316    let vtl0_vsock_listener = vsock_listener(opt.vmbus_vsock_path.as_deref())?;
1317    let vtl2_vsock_listener = vsock_listener(opt.vmbus_vtl2_vsock_path.as_deref())?;
1318
1319    if let Some(path) = &opt.openhcl_dump_path {
1320        let (resource, task) = spawn_dump_handler(&spawner, path.clone(), None);
1321        task.detach();
1322        vmbus_devices.push((openhcl_vtl, resource));
1323    }
1324
1325    #[cfg(guest_arch = "aarch64")]
1326    let topology_arch = openvmm_defs::config::ArchTopologyConfig::Aarch64(
1327        openvmm_defs::config::Aarch64TopologyConfig {
1328            // TODO: allow this to be configured from the command line
1329            gic_config: None,
1330            pmu_gsiv: openvmm_defs::config::PmuGsivConfig::Platform,
1331            gic_msi: match opt.gic_msi {
1332                cli_args::GicMsiCli::Auto => openvmm_defs::config::GicMsiConfig::Auto,
1333                cli_args::GicMsiCli::Its => openvmm_defs::config::GicMsiConfig::Its,
1334                cli_args::GicMsiCli::V2m => openvmm_defs::config::GicMsiConfig::V2m,
1335            },
1336        },
1337    );
1338    #[cfg(guest_arch = "x86_64")]
1339    let topology_arch =
1340        openvmm_defs::config::ArchTopologyConfig::X86(openvmm_defs::config::X86TopologyConfig {
1341            apic_id_offset: opt.apic_id_offset,
1342            x2apic: opt.x2apic,
1343        });
1344
1345    let with_isolation = if let Some(isolation) = &opt.isolation {
1346        // TODO: For now, isolation is only supported with VTL2.
1347        if !opt.vtl2 {
1348            anyhow::bail!("isolation is only currently supported with vtl2");
1349        }
1350
1351        // TODO: Alias map support is not yet implement with isolation.
1352        if !opt.no_alias_map {
1353            anyhow::bail!("alias map not supported with isolation");
1354        }
1355
1356        match isolation {
1357            cli_args::IsolationCli::Vbs => Some(openvmm_defs::config::IsolationType::Vbs),
1358        }
1359    } else {
1360        None
1361    };
1362
1363    if with_hv {
1364        let (shutdown_send, shutdown_recv) = mesh::channel();
1365        resources.shutdown_ic = Some(shutdown_send);
1366        let (kvp_send, kvp_recv) = mesh::channel();
1367        resources.kvp_ic = Some(kvp_send);
1368        vmbus_devices.extend(
1369            [
1370                hyperv_ic_resources::shutdown::ShutdownIcHandle {
1371                    recv: shutdown_recv,
1372                }
1373                .into_resource(),
1374                hyperv_ic_resources::kvp::KvpIcHandle { recv: kvp_recv }.into_resource(),
1375                hyperv_ic_resources::timesync::TimesyncIcHandle.into_resource(),
1376            ]
1377            .map(|r| (DeviceVtl::Vtl0, r)),
1378        );
1379    }
1380
1381    if let Some(hive_path) = &opt.imc {
1382        let file = fs_err::File::open(hive_path).context("failed to open imc hive")?;
1383        vmbus_devices.push((
1384            DeviceVtl::Vtl0,
1385            vmbfs_resources::VmbfsImcDeviceHandle { file: file.into() }.into_resource(),
1386        ));
1387    }
1388
1389    let mut virtio_devices = Vec::new();
1390    let mut add_virtio_device = |bus, resource: Resource<VirtioDeviceHandle>| {
1391        let bus = match bus {
1392            VirtioBusCli::Auto => {
1393                // Use VPCI when possible (currently only on Windows and macOS due
1394                // to KVM backend limitations).
1395                if with_hv && (cfg!(windows) || cfg!(target_os = "macos")) {
1396                    None
1397                } else {
1398                    Some(VirtioBus::Pci)
1399                }
1400            }
1401            VirtioBusCli::Mmio => Some(VirtioBus::Mmio),
1402            VirtioBusCli::Pci => Some(VirtioBus::Pci),
1403            VirtioBusCli::Vpci => None,
1404        };
1405        if let Some(bus) = bus {
1406            virtio_devices.push((bus, resource));
1407        } else {
1408            vpci_devices.push(VpciDeviceConfig {
1409                vtl: DeviceVtl::Vtl0,
1410                instance_id: Guid::new_random(),
1411                resource: VirtioPciDeviceHandle(resource).into_resource(),
1412            });
1413        }
1414    };
1415
1416    for cli_cfg in &opt.virtio_net {
1417        if cli_cfg.underhill {
1418            anyhow::bail!("use --net uh:[...] to add underhill NICs")
1419        }
1420        let vport = parse_endpoint(cli_cfg, &mut nic_index, &mut resources)?;
1421        let resource = virtio_resources::net::VirtioNetHandle {
1422            max_queues: vport.max_queues,
1423            mac_address: vport.mac_address,
1424            endpoint: vport.endpoint,
1425        }
1426        .into_resource();
1427        if let Some(pcie_port) = &cli_cfg.pcie_port {
1428            pcie_devices.push(PcieDeviceConfig {
1429                port_name: pcie_port.clone(),
1430                resource: VirtioPciDeviceHandle(resource).into_resource(),
1431            });
1432        } else {
1433            add_virtio_device(VirtioBusCli::Auto, resource);
1434        }
1435    }
1436
1437    for args in &opt.virtio_fs {
1438        let resource: Resource<VirtioDeviceHandle> = virtio_resources::fs::VirtioFsHandle {
1439            tag: args.tag.clone(),
1440            fs: virtio_resources::fs::VirtioFsBackend::HostFs {
1441                root_path: args.path.clone(),
1442                mount_options: args.options.clone(),
1443            },
1444        }
1445        .into_resource();
1446        if let Some(pcie_port) = &args.pcie_port {
1447            pcie_devices.push(PcieDeviceConfig {
1448                port_name: pcie_port.clone(),
1449                resource: VirtioPciDeviceHandle(resource).into_resource(),
1450            });
1451        } else {
1452            add_virtio_device(opt.virtio_fs_bus, resource);
1453        }
1454    }
1455
1456    for args in &opt.virtio_fs_shmem {
1457        let resource: Resource<VirtioDeviceHandle> = virtio_resources::fs::VirtioFsHandle {
1458            tag: args.tag.clone(),
1459            fs: virtio_resources::fs::VirtioFsBackend::SectionFs {
1460                root_path: args.path.clone(),
1461            },
1462        }
1463        .into_resource();
1464        if let Some(pcie_port) = &args.pcie_port {
1465            pcie_devices.push(PcieDeviceConfig {
1466                port_name: pcie_port.clone(),
1467                resource: VirtioPciDeviceHandle(resource).into_resource(),
1468            });
1469        } else {
1470            add_virtio_device(opt.virtio_fs_bus, resource);
1471        }
1472    }
1473
1474    for args in &opt.virtio_9p {
1475        let resource: Resource<VirtioDeviceHandle> = virtio_resources::p9::VirtioPlan9Handle {
1476            tag: args.tag.clone(),
1477            root_path: args.path.clone(),
1478            debug: opt.virtio_9p_debug,
1479        }
1480        .into_resource();
1481        if let Some(pcie_port) = &args.pcie_port {
1482            pcie_devices.push(PcieDeviceConfig {
1483                port_name: pcie_port.clone(),
1484                resource: VirtioPciDeviceHandle(resource).into_resource(),
1485            });
1486        } else {
1487            add_virtio_device(VirtioBusCli::Auto, resource);
1488        }
1489    }
1490
1491    if let Some(pmem_args) = &opt.virtio_pmem {
1492        let resource: Resource<VirtioDeviceHandle> = virtio_resources::pmem::VirtioPmemHandle {
1493            path: pmem_args.path.clone(),
1494        }
1495        .into_resource();
1496        if let Some(pcie_port) = &pmem_args.pcie_port {
1497            pcie_devices.push(PcieDeviceConfig {
1498                port_name: pcie_port.clone(),
1499                resource: VirtioPciDeviceHandle(resource).into_resource(),
1500            });
1501        } else {
1502            add_virtio_device(VirtioBusCli::Auto, resource);
1503        }
1504    }
1505
1506    if opt.virtio_rng {
1507        let resource: Resource<VirtioDeviceHandle> =
1508            virtio_resources::rng::VirtioRngHandle.into_resource();
1509        if let Some(pcie_port) = &opt.virtio_rng_pcie_port {
1510            pcie_devices.push(PcieDeviceConfig {
1511                port_name: pcie_port.clone(),
1512                resource: VirtioPciDeviceHandle(resource).into_resource(),
1513            });
1514        } else {
1515            add_virtio_device(opt.virtio_rng_bus, resource);
1516        }
1517    }
1518
1519    if let Some(backend) = virtio_console_backend {
1520        let resource: Resource<VirtioDeviceHandle> =
1521            virtio_resources::console::VirtioConsoleHandle { backend }.into_resource();
1522        if let Some(pcie_port) = &opt.virtio_console_pcie_port {
1523            pcie_devices.push(PcieDeviceConfig {
1524                port_name: pcie_port.clone(),
1525                resource: VirtioPciDeviceHandle(resource).into_resource(),
1526            });
1527        } else {
1528            add_virtio_device(VirtioBusCli::Auto, resource);
1529        }
1530    }
1531
1532    // Handle --vhost-user arguments.
1533    #[cfg(target_os = "linux")]
1534    for vhost_cli in &opt.vhost_user {
1535        let stream =
1536            unix_socket::UnixStream::connect(&vhost_cli.socket_path).with_context(|| {
1537                format!(
1538                    "failed to connect to vhost-user socket: {}",
1539                    vhost_cli.socket_path
1540                )
1541            })?;
1542
1543        use crate::cli_args::VhostUserDeviceTypeCli;
1544        let resource: Resource<VirtioDeviceHandle> = match vhost_cli.device_type {
1545            VhostUserDeviceTypeCli::Fs {
1546                ref tag,
1547                num_queues,
1548                queue_size,
1549            } => virtio_resources::vhost_user::VhostUserFsHandle {
1550                socket: stream.into(),
1551                tag: tag.clone(),
1552                num_queues,
1553                queue_size,
1554            }
1555            .into_resource(),
1556            VhostUserDeviceTypeCli::Blk {
1557                num_queues,
1558                queue_size,
1559            } => virtio_resources::vhost_user::VhostUserBlkHandle {
1560                socket: stream.into(),
1561                num_queues,
1562                queue_size,
1563            }
1564            .into_resource(),
1565            VhostUserDeviceTypeCli::Other {
1566                device_id,
1567                ref queue_sizes,
1568            } => virtio_resources::vhost_user::VhostUserGenericHandle {
1569                socket: stream.into(),
1570                device_id,
1571                queue_sizes: queue_sizes.clone(),
1572            }
1573            .into_resource(),
1574        };
1575        if let Some(pcie_port) = &vhost_cli.pcie_port {
1576            pcie_devices.push(PcieDeviceConfig {
1577                port_name: pcie_port.clone(),
1578                resource: VirtioPciDeviceHandle(resource).into_resource(),
1579            });
1580        } else {
1581            add_virtio_device(VirtioBusCli::Auto, resource);
1582        }
1583    }
1584
1585    if let Some(vsock_path) = &opt.virtio_vsock_path {
1586        let listener = vsock_listener(Some(vsock_path))?.unwrap();
1587        add_virtio_device(
1588            VirtioBusCli::Auto,
1589            virtio_resources::vsock::VirtioVsockHandle {
1590                // The guest CID does not matter since the UDS relay does not use it. It just needs
1591                // to be some non-reserved value for the guest to use.
1592                guest_cid: 0x3,
1593                base_path: vsock_path.clone(),
1594                listener,
1595            }
1596            .into_resource(),
1597        );
1598    }
1599
1600    let mut cfg = Config {
1601        chipset,
1602        load_mode,
1603        floppy_disks,
1604        pcie_root_complexes,
1605        #[cfg(target_os = "linux")]
1606        pcie_devices: {
1607            let mut devs = pcie_devices;
1608            devs.extend(vfio_pcie_devices);
1609            devs
1610        },
1611        #[cfg(not(target_os = "linux"))]
1612        pcie_devices,
1613        pcie_switches,
1614        vpci_devices,
1615        ide_disks: Vec::new(),
1616        memory: MemoryConfig {
1617            mem_size: if let Some(ref sizes) = opt.numa_memory {
1618                sizes
1619                    .iter()
1620                    .try_fold(0u64, |acc, &s| acc.checked_add(s))
1621                    .context("numa memory sizes overflow")?
1622            } else {
1623                opt.memory_size()
1624            },
1625            mmio_gaps,
1626            prefetch_memory: opt.prefetch_memory(),
1627            private_memory: opt.private_memory(),
1628            transparent_hugepages: opt.transparent_hugepages(),
1629            hugepages: opt.memory.hugepages,
1630            hugepage_size: opt.memory.hugepage_size,
1631            pci_ecam_gaps,
1632            pci_mmio_gaps,
1633            numa_mem_sizes: opt.numa_memory.clone(),
1634        },
1635        processor_topology: ProcessorTopologyConfig {
1636            proc_count: opt.processors,
1637            vps_per_socket: opt.vps_per_socket,
1638            enable_smt: match opt.smt {
1639                cli_args::SmtConfigCli::Auto => None,
1640                cli_args::SmtConfigCli::Force => Some(true),
1641                cli_args::SmtConfigCli::Off => Some(false),
1642            },
1643            arch: Some(topology_arch),
1644        },
1645        hypervisor: HypervisorConfig {
1646            with_hv,
1647            with_vtl2: opt.vtl2.then_some(Vtl2Config {
1648                vtl0_alias_map: !opt.no_alias_map,
1649                late_map_vtl0_memory: match opt.late_map_vtl0_policy {
1650                    cli_args::Vtl0LateMapPolicyCli::Off => None,
1651                    cli_args::Vtl0LateMapPolicyCli::Log => Some(LateMapVtl0MemoryPolicy::Log),
1652                    cli_args::Vtl0LateMapPolicyCli::Halt => Some(LateMapVtl0MemoryPolicy::Halt),
1653                    cli_args::Vtl0LateMapPolicyCli::Exception => {
1654                        Some(LateMapVtl0MemoryPolicy::InjectException)
1655                    }
1656                },
1657            }),
1658            with_isolation,
1659        },
1660        #[cfg(windows)]
1661        kernel_vmnics,
1662        input: mesh::Receiver::new(),
1663        framebuffer,
1664        vga_firmware,
1665        vtl2_gfx: opt.vtl2_gfx,
1666        virtio_devices,
1667        vmbus: with_hv.then_some(VmbusConfig {
1668            vsock_listener: vtl0_vsock_listener,
1669            vsock_path: opt.vmbus_vsock_path.clone(),
1670            vtl2_redirect: opt.vmbus_redirect,
1671            vmbus_max_version: opt.vmbus_max_version,
1672            #[cfg(windows)]
1673            vmbusproxy_handle,
1674        }),
1675        vtl2_vmbus: (with_hv && opt.vtl2).then_some(VmbusConfig {
1676            vsock_listener: vtl2_vsock_listener,
1677            vsock_path: opt.vmbus_vtl2_vsock_path.clone(),
1678            ..Default::default()
1679        }),
1680        vmbus_devices,
1681        chipset_devices,
1682        pci_chipset_devices,
1683        chipset_capabilities: capabilities,
1684        #[cfg(windows)]
1685        vpci_resources,
1686        vmgs,
1687        secure_boot_enabled: opt.secure_boot,
1688        custom_uefi_vars,
1689        firmware_event_send: None,
1690        debugger_rpc: None,
1691        generation_id_recv: None,
1692        rtc_delta_milliseconds: 0,
1693        automatic_guest_reset: !opt.halt_on_reset,
1694        efi_diagnostics_log_level: {
1695            match opt.efi_diagnostics_log_level.unwrap_or_default() {
1696                EfiDiagnosticsLogLevelCli::Default => EfiDiagnosticsLogLevelType::Default,
1697                EfiDiagnosticsLogLevelCli::Info => EfiDiagnosticsLogLevelType::Info,
1698                EfiDiagnosticsLogLevelCli::Full => EfiDiagnosticsLogLevelType::Full,
1699            }
1700        },
1701    };
1702
1703    storage.build_config(&mut cfg, &mut resources, opt.scsi_sub_channels)?;
1704    Ok((cfg, resources))
1705}
1706
1707/// Gets the terminal to use for externally launched console windows.
1708pub(crate) fn openvmm_terminal_app() -> Option<PathBuf> {
1709    std::env::var_os("OPENVMM_TERM")
1710        .or_else(|| std::env::var_os("HVLITE_TERM"))
1711        .map(Into::into)
1712}
1713
1714// Tries to remove `path` if it is confirmed to be a Unix socket.
1715fn cleanup_socket(path: &Path) {
1716    #[cfg(windows)]
1717    let is_socket = pal::windows::fs::is_unix_socket(path).unwrap_or(false);
1718    #[cfg(not(windows))]
1719    let is_socket = path
1720        .metadata()
1721        .is_ok_and(|meta| std::os::unix::fs::FileTypeExt::is_socket(&meta.file_type()));
1722
1723    if is_socket {
1724        let _ = std::fs::remove_file(path);
1725    }
1726}
1727
1728#[cfg(windows)]
1729const DEFAULT_SWITCH: &str = "C08CB7B8-9B3C-408E-8E30-5E16A3AEB444";
1730
1731#[cfg(windows)]
1732fn new_switch_port(
1733    switch_id: &str,
1734) -> anyhow::Result<(
1735    openvmm_defs::config::SwitchPortId,
1736    vmswitch::kernel::SwitchPort,
1737)> {
1738    let id = vmswitch::kernel::SwitchPortId {
1739        switch: switch_id.parse().context("invalid switch id")?,
1740        port: Guid::new_random(),
1741    };
1742    let _ = vmswitch::hcn::Network::open(&id.switch)
1743        .with_context(|| format!("could not find switch {}", id.switch))?;
1744
1745    let port = vmswitch::kernel::SwitchPort::new(&id).context("failed to create switch port")?;
1746
1747    let id = openvmm_defs::config::SwitchPortId {
1748        switch: id.switch,
1749        port: id.port,
1750    };
1751    Ok((id, port))
1752}
1753
1754fn parse_endpoint(
1755    cli_cfg: &NicConfigCli,
1756    index: &mut usize,
1757    resources: &mut VmResources,
1758) -> anyhow::Result<NicConfig> {
1759    let _ = resources;
1760    let endpoint = match &cli_cfg.endpoint {
1761        EndpointConfigCli::Consomme { cidr, host_fwd } => {
1762            let ports = host_fwd
1763                .iter()
1764                .map(|fwd| {
1765                    use net_backend_resources::consomme::HostPortProtocol;
1766                    net_backend_resources::consomme::HostPortConfig {
1767                        protocol: match fwd.protocol {
1768                            cli_args::HostPortProtocolCli::Tcp => HostPortProtocol::Tcp,
1769                            cli_args::HostPortProtocolCli::Udp => HostPortProtocol::Udp,
1770                        },
1771                        host_address: fwd
1772                            .host_address
1773                            .map(net_backend_resources::consomme::HostIpAddress::from),
1774                        host_port: fwd.host_port,
1775                        guest_port: fwd.guest_port,
1776                    }
1777                })
1778                .collect();
1779            net_backend_resources::consomme::ConsommeHandle {
1780                cidr: cidr.clone(),
1781                ports,
1782            }
1783            .into_resource()
1784        }
1785        EndpointConfigCli::None => net_backend_resources::null::NullHandle.into_resource(),
1786        EndpointConfigCli::Dio { id } => {
1787            #[cfg(windows)]
1788            {
1789                let (port_id, port) = new_switch_port(id.as_deref().unwrap_or(DEFAULT_SWITCH))?;
1790                resources.switch_ports.push(port);
1791                net_backend_resources::dio::WindowsDirectIoHandle {
1792                    switch_port_id: net_backend_resources::dio::SwitchPortId {
1793                        switch: port_id.switch,
1794                        port: port_id.port,
1795                    },
1796                }
1797                .into_resource()
1798            }
1799
1800            #[cfg(not(windows))]
1801            {
1802                let _ = id;
1803                bail!("cannot use dio on non-windows platforms")
1804            }
1805        }
1806        EndpointConfigCli::Tap { name } => {
1807            #[cfg(target_os = "linux")]
1808            {
1809                let fd = net_tap::tap::open_tap(name)
1810                    .with_context(|| format!("failed to open TAP device '{name}'"))?;
1811                net_backend_resources::tap::TapHandle { fd }.into_resource()
1812            }
1813
1814            #[cfg(not(target_os = "linux"))]
1815            {
1816                let _ = name;
1817                bail!("TAP backend is only supported on Linux")
1818            }
1819        }
1820    };
1821
1822    // Pick a random MAC address.
1823    let mut mac_address = [0x00, 0x15, 0x5D, 0, 0, 0];
1824    getrandom::fill(&mut mac_address[3..]).expect("rng failure");
1825
1826    // Pick a fixed instance ID based on the index.
1827    const BASE_INSTANCE_ID: Guid = guid::guid!("00000000-da43-11ed-936a-00155d6db52f");
1828    let instance_id = Guid {
1829        data1: *index as u32,
1830        ..BASE_INSTANCE_ID
1831    };
1832    *index += 1;
1833
1834    Ok(NicConfig {
1835        vtl: cli_cfg.vtl,
1836        instance_id,
1837        endpoint,
1838        mac_address: mac_address.into(),
1839        max_queues: cli_cfg.max_queues,
1840        pcie_port: cli_cfg.pcie_port.clone(),
1841    })
1842}
1843
1844#[derive(Debug)]
1845struct NicConfig {
1846    vtl: DeviceVtl,
1847    instance_id: Guid,
1848    mac_address: MacAddress,
1849    endpoint: Resource<NetEndpointHandleKind>,
1850    max_queues: Option<u16>,
1851    pcie_port: Option<String>,
1852}
1853
1854impl NicConfig {
1855    fn into_netvsp_handle(self) -> (DeviceVtl, Resource<VmbusDeviceHandleKind>) {
1856        (
1857            self.vtl,
1858            netvsp_resources::NetvspHandle {
1859                instance_id: self.instance_id,
1860                mac_address: self.mac_address,
1861                endpoint: self.endpoint,
1862                max_queues: self.max_queues,
1863            }
1864            .into_resource(),
1865        )
1866    }
1867}
1868
1869enum LayerOrDisk {
1870    Layer(DiskLayerDescription),
1871    Disk(Resource<DiskHandleKind>),
1872}
1873
1874async fn disk_open(
1875    disk_cli: &DiskCliKind,
1876    read_only: bool,
1877) -> anyhow::Result<Resource<DiskHandleKind>> {
1878    let mut layers = Vec::new();
1879    disk_open_inner(disk_cli, read_only, &mut layers).await?;
1880    if layers.len() == 1 && matches!(layers[0], LayerOrDisk::Disk(_)) {
1881        let LayerOrDisk::Disk(disk) = layers.pop().unwrap() else {
1882            unreachable!()
1883        };
1884        Ok(disk)
1885    } else {
1886        Ok(Resource::new(disk_backend_resources::LayeredDiskHandle {
1887            layers: layers
1888                .into_iter()
1889                .map(|layer| match layer {
1890                    LayerOrDisk::Layer(layer) => layer,
1891                    LayerOrDisk::Disk(disk) => DiskLayerDescription {
1892                        layer: DiskLayerHandle(disk).into_resource(),
1893                        read_cache: false,
1894                        write_through: false,
1895                    },
1896                })
1897                .collect(),
1898        }))
1899    }
1900}
1901
1902fn disk_open_inner<'a>(
1903    disk_cli: &'a DiskCliKind,
1904    read_only: bool,
1905    layers: &'a mut Vec<LayerOrDisk>,
1906) -> futures::future::BoxFuture<'a, anyhow::Result<()>> {
1907    Box::pin(async move {
1908        fn layer<T: IntoResource<DiskLayerHandleKind>>(layer: T) -> LayerOrDisk {
1909            LayerOrDisk::Layer(layer.into_resource().into())
1910        }
1911        fn disk<T: IntoResource<DiskHandleKind>>(disk: T) -> LayerOrDisk {
1912            LayerOrDisk::Disk(disk.into_resource())
1913        }
1914        match disk_cli {
1915            &DiskCliKind::Memory(len) => {
1916                layers.push(layer(RamDiskLayerHandle {
1917                    len: Some(len),
1918                    sector_size: None,
1919                }));
1920            }
1921            DiskCliKind::File {
1922                path,
1923                create_with_len,
1924                direct,
1925            } => layers.push(LayerOrDisk::Disk(if let Some(size) = create_with_len {
1926                create_disk_type(
1927                    path,
1928                    *size,
1929                    OpenDiskOptions {
1930                        read_only: false,
1931                        direct: *direct,
1932                    },
1933                )
1934                .with_context(|| format!("failed to create {}", path.display()))?
1935            } else {
1936                open_disk_type(
1937                    path,
1938                    OpenDiskOptions {
1939                        read_only,
1940                        direct: *direct,
1941                    },
1942                )
1943                .await
1944                .with_context(|| format!("failed to open {}", path.display()))?
1945            })),
1946            DiskCliKind::Blob { kind, url } => {
1947                layers.push(disk(disk_backend_resources::BlobDiskHandle {
1948                    url: url.to_owned(),
1949                    format: match kind {
1950                        cli_args::BlobKind::Flat => disk_backend_resources::BlobDiskFormat::Flat,
1951                        cli_args::BlobKind::Vhd1 => {
1952                            disk_backend_resources::BlobDiskFormat::FixedVhd1
1953                        }
1954                    },
1955                }))
1956            }
1957            DiskCliKind::MemoryDiff(inner) => {
1958                layers.push(layer(RamDiskLayerHandle {
1959                    len: None,
1960                    sector_size: None,
1961                }));
1962                disk_open_inner(inner, true, layers).await?;
1963            }
1964            DiskCliKind::PersistentReservationsWrapper(inner) => {
1965                layers.push(disk(disk_backend_resources::DiskWithReservationsHandle(
1966                    disk_open(inner, read_only).await?,
1967                )))
1968            }
1969            DiskCliKind::DelayDiskWrapper {
1970                delay_ms,
1971                disk: inner,
1972            } => layers.push(disk(DelayDiskHandle {
1973                delay: CellUpdater::new(Duration::from_millis(*delay_ms)).cell(),
1974                disk: disk_open(inner, read_only).await?,
1975            })),
1976            DiskCliKind::Crypt {
1977                disk: inner,
1978                cipher,
1979                key_file,
1980            } => layers.push(disk(disk_crypt_resources::DiskCryptHandle {
1981                disk: disk_open(inner, read_only).await?,
1982                cipher: match cipher {
1983                    cli_args::DiskCipher::XtsAes256 => disk_crypt_resources::Cipher::XtsAes256,
1984                },
1985                key: fs_err::read(key_file).context("failed to read key file")?,
1986            })),
1987            DiskCliKind::Sqlite {
1988                path,
1989                create_with_len,
1990            } => {
1991                // FUTURE: this code should be responsible for opening
1992                // file-handle(s) itself, and passing them into sqlite via a custom
1993                // vfs. For now though - simply check if the file exists or not, and
1994                // perform early validation of filesystem-level create options.
1995                match (create_with_len.is_some(), path.exists()) {
1996                    (true, true) => anyhow::bail!(
1997                        "cannot create new sqlite disk at {} - file already exists",
1998                        path.display()
1999                    ),
2000                    (false, false) => anyhow::bail!(
2001                        "cannot open sqlite disk at {} - file not found",
2002                        path.display()
2003                    ),
2004                    _ => {}
2005                }
2006
2007                layers.push(layer(SqliteDiskLayerHandle {
2008                    dbhd_path: path.display().to_string(),
2009                    format_dbhd: create_with_len.map(|len| {
2010                        disk_backend_resources::layer::SqliteDiskLayerFormatParams {
2011                            logically_read_only: false,
2012                            len: Some(len),
2013                        }
2014                    }),
2015                }));
2016            }
2017            DiskCliKind::SqliteDiff { path, create, disk } => {
2018                // FUTURE: this code should be responsible for opening
2019                // file-handle(s) itself, and passing them into sqlite via a custom
2020                // vfs. For now though - simply check if the file exists or not, and
2021                // perform early validation of filesystem-level create options.
2022                match (create, path.exists()) {
2023                    (true, true) => anyhow::bail!(
2024                        "cannot create new sqlite disk at {} - file already exists",
2025                        path.display()
2026                    ),
2027                    (false, false) => anyhow::bail!(
2028                        "cannot open sqlite disk at {} - file not found",
2029                        path.display()
2030                    ),
2031                    _ => {}
2032                }
2033
2034                layers.push(layer(SqliteDiskLayerHandle {
2035                    dbhd_path: path.display().to_string(),
2036                    format_dbhd: create.then_some(
2037                        disk_backend_resources::layer::SqliteDiskLayerFormatParams {
2038                            logically_read_only: false,
2039                            len: None,
2040                        },
2041                    ),
2042                }));
2043                disk_open_inner(disk, true, layers).await?;
2044            }
2045            DiskCliKind::AutoCacheSqlite {
2046                cache_path,
2047                key,
2048                disk,
2049            } => {
2050                layers.push(LayerOrDisk::Layer(DiskLayerDescription {
2051                    read_cache: true,
2052                    write_through: false,
2053                    layer: SqliteAutoCacheDiskLayerHandle {
2054                        cache_path: cache_path.clone(),
2055                        cache_key: key.clone(),
2056                    }
2057                    .into_resource(),
2058                }));
2059                disk_open_inner(disk, read_only, layers).await?;
2060            }
2061        }
2062        Ok(())
2063    })
2064}
2065
2066/// Get the system page size.
2067pub(crate) fn system_page_size() -> u32 {
2068    sparse_mmap::SparseMapping::page_size() as u32
2069}
2070
2071/// The guest architecture string, derived from the compile-time `guest_arch` cfg.
2072pub(crate) const GUEST_ARCH: &str = if cfg!(guest_arch = "x86_64") {
2073    "x86_64"
2074} else {
2075    "aarch64"
2076};
2077
2078/// Open a snapshot directory and validate it against the current VM config.
2079/// Returns the shared memory fd (from memory.bin) and the saved device state.
2080fn prepare_snapshot_restore(
2081    snapshot_dir: &Path,
2082    opt: &Options,
2083) -> anyhow::Result<(
2084    openvmm_defs::worker::SharedMemoryFd,
2085    mesh::payload::message::ProtobufMessage,
2086)> {
2087    let (manifest, state_bytes) = openvmm_helpers::snapshot::read_snapshot(snapshot_dir)?;
2088
2089    // Validate manifest against current VM config.
2090    openvmm_helpers::snapshot::validate_manifest(
2091        &manifest,
2092        GUEST_ARCH,
2093        opt.memory_size(),
2094        opt.processors,
2095        system_page_size(),
2096    )?;
2097
2098    // Open memory.bin (existing file, no create, no resize).
2099    let memory_file = fs_err::OpenOptions::new()
2100        .read(true)
2101        .write(true)
2102        .open(snapshot_dir.join("memory.bin"))?;
2103
2104    // Validate file size matches expected memory size.
2105    let file_size = memory_file.metadata()?.len();
2106    if file_size != manifest.memory_size_bytes {
2107        anyhow::bail!(
2108            "memory.bin size ({file_size} bytes) doesn't match manifest ({} bytes)",
2109            manifest.memory_size_bytes,
2110        );
2111    }
2112
2113    let shared_memory_fd =
2114        openvmm_helpers::shared_memory::file_to_shared_memory_fd(memory_file.into())?;
2115
2116    // Reconstruct ProtobufMessage from the saved state bytes.
2117    // The save side wrote mesh::payload::encode(ProtobufMessage), so we decode
2118    // back to ProtobufMessage.
2119    let state_msg: mesh::payload::message::ProtobufMessage = mesh::payload::decode(&state_bytes)
2120        .context("failed to decode saved state from snapshot")?;
2121
2122    Ok((shared_memory_fd, state_msg))
2123}
2124
2125fn do_main(pidfile_path: &mut Option<PathBuf>) -> anyhow::Result<()> {
2126    #[cfg(windows)]
2127    pal::windows::disable_hard_error_dialog();
2128
2129    tracing_init::enable_tracing()?;
2130
2131    // Try to run as a worker host.
2132    // On success the worker runs to completion and then exits the process (does
2133    // not return). Any worker host setup errors are return and bubbled up.
2134    meshworker::run_vmm_mesh_host()?;
2135
2136    let opt = Options::parse();
2137    if let Some(path) = &opt.write_saved_state_proto {
2138        mesh::payload::protofile::DescriptorWriter::new(vmcore::save_restore::saved_state_roots())
2139            .write_to_path(path)
2140            .context("failed to write protobuf descriptors")?;
2141        return Ok(());
2142    }
2143
2144    if let Some(ref path) = opt.pidfile {
2145        std::fs::write(path, format!("{}\n", std::process::id()))
2146            .context("failed to write pidfile")?;
2147        *pidfile_path = Some(path.clone());
2148    }
2149
2150    if let Some(path) = opt.relay_console_path {
2151        let console_title = opt.relay_console_title.unwrap_or_default();
2152        return console_relay::relay_console(&path, console_title.as_str());
2153    }
2154
2155    #[cfg(any(feature = "grpc", feature = "ttrpc"))]
2156    if let Some(path) = opt.ttrpc.as_ref().or(opt.grpc.as_ref()) {
2157        return block_on(async {
2158            let _ = std::fs::remove_file(path);
2159            let listener =
2160                unix_socket::UnixListener::bind(path).context("failed to bind to socket")?;
2161
2162            let transport = if opt.ttrpc.is_some() {
2163                ttrpc::RpcTransport::Ttrpc
2164            } else {
2165                ttrpc::RpcTransport::Grpc
2166            };
2167
2168            // This is a local launch
2169            let mut handle =
2170                mesh_worker::launch_local_worker::<ttrpc::TtrpcWorker>(ttrpc::Parameters {
2171                    listener,
2172                    transport,
2173                })
2174                .await?;
2175
2176            tracing::info!(%transport, path = %path.display(), "listening");
2177
2178            // Signal the the parent process that the server is ready.
2179            pal::close_stdout().context("failed to close stdout")?;
2180
2181            handle.join().await?;
2182
2183            Ok(())
2184        });
2185    }
2186
2187    DefaultPool::run_with(async |driver| run_control(&driver, opt).await)
2188}
2189
2190fn new_hvsock_service_id(port: u32) -> Guid {
2191    // This GUID is an embedding of the AF_VSOCK port into an
2192    // AF_HYPERV service ID.
2193    Guid {
2194        data1: port,
2195        .."00000000-facb-11e6-bd58-64006a7986d3".parse().unwrap()
2196    }
2197}
2198
2199async fn run_control(driver: &DefaultDriver, opt: Options) -> anyhow::Result<()> {
2200    let mut mesh = Some(VmmMesh::new(&driver, opt.single_process)?);
2201    let result = run_control_inner(driver, &mut mesh, opt).await;
2202    // If setup failed before the mesh was handed to the controller, shut it
2203    // down so the child host process exits cleanly without noisy logs.
2204    if let Some(mesh) = mesh {
2205        mesh.shutdown().await;
2206    }
2207    result
2208}
2209
2210async fn run_control_inner(
2211    driver: &DefaultDriver,
2212    mesh_slot: &mut Option<VmmMesh>,
2213    opt: Options,
2214) -> anyhow::Result<()> {
2215    let mesh = mesh_slot.as_ref().unwrap();
2216    let (mut vm_config, mut resources) = vm_config_from_command_line(driver, mesh, &opt).await?;
2217
2218    let mut vnc_worker = None;
2219    if opt.gfx || opt.vnc {
2220        let listener = TcpListener::bind(format!("127.0.0.1:{}", opt.vnc_port))
2221            .with_context(|| format!("binding to VNC port {}", opt.vnc_port))?;
2222
2223        let input_send = vm_config.input.sender();
2224        let framebuffer = resources
2225            .framebuffer_access
2226            .take()
2227            .expect("synth video enabled");
2228
2229        let vnc_host = mesh
2230            .make_host("vnc", None)
2231            .await
2232            .context("spawning vnc process failed")?;
2233
2234        vnc_worker = Some(
2235            vnc_host
2236                .launch_worker(
2237                    vnc_worker_defs::VNC_WORKER_TCP,
2238                    VncParameters {
2239                        listener,
2240                        framebuffer,
2241                        input_send,
2242                    },
2243                )
2244                .await?,
2245        )
2246    }
2247
2248    // spin up the debug worker
2249    let gdb_worker = if let Some(port) = opt.gdb {
2250        let listener = TcpListener::bind(format!("127.0.0.1:{}", port))
2251            .with_context(|| format!("binding to gdb port {}", port))?;
2252
2253        let (req_tx, req_rx) = mesh::channel();
2254        vm_config.debugger_rpc = Some(req_rx);
2255
2256        let gdb_host = mesh
2257            .make_host("gdb", None)
2258            .await
2259            .context("spawning gdbstub process failed")?;
2260
2261        Some(
2262            gdb_host
2263                .launch_worker(
2264                    debug_worker_defs::DEBUGGER_WORKER,
2265                    debug_worker_defs::DebuggerParameters {
2266                        listener,
2267                        req_chan: req_tx,
2268                        vp_count: vm_config.processor_topology.proc_count,
2269                        target_arch: if cfg!(guest_arch = "x86_64") {
2270                            debug_worker_defs::TargetArch::X86_64
2271                        } else {
2272                            debug_worker_defs::TargetArch::Aarch64
2273                        },
2274                    },
2275                )
2276                .await
2277                .context("failed to launch gdbstub worker")?,
2278        )
2279    } else {
2280        None
2281    };
2282
2283    // spin up the VM
2284    let (vm_rpc, rpc_recv) = mesh::channel();
2285    let (notify_send, notify_recv) = mesh::channel();
2286    let vm_worker = {
2287        let vm_host = mesh.make_host("vm", opt.log_file.clone()).await?;
2288
2289        let (shared_memory, saved_state) = if let Some(snapshot_dir) = &opt.restore_snapshot {
2290            let (fd, state_msg) = prepare_snapshot_restore(snapshot_dir, &opt)?;
2291            (Some(fd), Some(state_msg))
2292        } else {
2293            let shared_memory = opt
2294                .memory_backing_file()
2295                .map(|path| {
2296                    openvmm_helpers::shared_memory::open_memory_backing_file(
2297                        path,
2298                        opt.memory_size(),
2299                    )
2300                })
2301                .transpose()?;
2302            (shared_memory, None)
2303        };
2304
2305        let params = VmWorkerParameters {
2306            hypervisor: match &opt.hypervisor {
2307                Some(name) => openvmm_helpers::hypervisor::hypervisor_resource(name)?,
2308                None => openvmm_helpers::hypervisor::choose_hypervisor()?,
2309            },
2310            cfg: vm_config,
2311            saved_state,
2312            shared_memory,
2313            rpc: rpc_recv,
2314            notify: notify_send,
2315        };
2316        vm_host
2317            .launch_worker(VM_WORKER, params)
2318            .await
2319            .context("failed to launch vm worker")?
2320    };
2321
2322    if opt.restore_snapshot.is_some() {
2323        tracing::info!("restoring VM from snapshot");
2324    }
2325
2326    if !opt.paused {
2327        vm_rpc.call(VmRpc::Resume, ()).await?;
2328    }
2329
2330    let paravisor_diag = Arc::new(diag_client::DiagClient::from_dialer(
2331        driver.clone(),
2332        DiagDialer {
2333            driver: driver.clone(),
2334            vm_rpc: vm_rpc.clone(),
2335            openhcl_vtl: if opt.vtl2 {
2336                DeviceVtl::Vtl2
2337            } else {
2338                DeviceVtl::Vtl0
2339            },
2340        },
2341    ));
2342
2343    let diag_inspector = DiagInspector::new(driver.clone(), paravisor_diag.clone());
2344
2345    // Create channels between the REPL and VmController.
2346    let (vm_controller_send, vm_controller_recv) = mesh::channel();
2347    let (vm_controller_event_send, vm_controller_event_recv) = mesh::channel();
2348
2349    let has_vtl2 = resources.vtl2_settings.is_some();
2350
2351    // Build the VmController with exclusive resources.
2352    let controller = vm_controller::VmController {
2353        mesh: mesh_slot.take().unwrap(),
2354        vm_worker,
2355        vnc_worker,
2356        gdb_worker,
2357        diag_inspector: Some(diag_inspector),
2358        vtl2_settings: resources.vtl2_settings,
2359        ged_rpc: resources.ged_rpc.clone(),
2360        vm_rpc: vm_rpc.clone(),
2361        paravisor_diag: Some(paravisor_diag),
2362        igvm_path: opt.igvm.clone(),
2363        memory_backing_file: opt.memory_backing_file().cloned(),
2364        memory: opt.memory_size(),
2365        processors: opt.processors,
2366        log_file: opt.log_file.clone(),
2367    };
2368
2369    // Spawn the VmController as a task.
2370    let controller_task = driver.spawn(
2371        "vm-controller",
2372        controller.run(vm_controller_recv, vm_controller_event_send, notify_recv),
2373    );
2374
2375    // Run the REPL with shareable resources.
2376    let repl_result = repl::run_repl(
2377        driver,
2378        repl::ReplResources {
2379            vm_rpc,
2380            vm_controller: vm_controller_send,
2381            vm_controller_events: vm_controller_event_recv,
2382            scsi_rpc: resources.scsi_rpc,
2383            nvme_vtl2_rpc: resources.nvme_vtl2_rpc,
2384            shutdown_ic: resources.shutdown_ic,
2385            kvp_ic: resources.kvp_ic,
2386            console_in: resources.console_in,
2387            has_vtl2,
2388        },
2389    )
2390    .await;
2391
2392    // Wait for the controller task to finish (it stops the VM worker and
2393    // shuts down the mesh).
2394    controller_task.await;
2395
2396    repl_result
2397}
2398
2399struct DiagDialer {
2400    driver: DefaultDriver,
2401    vm_rpc: mesh::Sender<VmRpc>,
2402    openhcl_vtl: DeviceVtl,
2403}
2404
2405impl mesh_rpc::client::Dial for DiagDialer {
2406    type Stream = PolledSocket<unix_socket::UnixStream>;
2407
2408    async fn dial(&mut self) -> io::Result<Self::Stream> {
2409        let service_id = new_hvsock_service_id(1);
2410        let socket = self
2411            .vm_rpc
2412            .call_failable(
2413                VmRpc::ConnectHvsock,
2414                (
2415                    CancelContext::new().with_timeout(Duration::from_secs(2)),
2416                    service_id,
2417                    self.openhcl_vtl,
2418                ),
2419            )
2420            .await
2421            .map_err(io::Error::other)?;
2422
2423        PolledSocket::new(&self.driver, socket)
2424    }
2425}
2426
2427/// An object that implements [`InspectMut`] by sending an inspect request over
2428/// TTRPC to the guest (typically the paravisor running in VTL2), then stitching
2429/// the response back into the inspect tree.
2430///
2431/// This also caches the TTRPC connection to the guest so that only the first
2432/// inspect request has to wait for the connection to be established.
2433pub(crate) struct DiagInspector(DiagInspectorInner);
2434
2435enum DiagInspectorInner {
2436    NotStarted(DefaultDriver, Arc<diag_client::DiagClient>),
2437    Started {
2438        send: mesh::Sender<inspect::Deferred>,
2439        _task: Task<()>,
2440    },
2441    Invalid,
2442}
2443
2444impl DiagInspector {
2445    pub fn new(driver: DefaultDriver, diag_client: Arc<diag_client::DiagClient>) -> Self {
2446        Self(DiagInspectorInner::NotStarted(driver, diag_client))
2447    }
2448
2449    fn start(&mut self) -> &mesh::Sender<inspect::Deferred> {
2450        loop {
2451            match self.0 {
2452                DiagInspectorInner::NotStarted { .. } => {
2453                    let DiagInspectorInner::NotStarted(driver, client) =
2454                        std::mem::replace(&mut self.0, DiagInspectorInner::Invalid)
2455                    else {
2456                        unreachable!()
2457                    };
2458                    let (send, recv) = mesh::channel();
2459                    let task = driver.clone().spawn("diag-inspect", async move {
2460                        Self::run(&client, recv).await
2461                    });
2462
2463                    self.0 = DiagInspectorInner::Started { send, _task: task };
2464                }
2465                DiagInspectorInner::Started { ref send, .. } => break send,
2466                DiagInspectorInner::Invalid => unreachable!(),
2467            }
2468        }
2469    }
2470
2471    async fn run(
2472        diag_client: &diag_client::DiagClient,
2473        mut recv: mesh::Receiver<inspect::Deferred>,
2474    ) {
2475        while let Some(deferred) = recv.next().await {
2476            let info = deferred.external_request();
2477            let result = match info.request_type {
2478                inspect::ExternalRequestType::Inspect { depth } => {
2479                    if depth == 0 {
2480                        Ok(inspect::Node::Unevaluated)
2481                    } else {
2482                        // TODO: Support taking timeouts from the command line
2483                        diag_client
2484                            .inspect(info.path, Some(depth - 1), Some(Duration::from_secs(1)))
2485                            .await
2486                    }
2487                }
2488                inspect::ExternalRequestType::Update { value } => {
2489                    (diag_client.update(info.path, value).await).map(inspect::Node::Value)
2490                }
2491            };
2492            deferred.complete_external(
2493                result.unwrap_or_else(|err| {
2494                    inspect::Node::Failed(inspect::Error::Mesh(format!("{err:#}")))
2495                }),
2496                inspect::SensitivityLevel::Unspecified,
2497            )
2498        }
2499    }
2500}
2501
2502impl InspectMut for DiagInspector {
2503    fn inspect_mut(&mut self, req: inspect::Request<'_>) {
2504        self.start().send(req.defer());
2505    }
2506}