Skip to main content

openvmm_entry/
lib.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! This module implements the interactive control process and the entry point
5//! for the worker process.
6
7#![expect(missing_docs)]
8#![forbid(unsafe_code)]
9
10mod cli_args;
11mod crash_dump;
12mod kvp;
13mod meshworker;
14mod repl;
15mod serial_io;
16mod storage_builder;
17mod tracing_init;
18mod ttrpc;
19mod vm_controller;
20
21// `pub` so that the missing_docs warning fires for options without
22// documentation.
23pub use cli_args::Options;
24use console_relay::ConsoleLaunchOptions;
25
26use crate::cli_args::SecureBootTemplateCli;
27use anyhow::Context;
28use anyhow::bail;
29use chipset_resources::battery::HostBatteryUpdate;
30use clap::Parser;
31use cli_args::DiskCliKind;
32use cli_args::EfiDiagnosticsLogLevelCli;
33use cli_args::EndpointConfigCli;
34use cli_args::NicConfigCli;
35use cli_args::ProvisionVmgs;
36use cli_args::SerialConfigCli;
37use cli_args::UefiConsoleModeCli;
38use cli_args::VirtioBusCli;
39use cli_args::VmgsCli;
40use crash_dump::spawn_dump_handler;
41use disk_backend_resources::DelayDiskHandle;
42use disk_backend_resources::DiskLayerDescription;
43use disk_backend_resources::layer::DiskLayerHandle;
44use disk_backend_resources::layer::RamDiskLayerHandle;
45use disk_backend_resources::layer::SqliteAutoCacheDiskLayerHandle;
46use disk_backend_resources::layer::SqliteDiskLayerHandle;
47use floppy_resources::FloppyDiskConfig;
48use framebuffer::FRAMEBUFFER_SIZE;
49use framebuffer::FramebufferAccess;
50use futures::AsyncReadExt;
51use futures::AsyncWrite;
52use futures::StreamExt;
53use futures::executor::block_on;
54use futures::io::AllowStdIo;
55use gdma_resources::GdmaDeviceHandle;
56use gdma_resources::VportDefinition;
57use guid::Guid;
58use input_core::MultiplexedInputHandle;
59use inspect::InspectMut;
60use io::Read;
61use memory_range::MemoryRange;
62use mesh::CancelContext;
63use mesh::CellUpdater;
64use mesh::rpc::RpcSend;
65use meshworker::VmmMesh;
66use net_backend_resources::mac_address::MacAddress;
67use nvme_resources::NvmeControllerRequest;
68use openvmm_defs::config::Config;
69use openvmm_defs::config::DEFAULT_MMIO_GAPS_AARCH64;
70use openvmm_defs::config::DEFAULT_MMIO_GAPS_AARCH64_WITH_VTL2;
71use openvmm_defs::config::DEFAULT_MMIO_GAPS_X86;
72use openvmm_defs::config::DEFAULT_MMIO_GAPS_X86_WITH_VTL2;
73use openvmm_defs::config::DEFAULT_PCAT_BOOT_ORDER;
74use openvmm_defs::config::DeviceVtl;
75use openvmm_defs::config::EfiDiagnosticsLogLevelType;
76use openvmm_defs::config::HypervisorConfig;
77use openvmm_defs::config::LateMapVtl0MemoryPolicy;
78use openvmm_defs::config::LoadMode;
79use openvmm_defs::config::MemoryConfig;
80use openvmm_defs::config::PcieDeviceConfig;
81use openvmm_defs::config::PcieRootComplexConfig;
82use openvmm_defs::config::PcieRootPortConfig;
83use openvmm_defs::config::PcieSwitchConfig;
84use openvmm_defs::config::ProcessorTopologyConfig;
85use openvmm_defs::config::SerialInformation;
86use openvmm_defs::config::VirtioBus;
87use openvmm_defs::config::VmbusConfig;
88use openvmm_defs::config::VpciDeviceConfig;
89use openvmm_defs::config::Vtl2BaseAddressType;
90use openvmm_defs::config::Vtl2Config;
91use openvmm_defs::rpc::VmRpc;
92use openvmm_defs::worker::VM_WORKER;
93use openvmm_defs::worker::VmWorkerParameters;
94use openvmm_helpers::disk::OpenDiskOptions;
95use openvmm_helpers::disk::create_disk_type;
96use openvmm_helpers::disk::open_disk_type;
97use pal_async::DefaultDriver;
98use pal_async::DefaultPool;
99use pal_async::socket::PolledSocket;
100use pal_async::task::Spawn;
101use pal_async::task::Task;
102use serial_16550_resources::ComPort;
103use serial_core::resources::DisconnectedSerialBackendHandle;
104use sparse_mmap::alloc_shared_memory;
105use std::cell::RefCell;
106use std::collections::BTreeMap;
107use std::fmt::Write as _;
108use std::future::pending;
109use std::io;
110#[cfg(unix)]
111use std::io::IsTerminal;
112use std::io::Write;
113use std::net::TcpListener;
114use std::path::Path;
115use std::path::PathBuf;
116use std::sync::Arc;
117use std::thread;
118use std::time::Duration;
119use storvsp_resources::ScsiControllerRequest;
120use tpm_resources::TpmDeviceHandle;
121use tpm_resources::TpmRegisterLayout;
122use uidevices_resources::SynthKeyboardHandle;
123use uidevices_resources::SynthMouseHandle;
124use uidevices_resources::SynthVideoHandle;
125use video_core::SharedFramebufferHandle;
126use virtio_resources::VirtioPciDeviceHandle;
127use vm_manifest_builder::BaseChipsetType;
128use vm_manifest_builder::MachineArch;
129use vm_manifest_builder::VmChipsetResult;
130use vm_manifest_builder::VmManifestBuilder;
131use vm_resource::IntoResource;
132use vm_resource::Resource;
133use vm_resource::kind::DiskHandleKind;
134use vm_resource::kind::DiskLayerHandleKind;
135use vm_resource::kind::NetEndpointHandleKind;
136use vm_resource::kind::VirtioDeviceHandle;
137use vm_resource::kind::VmbusDeviceHandleKind;
138use vmbus_serial_resources::VmbusSerialDeviceHandle;
139use vmbus_serial_resources::VmbusSerialPort;
140use vmcore::non_volatile_store::resources::EphemeralNonVolatileStoreHandle;
141use vmgs_resources::GuestStateEncryptionPolicy;
142use vmgs_resources::VmgsDisk;
143use vmgs_resources::VmgsFileHandle;
144use vmgs_resources::VmgsResource;
145use vmotherboard::ChipsetDeviceHandle;
146use vnc_worker_defs::VncParameters;
147
148/// RAII guard that removes the pidfile when dropped. Ensures the pidfile is
149/// cleaned up even if [`do_main`] panics.
150struct PidfileGuard(Option<PathBuf>);
151
152impl Drop for PidfileGuard {
153    fn drop(&mut self) {
154        if let Some(path) = &self.0 {
155            let _ = fs_err::remove_file(path);
156        }
157    }
158}
159
160pub fn openvmm_main() {
161    // Save the current state of the terminal so we can restore it back to
162    // normal before exiting.
163    #[cfg(unix)]
164    let orig_termios = io::stderr().is_terminal().then(term::get_termios);
165
166    let mut pidfile_guard = PidfileGuard(None);
167    let exit_code = match do_main(&mut pidfile_guard.0) {
168        Ok(_) => 0,
169        Err(err) => {
170            eprintln!("fatal error: {:?}", err);
171            1
172        }
173    };
174
175    // Restore the terminal to its initial state.
176    #[cfg(unix)]
177    if let Some(orig_termios) = orig_termios {
178        term::set_termios(orig_termios);
179    }
180
181    // Clean up the pidfile before terminating, since pal::process::terminate
182    // skips destructors.
183    drop(pidfile_guard);
184
185    // Terminate the process immediately without graceful shutdown of DLLs or
186    // C++ destructors or anything like that. This is all unnecessary and saves
187    // time on Windows.
188    //
189    // Do flush stdout, though, since there may be buffered data.
190    let _ = io::stdout().flush();
191    pal::process::terminate(exit_code);
192}
193
194#[derive(Default)]
195struct VmResources {
196    console_in: Option<Box<dyn AsyncWrite + Send + Unpin>>,
197    framebuffer_access: Option<FramebufferAccess>,
198    shutdown_ic: Option<mesh::Sender<hyperv_ic_resources::shutdown::ShutdownRpc>>,
199    kvp_ic: Option<mesh::Sender<hyperv_ic_resources::kvp::KvpConnectRpc>>,
200    scsi_rpc: Option<mesh::Sender<ScsiControllerRequest>>,
201    nvme_vtl2_rpc: Option<mesh::Sender<NvmeControllerRequest>>,
202    ged_rpc: Option<mesh::Sender<get_resources::ged::GuestEmulationRequest>>,
203    vtl2_settings: Option<vtl2_settings_proto::Vtl2Settings>,
204    #[cfg(windows)]
205    switch_ports: Vec<vmswitch::kernel::SwitchPort>,
206}
207
208struct ConsoleState<'a> {
209    device: &'a str,
210    input: Box<dyn AsyncWrite + Unpin + Send>,
211}
212
213/// Build a flat list of switches with their parent port assignments.
214///
215/// This function converts hierarchical CLI switch definitions into a flat list
216/// where each switch specifies its parent port directly.
217fn build_switch_list(all_switches: &[cli_args::GenericPcieSwitchCli]) -> Vec<PcieSwitchConfig> {
218    all_switches
219        .iter()
220        .map(|switch_cli| PcieSwitchConfig {
221            name: switch_cli.name.clone(),
222            num_downstream_ports: switch_cli.num_downstream_ports,
223            parent_port: switch_cli.port_name.clone(),
224            hotplug: switch_cli.hotplug,
225        })
226        .collect()
227}
228
229async fn vm_config_from_command_line(
230    spawner: impl Spawn,
231    mesh: &VmmMesh,
232    opt: &Options,
233) -> anyhow::Result<(Config, VmResources)> {
234    let (_, serial_driver) = DefaultPool::spawn_on_thread("serial");
235    // Ensure the serial driver stays alive with no tasks.
236    serial_driver.spawn("leak", pending::<()>()).detach();
237
238    let openhcl_vtl = if opt.vtl2 {
239        DeviceVtl::Vtl2
240    } else {
241        DeviceVtl::Vtl0
242    };
243
244    let console_state: RefCell<Option<ConsoleState<'_>>> = RefCell::new(None);
245    let setup_serial = |name: &str, cli_cfg, device| -> anyhow::Result<_> {
246        Ok(match cli_cfg {
247            SerialConfigCli::Console => {
248                if let Some(console_state) = console_state.borrow().as_ref() {
249                    bail!("console already set by {}", console_state.device);
250                }
251                let (config, serial) = serial_io::anonymous_serial_pair(&serial_driver)?;
252                let (serial_read, serial_write) = AsyncReadExt::split(serial);
253                *console_state.borrow_mut() = Some(ConsoleState {
254                    device,
255                    input: Box::new(serial_write),
256                });
257                thread::Builder::new()
258                    .name(name.to_owned())
259                    .spawn(move || {
260                        let _ = block_on(futures::io::copy(
261                            serial_read,
262                            &mut AllowStdIo::new(term::raw_stdout()),
263                        ));
264                    })
265                    .unwrap();
266                Some(config)
267            }
268            SerialConfigCli::Stderr => {
269                let (config, serial) = serial_io::anonymous_serial_pair(&serial_driver)?;
270                thread::Builder::new()
271                    .name(name.to_owned())
272                    .spawn(move || {
273                        let _ = block_on(futures::io::copy(
274                            serial,
275                            &mut AllowStdIo::new(term::raw_stderr()),
276                        ));
277                    })
278                    .unwrap();
279                Some(config)
280            }
281            SerialConfigCli::File(path) => {
282                let (config, serial) = serial_io::anonymous_serial_pair(&serial_driver)?;
283                let file = fs_err::File::create(path).context("failed to create file")?;
284
285                thread::Builder::new()
286                    .name(name.to_owned())
287                    .spawn(move || {
288                        let _ = block_on(futures::io::copy(serial, &mut AllowStdIo::new(file)));
289                    })
290                    .unwrap();
291                Some(config)
292            }
293            SerialConfigCli::None => None,
294            SerialConfigCli::Pipe(path) => {
295                Some(serial_io::bind_serial(&path).context("failed to bind serial")?)
296            }
297            SerialConfigCli::Tcp(addr) => {
298                Some(serial_io::bind_tcp_serial(&addr).context("failed to bind serial")?)
299            }
300            SerialConfigCli::NewConsole(app, window_title) => {
301                let path = console_relay::random_console_path();
302                let config =
303                    serial_io::bind_serial(&path).context("failed to bind console serial")?;
304                let window_title =
305                    window_title.unwrap_or_else(|| name.to_uppercase() + " [OpenVMM]");
306
307                console_relay::launch_console(
308                    app.or_else(openvmm_terminal_app).as_deref(),
309                    &path,
310                    ConsoleLaunchOptions {
311                        window_title: Some(window_title),
312                    },
313                )
314                .context("failed to launch console")?;
315
316                Some(config)
317            }
318        })
319    };
320
321    let mut vmbus_devices = Vec::new();
322
323    let serial0_cfg = setup_serial(
324        "com1",
325        opt.com1.clone().unwrap_or(SerialConfigCli::Console),
326        if cfg!(guest_arch = "x86_64") {
327            "ttyS0"
328        } else {
329            "ttyAMA0"
330        },
331    )?;
332    let serial1_cfg = setup_serial(
333        "com2",
334        opt.com2.clone().unwrap_or(SerialConfigCli::None),
335        if cfg!(guest_arch = "x86_64") {
336            "ttyS1"
337        } else {
338            "ttyAMA1"
339        },
340    )?;
341    let serial2_cfg = setup_serial(
342        "com3",
343        opt.com3.clone().unwrap_or(SerialConfigCli::None),
344        if cfg!(guest_arch = "x86_64") {
345            "ttyS2"
346        } else {
347            "ttyAMA2"
348        },
349    )?;
350    let serial3_cfg = setup_serial(
351        "com4",
352        opt.com4.clone().unwrap_or(SerialConfigCli::None),
353        if cfg!(guest_arch = "x86_64") {
354            "ttyS3"
355        } else {
356            "ttyAMA3"
357        },
358    )?;
359    let with_vmbus_com1_serial = if let Some(vmbus_com1_cfg) = setup_serial(
360        "vmbus_com1",
361        opt.vmbus_com1_serial
362            .clone()
363            .unwrap_or(SerialConfigCli::None),
364        "vmbus_com1",
365    )? {
366        vmbus_devices.push((
367            openhcl_vtl,
368            VmbusSerialDeviceHandle {
369                port: VmbusSerialPort::Com1,
370                backend: vmbus_com1_cfg,
371            }
372            .into_resource(),
373        ));
374        true
375    } else {
376        false
377    };
378    let with_vmbus_com2_serial = if let Some(vmbus_com2_cfg) = setup_serial(
379        "vmbus_com2",
380        opt.vmbus_com2_serial
381            .clone()
382            .unwrap_or(SerialConfigCli::None),
383        "vmbus_com2",
384    )? {
385        vmbus_devices.push((
386            openhcl_vtl,
387            VmbusSerialDeviceHandle {
388                port: VmbusSerialPort::Com2,
389                backend: vmbus_com2_cfg,
390            }
391            .into_resource(),
392        ));
393        true
394    } else {
395        false
396    };
397    let debugcon_cfg = setup_serial(
398        "debugcon",
399        opt.debugcon
400            .clone()
401            .map(|cfg| cfg.serial)
402            .unwrap_or(SerialConfigCli::None),
403        "debugcon",
404    )?;
405
406    let virtio_console_backend = if let Some(serial_cfg) = opt.virtio_console.clone() {
407        setup_serial("virtio-console", serial_cfg, "hvc0")?
408    } else {
409        None
410    };
411
412    let mut resources = VmResources::default();
413    let mut console_str = "";
414    if let Some(ConsoleState { device, input }) = console_state.into_inner() {
415        resources.console_in = Some(input);
416        console_str = device;
417    }
418
419    if opt.shared_memory {
420        tracing::warn!("--shared-memory/-M flag has no effect and will be removed");
421    }
422    if opt.deprecated_prefetch {
423        tracing::warn!("--prefetch is deprecated; use --memory prefetch=on");
424    }
425    if opt.deprecated_private_memory {
426        tracing::warn!("--private-memory is deprecated; use --memory shared=off");
427    }
428    if opt.deprecated_thp {
429        tracing::warn!("--thp is deprecated; use --memory shared=off,thp=on");
430    }
431    if opt.deprecated_memory_backing_file.is_some() {
432        tracing::warn!("--memory-backing-file is deprecated; use --memory file=<path>");
433    }
434
435    opt.validate_memory_options()?;
436
437    const MAX_PROCESSOR_COUNT: u32 = 1024;
438
439    if opt.processors == 0 || opt.processors > MAX_PROCESSOR_COUNT {
440        bail!("invalid proc count: {}", opt.processors);
441    }
442
443    // Total SCSI channel count should not exceed the processor count
444    // (at most, one channel per VP).
445    if opt.scsi_sub_channels > (MAX_PROCESSOR_COUNT - 1) as u16 {
446        bail!(
447            "invalid SCSI sub-channel count: requested {}, max {}",
448            opt.scsi_sub_channels,
449            MAX_PROCESSOR_COUNT - 1
450        );
451    }
452
453    let with_get = opt.get || (opt.vtl2 && !opt.no_get);
454
455    let mut storage = storage_builder::StorageBuilder::new(with_get.then_some(openhcl_vtl));
456    for &cli_args::DiskCli {
457        vtl,
458        ref kind,
459        read_only,
460        is_dvd,
461        underhill,
462        ref pcie_port,
463    } in &opt.disk
464    {
465        if pcie_port.is_some() {
466            anyhow::bail!("`--disk` is incompatible with PCIe");
467        }
468
469        storage
470            .add(
471                vtl,
472                underhill,
473                storage_builder::DiskLocation::Scsi(None),
474                kind,
475                is_dvd,
476                read_only,
477            )
478            .await?;
479    }
480
481    for &cli_args::IdeDiskCli {
482        ref kind,
483        read_only,
484        channel,
485        device,
486        is_dvd,
487    } in &opt.ide
488    {
489        storage
490            .add(
491                DeviceVtl::Vtl0,
492                None,
493                storage_builder::DiskLocation::Ide(channel, device),
494                kind,
495                is_dvd,
496                read_only,
497            )
498            .await?;
499    }
500
501    for &cli_args::DiskCli {
502        vtl,
503        ref kind,
504        read_only,
505        is_dvd,
506        underhill,
507        ref pcie_port,
508    } in &opt.nvme
509    {
510        storage
511            .add(
512                vtl,
513                underhill,
514                storage_builder::DiskLocation::Nvme(None, pcie_port.clone()),
515                kind,
516                is_dvd,
517                read_only,
518            )
519            .await?;
520    }
521
522    for &cli_args::DiskCli {
523        vtl,
524        ref kind,
525        read_only,
526        is_dvd,
527        ref underhill,
528        ref pcie_port,
529    } in &opt.virtio_blk
530    {
531        if underhill.is_some() {
532            anyhow::bail!("underhill not supported with virtio-blk");
533        }
534        storage
535            .add(
536                vtl,
537                None,
538                storage_builder::DiskLocation::VirtioBlk(pcie_port.clone()),
539                kind,
540                is_dvd,
541                read_only,
542            )
543            .await?;
544    }
545
546    let mut floppy_disks = Vec::new();
547    for disk in &opt.floppy {
548        let &cli_args::FloppyDiskCli {
549            ref kind,
550            read_only,
551        } = disk;
552        floppy_disks.push(FloppyDiskConfig {
553            disk_type: disk_open(kind, read_only).await?,
554            read_only,
555        });
556    }
557
558    let mut vpci_mana_nics = [(); 3].map(|()| None);
559    let mut pcie_mana_nics = BTreeMap::<String, GdmaDeviceHandle>::new();
560    let mut underhill_nics = Vec::new();
561    let mut vpci_devices = Vec::new();
562
563    let mut nic_index = 0;
564    for cli_cfg in &opt.net {
565        if cli_cfg.pcie_port.is_some() {
566            anyhow::bail!("`--net` does not support PCIe");
567        }
568        let vport = parse_endpoint(cli_cfg, &mut nic_index, &mut resources)?;
569        if cli_cfg.underhill {
570            if !opt.no_alias_map {
571                anyhow::bail!("must specify --no-alias-map to offer NICs to VTL2");
572            }
573            let mana = vpci_mana_nics[openhcl_vtl as usize].get_or_insert_with(|| {
574                let vpci_instance_id = Guid::new_random();
575                underhill_nics.push(vtl2_settings_proto::NicDeviceLegacy {
576                    instance_id: vpci_instance_id.to_string(),
577                    subordinate_instance_id: None,
578                    max_sub_channels: None,
579                });
580                (vpci_instance_id, GdmaDeviceHandle { vports: Vec::new() })
581            });
582            mana.1.vports.push(VportDefinition {
583                mac_address: vport.mac_address,
584                endpoint: vport.endpoint,
585            });
586        } else {
587            vmbus_devices.push(vport.into_netvsp_handle());
588        }
589    }
590
591    if opt.nic {
592        let nic_config = parse_endpoint(
593            &NicConfigCli {
594                vtl: DeviceVtl::Vtl0,
595                endpoint: EndpointConfigCli::Consomme {
596                    cidr: None,
597                    host_fwd: Vec::new(),
598                },
599                max_queues: None,
600                underhill: false,
601                pcie_port: None,
602            },
603            &mut nic_index,
604            &mut resources,
605        )?;
606        vmbus_devices.push(nic_config.into_netvsp_handle());
607    }
608
609    // Build initial PCIe devices list from CLI options. Storage devices
610    // (e.g., NVMe controllers on PCIe ports) are added later by storage_builder.
611    let mut pcie_devices = Vec::new();
612    for (index, cli_cfg) in opt.pcie_remote.iter().enumerate() {
613        tracing::info!(
614            port_name = %cli_cfg.port_name,
615            socket_addr = ?cli_cfg.socket_addr,
616            "instantiating PCIe remote device"
617        );
618
619        // Generate a deterministic instance ID based on index
620        const PCIE_REMOTE_BASE_INSTANCE_ID: Guid =
621            guid::guid!("28ed784d-c059-429f-9d9a-46bea02562c0");
622        let instance_id = Guid {
623            data1: index as u32,
624            ..PCIE_REMOTE_BASE_INSTANCE_ID
625        };
626
627        pcie_devices.push(PcieDeviceConfig {
628            port_name: cli_cfg.port_name.clone(),
629            resource: pcie_remote_resources::PcieRemoteHandle {
630                instance_id,
631                socket_addr: cli_cfg.socket_addr.clone(),
632                hu: cli_cfg.hu,
633                controller: cli_cfg.controller,
634            }
635            .into_resource(),
636        });
637    }
638
639    #[cfg(windows)]
640    let mut kernel_vmnics = Vec::new();
641    #[cfg(windows)]
642    for (index, switch_id) in opt.kernel_vmnic.iter().enumerate() {
643        // Pick a random MAC address.
644        let mut mac_address = [0x00, 0x15, 0x5D, 0, 0, 0];
645        getrandom::fill(&mut mac_address[3..]).expect("rng failure");
646
647        // Pick a fixed instance ID based on the index.
648        const BASE_INSTANCE_ID: Guid = guid::guid!("00000000-435d-11ee-9f59-00155d5016fc");
649        let instance_id = Guid {
650            data1: index as u32,
651            ..BASE_INSTANCE_ID
652        };
653
654        let switch_id = if switch_id == "default" {
655            DEFAULT_SWITCH
656        } else {
657            switch_id
658        };
659        let (port_id, port) = new_switch_port(switch_id)?;
660        resources.switch_ports.push(port);
661
662        kernel_vmnics.push(openvmm_defs::config::KernelVmNicConfig {
663            instance_id,
664            mac_address: mac_address.into(),
665            switch_port_id: port_id,
666        });
667    }
668
669    for vport in &opt.mana {
670        let vport = parse_endpoint(vport, &mut nic_index, &mut resources)?;
671        let vport_array = match (vport.vtl as usize, vport.pcie_port) {
672            (vtl, None) => {
673                &mut vpci_mana_nics[vtl]
674                    .get_or_insert_with(|| {
675                        (Guid::new_random(), GdmaDeviceHandle { vports: Vec::new() })
676                    })
677                    .1
678                    .vports
679            }
680            (0, Some(pcie_port)) => {
681                &mut pcie_mana_nics
682                    .entry(pcie_port)
683                    .or_insert(GdmaDeviceHandle { vports: Vec::new() })
684                    .vports
685            }
686            _ => anyhow::bail!("PCIe NICs only supported to VTL0"),
687        };
688        vport_array.push(VportDefinition {
689            mac_address: vport.mac_address,
690            endpoint: vport.endpoint,
691        });
692    }
693
694    vpci_devices.extend(
695        vpci_mana_nics
696            .into_iter()
697            .enumerate()
698            .filter_map(|(vtl, nic)| {
699                nic.map(|(instance_id, handle)| VpciDeviceConfig {
700                    vtl: match vtl {
701                        0 => DeviceVtl::Vtl0,
702                        1 => DeviceVtl::Vtl1,
703                        2 => DeviceVtl::Vtl2,
704                        _ => unreachable!(),
705                    },
706                    instance_id,
707                    resource: handle.into_resource(),
708                })
709            }),
710    );
711
712    pcie_devices.extend(
713        pcie_mana_nics
714            .into_iter()
715            .map(|(pcie_port, handle)| PcieDeviceConfig {
716                port_name: pcie_port,
717                resource: handle.into_resource(),
718            }),
719    );
720
721    // If VTL2 is enabled, and we are not in VTL2 self allocate mode, provide an
722    // mmio gap for VTL2.
723    let use_vtl2_gap = opt.vtl2
724        && !matches!(
725            opt.igvm_vtl2_relocation_type,
726            Vtl2BaseAddressType::Vtl2Allocate { .. },
727        );
728
729    #[cfg(guest_arch = "aarch64")]
730    let arch = MachineArch::Aarch64;
731    #[cfg(guest_arch = "x86_64")]
732    let arch = MachineArch::X86_64;
733
734    let mmio_gaps: Vec<MemoryRange> = match (use_vtl2_gap, arch) {
735        (true, MachineArch::X86_64) => DEFAULT_MMIO_GAPS_X86_WITH_VTL2.into(),
736        (true, MachineArch::Aarch64) => DEFAULT_MMIO_GAPS_AARCH64_WITH_VTL2.into(),
737        (false, MachineArch::X86_64) => DEFAULT_MMIO_GAPS_X86.into(),
738        (false, MachineArch::Aarch64) => DEFAULT_MMIO_GAPS_AARCH64.into(),
739    };
740
741    let mut pci_ecam_gaps = Vec::new();
742    let mut pci_mmio_gaps = Vec::new();
743
744    let mut low_mmio_start = mmio_gaps.first().context("expected mmio gap")?.start();
745    let mut high_mmio_end = mmio_gaps.last().context("expected second mmio gap")?.end();
746
747    let mut pcie_root_complexes = Vec::new();
748    for (i, rc_cli) in opt.pcie_root_complex.iter().enumerate() {
749        let ports = opt
750            .pcie_root_port
751            .iter()
752            .filter(|port_cli| port_cli.root_complex_name == rc_cli.name)
753            .map(|port_cli| PcieRootPortConfig {
754                name: port_cli.name.clone(),
755                hotplug: port_cli.hotplug,
756            })
757            .collect();
758
759        const ONE_MB: u64 = 1024 * 1024;
760        let low_mmio_size = (rc_cli.low_mmio as u64).next_multiple_of(ONE_MB);
761        let high_mmio_size = rc_cli
762            .high_mmio
763            .checked_next_multiple_of(ONE_MB)
764            .context("high mmio rounding error")?;
765        let ecam_size = (((rc_cli.end_bus - rc_cli.start_bus) as u64) + 1) * 256 * 4096;
766
767        let low_pci_mmio_start = low_mmio_start
768            .checked_sub(low_mmio_size)
769            .context("pci low mmio underflow")?;
770        let ecam_start = low_pci_mmio_start
771            .checked_sub(ecam_size)
772            .context("pci ecam underflow")?;
773        low_mmio_start = ecam_start;
774        high_mmio_end = high_mmio_end
775            .checked_add(high_mmio_size)
776            .context("pci high mmio overflow")?;
777
778        let ecam_range = MemoryRange::new(ecam_start..ecam_start + ecam_size);
779        let low_mmio = MemoryRange::new(low_pci_mmio_start..low_pci_mmio_start + low_mmio_size);
780        let high_mmio = MemoryRange::new(high_mmio_end - high_mmio_size..high_mmio_end);
781
782        pci_ecam_gaps.push(ecam_range);
783        pci_mmio_gaps.push(low_mmio);
784        pci_mmio_gaps.push(high_mmio);
785
786        pcie_root_complexes.push(PcieRootComplexConfig {
787            index: i as u32,
788            name: rc_cli.name.clone(),
789            segment: rc_cli.segment,
790            start_bus: rc_cli.start_bus,
791            end_bus: rc_cli.end_bus,
792            ecam_range,
793            low_mmio,
794            high_mmio,
795            ports,
796        });
797    }
798
799    pci_ecam_gaps.sort();
800    pci_mmio_gaps.sort();
801
802    let pcie_switches = build_switch_list(&opt.pcie_switch);
803
804    #[cfg(target_os = "linux")]
805    let vfio_pcie_devices: Vec<PcieDeviceConfig> = opt
806        .vfio
807        .iter()
808        .map(|cli_cfg| {
809            use vm_resource::IntoResource;
810
811            let sysfs_path = Path::new("/sys/bus/pci/devices").join(&cli_cfg.pci_id);
812            let iommu_group_link = std::fs::read_link(sysfs_path.join("iommu_group"))
813                .with_context(|| format!("failed to read IOMMU group for {}", cli_cfg.pci_id))?;
814            let group_id: u64 = iommu_group_link
815                .file_name()
816                .and_then(|s| s.to_str())
817                .context("invalid iommu_group symlink")?
818                .parse()
819                .context("failed to parse IOMMU group ID")?;
820            let group = std::fs::OpenOptions::new()
821                .read(true)
822                .write(true)
823                .open(format!("/dev/vfio/{group_id}"))
824                .with_context(|| format!("failed to open /dev/vfio/{group_id}"))?;
825
826            Ok(PcieDeviceConfig {
827                port_name: cli_cfg.port_name.clone(),
828                resource: vfio_assigned_device_resources::VfioDeviceHandle {
829                    pci_id: cli_cfg.pci_id.clone(),
830                    group,
831                }
832                .into_resource(),
833            })
834        })
835        .collect::<anyhow::Result<Vec<_>>>()?;
836
837    #[cfg(windows)]
838    let vpci_resources: Vec<_> = opt
839        .device
840        .iter()
841        .map(|path| -> anyhow::Result<_> {
842            Ok(virt_whp::device::DeviceHandle(
843                whp::VpciResource::new(
844                    None,
845                    Default::default(),
846                    &whp::VpciResourceDescriptor::Sriov(path, 0, 0),
847                )
848                .with_context(|| format!("opening PCI device {}", path))?,
849            ))
850        })
851        .collect::<Result<_, _>>()?;
852
853    // Create a vmbusproxy handle if needed by any devices.
854    #[cfg(windows)]
855    let vmbusproxy_handle = if !kernel_vmnics.is_empty() {
856        Some(vmbus_proxy::ProxyHandle::new().context("failed to open vmbusproxy handle")?)
857    } else {
858        None
859    };
860
861    let framebuffer = if opt.gfx || opt.vtl2_gfx || opt.vnc || opt.pcat {
862        let vram = alloc_shared_memory(FRAMEBUFFER_SIZE, "vram")?;
863        let (fb, fba) =
864            framebuffer::framebuffer(vram, FRAMEBUFFER_SIZE, 0).context("creating framebuffer")?;
865        resources.framebuffer_access = Some(fba);
866        Some(fb)
867    } else {
868        None
869    };
870
871    let load_mode;
872    let with_hv;
873
874    let any_serial_configured = serial0_cfg.is_some()
875        || serial1_cfg.is_some()
876        || serial2_cfg.is_some()
877        || serial3_cfg.is_some();
878
879    let has_com3 = serial2_cfg.is_some();
880
881    let mut chipset = VmManifestBuilder::new(
882        if opt.igvm.is_some() {
883            BaseChipsetType::HclHost
884        } else if opt.pcat {
885            BaseChipsetType::HypervGen1
886        } else if opt.uefi {
887            BaseChipsetType::HypervGen2Uefi
888        } else if opt.hv {
889            BaseChipsetType::HyperVGen2LinuxDirect
890        } else {
891            BaseChipsetType::UnenlightenedLinuxDirect
892        },
893        arch,
894    );
895
896    if framebuffer.is_some() {
897        chipset = chipset.with_framebuffer();
898    }
899    if opt.guest_watchdog {
900        chipset = chipset.with_guest_watchdog();
901    }
902    if any_serial_configured {
903        chipset = chipset.with_serial([serial0_cfg, serial1_cfg, serial2_cfg, serial3_cfg]);
904    }
905    if opt.battery {
906        let (tx, rx) = mesh::channel();
907        tx.send(HostBatteryUpdate::default_present());
908        chipset = chipset.with_battery(rx);
909    }
910    if let Some(cfg) = &opt.debugcon {
911        chipset = chipset.with_debugcon(
912            debugcon_cfg.unwrap_or_else(|| DisconnectedSerialBackendHandle.into_resource()),
913            cfg.port,
914        );
915    }
916
917    // TODO: load from VMGS file if it exists
918    let bios_guid = Guid::new_random();
919
920    let VmChipsetResult {
921        chipset,
922        mut chipset_devices,
923        pci_chipset_devices,
924        capabilities,
925    } = chipset
926        .build()
927        .context("failed to build chipset configuration")?;
928
929    if opt.restore_snapshot.is_some() {
930        // Snapshot restore: skip firmware loading entirely. Device state and
931        // memory come from the snapshot directory.
932        load_mode = LoadMode::None;
933        with_hv = true;
934    } else if let Some(path) = &opt.igvm {
935        let file = fs_err::File::open(path)
936            .context("failed to open igvm file")?
937            .into();
938        let cmdline = opt.cmdline.join(" ");
939        with_hv = true;
940
941        load_mode = LoadMode::Igvm {
942            file,
943            cmdline,
944            vtl2_base_address: opt.igvm_vtl2_relocation_type,
945            com_serial: has_com3.then(|| SerialInformation {
946                io_port: ComPort::Com3.io_port(),
947                irq: ComPort::Com3.irq().into(),
948            }),
949        };
950    } else if opt.pcat {
951        // Emit a nice error early instead of complaining about missing firmware.
952        if arch != MachineArch::X86_64 {
953            anyhow::bail!("pcat not supported on this architecture");
954        }
955        with_hv = true;
956
957        let firmware = openvmm_pcat_locator::find_pcat_bios(opt.pcat_firmware.as_deref())?;
958        load_mode = LoadMode::Pcat {
959            firmware,
960            boot_order: opt
961                .pcat_boot_order
962                .map(|x| x.0)
963                .unwrap_or(DEFAULT_PCAT_BOOT_ORDER),
964        };
965    } else if opt.uefi {
966        use openvmm_defs::config::UefiConsoleMode;
967
968        with_hv = true;
969
970        let firmware = fs_err::File::open(
971            (opt.uefi_firmware.0)
972                .as_ref()
973                .context("must provide uefi firmware when booting with uefi")?,
974        )
975        .context("failed to open uefi firmware")?;
976
977        // TODO: It would be better to default memory protections to on, but currently Linux does not boot via UEFI due to what
978        //       appears to be a GRUB memory protection fault. Memory protections are therefore only enabled if configured.
979        load_mode = LoadMode::Uefi {
980            firmware: firmware.into(),
981            enable_debugging: opt.uefi_debug,
982            enable_memory_protections: opt.uefi_enable_memory_protections,
983            disable_frontpage: opt.disable_frontpage,
984            enable_tpm: opt.tpm,
985            enable_battery: opt.battery,
986            enable_serial: any_serial_configured,
987            enable_vpci_boot: false,
988            uefi_console_mode: opt.uefi_console_mode.map(|m| match m {
989                UefiConsoleModeCli::Default => UefiConsoleMode::Default,
990                UefiConsoleModeCli::Com1 => UefiConsoleMode::Com1,
991                UefiConsoleModeCli::Com2 => UefiConsoleMode::Com2,
992                UefiConsoleModeCli::None => UefiConsoleMode::None,
993            }),
994            default_boot_always_attempt: opt.default_boot_always_attempt,
995            bios_guid,
996        };
997    } else {
998        // Linux Direct
999        let mut cmdline = "panic=-1 debug".to_string();
1000
1001        with_hv = opt.hv;
1002        if with_hv && opt.pcie_root_complex.is_empty() {
1003            cmdline += " pci=off";
1004        }
1005
1006        if !console_str.is_empty() {
1007            let _ = write!(&mut cmdline, " console={}", console_str);
1008        }
1009
1010        if opt.gfx {
1011            cmdline += " console=tty";
1012        }
1013        for extra in &opt.cmdline {
1014            let _ = write!(&mut cmdline, " {}", extra);
1015        }
1016
1017        let kernel = fs_err::File::open(
1018            (opt.kernel.0)
1019                .as_ref()
1020                .context("must provide kernel when booting with linux direct")?,
1021        )
1022        .context("failed to open kernel")?;
1023        let initrd = (opt.initrd.0)
1024            .as_ref()
1025            .map(fs_err::File::open)
1026            .transpose()
1027            .context("failed to open initrd")?;
1028
1029        let custom_dsdt = match &opt.custom_dsdt {
1030            Some(path) => {
1031                let mut v = Vec::new();
1032                fs_err::File::open(path)
1033                    .context("failed to open custom dsdt")?
1034                    .read_to_end(&mut v)
1035                    .context("failed to read custom dsdt")?;
1036                Some(v)
1037            }
1038            None => None,
1039        };
1040
1041        load_mode = LoadMode::Linux {
1042            kernel: kernel.into(),
1043            initrd: initrd.map(Into::into),
1044            cmdline,
1045            custom_dsdt,
1046            enable_serial: any_serial_configured,
1047            boot_mode: if opt.device_tree {
1048                openvmm_defs::config::LinuxDirectBootMode::DeviceTree
1049            } else {
1050                openvmm_defs::config::LinuxDirectBootMode::Acpi
1051            },
1052        };
1053    }
1054
1055    let mut vmgs = Some(if let Some(VmgsCli { kind, provision }) = &opt.vmgs {
1056        let disk = VmgsDisk {
1057            disk: disk_open(kind, false)
1058                .await
1059                .context("failed to open vmgs disk")?,
1060            encryption_policy: if opt.test_gsp_by_id {
1061                GuestStateEncryptionPolicy::GspById(true)
1062            } else {
1063                GuestStateEncryptionPolicy::None(true)
1064            },
1065        };
1066        match provision {
1067            ProvisionVmgs::OnEmpty => VmgsResource::Disk(disk),
1068            ProvisionVmgs::OnFailure => VmgsResource::ReprovisionOnFailure(disk),
1069            ProvisionVmgs::True => VmgsResource::Reprovision(disk),
1070        }
1071    } else {
1072        VmgsResource::Ephemeral
1073    });
1074
1075    if with_get && with_hv {
1076        let vtl2_settings = vtl2_settings_proto::Vtl2Settings {
1077            version: vtl2_settings_proto::vtl2_settings_base::Version::V1.into(),
1078            fixed: Some(Default::default()),
1079            dynamic: Some(vtl2_settings_proto::Vtl2SettingsDynamic {
1080                storage_controllers: storage.build_underhill(opt.vmbus_redirect),
1081                nic_devices: underhill_nics,
1082            }),
1083            namespace_settings: Vec::default(),
1084        };
1085
1086        // Cache the VTL2 settings for later modification via the interactive console.
1087        resources.vtl2_settings = Some(vtl2_settings.clone());
1088
1089        let (send, guest_request_recv) = mesh::channel();
1090        resources.ged_rpc = Some(send);
1091
1092        let vmgs = vmgs.take().unwrap();
1093
1094        vmbus_devices.extend([
1095            (
1096                openhcl_vtl,
1097                get_resources::gel::GuestEmulationLogHandle.into_resource(),
1098            ),
1099            (
1100                openhcl_vtl,
1101                get_resources::ged::GuestEmulationDeviceHandle {
1102                    firmware: if opt.pcat {
1103                        get_resources::ged::GuestFirmwareConfig::Pcat {
1104                            boot_order: opt
1105                                .pcat_boot_order
1106                                .map_or(DEFAULT_PCAT_BOOT_ORDER, |x| x.0)
1107                                .map(|x| match x {
1108                                    openvmm_defs::config::PcatBootDevice::Floppy => {
1109                                        get_resources::ged::PcatBootDevice::Floppy
1110                                    }
1111                                    openvmm_defs::config::PcatBootDevice::HardDrive => {
1112                                        get_resources::ged::PcatBootDevice::HardDrive
1113                                    }
1114                                    openvmm_defs::config::PcatBootDevice::Optical => {
1115                                        get_resources::ged::PcatBootDevice::Optical
1116                                    }
1117                                    openvmm_defs::config::PcatBootDevice::Network => {
1118                                        get_resources::ged::PcatBootDevice::Network
1119                                    }
1120                                }),
1121                        }
1122                    } else {
1123                        use get_resources::ged::UefiConsoleMode;
1124
1125                        get_resources::ged::GuestFirmwareConfig::Uefi {
1126                            enable_vpci_boot: storage.has_vtl0_nvme(),
1127                            firmware_debug: opt.uefi_debug,
1128                            disable_frontpage: opt.disable_frontpage,
1129                            console_mode: match opt.uefi_console_mode.unwrap_or(UefiConsoleModeCli::Default) {
1130                                UefiConsoleModeCli::Default => UefiConsoleMode::Default,
1131                                UefiConsoleModeCli::Com1 => UefiConsoleMode::COM1,
1132                                UefiConsoleModeCli::Com2 => UefiConsoleMode::COM2,
1133                                UefiConsoleModeCli::None => UefiConsoleMode::None,
1134                            },
1135                            default_boot_always_attempt: opt.default_boot_always_attempt,
1136                        }
1137                    },
1138                    com1: with_vmbus_com1_serial,
1139                    com2: with_vmbus_com2_serial,
1140                    serial_tx_only: opt.serial_tx_only,
1141                    vtl2_settings: Some(prost::Message::encode_to_vec(&vtl2_settings)),
1142                    vmbus_redirection: opt.vmbus_redirect,
1143                    vmgs,
1144                    framebuffer: opt
1145                        .vtl2_gfx
1146                        .then(|| SharedFramebufferHandle.into_resource()),
1147                    guest_request_recv,
1148                    enable_tpm: opt.tpm,
1149                    firmware_event_send: None,
1150                    secure_boot_enabled: opt.secure_boot,
1151                    secure_boot_template: match opt.secure_boot_template {
1152                        Some(SecureBootTemplateCli::Windows) => {
1153                            get_resources::ged::GuestSecureBootTemplateType::MicrosoftWindows
1154                        },
1155                        Some(SecureBootTemplateCli::UefiCa) => {
1156                            get_resources::ged::GuestSecureBootTemplateType::MicrosoftUefiCertificateAuthority
1157                        }
1158                        None => {
1159                            get_resources::ged::GuestSecureBootTemplateType::None
1160                        },
1161                    },
1162                    enable_battery: opt.battery,
1163                    no_persistent_secrets: true,
1164                    igvm_attest_test_config: None,
1165                    test_gsp_by_id: opt.test_gsp_by_id,
1166                    efi_diagnostics_log_level: {
1167                        match opt.efi_diagnostics_log_level.unwrap_or_default() {
1168                            EfiDiagnosticsLogLevelCli::Default => get_resources::ged::EfiDiagnosticsLogLevelType::Default,
1169                            EfiDiagnosticsLogLevelCli::Info => get_resources::ged::EfiDiagnosticsLogLevelType::Info,
1170                            EfiDiagnosticsLogLevelCli::Full => get_resources::ged::EfiDiagnosticsLogLevelType::Full,
1171                        }
1172                    },
1173                    hv_sint_enabled: false,
1174                }
1175                .into_resource(),
1176            ),
1177        ]);
1178    }
1179
1180    if opt.tpm && !opt.vtl2 {
1181        let register_layout = if cfg!(guest_arch = "x86_64") {
1182            TpmRegisterLayout::IoPort
1183        } else {
1184            TpmRegisterLayout::Mmio
1185        };
1186
1187        let (ppi_store, nvram_store) = if opt.vmgs.is_some() {
1188            (
1189                VmgsFileHandle::new(vmgs_format::FileId::TPM_PPI, true).into_resource(),
1190                VmgsFileHandle::new(vmgs_format::FileId::TPM_NVRAM, true).into_resource(),
1191            )
1192        } else {
1193            (
1194                EphemeralNonVolatileStoreHandle.into_resource(),
1195                EphemeralNonVolatileStoreHandle.into_resource(),
1196            )
1197        };
1198
1199        chipset_devices.push(ChipsetDeviceHandle {
1200            name: "tpm".to_string(),
1201            resource: chipset_device_worker_defs::RemoteChipsetDeviceHandle {
1202                device: TpmDeviceHandle {
1203                    ppi_store,
1204                    nvram_store,
1205                    nvram_size: None,
1206                    refresh_tpm_seeds: false,
1207                    ak_cert_type: tpm_resources::TpmAkCertTypeResource::None,
1208                    register_layout,
1209                    guest_secret_key: None,
1210                    logger: None,
1211                    is_confidential_vm: false,
1212                    bios_guid,
1213                }
1214                .into_resource(),
1215                worker_host: mesh.make_host("tpm", None).await?,
1216            }
1217            .into_resource(),
1218        });
1219    }
1220
1221    let custom_uefi_vars = {
1222        use firmware_uefi_custom_vars::CustomVars;
1223
1224        // load base vars from specified template, or use an empty set of base
1225        // vars if none was specified.
1226        let base_vars = match opt.secure_boot_template {
1227            Some(template) => match (arch, template) {
1228                (MachineArch::X86_64, SecureBootTemplateCli::Windows) => {
1229                    hyperv_secure_boot_templates::x64::microsoft_windows()
1230                }
1231                (MachineArch::X86_64, SecureBootTemplateCli::UefiCa) => {
1232                    hyperv_secure_boot_templates::x64::microsoft_uefi_ca()
1233                }
1234                (MachineArch::Aarch64, SecureBootTemplateCli::Windows) => {
1235                    hyperv_secure_boot_templates::aarch64::microsoft_windows()
1236                }
1237                (MachineArch::Aarch64, SecureBootTemplateCli::UefiCa) => {
1238                    hyperv_secure_boot_templates::aarch64::microsoft_uefi_ca()
1239                }
1240            },
1241            None => CustomVars::default(),
1242        };
1243
1244        // TODO: fallback to VMGS read if no command line flag was given
1245
1246        let custom_uefi_json_data = match &opt.custom_uefi_json {
1247            Some(file) => Some(fs_err::read(file).context("opening custom uefi json file")?),
1248            None => None,
1249        };
1250
1251        // obtain the final custom uefi vars by applying the delta onto the base vars
1252        match custom_uefi_json_data {
1253            Some(data) => {
1254                let delta = hyperv_uefi_custom_vars_json::load_delta_from_json(&data)?;
1255                base_vars.apply_delta(delta)?
1256            }
1257            None => base_vars,
1258        }
1259    };
1260
1261    let vga_firmware = if opt.pcat {
1262        Some(openvmm_pcat_locator::find_svga_bios(
1263            opt.vga_firmware.as_deref(),
1264        )?)
1265    } else {
1266        None
1267    };
1268
1269    if opt.gfx {
1270        vmbus_devices.extend([
1271            (
1272                DeviceVtl::Vtl0,
1273                SynthVideoHandle {
1274                    framebuffer: SharedFramebufferHandle.into_resource(),
1275                }
1276                .into_resource(),
1277            ),
1278            (
1279                DeviceVtl::Vtl0,
1280                SynthKeyboardHandle {
1281                    source: MultiplexedInputHandle {
1282                        // Save 0 for PS/2
1283                        elevation: 1,
1284                    }
1285                    .into_resource(),
1286                }
1287                .into_resource(),
1288            ),
1289            (
1290                DeviceVtl::Vtl0,
1291                SynthMouseHandle {
1292                    source: MultiplexedInputHandle {
1293                        // Save 0 for PS/2
1294                        elevation: 1,
1295                    }
1296                    .into_resource(),
1297                }
1298                .into_resource(),
1299            ),
1300        ]);
1301    }
1302
1303    let vsock_listener = |path: Option<&str>| -> anyhow::Result<_> {
1304        if let Some(path) = path {
1305            cleanup_socket(path.as_ref());
1306            let listener = unix_socket::UnixListener::bind(path)
1307                .with_context(|| format!("failed to bind to hybrid vsock path: {}", path))?;
1308            Ok(Some(listener))
1309        } else {
1310            Ok(None)
1311        }
1312    };
1313
1314    let vtl0_vsock_listener = vsock_listener(opt.vmbus_vsock_path.as_deref())?;
1315    let vtl2_vsock_listener = vsock_listener(opt.vmbus_vtl2_vsock_path.as_deref())?;
1316
1317    if let Some(path) = &opt.openhcl_dump_path {
1318        let (resource, task) = spawn_dump_handler(&spawner, path.clone(), None);
1319        task.detach();
1320        vmbus_devices.push((openhcl_vtl, resource));
1321    }
1322
1323    #[cfg(guest_arch = "aarch64")]
1324    let topology_arch = openvmm_defs::config::ArchTopologyConfig::Aarch64(
1325        openvmm_defs::config::Aarch64TopologyConfig {
1326            // TODO: allow this to be configured from the command line
1327            gic_config: None,
1328            pmu_gsiv: openvmm_defs::config::PmuGsivConfig::Platform,
1329            gic_msi: match opt.gic_msi {
1330                cli_args::GicMsiCli::Auto => openvmm_defs::config::GicMsiConfig::Auto,
1331                cli_args::GicMsiCli::Its => openvmm_defs::config::GicMsiConfig::Its,
1332                cli_args::GicMsiCli::V2m => openvmm_defs::config::GicMsiConfig::V2m,
1333            },
1334        },
1335    );
1336    #[cfg(guest_arch = "x86_64")]
1337    let topology_arch =
1338        openvmm_defs::config::ArchTopologyConfig::X86(openvmm_defs::config::X86TopologyConfig {
1339            apic_id_offset: opt.apic_id_offset,
1340            x2apic: opt.x2apic,
1341        });
1342
1343    let with_isolation = if let Some(isolation) = &opt.isolation {
1344        // TODO: For now, isolation is only supported with VTL2.
1345        if !opt.vtl2 {
1346            anyhow::bail!("isolation is only currently supported with vtl2");
1347        }
1348
1349        // TODO: Alias map support is not yet implement with isolation.
1350        if !opt.no_alias_map {
1351            anyhow::bail!("alias map not supported with isolation");
1352        }
1353
1354        match isolation {
1355            cli_args::IsolationCli::Vbs => Some(openvmm_defs::config::IsolationType::Vbs),
1356        }
1357    } else {
1358        None
1359    };
1360
1361    if with_hv {
1362        let (shutdown_send, shutdown_recv) = mesh::channel();
1363        resources.shutdown_ic = Some(shutdown_send);
1364        let (kvp_send, kvp_recv) = mesh::channel();
1365        resources.kvp_ic = Some(kvp_send);
1366        vmbus_devices.extend(
1367            [
1368                hyperv_ic_resources::shutdown::ShutdownIcHandle {
1369                    recv: shutdown_recv,
1370                }
1371                .into_resource(),
1372                hyperv_ic_resources::kvp::KvpIcHandle { recv: kvp_recv }.into_resource(),
1373                hyperv_ic_resources::timesync::TimesyncIcHandle.into_resource(),
1374            ]
1375            .map(|r| (DeviceVtl::Vtl0, r)),
1376        );
1377    }
1378
1379    if let Some(hive_path) = &opt.imc {
1380        let file = fs_err::File::open(hive_path).context("failed to open imc hive")?;
1381        vmbus_devices.push((
1382            DeviceVtl::Vtl0,
1383            vmbfs_resources::VmbfsImcDeviceHandle { file: file.into() }.into_resource(),
1384        ));
1385    }
1386
1387    let mut virtio_devices = Vec::new();
1388    let mut add_virtio_device = |bus, resource: Resource<VirtioDeviceHandle>| {
1389        let bus = match bus {
1390            VirtioBusCli::Auto => {
1391                // Use VPCI when possible (currently only on Windows and macOS due
1392                // to KVM backend limitations).
1393                if with_hv && (cfg!(windows) || cfg!(target_os = "macos")) {
1394                    None
1395                } else {
1396                    Some(VirtioBus::Pci)
1397                }
1398            }
1399            VirtioBusCli::Mmio => Some(VirtioBus::Mmio),
1400            VirtioBusCli::Pci => Some(VirtioBus::Pci),
1401            VirtioBusCli::Vpci => None,
1402        };
1403        if let Some(bus) = bus {
1404            virtio_devices.push((bus, resource));
1405        } else {
1406            vpci_devices.push(VpciDeviceConfig {
1407                vtl: DeviceVtl::Vtl0,
1408                instance_id: Guid::new_random(),
1409                resource: VirtioPciDeviceHandle(resource).into_resource(),
1410            });
1411        }
1412    };
1413
1414    for cli_cfg in &opt.virtio_net {
1415        if cli_cfg.underhill {
1416            anyhow::bail!("use --net uh:[...] to add underhill NICs")
1417        }
1418        let vport = parse_endpoint(cli_cfg, &mut nic_index, &mut resources)?;
1419        let resource = virtio_resources::net::VirtioNetHandle {
1420            max_queues: vport.max_queues,
1421            mac_address: vport.mac_address,
1422            endpoint: vport.endpoint,
1423        }
1424        .into_resource();
1425        if let Some(pcie_port) = &cli_cfg.pcie_port {
1426            pcie_devices.push(PcieDeviceConfig {
1427                port_name: pcie_port.clone(),
1428                resource: VirtioPciDeviceHandle(resource).into_resource(),
1429            });
1430        } else {
1431            add_virtio_device(VirtioBusCli::Auto, resource);
1432        }
1433    }
1434
1435    for args in &opt.virtio_fs {
1436        let resource: Resource<VirtioDeviceHandle> = virtio_resources::fs::VirtioFsHandle {
1437            tag: args.tag.clone(),
1438            fs: virtio_resources::fs::VirtioFsBackend::HostFs {
1439                root_path: args.path.clone(),
1440                mount_options: args.options.clone(),
1441            },
1442        }
1443        .into_resource();
1444        if let Some(pcie_port) = &args.pcie_port {
1445            pcie_devices.push(PcieDeviceConfig {
1446                port_name: pcie_port.clone(),
1447                resource: VirtioPciDeviceHandle(resource).into_resource(),
1448            });
1449        } else {
1450            add_virtio_device(opt.virtio_fs_bus, resource);
1451        }
1452    }
1453
1454    for args in &opt.virtio_fs_shmem {
1455        let resource: Resource<VirtioDeviceHandle> = virtio_resources::fs::VirtioFsHandle {
1456            tag: args.tag.clone(),
1457            fs: virtio_resources::fs::VirtioFsBackend::SectionFs {
1458                root_path: args.path.clone(),
1459            },
1460        }
1461        .into_resource();
1462        if let Some(pcie_port) = &args.pcie_port {
1463            pcie_devices.push(PcieDeviceConfig {
1464                port_name: pcie_port.clone(),
1465                resource: VirtioPciDeviceHandle(resource).into_resource(),
1466            });
1467        } else {
1468            add_virtio_device(opt.virtio_fs_bus, resource);
1469        }
1470    }
1471
1472    for args in &opt.virtio_9p {
1473        let resource: Resource<VirtioDeviceHandle> = virtio_resources::p9::VirtioPlan9Handle {
1474            tag: args.tag.clone(),
1475            root_path: args.path.clone(),
1476            debug: opt.virtio_9p_debug,
1477        }
1478        .into_resource();
1479        if let Some(pcie_port) = &args.pcie_port {
1480            pcie_devices.push(PcieDeviceConfig {
1481                port_name: pcie_port.clone(),
1482                resource: VirtioPciDeviceHandle(resource).into_resource(),
1483            });
1484        } else {
1485            add_virtio_device(VirtioBusCli::Auto, resource);
1486        }
1487    }
1488
1489    if let Some(pmem_args) = &opt.virtio_pmem {
1490        let resource: Resource<VirtioDeviceHandle> = virtio_resources::pmem::VirtioPmemHandle {
1491            path: pmem_args.path.clone(),
1492        }
1493        .into_resource();
1494        if let Some(pcie_port) = &pmem_args.pcie_port {
1495            pcie_devices.push(PcieDeviceConfig {
1496                port_name: pcie_port.clone(),
1497                resource: VirtioPciDeviceHandle(resource).into_resource(),
1498            });
1499        } else {
1500            add_virtio_device(VirtioBusCli::Auto, resource);
1501        }
1502    }
1503
1504    if opt.virtio_rng {
1505        let resource: Resource<VirtioDeviceHandle> =
1506            virtio_resources::rng::VirtioRngHandle.into_resource();
1507        if let Some(pcie_port) = &opt.virtio_rng_pcie_port {
1508            pcie_devices.push(PcieDeviceConfig {
1509                port_name: pcie_port.clone(),
1510                resource: VirtioPciDeviceHandle(resource).into_resource(),
1511            });
1512        } else {
1513            add_virtio_device(opt.virtio_rng_bus, resource);
1514        }
1515    }
1516
1517    if let Some(backend) = virtio_console_backend {
1518        let resource: Resource<VirtioDeviceHandle> =
1519            virtio_resources::console::VirtioConsoleHandle { backend }.into_resource();
1520        if let Some(pcie_port) = &opt.virtio_console_pcie_port {
1521            pcie_devices.push(PcieDeviceConfig {
1522                port_name: pcie_port.clone(),
1523                resource: VirtioPciDeviceHandle(resource).into_resource(),
1524            });
1525        } else {
1526            add_virtio_device(VirtioBusCli::Auto, resource);
1527        }
1528    }
1529
1530    // Handle --vhost-user arguments.
1531    #[cfg(target_os = "linux")]
1532    for vhost_cli in &opt.vhost_user {
1533        let stream =
1534            unix_socket::UnixStream::connect(&vhost_cli.socket_path).with_context(|| {
1535                format!(
1536                    "failed to connect to vhost-user socket: {}",
1537                    vhost_cli.socket_path
1538                )
1539            })?;
1540
1541        use crate::cli_args::VhostUserDeviceTypeCli;
1542        let resource: Resource<VirtioDeviceHandle> = match vhost_cli.device_type {
1543            VhostUserDeviceTypeCli::Fs {
1544                ref tag,
1545                num_queues,
1546                queue_size,
1547            } => virtio_resources::vhost_user::VhostUserFsHandle {
1548                socket: stream.into(),
1549                tag: tag.clone(),
1550                num_queues,
1551                queue_size,
1552            }
1553            .into_resource(),
1554            VhostUserDeviceTypeCli::Blk {
1555                num_queues,
1556                queue_size,
1557            } => virtio_resources::vhost_user::VhostUserBlkHandle {
1558                socket: stream.into(),
1559                num_queues,
1560                queue_size,
1561            }
1562            .into_resource(),
1563            VhostUserDeviceTypeCli::Other {
1564                device_id,
1565                ref queue_sizes,
1566            } => virtio_resources::vhost_user::VhostUserGenericHandle {
1567                socket: stream.into(),
1568                device_id,
1569                queue_sizes: queue_sizes.clone(),
1570            }
1571            .into_resource(),
1572        };
1573        if let Some(pcie_port) = &vhost_cli.pcie_port {
1574            pcie_devices.push(PcieDeviceConfig {
1575                port_name: pcie_port.clone(),
1576                resource: VirtioPciDeviceHandle(resource).into_resource(),
1577            });
1578        } else {
1579            add_virtio_device(VirtioBusCli::Auto, resource);
1580        }
1581    }
1582
1583    if let Some(vsock_path) = &opt.virtio_vsock_path {
1584        let listener = vsock_listener(Some(vsock_path))?.unwrap();
1585        add_virtio_device(
1586            VirtioBusCli::Auto,
1587            virtio_resources::vsock::VirtioVsockHandle {
1588                // The guest CID does not matter since the UDS relay does not use it. It just needs
1589                // to be some non-reserved value for the guest to use.
1590                guest_cid: 0x3,
1591                base_path: vsock_path.clone(),
1592                listener,
1593            }
1594            .into_resource(),
1595        );
1596    }
1597
1598    let mut cfg = Config {
1599        chipset,
1600        load_mode,
1601        floppy_disks,
1602        pcie_root_complexes,
1603        #[cfg(target_os = "linux")]
1604        pcie_devices: {
1605            let mut devs = pcie_devices;
1606            devs.extend(vfio_pcie_devices);
1607            devs
1608        },
1609        #[cfg(not(target_os = "linux"))]
1610        pcie_devices,
1611        pcie_switches,
1612        vpci_devices,
1613        ide_disks: Vec::new(),
1614        memory: MemoryConfig {
1615            mem_size: if let Some(ref sizes) = opt.numa_memory {
1616                sizes
1617                    .iter()
1618                    .try_fold(0u64, |acc, &s| acc.checked_add(s))
1619                    .context("numa memory sizes overflow")?
1620            } else {
1621                opt.memory_size()
1622            },
1623            mmio_gaps,
1624            prefetch_memory: opt.prefetch_memory(),
1625            private_memory: opt.private_memory(),
1626            transparent_hugepages: opt.transparent_hugepages(),
1627            hugepages: opt.memory.hugepages,
1628            hugepage_size: opt.memory.hugepage_size,
1629            pci_ecam_gaps,
1630            pci_mmio_gaps,
1631            numa_mem_sizes: opt.numa_memory.clone(),
1632        },
1633        processor_topology: ProcessorTopologyConfig {
1634            proc_count: opt.processors,
1635            vps_per_socket: opt.vps_per_socket,
1636            enable_smt: match opt.smt {
1637                cli_args::SmtConfigCli::Auto => None,
1638                cli_args::SmtConfigCli::Force => Some(true),
1639                cli_args::SmtConfigCli::Off => Some(false),
1640            },
1641            arch: Some(topology_arch),
1642        },
1643        hypervisor: HypervisorConfig {
1644            with_hv,
1645            with_vtl2: opt.vtl2.then_some(Vtl2Config {
1646                vtl0_alias_map: !opt.no_alias_map,
1647                late_map_vtl0_memory: match opt.late_map_vtl0_policy {
1648                    cli_args::Vtl0LateMapPolicyCli::Off => None,
1649                    cli_args::Vtl0LateMapPolicyCli::Log => Some(LateMapVtl0MemoryPolicy::Log),
1650                    cli_args::Vtl0LateMapPolicyCli::Halt => Some(LateMapVtl0MemoryPolicy::Halt),
1651                    cli_args::Vtl0LateMapPolicyCli::Exception => {
1652                        Some(LateMapVtl0MemoryPolicy::InjectException)
1653                    }
1654                },
1655            }),
1656            with_isolation,
1657        },
1658        #[cfg(windows)]
1659        kernel_vmnics,
1660        input: mesh::Receiver::new(),
1661        framebuffer,
1662        vga_firmware,
1663        vtl2_gfx: opt.vtl2_gfx,
1664        virtio_devices,
1665        vmbus: with_hv.then_some(VmbusConfig {
1666            vsock_listener: vtl0_vsock_listener,
1667            vsock_path: opt.vmbus_vsock_path.clone(),
1668            vtl2_redirect: opt.vmbus_redirect,
1669            vmbus_max_version: opt.vmbus_max_version,
1670            #[cfg(windows)]
1671            vmbusproxy_handle,
1672        }),
1673        vtl2_vmbus: (with_hv && opt.vtl2).then_some(VmbusConfig {
1674            vsock_listener: vtl2_vsock_listener,
1675            vsock_path: opt.vmbus_vtl2_vsock_path.clone(),
1676            ..Default::default()
1677        }),
1678        vmbus_devices,
1679        chipset_devices,
1680        pci_chipset_devices,
1681        chipset_capabilities: capabilities,
1682        #[cfg(windows)]
1683        vpci_resources,
1684        vmgs,
1685        secure_boot_enabled: opt.secure_boot,
1686        custom_uefi_vars,
1687        firmware_event_send: None,
1688        debugger_rpc: None,
1689        generation_id_recv: None,
1690        rtc_delta_milliseconds: 0,
1691        automatic_guest_reset: !opt.halt_on_reset,
1692        efi_diagnostics_log_level: {
1693            match opt.efi_diagnostics_log_level.unwrap_or_default() {
1694                EfiDiagnosticsLogLevelCli::Default => EfiDiagnosticsLogLevelType::Default,
1695                EfiDiagnosticsLogLevelCli::Info => EfiDiagnosticsLogLevelType::Info,
1696                EfiDiagnosticsLogLevelCli::Full => EfiDiagnosticsLogLevelType::Full,
1697            }
1698        },
1699    };
1700
1701    storage.build_config(&mut cfg, &mut resources, opt.scsi_sub_channels)?;
1702    Ok((cfg, resources))
1703}
1704
1705/// Gets the terminal to use for externally launched console windows.
1706pub(crate) fn openvmm_terminal_app() -> Option<PathBuf> {
1707    std::env::var_os("OPENVMM_TERM")
1708        .or_else(|| std::env::var_os("HVLITE_TERM"))
1709        .map(Into::into)
1710}
1711
1712// Tries to remove `path` if it is confirmed to be a Unix socket.
1713fn cleanup_socket(path: &Path) {
1714    #[cfg(windows)]
1715    let is_socket = pal::windows::fs::is_unix_socket(path).unwrap_or(false);
1716    #[cfg(not(windows))]
1717    let is_socket = path
1718        .metadata()
1719        .is_ok_and(|meta| std::os::unix::fs::FileTypeExt::is_socket(&meta.file_type()));
1720
1721    if is_socket {
1722        let _ = std::fs::remove_file(path);
1723    }
1724}
1725
1726#[cfg(windows)]
1727const DEFAULT_SWITCH: &str = "C08CB7B8-9B3C-408E-8E30-5E16A3AEB444";
1728
1729#[cfg(windows)]
1730fn new_switch_port(
1731    switch_id: &str,
1732) -> anyhow::Result<(
1733    openvmm_defs::config::SwitchPortId,
1734    vmswitch::kernel::SwitchPort,
1735)> {
1736    let id = vmswitch::kernel::SwitchPortId {
1737        switch: switch_id.parse().context("invalid switch id")?,
1738        port: Guid::new_random(),
1739    };
1740    let _ = vmswitch::hcn::Network::open(&id.switch)
1741        .with_context(|| format!("could not find switch {}", id.switch))?;
1742
1743    let port = vmswitch::kernel::SwitchPort::new(&id).context("failed to create switch port")?;
1744
1745    let id = openvmm_defs::config::SwitchPortId {
1746        switch: id.switch,
1747        port: id.port,
1748    };
1749    Ok((id, port))
1750}
1751
1752fn parse_endpoint(
1753    cli_cfg: &NicConfigCli,
1754    index: &mut usize,
1755    resources: &mut VmResources,
1756) -> anyhow::Result<NicConfig> {
1757    let _ = resources;
1758    let endpoint = match &cli_cfg.endpoint {
1759        EndpointConfigCli::Consomme { cidr, host_fwd } => {
1760            let ports = host_fwd
1761                .iter()
1762                .map(|fwd| {
1763                    use net_backend_resources::consomme::HostPortProtocol;
1764                    net_backend_resources::consomme::HostPortConfig {
1765                        protocol: match fwd.protocol {
1766                            cli_args::HostPortProtocolCli::Tcp => HostPortProtocol::Tcp,
1767                            cli_args::HostPortProtocolCli::Udp => HostPortProtocol::Udp,
1768                        },
1769                        host_address: fwd
1770                            .host_address
1771                            .map(net_backend_resources::consomme::HostIpAddress::from),
1772                        host_port: fwd.host_port,
1773                        guest_port: fwd.guest_port,
1774                    }
1775                })
1776                .collect();
1777            net_backend_resources::consomme::ConsommeHandle {
1778                cidr: cidr.clone(),
1779                ports,
1780            }
1781            .into_resource()
1782        }
1783        EndpointConfigCli::None => net_backend_resources::null::NullHandle.into_resource(),
1784        EndpointConfigCli::Dio { id } => {
1785            #[cfg(windows)]
1786            {
1787                let (port_id, port) = new_switch_port(id.as_deref().unwrap_or(DEFAULT_SWITCH))?;
1788                resources.switch_ports.push(port);
1789                net_backend_resources::dio::WindowsDirectIoHandle {
1790                    switch_port_id: net_backend_resources::dio::SwitchPortId {
1791                        switch: port_id.switch,
1792                        port: port_id.port,
1793                    },
1794                }
1795                .into_resource()
1796            }
1797
1798            #[cfg(not(windows))]
1799            {
1800                let _ = id;
1801                bail!("cannot use dio on non-windows platforms")
1802            }
1803        }
1804        EndpointConfigCli::Tap { name } => {
1805            #[cfg(target_os = "linux")]
1806            {
1807                let fd = net_tap::tap::open_tap(name)
1808                    .with_context(|| format!("failed to open TAP device '{name}'"))?;
1809                net_backend_resources::tap::TapHandle { fd }.into_resource()
1810            }
1811
1812            #[cfg(not(target_os = "linux"))]
1813            {
1814                let _ = name;
1815                bail!("TAP backend is only supported on Linux")
1816            }
1817        }
1818    };
1819
1820    // Pick a random MAC address.
1821    let mut mac_address = [0x00, 0x15, 0x5D, 0, 0, 0];
1822    getrandom::fill(&mut mac_address[3..]).expect("rng failure");
1823
1824    // Pick a fixed instance ID based on the index.
1825    const BASE_INSTANCE_ID: Guid = guid::guid!("00000000-da43-11ed-936a-00155d6db52f");
1826    let instance_id = Guid {
1827        data1: *index as u32,
1828        ..BASE_INSTANCE_ID
1829    };
1830    *index += 1;
1831
1832    Ok(NicConfig {
1833        vtl: cli_cfg.vtl,
1834        instance_id,
1835        endpoint,
1836        mac_address: mac_address.into(),
1837        max_queues: cli_cfg.max_queues,
1838        pcie_port: cli_cfg.pcie_port.clone(),
1839    })
1840}
1841
1842#[derive(Debug)]
1843struct NicConfig {
1844    vtl: DeviceVtl,
1845    instance_id: Guid,
1846    mac_address: MacAddress,
1847    endpoint: Resource<NetEndpointHandleKind>,
1848    max_queues: Option<u16>,
1849    pcie_port: Option<String>,
1850}
1851
1852impl NicConfig {
1853    fn into_netvsp_handle(self) -> (DeviceVtl, Resource<VmbusDeviceHandleKind>) {
1854        (
1855            self.vtl,
1856            netvsp_resources::NetvspHandle {
1857                instance_id: self.instance_id,
1858                mac_address: self.mac_address,
1859                endpoint: self.endpoint,
1860                max_queues: self.max_queues,
1861            }
1862            .into_resource(),
1863        )
1864    }
1865}
1866
1867enum LayerOrDisk {
1868    Layer(DiskLayerDescription),
1869    Disk(Resource<DiskHandleKind>),
1870}
1871
1872async fn disk_open(
1873    disk_cli: &DiskCliKind,
1874    read_only: bool,
1875) -> anyhow::Result<Resource<DiskHandleKind>> {
1876    let mut layers = Vec::new();
1877    disk_open_inner(disk_cli, read_only, &mut layers).await?;
1878    if layers.len() == 1 && matches!(layers[0], LayerOrDisk::Disk(_)) {
1879        let LayerOrDisk::Disk(disk) = layers.pop().unwrap() else {
1880            unreachable!()
1881        };
1882        Ok(disk)
1883    } else {
1884        Ok(Resource::new(disk_backend_resources::LayeredDiskHandle {
1885            layers: layers
1886                .into_iter()
1887                .map(|layer| match layer {
1888                    LayerOrDisk::Layer(layer) => layer,
1889                    LayerOrDisk::Disk(disk) => DiskLayerDescription {
1890                        layer: DiskLayerHandle(disk).into_resource(),
1891                        read_cache: false,
1892                        write_through: false,
1893                    },
1894                })
1895                .collect(),
1896        }))
1897    }
1898}
1899
1900fn disk_open_inner<'a>(
1901    disk_cli: &'a DiskCliKind,
1902    read_only: bool,
1903    layers: &'a mut Vec<LayerOrDisk>,
1904) -> futures::future::BoxFuture<'a, anyhow::Result<()>> {
1905    Box::pin(async move {
1906        fn layer<T: IntoResource<DiskLayerHandleKind>>(layer: T) -> LayerOrDisk {
1907            LayerOrDisk::Layer(layer.into_resource().into())
1908        }
1909        fn disk<T: IntoResource<DiskHandleKind>>(disk: T) -> LayerOrDisk {
1910            LayerOrDisk::Disk(disk.into_resource())
1911        }
1912        match disk_cli {
1913            &DiskCliKind::Memory(len) => {
1914                layers.push(layer(RamDiskLayerHandle {
1915                    len: Some(len),
1916                    sector_size: None,
1917                }));
1918            }
1919            DiskCliKind::File {
1920                path,
1921                create_with_len,
1922                direct,
1923            } => layers.push(LayerOrDisk::Disk(if let Some(size) = create_with_len {
1924                create_disk_type(
1925                    path,
1926                    *size,
1927                    OpenDiskOptions {
1928                        read_only: false,
1929                        direct: *direct,
1930                    },
1931                )
1932                .with_context(|| format!("failed to create {}", path.display()))?
1933            } else {
1934                open_disk_type(
1935                    path,
1936                    OpenDiskOptions {
1937                        read_only,
1938                        direct: *direct,
1939                    },
1940                )
1941                .await
1942                .with_context(|| format!("failed to open {}", path.display()))?
1943            })),
1944            DiskCliKind::Blob { kind, url } => {
1945                layers.push(disk(disk_backend_resources::BlobDiskHandle {
1946                    url: url.to_owned(),
1947                    format: match kind {
1948                        cli_args::BlobKind::Flat => disk_backend_resources::BlobDiskFormat::Flat,
1949                        cli_args::BlobKind::Vhd1 => {
1950                            disk_backend_resources::BlobDiskFormat::FixedVhd1
1951                        }
1952                    },
1953                }))
1954            }
1955            DiskCliKind::MemoryDiff(inner) => {
1956                layers.push(layer(RamDiskLayerHandle {
1957                    len: None,
1958                    sector_size: None,
1959                }));
1960                disk_open_inner(inner, true, layers).await?;
1961            }
1962            DiskCliKind::PersistentReservationsWrapper(inner) => {
1963                layers.push(disk(disk_backend_resources::DiskWithReservationsHandle(
1964                    disk_open(inner, read_only).await?,
1965                )))
1966            }
1967            DiskCliKind::DelayDiskWrapper {
1968                delay_ms,
1969                disk: inner,
1970            } => layers.push(disk(DelayDiskHandle {
1971                delay: CellUpdater::new(Duration::from_millis(*delay_ms)).cell(),
1972                disk: disk_open(inner, read_only).await?,
1973            })),
1974            DiskCliKind::Crypt {
1975                disk: inner,
1976                cipher,
1977                key_file,
1978            } => layers.push(disk(disk_crypt_resources::DiskCryptHandle {
1979                disk: disk_open(inner, read_only).await?,
1980                cipher: match cipher {
1981                    cli_args::DiskCipher::XtsAes256 => disk_crypt_resources::Cipher::XtsAes256,
1982                },
1983                key: fs_err::read(key_file).context("failed to read key file")?,
1984            })),
1985            DiskCliKind::Sqlite {
1986                path,
1987                create_with_len,
1988            } => {
1989                // FUTURE: this code should be responsible for opening
1990                // file-handle(s) itself, and passing them into sqlite via a custom
1991                // vfs. For now though - simply check if the file exists or not, and
1992                // perform early validation of filesystem-level create options.
1993                match (create_with_len.is_some(), path.exists()) {
1994                    (true, true) => anyhow::bail!(
1995                        "cannot create new sqlite disk at {} - file already exists",
1996                        path.display()
1997                    ),
1998                    (false, false) => anyhow::bail!(
1999                        "cannot open sqlite disk at {} - file not found",
2000                        path.display()
2001                    ),
2002                    _ => {}
2003                }
2004
2005                layers.push(layer(SqliteDiskLayerHandle {
2006                    dbhd_path: path.display().to_string(),
2007                    format_dbhd: create_with_len.map(|len| {
2008                        disk_backend_resources::layer::SqliteDiskLayerFormatParams {
2009                            logically_read_only: false,
2010                            len: Some(len),
2011                        }
2012                    }),
2013                }));
2014            }
2015            DiskCliKind::SqliteDiff { path, create, disk } => {
2016                // FUTURE: this code should be responsible for opening
2017                // file-handle(s) itself, and passing them into sqlite via a custom
2018                // vfs. For now though - simply check if the file exists or not, and
2019                // perform early validation of filesystem-level create options.
2020                match (create, path.exists()) {
2021                    (true, true) => anyhow::bail!(
2022                        "cannot create new sqlite disk at {} - file already exists",
2023                        path.display()
2024                    ),
2025                    (false, false) => anyhow::bail!(
2026                        "cannot open sqlite disk at {} - file not found",
2027                        path.display()
2028                    ),
2029                    _ => {}
2030                }
2031
2032                layers.push(layer(SqliteDiskLayerHandle {
2033                    dbhd_path: path.display().to_string(),
2034                    format_dbhd: create.then_some(
2035                        disk_backend_resources::layer::SqliteDiskLayerFormatParams {
2036                            logically_read_only: false,
2037                            len: None,
2038                        },
2039                    ),
2040                }));
2041                disk_open_inner(disk, true, layers).await?;
2042            }
2043            DiskCliKind::AutoCacheSqlite {
2044                cache_path,
2045                key,
2046                disk,
2047            } => {
2048                layers.push(LayerOrDisk::Layer(DiskLayerDescription {
2049                    read_cache: true,
2050                    write_through: false,
2051                    layer: SqliteAutoCacheDiskLayerHandle {
2052                        cache_path: cache_path.clone(),
2053                        cache_key: key.clone(),
2054                    }
2055                    .into_resource(),
2056                }));
2057                disk_open_inner(disk, read_only, layers).await?;
2058            }
2059        }
2060        Ok(())
2061    })
2062}
2063
2064/// Get the system page size.
2065pub(crate) fn system_page_size() -> u32 {
2066    sparse_mmap::SparseMapping::page_size() as u32
2067}
2068
2069/// The guest architecture string, derived from the compile-time `guest_arch` cfg.
2070pub(crate) const GUEST_ARCH: &str = if cfg!(guest_arch = "x86_64") {
2071    "x86_64"
2072} else {
2073    "aarch64"
2074};
2075
2076/// Open a snapshot directory and validate it against the current VM config.
2077/// Returns the shared memory fd (from memory.bin) and the saved device state.
2078fn prepare_snapshot_restore(
2079    snapshot_dir: &Path,
2080    opt: &Options,
2081) -> anyhow::Result<(
2082    openvmm_defs::worker::SharedMemoryFd,
2083    mesh::payload::message::ProtobufMessage,
2084)> {
2085    let (manifest, state_bytes) = openvmm_helpers::snapshot::read_snapshot(snapshot_dir)?;
2086
2087    // Validate manifest against current VM config.
2088    openvmm_helpers::snapshot::validate_manifest(
2089        &manifest,
2090        GUEST_ARCH,
2091        opt.memory_size(),
2092        opt.processors,
2093        system_page_size(),
2094    )?;
2095
2096    // Open memory.bin (existing file, no create, no resize).
2097    let memory_file = fs_err::OpenOptions::new()
2098        .read(true)
2099        .write(true)
2100        .open(snapshot_dir.join("memory.bin"))?;
2101
2102    // Validate file size matches expected memory size.
2103    let file_size = memory_file.metadata()?.len();
2104    if file_size != manifest.memory_size_bytes {
2105        anyhow::bail!(
2106            "memory.bin size ({file_size} bytes) doesn't match manifest ({} bytes)",
2107            manifest.memory_size_bytes,
2108        );
2109    }
2110
2111    let shared_memory_fd =
2112        openvmm_helpers::shared_memory::file_to_shared_memory_fd(memory_file.into())?;
2113
2114    // Reconstruct ProtobufMessage from the saved state bytes.
2115    // The save side wrote mesh::payload::encode(ProtobufMessage), so we decode
2116    // back to ProtobufMessage.
2117    let state_msg: mesh::payload::message::ProtobufMessage = mesh::payload::decode(&state_bytes)
2118        .context("failed to decode saved state from snapshot")?;
2119
2120    Ok((shared_memory_fd, state_msg))
2121}
2122
2123fn do_main(pidfile_path: &mut Option<PathBuf>) -> anyhow::Result<()> {
2124    #[cfg(windows)]
2125    pal::windows::disable_hard_error_dialog();
2126
2127    tracing_init::enable_tracing()?;
2128
2129    // Try to run as a worker host.
2130    // On success the worker runs to completion and then exits the process (does
2131    // not return). Any worker host setup errors are return and bubbled up.
2132    meshworker::run_vmm_mesh_host()?;
2133
2134    let opt = Options::parse();
2135    if let Some(path) = &opt.write_saved_state_proto {
2136        mesh::payload::protofile::DescriptorWriter::new(vmcore::save_restore::saved_state_roots())
2137            .write_to_path(path)
2138            .context("failed to write protobuf descriptors")?;
2139        return Ok(());
2140    }
2141
2142    if let Some(ref path) = opt.pidfile {
2143        std::fs::write(path, format!("{}\n", std::process::id()))
2144            .context("failed to write pidfile")?;
2145        *pidfile_path = Some(path.clone());
2146    }
2147
2148    if let Some(path) = opt.relay_console_path {
2149        let console_title = opt.relay_console_title.unwrap_or_default();
2150        return console_relay::relay_console(&path, console_title.as_str());
2151    }
2152
2153    #[cfg(any(feature = "grpc", feature = "ttrpc"))]
2154    if let Some(path) = opt.ttrpc.as_ref().or(opt.grpc.as_ref()) {
2155        return block_on(async {
2156            let _ = std::fs::remove_file(path);
2157            let listener =
2158                unix_socket::UnixListener::bind(path).context("failed to bind to socket")?;
2159
2160            let transport = if opt.ttrpc.is_some() {
2161                ttrpc::RpcTransport::Ttrpc
2162            } else {
2163                ttrpc::RpcTransport::Grpc
2164            };
2165
2166            // This is a local launch
2167            let mut handle =
2168                mesh_worker::launch_local_worker::<ttrpc::TtrpcWorker>(ttrpc::Parameters {
2169                    listener,
2170                    transport,
2171                })
2172                .await?;
2173
2174            tracing::info!(%transport, path = %path.display(), "listening");
2175
2176            // Signal the the parent process that the server is ready.
2177            pal::close_stdout().context("failed to close stdout")?;
2178
2179            handle.join().await?;
2180
2181            Ok(())
2182        });
2183    }
2184
2185    DefaultPool::run_with(async |driver| run_control(&driver, opt).await)
2186}
2187
2188fn new_hvsock_service_id(port: u32) -> Guid {
2189    // This GUID is an embedding of the AF_VSOCK port into an
2190    // AF_HYPERV service ID.
2191    Guid {
2192        data1: port,
2193        .."00000000-facb-11e6-bd58-64006a7986d3".parse().unwrap()
2194    }
2195}
2196
2197async fn run_control(driver: &DefaultDriver, opt: Options) -> anyhow::Result<()> {
2198    let mut mesh = Some(VmmMesh::new(&driver, opt.single_process)?);
2199    let result = run_control_inner(driver, &mut mesh, opt).await;
2200    // If setup failed before the mesh was handed to the controller, shut it
2201    // down so the child host process exits cleanly without noisy logs.
2202    if let Some(mesh) = mesh {
2203        mesh.shutdown().await;
2204    }
2205    result
2206}
2207
2208async fn run_control_inner(
2209    driver: &DefaultDriver,
2210    mesh_slot: &mut Option<VmmMesh>,
2211    opt: Options,
2212) -> anyhow::Result<()> {
2213    let mesh = mesh_slot.as_ref().unwrap();
2214    let (mut vm_config, mut resources) = vm_config_from_command_line(driver, mesh, &opt).await?;
2215
2216    let mut vnc_worker = None;
2217    if opt.gfx || opt.vnc {
2218        let listener = TcpListener::bind(format!("127.0.0.1:{}", opt.vnc_port))
2219            .with_context(|| format!("binding to VNC port {}", opt.vnc_port))?;
2220
2221        let input_send = vm_config.input.sender();
2222        let framebuffer = resources
2223            .framebuffer_access
2224            .take()
2225            .expect("synth video enabled");
2226
2227        let vnc_host = mesh
2228            .make_host("vnc", None)
2229            .await
2230            .context("spawning vnc process failed")?;
2231
2232        vnc_worker = Some(
2233            vnc_host
2234                .launch_worker(
2235                    vnc_worker_defs::VNC_WORKER_TCP,
2236                    VncParameters {
2237                        listener,
2238                        framebuffer,
2239                        input_send,
2240                    },
2241                )
2242                .await?,
2243        )
2244    }
2245
2246    // spin up the debug worker
2247    let gdb_worker = if let Some(port) = opt.gdb {
2248        let listener = TcpListener::bind(format!("127.0.0.1:{}", port))
2249            .with_context(|| format!("binding to gdb port {}", port))?;
2250
2251        let (req_tx, req_rx) = mesh::channel();
2252        vm_config.debugger_rpc = Some(req_rx);
2253
2254        let gdb_host = mesh
2255            .make_host("gdb", None)
2256            .await
2257            .context("spawning gdbstub process failed")?;
2258
2259        Some(
2260            gdb_host
2261                .launch_worker(
2262                    debug_worker_defs::DEBUGGER_WORKER,
2263                    debug_worker_defs::DebuggerParameters {
2264                        listener,
2265                        req_chan: req_tx,
2266                        vp_count: vm_config.processor_topology.proc_count,
2267                        target_arch: if cfg!(guest_arch = "x86_64") {
2268                            debug_worker_defs::TargetArch::X86_64
2269                        } else {
2270                            debug_worker_defs::TargetArch::Aarch64
2271                        },
2272                    },
2273                )
2274                .await
2275                .context("failed to launch gdbstub worker")?,
2276        )
2277    } else {
2278        None
2279    };
2280
2281    // spin up the VM
2282    let (vm_rpc, rpc_recv) = mesh::channel();
2283    let (notify_send, notify_recv) = mesh::channel();
2284    let vm_worker = {
2285        let vm_host = mesh.make_host("vm", opt.log_file.clone()).await?;
2286
2287        let (shared_memory, saved_state) = if let Some(snapshot_dir) = &opt.restore_snapshot {
2288            let (fd, state_msg) = prepare_snapshot_restore(snapshot_dir, &opt)?;
2289            (Some(fd), Some(state_msg))
2290        } else {
2291            let shared_memory = opt
2292                .memory_backing_file()
2293                .map(|path| {
2294                    openvmm_helpers::shared_memory::open_memory_backing_file(
2295                        path,
2296                        opt.memory_size(),
2297                    )
2298                })
2299                .transpose()?;
2300            (shared_memory, None)
2301        };
2302
2303        let params = VmWorkerParameters {
2304            hypervisor: match &opt.hypervisor {
2305                Some(name) => openvmm_helpers::hypervisor::hypervisor_resource(name)?,
2306                None => openvmm_helpers::hypervisor::choose_hypervisor()?,
2307            },
2308            cfg: vm_config,
2309            saved_state,
2310            shared_memory,
2311            rpc: rpc_recv,
2312            notify: notify_send,
2313        };
2314        vm_host
2315            .launch_worker(VM_WORKER, params)
2316            .await
2317            .context("failed to launch vm worker")?
2318    };
2319
2320    if opt.restore_snapshot.is_some() {
2321        tracing::info!("restoring VM from snapshot");
2322    }
2323
2324    if !opt.paused {
2325        vm_rpc.call(VmRpc::Resume, ()).await?;
2326    }
2327
2328    let paravisor_diag = Arc::new(diag_client::DiagClient::from_dialer(
2329        driver.clone(),
2330        DiagDialer {
2331            driver: driver.clone(),
2332            vm_rpc: vm_rpc.clone(),
2333            openhcl_vtl: if opt.vtl2 {
2334                DeviceVtl::Vtl2
2335            } else {
2336                DeviceVtl::Vtl0
2337            },
2338        },
2339    ));
2340
2341    let diag_inspector = DiagInspector::new(driver.clone(), paravisor_diag.clone());
2342
2343    // Create channels between the REPL and VmController.
2344    let (vm_controller_send, vm_controller_recv) = mesh::channel();
2345    let (vm_controller_event_send, vm_controller_event_recv) = mesh::channel();
2346
2347    let has_vtl2 = resources.vtl2_settings.is_some();
2348
2349    // Build the VmController with exclusive resources.
2350    let controller = vm_controller::VmController {
2351        mesh: mesh_slot.take().unwrap(),
2352        vm_worker,
2353        vnc_worker,
2354        gdb_worker,
2355        diag_inspector: Some(diag_inspector),
2356        vtl2_settings: resources.vtl2_settings,
2357        ged_rpc: resources.ged_rpc.clone(),
2358        vm_rpc: vm_rpc.clone(),
2359        paravisor_diag: Some(paravisor_diag),
2360        igvm_path: opt.igvm.clone(),
2361        memory_backing_file: opt.memory_backing_file().cloned(),
2362        memory: opt.memory_size(),
2363        processors: opt.processors,
2364        log_file: opt.log_file.clone(),
2365    };
2366
2367    // Spawn the VmController as a task.
2368    let controller_task = driver.spawn(
2369        "vm-controller",
2370        controller.run(vm_controller_recv, vm_controller_event_send, notify_recv),
2371    );
2372
2373    // Run the REPL with shareable resources.
2374    let repl_result = repl::run_repl(
2375        driver,
2376        repl::ReplResources {
2377            vm_rpc,
2378            vm_controller: vm_controller_send,
2379            vm_controller_events: vm_controller_event_recv,
2380            scsi_rpc: resources.scsi_rpc,
2381            nvme_vtl2_rpc: resources.nvme_vtl2_rpc,
2382            shutdown_ic: resources.shutdown_ic,
2383            kvp_ic: resources.kvp_ic,
2384            console_in: resources.console_in,
2385            has_vtl2,
2386        },
2387    )
2388    .await;
2389
2390    // Wait for the controller task to finish (it stops the VM worker and
2391    // shuts down the mesh).
2392    controller_task.await;
2393
2394    repl_result
2395}
2396
2397struct DiagDialer {
2398    driver: DefaultDriver,
2399    vm_rpc: mesh::Sender<VmRpc>,
2400    openhcl_vtl: DeviceVtl,
2401}
2402
2403impl mesh_rpc::client::Dial for DiagDialer {
2404    type Stream = PolledSocket<unix_socket::UnixStream>;
2405
2406    async fn dial(&mut self) -> io::Result<Self::Stream> {
2407        let service_id = new_hvsock_service_id(1);
2408        let socket = self
2409            .vm_rpc
2410            .call_failable(
2411                VmRpc::ConnectHvsock,
2412                (
2413                    CancelContext::new().with_timeout(Duration::from_secs(2)),
2414                    service_id,
2415                    self.openhcl_vtl,
2416                ),
2417            )
2418            .await
2419            .map_err(io::Error::other)?;
2420
2421        PolledSocket::new(&self.driver, socket)
2422    }
2423}
2424
2425/// An object that implements [`InspectMut`] by sending an inspect request over
2426/// TTRPC to the guest (typically the paravisor running in VTL2), then stitching
2427/// the response back into the inspect tree.
2428///
2429/// This also caches the TTRPC connection to the guest so that only the first
2430/// inspect request has to wait for the connection to be established.
2431pub(crate) struct DiagInspector(DiagInspectorInner);
2432
2433enum DiagInspectorInner {
2434    NotStarted(DefaultDriver, Arc<diag_client::DiagClient>),
2435    Started {
2436        send: mesh::Sender<inspect::Deferred>,
2437        _task: Task<()>,
2438    },
2439    Invalid,
2440}
2441
2442impl DiagInspector {
2443    pub fn new(driver: DefaultDriver, diag_client: Arc<diag_client::DiagClient>) -> Self {
2444        Self(DiagInspectorInner::NotStarted(driver, diag_client))
2445    }
2446
2447    fn start(&mut self) -> &mesh::Sender<inspect::Deferred> {
2448        loop {
2449            match self.0 {
2450                DiagInspectorInner::NotStarted { .. } => {
2451                    let DiagInspectorInner::NotStarted(driver, client) =
2452                        std::mem::replace(&mut self.0, DiagInspectorInner::Invalid)
2453                    else {
2454                        unreachable!()
2455                    };
2456                    let (send, recv) = mesh::channel();
2457                    let task = driver.clone().spawn("diag-inspect", async move {
2458                        Self::run(&client, recv).await
2459                    });
2460
2461                    self.0 = DiagInspectorInner::Started { send, _task: task };
2462                }
2463                DiagInspectorInner::Started { ref send, .. } => break send,
2464                DiagInspectorInner::Invalid => unreachable!(),
2465            }
2466        }
2467    }
2468
2469    async fn run(
2470        diag_client: &diag_client::DiagClient,
2471        mut recv: mesh::Receiver<inspect::Deferred>,
2472    ) {
2473        while let Some(deferred) = recv.next().await {
2474            let info = deferred.external_request();
2475            let result = match info.request_type {
2476                inspect::ExternalRequestType::Inspect { depth } => {
2477                    if depth == 0 {
2478                        Ok(inspect::Node::Unevaluated)
2479                    } else {
2480                        // TODO: Support taking timeouts from the command line
2481                        diag_client
2482                            .inspect(info.path, Some(depth - 1), Some(Duration::from_secs(1)))
2483                            .await
2484                    }
2485                }
2486                inspect::ExternalRequestType::Update { value } => {
2487                    (diag_client.update(info.path, value).await).map(inspect::Node::Value)
2488                }
2489            };
2490            deferred.complete_external(
2491                result.unwrap_or_else(|err| {
2492                    inspect::Node::Failed(inspect::Error::Mesh(format!("{err:#}")))
2493                }),
2494                inspect::SensitivityLevel::Unspecified,
2495            )
2496        }
2497    }
2498}
2499
2500impl InspectMut for DiagInspector {
2501    fn inspect_mut(&mut self, req: inspect::Request<'_>) {
2502        self.start().send(req.defer());
2503    }
2504}