Skip to main content

openvmm_entry/
lib.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! This module implements the interactive control process and the entry point
5//! for the worker process.
6
7#![expect(missing_docs)]
8#![forbid(unsafe_code)]
9
10mod cli_args;
11mod crash_dump;
12mod kvp;
13mod meshworker;
14mod pidfile;
15mod repl;
16mod serial_io;
17mod storage_builder;
18mod tracing_init;
19mod ttrpc;
20mod vm_controller;
21
22// `pub` so that the missing_docs warning fires for options without
23// documentation.
24pub use cli_args::Options;
25use console_relay::ConsoleLaunchOptions;
26
27use crate::cli_args::SecureBootTemplateCli;
28use anyhow::Context;
29use anyhow::bail;
30use chipset_resources::battery::HostBatteryUpdate;
31use cli_args::DiskCliKind;
32use cli_args::EfiDiagnosticsLogLevelCli;
33use cli_args::EndpointConfigCli;
34use cli_args::GuestPowerAction;
35use cli_args::NicConfigCli;
36use cli_args::ProvisionVmgs;
37use cli_args::SerialConfigCli;
38use cli_args::UefiConsoleModeCli;
39use cli_args::VirtioBusCli;
40use cli_args::VmgsCli;
41use crash_dump::spawn_dump_handler;
42use cxl_spec::test::CxlTestDeviceHandle;
43use disk_backend_resources::DelayDiskHandle;
44use disk_backend_resources::DiskLayerDescription;
45use disk_backend_resources::layer::DiskLayerHandle;
46use disk_backend_resources::layer::RamDiskLayerHandle;
47use disk_backend_resources::layer::SqliteAutoCacheDiskLayerHandle;
48use disk_backend_resources::layer::SqliteDiskLayerHandle;
49use floppy_resources::FloppyDiskConfig;
50use framebuffer::FRAMEBUFFER_SIZE;
51use framebuffer::FramebufferAccess;
52use futures::AsyncReadExt;
53use futures::AsyncWrite;
54use futures::StreamExt;
55use futures::executor::block_on;
56use futures::io::AllowStdIo;
57use gdma_resources::GdmaDeviceHandle;
58use gdma_resources::VportDefinition;
59use guid::Guid;
60use input_core::MultiplexedInputHandle;
61use inspect::InspectMut;
62use io::Read;
63use mesh::CancelContext;
64use mesh::CellUpdater;
65use mesh::rpc::RpcSend;
66use meshworker::VmmMesh;
67use net_backend_resources::mac_address::MacAddress;
68use nvme_resources::NvmeControllerRequest;
69use openvmm_defs::config::Config;
70use openvmm_defs::config::DEFAULT_PCAT_BOOT_ORDER;
71use openvmm_defs::config::DeviceVtl;
72use openvmm_defs::config::EfiDiagnosticsLogLevelType;
73use openvmm_defs::config::HypervisorConfig;
74use openvmm_defs::config::LateMapVtl0MemoryPolicy;
75use openvmm_defs::config::LoadMode;
76use openvmm_defs::config::MemoryConfig;
77use openvmm_defs::config::NumaDistance;
78use openvmm_defs::config::NumaNode;
79use openvmm_defs::config::NumaTopology;
80use openvmm_defs::config::PcieDeviceConfig;
81use openvmm_defs::config::PcieMmioRangeConfig;
82use openvmm_defs::config::PciePortConfig;
83use openvmm_defs::config::PcieRootComplexConfig;
84use openvmm_defs::config::PcieSwitchConfig;
85use openvmm_defs::config::ProcessorTopologyConfig;
86use openvmm_defs::config::RootComplexCxlConfig;
87use openvmm_defs::config::SerialInformation;
88use openvmm_defs::config::VirtioBus;
89use openvmm_defs::config::VmbusConfig;
90use openvmm_defs::config::VpAssignment;
91use openvmm_defs::config::VpciDeviceConfig;
92use openvmm_defs::config::Vtl2Config;
93use openvmm_defs::rpc::VmRpc;
94use openvmm_defs::worker::VM_WORKER;
95use openvmm_defs::worker::VmWorkerParameters;
96use openvmm_helpers::disk::OpenDiskOptions;
97use openvmm_helpers::disk::create_disk_type;
98use openvmm_helpers::disk::open_disk_type;
99use pal_async::DefaultDriver;
100use pal_async::DefaultPool;
101use pal_async::socket::PolledSocket;
102use pal_async::task::Spawn;
103use pal_async::task::Task;
104use serial_16550_resources::ComPort;
105use serial_core::resources::DisconnectedSerialBackendHandle;
106use sparse_mmap::alloc_shared_memory;
107use std::cell::RefCell;
108use std::collections::BTreeMap;
109use std::fmt::Write as _;
110use std::future::pending;
111use std::io;
112#[cfg(unix)]
113use std::io::IsTerminal;
114use std::io::Write;
115use std::net::TcpListener;
116use std::path::Path;
117use std::path::PathBuf;
118use std::sync::Arc;
119use std::thread;
120use std::time::Duration;
121use storvsp_resources::ScsiControllerRequest;
122use tpm_resources::TpmDeviceHandle;
123use tpm_resources::TpmRegisterLayout;
124use uidevices_resources::SynthKeyboardHandle;
125use uidevices_resources::SynthMouseHandle;
126use uidevices_resources::SynthVideoHandle;
127use video_core::SharedFramebufferHandle;
128use virtio_resources::VirtioPciDeviceHandle;
129use vm_manifest_builder::BaseChipsetType;
130use vm_manifest_builder::MachineArch;
131use vm_manifest_builder::VmChipsetResult;
132use vm_manifest_builder::VmManifestBuilder;
133use vm_resource::IntoResource;
134use vm_resource::Resource;
135use vm_resource::kind::DiskHandleKind;
136use vm_resource::kind::DiskLayerHandleKind;
137use vm_resource::kind::NetEndpointHandleKind;
138use vm_resource::kind::VirtioDeviceHandle;
139use vm_resource::kind::VmbusDeviceHandleKind;
140use vmbus_serial_resources::VmbusSerialDeviceHandle;
141use vmbus_serial_resources::VmbusSerialPort;
142use vmcore::non_volatile_store::resources::EphemeralNonVolatileStoreHandle;
143use vmgs_resources::GuestStateEncryptionPolicy;
144use vmgs_resources::VmgsDisk;
145use vmgs_resources::VmgsFileHandle;
146use vmgs_resources::VmgsResource;
147use vmotherboard::ChipsetDeviceHandle;
148use vnc_worker_defs::VncParameters;
149
150pub fn openvmm_main() {
151    // Save the current state of the terminal so we can restore it back to
152    // normal before exiting.
153    #[cfg(unix)]
154    let orig_termios = io::stderr().is_terminal().then(term::get_termios);
155
156    let mut pidfile_guard: Option<pidfile::Pidfile> = None;
157    let exit_code = match do_main(&mut pidfile_guard) {
158        Ok(code) => code,
159        Err(err) => {
160            eprintln!("fatal error: {:?}", err);
161            1
162        }
163    };
164
165    // Restore the terminal to its initial state.
166    #[cfg(unix)]
167    if let Some(orig_termios) = orig_termios {
168        term::set_termios(orig_termios);
169    }
170
171    // Clean up the pidfile before terminating, since
172    // pal::process::terminate skips destructors.
173    drop(pidfile_guard);
174
175    // Terminate the process immediately without graceful shutdown of DLLs or
176    // C++ destructors or anything like that. This is all unnecessary and saves
177    // time on Windows.
178    //
179    // Do flush stdout, though, since there may be buffered data.
180    let _ = io::stdout().flush();
181    pal::process::terminate(exit_code);
182}
183
184#[derive(Default)]
185struct VmResources {
186    console_in: Option<Box<dyn AsyncWrite + Send + Unpin>>,
187    framebuffer_access: Option<FramebufferAccess>,
188    shutdown_ic: Option<mesh::Sender<hyperv_ic_resources::shutdown::ShutdownRpc>>,
189    kvp_ic: Option<mesh::Sender<hyperv_ic_resources::kvp::KvpConnectRpc>>,
190    scsi_rpc: Option<mesh::Sender<ScsiControllerRequest>>,
191    nvme_vtl2_rpc: Option<mesh::Sender<NvmeControllerRequest>>,
192    consomme_rpc: Option<mesh::Sender<net_backend_resources::consomme::ConsommeRequest>>,
193    ged_rpc: Option<mesh::Sender<get_resources::ged::GuestEmulationRequest>>,
194    vtl2_settings: Option<vtl2_settings_proto::Vtl2Settings>,
195    /// Receives dirty rectangles from the synthetic video device for the VNC worker.
196    dirty_rect_recv: Option<mesh::Receiver<Vec<video_core::DirtyRect>>>,
197    #[cfg(windows)]
198    switch_ports: Vec<vmswitch::kernel::SwitchPort>,
199}
200
201struct ConsoleState<'a> {
202    device: &'a str,
203    input: Box<dyn AsyncWrite + Unpin + Send>,
204}
205
206/// Build a flat list of switches with their parent port assignments.
207///
208/// This function converts hierarchical CLI switch definitions into a flat list
209/// where each switch specifies its parent port directly.
210fn build_switch_list(all_switches: &[cli_args::GenericPcieSwitchCli]) -> Vec<PcieSwitchConfig> {
211    all_switches
212        .iter()
213        .map(|switch_cli| PcieSwitchConfig {
214            name: switch_cli.name.clone(),
215            parent_port: switch_cli.port_name.clone(),
216            ports: (0..switch_cli.num_downstream_ports)
217                .map(|i| PciePortConfig {
218                    name: format!("{}-downstream-{}", switch_cli.name, i),
219                    devfn: None,
220                    hotplug: switch_cli.hotplug,
221                    acs_capabilities_supported: switch_cli.acs_capabilities_supported,
222                    cxl: false,
223                })
224                .collect(),
225        })
226        .collect()
227}
228
229async fn vm_config_from_command_line(
230    spawner: impl Spawn,
231    mesh: &VmmMesh,
232    opt: &Options,
233) -> anyhow::Result<(Config, VmResources)> {
234    let (_, serial_driver) = DefaultPool::spawn_on_thread("serial");
235    // Ensure the serial driver stays alive with no tasks.
236    serial_driver.spawn("leak", pending::<()>()).detach();
237
238    let openhcl_vtl = if opt.vtl2 {
239        DeviceVtl::Vtl2
240    } else {
241        DeviceVtl::Vtl0
242    };
243
244    let console_state: RefCell<Option<ConsoleState<'_>>> = RefCell::new(None);
245    let setup_serial = |name: &str, cli_cfg, device| -> anyhow::Result<_> {
246        Ok(match cli_cfg {
247            SerialConfigCli::Console => {
248                if let Some(console_state) = console_state.borrow().as_ref() {
249                    bail!("console already set by {}", console_state.device);
250                }
251                let (config, serial) = serial_io::anonymous_serial_pair(&serial_driver)?;
252                let (serial_read, serial_write) = AsyncReadExt::split(serial);
253                *console_state.borrow_mut() = Some(ConsoleState {
254                    device,
255                    input: Box::new(serial_write),
256                });
257                thread::Builder::new()
258                    .name(name.to_owned())
259                    .spawn(move || {
260                        let _ = block_on(futures::io::copy(
261                            serial_read,
262                            &mut AllowStdIo::new(term::raw_stdout()),
263                        ));
264                    })
265                    .unwrap();
266                Some(config)
267            }
268            SerialConfigCli::Stderr => {
269                let (config, serial) = serial_io::anonymous_serial_pair(&serial_driver)?;
270                thread::Builder::new()
271                    .name(name.to_owned())
272                    .spawn(move || {
273                        let _ = block_on(futures::io::copy(
274                            serial,
275                            &mut AllowStdIo::new(term::raw_stderr()),
276                        ));
277                    })
278                    .unwrap();
279                Some(config)
280            }
281            SerialConfigCli::File(path) => {
282                let (config, serial) = serial_io::anonymous_serial_pair(&serial_driver)?;
283                let file = fs_err::File::create(path).context("failed to create file")?;
284
285                thread::Builder::new()
286                    .name(name.to_owned())
287                    .spawn(move || {
288                        let _ = block_on(futures::io::copy(serial, &mut AllowStdIo::new(file)));
289                    })
290                    .unwrap();
291                Some(config)
292            }
293            SerialConfigCli::None => None,
294            SerialConfigCli::Pipe(path) => {
295                Some(serial_io::bind_serial(&path).context("failed to bind serial")?)
296            }
297            SerialConfigCli::Tcp(addr) => {
298                Some(serial_io::bind_tcp_serial(&addr).context("failed to bind serial")?)
299            }
300            SerialConfigCli::NewConsole(app, window_title) => {
301                let path = console_relay::random_console_path();
302                let config =
303                    serial_io::bind_serial(&path).context("failed to bind console serial")?;
304                let window_title =
305                    window_title.unwrap_or_else(|| name.to_uppercase() + " [OpenVMM]");
306
307                console_relay::launch_console(
308                    app.or_else(openvmm_terminal_app).as_deref(),
309                    &path,
310                    ConsoleLaunchOptions {
311                        window_title: Some(window_title),
312                    },
313                )
314                .context("failed to launch console")?;
315
316                Some(config)
317            }
318        })
319    };
320
321    let mut vmbus_devices = Vec::new();
322
323    let serial0_cfg = setup_serial(
324        "com1",
325        opt.com1.clone().unwrap_or(SerialConfigCli::Console),
326        if cfg!(guest_arch = "x86_64") {
327            "ttyS0"
328        } else {
329            "ttyAMA0"
330        },
331    )?;
332    let serial1_cfg = setup_serial(
333        "com2",
334        opt.com2.clone().unwrap_or(SerialConfigCli::None),
335        if cfg!(guest_arch = "x86_64") {
336            "ttyS1"
337        } else {
338            "ttyAMA1"
339        },
340    )?;
341    let serial2_cfg = setup_serial(
342        "com3",
343        opt.com3.clone().unwrap_or(SerialConfigCli::None),
344        if cfg!(guest_arch = "x86_64") {
345            "ttyS2"
346        } else {
347            "ttyAMA2"
348        },
349    )?;
350    let serial3_cfg = setup_serial(
351        "com4",
352        opt.com4.clone().unwrap_or(SerialConfigCli::None),
353        if cfg!(guest_arch = "x86_64") {
354            "ttyS3"
355        } else {
356            "ttyAMA3"
357        },
358    )?;
359    let with_vmbus_com1_serial = if let Some(vmbus_com1_cfg) = setup_serial(
360        "vmbus_com1",
361        opt.vmbus_com1_serial
362            .clone()
363            .unwrap_or(SerialConfigCli::None),
364        "vmbus_com1",
365    )? {
366        vmbus_devices.push((
367            openhcl_vtl,
368            VmbusSerialDeviceHandle {
369                port: VmbusSerialPort::Com1,
370                backend: vmbus_com1_cfg,
371            }
372            .into_resource(),
373        ));
374        true
375    } else {
376        false
377    };
378    let with_vmbus_com2_serial = if let Some(vmbus_com2_cfg) = setup_serial(
379        "vmbus_com2",
380        opt.vmbus_com2_serial
381            .clone()
382            .unwrap_or(SerialConfigCli::None),
383        "vmbus_com2",
384    )? {
385        vmbus_devices.push((
386            openhcl_vtl,
387            VmbusSerialDeviceHandle {
388                port: VmbusSerialPort::Com2,
389                backend: vmbus_com2_cfg,
390            }
391            .into_resource(),
392        ));
393        true
394    } else {
395        false
396    };
397    let debugcon_cfg = setup_serial(
398        "debugcon",
399        opt.debugcon
400            .clone()
401            .map(|cfg| cfg.serial)
402            .unwrap_or(SerialConfigCli::None),
403        "debugcon",
404    )?;
405
406    let virtio_console_backend = if let Some(serial_cfg) = opt.virtio_console.clone() {
407        setup_serial("virtio-console", serial_cfg, "hvc0")?
408    } else {
409        None
410    };
411
412    let mut resources = VmResources::default();
413    let mut console_str = "";
414    if let Some(ConsoleState { device, input }) = console_state.into_inner() {
415        resources.console_in = Some(input);
416        console_str = device;
417    }
418
419    if opt.shared_memory {
420        tracing::warn!("--shared-memory/-M flag has no effect and will be removed");
421    }
422    if opt.deprecated_prefetch {
423        tracing::warn!("--prefetch is deprecated; use --memory prefetch=on");
424    }
425    if opt.deprecated_private_memory {
426        tracing::warn!("--private-memory is deprecated; use --memory shared=off");
427    }
428    if opt.deprecated_thp {
429        tracing::warn!("--thp is deprecated; use --memory shared=off,thp=on");
430    }
431    if opt.deprecated_memory_backing_file.is_some() {
432        tracing::warn!("--memory-backing-file is deprecated; use --memory file=<path>");
433    }
434
435    opt.validate_memory_options()?;
436
437    const MAX_PROCESSOR_COUNT: u32 = 1024;
438
439    if opt.processors == 0 || opt.processors > MAX_PROCESSOR_COUNT {
440        bail!("invalid proc count: {}", opt.processors);
441    }
442
443    // Total SCSI channel count should not exceed the processor count
444    // (at most, one channel per VP).
445    if opt.scsi_sub_channels > (MAX_PROCESSOR_COUNT - 1) as u16 {
446        bail!(
447            "invalid SCSI sub-channel count: requested {}, max {}",
448            opt.scsi_sub_channels,
449            MAX_PROCESSOR_COUNT - 1
450        );
451    }
452
453    let with_get = opt.get || (opt.vtl2 && !opt.no_get);
454
455    let mut storage = storage_builder::StorageBuilder::new(with_get.then_some(openhcl_vtl));
456
457    // Register named controllers first, so that --disk on=<name>
458    // references can be resolved.
459    for ctrl in &opt.nvme_pci {
460        let transport = match &ctrl.transport {
461            cli_args::NvmeControllerTransport::Pcie(port) => {
462                storage_builder::NvmeControllerTransport::Pcie(port.clone())
463            }
464            cli_args::NvmeControllerTransport::Vpci(guid) => {
465                let guid = guid.unwrap_or_else(|| storage_builder::deterministic_guid(&ctrl.id));
466                storage_builder::NvmeControllerTransport::Vpci(guid)
467            }
468        };
469        storage.add_nvme_controller(ctrl.id.clone(), ctrl.vtl, transport, None)?;
470    }
471
472    for ctrl in &opt.vmbus_scsi {
473        let instance_id = storage_builder::deterministic_guid(&ctrl.id);
474        storage.add_scsi_controller(ctrl.id.clone(), ctrl.vtl, instance_id, ctrl.sub_channels)?;
475    }
476
477    for ctrl in &opt.openhcl_controller {
478        let controller_type = match ctrl.controller_type {
479            cli_args::OpenhclControllerType::Scsi => storage_builder::OpenhclControllerType::Scsi,
480            cli_args::OpenhclControllerType::Nvme => storage_builder::OpenhclControllerType::Nvme,
481        };
482        let instance_id = ctrl
483            .guid
484            .unwrap_or_else(|| storage_builder::deterministic_guid(&ctrl.id));
485        storage.add_openhcl_controller(ctrl.id.clone(), controller_type, instance_id)?;
486    }
487
488    for &cli_args::DiskCli {
489        vtl,
490        ref kind,
491        read_only,
492        is_dvd,
493        underhill,
494        ref pcie_port,
495        ref controller,
496        nsid,
497        lun,
498        ref relay,
499    } in &opt.disk
500    {
501        if controller.is_none() && underhill.is_none() && relay.is_none() {
502            tracing::warn!(
503                "--disk without `on` is deprecated; \
504                 use --vmbus-scsi and --disk on=<name> instead"
505            );
506        }
507
508        let relay_target = relay
509            .as_ref()
510            .map(|(name, loc)| storage_builder::RelayTarget {
511                controller: name.clone(),
512                location: *loc,
513            });
514
515        let target = if let Some(name) = controller {
516            if pcie_port.is_some() {
517                anyhow::bail!("`on` is incompatible with `pcie_port` on `--disk`");
518            }
519            storage_builder::DiskLocation::Named {
520                controller: name.clone(),
521                nsid,
522                lun,
523            }
524        } else if pcie_port.is_some() {
525            anyhow::bail!("`--disk` is incompatible with `pcie_port` without `controller`");
526        } else {
527            if opt.no_vmbus {
528                anyhow::bail!(
529                    "`--disk` without `on=` attaches to the default VMBus SCSI controller and \
530                     cannot be used with `--no-vmbus`; use `on=<name>` to attach to a named controller"
531                );
532            }
533            storage_builder::DiskLocation::Scsi(None)
534        };
535
536        storage
537            .add(
538                vtl,
539                underhill,
540                relay_target,
541                target,
542                kind,
543                is_dvd,
544                read_only,
545            )
546            .await?;
547    }
548
549    for &cli_args::IdeDiskCli {
550        ref kind,
551        read_only,
552        channel,
553        device,
554        is_dvd,
555    } in &opt.ide
556    {
557        storage
558            .add(
559                DeviceVtl::Vtl0,
560                None,
561                None,
562                storage_builder::DiskLocation::Ide(channel, device),
563                kind,
564                is_dvd,
565                read_only,
566            )
567            .await?;
568    }
569
570    if !opt.nvme.is_empty() {
571        tracing::warn!("--nvme is deprecated; use --nvme-pci and --disk on=<name> instead");
572
573        // Pre-register implicit PCIe controllers for unique port names.
574        let mut registered_ports = std::collections::BTreeSet::new();
575        for disk in &opt.nvme {
576            if let Some(port) = &disk.pcie_port {
577                if registered_ports.insert(port.clone()) {
578                    storage.add_nvme_controller(
579                        port.clone(),
580                        DeviceVtl::Vtl0,
581                        storage_builder::NvmeControllerTransport::Pcie(port.clone()),
582                        None,
583                    ).with_context(|| format!(
584                        "legacy --nvme flag conflicts with an explicit controller named '{port}'; \
585                         use --nvme-pci and --disk on=<name> instead"
586                    ))?;
587                }
588            }
589        }
590    }
591
592    for &cli_args::DiskCli {
593        vtl,
594        ref kind,
595        read_only,
596        is_dvd,
597        underhill,
598        ref pcie_port,
599        controller: _,
600        nsid: _,
601        lun: _,
602        relay: _,
603    } in &opt.nvme
604    {
605        let target = if let Some(port) = pcie_port {
606            storage_builder::DiskLocation::Named {
607                controller: port.clone(),
608                nsid: None,
609                lun: None,
610            }
611        } else {
612            storage_builder::DiskLocation::Nvme(None)
613        };
614        storage
615            .add(vtl, underhill, None, target, kind, is_dvd, read_only)
616            .await?;
617    }
618
619    for &cli_args::DiskCli {
620        vtl,
621        ref kind,
622        read_only,
623        is_dvd,
624        ref underhill,
625        ref pcie_port,
626        controller: _,
627        nsid: _,
628        lun: _,
629        relay: _,
630    } in &opt.virtio_blk
631    {
632        if underhill.is_some() {
633            anyhow::bail!("underhill not supported with virtio-blk");
634        }
635        storage
636            .add(
637                vtl,
638                None,
639                None,
640                storage_builder::DiskLocation::VirtioBlk(pcie_port.clone()),
641                kind,
642                is_dvd,
643                read_only,
644            )
645            .await?;
646    }
647
648    let mut floppy_disks = Vec::new();
649    for disk in &opt.floppy {
650        let &cli_args::FloppyDiskCli {
651            ref kind,
652            read_only,
653        } = disk;
654        floppy_disks.push(FloppyDiskConfig {
655            disk_type: disk_open(kind, read_only).await?,
656            read_only,
657        });
658    }
659
660    let mut vpci_mana_nics = [(); 3].map(|()| None);
661    let mut pcie_mana_nics = BTreeMap::<String, GdmaDeviceHandle>::new();
662    let mut underhill_nics = Vec::new();
663    let mut vpci_devices = Vec::new();
664
665    let mut nic_index = 0;
666    for cli_cfg in &opt.net {
667        if cli_cfg.pcie_port.is_some() {
668            anyhow::bail!("`--net` does not support PCIe");
669        }
670        let vport = parse_endpoint(cli_cfg, &mut nic_index, &mut resources)?;
671        if cli_cfg.underhill {
672            if !opt.no_alias_map {
673                anyhow::bail!("must specify --no-alias-map to offer NICs to VTL2");
674            }
675            let mana = vpci_mana_nics[openhcl_vtl as usize].get_or_insert_with(|| {
676                let vpci_instance_id = Guid::new_random();
677                underhill_nics.push(vtl2_settings_proto::NicDeviceLegacy {
678                    instance_id: vpci_instance_id.to_string(),
679                    subordinate_instance_id: None,
680                    max_sub_channels: None,
681                });
682                (vpci_instance_id, GdmaDeviceHandle { vports: Vec::new() })
683            });
684            mana.1.vports.push(VportDefinition {
685                mac_address: vport.mac_address,
686                endpoint: vport.endpoint,
687            });
688        } else {
689            vmbus_devices.push(vport.into_netvsp_handle());
690        }
691    }
692
693    if opt.nic {
694        let nic_config = parse_endpoint(
695            &NicConfigCli {
696                vtl: DeviceVtl::Vtl0,
697                endpoint: EndpointConfigCli::Consomme {
698                    cidr: None,
699                    host_fwd: Vec::new(),
700                },
701                max_queues: None,
702                underhill: false,
703                pcie_port: None,
704            },
705            &mut nic_index,
706            &mut resources,
707        )?;
708        vmbus_devices.push(nic_config.into_netvsp_handle());
709    }
710
711    // Build initial PCIe devices list from CLI options. Storage devices
712    // (e.g., NVMe controllers on PCIe ports) are added later by storage_builder.
713    let mut pcie_devices = Vec::new();
714    for (index, cli_cfg) in opt.pcie_remote.iter().enumerate() {
715        tracing::info!(
716            port_name = %cli_cfg.port_name,
717            socket_addr = ?cli_cfg.socket_addr,
718            "instantiating PCIe remote device"
719        );
720
721        // Generate a deterministic instance ID based on index
722        const PCIE_REMOTE_BASE_INSTANCE_ID: Guid =
723            guid::guid!("28ed784d-c059-429f-9d9a-46bea02562c0");
724        let instance_id = Guid {
725            data1: index as u32,
726            ..PCIE_REMOTE_BASE_INSTANCE_ID
727        };
728
729        pcie_devices.push(PcieDeviceConfig {
730            port_name: cli_cfg.port_name.clone(),
731            resource: pcie_remote_resources::PcieRemoteHandle {
732                instance_id,
733                socket_addr: cli_cfg.socket_addr.clone(),
734                hu: cli_cfg.hu,
735                controller: cli_cfg.controller,
736            }
737            .into_resource(),
738        });
739    }
740
741    #[cfg(windows)]
742    let mut kernel_vmnics = Vec::new();
743    #[cfg(windows)]
744    for (index, switch_id) in opt.kernel_vmnic.iter().enumerate() {
745        // Pick a random MAC address.
746        let mut mac_address = [0x00, 0x15, 0x5D, 0, 0, 0];
747        getrandom::fill(&mut mac_address[3..]).expect("rng failure");
748
749        // Pick a fixed instance ID based on the index.
750        const BASE_INSTANCE_ID: Guid = guid::guid!("00000000-435d-11ee-9f59-00155d5016fc");
751        let instance_id = Guid {
752            data1: index as u32,
753            ..BASE_INSTANCE_ID
754        };
755
756        let switch_id = if switch_id == "default" {
757            None
758        } else {
759            Some(switch_id.as_str())
760        };
761        let (port_id, port) = new_switch_port(switch_id)?;
762        resources.switch_ports.push(port);
763
764        kernel_vmnics.push(openvmm_defs::config::KernelVmNicConfig {
765            instance_id,
766            mac_address: mac_address.into(),
767            switch_port_id: port_id,
768        });
769    }
770
771    for vport in &opt.mana {
772        let vport = parse_endpoint(vport, &mut nic_index, &mut resources)?;
773        let vport_array = match (vport.vtl as usize, vport.pcie_port) {
774            (vtl, None) => {
775                &mut vpci_mana_nics[vtl]
776                    .get_or_insert_with(|| {
777                        (Guid::new_random(), GdmaDeviceHandle { vports: Vec::new() })
778                    })
779                    .1
780                    .vports
781            }
782            (0, Some(pcie_port)) => {
783                &mut pcie_mana_nics
784                    .entry(pcie_port)
785                    .or_insert(GdmaDeviceHandle { vports: Vec::new() })
786                    .vports
787            }
788            _ => anyhow::bail!("PCIe NICs only supported to VTL0"),
789        };
790        vport_array.push(VportDefinition {
791            mac_address: vport.mac_address,
792            endpoint: vport.endpoint,
793        });
794    }
795
796    vpci_devices.extend(
797        vpci_mana_nics
798            .into_iter()
799            .enumerate()
800            .filter_map(|(vtl, nic)| {
801                nic.map(|(instance_id, handle)| VpciDeviceConfig {
802                    vtl: match vtl {
803                        0 => DeviceVtl::Vtl0,
804                        1 => DeviceVtl::Vtl1,
805                        2 => DeviceVtl::Vtl2,
806                        _ => unreachable!(),
807                    },
808                    instance_id,
809                    resource: handle.into_resource(),
810                    vnode: None,
811                })
812            }),
813    );
814
815    pcie_devices.extend(
816        pcie_mana_nics
817            .into_iter()
818            .map(|(pcie_port, handle)| PcieDeviceConfig {
819                port_name: pcie_port,
820                resource: handle.into_resource(),
821            }),
822    );
823
824    for cxl_test in &opt.cxl_test {
825        pcie_devices.push(PcieDeviceConfig {
826            port_name: cxl_test.pcie_port.clone(),
827            resource: CxlTestDeviceHandle {
828                hdm_size_bytes: cxl_test.hdm_size,
829            }
830            .into_resource(),
831        });
832    }
833
834    #[cfg(guest_arch = "aarch64")]
835    let arch = MachineArch::Aarch64;
836    #[cfg(guest_arch = "x86_64")]
837    let arch = MachineArch::X86_64;
838
839    #[cfg(guest_arch = "x86_64")]
840    anyhow::ensure!(
841        opt.amd_iommu.is_empty() || opt.intel_vtd.is_empty(),
842        "--amd-iommu and --intel-vtd cannot both be used in the same VM"
843    );
844
845    #[cfg(guest_arch = "aarch64")]
846    let mut smmu_names: std::collections::HashSet<&str> =
847        opt.smmu.iter().map(|s| s.as_str()).collect();
848    #[cfg(guest_arch = "x86_64")]
849    let mut amd_iommu_names: std::collections::HashSet<&str> =
850        opt.amd_iommu.iter().map(|s| s.as_str()).collect();
851    #[cfg(guest_arch = "x86_64")]
852    let mut vtd_names: std::collections::HashSet<&str> =
853        opt.intel_vtd.iter().map(|s| s.as_str()).collect();
854
855    let mut pcie_root_complexes = Vec::new();
856    for (i, rc_cli) in opt.pcie_root_complex.iter().enumerate() {
857        let ports: Vec<PciePortConfig> = opt
858            .pcie_root_port
859            .iter()
860            .filter(|port_cli| port_cli.root_complex_name == rc_cli.name)
861            .map(|port_cli| PciePortConfig {
862                name: port_cli.name.clone(),
863                devfn: port_cli.devfn,
864                hotplug: port_cli.hotplug,
865                acs_capabilities_supported: port_cli.acs_capabilities_supported,
866                cxl: port_cli.cxl,
867            })
868            .collect();
869
870        const ONE_MB: u64 = 1024 * 1024;
871        // Keep all PCI windows 1MB-granular to match layout and downstream placement rules.
872        let low_mmio_size = (rc_cli.low_mmio as u64).next_multiple_of(ONE_MB);
873        let high_mmio_size = rc_cli
874            .high_mmio
875            .checked_next_multiple_of(ONE_MB)
876            .context("high mmio rounding error")?;
877
878        // Count CXL-capable ports under the root bus. If the root bus has CXL root ports, it needs CHBCR.
879        let cxl_port_count = ports.iter().filter(|port| port.cxl).count() as u64;
880
881        let cxl = if cxl_port_count != 0 {
882            Some(RootComplexCxlConfig {
883                hdm_size: rc_cli.hdm,
884                hdm_window_restrictions: rc_cli.hdm_window_restrictions.bits(),
885            })
886        } else {
887            None
888        };
889        pcie_root_complexes.push(PcieRootComplexConfig {
890            index: i as u32,
891            name: rc_cli.name.clone(),
892            segment: rc_cli.segment,
893            start_bus: rc_cli.start_bus,
894            end_bus: rc_cli.end_bus,
895            low_mmio: if let Some(base) = rc_cli.low_mmio_base {
896                PcieMmioRangeConfig::Fixed(
897                    memory_range::MemoryRange::try_new(base..base.wrapping_add(low_mmio_size))
898                        .context("invalid low MMIO range")?,
899                )
900            } else {
901                PcieMmioRangeConfig::Dynamic {
902                    size: low_mmio_size,
903                }
904            },
905            high_mmio: if let Some(base) = rc_cli.high_mmio_base {
906                PcieMmioRangeConfig::Fixed(
907                    memory_range::MemoryRange::try_new(base..base.wrapping_add(high_mmio_size))
908                        .context("invalid high MMIO range")?,
909                )
910            } else {
911                PcieMmioRangeConfig::Dynamic {
912                    size: high_mmio_size,
913                }
914            },
915            cxl,
916            ports,
917            #[cfg(guest_arch = "aarch64")]
918            iommu: smmu_names
919                .remove(rc_cli.name.as_str())
920                .then_some(openvmm_defs::config::PcieIommuConfig::Smmu),
921            #[cfg(guest_arch = "x86_64")]
922            iommu: if amd_iommu_names.remove(rc_cli.name.as_str()) {
923                Some(openvmm_defs::config::PcieIommuConfig::AmdVi)
924            } else if vtd_names.remove(rc_cli.name.as_str()) {
925                Some(openvmm_defs::config::PcieIommuConfig::IntelVtd)
926            } else {
927                None
928            },
929            vnode: rc_cli.vnode,
930            preserve_bars: rc_cli.preserve_bars,
931        });
932    }
933
934    #[cfg(guest_arch = "aarch64")]
935    if let Some(name) = smmu_names.into_iter().next() {
936        anyhow::bail!("--smmu refers to unknown root complex '{name}'");
937    }
938    #[cfg(guest_arch = "x86_64")]
939    if let Some(name) = amd_iommu_names.into_iter().next() {
940        anyhow::bail!("--amd-iommu refers to unknown root complex '{name}'");
941    }
942    #[cfg(guest_arch = "x86_64")]
943    if let Some(name) = vtd_names.into_iter().next() {
944        anyhow::bail!("--intel-vtd refers to unknown root complex '{name}'");
945    }
946
947    let pcie_switches = build_switch_list(&opt.pcie_switch);
948    let pcie_generic_initiators = opt
949        .pcie_generic_initiator
950        .iter()
951        .map(|gi| openvmm_defs::config::PcieGenericInitiatorConfig {
952            port_name: gi.port_name.clone(),
953            node: gi.node,
954        })
955        .collect();
956    #[cfg(target_os = "linux")]
957    let vfio_pcie_devices: Vec<PcieDeviceConfig> = {
958        use std::collections::HashMap;
959        use vm_resource::IntoResource;
960
961        // Process --iommu flags: open /dev/iommu for each declared context.
962        let mut iommu_map: HashMap<String, std::fs::File> = HashMap::new();
963        for iommu_cli in &opt.iommu {
964            anyhow::ensure!(
965                !iommu_map.contains_key(&iommu_cli.id),
966                "duplicate --iommu id={}",
967                iommu_cli.id
968            );
969            let file = std::fs::OpenOptions::new()
970                .read(true)
971                .write(true)
972                .open("/dev/iommu")
973                .context("failed to open /dev/iommu (is iommufd available?)")?;
974            iommu_map.insert(iommu_cli.id.clone(), file);
975        }
976
977        opt.vfio
978            .iter()
979            .map(|cli_cfg| {
980                let sysfs_path = Path::new("/sys/bus/pci/devices").join(&cli_cfg.pci_id);
981
982                if let Some(iommu_id) = &cli_cfg.iommu {
983                    // cdev + iommufd path
984                    let iommufd = iommu_map.get(iommu_id).with_context(|| {
985                        format!(
986                            "--vfio device {} references iommu={iommu_id}, \
987                             but no --iommu id={iommu_id} was specified",
988                            cli_cfg.pci_id
989                        )
990                    })?;
991                    // Clone the iommufd fd so the per-iommu manager can own it.
992                    // The first device for a given iommu ID uses the cloned fd
993                    // to create the IoasManager; subsequent devices reuse the
994                    // existing manager and the cloned fd is dropped.
995                    let iommufd = iommufd.try_clone().with_context(|| {
996                        format!("failed to dup iommufd fd for iommu={iommu_id}")
997                    })?;
998
999                    // Open the cdev device node.
1000                    let vfio_dev_dir = sysfs_path.join("vfio-dev");
1001                    let entry = std::fs::read_dir(&vfio_dev_dir)
1002                        .with_context(|| {
1003                            format!(
1004                                "failed to read {}: is {} bound to vfio-pci?",
1005                                vfio_dev_dir.display(),
1006                                cli_cfg.pci_id
1007                            )
1008                        })?
1009                        .next()
1010                        .context("no vfio-dev entry found")?
1011                        .context("failed to read vfio-dev entry")?;
1012                    let dev_path = Path::new("/dev/vfio/devices").join(entry.file_name());
1013                    let cdev = std::fs::OpenOptions::new()
1014                        .read(true)
1015                        .write(true)
1016                        .open(&dev_path)
1017                        .with_context(|| format!("failed to open {}", dev_path.display()))?;
1018
1019                    Ok(PcieDeviceConfig {
1020                        port_name: cli_cfg.port_name.clone(),
1021                        resource: vfio_assigned_device_resources::VfioCdevDeviceHandle {
1022                            pci_id: cli_cfg.pci_id.clone(),
1023                            cdev,
1024                            iommufd,
1025                            iommu_id: iommu_id.clone(),
1026                            bar_pt: cli_cfg.bar_pt,
1027                        }
1028                        .into_resource(),
1029                    })
1030                } else {
1031                    // Legacy group/container path
1032                    let iommu_group_link = std::fs::read_link(sysfs_path.join("iommu_group"))
1033                        .with_context(|| {
1034                            format!("failed to read IOMMU group for {}", cli_cfg.pci_id)
1035                        })?;
1036                    let group_id: u64 = iommu_group_link
1037                        .file_name()
1038                        .and_then(|s| s.to_str())
1039                        .context("invalid iommu_group symlink")?
1040                        .parse()
1041                        .context("failed to parse IOMMU group ID")?;
1042                    let group = std::fs::OpenOptions::new()
1043                        .read(true)
1044                        .write(true)
1045                        .open(format!("/dev/vfio/{group_id}"))
1046                        .with_context(|| format!("failed to open /dev/vfio/{group_id}"))?;
1047
1048                    Ok(PcieDeviceConfig {
1049                        port_name: cli_cfg.port_name.clone(),
1050                        resource: vfio_assigned_device_resources::VfioDeviceHandle {
1051                            pci_id: cli_cfg.pci_id.clone(),
1052                            group,
1053                            bar_pt: cli_cfg.bar_pt,
1054                        }
1055                        .into_resource(),
1056                    })
1057                }
1058            })
1059            .collect::<anyhow::Result<Vec<_>>>()?
1060    };
1061
1062    #[cfg(windows)]
1063    let vpci_resources: Vec<_> = opt
1064        .device
1065        .iter()
1066        .map(|path| -> anyhow::Result<_> {
1067            Ok(virt_whp::device::DeviceHandle(
1068                whp::VpciResource::new(
1069                    None,
1070                    Default::default(),
1071                    &whp::VpciResourceDescriptor::Sriov(path, 0, 0),
1072                )
1073                .with_context(|| format!("opening PCI device {}", path))?,
1074            ))
1075        })
1076        .collect::<Result<_, _>>()?;
1077
1078    // Create a vmbusproxy handle if needed by any devices.
1079    #[cfg(windows)]
1080    let vmbusproxy_handle = if !kernel_vmnics.is_empty() {
1081        Some(vmbus_proxy::ProxyHandle::new().context("failed to open vmbusproxy handle")?)
1082    } else {
1083        None
1084    };
1085
1086    let framebuffer = if opt.gfx || opt.vtl2_gfx || opt.vnc.vnc || opt.pcat {
1087        let vram = alloc_shared_memory(FRAMEBUFFER_SIZE, "vram")?;
1088        let (fb, fba) =
1089            framebuffer::framebuffer(vram, FRAMEBUFFER_SIZE, 0).context("creating framebuffer")?;
1090        resources.framebuffer_access = Some(fba);
1091        Some(fb)
1092    } else {
1093        None
1094    };
1095
1096    let load_mode;
1097    let with_hv;
1098
1099    let any_serial_configured = serial0_cfg.is_some()
1100        || serial1_cfg.is_some()
1101        || serial2_cfg.is_some()
1102        || serial3_cfg.is_some();
1103
1104    let has_com3 = serial2_cfg.is_some();
1105
1106    let mut chipset = VmManifestBuilder::new(
1107        if opt.igvm.is_some() {
1108            BaseChipsetType::HclHost
1109        } else if opt.pcat {
1110            BaseChipsetType::HypervGen1
1111        } else if opt.uefi {
1112            BaseChipsetType::HypervGen2Uefi
1113        } else if opt.hv {
1114            BaseChipsetType::HyperVGen2LinuxDirect
1115        } else {
1116            BaseChipsetType::UnenlightenedLinuxDirect
1117        },
1118        arch,
1119    );
1120
1121    if framebuffer.is_some() {
1122        chipset = chipset.with_framebuffer();
1123    }
1124    if opt.guest_watchdog {
1125        chipset = chipset.with_guest_watchdog();
1126    }
1127    if any_serial_configured {
1128        chipset = chipset.with_serial([serial0_cfg, serial1_cfg, serial2_cfg, serial3_cfg]);
1129    }
1130    if opt.battery {
1131        let (tx, rx) = mesh::channel();
1132        tx.send(HostBatteryUpdate::default_present());
1133        chipset = chipset.with_battery(rx);
1134    }
1135    if opt.no_vmbus {
1136        chipset = chipset.without_vmbus();
1137    }
1138    if let Some(cfg) = &opt.debugcon {
1139        chipset = chipset.with_debugcon(
1140            debugcon_cfg.unwrap_or_else(|| DisconnectedSerialBackendHandle.into_resource()),
1141            cfg.port,
1142        );
1143    }
1144
1145    let custom_uefi_vars = {
1146        use firmware_uefi_custom_vars::CustomVars;
1147
1148        // load base vars from specified template, or use an empty set of base
1149        // vars if none was specified.
1150        let base_vars = match opt.secure_boot_template {
1151            Some(template) => match (arch, template) {
1152                (MachineArch::X86_64, SecureBootTemplateCli::Windows) => {
1153                    hyperv_secure_boot_templates::x64::microsoft_windows()
1154                }
1155                (MachineArch::X86_64, SecureBootTemplateCli::UefiCa) => {
1156                    hyperv_secure_boot_templates::x64::microsoft_uefi_ca()
1157                }
1158                (MachineArch::Aarch64, SecureBootTemplateCli::Windows) => {
1159                    hyperv_secure_boot_templates::aarch64::microsoft_windows()
1160                }
1161                (MachineArch::Aarch64, SecureBootTemplateCli::UefiCa) => {
1162                    hyperv_secure_boot_templates::aarch64::microsoft_uefi_ca()
1163                }
1164            },
1165            None => CustomVars::default(),
1166        };
1167
1168        // TODO: fallback to VMGS read if no command line flag was given
1169
1170        let custom_uefi_json_data = match &opt.custom_uefi_json {
1171            Some(file) => Some(fs_err::read(file).context("opening custom uefi json file")?),
1172            None => None,
1173        };
1174
1175        // obtain the final custom uefi vars by applying the delta onto the base vars
1176        match custom_uefi_json_data {
1177            Some(data) => {
1178                let delta = hyperv_uefi_custom_vars_json::load_delta_from_json(&data)?;
1179                base_vars.apply_delta(delta)?
1180            }
1181            None => base_vars,
1182        }
1183    };
1184
1185    let efi_diagnostics_log_level = match opt.efi_diagnostics_log_level.unwrap_or_default() {
1186        EfiDiagnosticsLogLevelCli::Default => EfiDiagnosticsLogLevelType::Default,
1187        EfiDiagnosticsLogLevelCli::Info => EfiDiagnosticsLogLevelType::Info,
1188        EfiDiagnosticsLogLevelCli::Full => EfiDiagnosticsLogLevelType::Full,
1189    };
1190
1191    if opt.uefi {
1192        let log_level = match efi_diagnostics_log_level {
1193            EfiDiagnosticsLogLevelType::Default => {
1194                firmware_uefi_resources::LogLevel::make_default()
1195            }
1196            EfiDiagnosticsLogLevelType::Info => firmware_uefi_resources::LogLevel::make_info(),
1197            EfiDiagnosticsLogLevelType::Full => firmware_uefi_resources::LogLevel::make_full(),
1198        };
1199        let nvram_storage = if opt.vmgs.is_some() {
1200            VmgsFileHandle::new(vmgs_format::FileId::BIOS_NVRAM, true).into_resource()
1201        } else {
1202            EphemeralNonVolatileStoreHandle.into_resource()
1203        };
1204        chipset = chipset.with_uefi(vm_manifest_builder::UefiManifest::new(
1205            arch,
1206            custom_uefi_vars.clone(),
1207            opt.secure_boot,
1208            log_level,
1209            None,
1210            nvram_storage,
1211            None,
1212        ));
1213    }
1214
1215    // TODO: load from VMGS file if it exists
1216    let bios_guid = Guid::new_random();
1217
1218    let layout_config = chipset.layout_config();
1219    let VmChipsetResult {
1220        chipset,
1221        mut chipset_devices,
1222        pci_chipset_devices,
1223        isa_dma_controller,
1224        capabilities,
1225    } = chipset
1226        .build()
1227        .context("failed to build chipset configuration")?;
1228
1229    if opt.restore_snapshot.is_some() {
1230        // Snapshot restore: skip firmware loading entirely. Device state and
1231        // memory come from the snapshot directory.
1232        load_mode = LoadMode::None;
1233        with_hv = true;
1234    } else if let Some(path) = &opt.igvm {
1235        let file = fs_err::File::open(path)
1236            .context("failed to open igvm file")?
1237            .into();
1238        let cmdline = opt.cmdline.join(" ");
1239        with_hv = true;
1240
1241        load_mode = LoadMode::Igvm {
1242            file,
1243            cmdline,
1244            vtl2_base_address: opt.igvm_vtl2_relocation_type,
1245            com_serial: has_com3.then(|| SerialInformation {
1246                io_port: ComPort::Com3.io_port(),
1247                irq: ComPort::Com3.irq().into(),
1248            }),
1249        };
1250    } else if opt.pcat {
1251        // Emit a nice error early instead of complaining about missing firmware.
1252        if arch != MachineArch::X86_64 {
1253            anyhow::bail!("pcat not supported on this architecture");
1254        }
1255        with_hv = true;
1256
1257        let firmware = openvmm_pcat_locator::find_pcat_bios(opt.pcat_firmware.as_deref())?;
1258        load_mode = LoadMode::Pcat {
1259            firmware,
1260            boot_order: opt
1261                .pcat_boot_order
1262                .map(|x| x.0)
1263                .unwrap_or(DEFAULT_PCAT_BOOT_ORDER),
1264        };
1265    } else if opt.uefi {
1266        use openvmm_defs::config::UefiConsoleMode;
1267
1268        with_hv = true;
1269
1270        let firmware = fs_err::File::open(
1271            (opt.uefi_firmware.0)
1272                .as_ref()
1273                .context("must provide uefi firmware when booting with uefi")?,
1274        )
1275        .context("failed to open uefi firmware")?;
1276
1277        // TODO: It would be better to default memory protections to on, but currently Linux does not boot via UEFI due to what
1278        //       appears to be a GRUB memory protection fault. Memory protections are therefore only enabled if configured.
1279        load_mode = LoadMode::Uefi {
1280            firmware: firmware.into(),
1281            enable_debugging: opt.uefi_debug,
1282            enable_memory_protections: opt.uefi_enable_memory_protections,
1283            disable_frontpage: opt.disable_frontpage,
1284            enable_tpm: opt.tpm,
1285            enable_battery: opt.battery,
1286            enable_serial: any_serial_configured,
1287            enable_vpci_boot: false,
1288            uefi_console_mode: opt.uefi_console_mode.map(|m| match m {
1289                UefiConsoleModeCli::Default => UefiConsoleMode::Default,
1290                UefiConsoleModeCli::Com1 => UefiConsoleMode::Com1,
1291                UefiConsoleModeCli::Com2 => UefiConsoleMode::Com2,
1292                UefiConsoleModeCli::None => UefiConsoleMode::None,
1293            }),
1294            default_boot_always_attempt: opt.default_boot_always_attempt,
1295            bios_guid,
1296            enable_vmbus: !opt.no_vmbus,
1297            force_dma_bounce: opt.uefi_force_dma_bounce,
1298        };
1299    } else {
1300        // Linux Direct
1301        let mut cmdline = "panic=-1 debug".to_string();
1302
1303        with_hv = opt.hv;
1304        if with_hv && opt.pcie_root_complex.is_empty() {
1305            cmdline += " pci=off";
1306        }
1307
1308        if !console_str.is_empty() {
1309            let _ = write!(&mut cmdline, " console={}", console_str);
1310        }
1311
1312        if opt.gfx {
1313            cmdline += " console=tty";
1314        }
1315        for extra in &opt.cmdline {
1316            let _ = write!(&mut cmdline, " {}", extra);
1317        }
1318
1319        let kernel = fs_err::File::open(
1320            (opt.kernel.0)
1321                .as_ref()
1322                .context("must provide kernel when booting with linux direct")?,
1323        )
1324        .context("failed to open kernel")?;
1325        let initrd = (opt.initrd.0)
1326            .as_ref()
1327            .map(fs_err::File::open)
1328            .transpose()
1329            .context("failed to open initrd")?;
1330
1331        let custom_dsdt = match &opt.custom_dsdt {
1332            Some(path) => {
1333                let mut v = Vec::new();
1334                fs_err::File::open(path)
1335                    .context("failed to open custom dsdt")?
1336                    .read_to_end(&mut v)
1337                    .context("failed to read custom dsdt")?;
1338                Some(v)
1339            }
1340            None => None,
1341        };
1342
1343        load_mode = LoadMode::Linux {
1344            kernel: kernel.into(),
1345            initrd: initrd.map(Into::into),
1346            cmdline,
1347            custom_dsdt,
1348            enable_serial: any_serial_configured,
1349            boot_mode: if opt.device_tree {
1350                openvmm_defs::config::LinuxDirectBootMode::DeviceTree
1351            } else {
1352                openvmm_defs::config::LinuxDirectBootMode::Acpi
1353            },
1354        };
1355    }
1356
1357    let mut vmgs = Some(if let Some(VmgsCli { kind, provision }) = &opt.vmgs {
1358        let disk = VmgsDisk {
1359            disk: disk_open(kind, false)
1360                .await
1361                .context("failed to open vmgs disk")?,
1362            encryption_policy: if opt.test_gsp_by_id {
1363                GuestStateEncryptionPolicy::GspById(true)
1364            } else {
1365                GuestStateEncryptionPolicy::None(true)
1366            },
1367        };
1368        match provision {
1369            ProvisionVmgs::OnEmpty => VmgsResource::Disk(disk),
1370            ProvisionVmgs::OnFailure => VmgsResource::ReprovisionOnFailure(disk),
1371            ProvisionVmgs::True => VmgsResource::Reprovision(disk),
1372        }
1373    } else {
1374        VmgsResource::Ephemeral
1375    });
1376
1377    if with_get && with_hv {
1378        let has_vtl0_nvme = storage.has_vtl0_nvme();
1379        let vtl2_settings = vtl2_settings_proto::Vtl2Settings {
1380            version: vtl2_settings_proto::vtl2_settings_base::Version::V1.into(),
1381            fixed: Some(Default::default()),
1382            dynamic: Some(vtl2_settings_proto::Vtl2SettingsDynamic {
1383                storage_controllers: storage.build_openhcl_settings(opt.vmbus_redirect),
1384                nic_devices: underhill_nics,
1385            }),
1386            namespace_settings: Vec::default(),
1387        };
1388
1389        // Cache the VTL2 settings for later modification via the interactive console.
1390        resources.vtl2_settings = Some(vtl2_settings.clone());
1391
1392        let (send, guest_request_recv) = mesh::channel();
1393        resources.ged_rpc = Some(send);
1394
1395        let vmgs = vmgs.take().unwrap();
1396
1397        vmbus_devices.extend([
1398            (
1399                openhcl_vtl,
1400                get_resources::gel::GuestEmulationLogHandle.into_resource(),
1401            ),
1402            (
1403                openhcl_vtl,
1404                get_resources::ged::GuestEmulationDeviceHandle {
1405                    firmware: if opt.pcat {
1406                        get_resources::ged::GuestFirmwareConfig::Pcat {
1407                            boot_order: opt
1408                                .pcat_boot_order
1409                                .map_or(DEFAULT_PCAT_BOOT_ORDER, |x| x.0)
1410                                .map(|x| match x {
1411                                    openvmm_defs::config::PcatBootDevice::Floppy => {
1412                                        get_resources::ged::PcatBootDevice::Floppy
1413                                    }
1414                                    openvmm_defs::config::PcatBootDevice::HardDrive => {
1415                                        get_resources::ged::PcatBootDevice::HardDrive
1416                                    }
1417                                    openvmm_defs::config::PcatBootDevice::Optical => {
1418                                        get_resources::ged::PcatBootDevice::Optical
1419                                    }
1420                                    openvmm_defs::config::PcatBootDevice::Network => {
1421                                        get_resources::ged::PcatBootDevice::Network
1422                                    }
1423                                }),
1424                        }
1425                    } else {
1426                        use get_resources::ged::UefiConsoleMode;
1427
1428                        get_resources::ged::GuestFirmwareConfig::Uefi {
1429                            enable_vpci_boot: has_vtl0_nvme,
1430                            firmware_debug: opt.uefi_debug,
1431                            disable_frontpage: opt.disable_frontpage,
1432                            console_mode: match opt.uefi_console_mode.unwrap_or(UefiConsoleModeCli::Default) {
1433                                UefiConsoleModeCli::Default => UefiConsoleMode::Default,
1434                                UefiConsoleModeCli::Com1 => UefiConsoleMode::COM1,
1435                                UefiConsoleModeCli::Com2 => UefiConsoleMode::COM2,
1436                                UefiConsoleModeCli::None => UefiConsoleMode::None,
1437                            },
1438                            default_boot_always_attempt: opt.default_boot_always_attempt,
1439                        }
1440                    },
1441                    com1: with_vmbus_com1_serial,
1442                    com2: with_vmbus_com2_serial,
1443                    serial_tx_only: opt.serial_tx_only,
1444                    vtl2_settings: Some(prost::Message::encode_to_vec(&vtl2_settings)),
1445                    vmbus_redirection: opt.vmbus_redirect,
1446                    vmgs,
1447                    framebuffer: opt
1448                        .vtl2_gfx
1449                        .then(|| SharedFramebufferHandle.into_resource()),
1450                    guest_request_recv,
1451                    enable_tpm: opt.tpm,
1452                    firmware_event_send: None,
1453                    secure_boot_enabled: opt.secure_boot,
1454                    secure_boot_template: match opt.secure_boot_template {
1455                        Some(SecureBootTemplateCli::Windows) => {
1456                            get_resources::ged::GuestSecureBootTemplateType::MicrosoftWindows
1457                        },
1458                        Some(SecureBootTemplateCli::UefiCa) => {
1459                            get_resources::ged::GuestSecureBootTemplateType::MicrosoftUefiCertificateAuthority
1460                        }
1461                        None => {
1462                            get_resources::ged::GuestSecureBootTemplateType::None
1463                        },
1464                    },
1465                    enable_battery: opt.battery,
1466                    no_persistent_secrets: true,
1467                    igvm_attest_test_config: None,
1468                    test_gsp_by_id: opt.test_gsp_by_id,
1469                    efi_diagnostics_log_level: {
1470                        match opt.efi_diagnostics_log_level.unwrap_or_default() {
1471                            EfiDiagnosticsLogLevelCli::Default => get_resources::ged::EfiDiagnosticsLogLevelType::Default,
1472                            EfiDiagnosticsLogLevelCli::Info => get_resources::ged::EfiDiagnosticsLogLevelType::Info,
1473                            EfiDiagnosticsLogLevelCli::Full => get_resources::ged::EfiDiagnosticsLogLevelType::Full,
1474                        }
1475                    },
1476                    force_dma_bounce_enabled: opt.uefi_force_dma_bounce,
1477                }
1478                .into_resource(),
1479            ),
1480        ]);
1481    }
1482
1483    if opt.tpm && !opt.vtl2 {
1484        let register_layout = if cfg!(guest_arch = "x86_64") {
1485            TpmRegisterLayout::IoPort
1486        } else {
1487            TpmRegisterLayout::Mmio
1488        };
1489
1490        let (ppi_store, nvram_store) = if opt.vmgs.is_some() {
1491            (
1492                VmgsFileHandle::new(vmgs_format::FileId::TPM_PPI, true).into_resource(),
1493                VmgsFileHandle::new(vmgs_format::FileId::TPM_NVRAM, true).into_resource(),
1494            )
1495        } else {
1496            (
1497                EphemeralNonVolatileStoreHandle.into_resource(),
1498                EphemeralNonVolatileStoreHandle.into_resource(),
1499            )
1500        };
1501
1502        chipset_devices.push(ChipsetDeviceHandle {
1503            name: "tpm".to_string(),
1504            resource: chipset_device_worker_defs::RemoteChipsetDeviceHandle {
1505                device: TpmDeviceHandle {
1506                    ppi_store,
1507                    nvram_store,
1508                    nvram_size: None,
1509                    refresh_tpm_seeds: false,
1510                    ak_cert_type: tpm_resources::TpmAkCertTypeResource::None,
1511                    register_layout,
1512                    guest_secret_key: None,
1513                    logger: None,
1514                    is_confidential_vm: false,
1515                    bios_guid,
1516                }
1517                .into_resource(),
1518                worker_host: mesh.make_host("tpm", None).await?,
1519            }
1520            .into_resource(),
1521        });
1522    }
1523
1524    let vga_firmware = if opt.pcat {
1525        Some(openvmm_pcat_locator::find_svga_bios(
1526            opt.vga_firmware.as_deref(),
1527        )?)
1528    } else {
1529        None
1530    };
1531
1532    if opt.gfx {
1533        // Channel for the video device to report dirty rectangles to the VNC worker.
1534        let (dirt_send, dirt_recv) = mesh::channel();
1535        resources.dirty_rect_recv = Some(dirt_recv);
1536
1537        vmbus_devices.extend([
1538            (
1539                DeviceVtl::Vtl0,
1540                SynthVideoHandle {
1541                    framebuffer: SharedFramebufferHandle.into_resource(),
1542                    dirt_send: Some(dirt_send),
1543                }
1544                .into_resource(),
1545            ),
1546            (
1547                DeviceVtl::Vtl0,
1548                SynthKeyboardHandle {
1549                    source: MultiplexedInputHandle {
1550                        // Save 0 for PS/2
1551                        elevation: 1,
1552                    }
1553                    .into_resource(),
1554                }
1555                .into_resource(),
1556            ),
1557            (
1558                DeviceVtl::Vtl0,
1559                SynthMouseHandle {
1560                    source: MultiplexedInputHandle {
1561                        // Save 0 for PS/2
1562                        elevation: 1,
1563                    }
1564                    .into_resource(),
1565                }
1566                .into_resource(),
1567            ),
1568        ]);
1569    }
1570
1571    let vsock_listener = |path: Option<&str>| -> anyhow::Result<_> {
1572        if let Some(path) = path {
1573            cleanup_socket(path.as_ref());
1574            let listener = unix_socket::UnixListener::bind(path)
1575                .with_context(|| format!("failed to bind to hybrid vsock path: {}", path))?;
1576            Ok(Some(listener))
1577        } else {
1578            Ok(None)
1579        }
1580    };
1581
1582    let vtl0_vsock_listener = vsock_listener(opt.vmbus_vsock_path.as_deref())?;
1583    let vtl2_vsock_listener = vsock_listener(opt.vmbus_vtl2_vsock_path.as_deref())?;
1584
1585    if let Some(path) = &opt.openhcl_dump_path {
1586        let (resource, task) = spawn_dump_handler(&spawner, path.clone(), None);
1587        task.detach();
1588        vmbus_devices.push((openhcl_vtl, resource));
1589    }
1590
1591    #[cfg(guest_arch = "aarch64")]
1592    let topology_arch = openvmm_defs::config::ArchTopologyConfig::Aarch64(
1593        openvmm_defs::config::Aarch64TopologyConfig {
1594            // TODO: allow this to be configured from the command line
1595            gic_config: None,
1596            pmu_gsiv: openvmm_defs::config::PmuGsivConfig::Platform,
1597            gic_msi: match opt.gic_msi {
1598                cli_args::GicMsiCli::Auto => openvmm_defs::config::GicMsiConfig::Auto,
1599                cli_args::GicMsiCli::Its => openvmm_defs::config::GicMsiConfig::Its,
1600                cli_args::GicMsiCli::V2m => {
1601                    openvmm_defs::config::GicMsiConfig::V2m { spi_count: None }
1602                }
1603            },
1604        },
1605    );
1606    #[cfg(guest_arch = "x86_64")]
1607    let topology_arch =
1608        openvmm_defs::config::ArchTopologyConfig::X86(openvmm_defs::config::X86TopologyConfig {
1609            apic_id_offset: opt.apic_id_offset,
1610            x2apic: opt.x2apic,
1611        });
1612
1613    let with_isolation = if let Some(isolation) = &opt.isolation {
1614        // TODO: For now, isolation is only supported with VTL2.
1615        if !opt.vtl2 {
1616            anyhow::bail!("isolation is only currently supported with vtl2");
1617        }
1618
1619        // TODO: Alias map support is not yet implement with isolation.
1620        if !opt.no_alias_map {
1621            anyhow::bail!("alias map not supported with isolation");
1622        }
1623
1624        match isolation {
1625            cli_args::IsolationCli::Vbs => Some(openvmm_defs::config::IsolationType::Vbs),
1626        }
1627    } else {
1628        None
1629    };
1630
1631    if with_hv && !opt.no_vmbus {
1632        let (shutdown_send, shutdown_recv) = mesh::channel();
1633        resources.shutdown_ic = Some(shutdown_send);
1634        let (kvp_send, kvp_recv) = mesh::channel();
1635        resources.kvp_ic = Some(kvp_send);
1636        vmbus_devices.extend(
1637            [
1638                hyperv_ic_resources::shutdown::ShutdownIcHandle {
1639                    recv: shutdown_recv,
1640                }
1641                .into_resource(),
1642                hyperv_ic_resources::kvp::KvpIcHandle { recv: kvp_recv }.into_resource(),
1643                hyperv_ic_resources::timesync::TimesyncIcHandle.into_resource(),
1644            ]
1645            .map(|r| (DeviceVtl::Vtl0, r)),
1646        );
1647    }
1648
1649    if let Some(hive_path) = &opt.imc {
1650        let file = fs_err::File::open(hive_path).context("failed to open imc hive")?;
1651        vmbus_devices.push((
1652            DeviceVtl::Vtl0,
1653            vmbfs_resources::VmbfsImcDeviceHandle { file: file.into() }.into_resource(),
1654        ));
1655    }
1656
1657    let mut virtio_devices = Vec::new();
1658    let mut add_virtio_device = |bus, resource: Resource<VirtioDeviceHandle>| {
1659        let bus = match bus {
1660            VirtioBusCli::Auto => {
1661                // Use VPCI when possible (currently only on Windows and macOS due
1662                // to KVM backend limitations).
1663                if with_hv && (cfg!(windows) || cfg!(target_os = "macos")) {
1664                    None
1665                } else {
1666                    Some(VirtioBus::Pci)
1667                }
1668            }
1669            VirtioBusCli::Mmio => Some(VirtioBus::Mmio),
1670            VirtioBusCli::Pci => Some(VirtioBus::Pci),
1671            VirtioBusCli::Vpci => None,
1672        };
1673        if let Some(bus) = bus {
1674            virtio_devices.push((bus, resource));
1675        } else {
1676            vpci_devices.push(VpciDeviceConfig {
1677                vtl: DeviceVtl::Vtl0,
1678                instance_id: Guid::new_random(),
1679                resource: VirtioPciDeviceHandle(resource).into_resource(),
1680                vnode: None,
1681            });
1682        }
1683    };
1684
1685    for cli_cfg in &opt.virtio_net {
1686        if cli_cfg.underhill {
1687            anyhow::bail!("use --net uh:[...] to add underhill NICs")
1688        }
1689        let vport = parse_endpoint(cli_cfg, &mut nic_index, &mut resources)?;
1690        let resource = virtio_resources::net::VirtioNetHandle {
1691            max_queues: vport.max_queues,
1692            mac_address: vport.mac_address,
1693            endpoint: vport.endpoint,
1694        }
1695        .into_resource();
1696        if let Some(pcie_port) = &cli_cfg.pcie_port {
1697            pcie_devices.push(PcieDeviceConfig {
1698                port_name: pcie_port.clone(),
1699                resource: VirtioPciDeviceHandle(resource).into_resource(),
1700            });
1701        } else {
1702            add_virtio_device(VirtioBusCli::Auto, resource);
1703        }
1704    }
1705
1706    for args in &opt.virtio_fs {
1707        let resource: Resource<VirtioDeviceHandle> = virtio_resources::fs::VirtioFsHandle {
1708            tag: args.tag.clone(),
1709            fs: virtio_resources::fs::VirtioFsBackend::HostFs {
1710                root_path: args.path.clone(),
1711                mount_options: args.options.clone(),
1712            },
1713        }
1714        .into_resource();
1715        if let Some(pcie_port) = &args.pcie_port {
1716            pcie_devices.push(PcieDeviceConfig {
1717                port_name: pcie_port.clone(),
1718                resource: VirtioPciDeviceHandle(resource).into_resource(),
1719            });
1720        } else {
1721            add_virtio_device(opt.virtio_fs_bus, resource);
1722        }
1723    }
1724
1725    for args in &opt.virtio_fs_shmem {
1726        let resource: Resource<VirtioDeviceHandle> = virtio_resources::fs::VirtioFsHandle {
1727            tag: args.tag.clone(),
1728            fs: virtio_resources::fs::VirtioFsBackend::SectionFs {
1729                root_path: args.path.clone(),
1730            },
1731        }
1732        .into_resource();
1733        if let Some(pcie_port) = &args.pcie_port {
1734            pcie_devices.push(PcieDeviceConfig {
1735                port_name: pcie_port.clone(),
1736                resource: VirtioPciDeviceHandle(resource).into_resource(),
1737            });
1738        } else {
1739            add_virtio_device(opt.virtio_fs_bus, resource);
1740        }
1741    }
1742
1743    for args in &opt.virtio_9p {
1744        let resource: Resource<VirtioDeviceHandle> = virtio_resources::p9::VirtioPlan9Handle {
1745            tag: args.tag.clone(),
1746            root_path: args.path.clone(),
1747            debug: opt.virtio_9p_debug,
1748        }
1749        .into_resource();
1750        if let Some(pcie_port) = &args.pcie_port {
1751            pcie_devices.push(PcieDeviceConfig {
1752                port_name: pcie_port.clone(),
1753                resource: VirtioPciDeviceHandle(resource).into_resource(),
1754            });
1755        } else {
1756            add_virtio_device(VirtioBusCli::Auto, resource);
1757        }
1758    }
1759
1760    if let Some(pmem_args) = &opt.virtio_pmem {
1761        let resource: Resource<VirtioDeviceHandle> = virtio_resources::pmem::VirtioPmemHandle {
1762            path: pmem_args.path.clone(),
1763        }
1764        .into_resource();
1765        if let Some(pcie_port) = &pmem_args.pcie_port {
1766            pcie_devices.push(PcieDeviceConfig {
1767                port_name: pcie_port.clone(),
1768                resource: VirtioPciDeviceHandle(resource).into_resource(),
1769            });
1770        } else {
1771            add_virtio_device(VirtioBusCli::Auto, resource);
1772        }
1773    }
1774
1775    if opt.virtio_rng {
1776        let resource: Resource<VirtioDeviceHandle> =
1777            virtio_resources::rng::VirtioRngHandle.into_resource();
1778        if let Some(pcie_port) = &opt.virtio_rng_pcie_port {
1779            pcie_devices.push(PcieDeviceConfig {
1780                port_name: pcie_port.clone(),
1781                resource: VirtioPciDeviceHandle(resource).into_resource(),
1782            });
1783        } else {
1784            add_virtio_device(opt.virtio_rng_bus, resource);
1785        }
1786    }
1787
1788    if let Some(backend) = virtio_console_backend {
1789        let resource: Resource<VirtioDeviceHandle> =
1790            virtio_resources::console::VirtioConsoleHandle { backend }.into_resource();
1791        if let Some(pcie_port) = &opt.virtio_console_pcie_port {
1792            pcie_devices.push(PcieDeviceConfig {
1793                port_name: pcie_port.clone(),
1794                resource: VirtioPciDeviceHandle(resource).into_resource(),
1795            });
1796        } else {
1797            add_virtio_device(VirtioBusCli::Auto, resource);
1798        }
1799    }
1800
1801    // Handle --vhost-user arguments.
1802    #[cfg(target_os = "linux")]
1803    for vhost_cli in &opt.vhost_user {
1804        let stream =
1805            unix_socket::UnixStream::connect(&vhost_cli.socket_path).with_context(|| {
1806                format!(
1807                    "failed to connect to vhost-user socket: {}",
1808                    vhost_cli.socket_path
1809                )
1810            })?;
1811
1812        use crate::cli_args::VhostUserDeviceTypeCli;
1813        let resource: Resource<VirtioDeviceHandle> = match vhost_cli.device_type {
1814            VhostUserDeviceTypeCli::Fs {
1815                ref tag,
1816                num_queues,
1817                queue_size,
1818            } => virtio_resources::vhost_user::VhostUserFsHandle {
1819                socket: stream.into(),
1820                tag: tag.clone(),
1821                num_queues,
1822                queue_size,
1823            }
1824            .into_resource(),
1825            VhostUserDeviceTypeCli::Blk {
1826                num_queues,
1827                queue_size,
1828            } => virtio_resources::vhost_user::VhostUserBlkHandle {
1829                socket: stream.into(),
1830                num_queues,
1831                queue_size,
1832            }
1833            .into_resource(),
1834            VhostUserDeviceTypeCli::Other {
1835                device_id,
1836                ref queue_sizes,
1837            } => virtio_resources::vhost_user::VhostUserGenericHandle {
1838                socket: stream.into(),
1839                device_id,
1840                queue_sizes: queue_sizes.clone(),
1841            }
1842            .into_resource(),
1843        };
1844        if let Some(pcie_port) = &vhost_cli.pcie_port {
1845            pcie_devices.push(PcieDeviceConfig {
1846                port_name: pcie_port.clone(),
1847                resource: VirtioPciDeviceHandle(resource).into_resource(),
1848            });
1849        } else {
1850            add_virtio_device(VirtioBusCli::Auto, resource);
1851        }
1852    }
1853
1854    if let Some(vsock_path) = &opt.virtio_vsock_path {
1855        let listener = vsock_listener(Some(vsock_path))?.unwrap();
1856        add_virtio_device(
1857            VirtioBusCli::Auto,
1858            virtio_resources::vsock::VirtioVsockHandle {
1859                // The guest CID does not matter since the UDS relay does not use it. It just needs
1860                // to be some non-reserved value for the guest to use.
1861                guest_cid: 0x3,
1862                base_path: vsock_path.clone(),
1863                listener,
1864            }
1865            .into_resource(),
1866        );
1867    }
1868
1869    let mut cfg = Config {
1870        chipset,
1871        load_mode,
1872        floppy_disks,
1873        pcie_root_complexes,
1874        #[cfg(target_os = "linux")]
1875        pcie_devices: {
1876            let mut devs = pcie_devices;
1877            devs.extend(vfio_pcie_devices);
1878            devs
1879        },
1880        #[cfg(not(target_os = "linux"))]
1881        pcie_devices,
1882        pcie_switches,
1883        pcie_generic_initiators,
1884        vpci_devices,
1885        ide_disks: Vec::new(),
1886        numa: {
1887            if let Some(ref nodes) = opt.numa {
1888                // --numa mode: each --numa flag defines a node.
1889                NumaTopology {
1890                    nodes: nodes
1891                        .iter()
1892                        .map(|n| NumaNode {
1893                            mem: Some(MemoryConfig {
1894                                mem_size: n.memory.mem_size,
1895                                prefetch_memory: n.memory.prefetch,
1896                                private_memory: n.memory.shared == Some(false),
1897                                transparent_hugepages: n.memory.transparent_hugepages,
1898                                hugepages: n.memory.hugepages,
1899                                hugepage_size: n.memory.hugepage_size,
1900                                host_numa_node: n.host_numa_node,
1901                            }),
1902                            vps: match &n.vps {
1903                                Some(vps) if vps.is_empty() => VpAssignment::Empty,
1904                                Some(vps) => VpAssignment::Explicit(vps.clone()),
1905                                None => VpAssignment::FromTopology,
1906                            },
1907                        })
1908                        .collect(),
1909                    distances: opt
1910                        .numa_distance
1911                        .as_deref()
1912                        .unwrap_or(&[])
1913                        .iter()
1914                        .map(|d| NumaDistance {
1915                            src: d.src,
1916                            dst: d.dst,
1917                            distance: d.distance,
1918                        })
1919                        .collect(),
1920                }
1921            } else {
1922                // Single-node default from --memory.
1923                NumaTopology {
1924                    nodes: vec![NumaNode {
1925                        mem: Some(MemoryConfig {
1926                            mem_size: opt.memory_size(),
1927                            prefetch_memory: opt.prefetch_memory(),
1928                            private_memory: opt.private_memory(),
1929                            transparent_hugepages: opt.transparent_hugepages(),
1930                            hugepages: opt.memory.hugepages,
1931                            hugepage_size: opt.memory.hugepage_size,
1932                            host_numa_node: None,
1933                        }),
1934                        vps: VpAssignment::FromTopology,
1935                    }],
1936                    distances: vec![],
1937                }
1938            }
1939        },
1940        processor_topology: ProcessorTopologyConfig {
1941            proc_count: opt.processors,
1942            vps_per_socket: opt.vps_per_socket,
1943            enable_smt: match opt.smt {
1944                cli_args::SmtConfigCli::Auto => None,
1945                cli_args::SmtConfigCli::Force => Some(true),
1946                cli_args::SmtConfigCli::Off => Some(false),
1947            },
1948            arch: Some(topology_arch),
1949        },
1950        hypervisor: HypervisorConfig {
1951            with_hv,
1952            with_vtl2: opt.vtl2.then_some(Vtl2Config {
1953                vtl0_alias_map: !opt.no_alias_map,
1954                late_map_vtl0_memory: match opt.late_map_vtl0_policy {
1955                    cli_args::Vtl0LateMapPolicyCli::Off => None,
1956                    cli_args::Vtl0LateMapPolicyCli::Log => Some(LateMapVtl0MemoryPolicy::Log),
1957                    cli_args::Vtl0LateMapPolicyCli::Halt => Some(LateMapVtl0MemoryPolicy::Halt),
1958                    cli_args::Vtl0LateMapPolicyCli::Exception => {
1959                        Some(LateMapVtl0MemoryPolicy::InjectException)
1960                    }
1961                },
1962            }),
1963            with_isolation,
1964        },
1965        #[cfg(windows)]
1966        kernel_vmnics,
1967        input: mesh::Receiver::new(),
1968        framebuffer,
1969        vga_firmware,
1970        vtl2_gfx: opt.vtl2_gfx,
1971        virtio_devices,
1972        vmbus: (with_hv && !opt.no_vmbus).then_some(VmbusConfig {
1973            vsock_listener: vtl0_vsock_listener,
1974            vsock_path: opt.vmbus_vsock_path.clone(),
1975            vtl2_redirect: opt.vmbus_redirect,
1976            vmbus_max_version: opt.vmbus_max_version,
1977            #[cfg(windows)]
1978            vmbusproxy_handle,
1979        }),
1980        vtl2_vmbus: (with_hv && opt.vtl2).then_some(VmbusConfig {
1981            vsock_listener: vtl2_vsock_listener,
1982            vsock_path: opt.vmbus_vtl2_vsock_path.clone(),
1983            ..Default::default()
1984        }),
1985        vmbus_devices,
1986        chipset_devices,
1987        pci_chipset_devices,
1988        isa_dma_controller,
1989        chipset_capabilities: capabilities,
1990        layout: layout_config,
1991        #[cfg(windows)]
1992        vpci_resources,
1993        vmgs,
1994        secure_boot_enabled: opt.secure_boot,
1995        custom_uefi_vars,
1996        firmware_event_send: None,
1997        debugger_rpc: None,
1998        rtc_delta_milliseconds: 0,
1999        // Only let the partition auto-reset when the reset action is `reset`.
2000        // For `halt` or `exit`, the guest reset must surface as a halt event so
2001        // the controller can hold the VM or exit instead of rebooting in place.
2002        automatic_guest_reset: matches!(opt.guest_reset_action, GuestPowerAction::Reset),
2003        efi_diagnostics_log_level: {
2004            match opt.efi_diagnostics_log_level.unwrap_or_default() {
2005                EfiDiagnosticsLogLevelCli::Default => EfiDiagnosticsLogLevelType::Default,
2006                EfiDiagnosticsLogLevelCli::Info => EfiDiagnosticsLogLevelType::Info,
2007                EfiDiagnosticsLogLevelCli::Full => EfiDiagnosticsLogLevelType::Full,
2008            }
2009        },
2010    };
2011
2012    storage.build_config(&mut cfg, &mut resources, opt.scsi_sub_channels)?;
2013    Ok((cfg, resources))
2014}
2015
2016/// Gets the terminal to use for externally launched console windows.
2017pub(crate) fn openvmm_terminal_app() -> Option<PathBuf> {
2018    std::env::var_os("OPENVMM_TERM")
2019        .or_else(|| std::env::var_os("HVLITE_TERM"))
2020        .map(Into::into)
2021}
2022
2023// Tries to remove `path` if it is confirmed to be a Unix socket.
2024fn cleanup_socket(path: &Path) {
2025    #[cfg(windows)]
2026    let is_socket = pal::windows::fs::is_unix_socket(path).unwrap_or(false);
2027    #[cfg(not(windows))]
2028    let is_socket = path
2029        .metadata()
2030        .is_ok_and(|meta| std::os::unix::fs::FileTypeExt::is_socket(&meta.file_type()));
2031
2032    if is_socket {
2033        let _ = std::fs::remove_file(path);
2034    }
2035}
2036
2037#[cfg(windows)]
2038fn new_switch_port(
2039    switch_id: Option<&str>,
2040) -> anyhow::Result<(
2041    openvmm_defs::config::SwitchPortId,
2042    vmswitch::kernel::SwitchPort,
2043)> {
2044    let id = vmswitch::kernel::SwitchPortId {
2045        switch: match switch_id {
2046            Some(s) => s.parse().context("invalid switch id")?,
2047            None => vmswitch::hcn::DEFAULT_SWITCH,
2048        },
2049        port: Guid::new_random(),
2050    };
2051    let _ = vmswitch::hcn::Network::open(&id.switch)
2052        .with_context(|| format!("could not find switch {}", id.switch))?;
2053
2054    let port = vmswitch::kernel::SwitchPort::new(&id).context("failed to create switch port")?;
2055
2056    let id = openvmm_defs::config::SwitchPortId {
2057        switch: id.switch,
2058        port: id.port,
2059    };
2060    Ok((id, port))
2061}
2062
2063fn parse_endpoint(
2064    cli_cfg: &NicConfigCli,
2065    index: &mut usize,
2066    resources: &mut VmResources,
2067) -> anyhow::Result<NicConfig> {
2068    let _ = resources;
2069    let endpoint = match &cli_cfg.endpoint {
2070        EndpointConfigCli::Consomme { cidr, host_fwd } => {
2071            let ports = host_fwd
2072                .iter()
2073                .map(|fwd| {
2074                    use net_backend_resources::consomme::HostPortProtocol;
2075                    net_backend_resources::consomme::HostPortConfig {
2076                        protocol: match fwd.protocol {
2077                            cli_args::HostPortProtocolCli::Tcp => HostPortProtocol::Tcp,
2078                            cli_args::HostPortProtocolCli::Udp => HostPortProtocol::Udp,
2079                        },
2080                        host_address: fwd
2081                            .host_address
2082                            .map(net_backend_resources::consomme::HostIpAddress::from),
2083                        host_port: net_backend_resources::consomme::HostPort::Fixed(fwd.host_port),
2084                        guest_port: fwd.guest_port,
2085                    }
2086                })
2087                .collect();
2088            // Only wire the bind/unbind RPC channel to the first consomme
2089            // endpoint. Additional consomme NICs work normally but cannot be
2090            // targeted by runtime bind/unbind commands.
2091            let recv = if resources.consomme_rpc.is_none() {
2092                let (send, recv) = mesh::channel();
2093                resources.consomme_rpc = Some(send);
2094                Some(recv)
2095            } else {
2096                None
2097            };
2098            net_backend_resources::consomme::ConsommeHandle {
2099                cidr: cidr.clone(),
2100                ports,
2101                recv,
2102            }
2103            .into_resource()
2104        }
2105        EndpointConfigCli::None => net_backend_resources::null::NullHandle.into_resource(),
2106        EndpointConfigCli::Dio { id } => {
2107            #[cfg(windows)]
2108            {
2109                let (port_id, port) = new_switch_port(id.as_deref())?;
2110                resources.switch_ports.push(port);
2111                net_backend_resources::dio::WindowsDirectIoHandle {
2112                    switch_port_id: net_backend_resources::dio::SwitchPortId {
2113                        switch: port_id.switch,
2114                        port: port_id.port,
2115                    },
2116                }
2117                .into_resource()
2118            }
2119
2120            #[cfg(not(windows))]
2121            {
2122                let _ = id;
2123                bail!("cannot use dio on non-windows platforms")
2124            }
2125        }
2126        EndpointConfigCli::Tap { name } => {
2127            #[cfg(target_os = "linux")]
2128            {
2129                let fd = net_tap::tap::open_tap(name)
2130                    .with_context(|| format!("failed to open TAP device '{name}'"))?;
2131                net_backend_resources::tap::TapHandle { fd }.into_resource()
2132            }
2133
2134            #[cfg(not(target_os = "linux"))]
2135            {
2136                let _ = name;
2137                bail!("TAP backend is only supported on Linux")
2138            }
2139        }
2140    };
2141
2142    // Pick a random MAC address.
2143    let mut mac_address = [0x00, 0x15, 0x5D, 0, 0, 0];
2144    getrandom::fill(&mut mac_address[3..]).expect("rng failure");
2145
2146    // Pick a fixed instance ID based on the index.
2147    const BASE_INSTANCE_ID: Guid = guid::guid!("00000000-da43-11ed-936a-00155d6db52f");
2148    let instance_id = Guid {
2149        data1: *index as u32,
2150        ..BASE_INSTANCE_ID
2151    };
2152    *index += 1;
2153
2154    Ok(NicConfig {
2155        vtl: cli_cfg.vtl,
2156        instance_id,
2157        endpoint,
2158        mac_address: mac_address.into(),
2159        max_queues: cli_cfg.max_queues,
2160        pcie_port: cli_cfg.pcie_port.clone(),
2161    })
2162}
2163
2164#[derive(Debug)]
2165struct NicConfig {
2166    vtl: DeviceVtl,
2167    instance_id: Guid,
2168    mac_address: MacAddress,
2169    endpoint: Resource<NetEndpointHandleKind>,
2170    max_queues: Option<u16>,
2171    pcie_port: Option<String>,
2172}
2173
2174impl NicConfig {
2175    fn into_netvsp_handle(self) -> (DeviceVtl, Resource<VmbusDeviceHandleKind>) {
2176        (
2177            self.vtl,
2178            netvsp_resources::NetvspHandle {
2179                instance_id: self.instance_id,
2180                mac_address: self.mac_address,
2181                endpoint: self.endpoint,
2182                max_queues: self.max_queues,
2183            }
2184            .into_resource(),
2185        )
2186    }
2187}
2188
2189enum LayerOrDisk {
2190    Layer(DiskLayerDescription),
2191    Disk(Resource<DiskHandleKind>),
2192}
2193
2194async fn disk_open(
2195    disk_cli: &DiskCliKind,
2196    read_only: bool,
2197) -> anyhow::Result<Resource<DiskHandleKind>> {
2198    let mut layers = Vec::new();
2199    disk_open_inner(disk_cli, read_only, &mut layers).await?;
2200    if layers.len() == 1 && matches!(layers[0], LayerOrDisk::Disk(_)) {
2201        let LayerOrDisk::Disk(disk) = layers.pop().unwrap() else {
2202            unreachable!()
2203        };
2204        Ok(disk)
2205    } else {
2206        Ok(Resource::new(disk_backend_resources::LayeredDiskHandle {
2207            layers: layers
2208                .into_iter()
2209                .map(|layer| match layer {
2210                    LayerOrDisk::Layer(layer) => layer,
2211                    LayerOrDisk::Disk(disk) => DiskLayerDescription {
2212                        layer: DiskLayerHandle(disk).into_resource(),
2213                        read_cache: false,
2214                        write_through: false,
2215                    },
2216                })
2217                .collect(),
2218        }))
2219    }
2220}
2221
2222fn disk_open_inner<'a>(
2223    disk_cli: &'a DiskCliKind,
2224    read_only: bool,
2225    layers: &'a mut Vec<LayerOrDisk>,
2226) -> futures::future::BoxFuture<'a, anyhow::Result<()>> {
2227    Box::pin(async move {
2228        fn layer<T: IntoResource<DiskLayerHandleKind>>(layer: T) -> LayerOrDisk {
2229            LayerOrDisk::Layer(layer.into_resource().into())
2230        }
2231        fn disk<T: IntoResource<DiskHandleKind>>(disk: T) -> LayerOrDisk {
2232            LayerOrDisk::Disk(disk.into_resource())
2233        }
2234        match disk_cli {
2235            &DiskCliKind::Memory(len) => {
2236                layers.push(layer(RamDiskLayerHandle {
2237                    len: Some(len),
2238                    sector_size: None,
2239                }));
2240            }
2241            DiskCliKind::File {
2242                path,
2243                create_with_len,
2244                direct,
2245            } => layers.push(LayerOrDisk::Disk(if let Some(size) = create_with_len {
2246                create_disk_type(
2247                    path,
2248                    *size,
2249                    OpenDiskOptions {
2250                        read_only: false,
2251                        direct: *direct,
2252                    },
2253                )
2254                .with_context(|| format!("failed to create {}", path.display()))?
2255            } else {
2256                open_disk_type(
2257                    path,
2258                    OpenDiskOptions {
2259                        read_only,
2260                        direct: *direct,
2261                    },
2262                )
2263                .await
2264                .with_context(|| format!("failed to open {}", path.display()))?
2265            })),
2266            DiskCliKind::Blob { kind, url } => {
2267                layers.push(disk(disk_backend_resources::BlobDiskHandle {
2268                    url: url.to_owned(),
2269                    format: match kind {
2270                        cli_args::BlobKind::Flat => disk_backend_resources::BlobDiskFormat::Flat,
2271                        cli_args::BlobKind::Vhd1 => {
2272                            disk_backend_resources::BlobDiskFormat::FixedVhd1
2273                        }
2274                    },
2275                }))
2276            }
2277            DiskCliKind::MemoryDiff(inner) => {
2278                layers.push(layer(RamDiskLayerHandle {
2279                    len: None,
2280                    sector_size: None,
2281                }));
2282                disk_open_inner(inner, true, layers).await?;
2283            }
2284            DiskCliKind::PersistentReservationsWrapper(inner) => {
2285                layers.push(disk(disk_backend_resources::DiskWithReservationsHandle(
2286                    disk_open(inner, read_only).await?,
2287                )))
2288            }
2289            DiskCliKind::DelayDiskWrapper {
2290                delay_ms,
2291                disk: inner,
2292            } => layers.push(disk(DelayDiskHandle {
2293                delay: CellUpdater::new(Duration::from_millis(*delay_ms)).cell(),
2294                disk: disk_open(inner, read_only).await?,
2295            })),
2296            DiskCliKind::Crypt {
2297                disk: inner,
2298                cipher,
2299                key_file,
2300            } => layers.push(disk(disk_crypt_resources::DiskCryptHandle {
2301                disk: disk_open(inner, read_only).await?,
2302                cipher: match cipher {
2303                    cli_args::DiskCipher::XtsAes256 => disk_crypt_resources::Cipher::XtsAes256,
2304                },
2305                key: fs_err::read(key_file).context("failed to read key file")?,
2306            })),
2307            DiskCliKind::Sqlite {
2308                path,
2309                create_with_len,
2310            } => {
2311                // FUTURE: this code should be responsible for opening
2312                // file-handle(s) itself, and passing them into sqlite via a custom
2313                // vfs. For now though - simply check if the file exists or not, and
2314                // perform early validation of filesystem-level create options.
2315                match (create_with_len.is_some(), path.exists()) {
2316                    (true, true) => anyhow::bail!(
2317                        "cannot create new sqlite disk at {} - file already exists",
2318                        path.display()
2319                    ),
2320                    (false, false) => anyhow::bail!(
2321                        "cannot open sqlite disk at {} - file not found",
2322                        path.display()
2323                    ),
2324                    _ => {}
2325                }
2326
2327                layers.push(layer(SqliteDiskLayerHandle {
2328                    dbhd_path: path.display().to_string(),
2329                    format_dbhd: create_with_len.map(|len| {
2330                        disk_backend_resources::layer::SqliteDiskLayerFormatParams {
2331                            logically_read_only: false,
2332                            len: Some(len),
2333                        }
2334                    }),
2335                }));
2336            }
2337            DiskCliKind::SqliteDiff { path, create, disk } => {
2338                // FUTURE: this code should be responsible for opening
2339                // file-handle(s) itself, and passing them into sqlite via a custom
2340                // vfs. For now though - simply check if the file exists or not, and
2341                // perform early validation of filesystem-level create options.
2342                match (create, path.exists()) {
2343                    (true, true) => anyhow::bail!(
2344                        "cannot create new sqlite disk at {} - file already exists",
2345                        path.display()
2346                    ),
2347                    (false, false) => anyhow::bail!(
2348                        "cannot open sqlite disk at {} - file not found",
2349                        path.display()
2350                    ),
2351                    _ => {}
2352                }
2353
2354                layers.push(layer(SqliteDiskLayerHandle {
2355                    dbhd_path: path.display().to_string(),
2356                    format_dbhd: create.then_some(
2357                        disk_backend_resources::layer::SqliteDiskLayerFormatParams {
2358                            logically_read_only: false,
2359                            len: None,
2360                        },
2361                    ),
2362                }));
2363                disk_open_inner(disk, true, layers).await?;
2364            }
2365            DiskCliKind::AutoCacheSqlite {
2366                cache_path,
2367                key,
2368                disk,
2369            } => {
2370                layers.push(LayerOrDisk::Layer(DiskLayerDescription {
2371                    read_cache: true,
2372                    write_through: false,
2373                    layer: SqliteAutoCacheDiskLayerHandle {
2374                        cache_path: cache_path.clone(),
2375                        cache_key: key.clone(),
2376                    }
2377                    .into_resource(),
2378                }));
2379                disk_open_inner(disk, read_only, layers).await?;
2380            }
2381        }
2382        Ok(())
2383    })
2384}
2385
2386/// Get the system page size.
2387pub(crate) fn system_page_size() -> u32 {
2388    sparse_mmap::SparseMapping::page_size() as u32
2389}
2390
2391/// The guest architecture string, derived from the compile-time `guest_arch` cfg.
2392pub(crate) const GUEST_ARCH: &str = if cfg!(guest_arch = "x86_64") {
2393    "x86_64"
2394} else {
2395    "aarch64"
2396};
2397
2398/// Open a snapshot directory and validate it against the current VM config.
2399/// Returns the shared memory fd (from memory.bin) and the saved device state.
2400fn prepare_snapshot_restore(
2401    snapshot_dir: &Path,
2402    opt: &Options,
2403) -> anyhow::Result<(
2404    openvmm_defs::worker::SharedMemoryFd,
2405    mesh::payload::message::ProtobufMessage,
2406)> {
2407    let (manifest, state_bytes) = openvmm_helpers::snapshot::read_snapshot(snapshot_dir)?;
2408
2409    // Validate manifest against current VM config.
2410    openvmm_helpers::snapshot::validate_manifest(
2411        &manifest,
2412        GUEST_ARCH,
2413        opt.memory_size(),
2414        opt.processors,
2415        system_page_size(),
2416    )?;
2417
2418    // Open memory.bin (existing file, no create, no resize).
2419    let memory_file = fs_err::OpenOptions::new()
2420        .read(true)
2421        .write(true)
2422        .open(snapshot_dir.join("memory.bin"))?;
2423
2424    // Validate file size matches expected memory size.
2425    let file_size = memory_file.metadata()?.len();
2426    if file_size != manifest.memory_size_bytes {
2427        anyhow::bail!(
2428            "memory.bin size ({file_size} bytes) doesn't match manifest ({} bytes)",
2429            manifest.memory_size_bytes,
2430        );
2431    }
2432
2433    let shared_memory_fd =
2434        openvmm_helpers::shared_memory::file_to_shared_memory_fd(memory_file.into())?;
2435
2436    // Reconstruct ProtobufMessage from the saved state bytes.
2437    // The save side wrote mesh::payload::encode(ProtobufMessage), so we decode
2438    // back to ProtobufMessage.
2439    let state_msg: mesh::payload::message::ProtobufMessage = mesh::payload::decode(&state_bytes)
2440        .context("failed to decode saved state from snapshot")?;
2441
2442    Ok((shared_memory_fd, state_msg))
2443}
2444
2445fn do_main(pidfile_guard: &mut Option<pidfile::Pidfile>) -> anyhow::Result<i32> {
2446    #[cfg(windows)]
2447    pal::windows::disable_hard_error_dialog();
2448
2449    tracing_init::enable_tracing()?;
2450
2451    // Try to run as a worker host.
2452    // On success the worker runs to completion and then exits the process (does
2453    // not return). Any worker host setup errors are return and bubbled up.
2454    meshworker::run_vmm_mesh_host()?;
2455
2456    let opt = cli_args::parse_options();
2457    if let Some(path) = &opt.write_saved_state_proto {
2458        mesh::payload::protofile::DescriptorWriter::new(vmcore::save_restore::saved_state_roots())
2459            .write_to_path(path)
2460            .context("failed to write protobuf descriptors")?;
2461        return Ok(0);
2462    }
2463
2464    if let Some(ref path) = opt.pidfile {
2465        *pidfile_guard = Some(pidfile::Pidfile::new(path).context("failed to create pidfile")?);
2466    }
2467
2468    if let Some(path) = opt.relay_console_path {
2469        let console_title = opt.relay_console_title.unwrap_or_default();
2470        return console_relay::relay_console(&path, console_title.as_str()).map(|()| 0);
2471    }
2472
2473    #[cfg(any(feature = "grpc", feature = "ttrpc"))]
2474    if let Some(path) = opt.ttrpc.as_ref().or(opt.grpc.as_ref()) {
2475        return block_on(async {
2476            let _ = std::fs::remove_file(path);
2477            let listener =
2478                unix_socket::UnixListener::bind(path).context("failed to bind to socket")?;
2479
2480            let transport = if opt.ttrpc.is_some() {
2481                ttrpc::RpcTransport::Ttrpc
2482            } else {
2483                ttrpc::RpcTransport::Grpc
2484            };
2485
2486            // This is a local launch
2487            let mut handle =
2488                mesh_worker::launch_local_worker::<ttrpc::TtrpcWorker>(ttrpc::Parameters {
2489                    listener,
2490                    transport,
2491                })
2492                .await?;
2493
2494            tracing::info!(%transport, path = %path.display(), "listening");
2495
2496            // Signal the the parent process that the server is ready.
2497            pal::close_stdout().context("failed to close stdout")?;
2498
2499            handle.join().await?;
2500
2501            Ok(0)
2502        });
2503    }
2504
2505    DefaultPool::run_with(async |driver| run_control(&driver, opt).await)
2506}
2507
2508fn new_hvsock_service_id(port: u32) -> Guid {
2509    // This GUID is an embedding of the AF_VSOCK port into an
2510    // AF_HYPERV service ID.
2511    Guid {
2512        data1: port,
2513        .."00000000-facb-11e6-bd58-64006a7986d3".parse().unwrap()
2514    }
2515}
2516
2517async fn run_control(driver: &DefaultDriver, opt: Options) -> anyhow::Result<i32> {
2518    let mut mesh = Some(VmmMesh::new(&driver, opt.single_process)?);
2519    let result = run_control_inner(driver, &mut mesh, opt).await;
2520    // If setup failed before the mesh was handed to the controller, shut it
2521    // down so the child host process exits cleanly without noisy logs.
2522    if let Some(mesh) = mesh {
2523        mesh.shutdown().await;
2524    }
2525    result
2526}
2527
2528async fn run_control_inner(
2529    driver: &DefaultDriver,
2530    mesh_slot: &mut Option<VmmMesh>,
2531    opt: Options,
2532) -> anyhow::Result<i32> {
2533    let mesh = mesh_slot.as_ref().unwrap();
2534    let (mut vm_config, mut resources) = vm_config_from_command_line(driver, mesh, &opt).await?;
2535
2536    let mut vnc_worker = None;
2537    if opt.gfx || opt.vnc.vnc {
2538        // Parse the listen address. Try as a full SocketAddr (host:port) first;
2539        // fall back to a bare IP, using the configured port.
2540        let addr: std::net::SocketAddr = if let Ok(sa) =
2541            opt.vnc.vnc_listen.parse::<std::net::SocketAddr>()
2542        {
2543            sa
2544        } else {
2545            let ip: std::net::IpAddr = opt.vnc.vnc_listen.parse().with_context(|| {
2546                format!(
2547                    "invalid VNC listen address: {} (expected IP address or socket address like [::1]:5900)",
2548                    opt.vnc.vnc_listen
2549                )
2550            })?;
2551            std::net::SocketAddr::new(ip, opt.vnc.vnc_port)
2552        };
2553
2554        let socket = socket2::Socket::new(
2555            if addr.is_ipv6() {
2556                socket2::Domain::IPV6
2557            } else {
2558                socket2::Domain::IPV4
2559            },
2560            socket2::Type::STREAM,
2561            None,
2562        )
2563        .with_context(|| format!("creating VNC socket for {}", addr))?;
2564
2565        if addr.is_ipv6() {
2566            if let Err(e) = socket.set_only_v6(false) {
2567                tracing::warn!(
2568                    error = %e,
2569                    "failed to enable dual-stack on IPv6 VNC socket, IPv4 clients may not be able to connect"
2570                );
2571            }
2572        }
2573        socket.set_reuse_address(true)?;
2574        socket
2575            .bind(&addr.into())
2576            .with_context(|| format!("binding VNC socket to {}", addr))?;
2577        socket
2578            .listen(128)
2579            .with_context(|| format!("listening on VNC socket {}", addr))?;
2580        let listener: TcpListener = socket.into();
2581
2582        if !addr.ip().is_loopback() {
2583            tracing::warn!(
2584                address = %addr,
2585                "VNC server listening on non-localhost address without authentication"
2586            );
2587        }
2588
2589        let input_send = vm_config.input.sender();
2590        let framebuffer = resources
2591            .framebuffer_access
2592            .take()
2593            .expect("synth video enabled");
2594
2595        let vnc_host = mesh
2596            .make_host("vnc", None)
2597            .await
2598            .context("spawning vnc process failed")?;
2599
2600        vnc_worker = Some(
2601            vnc_host
2602                .launch_worker(
2603                    vnc_worker_defs::VNC_WORKER_TCP,
2604                    VncParameters {
2605                        listener,
2606                        framebuffer,
2607                        input_send,
2608                        dirty_recv: resources.dirty_rect_recv.take(),
2609                        max_clients: opt.vnc.vnc_max_clients,
2610                        evict_oldest: opt.vnc.vnc_evict_oldest,
2611                    },
2612                )
2613                .await?,
2614        )
2615    }
2616
2617    // spin up the debug worker
2618    let gdb_worker = if let Some(port) = opt.gdb {
2619        let listener = TcpListener::bind(format!("127.0.0.1:{}", port))
2620            .with_context(|| format!("binding to gdb port {}", port))?;
2621
2622        let (req_tx, req_rx) = mesh::channel();
2623        vm_config.debugger_rpc = Some(req_rx);
2624
2625        let gdb_host = mesh
2626            .make_host("gdb", None)
2627            .await
2628            .context("spawning gdbstub process failed")?;
2629
2630        Some(
2631            gdb_host
2632                .launch_worker(
2633                    debug_worker_defs::DEBUGGER_WORKER,
2634                    debug_worker_defs::DebuggerParameters {
2635                        listener,
2636                        req_chan: req_tx,
2637                        vp_count: vm_config.processor_topology.proc_count,
2638                        target_arch: if cfg!(guest_arch = "x86_64") {
2639                            debug_worker_defs::TargetArch::X86_64
2640                        } else {
2641                            debug_worker_defs::TargetArch::Aarch64
2642                        },
2643                    },
2644                )
2645                .await
2646                .context("failed to launch gdbstub worker")?,
2647        )
2648    } else {
2649        None
2650    };
2651
2652    // spin up the VM
2653    let (vm_rpc, rpc_recv) = mesh::channel();
2654    let (notify_send, notify_recv) = mesh::channel();
2655    let vm_worker = {
2656        let vm_host = mesh.make_host("vm", opt.log_file.clone()).await?;
2657
2658        let (shared_memory, saved_state) = if let Some(snapshot_dir) = &opt.restore_snapshot {
2659            let (fd, state_msg) = prepare_snapshot_restore(snapshot_dir, &opt)?;
2660            (Some(fd), Some(state_msg))
2661        } else {
2662            let shared_memory = opt
2663                .memory_backing_file()
2664                .map(|path| {
2665                    openvmm_helpers::shared_memory::open_memory_backing_file(
2666                        path,
2667                        opt.memory_size(),
2668                    )
2669                })
2670                .transpose()?;
2671            (shared_memory, None)
2672        };
2673
2674        let params = VmWorkerParameters {
2675            hypervisor: match &opt.hypervisor {
2676                Some(name) => openvmm_helpers::hypervisor::hypervisor_resource(name)?,
2677                None => openvmm_helpers::hypervisor::choose_hypervisor()?,
2678            },
2679            cfg: vm_config,
2680            saved_state,
2681            shared_memory,
2682            rpc: rpc_recv,
2683            notify: notify_send,
2684        };
2685        vm_host
2686            .launch_worker(VM_WORKER, params)
2687            .await
2688            .context("failed to launch vm worker")?
2689    };
2690
2691    if opt.restore_snapshot.is_some() {
2692        tracing::info!("restoring VM from snapshot");
2693    }
2694
2695    if !opt.paused {
2696        vm_rpc.call(VmRpc::Resume, ()).await?;
2697    }
2698
2699    let paravisor_diag = Arc::new(diag_client::DiagClient::from_dialer(
2700        driver.clone(),
2701        DiagDialer {
2702            driver: driver.clone(),
2703            vm_rpc: vm_rpc.clone(),
2704            openhcl_vtl: if opt.vtl2 {
2705                DeviceVtl::Vtl2
2706            } else {
2707                DeviceVtl::Vtl0
2708            },
2709        },
2710    ));
2711
2712    let diag_inspector = DiagInspector::new(driver.clone(), paravisor_diag.clone());
2713
2714    // Create channels between the REPL and VmController.
2715    let (vm_controller_send, vm_controller_recv) = mesh::channel();
2716    let (vm_controller_event_send, vm_controller_event_recv) = mesh::channel();
2717
2718    let has_vtl2 = resources.vtl2_settings.is_some();
2719
2720    // Build the VmController with exclusive resources.
2721    let controller = vm_controller::VmController {
2722        mesh: mesh_slot.take().unwrap(),
2723        vm_worker,
2724        vnc_worker,
2725        gdb_worker,
2726        diag_inspector: Some(diag_inspector),
2727        vtl2_settings: resources.vtl2_settings,
2728        ged_rpc: resources.ged_rpc.clone(),
2729        vm_rpc: vm_rpc.clone(),
2730        paravisor_diag: Some(paravisor_diag),
2731        igvm_path: opt.igvm.clone(),
2732        memory_backing_file: opt.memory_backing_file().cloned(),
2733        memory: opt.memory_size(),
2734        processors: opt.processors,
2735        log_file: opt.log_file.clone(),
2736        guest_power_actions: vm_controller::GuestPowerActions {
2737            shutdown: opt.guest_shutdown_action,
2738            reset: opt.guest_reset_action,
2739            crash: opt.guest_crash_action,
2740            watchdog: opt.guest_watchdog_action,
2741        },
2742    };
2743
2744    // Spawn the VmController as a task.
2745    let controller_task = driver.spawn(
2746        "vm-controller",
2747        controller.run(vm_controller_recv, vm_controller_event_send, notify_recv),
2748    );
2749
2750    // Run the REPL with shareable resources.
2751    let repl_result = repl::run_repl(
2752        driver,
2753        repl::ReplResources {
2754            vm_rpc,
2755            vm_controller: vm_controller_send,
2756            vm_controller_events: vm_controller_event_recv,
2757            scsi_rpc: resources.scsi_rpc,
2758            nvme_vtl2_rpc: resources.nvme_vtl2_rpc,
2759            consomme_rpc: resources.consomme_rpc,
2760            shutdown_ic: resources.shutdown_ic,
2761            kvp_ic: resources.kvp_ic,
2762            console_in: resources.console_in,
2763            has_vtl2,
2764        },
2765    )
2766    .await;
2767
2768    // Wait for the controller task to finish (it stops the VM worker and
2769    // shuts down the mesh).
2770    controller_task.await;
2771
2772    // run_repl returns the exit status: the code the guest drove via an opt-in
2773    // exit (VmControllerEvent::ExitRequested), or 0 when the VM stopped normally.
2774    repl_result
2775}
2776
2777struct DiagDialer {
2778    driver: DefaultDriver,
2779    vm_rpc: mesh::Sender<VmRpc>,
2780    openhcl_vtl: DeviceVtl,
2781}
2782
2783impl mesh_rpc::client::Dial for DiagDialer {
2784    type Stream = PolledSocket<unix_socket::UnixStream>;
2785
2786    async fn dial(&mut self) -> io::Result<Self::Stream> {
2787        let service_id = new_hvsock_service_id(1);
2788        let socket = self
2789            .vm_rpc
2790            .call_failable(
2791                VmRpc::ConnectHvsock,
2792                (
2793                    CancelContext::new().with_timeout(Duration::from_secs(2)),
2794                    service_id,
2795                    self.openhcl_vtl,
2796                ),
2797            )
2798            .await
2799            .map_err(io::Error::other)?;
2800
2801        PolledSocket::new(&self.driver, socket)
2802    }
2803}
2804
2805/// An object that implements [`InspectMut`] by sending an inspect request over
2806/// TTRPC to the guest (typically the paravisor running in VTL2), then stitching
2807/// the response back into the inspect tree.
2808///
2809/// This also caches the TTRPC connection to the guest so that only the first
2810/// inspect request has to wait for the connection to be established.
2811pub(crate) struct DiagInspector(DiagInspectorInner);
2812
2813enum DiagInspectorInner {
2814    NotStarted(DefaultDriver, Arc<diag_client::DiagClient>),
2815    Started {
2816        send: mesh::Sender<inspect::Deferred>,
2817        _task: Task<()>,
2818    },
2819    Invalid,
2820}
2821
2822impl DiagInspector {
2823    pub fn new(driver: DefaultDriver, diag_client: Arc<diag_client::DiagClient>) -> Self {
2824        Self(DiagInspectorInner::NotStarted(driver, diag_client))
2825    }
2826
2827    fn start(&mut self) -> &mesh::Sender<inspect::Deferred> {
2828        loop {
2829            match self.0 {
2830                DiagInspectorInner::NotStarted { .. } => {
2831                    let DiagInspectorInner::NotStarted(driver, client) =
2832                        std::mem::replace(&mut self.0, DiagInspectorInner::Invalid)
2833                    else {
2834                        unreachable!()
2835                    };
2836                    let (send, recv) = mesh::channel();
2837                    let task = driver.clone().spawn("diag-inspect", async move {
2838                        Self::run(&client, recv).await
2839                    });
2840
2841                    self.0 = DiagInspectorInner::Started { send, _task: task };
2842                }
2843                DiagInspectorInner::Started { ref send, .. } => break send,
2844                DiagInspectorInner::Invalid => unreachable!(),
2845            }
2846        }
2847    }
2848
2849    async fn run(
2850        diag_client: &diag_client::DiagClient,
2851        mut recv: mesh::Receiver<inspect::Deferred>,
2852    ) {
2853        while let Some(deferred) = recv.next().await {
2854            let info = deferred.external_request();
2855            let result = match info.request_type {
2856                inspect::ExternalRequestType::Inspect { depth } => {
2857                    if depth == 0 {
2858                        Ok(inspect::Node::Unevaluated)
2859                    } else {
2860                        // TODO: Support taking timeouts from the command line
2861                        diag_client
2862                            .inspect(info.path, Some(depth - 1), Some(Duration::from_secs(1)))
2863                            .await
2864                    }
2865                }
2866                inspect::ExternalRequestType::Update { value } => {
2867                    (diag_client.update(info.path, value).await).map(inspect::Node::Value)
2868                }
2869            };
2870            deferred.complete_external(
2871                result.unwrap_or_else(|err| {
2872                    inspect::Node::Failed(inspect::Error::Mesh(format!("{err:#}")))
2873                }),
2874                inspect::SensitivityLevel::Unspecified,
2875            )
2876        }
2877    }
2878}
2879
2880impl InspectMut for DiagInspector {
2881    fn inspect_mut(&mut self, req: inspect::Request<'_>) {
2882        self.start().send(req.defer());
2883    }
2884}