Skip to main content

openvmm_entry/
lib.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! This module implements the interactive control process and the entry point
5//! for the worker process.
6
7#![expect(missing_docs)]
8#![forbid(unsafe_code)]
9
10mod cli_args;
11mod crash_dump;
12mod kvp;
13mod meshworker;
14mod pidfile;
15mod repl;
16mod serial_io;
17mod storage_builder;
18mod tracing_init;
19mod ttrpc;
20mod vm_controller;
21
22// `pub` so that the missing_docs warning fires for options without
23// documentation.
24pub use cli_args::Options;
25use console_relay::ConsoleLaunchOptions;
26
27use crate::cli_args::SecureBootTemplateCli;
28use anyhow::Context;
29use anyhow::bail;
30use chipset_resources::battery::HostBatteryUpdate;
31use cli_args::DiskCliKind;
32use cli_args::EfiDiagnosticsLogLevelCli;
33use cli_args::EndpointConfigCli;
34use cli_args::GuestPowerAction;
35use cli_args::NicConfigCli;
36use cli_args::ProvisionVmgs;
37use cli_args::SerialConfigCli;
38use cli_args::UefiConsoleModeCli;
39use cli_args::VirtioBusCli;
40use cli_args::VmgsCli;
41use crash_dump::spawn_dump_handler;
42use cxl_spec::test::CxlTestDeviceHandle;
43use disk_backend_resources::DelayDiskHandle;
44use disk_backend_resources::DiskLayerDescription;
45use disk_backend_resources::layer::DiskLayerHandle;
46use disk_backend_resources::layer::RamDiskLayerHandle;
47use disk_backend_resources::layer::SqliteAutoCacheDiskLayerHandle;
48use disk_backend_resources::layer::SqliteDiskLayerHandle;
49use floppy_resources::FloppyDiskConfig;
50use framebuffer::FRAMEBUFFER_SIZE;
51use framebuffer::FramebufferAccess;
52use futures::AsyncReadExt;
53use futures::AsyncWrite;
54use futures::StreamExt;
55use futures::executor::block_on;
56use futures::io::AllowStdIo;
57use gdma_resources::GdmaDeviceHandle;
58use gdma_resources::VportDefinition;
59use guid::Guid;
60use input_core::MultiplexedInputHandle;
61use inspect::InspectMut;
62use io::Read;
63use mesh::CancelContext;
64use mesh::CellUpdater;
65use mesh::rpc::RpcSend;
66use meshworker::VmmMesh;
67use net_backend_resources::mac_address::MacAddress;
68use nvme_resources::NvmeControllerRequest;
69use openvmm_defs::config::Config;
70use openvmm_defs::config::DEFAULT_PCAT_BOOT_ORDER;
71use openvmm_defs::config::DeviceVtl;
72use openvmm_defs::config::EfiDiagnosticsLogLevelType;
73use openvmm_defs::config::HypervisorConfig;
74use openvmm_defs::config::LateMapVtl0MemoryPolicy;
75use openvmm_defs::config::LoadMode;
76use openvmm_defs::config::MemoryConfig;
77use openvmm_defs::config::NumaDistance;
78use openvmm_defs::config::NumaNode;
79use openvmm_defs::config::NumaTopology;
80use openvmm_defs::config::PcieDeviceConfig;
81use openvmm_defs::config::PcieMmioRangeConfig;
82use openvmm_defs::config::PcieRootComplexConfig;
83use openvmm_defs::config::PcieRootPortConfig;
84use openvmm_defs::config::PcieSwitchConfig;
85use openvmm_defs::config::ProcessorTopologyConfig;
86use openvmm_defs::config::RootComplexCxlConfig;
87use openvmm_defs::config::SerialInformation;
88use openvmm_defs::config::VirtioBus;
89use openvmm_defs::config::VmbusConfig;
90use openvmm_defs::config::VpAssignment;
91use openvmm_defs::config::VpciDeviceConfig;
92use openvmm_defs::config::Vtl2Config;
93use openvmm_defs::rpc::VmRpc;
94use openvmm_defs::worker::VM_WORKER;
95use openvmm_defs::worker::VmWorkerParameters;
96use openvmm_helpers::disk::OpenDiskOptions;
97use openvmm_helpers::disk::create_disk_type;
98use openvmm_helpers::disk::open_disk_type;
99use pal_async::DefaultDriver;
100use pal_async::DefaultPool;
101use pal_async::socket::PolledSocket;
102use pal_async::task::Spawn;
103use pal_async::task::Task;
104use serial_16550_resources::ComPort;
105use serial_core::resources::DisconnectedSerialBackendHandle;
106use sparse_mmap::alloc_shared_memory;
107use std::cell::RefCell;
108use std::collections::BTreeMap;
109use std::fmt::Write as _;
110use std::future::pending;
111use std::io;
112#[cfg(unix)]
113use std::io::IsTerminal;
114use std::io::Write;
115use std::net::TcpListener;
116use std::path::Path;
117use std::path::PathBuf;
118use std::sync::Arc;
119use std::thread;
120use std::time::Duration;
121use storvsp_resources::ScsiControllerRequest;
122use tpm_resources::TpmDeviceHandle;
123use tpm_resources::TpmRegisterLayout;
124use uidevices_resources::SynthKeyboardHandle;
125use uidevices_resources::SynthMouseHandle;
126use uidevices_resources::SynthVideoHandle;
127use video_core::SharedFramebufferHandle;
128use virtio_resources::VirtioPciDeviceHandle;
129use vm_manifest_builder::BaseChipsetType;
130use vm_manifest_builder::MachineArch;
131use vm_manifest_builder::VmChipsetResult;
132use vm_manifest_builder::VmManifestBuilder;
133use vm_resource::IntoResource;
134use vm_resource::Resource;
135use vm_resource::kind::DiskHandleKind;
136use vm_resource::kind::DiskLayerHandleKind;
137use vm_resource::kind::NetEndpointHandleKind;
138use vm_resource::kind::VirtioDeviceHandle;
139use vm_resource::kind::VmbusDeviceHandleKind;
140use vmbus_serial_resources::VmbusSerialDeviceHandle;
141use vmbus_serial_resources::VmbusSerialPort;
142use vmcore::non_volatile_store::resources::EphemeralNonVolatileStoreHandle;
143use vmgs_resources::GuestStateEncryptionPolicy;
144use vmgs_resources::VmgsDisk;
145use vmgs_resources::VmgsFileHandle;
146use vmgs_resources::VmgsResource;
147use vmotherboard::ChipsetDeviceHandle;
148use vnc_worker_defs::VncParameters;
149
150pub fn openvmm_main() {
151    // Save the current state of the terminal so we can restore it back to
152    // normal before exiting.
153    #[cfg(unix)]
154    let orig_termios = io::stderr().is_terminal().then(term::get_termios);
155
156    let mut pidfile_guard: Option<pidfile::Pidfile> = None;
157    let exit_code = match do_main(&mut pidfile_guard) {
158        Ok(code) => code,
159        Err(err) => {
160            eprintln!("fatal error: {:?}", err);
161            1
162        }
163    };
164
165    // Restore the terminal to its initial state.
166    #[cfg(unix)]
167    if let Some(orig_termios) = orig_termios {
168        term::set_termios(orig_termios);
169    }
170
171    // Clean up the pidfile before terminating, since
172    // pal::process::terminate skips destructors.
173    drop(pidfile_guard);
174
175    // Terminate the process immediately without graceful shutdown of DLLs or
176    // C++ destructors or anything like that. This is all unnecessary and saves
177    // time on Windows.
178    //
179    // Do flush stdout, though, since there may be buffered data.
180    let _ = io::stdout().flush();
181    pal::process::terminate(exit_code);
182}
183
184#[derive(Default)]
185struct VmResources {
186    console_in: Option<Box<dyn AsyncWrite + Send + Unpin>>,
187    framebuffer_access: Option<FramebufferAccess>,
188    shutdown_ic: Option<mesh::Sender<hyperv_ic_resources::shutdown::ShutdownRpc>>,
189    kvp_ic: Option<mesh::Sender<hyperv_ic_resources::kvp::KvpConnectRpc>>,
190    scsi_rpc: Option<mesh::Sender<ScsiControllerRequest>>,
191    nvme_vtl2_rpc: Option<mesh::Sender<NvmeControllerRequest>>,
192    ged_rpc: Option<mesh::Sender<get_resources::ged::GuestEmulationRequest>>,
193    vtl2_settings: Option<vtl2_settings_proto::Vtl2Settings>,
194    /// Receives dirty rectangles from the synthetic video device for the VNC worker.
195    dirty_rect_recv: Option<mesh::Receiver<Vec<video_core::DirtyRect>>>,
196    #[cfg(windows)]
197    switch_ports: Vec<vmswitch::kernel::SwitchPort>,
198}
199
200struct ConsoleState<'a> {
201    device: &'a str,
202    input: Box<dyn AsyncWrite + Unpin + Send>,
203}
204
205/// Build a flat list of switches with their parent port assignments.
206///
207/// This function converts hierarchical CLI switch definitions into a flat list
208/// where each switch specifies its parent port directly.
209fn build_switch_list(all_switches: &[cli_args::GenericPcieSwitchCli]) -> Vec<PcieSwitchConfig> {
210    all_switches
211        .iter()
212        .map(|switch_cli| PcieSwitchConfig {
213            name: switch_cli.name.clone(),
214            num_downstream_ports: switch_cli.num_downstream_ports,
215            parent_port: switch_cli.port_name.clone(),
216            hotplug: switch_cli.hotplug,
217            acs_capabilities_supported: switch_cli.acs_capabilities_supported,
218        })
219        .collect()
220}
221
222async fn vm_config_from_command_line(
223    spawner: impl Spawn,
224    mesh: &VmmMesh,
225    opt: &Options,
226) -> anyhow::Result<(Config, VmResources)> {
227    let (_, serial_driver) = DefaultPool::spawn_on_thread("serial");
228    // Ensure the serial driver stays alive with no tasks.
229    serial_driver.spawn("leak", pending::<()>()).detach();
230
231    let openhcl_vtl = if opt.vtl2 {
232        DeviceVtl::Vtl2
233    } else {
234        DeviceVtl::Vtl0
235    };
236
237    let console_state: RefCell<Option<ConsoleState<'_>>> = RefCell::new(None);
238    let setup_serial = |name: &str, cli_cfg, device| -> anyhow::Result<_> {
239        Ok(match cli_cfg {
240            SerialConfigCli::Console => {
241                if let Some(console_state) = console_state.borrow().as_ref() {
242                    bail!("console already set by {}", console_state.device);
243                }
244                let (config, serial) = serial_io::anonymous_serial_pair(&serial_driver)?;
245                let (serial_read, serial_write) = AsyncReadExt::split(serial);
246                *console_state.borrow_mut() = Some(ConsoleState {
247                    device,
248                    input: Box::new(serial_write),
249                });
250                thread::Builder::new()
251                    .name(name.to_owned())
252                    .spawn(move || {
253                        let _ = block_on(futures::io::copy(
254                            serial_read,
255                            &mut AllowStdIo::new(term::raw_stdout()),
256                        ));
257                    })
258                    .unwrap();
259                Some(config)
260            }
261            SerialConfigCli::Stderr => {
262                let (config, serial) = serial_io::anonymous_serial_pair(&serial_driver)?;
263                thread::Builder::new()
264                    .name(name.to_owned())
265                    .spawn(move || {
266                        let _ = block_on(futures::io::copy(
267                            serial,
268                            &mut AllowStdIo::new(term::raw_stderr()),
269                        ));
270                    })
271                    .unwrap();
272                Some(config)
273            }
274            SerialConfigCli::File(path) => {
275                let (config, serial) = serial_io::anonymous_serial_pair(&serial_driver)?;
276                let file = fs_err::File::create(path).context("failed to create file")?;
277
278                thread::Builder::new()
279                    .name(name.to_owned())
280                    .spawn(move || {
281                        let _ = block_on(futures::io::copy(serial, &mut AllowStdIo::new(file)));
282                    })
283                    .unwrap();
284                Some(config)
285            }
286            SerialConfigCli::None => None,
287            SerialConfigCli::Pipe(path) => {
288                Some(serial_io::bind_serial(&path).context("failed to bind serial")?)
289            }
290            SerialConfigCli::Tcp(addr) => {
291                Some(serial_io::bind_tcp_serial(&addr).context("failed to bind serial")?)
292            }
293            SerialConfigCli::NewConsole(app, window_title) => {
294                let path = console_relay::random_console_path();
295                let config =
296                    serial_io::bind_serial(&path).context("failed to bind console serial")?;
297                let window_title =
298                    window_title.unwrap_or_else(|| name.to_uppercase() + " [OpenVMM]");
299
300                console_relay::launch_console(
301                    app.or_else(openvmm_terminal_app).as_deref(),
302                    &path,
303                    ConsoleLaunchOptions {
304                        window_title: Some(window_title),
305                    },
306                )
307                .context("failed to launch console")?;
308
309                Some(config)
310            }
311        })
312    };
313
314    let mut vmbus_devices = Vec::new();
315
316    let serial0_cfg = setup_serial(
317        "com1",
318        opt.com1.clone().unwrap_or(SerialConfigCli::Console),
319        if cfg!(guest_arch = "x86_64") {
320            "ttyS0"
321        } else {
322            "ttyAMA0"
323        },
324    )?;
325    let serial1_cfg = setup_serial(
326        "com2",
327        opt.com2.clone().unwrap_or(SerialConfigCli::None),
328        if cfg!(guest_arch = "x86_64") {
329            "ttyS1"
330        } else {
331            "ttyAMA1"
332        },
333    )?;
334    let serial2_cfg = setup_serial(
335        "com3",
336        opt.com3.clone().unwrap_or(SerialConfigCli::None),
337        if cfg!(guest_arch = "x86_64") {
338            "ttyS2"
339        } else {
340            "ttyAMA2"
341        },
342    )?;
343    let serial3_cfg = setup_serial(
344        "com4",
345        opt.com4.clone().unwrap_or(SerialConfigCli::None),
346        if cfg!(guest_arch = "x86_64") {
347            "ttyS3"
348        } else {
349            "ttyAMA3"
350        },
351    )?;
352    let with_vmbus_com1_serial = if let Some(vmbus_com1_cfg) = setup_serial(
353        "vmbus_com1",
354        opt.vmbus_com1_serial
355            .clone()
356            .unwrap_or(SerialConfigCli::None),
357        "vmbus_com1",
358    )? {
359        vmbus_devices.push((
360            openhcl_vtl,
361            VmbusSerialDeviceHandle {
362                port: VmbusSerialPort::Com1,
363                backend: vmbus_com1_cfg,
364            }
365            .into_resource(),
366        ));
367        true
368    } else {
369        false
370    };
371    let with_vmbus_com2_serial = if let Some(vmbus_com2_cfg) = setup_serial(
372        "vmbus_com2",
373        opt.vmbus_com2_serial
374            .clone()
375            .unwrap_or(SerialConfigCli::None),
376        "vmbus_com2",
377    )? {
378        vmbus_devices.push((
379            openhcl_vtl,
380            VmbusSerialDeviceHandle {
381                port: VmbusSerialPort::Com2,
382                backend: vmbus_com2_cfg,
383            }
384            .into_resource(),
385        ));
386        true
387    } else {
388        false
389    };
390    let debugcon_cfg = setup_serial(
391        "debugcon",
392        opt.debugcon
393            .clone()
394            .map(|cfg| cfg.serial)
395            .unwrap_or(SerialConfigCli::None),
396        "debugcon",
397    )?;
398
399    let virtio_console_backend = if let Some(serial_cfg) = opt.virtio_console.clone() {
400        setup_serial("virtio-console", serial_cfg, "hvc0")?
401    } else {
402        None
403    };
404
405    let mut resources = VmResources::default();
406    let mut console_str = "";
407    if let Some(ConsoleState { device, input }) = console_state.into_inner() {
408        resources.console_in = Some(input);
409        console_str = device;
410    }
411
412    if opt.shared_memory {
413        tracing::warn!("--shared-memory/-M flag has no effect and will be removed");
414    }
415    if opt.deprecated_prefetch {
416        tracing::warn!("--prefetch is deprecated; use --memory prefetch=on");
417    }
418    if opt.deprecated_private_memory {
419        tracing::warn!("--private-memory is deprecated; use --memory shared=off");
420    }
421    if opt.deprecated_thp {
422        tracing::warn!("--thp is deprecated; use --memory shared=off,thp=on");
423    }
424    if opt.deprecated_memory_backing_file.is_some() {
425        tracing::warn!("--memory-backing-file is deprecated; use --memory file=<path>");
426    }
427
428    opt.validate_memory_options()?;
429
430    const MAX_PROCESSOR_COUNT: u32 = 1024;
431
432    if opt.processors == 0 || opt.processors > MAX_PROCESSOR_COUNT {
433        bail!("invalid proc count: {}", opt.processors);
434    }
435
436    // Total SCSI channel count should not exceed the processor count
437    // (at most, one channel per VP).
438    if opt.scsi_sub_channels > (MAX_PROCESSOR_COUNT - 1) as u16 {
439        bail!(
440            "invalid SCSI sub-channel count: requested {}, max {}",
441            opt.scsi_sub_channels,
442            MAX_PROCESSOR_COUNT - 1
443        );
444    }
445
446    let with_get = opt.get || (opt.vtl2 && !opt.no_get);
447
448    let mut storage = storage_builder::StorageBuilder::new(with_get.then_some(openhcl_vtl));
449
450    // Register named controllers first, so that --disk on=<name>
451    // references can be resolved.
452    for ctrl in &opt.nvme_pci {
453        let transport = match &ctrl.transport {
454            cli_args::NvmeControllerTransport::Pcie(port) => {
455                storage_builder::NvmeControllerTransport::Pcie(port.clone())
456            }
457            cli_args::NvmeControllerTransport::Vpci(guid) => {
458                let guid = guid.unwrap_or_else(|| storage_builder::deterministic_guid(&ctrl.id));
459                storage_builder::NvmeControllerTransport::Vpci(guid)
460            }
461        };
462        storage.add_nvme_controller(ctrl.id.clone(), ctrl.vtl, transport, None)?;
463    }
464
465    for ctrl in &opt.vmbus_scsi {
466        let instance_id = storage_builder::deterministic_guid(&ctrl.id);
467        storage.add_scsi_controller(ctrl.id.clone(), ctrl.vtl, instance_id, ctrl.sub_channels)?;
468    }
469
470    for ctrl in &opt.openhcl_controller {
471        let controller_type = match ctrl.controller_type {
472            cli_args::OpenhclControllerType::Scsi => storage_builder::OpenhclControllerType::Scsi,
473            cli_args::OpenhclControllerType::Nvme => storage_builder::OpenhclControllerType::Nvme,
474        };
475        let instance_id = ctrl
476            .guid
477            .unwrap_or_else(|| storage_builder::deterministic_guid(&ctrl.id));
478        storage.add_openhcl_controller(ctrl.id.clone(), controller_type, instance_id)?;
479    }
480
481    for &cli_args::DiskCli {
482        vtl,
483        ref kind,
484        read_only,
485        is_dvd,
486        underhill,
487        ref pcie_port,
488        ref controller,
489        nsid,
490        lun,
491        ref relay,
492    } in &opt.disk
493    {
494        if controller.is_none() && underhill.is_none() && relay.is_none() {
495            tracing::warn!(
496                "--disk without `on` is deprecated; \
497                 use --vmbus-scsi and --disk on=<name> instead"
498            );
499        }
500
501        let relay_target = relay
502            .as_ref()
503            .map(|(name, loc)| storage_builder::RelayTarget {
504                controller: name.clone(),
505                location: *loc,
506            });
507
508        let target = if let Some(name) = controller {
509            if pcie_port.is_some() {
510                anyhow::bail!("`on` is incompatible with `pcie_port` on `--disk`");
511            }
512            storage_builder::DiskLocation::Named {
513                controller: name.clone(),
514                nsid,
515                lun,
516            }
517        } else if pcie_port.is_some() {
518            anyhow::bail!("`--disk` is incompatible with `pcie_port` without `controller`");
519        } else {
520            if opt.no_vmbus {
521                anyhow::bail!(
522                    "`--disk` without `on=` attaches to the default VMBus SCSI controller and \
523                     cannot be used with `--no-vmbus`; use `on=<name>` to attach to a named controller"
524                );
525            }
526            storage_builder::DiskLocation::Scsi(None)
527        };
528
529        storage
530            .add(
531                vtl,
532                underhill,
533                relay_target,
534                target,
535                kind,
536                is_dvd,
537                read_only,
538            )
539            .await?;
540    }
541
542    for &cli_args::IdeDiskCli {
543        ref kind,
544        read_only,
545        channel,
546        device,
547        is_dvd,
548    } in &opt.ide
549    {
550        storage
551            .add(
552                DeviceVtl::Vtl0,
553                None,
554                None,
555                storage_builder::DiskLocation::Ide(channel, device),
556                kind,
557                is_dvd,
558                read_only,
559            )
560            .await?;
561    }
562
563    if !opt.nvme.is_empty() {
564        tracing::warn!("--nvme is deprecated; use --nvme-pci and --disk on=<name> instead");
565
566        // Pre-register implicit PCIe controllers for unique port names.
567        let mut registered_ports = std::collections::BTreeSet::new();
568        for disk in &opt.nvme {
569            if let Some(port) = &disk.pcie_port {
570                if registered_ports.insert(port.clone()) {
571                    storage.add_nvme_controller(
572                        port.clone(),
573                        DeviceVtl::Vtl0,
574                        storage_builder::NvmeControllerTransport::Pcie(port.clone()),
575                        None,
576                    ).with_context(|| format!(
577                        "legacy --nvme flag conflicts with an explicit controller named '{port}'; \
578                         use --nvme-pci and --disk on=<name> instead"
579                    ))?;
580                }
581            }
582        }
583    }
584
585    for &cli_args::DiskCli {
586        vtl,
587        ref kind,
588        read_only,
589        is_dvd,
590        underhill,
591        ref pcie_port,
592        controller: _,
593        nsid: _,
594        lun: _,
595        relay: _,
596    } in &opt.nvme
597    {
598        let target = if let Some(port) = pcie_port {
599            storage_builder::DiskLocation::Named {
600                controller: port.clone(),
601                nsid: None,
602                lun: None,
603            }
604        } else {
605            storage_builder::DiskLocation::Nvme(None)
606        };
607        storage
608            .add(vtl, underhill, None, target, kind, is_dvd, read_only)
609            .await?;
610    }
611
612    for &cli_args::DiskCli {
613        vtl,
614        ref kind,
615        read_only,
616        is_dvd,
617        ref underhill,
618        ref pcie_port,
619        controller: _,
620        nsid: _,
621        lun: _,
622        relay: _,
623    } in &opt.virtio_blk
624    {
625        if underhill.is_some() {
626            anyhow::bail!("underhill not supported with virtio-blk");
627        }
628        storage
629            .add(
630                vtl,
631                None,
632                None,
633                storage_builder::DiskLocation::VirtioBlk(pcie_port.clone()),
634                kind,
635                is_dvd,
636                read_only,
637            )
638            .await?;
639    }
640
641    let mut floppy_disks = Vec::new();
642    for disk in &opt.floppy {
643        let &cli_args::FloppyDiskCli {
644            ref kind,
645            read_only,
646        } = disk;
647        floppy_disks.push(FloppyDiskConfig {
648            disk_type: disk_open(kind, read_only).await?,
649            read_only,
650        });
651    }
652
653    let mut vpci_mana_nics = [(); 3].map(|()| None);
654    let mut pcie_mana_nics = BTreeMap::<String, GdmaDeviceHandle>::new();
655    let mut underhill_nics = Vec::new();
656    let mut vpci_devices = Vec::new();
657
658    let mut nic_index = 0;
659    for cli_cfg in &opt.net {
660        if cli_cfg.pcie_port.is_some() {
661            anyhow::bail!("`--net` does not support PCIe");
662        }
663        let vport = parse_endpoint(cli_cfg, &mut nic_index, &mut resources)?;
664        if cli_cfg.underhill {
665            if !opt.no_alias_map {
666                anyhow::bail!("must specify --no-alias-map to offer NICs to VTL2");
667            }
668            let mana = vpci_mana_nics[openhcl_vtl as usize].get_or_insert_with(|| {
669                let vpci_instance_id = Guid::new_random();
670                underhill_nics.push(vtl2_settings_proto::NicDeviceLegacy {
671                    instance_id: vpci_instance_id.to_string(),
672                    subordinate_instance_id: None,
673                    max_sub_channels: None,
674                });
675                (vpci_instance_id, GdmaDeviceHandle { vports: Vec::new() })
676            });
677            mana.1.vports.push(VportDefinition {
678                mac_address: vport.mac_address,
679                endpoint: vport.endpoint,
680            });
681        } else {
682            vmbus_devices.push(vport.into_netvsp_handle());
683        }
684    }
685
686    if opt.nic {
687        let nic_config = parse_endpoint(
688            &NicConfigCli {
689                vtl: DeviceVtl::Vtl0,
690                endpoint: EndpointConfigCli::Consomme {
691                    cidr: None,
692                    host_fwd: Vec::new(),
693                },
694                max_queues: None,
695                underhill: false,
696                pcie_port: None,
697            },
698            &mut nic_index,
699            &mut resources,
700        )?;
701        vmbus_devices.push(nic_config.into_netvsp_handle());
702    }
703
704    // Build initial PCIe devices list from CLI options. Storage devices
705    // (e.g., NVMe controllers on PCIe ports) are added later by storage_builder.
706    let mut pcie_devices = Vec::new();
707    for (index, cli_cfg) in opt.pcie_remote.iter().enumerate() {
708        tracing::info!(
709            port_name = %cli_cfg.port_name,
710            socket_addr = ?cli_cfg.socket_addr,
711            "instantiating PCIe remote device"
712        );
713
714        // Generate a deterministic instance ID based on index
715        const PCIE_REMOTE_BASE_INSTANCE_ID: Guid =
716            guid::guid!("28ed784d-c059-429f-9d9a-46bea02562c0");
717        let instance_id = Guid {
718            data1: index as u32,
719            ..PCIE_REMOTE_BASE_INSTANCE_ID
720        };
721
722        pcie_devices.push(PcieDeviceConfig {
723            port_name: cli_cfg.port_name.clone(),
724            resource: pcie_remote_resources::PcieRemoteHandle {
725                instance_id,
726                socket_addr: cli_cfg.socket_addr.clone(),
727                hu: cli_cfg.hu,
728                controller: cli_cfg.controller,
729            }
730            .into_resource(),
731        });
732    }
733
734    #[cfg(windows)]
735    let mut kernel_vmnics = Vec::new();
736    #[cfg(windows)]
737    for (index, switch_id) in opt.kernel_vmnic.iter().enumerate() {
738        // Pick a random MAC address.
739        let mut mac_address = [0x00, 0x15, 0x5D, 0, 0, 0];
740        getrandom::fill(&mut mac_address[3..]).expect("rng failure");
741
742        // Pick a fixed instance ID based on the index.
743        const BASE_INSTANCE_ID: Guid = guid::guid!("00000000-435d-11ee-9f59-00155d5016fc");
744        let instance_id = Guid {
745            data1: index as u32,
746            ..BASE_INSTANCE_ID
747        };
748
749        let switch_id = if switch_id == "default" {
750            None
751        } else {
752            Some(switch_id.as_str())
753        };
754        let (port_id, port) = new_switch_port(switch_id)?;
755        resources.switch_ports.push(port);
756
757        kernel_vmnics.push(openvmm_defs::config::KernelVmNicConfig {
758            instance_id,
759            mac_address: mac_address.into(),
760            switch_port_id: port_id,
761        });
762    }
763
764    for vport in &opt.mana {
765        let vport = parse_endpoint(vport, &mut nic_index, &mut resources)?;
766        let vport_array = match (vport.vtl as usize, vport.pcie_port) {
767            (vtl, None) => {
768                &mut vpci_mana_nics[vtl]
769                    .get_or_insert_with(|| {
770                        (Guid::new_random(), GdmaDeviceHandle { vports: Vec::new() })
771                    })
772                    .1
773                    .vports
774            }
775            (0, Some(pcie_port)) => {
776                &mut pcie_mana_nics
777                    .entry(pcie_port)
778                    .or_insert(GdmaDeviceHandle { vports: Vec::new() })
779                    .vports
780            }
781            _ => anyhow::bail!("PCIe NICs only supported to VTL0"),
782        };
783        vport_array.push(VportDefinition {
784            mac_address: vport.mac_address,
785            endpoint: vport.endpoint,
786        });
787    }
788
789    vpci_devices.extend(
790        vpci_mana_nics
791            .into_iter()
792            .enumerate()
793            .filter_map(|(vtl, nic)| {
794                nic.map(|(instance_id, handle)| VpciDeviceConfig {
795                    vtl: match vtl {
796                        0 => DeviceVtl::Vtl0,
797                        1 => DeviceVtl::Vtl1,
798                        2 => DeviceVtl::Vtl2,
799                        _ => unreachable!(),
800                    },
801                    instance_id,
802                    resource: handle.into_resource(),
803                    vnode: None,
804                })
805            }),
806    );
807
808    pcie_devices.extend(
809        pcie_mana_nics
810            .into_iter()
811            .map(|(pcie_port, handle)| PcieDeviceConfig {
812                port_name: pcie_port,
813                resource: handle.into_resource(),
814            }),
815    );
816
817    for cxl_test in &opt.cxl_test {
818        pcie_devices.push(PcieDeviceConfig {
819            port_name: cxl_test.pcie_port.clone(),
820            resource: CxlTestDeviceHandle {
821                hdm_size_bytes: cxl_test.hdm_size,
822            }
823            .into_resource(),
824        });
825    }
826
827    #[cfg(guest_arch = "aarch64")]
828    let arch = MachineArch::Aarch64;
829    #[cfg(guest_arch = "x86_64")]
830    let arch = MachineArch::X86_64;
831
832    let mut pcie_root_complexes = Vec::new();
833    for (i, rc_cli) in opt.pcie_root_complex.iter().enumerate() {
834        let ports: Vec<PcieRootPortConfig> = opt
835            .pcie_root_port
836            .iter()
837            .filter(|port_cli| port_cli.root_complex_name == rc_cli.name)
838            .map(|port_cli| PcieRootPortConfig {
839                name: port_cli.name.clone(),
840                hotplug: port_cli.hotplug,
841                acs_capabilities_supported: port_cli.acs_capabilities_supported,
842                cxl: port_cli.cxl,
843            })
844            .collect();
845
846        const ONE_MB: u64 = 1024 * 1024;
847        // Keep all PCI windows 1MB-granular to match layout and downstream placement rules.
848        let low_mmio_size = (rc_cli.low_mmio as u64).next_multiple_of(ONE_MB);
849        let high_mmio_size = rc_cli
850            .high_mmio
851            .checked_next_multiple_of(ONE_MB)
852            .context("high mmio rounding error")?;
853
854        // Count CXL-capable ports under the root bus. If the root bus has CXL root ports, it needs CHBCR.
855        let cxl_port_count = ports.iter().filter(|port| port.cxl).count() as u64;
856
857        let cxl = if cxl_port_count != 0 {
858            Some(RootComplexCxlConfig {
859                hdm_size: rc_cli.hdm,
860                hdm_window_restrictions: rc_cli.hdm_window_restrictions.bits(),
861            })
862        } else {
863            None
864        };
865        pcie_root_complexes.push(PcieRootComplexConfig {
866            index: i as u32,
867            name: rc_cli.name.clone(),
868            segment: rc_cli.segment,
869            start_bus: rc_cli.start_bus,
870            end_bus: rc_cli.end_bus,
871            low_mmio: if let Some(base) = rc_cli.low_mmio_base {
872                PcieMmioRangeConfig::Fixed(
873                    memory_range::MemoryRange::try_new(base..base.wrapping_add(low_mmio_size))
874                        .context("invalid low MMIO range")?,
875                )
876            } else {
877                PcieMmioRangeConfig::Dynamic {
878                    size: low_mmio_size,
879                }
880            },
881            high_mmio: if let Some(base) = rc_cli.high_mmio_base {
882                PcieMmioRangeConfig::Fixed(
883                    memory_range::MemoryRange::try_new(base..base.wrapping_add(high_mmio_size))
884                        .context("invalid high MMIO range")?,
885                )
886            } else {
887                PcieMmioRangeConfig::Dynamic {
888                    size: high_mmio_size,
889                }
890            },
891            cxl,
892            ports,
893            #[cfg(guest_arch = "aarch64")]
894            iommu: opt
895                .smmu
896                .iter()
897                .any(|s| s == &rc_cli.name)
898                .then_some(openvmm_defs::config::PcieIommuConfig::Smmu),
899            #[cfg(guest_arch = "x86_64")]
900            iommu: opt
901                .amd_iommu
902                .iter()
903                .any(|s| s == &rc_cli.name)
904                .then_some(openvmm_defs::config::PcieIommuConfig::AmdVi),
905            vnode: rc_cli.vnode,
906            preserve_bars: rc_cli.preserve_bars,
907        });
908    }
909
910    // Validate that all --smmu / --amd-iommu names refer to known root complexes.
911    #[cfg(guest_arch = "aarch64")]
912    for name in &opt.smmu {
913        anyhow::ensure!(
914            pcie_root_complexes.iter().any(|rc| rc.name == *name),
915            "--smmu refers to unknown root complex '{name}'"
916        );
917    }
918    #[cfg(guest_arch = "x86_64")]
919    for name in &opt.amd_iommu {
920        anyhow::ensure!(
921            pcie_root_complexes.iter().any(|rc| rc.name == *name),
922            "--amd-iommu refers to unknown root complex '{name}'"
923        );
924    }
925
926    let pcie_switches = build_switch_list(&opt.pcie_switch);
927    let pcie_generic_initiators = opt
928        .pcie_generic_initiator
929        .iter()
930        .map(|gi| openvmm_defs::config::PcieGenericInitiatorConfig {
931            port_name: gi.port_name.clone(),
932            node: gi.node,
933        })
934        .collect();
935    #[cfg(target_os = "linux")]
936    let vfio_pcie_devices: Vec<PcieDeviceConfig> = {
937        use std::collections::HashMap;
938        use vm_resource::IntoResource;
939
940        // Process --iommu flags: open /dev/iommu for each declared context.
941        let mut iommu_map: HashMap<String, std::fs::File> = HashMap::new();
942        for iommu_cli in &opt.iommu {
943            anyhow::ensure!(
944                !iommu_map.contains_key(&iommu_cli.id),
945                "duplicate --iommu id={}",
946                iommu_cli.id
947            );
948            let file = std::fs::OpenOptions::new()
949                .read(true)
950                .write(true)
951                .open("/dev/iommu")
952                .context("failed to open /dev/iommu (is iommufd available?)")?;
953            iommu_map.insert(iommu_cli.id.clone(), file);
954        }
955
956        opt.vfio
957            .iter()
958            .map(|cli_cfg| {
959                let sysfs_path = Path::new("/sys/bus/pci/devices").join(&cli_cfg.pci_id);
960
961                if let Some(iommu_id) = &cli_cfg.iommu {
962                    // cdev + iommufd path
963                    let iommufd = iommu_map.get(iommu_id).with_context(|| {
964                        format!(
965                            "--vfio device {} references iommu={iommu_id}, \
966                             but no --iommu id={iommu_id} was specified",
967                            cli_cfg.pci_id
968                        )
969                    })?;
970                    // Clone the iommufd fd so the per-iommu manager can own it.
971                    // The first device for a given iommu ID uses the cloned fd
972                    // to create the IoasManager; subsequent devices reuse the
973                    // existing manager and the cloned fd is dropped.
974                    let iommufd = iommufd.try_clone().with_context(|| {
975                        format!("failed to dup iommufd fd for iommu={iommu_id}")
976                    })?;
977
978                    // Open the cdev device node.
979                    let vfio_dev_dir = sysfs_path.join("vfio-dev");
980                    let entry = std::fs::read_dir(&vfio_dev_dir)
981                        .with_context(|| {
982                            format!(
983                                "failed to read {}: is {} bound to vfio-pci?",
984                                vfio_dev_dir.display(),
985                                cli_cfg.pci_id
986                            )
987                        })?
988                        .next()
989                        .context("no vfio-dev entry found")?
990                        .context("failed to read vfio-dev entry")?;
991                    let dev_path = Path::new("/dev/vfio/devices").join(entry.file_name());
992                    let cdev = std::fs::OpenOptions::new()
993                        .read(true)
994                        .write(true)
995                        .open(&dev_path)
996                        .with_context(|| format!("failed to open {}", dev_path.display()))?;
997
998                    Ok(PcieDeviceConfig {
999                        port_name: cli_cfg.port_name.clone(),
1000                        resource: vfio_assigned_device_resources::VfioCdevDeviceHandle {
1001                            pci_id: cli_cfg.pci_id.clone(),
1002                            cdev,
1003                            iommufd,
1004                            iommu_id: iommu_id.clone(),
1005                            bar_pt: cli_cfg.bar_pt,
1006                        }
1007                        .into_resource(),
1008                    })
1009                } else {
1010                    // Legacy group/container path
1011                    let iommu_group_link = std::fs::read_link(sysfs_path.join("iommu_group"))
1012                        .with_context(|| {
1013                            format!("failed to read IOMMU group for {}", cli_cfg.pci_id)
1014                        })?;
1015                    let group_id: u64 = iommu_group_link
1016                        .file_name()
1017                        .and_then(|s| s.to_str())
1018                        .context("invalid iommu_group symlink")?
1019                        .parse()
1020                        .context("failed to parse IOMMU group ID")?;
1021                    let group = std::fs::OpenOptions::new()
1022                        .read(true)
1023                        .write(true)
1024                        .open(format!("/dev/vfio/{group_id}"))
1025                        .with_context(|| format!("failed to open /dev/vfio/{group_id}"))?;
1026
1027                    Ok(PcieDeviceConfig {
1028                        port_name: cli_cfg.port_name.clone(),
1029                        resource: vfio_assigned_device_resources::VfioDeviceHandle {
1030                            pci_id: cli_cfg.pci_id.clone(),
1031                            group,
1032                            bar_pt: cli_cfg.bar_pt,
1033                        }
1034                        .into_resource(),
1035                    })
1036                }
1037            })
1038            .collect::<anyhow::Result<Vec<_>>>()?
1039    };
1040
1041    #[cfg(windows)]
1042    let vpci_resources: Vec<_> = opt
1043        .device
1044        .iter()
1045        .map(|path| -> anyhow::Result<_> {
1046            Ok(virt_whp::device::DeviceHandle(
1047                whp::VpciResource::new(
1048                    None,
1049                    Default::default(),
1050                    &whp::VpciResourceDescriptor::Sriov(path, 0, 0),
1051                )
1052                .with_context(|| format!("opening PCI device {}", path))?,
1053            ))
1054        })
1055        .collect::<Result<_, _>>()?;
1056
1057    // Create a vmbusproxy handle if needed by any devices.
1058    #[cfg(windows)]
1059    let vmbusproxy_handle = if !kernel_vmnics.is_empty() {
1060        Some(vmbus_proxy::ProxyHandle::new().context("failed to open vmbusproxy handle")?)
1061    } else {
1062        None
1063    };
1064
1065    let framebuffer = if opt.gfx || opt.vtl2_gfx || opt.vnc.vnc || opt.pcat {
1066        let vram = alloc_shared_memory(FRAMEBUFFER_SIZE, "vram")?;
1067        let (fb, fba) =
1068            framebuffer::framebuffer(vram, FRAMEBUFFER_SIZE, 0).context("creating framebuffer")?;
1069        resources.framebuffer_access = Some(fba);
1070        Some(fb)
1071    } else {
1072        None
1073    };
1074
1075    let load_mode;
1076    let with_hv;
1077
1078    let any_serial_configured = serial0_cfg.is_some()
1079        || serial1_cfg.is_some()
1080        || serial2_cfg.is_some()
1081        || serial3_cfg.is_some();
1082
1083    let has_com3 = serial2_cfg.is_some();
1084
1085    let mut chipset = VmManifestBuilder::new(
1086        if opt.igvm.is_some() {
1087            BaseChipsetType::HclHost
1088        } else if opt.pcat {
1089            BaseChipsetType::HypervGen1
1090        } else if opt.uefi {
1091            BaseChipsetType::HypervGen2Uefi
1092        } else if opt.hv {
1093            BaseChipsetType::HyperVGen2LinuxDirect
1094        } else {
1095            BaseChipsetType::UnenlightenedLinuxDirect
1096        },
1097        arch,
1098    );
1099
1100    if framebuffer.is_some() {
1101        chipset = chipset.with_framebuffer();
1102    }
1103    if opt.guest_watchdog {
1104        chipset = chipset.with_guest_watchdog();
1105    }
1106    if any_serial_configured {
1107        chipset = chipset.with_serial([serial0_cfg, serial1_cfg, serial2_cfg, serial3_cfg]);
1108    }
1109    if opt.battery {
1110        let (tx, rx) = mesh::channel();
1111        tx.send(HostBatteryUpdate::default_present());
1112        chipset = chipset.with_battery(rx);
1113    }
1114    if opt.no_vmbus {
1115        chipset = chipset.without_vmbus();
1116    }
1117    if let Some(cfg) = &opt.debugcon {
1118        chipset = chipset.with_debugcon(
1119            debugcon_cfg.unwrap_or_else(|| DisconnectedSerialBackendHandle.into_resource()),
1120            cfg.port,
1121        );
1122    }
1123
1124    let custom_uefi_vars = {
1125        use firmware_uefi_custom_vars::CustomVars;
1126
1127        // load base vars from specified template, or use an empty set of base
1128        // vars if none was specified.
1129        let base_vars = match opt.secure_boot_template {
1130            Some(template) => match (arch, template) {
1131                (MachineArch::X86_64, SecureBootTemplateCli::Windows) => {
1132                    hyperv_secure_boot_templates::x64::microsoft_windows()
1133                }
1134                (MachineArch::X86_64, SecureBootTemplateCli::UefiCa) => {
1135                    hyperv_secure_boot_templates::x64::microsoft_uefi_ca()
1136                }
1137                (MachineArch::Aarch64, SecureBootTemplateCli::Windows) => {
1138                    hyperv_secure_boot_templates::aarch64::microsoft_windows()
1139                }
1140                (MachineArch::Aarch64, SecureBootTemplateCli::UefiCa) => {
1141                    hyperv_secure_boot_templates::aarch64::microsoft_uefi_ca()
1142                }
1143            },
1144            None => CustomVars::default(),
1145        };
1146
1147        // TODO: fallback to VMGS read if no command line flag was given
1148
1149        let custom_uefi_json_data = match &opt.custom_uefi_json {
1150            Some(file) => Some(fs_err::read(file).context("opening custom uefi json file")?),
1151            None => None,
1152        };
1153
1154        // obtain the final custom uefi vars by applying the delta onto the base vars
1155        match custom_uefi_json_data {
1156            Some(data) => {
1157                let delta = hyperv_uefi_custom_vars_json::load_delta_from_json(&data)?;
1158                base_vars.apply_delta(delta)?
1159            }
1160            None => base_vars,
1161        }
1162    };
1163
1164    let efi_diagnostics_log_level = match opt.efi_diagnostics_log_level.unwrap_or_default() {
1165        EfiDiagnosticsLogLevelCli::Default => EfiDiagnosticsLogLevelType::Default,
1166        EfiDiagnosticsLogLevelCli::Info => EfiDiagnosticsLogLevelType::Info,
1167        EfiDiagnosticsLogLevelCli::Full => EfiDiagnosticsLogLevelType::Full,
1168    };
1169
1170    if opt.uefi {
1171        let log_level = match efi_diagnostics_log_level {
1172            EfiDiagnosticsLogLevelType::Default => {
1173                firmware_uefi_resources::LogLevel::make_default()
1174            }
1175            EfiDiagnosticsLogLevelType::Info => firmware_uefi_resources::LogLevel::make_info(),
1176            EfiDiagnosticsLogLevelType::Full => firmware_uefi_resources::LogLevel::make_full(),
1177        };
1178        let nvram_storage = if opt.vmgs.is_some() {
1179            VmgsFileHandle::new(vmgs_format::FileId::BIOS_NVRAM, true).into_resource()
1180        } else {
1181            EphemeralNonVolatileStoreHandle.into_resource()
1182        };
1183        chipset = chipset.with_uefi(vm_manifest_builder::UefiManifest::new(
1184            arch,
1185            custom_uefi_vars.clone(),
1186            opt.secure_boot,
1187            log_level,
1188            None,
1189            nvram_storage,
1190            None,
1191        ));
1192    }
1193
1194    // TODO: load from VMGS file if it exists
1195    let bios_guid = Guid::new_random();
1196
1197    let layout_config = chipset.layout_config();
1198    let VmChipsetResult {
1199        chipset,
1200        mut chipset_devices,
1201        pci_chipset_devices,
1202        isa_dma_controller,
1203        capabilities,
1204    } = chipset
1205        .build()
1206        .context("failed to build chipset configuration")?;
1207
1208    if opt.restore_snapshot.is_some() {
1209        // Snapshot restore: skip firmware loading entirely. Device state and
1210        // memory come from the snapshot directory.
1211        load_mode = LoadMode::None;
1212        with_hv = true;
1213    } else if let Some(path) = &opt.igvm {
1214        let file = fs_err::File::open(path)
1215            .context("failed to open igvm file")?
1216            .into();
1217        let cmdline = opt.cmdline.join(" ");
1218        with_hv = true;
1219
1220        load_mode = LoadMode::Igvm {
1221            file,
1222            cmdline,
1223            vtl2_base_address: opt.igvm_vtl2_relocation_type,
1224            com_serial: has_com3.then(|| SerialInformation {
1225                io_port: ComPort::Com3.io_port(),
1226                irq: ComPort::Com3.irq().into(),
1227            }),
1228        };
1229    } else if opt.pcat {
1230        // Emit a nice error early instead of complaining about missing firmware.
1231        if arch != MachineArch::X86_64 {
1232            anyhow::bail!("pcat not supported on this architecture");
1233        }
1234        with_hv = true;
1235
1236        let firmware = openvmm_pcat_locator::find_pcat_bios(opt.pcat_firmware.as_deref())?;
1237        load_mode = LoadMode::Pcat {
1238            firmware,
1239            boot_order: opt
1240                .pcat_boot_order
1241                .map(|x| x.0)
1242                .unwrap_or(DEFAULT_PCAT_BOOT_ORDER),
1243        };
1244    } else if opt.uefi {
1245        use openvmm_defs::config::UefiConsoleMode;
1246
1247        with_hv = true;
1248
1249        let firmware = fs_err::File::open(
1250            (opt.uefi_firmware.0)
1251                .as_ref()
1252                .context("must provide uefi firmware when booting with uefi")?,
1253        )
1254        .context("failed to open uefi firmware")?;
1255
1256        // TODO: It would be better to default memory protections to on, but currently Linux does not boot via UEFI due to what
1257        //       appears to be a GRUB memory protection fault. Memory protections are therefore only enabled if configured.
1258        load_mode = LoadMode::Uefi {
1259            firmware: firmware.into(),
1260            enable_debugging: opt.uefi_debug,
1261            enable_memory_protections: opt.uefi_enable_memory_protections,
1262            disable_frontpage: opt.disable_frontpage,
1263            enable_tpm: opt.tpm,
1264            enable_battery: opt.battery,
1265            enable_serial: any_serial_configured,
1266            enable_vpci_boot: false,
1267            uefi_console_mode: opt.uefi_console_mode.map(|m| match m {
1268                UefiConsoleModeCli::Default => UefiConsoleMode::Default,
1269                UefiConsoleModeCli::Com1 => UefiConsoleMode::Com1,
1270                UefiConsoleModeCli::Com2 => UefiConsoleMode::Com2,
1271                UefiConsoleModeCli::None => UefiConsoleMode::None,
1272            }),
1273            default_boot_always_attempt: opt.default_boot_always_attempt,
1274            bios_guid,
1275            enable_vmbus: !opt.no_vmbus,
1276            force_dma_bounce: opt.uefi_force_dma_bounce,
1277        };
1278    } else {
1279        // Linux Direct
1280        let mut cmdline = "panic=-1 debug".to_string();
1281
1282        with_hv = opt.hv;
1283        if with_hv && opt.pcie_root_complex.is_empty() {
1284            cmdline += " pci=off";
1285        }
1286
1287        if !console_str.is_empty() {
1288            let _ = write!(&mut cmdline, " console={}", console_str);
1289        }
1290
1291        if opt.gfx {
1292            cmdline += " console=tty";
1293        }
1294        for extra in &opt.cmdline {
1295            let _ = write!(&mut cmdline, " {}", extra);
1296        }
1297
1298        let kernel = fs_err::File::open(
1299            (opt.kernel.0)
1300                .as_ref()
1301                .context("must provide kernel when booting with linux direct")?,
1302        )
1303        .context("failed to open kernel")?;
1304        let initrd = (opt.initrd.0)
1305            .as_ref()
1306            .map(fs_err::File::open)
1307            .transpose()
1308            .context("failed to open initrd")?;
1309
1310        let custom_dsdt = match &opt.custom_dsdt {
1311            Some(path) => {
1312                let mut v = Vec::new();
1313                fs_err::File::open(path)
1314                    .context("failed to open custom dsdt")?
1315                    .read_to_end(&mut v)
1316                    .context("failed to read custom dsdt")?;
1317                Some(v)
1318            }
1319            None => None,
1320        };
1321
1322        load_mode = LoadMode::Linux {
1323            kernel: kernel.into(),
1324            initrd: initrd.map(Into::into),
1325            cmdline,
1326            custom_dsdt,
1327            enable_serial: any_serial_configured,
1328            boot_mode: if opt.device_tree {
1329                openvmm_defs::config::LinuxDirectBootMode::DeviceTree
1330            } else {
1331                openvmm_defs::config::LinuxDirectBootMode::Acpi
1332            },
1333        };
1334    }
1335
1336    let mut vmgs = Some(if let Some(VmgsCli { kind, provision }) = &opt.vmgs {
1337        let disk = VmgsDisk {
1338            disk: disk_open(kind, false)
1339                .await
1340                .context("failed to open vmgs disk")?,
1341            encryption_policy: if opt.test_gsp_by_id {
1342                GuestStateEncryptionPolicy::GspById(true)
1343            } else {
1344                GuestStateEncryptionPolicy::None(true)
1345            },
1346        };
1347        match provision {
1348            ProvisionVmgs::OnEmpty => VmgsResource::Disk(disk),
1349            ProvisionVmgs::OnFailure => VmgsResource::ReprovisionOnFailure(disk),
1350            ProvisionVmgs::True => VmgsResource::Reprovision(disk),
1351        }
1352    } else {
1353        VmgsResource::Ephemeral
1354    });
1355
1356    if with_get && with_hv {
1357        let has_vtl0_nvme = storage.has_vtl0_nvme();
1358        let vtl2_settings = vtl2_settings_proto::Vtl2Settings {
1359            version: vtl2_settings_proto::vtl2_settings_base::Version::V1.into(),
1360            fixed: Some(Default::default()),
1361            dynamic: Some(vtl2_settings_proto::Vtl2SettingsDynamic {
1362                storage_controllers: storage.build_openhcl_settings(opt.vmbus_redirect),
1363                nic_devices: underhill_nics,
1364            }),
1365            namespace_settings: Vec::default(),
1366        };
1367
1368        // Cache the VTL2 settings for later modification via the interactive console.
1369        resources.vtl2_settings = Some(vtl2_settings.clone());
1370
1371        let (send, guest_request_recv) = mesh::channel();
1372        resources.ged_rpc = Some(send);
1373
1374        let vmgs = vmgs.take().unwrap();
1375
1376        vmbus_devices.extend([
1377            (
1378                openhcl_vtl,
1379                get_resources::gel::GuestEmulationLogHandle.into_resource(),
1380            ),
1381            (
1382                openhcl_vtl,
1383                get_resources::ged::GuestEmulationDeviceHandle {
1384                    firmware: if opt.pcat {
1385                        get_resources::ged::GuestFirmwareConfig::Pcat {
1386                            boot_order: opt
1387                                .pcat_boot_order
1388                                .map_or(DEFAULT_PCAT_BOOT_ORDER, |x| x.0)
1389                                .map(|x| match x {
1390                                    openvmm_defs::config::PcatBootDevice::Floppy => {
1391                                        get_resources::ged::PcatBootDevice::Floppy
1392                                    }
1393                                    openvmm_defs::config::PcatBootDevice::HardDrive => {
1394                                        get_resources::ged::PcatBootDevice::HardDrive
1395                                    }
1396                                    openvmm_defs::config::PcatBootDevice::Optical => {
1397                                        get_resources::ged::PcatBootDevice::Optical
1398                                    }
1399                                    openvmm_defs::config::PcatBootDevice::Network => {
1400                                        get_resources::ged::PcatBootDevice::Network
1401                                    }
1402                                }),
1403                        }
1404                    } else {
1405                        use get_resources::ged::UefiConsoleMode;
1406
1407                        get_resources::ged::GuestFirmwareConfig::Uefi {
1408                            enable_vpci_boot: has_vtl0_nvme,
1409                            firmware_debug: opt.uefi_debug,
1410                            disable_frontpage: opt.disable_frontpage,
1411                            console_mode: match opt.uefi_console_mode.unwrap_or(UefiConsoleModeCli::Default) {
1412                                UefiConsoleModeCli::Default => UefiConsoleMode::Default,
1413                                UefiConsoleModeCli::Com1 => UefiConsoleMode::COM1,
1414                                UefiConsoleModeCli::Com2 => UefiConsoleMode::COM2,
1415                                UefiConsoleModeCli::None => UefiConsoleMode::None,
1416                            },
1417                            default_boot_always_attempt: opt.default_boot_always_attempt,
1418                        }
1419                    },
1420                    com1: with_vmbus_com1_serial,
1421                    com2: with_vmbus_com2_serial,
1422                    serial_tx_only: opt.serial_tx_only,
1423                    vtl2_settings: Some(prost::Message::encode_to_vec(&vtl2_settings)),
1424                    vmbus_redirection: opt.vmbus_redirect,
1425                    vmgs,
1426                    framebuffer: opt
1427                        .vtl2_gfx
1428                        .then(|| SharedFramebufferHandle.into_resource()),
1429                    guest_request_recv,
1430                    enable_tpm: opt.tpm,
1431                    firmware_event_send: None,
1432                    secure_boot_enabled: opt.secure_boot,
1433                    secure_boot_template: match opt.secure_boot_template {
1434                        Some(SecureBootTemplateCli::Windows) => {
1435                            get_resources::ged::GuestSecureBootTemplateType::MicrosoftWindows
1436                        },
1437                        Some(SecureBootTemplateCli::UefiCa) => {
1438                            get_resources::ged::GuestSecureBootTemplateType::MicrosoftUefiCertificateAuthority
1439                        }
1440                        None => {
1441                            get_resources::ged::GuestSecureBootTemplateType::None
1442                        },
1443                    },
1444                    enable_battery: opt.battery,
1445                    no_persistent_secrets: true,
1446                    igvm_attest_test_config: None,
1447                    test_gsp_by_id: opt.test_gsp_by_id,
1448                    efi_diagnostics_log_level: {
1449                        match opt.efi_diagnostics_log_level.unwrap_or_default() {
1450                            EfiDiagnosticsLogLevelCli::Default => get_resources::ged::EfiDiagnosticsLogLevelType::Default,
1451                            EfiDiagnosticsLogLevelCli::Info => get_resources::ged::EfiDiagnosticsLogLevelType::Info,
1452                            EfiDiagnosticsLogLevelCli::Full => get_resources::ged::EfiDiagnosticsLogLevelType::Full,
1453                        }
1454                    },
1455                    force_dma_bounce_enabled: opt.uefi_force_dma_bounce,
1456                }
1457                .into_resource(),
1458            ),
1459        ]);
1460    }
1461
1462    if opt.tpm && !opt.vtl2 {
1463        let register_layout = if cfg!(guest_arch = "x86_64") {
1464            TpmRegisterLayout::IoPort
1465        } else {
1466            TpmRegisterLayout::Mmio
1467        };
1468
1469        let (ppi_store, nvram_store) = if opt.vmgs.is_some() {
1470            (
1471                VmgsFileHandle::new(vmgs_format::FileId::TPM_PPI, true).into_resource(),
1472                VmgsFileHandle::new(vmgs_format::FileId::TPM_NVRAM, true).into_resource(),
1473            )
1474        } else {
1475            (
1476                EphemeralNonVolatileStoreHandle.into_resource(),
1477                EphemeralNonVolatileStoreHandle.into_resource(),
1478            )
1479        };
1480
1481        chipset_devices.push(ChipsetDeviceHandle {
1482            name: "tpm".to_string(),
1483            resource: chipset_device_worker_defs::RemoteChipsetDeviceHandle {
1484                device: TpmDeviceHandle {
1485                    ppi_store,
1486                    nvram_store,
1487                    nvram_size: None,
1488                    refresh_tpm_seeds: false,
1489                    ak_cert_type: tpm_resources::TpmAkCertTypeResource::None,
1490                    register_layout,
1491                    guest_secret_key: None,
1492                    logger: None,
1493                    is_confidential_vm: false,
1494                    bios_guid,
1495                }
1496                .into_resource(),
1497                worker_host: mesh.make_host("tpm", None).await?,
1498            }
1499            .into_resource(),
1500        });
1501    }
1502
1503    let vga_firmware = if opt.pcat {
1504        Some(openvmm_pcat_locator::find_svga_bios(
1505            opt.vga_firmware.as_deref(),
1506        )?)
1507    } else {
1508        None
1509    };
1510
1511    if opt.gfx {
1512        // Channel for the video device to report dirty rectangles to the VNC worker.
1513        let (dirt_send, dirt_recv) = mesh::channel();
1514        resources.dirty_rect_recv = Some(dirt_recv);
1515
1516        vmbus_devices.extend([
1517            (
1518                DeviceVtl::Vtl0,
1519                SynthVideoHandle {
1520                    framebuffer: SharedFramebufferHandle.into_resource(),
1521                    dirt_send: Some(dirt_send),
1522                }
1523                .into_resource(),
1524            ),
1525            (
1526                DeviceVtl::Vtl0,
1527                SynthKeyboardHandle {
1528                    source: MultiplexedInputHandle {
1529                        // Save 0 for PS/2
1530                        elevation: 1,
1531                    }
1532                    .into_resource(),
1533                }
1534                .into_resource(),
1535            ),
1536            (
1537                DeviceVtl::Vtl0,
1538                SynthMouseHandle {
1539                    source: MultiplexedInputHandle {
1540                        // Save 0 for PS/2
1541                        elevation: 1,
1542                    }
1543                    .into_resource(),
1544                }
1545                .into_resource(),
1546            ),
1547        ]);
1548    }
1549
1550    let vsock_listener = |path: Option<&str>| -> anyhow::Result<_> {
1551        if let Some(path) = path {
1552            cleanup_socket(path.as_ref());
1553            let listener = unix_socket::UnixListener::bind(path)
1554                .with_context(|| format!("failed to bind to hybrid vsock path: {}", path))?;
1555            Ok(Some(listener))
1556        } else {
1557            Ok(None)
1558        }
1559    };
1560
1561    let vtl0_vsock_listener = vsock_listener(opt.vmbus_vsock_path.as_deref())?;
1562    let vtl2_vsock_listener = vsock_listener(opt.vmbus_vtl2_vsock_path.as_deref())?;
1563
1564    if let Some(path) = &opt.openhcl_dump_path {
1565        let (resource, task) = spawn_dump_handler(&spawner, path.clone(), None);
1566        task.detach();
1567        vmbus_devices.push((openhcl_vtl, resource));
1568    }
1569
1570    #[cfg(guest_arch = "aarch64")]
1571    let topology_arch = openvmm_defs::config::ArchTopologyConfig::Aarch64(
1572        openvmm_defs::config::Aarch64TopologyConfig {
1573            // TODO: allow this to be configured from the command line
1574            gic_config: None,
1575            pmu_gsiv: openvmm_defs::config::PmuGsivConfig::Platform,
1576            gic_msi: match opt.gic_msi {
1577                cli_args::GicMsiCli::Auto => openvmm_defs::config::GicMsiConfig::Auto,
1578                cli_args::GicMsiCli::Its => openvmm_defs::config::GicMsiConfig::Its,
1579                cli_args::GicMsiCli::V2m => {
1580                    openvmm_defs::config::GicMsiConfig::V2m { spi_count: None }
1581                }
1582            },
1583        },
1584    );
1585    #[cfg(guest_arch = "x86_64")]
1586    let topology_arch =
1587        openvmm_defs::config::ArchTopologyConfig::X86(openvmm_defs::config::X86TopologyConfig {
1588            apic_id_offset: opt.apic_id_offset,
1589            x2apic: opt.x2apic,
1590        });
1591
1592    let with_isolation = if let Some(isolation) = &opt.isolation {
1593        // TODO: For now, isolation is only supported with VTL2.
1594        if !opt.vtl2 {
1595            anyhow::bail!("isolation is only currently supported with vtl2");
1596        }
1597
1598        // TODO: Alias map support is not yet implement with isolation.
1599        if !opt.no_alias_map {
1600            anyhow::bail!("alias map not supported with isolation");
1601        }
1602
1603        match isolation {
1604            cli_args::IsolationCli::Vbs => Some(openvmm_defs::config::IsolationType::Vbs),
1605        }
1606    } else {
1607        None
1608    };
1609
1610    if with_hv && !opt.no_vmbus {
1611        let (shutdown_send, shutdown_recv) = mesh::channel();
1612        resources.shutdown_ic = Some(shutdown_send);
1613        let (kvp_send, kvp_recv) = mesh::channel();
1614        resources.kvp_ic = Some(kvp_send);
1615        vmbus_devices.extend(
1616            [
1617                hyperv_ic_resources::shutdown::ShutdownIcHandle {
1618                    recv: shutdown_recv,
1619                }
1620                .into_resource(),
1621                hyperv_ic_resources::kvp::KvpIcHandle { recv: kvp_recv }.into_resource(),
1622                hyperv_ic_resources::timesync::TimesyncIcHandle.into_resource(),
1623            ]
1624            .map(|r| (DeviceVtl::Vtl0, r)),
1625        );
1626    }
1627
1628    if let Some(hive_path) = &opt.imc {
1629        let file = fs_err::File::open(hive_path).context("failed to open imc hive")?;
1630        vmbus_devices.push((
1631            DeviceVtl::Vtl0,
1632            vmbfs_resources::VmbfsImcDeviceHandle { file: file.into() }.into_resource(),
1633        ));
1634    }
1635
1636    let mut virtio_devices = Vec::new();
1637    let mut add_virtio_device = |bus, resource: Resource<VirtioDeviceHandle>| {
1638        let bus = match bus {
1639            VirtioBusCli::Auto => {
1640                // Use VPCI when possible (currently only on Windows and macOS due
1641                // to KVM backend limitations).
1642                if with_hv && (cfg!(windows) || cfg!(target_os = "macos")) {
1643                    None
1644                } else {
1645                    Some(VirtioBus::Pci)
1646                }
1647            }
1648            VirtioBusCli::Mmio => Some(VirtioBus::Mmio),
1649            VirtioBusCli::Pci => Some(VirtioBus::Pci),
1650            VirtioBusCli::Vpci => None,
1651        };
1652        if let Some(bus) = bus {
1653            virtio_devices.push((bus, resource));
1654        } else {
1655            vpci_devices.push(VpciDeviceConfig {
1656                vtl: DeviceVtl::Vtl0,
1657                instance_id: Guid::new_random(),
1658                resource: VirtioPciDeviceHandle(resource).into_resource(),
1659                vnode: None,
1660            });
1661        }
1662    };
1663
1664    for cli_cfg in &opt.virtio_net {
1665        if cli_cfg.underhill {
1666            anyhow::bail!("use --net uh:[...] to add underhill NICs")
1667        }
1668        let vport = parse_endpoint(cli_cfg, &mut nic_index, &mut resources)?;
1669        let resource = virtio_resources::net::VirtioNetHandle {
1670            max_queues: vport.max_queues,
1671            mac_address: vport.mac_address,
1672            endpoint: vport.endpoint,
1673        }
1674        .into_resource();
1675        if let Some(pcie_port) = &cli_cfg.pcie_port {
1676            pcie_devices.push(PcieDeviceConfig {
1677                port_name: pcie_port.clone(),
1678                resource: VirtioPciDeviceHandle(resource).into_resource(),
1679            });
1680        } else {
1681            add_virtio_device(VirtioBusCli::Auto, resource);
1682        }
1683    }
1684
1685    for args in &opt.virtio_fs {
1686        let resource: Resource<VirtioDeviceHandle> = virtio_resources::fs::VirtioFsHandle {
1687            tag: args.tag.clone(),
1688            fs: virtio_resources::fs::VirtioFsBackend::HostFs {
1689                root_path: args.path.clone(),
1690                mount_options: args.options.clone(),
1691            },
1692        }
1693        .into_resource();
1694        if let Some(pcie_port) = &args.pcie_port {
1695            pcie_devices.push(PcieDeviceConfig {
1696                port_name: pcie_port.clone(),
1697                resource: VirtioPciDeviceHandle(resource).into_resource(),
1698            });
1699        } else {
1700            add_virtio_device(opt.virtio_fs_bus, resource);
1701        }
1702    }
1703
1704    for args in &opt.virtio_fs_shmem {
1705        let resource: Resource<VirtioDeviceHandle> = virtio_resources::fs::VirtioFsHandle {
1706            tag: args.tag.clone(),
1707            fs: virtio_resources::fs::VirtioFsBackend::SectionFs {
1708                root_path: args.path.clone(),
1709            },
1710        }
1711        .into_resource();
1712        if let Some(pcie_port) = &args.pcie_port {
1713            pcie_devices.push(PcieDeviceConfig {
1714                port_name: pcie_port.clone(),
1715                resource: VirtioPciDeviceHandle(resource).into_resource(),
1716            });
1717        } else {
1718            add_virtio_device(opt.virtio_fs_bus, resource);
1719        }
1720    }
1721
1722    for args in &opt.virtio_9p {
1723        let resource: Resource<VirtioDeviceHandle> = virtio_resources::p9::VirtioPlan9Handle {
1724            tag: args.tag.clone(),
1725            root_path: args.path.clone(),
1726            debug: opt.virtio_9p_debug,
1727        }
1728        .into_resource();
1729        if let Some(pcie_port) = &args.pcie_port {
1730            pcie_devices.push(PcieDeviceConfig {
1731                port_name: pcie_port.clone(),
1732                resource: VirtioPciDeviceHandle(resource).into_resource(),
1733            });
1734        } else {
1735            add_virtio_device(VirtioBusCli::Auto, resource);
1736        }
1737    }
1738
1739    if let Some(pmem_args) = &opt.virtio_pmem {
1740        let resource: Resource<VirtioDeviceHandle> = virtio_resources::pmem::VirtioPmemHandle {
1741            path: pmem_args.path.clone(),
1742        }
1743        .into_resource();
1744        if let Some(pcie_port) = &pmem_args.pcie_port {
1745            pcie_devices.push(PcieDeviceConfig {
1746                port_name: pcie_port.clone(),
1747                resource: VirtioPciDeviceHandle(resource).into_resource(),
1748            });
1749        } else {
1750            add_virtio_device(VirtioBusCli::Auto, resource);
1751        }
1752    }
1753
1754    if opt.virtio_rng {
1755        let resource: Resource<VirtioDeviceHandle> =
1756            virtio_resources::rng::VirtioRngHandle.into_resource();
1757        if let Some(pcie_port) = &opt.virtio_rng_pcie_port {
1758            pcie_devices.push(PcieDeviceConfig {
1759                port_name: pcie_port.clone(),
1760                resource: VirtioPciDeviceHandle(resource).into_resource(),
1761            });
1762        } else {
1763            add_virtio_device(opt.virtio_rng_bus, resource);
1764        }
1765    }
1766
1767    if let Some(backend) = virtio_console_backend {
1768        let resource: Resource<VirtioDeviceHandle> =
1769            virtio_resources::console::VirtioConsoleHandle { backend }.into_resource();
1770        if let Some(pcie_port) = &opt.virtio_console_pcie_port {
1771            pcie_devices.push(PcieDeviceConfig {
1772                port_name: pcie_port.clone(),
1773                resource: VirtioPciDeviceHandle(resource).into_resource(),
1774            });
1775        } else {
1776            add_virtio_device(VirtioBusCli::Auto, resource);
1777        }
1778    }
1779
1780    // Handle --vhost-user arguments.
1781    #[cfg(target_os = "linux")]
1782    for vhost_cli in &opt.vhost_user {
1783        let stream =
1784            unix_socket::UnixStream::connect(&vhost_cli.socket_path).with_context(|| {
1785                format!(
1786                    "failed to connect to vhost-user socket: {}",
1787                    vhost_cli.socket_path
1788                )
1789            })?;
1790
1791        use crate::cli_args::VhostUserDeviceTypeCli;
1792        let resource: Resource<VirtioDeviceHandle> = match vhost_cli.device_type {
1793            VhostUserDeviceTypeCli::Fs {
1794                ref tag,
1795                num_queues,
1796                queue_size,
1797            } => virtio_resources::vhost_user::VhostUserFsHandle {
1798                socket: stream.into(),
1799                tag: tag.clone(),
1800                num_queues,
1801                queue_size,
1802            }
1803            .into_resource(),
1804            VhostUserDeviceTypeCli::Blk {
1805                num_queues,
1806                queue_size,
1807            } => virtio_resources::vhost_user::VhostUserBlkHandle {
1808                socket: stream.into(),
1809                num_queues,
1810                queue_size,
1811            }
1812            .into_resource(),
1813            VhostUserDeviceTypeCli::Other {
1814                device_id,
1815                ref queue_sizes,
1816            } => virtio_resources::vhost_user::VhostUserGenericHandle {
1817                socket: stream.into(),
1818                device_id,
1819                queue_sizes: queue_sizes.clone(),
1820            }
1821            .into_resource(),
1822        };
1823        if let Some(pcie_port) = &vhost_cli.pcie_port {
1824            pcie_devices.push(PcieDeviceConfig {
1825                port_name: pcie_port.clone(),
1826                resource: VirtioPciDeviceHandle(resource).into_resource(),
1827            });
1828        } else {
1829            add_virtio_device(VirtioBusCli::Auto, resource);
1830        }
1831    }
1832
1833    if let Some(vsock_path) = &opt.virtio_vsock_path {
1834        let listener = vsock_listener(Some(vsock_path))?.unwrap();
1835        add_virtio_device(
1836            VirtioBusCli::Auto,
1837            virtio_resources::vsock::VirtioVsockHandle {
1838                // The guest CID does not matter since the UDS relay does not use it. It just needs
1839                // to be some non-reserved value for the guest to use.
1840                guest_cid: 0x3,
1841                base_path: vsock_path.clone(),
1842                listener,
1843            }
1844            .into_resource(),
1845        );
1846    }
1847
1848    let mut cfg = Config {
1849        chipset,
1850        load_mode,
1851        floppy_disks,
1852        pcie_root_complexes,
1853        #[cfg(target_os = "linux")]
1854        pcie_devices: {
1855            let mut devs = pcie_devices;
1856            devs.extend(vfio_pcie_devices);
1857            devs
1858        },
1859        #[cfg(not(target_os = "linux"))]
1860        pcie_devices,
1861        pcie_switches,
1862        pcie_generic_initiators,
1863        vpci_devices,
1864        ide_disks: Vec::new(),
1865        numa: {
1866            if let Some(ref nodes) = opt.numa {
1867                // --numa mode: each --numa flag defines a node.
1868                NumaTopology {
1869                    nodes: nodes
1870                        .iter()
1871                        .map(|n| NumaNode {
1872                            mem: Some(MemoryConfig {
1873                                mem_size: n.memory.mem_size,
1874                                prefetch_memory: n.memory.prefetch,
1875                                private_memory: n.memory.shared == Some(false),
1876                                transparent_hugepages: n.memory.transparent_hugepages,
1877                                hugepages: n.memory.hugepages,
1878                                hugepage_size: n.memory.hugepage_size,
1879                                host_numa_node: n.host_numa_node,
1880                            }),
1881                            vps: match &n.vps {
1882                                Some(vps) if vps.is_empty() => VpAssignment::Empty,
1883                                Some(vps) => VpAssignment::Explicit(vps.clone()),
1884                                None => VpAssignment::FromTopology,
1885                            },
1886                        })
1887                        .collect(),
1888                    distances: opt
1889                        .numa_distance
1890                        .as_deref()
1891                        .unwrap_or(&[])
1892                        .iter()
1893                        .map(|d| NumaDistance {
1894                            src: d.src,
1895                            dst: d.dst,
1896                            distance: d.distance,
1897                        })
1898                        .collect(),
1899                }
1900            } else {
1901                // Single-node default from --memory.
1902                NumaTopology {
1903                    nodes: vec![NumaNode {
1904                        mem: Some(MemoryConfig {
1905                            mem_size: opt.memory_size(),
1906                            prefetch_memory: opt.prefetch_memory(),
1907                            private_memory: opt.private_memory(),
1908                            transparent_hugepages: opt.transparent_hugepages(),
1909                            hugepages: opt.memory.hugepages,
1910                            hugepage_size: opt.memory.hugepage_size,
1911                            host_numa_node: None,
1912                        }),
1913                        vps: VpAssignment::FromTopology,
1914                    }],
1915                    distances: vec![],
1916                }
1917            }
1918        },
1919        processor_topology: ProcessorTopologyConfig {
1920            proc_count: opt.processors,
1921            vps_per_socket: opt.vps_per_socket,
1922            enable_smt: match opt.smt {
1923                cli_args::SmtConfigCli::Auto => None,
1924                cli_args::SmtConfigCli::Force => Some(true),
1925                cli_args::SmtConfigCli::Off => Some(false),
1926            },
1927            arch: Some(topology_arch),
1928        },
1929        hypervisor: HypervisorConfig {
1930            with_hv,
1931            with_vtl2: opt.vtl2.then_some(Vtl2Config {
1932                vtl0_alias_map: !opt.no_alias_map,
1933                late_map_vtl0_memory: match opt.late_map_vtl0_policy {
1934                    cli_args::Vtl0LateMapPolicyCli::Off => None,
1935                    cli_args::Vtl0LateMapPolicyCli::Log => Some(LateMapVtl0MemoryPolicy::Log),
1936                    cli_args::Vtl0LateMapPolicyCli::Halt => Some(LateMapVtl0MemoryPolicy::Halt),
1937                    cli_args::Vtl0LateMapPolicyCli::Exception => {
1938                        Some(LateMapVtl0MemoryPolicy::InjectException)
1939                    }
1940                },
1941            }),
1942            with_isolation,
1943        },
1944        #[cfg(windows)]
1945        kernel_vmnics,
1946        input: mesh::Receiver::new(),
1947        framebuffer,
1948        vga_firmware,
1949        vtl2_gfx: opt.vtl2_gfx,
1950        virtio_devices,
1951        vmbus: (with_hv && !opt.no_vmbus).then_some(VmbusConfig {
1952            vsock_listener: vtl0_vsock_listener,
1953            vsock_path: opt.vmbus_vsock_path.clone(),
1954            vtl2_redirect: opt.vmbus_redirect,
1955            vmbus_max_version: opt.vmbus_max_version,
1956            #[cfg(windows)]
1957            vmbusproxy_handle,
1958        }),
1959        vtl2_vmbus: (with_hv && opt.vtl2).then_some(VmbusConfig {
1960            vsock_listener: vtl2_vsock_listener,
1961            vsock_path: opt.vmbus_vtl2_vsock_path.clone(),
1962            ..Default::default()
1963        }),
1964        vmbus_devices,
1965        chipset_devices,
1966        pci_chipset_devices,
1967        isa_dma_controller,
1968        chipset_capabilities: capabilities,
1969        layout: layout_config,
1970        #[cfg(windows)]
1971        vpci_resources,
1972        vmgs,
1973        secure_boot_enabled: opt.secure_boot,
1974        custom_uefi_vars,
1975        firmware_event_send: None,
1976        debugger_rpc: None,
1977        rtc_delta_milliseconds: 0,
1978        // Only let the partition auto-reset when the reset action is `reset`.
1979        // For `halt` or `exit`, the guest reset must surface as a halt event so
1980        // the controller can hold the VM or exit instead of rebooting in place.
1981        automatic_guest_reset: matches!(opt.guest_reset_action, GuestPowerAction::Reset),
1982        efi_diagnostics_log_level: {
1983            match opt.efi_diagnostics_log_level.unwrap_or_default() {
1984                EfiDiagnosticsLogLevelCli::Default => EfiDiagnosticsLogLevelType::Default,
1985                EfiDiagnosticsLogLevelCli::Info => EfiDiagnosticsLogLevelType::Info,
1986                EfiDiagnosticsLogLevelCli::Full => EfiDiagnosticsLogLevelType::Full,
1987            }
1988        },
1989    };
1990
1991    storage.build_config(&mut cfg, &mut resources, opt.scsi_sub_channels)?;
1992    Ok((cfg, resources))
1993}
1994
1995/// Gets the terminal to use for externally launched console windows.
1996pub(crate) fn openvmm_terminal_app() -> Option<PathBuf> {
1997    std::env::var_os("OPENVMM_TERM")
1998        .or_else(|| std::env::var_os("HVLITE_TERM"))
1999        .map(Into::into)
2000}
2001
2002// Tries to remove `path` if it is confirmed to be a Unix socket.
2003fn cleanup_socket(path: &Path) {
2004    #[cfg(windows)]
2005    let is_socket = pal::windows::fs::is_unix_socket(path).unwrap_or(false);
2006    #[cfg(not(windows))]
2007    let is_socket = path
2008        .metadata()
2009        .is_ok_and(|meta| std::os::unix::fs::FileTypeExt::is_socket(&meta.file_type()));
2010
2011    if is_socket {
2012        let _ = std::fs::remove_file(path);
2013    }
2014}
2015
2016#[cfg(windows)]
2017fn new_switch_port(
2018    switch_id: Option<&str>,
2019) -> anyhow::Result<(
2020    openvmm_defs::config::SwitchPortId,
2021    vmswitch::kernel::SwitchPort,
2022)> {
2023    let id = vmswitch::kernel::SwitchPortId {
2024        switch: match switch_id {
2025            Some(s) => s.parse().context("invalid switch id")?,
2026            None => vmswitch::hcn::DEFAULT_SWITCH,
2027        },
2028        port: Guid::new_random(),
2029    };
2030    let _ = vmswitch::hcn::Network::open(&id.switch)
2031        .with_context(|| format!("could not find switch {}", id.switch))?;
2032
2033    let port = vmswitch::kernel::SwitchPort::new(&id).context("failed to create switch port")?;
2034
2035    let id = openvmm_defs::config::SwitchPortId {
2036        switch: id.switch,
2037        port: id.port,
2038    };
2039    Ok((id, port))
2040}
2041
2042fn parse_endpoint(
2043    cli_cfg: &NicConfigCli,
2044    index: &mut usize,
2045    resources: &mut VmResources,
2046) -> anyhow::Result<NicConfig> {
2047    let _ = resources;
2048    let endpoint = match &cli_cfg.endpoint {
2049        EndpointConfigCli::Consomme { cidr, host_fwd } => {
2050            let ports = host_fwd
2051                .iter()
2052                .map(|fwd| {
2053                    use net_backend_resources::consomme::HostPortProtocol;
2054                    net_backend_resources::consomme::HostPortConfig {
2055                        protocol: match fwd.protocol {
2056                            cli_args::HostPortProtocolCli::Tcp => HostPortProtocol::Tcp,
2057                            cli_args::HostPortProtocolCli::Udp => HostPortProtocol::Udp,
2058                        },
2059                        host_address: fwd
2060                            .host_address
2061                            .map(net_backend_resources::consomme::HostIpAddress::from),
2062                        host_port: net_backend_resources::consomme::HostPort::Fixed(fwd.host_port),
2063                        guest_port: fwd.guest_port,
2064                    }
2065                })
2066                .collect();
2067            net_backend_resources::consomme::ConsommeHandle {
2068                cidr: cidr.clone(),
2069                ports,
2070            }
2071            .into_resource()
2072        }
2073        EndpointConfigCli::None => net_backend_resources::null::NullHandle.into_resource(),
2074        EndpointConfigCli::Dio { id } => {
2075            #[cfg(windows)]
2076            {
2077                let (port_id, port) = new_switch_port(id.as_deref())?;
2078                resources.switch_ports.push(port);
2079                net_backend_resources::dio::WindowsDirectIoHandle {
2080                    switch_port_id: net_backend_resources::dio::SwitchPortId {
2081                        switch: port_id.switch,
2082                        port: port_id.port,
2083                    },
2084                }
2085                .into_resource()
2086            }
2087
2088            #[cfg(not(windows))]
2089            {
2090                let _ = id;
2091                bail!("cannot use dio on non-windows platforms")
2092            }
2093        }
2094        EndpointConfigCli::Tap { name } => {
2095            #[cfg(target_os = "linux")]
2096            {
2097                let fd = net_tap::tap::open_tap(name)
2098                    .with_context(|| format!("failed to open TAP device '{name}'"))?;
2099                net_backend_resources::tap::TapHandle { fd }.into_resource()
2100            }
2101
2102            #[cfg(not(target_os = "linux"))]
2103            {
2104                let _ = name;
2105                bail!("TAP backend is only supported on Linux")
2106            }
2107        }
2108    };
2109
2110    // Pick a random MAC address.
2111    let mut mac_address = [0x00, 0x15, 0x5D, 0, 0, 0];
2112    getrandom::fill(&mut mac_address[3..]).expect("rng failure");
2113
2114    // Pick a fixed instance ID based on the index.
2115    const BASE_INSTANCE_ID: Guid = guid::guid!("00000000-da43-11ed-936a-00155d6db52f");
2116    let instance_id = Guid {
2117        data1: *index as u32,
2118        ..BASE_INSTANCE_ID
2119    };
2120    *index += 1;
2121
2122    Ok(NicConfig {
2123        vtl: cli_cfg.vtl,
2124        instance_id,
2125        endpoint,
2126        mac_address: mac_address.into(),
2127        max_queues: cli_cfg.max_queues,
2128        pcie_port: cli_cfg.pcie_port.clone(),
2129    })
2130}
2131
2132#[derive(Debug)]
2133struct NicConfig {
2134    vtl: DeviceVtl,
2135    instance_id: Guid,
2136    mac_address: MacAddress,
2137    endpoint: Resource<NetEndpointHandleKind>,
2138    max_queues: Option<u16>,
2139    pcie_port: Option<String>,
2140}
2141
2142impl NicConfig {
2143    fn into_netvsp_handle(self) -> (DeviceVtl, Resource<VmbusDeviceHandleKind>) {
2144        (
2145            self.vtl,
2146            netvsp_resources::NetvspHandle {
2147                instance_id: self.instance_id,
2148                mac_address: self.mac_address,
2149                endpoint: self.endpoint,
2150                max_queues: self.max_queues,
2151            }
2152            .into_resource(),
2153        )
2154    }
2155}
2156
2157enum LayerOrDisk {
2158    Layer(DiskLayerDescription),
2159    Disk(Resource<DiskHandleKind>),
2160}
2161
2162async fn disk_open(
2163    disk_cli: &DiskCliKind,
2164    read_only: bool,
2165) -> anyhow::Result<Resource<DiskHandleKind>> {
2166    let mut layers = Vec::new();
2167    disk_open_inner(disk_cli, read_only, &mut layers).await?;
2168    if layers.len() == 1 && matches!(layers[0], LayerOrDisk::Disk(_)) {
2169        let LayerOrDisk::Disk(disk) = layers.pop().unwrap() else {
2170            unreachable!()
2171        };
2172        Ok(disk)
2173    } else {
2174        Ok(Resource::new(disk_backend_resources::LayeredDiskHandle {
2175            layers: layers
2176                .into_iter()
2177                .map(|layer| match layer {
2178                    LayerOrDisk::Layer(layer) => layer,
2179                    LayerOrDisk::Disk(disk) => DiskLayerDescription {
2180                        layer: DiskLayerHandle(disk).into_resource(),
2181                        read_cache: false,
2182                        write_through: false,
2183                    },
2184                })
2185                .collect(),
2186        }))
2187    }
2188}
2189
2190fn disk_open_inner<'a>(
2191    disk_cli: &'a DiskCliKind,
2192    read_only: bool,
2193    layers: &'a mut Vec<LayerOrDisk>,
2194) -> futures::future::BoxFuture<'a, anyhow::Result<()>> {
2195    Box::pin(async move {
2196        fn layer<T: IntoResource<DiskLayerHandleKind>>(layer: T) -> LayerOrDisk {
2197            LayerOrDisk::Layer(layer.into_resource().into())
2198        }
2199        fn disk<T: IntoResource<DiskHandleKind>>(disk: T) -> LayerOrDisk {
2200            LayerOrDisk::Disk(disk.into_resource())
2201        }
2202        match disk_cli {
2203            &DiskCliKind::Memory(len) => {
2204                layers.push(layer(RamDiskLayerHandle {
2205                    len: Some(len),
2206                    sector_size: None,
2207                }));
2208            }
2209            DiskCliKind::File {
2210                path,
2211                create_with_len,
2212                direct,
2213            } => layers.push(LayerOrDisk::Disk(if let Some(size) = create_with_len {
2214                create_disk_type(
2215                    path,
2216                    *size,
2217                    OpenDiskOptions {
2218                        read_only: false,
2219                        direct: *direct,
2220                    },
2221                )
2222                .with_context(|| format!("failed to create {}", path.display()))?
2223            } else {
2224                open_disk_type(
2225                    path,
2226                    OpenDiskOptions {
2227                        read_only,
2228                        direct: *direct,
2229                    },
2230                )
2231                .await
2232                .with_context(|| format!("failed to open {}", path.display()))?
2233            })),
2234            DiskCliKind::Blob { kind, url } => {
2235                layers.push(disk(disk_backend_resources::BlobDiskHandle {
2236                    url: url.to_owned(),
2237                    format: match kind {
2238                        cli_args::BlobKind::Flat => disk_backend_resources::BlobDiskFormat::Flat,
2239                        cli_args::BlobKind::Vhd1 => {
2240                            disk_backend_resources::BlobDiskFormat::FixedVhd1
2241                        }
2242                    },
2243                }))
2244            }
2245            DiskCliKind::MemoryDiff(inner) => {
2246                layers.push(layer(RamDiskLayerHandle {
2247                    len: None,
2248                    sector_size: None,
2249                }));
2250                disk_open_inner(inner, true, layers).await?;
2251            }
2252            DiskCliKind::PersistentReservationsWrapper(inner) => {
2253                layers.push(disk(disk_backend_resources::DiskWithReservationsHandle(
2254                    disk_open(inner, read_only).await?,
2255                )))
2256            }
2257            DiskCliKind::DelayDiskWrapper {
2258                delay_ms,
2259                disk: inner,
2260            } => layers.push(disk(DelayDiskHandle {
2261                delay: CellUpdater::new(Duration::from_millis(*delay_ms)).cell(),
2262                disk: disk_open(inner, read_only).await?,
2263            })),
2264            DiskCliKind::Crypt {
2265                disk: inner,
2266                cipher,
2267                key_file,
2268            } => layers.push(disk(disk_crypt_resources::DiskCryptHandle {
2269                disk: disk_open(inner, read_only).await?,
2270                cipher: match cipher {
2271                    cli_args::DiskCipher::XtsAes256 => disk_crypt_resources::Cipher::XtsAes256,
2272                },
2273                key: fs_err::read(key_file).context("failed to read key file")?,
2274            })),
2275            DiskCliKind::Sqlite {
2276                path,
2277                create_with_len,
2278            } => {
2279                // FUTURE: this code should be responsible for opening
2280                // file-handle(s) itself, and passing them into sqlite via a custom
2281                // vfs. For now though - simply check if the file exists or not, and
2282                // perform early validation of filesystem-level create options.
2283                match (create_with_len.is_some(), path.exists()) {
2284                    (true, true) => anyhow::bail!(
2285                        "cannot create new sqlite disk at {} - file already exists",
2286                        path.display()
2287                    ),
2288                    (false, false) => anyhow::bail!(
2289                        "cannot open sqlite disk at {} - file not found",
2290                        path.display()
2291                    ),
2292                    _ => {}
2293                }
2294
2295                layers.push(layer(SqliteDiskLayerHandle {
2296                    dbhd_path: path.display().to_string(),
2297                    format_dbhd: create_with_len.map(|len| {
2298                        disk_backend_resources::layer::SqliteDiskLayerFormatParams {
2299                            logically_read_only: false,
2300                            len: Some(len),
2301                        }
2302                    }),
2303                }));
2304            }
2305            DiskCliKind::SqliteDiff { path, create, disk } => {
2306                // FUTURE: this code should be responsible for opening
2307                // file-handle(s) itself, and passing them into sqlite via a custom
2308                // vfs. For now though - simply check if the file exists or not, and
2309                // perform early validation of filesystem-level create options.
2310                match (create, path.exists()) {
2311                    (true, true) => anyhow::bail!(
2312                        "cannot create new sqlite disk at {} - file already exists",
2313                        path.display()
2314                    ),
2315                    (false, false) => anyhow::bail!(
2316                        "cannot open sqlite disk at {} - file not found",
2317                        path.display()
2318                    ),
2319                    _ => {}
2320                }
2321
2322                layers.push(layer(SqliteDiskLayerHandle {
2323                    dbhd_path: path.display().to_string(),
2324                    format_dbhd: create.then_some(
2325                        disk_backend_resources::layer::SqliteDiskLayerFormatParams {
2326                            logically_read_only: false,
2327                            len: None,
2328                        },
2329                    ),
2330                }));
2331                disk_open_inner(disk, true, layers).await?;
2332            }
2333            DiskCliKind::AutoCacheSqlite {
2334                cache_path,
2335                key,
2336                disk,
2337            } => {
2338                layers.push(LayerOrDisk::Layer(DiskLayerDescription {
2339                    read_cache: true,
2340                    write_through: false,
2341                    layer: SqliteAutoCacheDiskLayerHandle {
2342                        cache_path: cache_path.clone(),
2343                        cache_key: key.clone(),
2344                    }
2345                    .into_resource(),
2346                }));
2347                disk_open_inner(disk, read_only, layers).await?;
2348            }
2349        }
2350        Ok(())
2351    })
2352}
2353
2354/// Get the system page size.
2355pub(crate) fn system_page_size() -> u32 {
2356    sparse_mmap::SparseMapping::page_size() as u32
2357}
2358
2359/// The guest architecture string, derived from the compile-time `guest_arch` cfg.
2360pub(crate) const GUEST_ARCH: &str = if cfg!(guest_arch = "x86_64") {
2361    "x86_64"
2362} else {
2363    "aarch64"
2364};
2365
2366/// Open a snapshot directory and validate it against the current VM config.
2367/// Returns the shared memory fd (from memory.bin) and the saved device state.
2368fn prepare_snapshot_restore(
2369    snapshot_dir: &Path,
2370    opt: &Options,
2371) -> anyhow::Result<(
2372    openvmm_defs::worker::SharedMemoryFd,
2373    mesh::payload::message::ProtobufMessage,
2374)> {
2375    let (manifest, state_bytes) = openvmm_helpers::snapshot::read_snapshot(snapshot_dir)?;
2376
2377    // Validate manifest against current VM config.
2378    openvmm_helpers::snapshot::validate_manifest(
2379        &manifest,
2380        GUEST_ARCH,
2381        opt.memory_size(),
2382        opt.processors,
2383        system_page_size(),
2384    )?;
2385
2386    // Open memory.bin (existing file, no create, no resize).
2387    let memory_file = fs_err::OpenOptions::new()
2388        .read(true)
2389        .write(true)
2390        .open(snapshot_dir.join("memory.bin"))?;
2391
2392    // Validate file size matches expected memory size.
2393    let file_size = memory_file.metadata()?.len();
2394    if file_size != manifest.memory_size_bytes {
2395        anyhow::bail!(
2396            "memory.bin size ({file_size} bytes) doesn't match manifest ({} bytes)",
2397            manifest.memory_size_bytes,
2398        );
2399    }
2400
2401    let shared_memory_fd =
2402        openvmm_helpers::shared_memory::file_to_shared_memory_fd(memory_file.into())?;
2403
2404    // Reconstruct ProtobufMessage from the saved state bytes.
2405    // The save side wrote mesh::payload::encode(ProtobufMessage), so we decode
2406    // back to ProtobufMessage.
2407    let state_msg: mesh::payload::message::ProtobufMessage = mesh::payload::decode(&state_bytes)
2408        .context("failed to decode saved state from snapshot")?;
2409
2410    Ok((shared_memory_fd, state_msg))
2411}
2412
2413fn do_main(pidfile_guard: &mut Option<pidfile::Pidfile>) -> anyhow::Result<i32> {
2414    #[cfg(windows)]
2415    pal::windows::disable_hard_error_dialog();
2416
2417    tracing_init::enable_tracing()?;
2418
2419    // Try to run as a worker host.
2420    // On success the worker runs to completion and then exits the process (does
2421    // not return). Any worker host setup errors are return and bubbled up.
2422    meshworker::run_vmm_mesh_host()?;
2423
2424    let opt = cli_args::parse_options();
2425    if let Some(path) = &opt.write_saved_state_proto {
2426        mesh::payload::protofile::DescriptorWriter::new(vmcore::save_restore::saved_state_roots())
2427            .write_to_path(path)
2428            .context("failed to write protobuf descriptors")?;
2429        return Ok(0);
2430    }
2431
2432    if let Some(ref path) = opt.pidfile {
2433        *pidfile_guard = Some(pidfile::Pidfile::new(path).context("failed to create pidfile")?);
2434    }
2435
2436    if let Some(path) = opt.relay_console_path {
2437        let console_title = opt.relay_console_title.unwrap_or_default();
2438        return console_relay::relay_console(&path, console_title.as_str()).map(|()| 0);
2439    }
2440
2441    #[cfg(any(feature = "grpc", feature = "ttrpc"))]
2442    if let Some(path) = opt.ttrpc.as_ref().or(opt.grpc.as_ref()) {
2443        return block_on(async {
2444            let _ = std::fs::remove_file(path);
2445            let listener =
2446                unix_socket::UnixListener::bind(path).context("failed to bind to socket")?;
2447
2448            let transport = if opt.ttrpc.is_some() {
2449                ttrpc::RpcTransport::Ttrpc
2450            } else {
2451                ttrpc::RpcTransport::Grpc
2452            };
2453
2454            // This is a local launch
2455            let mut handle =
2456                mesh_worker::launch_local_worker::<ttrpc::TtrpcWorker>(ttrpc::Parameters {
2457                    listener,
2458                    transport,
2459                })
2460                .await?;
2461
2462            tracing::info!(%transport, path = %path.display(), "listening");
2463
2464            // Signal the the parent process that the server is ready.
2465            pal::close_stdout().context("failed to close stdout")?;
2466
2467            handle.join().await?;
2468
2469            Ok(0)
2470        });
2471    }
2472
2473    DefaultPool::run_with(async |driver| run_control(&driver, opt).await)
2474}
2475
2476fn new_hvsock_service_id(port: u32) -> Guid {
2477    // This GUID is an embedding of the AF_VSOCK port into an
2478    // AF_HYPERV service ID.
2479    Guid {
2480        data1: port,
2481        .."00000000-facb-11e6-bd58-64006a7986d3".parse().unwrap()
2482    }
2483}
2484
2485async fn run_control(driver: &DefaultDriver, opt: Options) -> anyhow::Result<i32> {
2486    let mut mesh = Some(VmmMesh::new(&driver, opt.single_process)?);
2487    let result = run_control_inner(driver, &mut mesh, opt).await;
2488    // If setup failed before the mesh was handed to the controller, shut it
2489    // down so the child host process exits cleanly without noisy logs.
2490    if let Some(mesh) = mesh {
2491        mesh.shutdown().await;
2492    }
2493    result
2494}
2495
2496async fn run_control_inner(
2497    driver: &DefaultDriver,
2498    mesh_slot: &mut Option<VmmMesh>,
2499    opt: Options,
2500) -> anyhow::Result<i32> {
2501    let mesh = mesh_slot.as_ref().unwrap();
2502    let (mut vm_config, mut resources) = vm_config_from_command_line(driver, mesh, &opt).await?;
2503
2504    let mut vnc_worker = None;
2505    if opt.gfx || opt.vnc.vnc {
2506        // Parse the listen address. Try as a full SocketAddr (host:port) first;
2507        // fall back to a bare IP, using the configured port.
2508        let addr: std::net::SocketAddr = if let Ok(sa) =
2509            opt.vnc.vnc_listen.parse::<std::net::SocketAddr>()
2510        {
2511            sa
2512        } else {
2513            let ip: std::net::IpAddr = opt.vnc.vnc_listen.parse().with_context(|| {
2514                format!(
2515                    "invalid VNC listen address: {} (expected IP address or socket address like [::1]:5900)",
2516                    opt.vnc.vnc_listen
2517                )
2518            })?;
2519            std::net::SocketAddr::new(ip, opt.vnc.vnc_port)
2520        };
2521
2522        let socket = socket2::Socket::new(
2523            if addr.is_ipv6() {
2524                socket2::Domain::IPV6
2525            } else {
2526                socket2::Domain::IPV4
2527            },
2528            socket2::Type::STREAM,
2529            None,
2530        )
2531        .with_context(|| format!("creating VNC socket for {}", addr))?;
2532
2533        if addr.is_ipv6() {
2534            if let Err(e) = socket.set_only_v6(false) {
2535                tracing::warn!(
2536                    error = %e,
2537                    "failed to enable dual-stack on IPv6 VNC socket, IPv4 clients may not be able to connect"
2538                );
2539            }
2540        }
2541        socket.set_reuse_address(true)?;
2542        socket
2543            .bind(&addr.into())
2544            .with_context(|| format!("binding VNC socket to {}", addr))?;
2545        socket
2546            .listen(128)
2547            .with_context(|| format!("listening on VNC socket {}", addr))?;
2548        let listener: TcpListener = socket.into();
2549
2550        if !addr.ip().is_loopback() {
2551            tracing::warn!(
2552                address = %addr,
2553                "VNC server listening on non-localhost address without authentication"
2554            );
2555        }
2556
2557        let input_send = vm_config.input.sender();
2558        let framebuffer = resources
2559            .framebuffer_access
2560            .take()
2561            .expect("synth video enabled");
2562
2563        let vnc_host = mesh
2564            .make_host("vnc", None)
2565            .await
2566            .context("spawning vnc process failed")?;
2567
2568        vnc_worker = Some(
2569            vnc_host
2570                .launch_worker(
2571                    vnc_worker_defs::VNC_WORKER_TCP,
2572                    VncParameters {
2573                        listener,
2574                        framebuffer,
2575                        input_send,
2576                        dirty_recv: resources.dirty_rect_recv.take(),
2577                        max_clients: opt.vnc.vnc_max_clients,
2578                        evict_oldest: opt.vnc.vnc_evict_oldest,
2579                    },
2580                )
2581                .await?,
2582        )
2583    }
2584
2585    // spin up the debug worker
2586    let gdb_worker = if let Some(port) = opt.gdb {
2587        let listener = TcpListener::bind(format!("127.0.0.1:{}", port))
2588            .with_context(|| format!("binding to gdb port {}", port))?;
2589
2590        let (req_tx, req_rx) = mesh::channel();
2591        vm_config.debugger_rpc = Some(req_rx);
2592
2593        let gdb_host = mesh
2594            .make_host("gdb", None)
2595            .await
2596            .context("spawning gdbstub process failed")?;
2597
2598        Some(
2599            gdb_host
2600                .launch_worker(
2601                    debug_worker_defs::DEBUGGER_WORKER,
2602                    debug_worker_defs::DebuggerParameters {
2603                        listener,
2604                        req_chan: req_tx,
2605                        vp_count: vm_config.processor_topology.proc_count,
2606                        target_arch: if cfg!(guest_arch = "x86_64") {
2607                            debug_worker_defs::TargetArch::X86_64
2608                        } else {
2609                            debug_worker_defs::TargetArch::Aarch64
2610                        },
2611                    },
2612                )
2613                .await
2614                .context("failed to launch gdbstub worker")?,
2615        )
2616    } else {
2617        None
2618    };
2619
2620    // spin up the VM
2621    let (vm_rpc, rpc_recv) = mesh::channel();
2622    let (notify_send, notify_recv) = mesh::channel();
2623    let vm_worker = {
2624        let vm_host = mesh.make_host("vm", opt.log_file.clone()).await?;
2625
2626        let (shared_memory, saved_state) = if let Some(snapshot_dir) = &opt.restore_snapshot {
2627            let (fd, state_msg) = prepare_snapshot_restore(snapshot_dir, &opt)?;
2628            (Some(fd), Some(state_msg))
2629        } else {
2630            let shared_memory = opt
2631                .memory_backing_file()
2632                .map(|path| {
2633                    openvmm_helpers::shared_memory::open_memory_backing_file(
2634                        path,
2635                        opt.memory_size(),
2636                    )
2637                })
2638                .transpose()?;
2639            (shared_memory, None)
2640        };
2641
2642        let params = VmWorkerParameters {
2643            hypervisor: match &opt.hypervisor {
2644                Some(name) => openvmm_helpers::hypervisor::hypervisor_resource(name)?,
2645                None => openvmm_helpers::hypervisor::choose_hypervisor()?,
2646            },
2647            cfg: vm_config,
2648            saved_state,
2649            shared_memory,
2650            rpc: rpc_recv,
2651            notify: notify_send,
2652        };
2653        vm_host
2654            .launch_worker(VM_WORKER, params)
2655            .await
2656            .context("failed to launch vm worker")?
2657    };
2658
2659    if opt.restore_snapshot.is_some() {
2660        tracing::info!("restoring VM from snapshot");
2661    }
2662
2663    if !opt.paused {
2664        vm_rpc.call(VmRpc::Resume, ()).await?;
2665    }
2666
2667    let paravisor_diag = Arc::new(diag_client::DiagClient::from_dialer(
2668        driver.clone(),
2669        DiagDialer {
2670            driver: driver.clone(),
2671            vm_rpc: vm_rpc.clone(),
2672            openhcl_vtl: if opt.vtl2 {
2673                DeviceVtl::Vtl2
2674            } else {
2675                DeviceVtl::Vtl0
2676            },
2677        },
2678    ));
2679
2680    let diag_inspector = DiagInspector::new(driver.clone(), paravisor_diag.clone());
2681
2682    // Create channels between the REPL and VmController.
2683    let (vm_controller_send, vm_controller_recv) = mesh::channel();
2684    let (vm_controller_event_send, vm_controller_event_recv) = mesh::channel();
2685
2686    let has_vtl2 = resources.vtl2_settings.is_some();
2687
2688    // Build the VmController with exclusive resources.
2689    let controller = vm_controller::VmController {
2690        mesh: mesh_slot.take().unwrap(),
2691        vm_worker,
2692        vnc_worker,
2693        gdb_worker,
2694        diag_inspector: Some(diag_inspector),
2695        vtl2_settings: resources.vtl2_settings,
2696        ged_rpc: resources.ged_rpc.clone(),
2697        vm_rpc: vm_rpc.clone(),
2698        paravisor_diag: Some(paravisor_diag),
2699        igvm_path: opt.igvm.clone(),
2700        memory_backing_file: opt.memory_backing_file().cloned(),
2701        memory: opt.memory_size(),
2702        processors: opt.processors,
2703        log_file: opt.log_file.clone(),
2704        guest_power_actions: vm_controller::GuestPowerActions {
2705            shutdown: opt.guest_shutdown_action,
2706            reset: opt.guest_reset_action,
2707            crash: opt.guest_crash_action,
2708            watchdog: opt.guest_watchdog_action,
2709        },
2710    };
2711
2712    // Spawn the VmController as a task.
2713    let controller_task = driver.spawn(
2714        "vm-controller",
2715        controller.run(vm_controller_recv, vm_controller_event_send, notify_recv),
2716    );
2717
2718    // Run the REPL with shareable resources.
2719    let repl_result = repl::run_repl(
2720        driver,
2721        repl::ReplResources {
2722            vm_rpc,
2723            vm_controller: vm_controller_send,
2724            vm_controller_events: vm_controller_event_recv,
2725            scsi_rpc: resources.scsi_rpc,
2726            nvme_vtl2_rpc: resources.nvme_vtl2_rpc,
2727            shutdown_ic: resources.shutdown_ic,
2728            kvp_ic: resources.kvp_ic,
2729            console_in: resources.console_in,
2730            has_vtl2,
2731        },
2732    )
2733    .await;
2734
2735    // Wait for the controller task to finish (it stops the VM worker and
2736    // shuts down the mesh).
2737    controller_task.await;
2738
2739    // run_repl returns the exit status: the code the guest drove via an opt-in
2740    // exit (VmControllerEvent::ExitRequested), or 0 when the VM stopped normally.
2741    repl_result
2742}
2743
2744struct DiagDialer {
2745    driver: DefaultDriver,
2746    vm_rpc: mesh::Sender<VmRpc>,
2747    openhcl_vtl: DeviceVtl,
2748}
2749
2750impl mesh_rpc::client::Dial for DiagDialer {
2751    type Stream = PolledSocket<unix_socket::UnixStream>;
2752
2753    async fn dial(&mut self) -> io::Result<Self::Stream> {
2754        let service_id = new_hvsock_service_id(1);
2755        let socket = self
2756            .vm_rpc
2757            .call_failable(
2758                VmRpc::ConnectHvsock,
2759                (
2760                    CancelContext::new().with_timeout(Duration::from_secs(2)),
2761                    service_id,
2762                    self.openhcl_vtl,
2763                ),
2764            )
2765            .await
2766            .map_err(io::Error::other)?;
2767
2768        PolledSocket::new(&self.driver, socket)
2769    }
2770}
2771
2772/// An object that implements [`InspectMut`] by sending an inspect request over
2773/// TTRPC to the guest (typically the paravisor running in VTL2), then stitching
2774/// the response back into the inspect tree.
2775///
2776/// This also caches the TTRPC connection to the guest so that only the first
2777/// inspect request has to wait for the connection to be established.
2778pub(crate) struct DiagInspector(DiagInspectorInner);
2779
2780enum DiagInspectorInner {
2781    NotStarted(DefaultDriver, Arc<diag_client::DiagClient>),
2782    Started {
2783        send: mesh::Sender<inspect::Deferred>,
2784        _task: Task<()>,
2785    },
2786    Invalid,
2787}
2788
2789impl DiagInspector {
2790    pub fn new(driver: DefaultDriver, diag_client: Arc<diag_client::DiagClient>) -> Self {
2791        Self(DiagInspectorInner::NotStarted(driver, diag_client))
2792    }
2793
2794    fn start(&mut self) -> &mesh::Sender<inspect::Deferred> {
2795        loop {
2796            match self.0 {
2797                DiagInspectorInner::NotStarted { .. } => {
2798                    let DiagInspectorInner::NotStarted(driver, client) =
2799                        std::mem::replace(&mut self.0, DiagInspectorInner::Invalid)
2800                    else {
2801                        unreachable!()
2802                    };
2803                    let (send, recv) = mesh::channel();
2804                    let task = driver.clone().spawn("diag-inspect", async move {
2805                        Self::run(&client, recv).await
2806                    });
2807
2808                    self.0 = DiagInspectorInner::Started { send, _task: task };
2809                }
2810                DiagInspectorInner::Started { ref send, .. } => break send,
2811                DiagInspectorInner::Invalid => unreachable!(),
2812            }
2813        }
2814    }
2815
2816    async fn run(
2817        diag_client: &diag_client::DiagClient,
2818        mut recv: mesh::Receiver<inspect::Deferred>,
2819    ) {
2820        while let Some(deferred) = recv.next().await {
2821            let info = deferred.external_request();
2822            let result = match info.request_type {
2823                inspect::ExternalRequestType::Inspect { depth } => {
2824                    if depth == 0 {
2825                        Ok(inspect::Node::Unevaluated)
2826                    } else {
2827                        // TODO: Support taking timeouts from the command line
2828                        diag_client
2829                            .inspect(info.path, Some(depth - 1), Some(Duration::from_secs(1)))
2830                            .await
2831                    }
2832                }
2833                inspect::ExternalRequestType::Update { value } => {
2834                    (diag_client.update(info.path, value).await).map(inspect::Node::Value)
2835                }
2836            };
2837            deferred.complete_external(
2838                result.unwrap_or_else(|err| {
2839                    inspect::Node::Failed(inspect::Error::Mesh(format!("{err:#}")))
2840                }),
2841                inspect::SensitivityLevel::Unspecified,
2842            )
2843        }
2844    }
2845}
2846
2847impl InspectMut for DiagInspector {
2848    fn inspect_mut(&mut self, req: inspect::Request<'_>) {
2849        self.start().send(req.defer());
2850    }
2851}