Skip to main content

openhcl_boot/
main.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! The openhcl boot loader, which loads before the kernel to set up the
5//! kernel's boot parameters.
6
7// See build.rs.
8#![cfg_attr(minimal_rt, no_std, no_main)]
9// UNSAFETY: Interacting with low level hardware and bootloader primitives.
10#![expect(unsafe_code)]
11// Allow the allocator api when compiling with `RUSTFLAGS="--cfg nightly"`. This
12// is used for some miri tests for testing the bump allocator.
13//
14// Do not use a normal feature, as that shows errors with rust-analyzer since
15// most people are using stable and enable all features. We could remove this
16// once the allocator_api feature is stable.
17#![cfg_attr(nightly, feature(allocator_api))]
18
19mod arch;
20mod boot_logger;
21mod cmdline;
22mod dt;
23mod host_params;
24mod hypercall;
25mod memory;
26mod rt;
27mod sidecar;
28mod single_threaded;
29
30use crate::arch::setup_vtl2_memory;
31use crate::arch::setup_vtl2_vp;
32#[cfg(target_arch = "x86_64")]
33use crate::arch::tdx::get_tdx_tsc_reftime;
34use crate::arch::verify_imported_regions_hash;
35use crate::boot_logger::boot_logger_memory_init;
36use crate::boot_logger::boot_logger_runtime_init;
37use crate::hypercall::hvcall;
38use crate::memory::AddressSpaceManager;
39use crate::single_threaded::OffStackRef;
40use crate::single_threaded::off_stack;
41use arrayvec::ArrayString;
42use arrayvec::ArrayVec;
43use cmdline::BootCommandLineOptions;
44use core::fmt::Write;
45use dt::BootTimes;
46use dt::write_dt;
47use host_fdt_parser::ComInfo;
48use host_params::COMMAND_LINE_SIZE;
49use host_params::PartitionInfo;
50use host_params::shim_params::IsolationType;
51use host_params::shim_params::ShimParams;
52use hvdef::Vtl;
53use loader_defs::linux::SETUP_DTB;
54use loader_defs::linux::setup_data;
55use loader_defs::shim::ShimParamsRaw;
56use memory_range::RangeWalkResult;
57use memory_range::walk_ranges;
58use minimal_rt::enlightened_panic::enable_enlightened_panic;
59use sidecar::SidecarConfig;
60use sidecar_defs::SidecarOutput;
61use sidecar_defs::SidecarParams;
62use zerocopy::FromBytes;
63use zerocopy::FromZeros;
64use zerocopy::Immutable;
65use zerocopy::IntoBytes;
66use zerocopy::KnownLayout;
67
68#[derive(Debug)]
69struct CommandLineTooLong;
70
71impl From<core::fmt::Error> for CommandLineTooLong {
72    fn from(_: core::fmt::Error) -> Self {
73        Self
74    }
75}
76
77struct BuildKernelCommandLineParams<'a> {
78    params: &'a ShimParams,
79    cmdline: &'a mut ArrayString<COMMAND_LINE_SIZE>,
80    partition_info: &'a PartitionInfo,
81    can_trust_host: bool,
82    is_confidential_debug: bool,
83    sidecar: Option<&'a SidecarConfig<'a>>,
84    vtl2_pool_supported: bool,
85}
86
87/// Read and setup the underhill kernel command line into the specified buffer.
88fn build_kernel_command_line(
89    fn_params: BuildKernelCommandLineParams<'_>,
90) -> Result<(), CommandLineTooLong> {
91    let BuildKernelCommandLineParams {
92        params,
93        cmdline,
94        partition_info,
95        can_trust_host,
96        is_confidential_debug,
97        sidecar,
98        vtl2_pool_supported,
99    } = fn_params;
100
101    // For reference:
102    // https://www.kernel.org/doc/html/v5.15/admin-guide/kernel-parameters.html
103    const KERNEL_PARAMETERS: &[&str] = &[
104        // If a console is specified, then write everything to it.
105        "loglevel=8",
106        // Use a fixed 128KB log buffer by default.
107        "log_buf_len=128K",
108        // Enable time output on console for ohcldiag-dev.
109        "printk.time=1",
110        // Enable facility and level output on console for ohcldiag-dev.
111        "console_msg_format=syslog",
112        // Set uio parameter to configure vmbus ring buffer behavior.
113        "uio_hv_generic.no_mask=1",
114        // RELIABILITY: Dump anonymous pages and ELF headers only. Skip over
115        // huge pages and the shared pages.
116        "coredump_filter=0x33",
117        // PERF: No processor frequency governing.
118        "cpufreq.off=1",
119        // PERF: Disable the CPU idle time management entirely. It does not
120        // prevent the idle loop from running on idle CPUs, but it prevents
121        // the CPU idle time governors and drivers from being invoked.
122        "cpuidle.off=1",
123        // PERF: No perf checks for crypto algorithms to boot faster.
124        // Would have to evaluate the perf wins on the crypto manager vs
125        // delaying the boot up.
126        "cryptomgr.notests",
127        // PERF: Idle threads use HLT on x64 if there is no work.
128        // Believed to be a compromise between waking up the processor
129        // and the power consumption.
130        "idle=halt",
131        // WORKAROUND: Avoid init calls that assume presence of CMOS (Simple
132        // Boot Flag) or allocate the real-mode trampoline for APs.
133        "initcall_blacklist=init_real_mode,sbf_init",
134        // CONFIG-STATIC, PERF: Static loops-per-jiffy value to save time on boot.
135        "lpj=3000000",
136        // PERF: No broken timer check to boot faster.
137        "no_timer_check",
138        // CONFIG-STATIC, PERF: Using xsave makes VTL transitions being
139        // much slower. The xsave state is shared between VTLs, and we don't
140        // context switch it in the kernel when leaving/entering VTL2.
141        // Removing this will lead to corrupting register state and the
142        // undefined behaviour.
143        "noxsave",
144        // RELIABILITY: Panic on MCEs and faults in the kernel.
145        "oops=panic",
146        // RELIABILITY: Don't panic on kernel warnings.
147        "panic_on_warn=0",
148        // PERF, RELIABILITY: Don't print detailed information about the failing
149        // processes (memory maps, threads).
150        "panic_print=0",
151        // RELIABILITY: Reboot immediately on panic, no timeout.
152        "panic=-1",
153        // RELIABILITY: Don't print processor context information on a fatal
154        // signal. Our crash dump collection infrastructure seems reliable, and
155        // this information doesn't seem useful without a dump anyways.
156        // Additionally it may push important logs off the end of the kmsg
157        // page logged by the host.
158        //"print_fatal_signals=0",
159        // RELIABILITY: Unlimited logging to /dev/kmsg from userspace.
160        "printk.devkmsg=on",
161        // RELIABILITY: Reboot using a triple fault as the fastest method.
162        // That is also the method used for compatibility with earlier versions
163        // of the Microsoft HCL.
164        "reboot=t",
165        // CONFIG-STATIC: Type of the root file system.
166        "rootfstype=tmpfs",
167        // PERF: Deactivate kcompactd kernel thread, otherwise it will queue a
168        // scheduler timer periodically, which introduces jitters for VTL0.
169        "sysctl.vm.compaction_proactiveness=0",
170        // PERF: No TSC stability check when booting up to boot faster,
171        // also no validation during runtime.
172        "tsc=reliable",
173        // RELIABILITY: Panic on receiving an NMI.
174        "unknown_nmi_panic=1",
175        // Use vfio for MANA devices.
176        "vfio_pci.ids=1414:00ba",
177        // WORKAROUND: Enable no-IOMMU mode. This mode provides no device isolation,
178        // and no DMA translation.
179        "vfio.enable_unsafe_noiommu_mode=1",
180        // Specify the init path.
181        "rdinit=/underhill-init",
182        // Default to user-mode NVMe driver.
183        "OPENHCL_NVME_VFIO=1",
184        // The next three items reduce the memory overhead of the storvsc driver.
185        // Since it is only used for DVD, performance is not critical.
186        "hv_storvsc.storvsc_vcpus_per_sub_channel=2048",
187        // Fix number of hardware queues at 2.
188        "hv_storvsc.storvsc_max_hw_queues=2",
189        // Reduce the ring buffer size to 32K.
190        "hv_storvsc.storvsc_ringbuffer_size=0x8000",
191        // Disable eager mimalloc commit to prevent core dumps from being overly large
192        "MIMALLOC_ARENA_EAGER_COMMIT=0",
193        // Disable acpi runtime support. Unused in underhill, but some support
194        // is compiled in for the kernel (ie TDX mailbox protocol).
195        "acpi=off",
196    ];
197
198    const X86_KERNEL_PARAMETERS: &[&str] = &[
199        // Disable pcid support. This is a temporary fix to allow
200        // Underhill to run nested inside AMD VMs. Otherwise, the
201        // Underhill kernel tries to start APs with PCID bits set in CR3
202        // without the PCIDE bit set in CR4, which is an invalid
203        // VP state (according to the mshv nested implementation).
204        //
205        // TODO: remove this once we figure out the root cause and apply
206        // a workaround/fix elsewhere.
207        "clearcpuid=pcid",
208        // Disable all attempts to use an IOMMU, including swiotlb.
209        "iommu=off",
210        // Don't probe for a PCI bus. PCI devices currently come from VPCI. When
211        // this changes, we will explicitly enumerate a PCI bus via devicetree.
212        "pci=off",
213    ];
214
215    const AARCH64_KERNEL_PARAMETERS: &[&str] = &[];
216
217    for p in KERNEL_PARAMETERS {
218        write!(cmdline, "{p} ")?;
219    }
220
221    let arch_parameters = if cfg!(target_arch = "x86_64") {
222        X86_KERNEL_PARAMETERS
223    } else {
224        AARCH64_KERNEL_PARAMETERS
225    };
226    for p in arch_parameters {
227        write!(cmdline, "{p} ")?;
228    }
229
230    const HARDWARE_ISOLATED_KERNEL_PARAMETERS: &[&str] = &[
231        // Even with iommu=off, the SWIOTLB is still allocated on AARCH64
232        // (iommu=off ignored entirely), and CVMs (memory encryption forces it
233        // on). Set it to a single area in 8MB. The first parameter controls the
234        // area size in slabs (2KB per slab), the second controls the number of
235        // areas (default is # of CPUs).
236        //
237        // This is set to 8MB on hardware isolated VMs since there are some
238        // scenarios, such as provisioning over DVD, which require a larger size
239        // since the buffer is being used.
240        "swiotlb=4096,1",
241    ];
242
243    const NON_HARDWARE_ISOLATED_KERNEL_PARAMETERS: &[&str] = &[
244        // Even with iommu=off, the SWIOTLB is still allocated on AARCH64
245        // (iommu=off ignored entirely). Set it to the minimum, saving ~63 MiB.
246        // The first parameter controls the area size, the second controls the
247        // number of areas (default is # of CPUs). Set them both to the minimum.
248        "swiotlb=1,1",
249    ];
250
251    if params.isolation_type.is_hardware_isolated() {
252        for p in HARDWARE_ISOLATED_KERNEL_PARAMETERS {
253            write!(cmdline, "{p} ")?;
254        }
255    } else {
256        for p in NON_HARDWARE_ISOLATED_KERNEL_PARAMETERS {
257            write!(cmdline, "{p} ")?;
258        }
259    }
260
261    // Enable the com3 console by default if it's available and we're not
262    // isolated, or if we are isolated but also have debugging enabled.
263    //
264    // Otherwise, set the console to ttynull so the kernel does not default to
265    // com1. This is overridden by any user customizations in the static or
266    // dynamic command line, as this console argument provided by the bootloader
267    // comes first.
268    write!(cmdline, "console=")?;
269    match (&partition_info.com3_serial, can_trust_host) {
270        (ComInfo::Ns16550 { current_speed, .. }, true) => {
271            write!(cmdline, "ttyS2,{current_speed} ")?
272        }
273        (ComInfo::Pl011 { current_speed, .. }, true) => {
274            write!(cmdline, "ttyAMA0,{current_speed} ")?
275        }
276        _ => write!(cmdline, "ttynull ")?,
277    }
278
279    if params.isolation_type != IsolationType::None {
280        write!(
281            cmdline,
282            "{}=1 ",
283            underhill_confidentiality::OPENHCL_CONFIDENTIAL_ENV_VAR_NAME
284        )?;
285    }
286
287    if is_confidential_debug {
288        write!(
289            cmdline,
290            "{}=1 ",
291            underhill_confidentiality::OPENHCL_CONFIDENTIAL_DEBUG_ENV_VAR_NAME
292        )?;
293    }
294
295    // Generate the NVMe keep alive command line which should look something
296    // like: OPENHCL_NVME_KEEP_ALIVE=disabled,host,privatepool
297    // TODO: Move from command line to device tree when stabilized.
298    write!(cmdline, "OPENHCL_NVME_KEEP_ALIVE=")?;
299
300    if partition_info.boot_options.disable_nvme_keep_alive {
301        write!(cmdline, "disabled,")?;
302    }
303
304    if partition_info.nvme_keepalive {
305        write!(cmdline, "host,")?;
306    } else {
307        write!(cmdline, "nohost,")?;
308    }
309
310    if vtl2_pool_supported {
311        write!(cmdline, "privatepool ")?;
312    } else {
313        write!(cmdline, "noprivatepool ")?;
314    }
315
316    if let Some(sidecar) = sidecar {
317        write!(cmdline, "{} ", sidecar.kernel_command_line())?;
318    }
319
320    if !cmdline.contains("hv_vmbus.message_connection_id") {
321        // HACK: Set the vmbus connection id via kernel commandline if we haven't
322        // gotten one from elsewhere.
323        //
324        // This code will be removed when the kernel supports setting connection id
325        // via device tree.
326        write!(
327            cmdline,
328            "hv_vmbus.message_connection_id=0x{:x} ",
329            partition_info.vmbus_vtl2.connection_id
330        )?;
331    }
332
333    // Prepend the computed parameters to the original command line.
334    cmdline.write_str(&partition_info.cmdline)?;
335
336    Ok(())
337}
338
339// The Linux kernel requires that the FDT fit within a single 256KB mapping, as
340// that is the maximum size the kernel can use during its early boot processes.
341// We also want our FDT to be as large as possible to support as many vCPUs as
342// possible. We set it to 256KB, but it must also be page-aligned, as leaving it
343// unaligned runs the possibility of it taking up 1 too many pages, resulting in
344// a 260KB mapping, which will fail.
345const FDT_SIZE: usize = 256 * 1024;
346
347#[repr(C, align(4096))]
348#[derive(FromBytes, IntoBytes, Immutable, KnownLayout)]
349struct Fdt {
350    header: setup_data,
351    data: [u8; FDT_SIZE - size_of::<setup_data>()],
352}
353
354/// Raw shim parameters are provided via a relative offset from the base of
355/// where the shim is loaded. Return a ShimParams structure based on the raw
356/// offset based RawShimParams.
357fn shim_parameters(shim_params_raw_offset: isize) -> ShimParams {
358    unsafe extern "C" {
359        static __ehdr_start: u8;
360    }
361
362    let shim_base = core::ptr::addr_of!(__ehdr_start) as usize;
363
364    // SAFETY: The host is required to relocate everything by the same bias, so
365    //         the shim parameters should be at the build time specified offset
366    //         from the base address of the image.
367    let raw_shim_params = unsafe {
368        &*(shim_base.wrapping_add_signed(shim_params_raw_offset) as *const ShimParamsRaw)
369    };
370
371    ShimParams::new(shim_base as u64, raw_shim_params)
372}
373
374#[cfg_attr(not(target_arch = "x86_64"), expect(dead_code))]
375mod x86_boot {
376    use crate::PageAlign;
377    use crate::memory::AddressSpaceManager;
378    use crate::single_threaded::OffStackRef;
379    use crate::single_threaded::off_stack;
380    use crate::zeroed;
381    use core::mem::size_of;
382    use core::ops::Range;
383    use core::ptr;
384    use loader_defs::linux::E820_RAM;
385    use loader_defs::linux::E820_RESERVED;
386    use loader_defs::linux::SETUP_E820_EXT;
387    use loader_defs::linux::boot_params;
388    use loader_defs::linux::e820entry;
389    use loader_defs::linux::setup_data;
390    use loader_defs::shim::MemoryVtlType;
391    use memory_range::MemoryRange;
392    use zerocopy::FromZeros;
393    use zerocopy::Immutable;
394    use zerocopy::KnownLayout;
395
396    #[repr(C)]
397    #[derive(FromZeros, Immutable, KnownLayout)]
398    pub struct E820Ext {
399        pub header: setup_data,
400        pub entries: [e820entry; 512],
401    }
402
403    fn add_e820_entry(
404        entry: Option<&mut e820entry>,
405        range: MemoryRange,
406        typ: u32,
407    ) -> Result<(), BuildE820MapError> {
408        *entry.ok_or(BuildE820MapError::OutOfE820Entries)? = e820entry {
409            addr: range.start().into(),
410            size: range.len().into(),
411            typ: typ.into(),
412        };
413        Ok(())
414    }
415
416    #[derive(Debug)]
417    pub enum BuildE820MapError {
418        /// Out of e820 entries.
419        OutOfE820Entries,
420    }
421
422    /// Build the e820 map for the kernel representing usable VTL2 ram.
423    pub fn build_e820_map(
424        boot_params: &mut boot_params,
425        ext: &mut E820Ext,
426        address_space: &AddressSpaceManager,
427    ) -> Result<bool, BuildE820MapError> {
428        boot_params.e820_entries = 0;
429        let mut entries = boot_params
430            .e820_map
431            .iter_mut()
432            .chain(ext.entries.iter_mut());
433
434        let mut n = 0;
435        for (range, typ) in address_space.vtl2_ranges() {
436            match typ {
437                MemoryVtlType::VTL2_RAM => {
438                    add_e820_entry(entries.next(), range, E820_RAM)?;
439                    n += 1;
440                }
441                MemoryVtlType::VTL2_CONFIG
442                | MemoryVtlType::VTL2_SIDECAR_IMAGE
443                | MemoryVtlType::VTL2_SIDECAR_NODE
444                | MemoryVtlType::VTL2_RESERVED
445                | MemoryVtlType::VTL2_GPA_POOL
446                | MemoryVtlType::VTL2_TDX_PAGE_TABLES
447                | MemoryVtlType::VTL2_BOOTSHIM_LOG_BUFFER
448                | MemoryVtlType::VTL2_PERSISTED_STATE_HEADER
449                | MemoryVtlType::VTL2_PERSISTED_STATE_PROTOBUF => {
450                    add_e820_entry(entries.next(), range, E820_RESERVED)?;
451                    n += 1;
452                }
453
454                _ => {
455                    panic!("unexpected vtl2 ram type {typ:?} for range {range:#?}");
456                }
457            }
458        }
459
460        let base = n.min(boot_params.e820_map.len());
461        boot_params.e820_entries = base as u8;
462
463        if base < n {
464            ext.header.len = ((n - base) * size_of::<e820entry>()) as u32;
465            Ok(true)
466        } else {
467            Ok(false)
468        }
469    }
470
471    pub fn build_boot_params(
472        address_space: &AddressSpaceManager,
473        initrd: Range<u64>,
474        cmdline: &str,
475        setup_data_head: *const setup_data,
476        setup_data_tail: &mut &mut setup_data,
477    ) -> OffStackRef<'static, PageAlign<boot_params>> {
478        let mut boot_params_storage = off_stack!(PageAlign<boot_params>, zeroed());
479        let boot_params = &mut boot_params_storage.0;
480        boot_params.hdr.type_of_loader = 0xff; // Unknown loader type
481
482        // HACK: A kernel change just in the Underhill kernel tree has a workaround
483        // to disable probe_roms and reserve_bios_regions when X86_SUBARCH_LGUEST
484        // (1) is set by the bootloader. This stops the kernel from reading VTL0
485        // memory during kernel boot, which can have catastrophic consequences
486        // during a servicing operation when VTL0 has written values to memory, or
487        // unaccepted page accesses in an isolated partition.
488        //
489        // This is only intended as a stopgap until a suitable upstreamable kernel
490        // patch is made.
491        boot_params.hdr.hardware_subarch = 1.into();
492
493        boot_params.hdr.ramdisk_image = (initrd.start as u32).into();
494        boot_params.ext_ramdisk_image = (initrd.start >> 32) as u32;
495        let initrd_len = initrd.end - initrd.start;
496        boot_params.hdr.ramdisk_size = (initrd_len as u32).into();
497        boot_params.ext_ramdisk_size = (initrd_len >> 32) as u32;
498
499        let e820_ext = OffStackRef::leak(off_stack!(E820Ext, zeroed()));
500
501        let used_ext = build_e820_map(boot_params, e820_ext, address_space)
502            .expect("building e820 map must succeed");
503
504        if used_ext {
505            e820_ext.header.ty = SETUP_E820_EXT;
506            setup_data_tail.next = ptr::from_ref(&e820_ext.header) as u64;
507            *setup_data_tail = &mut e820_ext.header;
508        }
509
510        let cmd_line_addr = cmdline.as_ptr() as u64;
511        boot_params.hdr.cmd_line_ptr = (cmd_line_addr as u32).into();
512        boot_params.ext_cmd_line_ptr = (cmd_line_addr >> 32) as u32;
513
514        boot_params.hdr.setup_data = (setup_data_head as u64).into();
515
516        boot_params_storage
517    }
518}
519
520/// Build the cc_blob containing the location of different parameters associated with SEV.
521#[cfg(target_arch = "x86_64")]
522fn build_cc_blob_sev_info(
523    cc_blob: &mut loader_defs::linux::cc_blob_sev_info,
524    shim_params: &ShimParams,
525) {
526    // TODO SNP: Currently only the first CPUID page is passed through.
527    // Consider changing this.
528    cc_blob.magic = loader_defs::linux::CC_BLOB_SEV_INFO_MAGIC;
529    cc_blob.version = 0;
530    cc_blob._reserved = 0;
531    cc_blob.secrets_phys = shim_params.secrets_start();
532    cc_blob.secrets_len = hvdef::HV_PAGE_SIZE as u32;
533    cc_blob._rsvd1 = 0;
534    cc_blob.cpuid_phys = shim_params.cpuid_start();
535    cc_blob.cpuid_len = hvdef::HV_PAGE_SIZE as u32;
536    cc_blob._rsvd2 = 0;
537}
538
539#[repr(C, align(4096))]
540#[derive(FromZeros, Immutable, KnownLayout)]
541struct PageAlign<T>(T);
542
543const fn zeroed<T: FromZeros>() -> T {
544    // SAFETY: `T` implements `FromZeros`, so this is a safe initialization of `T`.
545    unsafe { core::mem::MaybeUninit::<T>::zeroed().assume_init() }
546}
547
548fn get_ref_time(isolation: IsolationType) -> Option<u64> {
549    match isolation {
550        #[cfg(target_arch = "x86_64")]
551        IsolationType::Tdx => get_tdx_tsc_reftime(),
552        #[cfg(target_arch = "x86_64")]
553        IsolationType::Snp => None,
554        _ => Some(minimal_rt::reftime::reference_time()),
555    }
556}
557
558fn shim_main(shim_params_raw_offset: isize) -> ! {
559    let p = shim_parameters(shim_params_raw_offset);
560    if p.isolation_type == IsolationType::None {
561        enable_enlightened_panic();
562    }
563
564    #[cfg(feature = "cvm_boot_log")]
565    arch::initialize_serial_io(&p);
566
567    // Enable the in-memory log.
568    boot_logger_memory_init(p.log_buffer);
569
570    // Enable global log crate.
571    log::set_logger(&boot_logger::BOOT_LOGGER).unwrap();
572    // TODO: allow overriding filter at runtime
573    log::set_max_level(log::LevelFilter::Info);
574
575    let boot_reftime = get_ref_time(p.isolation_type);
576
577    // The support code for the fast hypercalls does not set
578    // the Guest ID if it is not set yet as opposed to the slow
579    // hypercall code path where that is done automatically.
580    // Thus the fast hypercalls will fail as the the Guest ID has
581    // to be set first hence initialize hypercall support
582    // explicitly.
583    if !p.isolation_type.is_hardware_isolated() {
584        hvcall().initialize();
585    }
586
587    let mut static_options = BootCommandLineOptions::new();
588    if let Some(cmdline) = p.command_line().command_line() {
589        static_options.parse(cmdline);
590    }
591
592    let static_confidential_debug = static_options.confidential_debug;
593    let can_trust_host = p.isolation_type == IsolationType::None || static_confidential_debug;
594
595    let mut dt_storage = off_stack!(PartitionInfo, PartitionInfo::new());
596    let address_space = OffStackRef::leak(off_stack!(
597        AddressSpaceManager,
598        AddressSpaceManager::new_const()
599    ));
600    let partition_info = match PartitionInfo::read_from_dt(
601        &p,
602        &mut dt_storage,
603        address_space,
604        static_options,
605        can_trust_host,
606    ) {
607        Ok(val) => val,
608        Err(e) => panic!("unable to read device tree params {:?}", e),
609    };
610
611    // Enable logging ASAP. This is fine even when isolated, as we don't have
612    // any access to secrets in the boot shim.
613    boot_logger_runtime_init(p.isolation_type, partition_info.com3_serial.clone());
614    log::info!("openhcl_boot: logging enabled");
615    log::info!("serial configuration: {:#x?}", partition_info.com3_serial);
616
617    // Confidential debug will show up in boot_options only if included in the
618    // static command line, or if can_trust_host is true (so the dynamic command
619    // line has been parsed).
620    let is_confidential_debug =
621        static_confidential_debug || partition_info.boot_options.confidential_debug;
622
623    // Fill out the non-devicetree derived parts of PartitionInfo.
624    if !p.isolation_type.is_hardware_isolated()
625        && hvcall().vtl() == Vtl::Vtl2
626        && hvdef::HvRegisterVsmCapabilities::from(
627            hvcall()
628                .get_register(hvdef::HvAllArchRegisterName::VsmCapabilities.into())
629                .expect("failed to query vsm capabilities")
630                .as_u64(),
631        )
632        .vtl0_alias_map_available()
633    {
634        // If the vtl0 alias map was not provided in the devicetree, attempt to
635        // derive it from the architectural physical address bits.
636        //
637        // The value in the ID_AA64MMFR0_EL1 register used to determine the
638        // physical address bits can only represent multiples of 4. As a result,
639        // the Surface Pro X (and systems with similar CPUs) cannot properly
640        // report their address width of 39 bits. This causes the calculated
641        // alias map to be incorrect, which results in panics when trying to
642        // read memory and getting invalid data.
643        if partition_info.vtl0_alias_map.is_none() {
644            partition_info.vtl0_alias_map =
645                Some(1 << (arch::physical_address_bits(p.isolation_type) - 1));
646        }
647    } else {
648        // Ignore any devicetree-provided alias map if the conditions above
649        // aren't met.
650        partition_info.vtl0_alias_map = None;
651    }
652
653    // Rebind partition_info as no longer mutable.
654    let partition_info: &PartitionInfo = partition_info;
655
656    if partition_info.cpus.is_empty() {
657        panic!("no cpus");
658    }
659
660    validate_vp_hw_ids(partition_info);
661
662    setup_vtl2_memory(&p, partition_info, address_space);
663    setup_vtl2_vp(partition_info);
664
665    verify_imported_regions_hash(&p);
666
667    let mut sidecar_params = off_stack!(PageAlign<SidecarParams>, zeroed());
668    let mut sidecar_output = off_stack!(PageAlign<SidecarOutput>, zeroed());
669    let sidecar = sidecar::start_sidecar(
670        &p,
671        partition_info,
672        address_space,
673        &mut sidecar_params.0,
674        &mut sidecar_output.0,
675    );
676
677    // Rebind address_space as no longer mutable.
678    let address_space: &AddressSpaceManager = address_space;
679
680    let mut cmdline = off_stack!(ArrayString<COMMAND_LINE_SIZE>, ArrayString::new_const());
681    build_kernel_command_line(BuildKernelCommandLineParams {
682        params: &p,
683        cmdline: &mut cmdline,
684        partition_info,
685        can_trust_host,
686        is_confidential_debug,
687        sidecar: sidecar.as_ref(),
688        vtl2_pool_supported: address_space.has_vtl2_pool(),
689    })
690    .unwrap();
691
692    let mut fdt = off_stack!(Fdt, zeroed());
693    fdt.header.len = fdt.data.len() as u32;
694    fdt.header.ty = SETUP_DTB;
695
696    #[cfg(target_arch = "x86_64")]
697    let mut setup_data_tail = &mut fdt.header;
698    #[cfg(target_arch = "x86_64")]
699    let setup_data_head = core::ptr::from_ref(setup_data_tail);
700
701    #[cfg(target_arch = "x86_64")]
702    if p.isolation_type == IsolationType::Snp {
703        let cc_blob = OffStackRef::leak(off_stack!(loader_defs::linux::cc_blob_sev_info, zeroed()));
704        build_cc_blob_sev_info(cc_blob, &p);
705
706        let cc_data = OffStackRef::leak(off_stack!(loader_defs::linux::cc_setup_data, zeroed()));
707        cc_data.header.len = size_of::<loader_defs::linux::cc_setup_data>() as u32;
708        cc_data.header.ty = loader_defs::linux::SETUP_CC_BLOB;
709        cc_data.cc_blob_address = core::ptr::from_ref(&*cc_blob) as u32;
710
711        // Chain in the setup data.
712        setup_data_tail.next = core::ptr::from_ref(&*cc_data) as u64;
713        setup_data_tail = &mut cc_data.header;
714    }
715
716    let initrd = p.initrd_base..p.initrd_base + p.initrd_size;
717
718    // Validate the initrd crc matches what was put at file generation time.
719    let computed_crc = crc32fast::hash(p.initrd());
720    assert_eq!(
721        computed_crc, p.initrd_crc,
722        "computed initrd crc does not match build time calculated crc"
723    );
724
725    #[cfg(target_arch = "x86_64")]
726    let boot_params = x86_boot::build_boot_params(
727        address_space,
728        initrd.clone(),
729        &cmdline,
730        setup_data_head,
731        &mut setup_data_tail,
732    );
733
734    // Compute the ending boot time. This has to be before writing to device
735    // tree, so this is as late as we can do it.
736
737    let boot_times = boot_reftime.map(|start| BootTimes {
738        start,
739        end: get_ref_time(p.isolation_type).unwrap_or(0),
740    });
741
742    // Validate that no imported regions that are pending are not part of vtl2
743    // ram.
744    for (range, result) in walk_ranges(
745        partition_info.vtl2_ram.iter().map(|r| (r.range, ())),
746        p.imported_regions(),
747    ) {
748        match result {
749            RangeWalkResult::Neither | RangeWalkResult::Left(_) | RangeWalkResult::Both(_, _) => {}
750            RangeWalkResult::Right(accepted) => {
751                // Ranges that are not a part of VTL2 ram must have been
752                // preaccepted, as usermode expect that to be the case.
753                assert!(
754                    accepted,
755                    "range {:#x?} not in vtl2 ram was not preaccepted at launch",
756                    range
757                );
758            }
759        }
760    }
761
762    write_dt(
763        &mut fdt.data,
764        partition_info,
765        address_space,
766        p.imported_regions().map(|r| {
767            // Discard if the range was previously pending - the bootloader has
768            // accepted all pending ranges.
769            //
770            // NOTE: No VTL0 memory today is marked as pending. The check above
771            // validates that, and this code may need to change if this becomes
772            // no longer true.
773            r.0
774        }),
775        initrd,
776        &cmdline,
777        sidecar.as_ref(),
778        boot_times,
779        p.isolation_type,
780    )
781    .unwrap();
782
783    rt::verify_stack_cookie();
784
785    log::info!("uninitializing hypercalls");
786    #[cfg(not(feature = "cvm_boot_log"))]
787    log::info!("about to jump to kernel");
788
789    hvcall().uninitialize();
790
791    #[cfg(feature = "cvm_boot_log")]
792    {
793        log::info!("uninitializing serial io");
794        log::info!("about to jump to kernel");
795        arch::uninitialize_serial_io(&p);
796    }
797
798    cfg_if::cfg_if! {
799        if #[cfg(target_arch = "x86_64")] {
800            // SAFETY: the parameter blob is trusted.
801            let kernel_entry: extern "C" fn(u64, &loader_defs::linux::boot_params) -> ! =
802                unsafe { core::mem::transmute(p.kernel_entry_address) };
803            kernel_entry(0, &boot_params.0)
804        } else if #[cfg(target_arch = "aarch64")] {
805            // SAFETY: the parameter blob is trusted.
806            let kernel_entry: extern "C" fn(fdt_data: *const u8, mbz0: u64, mbz1: u64, mbz2: u64) -> ! =
807                unsafe { core::mem::transmute(p.kernel_entry_address) };
808            // Disable MMU for kernel boot without EFI, as required by the boot protocol.
809            // Flush (and invalidate) the caches, as that is required for disabling MMU.
810            // SAFETY: Just changing a bit in the register and then jumping to the kernel.
811            unsafe {
812                core::arch::asm!(
813                    "
814                    mrs     {0}, sctlr_el1
815                    bic     {0}, {0}, #0x1
816                    msr     sctlr_el1, {0}
817                    tlbi    vmalle1
818                    dsb     sy
819                    isb     sy",
820                    lateout(reg) _,
821                );
822            }
823            kernel_entry(fdt.data.as_ptr(), 0, 0, 0)
824        } else {
825            panic!("unsupported arch")
826        }
827    }
828}
829
830/// Ensure that mshv VP indexes for the CPUs listed in the partition info
831/// correspond to the N in the cpu@N devicetree node name. OpenVMM assumes that
832/// this will be the case.
833fn validate_vp_hw_ids(partition_info: &PartitionInfo) {
834    use host_params::MAX_CPU_COUNT;
835    use hypercall::HwId;
836
837    if partition_info.isolation.is_hardware_isolated() {
838        // TODO TDX SNP: we don't have a GHCB/GHCI page set up to communicate
839        // with the hypervisor here, so we can't easily perform the check. Since
840        // there is no security impact to this check, we can skip it for now; if
841        // the VM fails to boot, then this is due to a host contract violation.
842        //
843        // For TDX, we could use ENUM TOPOLOGY to validate that the TD VCPU
844        // indexes correspond to the APIC IDs in the right order. I am not
845        // certain if there are places where we depend on this mapping today.
846        return;
847    }
848
849    if hvcall().vtl() != Vtl::Vtl2 {
850        // If we're not using guest VSM, then the guest won't communicate
851        // directly with the hypervisor, so we can choose the VP indexes
852        // ourselves.
853        return;
854    }
855
856    // Ensure the host and hypervisor agree on VP index ordering.
857
858    let mut hw_ids = off_stack!(ArrayVec<HwId, MAX_CPU_COUNT>, ArrayVec::new_const());
859    hw_ids.clear();
860    hw_ids.extend(partition_info.cpus.iter().map(|c| c.reg as _));
861    let mut vp_indexes = off_stack!(ArrayVec<u32, MAX_CPU_COUNT>, ArrayVec::new_const());
862    vp_indexes.clear();
863    if let Err(err) = hvcall().get_vp_index_from_hw_id(&hw_ids, &mut vp_indexes) {
864        panic!(
865            "failed to get VP index for hardware ID {:#x}: {}",
866            hw_ids[vp_indexes.len().min(hw_ids.len() - 1)],
867            err
868        );
869    }
870    if let Some((i, &vp_index)) = vp_indexes
871        .iter()
872        .enumerate()
873        .find(|&(i, vp_index)| i as u32 != *vp_index)
874    {
875        panic!(
876            "CPU hardware ID {:#x} does not correspond to VP index {}",
877            hw_ids[i], vp_index
878        );
879    }
880}
881
882// See build.rs. See `mod rt` for the actual bootstrap code required to invoke
883// shim_main.
884#[cfg(not(minimal_rt))]
885fn main() {
886    unimplemented!("build with MINIMAL_RT_BUILD to produce a working boot loader");
887}
888
889#[cfg(test)]
890mod test {
891    use super::x86_boot::E820Ext;
892    use super::x86_boot::build_e820_map;
893    use crate::cmdline::BootCommandLineOptions;
894    use crate::dt::write_dt;
895    use crate::host_params::MAX_CPU_COUNT;
896    use crate::host_params::PartitionInfo;
897    use crate::host_params::shim_params::IsolationType;
898    use crate::memory::AddressSpaceManager;
899    use crate::memory::AddressSpaceManagerBuilder;
900    use arrayvec::ArrayString;
901    use arrayvec::ArrayVec;
902    use core::ops::Range;
903    use host_fdt_parser::ComInfo;
904    use host_fdt_parser::CpuEntry;
905    use host_fdt_parser::MemoryEntry;
906    use host_fdt_parser::VmbusInfo;
907    use igvm_defs::MemoryMapEntryType;
908    use loader_defs::linux::E820_RAM;
909    use loader_defs::linux::E820_RESERVED;
910    use loader_defs::linux::boot_params;
911    use loader_defs::linux::e820entry;
912    use memory_range::MemoryRange;
913    use memory_range::subtract_ranges;
914    use sidecar_defs::PerCpuState;
915    use zerocopy::FromZeros;
916
917    const HIGH_MMIO_GAP_END: u64 = 0x1000000000; //  64 GiB
918    const VMBUS_MMIO_GAP_SIZE: u64 = 0x10000000; // 256 MiB
919    const HIGH_MMIO_GAP_START: u64 = HIGH_MMIO_GAP_END - VMBUS_MMIO_GAP_SIZE;
920
921    /// Create partition info with given cpu count enabled and sequential
922    /// apic_ids.
923    fn new_partition_info(cpu_count: usize) -> PartitionInfo {
924        let mut cpus: ArrayVec<CpuEntry, MAX_CPU_COUNT> = ArrayVec::new();
925
926        for id in 0..(cpu_count as u64) {
927            cpus.push(CpuEntry { reg: id, vnode: 0 });
928        }
929
930        let mut mmio = ArrayVec::new();
931        mmio.push(
932            MemoryRange::try_new(HIGH_MMIO_GAP_START..HIGH_MMIO_GAP_END).expect("valid range"),
933        );
934
935        PartitionInfo {
936            vtl2_ram: ArrayVec::new(),
937            partition_ram: ArrayVec::new(),
938            isolation: IsolationType::None,
939            bsp_reg: cpus[0].reg as u32,
940            cpus,
941            sidecar_cpu_overrides: PerCpuState {
942                per_cpu_state_specified: false,
943                sidecar_starts_cpu: [true; sidecar_defs::NUM_CPUS_SUPPORTED_FOR_PER_CPU_STATE],
944            },
945            cmdline: ArrayString::new(),
946            vmbus_vtl2: VmbusInfo {
947                mmio,
948                connection_id: 0,
949            },
950            vmbus_vtl0: VmbusInfo {
951                mmio: ArrayVec::new(),
952                connection_id: 0,
953            },
954            com3_serial: ComInfo::None,
955            gic: None,
956            pmu_gsiv: None,
957            memory_allocation_mode: host_fdt_parser::MemoryAllocationMode::Host,
958            entropy: None,
959            vtl0_alias_map: None,
960            nvme_keepalive: false,
961            boot_options: BootCommandLineOptions::new(),
962        }
963    }
964
965    // ensure we can boot with a _lot_ of vcpus
966    #[test]
967    #[cfg_attr(
968        target_arch = "aarch64",
969        ignore = "TODO: investigate why this doesn't always work on ARM"
970    )]
971    fn fdt_cpu_scaling() {
972        const MAX_CPUS: usize = 2048;
973
974        let mut buf = [0; 0x40000];
975        write_dt(
976            &mut buf,
977            &new_partition_info(MAX_CPUS),
978            &AddressSpaceManager::new_const(),
979            [],
980            0..0,
981            &ArrayString::from("test").unwrap_or_default(),
982            None,
983            None,
984            IsolationType::None,
985        )
986        .unwrap();
987    }
988
989    // Must match the DeviceTree blob generated with the standard tooling
990    // to ensure being compliant to the standards (or, at least, compatibility
991    // with a widely used implementation).
992    // For details on regenerating the test content, see `fdt_dtc_decompile`
993    // below.
994    #[test]
995    #[ignore = "TODO: temporarily broken"]
996    fn fdt_dtc_check_content() {
997        const MAX_CPUS: usize = 2;
998        const BUF_SIZE: usize = 0x1000;
999
1000        // Rust cannot infer the type.
1001        let dtb_data_spans: [(usize, &[u8]); 2] = [
1002            (
1003                /* Span starts at offset */ 0,
1004                b"\xd0\x0d\xfe\xed\x00\x00\x10\x00\x00\x00\x04\x38\x00\x00\x00\x38\
1005                \x00\x00\x00\x28\x00\x00\x00\x11\x00\x00\x00\x10\x00\x00\x00\x00\
1006                \x00\x00\x00\x4a\x00\x00\x01\x6c\x00\x00\x00\x00\x00\x00\x00\x00\
1007                \x00\x00\x00\x00\x00\x00\x00\x00\x23\x61\x64\x64\x72\x65\x73\x73\
1008                \x2d\x63\x65\x6c\x6c\x73\x00\x23\x73\x69\x7a\x65\x2d\x63\x65\x6c\
1009                \x6c\x73\x00\x6d\x6f\x64\x65\x6c\x00\x72\x65\x67\x00\x64\x65\x76\
1010                \x69\x63\x65\x5f\x74\x79\x70\x65\x00\x73\x74\x61\x74\x75\x73\x00\
1011                \x63\x6f\x6d\x70\x61\x74\x69\x62\x6c\x65\x00\x72\x61\x6e\x67\x65\
1012                \x73",
1013            ),
1014            (
1015                /* Span starts at offset */ 0x430,
1016                b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\
1017                \x00\x00\x00\x03\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x02\
1018                \x00\x00\x00\x03\x00\x00\x00\x04\x00\x00\x00\x0f\x00\x00\x00\x00\
1019                \x00\x00\x00\x03\x00\x00\x00\x0f\x00\x00\x00\x1b\x6d\x73\x66\x74\
1020                \x2c\x75\x6e\x64\x65\x72\x68\x69\x6c\x6c\x00\x00\x00\x00\x00\x01\
1021                \x63\x70\x75\x73\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x04\
1022                \x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x03\x00\x00\x00\x04\
1023                \x00\x00\x00\x0f\x00\x00\x00\x00\x00\x00\x00\x01\x63\x70\x75\x40\
1024                \x30\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x04\x00\x00\x00\x25\
1025                \x63\x70\x75\x00\x00\x00\x00\x03\x00\x00\x00\x04\x00\x00\x00\x21\
1026                \x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x05\x00\x00\x00\x31\
1027                \x6f\x6b\x61\x79\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x01\
1028                \x63\x70\x75\x40\x31\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x04\
1029                \x00\x00\x00\x25\x63\x70\x75\x00\x00\x00\x00\x03\x00\x00\x00\x04\
1030                \x00\x00\x00\x21\x00\x00\x00\x01\x00\x00\x00\x03\x00\x00\x00\x05\
1031                \x00\x00\x00\x31\x6f\x6b\x61\x79\x00\x00\x00\x00\x00\x00\x00\x02\
1032                \x00\x00\x00\x02\x00\x00\x00\x01\x76\x6d\x62\x75\x73\x00\x00\x00\
1033                \x00\x00\x00\x03\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x02\
1034                \x00\x00\x00\x03\x00\x00\x00\x04\x00\x00\x00\x0f\x00\x00\x00\x01\
1035                \x00\x00\x00\x03\x00\x00\x00\x0b\x00\x00\x00\x38\x6d\x73\x66\x74\
1036                \x2c\x76\x6d\x62\x75\x73\x00\x00\x00\x00\x00\x03\x00\x00\x00\x14\
1037                \x00\x00\x00\x43\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0f\
1038                \xf0\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x02\
1039                \x00\x00\x00\x09",
1040            ),
1041        ];
1042
1043        let mut sample_buf = [0u8; BUF_SIZE];
1044        for (span_start, bytes) in dtb_data_spans {
1045            sample_buf[span_start..span_start + bytes.len()].copy_from_slice(bytes);
1046        }
1047
1048        let mut buf = [0u8; BUF_SIZE];
1049        write_dt(
1050            &mut buf,
1051            &new_partition_info(MAX_CPUS),
1052            &AddressSpaceManager::new_const(),
1053            [],
1054            0..0,
1055            &ArrayString::from("test").unwrap_or_default(),
1056            None,
1057            None,
1058            IsolationType::None,
1059        )
1060        .unwrap();
1061
1062        assert!(sample_buf == buf);
1063    }
1064
1065    // This test should be manually enabled when need to regenerate
1066    // the sample content above and validate spec compliance with `dtc`.
1067    // Before running the test, please install the DeviceTree compiler:
1068    // ```shell
1069    // sudo apt-get update && sudo apt-get install device-tree-compiler
1070    // ```
1071    #[test]
1072    #[ignore = "enabling the test requires installing additional software, \
1073                and developers will experience a break."]
1074    fn fdt_dtc_decompile() {
1075        const MAX_CPUS: usize = 2048;
1076
1077        let mut buf = [0; 0x40000];
1078        write_dt(
1079            &mut buf,
1080            &new_partition_info(MAX_CPUS),
1081            &AddressSpaceManager::new_const(),
1082            [],
1083            0..0,
1084            &ArrayString::from("test").unwrap_or_default(),
1085            None,
1086            None,
1087            IsolationType::None,
1088        )
1089        .unwrap();
1090
1091        let input_dtb_file_name = "openhcl_boot.dtb";
1092        let output_dts_file_name = "openhcl_boot.dts";
1093        std::fs::write(input_dtb_file_name, buf).unwrap();
1094        let success = std::process::Command::new("dtc")
1095            .args([input_dtb_file_name, "-I", "dtb", "-o", output_dts_file_name])
1096            .status()
1097            .unwrap()
1098            .success();
1099        assert!(success);
1100    }
1101
1102    fn new_address_space_manager(
1103        ram: &[MemoryRange],
1104        bootshim_used: MemoryRange,
1105        persisted_range: MemoryRange,
1106        parameter_range: MemoryRange,
1107        reclaim: Option<MemoryRange>,
1108    ) -> AddressSpaceManager {
1109        let ram = ram
1110            .iter()
1111            .cloned()
1112            .map(|range| MemoryEntry {
1113                range,
1114                mem_type: MemoryMapEntryType::VTL2_PROTECTABLE,
1115                vnode: 0,
1116            })
1117            .collect::<Vec<_>>();
1118        let mut address_space = AddressSpaceManager::new_const();
1119        AddressSpaceManagerBuilder::new(
1120            &mut address_space,
1121            &ram,
1122            bootshim_used,
1123            persisted_range,
1124            subtract_ranges([parameter_range], reclaim),
1125        )
1126        .init()
1127        .unwrap();
1128        address_space
1129    }
1130
1131    fn check_e820(boot_params: &boot_params, ext: &E820Ext, expected: &[(Range<u64>, u32)]) {
1132        let actual = boot_params.e820_map[..boot_params.e820_entries as usize]
1133            .iter()
1134            .chain(
1135                ext.entries
1136                    .iter()
1137                    .take((ext.header.len as usize) / size_of::<e820entry>()),
1138            );
1139
1140        assert_eq!(actual.clone().count(), expected.len());
1141
1142        for (actual, (expected_range, expected_type)) in actual.zip(expected.iter()) {
1143            let addr: u64 = actual.addr.into();
1144            let size: u64 = actual.size.into();
1145            let typ: u32 = actual.typ.into();
1146            assert_eq!(addr, expected_range.start);
1147            assert_eq!(size, expected_range.end - expected_range.start);
1148            assert_eq!(typ, *expected_type);
1149        }
1150    }
1151
1152    const PAGE_SIZE: u64 = 0x1000;
1153    const ONE_MB: u64 = 0x10_0000;
1154
1155    #[test]
1156    fn test_e820_basic() {
1157        // memmap with no param reclaim
1158        let mut boot_params: boot_params = FromZeros::new_zeroed();
1159        let mut ext = FromZeros::new_zeroed();
1160        let bootshim_used = MemoryRange::try_new(ONE_MB..3 * ONE_MB).unwrap();
1161        let persisted_header_end = ONE_MB + PAGE_SIZE;
1162        let persisted_end = ONE_MB + 4 * PAGE_SIZE;
1163        let persisted_state = MemoryRange::try_new(ONE_MB..persisted_end).unwrap();
1164        let parameter_range = MemoryRange::try_new(2 * ONE_MB..3 * ONE_MB).unwrap();
1165        let address_space = new_address_space_manager(
1166            &[MemoryRange::new(ONE_MB..4 * ONE_MB)],
1167            bootshim_used,
1168            persisted_state,
1169            parameter_range,
1170            None,
1171        );
1172
1173        assert!(build_e820_map(&mut boot_params, &mut ext, &address_space).is_ok());
1174
1175        check_e820(
1176            &boot_params,
1177            &ext,
1178            &[
1179                (ONE_MB..(persisted_header_end), E820_RESERVED),
1180                (persisted_header_end..persisted_end, E820_RESERVED),
1181                (persisted_end..2 * ONE_MB, E820_RAM),
1182                (2 * ONE_MB..3 * ONE_MB, E820_RESERVED),
1183                (3 * ONE_MB..4 * ONE_MB, E820_RAM),
1184            ],
1185        );
1186
1187        // memmap with reclaim
1188        let mut boot_params: boot_params = FromZeros::new_zeroed();
1189        let mut ext = FromZeros::new_zeroed();
1190        let bootshim_used = MemoryRange::try_new(ONE_MB..5 * ONE_MB).unwrap();
1191        let persisted_header_end = ONE_MB + PAGE_SIZE;
1192        let persisted_end = ONE_MB + 4 * PAGE_SIZE;
1193        let persisted_state = MemoryRange::try_new(ONE_MB..persisted_end).unwrap();
1194        let parameter_range = MemoryRange::try_new(2 * ONE_MB..5 * ONE_MB).unwrap();
1195        let reclaim = MemoryRange::try_new(3 * ONE_MB..4 * ONE_MB).unwrap();
1196        let address_space = new_address_space_manager(
1197            &[MemoryRange::new(ONE_MB..6 * ONE_MB)],
1198            bootshim_used,
1199            persisted_state,
1200            parameter_range,
1201            Some(reclaim),
1202        );
1203
1204        assert!(build_e820_map(&mut boot_params, &mut ext, &address_space).is_ok());
1205
1206        check_e820(
1207            &boot_params,
1208            &ext,
1209            &[
1210                (ONE_MB..(persisted_header_end), E820_RESERVED),
1211                (persisted_header_end..persisted_end, E820_RESERVED),
1212                (persisted_end..2 * ONE_MB, E820_RAM),
1213                (2 * ONE_MB..3 * ONE_MB, E820_RESERVED),
1214                (3 * ONE_MB..4 * ONE_MB, E820_RAM),
1215                (4 * ONE_MB..5 * ONE_MB, E820_RESERVED),
1216                (5 * ONE_MB..6 * ONE_MB, E820_RAM),
1217            ],
1218        );
1219
1220        // two mem ranges
1221        let mut boot_params: boot_params = FromZeros::new_zeroed();
1222        let mut ext = FromZeros::new_zeroed();
1223        let bootshim_used = MemoryRange::try_new(ONE_MB..5 * ONE_MB).unwrap();
1224        let persisted_header_end = ONE_MB + PAGE_SIZE;
1225        let persisted_end = ONE_MB + 4 * PAGE_SIZE;
1226        let persisted_state = MemoryRange::try_new(ONE_MB..persisted_end).unwrap();
1227        let parameter_range = MemoryRange::try_new(2 * ONE_MB..5 * ONE_MB).unwrap();
1228        let reclaim = MemoryRange::try_new(3 * ONE_MB..4 * ONE_MB).unwrap();
1229        let address_space = new_address_space_manager(
1230            &[
1231                MemoryRange::new(ONE_MB..4 * ONE_MB),
1232                MemoryRange::new(4 * ONE_MB..10 * ONE_MB),
1233            ],
1234            bootshim_used,
1235            persisted_state,
1236            parameter_range,
1237            Some(reclaim),
1238        );
1239
1240        assert!(build_e820_map(&mut boot_params, &mut ext, &address_space).is_ok());
1241
1242        check_e820(
1243            &boot_params,
1244            &ext,
1245            &[
1246                (ONE_MB..(persisted_header_end), E820_RESERVED),
1247                (persisted_header_end..persisted_end, E820_RESERVED),
1248                (persisted_end..2 * ONE_MB, E820_RAM),
1249                (2 * ONE_MB..3 * ONE_MB, E820_RESERVED),
1250                (3 * ONE_MB..4 * ONE_MB, E820_RAM),
1251                (4 * ONE_MB..5 * ONE_MB, E820_RESERVED),
1252                (5 * ONE_MB..10 * ONE_MB, E820_RAM),
1253            ],
1254        );
1255
1256        // memmap in 1 mb chunks
1257        let mut boot_params: boot_params = FromZeros::new_zeroed();
1258        let mut ext = FromZeros::new_zeroed();
1259        let bootshim_used = MemoryRange::try_new(ONE_MB..5 * ONE_MB).unwrap();
1260        let persisted_header_end = ONE_MB + PAGE_SIZE;
1261        let persisted_end = ONE_MB + 4 * PAGE_SIZE;
1262        let persisted_state = MemoryRange::try_new(ONE_MB..persisted_end).unwrap();
1263        let parameter_range = MemoryRange::try_new(2 * ONE_MB..5 * ONE_MB).unwrap();
1264        let reclaim = MemoryRange::try_new(3 * ONE_MB..4 * ONE_MB).unwrap();
1265        let address_space = new_address_space_manager(
1266            &[
1267                MemoryRange::new(ONE_MB..2 * ONE_MB),
1268                MemoryRange::new(2 * ONE_MB..3 * ONE_MB),
1269                MemoryRange::new(3 * ONE_MB..4 * ONE_MB),
1270                MemoryRange::new(4 * ONE_MB..5 * ONE_MB),
1271                MemoryRange::new(5 * ONE_MB..6 * ONE_MB),
1272                MemoryRange::new(6 * ONE_MB..7 * ONE_MB),
1273                MemoryRange::new(7 * ONE_MB..8 * ONE_MB),
1274            ],
1275            bootshim_used,
1276            persisted_state,
1277            parameter_range,
1278            Some(reclaim),
1279        );
1280
1281        assert!(build_e820_map(&mut boot_params, &mut ext, &address_space).is_ok());
1282
1283        check_e820(
1284            &boot_params,
1285            &ext,
1286            &[
1287                (ONE_MB..(persisted_header_end), E820_RESERVED),
1288                (persisted_header_end..persisted_end, E820_RESERVED),
1289                (persisted_end..2 * ONE_MB, E820_RAM),
1290                (2 * ONE_MB..3 * ONE_MB, E820_RESERVED),
1291                (3 * ONE_MB..4 * ONE_MB, E820_RAM),
1292                (4 * ONE_MB..5 * ONE_MB, E820_RESERVED),
1293                (5 * ONE_MB..8 * ONE_MB, E820_RAM),
1294            ],
1295        );
1296    }
1297
1298    // test e820 with spillover into ext
1299    #[test]
1300    fn test_e820_huge() {
1301        use crate::memory::AllocationPolicy;
1302        use crate::memory::AllocationType;
1303
1304        // Create 64 RAM ranges, then allocate 256 ranges to test spillover
1305        // boot_params.e820_map has E820_MAX_ENTRIES_ZEROPAGE (128) entries
1306        const E820_MAX_ENTRIES_ZEROPAGE: usize = 128;
1307        const RAM_RANGES: usize = 64;
1308        const TOTAL_ALLOCATIONS: usize = 256;
1309
1310        // Create 64 large RAM ranges (64MB each = 64 * 1MB pages per range)
1311        let mut ranges = Vec::new();
1312        for i in 0..RAM_RANGES {
1313            let start = (i as u64) * 64 * ONE_MB;
1314            let end = start + 64 * ONE_MB;
1315            ranges.push(MemoryRange::new(start..end));
1316        }
1317
1318        let bootshim_used = MemoryRange::try_new(0..ONE_MB * 2).unwrap();
1319        let persisted_range = MemoryRange::try_new(0..ONE_MB).unwrap();
1320        let parameter_range = MemoryRange::try_new(ONE_MB..2 * ONE_MB).unwrap();
1321
1322        let mut address_space = {
1323            let ram = ranges
1324                .iter()
1325                .cloned()
1326                .map(|range| MemoryEntry {
1327                    range,
1328                    mem_type: MemoryMapEntryType::VTL2_PROTECTABLE,
1329                    vnode: 0,
1330                })
1331                .collect::<Vec<_>>();
1332            let mut address_space = AddressSpaceManager::new_const();
1333            AddressSpaceManagerBuilder::new(
1334                &mut address_space,
1335                &ram,
1336                bootshim_used,
1337                persisted_range,
1338                core::iter::once(parameter_range),
1339            )
1340            .init()
1341            .unwrap();
1342            address_space
1343        };
1344
1345        for i in 0..TOTAL_ALLOCATIONS {
1346            // Intersperse sidecar node allocations with gpa pool allocations,
1347            // as otherwise the address space manager will collapse adjacent
1348            // ranges of the same type.
1349            let _allocated = address_space
1350                .allocate(
1351                    None,
1352                    ONE_MB,
1353                    if i % 2 == 0 {
1354                        AllocationType::GpaPool
1355                    } else {
1356                        AllocationType::SidecarNode
1357                    },
1358                    AllocationPolicy::LowMemory,
1359                )
1360                .expect("should be able to allocate sidecar node");
1361        }
1362
1363        let mut boot_params: boot_params = FromZeros::new_zeroed();
1364        let mut ext = FromZeros::new_zeroed();
1365        let total_ranges = address_space.vtl2_ranges().count();
1366
1367        let used_ext = build_e820_map(&mut boot_params, &mut ext, &address_space).unwrap();
1368
1369        // Verify that we used the extension
1370        assert!(used_ext, "should use extension when there are many ranges");
1371
1372        // Verify the standard e820_map is full
1373        assert_eq!(boot_params.e820_entries, E820_MAX_ENTRIES_ZEROPAGE as u8);
1374
1375        // Verify the extension has the overflow entries
1376        let ext_entries = (ext.header.len as usize) / size_of::<e820entry>();
1377        assert_eq!(ext_entries, total_ranges - E820_MAX_ENTRIES_ZEROPAGE);
1378
1379        // Verify we have the expected number of total ranges
1380        let total_e820_entries = boot_params.e820_entries as usize + ext_entries;
1381        assert_eq!(total_e820_entries, total_ranges);
1382    }
1383}