openhcl_boot/
main.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! The openhcl boot loader, which loads before the kernel to set up the
5//! kernel's boot parameters.
6
7// See build.rs.
8#![cfg_attr(minimal_rt, no_std, no_main)]
9// UNSAFETY: Interacting with low level hardware and bootloader primitives.
10#![expect(unsafe_code)]
11// Allow the allocator api when compiling with `RUSTFLAGS="--cfg nightly"`. This
12// is used for some miri tests for testing the bump allocator.
13//
14// Do not use a normal feature, as that shows errors with rust-analyzer since
15// most people are using stable and enable all features. We could remove this
16// once the allocator_api feature is stable.
17#![cfg_attr(nightly, feature(allocator_api))]
18
19mod arch;
20mod boot_logger;
21mod cmdline;
22mod dt;
23mod host_params;
24mod hypercall;
25mod memory;
26mod rt;
27mod sidecar;
28mod single_threaded;
29
30use crate::arch::setup_vtl2_memory;
31use crate::arch::setup_vtl2_vp;
32#[cfg(target_arch = "x86_64")]
33use crate::arch::tdx::get_tdx_tsc_reftime;
34use crate::arch::verify_imported_regions_hash;
35use crate::boot_logger::boot_logger_memory_init;
36use crate::boot_logger::boot_logger_runtime_init;
37use crate::boot_logger::boot_logger_write_memory_log_to_runtime;
38use crate::boot_logger::log;
39use crate::hypercall::hvcall;
40use crate::memory::AddressSpaceManager;
41use crate::single_threaded::OffStackRef;
42use crate::single_threaded::off_stack;
43use arrayvec::ArrayString;
44use arrayvec::ArrayVec;
45use cmdline::BootCommandLineOptions;
46use core::fmt::Write;
47use dt::BootTimes;
48use dt::write_dt;
49use host_params::COMMAND_LINE_SIZE;
50use host_params::PartitionInfo;
51use host_params::shim_params::IsolationType;
52use host_params::shim_params::ShimParams;
53use hvdef::Vtl;
54use loader_defs::linux::SETUP_DTB;
55use loader_defs::linux::setup_data;
56use loader_defs::shim::ShimParamsRaw;
57use memory_range::RangeWalkResult;
58use memory_range::walk_ranges;
59use minimal_rt::enlightened_panic::enable_enlightened_panic;
60use sidecar::SidecarConfig;
61use sidecar_defs::SidecarOutput;
62use sidecar_defs::SidecarParams;
63use zerocopy::FromBytes;
64use zerocopy::FromZeros;
65use zerocopy::Immutable;
66use zerocopy::IntoBytes;
67use zerocopy::KnownLayout;
68
69#[derive(Debug)]
70struct CommandLineTooLong;
71
72impl From<core::fmt::Error> for CommandLineTooLong {
73    fn from(_: core::fmt::Error) -> Self {
74        Self
75    }
76}
77
78struct BuildKernelCommandLineParams<'a> {
79    params: &'a ShimParams,
80    cmdline: &'a mut ArrayString<COMMAND_LINE_SIZE>,
81    partition_info: &'a PartitionInfo,
82    can_trust_host: bool,
83    is_confidential_debug: bool,
84    sidecar: Option<&'a SidecarConfig<'a>>,
85    vtl2_pool_supported: bool,
86    disable_keep_alive: bool,
87}
88
89/// Read and setup the underhill kernel command line into the specified buffer.
90fn build_kernel_command_line(
91    fn_params: BuildKernelCommandLineParams<'_>,
92) -> Result<(), CommandLineTooLong> {
93    let BuildKernelCommandLineParams {
94        params,
95        cmdline,
96        partition_info,
97        can_trust_host,
98        is_confidential_debug,
99        sidecar,
100        vtl2_pool_supported,
101        disable_keep_alive,
102    } = fn_params;
103
104    // For reference:
105    // https://www.kernel.org/doc/html/v5.15/admin-guide/kernel-parameters.html
106    const KERNEL_PARAMETERS: &[&str] = &[
107        // If a console is specified, then write everything to it.
108        "loglevel=8",
109        // Use a fixed 128KB log buffer by default.
110        "log_buf_len=128K",
111        // Enable time output on console for ohcldiag-dev.
112        "printk.time=1",
113        // Enable facility and level output on console for ohcldiag-dev.
114        "console_msg_format=syslog",
115        // Set uio parameter to configure vmbus ring buffer behavior.
116        "uio_hv_generic.no_mask=1",
117        // RELIABILITY: Dump anonymous pages and ELF headers only. Skip over
118        // huge pages and the shared pages.
119        "coredump_filter=0x33",
120        // PERF: No processor frequency governing.
121        "cpufreq.off=1",
122        // PERF: Disable the CPU idle time management entirely. It does not
123        // prevent the idle loop from running on idle CPUs, but it prevents
124        // the CPU idle time governors and drivers from being invoked.
125        "cpuidle.off=1",
126        // PERF: No perf checks for crypto algorithms to boot faster.
127        // Would have to evaluate the perf wins on the crypto manager vs
128        // delaying the boot up.
129        "cryptomgr.notests",
130        // PERF: Idle threads use HLT on x64 if there is no work.
131        // Believed to be a compromise between waking up the processor
132        // and the power consumption.
133        "idle=halt",
134        // WORKAROUND: Avoid init calls that assume presence of CMOS (Simple
135        // Boot Flag) or allocate the real-mode trampoline for APs.
136        "initcall_blacklist=init_real_mode,sbf_init",
137        // CONFIG-STATIC, PERF: Static loops-per-jiffy value to save time on boot.
138        "lpj=3000000",
139        // PERF: No broken timer check to boot faster.
140        "no_timer_check",
141        // CONFIG-STATIC, PERF: Using xsave makes VTL transitions being
142        // much slower. The xsave state is shared between VTLs, and we don't
143        // context switch it in the kernel when leaving/entering VTL2.
144        // Removing this will lead to corrupting register state and the
145        // undefined behaviour.
146        "noxsave",
147        // RELIABILITY: Panic on MCEs and faults in the kernel.
148        "oops=panic",
149        // RELIABILITY: Don't panic on kernel warnings.
150        "panic_on_warn=0",
151        // PERF, RELIABILITY: Don't print detailed information about the failing
152        // processes (memory maps, threads).
153        "panic_print=0",
154        // RELIABILITY: Reboot immediately on panic, no timeout.
155        "panic=-1",
156        // RELIABILITY: Don't print processor context information on a fatal
157        // signal. Our crash dump collection infrastructure seems reliable, and
158        // this information doesn't seem useful without a dump anyways.
159        // Additionally it may push important logs off the end of the kmsg
160        // page logged by the host.
161        //"print_fatal_signals=0",
162        // RELIABILITY: Unlimited logging to /dev/kmsg from userspace.
163        "printk.devkmsg=on",
164        // RELIABILITY: Reboot using a triple fault as the fastest method.
165        // That is also the method used for compatibility with earlier versions
166        // of the Microsoft HCL.
167        "reboot=t",
168        // CONFIG-STATIC: Type of the root file system.
169        "rootfstype=tmpfs",
170        // PERF: Deactivate kcompactd kernel thread, otherwise it will queue a
171        // scheduler timer periodically, which introduces jitters for VTL0.
172        "sysctl.vm.compaction_proactiveness=0",
173        // PERF: No TSC stability check when booting up to boot faster,
174        // also no validation during runtime.
175        "tsc=reliable",
176        // RELIABILITY: Panic on receiving an NMI.
177        "unknown_nmi_panic=1",
178        // Use vfio for MANA devices.
179        "vfio_pci.ids=1414:00ba",
180        // WORKAROUND: Enable no-IOMMU mode. This mode provides no device isolation,
181        // and no DMA translation.
182        "vfio.enable_unsafe_noiommu_mode=1",
183        // Specify the init path.
184        "rdinit=/underhill-init",
185        // Default to user-mode NVMe driver.
186        "OPENHCL_NVME_VFIO=1",
187        // The next three items reduce the memory overhead of the storvsc driver.
188        // Since it is only used for DVD, performance is not critical.
189        "hv_storvsc.storvsc_vcpus_per_sub_channel=2048",
190        // Fix number of hardware queues at 2.
191        "hv_storvsc.storvsc_max_hw_queues=2",
192        // Reduce the ring buffer size to 32K.
193        "hv_storvsc.storvsc_ringbuffer_size=0x8000",
194        // Disable eager mimalloc commit to prevent core dumps from being overly large
195        "MIMALLOC_ARENA_EAGER_COMMIT=0",
196    ];
197
198    const X86_KERNEL_PARAMETERS: &[&str] = &[
199        // Disable pcid support. This is a temporary fix to allow
200        // Underhill to run nested inside AMD VMs. Otherwise, the
201        // Underhill kernel tries to start APs with PCID bits set in CR3
202        // without the PCIDE bit set in CR4, which is an invalid
203        // VP state (according to the mshv nested implementation).
204        //
205        // TODO: remove this once we figure out the root cause and apply
206        // a workaround/fix elsewhere.
207        "clearcpuid=pcid",
208        // Disable all attempts to use an IOMMU, including swiotlb.
209        "iommu=off",
210        // Don't probe for a PCI bus. PCI devices currently come from VPCI. When
211        // this changes, we will explicitly enumerate a PCI bus via devicetree.
212        "pci=off",
213    ];
214
215    const AARCH64_KERNEL_PARAMETERS: &[&str] = &[];
216
217    for p in KERNEL_PARAMETERS {
218        write!(cmdline, "{p} ")?;
219    }
220
221    let arch_parameters = if cfg!(target_arch = "x86_64") {
222        X86_KERNEL_PARAMETERS
223    } else {
224        AARCH64_KERNEL_PARAMETERS
225    };
226    for p in arch_parameters {
227        write!(cmdline, "{p} ")?;
228    }
229
230    const HARDWARE_ISOLATED_KERNEL_PARAMETERS: &[&str] = &[
231        // Even with iommu=off, the SWIOTLB is still allocated on AARCH64
232        // (iommu=off ignored entirely), and CVMs (memory encryption forces it
233        // on). Set it to a single area in 8MB. The first parameter controls the
234        // area size in slabs (2KB per slab), the second controls the number of
235        // areas (default is # of CPUs).
236        //
237        // This is set to 8MB on hardware isolated VMs since there are some
238        // scenarios, such as provisioning over DVD, which require a larger size
239        // since the buffer is being used.
240        "swiotlb=4096,1",
241    ];
242
243    const NON_HARDWARE_ISOLATED_KERNEL_PARAMETERS: &[&str] = &[
244        // Even with iommu=off, the SWIOTLB is still allocated on AARCH64
245        // (iommu=off ignored entirely). Set it to the minimum, saving ~63 MiB.
246        // The first parameter controls the area size, the second controls the
247        // number of areas (default is # of CPUs). Set them both to the minimum.
248        "swiotlb=1,1",
249    ];
250
251    if params.isolation_type.is_hardware_isolated() {
252        for p in HARDWARE_ISOLATED_KERNEL_PARAMETERS {
253            write!(cmdline, "{p} ")?;
254        }
255    } else {
256        for p in NON_HARDWARE_ISOLATED_KERNEL_PARAMETERS {
257            write!(cmdline, "{p} ")?;
258        }
259    }
260
261    // Enable the com3 console by default if it's available and we're not
262    // isolated, or if we are isolated but also have debugging enabled.
263    //
264    // Otherwise, set the console to ttynull so the kernel does not default to
265    // com1. This is overridden by any user customizations in the static or
266    // dynamic command line, as this console argument provided by the bootloader
267    // comes first.
268    let console = if partition_info.com3_serial_available && can_trust_host {
269        "ttyS2,115200"
270    } else {
271        "ttynull"
272    };
273    write!(cmdline, "console={console} ")?;
274
275    if params.isolation_type != IsolationType::None {
276        write!(
277            cmdline,
278            "{}=1 ",
279            underhill_confidentiality::OPENHCL_CONFIDENTIAL_ENV_VAR_NAME
280        )?;
281    }
282
283    if is_confidential_debug {
284        write!(
285            cmdline,
286            "{}=1 ",
287            underhill_confidentiality::OPENHCL_CONFIDENTIAL_DEBUG_ENV_VAR_NAME
288        )?;
289    }
290
291    // Only when explicitly supported by Host.
292    // TODO: Move from command line to device tree when stabilized.
293    if partition_info.nvme_keepalive && vtl2_pool_supported && !disable_keep_alive {
294        write!(cmdline, "OPENHCL_NVME_KEEP_ALIVE=1 ")?;
295    }
296
297    if let Some(sidecar) = sidecar {
298        write!(cmdline, "{} ", sidecar.kernel_command_line())?;
299    }
300
301    // HACK: Set the vmbus connection id via kernel commandline.
302    //
303    // This code will be removed when the kernel supports setting connection id
304    // via device tree.
305    write!(
306        cmdline,
307        "hv_vmbus.message_connection_id=0x{:x} ",
308        partition_info.vmbus_vtl2.connection_id
309    )?;
310
311    // If we're isolated we can't trust the host-provided cmdline
312    if can_trust_host {
313        // Prepend the computed parameters to the original command line.
314        cmdline.write_str(&partition_info.cmdline)?;
315    }
316
317    Ok(())
318}
319
320// The Linux kernel requires that the FDT fit within a single 256KB mapping, as
321// that is the maximum size the kernel can use during its early boot processes.
322// We also want our FDT to be as large as possible to support as many vCPUs as
323// possible. We set it to 256KB, but it must also be page-aligned, as leaving it
324// unaligned runs the possibility of it taking up 1 too many pages, resulting in
325// a 260KB mapping, which will fail.
326const FDT_SIZE: usize = 256 * 1024;
327
328#[repr(C, align(4096))]
329#[derive(FromBytes, IntoBytes, Immutable, KnownLayout)]
330struct Fdt {
331    header: setup_data,
332    data: [u8; FDT_SIZE - size_of::<setup_data>()],
333}
334
335/// Raw shim parameters are provided via a relative offset from the base of
336/// where the shim is loaded. Return a ShimParams structure based on the raw
337/// offset based RawShimParams.
338fn shim_parameters(shim_params_raw_offset: isize) -> ShimParams {
339    unsafe extern "C" {
340        static __ehdr_start: u8;
341    }
342
343    let shim_base = core::ptr::addr_of!(__ehdr_start) as usize;
344
345    // SAFETY: The host is required to relocate everything by the same bias, so
346    //         the shim parameters should be at the build time specified offset
347    //         from the base address of the image.
348    let raw_shim_params = unsafe {
349        &*(shim_base.wrapping_add_signed(shim_params_raw_offset) as *const ShimParamsRaw)
350    };
351
352    ShimParams::new(shim_base as u64, raw_shim_params)
353}
354
355#[cfg_attr(not(target_arch = "x86_64"), expect(dead_code))]
356mod x86_boot {
357    use crate::PageAlign;
358    use crate::memory::AddressSpaceManager;
359    use crate::single_threaded::OffStackRef;
360    use crate::single_threaded::off_stack;
361    use crate::zeroed;
362    use core::mem::size_of;
363    use core::ops::Range;
364    use core::ptr;
365    use loader_defs::linux::E820_RAM;
366    use loader_defs::linux::E820_RESERVED;
367    use loader_defs::linux::SETUP_E820_EXT;
368    use loader_defs::linux::boot_params;
369    use loader_defs::linux::e820entry;
370    use loader_defs::linux::setup_data;
371    use loader_defs::shim::MemoryVtlType;
372    use memory_range::MemoryRange;
373    use zerocopy::FromZeros;
374    use zerocopy::Immutable;
375    use zerocopy::KnownLayout;
376
377    #[repr(C)]
378    #[derive(FromZeros, Immutable, KnownLayout)]
379    pub struct E820Ext {
380        pub header: setup_data,
381        pub entries: [e820entry; 512],
382    }
383
384    fn add_e820_entry(
385        entry: Option<&mut e820entry>,
386        range: MemoryRange,
387        typ: u32,
388    ) -> Result<(), BuildE820MapError> {
389        *entry.ok_or(BuildE820MapError::OutOfE820Entries)? = e820entry {
390            addr: range.start().into(),
391            size: range.len().into(),
392            typ: typ.into(),
393        };
394        Ok(())
395    }
396
397    #[derive(Debug)]
398    pub enum BuildE820MapError {
399        /// Out of e820 entries.
400        OutOfE820Entries,
401    }
402
403    /// Build the e820 map for the kernel representing usable VTL2 ram.
404    pub fn build_e820_map(
405        boot_params: &mut boot_params,
406        ext: &mut E820Ext,
407        address_space: &AddressSpaceManager,
408    ) -> Result<bool, BuildE820MapError> {
409        boot_params.e820_entries = 0;
410        let mut entries = boot_params
411            .e820_map
412            .iter_mut()
413            .chain(ext.entries.iter_mut());
414
415        let mut n = 0;
416        for (range, typ) in address_space.vtl2_ranges() {
417            match typ {
418                MemoryVtlType::VTL2_RAM => {
419                    add_e820_entry(entries.next(), range, E820_RAM)?;
420                    n += 1;
421                }
422                MemoryVtlType::VTL2_CONFIG
423                | MemoryVtlType::VTL2_SIDECAR_IMAGE
424                | MemoryVtlType::VTL2_SIDECAR_NODE
425                | MemoryVtlType::VTL2_RESERVED
426                | MemoryVtlType::VTL2_GPA_POOL
427                | MemoryVtlType::VTL2_TDX_PAGE_TABLES
428                | MemoryVtlType::VTL2_BOOTSHIM_LOG_BUFFER
429                | MemoryVtlType::VTL2_PERSISTED_STATE_HEADER
430                | MemoryVtlType::VTL2_PERSISTED_STATE_PROTOBUF => {
431                    add_e820_entry(entries.next(), range, E820_RESERVED)?;
432                    n += 1;
433                }
434
435                _ => {
436                    panic!("unexpected vtl2 ram type {typ:?} for range {range:#?}");
437                }
438            }
439        }
440
441        let base = n.min(boot_params.e820_map.len());
442        boot_params.e820_entries = base as u8;
443
444        if base < n {
445            ext.header.len = ((n - base) * size_of::<e820entry>()) as u32;
446            Ok(true)
447        } else {
448            Ok(false)
449        }
450    }
451
452    pub fn build_boot_params(
453        address_space: &AddressSpaceManager,
454        initrd: Range<u64>,
455        cmdline: &str,
456        setup_data_head: *const setup_data,
457        setup_data_tail: &mut &mut setup_data,
458    ) -> OffStackRef<'static, PageAlign<boot_params>> {
459        let mut boot_params_storage = off_stack!(PageAlign<boot_params>, zeroed());
460        let boot_params = &mut boot_params_storage.0;
461        boot_params.hdr.type_of_loader = 0xff; // Unknown loader type
462
463        // HACK: A kernel change just in the Underhill kernel tree has a workaround
464        // to disable probe_roms and reserve_bios_regions when X86_SUBARCH_LGUEST
465        // (1) is set by the bootloader. This stops the kernel from reading VTL0
466        // memory during kernel boot, which can have catastrophic consequences
467        // during a servicing operation when VTL0 has written values to memory, or
468        // unaccepted page accesses in an isolated partition.
469        //
470        // This is only intended as a stopgap until a suitable upstreamable kernel
471        // patch is made.
472        boot_params.hdr.hardware_subarch = 1.into();
473
474        boot_params.hdr.ramdisk_image = (initrd.start as u32).into();
475        boot_params.ext_ramdisk_image = (initrd.start >> 32) as u32;
476        let initrd_len = initrd.end - initrd.start;
477        boot_params.hdr.ramdisk_size = (initrd_len as u32).into();
478        boot_params.ext_ramdisk_size = (initrd_len >> 32) as u32;
479
480        let e820_ext = OffStackRef::leak(off_stack!(E820Ext, zeroed()));
481
482        let used_ext = build_e820_map(boot_params, e820_ext, address_space)
483            .expect("building e820 map must succeed");
484
485        if used_ext {
486            e820_ext.header.ty = SETUP_E820_EXT;
487            setup_data_tail.next = ptr::from_ref(&e820_ext.header) as u64;
488            *setup_data_tail = &mut e820_ext.header;
489        }
490
491        let cmd_line_addr = cmdline.as_ptr() as u64;
492        boot_params.hdr.cmd_line_ptr = (cmd_line_addr as u32).into();
493        boot_params.ext_cmd_line_ptr = (cmd_line_addr >> 32) as u32;
494
495        boot_params.hdr.setup_data = (setup_data_head as u64).into();
496
497        boot_params_storage
498    }
499}
500
501/// Build the cc_blob containing the location of different parameters associated with SEV.
502#[cfg(target_arch = "x86_64")]
503fn build_cc_blob_sev_info(
504    cc_blob: &mut loader_defs::linux::cc_blob_sev_info,
505    shim_params: &ShimParams,
506) {
507    // TODO SNP: Currently only the first CPUID page is passed through.
508    // Consider changing this.
509    cc_blob.magic = loader_defs::linux::CC_BLOB_SEV_INFO_MAGIC;
510    cc_blob.version = 0;
511    cc_blob._reserved = 0;
512    cc_blob.secrets_phys = shim_params.secrets_start();
513    cc_blob.secrets_len = hvdef::HV_PAGE_SIZE as u32;
514    cc_blob._rsvd1 = 0;
515    cc_blob.cpuid_phys = shim_params.cpuid_start();
516    cc_blob.cpuid_len = hvdef::HV_PAGE_SIZE as u32;
517    cc_blob._rsvd2 = 0;
518}
519
520#[repr(C, align(4096))]
521#[derive(FromZeros, Immutable, KnownLayout)]
522struct PageAlign<T>(T);
523
524const fn zeroed<T: FromZeros>() -> T {
525    // SAFETY: `T` implements `FromZeros`, so this is a safe initialization of `T`.
526    unsafe { core::mem::MaybeUninit::<T>::zeroed().assume_init() }
527}
528
529fn get_ref_time(isolation: IsolationType) -> Option<u64> {
530    match isolation {
531        #[cfg(target_arch = "x86_64")]
532        IsolationType::Tdx => get_tdx_tsc_reftime(),
533        #[cfg(target_arch = "x86_64")]
534        IsolationType::Snp => None,
535        _ => Some(minimal_rt::reftime::reference_time()),
536    }
537}
538
539fn shim_main(shim_params_raw_offset: isize) -> ! {
540    let p = shim_parameters(shim_params_raw_offset);
541    if p.isolation_type == IsolationType::None {
542        enable_enlightened_panic();
543    }
544
545    // Enable the in-memory log.
546    boot_logger_memory_init(p.log_buffer);
547
548    let boot_reftime = get_ref_time(p.isolation_type);
549
550    // The support code for the fast hypercalls does not set
551    // the Guest ID if it is not set yet as opposed to the slow
552    // hypercall code path where that is done automatically.
553    // Thus the fast hypercalls will fail as the the Guest ID has
554    // to be set first hence initialize hypercall support
555    // explicitly.
556    if !p.isolation_type.is_hardware_isolated() {
557        hvcall().initialize();
558    }
559
560    let mut static_options = BootCommandLineOptions::new();
561    if let Some(cmdline) = p.command_line().command_line() {
562        static_options.parse(cmdline);
563    }
564
565    let static_confidential_debug = static_options.confidential_debug;
566    let can_trust_host = p.isolation_type == IsolationType::None || static_confidential_debug;
567
568    let mut dt_storage = off_stack!(PartitionInfo, PartitionInfo::new());
569    let address_space = OffStackRef::leak(off_stack!(
570        AddressSpaceManager,
571        AddressSpaceManager::new_const()
572    ));
573    let partition_info = match PartitionInfo::read_from_dt(
574        &p,
575        &mut dt_storage,
576        address_space,
577        static_options,
578        can_trust_host,
579    ) {
580        Ok(val) => val,
581        Err(e) => panic!("unable to read device tree params {}", e),
582    };
583
584    // Enable logging ASAP. This is fine even when isolated, as we don't have
585    // any access to secrets in the boot shim.
586    boot_logger_runtime_init(p.isolation_type, partition_info.com3_serial_available);
587    log!("openhcl_boot: logging enabled");
588    boot_logger_write_memory_log_to_runtime();
589
590    // Confidential debug will show up in boot_options only if included in the
591    // static command line, or if can_trust_host is true (so the dynamic command
592    // line has been parsed).
593    let is_confidential_debug =
594        static_confidential_debug || partition_info.boot_options.confidential_debug;
595
596    // Fill out the non-devicetree derived parts of PartitionInfo.
597    if !p.isolation_type.is_hardware_isolated()
598        && hvcall().vtl() == Vtl::Vtl2
599        && hvdef::HvRegisterVsmCapabilities::from(
600            hvcall()
601                .get_register(hvdef::HvAllArchRegisterName::VsmCapabilities.into())
602                .expect("failed to query vsm capabilities")
603                .as_u64(),
604        )
605        .vtl0_alias_map_available()
606    {
607        // If the vtl0 alias map was not provided in the devicetree, attempt to
608        // derive it from the architectural physical address bits.
609        //
610        // The value in the ID_AA64MMFR0_EL1 register used to determine the
611        // physical address bits can only represent multiples of 4. As a result,
612        // the Surface Pro X (and systems with similar CPUs) cannot properly
613        // report their address width of 39 bits. This causes the calculated
614        // alias map to be incorrect, which results in panics when trying to
615        // read memory and getting invalid data.
616        if partition_info.vtl0_alias_map.is_none() {
617            partition_info.vtl0_alias_map =
618                Some(1 << (arch::physical_address_bits(p.isolation_type) - 1));
619        }
620    } else {
621        // Ignore any devicetree-provided alias map if the conditions above
622        // aren't met.
623        partition_info.vtl0_alias_map = None;
624    }
625
626    // Rebind partition_info as no longer mutable.
627    let partition_info: &PartitionInfo = partition_info;
628
629    if partition_info.cpus.is_empty() {
630        panic!("no cpus");
631    }
632
633    validate_vp_hw_ids(partition_info);
634
635    setup_vtl2_memory(&p, partition_info);
636    setup_vtl2_vp(partition_info);
637
638    verify_imported_regions_hash(&p);
639
640    let mut sidecar_params = off_stack!(PageAlign<SidecarParams>, zeroed());
641    let mut sidecar_output = off_stack!(PageAlign<SidecarOutput>, zeroed());
642    let sidecar = sidecar::start_sidecar(
643        &p,
644        partition_info,
645        address_space,
646        &mut sidecar_params.0,
647        &mut sidecar_output.0,
648    );
649
650    // Rebind address_space as no longer mutable.
651    let address_space: &AddressSpaceManager = address_space;
652
653    let mut cmdline = off_stack!(ArrayString<COMMAND_LINE_SIZE>, ArrayString::new_const());
654    build_kernel_command_line(BuildKernelCommandLineParams {
655        params: &p,
656        cmdline: &mut cmdline,
657        partition_info,
658        can_trust_host,
659        is_confidential_debug,
660        sidecar: sidecar.as_ref(),
661        vtl2_pool_supported: address_space.has_vtl2_pool(),
662        disable_keep_alive: partition_info.boot_options.disable_nvme_keep_alive,
663    })
664    .unwrap();
665
666    let mut fdt = off_stack!(Fdt, zeroed());
667    fdt.header.len = fdt.data.len() as u32;
668    fdt.header.ty = SETUP_DTB;
669
670    #[cfg(target_arch = "x86_64")]
671    let mut setup_data_tail = &mut fdt.header;
672    #[cfg(target_arch = "x86_64")]
673    let setup_data_head = core::ptr::from_ref(setup_data_tail);
674
675    #[cfg(target_arch = "x86_64")]
676    if p.isolation_type == IsolationType::Snp {
677        let cc_blob = OffStackRef::leak(off_stack!(loader_defs::linux::cc_blob_sev_info, zeroed()));
678        build_cc_blob_sev_info(cc_blob, &p);
679
680        let cc_data = OffStackRef::leak(off_stack!(loader_defs::linux::cc_setup_data, zeroed()));
681        cc_data.header.len = size_of::<loader_defs::linux::cc_setup_data>() as u32;
682        cc_data.header.ty = loader_defs::linux::SETUP_CC_BLOB;
683        cc_data.cc_blob_address = core::ptr::from_ref(&*cc_blob) as u32;
684
685        // Chain in the setup data.
686        setup_data_tail.next = core::ptr::from_ref(&*cc_data) as u64;
687        setup_data_tail = &mut cc_data.header;
688    }
689
690    let initrd = p.initrd_base..p.initrd_base + p.initrd_size;
691
692    // Validate the initrd crc matches what was put at file generation time.
693    let computed_crc = crc32fast::hash(p.initrd());
694    assert_eq!(
695        computed_crc, p.initrd_crc,
696        "computed initrd crc does not match build time calculated crc"
697    );
698
699    #[cfg(target_arch = "x86_64")]
700    let boot_params = x86_boot::build_boot_params(
701        address_space,
702        initrd.clone(),
703        &cmdline,
704        setup_data_head,
705        &mut setup_data_tail,
706    );
707
708    // Compute the ending boot time. This has to be before writing to device
709    // tree, so this is as late as we can do it.
710
711    let boot_times = boot_reftime.map(|start| BootTimes {
712        start,
713        end: get_ref_time(p.isolation_type).unwrap_or(0),
714    });
715
716    // Validate that no imported regions that are pending are not part of vtl2
717    // ram.
718    for (range, result) in walk_ranges(
719        partition_info.vtl2_ram.iter().map(|r| (r.range, ())),
720        p.imported_regions(),
721    ) {
722        match result {
723            RangeWalkResult::Neither | RangeWalkResult::Left(_) | RangeWalkResult::Both(_, _) => {}
724            RangeWalkResult::Right(accepted) => {
725                // Ranges that are not a part of VTL2 ram must have been
726                // preaccepted, as usermode expect that to be the case.
727                assert!(
728                    accepted,
729                    "range {:#x?} not in vtl2 ram was not preaccepted at launch",
730                    range
731                );
732            }
733        }
734    }
735
736    write_dt(
737        &mut fdt.data,
738        partition_info,
739        address_space,
740        p.imported_regions().map(|r| {
741            // Discard if the range was previously pending - the bootloader has
742            // accepted all pending ranges.
743            //
744            // NOTE: No VTL0 memory today is marked as pending. The check above
745            // validates that, and this code may need to change if this becomes
746            // no longer true.
747            r.0
748        }),
749        initrd,
750        &cmdline,
751        sidecar.as_ref(),
752        boot_times,
753        p.isolation_type,
754    )
755    .unwrap();
756
757    rt::verify_stack_cookie();
758
759    log!("uninitializing hypercalls, about to jump to kernel");
760    hvcall().uninitialize();
761
762    cfg_if::cfg_if! {
763        if #[cfg(target_arch = "x86_64")] {
764            // SAFETY: the parameter blob is trusted.
765            let kernel_entry: extern "C" fn(u64, &loader_defs::linux::boot_params) -> ! =
766                unsafe { core::mem::transmute(p.kernel_entry_address) };
767            kernel_entry(0, &boot_params.0)
768        } else if #[cfg(target_arch = "aarch64")] {
769            // SAFETY: the parameter blob is trusted.
770            let kernel_entry: extern "C" fn(fdt_data: *const u8, mbz0: u64, mbz1: u64, mbz2: u64) -> ! =
771                unsafe { core::mem::transmute(p.kernel_entry_address) };
772            // Disable MMU for kernel boot without EFI, as required by the boot protocol.
773            // Flush (and invalidate) the caches, as that is required for disabling MMU.
774            // SAFETY: Just changing a bit in the register and then jumping to the kernel.
775            unsafe {
776                core::arch::asm!(
777                    "
778                    mrs     {0}, sctlr_el1
779                    bic     {0}, {0}, #0x1
780                    msr     sctlr_el1, {0}
781                    tlbi    vmalle1
782                    dsb     sy
783                    isb     sy",
784                    lateout(reg) _,
785                );
786            }
787            kernel_entry(fdt.data.as_ptr(), 0, 0, 0)
788        } else {
789            panic!("unsupported arch")
790        }
791    }
792}
793
794/// Ensure that mshv VP indexes for the CPUs listed in the partition info
795/// correspond to the N in the cpu@N devicetree node name. OpenVMM assumes that
796/// this will be the case.
797fn validate_vp_hw_ids(partition_info: &PartitionInfo) {
798    use host_params::MAX_CPU_COUNT;
799    use hypercall::HwId;
800
801    if partition_info.isolation.is_hardware_isolated() {
802        // TODO TDX SNP: we don't have a GHCB/GHCI page set up to communicate
803        // with the hypervisor here, so we can't easily perform the check. Since
804        // there is no security impact to this check, we can skip it for now; if
805        // the VM fails to boot, then this is due to a host contract violation.
806        //
807        // For TDX, we could use ENUM TOPOLOGY to validate that the TD VCPU
808        // indexes correspond to the APIC IDs in the right order. I am not
809        // certain if there are places where we depend on this mapping today.
810        return;
811    }
812
813    if hvcall().vtl() != Vtl::Vtl2 {
814        // If we're not using guest VSM, then the guest won't communicate
815        // directly with the hypervisor, so we can choose the VP indexes
816        // ourselves.
817        return;
818    }
819
820    // Ensure the host and hypervisor agree on VP index ordering.
821
822    let mut hw_ids = off_stack!(ArrayVec<HwId, MAX_CPU_COUNT>, ArrayVec::new_const());
823    hw_ids.clear();
824    hw_ids.extend(partition_info.cpus.iter().map(|c| c.reg as _));
825    let mut vp_indexes = off_stack!(ArrayVec<u32, MAX_CPU_COUNT>, ArrayVec::new_const());
826    vp_indexes.clear();
827    if let Err(err) = hvcall().get_vp_index_from_hw_id(&hw_ids, &mut vp_indexes) {
828        panic!(
829            "failed to get VP index for hardware ID {:#x}: {}",
830            hw_ids[vp_indexes.len().min(hw_ids.len() - 1)],
831            err
832        );
833    }
834    if let Some((i, &vp_index)) = vp_indexes
835        .iter()
836        .enumerate()
837        .find(|&(i, vp_index)| i as u32 != *vp_index)
838    {
839        panic!(
840            "CPU hardware ID {:#x} does not correspond to VP index {}",
841            hw_ids[i], vp_index
842        );
843    }
844}
845
846// See build.rs. See `mod rt` for the actual bootstrap code required to invoke
847// shim_main.
848#[cfg(not(minimal_rt))]
849fn main() {
850    unimplemented!("build with MINIMAL_RT_BUILD to produce a working boot loader");
851}
852
853#[cfg(test)]
854mod test {
855    use super::x86_boot::E820Ext;
856    use super::x86_boot::build_e820_map;
857    use crate::cmdline::BootCommandLineOptions;
858    use crate::dt::write_dt;
859    use crate::host_params::MAX_CPU_COUNT;
860    use crate::host_params::PartitionInfo;
861    use crate::host_params::shim_params::IsolationType;
862    use crate::memory::AddressSpaceManager;
863    use crate::memory::AddressSpaceManagerBuilder;
864    use arrayvec::ArrayString;
865    use arrayvec::ArrayVec;
866    use core::ops::Range;
867    use host_fdt_parser::CpuEntry;
868    use host_fdt_parser::MemoryEntry;
869    use host_fdt_parser::VmbusInfo;
870    use igvm_defs::MemoryMapEntryType;
871    use loader_defs::linux::E820_RAM;
872    use loader_defs::linux::E820_RESERVED;
873    use loader_defs::linux::boot_params;
874    use loader_defs::linux::e820entry;
875    use memory_range::MemoryRange;
876    use memory_range::subtract_ranges;
877    use zerocopy::FromZeros;
878
879    const HIGH_MMIO_GAP_END: u64 = 0x1000000000; //  64 GiB
880    const VMBUS_MMIO_GAP_SIZE: u64 = 0x10000000; // 256 MiB
881    const HIGH_MMIO_GAP_START: u64 = HIGH_MMIO_GAP_END - VMBUS_MMIO_GAP_SIZE;
882
883    /// Create partition info with given cpu count enabled and sequential
884    /// apic_ids.
885    fn new_partition_info(cpu_count: usize) -> PartitionInfo {
886        let mut cpus: ArrayVec<CpuEntry, MAX_CPU_COUNT> = ArrayVec::new();
887
888        for id in 0..(cpu_count as u64) {
889            cpus.push(CpuEntry { reg: id, vnode: 0 });
890        }
891
892        let mut mmio = ArrayVec::new();
893        mmio.push(
894            MemoryRange::try_new(HIGH_MMIO_GAP_START..HIGH_MMIO_GAP_END).expect("valid range"),
895        );
896
897        PartitionInfo {
898            vtl2_ram: ArrayVec::new(),
899            partition_ram: ArrayVec::new(),
900            isolation: IsolationType::None,
901            bsp_reg: cpus[0].reg as u32,
902            cpus,
903            cmdline: ArrayString::new(),
904            vmbus_vtl2: VmbusInfo {
905                mmio,
906                connection_id: 0,
907            },
908            vmbus_vtl0: VmbusInfo {
909                mmio: ArrayVec::new(),
910                connection_id: 0,
911            },
912            com3_serial_available: false,
913            gic: None,
914            pmu_gsiv: None,
915            memory_allocation_mode: host_fdt_parser::MemoryAllocationMode::Host,
916            entropy: None,
917            vtl0_alias_map: None,
918            nvme_keepalive: false,
919            boot_options: BootCommandLineOptions::new(),
920        }
921    }
922
923    // ensure we can boot with a _lot_ of vcpus
924    #[test]
925    #[cfg_attr(
926        target_arch = "aarch64",
927        ignore = "TODO: investigate why this doesn't always work on ARM"
928    )]
929    fn fdt_cpu_scaling() {
930        const MAX_CPUS: usize = 2048;
931
932        let mut buf = [0; 0x40000];
933        write_dt(
934            &mut buf,
935            &new_partition_info(MAX_CPUS),
936            &AddressSpaceManager::new_const(),
937            [],
938            0..0,
939            &ArrayString::from("test").unwrap_or_default(),
940            None,
941            None,
942            IsolationType::None,
943        )
944        .unwrap();
945    }
946
947    // Must match the DeviceTree blob generated with the standard tooling
948    // to ensure being compliant to the standards (or, at least, compatibility
949    // with a widely used implementation).
950    // For details on regenerating the test content, see `fdt_dtc_decompile`
951    // below.
952    #[test]
953    #[ignore = "TODO: temporarily broken"]
954    fn fdt_dtc_check_content() {
955        const MAX_CPUS: usize = 2;
956        const BUF_SIZE: usize = 0x1000;
957
958        // Rust cannot infer the type.
959        let dtb_data_spans: [(usize, &[u8]); 2] = [
960            (
961                /* Span starts at offset */ 0,
962                b"\xd0\x0d\xfe\xed\x00\x00\x10\x00\x00\x00\x04\x38\x00\x00\x00\x38\
963                \x00\x00\x00\x28\x00\x00\x00\x11\x00\x00\x00\x10\x00\x00\x00\x00\
964                \x00\x00\x00\x4a\x00\x00\x01\x6c\x00\x00\x00\x00\x00\x00\x00\x00\
965                \x00\x00\x00\x00\x00\x00\x00\x00\x23\x61\x64\x64\x72\x65\x73\x73\
966                \x2d\x63\x65\x6c\x6c\x73\x00\x23\x73\x69\x7a\x65\x2d\x63\x65\x6c\
967                \x6c\x73\x00\x6d\x6f\x64\x65\x6c\x00\x72\x65\x67\x00\x64\x65\x76\
968                \x69\x63\x65\x5f\x74\x79\x70\x65\x00\x73\x74\x61\x74\x75\x73\x00\
969                \x63\x6f\x6d\x70\x61\x74\x69\x62\x6c\x65\x00\x72\x61\x6e\x67\x65\
970                \x73",
971            ),
972            (
973                /* Span starts at offset */ 0x430,
974                b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\
975                \x00\x00\x00\x03\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x02\
976                \x00\x00\x00\x03\x00\x00\x00\x04\x00\x00\x00\x0f\x00\x00\x00\x00\
977                \x00\x00\x00\x03\x00\x00\x00\x0f\x00\x00\x00\x1b\x6d\x73\x66\x74\
978                \x2c\x75\x6e\x64\x65\x72\x68\x69\x6c\x6c\x00\x00\x00\x00\x00\x01\
979                \x63\x70\x75\x73\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x04\
980                \x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x03\x00\x00\x00\x04\
981                \x00\x00\x00\x0f\x00\x00\x00\x00\x00\x00\x00\x01\x63\x70\x75\x40\
982                \x30\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x04\x00\x00\x00\x25\
983                \x63\x70\x75\x00\x00\x00\x00\x03\x00\x00\x00\x04\x00\x00\x00\x21\
984                \x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x05\x00\x00\x00\x31\
985                \x6f\x6b\x61\x79\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x01\
986                \x63\x70\x75\x40\x31\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x04\
987                \x00\x00\x00\x25\x63\x70\x75\x00\x00\x00\x00\x03\x00\x00\x00\x04\
988                \x00\x00\x00\x21\x00\x00\x00\x01\x00\x00\x00\x03\x00\x00\x00\x05\
989                \x00\x00\x00\x31\x6f\x6b\x61\x79\x00\x00\x00\x00\x00\x00\x00\x02\
990                \x00\x00\x00\x02\x00\x00\x00\x01\x76\x6d\x62\x75\x73\x00\x00\x00\
991                \x00\x00\x00\x03\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x02\
992                \x00\x00\x00\x03\x00\x00\x00\x04\x00\x00\x00\x0f\x00\x00\x00\x01\
993                \x00\x00\x00\x03\x00\x00\x00\x0b\x00\x00\x00\x38\x6d\x73\x66\x74\
994                \x2c\x76\x6d\x62\x75\x73\x00\x00\x00\x00\x00\x03\x00\x00\x00\x14\
995                \x00\x00\x00\x43\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0f\
996                \xf0\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x02\
997                \x00\x00\x00\x09",
998            ),
999        ];
1000
1001        let mut sample_buf = [0u8; BUF_SIZE];
1002        for (span_start, bytes) in dtb_data_spans {
1003            sample_buf[span_start..span_start + bytes.len()].copy_from_slice(bytes);
1004        }
1005
1006        let mut buf = [0u8; BUF_SIZE];
1007        write_dt(
1008            &mut buf,
1009            &new_partition_info(MAX_CPUS),
1010            &AddressSpaceManager::new_const(),
1011            [],
1012            0..0,
1013            &ArrayString::from("test").unwrap_or_default(),
1014            None,
1015            None,
1016            IsolationType::None,
1017        )
1018        .unwrap();
1019
1020        assert!(sample_buf == buf);
1021    }
1022
1023    // This test should be manually enabled when need to regenerate
1024    // the sample content above and validate spec compliance with `dtc`.
1025    // Before running the test, please install the DeviceTree compiler:
1026    // ```shell
1027    // sudo apt-get update && sudo apt-get install device-tree-compiler
1028    // ```
1029    #[test]
1030    #[ignore = "enabling the test requires installing additional software, \
1031                and developers will experience a break."]
1032    fn fdt_dtc_decompile() {
1033        const MAX_CPUS: usize = 2048;
1034
1035        let mut buf = [0; 0x40000];
1036        write_dt(
1037            &mut buf,
1038            &new_partition_info(MAX_CPUS),
1039            &AddressSpaceManager::new_const(),
1040            [],
1041            0..0,
1042            &ArrayString::from("test").unwrap_or_default(),
1043            None,
1044            None,
1045            IsolationType::None,
1046        )
1047        .unwrap();
1048
1049        let input_dtb_file_name = "openhcl_boot.dtb";
1050        let output_dts_file_name = "openhcl_boot.dts";
1051        std::fs::write(input_dtb_file_name, buf).unwrap();
1052        let success = std::process::Command::new("dtc")
1053            .args([input_dtb_file_name, "-I", "dtb", "-o", output_dts_file_name])
1054            .status()
1055            .unwrap()
1056            .success();
1057        assert!(success);
1058    }
1059
1060    fn new_address_space_manager(
1061        ram: &[MemoryRange],
1062        bootshim_used: MemoryRange,
1063        persisted_range: MemoryRange,
1064        parameter_range: MemoryRange,
1065        reclaim: Option<MemoryRange>,
1066    ) -> AddressSpaceManager {
1067        let ram = ram
1068            .iter()
1069            .cloned()
1070            .map(|range| MemoryEntry {
1071                range,
1072                mem_type: MemoryMapEntryType::VTL2_PROTECTABLE,
1073                vnode: 0,
1074            })
1075            .collect::<Vec<_>>();
1076        let mut address_space = AddressSpaceManager::new_const();
1077        AddressSpaceManagerBuilder::new(
1078            &mut address_space,
1079            &ram,
1080            bootshim_used,
1081            persisted_range,
1082            subtract_ranges([parameter_range], reclaim),
1083        )
1084        .init()
1085        .unwrap();
1086        address_space
1087    }
1088
1089    fn check_e820(boot_params: &boot_params, ext: &E820Ext, expected: &[(Range<u64>, u32)]) {
1090        let actual = boot_params.e820_map[..boot_params.e820_entries as usize]
1091            .iter()
1092            .chain(
1093                ext.entries
1094                    .iter()
1095                    .take((ext.header.len as usize) / size_of::<e820entry>()),
1096            );
1097
1098        assert_eq!(actual.clone().count(), expected.len());
1099
1100        for (actual, (expected_range, expected_type)) in actual.zip(expected.iter()) {
1101            let addr: u64 = actual.addr.into();
1102            let size: u64 = actual.size.into();
1103            let typ: u32 = actual.typ.into();
1104            assert_eq!(addr, expected_range.start);
1105            assert_eq!(size, expected_range.end - expected_range.start);
1106            assert_eq!(typ, *expected_type);
1107        }
1108    }
1109
1110    const PAGE_SIZE: u64 = 0x1000;
1111    const ONE_MB: u64 = 0x10_0000;
1112
1113    #[test]
1114    fn test_e820_basic() {
1115        // memmap with no param reclaim
1116        let mut boot_params: boot_params = FromZeros::new_zeroed();
1117        let mut ext = FromZeros::new_zeroed();
1118        let bootshim_used = MemoryRange::try_new(ONE_MB..3 * ONE_MB).unwrap();
1119        let persisted_header_end = ONE_MB + PAGE_SIZE;
1120        let persisted_end = ONE_MB + 4 * PAGE_SIZE;
1121        let persisted_state = MemoryRange::try_new(ONE_MB..persisted_end).unwrap();
1122        let parameter_range = MemoryRange::try_new(2 * ONE_MB..3 * ONE_MB).unwrap();
1123        let address_space = new_address_space_manager(
1124            &[MemoryRange::new(ONE_MB..4 * ONE_MB)],
1125            bootshim_used,
1126            persisted_state,
1127            parameter_range,
1128            None,
1129        );
1130
1131        assert!(build_e820_map(&mut boot_params, &mut ext, &address_space).is_ok());
1132
1133        check_e820(
1134            &boot_params,
1135            &ext,
1136            &[
1137                (ONE_MB..(persisted_header_end), E820_RESERVED),
1138                (persisted_header_end..persisted_end, E820_RESERVED),
1139                (persisted_end..2 * ONE_MB, E820_RAM),
1140                (2 * ONE_MB..3 * ONE_MB, E820_RESERVED),
1141                (3 * ONE_MB..4 * ONE_MB, E820_RAM),
1142            ],
1143        );
1144
1145        // memmap with reclaim
1146        let mut boot_params: boot_params = FromZeros::new_zeroed();
1147        let mut ext = FromZeros::new_zeroed();
1148        let bootshim_used = MemoryRange::try_new(ONE_MB..5 * ONE_MB).unwrap();
1149        let persisted_header_end = ONE_MB + PAGE_SIZE;
1150        let persisted_end = ONE_MB + 4 * PAGE_SIZE;
1151        let persisted_state = MemoryRange::try_new(ONE_MB..persisted_end).unwrap();
1152        let parameter_range = MemoryRange::try_new(2 * ONE_MB..5 * ONE_MB).unwrap();
1153        let reclaim = MemoryRange::try_new(3 * ONE_MB..4 * ONE_MB).unwrap();
1154        let address_space = new_address_space_manager(
1155            &[MemoryRange::new(ONE_MB..6 * ONE_MB)],
1156            bootshim_used,
1157            persisted_state,
1158            parameter_range,
1159            Some(reclaim),
1160        );
1161
1162        assert!(build_e820_map(&mut boot_params, &mut ext, &address_space).is_ok());
1163
1164        check_e820(
1165            &boot_params,
1166            &ext,
1167            &[
1168                (ONE_MB..(persisted_header_end), E820_RESERVED),
1169                (persisted_header_end..persisted_end, E820_RESERVED),
1170                (persisted_end..2 * ONE_MB, E820_RAM),
1171                (2 * ONE_MB..3 * ONE_MB, E820_RESERVED),
1172                (3 * ONE_MB..4 * ONE_MB, E820_RAM),
1173                (4 * ONE_MB..5 * ONE_MB, E820_RESERVED),
1174                (5 * ONE_MB..6 * ONE_MB, E820_RAM),
1175            ],
1176        );
1177
1178        // two mem ranges
1179        let mut boot_params: boot_params = FromZeros::new_zeroed();
1180        let mut ext = FromZeros::new_zeroed();
1181        let bootshim_used = MemoryRange::try_new(ONE_MB..5 * ONE_MB).unwrap();
1182        let persisted_header_end = ONE_MB + PAGE_SIZE;
1183        let persisted_end = ONE_MB + 4 * PAGE_SIZE;
1184        let persisted_state = MemoryRange::try_new(ONE_MB..persisted_end).unwrap();
1185        let parameter_range = MemoryRange::try_new(2 * ONE_MB..5 * ONE_MB).unwrap();
1186        let reclaim = MemoryRange::try_new(3 * ONE_MB..4 * ONE_MB).unwrap();
1187        let address_space = new_address_space_manager(
1188            &[
1189                MemoryRange::new(ONE_MB..4 * ONE_MB),
1190                MemoryRange::new(4 * ONE_MB..10 * ONE_MB),
1191            ],
1192            bootshim_used,
1193            persisted_state,
1194            parameter_range,
1195            Some(reclaim),
1196        );
1197
1198        assert!(build_e820_map(&mut boot_params, &mut ext, &address_space).is_ok());
1199
1200        check_e820(
1201            &boot_params,
1202            &ext,
1203            &[
1204                (ONE_MB..(persisted_header_end), E820_RESERVED),
1205                (persisted_header_end..persisted_end, E820_RESERVED),
1206                (persisted_end..2 * ONE_MB, E820_RAM),
1207                (2 * ONE_MB..3 * ONE_MB, E820_RESERVED),
1208                (3 * ONE_MB..4 * ONE_MB, E820_RAM),
1209                (4 * ONE_MB..5 * ONE_MB, E820_RESERVED),
1210                (5 * ONE_MB..10 * ONE_MB, E820_RAM),
1211            ],
1212        );
1213
1214        // memmap in 1 mb chunks
1215        let mut boot_params: boot_params = FromZeros::new_zeroed();
1216        let mut ext = FromZeros::new_zeroed();
1217        let bootshim_used = MemoryRange::try_new(ONE_MB..5 * ONE_MB).unwrap();
1218        let persisted_header_end = ONE_MB + PAGE_SIZE;
1219        let persisted_end = ONE_MB + 4 * PAGE_SIZE;
1220        let persisted_state = MemoryRange::try_new(ONE_MB..persisted_end).unwrap();
1221        let parameter_range = MemoryRange::try_new(2 * ONE_MB..5 * ONE_MB).unwrap();
1222        let reclaim = MemoryRange::try_new(3 * ONE_MB..4 * ONE_MB).unwrap();
1223        let address_space = new_address_space_manager(
1224            &[
1225                MemoryRange::new(ONE_MB..2 * ONE_MB),
1226                MemoryRange::new(2 * ONE_MB..3 * ONE_MB),
1227                MemoryRange::new(3 * ONE_MB..4 * ONE_MB),
1228                MemoryRange::new(4 * ONE_MB..5 * ONE_MB),
1229                MemoryRange::new(5 * ONE_MB..6 * ONE_MB),
1230                MemoryRange::new(6 * ONE_MB..7 * ONE_MB),
1231                MemoryRange::new(7 * ONE_MB..8 * ONE_MB),
1232            ],
1233            bootshim_used,
1234            persisted_state,
1235            parameter_range,
1236            Some(reclaim),
1237        );
1238
1239        assert!(build_e820_map(&mut boot_params, &mut ext, &address_space).is_ok());
1240
1241        check_e820(
1242            &boot_params,
1243            &ext,
1244            &[
1245                (ONE_MB..(persisted_header_end), E820_RESERVED),
1246                (persisted_header_end..persisted_end, E820_RESERVED),
1247                (persisted_end..2 * ONE_MB, E820_RAM),
1248                (2 * ONE_MB..3 * ONE_MB, E820_RESERVED),
1249                (3 * ONE_MB..4 * ONE_MB, E820_RAM),
1250                (4 * ONE_MB..5 * ONE_MB, E820_RESERVED),
1251                (5 * ONE_MB..8 * ONE_MB, E820_RAM),
1252            ],
1253        );
1254    }
1255
1256    // test e820 with spillover into ext
1257    #[test]
1258    fn test_e820_huge() {
1259        use crate::memory::AllocationPolicy;
1260        use crate::memory::AllocationType;
1261
1262        // Create 64 RAM ranges, then allocate 256 ranges to test spillover
1263        // boot_params.e820_map has E820_MAX_ENTRIES_ZEROPAGE (128) entries
1264        const E820_MAX_ENTRIES_ZEROPAGE: usize = 128;
1265        const RAM_RANGES: usize = 64;
1266        const TOTAL_ALLOCATIONS: usize = 256;
1267
1268        // Create 64 large RAM ranges (64MB each = 64 * 1MB pages per range)
1269        let mut ranges = Vec::new();
1270        for i in 0..RAM_RANGES {
1271            let start = (i as u64) * 64 * ONE_MB;
1272            let end = start + 64 * ONE_MB;
1273            ranges.push(MemoryRange::new(start..end));
1274        }
1275
1276        let bootshim_used = MemoryRange::try_new(0..ONE_MB * 2).unwrap();
1277        let persisted_range = MemoryRange::try_new(0..ONE_MB).unwrap();
1278        let parameter_range = MemoryRange::try_new(ONE_MB..2 * ONE_MB).unwrap();
1279
1280        let mut address_space = {
1281            let ram = ranges
1282                .iter()
1283                .cloned()
1284                .map(|range| MemoryEntry {
1285                    range,
1286                    mem_type: MemoryMapEntryType::VTL2_PROTECTABLE,
1287                    vnode: 0,
1288                })
1289                .collect::<Vec<_>>();
1290            let mut address_space = AddressSpaceManager::new_const();
1291            AddressSpaceManagerBuilder::new(
1292                &mut address_space,
1293                &ram,
1294                bootshim_used,
1295                persisted_range,
1296                core::iter::once(parameter_range),
1297            )
1298            .init()
1299            .unwrap();
1300            address_space
1301        };
1302
1303        for i in 0..TOTAL_ALLOCATIONS {
1304            // Intersperse sidecar node allocations with gpa pool allocations,
1305            // as otherwise the address space manager will collapse adjacent
1306            // ranges of the same type.
1307            let _allocated = address_space
1308                .allocate(
1309                    None,
1310                    ONE_MB,
1311                    if i % 2 == 0 {
1312                        AllocationType::GpaPool
1313                    } else {
1314                        AllocationType::SidecarNode
1315                    },
1316                    AllocationPolicy::LowMemory,
1317                )
1318                .expect("should be able to allocate sidecar node");
1319        }
1320
1321        let mut boot_params: boot_params = FromZeros::new_zeroed();
1322        let mut ext = FromZeros::new_zeroed();
1323        let total_ranges = address_space.vtl2_ranges().count();
1324
1325        let used_ext = build_e820_map(&mut boot_params, &mut ext, &address_space).unwrap();
1326
1327        // Verify that we used the extension
1328        assert!(used_ext, "should use extension when there are many ranges");
1329
1330        // Verify the standard e820_map is full
1331        assert_eq!(boot_params.e820_entries, E820_MAX_ENTRIES_ZEROPAGE as u8);
1332
1333        // Verify the extension has the overflow entries
1334        let ext_entries = (ext.header.len as usize) / size_of::<e820entry>();
1335        assert_eq!(ext_entries, total_ranges - E820_MAX_ENTRIES_ZEROPAGE);
1336
1337        // Verify we have the expected number of total ranges
1338        let total_e820_entries = boot_params.e820_entries as usize + ext_entries;
1339        assert_eq!(total_e820_entries, total_ranges);
1340    }
1341}