openhcl_boot/
main.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! The openhcl boot loader, which loads before the kernel to set up the
5//! kernel's boot parameters.
6
7// See build.rs.
8#![cfg_attr(minimal_rt, no_std, no_main)]
9// UNSAFETY: Interacting with low level hardware and bootloader primitives.
10#![expect(unsafe_code)]
11// Allow the allocator api when compiling with `RUSTFLAGS="--cfg nightly"`. This
12// is used for some miri tests for testing the bump allocator.
13//
14// Do not use a normal feature, as that shows errors with rust-analyzer since
15// most people are using stable and enable all features. We could remove this
16// once the allocator_api feature is stable.
17#![cfg_attr(nightly, feature(allocator_api))]
18
19mod arch;
20mod boot_logger;
21mod cmdline;
22mod dt;
23mod host_params;
24mod hypercall;
25mod memory;
26mod rt;
27mod sidecar;
28mod single_threaded;
29
30use crate::arch::setup_vtl2_memory;
31use crate::arch::setup_vtl2_vp;
32#[cfg(target_arch = "x86_64")]
33use crate::arch::tdx::get_tdx_tsc_reftime;
34use crate::arch::verify_imported_regions_hash;
35use crate::boot_logger::boot_logger_memory_init;
36use crate::boot_logger::boot_logger_runtime_init;
37use crate::boot_logger::boot_logger_write_memory_log_to_runtime;
38use crate::boot_logger::log;
39use crate::hypercall::hvcall;
40use crate::memory::AddressSpaceManager;
41use crate::single_threaded::OffStackRef;
42use crate::single_threaded::off_stack;
43use arrayvec::ArrayString;
44use arrayvec::ArrayVec;
45use cmdline::BootCommandLineOptions;
46use core::fmt::Write;
47use dt::BootTimes;
48use dt::write_dt;
49use host_params::COMMAND_LINE_SIZE;
50use host_params::PartitionInfo;
51use host_params::shim_params::IsolationType;
52use host_params::shim_params::ShimParams;
53use hvdef::Vtl;
54use loader_defs::linux::SETUP_DTB;
55use loader_defs::linux::setup_data;
56use loader_defs::shim::ShimParamsRaw;
57use memory_range::RangeWalkResult;
58use memory_range::walk_ranges;
59use minimal_rt::enlightened_panic::enable_enlightened_panic;
60use sidecar::SidecarConfig;
61use sidecar_defs::SidecarOutput;
62use sidecar_defs::SidecarParams;
63use zerocopy::FromBytes;
64use zerocopy::FromZeros;
65use zerocopy::Immutable;
66use zerocopy::IntoBytes;
67use zerocopy::KnownLayout;
68
69#[derive(Debug)]
70struct CommandLineTooLong;
71
72impl From<core::fmt::Error> for CommandLineTooLong {
73    fn from(_: core::fmt::Error) -> Self {
74        Self
75    }
76}
77
78struct BuildKernelCommandLineParams<'a> {
79    params: &'a ShimParams,
80    cmdline: &'a mut ArrayString<COMMAND_LINE_SIZE>,
81    partition_info: &'a PartitionInfo,
82    can_trust_host: bool,
83    is_confidential_debug: bool,
84    sidecar: Option<&'a SidecarConfig<'a>>,
85    vtl2_pool_supported: bool,
86}
87
88/// Read and setup the underhill kernel command line into the specified buffer.
89fn build_kernel_command_line(
90    fn_params: BuildKernelCommandLineParams<'_>,
91) -> Result<(), CommandLineTooLong> {
92    let BuildKernelCommandLineParams {
93        params,
94        cmdline,
95        partition_info,
96        can_trust_host,
97        is_confidential_debug,
98        sidecar,
99        vtl2_pool_supported,
100    } = fn_params;
101
102    // For reference:
103    // https://www.kernel.org/doc/html/v5.15/admin-guide/kernel-parameters.html
104    const KERNEL_PARAMETERS: &[&str] = &[
105        // If a console is specified, then write everything to it.
106        "loglevel=8",
107        // Use a fixed 128KB log buffer by default.
108        "log_buf_len=128K",
109        // Enable time output on console for ohcldiag-dev.
110        "printk.time=1",
111        // Enable facility and level output on console for ohcldiag-dev.
112        "console_msg_format=syslog",
113        // Set uio parameter to configure vmbus ring buffer behavior.
114        "uio_hv_generic.no_mask=1",
115        // RELIABILITY: Dump anonymous pages and ELF headers only. Skip over
116        // huge pages and the shared pages.
117        "coredump_filter=0x33",
118        // PERF: No processor frequency governing.
119        "cpufreq.off=1",
120        // PERF: Disable the CPU idle time management entirely. It does not
121        // prevent the idle loop from running on idle CPUs, but it prevents
122        // the CPU idle time governors and drivers from being invoked.
123        "cpuidle.off=1",
124        // PERF: No perf checks for crypto algorithms to boot faster.
125        // Would have to evaluate the perf wins on the crypto manager vs
126        // delaying the boot up.
127        "cryptomgr.notests",
128        // PERF: Idle threads use HLT on x64 if there is no work.
129        // Believed to be a compromise between waking up the processor
130        // and the power consumption.
131        "idle=halt",
132        // WORKAROUND: Avoid init calls that assume presence of CMOS (Simple
133        // Boot Flag) or allocate the real-mode trampoline for APs.
134        "initcall_blacklist=init_real_mode,sbf_init",
135        // CONFIG-STATIC, PERF: Static loops-per-jiffy value to save time on boot.
136        "lpj=3000000",
137        // PERF: No broken timer check to boot faster.
138        "no_timer_check",
139        // CONFIG-STATIC, PERF: Using xsave makes VTL transitions being
140        // much slower. The xsave state is shared between VTLs, and we don't
141        // context switch it in the kernel when leaving/entering VTL2.
142        // Removing this will lead to corrupting register state and the
143        // undefined behaviour.
144        "noxsave",
145        // RELIABILITY: Panic on MCEs and faults in the kernel.
146        "oops=panic",
147        // RELIABILITY: Don't panic on kernel warnings.
148        "panic_on_warn=0",
149        // PERF, RELIABILITY: Don't print detailed information about the failing
150        // processes (memory maps, threads).
151        "panic_print=0",
152        // RELIABILITY: Reboot immediately on panic, no timeout.
153        "panic=-1",
154        // RELIABILITY: Don't print processor context information on a fatal
155        // signal. Our crash dump collection infrastructure seems reliable, and
156        // this information doesn't seem useful without a dump anyways.
157        // Additionally it may push important logs off the end of the kmsg
158        // page logged by the host.
159        //"print_fatal_signals=0",
160        // RELIABILITY: Unlimited logging to /dev/kmsg from userspace.
161        "printk.devkmsg=on",
162        // RELIABILITY: Reboot using a triple fault as the fastest method.
163        // That is also the method used for compatibility with earlier versions
164        // of the Microsoft HCL.
165        "reboot=t",
166        // CONFIG-STATIC: Type of the root file system.
167        "rootfstype=tmpfs",
168        // PERF: Deactivate kcompactd kernel thread, otherwise it will queue a
169        // scheduler timer periodically, which introduces jitters for VTL0.
170        "sysctl.vm.compaction_proactiveness=0",
171        // PERF: No TSC stability check when booting up to boot faster,
172        // also no validation during runtime.
173        "tsc=reliable",
174        // RELIABILITY: Panic on receiving an NMI.
175        "unknown_nmi_panic=1",
176        // Use vfio for MANA devices.
177        "vfio_pci.ids=1414:00ba",
178        // WORKAROUND: Enable no-IOMMU mode. This mode provides no device isolation,
179        // and no DMA translation.
180        "vfio.enable_unsafe_noiommu_mode=1",
181        // Specify the init path.
182        "rdinit=/underhill-init",
183        // Default to user-mode NVMe driver.
184        "OPENHCL_NVME_VFIO=1",
185        // The next three items reduce the memory overhead of the storvsc driver.
186        // Since it is only used for DVD, performance is not critical.
187        "hv_storvsc.storvsc_vcpus_per_sub_channel=2048",
188        // Fix number of hardware queues at 2.
189        "hv_storvsc.storvsc_max_hw_queues=2",
190        // Reduce the ring buffer size to 32K.
191        "hv_storvsc.storvsc_ringbuffer_size=0x8000",
192        // Disable eager mimalloc commit to prevent core dumps from being overly large
193        "MIMALLOC_ARENA_EAGER_COMMIT=0",
194        // Disable acpi runtime support. Unused in underhill, but some support
195        // is compiled in for the kernel (ie TDX mailbox protocol).
196        "acpi=off",
197    ];
198
199    const X86_KERNEL_PARAMETERS: &[&str] = &[
200        // Disable pcid support. This is a temporary fix to allow
201        // Underhill to run nested inside AMD VMs. Otherwise, the
202        // Underhill kernel tries to start APs with PCID bits set in CR3
203        // without the PCIDE bit set in CR4, which is an invalid
204        // VP state (according to the mshv nested implementation).
205        //
206        // TODO: remove this once we figure out the root cause and apply
207        // a workaround/fix elsewhere.
208        "clearcpuid=pcid",
209        // Disable all attempts to use an IOMMU, including swiotlb.
210        "iommu=off",
211        // Don't probe for a PCI bus. PCI devices currently come from VPCI. When
212        // this changes, we will explicitly enumerate a PCI bus via devicetree.
213        "pci=off",
214    ];
215
216    const AARCH64_KERNEL_PARAMETERS: &[&str] = &[];
217
218    for p in KERNEL_PARAMETERS {
219        write!(cmdline, "{p} ")?;
220    }
221
222    let arch_parameters = if cfg!(target_arch = "x86_64") {
223        X86_KERNEL_PARAMETERS
224    } else {
225        AARCH64_KERNEL_PARAMETERS
226    };
227    for p in arch_parameters {
228        write!(cmdline, "{p} ")?;
229    }
230
231    const HARDWARE_ISOLATED_KERNEL_PARAMETERS: &[&str] = &[
232        // Even with iommu=off, the SWIOTLB is still allocated on AARCH64
233        // (iommu=off ignored entirely), and CVMs (memory encryption forces it
234        // on). Set it to a single area in 8MB. The first parameter controls the
235        // area size in slabs (2KB per slab), the second controls the number of
236        // areas (default is # of CPUs).
237        //
238        // This is set to 8MB on hardware isolated VMs since there are some
239        // scenarios, such as provisioning over DVD, which require a larger size
240        // since the buffer is being used.
241        "swiotlb=4096,1",
242    ];
243
244    const NON_HARDWARE_ISOLATED_KERNEL_PARAMETERS: &[&str] = &[
245        // Even with iommu=off, the SWIOTLB is still allocated on AARCH64
246        // (iommu=off ignored entirely). Set it to the minimum, saving ~63 MiB.
247        // The first parameter controls the area size, the second controls the
248        // number of areas (default is # of CPUs). Set them both to the minimum.
249        "swiotlb=1,1",
250    ];
251
252    if params.isolation_type.is_hardware_isolated() {
253        for p in HARDWARE_ISOLATED_KERNEL_PARAMETERS {
254            write!(cmdline, "{p} ")?;
255        }
256    } else {
257        for p in NON_HARDWARE_ISOLATED_KERNEL_PARAMETERS {
258            write!(cmdline, "{p} ")?;
259        }
260    }
261
262    // Enable the com3 console by default if it's available and we're not
263    // isolated, or if we are isolated but also have debugging enabled.
264    //
265    // Otherwise, set the console to ttynull so the kernel does not default to
266    // com1. This is overridden by any user customizations in the static or
267    // dynamic command line, as this console argument provided by the bootloader
268    // comes first.
269    let console = if partition_info.com3_serial_available && can_trust_host {
270        "ttyS2,115200"
271    } else {
272        "ttynull"
273    };
274    write!(cmdline, "console={console} ")?;
275
276    if params.isolation_type != IsolationType::None {
277        write!(
278            cmdline,
279            "{}=1 ",
280            underhill_confidentiality::OPENHCL_CONFIDENTIAL_ENV_VAR_NAME
281        )?;
282    }
283
284    if is_confidential_debug {
285        write!(
286            cmdline,
287            "{}=1 ",
288            underhill_confidentiality::OPENHCL_CONFIDENTIAL_DEBUG_ENV_VAR_NAME
289        )?;
290    }
291
292    // Generate the NVMe keep alive command line which should look something
293    // like: OPENHCL_NVME_KEEP_ALIVE=disabled,host,privatepool
294    // TODO: Move from command line to device tree when stabilized.
295    write!(cmdline, "OPENHCL_NVME_KEEP_ALIVE=")?;
296
297    if partition_info.boot_options.disable_nvme_keep_alive {
298        write!(cmdline, "disabled,")?;
299    }
300
301    if partition_info.nvme_keepalive {
302        write!(cmdline, "host,")?;
303    } else {
304        write!(cmdline, "nohost,")?;
305    }
306
307    if vtl2_pool_supported {
308        write!(cmdline, "privatepool ")?;
309    } else {
310        write!(cmdline, "noprivatepool ")?;
311    }
312
313    if let Some(sidecar) = sidecar {
314        write!(cmdline, "{} ", sidecar.kernel_command_line())?;
315    }
316
317    // HACK: Set the vmbus connection id via kernel commandline.
318    //
319    // This code will be removed when the kernel supports setting connection id
320    // via device tree.
321    write!(
322        cmdline,
323        "hv_vmbus.message_connection_id=0x{:x} ",
324        partition_info.vmbus_vtl2.connection_id
325    )?;
326
327    // If we're isolated we can't trust the host-provided cmdline
328    if can_trust_host {
329        // Prepend the computed parameters to the original command line.
330        cmdline.write_str(&partition_info.cmdline)?;
331    }
332
333    Ok(())
334}
335
336// The Linux kernel requires that the FDT fit within a single 256KB mapping, as
337// that is the maximum size the kernel can use during its early boot processes.
338// We also want our FDT to be as large as possible to support as many vCPUs as
339// possible. We set it to 256KB, but it must also be page-aligned, as leaving it
340// unaligned runs the possibility of it taking up 1 too many pages, resulting in
341// a 260KB mapping, which will fail.
342const FDT_SIZE: usize = 256 * 1024;
343
344#[repr(C, align(4096))]
345#[derive(FromBytes, IntoBytes, Immutable, KnownLayout)]
346struct Fdt {
347    header: setup_data,
348    data: [u8; FDT_SIZE - size_of::<setup_data>()],
349}
350
351/// Raw shim parameters are provided via a relative offset from the base of
352/// where the shim is loaded. Return a ShimParams structure based on the raw
353/// offset based RawShimParams.
354fn shim_parameters(shim_params_raw_offset: isize) -> ShimParams {
355    unsafe extern "C" {
356        static __ehdr_start: u8;
357    }
358
359    let shim_base = core::ptr::addr_of!(__ehdr_start) as usize;
360
361    // SAFETY: The host is required to relocate everything by the same bias, so
362    //         the shim parameters should be at the build time specified offset
363    //         from the base address of the image.
364    let raw_shim_params = unsafe {
365        &*(shim_base.wrapping_add_signed(shim_params_raw_offset) as *const ShimParamsRaw)
366    };
367
368    ShimParams::new(shim_base as u64, raw_shim_params)
369}
370
371#[cfg_attr(not(target_arch = "x86_64"), expect(dead_code))]
372mod x86_boot {
373    use crate::PageAlign;
374    use crate::memory::AddressSpaceManager;
375    use crate::single_threaded::OffStackRef;
376    use crate::single_threaded::off_stack;
377    use crate::zeroed;
378    use core::mem::size_of;
379    use core::ops::Range;
380    use core::ptr;
381    use loader_defs::linux::E820_RAM;
382    use loader_defs::linux::E820_RESERVED;
383    use loader_defs::linux::SETUP_E820_EXT;
384    use loader_defs::linux::boot_params;
385    use loader_defs::linux::e820entry;
386    use loader_defs::linux::setup_data;
387    use loader_defs::shim::MemoryVtlType;
388    use memory_range::MemoryRange;
389    use zerocopy::FromZeros;
390    use zerocopy::Immutable;
391    use zerocopy::KnownLayout;
392
393    #[repr(C)]
394    #[derive(FromZeros, Immutable, KnownLayout)]
395    pub struct E820Ext {
396        pub header: setup_data,
397        pub entries: [e820entry; 512],
398    }
399
400    fn add_e820_entry(
401        entry: Option<&mut e820entry>,
402        range: MemoryRange,
403        typ: u32,
404    ) -> Result<(), BuildE820MapError> {
405        *entry.ok_or(BuildE820MapError::OutOfE820Entries)? = e820entry {
406            addr: range.start().into(),
407            size: range.len().into(),
408            typ: typ.into(),
409        };
410        Ok(())
411    }
412
413    #[derive(Debug)]
414    pub enum BuildE820MapError {
415        /// Out of e820 entries.
416        OutOfE820Entries,
417    }
418
419    /// Build the e820 map for the kernel representing usable VTL2 ram.
420    pub fn build_e820_map(
421        boot_params: &mut boot_params,
422        ext: &mut E820Ext,
423        address_space: &AddressSpaceManager,
424    ) -> Result<bool, BuildE820MapError> {
425        boot_params.e820_entries = 0;
426        let mut entries = boot_params
427            .e820_map
428            .iter_mut()
429            .chain(ext.entries.iter_mut());
430
431        let mut n = 0;
432        for (range, typ) in address_space.vtl2_ranges() {
433            match typ {
434                MemoryVtlType::VTL2_RAM => {
435                    add_e820_entry(entries.next(), range, E820_RAM)?;
436                    n += 1;
437                }
438                MemoryVtlType::VTL2_CONFIG
439                | MemoryVtlType::VTL2_SIDECAR_IMAGE
440                | MemoryVtlType::VTL2_SIDECAR_NODE
441                | MemoryVtlType::VTL2_RESERVED
442                | MemoryVtlType::VTL2_GPA_POOL
443                | MemoryVtlType::VTL2_TDX_PAGE_TABLES
444                | MemoryVtlType::VTL2_BOOTSHIM_LOG_BUFFER
445                | MemoryVtlType::VTL2_PERSISTED_STATE_HEADER
446                | MemoryVtlType::VTL2_PERSISTED_STATE_PROTOBUF => {
447                    add_e820_entry(entries.next(), range, E820_RESERVED)?;
448                    n += 1;
449                }
450
451                _ => {
452                    panic!("unexpected vtl2 ram type {typ:?} for range {range:#?}");
453                }
454            }
455        }
456
457        let base = n.min(boot_params.e820_map.len());
458        boot_params.e820_entries = base as u8;
459
460        if base < n {
461            ext.header.len = ((n - base) * size_of::<e820entry>()) as u32;
462            Ok(true)
463        } else {
464            Ok(false)
465        }
466    }
467
468    pub fn build_boot_params(
469        address_space: &AddressSpaceManager,
470        initrd: Range<u64>,
471        cmdline: &str,
472        setup_data_head: *const setup_data,
473        setup_data_tail: &mut &mut setup_data,
474    ) -> OffStackRef<'static, PageAlign<boot_params>> {
475        let mut boot_params_storage = off_stack!(PageAlign<boot_params>, zeroed());
476        let boot_params = &mut boot_params_storage.0;
477        boot_params.hdr.type_of_loader = 0xff; // Unknown loader type
478
479        // HACK: A kernel change just in the Underhill kernel tree has a workaround
480        // to disable probe_roms and reserve_bios_regions when X86_SUBARCH_LGUEST
481        // (1) is set by the bootloader. This stops the kernel from reading VTL0
482        // memory during kernel boot, which can have catastrophic consequences
483        // during a servicing operation when VTL0 has written values to memory, or
484        // unaccepted page accesses in an isolated partition.
485        //
486        // This is only intended as a stopgap until a suitable upstreamable kernel
487        // patch is made.
488        boot_params.hdr.hardware_subarch = 1.into();
489
490        boot_params.hdr.ramdisk_image = (initrd.start as u32).into();
491        boot_params.ext_ramdisk_image = (initrd.start >> 32) as u32;
492        let initrd_len = initrd.end - initrd.start;
493        boot_params.hdr.ramdisk_size = (initrd_len as u32).into();
494        boot_params.ext_ramdisk_size = (initrd_len >> 32) as u32;
495
496        let e820_ext = OffStackRef::leak(off_stack!(E820Ext, zeroed()));
497
498        let used_ext = build_e820_map(boot_params, e820_ext, address_space)
499            .expect("building e820 map must succeed");
500
501        if used_ext {
502            e820_ext.header.ty = SETUP_E820_EXT;
503            setup_data_tail.next = ptr::from_ref(&e820_ext.header) as u64;
504            *setup_data_tail = &mut e820_ext.header;
505        }
506
507        let cmd_line_addr = cmdline.as_ptr() as u64;
508        boot_params.hdr.cmd_line_ptr = (cmd_line_addr as u32).into();
509        boot_params.ext_cmd_line_ptr = (cmd_line_addr >> 32) as u32;
510
511        boot_params.hdr.setup_data = (setup_data_head as u64).into();
512
513        boot_params_storage
514    }
515}
516
517/// Build the cc_blob containing the location of different parameters associated with SEV.
518#[cfg(target_arch = "x86_64")]
519fn build_cc_blob_sev_info(
520    cc_blob: &mut loader_defs::linux::cc_blob_sev_info,
521    shim_params: &ShimParams,
522) {
523    // TODO SNP: Currently only the first CPUID page is passed through.
524    // Consider changing this.
525    cc_blob.magic = loader_defs::linux::CC_BLOB_SEV_INFO_MAGIC;
526    cc_blob.version = 0;
527    cc_blob._reserved = 0;
528    cc_blob.secrets_phys = shim_params.secrets_start();
529    cc_blob.secrets_len = hvdef::HV_PAGE_SIZE as u32;
530    cc_blob._rsvd1 = 0;
531    cc_blob.cpuid_phys = shim_params.cpuid_start();
532    cc_blob.cpuid_len = hvdef::HV_PAGE_SIZE as u32;
533    cc_blob._rsvd2 = 0;
534}
535
536#[repr(C, align(4096))]
537#[derive(FromZeros, Immutable, KnownLayout)]
538struct PageAlign<T>(T);
539
540const fn zeroed<T: FromZeros>() -> T {
541    // SAFETY: `T` implements `FromZeros`, so this is a safe initialization of `T`.
542    unsafe { core::mem::MaybeUninit::<T>::zeroed().assume_init() }
543}
544
545fn get_ref_time(isolation: IsolationType) -> Option<u64> {
546    match isolation {
547        #[cfg(target_arch = "x86_64")]
548        IsolationType::Tdx => get_tdx_tsc_reftime(),
549        #[cfg(target_arch = "x86_64")]
550        IsolationType::Snp => None,
551        _ => Some(minimal_rt::reftime::reference_time()),
552    }
553}
554
555fn shim_main(shim_params_raw_offset: isize) -> ! {
556    let p = shim_parameters(shim_params_raw_offset);
557    if p.isolation_type == IsolationType::None {
558        enable_enlightened_panic();
559    }
560
561    // Enable the in-memory log.
562    boot_logger_memory_init(p.log_buffer);
563
564    let boot_reftime = get_ref_time(p.isolation_type);
565
566    // The support code for the fast hypercalls does not set
567    // the Guest ID if it is not set yet as opposed to the slow
568    // hypercall code path where that is done automatically.
569    // Thus the fast hypercalls will fail as the the Guest ID has
570    // to be set first hence initialize hypercall support
571    // explicitly.
572    if !p.isolation_type.is_hardware_isolated() {
573        hvcall().initialize();
574    }
575
576    let mut static_options = BootCommandLineOptions::new();
577    if let Some(cmdline) = p.command_line().command_line() {
578        static_options.parse(cmdline);
579    }
580
581    let static_confidential_debug = static_options.confidential_debug;
582    let can_trust_host = p.isolation_type == IsolationType::None || static_confidential_debug;
583
584    let mut dt_storage = off_stack!(PartitionInfo, PartitionInfo::new());
585    let address_space = OffStackRef::leak(off_stack!(
586        AddressSpaceManager,
587        AddressSpaceManager::new_const()
588    ));
589    let partition_info = match PartitionInfo::read_from_dt(
590        &p,
591        &mut dt_storage,
592        address_space,
593        static_options,
594        can_trust_host,
595    ) {
596        Ok(val) => val,
597        Err(e) => panic!("unable to read device tree params {}", e),
598    };
599
600    // Enable logging ASAP. This is fine even when isolated, as we don't have
601    // any access to secrets in the boot shim.
602    boot_logger_runtime_init(p.isolation_type, partition_info.com3_serial_available);
603    log!("openhcl_boot: logging enabled");
604    boot_logger_write_memory_log_to_runtime();
605
606    // Confidential debug will show up in boot_options only if included in the
607    // static command line, or if can_trust_host is true (so the dynamic command
608    // line has been parsed).
609    let is_confidential_debug =
610        static_confidential_debug || partition_info.boot_options.confidential_debug;
611
612    // Fill out the non-devicetree derived parts of PartitionInfo.
613    if !p.isolation_type.is_hardware_isolated()
614        && hvcall().vtl() == Vtl::Vtl2
615        && hvdef::HvRegisterVsmCapabilities::from(
616            hvcall()
617                .get_register(hvdef::HvAllArchRegisterName::VsmCapabilities.into())
618                .expect("failed to query vsm capabilities")
619                .as_u64(),
620        )
621        .vtl0_alias_map_available()
622    {
623        // If the vtl0 alias map was not provided in the devicetree, attempt to
624        // derive it from the architectural physical address bits.
625        //
626        // The value in the ID_AA64MMFR0_EL1 register used to determine the
627        // physical address bits can only represent multiples of 4. As a result,
628        // the Surface Pro X (and systems with similar CPUs) cannot properly
629        // report their address width of 39 bits. This causes the calculated
630        // alias map to be incorrect, which results in panics when trying to
631        // read memory and getting invalid data.
632        if partition_info.vtl0_alias_map.is_none() {
633            partition_info.vtl0_alias_map =
634                Some(1 << (arch::physical_address_bits(p.isolation_type) - 1));
635        }
636    } else {
637        // Ignore any devicetree-provided alias map if the conditions above
638        // aren't met.
639        partition_info.vtl0_alias_map = None;
640    }
641
642    // Rebind partition_info as no longer mutable.
643    let partition_info: &PartitionInfo = partition_info;
644
645    if partition_info.cpus.is_empty() {
646        panic!("no cpus");
647    }
648
649    validate_vp_hw_ids(partition_info);
650
651    setup_vtl2_memory(&p, partition_info, address_space);
652    setup_vtl2_vp(partition_info);
653
654    verify_imported_regions_hash(&p);
655
656    let mut sidecar_params = off_stack!(PageAlign<SidecarParams>, zeroed());
657    let mut sidecar_output = off_stack!(PageAlign<SidecarOutput>, zeroed());
658    let sidecar = sidecar::start_sidecar(
659        &p,
660        partition_info,
661        address_space,
662        &mut sidecar_params.0,
663        &mut sidecar_output.0,
664    );
665
666    // Rebind address_space as no longer mutable.
667    let address_space: &AddressSpaceManager = address_space;
668
669    let mut cmdline = off_stack!(ArrayString<COMMAND_LINE_SIZE>, ArrayString::new_const());
670    build_kernel_command_line(BuildKernelCommandLineParams {
671        params: &p,
672        cmdline: &mut cmdline,
673        partition_info,
674        can_trust_host,
675        is_confidential_debug,
676        sidecar: sidecar.as_ref(),
677        vtl2_pool_supported: address_space.has_vtl2_pool(),
678    })
679    .unwrap();
680
681    let mut fdt = off_stack!(Fdt, zeroed());
682    fdt.header.len = fdt.data.len() as u32;
683    fdt.header.ty = SETUP_DTB;
684
685    #[cfg(target_arch = "x86_64")]
686    let mut setup_data_tail = &mut fdt.header;
687    #[cfg(target_arch = "x86_64")]
688    let setup_data_head = core::ptr::from_ref(setup_data_tail);
689
690    #[cfg(target_arch = "x86_64")]
691    if p.isolation_type == IsolationType::Snp {
692        let cc_blob = OffStackRef::leak(off_stack!(loader_defs::linux::cc_blob_sev_info, zeroed()));
693        build_cc_blob_sev_info(cc_blob, &p);
694
695        let cc_data = OffStackRef::leak(off_stack!(loader_defs::linux::cc_setup_data, zeroed()));
696        cc_data.header.len = size_of::<loader_defs::linux::cc_setup_data>() as u32;
697        cc_data.header.ty = loader_defs::linux::SETUP_CC_BLOB;
698        cc_data.cc_blob_address = core::ptr::from_ref(&*cc_blob) as u32;
699
700        // Chain in the setup data.
701        setup_data_tail.next = core::ptr::from_ref(&*cc_data) as u64;
702        setup_data_tail = &mut cc_data.header;
703    }
704
705    let initrd = p.initrd_base..p.initrd_base + p.initrd_size;
706
707    // Validate the initrd crc matches what was put at file generation time.
708    let computed_crc = crc32fast::hash(p.initrd());
709    assert_eq!(
710        computed_crc, p.initrd_crc,
711        "computed initrd crc does not match build time calculated crc"
712    );
713
714    #[cfg(target_arch = "x86_64")]
715    let boot_params = x86_boot::build_boot_params(
716        address_space,
717        initrd.clone(),
718        &cmdline,
719        setup_data_head,
720        &mut setup_data_tail,
721    );
722
723    // Compute the ending boot time. This has to be before writing to device
724    // tree, so this is as late as we can do it.
725
726    let boot_times = boot_reftime.map(|start| BootTimes {
727        start,
728        end: get_ref_time(p.isolation_type).unwrap_or(0),
729    });
730
731    // Validate that no imported regions that are pending are not part of vtl2
732    // ram.
733    for (range, result) in walk_ranges(
734        partition_info.vtl2_ram.iter().map(|r| (r.range, ())),
735        p.imported_regions(),
736    ) {
737        match result {
738            RangeWalkResult::Neither | RangeWalkResult::Left(_) | RangeWalkResult::Both(_, _) => {}
739            RangeWalkResult::Right(accepted) => {
740                // Ranges that are not a part of VTL2 ram must have been
741                // preaccepted, as usermode expect that to be the case.
742                assert!(
743                    accepted,
744                    "range {:#x?} not in vtl2 ram was not preaccepted at launch",
745                    range
746                );
747            }
748        }
749    }
750
751    write_dt(
752        &mut fdt.data,
753        partition_info,
754        address_space,
755        p.imported_regions().map(|r| {
756            // Discard if the range was previously pending - the bootloader has
757            // accepted all pending ranges.
758            //
759            // NOTE: No VTL0 memory today is marked as pending. The check above
760            // validates that, and this code may need to change if this becomes
761            // no longer true.
762            r.0
763        }),
764        initrd,
765        &cmdline,
766        sidecar.as_ref(),
767        boot_times,
768        p.isolation_type,
769    )
770    .unwrap();
771
772    rt::verify_stack_cookie();
773
774    log!("uninitializing hypercalls, about to jump to kernel");
775    hvcall().uninitialize();
776
777    cfg_if::cfg_if! {
778        if #[cfg(target_arch = "x86_64")] {
779            // SAFETY: the parameter blob is trusted.
780            let kernel_entry: extern "C" fn(u64, &loader_defs::linux::boot_params) -> ! =
781                unsafe { core::mem::transmute(p.kernel_entry_address) };
782            kernel_entry(0, &boot_params.0)
783        } else if #[cfg(target_arch = "aarch64")] {
784            // SAFETY: the parameter blob is trusted.
785            let kernel_entry: extern "C" fn(fdt_data: *const u8, mbz0: u64, mbz1: u64, mbz2: u64) -> ! =
786                unsafe { core::mem::transmute(p.kernel_entry_address) };
787            // Disable MMU for kernel boot without EFI, as required by the boot protocol.
788            // Flush (and invalidate) the caches, as that is required for disabling MMU.
789            // SAFETY: Just changing a bit in the register and then jumping to the kernel.
790            unsafe {
791                core::arch::asm!(
792                    "
793                    mrs     {0}, sctlr_el1
794                    bic     {0}, {0}, #0x1
795                    msr     sctlr_el1, {0}
796                    tlbi    vmalle1
797                    dsb     sy
798                    isb     sy",
799                    lateout(reg) _,
800                );
801            }
802            kernel_entry(fdt.data.as_ptr(), 0, 0, 0)
803        } else {
804            panic!("unsupported arch")
805        }
806    }
807}
808
809/// Ensure that mshv VP indexes for the CPUs listed in the partition info
810/// correspond to the N in the cpu@N devicetree node name. OpenVMM assumes that
811/// this will be the case.
812fn validate_vp_hw_ids(partition_info: &PartitionInfo) {
813    use host_params::MAX_CPU_COUNT;
814    use hypercall::HwId;
815
816    if partition_info.isolation.is_hardware_isolated() {
817        // TODO TDX SNP: we don't have a GHCB/GHCI page set up to communicate
818        // with the hypervisor here, so we can't easily perform the check. Since
819        // there is no security impact to this check, we can skip it for now; if
820        // the VM fails to boot, then this is due to a host contract violation.
821        //
822        // For TDX, we could use ENUM TOPOLOGY to validate that the TD VCPU
823        // indexes correspond to the APIC IDs in the right order. I am not
824        // certain if there are places where we depend on this mapping today.
825        return;
826    }
827
828    if hvcall().vtl() != Vtl::Vtl2 {
829        // If we're not using guest VSM, then the guest won't communicate
830        // directly with the hypervisor, so we can choose the VP indexes
831        // ourselves.
832        return;
833    }
834
835    // Ensure the host and hypervisor agree on VP index ordering.
836
837    let mut hw_ids = off_stack!(ArrayVec<HwId, MAX_CPU_COUNT>, ArrayVec::new_const());
838    hw_ids.clear();
839    hw_ids.extend(partition_info.cpus.iter().map(|c| c.reg as _));
840    let mut vp_indexes = off_stack!(ArrayVec<u32, MAX_CPU_COUNT>, ArrayVec::new_const());
841    vp_indexes.clear();
842    if let Err(err) = hvcall().get_vp_index_from_hw_id(&hw_ids, &mut vp_indexes) {
843        panic!(
844            "failed to get VP index for hardware ID {:#x}: {}",
845            hw_ids[vp_indexes.len().min(hw_ids.len() - 1)],
846            err
847        );
848    }
849    if let Some((i, &vp_index)) = vp_indexes
850        .iter()
851        .enumerate()
852        .find(|&(i, vp_index)| i as u32 != *vp_index)
853    {
854        panic!(
855            "CPU hardware ID {:#x} does not correspond to VP index {}",
856            hw_ids[i], vp_index
857        );
858    }
859}
860
861// See build.rs. See `mod rt` for the actual bootstrap code required to invoke
862// shim_main.
863#[cfg(not(minimal_rt))]
864fn main() {
865    unimplemented!("build with MINIMAL_RT_BUILD to produce a working boot loader");
866}
867
868#[cfg(test)]
869mod test {
870    use super::x86_boot::E820Ext;
871    use super::x86_boot::build_e820_map;
872    use crate::cmdline::BootCommandLineOptions;
873    use crate::dt::write_dt;
874    use crate::host_params::MAX_CPU_COUNT;
875    use crate::host_params::PartitionInfo;
876    use crate::host_params::shim_params::IsolationType;
877    use crate::memory::AddressSpaceManager;
878    use crate::memory::AddressSpaceManagerBuilder;
879    use arrayvec::ArrayString;
880    use arrayvec::ArrayVec;
881    use core::ops::Range;
882    use host_fdt_parser::CpuEntry;
883    use host_fdt_parser::MemoryEntry;
884    use host_fdt_parser::VmbusInfo;
885    use igvm_defs::MemoryMapEntryType;
886    use loader_defs::linux::E820_RAM;
887    use loader_defs::linux::E820_RESERVED;
888    use loader_defs::linux::boot_params;
889    use loader_defs::linux::e820entry;
890    use memory_range::MemoryRange;
891    use memory_range::subtract_ranges;
892    use zerocopy::FromZeros;
893
894    const HIGH_MMIO_GAP_END: u64 = 0x1000000000; //  64 GiB
895    const VMBUS_MMIO_GAP_SIZE: u64 = 0x10000000; // 256 MiB
896    const HIGH_MMIO_GAP_START: u64 = HIGH_MMIO_GAP_END - VMBUS_MMIO_GAP_SIZE;
897
898    /// Create partition info with given cpu count enabled and sequential
899    /// apic_ids.
900    fn new_partition_info(cpu_count: usize) -> PartitionInfo {
901        let mut cpus: ArrayVec<CpuEntry, MAX_CPU_COUNT> = ArrayVec::new();
902
903        for id in 0..(cpu_count as u64) {
904            cpus.push(CpuEntry { reg: id, vnode: 0 });
905        }
906
907        let mut mmio = ArrayVec::new();
908        mmio.push(
909            MemoryRange::try_new(HIGH_MMIO_GAP_START..HIGH_MMIO_GAP_END).expect("valid range"),
910        );
911
912        PartitionInfo {
913            vtl2_ram: ArrayVec::new(),
914            partition_ram: ArrayVec::new(),
915            isolation: IsolationType::None,
916            bsp_reg: cpus[0].reg as u32,
917            cpus,
918            cmdline: ArrayString::new(),
919            vmbus_vtl2: VmbusInfo {
920                mmio,
921                connection_id: 0,
922            },
923            vmbus_vtl0: VmbusInfo {
924                mmio: ArrayVec::new(),
925                connection_id: 0,
926            },
927            com3_serial_available: false,
928            gic: None,
929            pmu_gsiv: None,
930            memory_allocation_mode: host_fdt_parser::MemoryAllocationMode::Host,
931            entropy: None,
932            vtl0_alias_map: None,
933            nvme_keepalive: false,
934            boot_options: BootCommandLineOptions::new(),
935        }
936    }
937
938    // ensure we can boot with a _lot_ of vcpus
939    #[test]
940    #[cfg_attr(
941        target_arch = "aarch64",
942        ignore = "TODO: investigate why this doesn't always work on ARM"
943    )]
944    fn fdt_cpu_scaling() {
945        const MAX_CPUS: usize = 2048;
946
947        let mut buf = [0; 0x40000];
948        write_dt(
949            &mut buf,
950            &new_partition_info(MAX_CPUS),
951            &AddressSpaceManager::new_const(),
952            [],
953            0..0,
954            &ArrayString::from("test").unwrap_or_default(),
955            None,
956            None,
957            IsolationType::None,
958        )
959        .unwrap();
960    }
961
962    // Must match the DeviceTree blob generated with the standard tooling
963    // to ensure being compliant to the standards (or, at least, compatibility
964    // with a widely used implementation).
965    // For details on regenerating the test content, see `fdt_dtc_decompile`
966    // below.
967    #[test]
968    #[ignore = "TODO: temporarily broken"]
969    fn fdt_dtc_check_content() {
970        const MAX_CPUS: usize = 2;
971        const BUF_SIZE: usize = 0x1000;
972
973        // Rust cannot infer the type.
974        let dtb_data_spans: [(usize, &[u8]); 2] = [
975            (
976                /* Span starts at offset */ 0,
977                b"\xd0\x0d\xfe\xed\x00\x00\x10\x00\x00\x00\x04\x38\x00\x00\x00\x38\
978                \x00\x00\x00\x28\x00\x00\x00\x11\x00\x00\x00\x10\x00\x00\x00\x00\
979                \x00\x00\x00\x4a\x00\x00\x01\x6c\x00\x00\x00\x00\x00\x00\x00\x00\
980                \x00\x00\x00\x00\x00\x00\x00\x00\x23\x61\x64\x64\x72\x65\x73\x73\
981                \x2d\x63\x65\x6c\x6c\x73\x00\x23\x73\x69\x7a\x65\x2d\x63\x65\x6c\
982                \x6c\x73\x00\x6d\x6f\x64\x65\x6c\x00\x72\x65\x67\x00\x64\x65\x76\
983                \x69\x63\x65\x5f\x74\x79\x70\x65\x00\x73\x74\x61\x74\x75\x73\x00\
984                \x63\x6f\x6d\x70\x61\x74\x69\x62\x6c\x65\x00\x72\x61\x6e\x67\x65\
985                \x73",
986            ),
987            (
988                /* Span starts at offset */ 0x430,
989                b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\
990                \x00\x00\x00\x03\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x02\
991                \x00\x00\x00\x03\x00\x00\x00\x04\x00\x00\x00\x0f\x00\x00\x00\x00\
992                \x00\x00\x00\x03\x00\x00\x00\x0f\x00\x00\x00\x1b\x6d\x73\x66\x74\
993                \x2c\x75\x6e\x64\x65\x72\x68\x69\x6c\x6c\x00\x00\x00\x00\x00\x01\
994                \x63\x70\x75\x73\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x04\
995                \x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x03\x00\x00\x00\x04\
996                \x00\x00\x00\x0f\x00\x00\x00\x00\x00\x00\x00\x01\x63\x70\x75\x40\
997                \x30\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x04\x00\x00\x00\x25\
998                \x63\x70\x75\x00\x00\x00\x00\x03\x00\x00\x00\x04\x00\x00\x00\x21\
999                \x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x05\x00\x00\x00\x31\
1000                \x6f\x6b\x61\x79\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x01\
1001                \x63\x70\x75\x40\x31\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x04\
1002                \x00\x00\x00\x25\x63\x70\x75\x00\x00\x00\x00\x03\x00\x00\x00\x04\
1003                \x00\x00\x00\x21\x00\x00\x00\x01\x00\x00\x00\x03\x00\x00\x00\x05\
1004                \x00\x00\x00\x31\x6f\x6b\x61\x79\x00\x00\x00\x00\x00\x00\x00\x02\
1005                \x00\x00\x00\x02\x00\x00\x00\x01\x76\x6d\x62\x75\x73\x00\x00\x00\
1006                \x00\x00\x00\x03\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x02\
1007                \x00\x00\x00\x03\x00\x00\x00\x04\x00\x00\x00\x0f\x00\x00\x00\x01\
1008                \x00\x00\x00\x03\x00\x00\x00\x0b\x00\x00\x00\x38\x6d\x73\x66\x74\
1009                \x2c\x76\x6d\x62\x75\x73\x00\x00\x00\x00\x00\x03\x00\x00\x00\x14\
1010                \x00\x00\x00\x43\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0f\
1011                \xf0\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x02\
1012                \x00\x00\x00\x09",
1013            ),
1014        ];
1015
1016        let mut sample_buf = [0u8; BUF_SIZE];
1017        for (span_start, bytes) in dtb_data_spans {
1018            sample_buf[span_start..span_start + bytes.len()].copy_from_slice(bytes);
1019        }
1020
1021        let mut buf = [0u8; BUF_SIZE];
1022        write_dt(
1023            &mut buf,
1024            &new_partition_info(MAX_CPUS),
1025            &AddressSpaceManager::new_const(),
1026            [],
1027            0..0,
1028            &ArrayString::from("test").unwrap_or_default(),
1029            None,
1030            None,
1031            IsolationType::None,
1032        )
1033        .unwrap();
1034
1035        assert!(sample_buf == buf);
1036    }
1037
1038    // This test should be manually enabled when need to regenerate
1039    // the sample content above and validate spec compliance with `dtc`.
1040    // Before running the test, please install the DeviceTree compiler:
1041    // ```shell
1042    // sudo apt-get update && sudo apt-get install device-tree-compiler
1043    // ```
1044    #[test]
1045    #[ignore = "enabling the test requires installing additional software, \
1046                and developers will experience a break."]
1047    fn fdt_dtc_decompile() {
1048        const MAX_CPUS: usize = 2048;
1049
1050        let mut buf = [0; 0x40000];
1051        write_dt(
1052            &mut buf,
1053            &new_partition_info(MAX_CPUS),
1054            &AddressSpaceManager::new_const(),
1055            [],
1056            0..0,
1057            &ArrayString::from("test").unwrap_or_default(),
1058            None,
1059            None,
1060            IsolationType::None,
1061        )
1062        .unwrap();
1063
1064        let input_dtb_file_name = "openhcl_boot.dtb";
1065        let output_dts_file_name = "openhcl_boot.dts";
1066        std::fs::write(input_dtb_file_name, buf).unwrap();
1067        let success = std::process::Command::new("dtc")
1068            .args([input_dtb_file_name, "-I", "dtb", "-o", output_dts_file_name])
1069            .status()
1070            .unwrap()
1071            .success();
1072        assert!(success);
1073    }
1074
1075    fn new_address_space_manager(
1076        ram: &[MemoryRange],
1077        bootshim_used: MemoryRange,
1078        persisted_range: MemoryRange,
1079        parameter_range: MemoryRange,
1080        reclaim: Option<MemoryRange>,
1081    ) -> AddressSpaceManager {
1082        let ram = ram
1083            .iter()
1084            .cloned()
1085            .map(|range| MemoryEntry {
1086                range,
1087                mem_type: MemoryMapEntryType::VTL2_PROTECTABLE,
1088                vnode: 0,
1089            })
1090            .collect::<Vec<_>>();
1091        let mut address_space = AddressSpaceManager::new_const();
1092        AddressSpaceManagerBuilder::new(
1093            &mut address_space,
1094            &ram,
1095            bootshim_used,
1096            persisted_range,
1097            subtract_ranges([parameter_range], reclaim),
1098        )
1099        .init()
1100        .unwrap();
1101        address_space
1102    }
1103
1104    fn check_e820(boot_params: &boot_params, ext: &E820Ext, expected: &[(Range<u64>, u32)]) {
1105        let actual = boot_params.e820_map[..boot_params.e820_entries as usize]
1106            .iter()
1107            .chain(
1108                ext.entries
1109                    .iter()
1110                    .take((ext.header.len as usize) / size_of::<e820entry>()),
1111            );
1112
1113        assert_eq!(actual.clone().count(), expected.len());
1114
1115        for (actual, (expected_range, expected_type)) in actual.zip(expected.iter()) {
1116            let addr: u64 = actual.addr.into();
1117            let size: u64 = actual.size.into();
1118            let typ: u32 = actual.typ.into();
1119            assert_eq!(addr, expected_range.start);
1120            assert_eq!(size, expected_range.end - expected_range.start);
1121            assert_eq!(typ, *expected_type);
1122        }
1123    }
1124
1125    const PAGE_SIZE: u64 = 0x1000;
1126    const ONE_MB: u64 = 0x10_0000;
1127
1128    #[test]
1129    fn test_e820_basic() {
1130        // memmap with no param reclaim
1131        let mut boot_params: boot_params = FromZeros::new_zeroed();
1132        let mut ext = FromZeros::new_zeroed();
1133        let bootshim_used = MemoryRange::try_new(ONE_MB..3 * ONE_MB).unwrap();
1134        let persisted_header_end = ONE_MB + PAGE_SIZE;
1135        let persisted_end = ONE_MB + 4 * PAGE_SIZE;
1136        let persisted_state = MemoryRange::try_new(ONE_MB..persisted_end).unwrap();
1137        let parameter_range = MemoryRange::try_new(2 * ONE_MB..3 * ONE_MB).unwrap();
1138        let address_space = new_address_space_manager(
1139            &[MemoryRange::new(ONE_MB..4 * ONE_MB)],
1140            bootshim_used,
1141            persisted_state,
1142            parameter_range,
1143            None,
1144        );
1145
1146        assert!(build_e820_map(&mut boot_params, &mut ext, &address_space).is_ok());
1147
1148        check_e820(
1149            &boot_params,
1150            &ext,
1151            &[
1152                (ONE_MB..(persisted_header_end), E820_RESERVED),
1153                (persisted_header_end..persisted_end, E820_RESERVED),
1154                (persisted_end..2 * ONE_MB, E820_RAM),
1155                (2 * ONE_MB..3 * ONE_MB, E820_RESERVED),
1156                (3 * ONE_MB..4 * ONE_MB, E820_RAM),
1157            ],
1158        );
1159
1160        // memmap with reclaim
1161        let mut boot_params: boot_params = FromZeros::new_zeroed();
1162        let mut ext = FromZeros::new_zeroed();
1163        let bootshim_used = MemoryRange::try_new(ONE_MB..5 * ONE_MB).unwrap();
1164        let persisted_header_end = ONE_MB + PAGE_SIZE;
1165        let persisted_end = ONE_MB + 4 * PAGE_SIZE;
1166        let persisted_state = MemoryRange::try_new(ONE_MB..persisted_end).unwrap();
1167        let parameter_range = MemoryRange::try_new(2 * ONE_MB..5 * ONE_MB).unwrap();
1168        let reclaim = MemoryRange::try_new(3 * ONE_MB..4 * ONE_MB).unwrap();
1169        let address_space = new_address_space_manager(
1170            &[MemoryRange::new(ONE_MB..6 * ONE_MB)],
1171            bootshim_used,
1172            persisted_state,
1173            parameter_range,
1174            Some(reclaim),
1175        );
1176
1177        assert!(build_e820_map(&mut boot_params, &mut ext, &address_space).is_ok());
1178
1179        check_e820(
1180            &boot_params,
1181            &ext,
1182            &[
1183                (ONE_MB..(persisted_header_end), E820_RESERVED),
1184                (persisted_header_end..persisted_end, E820_RESERVED),
1185                (persisted_end..2 * ONE_MB, E820_RAM),
1186                (2 * ONE_MB..3 * ONE_MB, E820_RESERVED),
1187                (3 * ONE_MB..4 * ONE_MB, E820_RAM),
1188                (4 * ONE_MB..5 * ONE_MB, E820_RESERVED),
1189                (5 * ONE_MB..6 * ONE_MB, E820_RAM),
1190            ],
1191        );
1192
1193        // two mem ranges
1194        let mut boot_params: boot_params = FromZeros::new_zeroed();
1195        let mut ext = FromZeros::new_zeroed();
1196        let bootshim_used = MemoryRange::try_new(ONE_MB..5 * ONE_MB).unwrap();
1197        let persisted_header_end = ONE_MB + PAGE_SIZE;
1198        let persisted_end = ONE_MB + 4 * PAGE_SIZE;
1199        let persisted_state = MemoryRange::try_new(ONE_MB..persisted_end).unwrap();
1200        let parameter_range = MemoryRange::try_new(2 * ONE_MB..5 * ONE_MB).unwrap();
1201        let reclaim = MemoryRange::try_new(3 * ONE_MB..4 * ONE_MB).unwrap();
1202        let address_space = new_address_space_manager(
1203            &[
1204                MemoryRange::new(ONE_MB..4 * ONE_MB),
1205                MemoryRange::new(4 * ONE_MB..10 * ONE_MB),
1206            ],
1207            bootshim_used,
1208            persisted_state,
1209            parameter_range,
1210            Some(reclaim),
1211        );
1212
1213        assert!(build_e820_map(&mut boot_params, &mut ext, &address_space).is_ok());
1214
1215        check_e820(
1216            &boot_params,
1217            &ext,
1218            &[
1219                (ONE_MB..(persisted_header_end), E820_RESERVED),
1220                (persisted_header_end..persisted_end, E820_RESERVED),
1221                (persisted_end..2 * ONE_MB, E820_RAM),
1222                (2 * ONE_MB..3 * ONE_MB, E820_RESERVED),
1223                (3 * ONE_MB..4 * ONE_MB, E820_RAM),
1224                (4 * ONE_MB..5 * ONE_MB, E820_RESERVED),
1225                (5 * ONE_MB..10 * ONE_MB, E820_RAM),
1226            ],
1227        );
1228
1229        // memmap in 1 mb chunks
1230        let mut boot_params: boot_params = FromZeros::new_zeroed();
1231        let mut ext = FromZeros::new_zeroed();
1232        let bootshim_used = MemoryRange::try_new(ONE_MB..5 * ONE_MB).unwrap();
1233        let persisted_header_end = ONE_MB + PAGE_SIZE;
1234        let persisted_end = ONE_MB + 4 * PAGE_SIZE;
1235        let persisted_state = MemoryRange::try_new(ONE_MB..persisted_end).unwrap();
1236        let parameter_range = MemoryRange::try_new(2 * ONE_MB..5 * ONE_MB).unwrap();
1237        let reclaim = MemoryRange::try_new(3 * ONE_MB..4 * ONE_MB).unwrap();
1238        let address_space = new_address_space_manager(
1239            &[
1240                MemoryRange::new(ONE_MB..2 * ONE_MB),
1241                MemoryRange::new(2 * ONE_MB..3 * ONE_MB),
1242                MemoryRange::new(3 * ONE_MB..4 * ONE_MB),
1243                MemoryRange::new(4 * ONE_MB..5 * ONE_MB),
1244                MemoryRange::new(5 * ONE_MB..6 * ONE_MB),
1245                MemoryRange::new(6 * ONE_MB..7 * ONE_MB),
1246                MemoryRange::new(7 * ONE_MB..8 * ONE_MB),
1247            ],
1248            bootshim_used,
1249            persisted_state,
1250            parameter_range,
1251            Some(reclaim),
1252        );
1253
1254        assert!(build_e820_map(&mut boot_params, &mut ext, &address_space).is_ok());
1255
1256        check_e820(
1257            &boot_params,
1258            &ext,
1259            &[
1260                (ONE_MB..(persisted_header_end), E820_RESERVED),
1261                (persisted_header_end..persisted_end, E820_RESERVED),
1262                (persisted_end..2 * ONE_MB, E820_RAM),
1263                (2 * ONE_MB..3 * ONE_MB, E820_RESERVED),
1264                (3 * ONE_MB..4 * ONE_MB, E820_RAM),
1265                (4 * ONE_MB..5 * ONE_MB, E820_RESERVED),
1266                (5 * ONE_MB..8 * ONE_MB, E820_RAM),
1267            ],
1268        );
1269    }
1270
1271    // test e820 with spillover into ext
1272    #[test]
1273    fn test_e820_huge() {
1274        use crate::memory::AllocationPolicy;
1275        use crate::memory::AllocationType;
1276
1277        // Create 64 RAM ranges, then allocate 256 ranges to test spillover
1278        // boot_params.e820_map has E820_MAX_ENTRIES_ZEROPAGE (128) entries
1279        const E820_MAX_ENTRIES_ZEROPAGE: usize = 128;
1280        const RAM_RANGES: usize = 64;
1281        const TOTAL_ALLOCATIONS: usize = 256;
1282
1283        // Create 64 large RAM ranges (64MB each = 64 * 1MB pages per range)
1284        let mut ranges = Vec::new();
1285        for i in 0..RAM_RANGES {
1286            let start = (i as u64) * 64 * ONE_MB;
1287            let end = start + 64 * ONE_MB;
1288            ranges.push(MemoryRange::new(start..end));
1289        }
1290
1291        let bootshim_used = MemoryRange::try_new(0..ONE_MB * 2).unwrap();
1292        let persisted_range = MemoryRange::try_new(0..ONE_MB).unwrap();
1293        let parameter_range = MemoryRange::try_new(ONE_MB..2 * ONE_MB).unwrap();
1294
1295        let mut address_space = {
1296            let ram = ranges
1297                .iter()
1298                .cloned()
1299                .map(|range| MemoryEntry {
1300                    range,
1301                    mem_type: MemoryMapEntryType::VTL2_PROTECTABLE,
1302                    vnode: 0,
1303                })
1304                .collect::<Vec<_>>();
1305            let mut address_space = AddressSpaceManager::new_const();
1306            AddressSpaceManagerBuilder::new(
1307                &mut address_space,
1308                &ram,
1309                bootshim_used,
1310                persisted_range,
1311                core::iter::once(parameter_range),
1312            )
1313            .init()
1314            .unwrap();
1315            address_space
1316        };
1317
1318        for i in 0..TOTAL_ALLOCATIONS {
1319            // Intersperse sidecar node allocations with gpa pool allocations,
1320            // as otherwise the address space manager will collapse adjacent
1321            // ranges of the same type.
1322            let _allocated = address_space
1323                .allocate(
1324                    None,
1325                    ONE_MB,
1326                    if i % 2 == 0 {
1327                        AllocationType::GpaPool
1328                    } else {
1329                        AllocationType::SidecarNode
1330                    },
1331                    AllocationPolicy::LowMemory,
1332                )
1333                .expect("should be able to allocate sidecar node");
1334        }
1335
1336        let mut boot_params: boot_params = FromZeros::new_zeroed();
1337        let mut ext = FromZeros::new_zeroed();
1338        let total_ranges = address_space.vtl2_ranges().count();
1339
1340        let used_ext = build_e820_map(&mut boot_params, &mut ext, &address_space).unwrap();
1341
1342        // Verify that we used the extension
1343        assert!(used_ext, "should use extension when there are many ranges");
1344
1345        // Verify the standard e820_map is full
1346        assert_eq!(boot_params.e820_entries, E820_MAX_ENTRIES_ZEROPAGE as u8);
1347
1348        // Verify the extension has the overflow entries
1349        let ext_entries = (ext.header.len as usize) / size_of::<e820entry>();
1350        assert_eq!(ext_entries, total_ranges - E820_MAX_ENTRIES_ZEROPAGE);
1351
1352        // Verify we have the expected number of total ranges
1353        let total_e820_entries = boot_params.e820_entries as usize + ext_entries;
1354        assert_eq!(total_e820_entries, total_ranges);
1355    }
1356}