openhcl_boot/
main.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! The openhcl boot loader, which loads before the kernel to set up the
5//! kernel's boot parameters.
6
7// See build.rs.
8#![cfg_attr(minimal_rt, no_std, no_main)]
9// UNSAFETY: Interacting with low level hardware and bootloader primitives.
10#![expect(unsafe_code)]
11// Allow the allocator api when compiling with `RUSTFLAGS="--cfg nightly"`. This
12// is used for some miri tests for testing the bump allocator.
13//
14// Do not use a normal feature, as that shows errors with rust-analyzer since
15// most people are using stable and enable all features. We could remove this
16// once the allocator_api feature is stable.
17#![cfg_attr(nightly, feature(allocator_api))]
18
19mod arch;
20mod boot_logger;
21mod cmdline;
22mod dt;
23mod host_params;
24mod hypercall;
25mod memory;
26mod rt;
27mod sidecar;
28mod single_threaded;
29
30use crate::arch::setup_vtl2_memory;
31use crate::arch::setup_vtl2_vp;
32#[cfg(target_arch = "x86_64")]
33use crate::arch::tdx::get_tdx_tsc_reftime;
34use crate::arch::verify_imported_regions_hash;
35use crate::boot_logger::boot_logger_memory_init;
36use crate::boot_logger::boot_logger_runtime_init;
37use crate::hypercall::hvcall;
38use crate::memory::AddressSpaceManager;
39use crate::single_threaded::OffStackRef;
40use crate::single_threaded::off_stack;
41use arrayvec::ArrayString;
42use arrayvec::ArrayVec;
43use cmdline::BootCommandLineOptions;
44use core::fmt::Write;
45use dt::BootTimes;
46use dt::write_dt;
47use host_params::COMMAND_LINE_SIZE;
48use host_params::PartitionInfo;
49use host_params::shim_params::IsolationType;
50use host_params::shim_params::ShimParams;
51use hvdef::Vtl;
52use loader_defs::linux::SETUP_DTB;
53use loader_defs::linux::setup_data;
54use loader_defs::shim::ShimParamsRaw;
55use memory_range::RangeWalkResult;
56use memory_range::walk_ranges;
57use minimal_rt::enlightened_panic::enable_enlightened_panic;
58use sidecar::SidecarConfig;
59use sidecar_defs::SidecarOutput;
60use sidecar_defs::SidecarParams;
61use zerocopy::FromBytes;
62use zerocopy::FromZeros;
63use zerocopy::Immutable;
64use zerocopy::IntoBytes;
65use zerocopy::KnownLayout;
66
67#[derive(Debug)]
68struct CommandLineTooLong;
69
70impl From<core::fmt::Error> for CommandLineTooLong {
71    fn from(_: core::fmt::Error) -> Self {
72        Self
73    }
74}
75
76struct BuildKernelCommandLineParams<'a> {
77    params: &'a ShimParams,
78    cmdline: &'a mut ArrayString<COMMAND_LINE_SIZE>,
79    partition_info: &'a PartitionInfo,
80    can_trust_host: bool,
81    is_confidential_debug: bool,
82    sidecar: Option<&'a SidecarConfig<'a>>,
83    vtl2_pool_supported: bool,
84}
85
86/// Read and setup the underhill kernel command line into the specified buffer.
87fn build_kernel_command_line(
88    fn_params: BuildKernelCommandLineParams<'_>,
89) -> Result<(), CommandLineTooLong> {
90    let BuildKernelCommandLineParams {
91        params,
92        cmdline,
93        partition_info,
94        can_trust_host,
95        is_confidential_debug,
96        sidecar,
97        vtl2_pool_supported,
98    } = fn_params;
99
100    // For reference:
101    // https://www.kernel.org/doc/html/v5.15/admin-guide/kernel-parameters.html
102    const KERNEL_PARAMETERS: &[&str] = &[
103        // If a console is specified, then write everything to it.
104        "loglevel=8",
105        // Use a fixed 128KB log buffer by default.
106        "log_buf_len=128K",
107        // Enable time output on console for ohcldiag-dev.
108        "printk.time=1",
109        // Enable facility and level output on console for ohcldiag-dev.
110        "console_msg_format=syslog",
111        // Set uio parameter to configure vmbus ring buffer behavior.
112        "uio_hv_generic.no_mask=1",
113        // RELIABILITY: Dump anonymous pages and ELF headers only. Skip over
114        // huge pages and the shared pages.
115        "coredump_filter=0x33",
116        // PERF: No processor frequency governing.
117        "cpufreq.off=1",
118        // PERF: Disable the CPU idle time management entirely. It does not
119        // prevent the idle loop from running on idle CPUs, but it prevents
120        // the CPU idle time governors and drivers from being invoked.
121        "cpuidle.off=1",
122        // PERF: No perf checks for crypto algorithms to boot faster.
123        // Would have to evaluate the perf wins on the crypto manager vs
124        // delaying the boot up.
125        "cryptomgr.notests",
126        // PERF: Idle threads use HLT on x64 if there is no work.
127        // Believed to be a compromise between waking up the processor
128        // and the power consumption.
129        "idle=halt",
130        // WORKAROUND: Avoid init calls that assume presence of CMOS (Simple
131        // Boot Flag) or allocate the real-mode trampoline for APs.
132        "initcall_blacklist=init_real_mode,sbf_init",
133        // CONFIG-STATIC, PERF: Static loops-per-jiffy value to save time on boot.
134        "lpj=3000000",
135        // PERF: No broken timer check to boot faster.
136        "no_timer_check",
137        // CONFIG-STATIC, PERF: Using xsave makes VTL transitions being
138        // much slower. The xsave state is shared between VTLs, and we don't
139        // context switch it in the kernel when leaving/entering VTL2.
140        // Removing this will lead to corrupting register state and the
141        // undefined behaviour.
142        "noxsave",
143        // RELIABILITY: Panic on MCEs and faults in the kernel.
144        "oops=panic",
145        // RELIABILITY: Don't panic on kernel warnings.
146        "panic_on_warn=0",
147        // PERF, RELIABILITY: Don't print detailed information about the failing
148        // processes (memory maps, threads).
149        "panic_print=0",
150        // RELIABILITY: Reboot immediately on panic, no timeout.
151        "panic=-1",
152        // RELIABILITY: Don't print processor context information on a fatal
153        // signal. Our crash dump collection infrastructure seems reliable, and
154        // this information doesn't seem useful without a dump anyways.
155        // Additionally it may push important logs off the end of the kmsg
156        // page logged by the host.
157        //"print_fatal_signals=0",
158        // RELIABILITY: Unlimited logging to /dev/kmsg from userspace.
159        "printk.devkmsg=on",
160        // RELIABILITY: Reboot using a triple fault as the fastest method.
161        // That is also the method used for compatibility with earlier versions
162        // of the Microsoft HCL.
163        "reboot=t",
164        // CONFIG-STATIC: Type of the root file system.
165        "rootfstype=tmpfs",
166        // PERF: Deactivate kcompactd kernel thread, otherwise it will queue a
167        // scheduler timer periodically, which introduces jitters for VTL0.
168        "sysctl.vm.compaction_proactiveness=0",
169        // PERF: No TSC stability check when booting up to boot faster,
170        // also no validation during runtime.
171        "tsc=reliable",
172        // RELIABILITY: Panic on receiving an NMI.
173        "unknown_nmi_panic=1",
174        // Use vfio for MANA devices.
175        "vfio_pci.ids=1414:00ba",
176        // WORKAROUND: Enable no-IOMMU mode. This mode provides no device isolation,
177        // and no DMA translation.
178        "vfio.enable_unsafe_noiommu_mode=1",
179        // Specify the init path.
180        "rdinit=/underhill-init",
181        // Default to user-mode NVMe driver.
182        "OPENHCL_NVME_VFIO=1",
183        // The next three items reduce the memory overhead of the storvsc driver.
184        // Since it is only used for DVD, performance is not critical.
185        "hv_storvsc.storvsc_vcpus_per_sub_channel=2048",
186        // Fix number of hardware queues at 2.
187        "hv_storvsc.storvsc_max_hw_queues=2",
188        // Reduce the ring buffer size to 32K.
189        "hv_storvsc.storvsc_ringbuffer_size=0x8000",
190        // Disable eager mimalloc commit to prevent core dumps from being overly large
191        "MIMALLOC_ARENA_EAGER_COMMIT=0",
192        // Disable acpi runtime support. Unused in underhill, but some support
193        // is compiled in for the kernel (ie TDX mailbox protocol).
194        "acpi=off",
195    ];
196
197    const X86_KERNEL_PARAMETERS: &[&str] = &[
198        // Disable pcid support. This is a temporary fix to allow
199        // Underhill to run nested inside AMD VMs. Otherwise, the
200        // Underhill kernel tries to start APs with PCID bits set in CR3
201        // without the PCIDE bit set in CR4, which is an invalid
202        // VP state (according to the mshv nested implementation).
203        //
204        // TODO: remove this once we figure out the root cause and apply
205        // a workaround/fix elsewhere.
206        "clearcpuid=pcid",
207        // Disable all attempts to use an IOMMU, including swiotlb.
208        "iommu=off",
209        // Don't probe for a PCI bus. PCI devices currently come from VPCI. When
210        // this changes, we will explicitly enumerate a PCI bus via devicetree.
211        "pci=off",
212    ];
213
214    const AARCH64_KERNEL_PARAMETERS: &[&str] = &[];
215
216    for p in KERNEL_PARAMETERS {
217        write!(cmdline, "{p} ")?;
218    }
219
220    let arch_parameters = if cfg!(target_arch = "x86_64") {
221        X86_KERNEL_PARAMETERS
222    } else {
223        AARCH64_KERNEL_PARAMETERS
224    };
225    for p in arch_parameters {
226        write!(cmdline, "{p} ")?;
227    }
228
229    const HARDWARE_ISOLATED_KERNEL_PARAMETERS: &[&str] = &[
230        // Even with iommu=off, the SWIOTLB is still allocated on AARCH64
231        // (iommu=off ignored entirely), and CVMs (memory encryption forces it
232        // on). Set it to a single area in 8MB. The first parameter controls the
233        // area size in slabs (2KB per slab), the second controls the number of
234        // areas (default is # of CPUs).
235        //
236        // This is set to 8MB on hardware isolated VMs since there are some
237        // scenarios, such as provisioning over DVD, which require a larger size
238        // since the buffer is being used.
239        "swiotlb=4096,1",
240    ];
241
242    const NON_HARDWARE_ISOLATED_KERNEL_PARAMETERS: &[&str] = &[
243        // Even with iommu=off, the SWIOTLB is still allocated on AARCH64
244        // (iommu=off ignored entirely). Set it to the minimum, saving ~63 MiB.
245        // The first parameter controls the area size, the second controls the
246        // number of areas (default is # of CPUs). Set them both to the minimum.
247        "swiotlb=1,1",
248    ];
249
250    if params.isolation_type.is_hardware_isolated() {
251        for p in HARDWARE_ISOLATED_KERNEL_PARAMETERS {
252            write!(cmdline, "{p} ")?;
253        }
254    } else {
255        for p in NON_HARDWARE_ISOLATED_KERNEL_PARAMETERS {
256            write!(cmdline, "{p} ")?;
257        }
258    }
259
260    // Enable the com3 console by default if it's available and we're not
261    // isolated, or if we are isolated but also have debugging enabled.
262    //
263    // Otherwise, set the console to ttynull so the kernel does not default to
264    // com1. This is overridden by any user customizations in the static or
265    // dynamic command line, as this console argument provided by the bootloader
266    // comes first.
267    let console = if partition_info.com3_serial_available && can_trust_host {
268        "ttyS2,115200"
269    } else {
270        "ttynull"
271    };
272    write!(cmdline, "console={console} ")?;
273
274    if params.isolation_type != IsolationType::None {
275        write!(
276            cmdline,
277            "{}=1 ",
278            underhill_confidentiality::OPENHCL_CONFIDENTIAL_ENV_VAR_NAME
279        )?;
280    }
281
282    if is_confidential_debug {
283        write!(
284            cmdline,
285            "{}=1 ",
286            underhill_confidentiality::OPENHCL_CONFIDENTIAL_DEBUG_ENV_VAR_NAME
287        )?;
288    }
289
290    // Generate the NVMe keep alive command line which should look something
291    // like: OPENHCL_NVME_KEEP_ALIVE=disabled,host,privatepool
292    // TODO: Move from command line to device tree when stabilized.
293    write!(cmdline, "OPENHCL_NVME_KEEP_ALIVE=")?;
294
295    if partition_info.boot_options.disable_nvme_keep_alive {
296        write!(cmdline, "disabled,")?;
297    }
298
299    if partition_info.nvme_keepalive {
300        write!(cmdline, "host,")?;
301    } else {
302        write!(cmdline, "nohost,")?;
303    }
304
305    if vtl2_pool_supported {
306        write!(cmdline, "privatepool ")?;
307    } else {
308        write!(cmdline, "noprivatepool ")?;
309    }
310
311    if let Some(sidecar) = sidecar {
312        write!(cmdline, "{} ", sidecar.kernel_command_line())?;
313    }
314
315    if !cmdline.contains("hv_vmbus.message_connection_id") {
316        // HACK: Set the vmbus connection id via kernel commandline if we haven't
317        // gotten one from elsewhere.
318        //
319        // This code will be removed when the kernel supports setting connection id
320        // via device tree.
321        write!(
322            cmdline,
323            "hv_vmbus.message_connection_id=0x{:x} ",
324            partition_info.vmbus_vtl2.connection_id
325        )?;
326    }
327
328    // Prepend the computed parameters to the original command line.
329    cmdline.write_str(&partition_info.cmdline)?;
330
331    Ok(())
332}
333
334// The Linux kernel requires that the FDT fit within a single 256KB mapping, as
335// that is the maximum size the kernel can use during its early boot processes.
336// We also want our FDT to be as large as possible to support as many vCPUs as
337// possible. We set it to 256KB, but it must also be page-aligned, as leaving it
338// unaligned runs the possibility of it taking up 1 too many pages, resulting in
339// a 260KB mapping, which will fail.
340const FDT_SIZE: usize = 256 * 1024;
341
342#[repr(C, align(4096))]
343#[derive(FromBytes, IntoBytes, Immutable, KnownLayout)]
344struct Fdt {
345    header: setup_data,
346    data: [u8; FDT_SIZE - size_of::<setup_data>()],
347}
348
349/// Raw shim parameters are provided via a relative offset from the base of
350/// where the shim is loaded. Return a ShimParams structure based on the raw
351/// offset based RawShimParams.
352fn shim_parameters(shim_params_raw_offset: isize) -> ShimParams {
353    unsafe extern "C" {
354        static __ehdr_start: u8;
355    }
356
357    let shim_base = core::ptr::addr_of!(__ehdr_start) as usize;
358
359    // SAFETY: The host is required to relocate everything by the same bias, so
360    //         the shim parameters should be at the build time specified offset
361    //         from the base address of the image.
362    let raw_shim_params = unsafe {
363        &*(shim_base.wrapping_add_signed(shim_params_raw_offset) as *const ShimParamsRaw)
364    };
365
366    ShimParams::new(shim_base as u64, raw_shim_params)
367}
368
369#[cfg_attr(not(target_arch = "x86_64"), expect(dead_code))]
370mod x86_boot {
371    use crate::PageAlign;
372    use crate::memory::AddressSpaceManager;
373    use crate::single_threaded::OffStackRef;
374    use crate::single_threaded::off_stack;
375    use crate::zeroed;
376    use core::mem::size_of;
377    use core::ops::Range;
378    use core::ptr;
379    use loader_defs::linux::E820_RAM;
380    use loader_defs::linux::E820_RESERVED;
381    use loader_defs::linux::SETUP_E820_EXT;
382    use loader_defs::linux::boot_params;
383    use loader_defs::linux::e820entry;
384    use loader_defs::linux::setup_data;
385    use loader_defs::shim::MemoryVtlType;
386    use memory_range::MemoryRange;
387    use zerocopy::FromZeros;
388    use zerocopy::Immutable;
389    use zerocopy::KnownLayout;
390
391    #[repr(C)]
392    #[derive(FromZeros, Immutable, KnownLayout)]
393    pub struct E820Ext {
394        pub header: setup_data,
395        pub entries: [e820entry; 512],
396    }
397
398    fn add_e820_entry(
399        entry: Option<&mut e820entry>,
400        range: MemoryRange,
401        typ: u32,
402    ) -> Result<(), BuildE820MapError> {
403        *entry.ok_or(BuildE820MapError::OutOfE820Entries)? = e820entry {
404            addr: range.start().into(),
405            size: range.len().into(),
406            typ: typ.into(),
407        };
408        Ok(())
409    }
410
411    #[derive(Debug)]
412    pub enum BuildE820MapError {
413        /// Out of e820 entries.
414        OutOfE820Entries,
415    }
416
417    /// Build the e820 map for the kernel representing usable VTL2 ram.
418    pub fn build_e820_map(
419        boot_params: &mut boot_params,
420        ext: &mut E820Ext,
421        address_space: &AddressSpaceManager,
422    ) -> Result<bool, BuildE820MapError> {
423        boot_params.e820_entries = 0;
424        let mut entries = boot_params
425            .e820_map
426            .iter_mut()
427            .chain(ext.entries.iter_mut());
428
429        let mut n = 0;
430        for (range, typ) in address_space.vtl2_ranges() {
431            match typ {
432                MemoryVtlType::VTL2_RAM => {
433                    add_e820_entry(entries.next(), range, E820_RAM)?;
434                    n += 1;
435                }
436                MemoryVtlType::VTL2_CONFIG
437                | MemoryVtlType::VTL2_SIDECAR_IMAGE
438                | MemoryVtlType::VTL2_SIDECAR_NODE
439                | MemoryVtlType::VTL2_RESERVED
440                | MemoryVtlType::VTL2_GPA_POOL
441                | MemoryVtlType::VTL2_TDX_PAGE_TABLES
442                | MemoryVtlType::VTL2_BOOTSHIM_LOG_BUFFER
443                | MemoryVtlType::VTL2_PERSISTED_STATE_HEADER
444                | MemoryVtlType::VTL2_PERSISTED_STATE_PROTOBUF => {
445                    add_e820_entry(entries.next(), range, E820_RESERVED)?;
446                    n += 1;
447                }
448
449                _ => {
450                    panic!("unexpected vtl2 ram type {typ:?} for range {range:#?}");
451                }
452            }
453        }
454
455        let base = n.min(boot_params.e820_map.len());
456        boot_params.e820_entries = base as u8;
457
458        if base < n {
459            ext.header.len = ((n - base) * size_of::<e820entry>()) as u32;
460            Ok(true)
461        } else {
462            Ok(false)
463        }
464    }
465
466    pub fn build_boot_params(
467        address_space: &AddressSpaceManager,
468        initrd: Range<u64>,
469        cmdline: &str,
470        setup_data_head: *const setup_data,
471        setup_data_tail: &mut &mut setup_data,
472    ) -> OffStackRef<'static, PageAlign<boot_params>> {
473        let mut boot_params_storage = off_stack!(PageAlign<boot_params>, zeroed());
474        let boot_params = &mut boot_params_storage.0;
475        boot_params.hdr.type_of_loader = 0xff; // Unknown loader type
476
477        // HACK: A kernel change just in the Underhill kernel tree has a workaround
478        // to disable probe_roms and reserve_bios_regions when X86_SUBARCH_LGUEST
479        // (1) is set by the bootloader. This stops the kernel from reading VTL0
480        // memory during kernel boot, which can have catastrophic consequences
481        // during a servicing operation when VTL0 has written values to memory, or
482        // unaccepted page accesses in an isolated partition.
483        //
484        // This is only intended as a stopgap until a suitable upstreamable kernel
485        // patch is made.
486        boot_params.hdr.hardware_subarch = 1.into();
487
488        boot_params.hdr.ramdisk_image = (initrd.start as u32).into();
489        boot_params.ext_ramdisk_image = (initrd.start >> 32) as u32;
490        let initrd_len = initrd.end - initrd.start;
491        boot_params.hdr.ramdisk_size = (initrd_len as u32).into();
492        boot_params.ext_ramdisk_size = (initrd_len >> 32) as u32;
493
494        let e820_ext = OffStackRef::leak(off_stack!(E820Ext, zeroed()));
495
496        let used_ext = build_e820_map(boot_params, e820_ext, address_space)
497            .expect("building e820 map must succeed");
498
499        if used_ext {
500            e820_ext.header.ty = SETUP_E820_EXT;
501            setup_data_tail.next = ptr::from_ref(&e820_ext.header) as u64;
502            *setup_data_tail = &mut e820_ext.header;
503        }
504
505        let cmd_line_addr = cmdline.as_ptr() as u64;
506        boot_params.hdr.cmd_line_ptr = (cmd_line_addr as u32).into();
507        boot_params.ext_cmd_line_ptr = (cmd_line_addr >> 32) as u32;
508
509        boot_params.hdr.setup_data = (setup_data_head as u64).into();
510
511        boot_params_storage
512    }
513}
514
515/// Build the cc_blob containing the location of different parameters associated with SEV.
516#[cfg(target_arch = "x86_64")]
517fn build_cc_blob_sev_info(
518    cc_blob: &mut loader_defs::linux::cc_blob_sev_info,
519    shim_params: &ShimParams,
520) {
521    // TODO SNP: Currently only the first CPUID page is passed through.
522    // Consider changing this.
523    cc_blob.magic = loader_defs::linux::CC_BLOB_SEV_INFO_MAGIC;
524    cc_blob.version = 0;
525    cc_blob._reserved = 0;
526    cc_blob.secrets_phys = shim_params.secrets_start();
527    cc_blob.secrets_len = hvdef::HV_PAGE_SIZE as u32;
528    cc_blob._rsvd1 = 0;
529    cc_blob.cpuid_phys = shim_params.cpuid_start();
530    cc_blob.cpuid_len = hvdef::HV_PAGE_SIZE as u32;
531    cc_blob._rsvd2 = 0;
532}
533
534#[repr(C, align(4096))]
535#[derive(FromZeros, Immutable, KnownLayout)]
536struct PageAlign<T>(T);
537
538const fn zeroed<T: FromZeros>() -> T {
539    // SAFETY: `T` implements `FromZeros`, so this is a safe initialization of `T`.
540    unsafe { core::mem::MaybeUninit::<T>::zeroed().assume_init() }
541}
542
543fn get_ref_time(isolation: IsolationType) -> Option<u64> {
544    match isolation {
545        #[cfg(target_arch = "x86_64")]
546        IsolationType::Tdx => get_tdx_tsc_reftime(),
547        #[cfg(target_arch = "x86_64")]
548        IsolationType::Snp => None,
549        _ => Some(minimal_rt::reftime::reference_time()),
550    }
551}
552
553fn shim_main(shim_params_raw_offset: isize) -> ! {
554    let p = shim_parameters(shim_params_raw_offset);
555    if p.isolation_type == IsolationType::None {
556        enable_enlightened_panic();
557    }
558
559    // Enable the in-memory log.
560    boot_logger_memory_init(p.log_buffer);
561
562    // Enable global log crate.
563    log::set_logger(&boot_logger::BOOT_LOGGER).unwrap();
564    // TODO: allow overriding filter at runtime
565    log::set_max_level(log::LevelFilter::Info);
566
567    let boot_reftime = get_ref_time(p.isolation_type);
568
569    // The support code for the fast hypercalls does not set
570    // the Guest ID if it is not set yet as opposed to the slow
571    // hypercall code path where that is done automatically.
572    // Thus the fast hypercalls will fail as the the Guest ID has
573    // to be set first hence initialize hypercall support
574    // explicitly.
575    if !p.isolation_type.is_hardware_isolated() {
576        hvcall().initialize();
577    }
578
579    let mut static_options = BootCommandLineOptions::new();
580    if let Some(cmdline) = p.command_line().command_line() {
581        static_options.parse(cmdline);
582    }
583
584    let static_confidential_debug = static_options.confidential_debug;
585    let can_trust_host = p.isolation_type == IsolationType::None || static_confidential_debug;
586
587    let mut dt_storage = off_stack!(PartitionInfo, PartitionInfo::new());
588    let address_space = OffStackRef::leak(off_stack!(
589        AddressSpaceManager,
590        AddressSpaceManager::new_const()
591    ));
592    let partition_info = match PartitionInfo::read_from_dt(
593        &p,
594        &mut dt_storage,
595        address_space,
596        static_options,
597        can_trust_host,
598    ) {
599        Ok(val) => val,
600        Err(e) => panic!("unable to read device tree params {}", e),
601    };
602
603    // Enable logging ASAP. This is fine even when isolated, as we don't have
604    // any access to secrets in the boot shim.
605    boot_logger_runtime_init(p.isolation_type, partition_info.com3_serial_available);
606    log::info!("openhcl_boot: logging enabled");
607
608    // Confidential debug will show up in boot_options only if included in the
609    // static command line, or if can_trust_host is true (so the dynamic command
610    // line has been parsed).
611    let is_confidential_debug =
612        static_confidential_debug || partition_info.boot_options.confidential_debug;
613
614    // Fill out the non-devicetree derived parts of PartitionInfo.
615    if !p.isolation_type.is_hardware_isolated()
616        && hvcall().vtl() == Vtl::Vtl2
617        && hvdef::HvRegisterVsmCapabilities::from(
618            hvcall()
619                .get_register(hvdef::HvAllArchRegisterName::VsmCapabilities.into())
620                .expect("failed to query vsm capabilities")
621                .as_u64(),
622        )
623        .vtl0_alias_map_available()
624    {
625        // If the vtl0 alias map was not provided in the devicetree, attempt to
626        // derive it from the architectural physical address bits.
627        //
628        // The value in the ID_AA64MMFR0_EL1 register used to determine the
629        // physical address bits can only represent multiples of 4. As a result,
630        // the Surface Pro X (and systems with similar CPUs) cannot properly
631        // report their address width of 39 bits. This causes the calculated
632        // alias map to be incorrect, which results in panics when trying to
633        // read memory and getting invalid data.
634        if partition_info.vtl0_alias_map.is_none() {
635            partition_info.vtl0_alias_map =
636                Some(1 << (arch::physical_address_bits(p.isolation_type) - 1));
637        }
638    } else {
639        // Ignore any devicetree-provided alias map if the conditions above
640        // aren't met.
641        partition_info.vtl0_alias_map = None;
642    }
643
644    // Rebind partition_info as no longer mutable.
645    let partition_info: &PartitionInfo = partition_info;
646
647    if partition_info.cpus.is_empty() {
648        panic!("no cpus");
649    }
650
651    validate_vp_hw_ids(partition_info);
652
653    setup_vtl2_memory(&p, partition_info, address_space);
654    setup_vtl2_vp(partition_info);
655
656    verify_imported_regions_hash(&p);
657
658    let mut sidecar_params = off_stack!(PageAlign<SidecarParams>, zeroed());
659    let mut sidecar_output = off_stack!(PageAlign<SidecarOutput>, zeroed());
660    let sidecar = sidecar::start_sidecar(
661        &p,
662        partition_info,
663        address_space,
664        &mut sidecar_params.0,
665        &mut sidecar_output.0,
666    );
667
668    // Rebind address_space as no longer mutable.
669    let address_space: &AddressSpaceManager = address_space;
670
671    let mut cmdline = off_stack!(ArrayString<COMMAND_LINE_SIZE>, ArrayString::new_const());
672    build_kernel_command_line(BuildKernelCommandLineParams {
673        params: &p,
674        cmdline: &mut cmdline,
675        partition_info,
676        can_trust_host,
677        is_confidential_debug,
678        sidecar: sidecar.as_ref(),
679        vtl2_pool_supported: address_space.has_vtl2_pool(),
680    })
681    .unwrap();
682
683    let mut fdt = off_stack!(Fdt, zeroed());
684    fdt.header.len = fdt.data.len() as u32;
685    fdt.header.ty = SETUP_DTB;
686
687    #[cfg(target_arch = "x86_64")]
688    let mut setup_data_tail = &mut fdt.header;
689    #[cfg(target_arch = "x86_64")]
690    let setup_data_head = core::ptr::from_ref(setup_data_tail);
691
692    #[cfg(target_arch = "x86_64")]
693    if p.isolation_type == IsolationType::Snp {
694        let cc_blob = OffStackRef::leak(off_stack!(loader_defs::linux::cc_blob_sev_info, zeroed()));
695        build_cc_blob_sev_info(cc_blob, &p);
696
697        let cc_data = OffStackRef::leak(off_stack!(loader_defs::linux::cc_setup_data, zeroed()));
698        cc_data.header.len = size_of::<loader_defs::linux::cc_setup_data>() as u32;
699        cc_data.header.ty = loader_defs::linux::SETUP_CC_BLOB;
700        cc_data.cc_blob_address = core::ptr::from_ref(&*cc_blob) as u32;
701
702        // Chain in the setup data.
703        setup_data_tail.next = core::ptr::from_ref(&*cc_data) as u64;
704        setup_data_tail = &mut cc_data.header;
705    }
706
707    let initrd = p.initrd_base..p.initrd_base + p.initrd_size;
708
709    // Validate the initrd crc matches what was put at file generation time.
710    let computed_crc = crc32fast::hash(p.initrd());
711    assert_eq!(
712        computed_crc, p.initrd_crc,
713        "computed initrd crc does not match build time calculated crc"
714    );
715
716    #[cfg(target_arch = "x86_64")]
717    let boot_params = x86_boot::build_boot_params(
718        address_space,
719        initrd.clone(),
720        &cmdline,
721        setup_data_head,
722        &mut setup_data_tail,
723    );
724
725    // Compute the ending boot time. This has to be before writing to device
726    // tree, so this is as late as we can do it.
727
728    let boot_times = boot_reftime.map(|start| BootTimes {
729        start,
730        end: get_ref_time(p.isolation_type).unwrap_or(0),
731    });
732
733    // Validate that no imported regions that are pending are not part of vtl2
734    // ram.
735    for (range, result) in walk_ranges(
736        partition_info.vtl2_ram.iter().map(|r| (r.range, ())),
737        p.imported_regions(),
738    ) {
739        match result {
740            RangeWalkResult::Neither | RangeWalkResult::Left(_) | RangeWalkResult::Both(_, _) => {}
741            RangeWalkResult::Right(accepted) => {
742                // Ranges that are not a part of VTL2 ram must have been
743                // preaccepted, as usermode expect that to be the case.
744                assert!(
745                    accepted,
746                    "range {:#x?} not in vtl2 ram was not preaccepted at launch",
747                    range
748                );
749            }
750        }
751    }
752
753    write_dt(
754        &mut fdt.data,
755        partition_info,
756        address_space,
757        p.imported_regions().map(|r| {
758            // Discard if the range was previously pending - the bootloader has
759            // accepted all pending ranges.
760            //
761            // NOTE: No VTL0 memory today is marked as pending. The check above
762            // validates that, and this code may need to change if this becomes
763            // no longer true.
764            r.0
765        }),
766        initrd,
767        &cmdline,
768        sidecar.as_ref(),
769        boot_times,
770        p.isolation_type,
771    )
772    .unwrap();
773
774    rt::verify_stack_cookie();
775
776    log::info!("uninitializing hypercalls, about to jump to kernel");
777    hvcall().uninitialize();
778
779    cfg_if::cfg_if! {
780        if #[cfg(target_arch = "x86_64")] {
781            // SAFETY: the parameter blob is trusted.
782            let kernel_entry: extern "C" fn(u64, &loader_defs::linux::boot_params) -> ! =
783                unsafe { core::mem::transmute(p.kernel_entry_address) };
784            kernel_entry(0, &boot_params.0)
785        } else if #[cfg(target_arch = "aarch64")] {
786            // SAFETY: the parameter blob is trusted.
787            let kernel_entry: extern "C" fn(fdt_data: *const u8, mbz0: u64, mbz1: u64, mbz2: u64) -> ! =
788                unsafe { core::mem::transmute(p.kernel_entry_address) };
789            // Disable MMU for kernel boot without EFI, as required by the boot protocol.
790            // Flush (and invalidate) the caches, as that is required for disabling MMU.
791            // SAFETY: Just changing a bit in the register and then jumping to the kernel.
792            unsafe {
793                core::arch::asm!(
794                    "
795                    mrs     {0}, sctlr_el1
796                    bic     {0}, {0}, #0x1
797                    msr     sctlr_el1, {0}
798                    tlbi    vmalle1
799                    dsb     sy
800                    isb     sy",
801                    lateout(reg) _,
802                );
803            }
804            kernel_entry(fdt.data.as_ptr(), 0, 0, 0)
805        } else {
806            panic!("unsupported arch")
807        }
808    }
809}
810
811/// Ensure that mshv VP indexes for the CPUs listed in the partition info
812/// correspond to the N in the cpu@N devicetree node name. OpenVMM assumes that
813/// this will be the case.
814fn validate_vp_hw_ids(partition_info: &PartitionInfo) {
815    use host_params::MAX_CPU_COUNT;
816    use hypercall::HwId;
817
818    if partition_info.isolation.is_hardware_isolated() {
819        // TODO TDX SNP: we don't have a GHCB/GHCI page set up to communicate
820        // with the hypervisor here, so we can't easily perform the check. Since
821        // there is no security impact to this check, we can skip it for now; if
822        // the VM fails to boot, then this is due to a host contract violation.
823        //
824        // For TDX, we could use ENUM TOPOLOGY to validate that the TD VCPU
825        // indexes correspond to the APIC IDs in the right order. I am not
826        // certain if there are places where we depend on this mapping today.
827        return;
828    }
829
830    if hvcall().vtl() != Vtl::Vtl2 {
831        // If we're not using guest VSM, then the guest won't communicate
832        // directly with the hypervisor, so we can choose the VP indexes
833        // ourselves.
834        return;
835    }
836
837    // Ensure the host and hypervisor agree on VP index ordering.
838
839    let mut hw_ids = off_stack!(ArrayVec<HwId, MAX_CPU_COUNT>, ArrayVec::new_const());
840    hw_ids.clear();
841    hw_ids.extend(partition_info.cpus.iter().map(|c| c.reg as _));
842    let mut vp_indexes = off_stack!(ArrayVec<u32, MAX_CPU_COUNT>, ArrayVec::new_const());
843    vp_indexes.clear();
844    if let Err(err) = hvcall().get_vp_index_from_hw_id(&hw_ids, &mut vp_indexes) {
845        panic!(
846            "failed to get VP index for hardware ID {:#x}: {}",
847            hw_ids[vp_indexes.len().min(hw_ids.len() - 1)],
848            err
849        );
850    }
851    if let Some((i, &vp_index)) = vp_indexes
852        .iter()
853        .enumerate()
854        .find(|&(i, vp_index)| i as u32 != *vp_index)
855    {
856        panic!(
857            "CPU hardware ID {:#x} does not correspond to VP index {}",
858            hw_ids[i], vp_index
859        );
860    }
861}
862
863// See build.rs. See `mod rt` for the actual bootstrap code required to invoke
864// shim_main.
865#[cfg(not(minimal_rt))]
866fn main() {
867    unimplemented!("build with MINIMAL_RT_BUILD to produce a working boot loader");
868}
869
870#[cfg(test)]
871mod test {
872    use super::x86_boot::E820Ext;
873    use super::x86_boot::build_e820_map;
874    use crate::cmdline::BootCommandLineOptions;
875    use crate::dt::write_dt;
876    use crate::host_params::MAX_CPU_COUNT;
877    use crate::host_params::PartitionInfo;
878    use crate::host_params::shim_params::IsolationType;
879    use crate::memory::AddressSpaceManager;
880    use crate::memory::AddressSpaceManagerBuilder;
881    use arrayvec::ArrayString;
882    use arrayvec::ArrayVec;
883    use core::ops::Range;
884    use host_fdt_parser::CpuEntry;
885    use host_fdt_parser::MemoryEntry;
886    use host_fdt_parser::VmbusInfo;
887    use igvm_defs::MemoryMapEntryType;
888    use loader_defs::linux::E820_RAM;
889    use loader_defs::linux::E820_RESERVED;
890    use loader_defs::linux::boot_params;
891    use loader_defs::linux::e820entry;
892    use memory_range::MemoryRange;
893    use memory_range::subtract_ranges;
894    use zerocopy::FromZeros;
895
896    const HIGH_MMIO_GAP_END: u64 = 0x1000000000; //  64 GiB
897    const VMBUS_MMIO_GAP_SIZE: u64 = 0x10000000; // 256 MiB
898    const HIGH_MMIO_GAP_START: u64 = HIGH_MMIO_GAP_END - VMBUS_MMIO_GAP_SIZE;
899
900    /// Create partition info with given cpu count enabled and sequential
901    /// apic_ids.
902    fn new_partition_info(cpu_count: usize) -> PartitionInfo {
903        let mut cpus: ArrayVec<CpuEntry, MAX_CPU_COUNT> = ArrayVec::new();
904
905        for id in 0..(cpu_count as u64) {
906            cpus.push(CpuEntry { reg: id, vnode: 0 });
907        }
908
909        let mut mmio = ArrayVec::new();
910        mmio.push(
911            MemoryRange::try_new(HIGH_MMIO_GAP_START..HIGH_MMIO_GAP_END).expect("valid range"),
912        );
913
914        PartitionInfo {
915            vtl2_ram: ArrayVec::new(),
916            partition_ram: ArrayVec::new(),
917            isolation: IsolationType::None,
918            bsp_reg: cpus[0].reg as u32,
919            cpus,
920            cmdline: ArrayString::new(),
921            vmbus_vtl2: VmbusInfo {
922                mmio,
923                connection_id: 0,
924            },
925            vmbus_vtl0: VmbusInfo {
926                mmio: ArrayVec::new(),
927                connection_id: 0,
928            },
929            com3_serial_available: false,
930            gic: None,
931            pmu_gsiv: None,
932            memory_allocation_mode: host_fdt_parser::MemoryAllocationMode::Host,
933            entropy: None,
934            vtl0_alias_map: None,
935            nvme_keepalive: false,
936            boot_options: BootCommandLineOptions::new(),
937        }
938    }
939
940    // ensure we can boot with a _lot_ of vcpus
941    #[test]
942    #[cfg_attr(
943        target_arch = "aarch64",
944        ignore = "TODO: investigate why this doesn't always work on ARM"
945    )]
946    fn fdt_cpu_scaling() {
947        const MAX_CPUS: usize = 2048;
948
949        let mut buf = [0; 0x40000];
950        write_dt(
951            &mut buf,
952            &new_partition_info(MAX_CPUS),
953            &AddressSpaceManager::new_const(),
954            [],
955            0..0,
956            &ArrayString::from("test").unwrap_or_default(),
957            None,
958            None,
959            IsolationType::None,
960        )
961        .unwrap();
962    }
963
964    // Must match the DeviceTree blob generated with the standard tooling
965    // to ensure being compliant to the standards (or, at least, compatibility
966    // with a widely used implementation).
967    // For details on regenerating the test content, see `fdt_dtc_decompile`
968    // below.
969    #[test]
970    #[ignore = "TODO: temporarily broken"]
971    fn fdt_dtc_check_content() {
972        const MAX_CPUS: usize = 2;
973        const BUF_SIZE: usize = 0x1000;
974
975        // Rust cannot infer the type.
976        let dtb_data_spans: [(usize, &[u8]); 2] = [
977            (
978                /* Span starts at offset */ 0,
979                b"\xd0\x0d\xfe\xed\x00\x00\x10\x00\x00\x00\x04\x38\x00\x00\x00\x38\
980                \x00\x00\x00\x28\x00\x00\x00\x11\x00\x00\x00\x10\x00\x00\x00\x00\
981                \x00\x00\x00\x4a\x00\x00\x01\x6c\x00\x00\x00\x00\x00\x00\x00\x00\
982                \x00\x00\x00\x00\x00\x00\x00\x00\x23\x61\x64\x64\x72\x65\x73\x73\
983                \x2d\x63\x65\x6c\x6c\x73\x00\x23\x73\x69\x7a\x65\x2d\x63\x65\x6c\
984                \x6c\x73\x00\x6d\x6f\x64\x65\x6c\x00\x72\x65\x67\x00\x64\x65\x76\
985                \x69\x63\x65\x5f\x74\x79\x70\x65\x00\x73\x74\x61\x74\x75\x73\x00\
986                \x63\x6f\x6d\x70\x61\x74\x69\x62\x6c\x65\x00\x72\x61\x6e\x67\x65\
987                \x73",
988            ),
989            (
990                /* Span starts at offset */ 0x430,
991                b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\
992                \x00\x00\x00\x03\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x02\
993                \x00\x00\x00\x03\x00\x00\x00\x04\x00\x00\x00\x0f\x00\x00\x00\x00\
994                \x00\x00\x00\x03\x00\x00\x00\x0f\x00\x00\x00\x1b\x6d\x73\x66\x74\
995                \x2c\x75\x6e\x64\x65\x72\x68\x69\x6c\x6c\x00\x00\x00\x00\x00\x01\
996                \x63\x70\x75\x73\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x04\
997                \x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x03\x00\x00\x00\x04\
998                \x00\x00\x00\x0f\x00\x00\x00\x00\x00\x00\x00\x01\x63\x70\x75\x40\
999                \x30\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x04\x00\x00\x00\x25\
1000                \x63\x70\x75\x00\x00\x00\x00\x03\x00\x00\x00\x04\x00\x00\x00\x21\
1001                \x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x05\x00\x00\x00\x31\
1002                \x6f\x6b\x61\x79\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x01\
1003                \x63\x70\x75\x40\x31\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x04\
1004                \x00\x00\x00\x25\x63\x70\x75\x00\x00\x00\x00\x03\x00\x00\x00\x04\
1005                \x00\x00\x00\x21\x00\x00\x00\x01\x00\x00\x00\x03\x00\x00\x00\x05\
1006                \x00\x00\x00\x31\x6f\x6b\x61\x79\x00\x00\x00\x00\x00\x00\x00\x02\
1007                \x00\x00\x00\x02\x00\x00\x00\x01\x76\x6d\x62\x75\x73\x00\x00\x00\
1008                \x00\x00\x00\x03\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x02\
1009                \x00\x00\x00\x03\x00\x00\x00\x04\x00\x00\x00\x0f\x00\x00\x00\x01\
1010                \x00\x00\x00\x03\x00\x00\x00\x0b\x00\x00\x00\x38\x6d\x73\x66\x74\
1011                \x2c\x76\x6d\x62\x75\x73\x00\x00\x00\x00\x00\x03\x00\x00\x00\x14\
1012                \x00\x00\x00\x43\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0f\
1013                \xf0\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x02\
1014                \x00\x00\x00\x09",
1015            ),
1016        ];
1017
1018        let mut sample_buf = [0u8; BUF_SIZE];
1019        for (span_start, bytes) in dtb_data_spans {
1020            sample_buf[span_start..span_start + bytes.len()].copy_from_slice(bytes);
1021        }
1022
1023        let mut buf = [0u8; BUF_SIZE];
1024        write_dt(
1025            &mut buf,
1026            &new_partition_info(MAX_CPUS),
1027            &AddressSpaceManager::new_const(),
1028            [],
1029            0..0,
1030            &ArrayString::from("test").unwrap_or_default(),
1031            None,
1032            None,
1033            IsolationType::None,
1034        )
1035        .unwrap();
1036
1037        assert!(sample_buf == buf);
1038    }
1039
1040    // This test should be manually enabled when need to regenerate
1041    // the sample content above and validate spec compliance with `dtc`.
1042    // Before running the test, please install the DeviceTree compiler:
1043    // ```shell
1044    // sudo apt-get update && sudo apt-get install device-tree-compiler
1045    // ```
1046    #[test]
1047    #[ignore = "enabling the test requires installing additional software, \
1048                and developers will experience a break."]
1049    fn fdt_dtc_decompile() {
1050        const MAX_CPUS: usize = 2048;
1051
1052        let mut buf = [0; 0x40000];
1053        write_dt(
1054            &mut buf,
1055            &new_partition_info(MAX_CPUS),
1056            &AddressSpaceManager::new_const(),
1057            [],
1058            0..0,
1059            &ArrayString::from("test").unwrap_or_default(),
1060            None,
1061            None,
1062            IsolationType::None,
1063        )
1064        .unwrap();
1065
1066        let input_dtb_file_name = "openhcl_boot.dtb";
1067        let output_dts_file_name = "openhcl_boot.dts";
1068        std::fs::write(input_dtb_file_name, buf).unwrap();
1069        let success = std::process::Command::new("dtc")
1070            .args([input_dtb_file_name, "-I", "dtb", "-o", output_dts_file_name])
1071            .status()
1072            .unwrap()
1073            .success();
1074        assert!(success);
1075    }
1076
1077    fn new_address_space_manager(
1078        ram: &[MemoryRange],
1079        bootshim_used: MemoryRange,
1080        persisted_range: MemoryRange,
1081        parameter_range: MemoryRange,
1082        reclaim: Option<MemoryRange>,
1083    ) -> AddressSpaceManager {
1084        let ram = ram
1085            .iter()
1086            .cloned()
1087            .map(|range| MemoryEntry {
1088                range,
1089                mem_type: MemoryMapEntryType::VTL2_PROTECTABLE,
1090                vnode: 0,
1091            })
1092            .collect::<Vec<_>>();
1093        let mut address_space = AddressSpaceManager::new_const();
1094        AddressSpaceManagerBuilder::new(
1095            &mut address_space,
1096            &ram,
1097            bootshim_used,
1098            persisted_range,
1099            subtract_ranges([parameter_range], reclaim),
1100        )
1101        .init()
1102        .unwrap();
1103        address_space
1104    }
1105
1106    fn check_e820(boot_params: &boot_params, ext: &E820Ext, expected: &[(Range<u64>, u32)]) {
1107        let actual = boot_params.e820_map[..boot_params.e820_entries as usize]
1108            .iter()
1109            .chain(
1110                ext.entries
1111                    .iter()
1112                    .take((ext.header.len as usize) / size_of::<e820entry>()),
1113            );
1114
1115        assert_eq!(actual.clone().count(), expected.len());
1116
1117        for (actual, (expected_range, expected_type)) in actual.zip(expected.iter()) {
1118            let addr: u64 = actual.addr.into();
1119            let size: u64 = actual.size.into();
1120            let typ: u32 = actual.typ.into();
1121            assert_eq!(addr, expected_range.start);
1122            assert_eq!(size, expected_range.end - expected_range.start);
1123            assert_eq!(typ, *expected_type);
1124        }
1125    }
1126
1127    const PAGE_SIZE: u64 = 0x1000;
1128    const ONE_MB: u64 = 0x10_0000;
1129
1130    #[test]
1131    fn test_e820_basic() {
1132        // memmap with no param reclaim
1133        let mut boot_params: boot_params = FromZeros::new_zeroed();
1134        let mut ext = FromZeros::new_zeroed();
1135        let bootshim_used = MemoryRange::try_new(ONE_MB..3 * ONE_MB).unwrap();
1136        let persisted_header_end = ONE_MB + PAGE_SIZE;
1137        let persisted_end = ONE_MB + 4 * PAGE_SIZE;
1138        let persisted_state = MemoryRange::try_new(ONE_MB..persisted_end).unwrap();
1139        let parameter_range = MemoryRange::try_new(2 * ONE_MB..3 * ONE_MB).unwrap();
1140        let address_space = new_address_space_manager(
1141            &[MemoryRange::new(ONE_MB..4 * ONE_MB)],
1142            bootshim_used,
1143            persisted_state,
1144            parameter_range,
1145            None,
1146        );
1147
1148        assert!(build_e820_map(&mut boot_params, &mut ext, &address_space).is_ok());
1149
1150        check_e820(
1151            &boot_params,
1152            &ext,
1153            &[
1154                (ONE_MB..(persisted_header_end), E820_RESERVED),
1155                (persisted_header_end..persisted_end, E820_RESERVED),
1156                (persisted_end..2 * ONE_MB, E820_RAM),
1157                (2 * ONE_MB..3 * ONE_MB, E820_RESERVED),
1158                (3 * ONE_MB..4 * ONE_MB, E820_RAM),
1159            ],
1160        );
1161
1162        // memmap with reclaim
1163        let mut boot_params: boot_params = FromZeros::new_zeroed();
1164        let mut ext = FromZeros::new_zeroed();
1165        let bootshim_used = MemoryRange::try_new(ONE_MB..5 * ONE_MB).unwrap();
1166        let persisted_header_end = ONE_MB + PAGE_SIZE;
1167        let persisted_end = ONE_MB + 4 * PAGE_SIZE;
1168        let persisted_state = MemoryRange::try_new(ONE_MB..persisted_end).unwrap();
1169        let parameter_range = MemoryRange::try_new(2 * ONE_MB..5 * ONE_MB).unwrap();
1170        let reclaim = MemoryRange::try_new(3 * ONE_MB..4 * ONE_MB).unwrap();
1171        let address_space = new_address_space_manager(
1172            &[MemoryRange::new(ONE_MB..6 * ONE_MB)],
1173            bootshim_used,
1174            persisted_state,
1175            parameter_range,
1176            Some(reclaim),
1177        );
1178
1179        assert!(build_e820_map(&mut boot_params, &mut ext, &address_space).is_ok());
1180
1181        check_e820(
1182            &boot_params,
1183            &ext,
1184            &[
1185                (ONE_MB..(persisted_header_end), E820_RESERVED),
1186                (persisted_header_end..persisted_end, E820_RESERVED),
1187                (persisted_end..2 * ONE_MB, E820_RAM),
1188                (2 * ONE_MB..3 * ONE_MB, E820_RESERVED),
1189                (3 * ONE_MB..4 * ONE_MB, E820_RAM),
1190                (4 * ONE_MB..5 * ONE_MB, E820_RESERVED),
1191                (5 * ONE_MB..6 * ONE_MB, E820_RAM),
1192            ],
1193        );
1194
1195        // two mem ranges
1196        let mut boot_params: boot_params = FromZeros::new_zeroed();
1197        let mut ext = FromZeros::new_zeroed();
1198        let bootshim_used = MemoryRange::try_new(ONE_MB..5 * ONE_MB).unwrap();
1199        let persisted_header_end = ONE_MB + PAGE_SIZE;
1200        let persisted_end = ONE_MB + 4 * PAGE_SIZE;
1201        let persisted_state = MemoryRange::try_new(ONE_MB..persisted_end).unwrap();
1202        let parameter_range = MemoryRange::try_new(2 * ONE_MB..5 * ONE_MB).unwrap();
1203        let reclaim = MemoryRange::try_new(3 * ONE_MB..4 * ONE_MB).unwrap();
1204        let address_space = new_address_space_manager(
1205            &[
1206                MemoryRange::new(ONE_MB..4 * ONE_MB),
1207                MemoryRange::new(4 * ONE_MB..10 * ONE_MB),
1208            ],
1209            bootshim_used,
1210            persisted_state,
1211            parameter_range,
1212            Some(reclaim),
1213        );
1214
1215        assert!(build_e820_map(&mut boot_params, &mut ext, &address_space).is_ok());
1216
1217        check_e820(
1218            &boot_params,
1219            &ext,
1220            &[
1221                (ONE_MB..(persisted_header_end), E820_RESERVED),
1222                (persisted_header_end..persisted_end, E820_RESERVED),
1223                (persisted_end..2 * ONE_MB, E820_RAM),
1224                (2 * ONE_MB..3 * ONE_MB, E820_RESERVED),
1225                (3 * ONE_MB..4 * ONE_MB, E820_RAM),
1226                (4 * ONE_MB..5 * ONE_MB, E820_RESERVED),
1227                (5 * ONE_MB..10 * ONE_MB, E820_RAM),
1228            ],
1229        );
1230
1231        // memmap in 1 mb chunks
1232        let mut boot_params: boot_params = FromZeros::new_zeroed();
1233        let mut ext = FromZeros::new_zeroed();
1234        let bootshim_used = MemoryRange::try_new(ONE_MB..5 * ONE_MB).unwrap();
1235        let persisted_header_end = ONE_MB + PAGE_SIZE;
1236        let persisted_end = ONE_MB + 4 * PAGE_SIZE;
1237        let persisted_state = MemoryRange::try_new(ONE_MB..persisted_end).unwrap();
1238        let parameter_range = MemoryRange::try_new(2 * ONE_MB..5 * ONE_MB).unwrap();
1239        let reclaim = MemoryRange::try_new(3 * ONE_MB..4 * ONE_MB).unwrap();
1240        let address_space = new_address_space_manager(
1241            &[
1242                MemoryRange::new(ONE_MB..2 * ONE_MB),
1243                MemoryRange::new(2 * ONE_MB..3 * ONE_MB),
1244                MemoryRange::new(3 * ONE_MB..4 * ONE_MB),
1245                MemoryRange::new(4 * ONE_MB..5 * ONE_MB),
1246                MemoryRange::new(5 * ONE_MB..6 * ONE_MB),
1247                MemoryRange::new(6 * ONE_MB..7 * ONE_MB),
1248                MemoryRange::new(7 * ONE_MB..8 * ONE_MB),
1249            ],
1250            bootshim_used,
1251            persisted_state,
1252            parameter_range,
1253            Some(reclaim),
1254        );
1255
1256        assert!(build_e820_map(&mut boot_params, &mut ext, &address_space).is_ok());
1257
1258        check_e820(
1259            &boot_params,
1260            &ext,
1261            &[
1262                (ONE_MB..(persisted_header_end), E820_RESERVED),
1263                (persisted_header_end..persisted_end, E820_RESERVED),
1264                (persisted_end..2 * ONE_MB, E820_RAM),
1265                (2 * ONE_MB..3 * ONE_MB, E820_RESERVED),
1266                (3 * ONE_MB..4 * ONE_MB, E820_RAM),
1267                (4 * ONE_MB..5 * ONE_MB, E820_RESERVED),
1268                (5 * ONE_MB..8 * ONE_MB, E820_RAM),
1269            ],
1270        );
1271    }
1272
1273    // test e820 with spillover into ext
1274    #[test]
1275    fn test_e820_huge() {
1276        use crate::memory::AllocationPolicy;
1277        use crate::memory::AllocationType;
1278
1279        // Create 64 RAM ranges, then allocate 256 ranges to test spillover
1280        // boot_params.e820_map has E820_MAX_ENTRIES_ZEROPAGE (128) entries
1281        const E820_MAX_ENTRIES_ZEROPAGE: usize = 128;
1282        const RAM_RANGES: usize = 64;
1283        const TOTAL_ALLOCATIONS: usize = 256;
1284
1285        // Create 64 large RAM ranges (64MB each = 64 * 1MB pages per range)
1286        let mut ranges = Vec::new();
1287        for i in 0..RAM_RANGES {
1288            let start = (i as u64) * 64 * ONE_MB;
1289            let end = start + 64 * ONE_MB;
1290            ranges.push(MemoryRange::new(start..end));
1291        }
1292
1293        let bootshim_used = MemoryRange::try_new(0..ONE_MB * 2).unwrap();
1294        let persisted_range = MemoryRange::try_new(0..ONE_MB).unwrap();
1295        let parameter_range = MemoryRange::try_new(ONE_MB..2 * ONE_MB).unwrap();
1296
1297        let mut address_space = {
1298            let ram = ranges
1299                .iter()
1300                .cloned()
1301                .map(|range| MemoryEntry {
1302                    range,
1303                    mem_type: MemoryMapEntryType::VTL2_PROTECTABLE,
1304                    vnode: 0,
1305                })
1306                .collect::<Vec<_>>();
1307            let mut address_space = AddressSpaceManager::new_const();
1308            AddressSpaceManagerBuilder::new(
1309                &mut address_space,
1310                &ram,
1311                bootshim_used,
1312                persisted_range,
1313                core::iter::once(parameter_range),
1314            )
1315            .init()
1316            .unwrap();
1317            address_space
1318        };
1319
1320        for i in 0..TOTAL_ALLOCATIONS {
1321            // Intersperse sidecar node allocations with gpa pool allocations,
1322            // as otherwise the address space manager will collapse adjacent
1323            // ranges of the same type.
1324            let _allocated = address_space
1325                .allocate(
1326                    None,
1327                    ONE_MB,
1328                    if i % 2 == 0 {
1329                        AllocationType::GpaPool
1330                    } else {
1331                        AllocationType::SidecarNode
1332                    },
1333                    AllocationPolicy::LowMemory,
1334                )
1335                .expect("should be able to allocate sidecar node");
1336        }
1337
1338        let mut boot_params: boot_params = FromZeros::new_zeroed();
1339        let mut ext = FromZeros::new_zeroed();
1340        let total_ranges = address_space.vtl2_ranges().count();
1341
1342        let used_ext = build_e820_map(&mut boot_params, &mut ext, &address_space).unwrap();
1343
1344        // Verify that we used the extension
1345        assert!(used_ext, "should use extension when there are many ranges");
1346
1347        // Verify the standard e820_map is full
1348        assert_eq!(boot_params.e820_entries, E820_MAX_ENTRIES_ZEROPAGE as u8);
1349
1350        // Verify the extension has the overflow entries
1351        let ext_entries = (ext.header.len as usize) / size_of::<e820entry>();
1352        assert_eq!(ext_entries, total_ranges - E820_MAX_ENTRIES_ZEROPAGE);
1353
1354        // Verify we have the expected number of total ranges
1355        let total_e820_entries = boot_params.e820_entries as usize + ext_entries;
1356        assert_eq!(total_e820_entries, total_ranges);
1357    }
1358}