openhcl_boot/
main.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! The openhcl boot loader, which loads before the kernel to set up the
5//! kernel's boot parameters.
6
7// See build.rs.
8#![cfg_attr(minimal_rt, no_std, no_main)]
9// UNSAFETY: Interacting with low level hardware and bootloader primitives.
10#![expect(unsafe_code)]
11
12mod arch;
13mod boot_logger;
14mod cmdline;
15mod dt;
16mod host_params;
17mod hypercall;
18mod memory;
19mod rt;
20mod sidecar;
21mod single_threaded;
22
23use crate::arch::setup_vtl2_memory;
24use crate::arch::setup_vtl2_vp;
25#[cfg(target_arch = "x86_64")]
26use crate::arch::tdx::get_tdx_tsc_reftime;
27use crate::arch::verify_imported_regions_hash;
28use crate::boot_logger::boot_logger_init;
29use crate::boot_logger::log;
30use crate::hypercall::hvcall;
31use crate::memory::AddressSpaceManager;
32use crate::single_threaded::OffStackRef;
33use crate::single_threaded::off_stack;
34use arrayvec::ArrayString;
35use arrayvec::ArrayVec;
36use cmdline::BootCommandLineOptions;
37use core::fmt::Write;
38use dt::BootTimes;
39use dt::write_dt;
40use host_params::COMMAND_LINE_SIZE;
41use host_params::PartitionInfo;
42use host_params::shim_params::IsolationType;
43use host_params::shim_params::ShimParams;
44use hvdef::Vtl;
45use loader_defs::linux::SETUP_DTB;
46use loader_defs::linux::setup_data;
47use loader_defs::shim::ShimParamsRaw;
48use memory_range::RangeWalkResult;
49use memory_range::walk_ranges;
50use minimal_rt::enlightened_panic::enable_enlightened_panic;
51use sidecar::SidecarConfig;
52use sidecar_defs::SidecarOutput;
53use sidecar_defs::SidecarParams;
54use zerocopy::FromBytes;
55use zerocopy::FromZeros;
56use zerocopy::Immutable;
57use zerocopy::IntoBytes;
58use zerocopy::KnownLayout;
59
60#[derive(Debug)]
61struct CommandLineTooLong;
62
63impl From<core::fmt::Error> for CommandLineTooLong {
64    fn from(_: core::fmt::Error) -> Self {
65        Self
66    }
67}
68
69/// Read and setup the underhill kernel command line into the specified buffer.
70fn build_kernel_command_line(
71    params: &ShimParams,
72    cmdline: &mut ArrayString<COMMAND_LINE_SIZE>,
73    partition_info: &PartitionInfo,
74    can_trust_host: bool,
75    is_confidential_debug: bool,
76    sidecar: Option<&SidecarConfig<'_>>,
77    vtl2_pool_supported: bool,
78) -> Result<(), CommandLineTooLong> {
79    // For reference:
80    // https://www.kernel.org/doc/html/v5.15/admin-guide/kernel-parameters.html
81    const KERNEL_PARAMETERS: &[&str] = &[
82        // If a console is specified, then write everything to it.
83        "loglevel=8",
84        // Use a fixed 128KB log buffer by default.
85        "log_buf_len=128K",
86        // Enable time output on console for ohcldiag-dev.
87        "printk.time=1",
88        // Enable facility and level output on console for ohcldiag-dev.
89        "console_msg_format=syslog",
90        // Set uio parameter to configure vmbus ring buffer behavior.
91        "uio_hv_generic.no_mask=1",
92        // RELIABILITY: Dump anonymous pages and ELF headers only. Skip over
93        // huge pages and the shared pages.
94        "coredump_filter=0x33",
95        // PERF: No processor frequency governing.
96        "cpufreq.off=1",
97        // PERF: Disable the CPU idle time management entirely. It does not
98        // prevent the idle loop from running on idle CPUs, but it prevents
99        // the CPU idle time governors and drivers from being invoked.
100        "cpuidle.off=1",
101        // PERF: No perf checks for crypto algorithms to boot faster.
102        // Would have to evaluate the perf wins on the crypto manager vs
103        // delaying the boot up.
104        "cryptomgr.notests",
105        // PERF: Idle threads use HLT on x64 if there is no work.
106        // Believed to be a compromise between waking up the processor
107        // and the power consumption.
108        "idle=halt",
109        // WORKAROUND: Avoid init calls that assume presence of CMOS (Simple
110        // Boot Flag) or allocate the real-mode trampoline for APs.
111        "initcall_blacklist=init_real_mode,sbf_init",
112        // CONFIG-STATIC, PERF: Static loops-per-jiffy value to save time on boot.
113        "lpj=3000000",
114        // PERF: No broken timer check to boot faster.
115        "no_timer_check",
116        // CONFIG-STATIC, PERF: Using xsave makes VTL transitions being
117        // much slower. The xsave state is shared between VTLs, and we don't
118        // context switch it in the kernel when leaving/entering VTL2.
119        // Removing this will lead to corrupting register state and the
120        // undefined behaviour.
121        "noxsave",
122        // RELIABILITY: Panic on MCEs and faults in the kernel.
123        "oops=panic",
124        // RELIABILITY: Don't panic on kernel warnings.
125        "panic_on_warn=0",
126        // PERF, RELIABILITY: Don't print detailed information about the failing
127        // processes (memory maps, threads).
128        "panic_print=0",
129        // RELIABILITY: Reboot immediately on panic, no timeout.
130        "panic=-1",
131        // RELIABILITY: Don't print processor context information on a fatal
132        // signal. Our crash dump collection infrastructure seems reliable, and
133        // this information doesn't seem useful without a dump anyways.
134        // Additionally it may push important logs off the end of the kmsg
135        // page logged by the host.
136        //"print_fatal_signals=0",
137        // RELIABILITY: Unlimited logging to /dev/kmsg from userspace.
138        "printk.devkmsg=on",
139        // RELIABILITY: Reboot using a triple fault as the fastest method.
140        // That is also the method used for compatibility with earlier versions
141        // of the Microsoft HCL.
142        "reboot=t",
143        // CONFIG-STATIC: Type of the root file system.
144        "rootfstype=tmpfs",
145        // PERF: Deactivate kcompactd kernel thread, otherwise it will queue a
146        // scheduler timer periodically, which introduces jitters for VTL0.
147        "sysctl.vm.compaction_proactiveness=0",
148        // PERF: No TSC stability check when booting up to boot faster,
149        // also no validation during runtime.
150        "tsc=reliable",
151        // RELIABILITY: Panic on receiving an NMI.
152        "unknown_nmi_panic=1",
153        // Use vfio for MANA devices.
154        "vfio_pci.ids=1414:00ba",
155        // WORKAROUND: Enable no-IOMMU mode. This mode provides no device isolation,
156        // and no DMA translation.
157        "vfio.enable_unsafe_noiommu_mode=1",
158        // Specify the init path.
159        "rdinit=/underhill-init",
160        // Default to user-mode NVMe driver.
161        "OPENHCL_NVME_VFIO=1",
162        // The next three items reduce the memory overhead of the storvsc driver.
163        // Since it is only used for DVD, performance is not critical.
164        "hv_storvsc.storvsc_vcpus_per_sub_channel=2048",
165        // Fix number of hardware queues at 2.
166        "hv_storvsc.storvsc_max_hw_queues=2",
167        // Reduce the ring buffer size to 32K.
168        "hv_storvsc.storvsc_ringbuffer_size=0x8000",
169        // Disable eager mimalloc commit to prevent core dumps from being overly large
170        "MIMALLOC_ARENA_EAGER_COMMIT=0",
171    ];
172
173    const X86_KERNEL_PARAMETERS: &[&str] = &[
174        // Disable pcid support. This is a temporary fix to allow
175        // Underhill to run nested inside AMD VMs. Otherwise, the
176        // Underhill kernel tries to start APs with PCID bits set in CR3
177        // without the PCIDE bit set in CR4, which is an invalid
178        // VP state (according to the mshv nested implementation).
179        //
180        // TODO: remove this once we figure out the root cause and apply
181        // a workaround/fix elsewhere.
182        "clearcpuid=pcid",
183        // Disable all attempts to use an IOMMU, including swiotlb.
184        "iommu=off",
185        // Don't probe for a PCI bus. PCI devices currently come from VPCI. When
186        // this changes, we will explicitly enumerate a PCI bus via devicetree.
187        "pci=off",
188    ];
189
190    const AARCH64_KERNEL_PARAMETERS: &[&str] = &[];
191
192    for p in KERNEL_PARAMETERS {
193        write!(cmdline, "{p} ")?;
194    }
195
196    let arch_parameters = if cfg!(target_arch = "x86_64") {
197        X86_KERNEL_PARAMETERS
198    } else {
199        AARCH64_KERNEL_PARAMETERS
200    };
201    for p in arch_parameters {
202        write!(cmdline, "{p} ")?;
203    }
204
205    const HARDWARE_ISOLATED_KERNEL_PARAMETERS: &[&str] = &[
206        // Even with iommu=off, the SWIOTLB is still allocated on AARCH64
207        // (iommu=off ignored entirely), and CVMs (memory encryption forces it
208        // on). Set it to a single area in 8MB. The first parameter controls the
209        // area size in slabs (2KB per slab), the second controls the number of
210        // areas (default is # of CPUs).
211        //
212        // This is set to 8MB on hardware isolated VMs since there are some
213        // scenarios, such as provisioning over DVD, which require a larger size
214        // since the buffer is being used.
215        "swiotlb=4096,1",
216    ];
217
218    const NON_HARDWARE_ISOLATED_KERNEL_PARAMETERS: &[&str] = &[
219        // Even with iommu=off, the SWIOTLB is still allocated on AARCH64
220        // (iommu=off ignored entirely). Set it to the minimum, saving ~63 MiB.
221        // The first parameter controls the area size, the second controls the
222        // number of areas (default is # of CPUs). Set them both to the minimum.
223        "swiotlb=1,1",
224    ];
225
226    if params.isolation_type.is_hardware_isolated() {
227        for p in HARDWARE_ISOLATED_KERNEL_PARAMETERS {
228            write!(cmdline, "{p} ")?;
229        }
230    } else {
231        for p in NON_HARDWARE_ISOLATED_KERNEL_PARAMETERS {
232            write!(cmdline, "{p} ")?;
233        }
234    }
235
236    // Enable the com3 console by default if it's available and we're not
237    // isolated, or if we are isolated but also have debugging enabled.
238    //
239    // Otherwise, set the console to ttynull so the kernel does not default to
240    // com1. This is overridden by any user customizations in the static or
241    // dynamic command line, as this console argument provided by the bootloader
242    // comes first.
243    let console = if partition_info.com3_serial_available && can_trust_host {
244        "ttyS2,115200"
245    } else {
246        "ttynull"
247    };
248    write!(cmdline, "console={console} ")?;
249
250    if params.isolation_type != IsolationType::None {
251        write!(
252            cmdline,
253            "{}=1 ",
254            underhill_confidentiality::OPENHCL_CONFIDENTIAL_ENV_VAR_NAME
255        )?;
256    }
257
258    if is_confidential_debug {
259        write!(
260            cmdline,
261            "{}=1 ",
262            underhill_confidentiality::OPENHCL_CONFIDENTIAL_DEBUG_ENV_VAR_NAME
263        )?;
264    }
265
266    // Only when explicitly supported by Host.
267    // TODO: Move from command line to device tree when stabilized.
268    if partition_info.nvme_keepalive && vtl2_pool_supported {
269        write!(cmdline, "OPENHCL_NVME_KEEP_ALIVE=1 ")?;
270    }
271
272    if let Some(sidecar) = sidecar {
273        write!(cmdline, "{} ", sidecar.kernel_command_line())?;
274    }
275
276    // HACK: Set the vmbus connection id via kernel commandline.
277    //
278    // This code will be removed when the kernel supports setting connection id
279    // via device tree.
280    write!(
281        cmdline,
282        "hv_vmbus.message_connection_id=0x{:x} ",
283        partition_info.vmbus_vtl2.connection_id
284    )?;
285
286    // If we're isolated we can't trust the host-provided cmdline
287    if can_trust_host {
288        // Prepend the computed parameters to the original command line.
289        cmdline.write_str(&partition_info.cmdline)?;
290    }
291
292    Ok(())
293}
294
295// The Linux kernel requires that the FDT fit within a single 256KB mapping, as
296// that is the maximum size the kernel can use during its early boot processes.
297// We also want our FDT to be as large as possible to support as many vCPUs as
298// possible. We set it to 256KB, but it must also be page-aligned, as leaving it
299// unaligned runs the possibility of it taking up 1 too many pages, resulting in
300// a 260KB mapping, which will fail.
301const FDT_SIZE: usize = 256 * 1024;
302
303#[repr(C, align(4096))]
304#[derive(FromBytes, IntoBytes, Immutable, KnownLayout)]
305struct Fdt {
306    header: setup_data,
307    data: [u8; FDT_SIZE - size_of::<setup_data>()],
308}
309
310/// Raw shim parameters are provided via a relative offset from the base of
311/// where the shim is loaded. Return a ShimParams structure based on the raw
312/// offset based RawShimParams.
313fn shim_parameters(shim_params_raw_offset: isize) -> ShimParams {
314    unsafe extern "C" {
315        static __ehdr_start: u8;
316    }
317
318    let shim_base = core::ptr::addr_of!(__ehdr_start) as usize;
319
320    // SAFETY: The host is required to relocate everything by the same bias, so
321    //         the shim parameters should be at the build time specified offset
322    //         from the base address of the image.
323    let raw_shim_params = unsafe {
324        &*(shim_base.wrapping_add_signed(shim_params_raw_offset) as *const ShimParamsRaw)
325    };
326
327    ShimParams::new(shim_base as u64, raw_shim_params)
328}
329
330#[cfg_attr(not(target_arch = "x86_64"), expect(dead_code))]
331mod x86_boot {
332    use crate::PageAlign;
333    use crate::memory::AddressSpaceManager;
334    use crate::single_threaded::OffStackRef;
335    use crate::single_threaded::off_stack;
336    use crate::zeroed;
337    use core::mem::size_of;
338    use core::ops::Range;
339    use core::ptr;
340    use loader_defs::linux::E820_RAM;
341    use loader_defs::linux::E820_RESERVED;
342    use loader_defs::linux::SETUP_E820_EXT;
343    use loader_defs::linux::boot_params;
344    use loader_defs::linux::e820entry;
345    use loader_defs::linux::setup_data;
346    use loader_defs::shim::MemoryVtlType;
347    use memory_range::MemoryRange;
348    use zerocopy::FromZeros;
349    use zerocopy::Immutable;
350    use zerocopy::KnownLayout;
351
352    #[repr(C)]
353    #[derive(FromZeros, Immutable, KnownLayout)]
354    pub struct E820Ext {
355        pub header: setup_data,
356        pub entries: [e820entry; 512],
357    }
358
359    fn add_e820_entry(
360        entry: Option<&mut e820entry>,
361        range: MemoryRange,
362        typ: u32,
363    ) -> Result<(), BuildE820MapError> {
364        *entry.ok_or(BuildE820MapError::OutOfE820Entries)? = e820entry {
365            addr: range.start().into(),
366            size: range.len().into(),
367            typ: typ.into(),
368        };
369        Ok(())
370    }
371
372    #[derive(Debug)]
373    pub enum BuildE820MapError {
374        /// Out of e820 entries.
375        OutOfE820Entries,
376    }
377
378    /// Build the e820 map for the kernel representing usable VTL2 ram.
379    pub fn build_e820_map(
380        boot_params: &mut boot_params,
381        ext: &mut E820Ext,
382        address_space: &AddressSpaceManager,
383    ) -> Result<bool, BuildE820MapError> {
384        boot_params.e820_entries = 0;
385        let mut entries = boot_params
386            .e820_map
387            .iter_mut()
388            .chain(ext.entries.iter_mut());
389
390        let mut n = 0;
391        for (range, typ) in address_space.vtl2_ranges() {
392            match typ {
393                MemoryVtlType::VTL2_RAM => {
394                    add_e820_entry(entries.next(), range, E820_RAM)?;
395                    n += 1;
396                }
397                MemoryVtlType::VTL2_CONFIG
398                | MemoryVtlType::VTL2_SIDECAR_IMAGE
399                | MemoryVtlType::VTL2_SIDECAR_NODE
400                | MemoryVtlType::VTL2_RESERVED
401                | MemoryVtlType::VTL2_GPA_POOL
402                | MemoryVtlType::VTL2_TDX_PAGE_TABLES => {
403                    add_e820_entry(entries.next(), range, E820_RESERVED)?;
404                    n += 1;
405                }
406
407                _ => {
408                    panic!("unexpected vtl2 ram type {typ:?} for range {range:#?}");
409                }
410            }
411        }
412
413        let base = n.min(boot_params.e820_map.len());
414        boot_params.e820_entries = base as u8;
415
416        if base < n {
417            ext.header.len = ((n - base) * size_of::<e820entry>()) as u32;
418            Ok(true)
419        } else {
420            Ok(false)
421        }
422    }
423
424    pub fn build_boot_params(
425        address_space: &AddressSpaceManager,
426        initrd: Range<u64>,
427        cmdline: &str,
428        setup_data_head: *const setup_data,
429        setup_data_tail: &mut &mut setup_data,
430    ) -> OffStackRef<'static, PageAlign<boot_params>> {
431        let mut boot_params_storage = off_stack!(PageAlign<boot_params>, zeroed());
432        let boot_params = &mut boot_params_storage.0;
433        boot_params.hdr.type_of_loader = 0xff; // Unknown loader type
434
435        // HACK: A kernel change just in the Underhill kernel tree has a workaround
436        // to disable probe_roms and reserve_bios_regions when X86_SUBARCH_LGUEST
437        // (1) is set by the bootloader. This stops the kernel from reading VTL0
438        // memory during kernel boot, which can have catastrophic consequences
439        // during a servicing operation when VTL0 has written values to memory, or
440        // unaccepted page accesses in an isolated partition.
441        //
442        // This is only intended as a stopgap until a suitable upstreamable kernel
443        // patch is made.
444        boot_params.hdr.hardware_subarch = 1.into();
445
446        boot_params.hdr.ramdisk_image = (initrd.start as u32).into();
447        boot_params.ext_ramdisk_image = (initrd.start >> 32) as u32;
448        let initrd_len = initrd.end - initrd.start;
449        boot_params.hdr.ramdisk_size = (initrd_len as u32).into();
450        boot_params.ext_ramdisk_size = (initrd_len >> 32) as u32;
451
452        let e820_ext = OffStackRef::leak(off_stack!(E820Ext, zeroed()));
453
454        let used_ext = build_e820_map(boot_params, e820_ext, address_space)
455            .expect("building e820 map must succeed");
456
457        if used_ext {
458            e820_ext.header.ty = SETUP_E820_EXT;
459            setup_data_tail.next = ptr::from_ref(&e820_ext.header) as u64;
460            *setup_data_tail = &mut e820_ext.header;
461        }
462
463        let cmd_line_addr = cmdline.as_ptr() as u64;
464        boot_params.hdr.cmd_line_ptr = (cmd_line_addr as u32).into();
465        boot_params.ext_cmd_line_ptr = (cmd_line_addr >> 32) as u32;
466
467        boot_params.hdr.setup_data = (setup_data_head as u64).into();
468
469        boot_params_storage
470    }
471}
472
473/// Build the cc_blob containing the location of different parameters associated with SEV.
474#[cfg(target_arch = "x86_64")]
475fn build_cc_blob_sev_info(
476    cc_blob: &mut loader_defs::linux::cc_blob_sev_info,
477    shim_params: &ShimParams,
478) {
479    // TODO SNP: Currently only the first CPUID page is passed through.
480    // Consider changing this.
481    cc_blob.magic = loader_defs::linux::CC_BLOB_SEV_INFO_MAGIC;
482    cc_blob.version = 0;
483    cc_blob._reserved = 0;
484    cc_blob.secrets_phys = shim_params.secrets_start();
485    cc_blob.secrets_len = hvdef::HV_PAGE_SIZE as u32;
486    cc_blob._rsvd1 = 0;
487    cc_blob.cpuid_phys = shim_params.cpuid_start();
488    cc_blob.cpuid_len = hvdef::HV_PAGE_SIZE as u32;
489    cc_blob._rsvd2 = 0;
490}
491
492#[repr(C, align(4096))]
493#[derive(FromZeros, Immutable, KnownLayout)]
494struct PageAlign<T>(T);
495
496const fn zeroed<T: FromZeros>() -> T {
497    // SAFETY: `T` implements `FromZeros`, so this is a safe initialization of `T`.
498    unsafe { core::mem::MaybeUninit::<T>::zeroed().assume_init() }
499}
500
501fn get_ref_time(isolation: IsolationType) -> Option<u64> {
502    match isolation {
503        #[cfg(target_arch = "x86_64")]
504        IsolationType::Tdx => get_tdx_tsc_reftime(),
505        #[cfg(target_arch = "x86_64")]
506        IsolationType::Snp => None,
507        _ => Some(minimal_rt::reftime::reference_time()),
508    }
509}
510
511fn shim_main(shim_params_raw_offset: isize) -> ! {
512    let p = shim_parameters(shim_params_raw_offset);
513    if p.isolation_type == IsolationType::None {
514        enable_enlightened_panic();
515    }
516
517    let boot_reftime = get_ref_time(p.isolation_type);
518
519    // The support code for the fast hypercalls does not set
520    // the Guest ID if it is not set yet as opposed to the slow
521    // hypercall code path where that is done automatically.
522    // Thus the fast hypercalls will fail as the the Guest ID has
523    // to be set first hence initialize hypercall support
524    // explicitly.
525    if !p.isolation_type.is_hardware_isolated() {
526        hvcall().initialize();
527    }
528
529    let mut static_options = BootCommandLineOptions::new();
530    if let Some(cmdline) = p.command_line().command_line() {
531        static_options.parse(cmdline);
532    }
533
534    let static_confidential_debug = static_options.confidential_debug;
535    let can_trust_host = p.isolation_type == IsolationType::None || static_confidential_debug;
536
537    let mut dt_storage = off_stack!(PartitionInfo, PartitionInfo::new());
538    let address_space = OffStackRef::leak(off_stack!(
539        AddressSpaceManager,
540        AddressSpaceManager::new_const()
541    ));
542    let partition_info = match PartitionInfo::read_from_dt(
543        &p,
544        &mut dt_storage,
545        address_space,
546        static_options,
547        can_trust_host,
548    ) {
549        Ok(val) => val,
550        Err(e) => panic!("unable to read device tree params {}", e),
551    };
552
553    // Enable logging ASAP. This is fine even when isolated, as we don't have
554    // any access to secrets in the boot shim.
555    boot_logger_init(p.isolation_type, partition_info.com3_serial_available);
556    log!("openhcl_boot: logging enabled");
557
558    // Confidential debug will show up in boot_options only if included in the
559    // static command line, or if can_trust_host is true (so the dynamic command
560    // line has been parsed).
561    let is_confidential_debug =
562        static_confidential_debug || partition_info.boot_options.confidential_debug;
563
564    // Fill out the non-devicetree derived parts of PartitionInfo.
565    if !p.isolation_type.is_hardware_isolated()
566        && hvcall().vtl() == Vtl::Vtl2
567        && hvdef::HvRegisterVsmCapabilities::from(
568            hvcall()
569                .get_register(hvdef::HvAllArchRegisterName::VsmCapabilities.into())
570                .expect("failed to query vsm capabilities")
571                .as_u64(),
572        )
573        .vtl0_alias_map_available()
574    {
575        // If the vtl0 alias map was not provided in the devicetree, attempt to
576        // derive it from the architectural physical address bits.
577        //
578        // The value in the ID_AA64MMFR0_EL1 register used to determine the
579        // physical address bits can only represent multiples of 4. As a result,
580        // the Surface Pro X (and systems with similar CPUs) cannot properly
581        // report their address width of 39 bits. This causes the calculated
582        // alias map to be incorrect, which results in panics when trying to
583        // read memory and getting invalid data.
584        if partition_info.vtl0_alias_map.is_none() {
585            partition_info.vtl0_alias_map =
586                Some(1 << (arch::physical_address_bits(p.isolation_type) - 1));
587        }
588    } else {
589        // Ignore any devicetree-provided alias map if the conditions above
590        // aren't met.
591        partition_info.vtl0_alias_map = None;
592    }
593
594    // Rebind partition_info as no longer mutable.
595    let partition_info: &PartitionInfo = partition_info;
596
597    if partition_info.cpus.is_empty() {
598        panic!("no cpus");
599    }
600
601    validate_vp_hw_ids(partition_info);
602
603    setup_vtl2_memory(&p, partition_info);
604    setup_vtl2_vp(partition_info);
605
606    verify_imported_regions_hash(&p);
607
608    let mut sidecar_params = off_stack!(PageAlign<SidecarParams>, zeroed());
609    let mut sidecar_output = off_stack!(PageAlign<SidecarOutput>, zeroed());
610    let sidecar = sidecar::start_sidecar(
611        &p,
612        partition_info,
613        address_space,
614        &mut sidecar_params.0,
615        &mut sidecar_output.0,
616    );
617
618    // Rebind address_space as no longer mutable.
619    let address_space: &AddressSpaceManager = address_space;
620
621    let mut cmdline = off_stack!(ArrayString<COMMAND_LINE_SIZE>, ArrayString::new_const());
622    build_kernel_command_line(
623        &p,
624        &mut cmdline,
625        partition_info,
626        can_trust_host,
627        is_confidential_debug,
628        sidecar.as_ref(),
629        address_space.has_vtl2_pool(),
630    )
631    .unwrap();
632
633    let mut fdt = off_stack!(Fdt, zeroed());
634    fdt.header.len = fdt.data.len() as u32;
635    fdt.header.ty = SETUP_DTB;
636
637    #[cfg(target_arch = "x86_64")]
638    let mut setup_data_tail = &mut fdt.header;
639    #[cfg(target_arch = "x86_64")]
640    let setup_data_head = core::ptr::from_ref(setup_data_tail);
641
642    #[cfg(target_arch = "x86_64")]
643    if p.isolation_type == IsolationType::Snp {
644        let cc_blob = OffStackRef::leak(off_stack!(loader_defs::linux::cc_blob_sev_info, zeroed()));
645        build_cc_blob_sev_info(cc_blob, &p);
646
647        let cc_data = OffStackRef::leak(off_stack!(loader_defs::linux::cc_setup_data, zeroed()));
648        cc_data.header.len = size_of::<loader_defs::linux::cc_setup_data>() as u32;
649        cc_data.header.ty = loader_defs::linux::SETUP_CC_BLOB;
650        cc_data.cc_blob_address = core::ptr::from_ref(&*cc_blob) as u32;
651
652        // Chain in the setup data.
653        setup_data_tail.next = core::ptr::from_ref(&*cc_data) as u64;
654        setup_data_tail = &mut cc_data.header;
655    }
656
657    let initrd = p.initrd_base..p.initrd_base + p.initrd_size;
658
659    // Validate the initrd crc matches what was put at file generation time.
660    let computed_crc = crc32fast::hash(p.initrd());
661    assert_eq!(
662        computed_crc, p.initrd_crc,
663        "computed initrd crc does not match build time calculated crc"
664    );
665
666    #[cfg(target_arch = "x86_64")]
667    let boot_params = x86_boot::build_boot_params(
668        address_space,
669        initrd.clone(),
670        &cmdline,
671        setup_data_head,
672        &mut setup_data_tail,
673    );
674
675    // Compute the ending boot time. This has to be before writing to device
676    // tree, so this is as late as we can do it.
677
678    let boot_times = boot_reftime.map(|start| BootTimes {
679        start,
680        end: get_ref_time(p.isolation_type).unwrap_or(0),
681    });
682
683    // Validate that no imported regions that are pending are not part of vtl2
684    // ram.
685    for (range, result) in walk_ranges(
686        partition_info.vtl2_ram.iter().map(|r| (r.range, ())),
687        p.imported_regions(),
688    ) {
689        match result {
690            RangeWalkResult::Neither | RangeWalkResult::Left(_) | RangeWalkResult::Both(_, _) => {}
691            RangeWalkResult::Right(accepted) => {
692                // Ranges that are not a part of VTL2 ram must have been
693                // preaccepted, as usermode expect that to be the case.
694                assert!(
695                    accepted,
696                    "range {:#x?} not in vtl2 ram was not preaccepted at launch",
697                    range
698                );
699            }
700        }
701    }
702
703    write_dt(
704        &mut fdt.data,
705        partition_info,
706        address_space,
707        p.imported_regions().map(|r| {
708            // Discard if the range was previously pending - the bootloader has
709            // accepted all pending ranges.
710            //
711            // NOTE: No VTL0 memory today is marked as pending. The check above
712            // validates that, and this code may need to change if this becomes
713            // no longer true.
714            r.0
715        }),
716        initrd,
717        &cmdline,
718        sidecar.as_ref(),
719        boot_times,
720        p.isolation_type,
721    )
722    .unwrap();
723
724    rt::verify_stack_cookie();
725
726    log!("uninitializing hypercalls, about to jump to kernel");
727    hvcall().uninitialize();
728
729    cfg_if::cfg_if! {
730        if #[cfg(target_arch = "x86_64")] {
731            // SAFETY: the parameter blob is trusted.
732            let kernel_entry: extern "C" fn(u64, &loader_defs::linux::boot_params) -> ! =
733                unsafe { core::mem::transmute(p.kernel_entry_address) };
734            kernel_entry(0, &boot_params.0)
735        } else if #[cfg(target_arch = "aarch64")] {
736            // SAFETY: the parameter blob is trusted.
737            let kernel_entry: extern "C" fn(fdt_data: *const u8, mbz0: u64, mbz1: u64, mbz2: u64) -> ! =
738                unsafe { core::mem::transmute(p.kernel_entry_address) };
739            // Disable MMU for kernel boot without EFI, as required by the boot protocol.
740            // Flush (and invalidate) the caches, as that is required for disabling MMU.
741            // SAFETY: Just changing a bit in the register and then jumping to the kernel.
742            unsafe {
743                core::arch::asm!(
744                    "
745                    mrs     {0}, sctlr_el1
746                    bic     {0}, {0}, #0x1
747                    msr     sctlr_el1, {0}
748                    tlbi    vmalle1
749                    dsb     sy
750                    isb     sy",
751                    lateout(reg) _,
752                );
753            }
754            kernel_entry(fdt.data.as_ptr(), 0, 0, 0)
755        } else {
756            panic!("unsupported arch")
757        }
758    }
759}
760
761/// Ensure that mshv VP indexes for the CPUs listed in the partition info
762/// correspond to the N in the cpu@N devicetree node name. OpenVMM assumes that
763/// this will be the case.
764fn validate_vp_hw_ids(partition_info: &PartitionInfo) {
765    use host_params::MAX_CPU_COUNT;
766    use hypercall::HwId;
767
768    if partition_info.isolation.is_hardware_isolated() {
769        // TODO TDX SNP: we don't have a GHCB/GHCI page set up to communicate
770        // with the hypervisor here, so we can't easily perform the check. Since
771        // there is no security impact to this check, we can skip it for now; if
772        // the VM fails to boot, then this is due to a host contract violation.
773        //
774        // For TDX, we could use ENUM TOPOLOGY to validate that the TD VCPU
775        // indexes correspond to the APIC IDs in the right order. I am not
776        // certain if there are places where we depend on this mapping today.
777        return;
778    }
779
780    if hvcall().vtl() != Vtl::Vtl2 {
781        // If we're not using guest VSM, then the guest won't communicate
782        // directly with the hypervisor, so we can choose the VP indexes
783        // ourselves.
784        return;
785    }
786
787    // Ensure the host and hypervisor agree on VP index ordering.
788
789    let mut hw_ids = off_stack!(ArrayVec<HwId, MAX_CPU_COUNT>, ArrayVec::new_const());
790    hw_ids.clear();
791    hw_ids.extend(partition_info.cpus.iter().map(|c| c.reg as _));
792    let mut vp_indexes = off_stack!(ArrayVec<u32, MAX_CPU_COUNT>, ArrayVec::new_const());
793    vp_indexes.clear();
794    if let Err(err) = hvcall().get_vp_index_from_hw_id(&hw_ids, &mut vp_indexes) {
795        panic!(
796            "failed to get VP index for hardware ID {:#x}: {}",
797            hw_ids[vp_indexes.len().min(hw_ids.len() - 1)],
798            err
799        );
800    }
801    if let Some((i, &vp_index)) = vp_indexes
802        .iter()
803        .enumerate()
804        .find(|&(i, vp_index)| i as u32 != *vp_index)
805    {
806        panic!(
807            "CPU hardware ID {:#x} does not correspond to VP index {}",
808            hw_ids[i], vp_index
809        );
810    }
811}
812
813// See build.rs. See `mod rt` for the actual bootstrap code required to invoke
814// shim_main.
815#[cfg(not(minimal_rt))]
816fn main() {
817    unimplemented!("build with MINIMAL_RT_BUILD to produce a working boot loader");
818}
819
820#[cfg(test)]
821mod test {
822    use super::x86_boot::E820Ext;
823    use super::x86_boot::build_e820_map;
824    use crate::cmdline::BootCommandLineOptions;
825    use crate::dt::write_dt;
826    use crate::host_params::MAX_CPU_COUNT;
827    use crate::host_params::PartitionInfo;
828    use crate::host_params::shim_params::IsolationType;
829    use crate::memory::AddressSpaceManager;
830    use crate::memory::AddressSpaceManagerBuilder;
831    use arrayvec::ArrayString;
832    use arrayvec::ArrayVec;
833    use core::ops::Range;
834    use host_fdt_parser::CpuEntry;
835    use host_fdt_parser::MemoryEntry;
836    use host_fdt_parser::VmbusInfo;
837    use igvm_defs::MemoryMapEntryType;
838    use loader_defs::linux::E820_RAM;
839    use loader_defs::linux::E820_RESERVED;
840    use loader_defs::linux::boot_params;
841    use loader_defs::linux::e820entry;
842    use memory_range::MemoryRange;
843    use memory_range::subtract_ranges;
844    use zerocopy::FromZeros;
845
846    const HIGH_MMIO_GAP_END: u64 = 0x1000000000; //  64 GiB
847    const VMBUS_MMIO_GAP_SIZE: u64 = 0x10000000; // 256 MiB
848    const HIGH_MMIO_GAP_START: u64 = HIGH_MMIO_GAP_END - VMBUS_MMIO_GAP_SIZE;
849
850    /// Create partition info with given cpu count enabled and sequential
851    /// apic_ids.
852    fn new_partition_info(cpu_count: usize) -> PartitionInfo {
853        let mut cpus: ArrayVec<CpuEntry, MAX_CPU_COUNT> = ArrayVec::new();
854
855        for id in 0..(cpu_count as u64) {
856            cpus.push(CpuEntry { reg: id, vnode: 0 });
857        }
858
859        let mut mmio = ArrayVec::new();
860        mmio.push(
861            MemoryRange::try_new(HIGH_MMIO_GAP_START..HIGH_MMIO_GAP_END).expect("valid range"),
862        );
863
864        PartitionInfo {
865            vtl2_ram: ArrayVec::new(),
866            partition_ram: ArrayVec::new(),
867            isolation: IsolationType::None,
868            bsp_reg: cpus[0].reg as u32,
869            cpus,
870            cmdline: ArrayString::new(),
871            vmbus_vtl2: VmbusInfo {
872                mmio,
873                connection_id: 0,
874            },
875            vmbus_vtl0: VmbusInfo {
876                mmio: ArrayVec::new(),
877                connection_id: 0,
878            },
879            com3_serial_available: false,
880            gic: None,
881            pmu_gsiv: None,
882            memory_allocation_mode: host_fdt_parser::MemoryAllocationMode::Host,
883            entropy: None,
884            vtl0_alias_map: None,
885            nvme_keepalive: false,
886            boot_options: BootCommandLineOptions::new(),
887        }
888    }
889
890    // ensure we can boot with a _lot_ of vcpus
891    #[test]
892    #[cfg_attr(
893        target_arch = "aarch64",
894        ignore = "TODO: investigate why this doesn't always work on ARM"
895    )]
896    fn fdt_cpu_scaling() {
897        const MAX_CPUS: usize = 2048;
898
899        let mut buf = [0; 0x40000];
900        write_dt(
901            &mut buf,
902            &new_partition_info(MAX_CPUS),
903            &AddressSpaceManager::new_const(),
904            [],
905            0..0,
906            &ArrayString::from("test").unwrap_or_default(),
907            None,
908            None,
909            IsolationType::None,
910        )
911        .unwrap();
912    }
913
914    // Must match the DeviceTree blob generated with the standard tooling
915    // to ensure being compliant to the standards (or, at least, compatibility
916    // with a widely used implementation).
917    // For details on regenerating the test content, see `fdt_dtc_decompile`
918    // below.
919    #[test]
920    #[ignore = "TODO: temporarily broken"]
921    fn fdt_dtc_check_content() {
922        const MAX_CPUS: usize = 2;
923        const BUF_SIZE: usize = 0x1000;
924
925        // Rust cannot infer the type.
926        let dtb_data_spans: [(usize, &[u8]); 2] = [
927            (
928                /* Span starts at offset */ 0,
929                b"\xd0\x0d\xfe\xed\x00\x00\x10\x00\x00\x00\x04\x38\x00\x00\x00\x38\
930                \x00\x00\x00\x28\x00\x00\x00\x11\x00\x00\x00\x10\x00\x00\x00\x00\
931                \x00\x00\x00\x4a\x00\x00\x01\x6c\x00\x00\x00\x00\x00\x00\x00\x00\
932                \x00\x00\x00\x00\x00\x00\x00\x00\x23\x61\x64\x64\x72\x65\x73\x73\
933                \x2d\x63\x65\x6c\x6c\x73\x00\x23\x73\x69\x7a\x65\x2d\x63\x65\x6c\
934                \x6c\x73\x00\x6d\x6f\x64\x65\x6c\x00\x72\x65\x67\x00\x64\x65\x76\
935                \x69\x63\x65\x5f\x74\x79\x70\x65\x00\x73\x74\x61\x74\x75\x73\x00\
936                \x63\x6f\x6d\x70\x61\x74\x69\x62\x6c\x65\x00\x72\x61\x6e\x67\x65\
937                \x73",
938            ),
939            (
940                /* Span starts at offset */ 0x430,
941                b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\
942                \x00\x00\x00\x03\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x02\
943                \x00\x00\x00\x03\x00\x00\x00\x04\x00\x00\x00\x0f\x00\x00\x00\x00\
944                \x00\x00\x00\x03\x00\x00\x00\x0f\x00\x00\x00\x1b\x6d\x73\x66\x74\
945                \x2c\x75\x6e\x64\x65\x72\x68\x69\x6c\x6c\x00\x00\x00\x00\x00\x01\
946                \x63\x70\x75\x73\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x04\
947                \x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x03\x00\x00\x00\x04\
948                \x00\x00\x00\x0f\x00\x00\x00\x00\x00\x00\x00\x01\x63\x70\x75\x40\
949                \x30\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x04\x00\x00\x00\x25\
950                \x63\x70\x75\x00\x00\x00\x00\x03\x00\x00\x00\x04\x00\x00\x00\x21\
951                \x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x05\x00\x00\x00\x31\
952                \x6f\x6b\x61\x79\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x01\
953                \x63\x70\x75\x40\x31\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x04\
954                \x00\x00\x00\x25\x63\x70\x75\x00\x00\x00\x00\x03\x00\x00\x00\x04\
955                \x00\x00\x00\x21\x00\x00\x00\x01\x00\x00\x00\x03\x00\x00\x00\x05\
956                \x00\x00\x00\x31\x6f\x6b\x61\x79\x00\x00\x00\x00\x00\x00\x00\x02\
957                \x00\x00\x00\x02\x00\x00\x00\x01\x76\x6d\x62\x75\x73\x00\x00\x00\
958                \x00\x00\x00\x03\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x02\
959                \x00\x00\x00\x03\x00\x00\x00\x04\x00\x00\x00\x0f\x00\x00\x00\x01\
960                \x00\x00\x00\x03\x00\x00\x00\x0b\x00\x00\x00\x38\x6d\x73\x66\x74\
961                \x2c\x76\x6d\x62\x75\x73\x00\x00\x00\x00\x00\x03\x00\x00\x00\x14\
962                \x00\x00\x00\x43\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0f\
963                \xf0\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x02\
964                \x00\x00\x00\x09",
965            ),
966        ];
967
968        let mut sample_buf = [0u8; BUF_SIZE];
969        for (span_start, bytes) in dtb_data_spans {
970            sample_buf[span_start..span_start + bytes.len()].copy_from_slice(bytes);
971        }
972
973        let mut buf = [0u8; BUF_SIZE];
974        write_dt(
975            &mut buf,
976            &new_partition_info(MAX_CPUS),
977            &AddressSpaceManager::new_const(),
978            [],
979            0..0,
980            &ArrayString::from("test").unwrap_or_default(),
981            None,
982            None,
983            IsolationType::None,
984        )
985        .unwrap();
986
987        assert!(sample_buf == buf);
988    }
989
990    // This test should be manually enabled when need to regenerate
991    // the sample content above and validate spec compliance with `dtc`.
992    // Before running the test, please install the DeviceTree compiler:
993    // ```shell
994    // sudo apt-get update && sudo apt-get install device-tree-compiler
995    // ```
996    #[test]
997    #[ignore = "enabling the test requires installing additional software, \
998                and developers will experience a break."]
999    fn fdt_dtc_decompile() {
1000        const MAX_CPUS: usize = 2048;
1001
1002        let mut buf = [0; 0x40000];
1003        write_dt(
1004            &mut buf,
1005            &new_partition_info(MAX_CPUS),
1006            &AddressSpaceManager::new_const(),
1007            [],
1008            0..0,
1009            &ArrayString::from("test").unwrap_or_default(),
1010            None,
1011            None,
1012            IsolationType::None,
1013        )
1014        .unwrap();
1015
1016        let input_dtb_file_name = "openhcl_boot.dtb";
1017        let output_dts_file_name = "openhcl_boot.dts";
1018        std::fs::write(input_dtb_file_name, buf).unwrap();
1019        let success = std::process::Command::new("dtc")
1020            .args([input_dtb_file_name, "-I", "dtb", "-o", output_dts_file_name])
1021            .status()
1022            .unwrap()
1023            .success();
1024        assert!(success);
1025    }
1026
1027    fn new_address_space_manager(
1028        ram: &[MemoryRange],
1029        bootshim_used: MemoryRange,
1030        parameter_range: MemoryRange,
1031        reclaim: Option<MemoryRange>,
1032    ) -> AddressSpaceManager {
1033        let ram = ram
1034            .iter()
1035            .cloned()
1036            .map(|range| MemoryEntry {
1037                range,
1038                mem_type: MemoryMapEntryType::VTL2_PROTECTABLE,
1039                vnode: 0,
1040            })
1041            .collect::<Vec<_>>();
1042        let mut address_space = AddressSpaceManager::new_const();
1043        AddressSpaceManagerBuilder::new(
1044            &mut address_space,
1045            &ram,
1046            bootshim_used,
1047            subtract_ranges([parameter_range], reclaim),
1048        )
1049        .init()
1050        .unwrap();
1051        address_space
1052    }
1053
1054    fn check_e820(boot_params: &boot_params, ext: &E820Ext, expected: &[(Range<u64>, u32)]) {
1055        let actual = boot_params.e820_map[..boot_params.e820_entries as usize]
1056            .iter()
1057            .chain(
1058                ext.entries
1059                    .iter()
1060                    .take((ext.header.len as usize) / size_of::<e820entry>()),
1061            );
1062
1063        assert_eq!(actual.clone().count(), expected.len());
1064
1065        for (actual, (expected_range, expected_type)) in actual.zip(expected.iter()) {
1066            let addr: u64 = actual.addr.into();
1067            let size: u64 = actual.size.into();
1068            let typ: u32 = actual.typ.into();
1069            assert_eq!(addr, expected_range.start);
1070            assert_eq!(size, expected_range.end - expected_range.start);
1071            assert_eq!(typ, *expected_type);
1072        }
1073    }
1074
1075    const ONE_MB: u64 = 0x10_0000;
1076
1077    #[test]
1078    fn test_e820_basic() {
1079        // memmap with no param reclaim
1080        let mut boot_params: boot_params = FromZeros::new_zeroed();
1081        let mut ext = FromZeros::new_zeroed();
1082        let bootshim_used = MemoryRange::try_new(ONE_MB..3 * ONE_MB).unwrap();
1083        let parameter_range = MemoryRange::try_new(2 * ONE_MB..3 * ONE_MB).unwrap();
1084        let address_space = new_address_space_manager(
1085            &[MemoryRange::new(ONE_MB..4 * ONE_MB)],
1086            bootshim_used,
1087            parameter_range,
1088            None,
1089        );
1090
1091        assert!(build_e820_map(&mut boot_params, &mut ext, &address_space).is_ok());
1092
1093        check_e820(
1094            &boot_params,
1095            &ext,
1096            &[
1097                (ONE_MB..2 * ONE_MB, E820_RAM),
1098                (2 * ONE_MB..3 * ONE_MB, E820_RESERVED),
1099                (3 * ONE_MB..4 * ONE_MB, E820_RAM),
1100            ],
1101        );
1102
1103        // memmap with reclaim
1104        let mut boot_params: boot_params = FromZeros::new_zeroed();
1105        let mut ext = FromZeros::new_zeroed();
1106        let bootshim_used = MemoryRange::try_new(ONE_MB..5 * ONE_MB).unwrap();
1107        let parameter_range = MemoryRange::try_new(2 * ONE_MB..5 * ONE_MB).unwrap();
1108        let reclaim = MemoryRange::try_new(3 * ONE_MB..4 * ONE_MB).unwrap();
1109        let address_space = new_address_space_manager(
1110            &[MemoryRange::new(ONE_MB..6 * ONE_MB)],
1111            bootshim_used,
1112            parameter_range,
1113            Some(reclaim),
1114        );
1115
1116        assert!(build_e820_map(&mut boot_params, &mut ext, &address_space).is_ok());
1117
1118        check_e820(
1119            &boot_params,
1120            &ext,
1121            &[
1122                (ONE_MB..2 * ONE_MB, E820_RAM),
1123                (2 * ONE_MB..3 * ONE_MB, E820_RESERVED),
1124                (3 * ONE_MB..4 * ONE_MB, E820_RAM),
1125                (4 * ONE_MB..5 * ONE_MB, E820_RESERVED),
1126                (5 * ONE_MB..6 * ONE_MB, E820_RAM),
1127            ],
1128        );
1129
1130        // two mem ranges
1131        let mut boot_params: boot_params = FromZeros::new_zeroed();
1132        let mut ext = FromZeros::new_zeroed();
1133        let bootshim_used = MemoryRange::try_new(ONE_MB..5 * ONE_MB).unwrap();
1134        let parameter_range = MemoryRange::try_new(2 * ONE_MB..5 * ONE_MB).unwrap();
1135        let reclaim = MemoryRange::try_new(3 * ONE_MB..4 * ONE_MB).unwrap();
1136        let address_space = new_address_space_manager(
1137            &[
1138                MemoryRange::new(ONE_MB..4 * ONE_MB),
1139                MemoryRange::new(4 * ONE_MB..10 * ONE_MB),
1140            ],
1141            bootshim_used,
1142            parameter_range,
1143            Some(reclaim),
1144        );
1145
1146        assert!(build_e820_map(&mut boot_params, &mut ext, &address_space).is_ok());
1147
1148        check_e820(
1149            &boot_params,
1150            &ext,
1151            &[
1152                (ONE_MB..2 * ONE_MB, E820_RAM),
1153                (2 * ONE_MB..3 * ONE_MB, E820_RESERVED),
1154                (3 * ONE_MB..4 * ONE_MB, E820_RAM),
1155                (4 * ONE_MB..5 * ONE_MB, E820_RESERVED),
1156                (5 * ONE_MB..10 * ONE_MB, E820_RAM),
1157            ],
1158        );
1159
1160        // memmap in 1 mb chunks
1161        let mut boot_params: boot_params = FromZeros::new_zeroed();
1162        let mut ext = FromZeros::new_zeroed();
1163        let bootshim_used = MemoryRange::try_new(ONE_MB..5 * ONE_MB).unwrap();
1164        let parameter_range = MemoryRange::try_new(2 * ONE_MB..5 * ONE_MB).unwrap();
1165        let reclaim = MemoryRange::try_new(3 * ONE_MB..4 * ONE_MB).unwrap();
1166        let address_space = new_address_space_manager(
1167            &[
1168                MemoryRange::new(ONE_MB..2 * ONE_MB),
1169                MemoryRange::new(2 * ONE_MB..3 * ONE_MB),
1170                MemoryRange::new(3 * ONE_MB..4 * ONE_MB),
1171                MemoryRange::new(4 * ONE_MB..5 * ONE_MB),
1172                MemoryRange::new(5 * ONE_MB..6 * ONE_MB),
1173                MemoryRange::new(6 * ONE_MB..7 * ONE_MB),
1174                MemoryRange::new(7 * ONE_MB..8 * ONE_MB),
1175            ],
1176            bootshim_used,
1177            parameter_range,
1178            Some(reclaim),
1179        );
1180
1181        assert!(build_e820_map(&mut boot_params, &mut ext, &address_space).is_ok());
1182
1183        check_e820(
1184            &boot_params,
1185            &ext,
1186            &[
1187                (ONE_MB..2 * ONE_MB, E820_RAM),
1188                (2 * ONE_MB..3 * ONE_MB, E820_RESERVED),
1189                (3 * ONE_MB..4 * ONE_MB, E820_RAM),
1190                (4 * ONE_MB..5 * ONE_MB, E820_RESERVED),
1191                (5 * ONE_MB..8 * ONE_MB, E820_RAM),
1192            ],
1193        );
1194    }
1195
1196    // test e820 with spillover into ext
1197    #[test]
1198    fn test_e820_huge() {
1199        use crate::memory::AllocationPolicy;
1200        use crate::memory::AllocationType;
1201
1202        // Create 64 RAM ranges, then allocate 256 ranges to test spillover
1203        // boot_params.e820_map has E820_MAX_ENTRIES_ZEROPAGE (128) entries
1204        const E820_MAX_ENTRIES_ZEROPAGE: usize = 128;
1205        const RAM_RANGES: usize = 64;
1206        const TOTAL_ALLOCATIONS: usize = 256;
1207
1208        // Create 64 large RAM ranges (64MB each = 64 * 1MB pages per range)
1209        let mut ranges = Vec::new();
1210        for i in 0..RAM_RANGES {
1211            let start = (i as u64) * 64 * ONE_MB;
1212            let end = start + 64 * ONE_MB;
1213            ranges.push(MemoryRange::new(start..end));
1214        }
1215
1216        let bootshim_used = MemoryRange::try_new(0..ONE_MB).unwrap();
1217        let parameter_range = MemoryRange::try_new(0..ONE_MB).unwrap();
1218
1219        let mut address_space = {
1220            let ram = ranges
1221                .iter()
1222                .cloned()
1223                .map(|range| MemoryEntry {
1224                    range,
1225                    mem_type: MemoryMapEntryType::VTL2_PROTECTABLE,
1226                    vnode: 0,
1227                })
1228                .collect::<Vec<_>>();
1229            let mut address_space = AddressSpaceManager::new_const();
1230            AddressSpaceManagerBuilder::new(
1231                &mut address_space,
1232                &ram,
1233                bootshim_used,
1234                core::iter::once(parameter_range),
1235            )
1236            .init()
1237            .unwrap();
1238            address_space
1239        };
1240
1241        for i in 0..TOTAL_ALLOCATIONS {
1242            // Intersperse sidecar node allocations with gpa pool allocations,
1243            // as otherwise the address space manager will collapse adjacent
1244            // ranges of the same type.
1245            let _allocated = address_space
1246                .allocate(
1247                    None,
1248                    ONE_MB,
1249                    if i % 2 == 0 {
1250                        AllocationType::GpaPool
1251                    } else {
1252                        AllocationType::SidecarNode
1253                    },
1254                    AllocationPolicy::LowMemory,
1255                )
1256                .expect("should be able to allocate sidecar node");
1257        }
1258
1259        let mut boot_params: boot_params = FromZeros::new_zeroed();
1260        let mut ext = FromZeros::new_zeroed();
1261        let total_ranges = address_space.vtl2_ranges().count();
1262
1263        let used_ext = build_e820_map(&mut boot_params, &mut ext, &address_space).unwrap();
1264
1265        // Verify that we used the extension
1266        assert!(used_ext, "should use extension when there are many ranges");
1267
1268        // Verify the standard e820_map is full
1269        assert_eq!(boot_params.e820_entries, E820_MAX_ENTRIES_ZEROPAGE as u8);
1270
1271        // Verify the extension has the overflow entries
1272        let ext_entries = (ext.header.len as usize) / size_of::<e820entry>();
1273        assert_eq!(ext_entries, total_ranges - E820_MAX_ENTRIES_ZEROPAGE);
1274
1275        // Verify we have the expected number of total ranges
1276        let total_e820_entries = boot_params.e820_entries as usize + ext_entries;
1277        assert_eq!(total_e820_entries, total_ranges);
1278    }
1279}