openhcl_boot/
main.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! The openhcl boot loader, which loads before the kernel to set up the
5//! kernel's boot parameters.
6
7// See build.rs.
8#![cfg_attr(minimal_rt, no_std, no_main)]
9// UNSAFETY: Interacting with low level hardware and bootloader primitives.
10#![expect(unsafe_code)]
11
12mod arch;
13mod boot_logger;
14mod cmdline;
15mod dt;
16mod host_params;
17mod hypercall;
18mod rt;
19mod sidecar;
20mod single_threaded;
21
22use crate::arch::setup_vtl2_memory;
23use crate::arch::setup_vtl2_vp;
24#[cfg(target_arch = "x86_64")]
25use crate::arch::tdx::get_tdx_tsc_reftime;
26use crate::arch::verify_imported_regions_hash;
27use crate::boot_logger::boot_logger_init;
28use crate::boot_logger::log;
29use crate::hypercall::hvcall;
30use crate::single_threaded::off_stack;
31use arrayvec::ArrayString;
32use arrayvec::ArrayVec;
33use boot_logger::LoggerType;
34use cmdline::BootCommandLineOptions;
35use core::fmt::Write;
36use dt::BootTimes;
37use dt::write_dt;
38use host_params::COMMAND_LINE_SIZE;
39use host_params::PartitionInfo;
40use host_params::shim_params::IsolationType;
41use host_params::shim_params::ShimParams;
42use hvdef::Vtl;
43use loader_defs::linux::SETUP_DTB;
44use loader_defs::linux::setup_data;
45use loader_defs::shim::ShimParamsRaw;
46use memory_range::MemoryRange;
47use memory_range::RangeWalkResult;
48use memory_range::merge_adjacent_ranges;
49use memory_range::walk_ranges;
50use minimal_rt::enlightened_panic::enable_enlightened_panic;
51use sidecar::SidecarConfig;
52use sidecar_defs::SidecarOutput;
53use sidecar_defs::SidecarParams;
54use single_threaded::OffStackRef;
55use zerocopy::FromBytes;
56use zerocopy::FromZeros;
57use zerocopy::Immutable;
58use zerocopy::IntoBytes;
59use zerocopy::KnownLayout;
60
61#[derive(Debug)]
62struct CommandLineTooLong;
63
64impl From<core::fmt::Error> for CommandLineTooLong {
65    fn from(_: core::fmt::Error) -> Self {
66        Self
67    }
68}
69
70/// Read and setup the underhill kernel command line into the specified buffer.
71fn build_kernel_command_line(
72    params: &ShimParams,
73    cmdline: &mut ArrayString<COMMAND_LINE_SIZE>,
74    partition_info: &PartitionInfo,
75    can_trust_host: bool,
76    is_confidential_debug: bool,
77    sidecar: Option<&SidecarConfig<'_>>,
78) -> Result<(), CommandLineTooLong> {
79    // For reference:
80    // https://www.kernel.org/doc/html/v5.15/admin-guide/kernel-parameters.html
81    const KERNEL_PARAMETERS: &[&str] = &[
82        // If a console is specified, then write everything to it.
83        "loglevel=8",
84        // Use a fixed 128KB log buffer by default.
85        "log_buf_len=128K",
86        // Enable time output on console for ohcldiag-dev.
87        "printk.time=1",
88        // Enable facility and level output on console for ohcldiag-dev.
89        "console_msg_format=syslog",
90        // Set uio parameter to configure vmbus ring buffer behavior.
91        "uio_hv_generic.no_mask=1",
92        // RELIABILITY: Dump anonymous pages and ELF headers only. Skip over
93        // huge pages and the shared pages.
94        "coredump_filter=0x33",
95        // PERF: No processor frequency governing.
96        "cpufreq.off=1",
97        // PERF: Disable the CPU idle time management entirely. It does not
98        // prevent the idle loop from running on idle CPUs, but it prevents
99        // the CPU idle time governors and drivers from being invoked.
100        "cpuidle.off=1",
101        // PERF: No perf checks for crypto algorithms to boot faster.
102        // Would have to evaluate the perf wins on the crypto manager vs
103        // delaying the boot up.
104        "cryptomgr.notests",
105        // PERF: Idle threads use HLT on x64 if there is no work.
106        // Believed to be a compromise between waking up the processor
107        // and the power consumption.
108        "idle=halt",
109        // WORKAROUND: Avoid init calls that assume presence of CMOS (Simple
110        // Boot Flag) or allocate the real-mode trampoline for APs.
111        "initcall_blacklist=init_real_mode,sbf_init",
112        // CONFIG-STATIC, PERF: Static loops-per-jiffy value to save time on boot.
113        "lpj=3000000",
114        // PERF: No broken timer check to boot faster.
115        "no_timer_check",
116        // CONFIG-STATIC, PERF: Using xsave makes VTL transitions being
117        // much slower. The xsave state is shared between VTLs, and we don't
118        // context switch it in the kernel when leaving/entering VTL2.
119        // Removing this will lead to corrupting register state and the
120        // undefined behaviour.
121        "noxsave",
122        // RELIABILITY: Panic on MCEs and faults in the kernel.
123        "oops=panic",
124        // RELIABILITY: Don't panic on kernel warnings.
125        "panic_on_warn=0",
126        // PERF, RELIABILITY: Don't print detailed information about the failing
127        // processes (memory maps, threads).
128        "panic_print=0",
129        // RELIABILITY: Reboot immediately on panic, no timeout.
130        "panic=-1",
131        // RELIABILITY: Don't print processor context information on a fatal
132        // signal. Our crash dump collection infrastructure seems reliable, and
133        // this information doesn't seem useful without a dump anyways.
134        // Additionally it may push important logs off the end of the kmsg
135        // page logged by the host.
136        //"print_fatal_signals=0",
137        // RELIABILITY: Unlimited logging to /dev/kmsg from userspace.
138        "printk.devkmsg=on",
139        // RELIABILITY: Reboot using a triple fault as the fastest method.
140        // That is also the method used for compatibility with earlier versions
141        // of the Microsoft HCL.
142        "reboot=t",
143        // CONFIG-STATIC: Type of the root file system.
144        "rootfstype=tmpfs",
145        // PERF: Deactivate kcompactd kernel thread, otherwise it will queue a
146        // scheduler timer periodically, which introduces jitters for VTL0.
147        "sysctl.vm.compaction_proactiveness=0",
148        // PERF: No TSC stability check when booting up to boot faster,
149        // also no validation during runtime.
150        "tsc=reliable",
151        // RELIABILITY: Panic on receiving an NMI.
152        "unknown_nmi_panic=1",
153        // Use vfio for MANA devices.
154        "vfio_pci.ids=1414:00ba",
155        // WORKAROUND: Enable no-IOMMU mode. This mode provides no device isolation,
156        // and no DMA translation.
157        "vfio.enable_unsafe_noiommu_mode=1",
158        // Specify the init path.
159        "rdinit=/underhill-init",
160        // Default to user-mode NVMe driver.
161        "OPENHCL_NVME_VFIO=1",
162        // The next three items reduce the memory overhead of the storvsc driver.
163        // Since it is only used for DVD, performance is not critical.
164        "hv_storvsc.storvsc_vcpus_per_sub_channel=2048",
165        // Fix number of hardware queues at 2.
166        "hv_storvsc.storvsc_max_hw_queues=2",
167        // Reduce the ring buffer size to 32K.
168        "hv_storvsc.storvsc_ringbuffer_size=0x8000",
169        // Disable eager mimalloc commit to prevent core dumps from being overly large
170        "MIMALLOC_ARENA_EAGER_COMMIT=0",
171    ];
172
173    const X86_KERNEL_PARAMETERS: &[&str] = &[
174        // Disable pcid support. This is a temporary fix to allow
175        // Underhill to run nested inside AMD VMs. Otherwise, the
176        // Underhill kernel tries to start APs with PCID bits set in CR3
177        // without the PCIDE bit set in CR4, which is an invalid
178        // VP state (according to the mshv nested implementation).
179        //
180        // TODO: remove this once we figure out the root cause and apply
181        // a workaround/fix elsewhere.
182        "clearcpuid=pcid",
183        // Disable all attempts to use an IOMMU, including swiotlb.
184        "iommu=off",
185        // Don't probe for a PCI bus. PCI devices currently come from VPCI. When
186        // this changes, we will explicitly enumerate a PCI bus via devicetree.
187        "pci=off",
188    ];
189
190    const AARCH64_KERNEL_PARAMETERS: &[&str] = &[];
191
192    for p in KERNEL_PARAMETERS {
193        write!(cmdline, "{p} ")?;
194    }
195
196    let arch_parameters = if cfg!(target_arch = "x86_64") {
197        X86_KERNEL_PARAMETERS
198    } else {
199        AARCH64_KERNEL_PARAMETERS
200    };
201    for p in arch_parameters {
202        write!(cmdline, "{p} ")?;
203    }
204
205    const HARDWARE_ISOLATED_KERNEL_PARAMETERS: &[&str] = &[
206        // Even with iommu=off, the SWIOTLB is still allocated on AARCH64
207        // (iommu=off ignored entirely), and CVMs (memory encryption forces it
208        // on). Set it to a single area in 8MB. The first parameter controls the
209        // area size in slabs (2KB per slab), the second controls the number of
210        // areas (default is # of CPUs).
211        //
212        // This is set to 8MB on hardware isolated VMs since there are some
213        // scenarios, such as provisioning over DVD, which require a larger size
214        // since the buffer is being used.
215        "swiotlb=4096,1",
216    ];
217
218    const NON_HARDWARE_ISOLATED_KERNEL_PARAMETERS: &[&str] = &[
219        // Even with iommu=off, the SWIOTLB is still allocated on AARCH64
220        // (iommu=off ignored entirely). Set it to the minimum, saving ~63 MiB.
221        // The first parameter controls the area size, the second controls the
222        // number of areas (default is # of CPUs). Set them both to the minimum.
223        "swiotlb=1,1",
224    ];
225
226    if params.isolation_type.is_hardware_isolated() {
227        for p in HARDWARE_ISOLATED_KERNEL_PARAMETERS {
228            write!(cmdline, "{p} ")?;
229        }
230    } else {
231        for p in NON_HARDWARE_ISOLATED_KERNEL_PARAMETERS {
232            write!(cmdline, "{p} ")?;
233        }
234    }
235
236    // Enable the com3 console by default if it's available and we're not
237    // isolated, or if we are isolated but also have debugging enabled.
238    //
239    // Otherwise, set the console to ttynull so the kernel does not default to
240    // com1. This is overridden by any user customizations in the static or
241    // dynamic command line, as this console argument provided by the bootloader
242    // comes first.
243    let console = if partition_info.com3_serial_available && can_trust_host {
244        "ttyS2,115200"
245    } else {
246        "ttynull"
247    };
248    write!(cmdline, "console={console} ")?;
249
250    if params.isolation_type != IsolationType::None {
251        write!(
252            cmdline,
253            "{}=1 ",
254            underhill_confidentiality::OPENHCL_CONFIDENTIAL_ENV_VAR_NAME
255        )?;
256    }
257
258    if is_confidential_debug {
259        write!(
260            cmdline,
261            "{}=1 ",
262            underhill_confidentiality::OPENHCL_CONFIDENTIAL_DEBUG_ENV_VAR_NAME
263        )?;
264    }
265
266    // Only when explicitly supported by Host.
267    // TODO: Move from command line to device tree when stabilized.
268    if partition_info.nvme_keepalive && !partition_info.vtl2_pool_memory.is_empty() {
269        write!(cmdline, "OPENHCL_NVME_KEEP_ALIVE=1 ")?;
270    }
271
272    if let Some(sidecar) = sidecar {
273        write!(cmdline, "{} ", sidecar.kernel_command_line())?;
274    }
275
276    // If we're isolated we can't trust the host-provided cmdline
277    if can_trust_host {
278        let old_cmdline = &partition_info.cmdline;
279
280        // HACK: See if we should set the vmbus connection id via kernel
281        // commandline. It may already be set, and we don't want to set it again.
282        //
283        // This code will be removed when the kernel supports setting connection id
284        // via device tree.
285        if !old_cmdline.contains("hv_vmbus.message_connection_id=") {
286            write!(
287                cmdline,
288                "hv_vmbus.message_connection_id=0x{:x} ",
289                partition_info.vmbus_vtl2.connection_id
290            )?;
291        }
292
293        // Prepend the computed parameters to the original command line.
294        cmdline.write_str(old_cmdline)?;
295    }
296
297    Ok(())
298}
299
300// The Linux kernel requires that the FDT fit within a single 256KB mapping, as
301// that is the maximum size the kernel can use during its early boot processes.
302// We also want our FDT to be as large as possible to support as many vCPUs as
303// possible. We set it to 256KB, but it must also be page-aligned, as leaving it
304// unaligned runs the possibility of it taking up 1 too many pages, resulting in
305// a 260KB mapping, which will fail.
306const FDT_SIZE: usize = 256 * 1024;
307
308#[repr(C, align(4096))]
309#[derive(FromBytes, IntoBytes, Immutable, KnownLayout)]
310struct Fdt {
311    header: setup_data,
312    data: [u8; FDT_SIZE - size_of::<setup_data>()],
313}
314
315/// Raw shim parameters are provided via a relative offset from the base of
316/// where the shim is loaded. Return a ShimParams structure based on the raw
317/// offset based RawShimParams.
318fn shim_parameters(shim_params_raw_offset: isize) -> ShimParams {
319    unsafe extern "C" {
320        static __ehdr_start: u8;
321    }
322
323    let shim_base = core::ptr::addr_of!(__ehdr_start) as usize;
324
325    // SAFETY: The host is required to relocate everything by the same bias, so
326    //         the shim parameters should be at the build time specified offset
327    //         from the base address of the image.
328    let raw_shim_params = unsafe {
329        &*(shim_base.wrapping_add_signed(shim_params_raw_offset) as *const ShimParamsRaw)
330    };
331
332    ShimParams::new(shim_base as u64, raw_shim_params)
333}
334
335/// The maximum number of reserved memory ranges that we might use.
336/// See ReservedMemoryType definition for details.
337pub const MAX_RESERVED_MEM_RANGES: usize = 5 + sidecar_defs::MAX_NODES;
338
339#[derive(Clone, Copy, Debug, PartialEq, Eq)]
340enum ReservedMemoryType {
341    /// VTL2 parameter regions (could be up to 2).
342    Vtl2Config,
343    /// Reserved memory that should not be used by the kernel or usermode. There
344    /// should only be one.
345    Vtl2Reserved,
346    /// Sidecar image. There should only be one.
347    SidecarImage,
348    /// A reserved range per sidecar node.
349    SidecarNode,
350    /// Persistent VTL2 memory used for page allocations in usermode. This
351    /// memory is persisted, both location and contents, across servicing.
352    /// Today, we only support a single range.
353    Vtl2GpaPool,
354}
355
356/// Construct a slice representing the reserved memory ranges to be reported to
357/// VTL2.
358fn reserved_memory_regions(
359    partition_info: &PartitionInfo,
360    sidecar: Option<&SidecarConfig<'_>>,
361) -> OffStackRef<'static, impl AsRef<[(MemoryRange, ReservedMemoryType)]> + use<>> {
362    let mut reserved = off_stack!(ArrayVec<(MemoryRange, ReservedMemoryType), MAX_RESERVED_MEM_RANGES>, ArrayVec::new_const());
363    reserved.clear();
364    reserved.extend(
365        partition_info
366            .vtl2_config_regions()
367            .map(|r| (r, ReservedMemoryType::Vtl2Config)),
368    );
369    if let Some(sidecar) = sidecar {
370        reserved.push((sidecar.image, ReservedMemoryType::SidecarImage));
371        reserved.extend(sidecar.node_params.iter().map(|x| {
372            (
373                MemoryRange::new(x.memory_base..x.memory_base + x.memory_size),
374                ReservedMemoryType::SidecarNode,
375            )
376        }));
377    }
378
379    // Add the VTL2 reserved region, if it exists.
380    if !partition_info.vtl2_reserved_region.is_empty() {
381        reserved.push((
382            partition_info.vtl2_reserved_region,
383            ReservedMemoryType::Vtl2Reserved,
384        ));
385    }
386
387    // Add any VTL2 private pool.
388    if partition_info.vtl2_pool_memory != MemoryRange::EMPTY {
389        reserved.push((
390            partition_info.vtl2_pool_memory,
391            ReservedMemoryType::Vtl2GpaPool,
392        ));
393    }
394
395    reserved
396        .as_mut()
397        .sort_unstable_by_key(|(r, _typ)| r.start());
398
399    // Now flatten the ranges to avoid having more reserved ranges than
400    // necessary.
401    //
402    // You can also imagine doing this with `dedup_by`, but `ArrayVec` doesn't
403    // implement that.
404    let mut flattened = off_stack!(ArrayVec<(MemoryRange, ReservedMemoryType), MAX_RESERVED_MEM_RANGES>, ArrayVec::new_const());
405    flattened.clear();
406    flattened.extend(merge_adjacent_ranges(reserved.iter().copied()));
407    flattened
408}
409
410#[cfg_attr(not(target_arch = "x86_64"), expect(dead_code))]
411mod x86_boot {
412    use crate::PageAlign;
413    use crate::ReservedMemoryType;
414    use crate::host_params::PartitionInfo;
415    use crate::host_params::shim_params::IsolationType;
416    use crate::single_threaded::OffStackRef;
417    use crate::single_threaded::off_stack;
418    use crate::zeroed;
419    use core::mem::size_of;
420    use core::ops::Range;
421    use core::ptr;
422    use loader_defs::linux::E820_RAM;
423    use loader_defs::linux::E820_RESERVED;
424    use loader_defs::linux::SETUP_E820_EXT;
425    use loader_defs::linux::boot_params;
426    use loader_defs::linux::e820entry;
427    use loader_defs::linux::setup_data;
428    use memory_range::MemoryRange;
429    use memory_range::RangeWalkResult;
430    use memory_range::walk_ranges;
431    use zerocopy::FromZeros;
432    use zerocopy::Immutable;
433    use zerocopy::KnownLayout;
434
435    #[repr(C)]
436    #[derive(FromZeros, Immutable, KnownLayout)]
437    pub struct E820Ext {
438        pub header: setup_data,
439        pub entries: [e820entry; 512],
440    }
441
442    fn add_e820_entry(
443        entry: Option<&mut e820entry>,
444        range: MemoryRange,
445        typ: u32,
446    ) -> Result<(), BuildE820MapError> {
447        *entry.ok_or(BuildE820MapError::OutOfE820Entries)? = e820entry {
448            addr: range.start().into(),
449            size: range.len().into(),
450            typ: typ.into(),
451        };
452        Ok(())
453    }
454
455    #[derive(Debug)]
456    pub enum BuildE820MapError {
457        /// Parameter region not fully covered by VTL2 ram.
458        ReservedRegionNotCovered,
459        /// Out of e820 entries.
460        OutOfE820Entries,
461    }
462
463    /// Build the e820 map for the kernel representing usable VTL2 ram.
464    pub fn build_e820_map(
465        boot_params: &mut boot_params,
466        ext: &mut E820Ext,
467        partition_info: &PartitionInfo,
468        reserved: &[(MemoryRange, ReservedMemoryType)],
469        // The following params are only used when TDX Isolated
470        #[cfg_attr(target_arch = "aarch64", expect(unused_variables))]
471        isolation_type: IsolationType,
472        #[cfg_attr(target_arch = "aarch64", expect(unused_variables))] //
473        page_tables: Option<MemoryRange>,
474    ) -> Result<bool, BuildE820MapError> {
475        boot_params.e820_entries = 0;
476        let mut entries = boot_params
477            .e820_map
478            .iter_mut()
479            .chain(ext.entries.iter_mut());
480
481        let mut n = 0;
482        for (range, r) in walk_ranges(
483            partition_info.vtl2_ram.iter().map(|e| (e.range, ())),
484            reserved.iter().map(|&(r, _)| (r, ())),
485        ) {
486            match r {
487                RangeWalkResult::Neither => {}
488                RangeWalkResult::Left(_) => {
489                    add_e820_entry(entries.next(), range, E820_RAM)?;
490                    n += 1;
491                }
492                RangeWalkResult::Right(_) => {
493                    return Err(BuildE820MapError::ReservedRegionNotCovered);
494                }
495                RangeWalkResult::Both(_, _) => {
496                    add_e820_entry(entries.next(), range, E820_RESERVED)?;
497                    n += 1;
498                }
499            }
500        }
501
502        // If TDX-isolated, APs start up in the shim, and then are held in a wait
503        // loop as part of AP mailbox protocol used with the kernel. Mark the page
504        // tables and mailbox/reset-vector region of openhcl_boot as E820-reserved,
505        // otherwise the L1 kernel can use the pages while APs are in the reset vector
506        //
507        // TODO: address space management in the shim is getting centralized in
508        // a refactor, this should be moved somewhere more appropriate when possible
509        #[cfg(target_arch = "x86_64")]
510        if IsolationType::Tdx == isolation_type {
511            add_e820_entry(entries.next(), page_tables.unwrap(), E820_RESERVED)?;
512            n += 1;
513            add_e820_entry(
514                entries.next(),
515                MemoryRange::new(
516                    x86defs::tdx::RESET_VECTOR_PAGE..x86defs::tdx::RESET_VECTOR_PAGE + 0x1000,
517                ),
518                E820_RESERVED,
519            )?;
520            n += 1;
521        }
522
523        let base = n.min(boot_params.e820_map.len());
524        boot_params.e820_entries = base as u8;
525
526        if base < n {
527            ext.header.len = ((n - base) * size_of::<e820entry>()) as u32;
528            Ok(true)
529        } else {
530            Ok(false)
531        }
532    }
533
534    pub fn build_boot_params(
535        partition_info: &PartitionInfo,
536        reserved_memory: &[(MemoryRange, ReservedMemoryType)],
537        initrd: Range<u64>,
538        cmdline: &str,
539        setup_data_head: *const setup_data,
540        setup_data_tail: &mut &mut setup_data,
541        isolation_type: IsolationType,
542        page_tables: Option<MemoryRange>,
543    ) -> OffStackRef<'static, PageAlign<boot_params>> {
544        let mut boot_params_storage = off_stack!(PageAlign<boot_params>, zeroed());
545        let boot_params = &mut boot_params_storage.0;
546        boot_params.hdr.type_of_loader = 0xff; // Unknown loader type
547
548        // HACK: A kernel change just in the Underhill kernel tree has a workaround
549        // to disable probe_roms and reserve_bios_regions when X86_SUBARCH_LGUEST
550        // (1) is set by the bootloader. This stops the kernel from reading VTL0
551        // memory during kernel boot, which can have catastrophic consequences
552        // during a servicing operation when VTL0 has written values to memory, or
553        // unaccepted page accesses in an isolated partition.
554        //
555        // This is only intended as a stopgap until a suitable upstreamable kernel
556        // patch is made.
557        boot_params.hdr.hardware_subarch = 1.into();
558
559        boot_params.hdr.ramdisk_image = (initrd.start as u32).into();
560        boot_params.ext_ramdisk_image = (initrd.start >> 32) as u32;
561        let initrd_len = initrd.end - initrd.start;
562        boot_params.hdr.ramdisk_size = (initrd_len as u32).into();
563        boot_params.ext_ramdisk_size = (initrd_len >> 32) as u32;
564
565        let e820_ext = OffStackRef::leak(off_stack!(E820Ext, zeroed()));
566
567        let used_ext = build_e820_map(
568            boot_params,
569            e820_ext,
570            partition_info,
571            reserved_memory,
572            isolation_type,
573            page_tables,
574        )
575        .expect("building e820 map must succeed");
576
577        if used_ext {
578            e820_ext.header.ty = SETUP_E820_EXT;
579            setup_data_tail.next = ptr::from_ref(&e820_ext.header) as u64;
580            *setup_data_tail = &mut e820_ext.header;
581        }
582
583        let cmd_line_addr = cmdline.as_ptr() as u64;
584        boot_params.hdr.cmd_line_ptr = (cmd_line_addr as u32).into();
585        boot_params.ext_cmd_line_ptr = (cmd_line_addr >> 32) as u32;
586
587        boot_params.hdr.setup_data = (setup_data_head as u64).into();
588
589        boot_params_storage
590    }
591}
592
593/// Build the cc_blob containing the location of different parameters associated with SEV.
594#[cfg(target_arch = "x86_64")]
595fn build_cc_blob_sev_info(
596    cc_blob: &mut loader_defs::linux::cc_blob_sev_info,
597    shim_params: &ShimParams,
598) {
599    // TODO SNP: Currently only the first CPUID page is passed through.
600    // Consider changing this.
601    cc_blob.magic = loader_defs::linux::CC_BLOB_SEV_INFO_MAGIC;
602    cc_blob.version = 0;
603    cc_blob._reserved = 0;
604    cc_blob.secrets_phys = shim_params.secrets_start();
605    cc_blob.secrets_len = hvdef::HV_PAGE_SIZE as u32;
606    cc_blob._rsvd1 = 0;
607    cc_blob.cpuid_phys = shim_params.cpuid_start();
608    cc_blob.cpuid_len = hvdef::HV_PAGE_SIZE as u32;
609    cc_blob._rsvd2 = 0;
610}
611
612#[repr(C, align(4096))]
613#[derive(FromZeros, Immutable, KnownLayout)]
614struct PageAlign<T>(T);
615
616const fn zeroed<T: FromZeros>() -> T {
617    // SAFETY: `T` implements `FromZeros`, so this is a safe initialization of `T`.
618    unsafe { core::mem::MaybeUninit::<T>::zeroed().assume_init() }
619}
620
621fn get_ref_time(isolation: IsolationType) -> Option<u64> {
622    match isolation {
623        #[cfg(target_arch = "x86_64")]
624        IsolationType::Tdx => get_tdx_tsc_reftime(),
625        #[cfg(target_arch = "x86_64")]
626        IsolationType::Snp => None,
627        _ => Some(minimal_rt::reftime::reference_time()),
628    }
629}
630
631fn get_hw_debug_bit(isolation: IsolationType) -> bool {
632    match isolation {
633        #[cfg(target_arch = "x86_64")]
634        IsolationType::Tdx => {
635            use tdx_guest_device::protocol::TdReport;
636
637            use crate::arch::tdx::get_tdreport;
638
639            let mut report = off_stack!(PageAlign<TdReport>, zeroed());
640            match get_tdreport(&mut report.0) {
641                Ok(()) => report.0.td_info.td_info_base.attributes.debug(),
642                Err(_) => false,
643            }
644        }
645        #[cfg(target_arch = "x86_64")]
646        IsolationType::Snp => {
647            // Not implemented yet for SNP.
648            false
649        }
650        _ => false,
651    }
652}
653
654fn shim_main(shim_params_raw_offset: isize) -> ! {
655    let p = shim_parameters(shim_params_raw_offset);
656    if p.isolation_type == IsolationType::None {
657        enable_enlightened_panic();
658    }
659
660    // The support code for the fast hypercalls does not set
661    // the Guest ID if it is not set yet as opposed to the slow
662    // hypercall code path where that is done automatically.
663    // Thus the fast hypercalls will fail as the the Guest ID has
664    // to be set first hence initialize hypercall support
665    // explicitly.
666    if !p.isolation_type.is_hardware_isolated() {
667        hvcall().initialize();
668    }
669
670    // Enable early log output if requested in the static command line.
671    // Also check for confidential debug mode if we're isolated.
672    let mut static_options = BootCommandLineOptions::new();
673    if let Some(cmdline) = p.command_line().command_line() {
674        static_options.parse(cmdline);
675    }
676    if let Some(typ) = static_options.logger {
677        boot_logger_init(p.isolation_type, typ);
678        log!("openhcl_boot: early debugging enabled");
679    }
680
681    let hw_debug_bit = get_hw_debug_bit(p.isolation_type);
682    let can_trust_host = p.isolation_type == IsolationType::None
683        || static_options.confidential_debug
684        || hw_debug_bit;
685
686    let boot_reftime = get_ref_time(p.isolation_type);
687
688    let mut dt_storage = off_stack!(PartitionInfo, PartitionInfo::new());
689    let partition_info =
690        match PartitionInfo::read_from_dt(&p, &mut dt_storage, static_options, can_trust_host) {
691            Ok(Some(val)) => val,
692            Ok(None) => panic!("host did not provide a device tree"),
693            Err(e) => panic!("unable to read device tree params {}", e),
694        };
695
696    // Confidential debug will show up in boot_options only if included in the
697    // static command line, or if can_trust_host is true (so the dynamic command
698    // line has been parsed).
699    let is_confidential_debug = (can_trust_host && p.isolation_type != IsolationType::None)
700        || partition_info.boot_options.confidential_debug;
701
702    // Fill out the non-devicetree derived parts of PartitionInfo.
703    if !p.isolation_type.is_hardware_isolated()
704        && hvcall().vtl() == Vtl::Vtl2
705        && hvdef::HvRegisterVsmCapabilities::from(
706            hvcall()
707                .get_register(hvdef::HvAllArchRegisterName::VsmCapabilities.into())
708                .expect("failed to query vsm capabilities")
709                .as_u64(),
710        )
711        .vtl0_alias_map_available()
712    {
713        // If the vtl0 alias map was not provided in the devicetree, attempt to
714        // derive it from the architectural physical address bits.
715        //
716        // The value in the ID_AA64MMFR0_EL1 register used to determine the
717        // physical address bits can only represent multiples of 4. As a result,
718        // the Surface Pro X (and systems with similar CPUs) cannot properly
719        // report their address width of 39 bits. This causes the calculated
720        // alias map to be incorrect, which results in panics when trying to
721        // read memory and getting invalid data.
722        if partition_info.vtl0_alias_map.is_none() {
723            partition_info.vtl0_alias_map =
724                Some(1 << (arch::physical_address_bits(p.isolation_type) - 1));
725        }
726    } else {
727        // Ignore any devicetree-provided alias map if the conditions above
728        // aren't met.
729        partition_info.vtl0_alias_map = None;
730    }
731
732    if can_trust_host {
733        // Enable late log output if requested in the dynamic command line.
734        // Confidential debug is only allowed in the static command line.
735        if let Some(typ) = partition_info.boot_options.logger {
736            boot_logger_init(p.isolation_type, typ);
737        } else if partition_info.com3_serial_available && cfg!(target_arch = "x86_64") {
738            // If COM3 is available and we can trust the host, enable log output even
739            // if it wasn't otherwise requested.
740            boot_logger_init(p.isolation_type, LoggerType::Serial);
741        }
742    }
743
744    log!("openhcl_boot: entered shim_main");
745
746    if partition_info.cpus.is_empty() {
747        panic!("no cpus");
748    }
749
750    validate_vp_hw_ids(partition_info);
751
752    setup_vtl2_memory(&p, partition_info);
753    setup_vtl2_vp(partition_info);
754
755    verify_imported_regions_hash(&p);
756
757    let mut sidecar_params = off_stack!(PageAlign<SidecarParams>, zeroed());
758    let mut sidecar_output = off_stack!(PageAlign<SidecarOutput>, zeroed());
759    let sidecar = sidecar::start_sidecar(
760        &p,
761        partition_info,
762        &mut sidecar_params.0,
763        &mut sidecar_output.0,
764    );
765
766    let mut cmdline = off_stack!(ArrayString<COMMAND_LINE_SIZE>, ArrayString::new_const());
767    build_kernel_command_line(
768        &p,
769        &mut cmdline,
770        partition_info,
771        can_trust_host,
772        is_confidential_debug,
773        sidecar.as_ref(),
774    )
775    .unwrap();
776
777    let mut fdt = off_stack!(Fdt, zeroed());
778    fdt.header.len = fdt.data.len() as u32;
779    fdt.header.ty = SETUP_DTB;
780
781    #[cfg(target_arch = "x86_64")]
782    let mut setup_data_tail = &mut fdt.header;
783    #[cfg(target_arch = "x86_64")]
784    let setup_data_head = core::ptr::from_ref(setup_data_tail);
785
786    #[cfg(target_arch = "x86_64")]
787    if p.isolation_type == IsolationType::Snp {
788        let cc_blob = OffStackRef::leak(off_stack!(loader_defs::linux::cc_blob_sev_info, zeroed()));
789        build_cc_blob_sev_info(cc_blob, &p);
790
791        let cc_data = OffStackRef::leak(off_stack!(loader_defs::linux::cc_setup_data, zeroed()));
792        cc_data.header.len = size_of::<loader_defs::linux::cc_setup_data>() as u32;
793        cc_data.header.ty = loader_defs::linux::SETUP_CC_BLOB;
794        cc_data.cc_blob_address = core::ptr::from_ref(&*cc_blob) as u32;
795
796        // Chain in the setup data.
797        setup_data_tail.next = core::ptr::from_ref(&*cc_data) as u64;
798        setup_data_tail = &mut cc_data.header;
799    }
800
801    let reserved_memory = reserved_memory_regions(partition_info, sidecar.as_ref());
802    let initrd = p.initrd_base..p.initrd_base + p.initrd_size;
803
804    // Validate the initrd crc matches what was put at file generation time.
805    let computed_crc = crc32fast::hash(p.initrd());
806    assert_eq!(
807        computed_crc, p.initrd_crc,
808        "computed initrd crc does not match build time calculated crc"
809    );
810
811    #[cfg(target_arch = "x86_64")]
812    let boot_params = x86_boot::build_boot_params(
813        partition_info,
814        reserved_memory.as_ref(),
815        initrd.clone(),
816        &cmdline,
817        setup_data_head,
818        &mut setup_data_tail,
819        p.isolation_type,
820        p.page_tables,
821    );
822
823    // Compute the ending boot time. This has to be before writing to device
824    // tree, so this is as late as we can do it.
825
826    let boot_times = boot_reftime.map(|start| BootTimes {
827        start,
828        end: get_ref_time(p.isolation_type).unwrap_or(0),
829    });
830
831    // Validate that no imported regions that are pending are not part of vtl2
832    // ram.
833    for (range, result) in walk_ranges(
834        partition_info.vtl2_ram.iter().map(|r| (r.range, ())),
835        p.imported_regions(),
836    ) {
837        match result {
838            RangeWalkResult::Neither | RangeWalkResult::Left(_) | RangeWalkResult::Both(_, _) => {}
839            RangeWalkResult::Right(accepted) => {
840                // Ranges that are not a part of VTL2 ram must have been
841                // preaccepted, as usermode expect that to be the case.
842                assert!(
843                    accepted,
844                    "range {:#x?} not in vtl2 ram was not preaccepted at launch",
845                    range
846                );
847            }
848        }
849    }
850
851    write_dt(
852        &mut fdt.data,
853        partition_info,
854        reserved_memory.as_ref(),
855        p.imported_regions().map(|r| {
856            // Discard if the range was previously pending - the bootloader has
857            // accepted all pending ranges.
858            //
859            // NOTE: No VTL0 memory today is marked as pending. The check above
860            // validates that, and this code may need to change if this becomes
861            // no longer true.
862            r.0
863        }),
864        initrd,
865        &cmdline,
866        sidecar.as_ref(),
867        boot_times,
868        p.isolation_type,
869    )
870    .unwrap();
871
872    rt::verify_stack_cookie();
873
874    log!("uninitializing hypercalls, about to jump to kernel");
875    hvcall().uninitialize();
876
877    cfg_if::cfg_if! {
878        if #[cfg(target_arch = "x86_64")] {
879            // SAFETY: the parameter blob is trusted.
880            let kernel_entry: extern "C" fn(u64, &loader_defs::linux::boot_params) -> ! =
881                unsafe { core::mem::transmute(p.kernel_entry_address) };
882            kernel_entry(0, &boot_params.0)
883        } else if #[cfg(target_arch = "aarch64")] {
884            // SAFETY: the parameter blob is trusted.
885            let kernel_entry: extern "C" fn(fdt_data: *const u8, mbz0: u64, mbz1: u64, mbz2: u64) -> ! =
886                unsafe { core::mem::transmute(p.kernel_entry_address) };
887            // Disable MMU for kernel boot without EFI, as required by the boot protocol.
888            // Flush (and invalidate) the caches, as that is required for disabling MMU.
889            // SAFETY: Just changing a bit in the register and then jumping to the kernel.
890            unsafe {
891                core::arch::asm!(
892                    "
893                    mrs     {0}, sctlr_el1
894                    bic     {0}, {0}, #0x1
895                    msr     sctlr_el1, {0}
896                    tlbi    vmalle1
897                    dsb     sy
898                    isb     sy",
899                    lateout(reg) _,
900                );
901            }
902            kernel_entry(fdt.data.as_ptr(), 0, 0, 0)
903        } else {
904            panic!("unsupported arch")
905        }
906    }
907}
908
909/// Ensure that mshv VP indexes for the CPUs listed in the partition info
910/// correspond to the N in the cpu@N devicetree node name. OpenVMM assumes that
911/// this will be the case.
912fn validate_vp_hw_ids(partition_info: &PartitionInfo) {
913    use host_params::MAX_CPU_COUNT;
914    use hypercall::HwId;
915
916    if partition_info.isolation.is_hardware_isolated() {
917        // TODO TDX SNP: we don't have a GHCB/GHCI page set up to communicate
918        // with the hypervisor here, so we can't easily perform the check. Since
919        // there is no security impact to this check, we can skip it for now; if
920        // the VM fails to boot, then this is due to a host contract violation.
921        //
922        // For TDX, we could use ENUM TOPOLOGY to validate that the TD VCPU
923        // indexes correspond to the APIC IDs in the right order. I am not
924        // certain if there are places where we depend on this mapping today.
925        return;
926    }
927
928    if hvcall().vtl() != Vtl::Vtl2 {
929        // If we're not using guest VSM, then the guest won't communicate
930        // directly with the hypervisor, so we can choose the VP indexes
931        // ourselves.
932        return;
933    }
934
935    // Ensure the host and hypervisor agree on VP index ordering.
936
937    let mut hw_ids = off_stack!(ArrayVec<HwId, MAX_CPU_COUNT>, ArrayVec::new_const());
938    hw_ids.clear();
939    hw_ids.extend(partition_info.cpus.iter().map(|c| c.reg as _));
940    let mut vp_indexes = off_stack!(ArrayVec<u32, MAX_CPU_COUNT>, ArrayVec::new_const());
941    vp_indexes.clear();
942    if let Err(err) = hvcall().get_vp_index_from_hw_id(&hw_ids, &mut vp_indexes) {
943        panic!(
944            "failed to get VP index for hardware ID {:#x}: {}",
945            hw_ids[vp_indexes.len().min(hw_ids.len() - 1)],
946            err
947        );
948    }
949    if let Some((i, &vp_index)) = vp_indexes
950        .iter()
951        .enumerate()
952        .find(|&(i, vp_index)| i as u32 != *vp_index)
953    {
954        panic!(
955            "CPU hardware ID {:#x} does not correspond to VP index {}",
956            hw_ids[i], vp_index
957        );
958    }
959}
960
961// See build.rs. See `mod rt` for the actual bootstrap code required to invoke
962// shim_main.
963#[cfg(not(minimal_rt))]
964fn main() {
965    unimplemented!("build with MINIMAL_RT_BUILD to produce a working boot loader");
966}
967
968#[cfg(test)]
969mod test {
970    use super::x86_boot::E820Ext;
971    use super::x86_boot::build_e820_map;
972    use crate::ReservedMemoryType;
973    use crate::cmdline::BootCommandLineOptions;
974    use crate::dt::write_dt;
975    use crate::host_params::MAX_CPU_COUNT;
976    use crate::host_params::PartitionInfo;
977    use crate::host_params::shim_params::IsolationType;
978    use crate::reserved_memory_regions;
979    use arrayvec::ArrayString;
980    use arrayvec::ArrayVec;
981    use core::ops::Range;
982    use host_fdt_parser::CpuEntry;
983    use host_fdt_parser::MemoryEntry;
984    use host_fdt_parser::VmbusInfo;
985    use igvm_defs::MemoryMapEntryType;
986    use loader_defs::linux::E820_RAM;
987    use loader_defs::linux::E820_RESERVED;
988    use loader_defs::linux::boot_params;
989    use loader_defs::linux::e820entry;
990    use memory_range::MemoryRange;
991    use memory_range::RangeWalkResult;
992    use memory_range::walk_ranges;
993    use zerocopy::FromZeros;
994
995    const HIGH_MMIO_GAP_END: u64 = 0x1000000000; //  64 GiB
996    const VMBUS_MMIO_GAP_SIZE: u64 = 0x10000000; // 256 MiB
997    const HIGH_MMIO_GAP_START: u64 = HIGH_MMIO_GAP_END - VMBUS_MMIO_GAP_SIZE;
998
999    /// Create partition info with given cpu count enabled and sequential
1000    /// apic_ids.
1001    fn new_partition_info(cpu_count: usize) -> PartitionInfo {
1002        let mut cpus: ArrayVec<CpuEntry, MAX_CPU_COUNT> = ArrayVec::new();
1003
1004        for id in 0..(cpu_count as u64) {
1005            cpus.push(CpuEntry { reg: id, vnode: 0 });
1006        }
1007
1008        let mut mmio = ArrayVec::new();
1009        mmio.push(
1010            MemoryRange::try_new(HIGH_MMIO_GAP_START..HIGH_MMIO_GAP_END).expect("valid range"),
1011        );
1012
1013        PartitionInfo {
1014            vtl2_ram: ArrayVec::new(),
1015            vtl2_full_config_region: MemoryRange::EMPTY,
1016            vtl2_config_region_reclaim: MemoryRange::EMPTY,
1017            vtl2_reserved_region: MemoryRange::EMPTY,
1018            vtl2_pool_memory: MemoryRange::EMPTY,
1019            vtl2_used_ranges: ArrayVec::new(),
1020            partition_ram: ArrayVec::new(),
1021            isolation: IsolationType::None,
1022            bsp_reg: cpus[0].reg as u32,
1023            cpus,
1024            cmdline: ArrayString::new(),
1025            vmbus_vtl2: VmbusInfo {
1026                mmio,
1027                connection_id: 0,
1028            },
1029            vmbus_vtl0: VmbusInfo {
1030                mmio: ArrayVec::new(),
1031                connection_id: 0,
1032            },
1033            com3_serial_available: false,
1034            gic: None,
1035            memory_allocation_mode: host_fdt_parser::MemoryAllocationMode::Host,
1036            entropy: None,
1037            vtl0_alias_map: None,
1038            nvme_keepalive: false,
1039            boot_options: BootCommandLineOptions::new(),
1040        }
1041    }
1042
1043    // ensure we can boot with a _lot_ of vcpus
1044    #[test]
1045    #[cfg_attr(
1046        target_arch = "aarch64",
1047        ignore = "TODO: investigate why this doesn't always work on ARM"
1048    )]
1049    fn fdt_cpu_scaling() {
1050        const MAX_CPUS: usize = 2048;
1051
1052        let mut buf = [0; 0x40000];
1053        write_dt(
1054            &mut buf,
1055            &new_partition_info(MAX_CPUS),
1056            &[],
1057            [],
1058            0..0,
1059            &ArrayString::from("test").unwrap_or_default(),
1060            None,
1061            None,
1062            IsolationType::None,
1063        )
1064        .unwrap();
1065    }
1066
1067    // Must match the DeviceTree blob generated with the standard tooling
1068    // to ensure being compliant to the standards (or, at least, compatibility
1069    // with a widely used implementation).
1070    // For details on regenerating the test content, see `fdt_dtc_decompile`
1071    // below.
1072    #[test]
1073    #[ignore = "TODO: temporarily broken"]
1074    fn fdt_dtc_check_content() {
1075        const MAX_CPUS: usize = 2;
1076        const BUF_SIZE: usize = 0x1000;
1077
1078        // Rust cannot infer the type.
1079        let dtb_data_spans: [(usize, &[u8]); 2] = [
1080            (
1081                /* Span starts at offset */ 0,
1082                b"\xd0\x0d\xfe\xed\x00\x00\x10\x00\x00\x00\x04\x38\x00\x00\x00\x38\
1083                \x00\x00\x00\x28\x00\x00\x00\x11\x00\x00\x00\x10\x00\x00\x00\x00\
1084                \x00\x00\x00\x4a\x00\x00\x01\x6c\x00\x00\x00\x00\x00\x00\x00\x00\
1085                \x00\x00\x00\x00\x00\x00\x00\x00\x23\x61\x64\x64\x72\x65\x73\x73\
1086                \x2d\x63\x65\x6c\x6c\x73\x00\x23\x73\x69\x7a\x65\x2d\x63\x65\x6c\
1087                \x6c\x73\x00\x6d\x6f\x64\x65\x6c\x00\x72\x65\x67\x00\x64\x65\x76\
1088                \x69\x63\x65\x5f\x74\x79\x70\x65\x00\x73\x74\x61\x74\x75\x73\x00\
1089                \x63\x6f\x6d\x70\x61\x74\x69\x62\x6c\x65\x00\x72\x61\x6e\x67\x65\
1090                \x73",
1091            ),
1092            (
1093                /* Span starts at offset */ 0x430,
1094                b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\
1095                \x00\x00\x00\x03\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x02\
1096                \x00\x00\x00\x03\x00\x00\x00\x04\x00\x00\x00\x0f\x00\x00\x00\x00\
1097                \x00\x00\x00\x03\x00\x00\x00\x0f\x00\x00\x00\x1b\x6d\x73\x66\x74\
1098                \x2c\x75\x6e\x64\x65\x72\x68\x69\x6c\x6c\x00\x00\x00\x00\x00\x01\
1099                \x63\x70\x75\x73\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x04\
1100                \x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x03\x00\x00\x00\x04\
1101                \x00\x00\x00\x0f\x00\x00\x00\x00\x00\x00\x00\x01\x63\x70\x75\x40\
1102                \x30\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x04\x00\x00\x00\x25\
1103                \x63\x70\x75\x00\x00\x00\x00\x03\x00\x00\x00\x04\x00\x00\x00\x21\
1104                \x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x05\x00\x00\x00\x31\
1105                \x6f\x6b\x61\x79\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x01\
1106                \x63\x70\x75\x40\x31\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x04\
1107                \x00\x00\x00\x25\x63\x70\x75\x00\x00\x00\x00\x03\x00\x00\x00\x04\
1108                \x00\x00\x00\x21\x00\x00\x00\x01\x00\x00\x00\x03\x00\x00\x00\x05\
1109                \x00\x00\x00\x31\x6f\x6b\x61\x79\x00\x00\x00\x00\x00\x00\x00\x02\
1110                \x00\x00\x00\x02\x00\x00\x00\x01\x76\x6d\x62\x75\x73\x00\x00\x00\
1111                \x00\x00\x00\x03\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x02\
1112                \x00\x00\x00\x03\x00\x00\x00\x04\x00\x00\x00\x0f\x00\x00\x00\x01\
1113                \x00\x00\x00\x03\x00\x00\x00\x0b\x00\x00\x00\x38\x6d\x73\x66\x74\
1114                \x2c\x76\x6d\x62\x75\x73\x00\x00\x00\x00\x00\x03\x00\x00\x00\x14\
1115                \x00\x00\x00\x43\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0f\
1116                \xf0\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x02\
1117                \x00\x00\x00\x09",
1118            ),
1119        ];
1120
1121        let mut sample_buf = [0u8; BUF_SIZE];
1122        for (span_start, bytes) in dtb_data_spans {
1123            sample_buf[span_start..span_start + bytes.len()].copy_from_slice(bytes);
1124        }
1125
1126        let mut buf = [0u8; BUF_SIZE];
1127        write_dt(
1128            &mut buf,
1129            &new_partition_info(MAX_CPUS),
1130            &[],
1131            [],
1132            0..0,
1133            &ArrayString::from("test").unwrap_or_default(),
1134            None,
1135            None,
1136            IsolationType::None,
1137        )
1138        .unwrap();
1139
1140        assert!(sample_buf == buf);
1141    }
1142
1143    // This test should be manually enabled when need to regenerate
1144    // the sample content above and validate spec compliance with `dtc`.
1145    // Before running the test, please install the DeviceTree compiler:
1146    // ```shell
1147    // sudo apt-get update && sudo apt-get install device-tree-compiler
1148    // ```
1149    #[test]
1150    #[ignore = "enabling the test requires installing additional software, \
1151                and developers will experience a break."]
1152    fn fdt_dtc_decompile() {
1153        const MAX_CPUS: usize = 2048;
1154
1155        let mut buf = [0; 0x40000];
1156        write_dt(
1157            &mut buf,
1158            &new_partition_info(MAX_CPUS),
1159            &[],
1160            [],
1161            0..0,
1162            &ArrayString::from("test").unwrap_or_default(),
1163            None,
1164            None,
1165            IsolationType::None,
1166        )
1167        .unwrap();
1168
1169        let input_dtb_file_name = "openhcl_boot.dtb";
1170        let output_dts_file_name = "openhcl_boot.dts";
1171        std::fs::write(input_dtb_file_name, buf).unwrap();
1172        let success = std::process::Command::new("dtc")
1173            .args([input_dtb_file_name, "-I", "dtb", "-o", output_dts_file_name])
1174            .status()
1175            .unwrap()
1176            .success();
1177        assert!(success);
1178    }
1179
1180    fn partition_info_ram_ranges(
1181        ram: &[Range<u64>],
1182        parameter_range: MemoryRange,
1183        reclaim: Option<Range<u64>>,
1184    ) -> PartitionInfo {
1185        let mut info = PartitionInfo::new();
1186
1187        info.vtl2_ram = ram
1188            .iter()
1189            .map(|r| MemoryEntry {
1190                range: MemoryRange::try_new(r.clone()).unwrap(),
1191                mem_type: MemoryMapEntryType::VTL2_PROTECTABLE,
1192                vnode: 0,
1193            })
1194            .collect();
1195
1196        info.vtl2_full_config_region = parameter_range;
1197
1198        info.vtl2_config_region_reclaim = reclaim
1199            .map(|r| MemoryRange::try_new(r).unwrap())
1200            .unwrap_or(MemoryRange::EMPTY);
1201
1202        info
1203    }
1204
1205    fn check_e820(boot_params: &boot_params, ext: &E820Ext, expected: &[(Range<u64>, u32)]) {
1206        let actual = boot_params.e820_map[..boot_params.e820_entries as usize]
1207            .iter()
1208            .chain(
1209                ext.entries
1210                    .iter()
1211                    .take((ext.header.len as usize) / size_of::<e820entry>()),
1212            );
1213
1214        assert_eq!(actual.clone().count(), expected.len());
1215
1216        for (actual, (expected_range, expected_type)) in actual.zip(expected.iter()) {
1217            let addr: u64 = actual.addr.into();
1218            let size: u64 = actual.size.into();
1219            let typ: u32 = actual.typ.into();
1220            assert_eq!(addr, expected_range.start);
1221            assert_eq!(size, expected_range.end - expected_range.start);
1222            assert_eq!(typ, *expected_type);
1223        }
1224    }
1225
1226    const ONE_MB: u64 = 0x10_0000;
1227
1228    #[test]
1229    fn test_e820_basic() {
1230        // memmap with no param reclaim
1231        let mut boot_params: boot_params = FromZeros::new_zeroed();
1232        let mut ext = FromZeros::new_zeroed();
1233        let parameter_range = MemoryRange::try_new(2 * ONE_MB..3 * ONE_MB).unwrap();
1234        let partition_info =
1235            partition_info_ram_ranges(&[ONE_MB..4 * ONE_MB], parameter_range, None);
1236
1237        assert!(
1238            build_e820_map(
1239                &mut boot_params,
1240                &mut ext,
1241                &partition_info,
1242                reserved_memory_regions(&partition_info, None).as_ref(),
1243                partition_info.isolation,
1244                None
1245            )
1246            .is_ok()
1247        );
1248
1249        check_e820(
1250            &boot_params,
1251            &ext,
1252            &[
1253                (ONE_MB..2 * ONE_MB, E820_RAM),
1254                (2 * ONE_MB..3 * ONE_MB, E820_RESERVED),
1255                (3 * ONE_MB..4 * ONE_MB, E820_RAM),
1256            ],
1257        );
1258
1259        // memmap with reclaim
1260        let mut boot_params: boot_params = FromZeros::new_zeroed();
1261        let mut ext = FromZeros::new_zeroed();
1262        let parameter_range = MemoryRange::try_new(2 * ONE_MB..5 * ONE_MB).unwrap();
1263        let partition_info = partition_info_ram_ranges(
1264            &[ONE_MB..6 * ONE_MB],
1265            parameter_range,
1266            Some(3 * ONE_MB..4 * ONE_MB),
1267        );
1268
1269        assert!(
1270            build_e820_map(
1271                &mut boot_params,
1272                &mut ext,
1273                &partition_info,
1274                reserved_memory_regions(&partition_info, None).as_ref(),
1275                partition_info.isolation,
1276                None
1277            )
1278            .is_ok()
1279        );
1280
1281        check_e820(
1282            &boot_params,
1283            &ext,
1284            &[
1285                (ONE_MB..2 * ONE_MB, E820_RAM),
1286                (2 * ONE_MB..3 * ONE_MB, E820_RESERVED),
1287                (3 * ONE_MB..4 * ONE_MB, E820_RAM),
1288                (4 * ONE_MB..5 * ONE_MB, E820_RESERVED),
1289                (5 * ONE_MB..6 * ONE_MB, E820_RAM),
1290            ],
1291        );
1292
1293        // two mem ranges
1294        let mut boot_params: boot_params = FromZeros::new_zeroed();
1295        let mut ext = FromZeros::new_zeroed();
1296        let parameter_range = MemoryRange::try_new(2 * ONE_MB..5 * ONE_MB).unwrap();
1297        let partition_info = partition_info_ram_ranges(
1298            &[ONE_MB..4 * ONE_MB, 4 * ONE_MB..10 * ONE_MB],
1299            parameter_range,
1300            Some(3 * ONE_MB..4 * ONE_MB),
1301        );
1302
1303        assert!(
1304            build_e820_map(
1305                &mut boot_params,
1306                &mut ext,
1307                &partition_info,
1308                reserved_memory_regions(&partition_info, None).as_ref(),
1309                partition_info.isolation,
1310                None
1311            )
1312            .is_ok()
1313        );
1314
1315        check_e820(
1316            &boot_params,
1317            &ext,
1318            &[
1319                (ONE_MB..2 * ONE_MB, E820_RAM),
1320                (2 * ONE_MB..3 * ONE_MB, E820_RESERVED),
1321                (3 * ONE_MB..4 * ONE_MB, E820_RAM),
1322                (4 * ONE_MB..5 * ONE_MB, E820_RESERVED),
1323                (5 * ONE_MB..10 * ONE_MB, E820_RAM),
1324            ],
1325        );
1326
1327        // memmap in 1 mb chunks
1328        let mut boot_params: boot_params = FromZeros::new_zeroed();
1329        let mut ext = FromZeros::new_zeroed();
1330        let parameter_range = MemoryRange::try_new(2 * ONE_MB..5 * ONE_MB).unwrap();
1331        let partition_info = partition_info_ram_ranges(
1332            &[
1333                ONE_MB..2 * ONE_MB,
1334                2 * ONE_MB..3 * ONE_MB,
1335                3 * ONE_MB..4 * ONE_MB,
1336                4 * ONE_MB..5 * ONE_MB,
1337                5 * ONE_MB..6 * ONE_MB,
1338                6 * ONE_MB..7 * ONE_MB,
1339                7 * ONE_MB..8 * ONE_MB,
1340            ],
1341            parameter_range,
1342            Some(3 * ONE_MB..4 * ONE_MB),
1343        );
1344
1345        assert!(
1346            build_e820_map(
1347                &mut boot_params,
1348                &mut ext,
1349                &partition_info,
1350                reserved_memory_regions(&partition_info, None).as_ref(),
1351                partition_info.isolation,
1352                None
1353            )
1354            .is_ok()
1355        );
1356
1357        check_e820(
1358            &boot_params,
1359            &ext,
1360            &[
1361                (ONE_MB..2 * ONE_MB, E820_RAM),
1362                (2 * ONE_MB..3 * ONE_MB, E820_RESERVED),
1363                (3 * ONE_MB..4 * ONE_MB, E820_RAM),
1364                (4 * ONE_MB..5 * ONE_MB, E820_RESERVED),
1365                (5 * ONE_MB..6 * ONE_MB, E820_RAM),
1366                (6 * ONE_MB..7 * ONE_MB, E820_RAM),
1367                (7 * ONE_MB..8 * ONE_MB, E820_RAM),
1368            ],
1369        );
1370    }
1371
1372    #[test]
1373    fn test_e820_param_not_covered() {
1374        // parameter range not covered by ram at all
1375        let mut boot_params: boot_params = FromZeros::new_zeroed();
1376        let mut ext = FromZeros::new_zeroed();
1377        let parameter_range = MemoryRange::try_new(5 * ONE_MB..6 * ONE_MB).unwrap();
1378        let partition_info =
1379            partition_info_ram_ranges(&[ONE_MB..4 * ONE_MB], parameter_range, None);
1380
1381        assert!(
1382            build_e820_map(
1383                &mut boot_params,
1384                &mut ext,
1385                &partition_info,
1386                reserved_memory_regions(&partition_info, None).as_ref(),
1387                partition_info.isolation,
1388                None
1389            )
1390            .is_err()
1391        );
1392
1393        // parameter range start partial coverage
1394        let mut boot_params: boot_params = FromZeros::new_zeroed();
1395        let mut ext = FromZeros::new_zeroed();
1396        let parameter_range = MemoryRange::try_new(3 * ONE_MB..6 * ONE_MB).unwrap();
1397        let partition_info =
1398            partition_info_ram_ranges(&[ONE_MB..4 * ONE_MB], parameter_range, None);
1399
1400        assert!(
1401            build_e820_map(
1402                &mut boot_params,
1403                &mut ext,
1404                &partition_info,
1405                reserved_memory_regions(&partition_info, None).as_ref(),
1406                partition_info.isolation,
1407                None
1408            )
1409            .is_err()
1410        );
1411
1412        // parameter range end partial coverage
1413        let mut boot_params: boot_params = FromZeros::new_zeroed();
1414        let mut ext = FromZeros::new_zeroed();
1415        let parameter_range = MemoryRange::try_new(2 * ONE_MB..5 * ONE_MB).unwrap();
1416        let partition_info =
1417            partition_info_ram_ranges(&[4 * ONE_MB..6 * ONE_MB], parameter_range, None);
1418
1419        assert!(
1420            build_e820_map(
1421                &mut boot_params,
1422                &mut ext,
1423                &partition_info,
1424                reserved_memory_regions(&partition_info, None).as_ref(),
1425                partition_info.isolation,
1426                None
1427            )
1428            .is_err()
1429        );
1430
1431        // parameter range larger than ram
1432        let mut boot_params: boot_params = FromZeros::new_zeroed();
1433        let mut ext = FromZeros::new_zeroed();
1434        let parameter_range = MemoryRange::try_new(2 * ONE_MB..8 * ONE_MB).unwrap();
1435        let partition_info =
1436            partition_info_ram_ranges(&[4 * ONE_MB..6 * ONE_MB], parameter_range, None);
1437
1438        assert!(
1439            build_e820_map(
1440                &mut boot_params,
1441                &mut ext,
1442                &partition_info,
1443                reserved_memory_regions(&partition_info, None).as_ref(),
1444                partition_info.isolation,
1445                None
1446            )
1447            .is_err()
1448        );
1449
1450        // ram has gap inside param range
1451        let mut boot_params: boot_params = FromZeros::new_zeroed();
1452        let mut ext = FromZeros::new_zeroed();
1453        let parameter_range = MemoryRange::try_new(2 * ONE_MB..8 * ONE_MB).unwrap();
1454        let partition_info = partition_info_ram_ranges(
1455            &[ONE_MB..6 * ONE_MB, 7 * ONE_MB..10 * ONE_MB],
1456            parameter_range,
1457            None,
1458        );
1459
1460        assert!(
1461            build_e820_map(
1462                &mut boot_params,
1463                &mut ext,
1464                &partition_info,
1465                reserved_memory_regions(&partition_info, None).as_ref(),
1466                partition_info.isolation,
1467                None
1468            )
1469            .is_err()
1470        );
1471    }
1472
1473    #[test]
1474    fn test_e820_huge() {
1475        // memmap with no param reclaim
1476        let mut boot_params: boot_params = FromZeros::new_zeroed();
1477        let mut ext = FromZeros::new_zeroed();
1478        let ram = MemoryRange::new(0..32 * ONE_MB);
1479        let partition_info = partition_info_ram_ranges(&[ram.into()], MemoryRange::EMPTY, None);
1480        let reserved = (0..256)
1481            .map(|i| {
1482                (
1483                    MemoryRange::from_4k_gpn_range(i * 8 + 1..i * 8 + 3),
1484                    ReservedMemoryType::Vtl2Config,
1485                )
1486            })
1487            .collect::<Vec<_>>();
1488
1489        build_e820_map(
1490            &mut boot_params,
1491            &mut ext,
1492            &partition_info,
1493            &reserved,
1494            partition_info.isolation,
1495            None,
1496        )
1497        .unwrap();
1498
1499        assert!(ext.header.len > 0);
1500
1501        let expected = walk_ranges([(ram, ())], reserved.iter().map(|&(r, _)| (r, ())))
1502            .flat_map(|(range, r)| match r {
1503                RangeWalkResult::Neither => None,
1504                RangeWalkResult::Left(_) => Some((range.into(), E820_RAM)),
1505                RangeWalkResult::Right(_) => unreachable!(),
1506                RangeWalkResult::Both(_, _) => Some((range.into(), E820_RESERVED)),
1507            })
1508            .collect::<Vec<_>>();
1509
1510        check_e820(&boot_params, &ext, &expected);
1511    }
1512}