Skip to main content

openhcl_boot/host_params/dt/
mod.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! Parse partition info using the IGVM device tree parameter.
5
6extern crate alloc;
7
8use super::PartitionInfo;
9use super::shim_params::ShimParams;
10use crate::cmdline::BootCommandLineOptions;
11use crate::cmdline::SidecarOptions;
12use crate::host_params::COMMAND_LINE_SIZE;
13use crate::host_params::MAX_CPU_COUNT;
14use crate::host_params::MAX_ENTROPY_SIZE;
15use crate::host_params::MAX_NUMA_NODES;
16use crate::host_params::MAX_PARTITION_RAM_RANGES;
17use crate::host_params::MAX_VTL2_RAM_RANGES;
18use crate::host_params::dt::dma_hint::pick_private_pool_size;
19use crate::host_params::mmio::select_vtl2_mmio_range;
20use crate::host_params::shim_params::IsolationType;
21use crate::memory::AddressSpaceManager;
22use crate::memory::AddressSpaceManagerBuilder;
23use crate::memory::AllocationPolicy;
24use crate::memory::AllocationType;
25use crate::single_threaded::OffStackRef;
26use crate::single_threaded::off_stack;
27use alloc::vec::Vec;
28use arrayvec::ArrayString;
29use arrayvec::ArrayVec;
30use bump_alloc::ALLOCATOR;
31use core::cmp::max;
32use core::fmt::Write;
33use host_fdt_parser::MemoryAllocationMode;
34use host_fdt_parser::MemoryEntry;
35use host_fdt_parser::ParsedDeviceTree;
36use host_fdt_parser::VmbusInfo;
37use hvdef::HV_PAGE_SIZE;
38use igvm_defs::MemoryMapEntryType;
39use loader_defs::paravisor::CommandLinePolicy;
40use loader_defs::shim::MemoryVtlType;
41use loader_defs::shim::PersistedStateHeader;
42use memory_range::MemoryRange;
43use memory_range::subtract_ranges;
44use memory_range::walk_ranges;
45use thiserror::Error;
46use zerocopy::FromBytes;
47
48mod bump_alloc;
49mod dma_hint;
50
51/// Errors when reading the host device tree.
52#[derive(Debug, Error)]
53pub enum DtError {
54    /// Host did not provide a device tree.
55    #[error("no device tree provided by host")]
56    NoDeviceTree,
57    /// Invalid device tree.
58    #[error("host provided device tree is invalid")]
59    DeviceTree(#[source] host_fdt_parser::Error<'static>),
60    /// PartitionInfo's command line is too small to write the parsed legacy
61    /// command line.
62    #[error("commandline storage is too small to write the parsed command line")]
63    CommandLineSize,
64    /// Device tree did not contain a vmbus node for VTL2.
65    #[error("device tree did not contain a vmbus node for VTL2")]
66    Vtl2Vmbus,
67    /// Device tree did not contain a vmbus node for VTL0.
68    #[error("device tree did not contain a vmbus node for VTL0")]
69    Vtl0Vmbus,
70    /// Host provided high MMIO range is insufficient to cover VTL0 and VTL2.
71    #[error("host provided high MMIO range is insufficient to cover VTL0 and VTL2")]
72    NotEnoughVtl0Mmio,
73    /// Host provided MMIO range is insufficient to cover VTL2.
74    #[error("host provided MMIO range is insufficient to cover VTL2")]
75    NotEnoughVtl2Mmio,
76}
77
78/// Allocate the private pool across NUMA nodes.
79///
80/// By default, tries to allocate the entire pool on NUMA node 0 (preserving
81/// previous behavior). If that fails, or if `force_numa_split` is true, the
82/// pool is split evenly across all available NUMA nodes (one range per node).
83fn allocate_private_pool(
84    address_space: &mut AddressSpaceManager,
85    vtl2_ram: &[MemoryEntry],
86    pool_size_bytes: u64,
87    force_numa_split: bool,
88    enable_vtl2_gpa_pool: crate::cmdline::Vtl2GpaPoolConfig,
89    device_dma_page_count: Option<u64>,
90    vp_count: usize,
91    mem_size: u64,
92) {
93    // Try allocating the entire pool on node 0 first. We do this to maintain
94    // compatibility with older openhcl_boot images that do not understand how
95    // to handle a split private pool, and to maintain previous behavior where
96    // the pool was completely allocated on numa node 0.
97    //
98    // Allocate from high memory downward to avoid overlapping any used ranges
99    // in low memory when openhcl's usage gets bigger, as otherwise the
100    // used_range by the bootshim could overlap the pool range chosen when
101    // servicing to a new image.
102    if !force_numa_split {
103        if let Some(pool) = address_space.allocate(
104            Some(0),
105            pool_size_bytes,
106            AllocationType::GpaPool,
107            AllocationPolicy::HighMemory,
108        ) {
109            log::info!("allocated VTL2 pool at {:#x?}", pool.range);
110            return;
111        }
112        log::info!("node 0 cannot fit full pool, splitting across NUMA nodes");
113    } else {
114        log::info!("forcing VTL2 pool NUMA split across nodes");
115    }
116
117    // Enumerate unique NUMA nodes from VTL2 RAM.
118    //
119    // FUTURE: Handle cases where the are CPU only or RAM only numa nodes.
120    let mut numa_nodes = off_stack!(ArrayVec<u32, MAX_NUMA_NODES>, ArrayVec::new_const());
121    for entry in vtl2_ram.iter() {
122        match numa_nodes.binary_search(&entry.vnode) {
123            Ok(_) => {}
124            Err(index) => {
125                numa_nodes.insert(index, entry.vnode);
126            }
127        }
128    }
129
130    let num_nodes = numa_nodes.len() as u64;
131    // Split the per node size to page size aligned chunks, and give the
132    // remainder to the last node.
133    let per_node_size = (pool_size_bytes / num_nodes) & !(HV_PAGE_SIZE - 1);
134    let last_node_size = pool_size_bytes - per_node_size * (num_nodes - 1);
135    let mut remaining = pool_size_bytes;
136
137    // If per-node-size is zero, we're in some strange configuration. We should
138    // have been able to allocate this from a single node, as this would mean
139    // the number of nodes is larger than the number of pages requested for the
140    // pool, so fail explicitly.
141    if per_node_size == 0 {
142        panic!(
143            "cannot split VTL2 pool of size {pool_size_bytes:#x} bytes across \
144            {num_nodes} nodes, per node size {per_node_size:#x} bytes; \
145            enable_vtl2_gpa_pool={enable_vtl2_gpa_pool:?}, \
146            device_dma_page_count={device_dma_page_count:#x?}, \
147            vp_count={vp_count}, mem_size={mem_size:#x}"
148        );
149    }
150
151    for (i, vnode) in numa_nodes.iter().enumerate() {
152        if remaining == 0 {
153            break;
154        }
155
156        let is_last = i == numa_nodes.len() - 1;
157        let alloc_size = if is_last {
158            last_node_size
159        } else {
160            per_node_size
161        };
162
163        // Make sure to allocate high memory downward, for the same reason as
164        // described in the numa 0 case.
165        match address_space.allocate(
166            Some(*vnode),
167            alloc_size,
168            AllocationType::GpaPool,
169            AllocationPolicy::HighMemory,
170        ) {
171            Some(pool) => {
172                remaining -= pool.range.len();
173                log::info!(
174                    "allocated VTL2 pool on node {} at {:#x?}",
175                    vnode,
176                    pool.range
177                );
178            }
179            None => {
180                let mut free_ranges = off_stack!(ArrayString<2048>, ArrayString::new_const());
181                for node in numa_nodes.iter() {
182                    for range in address_space.free_ranges(*node) {
183                        if write!(
184                            free_ranges,
185                            "n{}:[{:#x?}, {:#x?}) ",
186                            node,
187                            range.start(),
188                            range.end()
189                        )
190                        .is_err()
191                        {
192                            let _ = write!(free_ranges, "...");
193                            break;
194                        }
195                    }
196                }
197                let highest_numa_node = vtl2_ram.iter().map(|e| e.vnode).max().unwrap_or(0);
198                panic!(
199                    "failed to allocate VTL2 pool on node {vnode}: \
200                     need {alloc_size:#x} bytes, pool total {pool_size_bytes:#x} bytes \
201                     (enable_vtl2_gpa_pool={enable_vtl2_gpa_pool:?}, \
202                     device_dma_page_count={device_dma_page_count:#x?}, \
203                     vp_count={vp_count}, mem_size={mem_size:#x}), \
204                     highest_numa_node={highest_numa_node}, \
205                     free_ranges=[ {}]",
206                    free_ranges.as_str()
207                );
208            }
209        }
210    }
211
212    assert_eq!(
213        remaining, 0,
214        "pool allocation arithmetic error: {remaining:#x} bytes unallocated"
215    );
216}
217
218/// Allocate VTL2 ram from the partition's memory map.
219fn allocate_vtl2_ram(
220    params: &ShimParams,
221    partition_memory_map: &[MemoryEntry],
222    ram_size: Option<u64>,
223) -> OffStackRef<'static, impl AsRef<[MemoryEntry]> + use<>> {
224    // First, calculate how many numa nodes there are by looking at unique numa
225    // nodes in the memory map.
226    let mut numa_nodes = off_stack!(ArrayVec<u32, MAX_NUMA_NODES>, ArrayVec::new_const());
227
228    for entry in partition_memory_map.iter() {
229        match numa_nodes.binary_search(&entry.vnode) {
230            Ok(_) => {}
231            Err(index) => {
232                numa_nodes.insert(index, entry.vnode);
233            }
234        }
235    }
236
237    let numa_node_count = numa_nodes.len();
238
239    let vtl2_size = if let Some(ram_size) = ram_size {
240        if ram_size < params.memory_size {
241            panic!(
242                "host provided vtl2 ram size {:x} is smaller than measured size {:x}",
243                ram_size, params.memory_size
244            );
245        }
246        max(ram_size, params.memory_size)
247    } else {
248        params.memory_size
249    };
250
251    // Next, calculate the amount of memory that needs to be allocated per numa
252    // node.
253    let ram_per_node = vtl2_size / numa_node_count as u64;
254
255    // Seed the remaining allocation list with the memory required per node.
256    let mut memory_per_node = off_stack!(ArrayVec<u64, MAX_NUMA_NODES>, ArrayVec::new_const());
257    memory_per_node.extend((0..numa_node_count).map(|_| 0));
258    for entry in partition_memory_map.iter() {
259        memory_per_node[entry.vnode as usize] = ram_per_node;
260    }
261
262    // The range the IGVM file was loaded into is special - it is already
263    // counted as "allocated". This may have been split across different numa
264    // nodes. Walk the used range, add it to vtl2 ram, and subtract it from the
265    // used ranges.
266    let mut vtl2_ram = off_stack!(ArrayVec<MemoryEntry, MAX_NUMA_NODES>, ArrayVec::new_const());
267    let mut free_memory_after_vtl2 = off_stack!(ArrayVec<MemoryEntry, 1024>, ArrayVec::new_const());
268    let file_memory_range = MemoryRange::new(
269        params.memory_start_address..(params.memory_start_address + params.memory_size),
270    );
271
272    for (range, result) in walk_ranges(
273        [(file_memory_range, ())],
274        partition_memory_map.iter().map(|e| (e.range, e)),
275    ) {
276        match result {
277            memory_range::RangeWalkResult::Right(entry) => {
278                // Add this entry to the free list.
279                free_memory_after_vtl2.push(MemoryEntry {
280                    range,
281                    mem_type: entry.mem_type,
282                    vnode: entry.vnode,
283                });
284            }
285            memory_range::RangeWalkResult::Both(_, entry) => {
286                // Add this entry to the vtl2 ram list.
287                vtl2_ram.push(MemoryEntry {
288                    range,
289                    mem_type: entry.mem_type,
290                    vnode: entry.vnode,
291                });
292            }
293            memory_range::RangeWalkResult::Left(_) => {
294                panic!("used file range {range:#x?} is not reported as ram by host memmap")
295            }
296            // Ranges in neither are ignored.
297            memory_range::RangeWalkResult::Neither => {}
298        }
299    }
300
301    // Now remove ranges from the free list that were part of the initial launch
302    // context.
303    let mut free_memory = off_stack!(ArrayVec<MemoryEntry, 1024>, ArrayVec::new_const());
304    for (range, result) in walk_ranges(
305        params
306            .imported_regions()
307            .filter_map(|(range, _preaccepted)| {
308                if !file_memory_range.contains(&range) {
309                     // There should be no overlap - either the preaccepted range
310                    // is exclusively covered by the preaccpted VTL2 range or it
311                    // is not.
312                    assert!(!file_memory_range.overlaps(&range), "imported range {range:#x?} overlaps vtl2 range and is not fully contained within vtl2 range");
313                    Some((range, ()))
314                } else {
315                    None
316                }
317            }),
318        free_memory_after_vtl2.iter().map(|e| (e.range, e)),
319    ) {
320        match result {
321            memory_range::RangeWalkResult::Right(entry) => {
322                free_memory.push(MemoryEntry {
323                    range,
324                    mem_type: entry.mem_type,
325                    vnode: entry.vnode,
326                });
327            }
328            memory_range::RangeWalkResult::Left(_) => {
329                // On TDX, the reset vector page is not reported as ram by the
330                // host, but is preaccepted. Ignore it.
331                #[cfg(target_arch = "x86_64")]
332                if params.isolation_type == IsolationType::Tdx && range.start_4k_gpn() == 0xFFFFF && range.len() == 0x1000 {
333                    continue;
334                }
335
336                panic!("launch context range {range:#x?} is not reported as ram by host memmap")
337            }
338            memory_range::RangeWalkResult::Both(_, _) => {
339                // Range was part of the preaccepted import, is not free to
340                // allocate additional VTL2 ram from.
341            }
342            // Ranges in neither are ignored.
343            memory_range::RangeWalkResult::Neither => {}
344        }
345    }
346
347    // Subtract the used ranges from vtl2_ram
348    for entry in vtl2_ram.iter() {
349        let mem_req = &mut memory_per_node[entry.vnode as usize];
350
351        if entry.range.len() > *mem_req {
352            // TODO: Today if a used range is larger than the mem required, we
353            // just subtract that numa range to zero. Should we instead subtract
354            // from other numa nodes equally for over allocation?
355            log::warn!(
356                "entry {entry:?} is larger than required {mem_req} for vnode {}",
357                entry.vnode
358            );
359            *mem_req = 0;
360        } else {
361            *mem_req -= entry.range.len();
362        }
363    }
364
365    // Allocate remaining memory per node required.
366    for (node, required_mem) in memory_per_node.iter().enumerate() {
367        let mut required_mem = *required_mem;
368        if required_mem == 0 {
369            continue;
370        }
371
372        // Start allocation from the top of the free list, which is high memory
373        // in reverse order.
374        for entry in free_memory.iter_mut().rev() {
375            if entry.vnode == node as u32 && !entry.range.is_empty() {
376                assert!(required_mem != 0);
377                let bytes_to_allocate = core::cmp::min(entry.range.len(), required_mem);
378
379                // Allocate top down from the range.
380                let offset = entry.range.len() - bytes_to_allocate;
381                let (remaining, alloc) = MemoryRange::split_at_offset(&entry.range, offset);
382
383                entry.range = remaining;
384                vtl2_ram.push(MemoryEntry {
385                    range: alloc,
386                    mem_type: entry.mem_type,
387                    vnode: node as u32,
388                });
389
390                required_mem -= bytes_to_allocate;
391
392                // Stop allocating if we're done allocating.
393                if required_mem == 0 {
394                    break;
395                }
396            }
397        }
398
399        if required_mem != 0 {
400            // TODO: Handle fallback allocations on other numa nodes when a node
401            // is exhausted.
402            panic!(
403                "failed to allocate {required_mem:#x} for vnode {node:#x}, no memory remaining for vnode"
404            );
405        }
406    }
407
408    // Sort VTL2 ram as we may have allocated from different places.
409    vtl2_ram.sort_unstable_by_key(|e| e.range.start());
410
411    vtl2_ram
412}
413
414/// Parse VTL2 ram from host provided ranges.
415fn parse_host_vtl2_ram(
416    params: &ShimParams,
417    memory: &[MemoryEntry],
418) -> OffStackRef<'static, impl AsRef<[MemoryEntry]> + use<>> {
419    // If no VTL2 protectable ram was provided by the host, use the build time
420    // value encoded in ShimParams.
421    let mut vtl2_ram = off_stack!(ArrayVec<MemoryEntry, MAX_NUMA_NODES>, ArrayVec::new_const());
422    if params.isolation_type.is_hardware_isolated() {
423        // Hardware isolated VMs use the size hint by the host, but use the base
424        // address encoded in the file.
425        let vtl2_size = memory.iter().fold(0, |acc, entry| {
426            if entry.mem_type == MemoryMapEntryType::VTL2_PROTECTABLE {
427                acc + entry.range.len()
428            } else {
429                acc
430            }
431        });
432
433        log::info!(
434            "host provided vtl2 ram size is {:x}, measured size is {:x}",
435            vtl2_size,
436            params.memory_size
437        );
438
439        let vtl2_size = max(vtl2_size, params.memory_size);
440        vtl2_ram.push(MemoryEntry {
441            range: MemoryRange::new(
442                params.memory_start_address..(params.memory_start_address + vtl2_size),
443            ),
444            mem_type: MemoryMapEntryType::VTL2_PROTECTABLE,
445            vnode: 0,
446        });
447    } else {
448        for &entry in memory
449            .iter()
450            .filter(|entry| entry.mem_type == MemoryMapEntryType::VTL2_PROTECTABLE)
451        {
452            vtl2_ram.push(entry);
453        }
454    }
455
456    if vtl2_ram.is_empty() {
457        log::info!("using measured vtl2 ram");
458        vtl2_ram.push(MemoryEntry {
459            range: MemoryRange::try_new(
460                params.memory_start_address..(params.memory_start_address + params.memory_size),
461            )
462            .expect("range is valid"),
463            mem_type: MemoryMapEntryType::VTL2_PROTECTABLE,
464            vnode: 0,
465        });
466    }
467
468    vtl2_ram
469}
470
471fn init_heap(params: &ShimParams) {
472    // Initialize the temporary heap.
473    //
474    // This is only to be enabled for mesh decode.
475    //
476    // SAFETY: The heap range is reserved at file build time, and is
477    // guaranteed to be unused by anything else.
478    unsafe {
479        ALLOCATOR.init(params.heap);
480    }
481}
482
483type ParsedDt =
484    ParsedDeviceTree<MAX_PARTITION_RAM_RANGES, MAX_CPU_COUNT, COMMAND_LINE_SIZE, MAX_ENTROPY_SIZE>;
485
486/// Add common ranges to [`AddressSpaceManagerBuilder`] regardless if creating
487/// topology from the host or from saved state.
488fn add_common_ranges<'a, I: Iterator<Item = MemoryRange>>(
489    params: &ShimParams,
490    mut builder: AddressSpaceManagerBuilder<'a, I>,
491) -> AddressSpaceManagerBuilder<'a, I> {
492    // Add the log buffer which is always present.
493    builder = builder.with_log_buffer(params.log_buffer);
494
495    if params.vtl2_reserved_region_size != 0 {
496        builder = builder.with_reserved_range(MemoryRange::new(
497            params.vtl2_reserved_region_start
498                ..(params.vtl2_reserved_region_start + params.vtl2_reserved_region_size),
499        ));
500    }
501
502    if params.sidecar_size != 0 {
503        builder = builder.with_sidecar_image(MemoryRange::new(
504            params.sidecar_base..(params.sidecar_base + params.sidecar_size),
505        ));
506    }
507
508    builder
509}
510
511#[derive(Debug, PartialEq, Eq)]
512struct PartitionTopology {
513    vtl2_ram: &'static [MemoryEntry],
514    vtl0_mmio: ArrayVec<MemoryRange, 2>,
515    vtl2_mmio: ArrayVec<MemoryRange, 2>,
516    memory_allocation_mode: MemoryAllocationMode,
517}
518
519/// State derived while constructing the partition topology
520/// from persisted state.
521#[derive(Debug, PartialEq, Eq)]
522struct PersistedPartitionTopology {
523    topology: PartitionTopology,
524    cpus_with_mapped_interrupts_no_io: Vec<u32>,
525    cpus_with_outstanding_io: Vec<u32>,
526}
527
528// Calculate the default mmio size for VTL2 when not specified by the host.
529//
530// This is half of the high mmio gap size, rounded down, with a minimum of 128
531// MB and a maximum of 1 GB.
532fn calculate_default_mmio_size(parsed: &ParsedDt) -> Result<u64, DtError> {
533    const MINIMUM_MMIO_SIZE: u64 = 128 * (1 << 20);
534    const MAXIMUM_MMIO_SIZE: u64 = 1 << 30;
535    let half_high_gap = parsed.vmbus_vtl0.as_ref().ok_or(DtError::Vtl0Vmbus)?.mmio[1].len() / 2;
536    Ok(half_high_gap.clamp(MINIMUM_MMIO_SIZE, MAXIMUM_MMIO_SIZE))
537}
538
539/// Read topology from the host provided device tree.
540fn topology_from_host_dt(
541    params: &ShimParams,
542    parsed: &ParsedDt,
543    options: &BootCommandLineOptions,
544    address_space: &mut AddressSpaceManager,
545) -> Result<PartitionTopology, DtError> {
546    log::info!("reading topology from host device tree");
547
548    let mut vtl2_ram =
549        off_stack!(ArrayVec<MemoryEntry, MAX_VTL2_RAM_RANGES>, ArrayVec::new_const());
550
551    // TODO: Decide if isolated guests always use VTL2 allocation mode.
552
553    let memory_allocation_mode = parsed.memory_allocation_mode;
554    match memory_allocation_mode {
555        MemoryAllocationMode::Host => {
556            vtl2_ram
557                .try_extend_from_slice(parse_host_vtl2_ram(params, &parsed.memory).as_ref())
558                .expect("vtl2 ram should only be 64 big");
559        }
560        MemoryAllocationMode::Vtl2 {
561            memory_size,
562            mmio_size: _,
563        } => {
564            vtl2_ram
565                .try_extend_from_slice(
566                    allocate_vtl2_ram(params, &parsed.memory, memory_size).as_ref(),
567                )
568                .expect("vtl2 ram should only be 64 big");
569        }
570    }
571
572    // The host is responsible for allocating MMIO ranges for non-isolated
573    // guests when it also provides the ram VTL2 should use.
574    //
575    // For isolated guests, or when VTL2 has been asked to carve out its own
576    // memory, first check if the host provided a VTL2 mmio range. If so, the
577    // mmio range must be large enough. Otherwise, choose to carve out a range
578    // from the VTL0 allotment.
579    let (vtl0_mmio, vtl2_mmio) = if params.isolation_type != IsolationType::None
580        || matches!(
581            parsed.memory_allocation_mode,
582            MemoryAllocationMode::Vtl2 { .. }
583        ) {
584        // Decide the amount of mmio VTL2 should allocate, which is different
585        // depending on the heuristic used.
586        //
587        // On a newer host where a vtl2 mmio range is provided inside the
588        // vmbus_vtl2 device tree node, use the size provided by the host inside
589        // the openhcl node for memory allocation mode.
590        //
591        // If the host did not provide a vtl2 mmio range, then use the maximum
592        // of the host provided value inside the openhcl node and the calculated
593        // default.
594        let host_provided_size = match parsed.memory_allocation_mode {
595            MemoryAllocationMode::Vtl2 { mmio_size, .. } => mmio_size.unwrap_or(0),
596            _ => 0,
597        };
598        let vmbus_vtl2 = parsed.vmbus_vtl2.as_ref().ok_or(DtError::Vtl2Vmbus)?;
599        let vmbus_vtl2_mmio_size = vmbus_vtl2.mmio.iter().map(|r| r.len()).sum::<u64>();
600        let mmio_size = if vmbus_vtl2_mmio_size != 0 {
601            host_provided_size
602        } else {
603            max(host_provided_size, calculate_default_mmio_size(parsed)?)
604        };
605
606        log::info!("allocating vtl2 mmio size {mmio_size:#x} bytes");
607        log::info!("host provided vtl2 mmio ranges are {vmbus_vtl2_mmio_size:#x} bytes");
608
609        let vmbus_vtl0 = parsed.vmbus_vtl0.as_ref().ok_or(DtError::Vtl0Vmbus)?;
610        if vmbus_vtl2_mmio_size != 0 {
611            // Verify the host provided mmio is large enough.
612            if vmbus_vtl2_mmio_size < mmio_size {
613                return Err(DtError::NotEnoughVtl2Mmio);
614            }
615
616            log::info!("using host provided vtl2 mmio: {:x?}", vmbus_vtl2.mmio);
617            (vmbus_vtl0.mmio.clone(), vmbus_vtl2.mmio.clone())
618        } else {
619            // Allocate vtl2 mmio from vtl0 mmio.
620            log::info!("no vtl2 mmio provided by host, allocating from vtl0 mmio");
621            let selected_vtl2_mmio = select_vtl2_mmio_range(&vmbus_vtl0.mmio, mmio_size)?;
622
623            // Update vtl0 mmio to exclude vtl2 mmio.
624            let vtl0_mmio = subtract_ranges(vmbus_vtl0.mmio.iter().cloned(), [selected_vtl2_mmio])
625                .collect::<ArrayVec<MemoryRange, 2>>();
626            let vtl2_mmio = [selected_vtl2_mmio]
627                .into_iter()
628                .collect::<ArrayVec<MemoryRange, 2>>();
629
630            // TODO: For now, if we have only a single vtl0_mmio range left,
631            // panic. In the future decide if we want to report this as a start
632            // failure in usermode, change allocation strategy, or something
633            // else.
634            assert_eq!(
635                vtl0_mmio.len(),
636                2,
637                "vtl0 mmio ranges are not 2 {:#x?}",
638                vtl0_mmio
639            );
640
641            log::info!("vtl0 mmio: {vtl0_mmio:x?}, vtl2 mmio: {vtl2_mmio:x?}");
642
643            (vtl0_mmio, vtl2_mmio)
644        }
645    } else {
646        (
647            parsed
648                .vmbus_vtl0
649                .as_ref()
650                .ok_or(DtError::Vtl0Vmbus)?
651                .mmio
652                .clone(),
653            parsed
654                .vmbus_vtl2
655                .as_ref()
656                .ok_or(DtError::Vtl2Vmbus)?
657                .mmio
658                .clone(),
659        )
660    };
661
662    // The host provided device tree is marked as normal ram, as the
663    // bootshim is responsible for constructing anything usermode needs from
664    // it, and passing it via the device tree provided to the kernel.
665    let reclaim_base = params.dt_start();
666    let reclaim_end = params.dt_start() + params.dt_size();
667    let vtl2_config_region_reclaim =
668        MemoryRange::try_new(reclaim_base..reclaim_end).expect("range is valid");
669
670    log::info!("reclaim device tree memory {reclaim_base:x}-{reclaim_end:x}");
671
672    // Initialize the address space manager with fixed at build time ranges.
673    let vtl2_config_region = MemoryRange::new(
674        params.parameter_region_start
675            ..(params.parameter_region_start + params.parameter_region_size),
676    );
677
678    // NOTE: Size the region as 20 pages. This should be plenty enough for the
679    // worst case encoded size (about 50 bytes worst case per memory entry, with
680    // the max number of ram ranges), and is small enough that we can reserve it
681    // on all sizes. Revisit this calculation if we persist more state in the
682    // future.
683    const PERSISTED_REGION_SIZE: u64 = 20 * 4096;
684    let (persisted_state_region, remainder) = params
685        .persisted_state
686        .split_at_offset(PERSISTED_REGION_SIZE);
687    log::info!(
688        "persisted state region sized to {persisted_state_region:#x?}, remainder {remainder:#x?}"
689    );
690
691    let mut address_space_builder = AddressSpaceManagerBuilder::new(
692        address_space,
693        &vtl2_ram,
694        params.used,
695        persisted_state_region,
696        subtract_ranges([vtl2_config_region], [vtl2_config_region_reclaim]),
697    );
698
699    address_space_builder = add_common_ranges(params, address_space_builder);
700
701    address_space_builder
702        .init()
703        .expect("failed to initialize address space manager");
704
705    if params.isolation_type == IsolationType::None {
706        let enable_vtl2_gpa_pool = options.enable_vtl2_gpa_pool;
707        let device_dma_page_count = parsed.device_dma_page_count;
708        let vp_count = parsed.cpu_count();
709        let mem_size = vtl2_ram.iter().map(|e| e.range.len()).sum();
710        if let Some(vtl2_gpa_pool_size) = pick_private_pool_size(
711            enable_vtl2_gpa_pool,
712            device_dma_page_count,
713            vp_count,
714            mem_size,
715        ) {
716            // Reserve the specified number of pages for the pool. Use the used
717            // ranges to figure out which VTL2 memory is free to allocate from.
718            let pool_size_bytes = vtl2_gpa_pool_size * HV_PAGE_SIZE;
719
720            allocate_private_pool(
721                address_space,
722                &vtl2_ram,
723                pool_size_bytes,
724                options.vtl2_gpa_pool_numa_split,
725                enable_vtl2_gpa_pool,
726                device_dma_page_count,
727                vp_count,
728                mem_size,
729            );
730        }
731    }
732
733    Ok(PartitionTopology {
734        vtl2_ram: OffStackRef::<'_, ArrayVec<MemoryEntry, MAX_VTL2_RAM_RANGES>>::leak(vtl2_ram),
735        vtl0_mmio,
736        vtl2_mmio,
737        memory_allocation_mode,
738    })
739}
740
741/// Read topology from the persisted state region and protobuf payload.
742fn topology_from_persisted_state(
743    header: PersistedStateHeader,
744    params: &ShimParams,
745    parsed: &ParsedDt,
746    address_space: &mut AddressSpaceManager,
747) -> Result<PersistedPartitionTopology, DtError> {
748    log::info!("reading topology from persisted state");
749
750    // Verify the header describes a protobuf region within the bootshim
751    // persisted region. We expect it to live there as today we rely on the
752    // build time generated pagetable to identity map the protobuf region.
753    let protobuf_region =
754        MemoryRange::new(header.protobuf_base..(header.protobuf_base + header.protobuf_region_len));
755    assert!(
756        params.persisted_state.contains(&protobuf_region),
757        "protobuf region {protobuf_region:#x?} is not contained within the persisted state region {:#x?}",
758        params.persisted_state
759    );
760
761    // Verify protobuf payload len is smaller than region.
762    assert!(
763        header.protobuf_payload_len <= header.protobuf_region_len,
764        "protobuf payload len {} is larger than region len {}",
765        header.protobuf_payload_len,
766        header.protobuf_region_len
767    );
768
769    // SAFETY: The region lies within the persisted state region, which is
770    // identity mapped via the build time generated pagetable.
771    let protobuf_raw = unsafe {
772        core::slice::from_raw_parts(
773            header.protobuf_base as *const u8,
774            header.protobuf_payload_len as usize,
775        )
776    };
777
778    let parsed_protobuf: loader_defs::shim::save_restore::SavedState =
779        bump_alloc::with_global_alloc(|| {
780            log::info!("decoding protobuf of size {}", protobuf_raw.len());
781            mesh_protobuf::decode(protobuf_raw).expect("failed to decode protobuf")
782        });
783
784    let loader_defs::shim::save_restore::SavedState {
785        partition_memory,
786        partition_mmio,
787        cpus_with_mapped_interrupts_no_io,
788        cpus_with_outstanding_io,
789    } = parsed_protobuf;
790
791    log::info!(
792        "persisted state: cpus_with_mapped_interrupts_no_io={:?}, cpus_with_outstanding_io={:?}",
793        cpus_with_mapped_interrupts_no_io,
794        cpus_with_outstanding_io,
795    );
796
797    // FUTURE: should memory allocation mode should persist in saved state and
798    // verify the host did not change it?
799    let memory_allocation_mode = parsed.memory_allocation_mode;
800
801    let mut vtl2_ram =
802        off_stack!(ArrayVec<MemoryEntry, MAX_VTL2_RAM_RANGES>, ArrayVec::new_const());
803
804    // Determine which ranges are memory ranges used by VTL2.
805    let previous_vtl2_ram = partition_memory.iter().filter_map(|entry| {
806        if entry.vtl_type.ram() && entry.vtl_type.vtl2() {
807            Some(MemoryEntry {
808                range: entry.range,
809                mem_type: entry.igvm_type.clone().into(),
810                vnode: entry.vnode,
811            })
812        } else {
813            None
814        }
815    });
816
817    // Merge adjacent ranges as saved state reports the final usage of ram which
818    // includes reserved in separate ranges. Here we want the whole underlying
819    // ram ranges, merged with adjacent types if they share the same igvm types.
820    let previous_vtl2_ram = memory_range::merge_adjacent_ranges(
821        previous_vtl2_ram.map(|entry| (entry.range, (entry.mem_type, entry.vnode))),
822    );
823
824    vtl2_ram.extend(
825        previous_vtl2_ram.map(|(range, (mem_type, vnode))| MemoryEntry {
826            range,
827            mem_type,
828            vnode,
829        }),
830    );
831
832    // If the host was responsible for allocating VTL2 ram, verify the ram
833    // parsed from the previous instance matches.
834    //
835    // FUTURE: When VTL2 itself did allocation, we should verify that all ranges
836    // are still within the provided memory map.
837    if matches!(memory_allocation_mode, MemoryAllocationMode::Host) {
838        let host_vtl2_ram = parse_host_vtl2_ram(params, &parsed.memory);
839        assert_eq!(
840            vtl2_ram.as_slice(),
841            host_vtl2_ram.as_ref(),
842            "vtl2 ram from persisted state does not match host provided ram"
843        );
844    }
845
846    // Merge the persisted state header and protobuf region, and report that as
847    // the persisted region.
848    //
849    // NOTE: We could choose to resize the persisted region at this point, which
850    // we would need to do if we expect the saved state to grow larger.
851    let persisted_header = partition_memory
852        .iter()
853        .find(|entry| entry.vtl_type == MemoryVtlType::VTL2_PERSISTED_STATE_HEADER)
854        .expect("persisted state header missing");
855    let persisted_protobuf = partition_memory
856        .iter()
857        .find(|entry| entry.vtl_type == MemoryVtlType::VTL2_PERSISTED_STATE_PROTOBUF)
858        .expect("persisted state protobuf region missing");
859    assert_eq!(persisted_header.range.end(), protobuf_region.start());
860    let persisted_state_region =
861        MemoryRange::new(persisted_header.range.start()..persisted_protobuf.range.end());
862
863    // The host provided device tree is marked as normal ram, as the
864    // bootshim is responsible for constructing anything usermode needs from
865    // it, and passing it via the device tree provided to the kernel.
866    let reclaim_base = params.dt_start();
867    let reclaim_end = params.dt_start() + params.dt_size();
868    let vtl2_config_region_reclaim =
869        MemoryRange::try_new(reclaim_base..reclaim_end).expect("range is valid");
870
871    log::info!("reclaim device tree memory {reclaim_base:x}-{reclaim_end:x}");
872
873    let vtl2_config_region = MemoryRange::new(
874        params.parameter_region_start
875            ..(params.parameter_region_start + params.parameter_region_size),
876    );
877
878    let mut address_space_builder = AddressSpaceManagerBuilder::new(
879        address_space,
880        &vtl2_ram,
881        params.used,
882        persisted_state_region,
883        subtract_ranges([vtl2_config_region], [vtl2_config_region_reclaim]),
884    );
885
886    // NOTE: The only other region we take from the previous instance is any
887    // allocated vtl2 pool. Today, we do not allocate a new/larger pool if the
888    // command line arguments or host device tree changed, as that's not
889    // something we expect to happen in practice.
890    let pool_ranges = partition_memory.iter().filter_map(|entry| {
891        if entry.vtl_type == MemoryVtlType::VTL2_GPA_POOL {
892            Some(entry.range)
893        } else {
894            None
895        }
896    });
897
898    address_space_builder = address_space_builder.with_pool_ranges(pool_ranges);
899
900    // As described above, other ranges come from this current boot.
901    address_space_builder = add_common_ranges(params, address_space_builder);
902
903    address_space_builder
904        .init()
905        .expect("failed to initialize address space manager");
906
907    // Read previous mmio for VTL0 and VTL2.
908    let vtl0_mmio = partition_mmio
909        .iter()
910        .filter_map(|entry| {
911            if entry.vtl_type == MemoryVtlType::VTL0_MMIO {
912                Some(entry.range)
913            } else {
914                None
915            }
916        })
917        .collect::<ArrayVec<MemoryRange, 2>>();
918    let vtl2_mmio = partition_mmio
919        .iter()
920        .filter_map(|entry| {
921            if entry.vtl_type == MemoryVtlType::VTL2_MMIO {
922                Some(entry.range)
923            } else {
924                None
925            }
926        })
927        .collect::<ArrayVec<MemoryRange, 2>>();
928
929    Ok(PersistedPartitionTopology {
930        topology: PartitionTopology {
931            vtl2_ram: OffStackRef::<'_, ArrayVec<MemoryEntry, MAX_VTL2_RAM_RANGES>>::leak(vtl2_ram),
932            vtl0_mmio,
933            vtl2_mmio,
934            memory_allocation_mode,
935        },
936        cpus_with_mapped_interrupts_no_io,
937        cpus_with_outstanding_io,
938    })
939}
940
941/// Read the persisted header from the start of the persisted state region
942/// described at file build time. If the magic value is not set, `None` is
943/// returned.
944fn read_persisted_region_header(params: &ShimParams) -> Option<PersistedStateHeader> {
945    // TODO CVM: On an isolated guest, these pages may not be accepted. We need
946    // to rethink how this will work in order to handle this correctly, as on a
947    // first boot we'd need to accept them early, but subsequent boots should
948    // not accept any pages.
949    //
950    // This may require some value passed in via a register or something early
951    // that indicates this is a servicing boot, which we could set if OpenHCL
952    // itself launches the next instance.
953    if params.isolation_type != IsolationType::None {
954        return None;
955    }
956
957    // SAFETY: The header lies at the start of the shim described persisted state
958    // region. This range is guaranteed to be identity mapped at file build
959    // time.
960    let buf = unsafe {
961        core::slice::from_raw_parts(
962            params.persisted_state.start() as *const u8,
963            size_of::<PersistedStateHeader>(),
964        )
965    };
966
967    let header = PersistedStateHeader::read_from_bytes(buf)
968        .expect("region is page aligned and the correct size");
969
970    if header.magic == PersistedStateHeader::MAGIC {
971        Some(header)
972    } else {
973        None
974    }
975}
976
977impl PartitionInfo {
978    // Read the IGVM provided DT for the vtl2 partition info.
979    pub fn read_from_dt<'a>(
980        params: &'a ShimParams,
981        storage: &'a mut Self,
982        address_space: &'_ mut AddressSpaceManager,
983        mut options: BootCommandLineOptions,
984        can_trust_host: bool,
985    ) -> Result<&'a mut Self, DtError> {
986        let dt = params.device_tree();
987
988        if dt[0] == 0 {
989            log::error!("host did not provide a device tree");
990            return Err(DtError::NoDeviceTree);
991        }
992
993        let mut dt_storage = off_stack!(ParsedDt, ParsedDeviceTree::new());
994
995        let parsed = ParsedDeviceTree::parse(dt, &mut *dt_storage).map_err(DtError::DeviceTree)?;
996
997        let command_line = params.command_line();
998
999        // Always write the measured command line.
1000        write!(
1001            storage.cmdline,
1002            "{}",
1003            command_line
1004                .command_line()
1005                .expect("measured command line should be valid")
1006        )
1007        .map_err(|_| DtError::CommandLineSize)?;
1008
1009        match command_line.policy {
1010            CommandLinePolicy::STATIC => {
1011                // Nothing to do, we already wrote the measured command line.
1012            }
1013            CommandLinePolicy::APPEND_CHOSEN if can_trust_host => {
1014                // Check the host-provided command line for options for ourself,
1015                // and pass it along to the kernel.
1016                options.parse(&parsed.command_line);
1017                write!(storage.cmdline, " {}", &parsed.command_line)
1018                    .map_err(|_| DtError::CommandLineSize)?;
1019            }
1020            CommandLinePolicy::APPEND_CHOSEN if !can_trust_host => {
1021                // Nothing to do, we ignore the host provided command line.
1022            }
1023            _ => unreachable!(),
1024        }
1025
1026        init_heap(params);
1027
1028        let persisted_state_header = read_persisted_region_header(params);
1029        log::info!(
1030            "read_from_dt: persisted_state_header present={}, sidecar={:?}",
1031            persisted_state_header.is_some(),
1032            options.sidecar,
1033        );
1034        let (topology, cpus_with_outstanding_io) = if let Some(header) = persisted_state_header {
1035            log::info!("found persisted state header");
1036            let persisted_topology =
1037                topology_from_persisted_state(header, params, parsed, address_space)?;
1038            (
1039                persisted_topology.topology,
1040                persisted_topology.cpus_with_outstanding_io,
1041            )
1042        } else {
1043            (
1044                topology_from_host_dt(params, parsed, &options, address_space)?,
1045                Vec::new(),
1046            )
1047        };
1048
1049        let Self {
1050            vtl2_ram,
1051            partition_ram,
1052            isolation,
1053            bsp_reg,
1054            cpus,
1055            sidecar_cpu_overrides,
1056            vmbus_vtl0,
1057            vmbus_vtl2,
1058            cmdline: _,
1059            com3_serial_available: com3_serial,
1060            gic,
1061            pmu_gsiv,
1062            memory_allocation_mode,
1063            entropy,
1064            vtl0_alias_map,
1065            nvme_keepalive,
1066            boot_options,
1067        } = storage;
1068
1069        // During servicing restore, selectively exclude CPUs with outstanding IO
1070        // from sidecar startup. These CPUs need immediate kernel access to handle
1071        // device interrupts. All other CPUs still benefit from sidecar's parallel
1072        // startup. Falls back to disabling sidecar entirely if CPU IDs exceed the
1073        // per-CPU state array capacity (>400 CPUs).
1074        //
1075        // Sidecar is automatically disabled when: all NUMA nodes have exactly
1076        // one CPU (nothing to parallelize), x2apic is unavailable, the VM is
1077        // isolated (CVM), or the sidecar image is not present (sidecar_size == 0).
1078        // It is also disabled via command line with OPENHCL_SIDECAR=off. In all
1079        // other cases sidecar is active and uses a fan-out pattern to bring up
1080        // APs in parallel across NUMA nodes.
1081        //
1082        // TODO: the `cpu_threshold` field in `SidecarOptions::Enabled` is
1083        // not used at present. Based on production performance data, either
1084        // remove `cpu_threshold` from `SidecarOptions` in cmdline.rs, or
1085        // add a VP-count cutoff here to disable sidecar for small VMs.
1086        if let (SidecarOptions::Enabled { .. }, true) =
1087            (&boot_options.sidecar, !cpus_with_outstanding_io.is_empty())
1088        {
1089            let max_cpu_id = *cpus_with_outstanding_io.iter().max().unwrap() as usize;
1090            if parsed.cpu_count() <= sidecar_cpu_overrides.sidecar_starts_cpu.len()
1091                && max_cpu_id < sidecar_cpu_overrides.sidecar_starts_cpu.len()
1092            {
1093                // Mark specific CPUs as kernel-started instead of sidecar-started.
1094                sidecar_cpu_overrides.per_cpu_state_specified = true;
1095                for &cpu_id in &cpus_with_outstanding_io {
1096                    sidecar_cpu_overrides.sidecar_starts_cpu[cpu_id as usize] = false;
1097                }
1098                log::info!(
1099                    "sidecar: excluding CPUs {:?} due to outstanding IO",
1100                    cpus_with_outstanding_io,
1101                );
1102            } else {
1103                // CPU IDs exceed per-cpu array capacity; disable sidecar entirely.
1104                log::info!(
1105                    "sidecar: disabling, too many CPUs for per-CPU state (max id {max_cpu_id})"
1106                );
1107                boot_options.sidecar = SidecarOptions::DisabledServicing;
1108                options.sidecar = SidecarOptions::DisabledServicing;
1109            }
1110        }
1111
1112        // Set ram and memory alloction mode.
1113        vtl2_ram.clear();
1114        vtl2_ram.extend(topology.vtl2_ram.iter().copied());
1115        partition_ram.clear();
1116        partition_ram.extend(parsed.memory.iter().copied());
1117        *memory_allocation_mode = topology.memory_allocation_mode;
1118
1119        // Set vmbus fields. The connection ID comes from the host, but mmio
1120        // comes from topology.
1121        *vmbus_vtl0 = VmbusInfo {
1122            connection_id: parsed
1123                .vmbus_vtl0
1124                .as_ref()
1125                .ok_or(DtError::Vtl0Vmbus)?
1126                .connection_id,
1127            mmio: topology.vtl0_mmio,
1128        };
1129        *vmbus_vtl2 = VmbusInfo {
1130            connection_id: parsed
1131                .vmbus_vtl2
1132                .as_ref()
1133                .ok_or(DtError::Vtl2Vmbus)?
1134                .connection_id,
1135            mmio: topology.vtl2_mmio,
1136        };
1137
1138        // If we can trust the host, use the provided alias map
1139        if can_trust_host {
1140            *vtl0_alias_map = parsed.vtl0_alias_map;
1141        }
1142
1143        *isolation = params.isolation_type;
1144
1145        *bsp_reg = parsed.boot_cpuid_phys;
1146        cpus.extend(parsed.cpus.iter().copied());
1147        *com3_serial = parsed.com3_serial;
1148        *gic = parsed.gic.clone();
1149        *pmu_gsiv = parsed.pmu_gsiv;
1150        *entropy = parsed.entropy.clone();
1151        *nvme_keepalive = parsed.nvme_keepalive;
1152        *boot_options = options;
1153
1154        Ok(storage)
1155    }
1156}