Skip to main content

openhcl_boot/host_params/dt/
mod.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! Parse partition info using the IGVM device tree parameter.
5
6use super::PartitionInfo;
7use super::shim_params::ShimParams;
8use crate::cmdline::BootCommandLineOptions;
9use crate::cmdline::SidecarOptions;
10use crate::host_params::COMMAND_LINE_SIZE;
11use crate::host_params::MAX_CPU_COUNT;
12use crate::host_params::MAX_ENTROPY_SIZE;
13use crate::host_params::MAX_NUMA_NODES;
14use crate::host_params::MAX_PARTITION_RAM_RANGES;
15use crate::host_params::MAX_VTL2_RAM_RANGES;
16use crate::host_params::dt::dma_hint::pick_private_pool_size;
17use crate::host_params::mmio::select_vtl2_mmio_range;
18use crate::host_params::shim_params::IsolationType;
19use crate::memory::AddressSpaceManager;
20use crate::memory::AddressSpaceManagerBuilder;
21use crate::memory::AllocationPolicy;
22use crate::memory::AllocationType;
23use crate::single_threaded::OffStackRef;
24use crate::single_threaded::off_stack;
25use arrayvec::ArrayString;
26use arrayvec::ArrayVec;
27use bump_alloc::ALLOCATOR;
28use core::cmp::max;
29use core::fmt::Write;
30use host_fdt_parser::MemoryAllocationMode;
31use host_fdt_parser::MemoryEntry;
32use host_fdt_parser::ParsedDeviceTree;
33use host_fdt_parser::VmbusInfo;
34use hvdef::HV_PAGE_SIZE;
35use igvm_defs::MemoryMapEntryType;
36use loader_defs::paravisor::CommandLinePolicy;
37use loader_defs::shim::MemoryVtlType;
38use loader_defs::shim::PersistedStateHeader;
39use memory_range::MemoryRange;
40use memory_range::subtract_ranges;
41use memory_range::walk_ranges;
42use thiserror::Error;
43use zerocopy::FromBytes;
44
45mod bump_alloc;
46mod dma_hint;
47
48/// Errors when reading the host device tree.
49#[derive(Debug, Error)]
50pub enum DtError {
51    /// Host did not provide a device tree.
52    #[error("no device tree provided by host")]
53    NoDeviceTree,
54    /// Invalid device tree.
55    #[error("host provided device tree is invalid")]
56    DeviceTree(#[source] host_fdt_parser::Error<'static>),
57    /// PartitionInfo's command line is too small to write the parsed legacy
58    /// command line.
59    #[error("commandline storage is too small to write the parsed command line")]
60    CommandLineSize,
61    /// Device tree did not contain a vmbus node for VTL2.
62    #[error("device tree did not contain a vmbus node for VTL2")]
63    Vtl2Vmbus,
64    /// Device tree did not contain a vmbus node for VTL0.
65    #[error("device tree did not contain a vmbus node for VTL0")]
66    Vtl0Vmbus,
67    /// Host provided high MMIO range is insufficient to cover VTL0 and VTL2.
68    #[error("host provided high MMIO range is insufficient to cover VTL0 and VTL2")]
69    NotEnoughVtl0Mmio,
70    /// Host provided MMIO range is insufficient to cover VTL2.
71    #[error("host provided MMIO range is insufficient to cover VTL2")]
72    NotEnoughVtl2Mmio,
73}
74
75/// Allocate the private pool across NUMA nodes.
76///
77/// By default, tries to allocate the entire pool on NUMA node 0 (preserving
78/// previous behavior). If that fails, or if `force_numa_split` is true, the
79/// pool is split evenly across all available NUMA nodes (one range per node).
80fn allocate_private_pool(
81    address_space: &mut AddressSpaceManager,
82    vtl2_ram: &[MemoryEntry],
83    pool_size_bytes: u64,
84    force_numa_split: bool,
85    enable_vtl2_gpa_pool: crate::cmdline::Vtl2GpaPoolConfig,
86    device_dma_page_count: Option<u64>,
87    vp_count: usize,
88    mem_size: u64,
89) {
90    // Try allocating the entire pool on node 0 first. We do this to maintain
91    // compatibility with older openhcl_boot images that do not understand how
92    // to handle a split private pool, and to maintain previous behavior where
93    // the pool was completely allocated on numa node 0.
94    //
95    // Allocate from high memory downward to avoid overlapping any used ranges
96    // in low memory when openhcl's usage gets bigger, as otherwise the
97    // used_range by the bootshim could overlap the pool range chosen when
98    // servicing to a new image.
99    if !force_numa_split {
100        if let Some(pool) = address_space.allocate(
101            Some(0),
102            pool_size_bytes,
103            AllocationType::GpaPool,
104            AllocationPolicy::HighMemory,
105        ) {
106            log::info!("allocated VTL2 pool at {:#x?}", pool.range);
107            return;
108        }
109        log::info!("node 0 cannot fit full pool, splitting across NUMA nodes");
110    } else {
111        log::info!("forcing VTL2 pool NUMA split across nodes");
112    }
113
114    // Enumerate unique NUMA nodes from VTL2 RAM.
115    //
116    // FUTURE: Handle cases where the are CPU only or RAM only numa nodes.
117    let mut numa_nodes = off_stack!(ArrayVec<u32, MAX_NUMA_NODES>, ArrayVec::new_const());
118    for entry in vtl2_ram.iter() {
119        match numa_nodes.binary_search(&entry.vnode) {
120            Ok(_) => {}
121            Err(index) => {
122                numa_nodes.insert(index, entry.vnode);
123            }
124        }
125    }
126
127    let num_nodes = numa_nodes.len() as u64;
128    // Split the per node size to page size aligned chunks, and give the
129    // remainder to the last node.
130    let per_node_size = (pool_size_bytes / num_nodes) & !(HV_PAGE_SIZE - 1);
131    let last_node_size = pool_size_bytes - per_node_size * (num_nodes - 1);
132    let mut remaining = pool_size_bytes;
133
134    // If per-node-size is zero, we're in some strange configuration. We should
135    // have been able to allocate this from a single node, as this would mean
136    // the number of nodes is larger than the number of pages requested for the
137    // pool, so fail explicitly.
138    if per_node_size == 0 {
139        panic!(
140            "cannot split VTL2 pool of size {pool_size_bytes:#x} bytes across \
141            {num_nodes} nodes, per node size {per_node_size:#x} bytes; \
142            enable_vtl2_gpa_pool={enable_vtl2_gpa_pool:?}, \
143            device_dma_page_count={device_dma_page_count:#x?}, \
144            vp_count={vp_count}, mem_size={mem_size:#x}"
145        );
146    }
147
148    for (i, vnode) in numa_nodes.iter().enumerate() {
149        if remaining == 0 {
150            break;
151        }
152
153        let is_last = i == numa_nodes.len() - 1;
154        let alloc_size = if is_last {
155            last_node_size
156        } else {
157            per_node_size
158        };
159
160        // Make sure to allocate high memory downward, for the same reason as
161        // described in the numa 0 case.
162        match address_space.allocate(
163            Some(*vnode),
164            alloc_size,
165            AllocationType::GpaPool,
166            AllocationPolicy::HighMemory,
167        ) {
168            Some(pool) => {
169                remaining -= pool.range.len();
170                log::info!(
171                    "allocated VTL2 pool on node {} at {:#x?}",
172                    vnode,
173                    pool.range
174                );
175            }
176            None => {
177                let mut free_ranges = off_stack!(ArrayString<2048>, ArrayString::new_const());
178                for node in numa_nodes.iter() {
179                    for range in address_space.free_ranges(*node) {
180                        if write!(
181                            free_ranges,
182                            "n{}:[{:#x?}, {:#x?}) ",
183                            node,
184                            range.start(),
185                            range.end()
186                        )
187                        .is_err()
188                        {
189                            let _ = write!(free_ranges, "...");
190                            break;
191                        }
192                    }
193                }
194                let highest_numa_node = vtl2_ram.iter().map(|e| e.vnode).max().unwrap_or(0);
195                panic!(
196                    "failed to allocate VTL2 pool on node {vnode}: \
197                     need {alloc_size:#x} bytes, pool total {pool_size_bytes:#x} bytes \
198                     (enable_vtl2_gpa_pool={enable_vtl2_gpa_pool:?}, \
199                     device_dma_page_count={device_dma_page_count:#x?}, \
200                     vp_count={vp_count}, mem_size={mem_size:#x}), \
201                     highest_numa_node={highest_numa_node}, \
202                     free_ranges=[ {}]",
203                    free_ranges.as_str()
204                );
205            }
206        }
207    }
208
209    assert_eq!(
210        remaining, 0,
211        "pool allocation arithmetic error: {remaining:#x} bytes unallocated"
212    );
213}
214
215/// Allocate VTL2 ram from the partition's memory map.
216fn allocate_vtl2_ram(
217    params: &ShimParams,
218    partition_memory_map: &[MemoryEntry],
219    ram_size: Option<u64>,
220) -> OffStackRef<'static, impl AsRef<[MemoryEntry]> + use<>> {
221    // First, calculate how many numa nodes there are by looking at unique numa
222    // nodes in the memory map.
223    let mut numa_nodes = off_stack!(ArrayVec<u32, MAX_NUMA_NODES>, ArrayVec::new_const());
224
225    for entry in partition_memory_map.iter() {
226        match numa_nodes.binary_search(&entry.vnode) {
227            Ok(_) => {}
228            Err(index) => {
229                numa_nodes.insert(index, entry.vnode);
230            }
231        }
232    }
233
234    let numa_node_count = numa_nodes.len();
235
236    let vtl2_size = if let Some(ram_size) = ram_size {
237        if ram_size < params.memory_size {
238            panic!(
239                "host provided vtl2 ram size {:x} is smaller than measured size {:x}",
240                ram_size, params.memory_size
241            );
242        }
243        max(ram_size, params.memory_size)
244    } else {
245        params.memory_size
246    };
247
248    // Next, calculate the amount of memory that needs to be allocated per numa
249    // node.
250    let ram_per_node = vtl2_size / numa_node_count as u64;
251
252    // Seed the remaining allocation list with the memory required per node.
253    let mut memory_per_node = off_stack!(ArrayVec<u64, MAX_NUMA_NODES>, ArrayVec::new_const());
254    memory_per_node.extend((0..numa_node_count).map(|_| 0));
255    for entry in partition_memory_map.iter() {
256        memory_per_node[entry.vnode as usize] = ram_per_node;
257    }
258
259    // The range the IGVM file was loaded into is special - it is already
260    // counted as "allocated". This may have been split across different numa
261    // nodes. Walk the used range, add it to vtl2 ram, and subtract it from the
262    // used ranges.
263    let mut vtl2_ram = off_stack!(ArrayVec<MemoryEntry, MAX_NUMA_NODES>, ArrayVec::new_const());
264    let mut free_memory_after_vtl2 = off_stack!(ArrayVec<MemoryEntry, 1024>, ArrayVec::new_const());
265    let file_memory_range = MemoryRange::new(
266        params.memory_start_address..(params.memory_start_address + params.memory_size),
267    );
268
269    for (range, result) in walk_ranges(
270        [(file_memory_range, ())],
271        partition_memory_map.iter().map(|e| (e.range, e)),
272    ) {
273        match result {
274            memory_range::RangeWalkResult::Right(entry) => {
275                // Add this entry to the free list.
276                free_memory_after_vtl2.push(MemoryEntry {
277                    range,
278                    mem_type: entry.mem_type,
279                    vnode: entry.vnode,
280                });
281            }
282            memory_range::RangeWalkResult::Both(_, entry) => {
283                // Add this entry to the vtl2 ram list.
284                vtl2_ram.push(MemoryEntry {
285                    range,
286                    mem_type: entry.mem_type,
287                    vnode: entry.vnode,
288                });
289            }
290            memory_range::RangeWalkResult::Left(_) => {
291                panic!("used file range {range:#x?} is not reported as ram by host memmap")
292            }
293            // Ranges in neither are ignored.
294            memory_range::RangeWalkResult::Neither => {}
295        }
296    }
297
298    // Now remove ranges from the free list that were part of the initial launch
299    // context.
300    let mut free_memory = off_stack!(ArrayVec<MemoryEntry, 1024>, ArrayVec::new_const());
301    for (range, result) in walk_ranges(
302        params
303            .imported_regions()
304            .filter_map(|(range, _preaccepted)| {
305                if !file_memory_range.contains(&range) {
306                     // There should be no overlap - either the preaccepted range
307                    // is exclusively covered by the preaccpted VTL2 range or it
308                    // is not.
309                    assert!(!file_memory_range.overlaps(&range), "imported range {range:#x?} overlaps vtl2 range and is not fully contained within vtl2 range");
310                    Some((range, ()))
311                } else {
312                    None
313                }
314            }),
315        free_memory_after_vtl2.iter().map(|e| (e.range, e)),
316    ) {
317        match result {
318            memory_range::RangeWalkResult::Right(entry) => {
319                free_memory.push(MemoryEntry {
320                    range,
321                    mem_type: entry.mem_type,
322                    vnode: entry.vnode,
323                });
324            }
325            memory_range::RangeWalkResult::Left(_) => {
326                // On TDX, the reset vector page is not reported as ram by the
327                // host, but is preaccepted. Ignore it.
328                #[cfg(target_arch = "x86_64")]
329                if params.isolation_type == IsolationType::Tdx && range.start_4k_gpn() == 0xFFFFF && range.len() == 0x1000 {
330                    continue;
331                }
332
333                panic!("launch context range {range:#x?} is not reported as ram by host memmap")
334            }
335            memory_range::RangeWalkResult::Both(_, _) => {
336                // Range was part of the preaccepted import, is not free to
337                // allocate additional VTL2 ram from.
338            }
339            // Ranges in neither are ignored.
340            memory_range::RangeWalkResult::Neither => {}
341        }
342    }
343
344    // Subtract the used ranges from vtl2_ram
345    for entry in vtl2_ram.iter() {
346        let mem_req = &mut memory_per_node[entry.vnode as usize];
347
348        if entry.range.len() > *mem_req {
349            // TODO: Today if a used range is larger than the mem required, we
350            // just subtract that numa range to zero. Should we instead subtract
351            // from other numa nodes equally for over allocation?
352            log::warn!(
353                "entry {entry:?} is larger than required {mem_req} for vnode {}",
354                entry.vnode
355            );
356            *mem_req = 0;
357        } else {
358            *mem_req -= entry.range.len();
359        }
360    }
361
362    // Allocate remaining memory per node required.
363    for (node, required_mem) in memory_per_node.iter().enumerate() {
364        let mut required_mem = *required_mem;
365        if required_mem == 0 {
366            continue;
367        }
368
369        // Start allocation from the top of the free list, which is high memory
370        // in reverse order.
371        for entry in free_memory.iter_mut().rev() {
372            if entry.vnode == node as u32 && !entry.range.is_empty() {
373                assert!(required_mem != 0);
374                let bytes_to_allocate = core::cmp::min(entry.range.len(), required_mem);
375
376                // Allocate top down from the range.
377                let offset = entry.range.len() - bytes_to_allocate;
378                let (remaining, alloc) = MemoryRange::split_at_offset(&entry.range, offset);
379
380                entry.range = remaining;
381                vtl2_ram.push(MemoryEntry {
382                    range: alloc,
383                    mem_type: entry.mem_type,
384                    vnode: node as u32,
385                });
386
387                required_mem -= bytes_to_allocate;
388
389                // Stop allocating if we're done allocating.
390                if required_mem == 0 {
391                    break;
392                }
393            }
394        }
395
396        if required_mem != 0 {
397            // TODO: Handle fallback allocations on other numa nodes when a node
398            // is exhausted.
399            panic!(
400                "failed to allocate {required_mem:#x} for vnode {node:#x}, no memory remaining for vnode"
401            );
402        }
403    }
404
405    // Sort VTL2 ram as we may have allocated from different places.
406    vtl2_ram.sort_unstable_by_key(|e| e.range.start());
407
408    vtl2_ram
409}
410
411/// Parse VTL2 ram from host provided ranges.
412fn parse_host_vtl2_ram(
413    params: &ShimParams,
414    memory: &[MemoryEntry],
415) -> OffStackRef<'static, impl AsRef<[MemoryEntry]> + use<>> {
416    // If no VTL2 protectable ram was provided by the host, use the build time
417    // value encoded in ShimParams.
418    let mut vtl2_ram = off_stack!(ArrayVec<MemoryEntry, MAX_NUMA_NODES>, ArrayVec::new_const());
419    if params.isolation_type.is_hardware_isolated() {
420        // Hardware isolated VMs use the size hint by the host, but use the base
421        // address encoded in the file.
422        let vtl2_size = memory.iter().fold(0, |acc, entry| {
423            if entry.mem_type == MemoryMapEntryType::VTL2_PROTECTABLE {
424                acc + entry.range.len()
425            } else {
426                acc
427            }
428        });
429
430        log::info!(
431            "host provided vtl2 ram size is {:x}, measured size is {:x}",
432            vtl2_size,
433            params.memory_size
434        );
435
436        let vtl2_size = max(vtl2_size, params.memory_size);
437        vtl2_ram.push(MemoryEntry {
438            range: MemoryRange::new(
439                params.memory_start_address..(params.memory_start_address + vtl2_size),
440            ),
441            mem_type: MemoryMapEntryType::VTL2_PROTECTABLE,
442            vnode: 0,
443        });
444    } else {
445        for &entry in memory
446            .iter()
447            .filter(|entry| entry.mem_type == MemoryMapEntryType::VTL2_PROTECTABLE)
448        {
449            vtl2_ram.push(entry);
450        }
451    }
452
453    if vtl2_ram.is_empty() {
454        log::info!("using measured vtl2 ram");
455        vtl2_ram.push(MemoryEntry {
456            range: MemoryRange::try_new(
457                params.memory_start_address..(params.memory_start_address + params.memory_size),
458            )
459            .expect("range is valid"),
460            mem_type: MemoryMapEntryType::VTL2_PROTECTABLE,
461            vnode: 0,
462        });
463    }
464
465    vtl2_ram
466}
467
468fn init_heap(params: &ShimParams) {
469    // Initialize the temporary heap.
470    //
471    // This is only to be enabled for mesh decode.
472    //
473    // SAFETY: The heap range is reserved at file build time, and is
474    // guaranteed to be unused by anything else.
475    unsafe {
476        ALLOCATOR.init(params.heap);
477    }
478}
479
480type ParsedDt =
481    ParsedDeviceTree<MAX_PARTITION_RAM_RANGES, MAX_CPU_COUNT, COMMAND_LINE_SIZE, MAX_ENTROPY_SIZE>;
482
483/// Add common ranges to [`AddressSpaceManagerBuilder`] regardless if creating
484/// topology from the host or from saved state.
485fn add_common_ranges<'a, I: Iterator<Item = MemoryRange>>(
486    params: &ShimParams,
487    mut builder: AddressSpaceManagerBuilder<'a, I>,
488) -> AddressSpaceManagerBuilder<'a, I> {
489    // Add the log buffer which is always present.
490    builder = builder.with_log_buffer(params.log_buffer);
491
492    if params.vtl2_reserved_region_size != 0 {
493        builder = builder.with_reserved_range(MemoryRange::new(
494            params.vtl2_reserved_region_start
495                ..(params.vtl2_reserved_region_start + params.vtl2_reserved_region_size),
496        ));
497    }
498
499    if params.sidecar_size != 0 {
500        builder = builder.with_sidecar_image(MemoryRange::new(
501            params.sidecar_base..(params.sidecar_base + params.sidecar_size),
502        ));
503    }
504
505    builder
506}
507
508#[derive(Debug, PartialEq, Eq)]
509struct PartitionTopology {
510    vtl2_ram: &'static [MemoryEntry],
511    vtl0_mmio: ArrayVec<MemoryRange, 2>,
512    vtl2_mmio: ArrayVec<MemoryRange, 2>,
513    memory_allocation_mode: MemoryAllocationMode,
514}
515
516/// State derived while constructing the partition topology
517/// from persisted state.
518#[derive(Debug, PartialEq, Eq)]
519struct PersistedPartitionTopology {
520    topology: PartitionTopology,
521    sidecar_excluded_cpus: &'static [u32],
522}
523
524// Calculate the default mmio size for VTL2 when not specified by the host.
525//
526// This is half of the high mmio gap size, rounded down, with a minimum of 128
527// MB and a maximum of 1 GB.
528fn calculate_default_mmio_size(parsed: &ParsedDt) -> Result<u64, DtError> {
529    const MINIMUM_MMIO_SIZE: u64 = 128 * (1 << 20);
530    const MAXIMUM_MMIO_SIZE: u64 = 1 << 30;
531    let half_high_gap = parsed.vmbus_vtl0.as_ref().ok_or(DtError::Vtl0Vmbus)?.mmio[1].len() / 2;
532    Ok(half_high_gap.clamp(MINIMUM_MMIO_SIZE, MAXIMUM_MMIO_SIZE))
533}
534
535/// Read topology from the host provided device tree.
536fn topology_from_host_dt(
537    params: &ShimParams,
538    parsed: &ParsedDt,
539    options: &BootCommandLineOptions,
540    address_space: &mut AddressSpaceManager,
541) -> Result<PartitionTopology, DtError> {
542    log::info!("reading topology from host device tree");
543
544    let mut vtl2_ram =
545        off_stack!(ArrayVec<MemoryEntry, MAX_VTL2_RAM_RANGES>, ArrayVec::new_const());
546
547    // TODO: Decide if isolated guests always use VTL2 allocation mode.
548
549    let memory_allocation_mode = parsed.memory_allocation_mode;
550    match memory_allocation_mode {
551        MemoryAllocationMode::Host => {
552            vtl2_ram
553                .try_extend_from_slice(parse_host_vtl2_ram(params, &parsed.memory).as_ref())
554                .expect("vtl2 ram should only be 64 big");
555        }
556        MemoryAllocationMode::Vtl2 {
557            memory_size,
558            mmio_size: _,
559        } => {
560            vtl2_ram
561                .try_extend_from_slice(
562                    allocate_vtl2_ram(params, &parsed.memory, memory_size).as_ref(),
563                )
564                .expect("vtl2 ram should only be 64 big");
565        }
566    }
567
568    // The host is responsible for allocating MMIO ranges for non-isolated
569    // guests when it also provides the ram VTL2 should use.
570    //
571    // For isolated guests, or when VTL2 has been asked to carve out its own
572    // memory, first check if the host provided a VTL2 mmio range. If so, the
573    // mmio range must be large enough. Otherwise, choose to carve out a range
574    // from the VTL0 allotment.
575    let (vtl0_mmio, vtl2_mmio) = if params.isolation_type != IsolationType::None
576        || matches!(
577            parsed.memory_allocation_mode,
578            MemoryAllocationMode::Vtl2 { .. }
579        ) {
580        // Decide the amount of mmio VTL2 should allocate, which is different
581        // depending on the heuristic used.
582        //
583        // On a newer host where a vtl2 mmio range is provided inside the
584        // vmbus_vtl2 device tree node, use the size provided by the host inside
585        // the openhcl node for memory allocation mode.
586        //
587        // If the host did not provide a vtl2 mmio range, then use the maximum
588        // of the host provided value inside the openhcl node and the calculated
589        // default.
590        let host_provided_size = match parsed.memory_allocation_mode {
591            MemoryAllocationMode::Vtl2 { mmio_size, .. } => mmio_size.unwrap_or(0),
592            _ => 0,
593        };
594        let vmbus_vtl2 = parsed.vmbus_vtl2.as_ref().ok_or(DtError::Vtl2Vmbus)?;
595        let vmbus_vtl2_mmio_size = vmbus_vtl2.mmio.iter().map(|r| r.len()).sum::<u64>();
596        let mmio_size = if vmbus_vtl2_mmio_size != 0 {
597            host_provided_size
598        } else {
599            max(host_provided_size, calculate_default_mmio_size(parsed)?)
600        };
601
602        log::info!("allocating vtl2 mmio size {mmio_size:#x} bytes");
603        log::info!("host provided vtl2 mmio ranges are {vmbus_vtl2_mmio_size:#x} bytes");
604
605        let vmbus_vtl0 = parsed.vmbus_vtl0.as_ref().ok_or(DtError::Vtl0Vmbus)?;
606        if vmbus_vtl2_mmio_size != 0 {
607            // Verify the host provided mmio is large enough.
608            if vmbus_vtl2_mmio_size < mmio_size {
609                return Err(DtError::NotEnoughVtl2Mmio);
610            }
611
612            log::info!("using host provided vtl2 mmio: {:x?}", vmbus_vtl2.mmio);
613            (vmbus_vtl0.mmio.clone(), vmbus_vtl2.mmio.clone())
614        } else {
615            // Allocate vtl2 mmio from vtl0 mmio.
616            log::info!("no vtl2 mmio provided by host, allocating from vtl0 mmio");
617            let selected_vtl2_mmio = select_vtl2_mmio_range(&vmbus_vtl0.mmio, mmio_size)?;
618
619            // Update vtl0 mmio to exclude vtl2 mmio.
620            let vtl0_mmio = subtract_ranges(vmbus_vtl0.mmio.iter().cloned(), [selected_vtl2_mmio])
621                .collect::<ArrayVec<MemoryRange, 2>>();
622            let vtl2_mmio = [selected_vtl2_mmio]
623                .into_iter()
624                .collect::<ArrayVec<MemoryRange, 2>>();
625
626            // TODO: For now, if we have only a single vtl0_mmio range left,
627            // panic. In the future decide if we want to report this as a start
628            // failure in usermode, change allocation strategy, or something
629            // else.
630            assert_eq!(
631                vtl0_mmio.len(),
632                2,
633                "vtl0 mmio ranges are not 2 {:#x?}",
634                vtl0_mmio
635            );
636
637            log::info!("vtl0 mmio: {vtl0_mmio:x?}, vtl2 mmio: {vtl2_mmio:x?}");
638
639            (vtl0_mmio, vtl2_mmio)
640        }
641    } else {
642        (
643            parsed
644                .vmbus_vtl0
645                .as_ref()
646                .ok_or(DtError::Vtl0Vmbus)?
647                .mmio
648                .clone(),
649            parsed
650                .vmbus_vtl2
651                .as_ref()
652                .ok_or(DtError::Vtl2Vmbus)?
653                .mmio
654                .clone(),
655        )
656    };
657
658    // The host provided device tree is marked as normal ram, as the
659    // bootshim is responsible for constructing anything usermode needs from
660    // it, and passing it via the device tree provided to the kernel.
661    let reclaim_base = params.dt_start();
662    let reclaim_end = params.dt_start() + params.dt_size();
663    let vtl2_config_region_reclaim =
664        MemoryRange::try_new(reclaim_base..reclaim_end).expect("range is valid");
665
666    log::info!("reclaim device tree memory {reclaim_base:x}-{reclaim_end:x}");
667
668    // Initialize the address space manager with fixed at build time ranges.
669    let vtl2_config_region = MemoryRange::new(
670        params.parameter_region_start
671            ..(params.parameter_region_start + params.parameter_region_size),
672    );
673
674    // NOTE: Size the region as 20 pages. This should be plenty enough for the
675    // worst case encoded size (about 50 bytes worst case per memory entry, with
676    // the max number of ram ranges), and is small enough that we can reserve it
677    // on all sizes. Revisit this calculation if we persist more state in the
678    // future.
679    const PERSISTED_REGION_SIZE: u64 = 20 * 4096;
680    let (persisted_state_region, remainder) = params
681        .persisted_state
682        .split_at_offset(PERSISTED_REGION_SIZE);
683    log::info!(
684        "persisted state region sized to {persisted_state_region:#x?}, remainder {remainder:#x?}"
685    );
686
687    let mut address_space_builder = AddressSpaceManagerBuilder::new(
688        address_space,
689        &vtl2_ram,
690        params.used,
691        persisted_state_region,
692        subtract_ranges([vtl2_config_region], [vtl2_config_region_reclaim]),
693    );
694
695    address_space_builder = add_common_ranges(params, address_space_builder);
696
697    address_space_builder
698        .init()
699        .expect("failed to initialize address space manager");
700
701    if params.isolation_type == IsolationType::None {
702        let enable_vtl2_gpa_pool = options.enable_vtl2_gpa_pool;
703        let device_dma_page_count = parsed.device_dma_page_count;
704        let vp_count = parsed.cpu_count();
705        let mem_size = vtl2_ram.iter().map(|e| e.range.len()).sum();
706        if let Some(vtl2_gpa_pool_size) = pick_private_pool_size(
707            enable_vtl2_gpa_pool,
708            device_dma_page_count,
709            vp_count,
710            mem_size,
711        ) {
712            // Reserve the specified number of pages for the pool. Use the used
713            // ranges to figure out which VTL2 memory is free to allocate from.
714            let pool_size_bytes = vtl2_gpa_pool_size * HV_PAGE_SIZE;
715
716            allocate_private_pool(
717                address_space,
718                &vtl2_ram,
719                pool_size_bytes,
720                options.vtl2_gpa_pool_numa_split,
721                enable_vtl2_gpa_pool,
722                device_dma_page_count,
723                vp_count,
724                mem_size,
725            );
726        }
727    }
728
729    Ok(PartitionTopology {
730        vtl2_ram: OffStackRef::<'_, ArrayVec<MemoryEntry, MAX_VTL2_RAM_RANGES>>::leak(vtl2_ram),
731        vtl0_mmio,
732        vtl2_mmio,
733        memory_allocation_mode,
734    })
735}
736
737/// Read topology from the persisted state region and protobuf payload.
738fn topology_from_persisted_state(
739    header: PersistedStateHeader,
740    params: &ShimParams,
741    parsed: &ParsedDt,
742    address_space: &mut AddressSpaceManager,
743) -> Result<PersistedPartitionTopology, DtError> {
744    log::info!("reading topology from persisted state");
745
746    // Verify the header describes a protobuf region within the bootshim
747    // persisted region. We expect it to live there as today we rely on the
748    // build time generated pagetable to identity map the protobuf region.
749    let protobuf_region =
750        MemoryRange::new(header.protobuf_base..(header.protobuf_base + header.protobuf_region_len));
751    assert!(
752        params.persisted_state.contains(&protobuf_region),
753        "protobuf region {protobuf_region:#x?} is not contained within the persisted state region {:#x?}",
754        params.persisted_state
755    );
756
757    // Verify protobuf payload len is smaller than region.
758    assert!(
759        header.protobuf_payload_len <= header.protobuf_region_len,
760        "protobuf payload len {} is larger than region len {}",
761        header.protobuf_payload_len,
762        header.protobuf_region_len
763    );
764
765    // SAFETY: The region lies within the persisted state region, which is
766    // identity mapped via the build time generated pagetable.
767    let protobuf_raw = unsafe {
768        core::slice::from_raw_parts(
769            header.protobuf_base as *const u8,
770            header.protobuf_payload_len as usize,
771        )
772    };
773
774    let parsed_protobuf: loader_defs::shim::save_restore::SavedState =
775        bump_alloc::with_global_alloc(|| {
776            log::info!("decoding protobuf of size {}", protobuf_raw.len());
777            mesh_protobuf::decode(protobuf_raw).expect("failed to decode protobuf")
778        });
779
780    let loader_defs::shim::save_restore::SavedState {
781        partition_memory,
782        partition_mmio,
783        cpus_with_mapped_interrupts_no_io,
784        cpus_with_outstanding_io,
785    } = parsed_protobuf;
786
787    log::info!(
788        "persisted state: cpus_with_mapped_interrupts_no_io={:?}, cpus_with_outstanding_io={:?}",
789        cpus_with_mapped_interrupts_no_io,
790        cpus_with_outstanding_io,
791    );
792
793    let mut sidecar_excluded_cpus = off_stack!(ArrayVec<u32, MAX_CPU_COUNT>, ArrayVec::new_const());
794    sidecar_excluded_cpus.clear();
795    // Keep the list sorted and deduplicated as we insert, so it's ready for
796    // binary search lookups later.
797    for c in cpus_with_outstanding_io
798        .iter()
799        .chain(cpus_with_mapped_interrupts_no_io.iter())
800        .copied()
801    {
802        if let Err(i) = sidecar_excluded_cpus.binary_search(&c) {
803            sidecar_excluded_cpus.insert(i, c);
804        }
805    }
806
807    // FUTURE: should memory allocation mode should persist in saved state and
808    // verify the host did not change it?
809    let memory_allocation_mode = parsed.memory_allocation_mode;
810
811    let mut vtl2_ram =
812        off_stack!(ArrayVec<MemoryEntry, MAX_VTL2_RAM_RANGES>, ArrayVec::new_const());
813
814    // Determine which ranges are memory ranges used by VTL2.
815    let previous_vtl2_ram = partition_memory.iter().filter_map(|entry| {
816        if entry.vtl_type.ram() && entry.vtl_type.vtl2() {
817            Some(MemoryEntry {
818                range: entry.range,
819                mem_type: entry.igvm_type.clone().into(),
820                vnode: entry.vnode,
821            })
822        } else {
823            None
824        }
825    });
826
827    // Merge adjacent ranges as saved state reports the final usage of ram which
828    // includes reserved in separate ranges. Here we want the whole underlying
829    // ram ranges, merged with adjacent types if they share the same igvm types.
830    let previous_vtl2_ram = memory_range::merge_adjacent_ranges(
831        previous_vtl2_ram.map(|entry| (entry.range, (entry.mem_type, entry.vnode))),
832    );
833
834    vtl2_ram.extend(
835        previous_vtl2_ram.map(|(range, (mem_type, vnode))| MemoryEntry {
836            range,
837            mem_type,
838            vnode,
839        }),
840    );
841
842    // If the host was responsible for allocating VTL2 ram, verify the ram
843    // parsed from the previous instance matches.
844    //
845    // FUTURE: When VTL2 itself did allocation, we should verify that all ranges
846    // are still within the provided memory map.
847    if matches!(memory_allocation_mode, MemoryAllocationMode::Host) {
848        let host_vtl2_ram = parse_host_vtl2_ram(params, &parsed.memory);
849        assert_eq!(
850            vtl2_ram.as_slice(),
851            host_vtl2_ram.as_ref(),
852            "vtl2 ram from persisted state does not match host provided ram"
853        );
854    }
855
856    // Merge the persisted state header and protobuf region, and report that as
857    // the persisted region.
858    //
859    // NOTE: We could choose to resize the persisted region at this point, which
860    // we would need to do if we expect the saved state to grow larger.
861    let persisted_header = partition_memory
862        .iter()
863        .find(|entry| entry.vtl_type == MemoryVtlType::VTL2_PERSISTED_STATE_HEADER)
864        .expect("persisted state header missing");
865    let persisted_protobuf = partition_memory
866        .iter()
867        .find(|entry| entry.vtl_type == MemoryVtlType::VTL2_PERSISTED_STATE_PROTOBUF)
868        .expect("persisted state protobuf region missing");
869    assert_eq!(persisted_header.range.end(), protobuf_region.start());
870    let persisted_state_region =
871        MemoryRange::new(persisted_header.range.start()..persisted_protobuf.range.end());
872
873    // The host provided device tree is marked as normal ram, as the
874    // bootshim is responsible for constructing anything usermode needs from
875    // it, and passing it via the device tree provided to the kernel.
876    let reclaim_base = params.dt_start();
877    let reclaim_end = params.dt_start() + params.dt_size();
878    let vtl2_config_region_reclaim =
879        MemoryRange::try_new(reclaim_base..reclaim_end).expect("range is valid");
880
881    log::info!("reclaim device tree memory {reclaim_base:x}-{reclaim_end:x}");
882
883    let vtl2_config_region = MemoryRange::new(
884        params.parameter_region_start
885            ..(params.parameter_region_start + params.parameter_region_size),
886    );
887
888    let mut address_space_builder = AddressSpaceManagerBuilder::new(
889        address_space,
890        &vtl2_ram,
891        params.used,
892        persisted_state_region,
893        subtract_ranges([vtl2_config_region], [vtl2_config_region_reclaim]),
894    );
895
896    // NOTE: The only other region we take from the previous instance is any
897    // allocated vtl2 pool. Today, we do not allocate a new/larger pool if the
898    // command line arguments or host device tree changed, as that's not
899    // something we expect to happen in practice.
900    let pool_ranges = partition_memory.iter().filter_map(|entry| {
901        if entry.vtl_type == MemoryVtlType::VTL2_GPA_POOL {
902            Some(entry.range)
903        } else {
904            None
905        }
906    });
907
908    address_space_builder = address_space_builder.with_pool_ranges(pool_ranges);
909
910    // As described above, other ranges come from this current boot.
911    address_space_builder = add_common_ranges(params, address_space_builder);
912
913    address_space_builder
914        .init()
915        .expect("failed to initialize address space manager");
916
917    // Read previous mmio for VTL0 and VTL2.
918    let vtl0_mmio = partition_mmio
919        .iter()
920        .filter_map(|entry| {
921            if entry.vtl_type == MemoryVtlType::VTL0_MMIO {
922                Some(entry.range)
923            } else {
924                None
925            }
926        })
927        .collect::<ArrayVec<MemoryRange, 2>>();
928    let vtl2_mmio = partition_mmio
929        .iter()
930        .filter_map(|entry| {
931            if entry.vtl_type == MemoryVtlType::VTL2_MMIO {
932                Some(entry.range)
933            } else {
934                None
935            }
936        })
937        .collect::<ArrayVec<MemoryRange, 2>>();
938
939    Ok(PersistedPartitionTopology {
940        topology: PartitionTopology {
941            vtl2_ram: OffStackRef::<'_, ArrayVec<MemoryEntry, MAX_VTL2_RAM_RANGES>>::leak(vtl2_ram),
942            vtl0_mmio,
943            vtl2_mmio,
944            memory_allocation_mode,
945        },
946        sidecar_excluded_cpus: OffStackRef::leak(sidecar_excluded_cpus),
947    })
948}
949
950/// Read the persisted header from the start of the persisted state region
951/// described at file build time. If the magic value is not set, `None` is
952/// returned.
953fn read_persisted_region_header(params: &ShimParams) -> Option<PersistedStateHeader> {
954    // TODO CVM: On an isolated guest, these pages may not be accepted. We need
955    // to rethink how this will work in order to handle this correctly, as on a
956    // first boot we'd need to accept them early, but subsequent boots should
957    // not accept any pages.
958    //
959    // This may require some value passed in via a register or something early
960    // that indicates this is a servicing boot, which we could set if OpenHCL
961    // itself launches the next instance.
962    if params.isolation_type != IsolationType::None {
963        return None;
964    }
965
966    // SAFETY: The header lies at the start of the shim described persisted state
967    // region. This range is guaranteed to be identity mapped at file build
968    // time.
969    let buf = unsafe {
970        core::slice::from_raw_parts(
971            params.persisted_state.start() as *const u8,
972            size_of::<PersistedStateHeader>(),
973        )
974    };
975
976    let header = PersistedStateHeader::read_from_bytes(buf)
977        .expect("region is page aligned and the correct size");
978
979    if header.magic == PersistedStateHeader::MAGIC {
980        Some(header)
981    } else {
982        None
983    }
984}
985
986impl PartitionInfo {
987    // Read the IGVM provided DT for the vtl2 partition info.
988    pub fn read_from_dt<'a>(
989        params: &'a ShimParams,
990        storage: &'a mut Self,
991        address_space: &'_ mut AddressSpaceManager,
992        mut options: BootCommandLineOptions,
993        can_trust_host: bool,
994    ) -> Result<&'a mut Self, DtError> {
995        let dt = params.device_tree();
996
997        if dt[0] == 0 {
998            log::error!("host did not provide a device tree");
999            return Err(DtError::NoDeviceTree);
1000        }
1001
1002        let mut dt_storage = off_stack!(ParsedDt, ParsedDeviceTree::new());
1003
1004        let parsed = ParsedDeviceTree::parse(dt, &mut *dt_storage).map_err(DtError::DeviceTree)?;
1005
1006        let command_line = params.command_line();
1007
1008        // Always write the measured command line.
1009        write!(
1010            storage.cmdline,
1011            "{}",
1012            command_line
1013                .command_line()
1014                .expect("measured command line should be valid")
1015        )
1016        .map_err(|_| DtError::CommandLineSize)?;
1017
1018        match command_line.policy {
1019            CommandLinePolicy::STATIC => {
1020                // Nothing to do, we already wrote the measured command line.
1021            }
1022            CommandLinePolicy::APPEND_CHOSEN if can_trust_host => {
1023                // Check the host-provided command line for options for ourself,
1024                // and pass it along to the kernel.
1025                options.parse(&parsed.command_line);
1026                write!(storage.cmdline, " {}", &parsed.command_line)
1027                    .map_err(|_| DtError::CommandLineSize)?;
1028            }
1029            CommandLinePolicy::APPEND_CHOSEN if !can_trust_host => {
1030                // Nothing to do, we ignore the host provided command line.
1031            }
1032            _ => unreachable!(),
1033        }
1034
1035        init_heap(params);
1036
1037        let persisted_state_header = read_persisted_region_header(params);
1038        log::info!(
1039            "read_from_dt: persisted_state_header present={}, sidecar={:?}",
1040            persisted_state_header.is_some(),
1041            options.sidecar,
1042        );
1043        let (topology, sidecar_excluded_cpus) = if let Some(header) = persisted_state_header {
1044            log::info!("found persisted state header");
1045            let persisted_topology =
1046                topology_from_persisted_state(header, params, parsed, address_space)?;
1047            (
1048                persisted_topology.topology,
1049                persisted_topology.sidecar_excluded_cpus,
1050            )
1051        } else {
1052            (
1053                topology_from_host_dt(params, parsed, &options, address_space)?,
1054                &[][..],
1055            )
1056        };
1057
1058        let Self {
1059            vtl2_ram,
1060            partition_ram,
1061            isolation,
1062            bsp_reg,
1063            cpus,
1064            sidecar_cpu_overrides,
1065            vmbus_vtl0,
1066            vmbus_vtl2,
1067            cmdline: _,
1068            com3_serial,
1069            gic,
1070            pmu_gsiv,
1071            memory_allocation_mode,
1072            entropy,
1073            vtl0_alias_map,
1074            nvme_keepalive,
1075            boot_options,
1076        } = storage;
1077
1078        // During servicing restore, selectively exclude CPUs that had
1079        // restored device state (outstanding NVMe I/O or just a mapped NVMe
1080        // interrupt) from sidecar startup. These CPUs need immediate kernel
1081        // access to handle device interrupts and complete the keepalive
1082        // restore. All other CPUs still benefit from sidecar's parallel
1083        // startup. Falls back to disabling sidecar entirely if CPU IDs exceed
1084        // the per-CPU state array capacity (>400 CPUs).
1085        //
1086        // Sidecar is automatically disabled when: all NUMA nodes have exactly
1087        // one CPU (nothing to parallelize), x2apic is unavailable, the VM is
1088        // isolated (CVM), or the sidecar image is not present (sidecar_size == 0).
1089        // It is also disabled via command line with OPENHCL_SIDECAR=off. In all
1090        // other cases sidecar is active and uses a fan-out pattern to bring up
1091        // APs in parallel across NUMA nodes.
1092        //
1093        // TODO: the `cpu_threshold` field in `SidecarOptions::Enabled` is
1094        // not used at present. Based on production performance data, either
1095        // remove `cpu_threshold` from `SidecarOptions` in cmdline.rs, or
1096        // add a VP-count cutoff here to disable sidecar for small VMs.
1097        if let (SidecarOptions::Enabled { .. }, true) =
1098            (&boot_options.sidecar, !sidecar_excluded_cpus.is_empty())
1099        {
1100            let max_cpu_id = *sidecar_excluded_cpus.iter().max().unwrap() as usize;
1101            if parsed.cpu_count() <= sidecar_cpu_overrides.sidecar_starts_cpu.len()
1102                && max_cpu_id < sidecar_cpu_overrides.sidecar_starts_cpu.len()
1103            {
1104                // Mark specific CPUs as kernel-started instead of sidecar-started.
1105                sidecar_cpu_overrides.per_cpu_state_specified = true;
1106                for &cpu_id in sidecar_excluded_cpus {
1107                    sidecar_cpu_overrides.sidecar_starts_cpu[cpu_id as usize] = false;
1108                }
1109                log::info!(
1110                    "sidecar: excluding CPUs {:?} due to restored NVMe device state",
1111                    sidecar_excluded_cpus,
1112                );
1113            } else {
1114                // CPU IDs exceed per-cpu array capacity; disable sidecar entirely.
1115                log::info!(
1116                    "sidecar: disabling, too many CPUs for per-CPU state (max id {max_cpu_id})"
1117                );
1118                boot_options.sidecar = SidecarOptions::DisabledServicing;
1119                options.sidecar = SidecarOptions::DisabledServicing;
1120            }
1121        }
1122
1123        // Set ram and memory alloction mode.
1124        vtl2_ram.clear();
1125        vtl2_ram.extend(topology.vtl2_ram.iter().copied());
1126        partition_ram.clear();
1127        partition_ram.extend(parsed.memory.iter().copied());
1128        *memory_allocation_mode = topology.memory_allocation_mode;
1129
1130        // Set vmbus fields. The connection ID comes from the host, but mmio
1131        // comes from topology.
1132        *vmbus_vtl0 = VmbusInfo {
1133            connection_id: parsed
1134                .vmbus_vtl0
1135                .as_ref()
1136                .ok_or(DtError::Vtl0Vmbus)?
1137                .connection_id,
1138            mmio: topology.vtl0_mmio,
1139        };
1140        *vmbus_vtl2 = VmbusInfo {
1141            connection_id: parsed
1142                .vmbus_vtl2
1143                .as_ref()
1144                .ok_or(DtError::Vtl2Vmbus)?
1145                .connection_id,
1146            mmio: topology.vtl2_mmio,
1147        };
1148
1149        // If we can trust the host, use the provided alias map
1150        if can_trust_host {
1151            *vtl0_alias_map = parsed.vtl0_alias_map;
1152        }
1153
1154        *isolation = params.isolation_type;
1155
1156        *bsp_reg = parsed.boot_cpuid_phys;
1157        cpus.extend(parsed.cpus.iter().copied());
1158        *com3_serial = parsed.com3_serial.clone();
1159        *gic = parsed.gic.clone();
1160        *pmu_gsiv = parsed.pmu_gsiv;
1161        *entropy = parsed.entropy.clone();
1162        *nvme_keepalive = parsed.nvme_keepalive;
1163        *boot_options = options;
1164
1165        Ok(storage)
1166    }
1167}