openhcl_boot/host_params/dt/
mod.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! Parse partition info using the IGVM device tree parameter.
5
6extern crate alloc;
7
8use super::PartitionInfo;
9use super::shim_params::ShimParams;
10use crate::cmdline::BootCommandLineOptions;
11use crate::cmdline::SidecarOptions;
12use crate::host_params::COMMAND_LINE_SIZE;
13use crate::host_params::MAX_CPU_COUNT;
14use crate::host_params::MAX_ENTROPY_SIZE;
15use crate::host_params::MAX_NUMA_NODES;
16use crate::host_params::MAX_PARTITION_RAM_RANGES;
17use crate::host_params::MAX_VTL2_RAM_RANGES;
18use crate::host_params::dt::dma_hint::pick_private_pool_size;
19use crate::host_params::mmio::select_vtl2_mmio_range;
20use crate::host_params::shim_params::IsolationType;
21use crate::memory::AddressSpaceManager;
22use crate::memory::AddressSpaceManagerBuilder;
23use crate::memory::AllocationPolicy;
24use crate::memory::AllocationType;
25use crate::single_threaded::OffStackRef;
26use crate::single_threaded::off_stack;
27use alloc::vec::Vec;
28use arrayvec::ArrayString;
29use arrayvec::ArrayVec;
30use bump_alloc::ALLOCATOR;
31use core::cmp::max;
32use core::fmt::Write;
33use host_fdt_parser::MemoryAllocationMode;
34use host_fdt_parser::MemoryEntry;
35use host_fdt_parser::ParsedDeviceTree;
36use host_fdt_parser::VmbusInfo;
37use hvdef::HV_PAGE_SIZE;
38use igvm_defs::MemoryMapEntryType;
39use loader_defs::paravisor::CommandLinePolicy;
40use loader_defs::shim::MemoryVtlType;
41use loader_defs::shim::PersistedStateHeader;
42use memory_range::MemoryRange;
43use memory_range::subtract_ranges;
44use memory_range::walk_ranges;
45use thiserror::Error;
46use zerocopy::FromBytes;
47
48mod bump_alloc;
49mod dma_hint;
50
51/// Errors when reading the host device tree.
52#[derive(Debug, Error)]
53pub enum DtError {
54    /// Host did not provide a device tree.
55    #[error("no device tree provided by host")]
56    NoDeviceTree,
57    /// Invalid device tree.
58    #[error("host provided device tree is invalid")]
59    DeviceTree(#[source] host_fdt_parser::Error<'static>),
60    /// PartitionInfo's command line is too small to write the parsed legacy
61    /// command line.
62    #[error("commandline storage is too small to write the parsed command line")]
63    CommandLineSize,
64    /// Device tree did not contain a vmbus node for VTL2.
65    #[error("device tree did not contain a vmbus node for VTL2")]
66    Vtl2Vmbus,
67    /// Device tree did not contain a vmbus node for VTL0.
68    #[error("device tree did not contain a vmbus node for VTL0")]
69    Vtl0Vmbus,
70    /// Host provided high MMIO range is insufficient to cover VTL0 and VTL2.
71    #[error("host provided high MMIO range is insufficient to cover VTL0 and VTL2")]
72    NotEnoughVtl0Mmio,
73    /// Host provided MMIO range is insufficient to cover VTL2.
74    #[error("host provided MMIO range is insufficient to cover VTL2")]
75    NotEnoughVtl2Mmio,
76}
77
78/// Allocate VTL2 ram from the partition's memory map.
79fn allocate_vtl2_ram(
80    params: &ShimParams,
81    partition_memory_map: &[MemoryEntry],
82    ram_size: Option<u64>,
83) -> OffStackRef<'static, impl AsRef<[MemoryEntry]> + use<>> {
84    // First, calculate how many numa nodes there are by looking at unique numa
85    // nodes in the memory map.
86    let mut numa_nodes = off_stack!(ArrayVec<u32, MAX_NUMA_NODES>, ArrayVec::new_const());
87
88    for entry in partition_memory_map.iter() {
89        match numa_nodes.binary_search(&entry.vnode) {
90            Ok(_) => {}
91            Err(index) => {
92                numa_nodes.insert(index, entry.vnode);
93            }
94        }
95    }
96
97    let numa_node_count = numa_nodes.len();
98
99    let vtl2_size = if let Some(ram_size) = ram_size {
100        if ram_size < params.memory_size {
101            panic!(
102                "host provided vtl2 ram size {:x} is smaller than measured size {:x}",
103                ram_size, params.memory_size
104            );
105        }
106        max(ram_size, params.memory_size)
107    } else {
108        params.memory_size
109    };
110
111    // Next, calculate the amount of memory that needs to be allocated per numa
112    // node.
113    let ram_per_node = vtl2_size / numa_node_count as u64;
114
115    // Seed the remaining allocation list with the memory required per node.
116    let mut memory_per_node = off_stack!(ArrayVec<u64, MAX_NUMA_NODES>, ArrayVec::new_const());
117    memory_per_node.extend((0..numa_node_count).map(|_| 0));
118    for entry in partition_memory_map.iter() {
119        memory_per_node[entry.vnode as usize] = ram_per_node;
120    }
121
122    // The range the IGVM file was loaded into is special - it is already
123    // counted as "allocated". This may have been split across different numa
124    // nodes. Walk the used range, add it to vtl2 ram, and subtract it from the
125    // used ranges.
126    let mut vtl2_ram = off_stack!(ArrayVec<MemoryEntry, MAX_NUMA_NODES>, ArrayVec::new_const());
127    let mut free_memory_after_vtl2 = off_stack!(ArrayVec<MemoryEntry, 1024>, ArrayVec::new_const());
128    let file_memory_range = MemoryRange::new(
129        params.memory_start_address..(params.memory_start_address + params.memory_size),
130    );
131
132    for (range, result) in walk_ranges(
133        [(file_memory_range, ())],
134        partition_memory_map.iter().map(|e| (e.range, e)),
135    ) {
136        match result {
137            memory_range::RangeWalkResult::Right(entry) => {
138                // Add this entry to the free list.
139                free_memory_after_vtl2.push(MemoryEntry {
140                    range,
141                    mem_type: entry.mem_type,
142                    vnode: entry.vnode,
143                });
144            }
145            memory_range::RangeWalkResult::Both(_, entry) => {
146                // Add this entry to the vtl2 ram list.
147                vtl2_ram.push(MemoryEntry {
148                    range,
149                    mem_type: entry.mem_type,
150                    vnode: entry.vnode,
151                });
152            }
153            memory_range::RangeWalkResult::Left(_) => {
154                panic!("used file range {range:#x?} is not reported as ram by host memmap")
155            }
156            // Ranges in neither are ignored.
157            memory_range::RangeWalkResult::Neither => {}
158        }
159    }
160
161    // Now remove ranges from the free list that were part of the initial launch
162    // context.
163    let mut free_memory = off_stack!(ArrayVec<MemoryEntry, 1024>, ArrayVec::new_const());
164    for (range, result) in walk_ranges(
165        params
166            .imported_regions()
167            .filter_map(|(range, _preaccepted)| {
168                if !file_memory_range.contains(&range) {
169                     // There should be no overlap - either the preaccepted range
170                    // is exclusively covered by the preaccpted VTL2 range or it
171                    // is not.
172                    assert!(!file_memory_range.overlaps(&range), "imported range {range:#x?} overlaps vtl2 range and is not fully contained within vtl2 range");
173                    Some((range, ()))
174                } else {
175                    None
176                }
177            }),
178        free_memory_after_vtl2.iter().map(|e| (e.range, e)),
179    ) {
180        match result {
181            memory_range::RangeWalkResult::Right(entry) => {
182                free_memory.push(MemoryEntry {
183                    range,
184                    mem_type: entry.mem_type,
185                    vnode: entry.vnode,
186                });
187            }
188            memory_range::RangeWalkResult::Left(_) => {
189                // On TDX, the reset vector page is not reported as ram by the
190                // host, but is preaccepted. Ignore it.
191                #[cfg(target_arch = "x86_64")]
192                if params.isolation_type == IsolationType::Tdx && range.start_4k_gpn() == 0xFFFFF && range.len() == 0x1000 {
193                    continue;
194                }
195
196                panic!("launch context range {range:#x?} is not reported as ram by host memmap")
197            }
198            memory_range::RangeWalkResult::Both(_, _) => {
199                // Range was part of the preaccepted import, is not free to
200                // allocate additional VTL2 ram from.
201            }
202            // Ranges in neither are ignored.
203            memory_range::RangeWalkResult::Neither => {}
204        }
205    }
206
207    // Subtract the used ranges from vtl2_ram
208    for entry in vtl2_ram.iter() {
209        let mem_req = &mut memory_per_node[entry.vnode as usize];
210
211        if entry.range.len() > *mem_req {
212            // TODO: Today if a used range is larger than the mem required, we
213            // just subtract that numa range to zero. Should we instead subtract
214            // from other numa nodes equally for over allocation?
215            log::warn!(
216                "entry {entry:?} is larger than required {mem_req} for vnode {}",
217                entry.vnode
218            );
219            *mem_req = 0;
220        } else {
221            *mem_req -= entry.range.len();
222        }
223    }
224
225    // Allocate remaining memory per node required.
226    for (node, required_mem) in memory_per_node.iter().enumerate() {
227        let mut required_mem = *required_mem;
228        if required_mem == 0 {
229            continue;
230        }
231
232        // Start allocation from the top of the free list, which is high memory
233        // in reverse order.
234        for entry in free_memory.iter_mut().rev() {
235            if entry.vnode == node as u32 && !entry.range.is_empty() {
236                assert!(required_mem != 0);
237                let bytes_to_allocate = core::cmp::min(entry.range.len(), required_mem);
238
239                // Allocate top down from the range.
240                let offset = entry.range.len() - bytes_to_allocate;
241                let (remaining, alloc) = MemoryRange::split_at_offset(&entry.range, offset);
242
243                entry.range = remaining;
244                vtl2_ram.push(MemoryEntry {
245                    range: alloc,
246                    mem_type: entry.mem_type,
247                    vnode: node as u32,
248                });
249
250                required_mem -= bytes_to_allocate;
251
252                // Stop allocating if we're done allocating.
253                if required_mem == 0 {
254                    break;
255                }
256            }
257        }
258
259        if required_mem != 0 {
260            // TODO: Handle fallback allocations on other numa nodes when a node
261            // is exhausted.
262            panic!(
263                "failed to allocate {required_mem:#x} for vnode {node:#x}, no memory remaining for vnode"
264            );
265        }
266    }
267
268    // Sort VTL2 ram as we may have allocated from different places.
269    vtl2_ram.sort_unstable_by_key(|e| e.range.start());
270
271    vtl2_ram
272}
273
274/// Parse VTL2 ram from host provided ranges.
275fn parse_host_vtl2_ram(
276    params: &ShimParams,
277    memory: &[MemoryEntry],
278) -> OffStackRef<'static, impl AsRef<[MemoryEntry]> + use<>> {
279    // If no VTL2 protectable ram was provided by the host, use the build time
280    // value encoded in ShimParams.
281    let mut vtl2_ram = off_stack!(ArrayVec<MemoryEntry, MAX_NUMA_NODES>, ArrayVec::new_const());
282    if params.isolation_type.is_hardware_isolated() {
283        // Hardware isolated VMs use the size hint by the host, but use the base
284        // address encoded in the file.
285        let vtl2_size = memory.iter().fold(0, |acc, entry| {
286            if entry.mem_type == MemoryMapEntryType::VTL2_PROTECTABLE {
287                acc + entry.range.len()
288            } else {
289                acc
290            }
291        });
292
293        log::info!(
294            "host provided vtl2 ram size is {:x}, measured size is {:x}",
295            vtl2_size,
296            params.memory_size
297        );
298
299        let vtl2_size = max(vtl2_size, params.memory_size);
300        vtl2_ram.push(MemoryEntry {
301            range: MemoryRange::new(
302                params.memory_start_address..(params.memory_start_address + vtl2_size),
303            ),
304            mem_type: MemoryMapEntryType::VTL2_PROTECTABLE,
305            vnode: 0,
306        });
307    } else {
308        for &entry in memory
309            .iter()
310            .filter(|entry| entry.mem_type == MemoryMapEntryType::VTL2_PROTECTABLE)
311        {
312            vtl2_ram.push(entry);
313        }
314    }
315
316    if vtl2_ram.is_empty() {
317        log::info!("using measured vtl2 ram");
318        vtl2_ram.push(MemoryEntry {
319            range: MemoryRange::try_new(
320                params.memory_start_address..(params.memory_start_address + params.memory_size),
321            )
322            .expect("range is valid"),
323            mem_type: MemoryMapEntryType::VTL2_PROTECTABLE,
324            vnode: 0,
325        });
326    }
327
328    vtl2_ram
329}
330
331fn init_heap(params: &ShimParams) {
332    // Initialize the temporary heap.
333    //
334    // This is only to be enabled for mesh decode.
335    //
336    // SAFETY: The heap range is reserved at file build time, and is
337    // guaranteed to be unused by anything else.
338    unsafe {
339        ALLOCATOR.init(params.heap);
340    }
341}
342
343type ParsedDt =
344    ParsedDeviceTree<MAX_PARTITION_RAM_RANGES, MAX_CPU_COUNT, COMMAND_LINE_SIZE, MAX_ENTROPY_SIZE>;
345
346/// Add common ranges to [`AddressSpaceManagerBuilder`] regardless if creating
347/// topology from the host or from saved state.
348fn add_common_ranges<'a, I: Iterator<Item = MemoryRange>>(
349    params: &ShimParams,
350    mut builder: AddressSpaceManagerBuilder<'a, I>,
351) -> AddressSpaceManagerBuilder<'a, I> {
352    // Add the log buffer which is always present.
353    builder = builder.with_log_buffer(params.log_buffer);
354
355    if params.vtl2_reserved_region_size != 0 {
356        builder = builder.with_reserved_range(MemoryRange::new(
357            params.vtl2_reserved_region_start
358                ..(params.vtl2_reserved_region_start + params.vtl2_reserved_region_size),
359        ));
360    }
361
362    if params.sidecar_size != 0 {
363        builder = builder.with_sidecar_image(MemoryRange::new(
364            params.sidecar_base..(params.sidecar_base + params.sidecar_size),
365        ));
366    }
367
368    builder
369}
370
371#[derive(Debug, PartialEq, Eq)]
372struct PartitionTopology {
373    vtl2_ram: &'static [MemoryEntry],
374    vtl0_mmio: ArrayVec<MemoryRange, 2>,
375    vtl2_mmio: ArrayVec<MemoryRange, 2>,
376    memory_allocation_mode: MemoryAllocationMode,
377}
378
379/// State derived while constructing the partition topology
380/// from persisted state.
381#[derive(Debug, PartialEq, Eq)]
382struct PersistedPartitionTopology {
383    topology: PartitionTopology,
384    cpus_with_mapped_interrupts_no_io: Vec<u32>,
385    cpus_with_outstanding_io: Vec<u32>,
386}
387
388// Calculate the default mmio size for VTL2 when not specified by the host.
389//
390// This is half of the high mmio gap size, rounded down, with a minimum of 128
391// MB and a maximum of 1 GB.
392fn calculate_default_mmio_size(parsed: &ParsedDt) -> Result<u64, DtError> {
393    const MINIMUM_MMIO_SIZE: u64 = 128 * (1 << 20);
394    const MAXIMUM_MMIO_SIZE: u64 = 1 << 30;
395    let half_high_gap = parsed.vmbus_vtl0.as_ref().ok_or(DtError::Vtl0Vmbus)?.mmio[1].len() / 2;
396    Ok(half_high_gap.clamp(MINIMUM_MMIO_SIZE, MAXIMUM_MMIO_SIZE))
397}
398
399/// Read topology from the host provided device tree.
400fn topology_from_host_dt(
401    params: &ShimParams,
402    parsed: &ParsedDt,
403    options: &BootCommandLineOptions,
404    address_space: &mut AddressSpaceManager,
405) -> Result<PartitionTopology, DtError> {
406    log::info!("reading topology from host device tree");
407
408    let mut vtl2_ram =
409        off_stack!(ArrayVec<MemoryEntry, MAX_VTL2_RAM_RANGES>, ArrayVec::new_const());
410
411    // TODO: Decide if isolated guests always use VTL2 allocation mode.
412
413    let memory_allocation_mode = parsed.memory_allocation_mode;
414    match memory_allocation_mode {
415        MemoryAllocationMode::Host => {
416            vtl2_ram
417                .try_extend_from_slice(parse_host_vtl2_ram(params, &parsed.memory).as_ref())
418                .expect("vtl2 ram should only be 64 big");
419        }
420        MemoryAllocationMode::Vtl2 {
421            memory_size,
422            mmio_size: _,
423        } => {
424            vtl2_ram
425                .try_extend_from_slice(
426                    allocate_vtl2_ram(params, &parsed.memory, memory_size).as_ref(),
427                )
428                .expect("vtl2 ram should only be 64 big");
429        }
430    }
431
432    // The host is responsible for allocating MMIO ranges for non-isolated
433    // guests when it also provides the ram VTL2 should use.
434    //
435    // For isolated guests, or when VTL2 has been asked to carve out its own
436    // memory, first check if the host provided a VTL2 mmio range. If so, the
437    // mmio range must be large enough. Otherwise, choose to carve out a range
438    // from the VTL0 allotment.
439    let (vtl0_mmio, vtl2_mmio) = if params.isolation_type != IsolationType::None
440        || matches!(
441            parsed.memory_allocation_mode,
442            MemoryAllocationMode::Vtl2 { .. }
443        ) {
444        // Decide the amount of mmio VTL2 should allocate, which is different
445        // depending on the heuristic used.
446        //
447        // On a newer host where a vtl2 mmio range is provided inside the
448        // vmbus_vtl2 device tree node, use the size provided by the host inside
449        // the openhcl node for memory allocation mode.
450        //
451        // If the host did not provide a vtl2 mmio range, then use the maximum
452        // of the host provided value inside the openhcl node and the calculated
453        // default.
454        let host_provided_size = match parsed.memory_allocation_mode {
455            MemoryAllocationMode::Vtl2 { mmio_size, .. } => mmio_size.unwrap_or(0),
456            _ => 0,
457        };
458        let vmbus_vtl2 = parsed.vmbus_vtl2.as_ref().ok_or(DtError::Vtl2Vmbus)?;
459        let vmbus_vtl2_mmio_size = vmbus_vtl2.mmio.iter().map(|r| r.len()).sum::<u64>();
460        let mmio_size = if vmbus_vtl2_mmio_size != 0 {
461            host_provided_size
462        } else {
463            max(host_provided_size, calculate_default_mmio_size(parsed)?)
464        };
465
466        log::info!("allocating vtl2 mmio size {mmio_size:#x} bytes");
467        log::info!("host provided vtl2 mmio ranges are {vmbus_vtl2_mmio_size:#x} bytes");
468
469        let vmbus_vtl0 = parsed.vmbus_vtl0.as_ref().ok_or(DtError::Vtl0Vmbus)?;
470        if vmbus_vtl2_mmio_size != 0 {
471            // Verify the host provided mmio is large enough.
472            if vmbus_vtl2_mmio_size < mmio_size {
473                return Err(DtError::NotEnoughVtl2Mmio);
474            }
475
476            log::info!("using host provided vtl2 mmio: {:x?}", vmbus_vtl2.mmio);
477            (vmbus_vtl0.mmio.clone(), vmbus_vtl2.mmio.clone())
478        } else {
479            // Allocate vtl2 mmio from vtl0 mmio.
480            log::info!("no vtl2 mmio provided by host, allocating from vtl0 mmio");
481            let selected_vtl2_mmio = select_vtl2_mmio_range(&vmbus_vtl0.mmio, mmio_size)?;
482
483            // Update vtl0 mmio to exclude vtl2 mmio.
484            let vtl0_mmio = subtract_ranges(vmbus_vtl0.mmio.iter().cloned(), [selected_vtl2_mmio])
485                .collect::<ArrayVec<MemoryRange, 2>>();
486            let vtl2_mmio = [selected_vtl2_mmio]
487                .into_iter()
488                .collect::<ArrayVec<MemoryRange, 2>>();
489
490            // TODO: For now, if we have only a single vtl0_mmio range left,
491            // panic. In the future decide if we want to report this as a start
492            // failure in usermode, change allocation strategy, or something
493            // else.
494            assert_eq!(
495                vtl0_mmio.len(),
496                2,
497                "vtl0 mmio ranges are not 2 {:#x?}",
498                vtl0_mmio
499            );
500
501            log::info!("vtl0 mmio: {vtl0_mmio:x?}, vtl2 mmio: {vtl2_mmio:x?}");
502
503            (vtl0_mmio, vtl2_mmio)
504        }
505    } else {
506        (
507            parsed
508                .vmbus_vtl0
509                .as_ref()
510                .ok_or(DtError::Vtl0Vmbus)?
511                .mmio
512                .clone(),
513            parsed
514                .vmbus_vtl2
515                .as_ref()
516                .ok_or(DtError::Vtl2Vmbus)?
517                .mmio
518                .clone(),
519        )
520    };
521
522    // The host provided device tree is marked as normal ram, as the
523    // bootshim is responsible for constructing anything usermode needs from
524    // it, and passing it via the device tree provided to the kernel.
525    let reclaim_base = params.dt_start();
526    let reclaim_end = params.dt_start() + params.dt_size();
527    let vtl2_config_region_reclaim =
528        MemoryRange::try_new(reclaim_base..reclaim_end).expect("range is valid");
529
530    log::info!("reclaim device tree memory {reclaim_base:x}-{reclaim_end:x}");
531
532    // Initialize the address space manager with fixed at build time ranges.
533    let vtl2_config_region = MemoryRange::new(
534        params.parameter_region_start
535            ..(params.parameter_region_start + params.parameter_region_size),
536    );
537
538    // NOTE: Size the region as 20 pages. This should be plenty enough for the
539    // worst case encoded size (about 50 bytes worst case per memory entry, with
540    // the max number of ram ranges), and is small enough that we can reserve it
541    // on all sizes. Revisit this calculation if we persist more state in the
542    // future.
543    const PERSISTED_REGION_SIZE: u64 = 20 * 4096;
544    let (persisted_state_region, remainder) = params
545        .persisted_state
546        .split_at_offset(PERSISTED_REGION_SIZE);
547    log::info!(
548        "persisted state region sized to {persisted_state_region:#x?}, remainder {remainder:#x?}"
549    );
550
551    let mut address_space_builder = AddressSpaceManagerBuilder::new(
552        address_space,
553        &vtl2_ram,
554        params.used,
555        persisted_state_region,
556        subtract_ranges([vtl2_config_region], [vtl2_config_region_reclaim]),
557    );
558
559    address_space_builder = add_common_ranges(params, address_space_builder);
560
561    address_space_builder
562        .init()
563        .expect("failed to initialize address space manager");
564
565    if params.isolation_type == IsolationType::None {
566        let enable_vtl2_gpa_pool = options.enable_vtl2_gpa_pool;
567        let device_dma_page_count = parsed.device_dma_page_count;
568        let vp_count = parsed.cpu_count();
569        let mem_size = vtl2_ram.iter().map(|e| e.range.len()).sum();
570        if let Some(vtl2_gpa_pool_size) = pick_private_pool_size(
571            enable_vtl2_gpa_pool,
572            device_dma_page_count,
573            vp_count,
574            mem_size,
575        ) {
576            // Reserve the specified number of pages for the pool. Use the used
577            // ranges to figure out which VTL2 memory is free to allocate from.
578            let pool_size_bytes = vtl2_gpa_pool_size * HV_PAGE_SIZE;
579
580            // NOTE: For now, allocate all the private pool on NUMA node 0 to
581            // match previous behavior. Allocate from high memory downward to
582            // avoid overlapping any used ranges in low memory when openhcl's
583            // usage gets bigger, as otherwise the used_range by the bootshim
584            // could overlap the pool range chosen, when servicing to a new
585            // image.
586            let vnode = 0;
587            match address_space.allocate(
588                Some(vnode),
589                pool_size_bytes,
590                AllocationType::GpaPool,
591                AllocationPolicy::HighMemory,
592            ) {
593                Some(pool) => {
594                    log::info!("allocated VTL2 pool at {:#x?}", pool.range);
595                }
596                None => {
597                    // Build a compact string representation of the free ranges
598                    // for diagnostics. Keep the string relatively small, as the
599                    // enlightened panic message can only contain 1 page (4096)
600                    // bytes of output.
601                    let mut free_ranges = off_stack!(ArrayString<2048>, ArrayString::new_const());
602                    for range in address_space.free_ranges(vnode) {
603                        if write!(free_ranges, "[{:#x?}, {:#x?}) ", range.start(), range.end())
604                            .is_err()
605                        {
606                            let _ = write!(free_ranges, "...");
607                            break;
608                        }
609                    }
610                    let highest_numa_node = vtl2_ram.iter().map(|e| e.vnode).max().unwrap_or(0);
611                    panic!(
612                        "failed to allocate VTL2 pool of size {pool_size_bytes:#x} bytes (enable_vtl2_gpa_pool={enable_vtl2_gpa_pool:?}, device_dma_page_count={device_dma_page_count:#x?}, vp_count={vp_count}, mem_size={mem_size:#x}), highest_numa_node={highest_numa_node}, free_ranges=[ {}]",
613                        free_ranges.as_str()
614                    );
615                }
616            };
617        }
618    }
619
620    Ok(PartitionTopology {
621        vtl2_ram: OffStackRef::<'_, ArrayVec<MemoryEntry, MAX_VTL2_RAM_RANGES>>::leak(vtl2_ram),
622        vtl0_mmio,
623        vtl2_mmio,
624        memory_allocation_mode,
625    })
626}
627
628/// Read topology from the persisted state region and protobuf payload.
629fn topology_from_persisted_state(
630    header: PersistedStateHeader,
631    params: &ShimParams,
632    parsed: &ParsedDt,
633    address_space: &mut AddressSpaceManager,
634) -> Result<PersistedPartitionTopology, DtError> {
635    log::info!("reading topology from persisted state");
636
637    // Verify the header describes a protobuf region within the bootshim
638    // persisted region. We expect it to live there as today we rely on the
639    // build time generated pagetable to identity map the protobuf region.
640    let protobuf_region =
641        MemoryRange::new(header.protobuf_base..(header.protobuf_base + header.protobuf_region_len));
642    assert!(
643        params.persisted_state.contains(&protobuf_region),
644        "protobuf region {protobuf_region:#x?} is not contained within the persisted state region {:#x?}",
645        params.persisted_state
646    );
647
648    // Verify protobuf payload len is smaller than region.
649    assert!(
650        header.protobuf_payload_len <= header.protobuf_region_len,
651        "protobuf payload len {} is larger than region len {}",
652        header.protobuf_payload_len,
653        header.protobuf_region_len
654    );
655
656    // SAFETY: The region lies within the persisted state region, which is
657    // identity mapped via the build time generated pagetable.
658    let protobuf_raw = unsafe {
659        core::slice::from_raw_parts(
660            header.protobuf_base as *const u8,
661            header.protobuf_payload_len as usize,
662        )
663    };
664
665    let parsed_protobuf: loader_defs::shim::save_restore::SavedState =
666        bump_alloc::with_global_alloc(|| {
667            log::info!("decoding protobuf of size {}", protobuf_raw.len());
668            mesh_protobuf::decode(protobuf_raw).expect("failed to decode protobuf")
669        });
670
671    let loader_defs::shim::save_restore::SavedState {
672        partition_memory,
673        partition_mmio,
674        cpus_with_mapped_interrupts_no_io,
675        cpus_with_outstanding_io,
676    } = parsed_protobuf;
677
678    // FUTURE: should memory allocation mode should persist in saved state and
679    // verify the host did not change it?
680    let memory_allocation_mode = parsed.memory_allocation_mode;
681
682    let mut vtl2_ram =
683        off_stack!(ArrayVec<MemoryEntry, MAX_VTL2_RAM_RANGES>, ArrayVec::new_const());
684
685    // Determine which ranges are memory ranges used by VTL2.
686    let previous_vtl2_ram = partition_memory.iter().filter_map(|entry| {
687        if entry.vtl_type.ram() && entry.vtl_type.vtl2() {
688            Some(MemoryEntry {
689                range: entry.range,
690                mem_type: entry.igvm_type.clone().into(),
691                vnode: entry.vnode,
692            })
693        } else {
694            None
695        }
696    });
697
698    // Merge adjacent ranges as saved state reports the final usage of ram which
699    // includes reserved in separate ranges. Here we want the whole underlying
700    // ram ranges, merged with adjacent types if they share the same igvm types.
701    let previous_vtl2_ram = memory_range::merge_adjacent_ranges(
702        previous_vtl2_ram.map(|entry| (entry.range, (entry.mem_type, entry.vnode))),
703    );
704
705    vtl2_ram.extend(
706        previous_vtl2_ram.map(|(range, (mem_type, vnode))| MemoryEntry {
707            range,
708            mem_type,
709            vnode,
710        }),
711    );
712
713    // If the host was responsible for allocating VTL2 ram, verify the ram
714    // parsed from the previous instance matches.
715    //
716    // FUTURE: When VTL2 itself did allocation, we should verify that all ranges
717    // are still within the provided memory map.
718    if matches!(memory_allocation_mode, MemoryAllocationMode::Host) {
719        let host_vtl2_ram = parse_host_vtl2_ram(params, &parsed.memory);
720        assert_eq!(
721            vtl2_ram.as_slice(),
722            host_vtl2_ram.as_ref(),
723            "vtl2 ram from persisted state does not match host provided ram"
724        );
725    }
726
727    // Merge the persisted state header and protobuf region, and report that as
728    // the persisted region.
729    //
730    // NOTE: We could choose to resize the persisted region at this point, which
731    // we would need to do if we expect the saved state to grow larger.
732    let persisted_header = partition_memory
733        .iter()
734        .find(|entry| entry.vtl_type == MemoryVtlType::VTL2_PERSISTED_STATE_HEADER)
735        .expect("persisted state header missing");
736    let persisted_protobuf = partition_memory
737        .iter()
738        .find(|entry| entry.vtl_type == MemoryVtlType::VTL2_PERSISTED_STATE_PROTOBUF)
739        .expect("persisted state protobuf region missing");
740    assert_eq!(persisted_header.range.end(), protobuf_region.start());
741    let persisted_state_region =
742        MemoryRange::new(persisted_header.range.start()..persisted_protobuf.range.end());
743
744    // The host provided device tree is marked as normal ram, as the
745    // bootshim is responsible for constructing anything usermode needs from
746    // it, and passing it via the device tree provided to the kernel.
747    let reclaim_base = params.dt_start();
748    let reclaim_end = params.dt_start() + params.dt_size();
749    let vtl2_config_region_reclaim =
750        MemoryRange::try_new(reclaim_base..reclaim_end).expect("range is valid");
751
752    log::info!("reclaim device tree memory {reclaim_base:x}-{reclaim_end:x}");
753
754    let vtl2_config_region = MemoryRange::new(
755        params.parameter_region_start
756            ..(params.parameter_region_start + params.parameter_region_size),
757    );
758
759    let mut address_space_builder = AddressSpaceManagerBuilder::new(
760        address_space,
761        &vtl2_ram,
762        params.used,
763        persisted_state_region,
764        subtract_ranges([vtl2_config_region], [vtl2_config_region_reclaim]),
765    );
766
767    // NOTE: The only other region we take from the previous instance is any
768    // allocated vtl2 pool. Today, we do not allocate a new/larger pool if the
769    // command line arguments or host device tree changed, as that's not
770    // something we expect to happen in practice.
771    let mut pool_ranges = partition_memory.iter().filter_map(|entry| {
772        if entry.vtl_type == MemoryVtlType::VTL2_GPA_POOL {
773            Some(entry.range)
774        } else {
775            None
776        }
777    });
778    let pool_range = pool_ranges.next();
779    assert!(
780        pool_ranges.next().is_none(),
781        "previous instance had multiple pool ranges"
782    );
783
784    if let Some(pool_range) = pool_range {
785        address_space_builder = address_space_builder.with_pool_range(pool_range);
786    }
787
788    // As described above, other ranges come from this current boot.
789    address_space_builder = add_common_ranges(params, address_space_builder);
790
791    address_space_builder
792        .init()
793        .expect("failed to initialize address space manager");
794
795    // Read previous mmio for VTL0 and VTL2.
796    let vtl0_mmio = partition_mmio
797        .iter()
798        .filter_map(|entry| {
799            if entry.vtl_type == MemoryVtlType::VTL0_MMIO {
800                Some(entry.range)
801            } else {
802                None
803            }
804        })
805        .collect::<ArrayVec<MemoryRange, 2>>();
806    let vtl2_mmio = partition_mmio
807        .iter()
808        .filter_map(|entry| {
809            if entry.vtl_type == MemoryVtlType::VTL2_MMIO {
810                Some(entry.range)
811            } else {
812                None
813            }
814        })
815        .collect::<ArrayVec<MemoryRange, 2>>();
816
817    Ok(PersistedPartitionTopology {
818        topology: PartitionTopology {
819            vtl2_ram: OffStackRef::<'_, ArrayVec<MemoryEntry, MAX_VTL2_RAM_RANGES>>::leak(vtl2_ram),
820            vtl0_mmio,
821            vtl2_mmio,
822            memory_allocation_mode,
823        },
824        cpus_with_mapped_interrupts_no_io,
825        cpus_with_outstanding_io,
826    })
827}
828
829/// Read the persisted header from the start of the persisted state region
830/// described at file build time. If the magic value is not set, `None` is
831/// returned.
832fn read_persisted_region_header(params: &ShimParams) -> Option<PersistedStateHeader> {
833    // TODO CVM: On an isolated guest, these pages may not be accepted. We need
834    // to rethink how this will work in order to handle this correctly, as on a
835    // first boot we'd need to accept them early, but subsequent boots should
836    // not accept any pages.
837    //
838    // This may require some value passed in via a register or something early
839    // that indicates this is a servicing boot, which we could set if OpenHCL
840    // itself launches the next instance.
841    if params.isolation_type != IsolationType::None {
842        return None;
843    }
844
845    // SAFETY: The header lies at the start of the shim described persisted state
846    // region. This range is guaranteed to be identity mapped at file build
847    // time.
848    let buf = unsafe {
849        core::slice::from_raw_parts(
850            params.persisted_state.start() as *const u8,
851            size_of::<PersistedStateHeader>(),
852        )
853    };
854
855    let header = PersistedStateHeader::read_from_bytes(buf)
856        .expect("region is page aligned and the correct size");
857
858    if header.magic == PersistedStateHeader::MAGIC {
859        Some(header)
860    } else {
861        None
862    }
863}
864
865impl PartitionInfo {
866    // Read the IGVM provided DT for the vtl2 partition info.
867    pub fn read_from_dt<'a>(
868        params: &'a ShimParams,
869        storage: &'a mut Self,
870        address_space: &'_ mut AddressSpaceManager,
871        mut options: BootCommandLineOptions,
872        can_trust_host: bool,
873    ) -> Result<&'a mut Self, DtError> {
874        let dt = params.device_tree();
875
876        if dt[0] == 0 {
877            log::error!("host did not provide a device tree");
878            return Err(DtError::NoDeviceTree);
879        }
880
881        let mut dt_storage = off_stack!(ParsedDt, ParsedDeviceTree::new());
882
883        let parsed = ParsedDeviceTree::parse(dt, &mut *dt_storage).map_err(DtError::DeviceTree)?;
884
885        let command_line = params.command_line();
886
887        // Always write the measured command line.
888        write!(
889            storage.cmdline,
890            "{}",
891            command_line
892                .command_line()
893                .expect("measured command line should be valid")
894        )
895        .map_err(|_| DtError::CommandLineSize)?;
896
897        match command_line.policy {
898            CommandLinePolicy::STATIC => {
899                // Nothing to do, we already wrote the measured command line.
900            }
901            CommandLinePolicy::APPEND_CHOSEN if can_trust_host => {
902                // Check the host-provided command line for options for ourself,
903                // and pass it along to the kernel.
904                options.parse(&parsed.command_line);
905                write!(storage.cmdline, " {}", &parsed.command_line)
906                    .map_err(|_| DtError::CommandLineSize)?;
907            }
908            CommandLinePolicy::APPEND_CHOSEN if !can_trust_host => {
909                // Nothing to do, we ignore the host provided command line.
910            }
911            _ => unreachable!(),
912        }
913
914        init_heap(params);
915
916        let persisted_state_header = read_persisted_region_header(params);
917        let (topology, has_devices_that_should_disable_sidecar) =
918            if let Some(header) = persisted_state_header {
919                log::info!("found persisted state header");
920                let persisted_topology =
921                    topology_from_persisted_state(header, params, parsed, address_space)?;
922
923                (
924                    persisted_topology.topology,
925                    !(persisted_topology
926                        .cpus_with_mapped_interrupts_no_io
927                        .is_empty()
928                        && persisted_topology.cpus_with_outstanding_io.is_empty()),
929                )
930            } else {
931                (
932                    topology_from_host_dt(params, parsed, &options, address_space)?,
933                    false,
934                )
935            };
936
937        let Self {
938            vtl2_ram,
939            partition_ram,
940            isolation,
941            bsp_reg,
942            cpus,
943            vmbus_vtl0,
944            vmbus_vtl2,
945            cmdline: _,
946            com3_serial_available: com3_serial,
947            gic,
948            pmu_gsiv,
949            memory_allocation_mode,
950            entropy,
951            vtl0_alias_map,
952            nvme_keepalive,
953            boot_options,
954        } = storage;
955
956        if let (SidecarOptions::Enabled { cpu_threshold, .. }, true) = (
957            &boot_options.sidecar,
958            has_devices_that_should_disable_sidecar,
959        ) {
960            if cpu_threshold.is_none()
961                || cpu_threshold
962                    .and_then(|threshold| threshold.try_into().ok())
963                    .is_some_and(|threshold| parsed.cpu_count() < threshold)
964            {
965                // If we are in the restore path, disable sidecar for small VMs, as the amortization
966                // benefits don't apply when devices are kept alive; the CPUs need to be powered on anyway
967                // to check for interrupts.
968                log::info!("disabling sidecar, as we are restoring from persisted state");
969                boot_options.sidecar = SidecarOptions::DisabledServicing;
970                options.sidecar = SidecarOptions::DisabledServicing;
971            }
972        }
973
974        // Set ram and memory alloction mode.
975        vtl2_ram.clear();
976        vtl2_ram.extend(topology.vtl2_ram.iter().copied());
977        partition_ram.clear();
978        partition_ram.extend(parsed.memory.iter().copied());
979        *memory_allocation_mode = topology.memory_allocation_mode;
980
981        // Set vmbus fields. The connection ID comes from the host, but mmio
982        // comes from topology.
983        *vmbus_vtl0 = VmbusInfo {
984            connection_id: parsed
985                .vmbus_vtl0
986                .as_ref()
987                .ok_or(DtError::Vtl0Vmbus)?
988                .connection_id,
989            mmio: topology.vtl0_mmio,
990        };
991        *vmbus_vtl2 = VmbusInfo {
992            connection_id: parsed
993                .vmbus_vtl2
994                .as_ref()
995                .ok_or(DtError::Vtl2Vmbus)?
996                .connection_id,
997            mmio: topology.vtl2_mmio,
998        };
999
1000        // If we can trust the host, use the provided alias map
1001        if can_trust_host {
1002            *vtl0_alias_map = parsed.vtl0_alias_map;
1003        }
1004
1005        *isolation = params.isolation_type;
1006
1007        *bsp_reg = parsed.boot_cpuid_phys;
1008        cpus.extend(parsed.cpus.iter().copied());
1009        *com3_serial = parsed.com3_serial;
1010        *gic = parsed.gic.clone();
1011        *pmu_gsiv = parsed.pmu_gsiv;
1012        *entropy = parsed.entropy.clone();
1013        *nvme_keepalive = parsed.nvme_keepalive;
1014        *boot_options = options;
1015
1016        Ok(storage)
1017    }
1018}