openhcl_boot/host_params/dt/
mod.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! Parse partition info using the IGVM device tree parameter.
5
6extern crate alloc;
7
8use super::PartitionInfo;
9use super::shim_params::ShimParams;
10use crate::cmdline::BootCommandLineOptions;
11use crate::cmdline::SidecarOptions;
12use crate::host_params::COMMAND_LINE_SIZE;
13use crate::host_params::MAX_CPU_COUNT;
14use crate::host_params::MAX_ENTROPY_SIZE;
15use crate::host_params::MAX_NUMA_NODES;
16use crate::host_params::MAX_PARTITION_RAM_RANGES;
17use crate::host_params::MAX_VTL2_RAM_RANGES;
18use crate::host_params::dt::dma_hint::pick_private_pool_size;
19use crate::host_params::mmio::select_vtl2_mmio_range;
20use crate::host_params::shim_params::IsolationType;
21use crate::memory::AddressSpaceManager;
22use crate::memory::AddressSpaceManagerBuilder;
23use crate::memory::AllocationPolicy;
24use crate::memory::AllocationType;
25use crate::single_threaded::OffStackRef;
26use crate::single_threaded::off_stack;
27use alloc::vec::Vec;
28use arrayvec::ArrayString;
29use arrayvec::ArrayVec;
30use bump_alloc::ALLOCATOR;
31use core::cmp::max;
32use core::fmt::Write;
33use host_fdt_parser::MemoryAllocationMode;
34use host_fdt_parser::MemoryEntry;
35use host_fdt_parser::ParsedDeviceTree;
36use host_fdt_parser::VmbusInfo;
37use hvdef::HV_PAGE_SIZE;
38use igvm_defs::MemoryMapEntryType;
39use loader_defs::paravisor::CommandLinePolicy;
40use loader_defs::shim::MemoryVtlType;
41use loader_defs::shim::PersistedStateHeader;
42use memory_range::MemoryRange;
43use memory_range::subtract_ranges;
44use memory_range::walk_ranges;
45use thiserror::Error;
46use zerocopy::FromBytes;
47
48mod bump_alloc;
49mod dma_hint;
50
51/// Errors when reading the host device tree.
52#[derive(Debug, Error)]
53pub enum DtError {
54    /// Host did not provide a device tree.
55    #[error("no device tree provided by host")]
56    NoDeviceTree,
57    /// Invalid device tree.
58    #[error("host provided device tree is invalid")]
59    DeviceTree(#[source] host_fdt_parser::Error<'static>),
60    /// PartitionInfo's command line is too small to write the parsed legacy
61    /// command line.
62    #[error("commandline storage is too small to write the parsed command line")]
63    CommandLineSize,
64    /// Device tree did not contain a vmbus node for VTL2.
65    #[error("device tree did not contain a vmbus node for VTL2")]
66    Vtl2Vmbus,
67    /// Device tree did not contain a vmbus node for VTL0.
68    #[error("device tree did not contain a vmbus node for VTL0")]
69    Vtl0Vmbus,
70    /// Host provided high MMIO range is insufficient to cover VTL0 and VTL2.
71    #[error("host provided high MMIO range is insufficient to cover VTL0 and VTL2")]
72    NotEnoughMmio,
73}
74
75/// Allocate VTL2 ram from the partition's memory map.
76fn allocate_vtl2_ram(
77    params: &ShimParams,
78    partition_memory_map: &[MemoryEntry],
79    ram_size: Option<u64>,
80) -> OffStackRef<'static, impl AsRef<[MemoryEntry]> + use<>> {
81    // First, calculate how many numa nodes there are by looking at unique numa
82    // nodes in the memory map.
83    let mut numa_nodes = off_stack!(ArrayVec<u32, MAX_NUMA_NODES>, ArrayVec::new_const());
84
85    for entry in partition_memory_map.iter() {
86        match numa_nodes.binary_search(&entry.vnode) {
87            Ok(_) => {}
88            Err(index) => {
89                numa_nodes.insert(index, entry.vnode);
90            }
91        }
92    }
93
94    let numa_node_count = numa_nodes.len();
95
96    let vtl2_size = if let Some(ram_size) = ram_size {
97        if ram_size < params.memory_size {
98            panic!(
99                "host provided vtl2 ram size {:x} is smaller than measured size {:x}",
100                ram_size, params.memory_size
101            );
102        }
103        max(ram_size, params.memory_size)
104    } else {
105        params.memory_size
106    };
107
108    // Next, calculate the amount of memory that needs to be allocated per numa
109    // node.
110    let ram_per_node = vtl2_size / numa_node_count as u64;
111
112    // Seed the remaining allocation list with the memory required per node.
113    let mut memory_per_node = off_stack!(ArrayVec<u64, MAX_NUMA_NODES>, ArrayVec::new_const());
114    memory_per_node.extend((0..numa_node_count).map(|_| 0));
115    for entry in partition_memory_map.iter() {
116        memory_per_node[entry.vnode as usize] = ram_per_node;
117    }
118
119    // The range the IGVM file was loaded into is special - it is already
120    // counted as "allocated". This may have been split across different numa
121    // nodes. Walk the used range, add it to vtl2 ram, and subtract it from the
122    // used ranges.
123    let mut vtl2_ram = off_stack!(ArrayVec<MemoryEntry, MAX_NUMA_NODES>, ArrayVec::new_const());
124    let mut free_memory_after_vtl2 = off_stack!(ArrayVec<MemoryEntry, 1024>, ArrayVec::new_const());
125    let file_memory_range = MemoryRange::new(
126        params.memory_start_address..(params.memory_start_address + params.memory_size),
127    );
128
129    for (range, result) in walk_ranges(
130        [(file_memory_range, ())],
131        partition_memory_map.iter().map(|e| (e.range, e)),
132    ) {
133        match result {
134            memory_range::RangeWalkResult::Right(entry) => {
135                // Add this entry to the free list.
136                free_memory_after_vtl2.push(MemoryEntry {
137                    range,
138                    mem_type: entry.mem_type,
139                    vnode: entry.vnode,
140                });
141            }
142            memory_range::RangeWalkResult::Both(_, entry) => {
143                // Add this entry to the vtl2 ram list.
144                vtl2_ram.push(MemoryEntry {
145                    range,
146                    mem_type: entry.mem_type,
147                    vnode: entry.vnode,
148                });
149            }
150            memory_range::RangeWalkResult::Left(_) => {
151                panic!("used file range {range:#x?} is not reported as ram by host memmap")
152            }
153            // Ranges in neither are ignored.
154            memory_range::RangeWalkResult::Neither => {}
155        }
156    }
157
158    // Now remove ranges from the free list that were part of the initial launch
159    // context.
160    let mut free_memory = off_stack!(ArrayVec<MemoryEntry, 1024>, ArrayVec::new_const());
161    for (range, result) in walk_ranges(
162        params
163            .imported_regions()
164            .filter_map(|(range, _preaccepted)| {
165                if !file_memory_range.contains(&range) {
166                     // There should be no overlap - either the preaccepted range
167                    // is exclusively covered by the preaccpted VTL2 range or it
168                    // is not.
169                    assert!(!file_memory_range.overlaps(&range), "imported range {range:#x?} overlaps vtl2 range and is not fully contained within vtl2 range");
170                    Some((range, ()))
171                } else {
172                    None
173                }
174            }),
175        free_memory_after_vtl2.iter().map(|e| (e.range, e)),
176    ) {
177        match result {
178            memory_range::RangeWalkResult::Right(entry) => {
179                free_memory.push(MemoryEntry {
180                    range,
181                    mem_type: entry.mem_type,
182                    vnode: entry.vnode,
183                });
184            }
185            memory_range::RangeWalkResult::Left(_) => {
186                // On TDX, the reset vector page is not reported as ram by the
187                // host, but is preaccepted. Ignore it.
188                #[cfg(target_arch = "x86_64")]
189                if params.isolation_type == IsolationType::Tdx && range.start_4k_gpn() == 0xFFFFF && range.len() == 0x1000 {
190                    continue;
191                }
192
193                panic!("launch context range {range:#x?} is not reported as ram by host memmap")
194            }
195            memory_range::RangeWalkResult::Both(_, _) => {
196                // Range was part of the preaccepted import, is not free to
197                // allocate additional VTL2 ram from.
198            }
199            // Ranges in neither are ignored.
200            memory_range::RangeWalkResult::Neither => {}
201        }
202    }
203
204    // Subtract the used ranges from vtl2_ram
205    for entry in vtl2_ram.iter() {
206        let mem_req = &mut memory_per_node[entry.vnode as usize];
207
208        if entry.range.len() > *mem_req {
209            // TODO: Today if a used range is larger than the mem required, we
210            // just subtract that numa range to zero. Should we instead subtract
211            // from other numa nodes equally for over allocation?
212            log::warn!(
213                "entry {entry:?} is larger than required {mem_req} for vnode {}",
214                entry.vnode
215            );
216            *mem_req = 0;
217        } else {
218            *mem_req -= entry.range.len();
219        }
220    }
221
222    // Allocate remaining memory per node required.
223    for (node, required_mem) in memory_per_node.iter().enumerate() {
224        let mut required_mem = *required_mem;
225        if required_mem == 0 {
226            continue;
227        }
228
229        // Start allocation from the top of the free list, which is high memory
230        // in reverse order.
231        for entry in free_memory.iter_mut().rev() {
232            if entry.vnode == node as u32 && !entry.range.is_empty() {
233                assert!(required_mem != 0);
234                let bytes_to_allocate = core::cmp::min(entry.range.len(), required_mem);
235
236                // Allocate top down from the range.
237                let offset = entry.range.len() - bytes_to_allocate;
238                let (remaining, alloc) = MemoryRange::split_at_offset(&entry.range, offset);
239
240                entry.range = remaining;
241                vtl2_ram.push(MemoryEntry {
242                    range: alloc,
243                    mem_type: entry.mem_type,
244                    vnode: node as u32,
245                });
246
247                required_mem -= bytes_to_allocate;
248
249                // Stop allocating if we're done allocating.
250                if required_mem == 0 {
251                    break;
252                }
253            }
254        }
255
256        if required_mem != 0 {
257            // TODO: Handle fallback allocations on other numa nodes when a node
258            // is exhausted.
259            panic!(
260                "failed to allocate {required_mem:#x} for vnode {node:#x}, no memory remaining for vnode"
261            );
262        }
263    }
264
265    // Sort VTL2 ram as we may have allocated from different places.
266    vtl2_ram.sort_unstable_by_key(|e| e.range.start());
267
268    vtl2_ram
269}
270
271/// Parse VTL2 ram from host provided ranges.
272fn parse_host_vtl2_ram(
273    params: &ShimParams,
274    memory: &[MemoryEntry],
275) -> OffStackRef<'static, impl AsRef<[MemoryEntry]> + use<>> {
276    // If no VTL2 protectable ram was provided by the host, use the build time
277    // value encoded in ShimParams.
278    let mut vtl2_ram = off_stack!(ArrayVec<MemoryEntry, MAX_NUMA_NODES>, ArrayVec::new_const());
279    if params.isolation_type.is_hardware_isolated() {
280        // Hardware isolated VMs use the size hint by the host, but use the base
281        // address encoded in the file.
282        let vtl2_size = memory.iter().fold(0, |acc, entry| {
283            if entry.mem_type == MemoryMapEntryType::VTL2_PROTECTABLE {
284                acc + entry.range.len()
285            } else {
286                acc
287            }
288        });
289
290        log::info!(
291            "host provided vtl2 ram size is {:x}, measured size is {:x}",
292            vtl2_size,
293            params.memory_size
294        );
295
296        let vtl2_size = max(vtl2_size, params.memory_size);
297        vtl2_ram.push(MemoryEntry {
298            range: MemoryRange::new(
299                params.memory_start_address..(params.memory_start_address + vtl2_size),
300            ),
301            mem_type: MemoryMapEntryType::VTL2_PROTECTABLE,
302            vnode: 0,
303        });
304    } else {
305        for &entry in memory
306            .iter()
307            .filter(|entry| entry.mem_type == MemoryMapEntryType::VTL2_PROTECTABLE)
308        {
309            vtl2_ram.push(entry);
310        }
311    }
312
313    if vtl2_ram.is_empty() {
314        log::info!("using measured vtl2 ram");
315        vtl2_ram.push(MemoryEntry {
316            range: MemoryRange::try_new(
317                params.memory_start_address..(params.memory_start_address + params.memory_size),
318            )
319            .expect("range is valid"),
320            mem_type: MemoryMapEntryType::VTL2_PROTECTABLE,
321            vnode: 0,
322        });
323    }
324
325    vtl2_ram
326}
327
328fn init_heap(params: &ShimParams) {
329    // Initialize the temporary heap.
330    //
331    // This is only to be enabled for mesh decode.
332    //
333    // SAFETY: The heap range is reserved at file build time, and is
334    // guaranteed to be unused by anything else.
335    unsafe {
336        ALLOCATOR.init(params.heap);
337    }
338}
339
340type ParsedDt =
341    ParsedDeviceTree<MAX_PARTITION_RAM_RANGES, MAX_CPU_COUNT, COMMAND_LINE_SIZE, MAX_ENTROPY_SIZE>;
342
343/// Add common ranges to [`AddressSpaceManagerBuilder`] regardless if creating
344/// topology from the host or from saved state.
345fn add_common_ranges<'a, I: Iterator<Item = MemoryRange>>(
346    params: &ShimParams,
347    mut builder: AddressSpaceManagerBuilder<'a, I>,
348) -> AddressSpaceManagerBuilder<'a, I> {
349    // Add the log buffer which is always present.
350    builder = builder.with_log_buffer(params.log_buffer);
351
352    if params.vtl2_reserved_region_size != 0 {
353        builder = builder.with_reserved_range(MemoryRange::new(
354            params.vtl2_reserved_region_start
355                ..(params.vtl2_reserved_region_start + params.vtl2_reserved_region_size),
356        ));
357    }
358
359    if params.sidecar_size != 0 {
360        builder = builder.with_sidecar_image(MemoryRange::new(
361            params.sidecar_base..(params.sidecar_base + params.sidecar_size),
362        ));
363    }
364
365    builder
366}
367
368#[derive(Debug, PartialEq, Eq)]
369struct PartitionTopology {
370    vtl2_ram: &'static [MemoryEntry],
371    vtl0_mmio: ArrayVec<MemoryRange, 2>,
372    vtl2_mmio: ArrayVec<MemoryRange, 2>,
373    memory_allocation_mode: MemoryAllocationMode,
374}
375
376/// State derived while constructing the partition topology
377/// from persisted state.
378#[derive(Debug, PartialEq, Eq)]
379struct PersistedPartitionTopology {
380    topology: PartitionTopology,
381    cpus_with_mapped_interrupts_no_io: Vec<u32>,
382    cpus_with_outstanding_io: Vec<u32>,
383}
384
385// Calculate the default mmio size for VTL2 when not specified by the host.
386//
387// This is half of the high mmio gap size, rounded down, with a minimum of 128
388// MB and a maximum of 1 GB.
389fn calculate_default_mmio_size(parsed: &ParsedDt) -> Result<u64, DtError> {
390    const MINIMUM_MMIO_SIZE: u64 = 128 * (1 << 20);
391    const MAXIMUM_MMIO_SIZE: u64 = 1 << 30;
392    let half_high_gap = parsed.vmbus_vtl0.as_ref().ok_or(DtError::Vtl0Vmbus)?.mmio[1].len() / 2;
393    Ok(half_high_gap.clamp(MINIMUM_MMIO_SIZE, MAXIMUM_MMIO_SIZE))
394}
395
396/// Read topology from the host provided device tree.
397fn topology_from_host_dt(
398    params: &ShimParams,
399    parsed: &ParsedDt,
400    options: &BootCommandLineOptions,
401    address_space: &mut AddressSpaceManager,
402) -> Result<PartitionTopology, DtError> {
403    log::info!("reading topology from host device tree");
404
405    let mut vtl2_ram =
406        off_stack!(ArrayVec<MemoryEntry, MAX_VTL2_RAM_RANGES>, ArrayVec::new_const());
407
408    // TODO: Decide if isolated guests always use VTL2 allocation mode.
409
410    let memory_allocation_mode = parsed.memory_allocation_mode;
411    match memory_allocation_mode {
412        MemoryAllocationMode::Host => {
413            vtl2_ram
414                .try_extend_from_slice(parse_host_vtl2_ram(params, &parsed.memory).as_ref())
415                .expect("vtl2 ram should only be 64 big");
416        }
417        MemoryAllocationMode::Vtl2 {
418            memory_size,
419            mmio_size: _,
420        } => {
421            vtl2_ram
422                .try_extend_from_slice(
423                    allocate_vtl2_ram(params, &parsed.memory, memory_size).as_ref(),
424                )
425                .expect("vtl2 ram should only be 64 big");
426        }
427    }
428
429    // The host is responsible for allocating MMIO ranges for non-isolated
430    // guests when it also provides the ram VTL2 should use.
431    //
432    // For isolated guests, or when VTL2 has been asked to carve out its own
433    // memory, carve out a range from the VTL0 allotment.
434    let (vtl0_mmio, vtl2_mmio) = if params.isolation_type != IsolationType::None
435        || matches!(
436            parsed.memory_allocation_mode,
437            MemoryAllocationMode::Vtl2 { .. }
438        ) {
439        // Decide the amount of mmio VTL2 should allocate.
440        let mmio_size = max(
441            match parsed.memory_allocation_mode {
442                MemoryAllocationMode::Vtl2 { mmio_size, .. } => mmio_size.unwrap_or(0),
443                _ => 0,
444            },
445            calculate_default_mmio_size(parsed)?,
446        );
447
448        log::info!("allocating vtl2 mmio size {mmio_size:#x} bytes");
449
450        // Decide what mmio vtl2 should use.
451        let mmio = &parsed.vmbus_vtl0.as_ref().ok_or(DtError::Vtl0Vmbus)?.mmio;
452        let selected_vtl2_mmio = select_vtl2_mmio_range(mmio, mmio_size)?;
453
454        // Update vtl0 mmio to exclude vtl2 mmio.
455        let vtl0_mmio = subtract_ranges(mmio.iter().cloned(), [selected_vtl2_mmio])
456            .collect::<ArrayVec<MemoryRange, 2>>();
457        let vtl2_mmio = [selected_vtl2_mmio]
458            .into_iter()
459            .collect::<ArrayVec<MemoryRange, 2>>();
460
461        // TODO: For now, if we have only a single vtl0_mmio range left,
462        // panic. In the future decide if we want to report this as a start
463        // failure in usermode, change allocation strategy, or something
464        // else.
465        assert_eq!(
466            vtl0_mmio.len(),
467            2,
468            "vtl0 mmio ranges are not 2 {:#x?}",
469            vtl0_mmio
470        );
471
472        (vtl0_mmio, vtl2_mmio)
473    } else {
474        (
475            parsed
476                .vmbus_vtl0
477                .as_ref()
478                .ok_or(DtError::Vtl0Vmbus)?
479                .mmio
480                .clone(),
481            parsed
482                .vmbus_vtl2
483                .as_ref()
484                .ok_or(DtError::Vtl2Vmbus)?
485                .mmio
486                .clone(),
487        )
488    };
489
490    // The host provided device tree is marked as normal ram, as the
491    // bootshim is responsible for constructing anything usermode needs from
492    // it, and passing it via the device tree provided to the kernel.
493    let reclaim_base = params.dt_start();
494    let reclaim_end = params.dt_start() + params.dt_size();
495    let vtl2_config_region_reclaim =
496        MemoryRange::try_new(reclaim_base..reclaim_end).expect("range is valid");
497
498    log::info!("reclaim device tree memory {reclaim_base:x}-{reclaim_end:x}");
499
500    // Initialize the address space manager with fixed at build time ranges.
501    let vtl2_config_region = MemoryRange::new(
502        params.parameter_region_start
503            ..(params.parameter_region_start + params.parameter_region_size),
504    );
505
506    // NOTE: Size the region as 20 pages. This should be plenty enough for the
507    // worst case encoded size (about 50 bytes worst case per memory entry, with
508    // the max number of ram ranges), and is small enough that we can reserve it
509    // on all sizes. Revisit this calculation if we persist more state in the
510    // future.
511    const PERSISTED_REGION_SIZE: u64 = 20 * 4096;
512    let (persisted_state_region, remainder) = params
513        .persisted_state
514        .split_at_offset(PERSISTED_REGION_SIZE);
515    log::info!(
516        "persisted state region sized to {persisted_state_region:#x?}, remainder {remainder:#x?}"
517    );
518
519    let mut address_space_builder = AddressSpaceManagerBuilder::new(
520        address_space,
521        &vtl2_ram,
522        params.used,
523        persisted_state_region,
524        subtract_ranges([vtl2_config_region], [vtl2_config_region_reclaim]),
525    );
526
527    address_space_builder = add_common_ranges(params, address_space_builder);
528
529    address_space_builder
530        .init()
531        .expect("failed to initialize address space manager");
532
533    if params.isolation_type == IsolationType::None {
534        let enable_vtl2_gpa_pool = options.enable_vtl2_gpa_pool;
535        let device_dma_page_count = parsed.device_dma_page_count;
536        let vp_count = parsed.cpu_count();
537        let mem_size = vtl2_ram.iter().map(|e| e.range.len()).sum();
538        if let Some(vtl2_gpa_pool_size) = pick_private_pool_size(
539            enable_vtl2_gpa_pool,
540            device_dma_page_count,
541            vp_count,
542            mem_size,
543        ) {
544            // Reserve the specified number of pages for the pool. Use the used
545            // ranges to figure out which VTL2 memory is free to allocate from.
546            let pool_size_bytes = vtl2_gpa_pool_size * HV_PAGE_SIZE;
547
548            // NOTE: For now, allocate all the private pool on NUMA node 0 to
549            // match previous behavior. Allocate from high memory downward to
550            // avoid overlapping any used ranges in low memory when openhcl's
551            // usage gets bigger, as otherwise the used_range by the bootshim
552            // could overlap the pool range chosen, when servicing to a new
553            // image.
554            let vnode = 0;
555            match address_space.allocate(
556                Some(vnode),
557                pool_size_bytes,
558                AllocationType::GpaPool,
559                AllocationPolicy::HighMemory,
560            ) {
561                Some(pool) => {
562                    log::info!("allocated VTL2 pool at {:#x?}", pool.range);
563                }
564                None => {
565                    // Build a compact string representation of the free ranges
566                    // for diagnostics. Keep the string relatively small, as the
567                    // enlightened panic message can only contain 1 page (4096)
568                    // bytes of output.
569                    let mut free_ranges = off_stack!(ArrayString<2048>, ArrayString::new_const());
570                    for range in address_space.free_ranges(vnode) {
571                        if write!(free_ranges, "[{:#x?}, {:#x?}) ", range.start(), range.end())
572                            .is_err()
573                        {
574                            let _ = write!(free_ranges, "...");
575                            break;
576                        }
577                    }
578                    let highest_numa_node = vtl2_ram.iter().map(|e| e.vnode).max().unwrap_or(0);
579                    panic!(
580                        "failed to allocate VTL2 pool of size {pool_size_bytes:#x} bytes (enable_vtl2_gpa_pool={enable_vtl2_gpa_pool:?}, device_dma_page_count={device_dma_page_count:#x?}, vp_count={vp_count}, mem_size={mem_size:#x}), highest_numa_node={highest_numa_node}, free_ranges=[ {}]",
581                        free_ranges.as_str()
582                    );
583                }
584            };
585        }
586    }
587
588    Ok(PartitionTopology {
589        vtl2_ram: OffStackRef::<'_, ArrayVec<MemoryEntry, MAX_VTL2_RAM_RANGES>>::leak(vtl2_ram),
590        vtl0_mmio,
591        vtl2_mmio,
592        memory_allocation_mode,
593    })
594}
595
596/// Read topology from the persisted state region and protobuf payload.
597fn topology_from_persisted_state(
598    header: PersistedStateHeader,
599    params: &ShimParams,
600    parsed: &ParsedDt,
601    address_space: &mut AddressSpaceManager,
602) -> Result<PersistedPartitionTopology, DtError> {
603    log::info!("reading topology from persisted state");
604
605    // Verify the header describes a protobuf region within the bootshim
606    // persisted region. We expect it to live there as today we rely on the
607    // build time generated pagetable to identity map the protobuf region.
608    let protobuf_region =
609        MemoryRange::new(header.protobuf_base..(header.protobuf_base + header.protobuf_region_len));
610    assert!(
611        params.persisted_state.contains(&protobuf_region),
612        "protobuf region {protobuf_region:#x?} is not contained within the persisted state region {:#x?}",
613        params.persisted_state
614    );
615
616    // Verify protobuf payload len is smaller than region.
617    assert!(
618        header.protobuf_payload_len <= header.protobuf_region_len,
619        "protobuf payload len {} is larger than region len {}",
620        header.protobuf_payload_len,
621        header.protobuf_region_len
622    );
623
624    // SAFETY: The region lies within the persisted state region, which is
625    // identity mapped via the build time generated pagetable.
626    let protobuf_raw = unsafe {
627        core::slice::from_raw_parts(
628            header.protobuf_base as *const u8,
629            header.protobuf_payload_len as usize,
630        )
631    };
632
633    let parsed_protobuf: loader_defs::shim::save_restore::SavedState =
634        bump_alloc::with_global_alloc(|| {
635            log::info!("decoding protobuf of size {}", protobuf_raw.len());
636            mesh_protobuf::decode(protobuf_raw).expect("failed to decode protobuf")
637        });
638
639    let loader_defs::shim::save_restore::SavedState {
640        partition_memory,
641        partition_mmio,
642        cpus_with_mapped_interrupts_no_io,
643        cpus_with_outstanding_io,
644    } = parsed_protobuf;
645
646    // FUTURE: should memory allocation mode should persist in saved state and
647    // verify the host did not change it?
648    let memory_allocation_mode = parsed.memory_allocation_mode;
649
650    let mut vtl2_ram =
651        off_stack!(ArrayVec<MemoryEntry, MAX_VTL2_RAM_RANGES>, ArrayVec::new_const());
652
653    // Determine which ranges are memory ranges used by VTL2.
654    let previous_vtl2_ram = partition_memory.iter().filter_map(|entry| {
655        if entry.vtl_type.ram() && entry.vtl_type.vtl2() {
656            Some(MemoryEntry {
657                range: entry.range,
658                mem_type: entry.igvm_type.clone().into(),
659                vnode: entry.vnode,
660            })
661        } else {
662            None
663        }
664    });
665
666    // Merge adjacent ranges as saved state reports the final usage of ram which
667    // includes reserved in separate ranges. Here we want the whole underlying
668    // ram ranges, merged with adjacent types if they share the same igvm types.
669    let previous_vtl2_ram = memory_range::merge_adjacent_ranges(
670        previous_vtl2_ram.map(|entry| (entry.range, (entry.mem_type, entry.vnode))),
671    );
672
673    vtl2_ram.extend(
674        previous_vtl2_ram.map(|(range, (mem_type, vnode))| MemoryEntry {
675            range,
676            mem_type,
677            vnode,
678        }),
679    );
680
681    // If the host was responsible for allocating VTL2 ram, verify the ram
682    // parsed from the previous instance matches.
683    //
684    // FUTURE: When VTL2 itself did allocation, we should verify that all ranges
685    // are still within the provided memory map.
686    if matches!(memory_allocation_mode, MemoryAllocationMode::Host) {
687        let host_vtl2_ram = parse_host_vtl2_ram(params, &parsed.memory);
688        assert_eq!(
689            vtl2_ram.as_slice(),
690            host_vtl2_ram.as_ref(),
691            "vtl2 ram from persisted state does not match host provided ram"
692        );
693    }
694
695    // Merge the persisted state header and protobuf region, and report that as
696    // the persisted region.
697    //
698    // NOTE: We could choose to resize the persisted region at this point, which
699    // we would need to do if we expect the saved state to grow larger.
700    let persisted_header = partition_memory
701        .iter()
702        .find(|entry| entry.vtl_type == MemoryVtlType::VTL2_PERSISTED_STATE_HEADER)
703        .expect("persisted state header missing");
704    let persisted_protobuf = partition_memory
705        .iter()
706        .find(|entry| entry.vtl_type == MemoryVtlType::VTL2_PERSISTED_STATE_PROTOBUF)
707        .expect("persisted state protobuf region missing");
708    assert_eq!(persisted_header.range.end(), protobuf_region.start());
709    let persisted_state_region =
710        MemoryRange::new(persisted_header.range.start()..persisted_protobuf.range.end());
711
712    // The host provided device tree is marked as normal ram, as the
713    // bootshim is responsible for constructing anything usermode needs from
714    // it, and passing it via the device tree provided to the kernel.
715    let reclaim_base = params.dt_start();
716    let reclaim_end = params.dt_start() + params.dt_size();
717    let vtl2_config_region_reclaim =
718        MemoryRange::try_new(reclaim_base..reclaim_end).expect("range is valid");
719
720    log::info!("reclaim device tree memory {reclaim_base:x}-{reclaim_end:x}");
721
722    let vtl2_config_region = MemoryRange::new(
723        params.parameter_region_start
724            ..(params.parameter_region_start + params.parameter_region_size),
725    );
726
727    let mut address_space_builder = AddressSpaceManagerBuilder::new(
728        address_space,
729        &vtl2_ram,
730        params.used,
731        persisted_state_region,
732        subtract_ranges([vtl2_config_region], [vtl2_config_region_reclaim]),
733    );
734
735    // NOTE: The only other region we take from the previous instance is any
736    // allocated vtl2 pool. Today, we do not allocate a new/larger pool if the
737    // command line arguments or host device tree changed, as that's not
738    // something we expect to happen in practice.
739    let mut pool_ranges = partition_memory.iter().filter_map(|entry| {
740        if entry.vtl_type == MemoryVtlType::VTL2_GPA_POOL {
741            Some(entry.range)
742        } else {
743            None
744        }
745    });
746    let pool_range = pool_ranges.next();
747    assert!(
748        pool_ranges.next().is_none(),
749        "previous instance had multiple pool ranges"
750    );
751
752    if let Some(pool_range) = pool_range {
753        address_space_builder = address_space_builder.with_pool_range(pool_range);
754    }
755
756    // As described above, other ranges come from this current boot.
757    address_space_builder = add_common_ranges(params, address_space_builder);
758
759    address_space_builder
760        .init()
761        .expect("failed to initialize address space manager");
762
763    // Read previous mmio for VTL0 and VTL2.
764    let vtl0_mmio = partition_mmio
765        .iter()
766        .filter_map(|entry| {
767            if entry.vtl_type == MemoryVtlType::VTL0_MMIO {
768                Some(entry.range)
769            } else {
770                None
771            }
772        })
773        .collect::<ArrayVec<MemoryRange, 2>>();
774    let vtl2_mmio = partition_mmio
775        .iter()
776        .filter_map(|entry| {
777            if entry.vtl_type == MemoryVtlType::VTL2_MMIO {
778                Some(entry.range)
779            } else {
780                None
781            }
782        })
783        .collect::<ArrayVec<MemoryRange, 2>>();
784
785    Ok(PersistedPartitionTopology {
786        topology: PartitionTopology {
787            vtl2_ram: OffStackRef::<'_, ArrayVec<MemoryEntry, MAX_VTL2_RAM_RANGES>>::leak(vtl2_ram),
788            vtl0_mmio,
789            vtl2_mmio,
790            memory_allocation_mode,
791        },
792        cpus_with_mapped_interrupts_no_io,
793        cpus_with_outstanding_io,
794    })
795}
796
797/// Read the persisted header from the start of the persisted state region
798/// described at file build time. If the magic value is not set, `None` is
799/// returned.
800fn read_persisted_region_header(params: &ShimParams) -> Option<PersistedStateHeader> {
801    // TODO CVM: On an isolated guest, these pages may not be accepted. We need
802    // to rethink how this will work in order to handle this correctly, as on a
803    // first boot we'd need to accept them early, but subsequent boots should
804    // not accept any pages.
805    //
806    // This may require some value passed in via a register or something early
807    // that indicates this is a servicing boot, which we could set if OpenHCL
808    // itself launches the next instance.
809    if params.isolation_type != IsolationType::None {
810        return None;
811    }
812
813    // SAFETY: The header lies at the start of the shim described persisted state
814    // region. This range is guaranteed to be identity mapped at file build
815    // time.
816    let buf = unsafe {
817        core::slice::from_raw_parts(
818            params.persisted_state.start() as *const u8,
819            size_of::<PersistedStateHeader>(),
820        )
821    };
822
823    let header = PersistedStateHeader::read_from_bytes(buf)
824        .expect("region is page aligned and the correct size");
825
826    if header.magic == PersistedStateHeader::MAGIC {
827        Some(header)
828    } else {
829        None
830    }
831}
832
833impl PartitionInfo {
834    // Read the IGVM provided DT for the vtl2 partition info.
835    pub fn read_from_dt<'a>(
836        params: &'a ShimParams,
837        storage: &'a mut Self,
838        address_space: &'_ mut AddressSpaceManager,
839        mut options: BootCommandLineOptions,
840        can_trust_host: bool,
841    ) -> Result<&'a mut Self, DtError> {
842        let dt = params.device_tree();
843
844        if dt[0] == 0 {
845            log::error!("host did not provide a device tree");
846            return Err(DtError::NoDeviceTree);
847        }
848
849        let mut dt_storage = off_stack!(ParsedDt, ParsedDeviceTree::new());
850
851        let parsed = ParsedDeviceTree::parse(dt, &mut *dt_storage).map_err(DtError::DeviceTree)?;
852
853        let command_line = params.command_line();
854
855        // Always write the measured command line.
856        write!(
857            storage.cmdline,
858            "{}",
859            command_line
860                .command_line()
861                .expect("measured command line should be valid")
862        )
863        .map_err(|_| DtError::CommandLineSize)?;
864
865        match command_line.policy {
866            CommandLinePolicy::STATIC => {
867                // Nothing to do, we already wrote the measured command line.
868            }
869            CommandLinePolicy::APPEND_CHOSEN if can_trust_host => {
870                // Check the host-provided command line for options for ourself,
871                // and pass it along to the kernel.
872                options.parse(&parsed.command_line);
873                write!(storage.cmdline, " {}", &parsed.command_line)
874                    .map_err(|_| DtError::CommandLineSize)?;
875            }
876            CommandLinePolicy::APPEND_CHOSEN if !can_trust_host => {
877                // Nothing to do, we ignore the host provided command line.
878            }
879            _ => unreachable!(),
880        }
881
882        init_heap(params);
883
884        let persisted_state_header = read_persisted_region_header(params);
885        let (topology, has_devices_that_should_disable_sidecar) =
886            if let Some(header) = persisted_state_header {
887                log::info!("found persisted state header");
888                let persisted_topology =
889                    topology_from_persisted_state(header, params, parsed, address_space)?;
890
891                (
892                    persisted_topology.topology,
893                    !(persisted_topology
894                        .cpus_with_mapped_interrupts_no_io
895                        .is_empty()
896                        && persisted_topology.cpus_with_outstanding_io.is_empty()),
897                )
898            } else {
899                (
900                    topology_from_host_dt(params, parsed, &options, address_space)?,
901                    false,
902                )
903            };
904
905        let Self {
906            vtl2_ram,
907            partition_ram,
908            isolation,
909            bsp_reg,
910            cpus,
911            vmbus_vtl0,
912            vmbus_vtl2,
913            cmdline: _,
914            com3_serial_available: com3_serial,
915            gic,
916            pmu_gsiv,
917            memory_allocation_mode,
918            entropy,
919            vtl0_alias_map,
920            nvme_keepalive,
921            boot_options,
922        } = storage;
923
924        if let (SidecarOptions::Enabled { cpu_threshold, .. }, true) = (
925            &boot_options.sidecar,
926            has_devices_that_should_disable_sidecar,
927        ) {
928            if cpu_threshold.is_none()
929                || cpu_threshold
930                    .and_then(|threshold| threshold.try_into().ok())
931                    .is_some_and(|threshold| parsed.cpu_count() < threshold)
932            {
933                // If we are in the restore path, disable sidecar for small VMs, as the amortization
934                // benefits don't apply when devices are kept alive; the CPUs need to be powered on anyway
935                // to check for interrupts.
936                log::info!("disabling sidecar, as we are restoring from persisted state");
937                boot_options.sidecar = SidecarOptions::DisabledServicing;
938                options.sidecar = SidecarOptions::DisabledServicing;
939            }
940        }
941
942        // Set ram and memory alloction mode.
943        vtl2_ram.clear();
944        vtl2_ram.extend(topology.vtl2_ram.iter().copied());
945        partition_ram.clear();
946        partition_ram.extend(parsed.memory.iter().copied());
947        *memory_allocation_mode = topology.memory_allocation_mode;
948
949        // Set vmbus fields. The connection ID comes from the host, but mmio
950        // comes from topology.
951        *vmbus_vtl0 = VmbusInfo {
952            connection_id: parsed
953                .vmbus_vtl0
954                .as_ref()
955                .ok_or(DtError::Vtl0Vmbus)?
956                .connection_id,
957            mmio: topology.vtl0_mmio,
958        };
959        *vmbus_vtl2 = VmbusInfo {
960            connection_id: parsed
961                .vmbus_vtl2
962                .as_ref()
963                .ok_or(DtError::Vtl2Vmbus)?
964                .connection_id,
965            mmio: topology.vtl2_mmio,
966        };
967
968        // If we can trust the host, use the provided alias map
969        if can_trust_host {
970            *vtl0_alias_map = parsed.vtl0_alias_map;
971        }
972
973        *isolation = params.isolation_type;
974
975        *bsp_reg = parsed.boot_cpuid_phys;
976        cpus.extend(parsed.cpus.iter().copied());
977        *com3_serial = parsed.com3_serial;
978        *gic = parsed.gic.clone();
979        *pmu_gsiv = parsed.pmu_gsiv;
980        *entropy = parsed.entropy.clone();
981        *nvme_keepalive = parsed.nvme_keepalive;
982        *boot_options = options;
983
984        Ok(storage)
985    }
986}