openhcl_boot/host_params/dt/
mod.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! Parse partition info using the IGVM device tree parameter.
5
6extern crate alloc;
7
8use super::PartitionInfo;
9use super::shim_params::ShimParams;
10use crate::boot_logger::log;
11use crate::cmdline::BootCommandLineOptions;
12use crate::cmdline::SidecarOptions;
13use crate::host_params::COMMAND_LINE_SIZE;
14use crate::host_params::MAX_CPU_COUNT;
15use crate::host_params::MAX_ENTROPY_SIZE;
16use crate::host_params::MAX_NUMA_NODES;
17use crate::host_params::MAX_PARTITION_RAM_RANGES;
18use crate::host_params::MAX_VTL2_RAM_RANGES;
19use crate::host_params::dt::dma_hint::pick_private_pool_size;
20use crate::host_params::mmio::select_vtl2_mmio_range;
21use crate::host_params::shim_params::IsolationType;
22use crate::memory::AddressSpaceManager;
23use crate::memory::AddressSpaceManagerBuilder;
24use crate::memory::AllocationPolicy;
25use crate::memory::AllocationType;
26use crate::single_threaded::OffStackRef;
27use crate::single_threaded::off_stack;
28use alloc::vec::Vec;
29use arrayvec::ArrayVec;
30use bump_alloc::ALLOCATOR;
31use core::cmp::max;
32use core::fmt::Write;
33use host_fdt_parser::MemoryAllocationMode;
34use host_fdt_parser::MemoryEntry;
35use host_fdt_parser::ParsedDeviceTree;
36use host_fdt_parser::VmbusInfo;
37use hvdef::HV_PAGE_SIZE;
38use igvm_defs::MemoryMapEntryType;
39use loader_defs::paravisor::CommandLinePolicy;
40use loader_defs::shim::MemoryVtlType;
41use loader_defs::shim::PersistedStateHeader;
42use memory_range::MemoryRange;
43use memory_range::subtract_ranges;
44use memory_range::walk_ranges;
45use thiserror::Error;
46use zerocopy::FromBytes;
47
48mod bump_alloc;
49mod dma_hint;
50
51/// Errors when reading the host device tree.
52#[derive(Debug, Error)]
53pub enum DtError {
54    /// Host did not provide a device tree.
55    #[error("no device tree provided by host")]
56    NoDeviceTree,
57    /// Invalid device tree.
58    #[error("host provided device tree is invalid")]
59    DeviceTree(#[source] host_fdt_parser::Error<'static>),
60    /// PartitionInfo's command line is too small to write the parsed legacy
61    /// command line.
62    #[error("commandline storage is too small to write the parsed command line")]
63    CommandLineSize,
64    /// Device tree did not contain a vmbus node for VTL2.
65    #[error("device tree did not contain a vmbus node for VTL2")]
66    Vtl2Vmbus,
67    /// Device tree did not contain a vmbus node for VTL0.
68    #[error("device tree did not contain a vmbus node for VTL0")]
69    Vtl0Vmbus,
70    /// Host provided high MMIO range is insufficient to cover VTL0 and VTL2.
71    #[error("host provided high MMIO range is insufficient to cover VTL0 and VTL2")]
72    NotEnoughMmio,
73}
74
75/// Allocate VTL2 ram from the partition's memory map.
76fn allocate_vtl2_ram(
77    params: &ShimParams,
78    partition_memory_map: &[MemoryEntry],
79    ram_size: Option<u64>,
80) -> OffStackRef<'static, impl AsRef<[MemoryEntry]> + use<>> {
81    // First, calculate how many numa nodes there are by looking at unique numa
82    // nodes in the memory map.
83    let mut numa_nodes = off_stack!(ArrayVec<u32, MAX_NUMA_NODES>, ArrayVec::new_const());
84
85    for entry in partition_memory_map.iter() {
86        match numa_nodes.binary_search(&entry.vnode) {
87            Ok(_) => {}
88            Err(index) => {
89                numa_nodes.insert(index, entry.vnode);
90            }
91        }
92    }
93
94    let numa_node_count = numa_nodes.len();
95
96    let vtl2_size = if let Some(ram_size) = ram_size {
97        if ram_size < params.memory_size {
98            panic!(
99                "host provided vtl2 ram size {:x} is smaller than measured size {:x}",
100                ram_size, params.memory_size
101            );
102        }
103        max(ram_size, params.memory_size)
104    } else {
105        params.memory_size
106    };
107
108    // Next, calculate the amount of memory that needs to be allocated per numa
109    // node.
110    let ram_per_node = vtl2_size / numa_node_count as u64;
111
112    // Seed the remaining allocation list with the memory required per node.
113    let mut memory_per_node = off_stack!(ArrayVec<u64, MAX_NUMA_NODES>, ArrayVec::new_const());
114    memory_per_node.extend((0..numa_node_count).map(|_| 0));
115    for entry in partition_memory_map.iter() {
116        memory_per_node[entry.vnode as usize] = ram_per_node;
117    }
118
119    // The range the IGVM file was loaded into is special - it is already
120    // counted as "allocated". This may have been split across different numa
121    // nodes. Walk the used range, add it to vtl2 ram, and subtract it from the
122    // used ranges.
123    let mut vtl2_ram = off_stack!(ArrayVec<MemoryEntry, MAX_NUMA_NODES>, ArrayVec::new_const());
124    let mut free_memory_after_vtl2 = off_stack!(ArrayVec<MemoryEntry, 1024>, ArrayVec::new_const());
125    let file_memory_range = MemoryRange::new(
126        params.memory_start_address..(params.memory_start_address + params.memory_size),
127    );
128
129    for (range, result) in walk_ranges(
130        [(file_memory_range, ())],
131        partition_memory_map.iter().map(|e| (e.range, e)),
132    ) {
133        match result {
134            memory_range::RangeWalkResult::Right(entry) => {
135                // Add this entry to the free list.
136                free_memory_after_vtl2.push(MemoryEntry {
137                    range,
138                    mem_type: entry.mem_type,
139                    vnode: entry.vnode,
140                });
141            }
142            memory_range::RangeWalkResult::Both(_, entry) => {
143                // Add this entry to the vtl2 ram list.
144                vtl2_ram.push(MemoryEntry {
145                    range,
146                    mem_type: entry.mem_type,
147                    vnode: entry.vnode,
148                });
149            }
150            memory_range::RangeWalkResult::Left(_) => {
151                panic!("used file range {range:#x?} is not reported as ram by host memmap")
152            }
153            // Ranges in neither are ignored.
154            memory_range::RangeWalkResult::Neither => {}
155        }
156    }
157
158    // Now remove ranges from the free list that were part of the initial launch
159    // context.
160    let mut free_memory = off_stack!(ArrayVec<MemoryEntry, 1024>, ArrayVec::new_const());
161    for (range, result) in walk_ranges(
162        params
163            .imported_regions()
164            .filter_map(|(range, _preaccepted)| {
165                if !file_memory_range.contains(&range) {
166                     // There should be no overlap - either the preaccepted range
167                    // is exclusively covered by the preaccpted VTL2 range or it
168                    // is not.
169                    assert!(!file_memory_range.overlaps(&range), "imported range {range:#x?} overlaps vtl2 range and is not fully contained within vtl2 range");
170                    Some((range, ()))
171                } else {
172                    None
173                }
174            }),
175        free_memory_after_vtl2.iter().map(|e| (e.range, e)),
176    ) {
177        match result {
178            memory_range::RangeWalkResult::Right(entry) => {
179                free_memory.push(MemoryEntry {
180                    range,
181                    mem_type: entry.mem_type,
182                    vnode: entry.vnode,
183                });
184            }
185            memory_range::RangeWalkResult::Left(_) => {
186                // On TDX, the reset vector page is not reported as ram by the
187                // host, but is preaccepted. Ignore it.
188                #[cfg(target_arch = "x86_64")]
189                if params.isolation_type == IsolationType::Tdx && range.start_4k_gpn() == 0xFFFFF && range.len() == 0x1000 {
190                    continue;
191                }
192
193                panic!("launch context range {range:#x?} is not reported as ram by host memmap")
194            }
195            memory_range::RangeWalkResult::Both(_, _) => {
196                // Range was part of the preaccepted import, is not free to
197                // allocate additional VTL2 ram from.
198            }
199            // Ranges in neither are ignored.
200            memory_range::RangeWalkResult::Neither => {}
201        }
202    }
203
204    // Subtract the used ranges from vtl2_ram
205    for entry in vtl2_ram.iter() {
206        let mem_req = &mut memory_per_node[entry.vnode as usize];
207
208        if entry.range.len() > *mem_req {
209            // TODO: Today if a used range is larger than the mem required, we
210            // just subtract that numa range to zero. Should we instead subtract
211            // from other numa nodes equally for over allocation?
212            log!(
213                "entry {entry:?} is larger than required {mem_req} for vnode {}",
214                entry.vnode
215            );
216            *mem_req = 0;
217        } else {
218            *mem_req -= entry.range.len();
219        }
220    }
221
222    // Allocate remaining memory per node required.
223    for (node, required_mem) in memory_per_node.iter().enumerate() {
224        let mut required_mem = *required_mem;
225        if required_mem == 0 {
226            continue;
227        }
228
229        // Start allocation from the top of the free list, which is high memory
230        // in reverse order.
231        for entry in free_memory.iter_mut().rev() {
232            if entry.vnode == node as u32 && !entry.range.is_empty() {
233                assert!(required_mem != 0);
234                let bytes_to_allocate = core::cmp::min(entry.range.len(), required_mem);
235
236                // Allocate top down from the range.
237                let offset = entry.range.len() - bytes_to_allocate;
238                let (remaining, alloc) = MemoryRange::split_at_offset(&entry.range, offset);
239
240                entry.range = remaining;
241                vtl2_ram.push(MemoryEntry {
242                    range: alloc,
243                    mem_type: entry.mem_type,
244                    vnode: node as u32,
245                });
246
247                required_mem -= bytes_to_allocate;
248
249                // Stop allocating if we're done allocating.
250                if required_mem == 0 {
251                    break;
252                }
253            }
254        }
255
256        if required_mem != 0 {
257            // TODO: Handle fallback allocations on other numa nodes when a node
258            // is exhausted.
259            panic!(
260                "failed to allocate {required_mem:#x} for vnode {node:#x}, no memory remaining for vnode"
261            );
262        }
263    }
264
265    // Sort VTL2 ram as we may have allocated from different places.
266    vtl2_ram.sort_unstable_by_key(|e| e.range.start());
267
268    vtl2_ram
269}
270
271/// Parse VTL2 ram from host provided ranges.
272fn parse_host_vtl2_ram(
273    params: &ShimParams,
274    memory: &[MemoryEntry],
275) -> OffStackRef<'static, impl AsRef<[MemoryEntry]> + use<>> {
276    // If no VTL2 protectable ram was provided by the host, use the build time
277    // value encoded in ShimParams.
278    let mut vtl2_ram = off_stack!(ArrayVec<MemoryEntry, MAX_NUMA_NODES>, ArrayVec::new_const());
279    if params.isolation_type.is_hardware_isolated() {
280        // Hardware isolated VMs use the size hint by the host, but use the base
281        // address encoded in the file.
282        let vtl2_size = memory.iter().fold(0, |acc, entry| {
283            if entry.mem_type == MemoryMapEntryType::VTL2_PROTECTABLE {
284                acc + entry.range.len()
285            } else {
286                acc
287            }
288        });
289
290        log!(
291            "host provided vtl2 ram size is {:x}, measured size is {:x}",
292            vtl2_size,
293            params.memory_size
294        );
295
296        let vtl2_size = max(vtl2_size, params.memory_size);
297        vtl2_ram.push(MemoryEntry {
298            range: MemoryRange::new(
299                params.memory_start_address..(params.memory_start_address + vtl2_size),
300            ),
301            mem_type: MemoryMapEntryType::VTL2_PROTECTABLE,
302            vnode: 0,
303        });
304    } else {
305        for &entry in memory
306            .iter()
307            .filter(|entry| entry.mem_type == MemoryMapEntryType::VTL2_PROTECTABLE)
308        {
309            vtl2_ram.push(entry);
310        }
311    }
312
313    if vtl2_ram.is_empty() {
314        log!("using measured vtl2 ram");
315        vtl2_ram.push(MemoryEntry {
316            range: MemoryRange::try_new(
317                params.memory_start_address..(params.memory_start_address + params.memory_size),
318            )
319            .expect("range is valid"),
320            mem_type: MemoryMapEntryType::VTL2_PROTECTABLE,
321            vnode: 0,
322        });
323    }
324
325    vtl2_ram
326}
327
328fn init_heap(params: &ShimParams) {
329    // Initialize the temporary heap.
330    //
331    // This is only to be enabled for mesh decode.
332    //
333    // SAFETY: The heap range is reserved at file build time, and is
334    // guaranteed to be unused by anything else.
335    unsafe {
336        ALLOCATOR.init(params.heap);
337    }
338}
339
340type ParsedDt =
341    ParsedDeviceTree<MAX_PARTITION_RAM_RANGES, MAX_CPU_COUNT, COMMAND_LINE_SIZE, MAX_ENTROPY_SIZE>;
342
343/// Add common ranges to [`AddressSpaceManagerBuilder`] regardless if creating
344/// topology from the host or from saved state.
345fn add_common_ranges<'a, I: Iterator<Item = MemoryRange>>(
346    params: &ShimParams,
347    mut builder: AddressSpaceManagerBuilder<'a, I>,
348) -> AddressSpaceManagerBuilder<'a, I> {
349    // Add the log buffer which is always present.
350    builder = builder.with_log_buffer(params.log_buffer);
351
352    if params.vtl2_reserved_region_size != 0 {
353        builder = builder.with_reserved_range(MemoryRange::new(
354            params.vtl2_reserved_region_start
355                ..(params.vtl2_reserved_region_start + params.vtl2_reserved_region_size),
356        ));
357    }
358
359    if params.sidecar_size != 0 {
360        builder = builder.with_sidecar_image(MemoryRange::new(
361            params.sidecar_base..(params.sidecar_base + params.sidecar_size),
362        ));
363    }
364
365    builder
366}
367
368#[derive(Debug, PartialEq, Eq)]
369struct PartitionTopology {
370    vtl2_ram: &'static [MemoryEntry],
371    vtl0_mmio: ArrayVec<MemoryRange, 2>,
372    vtl2_mmio: ArrayVec<MemoryRange, 2>,
373    memory_allocation_mode: MemoryAllocationMode,
374}
375
376/// State derived while constructing the partition topology
377/// from persisted state.
378#[derive(Debug, PartialEq, Eq)]
379struct PersistedPartitionTopology {
380    topology: PartitionTopology,
381    cpus_with_mapped_interrupts_no_io: Vec<u32>,
382    cpus_with_outstanding_io: Vec<u32>,
383}
384
385// Calculate the default mmio size for VTL2 when not specified by the host.
386//
387// This is half of the high mmio gap size, rounded down, with a minimum of 128
388// MB and a maximum of 1 GB.
389fn calculate_default_mmio_size(parsed: &ParsedDt) -> Result<u64, DtError> {
390    const MINIMUM_MMIO_SIZE: u64 = 128 * (1 << 20);
391    const MAXIMUM_MMIO_SIZE: u64 = 1 << 30;
392    let half_high_gap = parsed.vmbus_vtl0.as_ref().ok_or(DtError::Vtl0Vmbus)?.mmio[1].len() / 2;
393    Ok(half_high_gap.clamp(MINIMUM_MMIO_SIZE, MAXIMUM_MMIO_SIZE))
394}
395
396/// Read topology from the host provided device tree.
397fn topology_from_host_dt(
398    params: &ShimParams,
399    parsed: &ParsedDt,
400    options: &BootCommandLineOptions,
401    address_space: &mut AddressSpaceManager,
402) -> Result<PartitionTopology, DtError> {
403    log!("reading topology from host device tree");
404
405    let mut vtl2_ram =
406        off_stack!(ArrayVec<MemoryEntry, MAX_VTL2_RAM_RANGES>, ArrayVec::new_const());
407
408    // TODO: Decide if isolated guests always use VTL2 allocation mode.
409
410    let memory_allocation_mode = parsed.memory_allocation_mode;
411    match memory_allocation_mode {
412        MemoryAllocationMode::Host => {
413            vtl2_ram
414                .try_extend_from_slice(parse_host_vtl2_ram(params, &parsed.memory).as_ref())
415                .expect("vtl2 ram should only be 64 big");
416        }
417        MemoryAllocationMode::Vtl2 {
418            memory_size,
419            mmio_size: _,
420        } => {
421            vtl2_ram
422                .try_extend_from_slice(
423                    allocate_vtl2_ram(params, &parsed.memory, memory_size).as_ref(),
424                )
425                .expect("vtl2 ram should only be 64 big");
426        }
427    }
428
429    // The host is responsible for allocating MMIO ranges for non-isolated
430    // guests when it also provides the ram VTL2 should use.
431    //
432    // For isolated guests, or when VTL2 has been asked to carve out its own
433    // memory, carve out a range from the VTL0 allotment.
434    let (vtl0_mmio, vtl2_mmio) = if params.isolation_type != IsolationType::None
435        || matches!(
436            parsed.memory_allocation_mode,
437            MemoryAllocationMode::Vtl2 { .. }
438        ) {
439        // Decide the amount of mmio VTL2 should allocate.
440        let mmio_size = max(
441            match parsed.memory_allocation_mode {
442                MemoryAllocationMode::Vtl2 { mmio_size, .. } => mmio_size.unwrap_or(0),
443                _ => 0,
444            },
445            calculate_default_mmio_size(parsed)?,
446        );
447
448        log!("allocating vtl2 mmio size {mmio_size:#x} bytes");
449
450        // Decide what mmio vtl2 should use.
451        let mmio = &parsed.vmbus_vtl0.as_ref().ok_or(DtError::Vtl0Vmbus)?.mmio;
452        let selected_vtl2_mmio = select_vtl2_mmio_range(mmio, mmio_size)?;
453
454        // Update vtl0 mmio to exclude vtl2 mmio.
455        let vtl0_mmio = subtract_ranges(mmio.iter().cloned(), [selected_vtl2_mmio])
456            .collect::<ArrayVec<MemoryRange, 2>>();
457        let vtl2_mmio = [selected_vtl2_mmio]
458            .into_iter()
459            .collect::<ArrayVec<MemoryRange, 2>>();
460
461        // TODO: For now, if we have only a single vtl0_mmio range left,
462        // panic. In the future decide if we want to report this as a start
463        // failure in usermode, change allocation strategy, or something
464        // else.
465        assert_eq!(
466            vtl0_mmio.len(),
467            2,
468            "vtl0 mmio ranges are not 2 {:#x?}",
469            vtl0_mmio
470        );
471
472        (vtl0_mmio, vtl2_mmio)
473    } else {
474        (
475            parsed
476                .vmbus_vtl0
477                .as_ref()
478                .ok_or(DtError::Vtl0Vmbus)?
479                .mmio
480                .clone(),
481            parsed
482                .vmbus_vtl2
483                .as_ref()
484                .ok_or(DtError::Vtl2Vmbus)?
485                .mmio
486                .clone(),
487        )
488    };
489
490    // The host provided device tree is marked as normal ram, as the
491    // bootshim is responsible for constructing anything usermode needs from
492    // it, and passing it via the device tree provided to the kernel.
493    let reclaim_base = params.dt_start();
494    let reclaim_end = params.dt_start() + params.dt_size();
495    let vtl2_config_region_reclaim =
496        MemoryRange::try_new(reclaim_base..reclaim_end).expect("range is valid");
497
498    log!("reclaim device tree memory {reclaim_base:x}-{reclaim_end:x}");
499
500    // Initialize the address space manager with fixed at build time ranges.
501    let vtl2_config_region = MemoryRange::new(
502        params.parameter_region_start
503            ..(params.parameter_region_start + params.parameter_region_size),
504    );
505
506    // NOTE: Size the region as 20 pages. This should be plenty enough for the
507    // worst case encoded size (about 50 bytes worst case per memory entry, with
508    // the max number of ram ranges), and is small enough that we can reserve it
509    // on all sizes. Revisit this calculation if we persist more state in the
510    // future.
511    const PERSISTED_REGION_SIZE: u64 = 20 * 4096;
512    let (persisted_state_region, remainder) = params
513        .persisted_state
514        .split_at_offset(PERSISTED_REGION_SIZE);
515    log!("persisted state region sized to {persisted_state_region:#x?}, remainder {remainder:#x?}");
516
517    let mut address_space_builder = AddressSpaceManagerBuilder::new(
518        address_space,
519        &vtl2_ram,
520        params.used,
521        persisted_state_region,
522        subtract_ranges([vtl2_config_region], [vtl2_config_region_reclaim]),
523    );
524
525    address_space_builder = add_common_ranges(params, address_space_builder);
526
527    address_space_builder
528        .init()
529        .expect("failed to initialize address space manager");
530
531    if params.isolation_type == IsolationType::None {
532        let enable_vtl2_gpa_pool = options.enable_vtl2_gpa_pool;
533        let device_dma_page_count = parsed.device_dma_page_count;
534        let vp_count = parsed.cpu_count();
535        let mem_size = vtl2_ram.iter().map(|e| e.range.len()).sum();
536        if let Some(vtl2_gpa_pool_size) = pick_private_pool_size(
537            enable_vtl2_gpa_pool,
538            device_dma_page_count,
539            vp_count,
540            mem_size,
541        ) {
542            // Reserve the specified number of pages for the pool. Use the used
543            // ranges to figure out which VTL2 memory is free to allocate from.
544            let pool_size_bytes = vtl2_gpa_pool_size * HV_PAGE_SIZE;
545
546            // NOTE: For now, allocate all the private pool on NUMA node 0 to
547            // match previous behavior. Allocate from high memory downward to
548            // avoid overlapping any used ranges in low memory when openhcl's
549            // usage gets bigger, as otherwise the used_range by the bootshim
550            // could overlap the pool range chosen, when servicing to a new
551            // image.
552            match address_space.allocate(
553                Some(0),
554                pool_size_bytes,
555                AllocationType::GpaPool,
556                AllocationPolicy::HighMemory,
557            ) {
558                Some(pool) => {
559                    log!("allocated VTL2 pool at {:#x?}", pool.range);
560                }
561                None => {
562                    panic!(
563                        "failed to allocate VTL2 pool of size {pool_size_bytes:#x} bytes (enable_vtl2_gpa_pool={enable_vtl2_gpa_pool:?}, device_dma_page_count={device_dma_page_count:#x?}, vp_count={vp_count}, mem_size={mem_size:#x})"
564                    );
565                }
566            };
567        }
568    }
569
570    Ok(PartitionTopology {
571        vtl2_ram: OffStackRef::<'_, ArrayVec<MemoryEntry, MAX_VTL2_RAM_RANGES>>::leak(vtl2_ram),
572        vtl0_mmio,
573        vtl2_mmio,
574        memory_allocation_mode,
575    })
576}
577
578/// Read topology from the persisted state region and protobuf payload.
579fn topology_from_persisted_state(
580    header: PersistedStateHeader,
581    params: &ShimParams,
582    parsed: &ParsedDt,
583    address_space: &mut AddressSpaceManager,
584) -> Result<PersistedPartitionTopology, DtError> {
585    log!("reading topology from persisted state");
586
587    // Verify the header describes a protobuf region within the bootshim
588    // persisted region. We expect it to live there as today we rely on the
589    // build time generated pagetable to identity map the protobuf region.
590    let protobuf_region =
591        MemoryRange::new(header.protobuf_base..(header.protobuf_base + header.protobuf_region_len));
592    assert!(
593        params.persisted_state.contains(&protobuf_region),
594        "protobuf region {protobuf_region:#x?} is not contained within the persisted state region {:#x?}",
595        params.persisted_state
596    );
597
598    // Verify protobuf payload len is smaller than region.
599    assert!(
600        header.protobuf_payload_len <= header.protobuf_region_len,
601        "protobuf payload len {} is larger than region len {}",
602        header.protobuf_payload_len,
603        header.protobuf_region_len
604    );
605
606    // SAFETY: The region lies within the persisted state region, which is
607    // identity mapped via the build time generated pagetable.
608    let protobuf_raw = unsafe {
609        core::slice::from_raw_parts(
610            header.protobuf_base as *const u8,
611            header.protobuf_payload_len as usize,
612        )
613    };
614
615    let parsed_protobuf: loader_defs::shim::save_restore::SavedState =
616        bump_alloc::with_global_alloc(|| {
617            log!("decoding protobuf of size {}", protobuf_raw.len());
618            mesh_protobuf::decode(protobuf_raw).expect("failed to decode protobuf")
619        });
620
621    let loader_defs::shim::save_restore::SavedState {
622        partition_memory,
623        partition_mmio,
624        cpus_with_mapped_interrupts_no_io,
625        cpus_with_outstanding_io,
626    } = parsed_protobuf;
627
628    // FUTURE: should memory allocation mode should persist in saved state and
629    // verify the host did not change it?
630    let memory_allocation_mode = parsed.memory_allocation_mode;
631
632    let mut vtl2_ram =
633        off_stack!(ArrayVec<MemoryEntry, MAX_VTL2_RAM_RANGES>, ArrayVec::new_const());
634
635    // Determine which ranges are memory ranges used by VTL2.
636    let previous_vtl2_ram = partition_memory.iter().filter_map(|entry| {
637        if entry.vtl_type.ram() && entry.vtl_type.vtl2() {
638            Some(MemoryEntry {
639                range: entry.range,
640                mem_type: entry.igvm_type.clone().into(),
641                vnode: entry.vnode,
642            })
643        } else {
644            None
645        }
646    });
647
648    // Merge adjacent ranges as saved state reports the final usage of ram which
649    // includes reserved in separate ranges. Here we want the whole underlying
650    // ram ranges, merged with adjacent types if they share the same igvm types.
651    let previous_vtl2_ram = memory_range::merge_adjacent_ranges(
652        previous_vtl2_ram.map(|entry| (entry.range, (entry.mem_type, entry.vnode))),
653    );
654
655    vtl2_ram.extend(
656        previous_vtl2_ram.map(|(range, (mem_type, vnode))| MemoryEntry {
657            range,
658            mem_type,
659            vnode,
660        }),
661    );
662
663    // If the host was responsible for allocating VTL2 ram, verify the ram
664    // parsed from the previous instance matches.
665    //
666    // FUTURE: When VTL2 itself did allocation, we should verify that all ranges
667    // are still within the provided memory map.
668    if matches!(memory_allocation_mode, MemoryAllocationMode::Host) {
669        let host_vtl2_ram = parse_host_vtl2_ram(params, &parsed.memory);
670        assert_eq!(
671            vtl2_ram.as_slice(),
672            host_vtl2_ram.as_ref(),
673            "vtl2 ram from persisted state does not match host provided ram"
674        );
675    }
676
677    // Merge the persisted state header and protobuf region, and report that as
678    // the persisted region.
679    //
680    // NOTE: We could choose to resize the persisted region at this point, which
681    // we would need to do if we expect the saved state to grow larger.
682    let persisted_header = partition_memory
683        .iter()
684        .find(|entry| entry.vtl_type == MemoryVtlType::VTL2_PERSISTED_STATE_HEADER)
685        .expect("persisted state header missing");
686    let persisted_protobuf = partition_memory
687        .iter()
688        .find(|entry| entry.vtl_type == MemoryVtlType::VTL2_PERSISTED_STATE_PROTOBUF)
689        .expect("persisted state protobuf region missing");
690    assert_eq!(persisted_header.range.end(), protobuf_region.start());
691    let persisted_state_region =
692        MemoryRange::new(persisted_header.range.start()..persisted_protobuf.range.end());
693
694    // The host provided device tree is marked as normal ram, as the
695    // bootshim is responsible for constructing anything usermode needs from
696    // it, and passing it via the device tree provided to the kernel.
697    let reclaim_base = params.dt_start();
698    let reclaim_end = params.dt_start() + params.dt_size();
699    let vtl2_config_region_reclaim =
700        MemoryRange::try_new(reclaim_base..reclaim_end).expect("range is valid");
701
702    log!("reclaim device tree memory {reclaim_base:x}-{reclaim_end:x}");
703
704    let vtl2_config_region = MemoryRange::new(
705        params.parameter_region_start
706            ..(params.parameter_region_start + params.parameter_region_size),
707    );
708
709    let mut address_space_builder = AddressSpaceManagerBuilder::new(
710        address_space,
711        &vtl2_ram,
712        params.used,
713        persisted_state_region,
714        subtract_ranges([vtl2_config_region], [vtl2_config_region_reclaim]),
715    );
716
717    // NOTE: The only other region we take from the previous instance is any
718    // allocated vtl2 pool. Today, we do not allocate a new/larger pool if the
719    // command line arguments or host device tree changed, as that's not
720    // something we expect to happen in practice.
721    let mut pool_ranges = partition_memory.iter().filter_map(|entry| {
722        if entry.vtl_type == MemoryVtlType::VTL2_GPA_POOL {
723            Some(entry.range)
724        } else {
725            None
726        }
727    });
728    let pool_range = pool_ranges.next();
729    assert!(
730        pool_ranges.next().is_none(),
731        "previous instance had multiple pool ranges"
732    );
733
734    if let Some(pool_range) = pool_range {
735        address_space_builder = address_space_builder.with_pool_range(pool_range);
736    }
737
738    // As described above, other ranges come from this current boot.
739    address_space_builder = add_common_ranges(params, address_space_builder);
740
741    address_space_builder
742        .init()
743        .expect("failed to initialize address space manager");
744
745    // Read previous mmio for VTL0 and VTL2.
746    let vtl0_mmio = partition_mmio
747        .iter()
748        .filter_map(|entry| {
749            if entry.vtl_type == MemoryVtlType::VTL0_MMIO {
750                Some(entry.range)
751            } else {
752                None
753            }
754        })
755        .collect::<ArrayVec<MemoryRange, 2>>();
756    let vtl2_mmio = partition_mmio
757        .iter()
758        .filter_map(|entry| {
759            if entry.vtl_type == MemoryVtlType::VTL2_MMIO {
760                Some(entry.range)
761            } else {
762                None
763            }
764        })
765        .collect::<ArrayVec<MemoryRange, 2>>();
766
767    Ok(PersistedPartitionTopology {
768        topology: PartitionTopology {
769            vtl2_ram: OffStackRef::<'_, ArrayVec<MemoryEntry, MAX_VTL2_RAM_RANGES>>::leak(vtl2_ram),
770            vtl0_mmio,
771            vtl2_mmio,
772            memory_allocation_mode,
773        },
774        cpus_with_mapped_interrupts_no_io,
775        cpus_with_outstanding_io,
776    })
777}
778
779/// Read the persisted header from the start of the persisted state region
780/// described at file build time. If the magic value is not set, `None` is
781/// returned.
782fn read_persisted_region_header(params: &ShimParams) -> Option<PersistedStateHeader> {
783    // TODO CVM: On an isolated guest, these pages may not be accepted. We need
784    // to rethink how this will work in order to handle this correctly, as on a
785    // first boot we'd need to accept them early, but subsequent boots should
786    // not accept any pages.
787    //
788    // This may require some value passed in via a register or something early
789    // that indicates this is a servicing boot, which we could set if OpenHCL
790    // itself launches the next instance.
791    if params.isolation_type != IsolationType::None {
792        return None;
793    }
794
795    // SAFETY: The header lies at the start of the shim described persisted state
796    // region. This range is guaranteed to be identity mapped at file build
797    // time.
798    let buf = unsafe {
799        core::slice::from_raw_parts(
800            params.persisted_state.start() as *const u8,
801            size_of::<PersistedStateHeader>(),
802        )
803    };
804
805    let header = PersistedStateHeader::read_from_bytes(buf)
806        .expect("region is page aligned and the correct size");
807
808    if header.magic == PersistedStateHeader::MAGIC {
809        Some(header)
810    } else {
811        None
812    }
813}
814
815impl PartitionInfo {
816    // Read the IGVM provided DT for the vtl2 partition info.
817    pub fn read_from_dt<'a>(
818        params: &'a ShimParams,
819        storage: &'a mut Self,
820        address_space: &'_ mut AddressSpaceManager,
821        mut options: BootCommandLineOptions,
822        can_trust_host: bool,
823    ) -> Result<&'a mut Self, DtError> {
824        let dt = params.device_tree();
825
826        if dt[0] == 0 {
827            log!("host did not provide a device tree");
828            return Err(DtError::NoDeviceTree);
829        }
830
831        let mut dt_storage = off_stack!(ParsedDt, ParsedDeviceTree::new());
832
833        let parsed = ParsedDeviceTree::parse(dt, &mut *dt_storage).map_err(DtError::DeviceTree)?;
834
835        let command_line = params.command_line();
836
837        // Always write the measured command line.
838        write!(
839            storage.cmdline,
840            "{}",
841            command_line
842                .command_line()
843                .expect("measured command line should be valid")
844        )
845        .map_err(|_| DtError::CommandLineSize)?;
846
847        match command_line.policy {
848            CommandLinePolicy::STATIC => {
849                // Nothing to do, we already wrote the measured command line.
850            }
851            CommandLinePolicy::APPEND_CHOSEN if can_trust_host => {
852                // Check the host-provided command line for options for ourself,
853                // and pass it along to the kernel.
854                options.parse(&parsed.command_line);
855                write!(storage.cmdline, " {}", &parsed.command_line)
856                    .map_err(|_| DtError::CommandLineSize)?;
857            }
858            CommandLinePolicy::APPEND_CHOSEN if !can_trust_host => {
859                // Nothing to do, we ignore the host provided command line.
860            }
861            _ => unreachable!(),
862        }
863
864        init_heap(params);
865
866        let persisted_state_header = read_persisted_region_header(params);
867        let (topology, has_devices_that_should_disable_sidecar) =
868            if let Some(header) = persisted_state_header {
869                log!("found persisted state header");
870                let persisted_topology =
871                    topology_from_persisted_state(header, params, parsed, address_space)?;
872
873                (
874                    persisted_topology.topology,
875                    !(persisted_topology
876                        .cpus_with_mapped_interrupts_no_io
877                        .is_empty()
878                        && persisted_topology.cpus_with_outstanding_io.is_empty()),
879                )
880            } else {
881                (
882                    topology_from_host_dt(params, parsed, &options, address_space)?,
883                    false,
884                )
885            };
886
887        let Self {
888            vtl2_ram,
889            partition_ram,
890            isolation,
891            bsp_reg,
892            cpus,
893            vmbus_vtl0,
894            vmbus_vtl2,
895            cmdline: _,
896            com3_serial_available: com3_serial,
897            gic,
898            pmu_gsiv,
899            memory_allocation_mode,
900            entropy,
901            vtl0_alias_map,
902            nvme_keepalive,
903            boot_options,
904        } = storage;
905
906        if let (SidecarOptions::Enabled { cpu_threshold, .. }, true) = (
907            &boot_options.sidecar,
908            has_devices_that_should_disable_sidecar,
909        ) {
910            if cpu_threshold.is_none()
911                || cpu_threshold
912                    .and_then(|threshold| threshold.try_into().ok())
913                    .is_some_and(|threshold| parsed.cpu_count() < threshold)
914            {
915                // If we are in the restore path, disable sidecar for small VMs, as the amortization
916                // benefits don't apply when devices are kept alive; the CPUs need to be powered on anyway
917                // to check for interrupts.
918                log!("disabling sidecar, as we are restoring from persisted state");
919                boot_options.sidecar = SidecarOptions::DisabledServicing;
920                options.sidecar = SidecarOptions::DisabledServicing;
921            }
922        }
923
924        // Set ram and memory alloction mode.
925        vtl2_ram.clear();
926        vtl2_ram.extend(topology.vtl2_ram.iter().copied());
927        partition_ram.clear();
928        partition_ram.extend(parsed.memory.iter().copied());
929        *memory_allocation_mode = topology.memory_allocation_mode;
930
931        // Set vmbus fields. The connection ID comes from the host, but mmio
932        // comes from topology.
933        *vmbus_vtl0 = VmbusInfo {
934            connection_id: parsed
935                .vmbus_vtl0
936                .as_ref()
937                .ok_or(DtError::Vtl0Vmbus)?
938                .connection_id,
939            mmio: topology.vtl0_mmio,
940        };
941        *vmbus_vtl2 = VmbusInfo {
942            connection_id: parsed
943                .vmbus_vtl2
944                .as_ref()
945                .ok_or(DtError::Vtl2Vmbus)?
946                .connection_id,
947            mmio: topology.vtl2_mmio,
948        };
949
950        // If we can trust the host, use the provided alias map
951        if can_trust_host {
952            *vtl0_alias_map = parsed.vtl0_alias_map;
953        }
954
955        *isolation = params.isolation_type;
956
957        *bsp_reg = parsed.boot_cpuid_phys;
958        cpus.extend(parsed.cpus.iter().copied());
959        *com3_serial = parsed.com3_serial;
960        *gic = parsed.gic.clone();
961        *pmu_gsiv = parsed.pmu_gsiv;
962        *entropy = parsed.entropy.clone();
963        *nvme_keepalive = parsed.nvme_keepalive;
964        *boot_options = options;
965
966        Ok(storage)
967    }
968}