openhcl_boot/host_params/dt/
mod.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! Parse partition info using the IGVM device tree parameter.
5
6use super::PartitionInfo;
7use super::shim_params::ShimParams;
8use crate::boot_logger::log;
9use crate::cmdline::BootCommandLineOptions;
10use crate::host_params::COMMAND_LINE_SIZE;
11use crate::host_params::MAX_CPU_COUNT;
12use crate::host_params::MAX_ENTROPY_SIZE;
13use crate::host_params::MAX_NUMA_NODES;
14use crate::host_params::MAX_PARTITION_RAM_RANGES;
15use crate::host_params::MAX_VTL2_RAM_RANGES;
16use crate::host_params::dt::dma_hint::pick_private_pool_size;
17use crate::host_params::mmio::select_vtl2_mmio_range;
18use crate::host_params::shim_params::IsolationType;
19use crate::memory::AddressSpaceManager;
20use crate::memory::AddressSpaceManagerBuilder;
21use crate::memory::AllocationPolicy;
22use crate::memory::AllocationType;
23use crate::single_threaded::OffStackRef;
24use crate::single_threaded::off_stack;
25use arrayvec::ArrayVec;
26use bump_alloc::ALLOCATOR;
27use core::cmp::max;
28use core::fmt::Write;
29use host_fdt_parser::MemoryAllocationMode;
30use host_fdt_parser::MemoryEntry;
31use host_fdt_parser::ParsedDeviceTree;
32use host_fdt_parser::VmbusInfo;
33use hvdef::HV_PAGE_SIZE;
34use igvm_defs::MemoryMapEntryType;
35use loader_defs::paravisor::CommandLinePolicy;
36use loader_defs::shim::MemoryVtlType;
37use loader_defs::shim::PersistedStateHeader;
38use memory_range::MemoryRange;
39use memory_range::subtract_ranges;
40use memory_range::walk_ranges;
41use thiserror::Error;
42use zerocopy::FromBytes;
43
44mod bump_alloc;
45mod dma_hint;
46
47/// Errors when reading the host device tree.
48#[derive(Debug, Error)]
49pub enum DtError {
50    /// Host did not provide a device tree.
51    #[error("no device tree provided by host")]
52    NoDeviceTree,
53    /// Invalid device tree.
54    #[error("host provided device tree is invalid")]
55    DeviceTree(#[source] host_fdt_parser::Error<'static>),
56    /// PartitionInfo's command line is too small to write the parsed legacy
57    /// command line.
58    #[error("commandline storage is too small to write the parsed command line")]
59    CommandLineSize,
60    /// Device tree did not contain a vmbus node for VTL2.
61    #[error("device tree did not contain a vmbus node for VTL2")]
62    Vtl2Vmbus,
63    /// Device tree did not contain a vmbus node for VTL0.
64    #[error("device tree did not contain a vmbus node for VTL0")]
65    Vtl0Vmbus,
66    /// Host provided high MMIO range is insufficient to cover VTL0 and VTL2.
67    #[error("host provided high MMIO range is insufficient to cover VTL0 and VTL2")]
68    NotEnoughMmio,
69}
70
71/// Allocate VTL2 ram from the partition's memory map.
72fn allocate_vtl2_ram(
73    params: &ShimParams,
74    partition_memory_map: &[MemoryEntry],
75    ram_size: Option<u64>,
76) -> OffStackRef<'static, impl AsRef<[MemoryEntry]> + use<>> {
77    // First, calculate how many numa nodes there are by looking at unique numa
78    // nodes in the memory map.
79    let mut numa_nodes = off_stack!(ArrayVec<u32, MAX_NUMA_NODES>, ArrayVec::new_const());
80
81    for entry in partition_memory_map.iter() {
82        match numa_nodes.binary_search(&entry.vnode) {
83            Ok(_) => {}
84            Err(index) => {
85                numa_nodes.insert(index, entry.vnode);
86            }
87        }
88    }
89
90    let numa_node_count = numa_nodes.len();
91
92    let vtl2_size = if let Some(ram_size) = ram_size {
93        if ram_size < params.memory_size {
94            panic!(
95                "host provided vtl2 ram size {:x} is smaller than measured size {:x}",
96                ram_size, params.memory_size
97            );
98        }
99        max(ram_size, params.memory_size)
100    } else {
101        params.memory_size
102    };
103
104    // Next, calculate the amount of memory that needs to be allocated per numa
105    // node.
106    let ram_per_node = vtl2_size / numa_node_count as u64;
107
108    // Seed the remaining allocation list with the memory required per node.
109    let mut memory_per_node = off_stack!(ArrayVec<u64, MAX_NUMA_NODES>, ArrayVec::new_const());
110    memory_per_node.extend((0..numa_node_count).map(|_| 0));
111    for entry in partition_memory_map.iter() {
112        memory_per_node[entry.vnode as usize] = ram_per_node;
113    }
114
115    // The range the IGVM file was loaded into is special - it is already
116    // counted as "allocated". This may have been split across different numa
117    // nodes. Walk the used range, add it to vtl2 ram, and subtract it from the
118    // used ranges.
119    let mut vtl2_ram = off_stack!(ArrayVec<MemoryEntry, MAX_NUMA_NODES>, ArrayVec::new_const());
120    let mut free_memory_after_vtl2 = off_stack!(ArrayVec<MemoryEntry, 1024>, ArrayVec::new_const());
121    let file_memory_range = MemoryRange::new(
122        params.memory_start_address..(params.memory_start_address + params.memory_size),
123    );
124
125    for (range, result) in walk_ranges(
126        [(file_memory_range, ())],
127        partition_memory_map.iter().map(|e| (e.range, e)),
128    ) {
129        match result {
130            memory_range::RangeWalkResult::Right(entry) => {
131                // Add this entry to the free list.
132                free_memory_after_vtl2.push(MemoryEntry {
133                    range,
134                    mem_type: entry.mem_type,
135                    vnode: entry.vnode,
136                });
137            }
138            memory_range::RangeWalkResult::Both(_, entry) => {
139                // Add this entry to the vtl2 ram list.
140                vtl2_ram.push(MemoryEntry {
141                    range,
142                    mem_type: entry.mem_type,
143                    vnode: entry.vnode,
144                });
145            }
146            memory_range::RangeWalkResult::Left(_) => {
147                panic!("used file range {range:#x?} is not reported as ram by host memmap")
148            }
149            // Ranges in neither are ignored.
150            memory_range::RangeWalkResult::Neither => {}
151        }
152    }
153
154    // Now remove ranges from the free list that were part of the initial launch
155    // context.
156    let mut free_memory = off_stack!(ArrayVec<MemoryEntry, 1024>, ArrayVec::new_const());
157    for (range, result) in walk_ranges(
158        params
159            .imported_regions()
160            .filter_map(|(range, _preaccepted)| {
161                if !file_memory_range.contains(&range) {
162                     // There should be no overlap - either the preaccepted range
163                    // is exclusively covered by the preaccpted VTL2 range or it
164                    // is not.
165                    assert!(!file_memory_range.overlaps(&range), "imported range {range:#x?} overlaps vtl2 range and is not fully contained within vtl2 range");
166                    Some((range, ()))
167                } else {
168                    None
169                }
170            }),
171        free_memory_after_vtl2.iter().map(|e| (e.range, e)),
172    ) {
173        match result {
174            memory_range::RangeWalkResult::Right(entry) => {
175                free_memory.push(MemoryEntry {
176                    range,
177                    mem_type: entry.mem_type,
178                    vnode: entry.vnode,
179                });
180            }
181            memory_range::RangeWalkResult::Left(_) => {
182                // On TDX, the reset vector page is not reported as ram by the
183                // host, but is preaccepted. Ignore it.
184                #[cfg(target_arch = "x86_64")]
185                if params.isolation_type == IsolationType::Tdx && range.start_4k_gpn() == 0xFFFFF && range.len() == 0x1000 {
186                    continue;
187                }
188
189                panic!("launch context range {range:#x?} is not reported as ram by host memmap")
190            }
191            memory_range::RangeWalkResult::Both(_, _) => {
192                // Range was part of the preaccepted import, is not free to
193                // allocate additional VTL2 ram from.
194            }
195            // Ranges in neither are ignored.
196            memory_range::RangeWalkResult::Neither => {}
197        }
198    }
199
200    // Subtract the used ranges from vtl2_ram
201    for entry in vtl2_ram.iter() {
202        let mem_req = &mut memory_per_node[entry.vnode as usize];
203
204        if entry.range.len() > *mem_req {
205            // TODO: Today if a used range is larger than the mem required, we
206            // just subtract that numa range to zero. Should we instead subtract
207            // from other numa nodes equally for over allocation?
208            log!(
209                "entry {entry:?} is larger than required {mem_req} for vnode {}",
210                entry.vnode
211            );
212            *mem_req = 0;
213        } else {
214            *mem_req -= entry.range.len();
215        }
216    }
217
218    // Allocate remaining memory per node required.
219    for (node, required_mem) in memory_per_node.iter().enumerate() {
220        let mut required_mem = *required_mem;
221        if required_mem == 0 {
222            continue;
223        }
224
225        // Start allocation from the top of the free list, which is high memory
226        // in reverse order.
227        for entry in free_memory.iter_mut().rev() {
228            if entry.vnode == node as u32 && !entry.range.is_empty() {
229                assert!(required_mem != 0);
230                let bytes_to_allocate = core::cmp::min(entry.range.len(), required_mem);
231
232                // Allocate top down from the range.
233                let offset = entry.range.len() - bytes_to_allocate;
234                let (remaining, alloc) = MemoryRange::split_at_offset(&entry.range, offset);
235
236                entry.range = remaining;
237                vtl2_ram.push(MemoryEntry {
238                    range: alloc,
239                    mem_type: entry.mem_type,
240                    vnode: node as u32,
241                });
242
243                required_mem -= bytes_to_allocate;
244
245                // Stop allocating if we're done allocating.
246                if required_mem == 0 {
247                    break;
248                }
249            }
250        }
251
252        if required_mem != 0 {
253            // TODO: Handle fallback allocations on other numa nodes when a node
254            // is exhausted.
255            panic!(
256                "failed to allocate {required_mem:#x} for vnode {node:#x}, no memory remaining for vnode"
257            );
258        }
259    }
260
261    // Sort VTL2 ram as we may have allocated from different places.
262    vtl2_ram.sort_unstable_by_key(|e| e.range.start());
263
264    vtl2_ram
265}
266
267/// Parse VTL2 ram from host provided ranges.
268fn parse_host_vtl2_ram(
269    params: &ShimParams,
270    memory: &[MemoryEntry],
271) -> OffStackRef<'static, impl AsRef<[MemoryEntry]> + use<>> {
272    // If no VTL2 protectable ram was provided by the host, use the build time
273    // value encoded in ShimParams.
274    let mut vtl2_ram = off_stack!(ArrayVec<MemoryEntry, MAX_NUMA_NODES>, ArrayVec::new_const());
275    if params.isolation_type.is_hardware_isolated() {
276        // Hardware isolated VMs use the size hint by the host, but use the base
277        // address encoded in the file.
278        let vtl2_size = memory.iter().fold(0, |acc, entry| {
279            if entry.mem_type == MemoryMapEntryType::VTL2_PROTECTABLE {
280                acc + entry.range.len()
281            } else {
282                acc
283            }
284        });
285
286        log!(
287            "host provided vtl2 ram size is {:x}, measured size is {:x}",
288            vtl2_size,
289            params.memory_size
290        );
291
292        let vtl2_size = max(vtl2_size, params.memory_size);
293        vtl2_ram.push(MemoryEntry {
294            range: MemoryRange::new(
295                params.memory_start_address..(params.memory_start_address + vtl2_size),
296            ),
297            mem_type: MemoryMapEntryType::VTL2_PROTECTABLE,
298            vnode: 0,
299        });
300    } else {
301        for &entry in memory
302            .iter()
303            .filter(|entry| entry.mem_type == MemoryMapEntryType::VTL2_PROTECTABLE)
304        {
305            vtl2_ram.push(entry);
306        }
307    }
308
309    if vtl2_ram.is_empty() {
310        log!("using measured vtl2 ram");
311        vtl2_ram.push(MemoryEntry {
312            range: MemoryRange::try_new(
313                params.memory_start_address..(params.memory_start_address + params.memory_size),
314            )
315            .expect("range is valid"),
316            mem_type: MemoryMapEntryType::VTL2_PROTECTABLE,
317            vnode: 0,
318        });
319    }
320
321    vtl2_ram
322}
323
324fn init_heap(params: &ShimParams) {
325    // Initialize the temporary heap.
326    //
327    // This is only to be enabled for mesh decode.
328    //
329    // SAFETY: The heap range is reserved at file build time, and is
330    // guaranteed to be unused by anything else.
331    unsafe {
332        ALLOCATOR.init(params.heap);
333    }
334}
335
336type ParsedDt =
337    ParsedDeviceTree<MAX_PARTITION_RAM_RANGES, MAX_CPU_COUNT, COMMAND_LINE_SIZE, MAX_ENTROPY_SIZE>;
338
339/// Add common ranges to [`AddressSpaceManagerBuilder`] regardless if creating
340/// topology from the host or from saved state.
341fn add_common_ranges<'a, I: Iterator<Item = MemoryRange>>(
342    params: &ShimParams,
343    mut builder: AddressSpaceManagerBuilder<'a, I>,
344) -> AddressSpaceManagerBuilder<'a, I> {
345    // Add the log buffer which is always present.
346    builder = builder.with_log_buffer(params.log_buffer);
347
348    if params.vtl2_reserved_region_size != 0 {
349        builder = builder.with_reserved_range(MemoryRange::new(
350            params.vtl2_reserved_region_start
351                ..(params.vtl2_reserved_region_start + params.vtl2_reserved_region_size),
352        ));
353    }
354
355    if params.sidecar_size != 0 {
356        builder = builder.with_sidecar_image(MemoryRange::new(
357            params.sidecar_base..(params.sidecar_base + params.sidecar_size),
358        ));
359    }
360
361    // Only specify pagetables as a reserved region on TDX, as they are used
362    // for AP startup via the mailbox protocol. On other platforms, the
363    // memory is free to be reclaimed.
364    if params.isolation_type == IsolationType::Tdx {
365        assert!(params.page_tables.is_some());
366        builder = builder.with_page_tables(params.page_tables.expect("always present on tdx"));
367    }
368
369    builder
370}
371
372#[derive(Debug, PartialEq, Eq)]
373struct PartitionTopology {
374    vtl2_ram: &'static [MemoryEntry],
375    vtl0_mmio: ArrayVec<MemoryRange, 2>,
376    vtl2_mmio: ArrayVec<MemoryRange, 2>,
377    memory_allocation_mode: MemoryAllocationMode,
378}
379
380// Calculate the default mmio size for VTL2 when not specified by the host.
381//
382// This is half of the high mmio gap size, rounded down, with a minimum of 128
383// MB and a maximum of 1 GB.
384fn calculate_default_mmio_size(parsed: &ParsedDt) -> Result<u64, DtError> {
385    const MINIMUM_MMIO_SIZE: u64 = 128 * (1 << 20);
386    const MAXIMUM_MMIO_SIZE: u64 = 1 << 30;
387    let half_high_gap = parsed.vmbus_vtl0.as_ref().ok_or(DtError::Vtl0Vmbus)?.mmio[1].len() / 2;
388    Ok(half_high_gap.clamp(MINIMUM_MMIO_SIZE, MAXIMUM_MMIO_SIZE))
389}
390
391/// Read topology from the host provided device tree.
392fn topology_from_host_dt(
393    params: &ShimParams,
394    parsed: &ParsedDt,
395    options: &BootCommandLineOptions,
396    address_space: &mut AddressSpaceManager,
397) -> Result<PartitionTopology, DtError> {
398    log!("reading topology from host device tree");
399
400    let mut vtl2_ram =
401        off_stack!(ArrayVec<MemoryEntry, MAX_VTL2_RAM_RANGES>, ArrayVec::new_const());
402
403    // TODO: Decide if isolated guests always use VTL2 allocation mode.
404
405    let memory_allocation_mode = parsed.memory_allocation_mode;
406    match memory_allocation_mode {
407        MemoryAllocationMode::Host => {
408            vtl2_ram
409                .try_extend_from_slice(parse_host_vtl2_ram(params, &parsed.memory).as_ref())
410                .expect("vtl2 ram should only be 64 big");
411        }
412        MemoryAllocationMode::Vtl2 {
413            memory_size,
414            mmio_size: _,
415        } => {
416            vtl2_ram
417                .try_extend_from_slice(
418                    allocate_vtl2_ram(params, &parsed.memory, memory_size).as_ref(),
419                )
420                .expect("vtl2 ram should only be 64 big");
421        }
422    }
423
424    // The host is responsible for allocating MMIO ranges for non-isolated
425    // guests when it also provides the ram VTL2 should use.
426    //
427    // For isolated guests, or when VTL2 has been asked to carve out its own
428    // memory, carve out a range from the VTL0 allotment.
429    let (vtl0_mmio, vtl2_mmio) = if params.isolation_type != IsolationType::None
430        || matches!(
431            parsed.memory_allocation_mode,
432            MemoryAllocationMode::Vtl2 { .. }
433        ) {
434        // Decide the amount of mmio VTL2 should allocate.
435        let mmio_size = max(
436            match parsed.memory_allocation_mode {
437                MemoryAllocationMode::Vtl2 { mmio_size, .. } => mmio_size.unwrap_or(0),
438                _ => 0,
439            },
440            calculate_default_mmio_size(parsed)?,
441        );
442
443        log!("allocating vtl2 mmio size {mmio_size:#x} bytes");
444
445        // Decide what mmio vtl2 should use.
446        let mmio = &parsed.vmbus_vtl0.as_ref().ok_or(DtError::Vtl0Vmbus)?.mmio;
447        let selected_vtl2_mmio = select_vtl2_mmio_range(mmio, mmio_size)?;
448
449        // Update vtl0 mmio to exclude vtl2 mmio.
450        let vtl0_mmio = subtract_ranges(mmio.iter().cloned(), [selected_vtl2_mmio])
451            .collect::<ArrayVec<MemoryRange, 2>>();
452        let vtl2_mmio = [selected_vtl2_mmio]
453            .into_iter()
454            .collect::<ArrayVec<MemoryRange, 2>>();
455
456        // TODO: For now, if we have only a single vtl0_mmio range left,
457        // panic. In the future decide if we want to report this as a start
458        // failure in usermode, change allocation strategy, or something
459        // else.
460        assert_eq!(
461            vtl0_mmio.len(),
462            2,
463            "vtl0 mmio ranges are not 2 {:#x?}",
464            vtl0_mmio
465        );
466
467        (vtl0_mmio, vtl2_mmio)
468    } else {
469        (
470            parsed
471                .vmbus_vtl0
472                .as_ref()
473                .ok_or(DtError::Vtl0Vmbus)?
474                .mmio
475                .clone(),
476            parsed
477                .vmbus_vtl2
478                .as_ref()
479                .ok_or(DtError::Vtl2Vmbus)?
480                .mmio
481                .clone(),
482        )
483    };
484
485    // The host provided device tree is marked as normal ram, as the
486    // bootshim is responsible for constructing anything usermode needs from
487    // it, and passing it via the device tree provided to the kernel.
488    let reclaim_base = params.dt_start();
489    let reclaim_end = params.dt_start() + params.dt_size();
490    let vtl2_config_region_reclaim =
491        MemoryRange::try_new(reclaim_base..reclaim_end).expect("range is valid");
492
493    log!("reclaim device tree memory {reclaim_base:x}-{reclaim_end:x}");
494
495    // Initialize the address space manager with fixed at build time ranges.
496    let vtl2_config_region = MemoryRange::new(
497        params.parameter_region_start
498            ..(params.parameter_region_start + params.parameter_region_size),
499    );
500
501    // NOTE: Size the region as 20 pages. This should be plenty enough for the
502    // worst case encoded size (about 50 bytes worst case per memory entry, with
503    // the max number of ram ranges), and is small enough that we can reserve it
504    // on all sizes. Revisit this calculation if we persist more state in the
505    // future.
506    const PERSISTED_REGION_SIZE: u64 = 20 * 4096;
507    let (persisted_state_region, remainder) = params
508        .persisted_state
509        .split_at_offset(PERSISTED_REGION_SIZE);
510    log!("persisted state region sized to {persisted_state_region:#x?}, remainder {remainder:#x?}");
511
512    let mut address_space_builder = AddressSpaceManagerBuilder::new(
513        address_space,
514        &vtl2_ram,
515        params.used,
516        persisted_state_region,
517        subtract_ranges([vtl2_config_region], [vtl2_config_region_reclaim]),
518    );
519
520    address_space_builder = add_common_ranges(params, address_space_builder);
521
522    address_space_builder
523        .init()
524        .expect("failed to initialize address space manager");
525
526    if params.isolation_type == IsolationType::None {
527        if let Some(vtl2_gpa_pool_size) = pick_private_pool_size(
528            options.enable_vtl2_gpa_pool,
529            parsed.device_dma_page_count,
530            parsed.cpu_count(),
531            vtl2_ram.iter().map(|e| e.range.len()).sum(),
532        ) {
533            // Reserve the specified number of pages for the pool. Use the used
534            // ranges to figure out which VTL2 memory is free to allocate from.
535            let pool_size_bytes = vtl2_gpa_pool_size * HV_PAGE_SIZE;
536
537            match address_space.allocate(
538                None,
539                pool_size_bytes,
540                AllocationType::GpaPool,
541                AllocationPolicy::LowMemory,
542            ) {
543                Some(pool) => {
544                    log!("allocated VTL2 pool at {:#x?}", pool.range);
545                }
546                None => {
547                    panic!("failed to allocate VTL2 pool of size {pool_size_bytes:#x} bytes");
548                }
549            };
550        }
551    }
552
553    Ok(PartitionTopology {
554        vtl2_ram: OffStackRef::<'_, ArrayVec<MemoryEntry, MAX_VTL2_RAM_RANGES>>::leak(vtl2_ram),
555        vtl0_mmio,
556        vtl2_mmio,
557        memory_allocation_mode,
558    })
559}
560
561/// Read topology from the persisted state region and protobuf payload.
562fn topology_from_persisted_state(
563    header: PersistedStateHeader,
564    params: &ShimParams,
565    parsed: &ParsedDt,
566    address_space: &mut AddressSpaceManager,
567) -> Result<PartitionTopology, DtError> {
568    log!("reading topology from persisted state");
569
570    // Verify the header describes a protobuf region within the bootshim
571    // persisted region. We expect it to live there as today we rely on the
572    // build time generated pagetable to identity map the protobuf region.
573    let protobuf_region =
574        MemoryRange::new(header.protobuf_base..(header.protobuf_base + header.protobuf_region_len));
575    assert!(
576        params.persisted_state.contains(&protobuf_region),
577        "protobuf region {protobuf_region:#x?} is not contained within the persisted state region {:#x?}",
578        params.persisted_state
579    );
580
581    // Verify protobuf payload len is smaller than region.
582    assert!(
583        header.protobuf_payload_len <= header.protobuf_region_len,
584        "protobuf payload len {} is larger than region len {}",
585        header.protobuf_payload_len,
586        header.protobuf_region_len
587    );
588
589    // SAFETY: The region lies within the persisted state region, which is
590    // identity mapped via the build time generated pagetable.
591    let protobuf_raw = unsafe {
592        core::slice::from_raw_parts(
593            header.protobuf_base as *const u8,
594            header.protobuf_payload_len as usize,
595        )
596    };
597
598    let parsed_protobuf: loader_defs::shim::save_restore::SavedState =
599        bump_alloc::with_global_alloc(|| {
600            log!("decoding protobuf of size {}", protobuf_raw.len());
601            mesh_protobuf::decode(protobuf_raw).expect("failed to decode protobuf")
602        });
603
604    let loader_defs::shim::save_restore::SavedState {
605        partition_memory,
606        partition_mmio,
607    } = parsed_protobuf;
608
609    // FUTURE: should memory allocation mode should persist in saved state and
610    // verify the host did not change it?
611    let memory_allocation_mode = parsed.memory_allocation_mode;
612
613    let mut vtl2_ram =
614        off_stack!(ArrayVec<MemoryEntry, MAX_VTL2_RAM_RANGES>, ArrayVec::new_const());
615
616    // Determine which ranges are memory ranges used by VTL2.
617    let previous_vtl2_ram = partition_memory.iter().filter_map(|entry| {
618        if entry.vtl_type.ram() && entry.vtl_type.vtl2() {
619            Some(MemoryEntry {
620                range: entry.range,
621                mem_type: entry.igvm_type.clone().into(),
622                vnode: entry.vnode,
623            })
624        } else {
625            None
626        }
627    });
628
629    // Merge adjacent ranges as saved state reports the final usage of ram which
630    // includes reserved in separate ranges. Here we want the whole underlying
631    // ram ranges, merged with adjacent types if they share the same igvm types.
632    let previous_vtl2_ram = memory_range::merge_adjacent_ranges(
633        previous_vtl2_ram.map(|entry| (entry.range, (entry.mem_type, entry.vnode))),
634    );
635
636    vtl2_ram.extend(
637        previous_vtl2_ram.map(|(range, (mem_type, vnode))| MemoryEntry {
638            range,
639            mem_type,
640            vnode,
641        }),
642    );
643
644    // If the host was responsible for allocating VTL2 ram, verify the ram
645    // parsed from the previous instance matches.
646    //
647    // FUTURE: When VTL2 itself did allocation, we should verify that all ranges
648    // are still within the provided memory map.
649    if matches!(memory_allocation_mode, MemoryAllocationMode::Host) {
650        let host_vtl2_ram = parse_host_vtl2_ram(params, &parsed.memory);
651        assert_eq!(
652            vtl2_ram.as_slice(),
653            host_vtl2_ram.as_ref(),
654            "vtl2 ram from persisted state does not match host provided ram"
655        );
656    }
657
658    // Merge the persisted state header and protobuf region, and report that as
659    // the persisted region.
660    //
661    // NOTE: We could choose to resize the persisted region at this point, which
662    // we would need to do if we expect the saved state to grow larger.
663    let persisted_header = partition_memory
664        .iter()
665        .find(|entry| entry.vtl_type == MemoryVtlType::VTL2_PERSISTED_STATE_HEADER)
666        .expect("persisted state header missing");
667    let persisted_protobuf = partition_memory
668        .iter()
669        .find(|entry| entry.vtl_type == MemoryVtlType::VTL2_PERSISTED_STATE_PROTOBUF)
670        .expect("persisted state protobuf region missing");
671    assert_eq!(persisted_header.range.end(), protobuf_region.start());
672    let persisted_state_region =
673        MemoryRange::new(persisted_header.range.start()..persisted_protobuf.range.end());
674
675    // The host provided device tree is marked as normal ram, as the
676    // bootshim is responsible for constructing anything usermode needs from
677    // it, and passing it via the device tree provided to the kernel.
678    let reclaim_base = params.dt_start();
679    let reclaim_end = params.dt_start() + params.dt_size();
680    let vtl2_config_region_reclaim =
681        MemoryRange::try_new(reclaim_base..reclaim_end).expect("range is valid");
682
683    log!("reclaim device tree memory {reclaim_base:x}-{reclaim_end:x}");
684
685    let vtl2_config_region = MemoryRange::new(
686        params.parameter_region_start
687            ..(params.parameter_region_start + params.parameter_region_size),
688    );
689
690    let mut address_space_builder = AddressSpaceManagerBuilder::new(
691        address_space,
692        &vtl2_ram,
693        params.used,
694        persisted_state_region,
695        subtract_ranges([vtl2_config_region], [vtl2_config_region_reclaim]),
696    );
697
698    // NOTE: The only other region we take from the previous instance is any
699    // allocated vtl2 pool. Today, we do not allocate a new/larger pool if the
700    // command line arguments or host device tree changed, as that's not
701    // something we expect to happen in practice.
702    let mut pool_ranges = partition_memory.iter().filter_map(|entry| {
703        if entry.vtl_type == MemoryVtlType::VTL2_GPA_POOL {
704            Some(entry.range)
705        } else {
706            None
707        }
708    });
709    let pool_range = pool_ranges.next();
710    assert!(
711        pool_ranges.next().is_none(),
712        "previous instance had multiple pool ranges"
713    );
714
715    if let Some(pool_range) = pool_range {
716        address_space_builder = address_space_builder.with_pool_range(pool_range);
717    }
718
719    // As described above, other ranges come from this current boot.
720    address_space_builder = add_common_ranges(params, address_space_builder);
721
722    address_space_builder
723        .init()
724        .expect("failed to initialize address space manager");
725
726    // Read previous mmio for VTL0 and VTL2.
727    let vtl0_mmio = partition_mmio
728        .iter()
729        .filter_map(|entry| {
730            if entry.vtl_type == MemoryVtlType::VTL0_MMIO {
731                Some(entry.range)
732            } else {
733                None
734            }
735        })
736        .collect::<ArrayVec<MemoryRange, 2>>();
737    let vtl2_mmio = partition_mmio
738        .iter()
739        .filter_map(|entry| {
740            if entry.vtl_type == MemoryVtlType::VTL2_MMIO {
741                Some(entry.range)
742            } else {
743                None
744            }
745        })
746        .collect::<ArrayVec<MemoryRange, 2>>();
747
748    Ok(PartitionTopology {
749        vtl2_ram: OffStackRef::<'_, ArrayVec<MemoryEntry, MAX_VTL2_RAM_RANGES>>::leak(vtl2_ram),
750        vtl0_mmio,
751        vtl2_mmio,
752        memory_allocation_mode,
753    })
754}
755
756/// Read the persisted header from the start of the persisted state region
757/// described at file build time. If the magic value is not set, `None` is
758/// returned.
759fn read_persisted_region_header(params: &ShimParams) -> Option<PersistedStateHeader> {
760    // TODO CVM: On an isolated guest, these pages may not be accepted. We need
761    // to rethink how this will work in order to handle this correctly, as on a
762    // first boot we'd need to accept them early, but subsequent boots should
763    // not accept any pages.
764    //
765    // This may require some value passed in via a register or something early
766    // that indicates this is a servicing boot, which we could set if OpenHCL
767    // itself launches the next instance.
768    if params.isolation_type != IsolationType::None {
769        return None;
770    }
771
772    // SAFETY: The header lies at the start of the shim described persisted state
773    // region. This range is guaranteed to be identity mapped at file build
774    // time.
775    let buf = unsafe {
776        core::slice::from_raw_parts(
777            params.persisted_state.start() as *const u8,
778            size_of::<PersistedStateHeader>(),
779        )
780    };
781
782    let header = PersistedStateHeader::read_from_bytes(buf)
783        .expect("region is page aligned and the correct size");
784
785    if header.magic == PersistedStateHeader::MAGIC {
786        Some(header)
787    } else {
788        None
789    }
790}
791
792impl PartitionInfo {
793    // Read the IGVM provided DT for the vtl2 partition info.
794    pub fn read_from_dt<'a>(
795        params: &'a ShimParams,
796        storage: &'a mut Self,
797        address_space: &'_ mut AddressSpaceManager,
798        mut options: BootCommandLineOptions,
799        can_trust_host: bool,
800    ) -> Result<&'a mut Self, DtError> {
801        let dt = params.device_tree();
802
803        if dt[0] == 0 {
804            log!("host did not provide a device tree");
805            return Err(DtError::NoDeviceTree);
806        }
807
808        let mut dt_storage = off_stack!(ParsedDt, ParsedDeviceTree::new());
809
810        let parsed = ParsedDeviceTree::parse(dt, &mut *dt_storage).map_err(DtError::DeviceTree)?;
811
812        let command_line = params.command_line();
813
814        // Always write the measured command line.
815        write!(
816            storage.cmdline,
817            "{}",
818            command_line
819                .command_line()
820                .expect("measured command line should be valid")
821        )
822        .map_err(|_| DtError::CommandLineSize)?;
823
824        // Depending on policy, write what the host specified in the chosen node.
825        if can_trust_host && command_line.policy == CommandLinePolicy::APPEND_CHOSEN {
826            // Parse in extra options from the host provided command line.
827            options.parse(&parsed.command_line);
828            write!(storage.cmdline, " {}", &parsed.command_line)
829                .map_err(|_| DtError::CommandLineSize)?;
830        }
831
832        init_heap(params);
833
834        let persisted_state_header = read_persisted_region_header(params);
835        let topology = if let Some(header) = persisted_state_header {
836            log!("found persisted state header");
837            topology_from_persisted_state(header, params, parsed, address_space)?
838        } else {
839            topology_from_host_dt(params, parsed, &options, address_space)?
840        };
841
842        let Self {
843            vtl2_ram,
844            partition_ram,
845            isolation,
846            bsp_reg,
847            cpus,
848            vmbus_vtl0,
849            vmbus_vtl2,
850            cmdline: _,
851            com3_serial_available: com3_serial,
852            gic,
853            pmu_gsiv,
854            memory_allocation_mode,
855            entropy,
856            vtl0_alias_map,
857            nvme_keepalive,
858            boot_options,
859        } = storage;
860
861        // Set ram and memory alloction mode.
862        vtl2_ram.clear();
863        vtl2_ram.extend(topology.vtl2_ram.iter().copied());
864        partition_ram.clear();
865        partition_ram.extend(parsed.memory.iter().copied());
866        *memory_allocation_mode = topology.memory_allocation_mode;
867
868        // Set vmbus fields. The connection ID comes from the host, but mmio
869        // comes from topology.
870        *vmbus_vtl0 = VmbusInfo {
871            connection_id: parsed
872                .vmbus_vtl0
873                .as_ref()
874                .ok_or(DtError::Vtl0Vmbus)?
875                .connection_id,
876            mmio: topology.vtl0_mmio,
877        };
878        *vmbus_vtl2 = VmbusInfo {
879            connection_id: parsed
880                .vmbus_vtl2
881                .as_ref()
882                .ok_or(DtError::Vtl2Vmbus)?
883                .connection_id,
884            mmio: topology.vtl2_mmio,
885        };
886
887        // If we can trust the host, use the provided alias map
888        if can_trust_host {
889            *vtl0_alias_map = parsed.vtl0_alias_map;
890        }
891
892        *isolation = params.isolation_type;
893
894        *bsp_reg = parsed.boot_cpuid_phys;
895        cpus.extend(parsed.cpus.iter().copied());
896        *com3_serial = parsed.com3_serial;
897        *gic = parsed.gic.clone();
898        *pmu_gsiv = parsed.pmu_gsiv;
899        *entropy = parsed.entropy.clone();
900        *nvme_keepalive = parsed.nvme_keepalive;
901        *boot_options = options;
902
903        Ok(storage)
904    }
905}