openhcl_boot/host_params/dt/
mod.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! Parse partition info using the IGVM device tree parameter.
5
6extern crate alloc;
7
8use super::PartitionInfo;
9use super::shim_params::ShimParams;
10use crate::boot_logger::log;
11use crate::cmdline::BootCommandLineOptions;
12use crate::cmdline::SidecarOptions;
13use crate::host_params::COMMAND_LINE_SIZE;
14use crate::host_params::MAX_CPU_COUNT;
15use crate::host_params::MAX_ENTROPY_SIZE;
16use crate::host_params::MAX_NUMA_NODES;
17use crate::host_params::MAX_PARTITION_RAM_RANGES;
18use crate::host_params::MAX_VTL2_RAM_RANGES;
19use crate::host_params::dt::dma_hint::pick_private_pool_size;
20use crate::host_params::mmio::select_vtl2_mmio_range;
21use crate::host_params::shim_params::IsolationType;
22use crate::memory::AddressSpaceManager;
23use crate::memory::AddressSpaceManagerBuilder;
24use crate::memory::AllocationPolicy;
25use crate::memory::AllocationType;
26use crate::single_threaded::OffStackRef;
27use crate::single_threaded::off_stack;
28use alloc::vec::Vec;
29use arrayvec::ArrayVec;
30use bump_alloc::ALLOCATOR;
31use core::cmp::max;
32use core::fmt::Write;
33use host_fdt_parser::MemoryAllocationMode;
34use host_fdt_parser::MemoryEntry;
35use host_fdt_parser::ParsedDeviceTree;
36use host_fdt_parser::VmbusInfo;
37use hvdef::HV_PAGE_SIZE;
38use igvm_defs::MemoryMapEntryType;
39use loader_defs::paravisor::CommandLinePolicy;
40use loader_defs::shim::MemoryVtlType;
41use loader_defs::shim::PersistedStateHeader;
42use memory_range::MemoryRange;
43use memory_range::subtract_ranges;
44use memory_range::walk_ranges;
45use thiserror::Error;
46use zerocopy::FromBytes;
47
48mod bump_alloc;
49mod dma_hint;
50
51/// Errors when reading the host device tree.
52#[derive(Debug, Error)]
53pub enum DtError {
54    /// Host did not provide a device tree.
55    #[error("no device tree provided by host")]
56    NoDeviceTree,
57    /// Invalid device tree.
58    #[error("host provided device tree is invalid")]
59    DeviceTree(#[source] host_fdt_parser::Error<'static>),
60    /// PartitionInfo's command line is too small to write the parsed legacy
61    /// command line.
62    #[error("commandline storage is too small to write the parsed command line")]
63    CommandLineSize,
64    /// Device tree did not contain a vmbus node for VTL2.
65    #[error("device tree did not contain a vmbus node for VTL2")]
66    Vtl2Vmbus,
67    /// Device tree did not contain a vmbus node for VTL0.
68    #[error("device tree did not contain a vmbus node for VTL0")]
69    Vtl0Vmbus,
70    /// Host provided high MMIO range is insufficient to cover VTL0 and VTL2.
71    #[error("host provided high MMIO range is insufficient to cover VTL0 and VTL2")]
72    NotEnoughMmio,
73}
74
75/// Allocate VTL2 ram from the partition's memory map.
76fn allocate_vtl2_ram(
77    params: &ShimParams,
78    partition_memory_map: &[MemoryEntry],
79    ram_size: Option<u64>,
80) -> OffStackRef<'static, impl AsRef<[MemoryEntry]> + use<>> {
81    // First, calculate how many numa nodes there are by looking at unique numa
82    // nodes in the memory map.
83    let mut numa_nodes = off_stack!(ArrayVec<u32, MAX_NUMA_NODES>, ArrayVec::new_const());
84
85    for entry in partition_memory_map.iter() {
86        match numa_nodes.binary_search(&entry.vnode) {
87            Ok(_) => {}
88            Err(index) => {
89                numa_nodes.insert(index, entry.vnode);
90            }
91        }
92    }
93
94    let numa_node_count = numa_nodes.len();
95
96    let vtl2_size = if let Some(ram_size) = ram_size {
97        if ram_size < params.memory_size {
98            panic!(
99                "host provided vtl2 ram size {:x} is smaller than measured size {:x}",
100                ram_size, params.memory_size
101            );
102        }
103        max(ram_size, params.memory_size)
104    } else {
105        params.memory_size
106    };
107
108    // Next, calculate the amount of memory that needs to be allocated per numa
109    // node.
110    let ram_per_node = vtl2_size / numa_node_count as u64;
111
112    // Seed the remaining allocation list with the memory required per node.
113    let mut memory_per_node = off_stack!(ArrayVec<u64, MAX_NUMA_NODES>, ArrayVec::new_const());
114    memory_per_node.extend((0..numa_node_count).map(|_| 0));
115    for entry in partition_memory_map.iter() {
116        memory_per_node[entry.vnode as usize] = ram_per_node;
117    }
118
119    // The range the IGVM file was loaded into is special - it is already
120    // counted as "allocated". This may have been split across different numa
121    // nodes. Walk the used range, add it to vtl2 ram, and subtract it from the
122    // used ranges.
123    let mut vtl2_ram = off_stack!(ArrayVec<MemoryEntry, MAX_NUMA_NODES>, ArrayVec::new_const());
124    let mut free_memory_after_vtl2 = off_stack!(ArrayVec<MemoryEntry, 1024>, ArrayVec::new_const());
125    let file_memory_range = MemoryRange::new(
126        params.memory_start_address..(params.memory_start_address + params.memory_size),
127    );
128
129    for (range, result) in walk_ranges(
130        [(file_memory_range, ())],
131        partition_memory_map.iter().map(|e| (e.range, e)),
132    ) {
133        match result {
134            memory_range::RangeWalkResult::Right(entry) => {
135                // Add this entry to the free list.
136                free_memory_after_vtl2.push(MemoryEntry {
137                    range,
138                    mem_type: entry.mem_type,
139                    vnode: entry.vnode,
140                });
141            }
142            memory_range::RangeWalkResult::Both(_, entry) => {
143                // Add this entry to the vtl2 ram list.
144                vtl2_ram.push(MemoryEntry {
145                    range,
146                    mem_type: entry.mem_type,
147                    vnode: entry.vnode,
148                });
149            }
150            memory_range::RangeWalkResult::Left(_) => {
151                panic!("used file range {range:#x?} is not reported as ram by host memmap")
152            }
153            // Ranges in neither are ignored.
154            memory_range::RangeWalkResult::Neither => {}
155        }
156    }
157
158    // Now remove ranges from the free list that were part of the initial launch
159    // context.
160    let mut free_memory = off_stack!(ArrayVec<MemoryEntry, 1024>, ArrayVec::new_const());
161    for (range, result) in walk_ranges(
162        params
163            .imported_regions()
164            .filter_map(|(range, _preaccepted)| {
165                if !file_memory_range.contains(&range) {
166                     // There should be no overlap - either the preaccepted range
167                    // is exclusively covered by the preaccpted VTL2 range or it
168                    // is not.
169                    assert!(!file_memory_range.overlaps(&range), "imported range {range:#x?} overlaps vtl2 range and is not fully contained within vtl2 range");
170                    Some((range, ()))
171                } else {
172                    None
173                }
174            }),
175        free_memory_after_vtl2.iter().map(|e| (e.range, e)),
176    ) {
177        match result {
178            memory_range::RangeWalkResult::Right(entry) => {
179                free_memory.push(MemoryEntry {
180                    range,
181                    mem_type: entry.mem_type,
182                    vnode: entry.vnode,
183                });
184            }
185            memory_range::RangeWalkResult::Left(_) => {
186                // On TDX, the reset vector page is not reported as ram by the
187                // host, but is preaccepted. Ignore it.
188                #[cfg(target_arch = "x86_64")]
189                if params.isolation_type == IsolationType::Tdx && range.start_4k_gpn() == 0xFFFFF && range.len() == 0x1000 {
190                    continue;
191                }
192
193                panic!("launch context range {range:#x?} is not reported as ram by host memmap")
194            }
195            memory_range::RangeWalkResult::Both(_, _) => {
196                // Range was part of the preaccepted import, is not free to
197                // allocate additional VTL2 ram from.
198            }
199            // Ranges in neither are ignored.
200            memory_range::RangeWalkResult::Neither => {}
201        }
202    }
203
204    // Subtract the used ranges from vtl2_ram
205    for entry in vtl2_ram.iter() {
206        let mem_req = &mut memory_per_node[entry.vnode as usize];
207
208        if entry.range.len() > *mem_req {
209            // TODO: Today if a used range is larger than the mem required, we
210            // just subtract that numa range to zero. Should we instead subtract
211            // from other numa nodes equally for over allocation?
212            log!(
213                "entry {entry:?} is larger than required {mem_req} for vnode {}",
214                entry.vnode
215            );
216            *mem_req = 0;
217        } else {
218            *mem_req -= entry.range.len();
219        }
220    }
221
222    // Allocate remaining memory per node required.
223    for (node, required_mem) in memory_per_node.iter().enumerate() {
224        let mut required_mem = *required_mem;
225        if required_mem == 0 {
226            continue;
227        }
228
229        // Start allocation from the top of the free list, which is high memory
230        // in reverse order.
231        for entry in free_memory.iter_mut().rev() {
232            if entry.vnode == node as u32 && !entry.range.is_empty() {
233                assert!(required_mem != 0);
234                let bytes_to_allocate = core::cmp::min(entry.range.len(), required_mem);
235
236                // Allocate top down from the range.
237                let offset = entry.range.len() - bytes_to_allocate;
238                let (remaining, alloc) = MemoryRange::split_at_offset(&entry.range, offset);
239
240                entry.range = remaining;
241                vtl2_ram.push(MemoryEntry {
242                    range: alloc,
243                    mem_type: entry.mem_type,
244                    vnode: node as u32,
245                });
246
247                required_mem -= bytes_to_allocate;
248
249                // Stop allocating if we're done allocating.
250                if required_mem == 0 {
251                    break;
252                }
253            }
254        }
255
256        if required_mem != 0 {
257            // TODO: Handle fallback allocations on other numa nodes when a node
258            // is exhausted.
259            panic!(
260                "failed to allocate {required_mem:#x} for vnode {node:#x}, no memory remaining for vnode"
261            );
262        }
263    }
264
265    // Sort VTL2 ram as we may have allocated from different places.
266    vtl2_ram.sort_unstable_by_key(|e| e.range.start());
267
268    vtl2_ram
269}
270
271/// Parse VTL2 ram from host provided ranges.
272fn parse_host_vtl2_ram(
273    params: &ShimParams,
274    memory: &[MemoryEntry],
275) -> OffStackRef<'static, impl AsRef<[MemoryEntry]> + use<>> {
276    // If no VTL2 protectable ram was provided by the host, use the build time
277    // value encoded in ShimParams.
278    let mut vtl2_ram = off_stack!(ArrayVec<MemoryEntry, MAX_NUMA_NODES>, ArrayVec::new_const());
279    if params.isolation_type.is_hardware_isolated() {
280        // Hardware isolated VMs use the size hint by the host, but use the base
281        // address encoded in the file.
282        let vtl2_size = memory.iter().fold(0, |acc, entry| {
283            if entry.mem_type == MemoryMapEntryType::VTL2_PROTECTABLE {
284                acc + entry.range.len()
285            } else {
286                acc
287            }
288        });
289
290        log!(
291            "host provided vtl2 ram size is {:x}, measured size is {:x}",
292            vtl2_size,
293            params.memory_size
294        );
295
296        let vtl2_size = max(vtl2_size, params.memory_size);
297        vtl2_ram.push(MemoryEntry {
298            range: MemoryRange::new(
299                params.memory_start_address..(params.memory_start_address + vtl2_size),
300            ),
301            mem_type: MemoryMapEntryType::VTL2_PROTECTABLE,
302            vnode: 0,
303        });
304    } else {
305        for &entry in memory
306            .iter()
307            .filter(|entry| entry.mem_type == MemoryMapEntryType::VTL2_PROTECTABLE)
308        {
309            vtl2_ram.push(entry);
310        }
311    }
312
313    if vtl2_ram.is_empty() {
314        log!("using measured vtl2 ram");
315        vtl2_ram.push(MemoryEntry {
316            range: MemoryRange::try_new(
317                params.memory_start_address..(params.memory_start_address + params.memory_size),
318            )
319            .expect("range is valid"),
320            mem_type: MemoryMapEntryType::VTL2_PROTECTABLE,
321            vnode: 0,
322        });
323    }
324
325    vtl2_ram
326}
327
328fn init_heap(params: &ShimParams) {
329    // Initialize the temporary heap.
330    //
331    // This is only to be enabled for mesh decode.
332    //
333    // SAFETY: The heap range is reserved at file build time, and is
334    // guaranteed to be unused by anything else.
335    unsafe {
336        ALLOCATOR.init(params.heap);
337    }
338}
339
340type ParsedDt =
341    ParsedDeviceTree<MAX_PARTITION_RAM_RANGES, MAX_CPU_COUNT, COMMAND_LINE_SIZE, MAX_ENTROPY_SIZE>;
342
343/// Add common ranges to [`AddressSpaceManagerBuilder`] regardless if creating
344/// topology from the host or from saved state.
345fn add_common_ranges<'a, I: Iterator<Item = MemoryRange>>(
346    params: &ShimParams,
347    mut builder: AddressSpaceManagerBuilder<'a, I>,
348) -> AddressSpaceManagerBuilder<'a, I> {
349    // Add the log buffer which is always present.
350    builder = builder.with_log_buffer(params.log_buffer);
351
352    if params.vtl2_reserved_region_size != 0 {
353        builder = builder.with_reserved_range(MemoryRange::new(
354            params.vtl2_reserved_region_start
355                ..(params.vtl2_reserved_region_start + params.vtl2_reserved_region_size),
356        ));
357    }
358
359    if params.sidecar_size != 0 {
360        builder = builder.with_sidecar_image(MemoryRange::new(
361            params.sidecar_base..(params.sidecar_base + params.sidecar_size),
362        ));
363    }
364
365    builder
366}
367
368#[derive(Debug, PartialEq, Eq)]
369struct PartitionTopology {
370    vtl2_ram: &'static [MemoryEntry],
371    vtl0_mmio: ArrayVec<MemoryRange, 2>,
372    vtl2_mmio: ArrayVec<MemoryRange, 2>,
373    memory_allocation_mode: MemoryAllocationMode,
374}
375
376/// State derived while constructing the partition topology
377/// from persisted state.
378#[derive(Debug, PartialEq, Eq)]
379struct PersistedPartitionTopology {
380    topology: PartitionTopology,
381    cpus_with_mapped_interrupts_no_io: Vec<u32>,
382    cpus_with_outstanding_io: Vec<u32>,
383}
384
385// Calculate the default mmio size for VTL2 when not specified by the host.
386//
387// This is half of the high mmio gap size, rounded down, with a minimum of 128
388// MB and a maximum of 1 GB.
389fn calculate_default_mmio_size(parsed: &ParsedDt) -> Result<u64, DtError> {
390    const MINIMUM_MMIO_SIZE: u64 = 128 * (1 << 20);
391    const MAXIMUM_MMIO_SIZE: u64 = 1 << 30;
392    let half_high_gap = parsed.vmbus_vtl0.as_ref().ok_or(DtError::Vtl0Vmbus)?.mmio[1].len() / 2;
393    Ok(half_high_gap.clamp(MINIMUM_MMIO_SIZE, MAXIMUM_MMIO_SIZE))
394}
395
396/// Read topology from the host provided device tree.
397fn topology_from_host_dt(
398    params: &ShimParams,
399    parsed: &ParsedDt,
400    options: &BootCommandLineOptions,
401    address_space: &mut AddressSpaceManager,
402) -> Result<PartitionTopology, DtError> {
403    log!("reading topology from host device tree");
404
405    let mut vtl2_ram =
406        off_stack!(ArrayVec<MemoryEntry, MAX_VTL2_RAM_RANGES>, ArrayVec::new_const());
407
408    // TODO: Decide if isolated guests always use VTL2 allocation mode.
409
410    let memory_allocation_mode = parsed.memory_allocation_mode;
411    match memory_allocation_mode {
412        MemoryAllocationMode::Host => {
413            vtl2_ram
414                .try_extend_from_slice(parse_host_vtl2_ram(params, &parsed.memory).as_ref())
415                .expect("vtl2 ram should only be 64 big");
416        }
417        MemoryAllocationMode::Vtl2 {
418            memory_size,
419            mmio_size: _,
420        } => {
421            vtl2_ram
422                .try_extend_from_slice(
423                    allocate_vtl2_ram(params, &parsed.memory, memory_size).as_ref(),
424                )
425                .expect("vtl2 ram should only be 64 big");
426        }
427    }
428
429    // The host is responsible for allocating MMIO ranges for non-isolated
430    // guests when it also provides the ram VTL2 should use.
431    //
432    // For isolated guests, or when VTL2 has been asked to carve out its own
433    // memory, carve out a range from the VTL0 allotment.
434    let (vtl0_mmio, vtl2_mmio) = if params.isolation_type != IsolationType::None
435        || matches!(
436            parsed.memory_allocation_mode,
437            MemoryAllocationMode::Vtl2 { .. }
438        ) {
439        // Decide the amount of mmio VTL2 should allocate.
440        let mmio_size = max(
441            match parsed.memory_allocation_mode {
442                MemoryAllocationMode::Vtl2 { mmio_size, .. } => mmio_size.unwrap_or(0),
443                _ => 0,
444            },
445            calculate_default_mmio_size(parsed)?,
446        );
447
448        log!("allocating vtl2 mmio size {mmio_size:#x} bytes");
449
450        // Decide what mmio vtl2 should use.
451        let mmio = &parsed.vmbus_vtl0.as_ref().ok_or(DtError::Vtl0Vmbus)?.mmio;
452        let selected_vtl2_mmio = select_vtl2_mmio_range(mmio, mmio_size)?;
453
454        // Update vtl0 mmio to exclude vtl2 mmio.
455        let vtl0_mmio = subtract_ranges(mmio.iter().cloned(), [selected_vtl2_mmio])
456            .collect::<ArrayVec<MemoryRange, 2>>();
457        let vtl2_mmio = [selected_vtl2_mmio]
458            .into_iter()
459            .collect::<ArrayVec<MemoryRange, 2>>();
460
461        // TODO: For now, if we have only a single vtl0_mmio range left,
462        // panic. In the future decide if we want to report this as a start
463        // failure in usermode, change allocation strategy, or something
464        // else.
465        assert_eq!(
466            vtl0_mmio.len(),
467            2,
468            "vtl0 mmio ranges are not 2 {:#x?}",
469            vtl0_mmio
470        );
471
472        (vtl0_mmio, vtl2_mmio)
473    } else {
474        (
475            parsed
476                .vmbus_vtl0
477                .as_ref()
478                .ok_or(DtError::Vtl0Vmbus)?
479                .mmio
480                .clone(),
481            parsed
482                .vmbus_vtl2
483                .as_ref()
484                .ok_or(DtError::Vtl2Vmbus)?
485                .mmio
486                .clone(),
487        )
488    };
489
490    // The host provided device tree is marked as normal ram, as the
491    // bootshim is responsible for constructing anything usermode needs from
492    // it, and passing it via the device tree provided to the kernel.
493    let reclaim_base = params.dt_start();
494    let reclaim_end = params.dt_start() + params.dt_size();
495    let vtl2_config_region_reclaim =
496        MemoryRange::try_new(reclaim_base..reclaim_end).expect("range is valid");
497
498    log!("reclaim device tree memory {reclaim_base:x}-{reclaim_end:x}");
499
500    // Initialize the address space manager with fixed at build time ranges.
501    let vtl2_config_region = MemoryRange::new(
502        params.parameter_region_start
503            ..(params.parameter_region_start + params.parameter_region_size),
504    );
505
506    // NOTE: Size the region as 20 pages. This should be plenty enough for the
507    // worst case encoded size (about 50 bytes worst case per memory entry, with
508    // the max number of ram ranges), and is small enough that we can reserve it
509    // on all sizes. Revisit this calculation if we persist more state in the
510    // future.
511    const PERSISTED_REGION_SIZE: u64 = 20 * 4096;
512    let (persisted_state_region, remainder) = params
513        .persisted_state
514        .split_at_offset(PERSISTED_REGION_SIZE);
515    log!("persisted state region sized to {persisted_state_region:#x?}, remainder {remainder:#x?}");
516
517    let mut address_space_builder = AddressSpaceManagerBuilder::new(
518        address_space,
519        &vtl2_ram,
520        params.used,
521        persisted_state_region,
522        subtract_ranges([vtl2_config_region], [vtl2_config_region_reclaim]),
523    );
524
525    address_space_builder = add_common_ranges(params, address_space_builder);
526
527    address_space_builder
528        .init()
529        .expect("failed to initialize address space manager");
530
531    if params.isolation_type == IsolationType::None {
532        if let Some(vtl2_gpa_pool_size) = pick_private_pool_size(
533            options.enable_vtl2_gpa_pool,
534            parsed.device_dma_page_count,
535            parsed.cpu_count(),
536            vtl2_ram.iter().map(|e| e.range.len()).sum(),
537        ) {
538            // Reserve the specified number of pages for the pool. Use the used
539            // ranges to figure out which VTL2 memory is free to allocate from.
540            let pool_size_bytes = vtl2_gpa_pool_size * HV_PAGE_SIZE;
541
542            // NOTE: For now, allocate all the private pool on NUMA node 0 to
543            // match previous behavior. Allocate from high memory downward to
544            // avoid overlapping any used ranges in low memory when openhcl's
545            // usage gets bigger, as otherwise the used_range by the bootshim
546            // could overlap the pool range chosen, when servicing to a new
547            // image.
548            match address_space.allocate(
549                Some(0),
550                pool_size_bytes,
551                AllocationType::GpaPool,
552                AllocationPolicy::HighMemory,
553            ) {
554                Some(pool) => {
555                    log!("allocated VTL2 pool at {:#x?}", pool.range);
556                }
557                None => {
558                    panic!("failed to allocate VTL2 pool of size {pool_size_bytes:#x} bytes");
559                }
560            };
561        }
562    }
563
564    Ok(PartitionTopology {
565        vtl2_ram: OffStackRef::<'_, ArrayVec<MemoryEntry, MAX_VTL2_RAM_RANGES>>::leak(vtl2_ram),
566        vtl0_mmio,
567        vtl2_mmio,
568        memory_allocation_mode,
569    })
570}
571
572/// Read topology from the persisted state region and protobuf payload.
573fn topology_from_persisted_state(
574    header: PersistedStateHeader,
575    params: &ShimParams,
576    parsed: &ParsedDt,
577    address_space: &mut AddressSpaceManager,
578) -> Result<PersistedPartitionTopology, DtError> {
579    log!("reading topology from persisted state");
580
581    // Verify the header describes a protobuf region within the bootshim
582    // persisted region. We expect it to live there as today we rely on the
583    // build time generated pagetable to identity map the protobuf region.
584    let protobuf_region =
585        MemoryRange::new(header.protobuf_base..(header.protobuf_base + header.protobuf_region_len));
586    assert!(
587        params.persisted_state.contains(&protobuf_region),
588        "protobuf region {protobuf_region:#x?} is not contained within the persisted state region {:#x?}",
589        params.persisted_state
590    );
591
592    // Verify protobuf payload len is smaller than region.
593    assert!(
594        header.protobuf_payload_len <= header.protobuf_region_len,
595        "protobuf payload len {} is larger than region len {}",
596        header.protobuf_payload_len,
597        header.protobuf_region_len
598    );
599
600    // SAFETY: The region lies within the persisted state region, which is
601    // identity mapped via the build time generated pagetable.
602    let protobuf_raw = unsafe {
603        core::slice::from_raw_parts(
604            header.protobuf_base as *const u8,
605            header.protobuf_payload_len as usize,
606        )
607    };
608
609    let parsed_protobuf: loader_defs::shim::save_restore::SavedState =
610        bump_alloc::with_global_alloc(|| {
611            log!("decoding protobuf of size {}", protobuf_raw.len());
612            mesh_protobuf::decode(protobuf_raw).expect("failed to decode protobuf")
613        });
614
615    let loader_defs::shim::save_restore::SavedState {
616        partition_memory,
617        partition_mmio,
618        cpus_with_mapped_interrupts_no_io,
619        cpus_with_outstanding_io,
620    } = parsed_protobuf;
621
622    // FUTURE: should memory allocation mode should persist in saved state and
623    // verify the host did not change it?
624    let memory_allocation_mode = parsed.memory_allocation_mode;
625
626    let mut vtl2_ram =
627        off_stack!(ArrayVec<MemoryEntry, MAX_VTL2_RAM_RANGES>, ArrayVec::new_const());
628
629    // Determine which ranges are memory ranges used by VTL2.
630    let previous_vtl2_ram = partition_memory.iter().filter_map(|entry| {
631        if entry.vtl_type.ram() && entry.vtl_type.vtl2() {
632            Some(MemoryEntry {
633                range: entry.range,
634                mem_type: entry.igvm_type.clone().into(),
635                vnode: entry.vnode,
636            })
637        } else {
638            None
639        }
640    });
641
642    // Merge adjacent ranges as saved state reports the final usage of ram which
643    // includes reserved in separate ranges. Here we want the whole underlying
644    // ram ranges, merged with adjacent types if they share the same igvm types.
645    let previous_vtl2_ram = memory_range::merge_adjacent_ranges(
646        previous_vtl2_ram.map(|entry| (entry.range, (entry.mem_type, entry.vnode))),
647    );
648
649    vtl2_ram.extend(
650        previous_vtl2_ram.map(|(range, (mem_type, vnode))| MemoryEntry {
651            range,
652            mem_type,
653            vnode,
654        }),
655    );
656
657    // If the host was responsible for allocating VTL2 ram, verify the ram
658    // parsed from the previous instance matches.
659    //
660    // FUTURE: When VTL2 itself did allocation, we should verify that all ranges
661    // are still within the provided memory map.
662    if matches!(memory_allocation_mode, MemoryAllocationMode::Host) {
663        let host_vtl2_ram = parse_host_vtl2_ram(params, &parsed.memory);
664        assert_eq!(
665            vtl2_ram.as_slice(),
666            host_vtl2_ram.as_ref(),
667            "vtl2 ram from persisted state does not match host provided ram"
668        );
669    }
670
671    // Merge the persisted state header and protobuf region, and report that as
672    // the persisted region.
673    //
674    // NOTE: We could choose to resize the persisted region at this point, which
675    // we would need to do if we expect the saved state to grow larger.
676    let persisted_header = partition_memory
677        .iter()
678        .find(|entry| entry.vtl_type == MemoryVtlType::VTL2_PERSISTED_STATE_HEADER)
679        .expect("persisted state header missing");
680    let persisted_protobuf = partition_memory
681        .iter()
682        .find(|entry| entry.vtl_type == MemoryVtlType::VTL2_PERSISTED_STATE_PROTOBUF)
683        .expect("persisted state protobuf region missing");
684    assert_eq!(persisted_header.range.end(), protobuf_region.start());
685    let persisted_state_region =
686        MemoryRange::new(persisted_header.range.start()..persisted_protobuf.range.end());
687
688    // The host provided device tree is marked as normal ram, as the
689    // bootshim is responsible for constructing anything usermode needs from
690    // it, and passing it via the device tree provided to the kernel.
691    let reclaim_base = params.dt_start();
692    let reclaim_end = params.dt_start() + params.dt_size();
693    let vtl2_config_region_reclaim =
694        MemoryRange::try_new(reclaim_base..reclaim_end).expect("range is valid");
695
696    log!("reclaim device tree memory {reclaim_base:x}-{reclaim_end:x}");
697
698    let vtl2_config_region = MemoryRange::new(
699        params.parameter_region_start
700            ..(params.parameter_region_start + params.parameter_region_size),
701    );
702
703    let mut address_space_builder = AddressSpaceManagerBuilder::new(
704        address_space,
705        &vtl2_ram,
706        params.used,
707        persisted_state_region,
708        subtract_ranges([vtl2_config_region], [vtl2_config_region_reclaim]),
709    );
710
711    // NOTE: The only other region we take from the previous instance is any
712    // allocated vtl2 pool. Today, we do not allocate a new/larger pool if the
713    // command line arguments or host device tree changed, as that's not
714    // something we expect to happen in practice.
715    let mut pool_ranges = partition_memory.iter().filter_map(|entry| {
716        if entry.vtl_type == MemoryVtlType::VTL2_GPA_POOL {
717            Some(entry.range)
718        } else {
719            None
720        }
721    });
722    let pool_range = pool_ranges.next();
723    assert!(
724        pool_ranges.next().is_none(),
725        "previous instance had multiple pool ranges"
726    );
727
728    if let Some(pool_range) = pool_range {
729        address_space_builder = address_space_builder.with_pool_range(pool_range);
730    }
731
732    // As described above, other ranges come from this current boot.
733    address_space_builder = add_common_ranges(params, address_space_builder);
734
735    address_space_builder
736        .init()
737        .expect("failed to initialize address space manager");
738
739    // Read previous mmio for VTL0 and VTL2.
740    let vtl0_mmio = partition_mmio
741        .iter()
742        .filter_map(|entry| {
743            if entry.vtl_type == MemoryVtlType::VTL0_MMIO {
744                Some(entry.range)
745            } else {
746                None
747            }
748        })
749        .collect::<ArrayVec<MemoryRange, 2>>();
750    let vtl2_mmio = partition_mmio
751        .iter()
752        .filter_map(|entry| {
753            if entry.vtl_type == MemoryVtlType::VTL2_MMIO {
754                Some(entry.range)
755            } else {
756                None
757            }
758        })
759        .collect::<ArrayVec<MemoryRange, 2>>();
760
761    Ok(PersistedPartitionTopology {
762        topology: PartitionTopology {
763            vtl2_ram: OffStackRef::<'_, ArrayVec<MemoryEntry, MAX_VTL2_RAM_RANGES>>::leak(vtl2_ram),
764            vtl0_mmio,
765            vtl2_mmio,
766            memory_allocation_mode,
767        },
768        cpus_with_mapped_interrupts_no_io,
769        cpus_with_outstanding_io,
770    })
771}
772
773/// Read the persisted header from the start of the persisted state region
774/// described at file build time. If the magic value is not set, `None` is
775/// returned.
776fn read_persisted_region_header(params: &ShimParams) -> Option<PersistedStateHeader> {
777    // TODO CVM: On an isolated guest, these pages may not be accepted. We need
778    // to rethink how this will work in order to handle this correctly, as on a
779    // first boot we'd need to accept them early, but subsequent boots should
780    // not accept any pages.
781    //
782    // This may require some value passed in via a register or something early
783    // that indicates this is a servicing boot, which we could set if OpenHCL
784    // itself launches the next instance.
785    if params.isolation_type != IsolationType::None {
786        return None;
787    }
788
789    // SAFETY: The header lies at the start of the shim described persisted state
790    // region. This range is guaranteed to be identity mapped at file build
791    // time.
792    let buf = unsafe {
793        core::slice::from_raw_parts(
794            params.persisted_state.start() as *const u8,
795            size_of::<PersistedStateHeader>(),
796        )
797    };
798
799    let header = PersistedStateHeader::read_from_bytes(buf)
800        .expect("region is page aligned and the correct size");
801
802    if header.magic == PersistedStateHeader::MAGIC {
803        Some(header)
804    } else {
805        None
806    }
807}
808
809impl PartitionInfo {
810    // Read the IGVM provided DT for the vtl2 partition info.
811    pub fn read_from_dt<'a>(
812        params: &'a ShimParams,
813        storage: &'a mut Self,
814        address_space: &'_ mut AddressSpaceManager,
815        mut options: BootCommandLineOptions,
816        can_trust_host: bool,
817    ) -> Result<&'a mut Self, DtError> {
818        let dt = params.device_tree();
819
820        if dt[0] == 0 {
821            log!("host did not provide a device tree");
822            return Err(DtError::NoDeviceTree);
823        }
824
825        let mut dt_storage = off_stack!(ParsedDt, ParsedDeviceTree::new());
826
827        let parsed = ParsedDeviceTree::parse(dt, &mut *dt_storage).map_err(DtError::DeviceTree)?;
828
829        let command_line = params.command_line();
830
831        // Always write the measured command line.
832        write!(
833            storage.cmdline,
834            "{}",
835            command_line
836                .command_line()
837                .expect("measured command line should be valid")
838        )
839        .map_err(|_| DtError::CommandLineSize)?;
840
841        // Depending on policy, write what the host specified in the chosen node.
842        if can_trust_host && command_line.policy == CommandLinePolicy::APPEND_CHOSEN {
843            // Parse in extra options from the host provided command line.
844            options.parse(&parsed.command_line);
845            write!(storage.cmdline, " {}", &parsed.command_line)
846                .map_err(|_| DtError::CommandLineSize)?;
847        }
848
849        init_heap(params);
850
851        let persisted_state_header = read_persisted_region_header(params);
852        let (topology, has_devices_that_should_disable_sidecar) =
853            if let Some(header) = persisted_state_header {
854                log!("found persisted state header");
855                let persisted_topology =
856                    topology_from_persisted_state(header, params, parsed, address_space)?;
857
858                (
859                    persisted_topology.topology,
860                    !(persisted_topology
861                        .cpus_with_mapped_interrupts_no_io
862                        .is_empty()
863                        && persisted_topology.cpus_with_outstanding_io.is_empty()),
864                )
865            } else {
866                (
867                    topology_from_host_dt(params, parsed, &options, address_space)?,
868                    false,
869                )
870            };
871
872        let Self {
873            vtl2_ram,
874            partition_ram,
875            isolation,
876            bsp_reg,
877            cpus,
878            vmbus_vtl0,
879            vmbus_vtl2,
880            cmdline: _,
881            com3_serial_available: com3_serial,
882            gic,
883            pmu_gsiv,
884            memory_allocation_mode,
885            entropy,
886            vtl0_alias_map,
887            nvme_keepalive,
888            boot_options,
889        } = storage;
890
891        if let (SidecarOptions::Enabled { cpu_threshold, .. }, true) = (
892            &boot_options.sidecar,
893            has_devices_that_should_disable_sidecar,
894        ) {
895            if cpu_threshold.is_none()
896                || cpu_threshold
897                    .and_then(|threshold| threshold.try_into().ok())
898                    .is_some_and(|threshold| parsed.cpu_count() < threshold)
899            {
900                // If we are in the restore path, disable sidecar for small VMs, as the amortization
901                // benefits don't apply when devices are kept alive; the CPUs need to be powered on anyway
902                // to check for interrupts.
903                log!("disabling sidecar, as we are restoring from persisted state");
904                boot_options.sidecar = SidecarOptions::DisabledServicing;
905                options.sidecar = SidecarOptions::DisabledServicing;
906            }
907        }
908
909        // Set ram and memory alloction mode.
910        vtl2_ram.clear();
911        vtl2_ram.extend(topology.vtl2_ram.iter().copied());
912        partition_ram.clear();
913        partition_ram.extend(parsed.memory.iter().copied());
914        *memory_allocation_mode = topology.memory_allocation_mode;
915
916        // Set vmbus fields. The connection ID comes from the host, but mmio
917        // comes from topology.
918        *vmbus_vtl0 = VmbusInfo {
919            connection_id: parsed
920                .vmbus_vtl0
921                .as_ref()
922                .ok_or(DtError::Vtl0Vmbus)?
923                .connection_id,
924            mmio: topology.vtl0_mmio,
925        };
926        *vmbus_vtl2 = VmbusInfo {
927            connection_id: parsed
928                .vmbus_vtl2
929                .as_ref()
930                .ok_or(DtError::Vtl2Vmbus)?
931                .connection_id,
932            mmio: topology.vtl2_mmio,
933        };
934
935        // If we can trust the host, use the provided alias map
936        if can_trust_host {
937            *vtl0_alias_map = parsed.vtl0_alias_map;
938        }
939
940        *isolation = params.isolation_type;
941
942        *bsp_reg = parsed.boot_cpuid_phys;
943        cpus.extend(parsed.cpus.iter().copied());
944        *com3_serial = parsed.com3_serial;
945        *gic = parsed.gic.clone();
946        *pmu_gsiv = parsed.pmu_gsiv;
947        *entropy = parsed.entropy.clone();
948        *nvme_keepalive = parsed.nvme_keepalive;
949        *boot_options = options;
950
951        Ok(storage)
952    }
953}