openhcl_boot/host_params/dt/
mod.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! Parse partition info using the IGVM device tree parameter.
5
6use super::PartitionInfo;
7use super::shim_params::ShimParams;
8use crate::boot_logger::log;
9use crate::cmdline::BootCommandLineOptions;
10use crate::host_params::COMMAND_LINE_SIZE;
11use crate::host_params::MAX_CPU_COUNT;
12use crate::host_params::MAX_ENTROPY_SIZE;
13use crate::host_params::MAX_NUMA_NODES;
14use crate::host_params::MAX_PARTITION_RAM_RANGES;
15use crate::host_params::mmio::select_vtl2_mmio_range;
16use crate::host_params::shim_params::IsolationType;
17use crate::memory::AddressSpaceManager;
18use crate::memory::AddressSpaceManagerBuilder;
19use crate::memory::AllocationPolicy;
20use crate::memory::AllocationType;
21use crate::single_threaded::OffStackRef;
22use crate::single_threaded::off_stack;
23use arrayvec::ArrayVec;
24use core::cmp::max;
25use core::fmt::Write;
26use host_fdt_parser::MemoryAllocationMode;
27use host_fdt_parser::MemoryEntry;
28use host_fdt_parser::ParsedDeviceTree;
29use hvdef::HV_PAGE_SIZE;
30use igvm_defs::MemoryMapEntryType;
31use loader_defs::paravisor::CommandLinePolicy;
32use memory_range::MemoryRange;
33use memory_range::subtract_ranges;
34use memory_range::walk_ranges;
35use thiserror::Error;
36
37mod bump_alloc;
38
39/// Errors when reading the host device tree.
40#[derive(Debug, Error)]
41pub enum DtError {
42    /// Host did not provide a device tree.
43    #[error("no device tree provided by host")]
44    NoDeviceTree,
45    /// Invalid device tree.
46    #[error("host provided device tree is invalid")]
47    DeviceTree(#[source] host_fdt_parser::Error<'static>),
48    /// PartitionInfo's command line is too small to write the parsed legacy
49    /// command line.
50    #[error("commandline storage is too small to write the parsed command line")]
51    CommandLineSize,
52    /// Device tree did not contain a vmbus node for VTL2.
53    #[error("device tree did not contain a vmbus node for VTL2")]
54    Vtl2Vmbus,
55    /// Device tree did not contain a vmbus node for VTL0.
56    #[error("device tree did not contain a vmbus node for VTL0")]
57    Vtl0Vmbus,
58    /// Host provided high MMIO range is insufficient to cover VTL0 and VTL2.
59    #[error("host provided high MMIO range is insufficient to cover VTL0 and VTL2")]
60    NotEnoughMmio,
61}
62
63/// Allocate VTL2 ram from the partition's memory map.
64fn allocate_vtl2_ram(
65    params: &ShimParams,
66    partition_memory_map: &[MemoryEntry],
67    ram_size: Option<u64>,
68) -> OffStackRef<'static, impl AsRef<[MemoryEntry]> + use<>> {
69    // First, calculate how many numa nodes there are by looking at unique numa
70    // nodes in the memory map.
71    let mut numa_nodes = off_stack!(ArrayVec<u32, MAX_NUMA_NODES>, ArrayVec::new_const());
72
73    for entry in partition_memory_map.iter() {
74        match numa_nodes.binary_search(&entry.vnode) {
75            Ok(_) => {}
76            Err(index) => {
77                numa_nodes.insert(index, entry.vnode);
78            }
79        }
80    }
81
82    let numa_node_count = numa_nodes.len();
83
84    let vtl2_size = if let Some(ram_size) = ram_size {
85        if ram_size < params.memory_size {
86            panic!(
87                "host provided vtl2 ram size {:x} is smaller than measured size {:x}",
88                ram_size, params.memory_size
89            );
90        }
91        max(ram_size, params.memory_size)
92    } else {
93        params.memory_size
94    };
95
96    // Next, calculate the amount of memory that needs to be allocated per numa
97    // node.
98    let ram_per_node = vtl2_size / numa_node_count as u64;
99
100    // Seed the remaining allocation list with the memory required per node.
101    let mut memory_per_node = off_stack!(ArrayVec<u64, MAX_NUMA_NODES>, ArrayVec::new_const());
102    memory_per_node.extend((0..numa_node_count).map(|_| 0));
103    for entry in partition_memory_map.iter() {
104        memory_per_node[entry.vnode as usize] = ram_per_node;
105    }
106
107    // The range the IGVM file was loaded into is special - it is already
108    // counted as "allocated". This may have been split across different numa
109    // nodes. Walk the used range, add it to vtl2 ram, and subtract it from the
110    // used ranges.
111    let mut vtl2_ram = off_stack!(ArrayVec<MemoryEntry, MAX_NUMA_NODES>, ArrayVec::new_const());
112    let mut free_memory_after_vtl2 = off_stack!(ArrayVec<MemoryEntry, 1024>, ArrayVec::new_const());
113    let file_memory_range = MemoryRange::new(
114        params.memory_start_address..(params.memory_start_address + params.memory_size),
115    );
116
117    for (range, result) in walk_ranges(
118        [(file_memory_range, ())],
119        partition_memory_map.iter().map(|e| (e.range, e)),
120    ) {
121        match result {
122            memory_range::RangeWalkResult::Right(entry) => {
123                // Add this entry to the free list.
124                free_memory_after_vtl2.push(MemoryEntry {
125                    range,
126                    mem_type: entry.mem_type,
127                    vnode: entry.vnode,
128                });
129            }
130            memory_range::RangeWalkResult::Both(_, entry) => {
131                // Add this entry to the vtl2 ram list.
132                vtl2_ram.push(MemoryEntry {
133                    range,
134                    mem_type: entry.mem_type,
135                    vnode: entry.vnode,
136                });
137            }
138            memory_range::RangeWalkResult::Left(_) => {
139                panic!("used file range {range:#x?} is not reported as ram by host memmap")
140            }
141            // Ranges in neither are ignored.
142            memory_range::RangeWalkResult::Neither => {}
143        }
144    }
145
146    // Now remove ranges from the free list that were part of the initial launch
147    // context.
148    let mut free_memory = off_stack!(ArrayVec<MemoryEntry, 1024>, ArrayVec::new_const());
149    for (range, result) in walk_ranges(
150        params
151            .imported_regions()
152            .filter_map(|(range, _preaccepted)| {
153                if !file_memory_range.contains(&range) {
154                     // There should be no overlap - either the preaccepted range
155                    // is exclusively covered by the preaccpted VTL2 range or it
156                    // is not.
157                    assert!(!file_memory_range.overlaps(&range), "imported range {range:#x?} overlaps vtl2 range and is not fully contained within vtl2 range");
158                    Some((range, ()))
159                } else {
160                    None
161                }
162            }),
163        free_memory_after_vtl2.iter().map(|e| (e.range, e)),
164    ) {
165        match result {
166            memory_range::RangeWalkResult::Right(entry) => {
167                free_memory.push(MemoryEntry {
168                    range,
169                    mem_type: entry.mem_type,
170                    vnode: entry.vnode,
171                });
172            }
173            memory_range::RangeWalkResult::Left(_) => {
174                // On TDX, the reset vector page is not reported as ram by the
175                // host, but is preaccepted. Ignore it.
176                #[cfg(target_arch = "x86_64")]
177                if params.isolation_type == IsolationType::Tdx && range.start_4k_gpn() == 0xFFFFF && range.len() == 0x1000 {
178                    continue;
179                }
180
181                panic!("launch context range {range:#x?} is not reported as ram by host memmap")
182            }
183            memory_range::RangeWalkResult::Both(_, _) => {
184                // Range was part of the preaccepted import, is not free to
185                // allocate additional VTL2 ram from.
186            }
187            // Ranges in neither are ignored.
188            memory_range::RangeWalkResult::Neither => {}
189        }
190    }
191
192    // Subtract the used ranges from vtl2_ram
193    for entry in vtl2_ram.iter() {
194        let mem_req = &mut memory_per_node[entry.vnode as usize];
195
196        if entry.range.len() > *mem_req {
197            // TODO: Today if a used range is larger than the mem required, we
198            // just subtract that numa range to zero. Should we instead subtract
199            // from other numa nodes equally for over allocation?
200            log!(
201                "entry {entry:?} is larger than required {mem_req} for vnode {}",
202                entry.vnode
203            );
204            *mem_req = 0;
205        } else {
206            *mem_req -= entry.range.len();
207        }
208    }
209
210    // Allocate remaining memory per node required.
211    for (node, required_mem) in memory_per_node.iter().enumerate() {
212        let mut required_mem = *required_mem;
213        if required_mem == 0 {
214            continue;
215        }
216
217        // Start allocation from the top of the free list, which is high memory
218        // in reverse order.
219        for entry in free_memory.iter_mut().rev() {
220            if entry.vnode == node as u32 && !entry.range.is_empty() {
221                assert!(required_mem != 0);
222                let bytes_to_allocate = core::cmp::min(entry.range.len(), required_mem);
223
224                // Allocate top down from the range.
225                let offset = entry.range.len() - bytes_to_allocate;
226                let (remaining, alloc) = MemoryRange::split_at_offset(&entry.range, offset);
227
228                entry.range = remaining;
229                vtl2_ram.push(MemoryEntry {
230                    range: alloc,
231                    mem_type: entry.mem_type,
232                    vnode: node as u32,
233                });
234
235                required_mem -= bytes_to_allocate;
236
237                // Stop allocating if we're done allocating.
238                if required_mem == 0 {
239                    break;
240                }
241            }
242        }
243
244        if required_mem != 0 {
245            // TODO: Handle fallback allocations on other numa nodes when a node
246            // is exhausted.
247            panic!(
248                "failed to allocate {required_mem:#x} for vnode {node:#x}, no memory remaining for vnode"
249            );
250        }
251    }
252
253    // Sort VTL2 ram as we may have allocated from different places.
254    vtl2_ram.sort_unstable_by_key(|e| e.range.start());
255
256    vtl2_ram
257}
258
259/// Parse VTL2 ram from host provided ranges.
260fn parse_host_vtl2_ram(
261    params: &ShimParams,
262    memory: &[MemoryEntry],
263) -> OffStackRef<'static, impl AsRef<[MemoryEntry]> + use<>> {
264    // If no VTL2 protectable ram was provided by the host, use the build time
265    // value encoded in ShimParams.
266    let mut vtl2_ram = off_stack!(ArrayVec<MemoryEntry, MAX_NUMA_NODES>, ArrayVec::new_const());
267    if params.isolation_type.is_hardware_isolated() {
268        // Hardware isolated VMs use the size hint by the host, but use the base
269        // address encoded in the file.
270        let vtl2_size = memory.iter().fold(0, |acc, entry| {
271            if entry.mem_type == MemoryMapEntryType::VTL2_PROTECTABLE {
272                acc + entry.range.len()
273            } else {
274                acc
275            }
276        });
277
278        log!(
279            "host provided vtl2 ram size is {:x}, measured size is {:x}",
280            vtl2_size,
281            params.memory_size
282        );
283
284        let vtl2_size = max(vtl2_size, params.memory_size);
285        vtl2_ram.push(MemoryEntry {
286            range: MemoryRange::new(
287                params.memory_start_address..(params.memory_start_address + vtl2_size),
288            ),
289            mem_type: MemoryMapEntryType::VTL2_PROTECTABLE,
290            vnode: 0,
291        });
292    } else {
293        for &entry in memory
294            .iter()
295            .filter(|entry| entry.mem_type == MemoryMapEntryType::VTL2_PROTECTABLE)
296        {
297            vtl2_ram.push(entry);
298        }
299    }
300
301    if vtl2_ram.is_empty() {
302        log!("using measured vtl2 ram");
303        vtl2_ram.push(MemoryEntry {
304            range: MemoryRange::try_new(
305                params.memory_start_address..(params.memory_start_address + params.memory_size),
306            )
307            .expect("range is valid"),
308            mem_type: MemoryMapEntryType::VTL2_PROTECTABLE,
309            vnode: 0,
310        });
311    }
312
313    vtl2_ram
314}
315
316fn init_heap(params: &ShimParams) {
317    // Initialize the temporary heap.
318    //
319    // This is only to be enabled for mesh decode.
320    //
321    // SAFETY: The heap range is reserved at file build time, and is
322    // guaranteed to be unused by anything else.
323    unsafe {
324        bump_alloc::ALLOCATOR.init(params.heap);
325    }
326
327    // TODO: test using heap, as no mesh decode yet.
328    {
329        use alloc::boxed::Box;
330        bump_alloc::ALLOCATOR.enable_alloc();
331
332        let box_int = Box::new(42);
333        log!("box int {box_int}");
334        drop(box_int);
335        bump_alloc::ALLOCATOR.disable_alloc();
336        bump_alloc::ALLOCATOR.log_stats();
337    }
338}
339
340impl PartitionInfo {
341    // Read the IGVM provided DT for the vtl2 partition info.
342    pub fn read_from_dt<'a>(
343        params: &'a ShimParams,
344        storage: &'a mut Self,
345        address_space: &'_ mut AddressSpaceManager,
346        mut options: BootCommandLineOptions,
347        can_trust_host: bool,
348    ) -> Result<&'a mut Self, DtError> {
349        let dt = params.device_tree();
350
351        if dt[0] == 0 {
352            log!("host did not provide a device tree");
353            return Err(DtError::NoDeviceTree);
354        }
355
356        let mut dt_storage = off_stack!(ParsedDeviceTree<MAX_PARTITION_RAM_RANGES, MAX_CPU_COUNT, COMMAND_LINE_SIZE, MAX_ENTROPY_SIZE>, ParsedDeviceTree::new());
357
358        let parsed = ParsedDeviceTree::parse(dt, &mut *dt_storage).map_err(DtError::DeviceTree)?;
359
360        let command_line = params.command_line();
361
362        // Always write the measured command line.
363        write!(
364            storage.cmdline,
365            "{}",
366            command_line
367                .command_line()
368                .expect("measured command line should be valid")
369        )
370        .map_err(|_| DtError::CommandLineSize)?;
371
372        // Depending on policy, write what the host specified in the chosen node.
373        if can_trust_host && command_line.policy == CommandLinePolicy::APPEND_CHOSEN {
374            // Parse in extra options from the host provided command line.
375            options.parse(&parsed.command_line);
376            write!(storage.cmdline, " {}", &parsed.command_line)
377                .map_err(|_| DtError::CommandLineSize)?;
378        }
379
380        // TODO: Decide if isolated guests always use VTL2 allocation mode.
381
382        match parsed.memory_allocation_mode {
383            MemoryAllocationMode::Host => {
384                storage.vtl2_ram.clear();
385                storage
386                    .vtl2_ram
387                    .try_extend_from_slice(parse_host_vtl2_ram(params, &parsed.memory).as_ref())
388                    .expect("vtl2 ram should only be 64 big");
389                storage.memory_allocation_mode = MemoryAllocationMode::Host;
390            }
391            MemoryAllocationMode::Vtl2 {
392                memory_size,
393                mmio_size,
394            } => {
395                storage.vtl2_ram.clear();
396                storage
397                    .vtl2_ram
398                    .try_extend_from_slice(
399                        allocate_vtl2_ram(params, &parsed.memory, memory_size).as_ref(),
400                    )
401                    .expect("vtl2 ram should only be 64 big");
402                storage.memory_allocation_mode = MemoryAllocationMode::Vtl2 {
403                    memory_size,
404                    mmio_size,
405                };
406            }
407        }
408
409        storage.vmbus_vtl2 = parsed.vmbus_vtl2.clone().ok_or(DtError::Vtl2Vmbus)?;
410        storage.vmbus_vtl0 = parsed.vmbus_vtl0.clone().ok_or(DtError::Vtl0Vmbus)?;
411
412        init_heap(params);
413
414        // The host is responsible for allocating MMIO ranges for non-isolated
415        // guests when it also provides the ram VTL2 should use.
416        //
417        // For isolated guests, or when VTL2 has been asked to carve out its own
418        // memory, carve out a range from the VTL0 allotment.
419        if params.isolation_type != IsolationType::None
420            || matches!(
421                parsed.memory_allocation_mode,
422                MemoryAllocationMode::Vtl2 { .. }
423            )
424        {
425            // Decide the amount of mmio VTL2 should allocate. Enforce a minimum
426            // of 128 MB mmio for VTL2.
427            const MINIMUM_MMIO_SIZE: u64 = 128 * (1 << 20);
428            let mmio_size = max(
429                match parsed.memory_allocation_mode {
430                    MemoryAllocationMode::Vtl2 { mmio_size, .. } => mmio_size.unwrap_or(0),
431                    _ => 0,
432                },
433                MINIMUM_MMIO_SIZE,
434            );
435
436            // Decide what mmio vtl2 should use.
437            let mmio = &parsed.vmbus_vtl0.as_ref().ok_or(DtError::Vtl0Vmbus)?.mmio;
438            let selected_vtl2_mmio = select_vtl2_mmio_range(mmio, mmio_size)?;
439
440            // Update vtl0 mmio to exclude vtl2 mmio.
441            let vtl0_mmio = subtract_ranges(
442                storage.vmbus_vtl0.mmio.iter().cloned(),
443                [selected_vtl2_mmio],
444            )
445            .collect::<ArrayVec<MemoryRange, 2>>();
446
447            // TODO: For now, if we have only a single vtl0_mmio range left,
448            // panic. In the future decide if we want to report this as a start
449            // failure in usermode, change allocation strategy, or something
450            // else.
451            assert_eq!(
452                vtl0_mmio.len(),
453                2,
454                "vtl0 mmio ranges are not 2 {:#x?}",
455                vtl0_mmio
456            );
457
458            storage.vmbus_vtl2.mmio.clear();
459            storage.vmbus_vtl2.mmio.push(selected_vtl2_mmio);
460            storage.vmbus_vtl0.mmio = vtl0_mmio;
461        }
462
463        // The host provided device tree is marked as normal ram, as the
464        // bootshim is responsible for constructing anything usermode needs from
465        // it, and passing it via the device tree provided to the kernel.
466        let reclaim_base = params.dt_start();
467        let reclaim_end = params.dt_start() + params.dt_size();
468        let vtl2_config_region_reclaim =
469            MemoryRange::try_new(reclaim_base..reclaim_end).expect("range is valid");
470
471        log!("reclaim device tree memory {reclaim_base:x}-{reclaim_end:x}");
472
473        for entry in &parsed.memory {
474            storage.partition_ram.push(*entry);
475        }
476
477        // Initialize the address space manager with fixed at build time ranges.
478        let vtl2_config_region = MemoryRange::new(
479            params.parameter_region_start
480                ..(params.parameter_region_start + params.parameter_region_size),
481        );
482
483        let mut address_space_builder = AddressSpaceManagerBuilder::new(
484            address_space,
485            &storage.vtl2_ram,
486            params.used,
487            subtract_ranges([vtl2_config_region], [vtl2_config_region_reclaim]),
488        )
489        .with_log_buffer(params.log_buffer);
490
491        if params.vtl2_reserved_region_size != 0 {
492            address_space_builder = address_space_builder.with_reserved_range(MemoryRange::new(
493                params.vtl2_reserved_region_start
494                    ..(params.vtl2_reserved_region_start + params.vtl2_reserved_region_size),
495            ));
496        }
497
498        if params.sidecar_size != 0 {
499            address_space_builder = address_space_builder.with_sidecar_image(MemoryRange::new(
500                params.sidecar_base..(params.sidecar_base + params.sidecar_size),
501            ));
502        }
503
504        // Only specify pagetables as a reserved region on TDX, as they are used
505        // for AP startup via the mailbox protocol. On other platforms, the
506        // memory is free to be reclaimed.
507        if params.isolation_type == IsolationType::Tdx {
508            assert!(params.page_tables.is_some());
509            address_space_builder = address_space_builder
510                .with_page_tables(params.page_tables.expect("always present on tdx"));
511        }
512
513        address_space_builder
514            .init()
515            .expect("failed to initialize address space manager");
516
517        // Decide if we will reserve memory for a VTL2 private pool. Parse this
518        // from the final command line, or the host provided device tree value.
519        let vtl2_gpa_pool_size = {
520            let dt_page_count = parsed.device_dma_page_count;
521            let cmdline_page_count = options.enable_vtl2_gpa_pool;
522            max(dt_page_count.unwrap_or(0), cmdline_page_count.unwrap_or(0))
523        };
524        if vtl2_gpa_pool_size != 0 {
525            // Reserve the specified number of pages for the pool. Use the used
526            // ranges to figure out which VTL2 memory is free to allocate from.
527            let pool_size_bytes = vtl2_gpa_pool_size * HV_PAGE_SIZE;
528
529            match address_space.allocate(
530                None,
531                pool_size_bytes,
532                AllocationType::GpaPool,
533                AllocationPolicy::LowMemory,
534            ) {
535                Some(pool) => {
536                    log!("allocated VTL2 pool at {:#x?}", pool.range);
537                }
538                None => {
539                    panic!("failed to allocate VTL2 pool of size {pool_size_bytes:#x} bytes");
540                }
541            };
542        }
543
544        // If we can trust the host, use the provided alias map
545        if can_trust_host {
546            storage.vtl0_alias_map = parsed.vtl0_alias_map;
547        }
548
549        // Set remaining struct fields before returning.
550        let Self {
551            vtl2_ram: _,
552            partition_ram: _,
553            isolation,
554            bsp_reg,
555            cpus,
556            vmbus_vtl0: _,
557            vmbus_vtl2: _,
558            cmdline: _,
559            com3_serial_available: com3_serial,
560            gic,
561            pmu_gsiv,
562            memory_allocation_mode: _,
563            entropy,
564            vtl0_alias_map: _,
565            nvme_keepalive,
566            boot_options,
567        } = storage;
568
569        *isolation = params.isolation_type;
570
571        *bsp_reg = parsed.boot_cpuid_phys;
572        cpus.extend(parsed.cpus.iter().copied());
573        *com3_serial = parsed.com3_serial;
574        *gic = parsed.gic.clone();
575        *pmu_gsiv = parsed.pmu_gsiv;
576        *entropy = parsed.entropy.clone();
577        *nvme_keepalive = parsed.nvme_keepalive;
578        *boot_options = options;
579
580        Ok(storage)
581    }
582}