openhcl_boot/host_params/
dt.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! Parse partition info using the IGVM device tree parameter.
5
6use super::PartitionInfo;
7use super::shim_params::ShimParams;
8use crate::boot_logger::log;
9use crate::cmdline::BootCommandLineOptions;
10use crate::host_params::COMMAND_LINE_SIZE;
11use crate::host_params::MAX_CPU_COUNT;
12use crate::host_params::MAX_ENTROPY_SIZE;
13use crate::host_params::MAX_NUMA_NODES;
14use crate::host_params::MAX_PARTITION_RAM_RANGES;
15use crate::host_params::mmio::select_vtl2_mmio_range;
16use crate::host_params::shim_params::IsolationType;
17use crate::memory::AddressSpaceManager;
18use crate::memory::AddressSpaceManagerBuilder;
19use crate::memory::AllocationPolicy;
20use crate::memory::AllocationType;
21use crate::single_threaded::OffStackRef;
22use crate::single_threaded::off_stack;
23use arrayvec::ArrayVec;
24use core::cmp::max;
25use core::fmt::Write;
26use host_fdt_parser::MemoryAllocationMode;
27use host_fdt_parser::MemoryEntry;
28use host_fdt_parser::ParsedDeviceTree;
29use hvdef::HV_PAGE_SIZE;
30use igvm_defs::MemoryMapEntryType;
31use loader_defs::paravisor::CommandLinePolicy;
32use memory_range::MemoryRange;
33use memory_range::subtract_ranges;
34use memory_range::walk_ranges;
35use thiserror::Error;
36
37/// Errors when reading the host device tree.
38#[derive(Debug, Error)]
39pub enum DtError {
40    /// Host did not provide a device tree.
41    #[error("no device tree provided by host")]
42    NoDeviceTree,
43    /// Invalid device tree.
44    #[error("host provided device tree is invalid")]
45    DeviceTree(#[source] host_fdt_parser::Error<'static>),
46    /// PartitionInfo's command line is too small to write the parsed legacy
47    /// command line.
48    #[error("commandline storage is too small to write the parsed command line")]
49    CommandLineSize,
50    /// Device tree did not contain a vmbus node for VTL2.
51    #[error("device tree did not contain a vmbus node for VTL2")]
52    Vtl2Vmbus,
53    /// Device tree did not contain a vmbus node for VTL0.
54    #[error("device tree did not contain a vmbus node for VTL0")]
55    Vtl0Vmbus,
56    /// Host provided high MMIO range is insufficient to cover VTL0 and VTL2.
57    #[error("host provided high MMIO range is insufficient to cover VTL0 and VTL2")]
58    NotEnoughMmio,
59}
60
61/// Allocate VTL2 ram from the partition's memory map.
62fn allocate_vtl2_ram(
63    params: &ShimParams,
64    partition_memory_map: &[MemoryEntry],
65    ram_size: Option<u64>,
66) -> OffStackRef<'static, impl AsRef<[MemoryEntry]> + use<>> {
67    // First, calculate how many numa nodes there are by looking at unique numa
68    // nodes in the memory map.
69    let mut numa_nodes = off_stack!(ArrayVec<u32, MAX_NUMA_NODES>, ArrayVec::new_const());
70
71    for entry in partition_memory_map.iter() {
72        match numa_nodes.binary_search(&entry.vnode) {
73            Ok(_) => {}
74            Err(index) => {
75                numa_nodes.insert(index, entry.vnode);
76            }
77        }
78    }
79
80    let numa_node_count = numa_nodes.len();
81
82    let vtl2_size = if let Some(ram_size) = ram_size {
83        if ram_size < params.memory_size {
84            panic!(
85                "host provided vtl2 ram size {:x} is smaller than measured size {:x}",
86                ram_size, params.memory_size
87            );
88        }
89        max(ram_size, params.memory_size)
90    } else {
91        params.memory_size
92    };
93
94    // Next, calculate the amount of memory that needs to be allocated per numa
95    // node.
96    let ram_per_node = vtl2_size / numa_node_count as u64;
97
98    // Seed the remaining allocation list with the memory required per node.
99    let mut memory_per_node = off_stack!(ArrayVec<u64, MAX_NUMA_NODES>, ArrayVec::new_const());
100    memory_per_node.extend((0..numa_node_count).map(|_| 0));
101    for entry in partition_memory_map.iter() {
102        memory_per_node[entry.vnode as usize] = ram_per_node;
103    }
104
105    // The range the IGVM file was loaded into is special - it is already
106    // counted as "allocated". This may have been split across different numa
107    // nodes. Walk the used range, add it to vtl2 ram, and subtract it from the
108    // used ranges.
109    let mut vtl2_ram = off_stack!(ArrayVec<MemoryEntry, MAX_NUMA_NODES>, ArrayVec::new_const());
110    let mut free_memory_after_vtl2 = off_stack!(ArrayVec<MemoryEntry, 1024>, ArrayVec::new_const());
111    let file_memory_range = MemoryRange::new(
112        params.memory_start_address..(params.memory_start_address + params.memory_size),
113    );
114
115    for (range, result) in walk_ranges(
116        [(file_memory_range, ())],
117        partition_memory_map.iter().map(|e| (e.range, e)),
118    ) {
119        match result {
120            memory_range::RangeWalkResult::Right(entry) => {
121                // Add this entry to the free list.
122                free_memory_after_vtl2.push(MemoryEntry {
123                    range,
124                    mem_type: entry.mem_type,
125                    vnode: entry.vnode,
126                });
127            }
128            memory_range::RangeWalkResult::Both(_, entry) => {
129                // Add this entry to the vtl2 ram list.
130                vtl2_ram.push(MemoryEntry {
131                    range,
132                    mem_type: entry.mem_type,
133                    vnode: entry.vnode,
134                });
135            }
136            memory_range::RangeWalkResult::Left(_) => {
137                panic!("used file range {range:#x?} is not reported as ram by host memmap")
138            }
139            // Ranges in neither are ignored.
140            memory_range::RangeWalkResult::Neither => {}
141        }
142    }
143
144    // Now remove ranges from the free list that were part of the initial launch
145    // context.
146    let mut free_memory = off_stack!(ArrayVec<MemoryEntry, 1024>, ArrayVec::new_const());
147    for (range, result) in walk_ranges(
148        params
149            .imported_regions()
150            .filter_map(|(range, _preaccepted)| {
151                if !file_memory_range.contains(&range) {
152                     // There should be no overlap - either the preaccepted range
153                    // is exclusively covered by the preaccpted VTL2 range or it
154                    // is not.
155                    assert!(!file_memory_range.overlaps(&range), "imported range {range:#x?} overlaps vtl2 range and is not fully contained within vtl2 range");
156                    Some((range, ()))
157                } else {
158                    None
159                }
160            }),
161        free_memory_after_vtl2.iter().map(|e| (e.range, e)),
162    ) {
163        match result {
164            memory_range::RangeWalkResult::Right(entry) => {
165                free_memory.push(MemoryEntry {
166                    range,
167                    mem_type: entry.mem_type,
168                    vnode: entry.vnode,
169                });
170            }
171            memory_range::RangeWalkResult::Left(_) => {
172                // On TDX, the reset vector page is not reported as ram by the
173                // host, but is preaccepted. Ignore it.
174                #[cfg(target_arch = "x86_64")]
175                if params.isolation_type == IsolationType::Tdx && range.start_4k_gpn() == 0xFFFFF && range.len() == 0x1000 {
176                    continue;
177                }
178
179                panic!("launch context range {range:#x?} is not reported as ram by host memmap")
180            }
181            memory_range::RangeWalkResult::Both(_, _) => {
182                // Range was part of the preaccepted import, is not free to
183                // allocate additional VTL2 ram from.
184            }
185            // Ranges in neither are ignored.
186            memory_range::RangeWalkResult::Neither => {}
187        }
188    }
189
190    // Subtract the used ranges from vtl2_ram
191    for entry in vtl2_ram.iter() {
192        let mem_req = &mut memory_per_node[entry.vnode as usize];
193
194        if entry.range.len() > *mem_req {
195            // TODO: Today if a used range is larger than the mem required, we
196            // just subtract that numa range to zero. Should we instead subtract
197            // from other numa nodes equally for over allocation?
198            log!(
199                "entry {entry:?} is larger than required {mem_req} for vnode {}",
200                entry.vnode
201            );
202            *mem_req = 0;
203        } else {
204            *mem_req -= entry.range.len();
205        }
206    }
207
208    // Allocate remaining memory per node required.
209    for (node, required_mem) in memory_per_node.iter().enumerate() {
210        let mut required_mem = *required_mem;
211        if required_mem == 0 {
212            continue;
213        }
214
215        // Start allocation from the top of the free list, which is high memory
216        // in reverse order.
217        for entry in free_memory.iter_mut().rev() {
218            if entry.vnode == node as u32 && !entry.range.is_empty() {
219                assert!(required_mem != 0);
220                let bytes_to_allocate = core::cmp::min(entry.range.len(), required_mem);
221
222                // Allocate top down from the range.
223                let offset = entry.range.len() - bytes_to_allocate;
224                let (remaining, alloc) = MemoryRange::split_at_offset(&entry.range, offset);
225
226                entry.range = remaining;
227                vtl2_ram.push(MemoryEntry {
228                    range: alloc,
229                    mem_type: entry.mem_type,
230                    vnode: node as u32,
231                });
232
233                required_mem -= bytes_to_allocate;
234
235                // Stop allocating if we're done allocating.
236                if required_mem == 0 {
237                    break;
238                }
239            }
240        }
241
242        if required_mem != 0 {
243            // TODO: Handle fallback allocations on other numa nodes when a node
244            // is exhausted.
245            panic!(
246                "failed to allocate {required_mem:#x} for vnode {node:#x}, no memory remaining for vnode"
247            );
248        }
249    }
250
251    // Sort VTL2 ram as we may have allocated from different places.
252    vtl2_ram.sort_unstable_by_key(|e| e.range.start());
253
254    vtl2_ram
255}
256
257/// Parse VTL2 ram from host provided ranges.
258fn parse_host_vtl2_ram(
259    params: &ShimParams,
260    memory: &[MemoryEntry],
261) -> OffStackRef<'static, impl AsRef<[MemoryEntry]> + use<>> {
262    // If no VTL2 protectable ram was provided by the host, use the build time
263    // value encoded in ShimParams.
264    let mut vtl2_ram = off_stack!(ArrayVec<MemoryEntry, MAX_NUMA_NODES>, ArrayVec::new_const());
265    if params.isolation_type.is_hardware_isolated() {
266        // Hardware isolated VMs use the size hint by the host, but use the base
267        // address encoded in the file.
268        let vtl2_size = memory.iter().fold(0, |acc, entry| {
269            if entry.mem_type == MemoryMapEntryType::VTL2_PROTECTABLE {
270                acc + entry.range.len()
271            } else {
272                acc
273            }
274        });
275
276        log!(
277            "host provided vtl2 ram size is {:x}, measured size is {:x}",
278            vtl2_size,
279            params.memory_size
280        );
281
282        let vtl2_size = max(vtl2_size, params.memory_size);
283        vtl2_ram.push(MemoryEntry {
284            range: MemoryRange::new(
285                params.memory_start_address..(params.memory_start_address + vtl2_size),
286            ),
287            mem_type: MemoryMapEntryType::VTL2_PROTECTABLE,
288            vnode: 0,
289        });
290    } else {
291        for &entry in memory
292            .iter()
293            .filter(|entry| entry.mem_type == MemoryMapEntryType::VTL2_PROTECTABLE)
294        {
295            vtl2_ram.push(entry);
296        }
297    }
298
299    if vtl2_ram.is_empty() {
300        log!("using measured vtl2 ram");
301        vtl2_ram.push(MemoryEntry {
302            range: MemoryRange::try_new(
303                params.memory_start_address..(params.memory_start_address + params.memory_size),
304            )
305            .expect("range is valid"),
306            mem_type: MemoryMapEntryType::VTL2_PROTECTABLE,
307            vnode: 0,
308        });
309    }
310
311    vtl2_ram
312}
313
314impl PartitionInfo {
315    // Read the IGVM provided DT for the vtl2 partition info.
316    pub fn read_from_dt<'a>(
317        params: &'a ShimParams,
318        storage: &'a mut Self,
319        address_space: &'_ mut AddressSpaceManager,
320        mut options: BootCommandLineOptions,
321        can_trust_host: bool,
322    ) -> Result<&'a mut Self, DtError> {
323        let dt = params.device_tree();
324
325        if dt[0] == 0 {
326            log!("host did not provide a device tree");
327            return Err(DtError::NoDeviceTree);
328        }
329
330        let mut dt_storage = off_stack!(ParsedDeviceTree<MAX_PARTITION_RAM_RANGES, MAX_CPU_COUNT, COMMAND_LINE_SIZE, MAX_ENTROPY_SIZE>, ParsedDeviceTree::new());
331
332        let parsed = ParsedDeviceTree::parse(dt, &mut *dt_storage).map_err(DtError::DeviceTree)?;
333
334        let command_line = params.command_line();
335
336        // Always write the measured command line.
337        write!(
338            storage.cmdline,
339            "{}",
340            command_line
341                .command_line()
342                .expect("measured command line should be valid")
343        )
344        .map_err(|_| DtError::CommandLineSize)?;
345
346        // Depending on policy, write what the host specified in the chosen node.
347        if can_trust_host && command_line.policy == CommandLinePolicy::APPEND_CHOSEN {
348            // Parse in extra options from the host provided command line.
349            options.parse(&parsed.command_line);
350            write!(storage.cmdline, " {}", &parsed.command_line)
351                .map_err(|_| DtError::CommandLineSize)?;
352        }
353
354        // TODO: Decide if isolated guests always use VTL2 allocation mode.
355
356        match parsed.memory_allocation_mode {
357            MemoryAllocationMode::Host => {
358                storage.vtl2_ram.clear();
359                storage
360                    .vtl2_ram
361                    .try_extend_from_slice(parse_host_vtl2_ram(params, &parsed.memory).as_ref())
362                    .expect("vtl2 ram should only be 64 big");
363                storage.memory_allocation_mode = MemoryAllocationMode::Host;
364            }
365            MemoryAllocationMode::Vtl2 {
366                memory_size,
367                mmio_size,
368            } => {
369                storage.vtl2_ram.clear();
370                storage
371                    .vtl2_ram
372                    .try_extend_from_slice(
373                        allocate_vtl2_ram(params, &parsed.memory, memory_size).as_ref(),
374                    )
375                    .expect("vtl2 ram should only be 64 big");
376                storage.memory_allocation_mode = MemoryAllocationMode::Vtl2 {
377                    memory_size,
378                    mmio_size,
379                };
380            }
381        }
382
383        storage.vmbus_vtl2 = parsed.vmbus_vtl2.clone().ok_or(DtError::Vtl2Vmbus)?;
384        storage.vmbus_vtl0 = parsed.vmbus_vtl0.clone().ok_or(DtError::Vtl0Vmbus)?;
385
386        // The host is responsible for allocating MMIO ranges for non-isolated
387        // guests when it also provides the ram VTL2 should use.
388        //
389        // For isolated guests, or when VTL2 has been asked to carve out its own
390        // memory, carve out a range from the VTL0 allotment.
391        if params.isolation_type != IsolationType::None
392            || matches!(
393                parsed.memory_allocation_mode,
394                MemoryAllocationMode::Vtl2 { .. }
395            )
396        {
397            // Decide the amount of mmio VTL2 should allocate. Enforce a minimum
398            // of 128 MB mmio for VTL2.
399            const MINIMUM_MMIO_SIZE: u64 = 128 * (1 << 20);
400            let mmio_size = max(
401                match parsed.memory_allocation_mode {
402                    MemoryAllocationMode::Vtl2 { mmio_size, .. } => mmio_size.unwrap_or(0),
403                    _ => 0,
404                },
405                MINIMUM_MMIO_SIZE,
406            );
407
408            // Decide what mmio vtl2 should use.
409            let mmio = &parsed.vmbus_vtl0.as_ref().ok_or(DtError::Vtl0Vmbus)?.mmio;
410            let selected_vtl2_mmio = select_vtl2_mmio_range(mmio, mmio_size)?;
411
412            // Update vtl0 mmio to exclude vtl2 mmio.
413            let vtl0_mmio = subtract_ranges(
414                storage.vmbus_vtl0.mmio.iter().cloned(),
415                [selected_vtl2_mmio],
416            )
417            .collect::<ArrayVec<MemoryRange, 2>>();
418
419            // TODO: For now, if we have only a single vtl0_mmio range left,
420            // panic. In the future decide if we want to report this as a start
421            // failure in usermode, change allocation strategy, or something
422            // else.
423            assert_eq!(
424                vtl0_mmio.len(),
425                2,
426                "vtl0 mmio ranges are not 2 {:#x?}",
427                vtl0_mmio
428            );
429
430            storage.vmbus_vtl2.mmio.clear();
431            storage.vmbus_vtl2.mmio.push(selected_vtl2_mmio);
432            storage.vmbus_vtl0.mmio = vtl0_mmio;
433        }
434
435        // The host provided device tree is marked as normal ram, as the
436        // bootshim is responsible for constructing anything usermode needs from
437        // it, and passing it via the device tree provided to the kernel.
438        let reclaim_base = params.dt_start();
439        let reclaim_end = params.dt_start() + params.dt_size();
440        let vtl2_config_region_reclaim =
441            MemoryRange::try_new(reclaim_base..reclaim_end).expect("range is valid");
442
443        log!("reclaim device tree memory {reclaim_base:x}-{reclaim_end:x}");
444
445        for entry in &parsed.memory {
446            storage.partition_ram.push(*entry);
447        }
448
449        // Initialize the address space manager with fixed at build time ranges.
450        let vtl2_config_region = MemoryRange::new(
451            params.parameter_region_start
452                ..(params.parameter_region_start + params.parameter_region_size),
453        );
454
455        let mut address_space_builder = AddressSpaceManagerBuilder::new(
456            address_space,
457            &storage.vtl2_ram,
458            params.used,
459            subtract_ranges([vtl2_config_region], [vtl2_config_region_reclaim]),
460        );
461
462        if params.vtl2_reserved_region_size != 0 {
463            address_space_builder = address_space_builder.with_reserved_range(MemoryRange::new(
464                params.vtl2_reserved_region_start
465                    ..(params.vtl2_reserved_region_start + params.vtl2_reserved_region_size),
466            ));
467        }
468
469        if params.sidecar_size != 0 {
470            address_space_builder = address_space_builder.with_sidecar_image(MemoryRange::new(
471                params.sidecar_base..(params.sidecar_base + params.sidecar_size),
472            ));
473        }
474
475        // Only specify pagetables as a reserved region on TDX, as they are used
476        // for AP startup via the mailbox protocol. On other platforms, the
477        // memory is free to be reclaimed.
478        if params.isolation_type == IsolationType::Tdx {
479            assert!(params.page_tables.is_some());
480            address_space_builder = address_space_builder
481                .with_page_tables(params.page_tables.expect("always present on tdx"));
482        }
483
484        address_space_builder
485            .init()
486            .expect("failed to initialize address space manager");
487
488        // Decide if we will reserve memory for a VTL2 private pool. Parse this
489        // from the final command line, or the host provided device tree value.
490        let vtl2_gpa_pool_size = {
491            let dt_page_count = parsed.device_dma_page_count;
492            let cmdline_page_count = options.enable_vtl2_gpa_pool;
493            max(dt_page_count.unwrap_or(0), cmdline_page_count.unwrap_or(0))
494        };
495        if vtl2_gpa_pool_size != 0 {
496            // Reserve the specified number of pages for the pool. Use the used
497            // ranges to figure out which VTL2 memory is free to allocate from.
498            let pool_size_bytes = vtl2_gpa_pool_size * HV_PAGE_SIZE;
499
500            match address_space.allocate(
501                None,
502                pool_size_bytes,
503                AllocationType::GpaPool,
504                AllocationPolicy::LowMemory,
505            ) {
506                Some(pool) => {
507                    log!("allocated VTL2 pool at {:#x?}", pool.range);
508                }
509                None => {
510                    panic!("failed to allocate VTL2 pool of size {pool_size_bytes:#x} bytes");
511                }
512            };
513        }
514
515        // If we can trust the host, use the provided alias map
516        if can_trust_host {
517            storage.vtl0_alias_map = parsed.vtl0_alias_map;
518        }
519
520        // Set remaining struct fields before returning.
521        let Self {
522            vtl2_ram: _,
523            partition_ram: _,
524            isolation,
525            bsp_reg,
526            cpus,
527            vmbus_vtl0: _,
528            vmbus_vtl2: _,
529            cmdline: _,
530            com3_serial_available: com3_serial,
531            gic,
532            pmu_gsiv,
533            memory_allocation_mode: _,
534            entropy,
535            vtl0_alias_map: _,
536            nvme_keepalive,
537            boot_options,
538        } = storage;
539
540        *isolation = params.isolation_type;
541
542        *bsp_reg = parsed.boot_cpuid_phys;
543        cpus.extend(parsed.cpus.iter().copied());
544        *com3_serial = parsed.com3_serial;
545        *gic = parsed.gic.clone();
546        *pmu_gsiv = parsed.pmu_gsiv;
547        *entropy = parsed.entropy.clone();
548        *nvme_keepalive = parsed.nvme_keepalive;
549        *boot_options = options;
550
551        Ok(storage)
552    }
553}