openhcl_boot/host_params/
dt.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! Parse partition info using the IGVM device tree parameter.
5
6use super::PartitionInfo;
7use super::shim_params::ShimParams;
8use crate::boot_logger::log;
9use crate::cmdline::BootCommandLineOptions;
10use crate::host_params::COMMAND_LINE_SIZE;
11use crate::host_params::MAX_CPU_COUNT;
12use crate::host_params::MAX_ENTROPY_SIZE;
13use crate::host_params::MAX_NUMA_NODES;
14use crate::host_params::MAX_PARTITION_RAM_RANGES;
15use crate::host_params::MAX_VTL2_USED_RANGES;
16use crate::host_params::shim_params::IsolationType;
17use crate::single_threaded::OffStackRef;
18use crate::single_threaded::off_stack;
19use arrayvec::ArrayVec;
20use core::cmp::max;
21use core::fmt::Display;
22use core::fmt::Write;
23use host_fdt_parser::MemoryAllocationMode;
24use host_fdt_parser::MemoryEntry;
25use host_fdt_parser::ParsedDeviceTree;
26use hvdef::HV_PAGE_SIZE;
27use igvm_defs::MemoryMapEntryType;
28use loader_defs::paravisor::CommandLinePolicy;
29use memory_range::MemoryRange;
30use memory_range::flatten_ranges;
31use memory_range::subtract_ranges;
32use memory_range::walk_ranges;
33
34/// Errors when reading the host device tree.
35#[derive(Debug)]
36pub enum DtError {
37    /// Invalid device tree.
38    DeviceTree(host_fdt_parser::Error<'static>),
39    /// PartitionInfo's command line is too small to write the parsed legacy
40    /// command line.
41    CommandLineSize,
42    /// Device tree did not contain a vmbus node for VTL2.
43    Vtl2Vmbus,
44    /// Device tree did not contain a vmbus node for VTL0.
45    Vtl0Vmbus,
46    /// Host provided high MMIO range is insufficient to cover VTL0 and VTL2.
47    NotEnoughMmio,
48}
49
50impl Display for DtError {
51    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
52        match self {
53            DtError::DeviceTree(err) => {
54                f.write_fmt(format_args!("host provided device tree is invalid: {err}"))
55            }
56            DtError::CommandLineSize => {
57                f.write_str("commandline storage is too small to write the parsed command line")
58            }
59            DtError::Vtl2Vmbus => f.write_str("device tree did not contain a vmbus node for VTL2"),
60            DtError::Vtl0Vmbus => f.write_str("device tree did not contain a vmbus node for VTL0"),
61            DtError::NotEnoughMmio => {
62                f.write_str("host provided high MMIO range is insufficient to cover VTL0 and VTL2")
63            }
64        }
65    }
66}
67
68/// Allocate VTL2 ram from the partition's memory map.
69fn allocate_vtl2_ram(
70    params: &ShimParams,
71    partition_memory_map: &[MemoryEntry],
72    ram_size: Option<u64>,
73) -> OffStackRef<'static, impl AsRef<[MemoryEntry]> + use<>> {
74    // First, calculate how many numa nodes there are by looking at unique numa
75    // nodes in the memory map.
76    let mut numa_nodes = off_stack!(ArrayVec<u32, MAX_NUMA_NODES>, ArrayVec::new_const());
77
78    for entry in partition_memory_map.iter() {
79        match numa_nodes.binary_search(&entry.vnode) {
80            Ok(_) => {}
81            Err(index) => {
82                numa_nodes.insert(index, entry.vnode);
83            }
84        }
85    }
86
87    let numa_node_count = numa_nodes.len();
88
89    let vtl2_size = if let Some(ram_size) = ram_size {
90        if ram_size < params.memory_size {
91            panic!(
92                "host provided vtl2 ram size {:x} is smaller than measured size {:x}",
93                ram_size, params.memory_size
94            );
95        }
96        max(ram_size, params.memory_size)
97    } else {
98        params.memory_size
99    };
100
101    // Next, calculate the amount of memory that needs to be allocated per numa
102    // node.
103    let ram_per_node = vtl2_size / numa_node_count as u64;
104
105    // Seed the remaining allocation list with the memory required per node.
106    let mut memory_per_node = off_stack!(ArrayVec<u64, MAX_NUMA_NODES>, ArrayVec::new_const());
107    memory_per_node.extend((0..numa_node_count).map(|_| 0));
108    for entry in partition_memory_map.iter() {
109        memory_per_node[entry.vnode as usize] = ram_per_node;
110    }
111
112    // The range the IGVM file was loaded into is special - it is already
113    // counted as "allocated". This may have been split across different numa
114    // nodes. Walk the used range, add it to vtl2 ram, and subtract it from the
115    // used ranges.
116    let mut vtl2_ram = off_stack!(ArrayVec<MemoryEntry, MAX_NUMA_NODES>, ArrayVec::new_const());
117    let mut free_memory_after_vtl2 = off_stack!(ArrayVec<MemoryEntry, 1024>, ArrayVec::new_const());
118    let file_memory_range = MemoryRange::new(
119        params.memory_start_address..(params.memory_start_address + params.memory_size),
120    );
121
122    for (range, result) in walk_ranges(
123        [(file_memory_range, ())],
124        partition_memory_map.iter().map(|e| (e.range, e)),
125    ) {
126        match result {
127            memory_range::RangeWalkResult::Right(entry) => {
128                // Add this entry to the free list.
129                free_memory_after_vtl2.push(MemoryEntry {
130                    range,
131                    mem_type: entry.mem_type,
132                    vnode: entry.vnode,
133                });
134            }
135            memory_range::RangeWalkResult::Both(_, entry) => {
136                // Add this entry to the vtl2 ram list.
137                vtl2_ram.push(MemoryEntry {
138                    range,
139                    mem_type: entry.mem_type,
140                    vnode: entry.vnode,
141                });
142            }
143            memory_range::RangeWalkResult::Left(_) => {
144                panic!("used file range {range:#x?} is not reported as ram by host memmap")
145            }
146            // Ranges in neither are ignored.
147            memory_range::RangeWalkResult::Neither => {}
148        }
149    }
150
151    // Now remove ranges from the free list that were part of the initial launch
152    // context.
153    let mut free_memory = off_stack!(ArrayVec<MemoryEntry, 1024>, ArrayVec::new_const());
154    for (range, result) in walk_ranges(
155        params
156            .imported_regions()
157            .filter_map(|(range, _preaccepted)| {
158                if !file_memory_range.contains(&range) {
159                     // There should be no overlap - either the preaccepted range
160                    // is exclusively covered by the preaccpted VTL2 range or it
161                    // is not.
162                    assert!(!file_memory_range.overlaps(&range), "imported range {range:#x?} overlaps vtl2 range and is not fully contained within vtl2 range");
163                    Some((range, ()))
164                } else {
165                    None
166                }
167            }),
168        free_memory_after_vtl2.iter().map(|e| (e.range, e)),
169    ) {
170        match result {
171            memory_range::RangeWalkResult::Right(entry) => {
172                free_memory.push(MemoryEntry {
173                    range,
174                    mem_type: entry.mem_type,
175                    vnode: entry.vnode,
176                });
177            }
178            memory_range::RangeWalkResult::Left(_) => {
179                // On TDX, the reset vector page is not reported as ram by the
180                // host, but is preaccepted. Ignore it.
181                #[cfg(target_arch = "x86_64")]
182                if params.isolation_type == IsolationType::Tdx && range.start_4k_gpn() == 0xFFFFF && range.len() == 0x1000 {
183                    continue;
184                }
185
186                panic!("launch context range {range:#x?} is not reported as ram by host memmap")
187            }
188            memory_range::RangeWalkResult::Both(_, _) => {
189                // Range was part of the preaccepted import, is not free to
190                // allocate additional VTL2 ram from.
191            }
192            // Ranges in neither are ignored.
193            memory_range::RangeWalkResult::Neither => {}
194        }
195    }
196
197    // Subtract the used ranges from vtl2_ram
198    for entry in vtl2_ram.iter() {
199        let mem_req = &mut memory_per_node[entry.vnode as usize];
200
201        if entry.range.len() > *mem_req {
202            // TODO: Today if a used range is larger than the mem required, we
203            // just subtract that numa range to zero. Should we instead subtract
204            // from other numa nodes equally for over allocation?
205            log!(
206                "entry {entry:?} is larger than required {mem_req} for vnode {}",
207                entry.vnode
208            );
209            *mem_req = 0;
210        } else {
211            *mem_req -= entry.range.len();
212        }
213    }
214
215    // Allocate remaining memory per node required.
216    for (node, required_mem) in memory_per_node.iter().enumerate() {
217        let mut required_mem = *required_mem;
218        if required_mem == 0 {
219            continue;
220        }
221
222        // Start allocation from the top of the free list, which is high memory
223        // in reverse order.
224        for entry in free_memory.iter_mut().rev() {
225            if entry.vnode == node as u32 && !entry.range.is_empty() {
226                assert!(required_mem != 0);
227                let bytes_to_allocate = core::cmp::min(entry.range.len(), required_mem);
228
229                // Allocate top down from the range.
230                let offset = entry.range.len() - bytes_to_allocate;
231                let (remaining, alloc) = MemoryRange::split_at_offset(&entry.range, offset);
232
233                entry.range = remaining;
234                vtl2_ram.push(MemoryEntry {
235                    range: alloc,
236                    mem_type: entry.mem_type,
237                    vnode: node as u32,
238                });
239
240                required_mem -= bytes_to_allocate;
241
242                // Stop allocating if we're done allocating.
243                if required_mem == 0 {
244                    break;
245                }
246            }
247        }
248
249        if required_mem != 0 {
250            // TODO: Handle fallback allocations on other numa nodes when a node
251            // is exhausted.
252            panic!(
253                "failed to allocate {required_mem:#x} for vnode {node:#x}, no memory remaining for vnode"
254            );
255        }
256    }
257
258    // Sort VTL2 ram as we may have allocated from different places.
259    vtl2_ram.sort_unstable_by_key(|e| e.range.start());
260
261    vtl2_ram
262}
263
264/// Parse VTL2 ram from host provided ranges.
265fn parse_host_vtl2_ram(
266    params: &ShimParams,
267    memory: &[MemoryEntry],
268) -> OffStackRef<'static, impl AsRef<[MemoryEntry]> + use<>> {
269    // If no VTL2 protectable ram was provided by the host, use the build time
270    // value encoded in ShimParams.
271    let mut vtl2_ram = off_stack!(ArrayVec<MemoryEntry, MAX_NUMA_NODES>, ArrayVec::new_const());
272    if params.isolation_type.is_hardware_isolated() {
273        // Hardware isolated VMs use the size hint by the host, but use the base
274        // address encoded in the file.
275        let vtl2_size = memory.iter().fold(0, |acc, entry| {
276            if entry.mem_type == MemoryMapEntryType::VTL2_PROTECTABLE {
277                acc + entry.range.len()
278            } else {
279                acc
280            }
281        });
282
283        log!(
284            "host provided vtl2 ram size is {:x}, measured size is {:x}",
285            vtl2_size,
286            params.memory_size
287        );
288
289        let vtl2_size = max(vtl2_size, params.memory_size);
290        vtl2_ram.push(MemoryEntry {
291            range: MemoryRange::new(
292                params.memory_start_address..(params.memory_start_address + vtl2_size),
293            ),
294            mem_type: MemoryMapEntryType::VTL2_PROTECTABLE,
295            vnode: 0,
296        });
297    } else {
298        for &entry in memory
299            .iter()
300            .filter(|entry| entry.mem_type == MemoryMapEntryType::VTL2_PROTECTABLE)
301        {
302            vtl2_ram.push(entry);
303        }
304    }
305
306    if vtl2_ram.is_empty() {
307        log!("using measured vtl2 ram");
308        vtl2_ram.push(MemoryEntry {
309            range: MemoryRange::try_new(
310                params.memory_start_address..(params.memory_start_address + params.memory_size),
311            )
312            .expect("range is valid"),
313            mem_type: MemoryMapEntryType::VTL2_PROTECTABLE,
314            vnode: 0,
315        });
316    }
317
318    vtl2_ram
319}
320
321impl PartitionInfo {
322    // Read the IGVM provided DT for the vtl2 partition info. If no device tree
323    // was provided by the host, `None` is returned.
324    pub fn read_from_dt<'a>(
325        params: &'a ShimParams,
326        storage: &'a mut Self,
327        mut options: BootCommandLineOptions,
328        can_trust_host: bool,
329    ) -> Result<Option<&'a mut Self>, DtError> {
330        let dt = params.device_tree();
331
332        if dt[0] == 0 {
333            log!("host did not provide a device tree");
334            return Ok(None);
335        }
336
337        let mut dt_storage = off_stack!(ParsedDeviceTree<MAX_PARTITION_RAM_RANGES, MAX_CPU_COUNT, COMMAND_LINE_SIZE, MAX_ENTROPY_SIZE>, ParsedDeviceTree::new());
338
339        let parsed = ParsedDeviceTree::parse(dt, &mut *dt_storage).map_err(DtError::DeviceTree)?;
340
341        let command_line = params.command_line();
342
343        // Always write the measured command line.
344        write!(
345            storage.cmdline,
346            "{}",
347            command_line
348                .command_line()
349                .expect("measured command line should be valid")
350        )
351        .map_err(|_| DtError::CommandLineSize)?;
352
353        // Depending on policy, write what the host specified in the chosen node.
354        if can_trust_host && command_line.policy == CommandLinePolicy::APPEND_CHOSEN {
355            // Parse in extra options from the host provided command line.
356            options.parse(&parsed.command_line);
357            write!(storage.cmdline, " {}", &parsed.command_line)
358                .map_err(|_| DtError::CommandLineSize)?;
359        }
360
361        // TODO: Decide if isolated guests always use VTL2 allocation mode.
362
363        match parsed.memory_allocation_mode {
364            MemoryAllocationMode::Host => {
365                storage.vtl2_ram.clear();
366                storage
367                    .vtl2_ram
368                    .try_extend_from_slice(parse_host_vtl2_ram(params, &parsed.memory).as_ref())
369                    .expect("vtl2 ram should only be 64 big");
370                storage.memory_allocation_mode = MemoryAllocationMode::Host;
371            }
372            MemoryAllocationMode::Vtl2 {
373                memory_size,
374                mmio_size,
375            } => {
376                storage.vtl2_ram.clear();
377                storage
378                    .vtl2_ram
379                    .try_extend_from_slice(
380                        allocate_vtl2_ram(params, &parsed.memory, memory_size).as_ref(),
381                    )
382                    .expect("vtl2 ram should only be 64 big");
383                storage.memory_allocation_mode = MemoryAllocationMode::Vtl2 {
384                    memory_size,
385                    mmio_size,
386                };
387            }
388        }
389
390        storage.vmbus_vtl2 = parsed.vmbus_vtl2.clone().ok_or(DtError::Vtl2Vmbus)?;
391        storage.vmbus_vtl0 = parsed.vmbus_vtl0.clone().ok_or(DtError::Vtl0Vmbus)?;
392
393        // The host is responsible for allocating MMIO ranges for non-isolated
394        // guests when it also provides the ram VTL2 should use.
395        //
396        // For isolated guests, or when VTL2 has been asked to carve out its own
397        // memory, carve out a range from the VTL0 allotment.
398        if params.isolation_type != IsolationType::None
399            || matches!(
400                parsed.memory_allocation_mode,
401                MemoryAllocationMode::Vtl2 { .. }
402            )
403        {
404            // Decide the amount of mmio VTL2 should allocate. Enforce a minimum
405            // of 128 MB mmio for VTL2.
406            const MINIMUM_MMIO_SIZE: u64 = 128 * (1 << 20);
407            let mmio_size = max(
408                match parsed.memory_allocation_mode {
409                    MemoryAllocationMode::Vtl2 { mmio_size, .. } => mmio_size.unwrap_or(0),
410                    _ => 0,
411                },
412                MINIMUM_MMIO_SIZE,
413            );
414
415            // Decide what mmio vtl2 should use.
416            let vtl2_mmio = storage.select_vtl2_mmio_range(mmio_size)?;
417
418            // Update vtl0 mmio to exclude vtl2 mmio.
419            let vtl0_mmio = subtract_ranges(storage.vmbus_vtl0.mmio.iter().cloned(), [vtl2_mmio])
420                .collect::<ArrayVec<MemoryRange, 2>>();
421
422            // TODO: For now, if we have only a single vtl0_mmio range left,
423            // panic. In the future decide if we want to report this as a start
424            // failure in usermode, change allocation strategy, or something
425            // else.
426            assert_eq!(
427                vtl0_mmio.len(),
428                2,
429                "vtl0 mmio ranges are not 2 {:#x?}",
430                vtl0_mmio
431            );
432
433            storage.vmbus_vtl2.mmio.clear();
434            storage.vmbus_vtl2.mmio.push(vtl2_mmio);
435            storage.vmbus_vtl0.mmio = vtl0_mmio;
436        }
437
438        // The host provided device tree is marked as normal ram, as the
439        // bootshim is responsible for constructing anything usermode needs from
440        // it, and passing it via the device tree provided to the kernel.
441        let reclaim_base = params.dt_start();
442        let reclaim_end = params.dt_start() + params.dt_size();
443        let vtl2_config_region_reclaim =
444            MemoryRange::try_new(reclaim_base..reclaim_end).expect("range is valid");
445
446        log!("reclaim device tree memory {reclaim_base:x}-{reclaim_end:x}");
447
448        for entry in &parsed.memory {
449            storage.partition_ram.push(*entry);
450        }
451
452        // Add all the ranges are not free for further allocation.
453        let mut used_ranges =
454            off_stack!(ArrayVec<MemoryRange, MAX_VTL2_USED_RANGES>, ArrayVec::new_const());
455        used_ranges.push(params.used);
456        used_ranges.sort_unstable_by_key(|r| r.start());
457        storage.vtl2_used_ranges.clear();
458        storage
459            .vtl2_used_ranges
460            .extend(flatten_ranges(used_ranges.iter().copied()));
461
462        // Decide if we will reserve memory for a VTL2 private pool. Parse this
463        // from the final command line, or the host provided device tree value.
464        let vtl2_gpa_pool_size = {
465            let dt_page_count = parsed.device_dma_page_count;
466            let cmdline_page_count = options.enable_vtl2_gpa_pool;
467            max(dt_page_count.unwrap_or(0), cmdline_page_count.unwrap_or(0))
468        };
469        if vtl2_gpa_pool_size != 0 {
470            // Reserve the specified number of pages for the pool. Use the used
471            // ranges to figure out which VTL2 memory is free to allocate from.
472            let pool_size_bytes = vtl2_gpa_pool_size * HV_PAGE_SIZE;
473            let free_memory = subtract_ranges(
474                storage.vtl2_ram.iter().map(|e| e.range),
475                storage.vtl2_used_ranges.iter().copied(),
476            );
477
478            let mut pool = MemoryRange::EMPTY;
479
480            for range in free_memory {
481                if range.len() >= pool_size_bytes {
482                    pool = MemoryRange::new(range.start()..(range.start() + pool_size_bytes));
483                    break;
484                }
485            }
486
487            if pool.is_empty() {
488                panic!(
489                    "failed to find {pool_size_bytes} bytes of free VTL2 memory for VTL2 GPA pool"
490                );
491            }
492
493            // Update the used ranges to mark the pool range as used.
494            used_ranges.clear();
495            used_ranges.extend(storage.vtl2_used_ranges.iter().copied());
496            used_ranges.push(pool);
497            used_ranges.sort_unstable_by_key(|r| r.start());
498            storage.vtl2_used_ranges.clear();
499            storage
500                .vtl2_used_ranges
501                .extend(flatten_ranges(used_ranges.iter().copied()));
502
503            storage.vtl2_pool_memory = pool;
504        }
505
506        // If we can trust the host, use the provided alias map
507        if can_trust_host {
508            storage.vtl0_alias_map = parsed.vtl0_alias_map;
509        }
510
511        // Set remaining struct fields before returning.
512        let Self {
513            vtl2_ram: _,
514            vtl2_full_config_region: vtl2_config_region,
515            vtl2_config_region_reclaim: vtl2_config_region_reclaim_struct,
516            vtl2_reserved_region,
517            vtl2_pool_memory: _,
518            vtl2_used_ranges,
519            partition_ram: _,
520            isolation,
521            bsp_reg,
522            cpus,
523            vmbus_vtl0: _,
524            vmbus_vtl2: _,
525            cmdline: _,
526            com3_serial_available: com3_serial,
527            gic,
528            memory_allocation_mode: _,
529            entropy,
530            vtl0_alias_map: _,
531            nvme_keepalive,
532            boot_options,
533        } = storage;
534
535        assert!(!vtl2_used_ranges.is_empty());
536
537        *isolation = params.isolation_type;
538
539        *vtl2_config_region = MemoryRange::new(
540            params.parameter_region_start
541                ..(params.parameter_region_start + params.parameter_region_size),
542        );
543        *vtl2_config_region_reclaim_struct = vtl2_config_region_reclaim;
544        assert!(vtl2_config_region.contains(&vtl2_config_region_reclaim));
545        *vtl2_reserved_region = MemoryRange::new(
546            params.vtl2_reserved_region_start
547                ..(params.vtl2_reserved_region_start + params.vtl2_reserved_region_size),
548        );
549        *bsp_reg = parsed.boot_cpuid_phys;
550        cpus.extend(parsed.cpus.iter().copied());
551        *com3_serial = parsed.com3_serial;
552        *gic = parsed.gic.clone();
553        *entropy = parsed.entropy.clone();
554        *nvme_keepalive = parsed.nvme_keepalive;
555        *boot_options = options;
556
557        Ok(Some(storage))
558    }
559}