openhcl_boot/host_params/
dt.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.

//! Parse partition info using the IGVM device tree parameter.

use super::PartitionInfo;
use super::shim_params::IsolationType;
use super::shim_params::ShimParams;
use crate::boot_logger::log;
use crate::host_params::COMMAND_LINE_SIZE;
use crate::host_params::MAX_CPU_COUNT;
use crate::host_params::MAX_ENTROPY_SIZE;
use crate::host_params::MAX_NUMA_NODES;
use crate::host_params::MAX_PARTITION_RAM_RANGES;
use crate::host_params::MAX_VTL2_USED_RANGES;
use crate::single_threaded::OffStackRef;
use crate::single_threaded::off_stack;
use arrayvec::ArrayVec;
use core::cmp::max;
use core::fmt::Display;
use core::fmt::Write;
use host_fdt_parser::MemoryAllocationMode;
use host_fdt_parser::MemoryEntry;
use host_fdt_parser::ParsedDeviceTree;
use hvdef::HV_PAGE_SIZE;
use igvm_defs::MemoryMapEntryType;
use loader_defs::paravisor::CommandLinePolicy;
use memory_range::MemoryRange;
use memory_range::flatten_ranges;
use memory_range::subtract_ranges;
use memory_range::walk_ranges;

/// Errors when reading the host device tree.
#[derive(Debug)]
pub enum DtError {
    /// Invalid device tree.
    DeviceTree(host_fdt_parser::Error<'static>),
    /// PartitionInfo's command line is too small to write the parsed legacy
    /// command line.
    CommandLineSize,
    /// Device tree did not contain a vmbus node for VTL2.
    Vtl2Vmbus,
    /// Device tree did not contain a vmbus node for VTL0.
    Vtl0Vmbus,
    /// Host provided high MMIO range is insufficient to cover VTL0 and VTL2.
    NotEnoughMmio,
}

impl Display for DtError {
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
        match self {
            DtError::DeviceTree(err) => {
                f.write_fmt(format_args!("host provided device tree is invalid: {err}"))
            }
            DtError::CommandLineSize => {
                f.write_str("commandline storage is too small to write the parsed command line")
            }
            DtError::Vtl2Vmbus => f.write_str("device tree did not contain a vmbus node for VTL2"),
            DtError::Vtl0Vmbus => f.write_str("device tree did not contain a vmbus node for VTL0"),
            DtError::NotEnoughMmio => {
                f.write_str("host provided high MMIO range is insufficient to cover VTL0 and VTL2")
            }
        }
    }
}

/// Allocate VTL2 ram from the partition's memory map.
fn allocate_vtl2_ram(
    params: &ShimParams,
    partition_memory_map: &[MemoryEntry],
    ram_size: Option<u64>,
) -> OffStackRef<'static, impl AsRef<[MemoryEntry]> + use<>> {
    // First, calculate how many numa nodes there are by looking at unique numa
    // nodes in the memory map.
    let mut numa_nodes = off_stack!(ArrayVec<u32, MAX_NUMA_NODES>, ArrayVec::new_const());

    for entry in partition_memory_map.iter() {
        match numa_nodes.binary_search(&entry.vnode) {
            Ok(_) => {}
            Err(index) => {
                numa_nodes.insert(index, entry.vnode);
            }
        }
    }

    let numa_node_count = numa_nodes.len();

    let vtl2_size = if let Some(ram_size) = ram_size {
        if ram_size < params.memory_size {
            panic!(
                "host provided vtl2 ram size {:x} is smaller than measured size {:x}",
                ram_size, params.memory_size
            );
        }
        max(ram_size, params.memory_size)
    } else {
        params.memory_size
    };

    // Next, calculate the amount of memory that needs to be allocated per numa
    // node.
    let ram_per_node = vtl2_size / numa_node_count as u64;

    // Seed the remaining allocation list with the memory required per node.
    let mut memory_per_node = off_stack!(ArrayVec<u64, MAX_NUMA_NODES>, ArrayVec::new_const());
    memory_per_node.extend((0..numa_node_count).map(|_| 0));
    for entry in partition_memory_map.iter() {
        memory_per_node[entry.vnode as usize] = ram_per_node;
    }

    // The range the IGVM file was loaded into is special - it is already
    // counted as "allocated". This may have been split across different numa
    // nodes. Walk the used range, add it to vtl2 ram, and subtract it from the
    // used ranges.
    let mut vtl2_ram = off_stack!(ArrayVec<MemoryEntry, MAX_NUMA_NODES>, ArrayVec::new_const());
    let mut free_memory_after_vtl2 = off_stack!(ArrayVec<MemoryEntry, 1024>, ArrayVec::new_const());
    let file_memory_range = MemoryRange::new(
        params.memory_start_address..(params.memory_start_address + params.memory_size),
    );

    for (range, result) in walk_ranges(
        [(file_memory_range, ())],
        partition_memory_map.iter().map(|e| (e.range, e)),
    ) {
        match result {
            memory_range::RangeWalkResult::Right(entry) => {
                // Add this entry to the free list.
                free_memory_after_vtl2.push(MemoryEntry {
                    range,
                    mem_type: entry.mem_type,
                    vnode: entry.vnode,
                });
            }
            memory_range::RangeWalkResult::Both(_, entry) => {
                // Add this entry to the vtl2 ram list.
                vtl2_ram.push(MemoryEntry {
                    range,
                    mem_type: entry.mem_type,
                    vnode: entry.vnode,
                });
            }
            memory_range::RangeWalkResult::Left(_) => {
                panic!("used file range {range:#x?} is not reported as ram by host memmap")
            }
            // Ranges in neither are ignored.
            memory_range::RangeWalkResult::Neither => {}
        }
    }

    // Now remove ranges from the free list that were part of the initial launch
    // context.
    let mut free_memory = off_stack!(ArrayVec<MemoryEntry, 1024>, ArrayVec::new_const());
    for (range, result) in walk_ranges(
        params
            .imported_regions()
            .filter_map(|(range, _preaccepted)| {
                if !file_memory_range.contains(&range) {
                     // There should be no overlap - either the preaccepted range
                    // is exclusively covered by the preaccpted VTL2 range or it
                    // is not.
                    assert!(!file_memory_range.overlaps(&range), "imported range {range:#x?} overlaps vtl2 range and is not fully contained within vtl2 range");
                    Some((range, ()))
                } else {
                    None
                }
            }),
        free_memory_after_vtl2.iter().map(|e| (e.range, e)),
    ) {
        match result {
            memory_range::RangeWalkResult::Right(entry) => {
                free_memory.push(MemoryEntry {
                    range,
                    mem_type: entry.mem_type,
                    vnode: entry.vnode,
                });
            }
            memory_range::RangeWalkResult::Left(_) => {
                // On TDX, the reset vector page is not reported as ram by the
                // host, but is preaccepted. Ignore it.
                #[cfg(target_arch = "x86_64")]
                if params.isolation_type == IsolationType::Tdx && range.start_4k_gpn() == 0xFFFFF && range.len() == 0x1000 {
                    continue;
                }

                panic!("launch context range {range:#x?} is not reported as ram by host memmap")
            }
            memory_range::RangeWalkResult::Both(_, _) => {
                // Range was part of the preaccepted import, is not free to
                // allocate additional VTL2 ram from.
            }
            // Ranges in neither are ignored.
            memory_range::RangeWalkResult::Neither => {}
        }
    }

    // Subtract the used ranges from vtl2_ram
    for entry in vtl2_ram.iter() {
        let mem_req = &mut memory_per_node[entry.vnode as usize];

        if entry.range.len() > *mem_req {
            // TODO: Today if a used range is larger than the mem required, we
            // just subtract that numa range to zero. Should we instead subtract
            // from other numa nodes equally for over allocation?
            log!(
                "entry {entry:?} is larger than required {mem_req} for vnode {}",
                entry.vnode
            );
            *mem_req = 0;
        } else {
            *mem_req -= entry.range.len();
        }
    }

    // Allocate remaining memory per node required.
    for (node, required_mem) in memory_per_node.iter().enumerate() {
        let mut required_mem = *required_mem;
        if required_mem == 0 {
            continue;
        }

        // Start allocation from the top of the free list, which is high memory
        // in reverse order.
        for entry in free_memory.iter_mut().rev() {
            if entry.vnode == node as u32 && !entry.range.is_empty() {
                assert!(required_mem != 0);
                let bytes_to_allocate = core::cmp::min(entry.range.len(), required_mem);

                // Allocate top down from the range.
                let offset = entry.range.len() - bytes_to_allocate;
                let (remaining, alloc) = MemoryRange::split_at_offset(&entry.range, offset);

                entry.range = remaining;
                vtl2_ram.push(MemoryEntry {
                    range: alloc,
                    mem_type: entry.mem_type,
                    vnode: node as u32,
                });

                required_mem -= bytes_to_allocate;

                // Stop allocating if we're done allocating.
                if required_mem == 0 {
                    break;
                }
            }
        }

        if required_mem != 0 {
            // TODO: Handle fallback allocations on other numa nodes when a node
            // is exhausted.
            panic!(
                "failed to allocate {required_mem:#x} for vnode {node:#x}, no memory remaining for vnode"
            );
        }
    }

    // Sort VTL2 ram as we may have allocated from different places.
    vtl2_ram.sort_unstable_by_key(|e| e.range.start());

    vtl2_ram
}

/// Parse VTL2 ram from host provided ranges.
fn parse_host_vtl2_ram(
    params: &ShimParams,
    memory: &[MemoryEntry],
) -> OffStackRef<'static, impl AsRef<[MemoryEntry]> + use<>> {
    // If no VTL2 protectable ram was provided by the host, use the build time
    // value encoded in ShimParams.
    let mut vtl2_ram = off_stack!(ArrayVec<MemoryEntry, MAX_NUMA_NODES>, ArrayVec::new_const());
    if params.isolation_type.is_hardware_isolated() {
        // Hardware isolated VMs use the size hint by the host, but use the base
        // address encoded in the file.
        let vtl2_size = memory.iter().fold(0, |acc, entry| {
            if entry.mem_type == MemoryMapEntryType::VTL2_PROTECTABLE {
                acc + entry.range.len()
            } else {
                acc
            }
        });

        log!(
            "host provided vtl2 ram size is {:x}, measured size is {:x}",
            vtl2_size,
            params.memory_size
        );

        let vtl2_size = max(vtl2_size, params.memory_size);
        vtl2_ram.push(MemoryEntry {
            range: MemoryRange::new(
                params.memory_start_address..(params.memory_start_address + vtl2_size),
            ),
            mem_type: MemoryMapEntryType::VTL2_PROTECTABLE,
            vnode: 0,
        });
    } else {
        for &entry in memory
            .iter()
            .filter(|entry| entry.mem_type == MemoryMapEntryType::VTL2_PROTECTABLE)
        {
            vtl2_ram.push(entry);
        }
    }

    if vtl2_ram.is_empty() {
        log!("using measured vtl2 ram");
        vtl2_ram.push(MemoryEntry {
            range: MemoryRange::try_new(
                params.memory_start_address..(params.memory_start_address + params.memory_size),
            )
            .expect("range is valid"),
            mem_type: MemoryMapEntryType::VTL2_PROTECTABLE,
            vnode: 0,
        });
    }

    vtl2_ram
}

impl PartitionInfo {
    // Read the IGVM provided DT for the vtl2 partition info. If no device tree
    // was provided by the host, `None` is returned.
    pub fn read_from_dt<'a>(
        params: &'a ShimParams,
        storage: &'a mut Self,
        can_trust_host: bool,
    ) -> Result<Option<&'a mut Self>, DtError> {
        let dt = params.device_tree();

        if dt[0] == 0 {
            log!("host did not provide a device tree");
            return Ok(None);
        }

        let mut dt_storage = off_stack!(ParsedDeviceTree<MAX_PARTITION_RAM_RANGES, MAX_CPU_COUNT, COMMAND_LINE_SIZE, MAX_ENTROPY_SIZE>, ParsedDeviceTree::new());

        let parsed = ParsedDeviceTree::parse(dt, &mut *dt_storage).map_err(DtError::DeviceTree)?;

        let command_line = params.command_line();

        // Always write the measured command line.
        write!(
            storage.cmdline,
            "{}",
            command_line
                .command_line()
                .expect("measured command line should be valid")
        )
        .map_err(|_| DtError::CommandLineSize)?;

        // Depending on policy, write what the host specified in the chosen node.
        if can_trust_host && command_line.policy == CommandLinePolicy::APPEND_CHOSEN {
            write!(storage.cmdline, " {}", &parsed.command_line)
                .map_err(|_| DtError::CommandLineSize)?;
        }

        // TODO: Decide if isolated guests always use VTL2 allocation mode.

        match parsed.memory_allocation_mode {
            MemoryAllocationMode::Host => {
                storage.vtl2_ram.clear();
                storage
                    .vtl2_ram
                    .try_extend_from_slice(parse_host_vtl2_ram(params, &parsed.memory).as_ref())
                    .expect("vtl2 ram should only be 64 big");
                storage.memory_allocation_mode = MemoryAllocationMode::Host;
            }
            MemoryAllocationMode::Vtl2 {
                memory_size,
                mmio_size,
            } => {
                storage.vtl2_ram.clear();
                storage
                    .vtl2_ram
                    .try_extend_from_slice(
                        allocate_vtl2_ram(params, &parsed.memory, memory_size).as_ref(),
                    )
                    .expect("vtl2 ram should only be 64 big");
                storage.memory_allocation_mode = MemoryAllocationMode::Vtl2 {
                    memory_size,
                    mmio_size,
                };
            }
        }

        storage.vmbus_vtl2 = parsed.vmbus_vtl2.clone().ok_or(DtError::Vtl2Vmbus)?;
        storage.vmbus_vtl0 = parsed.vmbus_vtl0.clone().ok_or(DtError::Vtl0Vmbus)?;

        // The host is responsible for allocating MMIO ranges for non-isolated
        // guests when it also provides the ram VTL2 should use.
        //
        // For isolated guests, or when VTL2 has been asked to carve out its own
        // memory, carve out a range from the VTL0 allotment.
        if params.isolation_type != IsolationType::None
            || matches!(
                parsed.memory_allocation_mode,
                MemoryAllocationMode::Vtl2 { .. }
            )
        {
            // Decide the amount of mmio VTL2 should allocate. Enforce a minimum
            // of 128 MB mmio for VTL2.
            const MINIMUM_MMIO_SIZE: u64 = 128 * (1 << 20);
            let mmio_size = max(
                match parsed.memory_allocation_mode {
                    MemoryAllocationMode::Vtl2 { mmio_size, .. } => mmio_size.unwrap_or(0),
                    _ => 0,
                },
                MINIMUM_MMIO_SIZE,
            );

            // Decide what mmio vtl2 should use.
            let vtl2_mmio = storage.select_vtl2_mmio_range(mmio_size)?;

            // Update vtl0 mmio to exclude vtl2 mmio.
            let vtl0_mmio = subtract_ranges(storage.vmbus_vtl0.mmio.iter().cloned(), [vtl2_mmio])
                .collect::<ArrayVec<MemoryRange, 2>>();

            // TODO: For now, if we have only a single vtl0_mmio range left,
            // panic. In the future decide if we want to report this as a start
            // failure in usermode, change allocation strategy, or something
            // else.
            assert_eq!(
                vtl0_mmio.len(),
                2,
                "vtl0 mmio ranges are not 2 {:#x?}",
                vtl0_mmio
            );

            storage.vmbus_vtl2.mmio.clear();
            storage.vmbus_vtl2.mmio.push(vtl2_mmio);
            storage.vmbus_vtl0.mmio = vtl0_mmio;
        }

        // The host provided device tree is marked as normal ram, as the
        // bootshim is responsible for constructing anything usermode needs from
        // it, and passing it via the device tree provided to the kernel.
        let reclaim_base = params.dt_start();
        let reclaim_end = params.dt_start() + params.dt_size();
        let vtl2_config_region_reclaim =
            MemoryRange::try_new(reclaim_base..reclaim_end).expect("range is valid");

        log!("reclaim device tree memory {reclaim_base:x}-{reclaim_end:x}");

        for entry in &parsed.memory {
            storage.partition_ram.push(*entry);
        }

        // Add all the ranges are not free for further allocation.
        let mut used_ranges =
            off_stack!(ArrayVec<MemoryRange, MAX_VTL2_USED_RANGES>, ArrayVec::new_const());
        used_ranges.push(params.used);
        used_ranges.sort_unstable_by_key(|r| r.start());
        storage.vtl2_used_ranges.clear();
        storage
            .vtl2_used_ranges
            .extend(flatten_ranges(used_ranges.iter().copied()));

        // Decide if we will reserve memory for a VTL2 private pool. Parse this
        // from the final command line, or the host provided device tree value.
        let vtl2_gpa_pool_size = {
            let dt_page_count = parsed.device_dma_page_count;
            let cmdline_page_count =
                crate::cmdline::parse_boot_command_line(storage.cmdline.as_str())
                    .enable_vtl2_gpa_pool;

            max(dt_page_count.unwrap_or(0), cmdline_page_count.unwrap_or(0))
        };
        if vtl2_gpa_pool_size != 0 {
            // Reserve the specified number of pages for the pool. Use the used
            // ranges to figure out which VTL2 memory is free to allocate from.
            let pool_size_bytes = vtl2_gpa_pool_size * HV_PAGE_SIZE;
            let free_memory = subtract_ranges(
                storage.vtl2_ram.iter().map(|e| e.range),
                storage.vtl2_used_ranges.iter().copied(),
            );

            let mut pool = MemoryRange::EMPTY;

            for range in free_memory {
                if range.len() >= pool_size_bytes {
                    pool = MemoryRange::new(range.start()..(range.start() + pool_size_bytes));
                    break;
                }
            }

            if pool.is_empty() {
                panic!(
                    "failed to find {pool_size_bytes} bytes of free VTL2 memory for VTL2 GPA pool"
                );
            }

            // Update the used ranges to mark the pool range as used.
            used_ranges.clear();
            used_ranges.extend(storage.vtl2_used_ranges.iter().copied());
            used_ranges.push(pool);
            used_ranges.sort_unstable_by_key(|r| r.start());
            storage.vtl2_used_ranges.clear();
            storage
                .vtl2_used_ranges
                .extend(flatten_ranges(used_ranges.iter().copied()));

            storage.vtl2_pool_memory = pool;
        }

        // If we can trust the host, use the provided alias map
        if can_trust_host {
            storage.vtl0_alias_map = parsed.vtl0_alias_map;
        }

        // Set remaining struct fields before returning.
        let Self {
            vtl2_ram: _,
            vtl2_full_config_region: vtl2_config_region,
            vtl2_config_region_reclaim: vtl2_config_region_reclaim_struct,
            vtl2_reserved_region,
            vtl2_pool_memory: _,
            vtl2_used_ranges,
            partition_ram: _,
            isolation,
            bsp_reg,
            cpus,
            vmbus_vtl0: _,
            vmbus_vtl2: _,
            cmdline: _,
            com3_serial_available: com3_serial,
            gic,
            memory_allocation_mode: _,
            entropy,
            vtl0_alias_map: _,
            nvme_keepalive,
        } = storage;

        assert!(!vtl2_used_ranges.is_empty());

        *isolation = params.isolation_type;

        *vtl2_config_region = MemoryRange::new(
            params.parameter_region_start
                ..(params.parameter_region_start + params.parameter_region_size),
        );
        *vtl2_config_region_reclaim_struct = vtl2_config_region_reclaim;
        assert!(vtl2_config_region.contains(&vtl2_config_region_reclaim));
        *vtl2_reserved_region = MemoryRange::new(
            params.vtl2_reserved_region_start
                ..(params.vtl2_reserved_region_start + params.vtl2_reserved_region_size),
        );
        *bsp_reg = parsed.boot_cpuid_phys;
        cpus.extend(parsed.cpus.iter().copied());
        *com3_serial = parsed.com3_serial;
        *gic = parsed.gic.clone();
        *entropy = parsed.entropy.clone();
        *nvme_keepalive = parsed.nvme_keepalive;

        Ok(Some(storage))
    }
}