hvlite_core/worker/vm_loaders/
linux.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4use guestmem::GuestMemory;
5use loader::importer::Aarch64Register;
6use loader::importer::X86Register;
7use loader::linux::AcpiConfig;
8use loader::linux::CommandLineConfig;
9use loader::linux::InitrdAddressType;
10use loader::linux::InitrdConfig;
11use loader::linux::RegisterConfig;
12use loader::linux::ZeroPageConfig;
13use std::ffi::CString;
14use std::io::Read;
15use std::io::Seek;
16use thiserror::Error;
17use vm_loader::Loader;
18use vm_topology::memory::MemoryLayout;
19use vm_topology::processor::ProcessorTopology;
20use vm_topology::processor::aarch64::Aarch64Topology;
21
22#[derive(Debug, Error)]
23#[error("device tree error: {0:?}")]
24pub struct DtError(pub fdt::builder::Error);
25
26#[derive(Debug, Error)]
27pub enum Error {
28    #[error("failed to read initrd file")]
29    InitRd(#[source] std::io::Error),
30    #[error("linux loader error")]
31    Loader(#[source] loader::linux::Error),
32    #[error("device tree error")]
33    Dt(#[source] DtError),
34}
35
36#[derive(Debug)]
37pub struct KernelConfig<'a> {
38    pub kernel: &'a std::fs::File,
39    pub initrd: &'a Option<std::fs::File>,
40    pub cmdline: &'a str,
41    pub mem_layout: &'a MemoryLayout,
42}
43
44pub struct AcpiTables {
45    /// The RDSP. Assumed to be given a whole page.
46    pub rdsp: Vec<u8>,
47    /// The remaining tables pointed to by the RDSP.
48    pub tables: Vec<u8>,
49}
50
51#[cfg_attr(not(guest_arch = "x86_64"), expect(dead_code))]
52pub fn load_linux_x86(
53    cfg: &KernelConfig<'_>,
54    gm: &GuestMemory,
55    acpi_at_gpa: impl FnOnce(u64) -> AcpiTables,
56) -> Result<Vec<X86Register>, Error> {
57    const GDT_BASE: u64 = 0x1000;
58    const CR3_BASE: u64 = 0x4000;
59    const ZERO_PAGE_BASE: u64 = 0x2000;
60    const CMDLINE_BASE: u64 = 0x3000;
61    const ACPI_BASE: u64 = 0xe0000;
62
63    let kaddr: u64 = 2 * 1024 * 1024;
64    let mut kernel_file = cfg.kernel;
65
66    let mut initrd = Vec::new();
67    if let Some(mut initrd_file) = cfg.initrd.as_ref() {
68        initrd_file.rewind().map_err(Error::InitRd)?;
69        initrd_file
70            .read_to_end(&mut initrd)
71            .map_err(Error::InitRd)?;
72    }
73
74    let initrd_config = InitrdConfig {
75        initrd_address: InitrdAddressType::AfterKernel,
76        initrd: &initrd,
77    };
78
79    let cmdline = CString::new(cfg.cmdline).unwrap();
80    let cmdline_config = CommandLineConfig {
81        address: CMDLINE_BASE,
82        cmdline: &cmdline,
83    };
84
85    let register_config = RegisterConfig {
86        gdt_address: GDT_BASE,
87        page_table_address: CR3_BASE,
88    };
89
90    let acpi_tables = acpi_at_gpa(ACPI_BASE);
91
92    // NOTE: The rdsp is given a whole page.
93    let acpi_len = acpi_tables.tables.len() + 0x1000;
94    let acpi_config = AcpiConfig {
95        rdsp_address: ACPI_BASE,
96        rdsp: &acpi_tables.rdsp,
97        tables_address: ACPI_BASE + 0x1000,
98        tables: &acpi_tables.tables,
99    };
100
101    let zero_page_config = ZeroPageConfig {
102        address: ZERO_PAGE_BASE,
103        mem_layout: cfg.mem_layout,
104        acpi_base_address: ACPI_BASE,
105        acpi_len,
106    };
107
108    let mut loader = Loader::new(gm.clone(), cfg.mem_layout, hvdef::Vtl::Vtl0);
109
110    loader::linux::load_x86(
111        &mut loader,
112        &mut kernel_file,
113        kaddr,
114        if !initrd.is_empty() {
115            Some(initrd_config)
116        } else {
117            None
118        },
119        cmdline_config,
120        zero_page_config,
121        acpi_config,
122        register_config,
123    )
124    .map_err(Error::Loader)?;
125
126    Ok(loader.initial_regs())
127}
128
129/// Returns the device tree blob.
130/// NOTE: if need to use GICv2, then the interrupt level must include flags
131/// derived from the number of CPUs for the PPI interrupts.
132/// TODO: the hvlite's command line should provide a device tree blob, optionally, too.
133/// TODO: this is a large function, break it up.
134/// TODO: disjoint from the VM configuration, must work key off of the VM configuration.
135fn build_dt(
136    cfg: &KernelConfig<'_>,
137    _gm: &GuestMemory,
138    enable_serial: bool,
139    processor_topology: &ProcessorTopology<Aarch64Topology>,
140    initrd_start: u64,
141    initrd_end: u64,
142) -> Result<Vec<u8>, fdt::builder::Error> {
143    // This ID forces the subset of PL011 known as the SBSA UART be used.
144    const PL011_PERIPH_ID: u32 = 0x00041011;
145    const PL011_BAUD: u32 = 115200;
146    const PL011_SERIAL0_BASE: u64 = 0xEFFEC000;
147    const PL011_SERIAL0_IRQ: u32 = 1;
148    const PL011_SERIAL1_BASE: u64 = 0xEFFEB000;
149    const PL011_SERIAL1_IRQ: u32 = 2;
150
151    let num_cpus = processor_topology.vps().len();
152
153    let gic_dist_base: u64 = processor_topology.gic_distributor_base();
154    let gic_dist_size: u64 = aarch64defs::GIC_DISTRIBUTOR_SIZE;
155    let gic_redist_base: u64 = processor_topology.gic_redistributors_base();
156    let gic_redist_size: u64 = aarch64defs::GIC_REDISTRIBUTOR_SIZE * num_cpus as u64;
157
158    // With the default values, that will overlap with the GIC distributor range
159    // if the number of VPs goes above `2048`. That is more than enough for the time being,
160    // both for the Linux and the Windows guests. The debug assert below is for the time
161    // when custom values are used.
162    debug_assert!(
163        !(gic_dist_base..gic_dist_base + gic_dist_size).contains(&gic_redist_base)
164            && !(gic_redist_base..gic_redist_base + gic_redist_size).contains(&gic_dist_base)
165    );
166
167    let mut buffer = vec![0u8; hvdef::HV_PAGE_SIZE as usize * 256];
168
169    let builder_config = fdt::builder::BuilderConfig {
170        blob_buffer: &mut buffer,
171        string_table_cap: 1024,
172        memory_reservations: &[],
173    };
174    let mut builder = fdt::builder::Builder::new(builder_config)?;
175    let p_address_cells = builder.add_string("#address-cells")?;
176    let p_size_cells = builder.add_string("#size-cells")?;
177    let p_model = builder.add_string("model")?;
178    let p_reg = builder.add_string("reg")?;
179    let p_device_type = builder.add_string("device_type")?;
180    let p_status = builder.add_string("status")?;
181    let p_compatible = builder.add_string("compatible")?;
182    let p_ranges = builder.add_string("ranges")?;
183    let p_enable_method = builder.add_string("enable-method")?;
184    let p_method = builder.add_string("method")?;
185    let p_bootargs = builder.add_string("bootargs")?;
186    let p_stdout_path = builder.add_string("stdout-path")?;
187    let p_initrd_start = builder.add_string("linux,initrd-start")?;
188    let p_initrd_end = builder.add_string("linux,initrd-end")?;
189    let p_interrupt_cells = builder.add_string("#interrupt-cells")?;
190    let p_interrupt_controller = builder.add_string("interrupt-controller")?;
191    let p_interrupt_names = builder.add_string("interrupt-names")?;
192    let p_interrupts = builder.add_string("interrupts")?;
193    let p_interrupt_parent = builder.add_string("interrupt-parent")?;
194    let p_always_on = builder.add_string("always-on")?;
195    let p_phandle = builder.add_string("phandle")?;
196    let p_clock_frequency = builder.add_string("clock-frequency")?;
197    let p_clock_output_names = builder.add_string("clock-output-names")?;
198    let p_clock_cells = builder.add_string("#clock-cells")?;
199    let p_clocks = builder.add_string("clocks")?;
200    let p_clock_names = builder.add_string("clock-names")?;
201    let p_current_speed = builder.add_string("current-speed")?;
202    let p_arm_periph_id = builder.add_string("arm,primecell-periphid")?;
203
204    // Property handle values.
205    const PHANDLE_GIC: u32 = 1;
206    const PHANDLE_APB_PCLK: u32 = 2;
207
208    const GIC_SPI: u32 = 0;
209    const GIC_PPI: u32 = 1;
210    const IRQ_TYPE_LEVEL_LOW: u32 = 8;
211    const IRQ_TYPE_LEVEL_HIGH: u32 = 4;
212
213    let mut root_builder = builder
214        .start_node("")?
215        .add_u32(p_address_cells, 2)?
216        .add_u32(p_size_cells, 2)?
217        .add_u32(p_interrupt_parent, PHANDLE_GIC)?
218        .add_str(p_model, "microsoft,hvlite")?
219        .add_str(p_compatible, "microsoft,hvlite")?;
220
221    let mut cpu_builder = root_builder
222        .start_node("cpus")?
223        .add_str(p_compatible, "arm,armv8")?
224        .add_u32(p_address_cells, 1)?
225        .add_u32(p_size_cells, 0)?;
226
227    // Add a CPU node for each cpu.
228    for vp_index in 0..num_cpus {
229        let name = format!("cpu@{}", vp_index);
230        let mut cpu = cpu_builder
231            .start_node(name.as_ref())?
232            .add_u32(p_reg, vp_index as u32)?
233            .add_str(p_device_type, "cpu")?;
234
235        if num_cpus > 1 {
236            cpu = cpu.add_str(p_enable_method, "psci")?;
237        }
238
239        if vp_index == 0 {
240            cpu = cpu.add_str(p_status, "okay")?;
241        } else {
242            cpu = cpu.add_str(p_status, "disabled")?;
243        }
244
245        cpu_builder = cpu.end_node()?;
246    }
247    root_builder = cpu_builder.end_node()?;
248
249    let psci = root_builder
250        .start_node("psci")?
251        .add_str(p_compatible, "arm,psci-0.2")?
252        .add_str(p_method, "hvc")?;
253    root_builder = psci.end_node()?;
254
255    // Add a memory node for each RAM range.
256    for mem_entry in cfg.mem_layout.ram() {
257        let start = mem_entry.range.start();
258        let len = mem_entry.range.len();
259        let name = format!("memory@{:x}", start);
260        let mut mem = root_builder.start_node(&name)?;
261        mem = mem.add_str(p_device_type, "memory")?;
262        mem = mem.add_u64_array(p_reg, &[start, len])?;
263        root_builder = mem.end_node()?;
264    }
265
266    // Advanced Bus Peripheral Clock.
267    root_builder = root_builder
268        .start_node("apb-pclk")?
269        .add_str(p_compatible, "fixed-clock")?
270        .add_u32(p_clock_frequency, 24000000)?
271        .add_str_array(p_clock_output_names, &["clk24mhz"])?
272        .add_u32(p_clock_cells, 0)?
273        .add_u32(p_phandle, PHANDLE_APB_PCLK)?
274        .end_node()?;
275
276    // ARM64 Generic Interrupt Controller aka GIC, v3.
277    let gicv3 = root_builder
278        .start_node(format!("intc@{gic_dist_base:x}").as_str())?
279        .add_str(p_compatible, "arm,gic-v3")?
280        .add_u64_array(
281            p_reg,
282            &[
283                gic_dist_base,
284                gic_dist_size,
285                gic_redist_base,
286                gic_redist_size,
287            ],
288        )?
289        .add_u32(p_address_cells, 2)?
290        .add_u32(p_size_cells, 2)?
291        .add_u32(p_interrupt_cells, 3)?
292        .add_null(p_interrupt_controller)?
293        .add_u32(p_phandle, PHANDLE_GIC)?
294        .add_null(p_ranges)?;
295    root_builder = gicv3.end_node()?;
296
297    // ARM64 Architectural Timer.
298    const HYPERV_VIRT_TIMER_PPI: u32 = 4; // relative to PPI base of 16
299    let timer = root_builder
300        .start_node("timer")?
301        .add_str(p_compatible, "arm,armv8-timer")?
302        .add_u32(p_interrupt_parent, PHANDLE_GIC)?
303        .add_str(p_interrupt_names, "virt")?
304        .add_u32_array(
305            p_interrupts,
306            &[GIC_PPI, HYPERV_VIRT_TIMER_PPI, IRQ_TYPE_LEVEL_LOW],
307        )?
308        .add_null(p_always_on)?;
309    root_builder = timer.end_node()?;
310
311    let mut soc = root_builder
312        .start_node("openvmm")?
313        .add_str(p_compatible, "simple-bus")?
314        .add_u32(p_address_cells, 2)?
315        .add_u32(p_size_cells, 2)?
316        .add_null(p_ranges)?
317        .add_u32(p_interrupt_parent, PHANDLE_GIC)?;
318
319    if enable_serial {
320        // Uses the scoped down "arm,sbsa-aurt" rather than the full "arm,pl011" device.
321        for (serial_base, serial_interrupt) in [
322            (PL011_SERIAL0_BASE, PL011_SERIAL0_IRQ),
323            (PL011_SERIAL1_BASE, PL011_SERIAL1_IRQ),
324        ] {
325            let name = format!("uart@{:x}", serial_base);
326            soc = soc
327                .start_node(name.as_ref())?
328                .add_str_array(p_compatible, &["arm,sbsa-uart", "arm,primecell"])?
329                .add_str_array(p_clock_names, &["apb_pclk"])?
330                .add_u32(p_clocks, PHANDLE_APB_PCLK)?
331                .add_u32(p_interrupt_parent, PHANDLE_GIC)?
332                .add_u64_array(p_reg, &[serial_base, 0x1000])?
333                .add_u32(p_current_speed, PL011_BAUD)?
334                .add_u32(p_arm_periph_id, PL011_PERIPH_ID)?
335                .add_u32_array(
336                    p_interrupts,
337                    &[GIC_SPI, serial_interrupt, IRQ_TYPE_LEVEL_HIGH],
338                )?
339                .add_str(p_status, "okay")?
340                .end_node()?;
341        }
342    }
343
344    root_builder = soc.end_node()?;
345
346    let mut chosen = root_builder
347        .start_node("chosen")?
348        .add_str(p_bootargs, cfg.cmdline)?;
349    chosen = chosen.add_u64(p_initrd_start, initrd_start)?;
350    chosen = chosen.add_u64(p_initrd_end, initrd_end)?;
351    if enable_serial {
352        chosen = chosen.add_str(
353            p_stdout_path,
354            format!("/hvlite/uart@{PL011_SERIAL0_BASE:x}").as_str(),
355        )?;
356    }
357
358    root_builder = chosen.end_node()?;
359
360    let boot_cpu_id = 0;
361    root_builder.end_node()?.build(boot_cpu_id)?;
362
363    Ok(buffer)
364}
365
366#[cfg_attr(not(guest_arch = "aarch64"), expect(dead_code))]
367pub fn load_linux_arm64(
368    cfg: &KernelConfig<'_>,
369    gm: &GuestMemory,
370    enable_serial: bool,
371    processor_topology: &ProcessorTopology<Aarch64Topology>,
372) -> Result<Vec<Aarch64Register>, Error> {
373    let mut loader = Loader::new(gm.clone(), cfg.mem_layout, hvdef::Vtl::Vtl0);
374    let mut kernel_file = cfg.kernel;
375    let mut initrd = Vec::new();
376    if let Some(mut initrd_file) = cfg.initrd.as_ref() {
377        initrd_file.rewind().map_err(Error::InitRd)?;
378        initrd_file
379            .read_to_end(&mut initrd)
380            .map_err(Error::InitRd)?;
381    }
382
383    // Data dependencies:
384    // - DeviceTree carries the start address of the initrd.
385    // - The linux loader loads the kernel, the initrd at the said address,
386    //   and the device tree into the guest memory.
387    //
388    // Thus, we first start with planning the memory layout where
389    // some space at the loader bottom is reserved for the initrd.
390
391    let load_bottom_addr: u64 = 16 << 20;
392    let initrd_start: u64 = load_bottom_addr;
393    let initrd_end: u64 = initrd_start + initrd.len() as u64;
394    // Align the kernel to 2MB
395    let kernel_minimum_start_address: u64 = (initrd_end + 0x1fffff) & !0x1fffff;
396
397    let device_tree = build_dt(
398        cfg,
399        gm,
400        enable_serial,
401        processor_topology,
402        initrd_start,
403        initrd_end,
404    )
405    .map_err(|e| Error::Dt(DtError(e)))?;
406    let load_info = loader::linux::load_kernel_and_initrd_arm64(
407        &mut loader,
408        &mut kernel_file,
409        kernel_minimum_start_address,
410        if !initrd.is_empty() {
411            Some(InitrdConfig {
412                initrd_address: InitrdAddressType::Address(initrd_start),
413                initrd: &initrd,
414            })
415        } else {
416            None
417        },
418        Some(&device_tree),
419    )
420    .map_err(Error::Loader)?;
421
422    // Set the registers separately so they won't conflict with the UEFI boot when
423    // `load_kernel_and_initrd_arm64` is used for VTL2 direct kernel boot.
424    loader::linux::set_direct_boot_registers_arm64(&mut loader, &load_info)
425        .map_err(Error::Loader)?;
426
427    Ok(loader.initial_regs())
428}