openhcl_boot/
sidecar.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4use crate::boot_logger::log;
5use crate::host_params::MAX_CPU_COUNT;
6use crate::host_params::MAX_NUMA_NODES;
7use crate::host_params::PartitionInfo;
8use crate::host_params::shim_params::IsolationType;
9use crate::host_params::shim_params::ShimParams;
10use crate::single_threaded::off_stack;
11use arrayvec::ArrayVec;
12use memory_range::MemoryRange;
13use sidecar_defs::SidecarNodeOutput;
14use sidecar_defs::SidecarNodeParams;
15use sidecar_defs::SidecarOutput;
16use sidecar_defs::SidecarParams;
17
18/// The maximum side of a sidecar node. This is tuned to ensure that there are
19/// enough Linux CPUs to manage all the sidecar VPs.
20const MAX_SIDECAR_NODE_SIZE: usize = 32;
21
22// Assert that there are enough sidecar nodes for the maximum number of CPUs, if
23// all NUMA nodes but one have one processor.
24const _: () = assert!(
25    sidecar_defs::MAX_NODES >= (MAX_NUMA_NODES - 1) + MAX_CPU_COUNT.div_ceil(MAX_SIDECAR_NODE_SIZE)
26);
27
28pub struct SidecarConfig<'a> {
29    pub image: MemoryRange,
30    pub node_params: &'a [SidecarNodeParams],
31    pub nodes: &'a [SidecarNodeOutput],
32    pub start_reftime: u64,
33    pub end_reftime: u64,
34}
35
36impl SidecarConfig<'_> {
37    /// Returns an object to be appended to the Linux kernel command line to
38    /// configure it properly for sidecar.
39    pub fn kernel_command_line(&self) -> SidecarKernelCommandLine<'_> {
40        SidecarKernelCommandLine(self)
41    }
42}
43
44pub struct SidecarKernelCommandLine<'a>(&'a SidecarConfig<'a>);
45
46impl core::fmt::Display for SidecarKernelCommandLine<'_> {
47    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
48        // Add something like boot_cpus=0,4,8,12 to the command line so that
49        // Linux boots with the base VP of each sidecar node. Other CPUs will
50        // be brought up by the sidecar kernel.
51        f.write_str("boot_cpus=")?;
52        let mut comma = "";
53        for node in self.0.node_params {
54            write!(f, "{}{}", comma, node.base_vp)?;
55            comma = ",";
56        }
57        Ok(())
58    }
59}
60
61pub fn start_sidecar<'a>(
62    p: &ShimParams,
63    partition_info: &PartitionInfo,
64    sidecar_params: &'a mut SidecarParams,
65    sidecar_output: &'a mut SidecarOutput,
66) -> Option<SidecarConfig<'a>> {
67    if !cfg!(target_arch = "x86_64") || p.isolation_type != IsolationType::None {
68        return None;
69    }
70
71    if p.sidecar_size == 0 {
72        log!("sidecar: not present in image");
73        return None;
74    }
75
76    if !partition_info.boot_options.sidecar {
77        log!("sidecar: disabled via command line");
78        return None;
79    }
80
81    let image = MemoryRange::new(p.sidecar_base..p.sidecar_base + p.sidecar_size);
82
83    // Ensure the host didn't provide an out-of-bounds NUMA node.
84    let max_vnode = partition_info
85        .cpus
86        .iter()
87        .map(|cpu| cpu.vnode)
88        .chain(partition_info.vtl2_ram.iter().map(|e| e.vnode))
89        .max()
90        .unwrap();
91
92    if max_vnode >= MAX_NUMA_NODES as u32 {
93        log!("sidecar: NUMA node {max_vnode} too large");
94        return None;
95    }
96
97    // Compute a free list of VTL2 memory per NUMA node.
98    let mut free_memory = off_stack!(ArrayVec<MemoryRange, MAX_NUMA_NODES>, ArrayVec::new_const());
99    free_memory.extend((0..max_vnode + 1).map(|_| MemoryRange::EMPTY));
100    for (range, r) in memory_range::walk_ranges(
101        partition_info.vtl2_ram.iter().map(|e| (e.range, e.vnode)),
102        partition_info
103            .vtl2_used_ranges
104            .iter()
105            .cloned()
106            .map(|range| (range, ())),
107    ) {
108        if let memory_range::RangeWalkResult::Left(vnode) = r {
109            let free = &mut free_memory[vnode as usize];
110            if range.len() > free.len() {
111                *free = range;
112            }
113        }
114    }
115
116    #[cfg(target_arch = "x86_64")]
117    if !x86defs::cpuid::VersionAndFeaturesEcx::from(
118        safe_intrinsics::cpuid(x86defs::cpuid::CpuidFunction::VersionAndFeatures.0, 0).ecx,
119    )
120    .x2_apic()
121    {
122        // Currently, sidecar needs x2apic to communicate with the kernel
123        log!("sidecar: x2apic not available; not using sidecar");
124        return None;
125    }
126
127    // Split the CPUs by NUMA node, and then into chunks of no more than
128    // MAX_SIDECAR_NODE_SIZE processors.
129    let cpus_by_node = || {
130        partition_info
131            .cpus
132            .chunk_by(|a, b| a.vnode == b.vnode)
133            .flat_map(|cpus| {
134                let chunks = cpus.len().div_ceil(MAX_SIDECAR_NODE_SIZE);
135                cpus.chunks(cpus.len().div_ceil(chunks))
136            })
137    };
138    if cpus_by_node().all(|cpus_by_node| cpus_by_node.len() == 1) {
139        log!("sidecar: all NUMA nodes have one CPU");
140        return None;
141    }
142    let node_count = cpus_by_node().count();
143
144    let mut total_ram;
145    {
146        let SidecarParams {
147            hypercall_page,
148            enable_logging,
149            node_count,
150            nodes,
151        } = sidecar_params;
152
153        *hypercall_page = 0;
154        #[cfg(target_arch = "x86_64")]
155        {
156            *hypercall_page = crate::hypercall::hvcall().hypercall_page();
157        }
158        *enable_logging = partition_info.boot_options.sidecar_logging;
159
160        let mut base_vp = 0;
161        total_ram = 0;
162        for (cpus, node) in cpus_by_node().zip(nodes) {
163            let required_ram = sidecar_defs::required_memory(cpus.len() as u32) as u64;
164            // Take some VTL2 RAM for sidecar use. Try to use the same NUMA node
165            // as the first CPU.
166            let local_vnode = cpus[0].vnode as usize;
167            let mut vtl2_ram = &mut free_memory[local_vnode];
168            if required_ram >= vtl2_ram.len() {
169                // Take RAM from the next NUMA node with enough memory.
170                let remote_vnode = free_memory
171                    .iter()
172                    .enumerate()
173                    .cycle()
174                    .skip(local_vnode + 1)
175                    .take(free_memory.len())
176                    .find_map(|(vnode, mem)| (mem.len() >= required_ram).then_some(vnode));
177                let Some(remote_vnode) = remote_vnode else {
178                    log!("sidecar: not enough memory for sidecar");
179                    return None;
180                };
181                log!(
182                    "sidecar: not enough memory for sidecar on node {local_vnode}, falling back to node {remote_vnode}"
183                );
184                vtl2_ram = &mut free_memory[remote_vnode];
185            }
186            let (rest, mem) = vtl2_ram.split_at_offset(vtl2_ram.len() - required_ram);
187            *vtl2_ram = rest;
188            *node = SidecarNodeParams {
189                memory_base: mem.start(),
190                memory_size: mem.len(),
191                base_vp,
192                vp_count: cpus.len() as u32,
193            };
194            base_vp += cpus.len() as u32;
195            *node_count += 1;
196            total_ram += required_ram;
197        }
198    }
199
200    // SAFETY: the parameter blob is trusted.
201    let sidecar_entry: extern "C" fn(&SidecarParams, &mut SidecarOutput) -> bool =
202        unsafe { core::mem::transmute(p.sidecar_entry_address) };
203
204    let boot_start_reftime = minimal_rt::reftime::reference_time();
205    log!(
206        "sidecar starting, {} nodes, {} cpus, {:#x} total bytes",
207        node_count,
208        partition_info.cpus.len(),
209        total_ram
210    );
211    if !sidecar_entry(sidecar_params, sidecar_output) {
212        panic!(
213            "failed to start sidecar: {}",
214            core::str::from_utf8(&sidecar_output.error.buf[..sidecar_output.error.len as usize])
215                .unwrap()
216        );
217    }
218    let boot_end_reftime = minimal_rt::reftime::reference_time();
219
220    let SidecarOutput { nodes, error: _ } = sidecar_output;
221    Some(SidecarConfig {
222        image,
223        start_reftime: boot_start_reftime,
224        end_reftime: boot_end_reftime,
225        node_params: &sidecar_params.nodes[..node_count],
226        nodes: &nodes[..node_count],
227    })
228}