Skip to main content

sidecar/arch/x86_64/
init.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! Sidecar initialization code. This code runs once, on the BSP, before the
5//! main kernel boots.
6
7use super::AFTER_INIT;
8use super::CommandErrorWriter;
9use super::ENABLE_LOG;
10use super::VSM_CAPABILITIES;
11use super::VTL_RETURN_OFFSET;
12use super::VpGlobals;
13use super::addr_space;
14use super::temporary_map;
15use crate::arch::x86_64::get_hv_vp_register;
16use crate::arch::x86_64::hypercall;
17use crate::arch::x86_64::log;
18use arrayvec::ArrayVec;
19use core::fmt::Display;
20use core::fmt::Write;
21use core::hint::spin_loop;
22use core::mem::MaybeUninit;
23use core::ptr::addr_of;
24use core::ptr::addr_of_mut;
25use core::sync::atomic::AtomicU32;
26use core::sync::atomic::Ordering::Acquire;
27use core::sync::atomic::Ordering::Relaxed;
28use core::sync::atomic::Ordering::Release;
29use hvdef::HvError;
30use hvdef::HvRegisterVsmCodePageOffsets;
31use hvdef::HvX64RegisterName;
32use hvdef::HvX64SegmentRegister;
33use hvdef::HypercallCode;
34use hvdef::hypercall::EnableVpVtlX64;
35use hvdef::hypercall::HvInputVtl;
36use hvdef::hypercall::StartVirtualProcessorX64;
37use memory_range::AlignedSubranges;
38use memory_range::MemoryRange;
39use minimal_rt::arch::hypercall::HYPERCALL_PAGE;
40use minimal_rt::enlightened_panic;
41use sidecar_defs::ControlPage;
42use sidecar_defs::CpuStatus;
43use sidecar_defs::PAGE_SIZE;
44use sidecar_defs::PER_VP_PAGES;
45use sidecar_defs::PER_VP_SHMEM_PAGES;
46use sidecar_defs::SidecarNodeOutput;
47use sidecar_defs::SidecarNodeParams;
48use sidecar_defs::SidecarOutput;
49use sidecar_defs::SidecarParams;
50use sidecar_defs::required_memory;
51use x86defs::Exception;
52use x86defs::GdtEntry;
53use x86defs::IdtAttributes;
54use x86defs::IdtEntry64;
55use x86defs::Pte;
56use zerocopy::FromZeros;
57
58unsafe extern "C" {
59    static IMAGE_PDE: Pte;
60    fn irq_entry();
61    fn exc_gpf();
62    fn exc_pf();
63}
64
65static GDT: [GdtEntry; 4] = {
66    let default_data_attributes = x86defs::X64_DEFAULT_DATA_SEGMENT_ATTRIBUTES.as_bits();
67    let default_code_attributes = x86defs::X64_DEFAULT_CODE_SEGMENT_ATTRIBUTES.as_bits();
68    let zero = GdtEntry {
69        limit_low: 0,
70        base_low: 0,
71        base_middle: 0,
72        attr_low: 0,
73        attr_high: 0,
74        base_high: 0,
75    };
76
77    [
78        zero,
79        zero,
80        GdtEntry {
81            limit_low: 0xffff,
82            attr_low: default_code_attributes as u8,
83            attr_high: (default_code_attributes >> 8) as u8,
84            ..zero
85        },
86        GdtEntry {
87            limit_low: 0xffff,
88            attr_low: default_data_attributes as u8,
89            attr_high: (default_data_attributes >> 8) as u8,
90            ..zero
91        },
92    ]
93};
94
95const IRQ: u8 = 0x20;
96
97static mut IDT: [IdtEntry64; IRQ as usize + 1] = {
98    let zero = IdtEntry64 {
99        offset_low: 0,
100        selector: 0,
101        attributes: IdtAttributes::new(),
102        offset_middle: 0,
103        offset_high: 0,
104        reserved: 0,
105    };
106    [zero; IRQ as usize + 1]
107};
108
109enum InitError {
110    RequiredMemory { required: u64, actual: u64 },
111    GetVsmCodePageOffset(HvError),
112    GetVsmCapabilities(HvError),
113}
114
115impl Display for InitError {
116    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
117        match self {
118            InitError::RequiredMemory { required, actual } => {
119                write!(
120                    f,
121                    "failed to provide required memory: {:#x}, actual: {:#x}",
122                    required, actual
123                )
124            }
125            InitError::GetVsmCodePageOffset(err) => {
126                write!(f, "failed to get vsm code page offset: {err}")
127            }
128            InitError::GetVsmCapabilities(err) => {
129                write!(f, "failed to get vsm capabilities: {err}")
130            }
131        }
132    }
133}
134
135enum InitVpError {
136    EnableVtl2(HvError),
137    StartVp(HvError),
138}
139
140impl Display for InitVpError {
141    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
142        match self {
143            InitVpError::EnableVtl2(err) => write!(f, "failed to enable vtl2: {err}"),
144            InitVpError::StartVp(err) => write!(f, "failed to start vp: {err}"),
145        }
146    }
147}
148
149/// BSP entry point from entry.S. Called with BSS, stack, and page tables
150/// initialized, and relocations applied.
151#[cfg_attr(not(minimal_rt), expect(dead_code))]
152pub extern "C" fn start(params: u64, output: u64) -> bool {
153    enlightened_panic::enable_enlightened_panic();
154
155    let [mut params_mapper, mut output_mapper, mut temp_mapper] = [0, 1, 2].map(|i| {
156        // SAFETY: no concurrent accessors to the same index.
157        unsafe { temporary_map::Mapper::new(i) }
158    });
159    // SAFETY: The page is not being concurrently accessed, and it has no
160    // invariant requirements.
161    let params = unsafe { params_mapper.map::<SidecarParams>(params) };
162    // SAFETY: The page is not being concurrently accessed, and it has no
163    // invariant requirements.
164    let mut output = unsafe { output_mapper.map::<SidecarOutput>(output) };
165    match init(&mut temp_mapper, &params, &mut output) {
166        Ok(()) => {
167            AFTER_INIT.store(true, Release);
168            true
169        }
170        Err(err) => {
171            let _ = write!(CommandErrorWriter(&mut output.error), "{err}");
172            false
173        }
174    }
175}
176
177/// Called on the BSP to initialize all the APs.
178fn init(
179    mapper: &mut temporary_map::Mapper,
180    params: &SidecarParams,
181    output: &mut SidecarOutput,
182) -> Result<(), InitError> {
183    let &SidecarParams {
184        hypercall_page,
185        enable_logging,
186        node_count,
187        ref nodes,
188        ref initial_state,
189    } = params;
190
191    ENABLE_LOG.store(enable_logging, Relaxed);
192    let nodes = &nodes[..node_count as usize];
193
194    // Copy the hypercall page locally since the main kernel will move it after
195    // this function returns.
196    {
197        // SAFETY: The page is not being concurrently accessed, and it has
198        // no invariant requirements.
199        let hypercall_page = unsafe { mapper.map::<[u8; 4096]>(hypercall_page) };
200        // SAFETY: no concurrent accessors to the page.
201        unsafe { (&raw mut HYPERCALL_PAGE).copy_from_nonoverlapping(&*hypercall_page, 1) };
202    }
203
204    // Initialize the IDT.
205    {
206        // SAFETY: no concurrent accessors.
207        let idt = unsafe { &mut *addr_of_mut!(IDT) };
208
209        let offset = exc_pf as *const () as u64;
210        idt[Exception::PAGE_FAULT.0 as usize] = IdtEntry64 {
211            offset_low: offset as u16,
212            selector: 2 * 8,
213            attributes: IdtAttributes::new().with_present(false).with_gate_type(0xf),
214            offset_middle: (offset >> 16) as u16,
215            offset_high: (offset >> 32) as u32,
216            reserved: 0,
217        };
218
219        let offset = exc_gpf as *const () as u64;
220        idt[Exception::GENERAL_PROTECTION_FAULT.0 as usize] = IdtEntry64 {
221            offset_low: offset as u16,
222            selector: 2 * 8,
223            attributes: IdtAttributes::new().with_present(false).with_gate_type(0xf),
224            offset_middle: (offset >> 16) as u16,
225            offset_high: (offset >> 32) as u32,
226            reserved: 0,
227        };
228
229        let offset = irq_entry as *const () as u64;
230        idt[IRQ as usize] = IdtEntry64 {
231            offset_low: offset as u16,
232            selector: 2 * 8,
233            attributes: IdtAttributes::new().with_present(true).with_gate_type(0xe),
234            offset_middle: (offset >> 16) as u16,
235            offset_high: (offset >> 32) as u32,
236            reserved: 0,
237        };
238    }
239
240    // Get the byte offset in the hypercall page of the VTL return function.
241    {
242        let value = HvRegisterVsmCodePageOffsets::from(
243            get_hv_vp_register(
244                HvInputVtl::CURRENT_VTL,
245                HvX64RegisterName::VsmCodePageOffsets.into(),
246            )
247            .map_err(InitError::GetVsmCodePageOffset)?
248            .as_u64(),
249        );
250        // SAFETY: no concurrent accessors.
251        unsafe { VTL_RETURN_OFFSET = value.return_offset() }
252    }
253
254    // Get the reported VSM capabilities.
255    {
256        let value = get_hv_vp_register(
257            HvInputVtl::CURRENT_VTL,
258            HvX64RegisterName::VsmCapabilities.into(),
259        )
260        .map_err(InitError::GetVsmCapabilities)?;
261        // SAFETY: no concurrent accessors.
262        unsafe { VSM_CAPABILITIES = value.as_u64().into() }
263    }
264
265    // SAFETY: no concurrent accesses yet.
266    let node_init = unsafe { &mut *addr_of_mut!(NODE_INIT) };
267
268    // Process each node, building the `node_init` array.
269    for (node_index, (node, node_output)) in nodes.iter().zip(&mut output.nodes).enumerate() {
270        let &SidecarNodeParams {
271            memory_base,
272            memory_size,
273            base_vp,
274            vp_count,
275        } = node;
276        let memory = MemoryRange::new(memory_base..memory_base + memory_size);
277
278        log!("node {node_index}: {vp_count} VPs starting at VP {base_vp}, memory {memory}");
279
280        let required = required_memory(vp_count) as u64;
281        if memory_size < required {
282            return Err(InitError::RequiredMemory {
283                required,
284                actual: memory_size,
285            });
286        }
287
288        let (control_page_range, memory) = memory.split_at_offset(PAGE_SIZE as u64);
289        let (shmem_pages, memory) =
290            memory.split_at_offset(vp_count as u64 * PER_VP_SHMEM_PAGES as u64 * PAGE_SIZE as u64);
291
292        *node_output = SidecarNodeOutput {
293            control_page: control_page_range.start(),
294            shmem_pages_base: shmem_pages.start(),
295            shmem_pages_size: shmem_pages.len(),
296        };
297
298        // Initialize the control page.
299        {
300            // SAFETY: The page is not being concurrently accessed, and it has
301            // no invariant requirements.
302            let mut control = unsafe { mapper.map::<ControlPage>(control_page_range.start()) };
303            let ControlPage {
304                index,
305                base_cpu,
306                cpu_count,
307                request_vector,
308                response_cpu,
309                response_vector,
310                needs_attention,
311                reserved,
312                cpu_status,
313            } = &mut *control;
314            *index = (node_index as u32).into();
315            *base_cpu = base_vp.into();
316            *cpu_count = vp_count.into();
317            *request_vector = (IRQ as u32).into();
318            *response_cpu = 0.into();
319            *response_vector = 0.into();
320            *needs_attention = 0.into();
321            reserved.fill(0);
322            // Default: base VP -> REMOVED (kernel starts it), other VPs -> RUN,
323            // beyond vp_count -> REMOVED.
324            cpu_status[0] = CpuStatus::REMOVED.0.into();
325            cpu_status[1..vp_count as usize].fill_with(|| CpuStatus::RUN.0.into());
326            cpu_status[vp_count as usize..].fill_with(|| CpuStatus::REMOVED.0.into());
327
328            // Apply per-CPU overrides from openhcl_boot when restoring from
329            // servicing with outstanding IO. CPUs marked false in
330            // sidecar_starts_cpu are set to REMOVED so the kernel starts them
331            // directly for immediate interrupt handling.
332            if initial_state.per_cpu_state_specified {
333                log!(
334                    "node {node_index}: applying per-cpu overrides, base_vp={base_vp}, vp_count={vp_count}"
335                );
336                let overrides = &initial_state.sidecar_starts_cpu
337                    [base_vp as usize..(base_vp + vp_count) as usize];
338                for (i, &should_start) in overrides.iter().enumerate() {
339                    cpu_status[i] = if should_start {
340                        CpuStatus::RUN.0.into()
341                    } else {
342                        let vp = base_vp + i as u32;
343                        log!("node {node_index}: VP {vp} (idx {i}) -> REMOVED");
344                        CpuStatus::REMOVED.0.into()
345                    };
346                }
347            }
348        }
349
350        node_init.push(NodeInit {
351            node: NodeDefinition {
352                base_vp,
353                vp_count,
354                control_page_pa: control_page_range.start(),
355                shmem_pages,
356                memory,
357            },
358            next_vp: AtomicU32::new(1), // skip the base VP in each node
359        });
360    }
361
362    // Downgrade the node init array to immutable, then start booting the APs.
363    // Each AP that boots will then start helping boot additional APs.
364    //
365    // SAFETY: no concurrent mutators.
366    let node_init = unsafe { &*addr_of!(NODE_INIT) };
367    start_aps(node_init, mapper);
368
369    // Wait for all the APs to finish starting.
370    {
371        for (node, output) in nodes.iter().zip(&output.nodes) {
372            // SAFETY: The page is not being concurrently accessed, and it has
373            // no invariant requirements.
374            let control = unsafe { mapper.map::<ControlPage>(output.control_page) };
375            for status in &control.cpu_status[0..node.vp_count as usize] {
376                while status.load(Acquire) == CpuStatus::RUN.0 {
377                    spin_loop();
378                }
379            }
380        }
381    }
382
383    Ok(())
384}
385
386struct NodeInit {
387    node: NodeDefinition,
388    next_vp: AtomicU32,
389}
390
391static mut NODE_INIT: ArrayVec<NodeInit, { sidecar_defs::MAX_NODES }> = ArrayVec::new_const();
392
393fn start_aps(node_init: &[NodeInit], mapper: &mut temporary_map::Mapper) {
394    for node in node_init {
395        loop {
396            let node_cpu_index = node.next_vp.fetch_add(1, Relaxed);
397            assert!(node_cpu_index != u32::MAX);
398            if node_cpu_index >= node.node.vp_count {
399                break;
400            }
401
402            // Read this VP's status from the node's control page.
403            // The mapping is scoped so the mapper is free for start().
404            let is_removed = {
405                // SAFETY: control page was initialized; no concurrent mutation yet.
406                let control = unsafe { mapper.map::<ControlPage>(node.node.control_page_pa) };
407                control.cpu_status[node_cpu_index as usize].load(Relaxed) == CpuStatus::REMOVED.0
408            };
409
410            let vp = node.node.base_vp + node_cpu_index;
411            if is_removed {
412                log!("start_aps: skipping VP {vp} (idx {node_cpu_index}): REMOVED");
413                continue;
414            }
415
416            match node.node.start(mapper, node_cpu_index) {
417                Ok(()) => {}
418                Err(err) => panic!("failed to start VP {vp}: {err}"),
419            }
420        }
421    }
422}
423
424/// # Safety
425/// Must be called as an AP entry point.
426unsafe fn ap_init() -> ! {
427    // Start any other pending APs.
428    {
429        // SAFETY: `NODE_INIT` is set before this routine is called.
430        let node_init = unsafe { &*addr_of!(NODE_INIT) };
431        // SAFETY: nothing else on this CPU is using the temporary map.
432        let mut mapper = unsafe { temporary_map::Mapper::new(0) };
433        start_aps(node_init, &mut mapper)
434    }
435    // SAFETY: this is an entry point.
436    unsafe { super::vp::ap_entry() }
437}
438
439struct NodeDefinition {
440    base_vp: u32,
441    vp_count: u32,
442    control_page_pa: u64,
443    shmem_pages: MemoryRange,
444    memory: MemoryRange,
445}
446
447impl NodeDefinition {
448    fn start(
449        &self,
450        mapper: &mut temporary_map::Mapper,
451        node_cpu_index: u32,
452    ) -> Result<(), InitVpError> {
453        let hv_vp_index = self.base_vp + node_cpu_index;
454
455        let shmem_pages = self.shmem_pages.start()
456            + node_cpu_index as u64 * PER_VP_SHMEM_PAGES as u64 * PAGE_SIZE as u64;
457        let command_page_pa = shmem_pages;
458        let reg_page_pa = shmem_pages + PAGE_SIZE as u64;
459        let memory_start =
460            self.memory.start() + node_cpu_index as u64 * PER_VP_PAGES as u64 * PAGE_SIZE as u64;
461        let memory =
462            MemoryRange::new(memory_start..memory_start + PER_VP_PAGES as u64 * PAGE_SIZE as u64);
463
464        let mut memory = AlignedSubranges::new(memory)
465            .with_max_range_len(PAGE_SIZE as u64)
466            .map(|r| r.start());
467        let pml4_pa = memory.next().unwrap();
468        let pdpt_pa = memory.next().unwrap();
469        let pd_pa = memory.next().unwrap();
470        let pt_pa = memory.next().unwrap();
471
472        let pte_table = |addr| {
473            Pte::new()
474                .with_address(addr)
475                .with_read_write(true)
476                .with_present(true)
477        };
478
479        {
480            // SAFETY: The page is not being concurrently accessed, and it has no
481            // invariant requirements.
482            let mut pml4 = unsafe { mapper.map::<[Pte; 512]>(pml4_pa) };
483            pml4[511] = pte_table(pdpt_pa);
484        }
485        {
486            // SAFETY: The page is not being concurrently accessed, and it has no
487            // invariant requirements.
488            let mut pdpt = unsafe { mapper.map::<Pte>(pdpt_pa) };
489            *pdpt = pte_table(pd_pa);
490        }
491        {
492            // SAFETY: The page is not being concurrently accessed, and it has no
493            // invariant requirements.
494            let mut pd = unsafe { mapper.map::<[Pte; 512]>(pd_pa) };
495            // SAFETY: the PTE is not being concurrently modified.
496            pd[0] = unsafe { IMAGE_PDE };
497            pd[1] = pte_table(pt_pa);
498        }
499        let globals_pa = {
500            // SAFETY: The page is not being concurrently accessed, and it has no
501            // invariant requirements.
502            let mut pt = unsafe { mapper.map::<[Pte; 512]>(pt_pa) };
503            addr_space::init_ap(
504                &mut pt,
505                pt_pa,
506                self.control_page_pa,
507                command_page_pa,
508                reg_page_pa,
509                &mut memory,
510            )
511        };
512        {
513            // SAFETY: The page is not being concurrently accessed, and it has no
514            // invariant requirements.
515            let mut globals = unsafe { mapper.map::<MaybeUninit<VpGlobals>>(globals_pa) };
516            globals.write(VpGlobals {
517                hv_vp_index,
518                node_cpu_index,
519                overlays_mapped: false,
520                register_page_mapped: false,
521            });
522        }
523
524        let cs = HvX64SegmentRegister {
525            base: 0,
526            limit: !0,
527            selector: 2 * 8,
528            attributes: x86defs::X64_DEFAULT_CODE_SEGMENT_ATTRIBUTES.into(),
529        };
530        let ds = HvX64SegmentRegister {
531            base: 0,
532            limit: !0,
533            selector: 3 * 8,
534            attributes: x86defs::X64_DEFAULT_DATA_SEGMENT_ATTRIBUTES.into(),
535        };
536        let gdtr = hvdef::HvX64TableRegister {
537            base: addr_of!(GDT) as u64,
538            limit: size_of_val(&GDT) as u16 - 1,
539            pad: [0; 3],
540        };
541        let idtr = hvdef::HvX64TableRegister {
542            base: addr_of!(IDT) as u64,
543            // SAFETY: just getting the size
544            limit: size_of_val(unsafe { &*addr_of!(IDT) }) as u16 - 1,
545            pad: [0; 3],
546        };
547        let context = hvdef::hypercall::InitialVpContextX64 {
548            rip: ap_init as *const () as u64,
549            rsp: addr_space::stack().end() - 8, // start unaligned to match calling convention
550            rflags: x86defs::RFlags::at_reset().into(),
551            cs,
552            ds,
553            es: ds,
554            fs: ds,
555            gs: ds,
556            ss: ds,
557            tr: HvX64SegmentRegister {
558                base: 0,
559                limit: 0xffff,
560                selector: 0,
561                attributes: x86defs::X64_BUSY_TSS_SEGMENT_ATTRIBUTES.into(),
562            },
563            ldtr: FromZeros::new_zeroed(),
564            idtr,
565            gdtr,
566            efer: x86defs::X64_EFER_LMA | x86defs::X64_EFER_LME | x86defs::X64_EFER_NXE,
567            cr0: x86defs::X64_CR0_PG | x86defs::X64_CR0_PE | x86defs::X64_CR0_NE,
568            cr3: pml4_pa,
569            cr4: x86defs::X64_CR4_PAE | x86defs::X64_CR4_MCE | x86defs::X64_CR4_FXSR,
570            msr_cr_pat: x86defs::X86X_MSR_DEFAULT_PAT,
571        };
572
573        {
574            // SAFETY: no concurrent accessors.
575            let input_page = unsafe { &mut *addr_space::hypercall_input().cast() };
576            let EnableVpVtlX64 {
577                partition_id,
578                vp_index,
579                target_vtl,
580                reserved,
581                vp_vtl_context,
582            } = input_page;
583
584            *partition_id = hvdef::HV_PARTITION_ID_SELF;
585            *vp_index = hv_vp_index;
586            *target_vtl = hvdef::Vtl::Vtl2.into();
587            *vp_vtl_context = context;
588            *reserved = [0; 3];
589        }
590        match hypercall(HypercallCode::HvCallEnableVpVtl, 0) {
591            Ok(()) | Err(HvError::VtlAlreadyEnabled) => {}
592            Err(err) => return Err(InitVpError::EnableVtl2(err)),
593        }
594
595        {
596            // SAFETY: no concurrent accessors.
597            let input_page = unsafe { &mut *addr_space::hypercall_input().cast() };
598            let StartVirtualProcessorX64 {
599                partition_id,
600                vp_index,
601                target_vtl,
602                rsvd0,
603                rsvd1,
604                vp_context,
605            } = input_page;
606
607            *partition_id = hvdef::HV_PARTITION_ID_SELF;
608            *vp_index = hv_vp_index;
609            *target_vtl = hvdef::Vtl::Vtl2.into();
610            *rsvd0 = 0;
611            *rsvd1 = 0;
612            *vp_context = context;
613        }
614        hypercall(HypercallCode::HvCallStartVirtualProcessor, 0).map_err(InitVpError::StartVp)?;
615
616        Ok(())
617    }
618}