virt_mshv_vtl/processor/tdx/
mod.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! Processor support for TDX partitions.
5
6mod tlb_flush;
7
8use super::BackingPrivate;
9use super::BackingSharedParams;
10use super::HardwareIsolatedBacking;
11use super::UhEmulationState;
12use super::UhHypercallHandler;
13use super::UhRunVpError;
14use super::hardware_cvm;
15use super::vp_state;
16use super::vp_state::UhVpStateAccess;
17use crate::BackingShared;
18use crate::GuestVtl;
19use crate::TlbFlushLockAccess;
20use crate::UhCvmPartitionState;
21use crate::UhCvmVpState;
22use crate::UhPartitionInner;
23use crate::UhPartitionNewParams;
24use crate::UhProcessor;
25use crate::WakeReason;
26use cvm_tracing::CVM_ALLOWED;
27use cvm_tracing::CVM_CONFIDENTIAL;
28use guestmem::GuestMemory;
29use hcl::ioctl::ProcessorRunner;
30use hcl::ioctl::tdx::Tdx;
31use hcl::ioctl::tdx::TdxPrivateRegs;
32use hcl::protocol::hcl_intr_offload_flags;
33use hcl::protocol::tdx_tdg_vp_enter_exit_info;
34use hv1_emulator::hv::ProcessorVtlHv;
35use hv1_emulator::synic::GlobalSynic;
36use hv1_emulator::synic::ProcessorSynic;
37use hv1_hypercall::AsHandler;
38use hv1_hypercall::HvRepResult;
39use hv1_hypercall::HypercallIo;
40use hv1_structs::ProcessorSet;
41use hv1_structs::VtlArray;
42use hvdef::HV_PAGE_SIZE;
43use hvdef::HvError;
44use hvdef::HvSynicSimpSiefp;
45use hvdef::HvX64PendingExceptionEvent;
46use hvdef::HvX64RegisterName;
47use hvdef::Vtl;
48use hvdef::hypercall::HvFlushFlags;
49use hvdef::hypercall::HvGvaRange;
50use inspect::Inspect;
51use inspect::InspectMut;
52use inspect_counters::Counter;
53use std::sync::atomic::AtomicU8;
54use std::sync::atomic::Ordering;
55use tlb_flush::FLUSH_GVA_LIST_SIZE;
56use tlb_flush::TdxFlushState;
57use tlb_flush::TdxPartitionFlushState;
58use virt::Processor;
59use virt::VpHaltReason;
60use virt::VpIndex;
61use virt::io::CpuIo;
62use virt::state::StateElement;
63use virt::vp;
64use virt::vp::AccessVpState;
65use virt::vp::MpState;
66use virt::vp::Registers;
67use virt::x86::MsrError;
68use virt::x86::MsrErrorExt;
69use virt::x86::SegmentRegister;
70use virt::x86::TableRegister;
71use virt_support_apic::ApicClient;
72use virt_support_apic::OffloadNotSupported;
73use virt_support_x86emu::emulate::EmulatedMemoryOperation;
74use virt_support_x86emu::emulate::EmulatorSupport as X86EmulatorSupport;
75use virt_support_x86emu::emulate::TranslateMode;
76use virt_support_x86emu::emulate::emulate_insn_memory_op;
77use virt_support_x86emu::emulate::emulate_io;
78use virt_support_x86emu::emulate::emulate_translate_gva;
79use virt_support_x86emu::translate::TranslationRegisters;
80use vm_topology::memory::AddressType;
81use vmcore::vmtime::VmTimeAccess;
82use x86defs::RFlags;
83use x86defs::X64_CR0_ET;
84use x86defs::X64_CR0_NE;
85use x86defs::X64_CR0_PE;
86use x86defs::X64_CR0_PG;
87use x86defs::X64_CR4_MCE;
88use x86defs::X64_CR4_UMIP;
89use x86defs::X64_CR4_VMXE;
90use x86defs::X64_EFER_FFXSR;
91use x86defs::X64_EFER_LMA;
92use x86defs::X64_EFER_LME;
93use x86defs::X64_EFER_NXE;
94use x86defs::X64_EFER_SVME;
95use x86defs::X86X_MSR_EFER;
96use x86defs::apic::X2APIC_MSR_BASE;
97use x86defs::tdx::TdCallResultCode;
98use x86defs::tdx::TdVmCallR10Result;
99use x86defs::tdx::TdxGp;
100use x86defs::tdx::TdxInstructionInfo;
101use x86defs::tdx::TdxL2Ctls;
102use x86defs::tdx::TdxVpEnterRaxResult;
103use x86defs::vmx::ApicPage;
104use x86defs::vmx::ApicRegister;
105use x86defs::vmx::CR_ACCESS_TYPE_LMSW;
106use x86defs::vmx::CR_ACCESS_TYPE_MOV_TO_CR;
107use x86defs::vmx::CrAccessQualification;
108use x86defs::vmx::ExitQualificationIo;
109use x86defs::vmx::GdtrOrIdtrInstruction;
110use x86defs::vmx::GdtrOrIdtrInstructionInfo;
111use x86defs::vmx::INTERRUPT_TYPE_EXTERNAL;
112use x86defs::vmx::INTERRUPT_TYPE_HARDWARE_EXCEPTION;
113use x86defs::vmx::INTERRUPT_TYPE_NMI;
114use x86defs::vmx::IO_SIZE_8_BIT;
115use x86defs::vmx::IO_SIZE_16_BIT;
116use x86defs::vmx::IO_SIZE_32_BIT;
117use x86defs::vmx::Interruptibility;
118use x86defs::vmx::InterruptionInformation;
119use x86defs::vmx::LdtrOrTrInstruction;
120use x86defs::vmx::LdtrOrTrInstructionInfo;
121use x86defs::vmx::ProcessorControls;
122use x86defs::vmx::SecondaryProcessorControls;
123use x86defs::vmx::VMX_ENTRY_CONTROL_LONG_MODE_GUEST;
124use x86defs::vmx::VMX_FEATURE_CONTROL_LOCKED;
125use x86defs::vmx::VmcsField;
126use x86defs::vmx::VmxEptExitQualification;
127use x86defs::vmx::VmxExit;
128use x86defs::vmx::VmxExitBasic;
129use x86emu::Gp;
130use x86emu::Segment;
131
132/// MSRs that are allowed to be read by the guest without interception.
133const MSR_ALLOWED_READ: &[u32] = &[
134    x86defs::X86X_MSR_TSC,
135    x86defs::X86X_MSR_TSC_AUX,
136    X86X_MSR_EFER,
137    x86defs::X86X_MSR_STAR,
138    x86defs::X86X_MSR_LSTAR,
139    x86defs::X86X_MSR_SFMASK,
140    x86defs::X86X_MSR_SYSENTER_CS,
141    x86defs::X86X_MSR_SYSENTER_ESP,
142    x86defs::X86X_MSR_SYSENTER_EIP,
143];
144
145/// MSRs that are allowed to be read and written by the guest without interception.
146const MSR_ALLOWED_READ_WRITE: &[u32] = &[
147    x86defs::X64_MSR_FS_BASE,
148    x86defs::X64_MSR_GS_BASE,
149    x86defs::X64_MSR_KERNEL_GS_BASE,
150    x86defs::X86X_MSR_SPEC_CTRL,
151    x86defs::X86X_MSR_U_CET,
152    x86defs::X86X_MSR_S_CET,
153    x86defs::X86X_MSR_PL0_SSP,
154    x86defs::X86X_MSR_PL1_SSP,
155    x86defs::X86X_MSR_PL2_SSP,
156    x86defs::X86X_MSR_PL3_SSP,
157    x86defs::X86X_MSR_INTERRUPT_SSP_TABLE_ADDR,
158    x86defs::X86X_IA32_MSR_XFD,
159    x86defs::X86X_IA32_MSR_XFD_ERR,
160];
161
162#[derive(Debug)]
163struct TdxExit<'a>(&'a tdx_tdg_vp_enter_exit_info);
164
165impl TdxExit<'_> {
166    fn code(&self) -> TdxVpEnterRaxResult {
167        self.0.rax.into()
168    }
169    fn qualification(&self) -> u64 {
170        self.0.rcx
171    }
172    fn gla(&self) -> Option<u64> {
173        // Only valid for EPT exits.
174        if self.code().vmx_exit().basic_reason() == VmxExitBasic::EPT_VIOLATION {
175            Some(self.0.rdx)
176        } else {
177            None
178        }
179    }
180    fn gpa(&self) -> Option<u64> {
181        // Only valid for EPT exits.
182        if self.code().vmx_exit().basic_reason() == VmxExitBasic::EPT_VIOLATION {
183            Some(self.0.r8)
184        } else {
185            None
186        }
187    }
188    fn _exit_interruption_info(&self) -> InterruptionInformation {
189        (self.0.r9 as u32).into()
190    }
191    fn _exit_interruption_error_code(&self) -> u32 {
192        (self.0.r9 >> 32) as u32
193    }
194    fn idt_vectoring_info(&self) -> InterruptionInformation {
195        (self.0.r10 as u32).into()
196    }
197    fn idt_vectoring_error_code(&self) -> u32 {
198        (self.0.r10 >> 32) as u32
199    }
200    fn instr_info(&self) -> TdxInstructionInfo {
201        self.0.r11.into()
202    }
203    fn cs(&self) -> SegmentRegister {
204        SegmentRegister {
205            selector: self.0.rsi as u16,
206            base: self.0.rdi,
207            limit: (self.0.rsi >> 32) as u32,
208            attributes: (self.0.rsi >> 16) as u16,
209        }
210    }
211    fn cpl(&self) -> u8 {
212        self.0.r12 as u8 & 3
213    }
214}
215
216/// Registers that can be virtual and shadowed.
217#[derive(Debug, Inspect)]
218enum ShadowedRegister {
219    Cr0,
220    Cr4,
221}
222
223impl ShadowedRegister {
224    fn name(&self) -> &'static str {
225        match self {
226            Self::Cr0 => "cr0",
227            Self::Cr4 => "cr4",
228        }
229    }
230
231    fn physical_vmcs_field(&self) -> VmcsField {
232        match self {
233            Self::Cr0 => VmcsField::VMX_VMCS_GUEST_CR0,
234            Self::Cr4 => VmcsField::VMX_VMCS_GUEST_CR4,
235        }
236    }
237
238    fn shadow_vmcs_field(&self) -> VmcsField {
239        match self {
240            Self::Cr0 => VmcsField::VMX_VMCS_CR0_READ_SHADOW,
241            Self::Cr4 => VmcsField::VMX_VMCS_CR4_READ_SHADOW,
242        }
243    }
244
245    fn guest_owned_mask(&self) -> u64 {
246        // Control register bits that are guest owned by default. A bit is guest
247        // owned when the physical register bit is always set to the virtual
248        // register bit (subject to validation of the virtual register).
249        match self {
250            Self::Cr0 => {
251                X64_CR0_ET
252                    | x86defs::X64_CR0_MP
253                    | x86defs::X64_CR0_EM
254                    | x86defs::X64_CR0_TS
255                    | x86defs::X64_CR0_WP
256                    | x86defs::X64_CR0_AM
257                    | X64_CR0_PE
258                    | X64_CR0_PG
259            }
260            Self::Cr4 => {
261                x86defs::X64_CR4_VME
262                    | x86defs::X64_CR4_PVI
263                    | x86defs::X64_CR4_TSD
264                    | x86defs::X64_CR4_DE
265                    | x86defs::X64_CR4_PSE
266                    | x86defs::X64_CR4_PAE
267                    | x86defs::X64_CR4_PGE
268                    | x86defs::X64_CR4_PCE
269                    | x86defs::X64_CR4_FXSR
270                    | x86defs::X64_CR4_XMMEXCPT
271                    | X64_CR4_UMIP
272                    | x86defs::X64_CR4_LA57
273                    | x86defs::X64_CR4_RWFSGS
274                    | x86defs::X64_CR4_PCIDE
275                    | x86defs::X64_CR4_OSXSAVE
276                    | x86defs::X64_CR4_SMEP
277                    | x86defs::X64_CR4_SMAP
278                    | x86defs::X64_CR4_CET
279            }
280        }
281    }
282}
283
284/// A virtual register that is shadowed by the virtstack.
285///
286/// Some bits are owned by the guest while others are owned by the virtstack,
287/// due to TDX requirements.
288#[derive(Inspect)]
289struct VirtualRegister {
290    /// The register being shadowed.
291    register: ShadowedRegister,
292    /// The VTL this register is shadowed for.
293    vtl: GuestVtl,
294    /// The value the guest sees.
295    shadow_value: u64,
296    /// Additional constraints on bits.
297    allowed_bits: u64,
298}
299
300impl VirtualRegister {
301    fn new(reg: ShadowedRegister, vtl: GuestVtl, initial_value: u64, allowed_bits: u64) -> Self {
302        Self {
303            register: reg,
304            vtl,
305            shadow_value: initial_value,
306            allowed_bits,
307        }
308    }
309
310    /// Write a new value to the virtual register. This updates host owned bits
311    /// in the shadowed value, and updates guest owned bits in the physical
312    /// register in the vmcs.
313    fn write<'a>(
314        &mut self,
315        value: u64,
316        runner: &mut ProcessorRunner<'a, Tdx<'a>>,
317    ) -> Result<(), vp_state::Error> {
318        tracing::trace!(?self.register, value, "write virtual register");
319
320        if value & !self.allowed_bits != 0 {
321            return Err(vp_state::Error::InvalidValue(
322                value,
323                self.register.name(),
324                "disallowed bit set",
325            ));
326        }
327
328        // If guest owned bits of the physical register have changed, then update
329        // the guest owned bits of the physical field.
330        let old_physical_reg = runner.read_vmcs64(self.vtl, self.register.physical_vmcs_field());
331
332        tracing::trace!(old_physical_reg, "old_physical_reg");
333
334        let guest_owned_mask = self.register.guest_owned_mask();
335        if (old_physical_reg ^ value) & guest_owned_mask != 0 {
336            let new_physical_reg =
337                (old_physical_reg & !guest_owned_mask) | (value & guest_owned_mask);
338
339            tracing::trace!(new_physical_reg, "new_physical_reg");
340
341            runner.write_vmcs64(
342                self.vtl,
343                self.register.physical_vmcs_field(),
344                !0,
345                new_physical_reg,
346            );
347        }
348
349        self.shadow_value = value;
350        runner.write_vmcs64(self.vtl, self.register.shadow_vmcs_field(), !0, value);
351        Ok(())
352    }
353
354    fn read<'a>(&self, runner: &ProcessorRunner<'a, Tdx<'a>>) -> u64 {
355        let physical_reg = runner.read_vmcs64(self.vtl, self.register.physical_vmcs_field());
356
357        // Get the bits owned by the host from the shadow and the bits owned by the
358        // guest from the physical value.
359        let guest_owned_mask = self.register.guest_owned_mask();
360        (self.shadow_value & !self.register.guest_owned_mask()) | (physical_reg & guest_owned_mask)
361    }
362}
363
364/// Backing for TDX partitions.
365#[derive(InspectMut)]
366pub struct TdxBacked {
367    #[inspect(mut)]
368    vtls: VtlArray<TdxVtl, 2>,
369
370    untrusted_synic: Option<ProcessorSynic>,
371    #[inspect(hex, iter_by_index)]
372    eoi_exit_bitmap: [u64; 4],
373
374    /// A mapped page used for issuing INVGLA hypercalls.
375    #[inspect(skip)]
376    flush_page: user_driver::memory::MemoryBlock,
377
378    #[inspect(flatten)]
379    cvm: UhCvmVpState,
380}
381
382#[derive(InspectMut)]
383struct TdxVtl {
384    /// The EFER value for this VP.
385    efer: u64,
386    /// Virtual cr0.
387    cr0: VirtualRegister,
388    /// Virtual cr4.
389    cr4: VirtualRegister,
390
391    // CSTAR doesn't exist on TDX, but Windows likes to verify that values are sticky.
392    msr_cstar: u64,
393
394    tpr_threshold: u8,
395    #[inspect(skip)]
396    processor_controls: ProcessorControls,
397    #[inspect(skip)]
398    interruption_information: InterruptionInformation,
399    exception_error_code: u32,
400    interruption_set: bool,
401
402    #[inspect(mut)]
403    private_regs: TdxPrivateRegs,
404
405    /// TDX only TLB flush state.
406    flush_state: TdxFlushState,
407
408    enter_stats: EnterStats,
409    exit_stats: ExitStats,
410}
411
412#[derive(Default)]
413pub struct TdxEmulationCache {
414    segs: [Option<SegmentRegister>; 6],
415    cr0: Option<u64>,
416}
417
418#[derive(Inspect, Default)]
419struct EnterStats {
420    success: Counter,
421    host_routed_async: Counter,
422    l2_exit_pending_intr: Counter,
423    pending_intr: Counter,
424    host_routed_td_vmcall: Counter,
425}
426
427#[derive(Inspect, Default)]
428struct ExitStats {
429    io: Counter,
430    msr_read: Counter,
431    msr_write: Counter,
432    ept_violation: Counter,
433    cpuid: Counter,
434    cr_access: Counter,
435    xsetbv: Counter,
436    tpr_below_threshold: Counter,
437    interrupt_window: Counter,
438    nmi_window: Counter,
439    vmcall: Counter,
440    smi_intr: Counter,
441    wbinvd: Counter,
442    hw_interrupt: Counter,
443    tdcall: Counter,
444    hlt: Counter,
445    pause: Counter,
446    needs_interrupt_reinject: Counter,
447    exception: Counter,
448    descriptor_table: Counter,
449}
450
451enum UhDirectOverlay {
452    Sipp,
453    Sifp,
454    Count,
455}
456
457impl HardwareIsolatedBacking for TdxBacked {
458    fn cvm_state(&self) -> &UhCvmVpState {
459        &self.cvm
460    }
461
462    fn cvm_state_mut(&mut self) -> &mut UhCvmVpState {
463        &mut self.cvm
464    }
465
466    fn cvm_partition_state(shared: &Self::Shared) -> &UhCvmPartitionState {
467        &shared.cvm
468    }
469
470    fn switch_vtl(this: &mut UhProcessor<'_, Self>, _source_vtl: GuestVtl, target_vtl: GuestVtl) {
471        // The GPs, Fxsave, and CR2 are saved in the shared kernel state. No copying needed.
472        // Debug registers and XFEM are shared architecturally. No copying needed.
473
474        this.backing.cvm_state_mut().exit_vtl = target_vtl;
475    }
476
477    fn translation_registers(
478        &self,
479        this: &UhProcessor<'_, Self>,
480        vtl: GuestVtl,
481    ) -> TranslationRegisters {
482        let cr0 = this.backing.vtls[vtl].cr0.read(&this.runner);
483        let cr4 = this.backing.vtls[vtl].cr4.read(&this.runner);
484        let efer = this.backing.vtls[vtl].efer;
485        let cr3 = this.runner.read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_CR3);
486        let ss = this.read_segment(vtl, TdxSegmentReg::Ss).into();
487        let rflags = this.backing.vtls[vtl].private_regs.rflags;
488
489        TranslationRegisters {
490            cr0,
491            cr4,
492            efer,
493            cr3,
494            ss,
495            rflags,
496            encryption_mode: this.partition.caps.vtom.map_or(
497                virt_support_x86emu::translate::EncryptionMode::None,
498                virt_support_x86emu::translate::EncryptionMode::Vtom,
499            ),
500        }
501    }
502
503    fn tlb_flush_lock_access<'a>(
504        vp_index: Option<VpIndex>,
505        partition: &'a UhPartitionInner,
506        shared: &'a Self::Shared,
507    ) -> impl TlbFlushLockAccess + 'a {
508        TdxTlbLockFlushAccess {
509            vp_index,
510            partition,
511            shared,
512        }
513    }
514
515    fn pending_event_vector(this: &UhProcessor<'_, Self>, vtl: GuestVtl) -> Option<u8> {
516        let event_inject = this.backing.vtls[vtl].interruption_information;
517        if event_inject.valid() {
518            Some(event_inject.vector())
519        } else {
520            None
521        }
522    }
523
524    fn set_pending_exception(
525        this: &mut UhProcessor<'_, Self>,
526        vtl: GuestVtl,
527        event: HvX64PendingExceptionEvent,
528    ) {
529        let new_intr = InterruptionInformation::new()
530            .with_valid(true)
531            .with_deliver_error_code(event.deliver_error_code())
532            .with_vector(event.vector().try_into().unwrap())
533            .with_interruption_type(INTERRUPT_TYPE_HARDWARE_EXCEPTION);
534
535        this.backing.vtls[vtl].interruption_information = new_intr;
536        this.backing.vtls[vtl].exception_error_code = event.error_code();
537    }
538
539    fn cr0(this: &UhProcessor<'_, Self>, vtl: GuestVtl) -> u64 {
540        this.read_cr0(vtl)
541    }
542
543    fn cr4(this: &UhProcessor<'_, Self>, vtl: GuestVtl) -> u64 {
544        this.read_cr4(vtl)
545    }
546
547    fn intercept_message_state(
548        this: &UhProcessor<'_, Self>,
549        vtl: GuestVtl,
550        include_optional_state: bool,
551    ) -> super::InterceptMessageState {
552        let exit = TdxExit(this.runner.tdx_vp_enter_exit_info());
553        let backing_vtl = &this.backing.vtls[vtl];
554        let shared_gps = this.runner.tdx_enter_guest_gps();
555
556        super::InterceptMessageState {
557            instruction_length_and_cr8: exit.instr_info().length() as u8,
558            cpl: exit.cpl(),
559            efer_lma: backing_vtl.efer & X64_EFER_LMA != 0,
560            cs: exit.cs().into(),
561            rip: backing_vtl.private_regs.rip,
562            rflags: backing_vtl.private_regs.rflags,
563            rax: shared_gps[TdxGp::RAX],
564            rdx: shared_gps[TdxGp::RDX],
565            optional: if include_optional_state {
566                Some(super::InterceptMessageOptionalState {
567                    ds: this.read_segment(vtl, TdxSegmentReg::Ds).into(),
568                    es: this.read_segment(vtl, TdxSegmentReg::Es).into(),
569                })
570            } else {
571                None
572            },
573            rcx: shared_gps[TdxGp::RCX],
574            rsi: shared_gps[TdxGp::RSI],
575            rdi: shared_gps[TdxGp::RDI],
576        }
577    }
578
579    fn cr_intercept_registration(
580        this: &mut UhProcessor<'_, Self>,
581        intercept_control: hvdef::HvRegisterCrInterceptControl,
582    ) {
583        // Today we only support intercepting VTL 0 on behalf of VTL 1.
584        let vtl = GuestVtl::Vtl0;
585        let intercept_masks = &this
586            .backing
587            .cvm_state()
588            .vtl1
589            .as_ref()
590            .unwrap()
591            .reg_intercept;
592
593        // Update CR0 and CR4 intercept masks in the VMCS.
594        this.runner.write_vmcs64(
595            vtl,
596            VmcsField::VMX_VMCS_CR0_GUEST_HOST_MASK,
597            !0,
598            this.shared.cr_guest_host_mask(ShadowedRegister::Cr0)
599                | if intercept_control.cr0_write() {
600                    intercept_masks.cr0_mask
601                } else {
602                    0
603                },
604        );
605        this.runner.write_vmcs64(
606            vtl,
607            VmcsField::VMX_VMCS_CR4_GUEST_HOST_MASK,
608            !0,
609            this.shared.cr_guest_host_mask(ShadowedRegister::Cr4)
610                | if intercept_control.cr4_write() {
611                    intercept_masks.cr4_mask
612                } else {
613                    0
614                },
615        );
616
617        // Update descriptor table intercepts.
618        let intercept_tables = intercept_control.gdtr_write()
619            | intercept_control.idtr_write()
620            | intercept_control.ldtr_write()
621            | intercept_control.tr_write();
622        this.runner.write_vmcs32(
623            vtl,
624            VmcsField::VMX_VMCS_SECONDARY_PROCESSOR_CONTROLS,
625            SecondaryProcessorControls::new()
626                .with_descriptor_table_exiting(true)
627                .into_bits(),
628            SecondaryProcessorControls::new()
629                .with_descriptor_table_exiting(intercept_tables)
630                .into_bits(),
631        );
632
633        // Update MSR intercepts. We only need to update those that are allowed
634        // to be passed through, as the default otherwise is to always intercept.
635        // See [`MSR_ALLOWED_READ_WRITE`].
636        this.runner.set_msr_bit(
637            vtl,
638            x86defs::X86X_MSR_S_CET,
639            true,
640            intercept_control.msr_scet_write(),
641        );
642        this.runner.set_msr_bit(
643            vtl,
644            x86defs::X86X_MSR_PL0_SSP,
645            true,
646            intercept_control.msr_pls_ssp_write(),
647        );
648        this.runner.set_msr_bit(
649            vtl,
650            x86defs::X86X_MSR_PL1_SSP,
651            true,
652            intercept_control.msr_pls_ssp_write(),
653        );
654        this.runner.set_msr_bit(
655            vtl,
656            x86defs::X86X_MSR_PL2_SSP,
657            true,
658            intercept_control.msr_pls_ssp_write(),
659        );
660        this.runner.set_msr_bit(
661            vtl,
662            x86defs::X86X_MSR_PL3_SSP,
663            true,
664            intercept_control.msr_pls_ssp_write(),
665        );
666        this.runner.set_msr_bit(
667            vtl,
668            x86defs::X86X_MSR_INTERRUPT_SSP_TABLE_ADDR,
669            true,
670            intercept_control.msr_pls_ssp_write(),
671        );
672    }
673
674    fn is_interrupt_pending(
675        this: &mut UhProcessor<'_, Self>,
676        vtl: GuestVtl,
677        check_rflags: bool,
678        dev: &impl CpuIo,
679    ) -> bool {
680        let backing_vtl = &this.backing.vtls[vtl];
681        if backing_vtl.interruption_information.valid()
682            && backing_vtl.interruption_information.interruption_type() == INTERRUPT_TYPE_NMI
683        {
684            return true;
685        }
686
687        let (vector, ppr) = if this.backing.cvm.lapics[vtl].lapic.is_offloaded() {
688            let vector = backing_vtl.private_regs.rvi;
689            let ppr = std::cmp::max(
690                backing_vtl.private_regs.svi.into(),
691                this.runner.tdx_apic_page(vtl).tpr.value,
692            );
693            (vector, ppr)
694        } else {
695            let lapic = &mut this.backing.cvm.lapics[vtl].lapic;
696            let vector = lapic.next_irr().unwrap_or(0);
697            let ppr = lapic
698                .access(&mut TdxApicClient {
699                    partition: this.partition,
700                    apic_page: this.runner.tdx_apic_page_mut(vtl),
701                    dev,
702                    vmtime: &this.vmtime,
703                    vtl,
704                })
705                .get_ppr();
706            (vector, ppr)
707        };
708        let vector_priority = (vector as u32) >> 4;
709        let ppr_priority = ppr >> 4;
710
711        if vector_priority <= ppr_priority {
712            return false;
713        }
714
715        if check_rflags && !RFlags::from_bits(backing_vtl.private_regs.rflags).interrupt_enable() {
716            return false;
717        }
718
719        let interruptibility: Interruptibility = this
720            .runner
721            .read_vmcs32(vtl, VmcsField::VMX_VMCS_GUEST_INTERRUPTIBILITY)
722            .into();
723
724        if interruptibility.blocked_by_sti() || interruptibility.blocked_by_movss() {
725            return false;
726        }
727
728        true
729    }
730
731    fn untrusted_synic_mut(&mut self) -> Option<&mut ProcessorSynic> {
732        self.untrusted_synic.as_mut()
733    }
734}
735
736/// Partition-wide shared data for TDX VPs.
737#[derive(Inspect)]
738pub struct TdxBackedShared {
739    #[inspect(flatten)]
740    pub(crate) cvm: UhCvmPartitionState,
741    /// The synic state used for untrusted SINTs, that is, the SINTs for which
742    /// the guest thinks it is interacting directly with the untrusted
743    /// hypervisor via an architecture-specific interface.
744    pub(crate) untrusted_synic: Option<GlobalSynic>,
745    flush_state: VtlArray<TdxPartitionFlushState, 2>,
746    #[inspect(iter_by_index)]
747    active_vtl: Vec<AtomicU8>,
748    /// CR4 bits that the guest is allowed to set to 1.
749    cr4_allowed_bits: u64,
750}
751
752impl TdxBackedShared {
753    pub(crate) fn new(
754        partition_params: &UhPartitionNewParams<'_>,
755        params: BackingSharedParams<'_>,
756    ) -> Result<Self, crate::Error> {
757        // Create a second synic to fully manage the untrusted SINTs
758        // here. At time of writing, the hypervisor does not support
759        // sharing the untrusted SINTs with the TDX L1. Even if it did,
760        // performance would be poor for cases where the L1 implements
761        // high-performance devices.
762        let untrusted_synic = (partition_params.handle_synic && !partition_params.hide_isolation)
763            .then(|| GlobalSynic::new(partition_params.topology.vp_count()));
764
765        // TODO TDX: Consider just using MSR kernel module instead of explicit ioctl.
766        let cr4_fixed1 = params.hcl.read_vmx_cr4_fixed1();
767        let cr4_allowed_bits =
768            (ShadowedRegister::Cr4.guest_owned_mask() | X64_CR4_MCE) & cr4_fixed1;
769
770        Ok(Self {
771            untrusted_synic,
772            flush_state: VtlArray::from_fn(|_| TdxPartitionFlushState::new()),
773            cvm: params.cvm_state.unwrap(),
774            // VPs start in VTL 2.
775            active_vtl: std::iter::repeat_n(2, partition_params.topology.vp_count() as usize)
776                .map(AtomicU8::new)
777                .collect(),
778            cr4_allowed_bits,
779        })
780    }
781
782    /// Get the default guest host mask for the specified register.
783    fn cr_guest_host_mask(&self, reg: ShadowedRegister) -> u64 {
784        match reg {
785            ShadowedRegister::Cr0 => {
786                !ShadowedRegister::Cr0.guest_owned_mask() | X64_CR0_PE | X64_CR0_PG
787            }
788            ShadowedRegister::Cr4 => {
789                !(ShadowedRegister::Cr4.guest_owned_mask() & self.cr4_allowed_bits)
790            }
791        }
792    }
793}
794
795impl TdxBacked {
796    /// Gets the number of pages that will be allocated from the shared page pool
797    /// for each CPU.
798    pub fn shared_pages_required_per_cpu() -> u64 {
799        UhDirectOverlay::Count as u64
800    }
801}
802
803// The memory used to back the untrusted synic is not guest-visible, but rather
804// is allocated from our shared pool. Therefore it does not need to go through
805// the normal memory protections path.
806struct UntrustedSynicVtlProts<'a>(&'a GuestMemory);
807
808impl hv1_emulator::VtlProtectAccess for UntrustedSynicVtlProts<'_> {
809    fn check_modify_and_lock_overlay_page(
810        &mut self,
811        gpn: u64,
812        _check_perms: hvdef::HvMapGpaFlags,
813        _new_perms: Option<hvdef::HvMapGpaFlags>,
814    ) -> Result<guestmem::LockedPages, HvError> {
815        self.0
816            .lock_gpns(false, &[gpn])
817            .map_err(|_| HvError::OperationFailed)
818    }
819
820    fn unlock_overlay_page(&mut self, _gpn: u64) -> Result<(), HvError> {
821        Ok(())
822    }
823}
824
825#[expect(private_interfaces)]
826impl BackingPrivate for TdxBacked {
827    type HclBacking<'tdx> = Tdx<'tdx>;
828    type Shared = TdxBackedShared;
829    type EmulationCache = TdxEmulationCache;
830
831    fn shared(shared: &BackingShared) -> &Self::Shared {
832        let BackingShared::Tdx(shared) = shared else {
833            unreachable!()
834        };
835        shared
836    }
837
838    fn new(
839        params: super::BackingParams<'_, '_, Self>,
840        shared: &TdxBackedShared,
841    ) -> Result<Self, crate::Error> {
842        // TODO TDX: ssp is for shadow stack
843        // TODO TDX: direct overlay like snp?
844        // TODO TDX: lapic / APIC setup?
845        // TODO TDX: see ValInitializeVplc
846        // TODO TDX: XCR_XFMEM setup?
847
848        // Turn on MBEC for just VTL 0.
849        params.runner.write_vmcs32(
850            GuestVtl::Vtl0,
851            VmcsField::VMX_VMCS_SECONDARY_PROCESSOR_CONTROLS,
852            SecondaryProcessorControls::new()
853                .with_mode_based_execute_control(true)
854                .into(),
855            SecondaryProcessorControls::new()
856                .with_mode_based_execute_control(true)
857                .into(),
858        );
859
860        for vtl in [GuestVtl::Vtl0, GuestVtl::Vtl1] {
861            let controls = TdxL2Ctls::new()
862                // Configure L2 controls to permit shared memory.
863                .with_enable_shared_ept(!shared.cvm.hide_isolation)
864                // If the synic is to be managed by the hypervisor, then enable TDVMCALLs.
865                .with_enable_tdvmcall(
866                    shared.untrusted_synic.is_none() && !shared.cvm.hide_isolation,
867                );
868
869            params
870                .runner
871                .set_l2_ctls(vtl, controls)
872                .map_err(crate::Error::FailedToSetL2Ctls)?;
873
874            // Set guest/host masks for CR0 and CR4. These enable shadowing these
875            // registers since TDX requires certain bits to be set at all times.
876            let initial_cr0 = params
877                .runner
878                .read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_CR0);
879            assert_eq!(initial_cr0, X64_CR0_PE | X64_CR0_NE);
880
881            // N.B. CR0.PE and CR0.PG are guest owned but still intercept when they
882            // are changed for caching purposes and to ensure EFER is managed
883            // properly due to the need to change execution state.
884            params.runner.write_vmcs64(
885                vtl,
886                VmcsField::VMX_VMCS_CR0_READ_SHADOW,
887                !0,
888                X64_CR0_PE | X64_CR0_NE,
889            );
890            params.runner.write_vmcs64(
891                vtl,
892                VmcsField::VMX_VMCS_CR0_GUEST_HOST_MASK,
893                !0,
894                shared.cr_guest_host_mask(ShadowedRegister::Cr0),
895            );
896
897            let initial_cr4 = params
898                .runner
899                .read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_CR4);
900            assert_eq!(initial_cr4, X64_CR4_MCE | X64_CR4_VMXE);
901
902            params
903                .runner
904                .write_vmcs64(vtl, VmcsField::VMX_VMCS_CR4_READ_SHADOW, !0, 0);
905            params.runner.write_vmcs64(
906                vtl,
907                VmcsField::VMX_VMCS_CR4_GUEST_HOST_MASK,
908                !0,
909                shared.cr_guest_host_mask(ShadowedRegister::Cr4),
910            );
911
912            // Configure the MSR bitmap for this VP. Since the default MSR bitmap
913            // is set to intercept everything only the MSRs that we want to allow
914            // to passthrough need to be set.
915            for msr in MSR_ALLOWED_READ {
916                params.runner.set_msr_bit(vtl, *msr, false, false);
917            }
918            for msr in MSR_ALLOWED_READ_WRITE {
919                params.runner.set_msr_bit(vtl, *msr, false, false);
920                params.runner.set_msr_bit(vtl, *msr, true, false);
921            }
922
923            // Set the exception bitmap.
924            if params.partition.intercept_debug_exceptions {
925                if cfg!(feature = "gdb") {
926                    let initial_exception_bitmap = params
927                        .runner
928                        .read_vmcs32(vtl, VmcsField::VMX_VMCS_EXCEPTION_BITMAP);
929
930                    let exception_bitmap =
931                        initial_exception_bitmap | (1 << x86defs::Exception::DEBUG.0);
932
933                    params.runner.write_vmcs32(
934                        vtl,
935                        VmcsField::VMX_VMCS_EXCEPTION_BITMAP,
936                        !0,
937                        exception_bitmap,
938                    );
939                } else {
940                    return Err(super::Error::InvalidDebugConfiguration);
941                }
942            }
943        }
944
945        let flush_page = shared
946            .cvm
947            .private_dma_client
948            .allocate_dma_buffer(HV_PAGE_SIZE as usize)
949            .map_err(crate::Error::AllocateTlbFlushPage)?;
950
951        let untrusted_synic = shared
952            .untrusted_synic
953            .as_ref()
954            .map(|synic| synic.add_vp(params.vp_info.base.vp_index));
955
956        Ok(Self {
957            vtls: VtlArray::from_fn(|vtl| {
958                let vtl: GuestVtl = vtl.try_into().unwrap();
959                TdxVtl {
960                    efer: params
961                        .runner
962                        .read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_EFER),
963                    cr0: VirtualRegister::new(
964                        ShadowedRegister::Cr0,
965                        vtl,
966                        params
967                            .runner
968                            .read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_CR0),
969                        !0,
970                    ),
971                    cr4: VirtualRegister::new(
972                        ShadowedRegister::Cr4,
973                        vtl,
974                        params
975                            .runner
976                            .read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_CR4),
977                        shared.cr4_allowed_bits,
978                    ),
979                    msr_cstar: 0,
980                    tpr_threshold: 0,
981                    processor_controls: params
982                        .runner
983                        .read_vmcs32(vtl, VmcsField::VMX_VMCS_PROCESSOR_CONTROLS)
984                        .into(),
985                    interruption_information: Default::default(),
986                    exception_error_code: 0,
987                    interruption_set: false,
988                    flush_state: TdxFlushState::new(),
989                    private_regs: TdxPrivateRegs::new(vtl),
990                    enter_stats: Default::default(),
991                    exit_stats: Default::default(),
992                }
993            }),
994            untrusted_synic,
995            eoi_exit_bitmap: [0; 4],
996            flush_page,
997            cvm: UhCvmVpState::new(
998                &shared.cvm,
999                params.partition,
1000                params.vp_info,
1001                UhDirectOverlay::Count as usize,
1002            )?,
1003        })
1004    }
1005
1006    type StateAccess<'p, 'a>
1007        = UhVpStateAccess<'a, 'p, Self>
1008    where
1009        Self: 'a + 'p,
1010        'p: 'a;
1011
1012    fn access_vp_state<'a, 'p>(
1013        this: &'a mut UhProcessor<'p, Self>,
1014        vtl: GuestVtl,
1015    ) -> Self::StateAccess<'p, 'a> {
1016        UhVpStateAccess::new(this, vtl)
1017    }
1018
1019    fn init(this: &mut UhProcessor<'_, Self>) {
1020        // Configure the synic direct overlays.
1021        // So far, only VTL 0 is using these (for VMBus).
1022        let pfns = &this.backing.cvm.direct_overlay_handle.pfns();
1023        let reg = |gpn| {
1024            u64::from(
1025                HvSynicSimpSiefp::new()
1026                    .with_base_gpn(gpn)
1027                    .with_enabled(true),
1028            )
1029        };
1030
1031        let values: &[(HvX64RegisterName, u64); 2] = &[
1032            (
1033                HvX64RegisterName::Sifp,
1034                reg(pfns[UhDirectOverlay::Sifp as usize]),
1035            ),
1036            (
1037                HvX64RegisterName::Sipp,
1038                reg(pfns[UhDirectOverlay::Sipp as usize]),
1039            ),
1040        ];
1041
1042        let reg_count = if let Some(synic) = &mut this.backing.untrusted_synic {
1043            let prot_access = &mut UntrustedSynicVtlProts(&this.partition.gm[GuestVtl::Vtl0]);
1044
1045            synic
1046                .set_simp(reg(pfns[UhDirectOverlay::Sipp as usize]), prot_access)
1047                .unwrap();
1048            synic
1049                .set_siefp(reg(pfns[UhDirectOverlay::Sifp as usize]), prot_access)
1050                .unwrap();
1051            // Set the SIEFP in the hypervisor so that the hypervisor can
1052            // directly signal synic events. Don't set the SIMP, since the
1053            // message page is owned by the paravisor.
1054            1
1055        } else {
1056            2
1057        };
1058
1059        this.runner
1060            .set_vp_registers_hvcall(Vtl::Vtl0, &values[..reg_count])
1061            .expect("set_vp_registers hypercall for direct overlays should succeed");
1062
1063        // Enable APIC offload by default for VTL 0.
1064        this.set_apic_offload(GuestVtl::Vtl0, true);
1065        this.backing.cvm.lapics[GuestVtl::Vtl0]
1066            .lapic
1067            .enable_offload();
1068
1069        // But disable it for VTL 1.
1070        this.set_apic_offload(GuestVtl::Vtl1, false);
1071
1072        // Initialize registers to the reset state, since this may be different
1073        // than what's on the VMCS and is certainly different than what's in the
1074        // VP enter and private register state (which was mostly zero
1075        // initialized).
1076        for vtl in [GuestVtl::Vtl0, GuestVtl::Vtl1] {
1077            let registers = Registers::at_reset(&this.partition.caps, &this.inner.vp_info);
1078
1079            let mut state = this.access_state(vtl.into());
1080            state
1081                .set_registers(&registers)
1082                .expect("Resetting to architectural state should succeed");
1083
1084            state.commit().expect("committing state should succeed");
1085        }
1086
1087        // FX regs and XMM registers are zero-initialized by the kernel. Set
1088        // them to the arch default.
1089        *this.runner.fx_state_mut() =
1090            vp::Xsave::at_reset(&this.partition.caps, &this.inner.vp_info).fxsave();
1091    }
1092
1093    async fn run_vp(
1094        this: &mut UhProcessor<'_, Self>,
1095        dev: &impl CpuIo,
1096        _stop: &mut virt::StopVp<'_>,
1097    ) -> Result<(), VpHaltReason<UhRunVpError>> {
1098        this.run_vp_tdx(dev).await
1099    }
1100
1101    fn poll_apic(
1102        this: &mut UhProcessor<'_, Self>,
1103        vtl: GuestVtl,
1104        scan_irr: bool,
1105    ) -> Result<(), UhRunVpError> {
1106        if !this.try_poll_apic(vtl, scan_irr)? {
1107            tracing::info!(CVM_ALLOWED, "disabling APIC offload due to auto EOI");
1108            let page = this.runner.tdx_apic_page_mut(vtl);
1109            let (irr, isr) = pull_apic_offload(page);
1110
1111            this.backing.cvm.lapics[vtl]
1112                .lapic
1113                .disable_offload(&irr, &isr);
1114            this.set_apic_offload(vtl, false);
1115            this.try_poll_apic(vtl, false)?;
1116        }
1117
1118        Ok(())
1119    }
1120
1121    fn request_extint_readiness(_this: &mut UhProcessor<'_, Self>) {
1122        unreachable!("extint managed through software apic")
1123    }
1124
1125    fn request_untrusted_sint_readiness(this: &mut UhProcessor<'_, Self>, sints: u16) {
1126        if let Some(synic) = &mut this.backing.untrusted_synic {
1127            synic.request_sint_readiness(sints);
1128        } else {
1129            tracelimit::error_ratelimited!(CVM_ALLOWED, "untrusted synic is not configured");
1130        }
1131    }
1132
1133    fn hv(&self, vtl: GuestVtl) -> Option<&ProcessorVtlHv> {
1134        Some(&self.cvm.hv[vtl])
1135    }
1136
1137    fn hv_mut(&mut self, vtl: GuestVtl) -> Option<&mut ProcessorVtlHv> {
1138        Some(&mut self.cvm.hv[vtl])
1139    }
1140
1141    fn handle_vp_start_enable_vtl_wake(
1142        this: &mut UhProcessor<'_, Self>,
1143        vtl: GuestVtl,
1144    ) -> Result<(), UhRunVpError> {
1145        this.hcvm_handle_vp_start_enable_vtl(vtl)
1146    }
1147
1148    fn vtl1_inspectable(this: &UhProcessor<'_, Self>) -> bool {
1149        this.hcvm_vtl1_inspectable()
1150    }
1151
1152    fn process_interrupts(
1153        this: &mut UhProcessor<'_, Self>,
1154        scan_irr: VtlArray<bool, 2>,
1155        first_scan_irr: &mut bool,
1156        dev: &impl CpuIo,
1157    ) -> Result<bool, VpHaltReason<UhRunVpError>> {
1158        this.cvm_process_interrupts(scan_irr, first_scan_irr, dev)
1159    }
1160}
1161
1162impl UhProcessor<'_, TdxBacked> {
1163    /// Returns `Ok(false)` if the APIC offload needs to be disabled and the
1164    /// poll retried.
1165    fn try_poll_apic(&mut self, vtl: GuestVtl, scan_irr: bool) -> Result<bool, UhRunVpError> {
1166        let mut scan = TdxApicScanner {
1167            processor_controls: self.backing.vtls[vtl]
1168                .processor_controls
1169                .with_nmi_window_exiting(false)
1170                .with_interrupt_window_exiting(false),
1171            vp: self,
1172            tpr_threshold: 0,
1173        };
1174
1175        // TODO TDX: filter proxy IRRs by setting the `proxy_irr_blocked` field of the run page
1176        hardware_cvm::apic::poll_apic_core(&mut scan, vtl, scan_irr)?;
1177
1178        let TdxApicScanner {
1179            vp: _,
1180            processor_controls: new_processor_controls,
1181            tpr_threshold: new_tpr_threshold,
1182        } = scan;
1183
1184        // Interrupts are ignored while waiting for SIPI.
1185        if self.backing.cvm.lapics[vtl].activity != MpState::WaitForSipi
1186            && self.backing.vtls[vtl].tpr_threshold != new_tpr_threshold
1187        {
1188            tracing::trace!(new_tpr_threshold, ?vtl, "setting tpr threshold");
1189            self.runner.write_vmcs32(
1190                vtl,
1191                VmcsField::VMX_VMCS_TPR_THRESHOLD,
1192                !0,
1193                new_tpr_threshold.into(),
1194            );
1195            self.backing.vtls[vtl].tpr_threshold = new_tpr_threshold;
1196        }
1197
1198        if self.backing.vtls[vtl].processor_controls != new_processor_controls {
1199            tracing::trace!(?new_processor_controls, ?vtl, "requesting window change");
1200            self.runner.write_vmcs32(
1201                vtl,
1202                VmcsField::VMX_VMCS_PROCESSOR_CONTROLS,
1203                !0,
1204                new_processor_controls.into(),
1205            );
1206            self.backing.vtls[vtl].processor_controls = new_processor_controls;
1207        }
1208
1209        // Offloading and proxying is only done with VTL 0 today.
1210        if vtl == GuestVtl::Vtl0 {
1211            let mut update_rvi = false;
1212            let r: Result<(), OffloadNotSupported> = self.backing.cvm.lapics[vtl]
1213                .lapic
1214                .push_to_offload(|irr, isr, tmr| {
1215                    let apic_page = self.runner.tdx_apic_page_mut(vtl);
1216
1217                    for (((irr, page_irr), isr), page_isr) in irr
1218                        .iter()
1219                        .zip(&mut apic_page.irr)
1220                        .zip(isr)
1221                        .zip(&mut apic_page.isr)
1222                    {
1223                        page_irr.value |= *irr;
1224                        page_isr.value |= *isr;
1225                    }
1226
1227                    // Update SVI and RVI.
1228                    let svi = top_vector(&apic_page.isr);
1229                    self.backing.vtls[vtl].private_regs.svi = svi;
1230                    update_rvi = true;
1231
1232                    // Ensure the EOI exit bitmap is up to date.
1233                    let fields = [
1234                        VmcsField::VMX_VMCS_EOI_EXIT_0,
1235                        VmcsField::VMX_VMCS_EOI_EXIT_1,
1236                        VmcsField::VMX_VMCS_EOI_EXIT_2,
1237                        VmcsField::VMX_VMCS_EOI_EXIT_3,
1238                    ];
1239                    for ((&field, eoi_exit), (i, tmr)) in fields
1240                        .iter()
1241                        .zip(&mut self.backing.eoi_exit_bitmap)
1242                        .zip(tmr.chunks_exact(2).enumerate())
1243                    {
1244                        let tmr = tmr[0] as u64 | ((tmr[1] as u64) << 32);
1245                        if *eoi_exit != tmr {
1246                            self.runner.write_vmcs64(vtl, field, !0, tmr);
1247                            *eoi_exit = tmr;
1248                            // The kernel driver supports some common APIC functionality (ICR writes,
1249                            // interrupt injection). When the kernel driver handles an interrupt, it
1250                            // must know if that interrupt was previously level-triggered. Otherwise,
1251                            // the EOI will be incorrectly treated as level-triggered. We keep a copy
1252                            // of the tmr in the kernel so it knows when this scenario occurs.
1253                            self.runner.proxy_irr_exit_mut_vtl0()[i * 2] = tmr as u32;
1254                            self.runner.proxy_irr_exit_mut_vtl0()[i * 2 + 1] = (tmr >> 32) as u32;
1255                        }
1256                    }
1257                });
1258
1259            if let Err(OffloadNotSupported) = r {
1260                // APIC needs offloading to be disabled to support auto-EOI. The caller
1261                // will disable offload and try again.
1262                return Ok(false);
1263            }
1264
1265            if update_rvi {
1266                let page = self.runner.tdx_apic_page_mut(vtl);
1267                let rvi = top_vector(&page.irr);
1268                self.backing.vtls[vtl].private_regs.rvi = rvi;
1269            }
1270        }
1271
1272        // If there is a pending interrupt, clear the halted and idle state.
1273        if (self.backing.cvm.lapics[vtl].activity != MpState::Running)
1274            && self.backing.cvm.lapics[vtl].lapic.is_offloaded()
1275            && self.backing.vtls[vtl].private_regs.rvi != 0
1276        {
1277            // To model a non-virtualized processor, we should only do this if
1278            // TPR and IF and interrupt shadow allow. However, fetching the
1279            // interrupt shadow state is expensive (tdcall). This shouldn't
1280            // matter much, because real guests don't issue hlt while in
1281            // interrupt shadow or with interrupts disabled or with a non-zero
1282            // TPR.
1283            //
1284            // Note that the processor will not actually inject the interrupt
1285            // until conditions hold. So, unless the guest fails to loop around
1286            // and hlt again (which we already treat as a guest bug, since
1287            // Hyper-V in general does not guarantee hlt will stick until an
1288            // interrupt is pending), at worst this will just burn some CPU.
1289            self.backing.cvm.lapics[vtl].activity = MpState::Running;
1290        }
1291
1292        Ok(true)
1293    }
1294
1295    fn access_apic_without_offload<R>(
1296        &mut self,
1297        vtl: GuestVtl,
1298        f: impl FnOnce(&mut Self) -> R,
1299    ) -> R {
1300        let offloaded = self.backing.cvm.lapics[vtl].lapic.is_offloaded();
1301        if offloaded {
1302            let (irr, isr) = pull_apic_offload(self.runner.tdx_apic_page_mut(vtl));
1303            self.backing.cvm.lapics[vtl]
1304                .lapic
1305                .disable_offload(&irr, &isr);
1306        }
1307        let r = f(self);
1308        if offloaded {
1309            self.backing.cvm.lapics[vtl].lapic.enable_offload();
1310        }
1311        r
1312    }
1313
1314    fn set_apic_offload(&mut self, vtl: GuestVtl, offload: bool) {
1315        // Update the APIC portion of the MSR bitmap.
1316        let offload_bitmap = if offload {
1317            (1 << x86defs::apic::ApicRegister::TPR.0)
1318                | (1 << x86defs::apic::ApicRegister::EOI.0)
1319                | (1 << x86defs::apic::ApicRegister::SELF_IPI.0)
1320        } else {
1321            0
1322        };
1323        // Once for read and once for write.
1324        for offset in [0, 0x100] {
1325            self.runner
1326                .write_msr_bitmap(vtl, offset + X2APIC_MSR_BASE / 64, !0, !offload_bitmap);
1327        }
1328
1329        // Update virtual-interrupt delivery.
1330        self.runner.write_vmcs32(
1331            vtl,
1332            VmcsField::VMX_VMCS_SECONDARY_PROCESSOR_CONTROLS,
1333            SecondaryProcessorControls::new()
1334                .with_virtual_interrupt_delivery(true)
1335                .into(),
1336            SecondaryProcessorControls::new()
1337                .with_virtual_interrupt_delivery(offload)
1338                .into(),
1339        );
1340
1341        // Clear any pending external interrupt when enabling the APIC offload.
1342        if offload
1343            && self.backing.vtls[vtl]
1344                .interruption_information
1345                .interruption_type()
1346                == INTERRUPT_TYPE_EXTERNAL
1347        {
1348            self.backing.vtls[vtl]
1349                .interruption_information
1350                .set_valid(false);
1351        }
1352    }
1353}
1354
1355struct TdxApicScanner<'a, 'b> {
1356    vp: &'a mut UhProcessor<'b, TdxBacked>,
1357    processor_controls: ProcessorControls,
1358    tpr_threshold: u8,
1359}
1360
1361impl<'b> hardware_cvm::apic::ApicBacking<'b, TdxBacked> for TdxApicScanner<'_, 'b> {
1362    fn vp(&mut self) -> &mut UhProcessor<'b, TdxBacked> {
1363        self.vp
1364    }
1365
1366    fn handle_interrupt(&mut self, vtl: GuestVtl, vector: u8) -> Result<(), UhRunVpError> {
1367        // Exit idle when an interrupt is received, regardless of IF
1368        if self.vp.backing.cvm.lapics[vtl].activity == MpState::Idle {
1369            self.vp.backing.cvm.lapics[vtl].activity = MpState::Running;
1370        }
1371        // If there is a higher-priority pending event of some kind, then
1372        // just request an exit after it has resolved, after which we will
1373        // try again.
1374        if self.vp.backing.vtls[vtl].interruption_information.valid()
1375            && self.vp.backing.vtls[vtl]
1376                .interruption_information
1377                .interruption_type()
1378                != INTERRUPT_TYPE_EXTERNAL
1379        {
1380            self.processor_controls.set_interrupt_window_exiting(true);
1381            return Ok(());
1382        }
1383
1384        // Ensure the interrupt is not blocked by RFLAGS.IF or interrupt shadow.
1385        let interruptibility: Interruptibility = self
1386            .vp
1387            .runner
1388            .read_vmcs32(vtl, VmcsField::VMX_VMCS_GUEST_INTERRUPTIBILITY)
1389            .into();
1390
1391        let rflags = RFlags::from(self.vp.backing.vtls[vtl].private_regs.rflags);
1392        if !rflags.interrupt_enable()
1393            || interruptibility.blocked_by_sti()
1394            || interruptibility.blocked_by_movss()
1395        {
1396            self.processor_controls.set_interrupt_window_exiting(true);
1397            return Ok(());
1398        }
1399
1400        let priority = vector >> 4;
1401        let apic = self.vp.runner.tdx_apic_page(vtl);
1402        if (apic.tpr.value as u8 >> 4) >= priority {
1403            self.tpr_threshold = priority;
1404            return Ok(());
1405        }
1406
1407        self.vp.backing.vtls[vtl].interruption_information = InterruptionInformation::new()
1408            .with_valid(true)
1409            .with_vector(vector)
1410            .with_interruption_type(INTERRUPT_TYPE_EXTERNAL);
1411
1412        self.vp.backing.cvm.lapics[vtl].activity = MpState::Running;
1413        Ok(())
1414    }
1415
1416    fn handle_nmi(&mut self, vtl: GuestVtl) -> Result<(), UhRunVpError> {
1417        // Exit idle when an interrupt is received, regardless of IF
1418        // TODO: Investigate lifting more activity management into poll_apic_core
1419        if self.vp.backing.cvm.lapics[vtl].activity == MpState::Idle {
1420            self.vp.backing.cvm.lapics[vtl].activity = MpState::Running;
1421        }
1422        // If there is a higher-priority pending event of some kind, then
1423        // just request an exit after it has resolved, after which we will
1424        // try again.
1425        if self.vp.backing.vtls[vtl].interruption_information.valid()
1426            && self.vp.backing.vtls[vtl]
1427                .interruption_information
1428                .interruption_type()
1429                != INTERRUPT_TYPE_EXTERNAL
1430        {
1431            self.processor_controls.set_nmi_window_exiting(true);
1432            return Ok(());
1433        }
1434
1435        let interruptibility: Interruptibility = self
1436            .vp
1437            .runner
1438            .read_vmcs32(vtl, VmcsField::VMX_VMCS_GUEST_INTERRUPTIBILITY)
1439            .into();
1440
1441        if interruptibility.blocked_by_nmi()
1442            || interruptibility.blocked_by_sti()
1443            || interruptibility.blocked_by_movss()
1444        {
1445            self.processor_controls.set_nmi_window_exiting(true);
1446            return Ok(());
1447        }
1448
1449        self.vp.backing.vtls[vtl].interruption_information = InterruptionInformation::new()
1450            .with_valid(true)
1451            .with_vector(2)
1452            .with_interruption_type(INTERRUPT_TYPE_NMI);
1453
1454        self.vp.backing.cvm.lapics[vtl].activity = MpState::Running;
1455        Ok(())
1456    }
1457
1458    fn handle_sipi(&mut self, vtl: GuestVtl, cs: SegmentRegister) -> Result<(), UhRunVpError> {
1459        self.vp.write_segment(vtl, TdxSegmentReg::Cs, cs).unwrap();
1460        self.vp.backing.vtls[vtl].private_regs.rip = 0;
1461        self.vp.backing.cvm.lapics[vtl].activity = MpState::Running;
1462
1463        Ok(())
1464    }
1465}
1466
1467impl UhProcessor<'_, TdxBacked> {
1468    async fn run_vp_tdx(&mut self, dev: &impl CpuIo) -> Result<(), VpHaltReason<UhRunVpError>> {
1469        let next_vtl = self.backing.cvm.exit_vtl;
1470
1471        if self.backing.vtls[next_vtl].interruption_information.valid() {
1472            tracing::trace!(
1473                vector = self.backing.vtls[next_vtl]
1474                    .interruption_information
1475                    .vector(),
1476                vp_index = self.vp_index().index(),
1477                ?next_vtl,
1478                "injecting interrupt"
1479            );
1480
1481            self.runner.write_vmcs32(
1482                next_vtl,
1483                VmcsField::VMX_VMCS_ENTRY_INTERRUPT_INFO,
1484                !0,
1485                self.backing.vtls[next_vtl].interruption_information.into(),
1486            );
1487            if self.backing.vtls[next_vtl]
1488                .interruption_information
1489                .deliver_error_code()
1490            {
1491                self.runner.write_vmcs32(
1492                    next_vtl,
1493                    VmcsField::VMX_VMCS_ENTRY_EXCEPTION_ERROR_CODE,
1494                    !0,
1495                    self.backing.vtls[next_vtl].exception_error_code,
1496                );
1497            }
1498            self.backing.vtls[next_vtl].interruption_set = true;
1499        } else if self.backing.vtls[next_vtl].interruption_set {
1500            self.runner
1501                .write_vmcs32(next_vtl, VmcsField::VMX_VMCS_ENTRY_INTERRUPT_INFO, !0, 0);
1502            self.backing.vtls[next_vtl].interruption_set = false;
1503        }
1504
1505        // We're about to return to a lower VTL, so set active_vtl for other VPs,
1506        // do any pending flushes, unlock our TLB locks, and wait for any others
1507        // we're supposed to.
1508
1509        // active_vtl needs SeqCst ordering here in order to correctly synchronize
1510        // access with the TLB address flush list. We need to ensure that, when
1511        // other VPs are adding entries to the list, they always observe the
1512        // correct lower active VTL. Otherwise they might choose to not send this
1513        // VP a wake, leading to a stall, until this VP happens to exit to VTL 2 again.
1514        //
1515        // This does technically leave open a small window for potential spurious
1516        // wakes, but that's preferable, and will cause no problems besides a
1517        // small amount of time waste.
1518        self.shared.active_vtl[self.vp_index().index() as usize]
1519            .store(next_vtl as u8, Ordering::SeqCst);
1520
1521        self.do_tlb_flush(next_vtl);
1522        self.unlock_tlb_lock(Vtl::Vtl2);
1523        let tlb_halt = self.should_halt_for_tlb_unlock(next_vtl);
1524
1525        // If we are halted in the kernel due to hlt or idle, and we receive an interrupt
1526        // we'd like to unhalt, inject the interrupt, and resume vtl0 without returning to
1527        // user-mode. To enable this, the kernel must know why are are halted
1528        let activity = self.backing.cvm.lapics[next_vtl].activity;
1529        let kernel_known_state =
1530            matches!(activity, MpState::Running | MpState::Halted | MpState::Idle);
1531        let halted_other = tlb_halt || !kernel_known_state;
1532
1533        self.runner
1534            .set_halted(activity != MpState::Running || tlb_halt);
1535
1536        // Turn on kernel interrupt handling if possible. This will cause the
1537        // kernel to handle some exits internally, without returning to user
1538        // mode, to improve performance.
1539        //
1540        // Do not do this if there is a pending interruption, since we need to
1541        // run code on the next exit to clear it. If we miss this opportunity,
1542        // we will probably double-inject the interruption, wreaking havoc.
1543        //
1544        // Also do not do this if there is a pending TLB flush, since we need to
1545        // run code on the next exit to clear it. If we miss this opportunity,
1546        // we could double-inject the TLB flush unnecessarily.
1547        let offload_enabled = self.backing.cvm.lapics[next_vtl].lapic.can_offload_irr()
1548            && !self.backing.vtls[next_vtl].interruption_information.valid()
1549            && self.backing.vtls[next_vtl]
1550                .private_regs
1551                .vp_entry_flags
1552                .invd_translations()
1553                != 0;
1554        let x2apic_enabled = self.backing.cvm.lapics[next_vtl].lapic.x2apic_enabled();
1555
1556        let offload_flags = hcl_intr_offload_flags::new()
1557            .with_offload_intr_inject(offload_enabled)
1558            .with_offload_x2apic(offload_enabled && x2apic_enabled)
1559            .with_halted_other(halted_other)
1560            .with_halted_hlt(activity == MpState::Halted)
1561            .with_halted_idle(activity == MpState::Idle);
1562
1563        *self.runner.offload_flags_mut() = offload_flags;
1564
1565        self.runner
1566            .write_private_regs(&self.backing.vtls[next_vtl].private_regs);
1567
1568        let has_intercept = self
1569            .runner
1570            .run()
1571            .map_err(|e| VpHaltReason::Hypervisor(UhRunVpError::Run(e)))?;
1572
1573        // TLB flushes can only target lower VTLs, so it is fine to use a relaxed
1574        // ordering here. The worst that can happen is some spurious wakes, due
1575        // to another VP observing that this VP is still in a lower VTL.
1576        self.shared.active_vtl[self.vp_index().index() as usize].store(2, Ordering::Relaxed);
1577
1578        let entered_from_vtl = next_vtl;
1579        self.runner
1580            .read_private_regs(&mut self.backing.vtls[entered_from_vtl].private_regs);
1581
1582        // Kernel offload may have set or cleared the halt/idle states
1583        if offload_enabled && kernel_known_state {
1584            let offload_flags = self.runner.offload_flags_mut();
1585
1586            self.backing.cvm.lapics[entered_from_vtl].activity =
1587                match (offload_flags.halted_hlt(), offload_flags.halted_idle()) {
1588                    (false, false) => MpState::Running,
1589                    (true, false) => MpState::Halted,
1590                    (false, true) => MpState::Idle,
1591                    (true, true) => {
1592                        tracelimit::warn_ratelimited!(
1593                            CVM_ALLOWED,
1594                            "Kernel indicates VP is both halted and idle!"
1595                        );
1596                        activity
1597                    }
1598                };
1599        }
1600
1601        if !has_intercept {
1602            return Ok(());
1603        }
1604
1605        let exit_info = TdxExit(self.runner.tdx_vp_enter_exit_info());
1606
1607        // Result codes above PENDING_INTERRUPT indicate the L2 was never entered.
1608        if exit_info.code().tdx_exit() >= TdCallResultCode::PENDING_INTERRUPT {
1609            self.backing.vtls[entered_from_vtl]
1610                .enter_stats
1611                .pending_intr
1612                .increment();
1613            return Ok(());
1614        }
1615
1616        // Since the L2 was entered we can clear any TLB flush requests
1617        self.backing.vtls[entered_from_vtl]
1618            .private_regs
1619            .vp_entry_flags
1620            .set_invd_translations(0);
1621
1622        // The L2 was entered, so process the exit.
1623        let stat = match exit_info.code().tdx_exit() {
1624            TdCallResultCode::SUCCESS => {
1625                &mut self.backing.vtls[entered_from_vtl].enter_stats.success
1626            }
1627            TdCallResultCode::L2_EXIT_HOST_ROUTED_ASYNC => {
1628                &mut self.backing.vtls[entered_from_vtl]
1629                    .enter_stats
1630                    .host_routed_async
1631            }
1632            TdCallResultCode::L2_EXIT_PENDING_INTERRUPT => {
1633                &mut self.backing.vtls[entered_from_vtl]
1634                    .enter_stats
1635                    .l2_exit_pending_intr
1636            }
1637            TdCallResultCode::L2_EXIT_HOST_ROUTED_TDVMCALL => {
1638                // This is expected, and means that the hypervisor completed a
1639                // TD.VMCALL from the L2 and has requested to resume the L2 to
1640                // the L1.
1641                //
1642                // There is nothing to do here.
1643                assert_eq!(
1644                    exit_info.code().vmx_exit(),
1645                    VmxExit::new().with_basic_reason(VmxExitBasic::TDCALL)
1646                );
1647                &mut self.backing.vtls[entered_from_vtl]
1648                    .enter_stats
1649                    .host_routed_td_vmcall
1650            }
1651            _ => panic!("unexpected tdx exit code {:?}", exit_info.code()),
1652        };
1653
1654        stat.increment();
1655        self.handle_vmx_exit(dev, entered_from_vtl).await?;
1656        Ok(())
1657    }
1658
1659    async fn handle_vmx_exit(
1660        &mut self,
1661        dev: &impl CpuIo,
1662        intercepted_vtl: GuestVtl,
1663    ) -> Result<(), VpHaltReason<UhRunVpError>> {
1664        let exit_info = TdxExit(self.runner.tdx_vp_enter_exit_info());
1665
1666        // First, check that the VM entry was even successful.
1667        let vmx_exit = exit_info.code().vmx_exit();
1668        if vmx_exit.vm_enter_failed() {
1669            return Err(self.handle_vm_enter_failed(intercepted_vtl, vmx_exit));
1670        }
1671
1672        let next_interruption = exit_info.idt_vectoring_info();
1673
1674        // Acknowledge the APIC interrupt/NMI if it was delivered.
1675        if self.backing.vtls[intercepted_vtl]
1676            .interruption_information
1677            .valid()
1678            && (!next_interruption.valid()
1679                || self.backing.vtls[intercepted_vtl]
1680                    .interruption_information
1681                    .interruption_type()
1682                    != next_interruption.interruption_type())
1683        {
1684            match self.backing.vtls[intercepted_vtl]
1685                .interruption_information
1686                .interruption_type()
1687            {
1688                INTERRUPT_TYPE_EXTERNAL
1689                    if !self.backing.cvm.lapics[intercepted_vtl]
1690                        .lapic
1691                        .is_offloaded() =>
1692                {
1693                    // This must be a pending APIC interrupt. Acknowledge it.
1694                    tracing::trace!(
1695                        vector = self.backing.vtls[intercepted_vtl]
1696                            .interruption_information
1697                            .vector(),
1698                        "acknowledging interrupt"
1699                    );
1700                    self.backing.cvm.lapics[intercepted_vtl]
1701                        .lapic
1702                        .acknowledge_interrupt(
1703                            self.backing.vtls[intercepted_vtl]
1704                                .interruption_information
1705                                .vector(),
1706                        );
1707                }
1708                INTERRUPT_TYPE_NMI => {
1709                    // This must be a pending NMI.
1710                    tracing::debug!("acknowledging NMI");
1711                    self.backing.cvm.lapics[intercepted_vtl].nmi_pending = false;
1712                }
1713                _ => {}
1714            }
1715        }
1716
1717        if self.backing.cvm.lapics[intercepted_vtl]
1718            .lapic
1719            .is_offloaded()
1720        {
1721            // It's possible with vAPIC that we take an exit in the window where
1722            // hardware has moved a bit from IRR to ISR, but has not injected
1723            // the interrupt into the guest. In this case, we need to track that
1724            // we must inject the interrupt before we return to the guest,
1725            // otherwise the interrupt will be lost and the guest left in a bad
1726            // state.
1727            //
1728            // TODO TDX: Unclear what kind of exits these would be, but they
1729            // should be spurious EPT exits. Can we validate or assert that
1730            // somehow? If we were to somehow call some other path which would
1731            // set interruption_information before we inject this one, we would
1732            // lose this interrupt.
1733            if next_interruption.valid() {
1734                tracing::debug!(
1735                    ?next_interruption,
1736                    vp_index = self.vp_index().index(),
1737                    "exit requires reinjecting interrupt"
1738                );
1739                self.backing.vtls[intercepted_vtl].interruption_information = next_interruption;
1740                self.backing.vtls[intercepted_vtl].exception_error_code =
1741                    exit_info.idt_vectoring_error_code();
1742                self.backing.vtls[intercepted_vtl]
1743                    .exit_stats
1744                    .needs_interrupt_reinject
1745                    .increment();
1746            } else {
1747                self.backing.vtls[intercepted_vtl].interruption_information = Default::default();
1748            }
1749        } else {
1750            // Ignore (and later recalculate) the next interruption if it is an
1751            // external interrupt or NMI, since it may change if the APIC state
1752            // changes.
1753            if next_interruption.valid()
1754                && !matches!(
1755                    next_interruption.interruption_type(),
1756                    INTERRUPT_TYPE_EXTERNAL | INTERRUPT_TYPE_NMI
1757                )
1758            {
1759                self.backing.vtls[intercepted_vtl].interruption_information = next_interruption;
1760                self.backing.vtls[intercepted_vtl].exception_error_code =
1761                    exit_info.idt_vectoring_error_code();
1762            } else {
1763                self.backing.vtls[intercepted_vtl].interruption_information = Default::default();
1764            }
1765        }
1766
1767        let mut breakpoint_debug_exception = false;
1768        let stat = match vmx_exit.basic_reason() {
1769            VmxExitBasic::IO_INSTRUCTION => {
1770                let io_qual = ExitQualificationIo::from(exit_info.qualification() as u32);
1771
1772                let len = match io_qual.access_size() {
1773                    IO_SIZE_8_BIT => 1,
1774                    IO_SIZE_16_BIT => 2,
1775                    IO_SIZE_32_BIT => 4,
1776                    _ => panic!(
1777                        "tdx module returned invalid io instr size {}",
1778                        io_qual.access_size()
1779                    ),
1780                };
1781
1782                let port_access_protected = self.cvm_try_protect_io_port_access(
1783                    intercepted_vtl,
1784                    io_qual.port(),
1785                    io_qual.is_in(),
1786                    len,
1787                    io_qual.is_string(),
1788                    io_qual.rep_prefix(),
1789                );
1790
1791                if !port_access_protected {
1792                    if io_qual.is_string() || io_qual.rep_prefix() {
1793                        // TODO GUEST VSM: consider changing the emulation path
1794                        // to also check for io port installation, mainly for
1795                        // handling rep instructions.
1796
1797                        self.emulate(
1798                            dev,
1799                            self.backing.vtls[intercepted_vtl]
1800                                .interruption_information
1801                                .valid(),
1802                            intercepted_vtl,
1803                            TdxEmulationCache::default(),
1804                        )
1805                        .await?;
1806                    } else {
1807                        let mut rax = self.runner.tdx_enter_guest_gps()[TdxGp::RAX];
1808                        emulate_io(
1809                            self.inner.vp_info.base.vp_index,
1810                            !io_qual.is_in(),
1811                            io_qual.port(),
1812                            &mut rax,
1813                            len,
1814                            dev,
1815                        )
1816                        .await;
1817                        self.runner.tdx_enter_guest_gps_mut()[TdxGp::RAX] = rax;
1818
1819                        self.advance_to_next_instruction(intercepted_vtl);
1820                    }
1821                }
1822
1823                &mut self.backing.vtls[intercepted_vtl].exit_stats.io
1824            }
1825            VmxExitBasic::MSR_READ => {
1826                let msr = self.runner.tdx_enter_guest_gps()[TdxGp::RCX] as u32;
1827
1828                let result = self.backing.cvm.lapics[intercepted_vtl]
1829                    .lapic
1830                    .access(&mut TdxApicClient {
1831                        partition: self.partition,
1832                        vmtime: &self.vmtime,
1833                        apic_page: self.runner.tdx_apic_page_mut(intercepted_vtl),
1834                        dev,
1835                        vtl: intercepted_vtl,
1836                    })
1837                    .msr_read(msr)
1838                    .or_else_if_unknown(|| self.read_msr_cvm(msr, intercepted_vtl))
1839                    .or_else_if_unknown(|| self.read_msr_tdx(msr, intercepted_vtl));
1840
1841                let value = match result {
1842                    Ok(v) => Some(v),
1843                    Err(MsrError::Unknown) => {
1844                        tracelimit::warn_ratelimited!(CVM_ALLOWED, msr, "unknown tdx vm msr read");
1845                        Some(0)
1846                    }
1847                    Err(MsrError::InvalidAccess) => None,
1848                };
1849
1850                let inject_gp = if let Some(value) = value {
1851                    let gps = self.runner.tdx_enter_guest_gps_mut();
1852                    gps[TdxGp::RAX] = (value as u32).into();
1853                    gps[TdxGp::RDX] = ((value >> 32) as u32).into();
1854                    false
1855                } else {
1856                    true
1857                };
1858
1859                if inject_gp {
1860                    self.inject_gpf(intercepted_vtl);
1861                } else {
1862                    self.advance_to_next_instruction(intercepted_vtl);
1863                }
1864                &mut self.backing.vtls[intercepted_vtl].exit_stats.msr_read
1865            }
1866            VmxExitBasic::MSR_WRITE => {
1867                let gps = self.runner.tdx_enter_guest_gps();
1868                let msr = gps[TdxGp::RCX] as u32;
1869                let value =
1870                    (gps[TdxGp::RAX] as u32 as u64) | ((gps[TdxGp::RDX] as u32 as u64) << 32);
1871
1872                if !self.cvm_try_protect_msr_write(intercepted_vtl, msr) {
1873                    let result = self.backing.cvm.lapics[intercepted_vtl]
1874                        .lapic
1875                        .access(&mut TdxApicClient {
1876                            partition: self.partition,
1877                            vmtime: &self.vmtime,
1878                            apic_page: self.runner.tdx_apic_page_mut(intercepted_vtl),
1879                            dev,
1880                            vtl: intercepted_vtl,
1881                        })
1882                        .msr_write(msr, value)
1883                        .or_else_if_unknown(|| self.write_msr_cvm(msr, value, intercepted_vtl))
1884                        .or_else_if_unknown(|| self.write_msr_tdx(msr, value, intercepted_vtl))
1885                        .or_else_if_unknown(|| {
1886                            // Sanity check
1887                            if MSR_ALLOWED_READ_WRITE.contains(&msr) {
1888                                unreachable!("intercepted a write to MSR {msr}, configured for passthrough by default, that wasn't registered for intercepts by a higher VTL");
1889                            }
1890                            Err(MsrError::Unknown)
1891                        });
1892
1893                    let inject_gp = match result {
1894                        Ok(()) => false,
1895                        Err(MsrError::Unknown) => {
1896                            tracelimit::warn_ratelimited!(
1897                                CVM_ALLOWED,
1898                                msr,
1899                                "unknown tdx vm msr write"
1900                            );
1901                            tracelimit::warn_ratelimited!(
1902                                CVM_CONFIDENTIAL,
1903                                value,
1904                                "unknown tdx vm msr write"
1905                            );
1906                            false
1907                        }
1908                        Err(MsrError::InvalidAccess) => true,
1909                    };
1910
1911                    if inject_gp {
1912                        self.inject_gpf(intercepted_vtl);
1913                    } else {
1914                        self.advance_to_next_instruction(intercepted_vtl);
1915                    }
1916                }
1917                &mut self.backing.vtls[intercepted_vtl].exit_stats.msr_write
1918            }
1919            VmxExitBasic::CPUID => {
1920                let gps = self.runner.tdx_enter_guest_gps();
1921                let leaf = gps[TdxGp::RAX] as u32;
1922                let subleaf = gps[TdxGp::RCX] as u32;
1923                let [eax, ebx, ecx, edx] = self.cvm_cpuid_result(intercepted_vtl, leaf, subleaf);
1924                let gps = self.runner.tdx_enter_guest_gps_mut();
1925                gps[TdxGp::RAX] = eax.into();
1926                gps[TdxGp::RBX] = ebx.into();
1927                gps[TdxGp::RCX] = ecx.into();
1928                gps[TdxGp::RDX] = edx.into();
1929                self.advance_to_next_instruction(intercepted_vtl);
1930                &mut self.backing.vtls[intercepted_vtl].exit_stats.cpuid
1931            }
1932            VmxExitBasic::VMCALL_INSTRUCTION => {
1933                if exit_info.cpl() != 0 {
1934                    self.inject_gpf(intercepted_vtl);
1935                } else {
1936                    let is_64bit = self.long_mode(intercepted_vtl);
1937                    let guest_memory = &self.partition.gm[intercepted_vtl];
1938                    let handler = UhHypercallHandler {
1939                        trusted: !self.cvm_partition().hide_isolation,
1940                        vp: &mut *self,
1941                        bus: dev,
1942                        intercepted_vtl,
1943                    };
1944
1945                    UhHypercallHandler::TDX_DISPATCHER.dispatch(
1946                        guest_memory,
1947                        hv1_hypercall::X64RegisterIo::new(handler, is_64bit),
1948                    );
1949                }
1950                &mut self.backing.vtls[intercepted_vtl].exit_stats.vmcall
1951            }
1952            VmxExitBasic::HLT_INSTRUCTION => {
1953                self.backing.cvm.lapics[intercepted_vtl].activity = MpState::Halted;
1954                self.clear_interrupt_shadow(intercepted_vtl);
1955                self.advance_to_next_instruction(intercepted_vtl);
1956                &mut self.backing.vtls[intercepted_vtl].exit_stats.hlt
1957            }
1958            VmxExitBasic::CR_ACCESS => {
1959                let qual = CrAccessQualification::from(exit_info.qualification());
1960                let cr;
1961                let value;
1962                match qual.access_type() {
1963                    CR_ACCESS_TYPE_MOV_TO_CR => {
1964                        cr = qual.cr();
1965                        value = self.runner.tdx_enter_guest_gps()[qual.gp_register() as usize];
1966                    }
1967                    CR_ACCESS_TYPE_LMSW => {
1968                        cr = 0;
1969                        let cr0 = self.backing.vtls[intercepted_vtl].cr0.read(&self.runner);
1970                        // LMSW updates the low four bits only.
1971                        value = (qual.lmsw_source_data() as u64 & 0xf) | (cr0 & !0xf);
1972                    }
1973                    access_type => unreachable!("not registered for cr access type {access_type}"),
1974                }
1975
1976                let cr = match cr {
1977                    0 => HvX64RegisterName::Cr0,
1978                    4 => HvX64RegisterName::Cr4,
1979                    _ => unreachable!("not registered for cr{cr} accesses"),
1980                };
1981
1982                if !self.cvm_try_protect_secure_register_write(intercepted_vtl, cr, value) {
1983                    let r = match cr {
1984                        HvX64RegisterName::Cr0 => self.backing.vtls[intercepted_vtl]
1985                            .cr0
1986                            .write(value, &mut self.runner),
1987                        HvX64RegisterName::Cr4 => self.backing.vtls[intercepted_vtl]
1988                            .cr4
1989                            .write(value, &mut self.runner),
1990                        _ => unreachable!(),
1991                    };
1992                    if r.is_ok() {
1993                        self.update_execution_mode(intercepted_vtl);
1994                        self.advance_to_next_instruction(intercepted_vtl);
1995                    } else {
1996                        tracelimit::warn_ratelimited!(
1997                            CVM_ALLOWED,
1998                            ?cr,
1999                            value,
2000                            "failed to write cr"
2001                        );
2002                        self.inject_gpf(intercepted_vtl);
2003                    }
2004                }
2005                &mut self.backing.vtls[intercepted_vtl].exit_stats.cr_access
2006            }
2007            VmxExitBasic::XSETBV => {
2008                let gps = self.runner.tdx_enter_guest_gps();
2009                if let Some(value) =
2010                    hardware_cvm::validate_xsetbv_exit(hardware_cvm::XsetbvExitInput {
2011                        rax: gps[TdxGp::RAX],
2012                        rcx: gps[TdxGp::RCX],
2013                        rdx: gps[TdxGp::RDX],
2014                        cr4: self.backing.vtls[intercepted_vtl].cr4.read(&self.runner),
2015                        cpl: exit_info.cpl(),
2016                    })
2017                {
2018                    if !self.cvm_try_protect_secure_register_write(
2019                        intercepted_vtl,
2020                        HvX64RegisterName::Xfem,
2021                        value,
2022                    ) {
2023                        self.runner
2024                            .set_vp_register(intercepted_vtl, HvX64RegisterName::Xfem, value.into())
2025                            .map_err(|err| {
2026                                VpHaltReason::Hypervisor(UhRunVpError::EmulationState(err))
2027                            })?;
2028                        self.advance_to_next_instruction(intercepted_vtl);
2029                    }
2030                } else {
2031                    self.inject_gpf(intercepted_vtl);
2032                }
2033                &mut self.backing.vtls[intercepted_vtl].exit_stats.xsetbv
2034            }
2035            VmxExitBasic::WBINVD_INSTRUCTION => {
2036                // Ask the kernel to flush the cache before issuing VP.ENTER.
2037                let no_invalidate = exit_info.qualification() != 0;
2038                if no_invalidate {
2039                    self.runner.tdx_vp_state_flags_mut().set_wbnoinvd(true);
2040                } else {
2041                    self.runner.tdx_vp_state_flags_mut().set_wbinvd(true);
2042                }
2043
2044                self.advance_to_next_instruction(intercepted_vtl);
2045                &mut self.backing.vtls[intercepted_vtl].exit_stats.wbinvd
2046            }
2047            VmxExitBasic::EPT_VIOLATION => {
2048                let gpa = exit_info.gpa().expect("is EPT exit");
2049                let ept_info = VmxEptExitQualification::from(exit_info.qualification());
2050                // If this was an EPT violation while handling an iret, and
2051                // that iret cleared the NMI blocking state, restore it.
2052                if !next_interruption.valid() && ept_info.nmi_unmasking_due_to_iret() {
2053                    let mask = Interruptibility::new().with_blocked_by_nmi(true);
2054                    let value = Interruptibility::new().with_blocked_by_nmi(true);
2055                    let old_interruptibility: Interruptibility = self
2056                        .runner
2057                        .write_vmcs32(
2058                            intercepted_vtl,
2059                            VmcsField::VMX_VMCS_GUEST_INTERRUPTIBILITY,
2060                            mask.into(),
2061                            value.into(),
2062                        )
2063                        .into();
2064                    assert!(!old_interruptibility.blocked_by_nmi());
2065                } else {
2066                    self.handle_ept(intercepted_vtl, dev, gpa, ept_info).await?;
2067                }
2068
2069                &mut self.backing.vtls[intercepted_vtl].exit_stats.ept_violation
2070            }
2071            VmxExitBasic::TPR_BELOW_THRESHOLD => {
2072                // Loop around to reevaluate the APIC.
2073                &mut self.backing.vtls[intercepted_vtl]
2074                    .exit_stats
2075                    .tpr_below_threshold
2076            }
2077            VmxExitBasic::INTERRUPT_WINDOW => {
2078                // Loop around to reevaluate the APIC.
2079                &mut self.backing.vtls[intercepted_vtl]
2080                    .exit_stats
2081                    .interrupt_window
2082            }
2083            VmxExitBasic::NMI_WINDOW => {
2084                // Loop around to reevaluate pending NMIs.
2085                &mut self.backing.vtls[intercepted_vtl].exit_stats.nmi_window
2086            }
2087            VmxExitBasic::HW_INTERRUPT => {
2088                if cfg!(feature = "gdb") {
2089                    // Check if the interrupt was triggered by a hardware breakpoint.
2090                    let debug_regs = self
2091                        .access_state(intercepted_vtl.into())
2092                        .debug_regs()
2093                        .expect("register query should not fail");
2094                    // The lowest four bits of DR6 indicate which of the
2095                    // four breakpoints triggered.
2096                    breakpoint_debug_exception = debug_regs.dr6.trailing_zeros() < 4;
2097                }
2098                &mut self.backing.vtls[intercepted_vtl].exit_stats.hw_interrupt
2099            }
2100            VmxExitBasic::SMI_INTR => &mut self.backing.vtls[intercepted_vtl].exit_stats.smi_intr,
2101            VmxExitBasic::PAUSE_INSTRUCTION => {
2102                &mut self.backing.vtls[intercepted_vtl].exit_stats.pause
2103            }
2104            VmxExitBasic::TDCALL => {
2105                // If the proxy synic is local, then the host did not get this
2106                // instruction, and we need to handle it.
2107                if self.backing.untrusted_synic.is_some() {
2108                    assert_eq!(intercepted_vtl, GuestVtl::Vtl0);
2109                    self.handle_tdvmcall(dev, intercepted_vtl);
2110                } else if self.cvm_partition().hide_isolation {
2111                    // TDCALL is not valid when hiding isolation. Inject a #UD.
2112                    self.backing.vtls[intercepted_vtl].interruption_information =
2113                        InterruptionInformation::new()
2114                            .with_valid(true)
2115                            .with_vector(x86defs::Exception::INVALID_OPCODE.0)
2116                            .with_interruption_type(INTERRUPT_TYPE_HARDWARE_EXCEPTION);
2117                }
2118                &mut self.backing.vtls[intercepted_vtl].exit_stats.tdcall
2119            }
2120            VmxExitBasic::EXCEPTION => {
2121                tracing::trace!(
2122                    "Caught Exception: {:?}",
2123                    exit_info._exit_interruption_info()
2124                );
2125                if cfg!(feature = "gdb") {
2126                    breakpoint_debug_exception = true;
2127                }
2128                &mut self.backing.vtls[intercepted_vtl].exit_stats.exception
2129            }
2130            VmxExitBasic::TRIPLE_FAULT => {
2131                return Err(VpHaltReason::TripleFault {
2132                    vtl: intercepted_vtl.into(),
2133                });
2134            }
2135            VmxExitBasic::GDTR_OR_IDTR => {
2136                let info = GdtrOrIdtrInstructionInfo::from(exit_info.instr_info().info());
2137                tracing::trace!("Intercepted GDT or IDT instruction: {:?}", info);
2138                let reg = match info.instruction() {
2139                    GdtrOrIdtrInstruction::Sidt | GdtrOrIdtrInstruction::Lidt => {
2140                        HvX64RegisterName::Idtr
2141                    }
2142                    GdtrOrIdtrInstruction::Sgdt | GdtrOrIdtrInstruction::Lgdt => {
2143                        HvX64RegisterName::Gdtr
2144                    }
2145                };
2146                // We only support fowarding intercepts for descriptor table loads today.
2147                if (info.instruction().is_load()
2148                    && !self.cvm_try_protect_secure_register_write(intercepted_vtl, reg, 0))
2149                    || !info.instruction().is_load()
2150                {
2151                    self.emulate_gdtr_or_idtr(intercepted_vtl, dev).await?;
2152                }
2153                &mut self.backing.vtls[intercepted_vtl]
2154                    .exit_stats
2155                    .descriptor_table
2156            }
2157            VmxExitBasic::LDTR_OR_TR => {
2158                let info = LdtrOrTrInstructionInfo::from(exit_info.instr_info().info());
2159                tracing::trace!("Intercepted LDT or TR instruction: {:?}", info);
2160                let reg = match info.instruction() {
2161                    LdtrOrTrInstruction::Sldt | LdtrOrTrInstruction::Lldt => {
2162                        HvX64RegisterName::Ldtr
2163                    }
2164                    LdtrOrTrInstruction::Str | LdtrOrTrInstruction::Ltr => HvX64RegisterName::Tr,
2165                };
2166                // We only support fowarding intercepts for descriptor table loads today.
2167                if (info.instruction().is_load()
2168                    && !self.cvm_try_protect_secure_register_write(intercepted_vtl, reg, 0))
2169                    || !info.instruction().is_load()
2170                {
2171                    self.emulate_ldtr_or_tr(intercepted_vtl, dev).await?;
2172                }
2173                &mut self.backing.vtls[intercepted_vtl]
2174                    .exit_stats
2175                    .descriptor_table
2176            }
2177            _ => {
2178                return Err(VpHaltReason::Hypervisor(UhRunVpError::UnknownVmxExit(
2179                    exit_info.code().vmx_exit(),
2180                )));
2181            }
2182        };
2183        stat.increment();
2184
2185        // Breakpoint exceptions may return a non-fatal error.
2186        // We dispatch here to correctly increment the counter.
2187        if cfg!(feature = "gdb") && breakpoint_debug_exception {
2188            self.handle_debug_exception(intercepted_vtl)?;
2189        }
2190
2191        Ok(())
2192    }
2193
2194    /// Trace processor state for debugging purposes.
2195    fn trace_processor_state(&self, vtl: GuestVtl) {
2196        let raw_exit = self.runner.tdx_vp_enter_exit_info();
2197        tracing::error!(CVM_CONFIDENTIAL, ?raw_exit, "raw tdx vp enter exit info");
2198
2199        let gprs = self.runner.tdx_enter_guest_gps();
2200        tracing::error!(CVM_CONFIDENTIAL, ?gprs, "guest gpr list");
2201
2202        let TdxPrivateRegs {
2203            rflags,
2204            rip,
2205            rsp,
2206            ssp,
2207            rvi,
2208            svi,
2209            msr_kernel_gs_base,
2210            msr_star,
2211            msr_lstar,
2212            msr_sfmask,
2213            msr_xss,
2214            msr_tsc_aux,
2215            vp_entry_flags,
2216        } = self.backing.vtls[vtl].private_regs;
2217        tracing::error!(
2218            CVM_CONFIDENTIAL,
2219            rflags,
2220            rip,
2221            rsp,
2222            ssp,
2223            rvi,
2224            svi,
2225            msr_kernel_gs_base,
2226            msr_star,
2227            msr_lstar,
2228            msr_sfmask,
2229            msr_xss,
2230            msr_tsc_aux,
2231            ?vp_entry_flags,
2232            "private registers"
2233        );
2234
2235        let physical_cr0 = self.runner.read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_CR0);
2236        let shadow_cr0 = self
2237            .runner
2238            .read_vmcs64(vtl, VmcsField::VMX_VMCS_CR0_READ_SHADOW);
2239        let cr0_guest_host_mask: u64 = self
2240            .runner
2241            .read_vmcs64(vtl, VmcsField::VMX_VMCS_CR0_GUEST_HOST_MASK);
2242        tracing::error!(
2243            CVM_CONFIDENTIAL,
2244            physical_cr0,
2245            shadow_cr0,
2246            cr0_guest_host_mask,
2247            "cr0 values"
2248        );
2249
2250        let physical_cr4 = self.runner.read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_CR4);
2251        let shadow_cr4 = self
2252            .runner
2253            .read_vmcs64(vtl, VmcsField::VMX_VMCS_CR4_READ_SHADOW);
2254        let cr4_guest_host_mask = self
2255            .runner
2256            .read_vmcs64(vtl, VmcsField::VMX_VMCS_CR4_GUEST_HOST_MASK);
2257        tracing::error!(
2258            CVM_CONFIDENTIAL,
2259            physical_cr4,
2260            shadow_cr4,
2261            cr4_guest_host_mask,
2262            "cr4 values"
2263        );
2264
2265        let cr3 = self.runner.read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_CR3);
2266        tracing::error!(CVM_CONFIDENTIAL, cr3, "cr3");
2267
2268        let cached_efer = self.backing.vtls[vtl].efer;
2269        let vmcs_efer = self.runner.read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_EFER);
2270        let entry_controls = self
2271            .runner
2272            .read_vmcs32(vtl, VmcsField::VMX_VMCS_ENTRY_CONTROLS);
2273        tracing::error!(CVM_CONFIDENTIAL, cached_efer, vmcs_efer, "efer");
2274        tracing::error!(CVM_CONFIDENTIAL, entry_controls, "entry controls");
2275
2276        let cs = self.read_segment(vtl, TdxSegmentReg::Cs);
2277        let ds = self.read_segment(vtl, TdxSegmentReg::Ds);
2278        let es = self.read_segment(vtl, TdxSegmentReg::Es);
2279        let fs = self.read_segment(vtl, TdxSegmentReg::Fs);
2280        let gs = self.read_segment(vtl, TdxSegmentReg::Gs);
2281        let ss = self.read_segment(vtl, TdxSegmentReg::Ss);
2282        let tr = self.read_segment(vtl, TdxSegmentReg::Tr);
2283        let ldtr = self.read_segment(vtl, TdxSegmentReg::Ldtr);
2284
2285        tracing::error!(
2286            CVM_CONFIDENTIAL,
2287            ?cs,
2288            ?ds,
2289            ?es,
2290            ?fs,
2291            ?gs,
2292            ?ss,
2293            ?tr,
2294            ?ldtr,
2295            "segment values"
2296        );
2297
2298        let exception_bitmap = self
2299            .runner
2300            .read_vmcs32(vtl, VmcsField::VMX_VMCS_EXCEPTION_BITMAP);
2301        tracing::error!(CVM_CONFIDENTIAL, exception_bitmap, "exception bitmap");
2302
2303        let cached_processor_controls = self.backing.vtls[vtl].processor_controls;
2304        let vmcs_processor_controls = ProcessorControls::from(
2305            self.runner
2306                .read_vmcs32(vtl, VmcsField::VMX_VMCS_PROCESSOR_CONTROLS),
2307        );
2308        let vmcs_secondary_processor_controls = SecondaryProcessorControls::from(
2309            self.runner
2310                .read_vmcs32(vtl, VmcsField::VMX_VMCS_SECONDARY_PROCESSOR_CONTROLS),
2311        );
2312        tracing::error!(
2313            CVM_CONFIDENTIAL,
2314            ?cached_processor_controls,
2315            ?vmcs_processor_controls,
2316            ?vmcs_secondary_processor_controls,
2317            "processor controls"
2318        );
2319
2320        if cached_processor_controls != vmcs_processor_controls {
2321            tracing::error!(CVM_ALLOWED, "BUGBUG: processor controls mismatch");
2322        }
2323
2324        let cached_tpr_threshold = self.backing.vtls[vtl].tpr_threshold;
2325        let vmcs_tpr_threshold = self
2326            .runner
2327            .read_vmcs32(vtl, VmcsField::VMX_VMCS_TPR_THRESHOLD);
2328        tracing::error!(
2329            CVM_CONFIDENTIAL,
2330            cached_tpr_threshold,
2331            vmcs_tpr_threshold,
2332            "tpr threshold"
2333        );
2334
2335        let cached_eoi_exit_bitmap = self.backing.eoi_exit_bitmap;
2336        let vmcs_eoi_exit_bitmap = {
2337            let fields = [
2338                VmcsField::VMX_VMCS_EOI_EXIT_0,
2339                VmcsField::VMX_VMCS_EOI_EXIT_1,
2340                VmcsField::VMX_VMCS_EOI_EXIT_2,
2341                VmcsField::VMX_VMCS_EOI_EXIT_3,
2342            ];
2343            fields
2344                .iter()
2345                .map(|field| self.runner.read_vmcs64(vtl, *field))
2346                .collect::<Vec<_>>()
2347        };
2348        tracing::error!(
2349            CVM_CONFIDENTIAL,
2350            ?cached_eoi_exit_bitmap,
2351            ?vmcs_eoi_exit_bitmap,
2352            "eoi exit bitmap"
2353        );
2354
2355        let cached_interrupt_information = self.backing.vtls[vtl].interruption_information;
2356        let cached_interruption_set = self.backing.vtls[vtl].interruption_set;
2357        let vmcs_interrupt_information = self
2358            .runner
2359            .read_vmcs32(vtl, VmcsField::VMX_VMCS_ENTRY_INTERRUPT_INFO);
2360        let vmcs_entry_exception_code = self
2361            .runner
2362            .read_vmcs32(vtl, VmcsField::VMX_VMCS_ENTRY_EXCEPTION_ERROR_CODE);
2363        tracing::error!(
2364            CVM_CONFIDENTIAL,
2365            ?cached_interrupt_information,
2366            cached_interruption_set,
2367            vmcs_interrupt_information,
2368            vmcs_entry_exception_code,
2369            "interrupt information"
2370        );
2371
2372        let guest_interruptibility = self
2373            .runner
2374            .read_vmcs32(vtl, VmcsField::VMX_VMCS_GUEST_INTERRUPTIBILITY);
2375        tracing::error!(
2376            CVM_CONFIDENTIAL,
2377            guest_interruptibility,
2378            "guest interruptibility"
2379        );
2380
2381        let vmcs_sysenter_cs = self
2382            .runner
2383            .read_vmcs32(vtl, VmcsField::VMX_VMCS_GUEST_SYSENTER_CS_MSR);
2384        let vmcs_sysenter_esp = self
2385            .runner
2386            .read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_SYSENTER_ESP_MSR);
2387        let vmcs_sysenter_eip = self
2388            .runner
2389            .read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_SYSENTER_EIP_MSR);
2390        tracing::error!(
2391            CVM_CONFIDENTIAL,
2392            vmcs_sysenter_cs,
2393            vmcs_sysenter_esp,
2394            vmcs_sysenter_eip,
2395            "sysenter values"
2396        );
2397
2398        let vmcs_pat = self.runner.read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_PAT);
2399        tracing::error!(CVM_CONFIDENTIAL, vmcs_pat, "guest PAT");
2400    }
2401
2402    fn handle_vm_enter_failed(
2403        &self,
2404        vtl: GuestVtl,
2405        vmx_exit: VmxExit,
2406    ) -> VpHaltReason<UhRunVpError> {
2407        assert!(vmx_exit.vm_enter_failed());
2408        match vmx_exit.basic_reason() {
2409            VmxExitBasic::BAD_GUEST_STATE => {
2410                // Log system register state for debugging why we were
2411                // unable to enter the guest. This is a VMM bug.
2412                tracing::error!(CVM_ALLOWED, "VP.ENTER failed with bad guest state");
2413                self.trace_processor_state(vtl);
2414
2415                // TODO: panic instead?
2416                VpHaltReason::Hypervisor(UhRunVpError::VmxBadGuestState)
2417            }
2418            _ => VpHaltReason::Hypervisor(UhRunVpError::UnknownVmxExit(vmx_exit)),
2419        }
2420    }
2421
2422    fn advance_to_next_instruction(&mut self, vtl: GuestVtl) {
2423        let instr_info = TdxExit(self.runner.tdx_vp_enter_exit_info()).instr_info();
2424        let rip = &mut self.backing.vtls[vtl].private_regs.rip;
2425        *rip = rip.wrapping_add(instr_info.length().into());
2426    }
2427
2428    fn clear_interrupt_shadow(&mut self, vtl: GuestVtl) {
2429        let mask = Interruptibility::new().with_blocked_by_sti(true);
2430        let value = Interruptibility::new().with_blocked_by_sti(false);
2431        self.runner.write_vmcs32(
2432            vtl,
2433            VmcsField::VMX_VMCS_GUEST_INTERRUPTIBILITY,
2434            mask.into(),
2435            value.into(),
2436        );
2437    }
2438
2439    fn inject_gpf(&mut self, vtl: GuestVtl) {
2440        self.backing.vtls[vtl].interruption_information = InterruptionInformation::new()
2441            .with_valid(true)
2442            .with_vector(x86defs::Exception::GENERAL_PROTECTION_FAULT.0)
2443            .with_interruption_type(INTERRUPT_TYPE_HARDWARE_EXCEPTION)
2444            .with_deliver_error_code(true);
2445        self.backing.vtls[vtl].exception_error_code = 0;
2446    }
2447
2448    fn inject_mc(&mut self, vtl: GuestVtl) {
2449        self.backing.vtls[vtl].interruption_information = InterruptionInformation::new()
2450            .with_valid(true)
2451            .with_vector(x86defs::Exception::MACHINE_CHECK.0)
2452            .with_interruption_type(INTERRUPT_TYPE_HARDWARE_EXCEPTION);
2453    }
2454
2455    async fn handle_ept(
2456        &mut self,
2457        intercepted_vtl: GuestVtl,
2458        dev: &impl CpuIo,
2459        gpa: u64,
2460        ept_info: VmxEptExitQualification,
2461    ) -> Result<(), VpHaltReason<UhRunVpError>> {
2462        let vtom = self.partition.caps.vtom.unwrap_or(0);
2463        let is_shared = (gpa & vtom) == vtom && vtom != 0;
2464        let canonical_gpa = gpa & !vtom;
2465
2466        // Only emulate the access if the gpa is mmio or outside of ram.
2467        let address_type = self
2468            .partition
2469            .lower_vtl_memory_layout
2470            .probe_address(canonical_gpa);
2471
2472        match address_type {
2473            Some(AddressType::Mmio) => {
2474                // Emulate the access.
2475                self.emulate(
2476                    dev,
2477                    self.backing.vtls[intercepted_vtl]
2478                        .interruption_information
2479                        .valid(),
2480                    intercepted_vtl,
2481                    TdxEmulationCache::default(),
2482                )
2483                .await?;
2484            }
2485            Some(AddressType::Ram) => {
2486                // TODO TDX: This path changes when we support VTL page
2487                // protections and MNF. That will require injecting events to
2488                // VTL1 or other handling.
2489                //
2490                // For now, we just check if the exit was suprious or if we
2491                // should inject a machine check. An exit is considered spurious
2492                // if the gpa is accessible.
2493                if self.partition.gm[intercepted_vtl]
2494                    .probe_gpa_readable(gpa)
2495                    .is_ok()
2496                {
2497                    tracelimit::warn_ratelimited!(
2498                        CVM_ALLOWED,
2499                        gpa,
2500                        "possible spurious EPT violation, ignoring"
2501                    );
2502                } else {
2503                    // TODO: It would be better to show what exact bitmap check
2504                    // failed, but that requires some refactoring of how the
2505                    // different bitmaps are stored. Do this when we support VTL
2506                    // protections or MNF.
2507                    //
2508                    // If we entered this path, it means the bitmap check on
2509                    // `check_gpa_readable` failed, so we can assume that if the
2510                    // address is shared, the actual state of the page is
2511                    // private, and vice versa. This is because the address
2512                    // should have already been checked to be valid memory
2513                    // described to the guest or not.
2514                    tracelimit::warn_ratelimited!(
2515                        CVM_ALLOWED,
2516                        gpa,
2517                        is_shared,
2518                        ?ept_info,
2519                        "guest accessed inaccessible gpa, injecting MC"
2520                    );
2521
2522                    // TODO: Implement IA32_MCG_STATUS MSR for more reporting
2523                    self.inject_mc(intercepted_vtl);
2524                }
2525            }
2526            None => {
2527                if !self.cvm_partition().hide_isolation {
2528                    // TODO: Addresses outside of ram and mmio probably should
2529                    // not be accessed by the guest, if it has been told about
2530                    // isolation. While it's okay as we will return FFs or
2531                    // discard writes for addresses that are not mmio, we should
2532                    // consider if instead we should also inject a machine check
2533                    // for such accesses. The guest should not access any
2534                    // addresses not described to it.
2535                    //
2536                    // For now, log that the guest did this.
2537                    tracelimit::warn_ratelimited!(
2538                        CVM_ALLOWED,
2539                        gpa,
2540                        is_shared,
2541                        ?ept_info,
2542                        "guest accessed gpa not described in memory layout, emulating anyways"
2543                    );
2544                }
2545
2546                // Emulate the access.
2547                self.emulate(
2548                    dev,
2549                    self.backing.vtls[intercepted_vtl]
2550                        .interruption_information
2551                        .valid(),
2552                    intercepted_vtl,
2553                    TdxEmulationCache::default(),
2554                )
2555                .await?;
2556            }
2557        }
2558
2559        Ok(())
2560    }
2561
2562    fn handle_tdvmcall(&mut self, dev: &impl CpuIo, intercepted_vtl: GuestVtl) {
2563        let regs = self.runner.tdx_enter_guest_gps();
2564        if regs[TdxGp::R10] == 0 {
2565            // Architectural VMCALL.
2566            let result = match VmxExitBasic(regs[TdxGp::R11] as u16) {
2567                VmxExitBasic::MSR_WRITE => {
2568                    let msr = regs[TdxGp::R12] as u32;
2569                    let value = regs[TdxGp::R13];
2570                    match self.write_tdvmcall_msr(msr, value, intercepted_vtl) {
2571                        Ok(()) => {
2572                            tracing::debug!(msr, value, "tdvmcall msr write");
2573                            TdVmCallR10Result::SUCCESS
2574                        }
2575                        Err(err) => {
2576                            tracelimit::warn_ratelimited!(
2577                                CVM_ALLOWED,
2578                                msr,
2579                                ?err,
2580                                "failed tdvmcall msr write"
2581                            );
2582                            tracelimit::warn_ratelimited!(
2583                                CVM_CONFIDENTIAL,
2584                                value,
2585                                "failed tdvmcall msr write"
2586                            );
2587                            TdVmCallR10Result::OPERAND_INVALID
2588                        }
2589                    }
2590                }
2591                VmxExitBasic::MSR_READ => {
2592                    let msr = regs[TdxGp::R12] as u32;
2593                    match self.read_tdvmcall_msr(msr, intercepted_vtl) {
2594                        Ok(value) => {
2595                            tracing::debug!(msr, value, "tdvmcall msr read");
2596                            self.runner.tdx_enter_guest_gps_mut()[TdxGp::R11] = value;
2597                            TdVmCallR10Result::SUCCESS
2598                        }
2599                        Err(err) => {
2600                            tracelimit::warn_ratelimited!(
2601                                CVM_ALLOWED,
2602                                msr,
2603                                ?err,
2604                                "failed tdvmcall msr read"
2605                            );
2606                            TdVmCallR10Result::OPERAND_INVALID
2607                        }
2608                    }
2609                }
2610                subfunction => {
2611                    tracelimit::warn_ratelimited!(
2612                        CVM_ALLOWED,
2613                        ?subfunction,
2614                        "architectural vmcall not supported"
2615                    );
2616                    TdVmCallR10Result::OPERAND_INVALID
2617                }
2618            };
2619            self.runner.tdx_enter_guest_gps_mut()[TdxGp::R10] = result.0;
2620            self.backing.vtls[intercepted_vtl].private_regs.rip = self.backing.vtls
2621                [intercepted_vtl]
2622                .private_regs
2623                .rip
2624                .wrapping_add(4);
2625        } else {
2626            // This hypercall is normally handled by the hypervisor, so the gpas
2627            // given by the guest should all be shared. The hypervisor allows
2628            // gpas to be set with or without the shared gpa boundary bit, which
2629            // untrusted_dma_memory correctly models. Note that some Linux
2630            // guests will issue hypercalls without the boundary bit set,
2631            // whereas UEFI will issue with the bit set.
2632            let guest_memory = &self.shared.cvm.shared_memory;
2633            let handler = UhHypercallHandler {
2634                vp: &mut *self,
2635                bus: dev,
2636                trusted: false,
2637                intercepted_vtl,
2638            };
2639
2640            UhHypercallHandler::TDCALL_DISPATCHER.dispatch(guest_memory, TdHypercall(handler));
2641        }
2642    }
2643
2644    fn read_tdvmcall_msr(&mut self, msr: u32, intercepted_vtl: GuestVtl) -> Result<u64, MsrError> {
2645        match msr {
2646            msr @ (hvdef::HV_X64_MSR_GUEST_OS_ID | hvdef::HV_X64_MSR_VP_INDEX) => {
2647                self.backing.cvm.hv[intercepted_vtl].msr_read(msr)
2648            }
2649            _ => self
2650                .backing
2651                .untrusted_synic
2652                .as_mut()
2653                .unwrap()
2654                .read_nontimer_msr(msr),
2655        }
2656    }
2657
2658    fn write_tdvmcall_msr(
2659        &mut self,
2660        msr: u32,
2661        value: u64,
2662        intercepted_vtl: GuestVtl,
2663    ) -> Result<(), MsrError> {
2664        match msr {
2665            hvdef::HV_X64_MSR_GUEST_OS_ID => {
2666                self.backing.cvm.hv[intercepted_vtl].msr_write_guest_os_id(value)
2667            }
2668            _ => {
2669                // If we get here we must have an untrusted synic, as otherwise
2670                // we wouldn't be handling the TDVMCALL that ends up here. Therefore
2671                // this is fine to unwrap.
2672                self.backing
2673                    .untrusted_synic
2674                    .as_mut()
2675                    .unwrap()
2676                    .write_nontimer_msr(
2677                        msr,
2678                        value,
2679                        &mut UntrustedSynicVtlProts(&self.partition.gm[GuestVtl::Vtl0]),
2680                    )?;
2681                // Propagate sint MSR writes to the hypervisor as well
2682                // so that the hypervisor can directly inject events.
2683                if matches!(msr, hvdef::HV_X64_MSR_SINT0..=hvdef::HV_X64_MSR_SINT15) {
2684                    if let Err(err) = self.runner.set_vp_register(
2685                        intercepted_vtl,
2686                        HvX64RegisterName(
2687                            HvX64RegisterName::Sint0.0 + (msr - hvdef::HV_X64_MSR_SINT0),
2688                        ),
2689                        value.into(),
2690                    ) {
2691                        tracelimit::warn_ratelimited!(
2692                            CVM_ALLOWED,
2693                            error = &err as &dyn std::error::Error,
2694                            "failed to set sint register"
2695                        );
2696                    }
2697                }
2698            }
2699        }
2700
2701        Ok(())
2702    }
2703
2704    fn read_msr_tdx(&mut self, msr: u32, vtl: GuestVtl) -> Result<u64, MsrError> {
2705        // TODO TDX: port remaining tdx and common values
2706        //
2707        // TODO TDX: consider if this can be shared with SnpBacked's
2708        // implementation. For the most part other than Intel/TDX specific
2709        // registers, MSR handling should be the same.
2710
2711        match msr {
2712            // TODO TDX: LIFTED FROM WHP
2713            x86defs::X86X_IA32_MSR_PLATFORM_ID => {
2714                // Windows requires accessing this to boot. WHP
2715                // used to pass this through to the hardware,
2716                // but this regressed. Zero seems to work fine
2717                // for Windows.
2718                //
2719                // TODO: Pass through the host value if it can
2720                //       be retrieved.
2721                Ok(0)
2722            }
2723
2724            x86defs::X86X_MSR_MTRR_CAP => {
2725                // Advertise the absence of MTRR capabilities, but include the availability of write
2726                // combining.
2727                Ok(0x400)
2728            }
2729            x86defs::X86X_MSR_MTRR_DEF_TYPE => {
2730                // Because the MTRR registers are advertised via CPUID, even though no actual ranges
2731                // are supported a guest may choose to write to this MSR. Implement it as read as
2732                // zero/write ignore.
2733                Ok(0)
2734            }
2735            x86defs::X86X_MSR_CSTAR => Ok(self.backing.vtls[vtl].msr_cstar),
2736            x86defs::X86X_MSR_MCG_CAP => Ok(0),
2737            x86defs::X86X_MSR_MCG_STATUS => Ok(0),
2738            x86defs::X86X_MSR_MC_UPDATE_PATCH_LEVEL => Ok(0xFFFFFFFF),
2739            x86defs::X86X_MSR_XSS => Ok(self.backing.vtls[vtl].private_regs.msr_xss),
2740            x86defs::X86X_IA32_MSR_MISC_ENABLE => Ok(hv1_emulator::x86::MISC_ENABLE.into()),
2741            x86defs::X86X_IA32_MSR_FEATURE_CONTROL => Ok(VMX_FEATURE_CONTROL_LOCKED),
2742            x86defs::X86X_MSR_CR_PAT => {
2743                let pat = self.runner.read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_PAT);
2744                Ok(pat)
2745            }
2746
2747            // Following MSRs are unconditionally read by Linux guests.
2748            // These are not virtualized and unsupported for L2-VMs
2749            x86defs::X86X_MSR_MISC_FEATURE_ENABLES
2750            | x86defs::X86X_MSR_PLATFORM_INFO
2751            | x86defs::X86X_MSR_PPIN_CTL
2752            | x86defs::X86X_IA32_MSR_SMI_COUNT
2753            | x86defs::X86X_MSR_UMWAIT_CONTROL
2754            | x86defs::X86X_AMD_MSR_DE_CFG
2755            | x86defs::X86X_IA32_MSR_RAPL_POWER_UNIT
2756            | x86defs::X86X_IA32_MSR_PKG_ENERGY_STATUS
2757            | x86defs::X86X_IA32_MSR_DRAM_ENERGY_STATUS
2758            | x86defs::X86X_IA32_MSR_PP0_ENERGY_STATUS => Ok(0),
2759
2760            hvdef::HV_X64_MSR_GUEST_IDLE => {
2761                self.backing.cvm.lapics[vtl].activity = MpState::Idle;
2762                self.clear_interrupt_shadow(vtl);
2763                Ok(0)
2764            }
2765            X86X_MSR_EFER => Ok(self.backing.vtls[vtl].efer),
2766
2767            _ => Err(MsrError::Unknown),
2768        }
2769    }
2770
2771    fn write_msr_tdx(&mut self, msr: u32, value: u64, vtl: GuestVtl) -> Result<(), MsrError> {
2772        let state = &mut self.backing.vtls[vtl].private_regs;
2773
2774        match msr {
2775            X86X_MSR_EFER => {
2776                self.write_efer(vtl, value)
2777                    .map_err(|_| MsrError::InvalidAccess)?;
2778                self.update_execution_mode(vtl);
2779            }
2780            x86defs::X86X_MSR_STAR => state.msr_star = value,
2781            x86defs::X86X_MSR_CSTAR => self.backing.vtls[vtl].msr_cstar = value,
2782            x86defs::X86X_MSR_LSTAR => state.msr_lstar = value,
2783            x86defs::X86X_MSR_SFMASK => state.msr_sfmask = value,
2784            x86defs::X86X_MSR_TSC_AUX => state.msr_tsc_aux = value,
2785            x86defs::X86X_MSR_SYSENTER_CS => {
2786                self.runner.write_vmcs32(
2787                    vtl,
2788                    VmcsField::VMX_VMCS_GUEST_SYSENTER_CS_MSR,
2789                    !0,
2790                    value as u32,
2791                );
2792            }
2793            x86defs::X86X_MSR_SYSENTER_EIP => {
2794                self.runner.write_vmcs64(
2795                    vtl,
2796                    VmcsField::VMX_VMCS_GUEST_SYSENTER_EIP_MSR,
2797                    !0,
2798                    value,
2799                );
2800            }
2801            x86defs::X86X_MSR_SYSENTER_ESP => {
2802                self.runner.write_vmcs64(
2803                    vtl,
2804                    VmcsField::VMX_VMCS_GUEST_SYSENTER_ESP_MSR,
2805                    !0,
2806                    value,
2807                );
2808            }
2809            x86defs::X86X_MSR_XSS => state.msr_xss = value,
2810            x86defs::X86X_MSR_MC_UPDATE_PATCH_LEVEL => {
2811                // Writing zero on intel platforms is allowed and ignored.
2812                if value != 0 {
2813                    return Err(MsrError::InvalidAccess);
2814                }
2815            }
2816            x86defs::X86X_IA32_MSR_MISC_ENABLE => {}
2817            x86defs::X86X_MSR_CR_PAT => {
2818                self.runner
2819                    .write_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_PAT, !0, value);
2820            }
2821
2822            x86defs::X86X_MSR_MCG_STATUS => {
2823                // Writes are swallowed, except for reserved bits violations
2824                if x86defs::X86xMcgStatusRegister::from(value).reserved0() != 0 {
2825                    return Err(MsrError::InvalidAccess);
2826                }
2827            }
2828
2829            // Ignore writes to this MSR
2830            x86defs::X86X_MSR_MTRR_DEF_TYPE => {}
2831
2832            // Following MSRs are sometimes written by Windows guests.
2833            // These are not virtualized and unsupported for L2-VMs
2834            x86defs::X86X_MSR_BIOS_UPDT_TRIG => {}
2835
2836            // Following MSRs are unconditionally written by Linux guests.
2837            // These are not virtualized and unsupported for L2-VMs
2838            x86defs::X86X_MSR_MISC_FEATURE_ENABLES
2839            | x86defs::X86X_MSR_PLATFORM_INFO
2840            | x86defs::X86X_MSR_PPIN_CTL
2841            | x86defs::X86X_IA32_MSR_SMI_COUNT
2842            | x86defs::X86X_MSR_UMWAIT_CONTROL
2843            | x86defs::X86X_AMD_MSR_DE_CFG
2844            | x86defs::X86X_IA32_MSR_RAPL_POWER_UNIT
2845            | x86defs::X86X_IA32_MSR_PKG_ENERGY_STATUS
2846            | x86defs::X86X_IA32_MSR_DRAM_ENERGY_STATUS
2847            | x86defs::X86X_IA32_MSR_PP0_ENERGY_STATUS => {}
2848
2849            _ => return Err(MsrError::Unknown),
2850        }
2851
2852        Ok(())
2853    }
2854
2855    fn write_segment(
2856        &mut self,
2857        vtl: GuestVtl,
2858        seg: TdxSegmentReg,
2859        reg: SegmentRegister,
2860    ) -> Result<(), vp_state::Error> {
2861        // write base, selector, limit
2862        self.runner
2863            .write_vmcs16(vtl, seg.selector(), !0, reg.selector);
2864        self.runner.write_vmcs64(vtl, seg.base(), !0, reg.base);
2865        self.runner.write_vmcs32(vtl, seg.limit(), !0, reg.limit);
2866
2867        // Mark segment not valid if its attributes indicate not present.
2868        let mut attributes = x86defs::vmx::VmxSegmentAttributes::from(reg.attributes as u32);
2869        attributes.set_null(!attributes.present());
2870
2871        self.runner
2872            .write_vmcs32(vtl, seg.attributes(), !0, attributes.into());
2873
2874        Ok(())
2875    }
2876
2877    fn read_segment(&self, vtl: GuestVtl, seg: TdxSegmentReg) -> SegmentRegister {
2878        let selector = self.runner.read_vmcs16(vtl, seg.selector());
2879        let base = self.runner.read_vmcs64(vtl, seg.base());
2880        let limit = self.runner.read_vmcs32(vtl, seg.limit());
2881        let attributes = self.runner.read_vmcs32(vtl, seg.attributes());
2882
2883        SegmentRegister {
2884            selector,
2885            base,
2886            limit,
2887            attributes: attributes as u16,
2888        }
2889    }
2890
2891    fn long_mode(&self, vtl: GuestVtl) -> bool {
2892        let backing = &self.backing.vtls[vtl];
2893        backing.cr0.read(&self.runner) & X64_CR0_PE != 0 && backing.efer & X64_EFER_LMA != 0
2894    }
2895}
2896
2897impl<T: CpuIo> X86EmulatorSupport for UhEmulationState<'_, '_, T, TdxBacked> {
2898    type Error = UhRunVpError;
2899
2900    fn vp_index(&self) -> VpIndex {
2901        self.vp.vp_index()
2902    }
2903
2904    fn flush(&mut self) -> Result<(), Self::Error> {
2905        // no cached registers are modifiable by the emulator for TDX
2906        Ok(())
2907    }
2908
2909    fn vendor(&self) -> x86defs::cpuid::Vendor {
2910        self.vp.partition.caps.vendor
2911    }
2912
2913    fn gp(&mut self, reg: Gp) -> u64 {
2914        self.vp.runner.tdx_enter_guest_gps()[reg as usize]
2915    }
2916
2917    fn set_gp(&mut self, reg: Gp, v: u64) {
2918        self.vp.runner.tdx_enter_guest_gps_mut()[reg as usize] = v;
2919    }
2920
2921    fn xmm(&mut self, index: usize) -> u128 {
2922        u128::from_ne_bytes(self.vp.runner.fx_state().xmm[index])
2923    }
2924
2925    fn set_xmm(&mut self, index: usize, v: u128) -> Result<(), Self::Error> {
2926        self.vp.runner.fx_state_mut().xmm[index] = v.to_ne_bytes();
2927        Ok(())
2928    }
2929
2930    fn rip(&mut self) -> u64 {
2931        self.vp.backing.vtls[self.vtl].private_regs.rip
2932    }
2933
2934    fn set_rip(&mut self, v: u64) {
2935        self.vp.backing.vtls[self.vtl].private_regs.rip = v;
2936    }
2937
2938    fn segment(&mut self, index: Segment) -> x86defs::SegmentRegister {
2939        let tdx_segment_index = match index {
2940            Segment::CS => TdxSegmentReg::Cs,
2941            Segment::ES => TdxSegmentReg::Es,
2942            Segment::SS => TdxSegmentReg::Ss,
2943            Segment::DS => TdxSegmentReg::Ds,
2944            Segment::FS => TdxSegmentReg::Fs,
2945            Segment::GS => TdxSegmentReg::Gs,
2946        };
2947        let reg = match tdx_segment_index {
2948            TdxSegmentReg::Cs => self.cache.segs[index as usize]
2949                .get_or_insert_with(|| TdxExit(self.vp.runner.tdx_vp_enter_exit_info()).cs()),
2950            _ => self.cache.segs[index as usize]
2951                .get_or_insert_with(|| self.vp.read_segment(self.vtl, tdx_segment_index)),
2952        };
2953        (*reg).into()
2954    }
2955
2956    fn efer(&mut self) -> u64 {
2957        self.vp.backing.vtls[self.vtl].efer
2958    }
2959
2960    fn cr0(&mut self) -> u64 {
2961        let reg = self
2962            .cache
2963            .cr0
2964            .get_or_insert_with(|| self.vp.backing.vtls[self.vtl].cr0.read(&self.vp.runner));
2965        *reg
2966    }
2967
2968    fn rflags(&mut self) -> RFlags {
2969        self.vp.backing.vtls[self.vtl].private_regs.rflags.into()
2970    }
2971
2972    fn set_rflags(&mut self, v: RFlags) {
2973        self.vp.backing.vtls[self.vtl].private_regs.rflags = v.into();
2974    }
2975
2976    fn instruction_bytes(&self) -> &[u8] {
2977        &[]
2978    }
2979
2980    fn physical_address(&self) -> Option<u64> {
2981        TdxExit(self.vp.runner.tdx_vp_enter_exit_info()).gpa()
2982    }
2983
2984    fn initial_gva_translation(
2985        &mut self,
2986    ) -> Option<virt_support_x86emu::emulate::InitialTranslation> {
2987        let exit_info = TdxExit(self.vp.runner.tdx_vp_enter_exit_info());
2988        let ept_info = VmxEptExitQualification::from(exit_info.qualification());
2989
2990        if exit_info.code().vmx_exit().basic_reason() == VmxExitBasic::EPT_VIOLATION
2991            && ept_info.gva_valid()
2992        {
2993            Some(virt_support_x86emu::emulate::InitialTranslation {
2994                gva: exit_info.gla().expect("already validated EPT exit"),
2995                gpa: exit_info.gpa().expect("already validated EPT exit"),
2996                translate_mode: match ept_info.access_mask() {
2997                    0x1 => TranslateMode::Read,
2998                    // As defined in "Table 28-7. Exit Qualification for EPT
2999                    // Violations" in the Intel SDM, the processor may set both
3000                    // the read and write bits in certain conditions:
3001                    //
3002                    // If accessed and dirty flags for EPT are enabled,
3003                    // processor accesses to guest paging-structure entries are
3004                    // treated as writes with regard to EPT violations (see
3005                    // Section 29.3.3.2). If such an access causes an EPT
3006                    // violation, the processor sets both bit 0 and bit 1 of the
3007                    // exit qualification.
3008                    //
3009                    // Treat both 0x2 and 0x3 as writes.
3010                    0x2 | 0x3 => TranslateMode::Write,
3011                    0x4 => TranslateMode::Execute,
3012                    _ => panic!("unexpected ept access mask 0x{:x}", ept_info.access_mask()),
3013                },
3014            })
3015        } else {
3016            None
3017        }
3018    }
3019
3020    fn interruption_pending(&self) -> bool {
3021        self.interruption_pending
3022    }
3023
3024    fn check_vtl_access(
3025        &mut self,
3026        _gpa: u64,
3027        _mode: TranslateMode,
3028    ) -> Result<(), virt_support_x86emu::emulate::EmuCheckVtlAccessError<Self::Error>> {
3029        // Nothing to do here, the guest memory object will handle the check.
3030        Ok(())
3031    }
3032
3033    fn translate_gva(
3034        &mut self,
3035        gva: u64,
3036        mode: TranslateMode,
3037    ) -> Result<
3038        Result<
3039            virt_support_x86emu::emulate::EmuTranslateResult,
3040            virt_support_x86emu::emulate::EmuTranslateError,
3041        >,
3042        Self::Error,
3043    > {
3044        emulate_translate_gva(self, gva, mode)
3045    }
3046
3047    fn inject_pending_event(&mut self, event_info: hvdef::HvX64PendingEvent) {
3048        assert!(event_info.reg_0.event_pending());
3049        assert_eq!(
3050            event_info.reg_0.event_type(),
3051            hvdef::HV_X64_PENDING_EVENT_EXCEPTION
3052        );
3053        assert!(!self.interruption_pending);
3054
3055        // There's no interruption pending, so just inject the exception
3056        // directly without checking for double fault.
3057        TdxBacked::set_pending_exception(
3058            self.vp,
3059            self.vtl,
3060            HvX64PendingExceptionEvent::from(event_info.reg_0.into_bits()),
3061        );
3062    }
3063
3064    fn is_gpa_mapped(&self, gpa: u64, write: bool) -> bool {
3065        // Ignore the VTOM address bit when checking, since memory is mirrored
3066        // across the VTOM.
3067        let vtom = self.vp.partition.caps.vtom.unwrap_or(0);
3068        debug_assert!(vtom == 0 || vtom.is_power_of_two());
3069        self.vp.partition.is_gpa_mapped(gpa & !vtom, write)
3070    }
3071
3072    fn lapic_base_address(&self) -> Option<u64> {
3073        self.vp.backing.cvm.lapics[self.vtl].lapic.base_address()
3074    }
3075
3076    fn lapic_read(&mut self, address: u64, data: &mut [u8]) {
3077        self.vp.backing.cvm.lapics[self.vtl]
3078            .lapic
3079            .access(&mut TdxApicClient {
3080                partition: self.vp.partition,
3081                dev: self.devices,
3082                vmtime: &self.vp.vmtime,
3083                apic_page: self.vp.runner.tdx_apic_page_mut(self.vtl),
3084                vtl: self.vtl,
3085            })
3086            .mmio_read(address, data);
3087    }
3088
3089    fn lapic_write(&mut self, address: u64, data: &[u8]) {
3090        self.vp.backing.cvm.lapics[self.vtl]
3091            .lapic
3092            .access(&mut TdxApicClient {
3093                partition: self.vp.partition,
3094                dev: self.devices,
3095                vmtime: &self.vp.vmtime,
3096                apic_page: self.vp.runner.tdx_apic_page_mut(self.vtl),
3097                vtl: self.vtl,
3098            })
3099            .mmio_write(address, data);
3100    }
3101}
3102
3103#[derive(Debug)]
3104enum TdxSegmentReg {
3105    Es,
3106    Cs,
3107    Ss,
3108    Ds,
3109    Fs,
3110    Gs,
3111    Ldtr,
3112    Tr,
3113}
3114
3115impl TdxSegmentReg {
3116    /// The selector vmcs field code.
3117    fn selector(&self) -> VmcsField {
3118        match self {
3119            Self::Es => VmcsField::VMX_VMCS_GUEST_ES_SELECTOR,
3120            Self::Cs => VmcsField::VMX_VMCS_GUEST_CS_SELECTOR,
3121            Self::Ss => VmcsField::VMX_VMCS_GUEST_SS_SELECTOR,
3122            Self::Ds => VmcsField::VMX_VMCS_GUEST_DS_SELECTOR,
3123            Self::Fs => VmcsField::VMX_VMCS_GUEST_FS_SELECTOR,
3124            Self::Gs => VmcsField::VMX_VMCS_GUEST_GS_SELECTOR,
3125            Self::Ldtr => VmcsField::VMX_VMCS_GUEST_LDTR_SELECTOR,
3126            Self::Tr => VmcsField::VMX_VMCS_GUEST_TR_SELECTOR,
3127        }
3128    }
3129
3130    /// The base vmcs field code.
3131    fn base(&self) -> VmcsField {
3132        match self {
3133            Self::Es => VmcsField::VMX_VMCS_GUEST_ES_BASE,
3134            Self::Cs => VmcsField::VMX_VMCS_GUEST_CS_BASE,
3135            Self::Ss => VmcsField::VMX_VMCS_GUEST_SS_BASE,
3136            Self::Ds => VmcsField::VMX_VMCS_GUEST_DS_BASE,
3137            Self::Fs => VmcsField::VMX_VMCS_GUEST_FS_BASE,
3138            Self::Gs => VmcsField::VMX_VMCS_GUEST_GS_BASE,
3139            Self::Ldtr => VmcsField::VMX_VMCS_GUEST_LDTR_BASE,
3140            Self::Tr => VmcsField::VMX_VMCS_GUEST_TR_BASE,
3141        }
3142    }
3143
3144    /// The limit vmcs field code.
3145    fn limit(&self) -> VmcsField {
3146        match self {
3147            Self::Es => VmcsField::VMX_VMCS_GUEST_ES_LIMIT,
3148            Self::Cs => VmcsField::VMX_VMCS_GUEST_CS_LIMIT,
3149            Self::Ss => VmcsField::VMX_VMCS_GUEST_SS_LIMIT,
3150            Self::Ds => VmcsField::VMX_VMCS_GUEST_DS_LIMIT,
3151            Self::Fs => VmcsField::VMX_VMCS_GUEST_FS_LIMIT,
3152            Self::Gs => VmcsField::VMX_VMCS_GUEST_GS_LIMIT,
3153            Self::Ldtr => VmcsField::VMX_VMCS_GUEST_LDTR_LIMIT,
3154            Self::Tr => VmcsField::VMX_VMCS_GUEST_TR_LIMIT,
3155        }
3156    }
3157
3158    // The attributes vmcs field code.
3159    fn attributes(&self) -> VmcsField {
3160        match self {
3161            Self::Es => VmcsField::VMX_VMCS_GUEST_ES_AR,
3162            Self::Cs => VmcsField::VMX_VMCS_GUEST_CS_AR,
3163            Self::Ss => VmcsField::VMX_VMCS_GUEST_SS_AR,
3164            Self::Ds => VmcsField::VMX_VMCS_GUEST_DS_AR,
3165            Self::Fs => VmcsField::VMX_VMCS_GUEST_FS_AR,
3166            Self::Gs => VmcsField::VMX_VMCS_GUEST_GS_AR,
3167            Self::Ldtr => VmcsField::VMX_VMCS_GUEST_LDTR_AR,
3168            Self::Tr => VmcsField::VMX_VMCS_GUEST_TR_AR,
3169        }
3170    }
3171}
3172
3173#[derive(Debug)]
3174enum TdxTableReg {
3175    Idtr,
3176    Gdtr,
3177}
3178
3179impl TdxTableReg {
3180    fn base_code(&self) -> VmcsField {
3181        match self {
3182            Self::Idtr => VmcsField::VMX_VMCS_GUEST_IDTR_BASE,
3183            Self::Gdtr => VmcsField::VMX_VMCS_GUEST_GDTR_BASE,
3184        }
3185    }
3186
3187    fn limit_code(&self) -> VmcsField {
3188        match self {
3189            Self::Idtr => VmcsField::VMX_VMCS_GUEST_IDTR_LIMIT,
3190            Self::Gdtr => VmcsField::VMX_VMCS_GUEST_GDTR_LIMIT,
3191        }
3192    }
3193}
3194
3195impl UhProcessor<'_, TdxBacked> {
3196    /// Handle a write to EFER, which requires special handling on TDX due to
3197    /// required bits and state updates.
3198    ///
3199    /// Note that a caller must also call [`Self::update_execution_mode`] after
3200    /// updating EFER.
3201    fn write_efer(&mut self, vtl: GuestVtl, efer: u64) -> Result<(), vp_state::Error> {
3202        if efer & (X64_EFER_SVME | X64_EFER_FFXSR) != 0 {
3203            return Err(vp_state::Error::InvalidValue(
3204                efer,
3205                "EFER",
3206                "SVME or FFXSR set",
3207            ));
3208        }
3209
3210        // EFER.NXE must be 1.
3211        if efer & X64_EFER_NXE == 0 {
3212            return Err(vp_state::Error::InvalidValue(efer, "EFER", "NXE not set"));
3213        }
3214
3215        // Update the local value of EFER and the VMCS.
3216        if self.backing.vtls[vtl].efer != efer {
3217            self.backing.vtls[vtl].efer = efer;
3218            self.runner
3219                .write_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_EFER, !0, efer);
3220        }
3221
3222        Ok(())
3223    }
3224
3225    /// Read CR0 that includes guest shadowed bits. This is the value the guest
3226    /// sees.
3227    fn read_cr0(&self, vtl: GuestVtl) -> u64 {
3228        self.backing.vtls[vtl].cr0.read(&self.runner)
3229    }
3230
3231    /// Write to the guest CR0.
3232    fn write_cr0(&mut self, vtl: GuestVtl, value: u64) -> Result<(), vp_state::Error> {
3233        self.backing.vtls[vtl]
3234            .cr0
3235            .write(value | X64_CR0_ET, &mut self.runner)
3236    }
3237
3238    fn read_cr4(&self, vtl: GuestVtl) -> u64 {
3239        self.backing.vtls[vtl].cr4.read(&self.runner)
3240    }
3241
3242    fn write_cr4(&mut self, vtl: GuestVtl, value: u64) -> Result<(), vp_state::Error> {
3243        self.backing.vtls[vtl].cr4.write(value, &mut self.runner)
3244    }
3245
3246    fn write_table_register(&mut self, vtl: GuestVtl, table: TdxTableReg, reg: TableRegister) {
3247        self.runner
3248            .write_vmcs64(vtl, table.base_code(), !0, reg.base);
3249        self.runner
3250            .write_vmcs32(vtl, table.limit_code(), !0, reg.limit.into());
3251    }
3252
3253    fn read_table_register(&self, vtl: GuestVtl, table: TdxTableReg) -> TableRegister {
3254        let base = self.runner.read_vmcs64(vtl, table.base_code());
3255        let limit = self.runner.read_vmcs32(vtl, table.limit_code());
3256
3257        TableRegister {
3258            base,
3259            limit: limit as u16,
3260        }
3261    }
3262
3263    /// Update execution mode when CR0 or EFER is changed.
3264    fn update_execution_mode(&mut self, vtl: GuestVtl) {
3265        let lme = self.backing.vtls[vtl].efer & X64_EFER_LME == X64_EFER_LME;
3266        let pg = self.read_cr0(vtl) & X64_CR0_PG == X64_CR0_PG;
3267        let efer_lma = self.backing.vtls[vtl].efer & X64_EFER_LMA == X64_EFER_LMA;
3268        let lma = lme && pg;
3269
3270        if lma != efer_lma {
3271            // Flip only the LMA bit.
3272            let new_efer = self.backing.vtls[vtl].efer ^ X64_EFER_LMA;
3273            self.write_efer(vtl, new_efer)
3274                .expect("EFER was valid before, it should still be valid");
3275        }
3276
3277        self.runner.write_vmcs32(
3278            vtl,
3279            VmcsField::VMX_VMCS_ENTRY_CONTROLS,
3280            VMX_ENTRY_CONTROL_LONG_MODE_GUEST,
3281            if lma {
3282                VMX_ENTRY_CONTROL_LONG_MODE_GUEST
3283            } else {
3284                0
3285            },
3286        );
3287    }
3288
3289    async fn emulate_gdtr_or_idtr(
3290        &mut self,
3291        vtl: GuestVtl,
3292        dev: &impl CpuIo,
3293    ) -> Result<(), VpHaltReason<UhRunVpError>> {
3294        let exit_info = TdxExit(self.runner.tdx_vp_enter_exit_info());
3295        assert_eq!(
3296            exit_info.code().vmx_exit().basic_reason(),
3297            VmxExitBasic::GDTR_OR_IDTR
3298        );
3299        let instr_info = GdtrOrIdtrInstructionInfo::from(exit_info.instr_info().info());
3300
3301        // Check if load instructions are executed outside of kernel mode.
3302        // Check if store instructions are blocked by UMIP.
3303        if (instr_info.instruction().is_load() && exit_info.cpl() != 0)
3304            || (!instr_info.instruction().is_load()
3305                && exit_info.cpl() > 0
3306                && self.read_cr4(vtl) & X64_CR4_UMIP != 0)
3307        {
3308            self.inject_gpf(vtl);
3309            return Ok(());
3310        }
3311
3312        let (gva, segment) = self.compute_gva_for_table_access_emulation(
3313            exit_info.qualification(),
3314            (!instr_info.base_register_invalid()).then_some(instr_info.base_register()),
3315            (!instr_info.index_register_invalid()).then_some(instr_info.index_register()),
3316            instr_info.scaling(),
3317            instr_info.address_size(),
3318            instr_info.segment_register(),
3319        );
3320
3321        let gm = &self.partition.gm[vtl];
3322        let interruption_pending = self.backing.vtls[vtl].interruption_information.valid();
3323        let len = 2 + if self.long_mode(vtl) { 8 } else { 4 };
3324        let mut buf = [0u8; 10];
3325
3326        match instr_info.instruction() {
3327            GdtrOrIdtrInstruction::Sidt | GdtrOrIdtrInstruction::Sgdt => {
3328                let table = self.read_table_register(
3329                    vtl,
3330                    if matches!(instr_info.instruction(), GdtrOrIdtrInstruction::Sidt) {
3331                        TdxTableReg::Idtr
3332                    } else {
3333                        TdxTableReg::Gdtr
3334                    },
3335                );
3336                buf[..2].copy_from_slice(&table.limit.to_le_bytes());
3337                buf[2..].copy_from_slice(&table.base.to_le_bytes());
3338                let mut emulation_state = UhEmulationState {
3339                    vp: &mut *self,
3340                    interruption_pending,
3341                    devices: dev,
3342                    vtl,
3343                    cache: TdxEmulationCache::default(),
3344                };
3345                emulate_insn_memory_op(
3346                    &mut emulation_state,
3347                    gm,
3348                    dev,
3349                    gva,
3350                    segment,
3351                    x86emu::AlignmentMode::Unaligned,
3352                    EmulatedMemoryOperation::Write(&buf[..len]),
3353                )
3354                .await?;
3355            }
3356
3357            GdtrOrIdtrInstruction::Lgdt | GdtrOrIdtrInstruction::Lidt => {
3358                let mut emulation_state = UhEmulationState {
3359                    vp: &mut *self,
3360                    interruption_pending,
3361                    devices: dev,
3362                    vtl,
3363                    cache: TdxEmulationCache::default(),
3364                };
3365                emulate_insn_memory_op(
3366                    &mut emulation_state,
3367                    gm,
3368                    dev,
3369                    gva,
3370                    segment,
3371                    x86emu::AlignmentMode::Unaligned,
3372                    EmulatedMemoryOperation::Read(&mut buf[..len]),
3373                )
3374                .await?;
3375                let table = TableRegister {
3376                    limit: u16::from_le_bytes(buf[..2].try_into().unwrap()),
3377                    base: u64::from_le_bytes(buf[2..len].try_into().unwrap()),
3378                };
3379                self.write_table_register(
3380                    vtl,
3381                    if matches!(instr_info.instruction(), GdtrOrIdtrInstruction::Lidt) {
3382                        TdxTableReg::Idtr
3383                    } else {
3384                        TdxTableReg::Gdtr
3385                    },
3386                    table,
3387                );
3388            }
3389        }
3390
3391        self.advance_to_next_instruction(vtl);
3392        Ok(())
3393    }
3394
3395    async fn emulate_ldtr_or_tr(
3396        &mut self,
3397        vtl: GuestVtl,
3398        dev: &impl CpuIo,
3399    ) -> Result<(), VpHaltReason<UhRunVpError>> {
3400        let exit_info = TdxExit(self.runner.tdx_vp_enter_exit_info());
3401        assert_eq!(
3402            exit_info.code().vmx_exit().basic_reason(),
3403            VmxExitBasic::LDTR_OR_TR
3404        );
3405        let instr_info = LdtrOrTrInstructionInfo::from(exit_info.instr_info().info());
3406
3407        // Check if load instructions are executed outside of kernel mode.
3408        // Check if store instructions are blocked by UMIP.
3409        if (instr_info.instruction().is_load() && exit_info.cpl() != 0)
3410            || (!instr_info.instruction().is_load()
3411                && exit_info.cpl() > 0
3412                && self.read_cr4(vtl) & X64_CR4_UMIP != 0)
3413        {
3414            self.inject_gpf(vtl);
3415            return Ok(());
3416        }
3417
3418        let gm = &self.partition.gm[vtl];
3419        let interruption_pending = self.backing.vtls[vtl].interruption_information.valid();
3420
3421        match instr_info.instruction() {
3422            LdtrOrTrInstruction::Sldt | LdtrOrTrInstruction::Str => {
3423                let value = self.runner.read_vmcs16(
3424                    vtl,
3425                    if matches!(instr_info.instruction(), LdtrOrTrInstruction::Sldt) {
3426                        TdxSegmentReg::Ldtr
3427                    } else {
3428                        TdxSegmentReg::Tr
3429                    }
3430                    .selector(),
3431                );
3432
3433                if instr_info.memory_or_register() {
3434                    let gps = self.runner.tdx_enter_guest_gps_mut();
3435                    gps[instr_info.register_1() as usize] = value.into();
3436                } else {
3437                    let (gva, segment) = self.compute_gva_for_table_access_emulation(
3438                        exit_info.qualification(),
3439                        (!instr_info.base_register_invalid()).then_some(instr_info.base_register()),
3440                        (!instr_info.index_register_invalid())
3441                            .then_some(instr_info.index_register()),
3442                        instr_info.scaling(),
3443                        instr_info.address_size(),
3444                        instr_info.segment_register(),
3445                    );
3446                    let mut emulation_state = UhEmulationState {
3447                        vp: &mut *self,
3448                        interruption_pending,
3449                        devices: dev,
3450                        vtl,
3451                        cache: TdxEmulationCache::default(),
3452                    };
3453                    emulate_insn_memory_op(
3454                        &mut emulation_state,
3455                        gm,
3456                        dev,
3457                        gva,
3458                        segment,
3459                        x86emu::AlignmentMode::Standard,
3460                        EmulatedMemoryOperation::Write(&value.to_le_bytes()),
3461                    )
3462                    .await?;
3463                }
3464            }
3465
3466            LdtrOrTrInstruction::Lldt | LdtrOrTrInstruction::Ltr => {
3467                let value = if instr_info.memory_or_register() {
3468                    let gps = self.runner.tdx_enter_guest_gps();
3469                    gps[instr_info.register_1() as usize] as u16
3470                } else {
3471                    let (gva, segment) = self.compute_gva_for_table_access_emulation(
3472                        exit_info.qualification(),
3473                        (!instr_info.base_register_invalid()).then_some(instr_info.base_register()),
3474                        (!instr_info.index_register_invalid())
3475                            .then_some(instr_info.index_register()),
3476                        instr_info.scaling(),
3477                        instr_info.address_size(),
3478                        instr_info.segment_register(),
3479                    );
3480                    let mut emulation_state = UhEmulationState {
3481                        vp: &mut *self,
3482                        interruption_pending,
3483                        devices: dev,
3484                        vtl,
3485                        cache: TdxEmulationCache::default(),
3486                    };
3487                    let mut buf = [0u8; 2];
3488                    emulate_insn_memory_op(
3489                        &mut emulation_state,
3490                        gm,
3491                        dev,
3492                        gva,
3493                        segment,
3494                        x86emu::AlignmentMode::Standard,
3495                        EmulatedMemoryOperation::Read(&mut buf),
3496                    )
3497                    .await?;
3498                    u16::from_le_bytes(buf)
3499                };
3500                self.runner.write_vmcs16(
3501                    vtl,
3502                    if matches!(instr_info.instruction(), LdtrOrTrInstruction::Lldt) {
3503                        TdxSegmentReg::Ldtr
3504                    } else {
3505                        TdxSegmentReg::Tr
3506                    }
3507                    .selector(),
3508                    !0,
3509                    value,
3510                );
3511            }
3512        }
3513
3514        self.advance_to_next_instruction(vtl);
3515        Ok(())
3516    }
3517
3518    fn compute_gva_for_table_access_emulation(
3519        &self,
3520        qualification: u64,
3521        base_reg: Option<u8>,
3522        index_reg: Option<u8>,
3523        scaling: u8,
3524        address_size: u8,
3525        segment_register: u8,
3526    ) -> (u64, Segment) {
3527        let gps = self.runner.tdx_enter_guest_gps();
3528
3529        // Displacement is stored in the qualification field for these instructions.
3530        let mut gva = qualification;
3531        if let Some(base_register) = base_reg {
3532            gva += gps[base_register as usize];
3533        }
3534        if let Some(index_register) = index_reg {
3535            gva += gps[index_register as usize] << scaling;
3536        }
3537        match address_size {
3538            // 16-bit address size
3539            0 => gva &= 0xFFFF,
3540            // 32-bit address size
3541            1 => gva &= 0xFFFFFFFF,
3542            // 64-bit address size
3543            2 => {}
3544            _ => unreachable!(),
3545        }
3546
3547        let segment = match segment_register {
3548            0 => Segment::ES,
3549            1 => Segment::CS,
3550            2 => Segment::SS,
3551            3 => Segment::DS,
3552            4 => Segment::FS,
3553            5 => Segment::GS,
3554            _ => unreachable!(),
3555        };
3556
3557        (gva, segment)
3558    }
3559}
3560
3561struct TdxApicClient<'a, T> {
3562    partition: &'a UhPartitionInner,
3563    apic_page: &'a mut ApicPage,
3564    dev: &'a T,
3565    vmtime: &'a VmTimeAccess,
3566    vtl: GuestVtl,
3567}
3568
3569impl<T: CpuIo> ApicClient for TdxApicClient<'_, T> {
3570    fn cr8(&mut self) -> u32 {
3571        self.apic_page.tpr.value >> 4
3572    }
3573
3574    fn set_cr8(&mut self, value: u32) {
3575        self.apic_page.tpr.value = value << 4;
3576    }
3577
3578    fn set_apic_base(&mut self, _value: u64) {
3579        // No-op--the APIC base is stored in the APIC itself.
3580    }
3581
3582    fn wake(&mut self, vp_index: VpIndex) {
3583        self.partition.vps[vp_index.index() as usize].wake(self.vtl, WakeReason::INTCON);
3584    }
3585
3586    fn eoi(&mut self, vector: u8) {
3587        self.dev.handle_eoi(vector.into())
3588    }
3589
3590    fn now(&mut self) -> vmcore::vmtime::VmTime {
3591        self.vmtime.now()
3592    }
3593
3594    fn pull_offload(&mut self) -> ([u32; 8], [u32; 8]) {
3595        pull_apic_offload(self.apic_page)
3596    }
3597}
3598
3599fn pull_apic_offload(page: &mut ApicPage) -> ([u32; 8], [u32; 8]) {
3600    let mut irr = [0; 8];
3601    let mut isr = [0; 8];
3602    for (((irr, page_irr), isr), page_isr) in irr
3603        .iter_mut()
3604        .zip(page.irr.iter_mut())
3605        .zip(isr.iter_mut())
3606        .zip(page.isr.iter_mut())
3607    {
3608        *irr = std::mem::take(&mut page_irr.value);
3609        *isr = std::mem::take(&mut page_isr.value);
3610    }
3611    (irr, isr)
3612}
3613
3614impl<T> hv1_hypercall::X64RegisterState for UhHypercallHandler<'_, '_, T, TdxBacked> {
3615    fn rip(&mut self) -> u64 {
3616        self.vp.backing.vtls[self.intercepted_vtl].private_regs.rip
3617    }
3618
3619    fn set_rip(&mut self, rip: u64) {
3620        self.vp.backing.vtls[self.intercepted_vtl].private_regs.rip = rip;
3621    }
3622
3623    fn gp(&mut self, n: hv1_hypercall::X64HypercallRegister) -> u64 {
3624        let gps = self.vp.runner.tdx_enter_guest_gps();
3625        match n {
3626            hv1_hypercall::X64HypercallRegister::Rax => gps[TdxGp::RAX],
3627            hv1_hypercall::X64HypercallRegister::Rcx => gps[TdxGp::RCX],
3628            hv1_hypercall::X64HypercallRegister::Rdx => gps[TdxGp::RDX],
3629            hv1_hypercall::X64HypercallRegister::Rbx => gps[TdxGp::RBX],
3630            hv1_hypercall::X64HypercallRegister::Rsi => gps[TdxGp::RSI],
3631            hv1_hypercall::X64HypercallRegister::Rdi => gps[TdxGp::RDI],
3632            hv1_hypercall::X64HypercallRegister::R8 => gps[TdxGp::R8],
3633        }
3634    }
3635
3636    fn set_gp(&mut self, n: hv1_hypercall::X64HypercallRegister, value: u64) {
3637        let gps = self.vp.runner.tdx_enter_guest_gps_mut();
3638        match n {
3639            hv1_hypercall::X64HypercallRegister::Rax => gps[TdxGp::RAX] = value,
3640            hv1_hypercall::X64HypercallRegister::Rcx => gps[TdxGp::RCX] = value,
3641            hv1_hypercall::X64HypercallRegister::Rdx => gps[TdxGp::RDX] = value,
3642            hv1_hypercall::X64HypercallRegister::Rbx => gps[TdxGp::RBX] = value,
3643            hv1_hypercall::X64HypercallRegister::Rsi => gps[TdxGp::RSI] = value,
3644            hv1_hypercall::X64HypercallRegister::Rdi => gps[TdxGp::RDI] = value,
3645            hv1_hypercall::X64HypercallRegister::R8 => gps[TdxGp::R8] = value,
3646        }
3647    }
3648
3649    // TODO: cleanup xmm to not use same as mshv
3650    fn xmm(&mut self, n: usize) -> u128 {
3651        u128::from_ne_bytes(self.vp.runner.fx_state().xmm[n])
3652    }
3653
3654    fn set_xmm(&mut self, n: usize, value: u128) {
3655        self.vp.runner.fx_state_mut().xmm[n] = value.to_ne_bytes();
3656    }
3657}
3658
3659impl<T: CpuIo> UhHypercallHandler<'_, '_, T, TdxBacked> {
3660    const TDX_DISPATCHER: hv1_hypercall::Dispatcher<Self> = hv1_hypercall::dispatcher!(
3661        Self,
3662        [
3663            hv1_hypercall::HvModifySparseGpaPageHostVisibility,
3664            hv1_hypercall::HvQuerySparseGpaPageHostVisibility,
3665            hv1_hypercall::HvX64StartVirtualProcessor,
3666            hv1_hypercall::HvGetVpIndexFromApicId,
3667            hv1_hypercall::HvRetargetDeviceInterrupt,
3668            hv1_hypercall::HvFlushVirtualAddressList,
3669            hv1_hypercall::HvFlushVirtualAddressListEx,
3670            hv1_hypercall::HvFlushVirtualAddressSpace,
3671            hv1_hypercall::HvFlushVirtualAddressSpaceEx,
3672            hv1_hypercall::HvPostMessage,
3673            hv1_hypercall::HvSignalEvent,
3674            hv1_hypercall::HvExtQueryCapabilities,
3675            hv1_hypercall::HvGetVpRegisters,
3676            hv1_hypercall::HvSetVpRegisters,
3677            hv1_hypercall::HvEnablePartitionVtl,
3678            hv1_hypercall::HvX64EnableVpVtl,
3679            hv1_hypercall::HvVtlCall,
3680            hv1_hypercall::HvVtlReturn,
3681            hv1_hypercall::HvModifyVtlProtectionMask,
3682            hv1_hypercall::HvX64TranslateVirtualAddress,
3683            hv1_hypercall::HvSendSyntheticClusterIpi,
3684            hv1_hypercall::HvSendSyntheticClusterIpiEx,
3685            hv1_hypercall::HvInstallIntercept,
3686            hv1_hypercall::HvAssertVirtualInterrupt,
3687        ]
3688    );
3689
3690    /// Hypercalls that come through a tdg.vp.vmcall tdcall instruction.
3691    ///
3692    /// This is just to handle the proxy synic.
3693    const TDCALL_DISPATCHER: hv1_hypercall::Dispatcher<Self> = hv1_hypercall::dispatcher!(
3694        Self,
3695        [hv1_hypercall::HvPostMessage, hv1_hypercall::HvSignalEvent],
3696    );
3697}
3698
3699impl AccessVpState for UhVpStateAccess<'_, '_, TdxBacked> {
3700    type Error = vp_state::Error;
3701
3702    fn caps(&self) -> &virt::x86::X86PartitionCapabilities {
3703        &self.vp.partition.caps
3704    }
3705
3706    fn commit(&mut self) -> Result<(), Self::Error> {
3707        Ok(())
3708    }
3709
3710    fn registers(&mut self) -> Result<Registers, Self::Error> {
3711        let gps = self.vp.runner.tdx_enter_guest_gps();
3712
3713        let cs = self.vp.read_segment(self.vtl, TdxSegmentReg::Cs);
3714        let ds = self.vp.read_segment(self.vtl, TdxSegmentReg::Ds);
3715        let es = self.vp.read_segment(self.vtl, TdxSegmentReg::Es);
3716        let fs = self.vp.read_segment(self.vtl, TdxSegmentReg::Fs);
3717        let gs = self.vp.read_segment(self.vtl, TdxSegmentReg::Gs);
3718        let ss = self.vp.read_segment(self.vtl, TdxSegmentReg::Ss);
3719        let tr = self.vp.read_segment(self.vtl, TdxSegmentReg::Tr);
3720        let ldtr = self.vp.read_segment(self.vtl, TdxSegmentReg::Ldtr);
3721
3722        let gdtr = self.vp.read_table_register(self.vtl, TdxTableReg::Gdtr);
3723        let idtr = self.vp.read_table_register(self.vtl, TdxTableReg::Idtr);
3724
3725        let cr0 = self.vp.read_cr0(self.vtl);
3726        let cr2 = self.vp.runner.cr2();
3727        let cr3 = self
3728            .vp
3729            .runner
3730            .read_vmcs64(self.vtl, VmcsField::VMX_VMCS_GUEST_CR3);
3731        let cr4 = self.vp.read_cr4(self.vtl);
3732
3733        let cr8 = self.vp.runner.tdx_apic_page(self.vtl).tpr.value >> 4;
3734
3735        let efer = self.vp.backing.vtls[self.vtl].efer;
3736
3737        Ok(Registers {
3738            rax: gps[TdxGp::RAX],
3739            rcx: gps[TdxGp::RCX],
3740            rdx: gps[TdxGp::RDX],
3741            rbx: gps[TdxGp::RBX],
3742            rsp: self.vp.backing.vtls[self.vtl].private_regs.rsp,
3743            rbp: gps[TdxGp::RBP],
3744            rsi: gps[TdxGp::RSI],
3745            rdi: gps[TdxGp::RDI],
3746            r8: gps[TdxGp::R8],
3747            r9: gps[TdxGp::R9],
3748            r10: gps[TdxGp::R10],
3749            r11: gps[TdxGp::R11],
3750            r12: gps[TdxGp::R12],
3751            r13: gps[TdxGp::R13],
3752            r14: gps[TdxGp::R14],
3753            r15: gps[TdxGp::R15],
3754            rip: self.vp.backing.vtls[self.vtl].private_regs.rip,
3755            rflags: self.vp.backing.vtls[self.vtl].private_regs.rflags,
3756            cs,
3757            ds,
3758            es,
3759            fs,
3760            gs,
3761            ss,
3762            tr,
3763            ldtr,
3764            gdtr,
3765            idtr,
3766            cr0,
3767            cr2,
3768            cr3,
3769            cr4,
3770            cr8: cr8.into(),
3771            efer,
3772        })
3773    }
3774
3775    fn set_registers(&mut self, value: &Registers) -> Result<(), Self::Error> {
3776        let Registers {
3777            rax,
3778            rcx,
3779            rdx,
3780            rbx,
3781            rsp,
3782            rbp,
3783            rsi,
3784            rdi,
3785            r8,
3786            r9,
3787            r10,
3788            r11,
3789            r12,
3790            r13,
3791            r14,
3792            r15,
3793            rip,
3794            rflags,
3795            cs,
3796            ds,
3797            es,
3798            fs,
3799            gs,
3800            ss,
3801            tr,
3802            ldtr,
3803            gdtr,
3804            idtr,
3805            cr0,
3806            cr2,
3807            cr3,
3808            cr4,
3809            cr8,
3810            efer,
3811        } = value;
3812
3813        let gps = self.vp.runner.tdx_enter_guest_gps_mut();
3814        gps[TdxGp::RAX] = *rax;
3815        gps[TdxGp::RCX] = *rcx;
3816        gps[TdxGp::RDX] = *rdx;
3817        gps[TdxGp::RBX] = *rbx;
3818        self.vp.backing.vtls[self.vtl].private_regs.rsp = *rsp;
3819        gps[TdxGp::RBP] = *rbp;
3820        gps[TdxGp::RSI] = *rsi;
3821        gps[TdxGp::RDI] = *rdi;
3822        gps[TdxGp::R8] = *r8;
3823        gps[TdxGp::R9] = *r9;
3824        gps[TdxGp::R10] = *r10;
3825        gps[TdxGp::R11] = *r11;
3826        gps[TdxGp::R12] = *r12;
3827        gps[TdxGp::R13] = *r13;
3828        gps[TdxGp::R14] = *r14;
3829        gps[TdxGp::R15] = *r15;
3830        self.vp.backing.vtls[self.vtl].private_regs.rip = *rip;
3831        // BUGBUG: rflags set also updates interrupts in hcl
3832        self.vp.backing.vtls[self.vtl].private_regs.rflags = *rflags;
3833
3834        // Set segment registers
3835        self.vp.write_segment(self.vtl, TdxSegmentReg::Cs, *cs)?;
3836        self.vp.write_segment(self.vtl, TdxSegmentReg::Ds, *ds)?;
3837        self.vp.write_segment(self.vtl, TdxSegmentReg::Es, *es)?;
3838        self.vp.write_segment(self.vtl, TdxSegmentReg::Fs, *fs)?;
3839        self.vp.write_segment(self.vtl, TdxSegmentReg::Gs, *gs)?;
3840        self.vp.write_segment(self.vtl, TdxSegmentReg::Ss, *ss)?;
3841        self.vp.write_segment(self.vtl, TdxSegmentReg::Tr, *tr)?;
3842        self.vp
3843            .write_segment(self.vtl, TdxSegmentReg::Ldtr, *ldtr)?;
3844
3845        // Set table registers
3846        self.vp
3847            .write_table_register(self.vtl, TdxTableReg::Gdtr, *gdtr);
3848        self.vp
3849            .write_table_register(self.vtl, TdxTableReg::Idtr, *idtr);
3850
3851        self.vp.write_cr0(self.vtl, *cr0)?;
3852
3853        // CR2 is shared with the kernel, so set it in the VP run page which
3854        // will be set before lower VTL entry.
3855        self.vp.runner.set_cr2(*cr2);
3856
3857        self.vp
3858            .runner
3859            .write_vmcs64(self.vtl, VmcsField::VMX_VMCS_GUEST_CR3, !0, *cr3);
3860
3861        self.vp.write_cr4(self.vtl, *cr4)?;
3862
3863        self.vp.runner.tdx_apic_page_mut(self.vtl).tpr.value = (*cr8 << 4) as u32;
3864
3865        self.vp.write_efer(self.vtl, *efer)?;
3866
3867        // Execution mode must be updated after setting EFER and CR0.
3868        self.vp.update_execution_mode(self.vtl);
3869
3870        Ok(())
3871    }
3872
3873    fn activity(&mut self) -> Result<vp::Activity, Self::Error> {
3874        let lapic = &self.vp.backing.cvm.lapics[self.vtl];
3875        let interruptibility: Interruptibility = self
3876            .vp
3877            .runner
3878            .read_vmcs32(self.vtl, VmcsField::VMX_VMCS_GUEST_INTERRUPTIBILITY)
3879            .into();
3880        Ok(vp::Activity {
3881            mp_state: lapic.activity,
3882            nmi_pending: lapic.nmi_pending,
3883            nmi_masked: interruptibility.blocked_by_nmi(),
3884            interrupt_shadow: interruptibility.blocked_by_sti()
3885                || interruptibility.blocked_by_movss(),
3886            pending_event: None,        // TODO TDX
3887            pending_interruption: None, // TODO TDX
3888        })
3889    }
3890
3891    fn set_activity(&mut self, value: &vp::Activity) -> Result<(), Self::Error> {
3892        let &vp::Activity {
3893            mp_state,
3894            nmi_pending,
3895            nmi_masked,
3896            interrupt_shadow,
3897            pending_event: _,        // TODO TDX
3898            pending_interruption: _, // TODO TDX
3899        } = value;
3900        self.vp.backing.cvm.lapics[self.vtl].activity = mp_state;
3901        self.vp.backing.cvm.lapics[self.vtl].nmi_pending = nmi_pending;
3902        let interruptibility = Interruptibility::new()
3903            .with_blocked_by_movss(interrupt_shadow)
3904            .with_blocked_by_nmi(nmi_masked);
3905        self.vp.runner.write_vmcs32(
3906            self.vtl,
3907            VmcsField::VMX_VMCS_GUEST_INTERRUPTIBILITY,
3908            !0,
3909            interruptibility.into(),
3910        );
3911        Ok(())
3912    }
3913
3914    fn xsave(&mut self) -> Result<vp::Xsave, Self::Error> {
3915        // TODO: needed?
3916        Err(vp_state::Error::Unimplemented("xsave"))
3917    }
3918
3919    fn set_xsave(&mut self, _value: &vp::Xsave) -> Result<(), Self::Error> {
3920        // TODO: needed?
3921        Err(vp_state::Error::Unimplemented("xsave"))
3922    }
3923
3924    fn apic(&mut self) -> Result<vp::Apic, Self::Error> {
3925        self.vp.access_apic_without_offload(self.vtl, |vp| {
3926            Ok(vp.backing.cvm.lapics[self.vtl].lapic.save())
3927        })
3928    }
3929
3930    fn set_apic(&mut self, value: &vp::Apic) -> Result<(), Self::Error> {
3931        self.vp.access_apic_without_offload(self.vtl, |vp| {
3932            vp.backing.cvm.lapics[self.vtl]
3933                .lapic
3934                .restore(value)
3935                .map_err(vp_state::Error::InvalidApicBase)?;
3936
3937            Ok(())
3938        })
3939    }
3940
3941    fn xcr(&mut self) -> Result<vp::Xcr0, Self::Error> {
3942        Ok(vp::Xcr0 {
3943            value: self
3944                .vp
3945                .runner
3946                .get_vp_register(self.vtl, HvX64RegisterName::Xfem)
3947                .unwrap()
3948                .as_u64(),
3949        })
3950    }
3951
3952    fn set_xcr(&mut self, _value: &vp::Xcr0) -> Result<(), Self::Error> {
3953        Err(vp_state::Error::Unimplemented("xcr"))
3954    }
3955
3956    fn xss(&mut self) -> Result<vp::Xss, Self::Error> {
3957        Ok(vp::Xss {
3958            value: self.vp.backing.vtls[self.vtl].private_regs.msr_xss,
3959        })
3960    }
3961
3962    fn set_xss(&mut self, value: &vp::Xss) -> Result<(), Self::Error> {
3963        self.vp.backing.vtls[self.vtl].private_regs.msr_xss = value.value;
3964        Ok(())
3965    }
3966
3967    fn mtrrs(&mut self) -> Result<vp::Mtrrs, Self::Error> {
3968        Ok(vp::Mtrrs {
3969            msr_mtrr_def_type: 0, // TODO TDX: MTRRs
3970            fixed: [0; 11],       // TODO TDX: MTRRs
3971            variable: [0; 16],    // TODO TDX: MTRRs
3972        })
3973    }
3974
3975    fn set_mtrrs(&mut self, _value: &vp::Mtrrs) -> Result<(), Self::Error> {
3976        // TODO TDX: MTRRs
3977        Ok(())
3978    }
3979
3980    fn pat(&mut self) -> Result<vp::Pat, Self::Error> {
3981        let msr_cr_pat = self
3982            .vp
3983            .runner
3984            .read_vmcs64(self.vtl, VmcsField::VMX_VMCS_GUEST_PAT);
3985        Ok(vp::Pat { value: msr_cr_pat })
3986    }
3987
3988    fn set_pat(&mut self, value: &vp::Pat) -> Result<(), Self::Error> {
3989        self.vp
3990            .runner
3991            .write_vmcs64(self.vtl, VmcsField::VMX_VMCS_GUEST_PAT, !0, value.value);
3992        Ok(())
3993    }
3994
3995    fn virtual_msrs(&mut self) -> Result<vp::VirtualMsrs, Self::Error> {
3996        let state = &self.vp.backing.vtls[self.vtl].private_regs;
3997
3998        let sysenter_cs = self
3999            .vp
4000            .runner
4001            .read_vmcs32(self.vtl, VmcsField::VMX_VMCS_GUEST_SYSENTER_CS_MSR)
4002            .into();
4003        let sysenter_eip = self
4004            .vp
4005            .runner
4006            .read_vmcs64(self.vtl, VmcsField::VMX_VMCS_GUEST_SYSENTER_EIP_MSR);
4007        let sysenter_esp = self
4008            .vp
4009            .runner
4010            .read_vmcs64(self.vtl, VmcsField::VMX_VMCS_GUEST_SYSENTER_ESP_MSR);
4011
4012        Ok(vp::VirtualMsrs {
4013            kernel_gs_base: state.msr_kernel_gs_base,
4014            sysenter_cs,
4015            sysenter_eip,
4016            sysenter_esp,
4017            star: state.msr_star,
4018            lstar: state.msr_lstar,
4019            cstar: self.vp.backing.vtls[self.vtl].msr_cstar,
4020            sfmask: state.msr_sfmask,
4021        })
4022    }
4023
4024    fn set_virtual_msrs(&mut self, value: &vp::VirtualMsrs) -> Result<(), Self::Error> {
4025        let &vp::VirtualMsrs {
4026            kernel_gs_base,
4027            sysenter_cs,
4028            sysenter_eip,
4029            sysenter_esp,
4030            star,
4031            lstar,
4032            cstar,
4033            sfmask,
4034        } = value;
4035
4036        let state = &mut self.vp.backing.vtls[self.vtl].private_regs;
4037        state.msr_kernel_gs_base = kernel_gs_base;
4038        state.msr_star = star;
4039        state.msr_lstar = lstar;
4040        state.msr_sfmask = sfmask;
4041
4042        self.vp.runner.write_vmcs32(
4043            self.vtl,
4044            VmcsField::VMX_VMCS_GUEST_SYSENTER_CS_MSR,
4045            !0,
4046            sysenter_cs as u32,
4047        );
4048        self.vp.runner.write_vmcs64(
4049            self.vtl,
4050            VmcsField::VMX_VMCS_GUEST_SYSENTER_EIP_MSR,
4051            !0,
4052            sysenter_eip,
4053        );
4054        self.vp.runner.write_vmcs64(
4055            self.vtl,
4056            VmcsField::VMX_VMCS_GUEST_SYSENTER_ESP_MSR,
4057            !0,
4058            sysenter_esp,
4059        );
4060
4061        self.vp.backing.vtls[self.vtl].msr_cstar = cstar;
4062
4063        Ok(())
4064    }
4065
4066    fn debug_regs(&mut self) -> Result<vp::DebugRegisters, Self::Error> {
4067        let mut values = [0u64.into(); 5];
4068        self.vp
4069            .runner
4070            .get_vp_registers(
4071                self.vtl,
4072                &[
4073                    HvX64RegisterName::Dr0,
4074                    HvX64RegisterName::Dr1,
4075                    HvX64RegisterName::Dr2,
4076                    HvX64RegisterName::Dr3,
4077                    HvX64RegisterName::Dr6,
4078                ],
4079                &mut values,
4080            )
4081            .map_err(vp_state::Error::GetRegisters)?;
4082
4083        let dr7 = self
4084            .vp
4085            .runner
4086            .read_vmcs64(self.vtl, VmcsField::VMX_VMCS_GUEST_DR7);
4087
4088        Ok(vp::DebugRegisters {
4089            dr0: values[0].as_u64(),
4090            dr1: values[1].as_u64(),
4091            dr2: values[2].as_u64(),
4092            dr3: values[3].as_u64(),
4093            dr6: values[4].as_u64(),
4094            dr7,
4095        })
4096    }
4097
4098    fn set_debug_regs(&mut self, value: &vp::DebugRegisters) -> Result<(), Self::Error> {
4099        let &vp::DebugRegisters {
4100            dr0,
4101            dr1,
4102            dr2,
4103            dr3,
4104            dr6,
4105            dr7,
4106        } = value;
4107        self.vp
4108            .runner
4109            .set_vp_registers(
4110                self.vtl,
4111                [
4112                    (HvX64RegisterName::Dr0, dr0),
4113                    (HvX64RegisterName::Dr1, dr1),
4114                    (HvX64RegisterName::Dr2, dr2),
4115                    (HvX64RegisterName::Dr3, dr3),
4116                    (HvX64RegisterName::Dr6, dr6),
4117                ],
4118            )
4119            .map_err(vp_state::Error::SetRegisters)?;
4120
4121        self.vp
4122            .runner
4123            .write_vmcs64(self.vtl, VmcsField::VMX_VMCS_GUEST_DR7, !0, dr7);
4124
4125        Ok(())
4126    }
4127
4128    fn tsc(&mut self) -> Result<vp::Tsc, Self::Error> {
4129        Err(vp_state::Error::Unimplemented("tsc"))
4130    }
4131
4132    fn set_tsc(&mut self, _value: &vp::Tsc) -> Result<(), Self::Error> {
4133        Err(vp_state::Error::Unimplemented("tsc"))
4134    }
4135
4136    fn tsc_aux(&mut self) -> Result<vp::TscAux, Self::Error> {
4137        Ok(vp::TscAux {
4138            value: self.vp.backing.vtls[self.vtl].private_regs.msr_tsc_aux,
4139        })
4140    }
4141
4142    fn set_tsc_aux(&mut self, value: &vp::TscAux) -> Result<(), Self::Error> {
4143        self.vp.backing.vtls[self.vtl].private_regs.msr_tsc_aux = value.value;
4144        Ok(())
4145    }
4146
4147    fn cet(&mut self) -> Result<vp::Cet, Self::Error> {
4148        Err(vp_state::Error::Unimplemented("cet"))
4149    }
4150
4151    fn set_cet(&mut self, _value: &vp::Cet) -> Result<(), Self::Error> {
4152        Err(vp_state::Error::Unimplemented("cet"))
4153    }
4154
4155    fn cet_ss(&mut self) -> Result<vp::CetSs, Self::Error> {
4156        Err(vp_state::Error::Unimplemented("cet_ss"))
4157    }
4158
4159    fn set_cet_ss(&mut self, _value: &vp::CetSs) -> Result<(), Self::Error> {
4160        Err(vp_state::Error::Unimplemented("cet_ss"))
4161    }
4162
4163    fn synic_msrs(&mut self) -> Result<vp::SyntheticMsrs, Self::Error> {
4164        Err(vp_state::Error::Unimplemented("synic_msrs"))
4165    }
4166
4167    fn set_synic_msrs(&mut self, _value: &vp::SyntheticMsrs) -> Result<(), Self::Error> {
4168        Err(vp_state::Error::Unimplemented("synic_msrs"))
4169    }
4170
4171    fn synic_message_page(&mut self) -> Result<vp::SynicMessagePage, Self::Error> {
4172        Err(vp_state::Error::Unimplemented("synic_message_page"))
4173    }
4174
4175    fn set_synic_message_page(&mut self, _value: &vp::SynicMessagePage) -> Result<(), Self::Error> {
4176        Err(vp_state::Error::Unimplemented("synic_message_page"))
4177    }
4178
4179    fn synic_event_flags_page(&mut self) -> Result<vp::SynicEventFlagsPage, Self::Error> {
4180        Err(vp_state::Error::Unimplemented("synic_event_flags_page"))
4181    }
4182
4183    fn set_synic_event_flags_page(
4184        &mut self,
4185        _value: &vp::SynicEventFlagsPage,
4186    ) -> Result<(), Self::Error> {
4187        Err(vp_state::Error::Unimplemented("synic_event_flags_page"))
4188    }
4189
4190    fn synic_message_queues(&mut self) -> Result<vp::SynicMessageQueues, Self::Error> {
4191        Err(vp_state::Error::Unimplemented("synic_message_queues"))
4192    }
4193
4194    fn set_synic_message_queues(
4195        &mut self,
4196        _value: &vp::SynicMessageQueues,
4197    ) -> Result<(), Self::Error> {
4198        Err(vp_state::Error::Unimplemented("synic_message_queues"))
4199    }
4200
4201    fn synic_timers(&mut self) -> Result<vp::SynicTimers, Self::Error> {
4202        Err(vp_state::Error::Unimplemented("synic_timers"))
4203    }
4204
4205    fn set_synic_timers(&mut self, _value: &vp::SynicTimers) -> Result<(), Self::Error> {
4206        Err(vp_state::Error::Unimplemented("synic_timers"))
4207    }
4208}
4209
4210/// Compute the index of the highest vector set in IRR/ISR, or 0
4211/// if no vector is set. (Vectors 0-15 are invalid so this is not
4212/// ambiguous.)
4213fn top_vector(reg: &[ApicRegister; 8]) -> u8 {
4214    reg.iter()
4215        .enumerate()
4216        .rev()
4217        .find_map(|(i, r)| {
4218            (r.value != 0).then(|| (i as u32 * 32 + (31 - r.value.leading_zeros())) as u8)
4219        })
4220        .unwrap_or(0)
4221}
4222
4223struct TdHypercall<'a, 'b, T>(UhHypercallHandler<'a, 'b, T, TdxBacked>);
4224
4225impl<'a, 'b, T> AsHandler<UhHypercallHandler<'a, 'b, T, TdxBacked>> for TdHypercall<'a, 'b, T> {
4226    fn as_handler(&mut self) -> &mut UhHypercallHandler<'a, 'b, T, TdxBacked> {
4227        &mut self.0
4228    }
4229}
4230
4231impl<T> HypercallIo for TdHypercall<'_, '_, T> {
4232    fn advance_ip(&mut self) {
4233        self.0.vp.runner.tdx_enter_guest_gps_mut()[TdxGp::R10] = 0;
4234        self.0.vp.backing.vtls[self.0.intercepted_vtl]
4235            .private_regs
4236            .rip = self.0.vp.backing.vtls[self.0.intercepted_vtl]
4237            .private_regs
4238            .rip
4239            .wrapping_add(4);
4240    }
4241
4242    fn retry(&mut self, control: u64) {
4243        self.0.vp.runner.tdx_enter_guest_gps_mut()[TdxGp::R10] = control;
4244        self.set_result(hvdef::hypercall::HypercallOutput::from(HvError::Timeout).into());
4245    }
4246
4247    fn control(&mut self) -> u64 {
4248        self.0.vp.runner.tdx_enter_guest_gps()[TdxGp::R10]
4249    }
4250
4251    fn input_gpa(&mut self) -> u64 {
4252        self.0.vp.runner.tdx_enter_guest_gps()[TdxGp::RDX]
4253    }
4254
4255    fn output_gpa(&mut self) -> u64 {
4256        self.0.vp.runner.tdx_enter_guest_gps()[TdxGp::R8]
4257    }
4258
4259    fn fast_register_pair_count(&mut self) -> usize {
4260        7
4261    }
4262
4263    fn extended_fast_hypercalls_ok(&mut self) -> bool {
4264        false
4265    }
4266
4267    fn fast_input(&mut self, buf: &mut [[u64; 2]], _output_register_pairs: usize) -> usize {
4268        self.fast_regs(0, buf);
4269        buf.len()
4270    }
4271
4272    fn fast_output(&mut self, _starting_pair_index: usize, buf: &[[u64; 2]]) {
4273        assert!(buf.is_empty());
4274    }
4275
4276    fn vtl_input(&mut self) -> u64 {
4277        unreachable!()
4278    }
4279
4280    fn set_result(&mut self, n: u64) {
4281        self.0.vp.runner.tdx_enter_guest_gps_mut()[TdxGp::R11] = n;
4282    }
4283
4284    fn fast_regs(&mut self, starting_pair_index: usize, buf: &mut [[u64; 2]]) {
4285        let regs = self.0.vp.runner.tdx_enter_guest_gps();
4286        let fx_state = self.0.vp.runner.fx_state();
4287        for (i, [low, high]) in buf.iter_mut().enumerate() {
4288            let index = i + starting_pair_index;
4289            if index == 0 {
4290                *low = regs[TdxGp::RDX];
4291                *high = regs[TdxGp::R8];
4292            } else {
4293                let value = u128::from_ne_bytes(fx_state.xmm[index - 1]);
4294                *low = value as u64;
4295                *high = (value >> 64) as u64;
4296            }
4297        }
4298    }
4299}
4300
4301impl<T> hv1_hypercall::VtlSwitchOps for UhHypercallHandler<'_, '_, T, TdxBacked> {
4302    fn advance_ip(&mut self) {
4303        let long_mode = self.vp.long_mode(self.intercepted_vtl);
4304        let mut io = hv1_hypercall::X64RegisterIo::new(self, long_mode);
4305        io.advance_ip();
4306    }
4307
4308    fn inject_invalid_opcode_fault(&mut self) {
4309        self.vp.backing.vtls[self.intercepted_vtl].interruption_information =
4310            InterruptionInformation::new()
4311                .with_valid(true)
4312                .with_interruption_type(INTERRUPT_TYPE_HARDWARE_EXCEPTION)
4313                .with_vector(x86defs::Exception::INVALID_OPCODE.0);
4314    }
4315}
4316
4317impl<T: CpuIo> hv1_hypercall::FlushVirtualAddressList for UhHypercallHandler<'_, '_, T, TdxBacked> {
4318    fn flush_virtual_address_list(
4319        &mut self,
4320        processor_set: ProcessorSet<'_>,
4321        flags: HvFlushFlags,
4322        gva_ranges: &[HvGvaRange],
4323    ) -> HvRepResult {
4324        hv1_hypercall::FlushVirtualAddressListEx::flush_virtual_address_list_ex(
4325            self,
4326            processor_set,
4327            flags,
4328            gva_ranges,
4329        )
4330    }
4331}
4332
4333impl<T: CpuIo> hv1_hypercall::FlushVirtualAddressListEx
4334    for UhHypercallHandler<'_, '_, T, TdxBacked>
4335{
4336    fn flush_virtual_address_list_ex(
4337        &mut self,
4338        processor_set: ProcessorSet<'_>,
4339        flags: HvFlushFlags,
4340        gva_ranges: &[HvGvaRange],
4341    ) -> HvRepResult {
4342        self.hcvm_validate_flush_inputs(processor_set, flags, true)
4343            .map_err(|e| (e, 0))?;
4344
4345        let vtl = self.intercepted_vtl;
4346        let flush_state = &self.vp.shared.flush_state[vtl];
4347
4348        // If we fail to add ranges to the list for any reason then promote this request to a flush entire.
4349        if let Err(()) = Self::add_ranges_to_tlb_flush_list(
4350            flush_state,
4351            gva_ranges,
4352            flags.use_extended_range_format(),
4353        ) {
4354            if flags.non_global_mappings_only() {
4355                flush_state
4356                    .flush_entire_non_global_counter
4357                    .fetch_add(1, Ordering::Relaxed);
4358            } else {
4359                flush_state
4360                    .flush_entire_counter
4361                    .fetch_add(1, Ordering::Relaxed);
4362            }
4363        }
4364
4365        // Send flush IPIs to the specified VPs.
4366        TdxTlbLockFlushAccess {
4367            vp_index: Some(self.vp.vp_index()),
4368            partition: self.vp.partition,
4369            shared: self.vp.shared,
4370        }
4371        .wake_processors_for_tlb_flush(vtl, (!flags.all_processors()).then_some(processor_set));
4372
4373        // Mark that this VP needs to wait for all TLB locks to be released before returning.
4374        self.vp.set_wait_for_tlb_locks(vtl);
4375
4376        Ok(())
4377    }
4378}
4379
4380impl<T: CpuIo> hv1_hypercall::FlushVirtualAddressSpace
4381    for UhHypercallHandler<'_, '_, T, TdxBacked>
4382{
4383    fn flush_virtual_address_space(
4384        &mut self,
4385        processor_set: ProcessorSet<'_>,
4386        flags: HvFlushFlags,
4387    ) -> hvdef::HvResult<()> {
4388        hv1_hypercall::FlushVirtualAddressSpaceEx::flush_virtual_address_space_ex(
4389            self,
4390            processor_set,
4391            flags,
4392        )
4393    }
4394}
4395
4396impl<T: CpuIo> hv1_hypercall::FlushVirtualAddressSpaceEx
4397    for UhHypercallHandler<'_, '_, T, TdxBacked>
4398{
4399    fn flush_virtual_address_space_ex(
4400        &mut self,
4401        processor_set: ProcessorSet<'_>,
4402        flags: HvFlushFlags,
4403    ) -> hvdef::HvResult<()> {
4404        self.hcvm_validate_flush_inputs(processor_set, flags, false)?;
4405        let vtl = self.intercepted_vtl;
4406
4407        let flush_state = &self.vp.shared.flush_state[vtl];
4408
4409        // Set flush entire.
4410        if flags.non_global_mappings_only() {
4411            flush_state
4412                .flush_entire_non_global_counter
4413                .fetch_add(1, Ordering::Relaxed);
4414        } else {
4415            flush_state
4416                .flush_entire_counter
4417                .fetch_add(1, Ordering::Relaxed);
4418        }
4419
4420        // Send flush IPIs to the specified VPs.
4421        TdxTlbLockFlushAccess {
4422            vp_index: Some(self.vp.vp_index()),
4423            partition: self.vp.partition,
4424            shared: self.vp.shared,
4425        }
4426        .wake_processors_for_tlb_flush(vtl, (!flags.all_processors()).then_some(processor_set));
4427
4428        // Mark that this VP needs to wait for all TLB locks to be released before returning.
4429        self.vp.set_wait_for_tlb_locks(vtl);
4430
4431        Ok(())
4432    }
4433}
4434
4435impl<T: CpuIo> UhHypercallHandler<'_, '_, T, TdxBacked> {
4436    fn add_ranges_to_tlb_flush_list(
4437        flush_state: &TdxPartitionFlushState,
4438        gva_ranges: &[HvGvaRange],
4439        use_extended_range_format: bool,
4440    ) -> Result<(), ()> {
4441        // If there are more gvas than the list size there's no point in filling the list.
4442        if gva_ranges.len() > FLUSH_GVA_LIST_SIZE {
4443            return Err(());
4444        }
4445
4446        if use_extended_range_format
4447            && gva_ranges
4448                .iter()
4449                .any(|range| range.as_extended().large_page())
4450        {
4451            // TDX does not provide a way to flush large page ranges,
4452            // we have to promote this request to a flush entire.
4453            return Err(());
4454        }
4455
4456        flush_state
4457            .gva_list
4458            .write()
4459            .extend(gva_ranges.iter().copied());
4460
4461        Ok(())
4462    }
4463}
4464
4465impl TdxTlbLockFlushAccess<'_> {
4466    fn wake_processors_for_tlb_flush(
4467        &mut self,
4468        target_vtl: GuestVtl,
4469        processor_set: Option<ProcessorSet<'_>>,
4470    ) {
4471        match processor_set {
4472            Some(processors) => {
4473                self.wake_processors_for_tlb_flush_inner(target_vtl, processors);
4474            }
4475            None => self.wake_processors_for_tlb_flush_inner(
4476                target_vtl,
4477                0..(self.partition.vps.len() as u32),
4478            ),
4479        }
4480    }
4481
4482    fn wake_processors_for_tlb_flush_inner(
4483        &mut self,
4484        target_vtl: GuestVtl,
4485        processors: impl IntoIterator<Item = u32>,
4486    ) {
4487        // Use SeqCst ordering to ensure that we are observing the most
4488        // up-to-date value from other VPs. Otherwise we might not send a
4489        // wake to a VP in a lower VTL, which could cause TLB lock holders
4490        // to be stuck waiting until the target_vp happens to switch into
4491        // VTL 2.
4492        // We use a single fence to avoid having to take a SeqCst load
4493        // for each VP.
4494        std::sync::atomic::fence(Ordering::SeqCst);
4495        self.partition.hcl.kick_cpus(
4496            processors.into_iter().filter(|&vp| {
4497                self.shared.active_vtl[vp as usize].load(Ordering::Relaxed) == target_vtl as u8
4498            }),
4499            true,
4500            true,
4501        );
4502    }
4503}
4504
4505struct TdxTlbLockFlushAccess<'a> {
4506    vp_index: Option<VpIndex>,
4507    partition: &'a UhPartitionInner,
4508    shared: &'a TdxBackedShared,
4509}
4510
4511impl TlbFlushLockAccess for TdxTlbLockFlushAccess<'_> {
4512    fn flush(&mut self, vtl: GuestVtl) {
4513        self.shared.flush_state[vtl]
4514            .flush_entire_counter
4515            .fetch_add(1, Ordering::Relaxed);
4516
4517        self.wake_processors_for_tlb_flush(vtl, None);
4518        self.set_wait_for_tlb_locks(vtl);
4519    }
4520
4521    fn flush_entire(&mut self) {
4522        for vtl in [GuestVtl::Vtl0, GuestVtl::Vtl1] {
4523            self.shared.flush_state[vtl]
4524                .flush_entire_counter
4525                .fetch_add(1, Ordering::Relaxed);
4526        }
4527        for vtl in [GuestVtl::Vtl0, GuestVtl::Vtl1] {
4528            self.wake_processors_for_tlb_flush(vtl, None);
4529            self.set_wait_for_tlb_locks(vtl);
4530        }
4531    }
4532
4533    fn set_wait_for_tlb_locks(&mut self, vtl: GuestVtl) {
4534        if let Some(vp_index) = self.vp_index {
4535            hardware_cvm::tlb_lock::TlbLockAccess {
4536                vp_index,
4537                cvm_partition: &self.shared.cvm,
4538            }
4539            .set_wait_for_tlb_locks(vtl);
4540        }
4541    }
4542}
4543
4544mod save_restore {
4545    use super::TdxBacked;
4546    use super::UhProcessor;
4547    use vmcore::save_restore::RestoreError;
4548    use vmcore::save_restore::SaveError;
4549    use vmcore::save_restore::SaveRestore;
4550    use vmcore::save_restore::SavedStateNotSupported;
4551
4552    impl SaveRestore for UhProcessor<'_, TdxBacked> {
4553        type SavedState = SavedStateNotSupported;
4554
4555        fn save(&mut self) -> Result<Self::SavedState, SaveError> {
4556            Err(SaveError::NotSupported)
4557        }
4558
4559        fn restore(&mut self, state: Self::SavedState) -> Result<(), RestoreError> {
4560            match state {}
4561        }
4562    }
4563}