virt_mshv_vtl/processor/tdx/
mod.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! Processor support for TDX partitions.
5
6mod tlb_flush;
7
8use super::BackingPrivate;
9use super::BackingSharedParams;
10use super::HardwareIsolatedBacking;
11use super::UhEmulationState;
12use super::UhHypercallHandler;
13use super::hardware_cvm;
14use super::vp_state;
15use super::vp_state::UhVpStateAccess;
16use crate::BackingShared;
17use crate::GuestVtl;
18use crate::TlbFlushLockAccess;
19use crate::UhCvmPartitionState;
20use crate::UhCvmVpState;
21use crate::UhPartitionInner;
22use crate::UhPartitionNewParams;
23use crate::UhProcessor;
24use crate::WakeReason;
25use cvm_tracing::CVM_ALLOWED;
26use cvm_tracing::CVM_CONFIDENTIAL;
27use guestmem::GuestMemory;
28use hcl::ioctl::ProcessorRunner;
29use hcl::ioctl::tdx::Tdx;
30use hcl::ioctl::tdx::TdxPrivateRegs;
31use hcl::protocol::hcl_intr_offload_flags;
32use hcl::protocol::tdx_tdg_vp_enter_exit_info;
33use hv1_emulator::hv::ProcessorVtlHv;
34use hv1_emulator::synic::GlobalSynic;
35use hv1_emulator::synic::ProcessorSynic;
36use hv1_hypercall::AsHandler;
37use hv1_hypercall::HvRepResult;
38use hv1_hypercall::HypercallIo;
39use hv1_structs::ProcessorSet;
40use hv1_structs::VtlArray;
41use hvdef::HV_PAGE_SIZE;
42use hvdef::HvError;
43use hvdef::HvSynicSimpSiefp;
44use hvdef::HvX64PendingExceptionEvent;
45use hvdef::HvX64RegisterName;
46use hvdef::Vtl;
47use hvdef::hypercall::HvFlushFlags;
48use hvdef::hypercall::HvGvaRange;
49use inspect::Inspect;
50use inspect::InspectMut;
51use inspect_counters::Counter;
52use std::sync::atomic::AtomicU8;
53use std::sync::atomic::Ordering;
54use thiserror::Error;
55use tlb_flush::FLUSH_GVA_LIST_SIZE;
56use tlb_flush::TdxFlushState;
57use tlb_flush::TdxPartitionFlushState;
58use virt::EmulatorMonitorSupport;
59use virt::Processor;
60use virt::VpHaltReason;
61use virt::VpIndex;
62use virt::io::CpuIo;
63use virt::state::StateElement;
64use virt::vp;
65use virt::vp::AccessVpState;
66use virt::vp::MpState;
67use virt::vp::Registers;
68use virt::x86::MsrError;
69use virt::x86::MsrErrorExt;
70use virt::x86::SegmentRegister;
71use virt::x86::TableRegister;
72use virt_support_apic::ApicClient;
73use virt_support_apic::OffloadNotSupported;
74use virt_support_x86emu::emulate::EmulatedMemoryOperation;
75use virt_support_x86emu::emulate::EmulatorSupport as X86EmulatorSupport;
76use virt_support_x86emu::emulate::TranslateMode;
77use virt_support_x86emu::emulate::emulate_insn_memory_op;
78use virt_support_x86emu::emulate::emulate_io;
79use virt_support_x86emu::emulate::emulate_translate_gva;
80use virt_support_x86emu::translate::TranslationRegisters;
81use vmcore::vmtime::VmTimeAccess;
82use x86defs::RFlags;
83use x86defs::X64_CR0_ET;
84use x86defs::X64_CR0_NE;
85use x86defs::X64_CR0_PE;
86use x86defs::X64_CR0_PG;
87use x86defs::X64_CR4_MCE;
88use x86defs::X64_CR4_UMIP;
89use x86defs::X64_CR4_VMXE;
90use x86defs::X64_EFER_FFXSR;
91use x86defs::X64_EFER_LMA;
92use x86defs::X64_EFER_LME;
93use x86defs::X64_EFER_NXE;
94use x86defs::X64_EFER_SVME;
95use x86defs::X86X_MSR_EFER;
96use x86defs::apic::X2APIC_MSR_BASE;
97use x86defs::tdx::TdCallResultCode;
98use x86defs::tdx::TdVmCallR10Result;
99use x86defs::tdx::TdxGp;
100use x86defs::tdx::TdxInstructionInfo;
101use x86defs::tdx::TdxL2Ctls;
102use x86defs::tdx::TdxVpEnterRaxResult;
103use x86defs::vmx::ApicPage;
104use x86defs::vmx::ApicRegister;
105use x86defs::vmx::CR_ACCESS_TYPE_LMSW;
106use x86defs::vmx::CR_ACCESS_TYPE_MOV_TO_CR;
107use x86defs::vmx::CrAccessQualification;
108use x86defs::vmx::ExitQualificationIo;
109use x86defs::vmx::GdtrOrIdtrInstruction;
110use x86defs::vmx::GdtrOrIdtrInstructionInfo;
111use x86defs::vmx::INTERRUPT_TYPE_EXTERNAL;
112use x86defs::vmx::INTERRUPT_TYPE_HARDWARE_EXCEPTION;
113use x86defs::vmx::INTERRUPT_TYPE_NMI;
114use x86defs::vmx::IO_SIZE_8_BIT;
115use x86defs::vmx::IO_SIZE_16_BIT;
116use x86defs::vmx::IO_SIZE_32_BIT;
117use x86defs::vmx::Interruptibility;
118use x86defs::vmx::InterruptionInformation;
119use x86defs::vmx::LdtrOrTrInstruction;
120use x86defs::vmx::LdtrOrTrInstructionInfo;
121use x86defs::vmx::ProcessorControls;
122use x86defs::vmx::SecondaryProcessorControls;
123use x86defs::vmx::VMX_ENTRY_CONTROL_LONG_MODE_GUEST;
124use x86defs::vmx::VMX_FEATURE_CONTROL_LOCKED;
125use x86defs::vmx::VmcsField;
126use x86defs::vmx::VmxEptExitQualification;
127use x86defs::vmx::VmxExit;
128use x86defs::vmx::VmxExitBasic;
129use x86emu::Gp;
130use x86emu::Segment;
131
132/// MSRs that are allowed to be read by the guest without interception.
133const MSR_ALLOWED_READ: &[u32] = &[
134    x86defs::X86X_MSR_TSC,
135    x86defs::X86X_MSR_TSC_AUX,
136    X86X_MSR_EFER,
137    x86defs::X86X_MSR_STAR,
138    x86defs::X86X_MSR_LSTAR,
139    x86defs::X86X_MSR_SFMASK,
140    x86defs::X86X_MSR_SYSENTER_CS,
141    x86defs::X86X_MSR_SYSENTER_ESP,
142    x86defs::X86X_MSR_SYSENTER_EIP,
143];
144
145/// MSRs that are allowed to be read and written by the guest without interception.
146const MSR_ALLOWED_READ_WRITE: &[u32] = &[
147    x86defs::X64_MSR_FS_BASE,
148    x86defs::X64_MSR_GS_BASE,
149    x86defs::X64_MSR_KERNEL_GS_BASE,
150    x86defs::X86X_MSR_SPEC_CTRL,
151    x86defs::X86X_MSR_U_CET,
152    x86defs::X86X_MSR_S_CET,
153    x86defs::X86X_MSR_PL0_SSP,
154    x86defs::X86X_MSR_PL1_SSP,
155    x86defs::X86X_MSR_PL2_SSP,
156    x86defs::X86X_MSR_PL3_SSP,
157    x86defs::X86X_MSR_INTERRUPT_SSP_TABLE_ADDR,
158    x86defs::X86X_IA32_MSR_XFD,
159    x86defs::X86X_IA32_MSR_XFD_ERR,
160];
161
162#[derive(Debug, Error)]
163#[error("unknown exit {0:#x?}")]
164struct UnknownVmxExit(VmxExit);
165
166#[derive(Debug, Error)]
167#[error("bad guest state on VP.ENTER")]
168struct VmxBadGuestState;
169
170#[derive(Debug, Error)]
171#[error("failed to run")]
172struct TdxRunVpError(#[source] hcl::ioctl::Error);
173
174#[derive(Debug)]
175struct TdxExit<'a>(&'a tdx_tdg_vp_enter_exit_info);
176
177impl TdxExit<'_> {
178    fn code(&self) -> TdxVpEnterRaxResult {
179        self.0.rax.into()
180    }
181    fn qualification(&self) -> u64 {
182        self.0.rcx
183    }
184    fn gla(&self) -> Option<u64> {
185        // Only valid for EPT exits.
186        if self.code().vmx_exit().basic_reason() == VmxExitBasic::EPT_VIOLATION {
187            Some(self.0.rdx)
188        } else {
189            None
190        }
191    }
192    fn gpa(&self) -> Option<u64> {
193        // Only valid for EPT exits.
194        if self.code().vmx_exit().basic_reason() == VmxExitBasic::EPT_VIOLATION {
195            Some(self.0.r8)
196        } else {
197            None
198        }
199    }
200    fn _exit_interruption_info(&self) -> InterruptionInformation {
201        (self.0.r9 as u32).into()
202    }
203    fn _exit_interruption_error_code(&self) -> u32 {
204        (self.0.r9 >> 32) as u32
205    }
206    fn idt_vectoring_info(&self) -> InterruptionInformation {
207        (self.0.r10 as u32).into()
208    }
209    fn idt_vectoring_error_code(&self) -> u32 {
210        (self.0.r10 >> 32) as u32
211    }
212    fn instr_info(&self) -> TdxInstructionInfo {
213        self.0.r11.into()
214    }
215    fn cs(&self) -> SegmentRegister {
216        SegmentRegister {
217            selector: self.0.rsi as u16,
218            base: self.0.rdi,
219            limit: (self.0.rsi >> 32) as u32,
220            attributes: (self.0.rsi >> 16) as u16,
221        }
222    }
223    fn cpl(&self) -> u8 {
224        self.0.r12 as u8 & 3
225    }
226}
227
228/// Registers that can be virtual and shadowed.
229#[derive(Debug, Inspect)]
230enum ShadowedRegister {
231    Cr0,
232    Cr4,
233}
234
235impl ShadowedRegister {
236    fn name(&self) -> &'static str {
237        match self {
238            Self::Cr0 => "cr0",
239            Self::Cr4 => "cr4",
240        }
241    }
242
243    fn physical_vmcs_field(&self) -> VmcsField {
244        match self {
245            Self::Cr0 => VmcsField::VMX_VMCS_GUEST_CR0,
246            Self::Cr4 => VmcsField::VMX_VMCS_GUEST_CR4,
247        }
248    }
249
250    fn shadow_vmcs_field(&self) -> VmcsField {
251        match self {
252            Self::Cr0 => VmcsField::VMX_VMCS_CR0_READ_SHADOW,
253            Self::Cr4 => VmcsField::VMX_VMCS_CR4_READ_SHADOW,
254        }
255    }
256
257    fn guest_owned_mask(&self) -> u64 {
258        // Control register bits that are guest owned by default. A bit is guest
259        // owned when the physical register bit is always set to the virtual
260        // register bit (subject to validation of the virtual register).
261        match self {
262            Self::Cr0 => {
263                X64_CR0_ET
264                    | x86defs::X64_CR0_MP
265                    | x86defs::X64_CR0_EM
266                    | x86defs::X64_CR0_TS
267                    | x86defs::X64_CR0_WP
268                    | x86defs::X64_CR0_AM
269                    | X64_CR0_PE
270                    | X64_CR0_PG
271            }
272            Self::Cr4 => {
273                x86defs::X64_CR4_VME
274                    | x86defs::X64_CR4_PVI
275                    | x86defs::X64_CR4_TSD
276                    | x86defs::X64_CR4_DE
277                    | x86defs::X64_CR4_PSE
278                    | x86defs::X64_CR4_PAE
279                    | x86defs::X64_CR4_PGE
280                    | x86defs::X64_CR4_PCE
281                    | x86defs::X64_CR4_FXSR
282                    | x86defs::X64_CR4_XMMEXCPT
283                    | X64_CR4_UMIP
284                    | x86defs::X64_CR4_LA57
285                    | x86defs::X64_CR4_RWFSGS
286                    | x86defs::X64_CR4_PCIDE
287                    | x86defs::X64_CR4_OSXSAVE
288                    | x86defs::X64_CR4_SMEP
289                    | x86defs::X64_CR4_SMAP
290                    | x86defs::X64_CR4_CET
291            }
292        }
293    }
294}
295
296/// A virtual register that is shadowed by the virtstack.
297///
298/// Some bits are owned by the guest while others are owned by the virtstack,
299/// due to TDX requirements.
300#[derive(Inspect)]
301struct VirtualRegister {
302    /// The register being shadowed.
303    register: ShadowedRegister,
304    /// The VTL this register is shadowed for.
305    vtl: GuestVtl,
306    /// The value the guest sees.
307    shadow_value: u64,
308    /// Additional constraints on bits.
309    allowed_bits: u64,
310}
311
312impl VirtualRegister {
313    fn new(reg: ShadowedRegister, vtl: GuestVtl, initial_value: u64, allowed_bits: u64) -> Self {
314        Self {
315            register: reg,
316            vtl,
317            shadow_value: initial_value,
318            allowed_bits,
319        }
320    }
321
322    /// Write a new value to the virtual register. This updates host owned bits
323    /// in the shadowed value, and updates guest owned bits in the physical
324    /// register in the vmcs.
325    fn write<'a>(
326        &mut self,
327        value: u64,
328        runner: &mut ProcessorRunner<'a, Tdx<'a>>,
329    ) -> Result<(), vp_state::Error> {
330        tracing::trace!(?self.register, value, "write virtual register");
331
332        if value & !self.allowed_bits != 0 {
333            return Err(vp_state::Error::InvalidValue(
334                value,
335                self.register.name(),
336                "disallowed bit set",
337            ));
338        }
339
340        // If guest owned bits of the physical register have changed, then update
341        // the guest owned bits of the physical field.
342        let old_physical_reg = runner.read_vmcs64(self.vtl, self.register.physical_vmcs_field());
343
344        tracing::trace!(old_physical_reg, "old_physical_reg");
345
346        let guest_owned_mask = self.register.guest_owned_mask();
347        if (old_physical_reg ^ value) & guest_owned_mask != 0 {
348            let new_physical_reg =
349                (old_physical_reg & !guest_owned_mask) | (value & guest_owned_mask);
350
351            tracing::trace!(new_physical_reg, "new_physical_reg");
352
353            runner.write_vmcs64(
354                self.vtl,
355                self.register.physical_vmcs_field(),
356                !0,
357                new_physical_reg,
358            );
359        }
360
361        self.shadow_value = value;
362        runner.write_vmcs64(self.vtl, self.register.shadow_vmcs_field(), !0, value);
363        Ok(())
364    }
365
366    fn read<'a>(&self, runner: &ProcessorRunner<'a, Tdx<'a>>) -> u64 {
367        let physical_reg = runner.read_vmcs64(self.vtl, self.register.physical_vmcs_field());
368
369        // Get the bits owned by the host from the shadow and the bits owned by the
370        // guest from the physical value.
371        let guest_owned_mask = self.register.guest_owned_mask();
372        (self.shadow_value & !self.register.guest_owned_mask()) | (physical_reg & guest_owned_mask)
373    }
374}
375
376/// Backing for TDX partitions.
377#[derive(InspectMut)]
378pub struct TdxBacked {
379    #[inspect(mut)]
380    vtls: VtlArray<TdxVtl, 2>,
381
382    untrusted_synic: Option<ProcessorSynic>,
383    #[inspect(hex, iter_by_index)]
384    eoi_exit_bitmap: [u64; 4],
385
386    /// A mapped page used for issuing INVGLA hypercalls.
387    #[inspect(skip)]
388    flush_page: user_driver::memory::MemoryBlock,
389
390    #[inspect(flatten)]
391    cvm: UhCvmVpState,
392}
393
394#[derive(InspectMut)]
395struct TdxVtl {
396    /// The EFER value for this VP.
397    efer: u64,
398    /// Virtual cr0.
399    cr0: VirtualRegister,
400    /// Virtual cr4.
401    cr4: VirtualRegister,
402
403    // CSTAR doesn't exist on TDX, but Windows likes to verify that values are sticky.
404    msr_cstar: u64,
405
406    tpr_threshold: u8,
407    #[inspect(skip)]
408    processor_controls: ProcessorControls,
409    #[inspect(skip)]
410    interruption_information: InterruptionInformation,
411    exception_error_code: u32,
412    interruption_set: bool,
413
414    #[inspect(mut)]
415    private_regs: TdxPrivateRegs,
416
417    /// TDX only TLB flush state.
418    flush_state: TdxFlushState,
419
420    enter_stats: EnterStats,
421    exit_stats: ExitStats,
422}
423
424#[derive(Default)]
425pub struct TdxEmulationCache {
426    segs: [Option<SegmentRegister>; 6],
427    cr0: Option<u64>,
428}
429
430#[derive(Inspect, Default)]
431struct EnterStats {
432    success: Counter,
433    host_routed_async: Counter,
434    l2_exit_pending_intr: Counter,
435    pending_intr: Counter,
436    host_routed_td_vmcall: Counter,
437}
438
439#[derive(Inspect, Default)]
440struct ExitStats {
441    io: Counter,
442    msr_read: Counter,
443    msr_write: Counter,
444    ept_violation: Counter,
445    cpuid: Counter,
446    cr_access: Counter,
447    xsetbv: Counter,
448    tpr_below_threshold: Counter,
449    interrupt_window: Counter,
450    nmi_window: Counter,
451    vmcall: Counter,
452    smi_intr: Counter,
453    wbinvd: Counter,
454    hw_interrupt: Counter,
455    tdcall: Counter,
456    hlt: Counter,
457    pause: Counter,
458    needs_interrupt_reinject: Counter,
459    exception: Counter,
460    descriptor_table: Counter,
461}
462
463enum UhDirectOverlay {
464    Sipp,
465    Sifp,
466    Count,
467}
468
469impl HardwareIsolatedBacking for TdxBacked {
470    fn cvm_state(&self) -> &UhCvmVpState {
471        &self.cvm
472    }
473
474    fn cvm_state_mut(&mut self) -> &mut UhCvmVpState {
475        &mut self.cvm
476    }
477
478    fn cvm_partition_state(shared: &Self::Shared) -> &UhCvmPartitionState {
479        &shared.cvm
480    }
481
482    fn switch_vtl(this: &mut UhProcessor<'_, Self>, _source_vtl: GuestVtl, target_vtl: GuestVtl) {
483        // The GPs, Fxsave, and CR2 are saved in the shared kernel state. No copying needed.
484        // Debug registers and XFEM are shared architecturally. No copying needed.
485
486        this.backing.cvm_state_mut().exit_vtl = target_vtl;
487    }
488
489    fn translation_registers(
490        &self,
491        this: &UhProcessor<'_, Self>,
492        vtl: GuestVtl,
493    ) -> TranslationRegisters {
494        let cr0 = this.backing.vtls[vtl].cr0.read(&this.runner);
495        let cr4 = this.backing.vtls[vtl].cr4.read(&this.runner);
496        let efer = this.backing.vtls[vtl].efer;
497        let cr3 = this.runner.read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_CR3);
498        let ss = this.read_segment(vtl, TdxSegmentReg::Ss).into();
499        let rflags = this.backing.vtls[vtl].private_regs.rflags;
500
501        TranslationRegisters {
502            cr0,
503            cr4,
504            efer,
505            cr3,
506            ss,
507            rflags,
508            encryption_mode: this.partition.caps.vtom.map_or(
509                virt_support_x86emu::translate::EncryptionMode::None,
510                virt_support_x86emu::translate::EncryptionMode::Vtom,
511            ),
512        }
513    }
514
515    fn tlb_flush_lock_access<'a>(
516        vp_index: Option<VpIndex>,
517        partition: &'a UhPartitionInner,
518        shared: &'a Self::Shared,
519    ) -> impl TlbFlushLockAccess + 'a {
520        TdxTlbLockFlushAccess {
521            vp_index,
522            partition,
523            shared,
524        }
525    }
526
527    fn pending_event_vector(this: &UhProcessor<'_, Self>, vtl: GuestVtl) -> Option<u8> {
528        let event_inject = this.backing.vtls[vtl].interruption_information;
529        if event_inject.valid() {
530            Some(event_inject.vector())
531        } else {
532            None
533        }
534    }
535
536    fn set_pending_exception(
537        this: &mut UhProcessor<'_, Self>,
538        vtl: GuestVtl,
539        event: HvX64PendingExceptionEvent,
540    ) {
541        let new_intr = InterruptionInformation::new()
542            .with_valid(true)
543            .with_deliver_error_code(event.deliver_error_code())
544            .with_vector(event.vector().try_into().unwrap())
545            .with_interruption_type(INTERRUPT_TYPE_HARDWARE_EXCEPTION);
546
547        this.backing.vtls[vtl].interruption_information = new_intr;
548        this.backing.vtls[vtl].exception_error_code = event.error_code();
549    }
550
551    fn cr0(this: &UhProcessor<'_, Self>, vtl: GuestVtl) -> u64 {
552        this.read_cr0(vtl)
553    }
554
555    fn cr4(this: &UhProcessor<'_, Self>, vtl: GuestVtl) -> u64 {
556        this.read_cr4(vtl)
557    }
558
559    fn intercept_message_state(
560        this: &UhProcessor<'_, Self>,
561        vtl: GuestVtl,
562        include_optional_state: bool,
563    ) -> super::InterceptMessageState {
564        let exit = TdxExit(this.runner.tdx_vp_enter_exit_info());
565        let backing_vtl = &this.backing.vtls[vtl];
566        let shared_gps = this.runner.tdx_enter_guest_gps();
567
568        super::InterceptMessageState {
569            instruction_length_and_cr8: exit.instr_info().length() as u8,
570            cpl: exit.cpl(),
571            efer_lma: backing_vtl.efer & X64_EFER_LMA != 0,
572            cs: exit.cs().into(),
573            rip: backing_vtl.private_regs.rip,
574            rflags: backing_vtl.private_regs.rflags,
575            rax: shared_gps[TdxGp::RAX],
576            rdx: shared_gps[TdxGp::RDX],
577            optional: if include_optional_state {
578                Some(super::InterceptMessageOptionalState {
579                    ds: this.read_segment(vtl, TdxSegmentReg::Ds).into(),
580                    es: this.read_segment(vtl, TdxSegmentReg::Es).into(),
581                })
582            } else {
583                None
584            },
585            rcx: shared_gps[TdxGp::RCX],
586            rsi: shared_gps[TdxGp::RSI],
587            rdi: shared_gps[TdxGp::RDI],
588        }
589    }
590
591    fn cr_intercept_registration(
592        this: &mut UhProcessor<'_, Self>,
593        intercept_control: hvdef::HvRegisterCrInterceptControl,
594    ) {
595        // Today we only support intercepting VTL 0 on behalf of VTL 1.
596        let vtl = GuestVtl::Vtl0;
597        let intercept_masks = &this
598            .backing
599            .cvm_state()
600            .vtl1
601            .as_ref()
602            .unwrap()
603            .reg_intercept;
604
605        // Update CR0 and CR4 intercept masks in the VMCS.
606        this.runner.write_vmcs64(
607            vtl,
608            VmcsField::VMX_VMCS_CR0_GUEST_HOST_MASK,
609            !0,
610            this.shared.cr_guest_host_mask(ShadowedRegister::Cr0)
611                | if intercept_control.cr0_write() {
612                    intercept_masks.cr0_mask
613                } else {
614                    0
615                },
616        );
617        this.runner.write_vmcs64(
618            vtl,
619            VmcsField::VMX_VMCS_CR4_GUEST_HOST_MASK,
620            !0,
621            this.shared.cr_guest_host_mask(ShadowedRegister::Cr4)
622                | if intercept_control.cr4_write() {
623                    intercept_masks.cr4_mask
624                } else {
625                    0
626                },
627        );
628
629        // Update descriptor table intercepts.
630        let intercept_tables = intercept_control.gdtr_write()
631            | intercept_control.idtr_write()
632            | intercept_control.ldtr_write()
633            | intercept_control.tr_write();
634        this.runner.write_vmcs32(
635            vtl,
636            VmcsField::VMX_VMCS_SECONDARY_PROCESSOR_CONTROLS,
637            SecondaryProcessorControls::new()
638                .with_descriptor_table_exiting(true)
639                .into_bits(),
640            SecondaryProcessorControls::new()
641                .with_descriptor_table_exiting(intercept_tables)
642                .into_bits(),
643        );
644
645        // Update MSR intercepts. We only need to update those that are allowed
646        // to be passed through, as the default otherwise is to always intercept.
647        // See [`MSR_ALLOWED_READ_WRITE`].
648        this.runner.set_msr_bit(
649            vtl,
650            x86defs::X86X_MSR_S_CET,
651            true,
652            intercept_control.msr_scet_write(),
653        );
654        this.runner.set_msr_bit(
655            vtl,
656            x86defs::X86X_MSR_PL0_SSP,
657            true,
658            intercept_control.msr_pls_ssp_write(),
659        );
660        this.runner.set_msr_bit(
661            vtl,
662            x86defs::X86X_MSR_PL1_SSP,
663            true,
664            intercept_control.msr_pls_ssp_write(),
665        );
666        this.runner.set_msr_bit(
667            vtl,
668            x86defs::X86X_MSR_PL2_SSP,
669            true,
670            intercept_control.msr_pls_ssp_write(),
671        );
672        this.runner.set_msr_bit(
673            vtl,
674            x86defs::X86X_MSR_PL3_SSP,
675            true,
676            intercept_control.msr_pls_ssp_write(),
677        );
678        this.runner.set_msr_bit(
679            vtl,
680            x86defs::X86X_MSR_INTERRUPT_SSP_TABLE_ADDR,
681            true,
682            intercept_control.msr_pls_ssp_write(),
683        );
684    }
685
686    fn is_interrupt_pending(
687        this: &mut UhProcessor<'_, Self>,
688        vtl: GuestVtl,
689        check_rflags: bool,
690        dev: &impl CpuIo,
691    ) -> bool {
692        let backing_vtl = &this.backing.vtls[vtl];
693        if backing_vtl.interruption_information.valid()
694            && backing_vtl.interruption_information.interruption_type() == INTERRUPT_TYPE_NMI
695        {
696            return true;
697        }
698
699        let (vector, ppr) = if this.backing.cvm.lapics[vtl].lapic.is_offloaded() {
700            let vector = backing_vtl.private_regs.rvi;
701            let ppr = std::cmp::max(
702                backing_vtl.private_regs.svi.into(),
703                this.runner.tdx_apic_page(vtl).tpr.value,
704            );
705            (vector, ppr)
706        } else {
707            let lapic = &mut this.backing.cvm.lapics[vtl].lapic;
708            let vector = lapic.next_irr().unwrap_or(0);
709            let ppr = lapic
710                .access(&mut TdxApicClient {
711                    partition: this.partition,
712                    apic_page: this.runner.tdx_apic_page_mut(vtl),
713                    dev,
714                    vmtime: &this.vmtime,
715                    vtl,
716                })
717                .get_ppr();
718            (vector, ppr)
719        };
720        let vector_priority = (vector as u32) >> 4;
721        let ppr_priority = ppr >> 4;
722
723        if vector_priority <= ppr_priority {
724            return false;
725        }
726
727        if check_rflags && !RFlags::from_bits(backing_vtl.private_regs.rflags).interrupt_enable() {
728            return false;
729        }
730
731        let interruptibility: Interruptibility = this
732            .runner
733            .read_vmcs32(vtl, VmcsField::VMX_VMCS_GUEST_INTERRUPTIBILITY)
734            .into();
735
736        if interruptibility.blocked_by_sti() || interruptibility.blocked_by_movss() {
737            return false;
738        }
739
740        true
741    }
742
743    fn untrusted_synic_mut(&mut self) -> Option<&mut ProcessorSynic> {
744        self.untrusted_synic.as_mut()
745    }
746}
747
748/// Partition-wide shared data for TDX VPs.
749#[derive(Inspect)]
750pub struct TdxBackedShared {
751    #[inspect(flatten)]
752    pub(crate) cvm: UhCvmPartitionState,
753    /// The synic state used for untrusted SINTs, that is, the SINTs for which
754    /// the guest thinks it is interacting directly with the untrusted
755    /// hypervisor via an architecture-specific interface.
756    pub(crate) untrusted_synic: Option<GlobalSynic>,
757    flush_state: VtlArray<TdxPartitionFlushState, 2>,
758    #[inspect(iter_by_index)]
759    active_vtl: Vec<AtomicU8>,
760    /// CR4 bits that the guest is allowed to set to 1.
761    cr4_allowed_bits: u64,
762}
763
764impl TdxBackedShared {
765    pub(crate) fn new(
766        partition_params: &UhPartitionNewParams<'_>,
767        params: BackingSharedParams<'_>,
768    ) -> Result<Self, crate::Error> {
769        // Create a second synic to fully manage the untrusted SINTs
770        // here. At time of writing, the hypervisor does not support
771        // sharing the untrusted SINTs with the TDX L1. Even if it did,
772        // performance would be poor for cases where the L1 implements
773        // high-performance devices.
774        let untrusted_synic = (partition_params.handle_synic && !partition_params.hide_isolation)
775            .then(|| GlobalSynic::new(partition_params.topology.vp_count()));
776
777        // TODO TDX: Consider just using MSR kernel module instead of explicit ioctl.
778        let cr4_fixed1 = params.hcl.read_vmx_cr4_fixed1();
779        let cr4_allowed_bits =
780            (ShadowedRegister::Cr4.guest_owned_mask() | X64_CR4_MCE) & cr4_fixed1;
781
782        Ok(Self {
783            untrusted_synic,
784            flush_state: VtlArray::from_fn(|_| TdxPartitionFlushState::new()),
785            cvm: params.cvm_state.unwrap(),
786            // VPs start in VTL 2.
787            active_vtl: std::iter::repeat_n(2, partition_params.topology.vp_count() as usize)
788                .map(AtomicU8::new)
789                .collect(),
790            cr4_allowed_bits,
791        })
792    }
793
794    /// Get the default guest host mask for the specified register.
795    fn cr_guest_host_mask(&self, reg: ShadowedRegister) -> u64 {
796        match reg {
797            ShadowedRegister::Cr0 => {
798                !ShadowedRegister::Cr0.guest_owned_mask() | X64_CR0_PE | X64_CR0_PG
799            }
800            ShadowedRegister::Cr4 => {
801                !(ShadowedRegister::Cr4.guest_owned_mask() & self.cr4_allowed_bits)
802            }
803        }
804    }
805}
806
807impl TdxBacked {
808    /// Gets the number of pages that will be allocated from the shared page pool
809    /// for each CPU.
810    pub fn shared_pages_required_per_cpu() -> u64 {
811        UhDirectOverlay::Count as u64
812    }
813}
814
815// The memory used to back the untrusted synic is not guest-visible, but rather
816// is allocated from our shared pool. Therefore it does not need to go through
817// the normal memory protections path.
818struct UntrustedSynicVtlProts<'a>(&'a GuestMemory);
819
820impl hv1_emulator::VtlProtectAccess for UntrustedSynicVtlProts<'_> {
821    fn check_modify_and_lock_overlay_page(
822        &mut self,
823        gpn: u64,
824        _check_perms: hvdef::HvMapGpaFlags,
825        _new_perms: Option<hvdef::HvMapGpaFlags>,
826    ) -> Result<guestmem::LockedPages, HvError> {
827        self.0
828            .lock_gpns(false, &[gpn])
829            .map_err(|_| HvError::OperationFailed)
830    }
831
832    fn unlock_overlay_page(&mut self, _gpn: u64) -> Result<(), HvError> {
833        Ok(())
834    }
835}
836
837#[expect(private_interfaces)]
838impl BackingPrivate for TdxBacked {
839    type HclBacking<'tdx> = Tdx<'tdx>;
840    type Shared = TdxBackedShared;
841    type EmulationCache = TdxEmulationCache;
842
843    fn shared(shared: &BackingShared) -> &Self::Shared {
844        let BackingShared::Tdx(shared) = shared else {
845            unreachable!()
846        };
847        shared
848    }
849
850    fn new(
851        params: super::BackingParams<'_, '_, Self>,
852        shared: &TdxBackedShared,
853    ) -> Result<Self, crate::Error> {
854        // TODO TDX: ssp is for shadow stack
855        // TODO TDX: direct overlay like snp?
856        // TODO TDX: lapic / APIC setup?
857        // TODO TDX: see ValInitializeVplc
858        // TODO TDX: XCR_XFMEM setup?
859
860        // Turn on MBEC for just VTL 0.
861        params.runner.write_vmcs32(
862            GuestVtl::Vtl0,
863            VmcsField::VMX_VMCS_SECONDARY_PROCESSOR_CONTROLS,
864            SecondaryProcessorControls::new()
865                .with_mode_based_execute_control(true)
866                .into(),
867            SecondaryProcessorControls::new()
868                .with_mode_based_execute_control(true)
869                .into(),
870        );
871
872        let controls = TdxL2Ctls::new()
873            // Configure L2 controls to permit shared memory.
874            .with_enable_shared_ept(!shared.cvm.hide_isolation)
875            // If the synic is to be managed by the hypervisor, then enable TDVMCALLs.
876            .with_enable_tdvmcall(shared.untrusted_synic.is_none() && !shared.cvm.hide_isolation);
877
878        params
879            .runner
880            .set_l2_ctls(GuestVtl::Vtl0, controls)
881            .map_err(crate::Error::FailedToSetL2Ctls)?;
882
883        for vtl in [GuestVtl::Vtl0, GuestVtl::Vtl1] {
884            // Set guest/host masks for CR0 and CR4. These enable shadowing these
885            // registers since TDX requires certain bits to be set at all times.
886            let initial_cr0 = params
887                .runner
888                .read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_CR0);
889            assert_eq!(initial_cr0, X64_CR0_PE | X64_CR0_NE);
890
891            // N.B. CR0.PE and CR0.PG are guest owned but still intercept when they
892            // are changed for caching purposes and to ensure EFER is managed
893            // properly due to the need to change execution state.
894            params.runner.write_vmcs64(
895                vtl,
896                VmcsField::VMX_VMCS_CR0_READ_SHADOW,
897                !0,
898                X64_CR0_PE | X64_CR0_NE,
899            );
900            params.runner.write_vmcs64(
901                vtl,
902                VmcsField::VMX_VMCS_CR0_GUEST_HOST_MASK,
903                !0,
904                shared.cr_guest_host_mask(ShadowedRegister::Cr0),
905            );
906
907            let initial_cr4 = params
908                .runner
909                .read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_CR4);
910            assert_eq!(initial_cr4, X64_CR4_MCE | X64_CR4_VMXE);
911
912            params
913                .runner
914                .write_vmcs64(vtl, VmcsField::VMX_VMCS_CR4_READ_SHADOW, !0, 0);
915            params.runner.write_vmcs64(
916                vtl,
917                VmcsField::VMX_VMCS_CR4_GUEST_HOST_MASK,
918                !0,
919                shared.cr_guest_host_mask(ShadowedRegister::Cr4),
920            );
921
922            // Configure the MSR bitmap for this VP. Since the default MSR bitmap
923            // is set to intercept everything only the MSRs that we want to allow
924            // to passthrough need to be set.
925            for msr in MSR_ALLOWED_READ {
926                params.runner.set_msr_bit(vtl, *msr, false, false);
927            }
928            for msr in MSR_ALLOWED_READ_WRITE {
929                params.runner.set_msr_bit(vtl, *msr, false, false);
930                params.runner.set_msr_bit(vtl, *msr, true, false);
931            }
932
933            // Set the exception bitmap.
934            if params.partition.intercept_debug_exceptions {
935                if cfg!(feature = "gdb") {
936                    let initial_exception_bitmap = params
937                        .runner
938                        .read_vmcs32(vtl, VmcsField::VMX_VMCS_EXCEPTION_BITMAP);
939
940                    let exception_bitmap =
941                        initial_exception_bitmap | (1 << x86defs::Exception::DEBUG.0);
942
943                    params.runner.write_vmcs32(
944                        vtl,
945                        VmcsField::VMX_VMCS_EXCEPTION_BITMAP,
946                        !0,
947                        exception_bitmap,
948                    );
949                } else {
950                    return Err(super::Error::InvalidDebugConfiguration);
951                }
952            }
953        }
954
955        let flush_page = shared
956            .cvm
957            .private_dma_client
958            .allocate_dma_buffer(HV_PAGE_SIZE as usize)
959            .map_err(crate::Error::AllocateTlbFlushPage)?;
960
961        let untrusted_synic = shared
962            .untrusted_synic
963            .as_ref()
964            .map(|synic| synic.add_vp(params.vp_info.base.vp_index));
965
966        Ok(Self {
967            vtls: VtlArray::from_fn(|vtl| {
968                let vtl: GuestVtl = vtl.try_into().unwrap();
969                TdxVtl {
970                    efer: params
971                        .runner
972                        .read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_EFER),
973                    cr0: VirtualRegister::new(
974                        ShadowedRegister::Cr0,
975                        vtl,
976                        params
977                            .runner
978                            .read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_CR0),
979                        !0,
980                    ),
981                    cr4: VirtualRegister::new(
982                        ShadowedRegister::Cr4,
983                        vtl,
984                        params
985                            .runner
986                            .read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_CR4),
987                        shared.cr4_allowed_bits,
988                    ),
989                    msr_cstar: 0,
990                    tpr_threshold: 0,
991                    processor_controls: params
992                        .runner
993                        .read_vmcs32(vtl, VmcsField::VMX_VMCS_PROCESSOR_CONTROLS)
994                        .into(),
995                    interruption_information: Default::default(),
996                    exception_error_code: 0,
997                    interruption_set: false,
998                    flush_state: TdxFlushState::new(),
999                    private_regs: TdxPrivateRegs::new(vtl),
1000                    enter_stats: Default::default(),
1001                    exit_stats: Default::default(),
1002                }
1003            }),
1004            untrusted_synic,
1005            eoi_exit_bitmap: [0; 4],
1006            flush_page,
1007            cvm: UhCvmVpState::new(
1008                &shared.cvm,
1009                params.partition,
1010                params.vp_info,
1011                UhDirectOverlay::Count as usize,
1012            )?,
1013        })
1014    }
1015
1016    type StateAccess<'p, 'a>
1017        = UhVpStateAccess<'a, 'p, Self>
1018    where
1019        Self: 'a + 'p,
1020        'p: 'a;
1021
1022    fn access_vp_state<'a, 'p>(
1023        this: &'a mut UhProcessor<'p, Self>,
1024        vtl: GuestVtl,
1025    ) -> Self::StateAccess<'p, 'a> {
1026        UhVpStateAccess::new(this, vtl)
1027    }
1028
1029    fn init(this: &mut UhProcessor<'_, Self>) {
1030        // Configure the synic direct overlays.
1031        // So far, only VTL 0 is using these (for VMBus).
1032        let pfns = &this.backing.cvm.direct_overlay_handle.pfns();
1033        let reg = |gpn| {
1034            u64::from(
1035                HvSynicSimpSiefp::new()
1036                    .with_base_gpn(gpn)
1037                    .with_enabled(true),
1038            )
1039        };
1040
1041        let values: &[(HvX64RegisterName, u64); 2] = &[
1042            (
1043                HvX64RegisterName::Sifp,
1044                reg(pfns[UhDirectOverlay::Sifp as usize]),
1045            ),
1046            (
1047                HvX64RegisterName::Sipp,
1048                reg(pfns[UhDirectOverlay::Sipp as usize]),
1049            ),
1050        ];
1051
1052        let reg_count = if let Some(synic) = &mut this.backing.untrusted_synic {
1053            let prot_access = &mut UntrustedSynicVtlProts(&this.partition.gm[GuestVtl::Vtl0]);
1054
1055            synic
1056                .set_simp(reg(pfns[UhDirectOverlay::Sipp as usize]), prot_access)
1057                .unwrap();
1058            synic
1059                .set_siefp(reg(pfns[UhDirectOverlay::Sifp as usize]), prot_access)
1060                .unwrap();
1061            // Set the SIEFP in the hypervisor so that the hypervisor can
1062            // directly signal synic events. Don't set the SIMP, since the
1063            // message page is owned by the paravisor.
1064            1
1065        } else {
1066            2
1067        };
1068
1069        this.runner
1070            .set_vp_registers_hvcall(Vtl::Vtl0, &values[..reg_count])
1071            .expect("set_vp_registers hypercall for direct overlays should succeed");
1072
1073        // Enable APIC offload by default for VTL 0.
1074        this.set_apic_offload(GuestVtl::Vtl0, true);
1075        this.backing.cvm.lapics[GuestVtl::Vtl0]
1076            .lapic
1077            .enable_offload();
1078
1079        // But disable it for VTL 1.
1080        this.set_apic_offload(GuestVtl::Vtl1, false);
1081
1082        // Initialize registers to the reset state, since this may be different
1083        // than what's on the VMCS and is certainly different than what's in the
1084        // VP enter and private register state (which was mostly zero
1085        // initialized).
1086        for vtl in [GuestVtl::Vtl0, GuestVtl::Vtl1] {
1087            let registers = Registers::at_reset(&this.partition.caps, &this.inner.vp_info);
1088
1089            let mut state = this.access_state(vtl.into());
1090            state
1091                .set_registers(&registers)
1092                .expect("Resetting to architectural state should succeed");
1093
1094            state.commit().expect("committing state should succeed");
1095        }
1096
1097        // FX regs and XMM registers are zero-initialized by the kernel. Set
1098        // them to the arch default.
1099        *this.runner.fx_state_mut() =
1100            vp::Xsave::at_reset(&this.partition.caps, &this.inner.vp_info).fxsave();
1101    }
1102
1103    async fn run_vp(
1104        this: &mut UhProcessor<'_, Self>,
1105        dev: &impl CpuIo,
1106        _stop: &mut virt::StopVp<'_>,
1107    ) -> Result<(), VpHaltReason> {
1108        this.run_vp_tdx(dev).await
1109    }
1110
1111    fn poll_apic(this: &mut UhProcessor<'_, Self>, vtl: GuestVtl, scan_irr: bool) {
1112        if !this.try_poll_apic(vtl, scan_irr) {
1113            tracing::info!(CVM_ALLOWED, "disabling APIC offload due to auto EOI");
1114            let page = this.runner.tdx_apic_page_mut(vtl);
1115            let (irr, isr) = pull_apic_offload(page);
1116
1117            this.backing.cvm.lapics[vtl]
1118                .lapic
1119                .disable_offload(&irr, &isr);
1120            this.set_apic_offload(vtl, false);
1121            this.try_poll_apic(vtl, false);
1122        }
1123    }
1124
1125    fn request_extint_readiness(_this: &mut UhProcessor<'_, Self>) {
1126        unreachable!("extint managed through software apic")
1127    }
1128
1129    fn request_untrusted_sint_readiness(this: &mut UhProcessor<'_, Self>, sints: u16) {
1130        if let Some(synic) = &mut this.backing.untrusted_synic {
1131            synic.request_sint_readiness(sints);
1132        } else {
1133            tracelimit::error_ratelimited!(CVM_ALLOWED, "untrusted synic is not configured");
1134        }
1135    }
1136
1137    fn hv(&self, vtl: GuestVtl) -> Option<&ProcessorVtlHv> {
1138        Some(&self.cvm.hv[vtl])
1139    }
1140
1141    fn hv_mut(&mut self, vtl: GuestVtl) -> Option<&mut ProcessorVtlHv> {
1142        Some(&mut self.cvm.hv[vtl])
1143    }
1144
1145    fn handle_vp_start_enable_vtl_wake(this: &mut UhProcessor<'_, Self>, vtl: GuestVtl) {
1146        this.hcvm_handle_vp_start_enable_vtl(vtl)
1147    }
1148
1149    fn vtl1_inspectable(this: &UhProcessor<'_, Self>) -> bool {
1150        this.hcvm_vtl1_inspectable()
1151    }
1152
1153    fn process_interrupts(
1154        this: &mut UhProcessor<'_, Self>,
1155        scan_irr: VtlArray<bool, 2>,
1156        first_scan_irr: &mut bool,
1157        dev: &impl CpuIo,
1158    ) -> bool {
1159        this.cvm_process_interrupts(scan_irr, first_scan_irr, dev)
1160    }
1161}
1162
1163impl UhProcessor<'_, TdxBacked> {
1164    /// Returns `Ok(false)` if the APIC offload needs to be disabled and the
1165    /// poll retried.
1166    fn try_poll_apic(&mut self, vtl: GuestVtl, scan_irr: bool) -> bool {
1167        let mut scan = TdxApicScanner {
1168            processor_controls: self.backing.vtls[vtl]
1169                .processor_controls
1170                .with_nmi_window_exiting(false)
1171                .with_interrupt_window_exiting(false),
1172            vp: self,
1173            tpr_threshold: 0,
1174        };
1175
1176        // TODO TDX: filter proxy IRRs by setting the `proxy_irr_blocked` field of the run page
1177        hardware_cvm::apic::poll_apic_core(&mut scan, vtl, scan_irr);
1178
1179        let TdxApicScanner {
1180            vp: _,
1181            processor_controls: new_processor_controls,
1182            tpr_threshold: new_tpr_threshold,
1183        } = scan;
1184
1185        // Interrupts are ignored while waiting for SIPI.
1186        if self.backing.cvm.lapics[vtl].activity != MpState::WaitForSipi
1187            && self.backing.vtls[vtl].tpr_threshold != new_tpr_threshold
1188        {
1189            tracing::trace!(new_tpr_threshold, ?vtl, "setting tpr threshold");
1190            self.runner.write_vmcs32(
1191                vtl,
1192                VmcsField::VMX_VMCS_TPR_THRESHOLD,
1193                !0,
1194                new_tpr_threshold.into(),
1195            );
1196            self.backing.vtls[vtl].tpr_threshold = new_tpr_threshold;
1197        }
1198
1199        if self.backing.vtls[vtl].processor_controls != new_processor_controls {
1200            tracing::trace!(?new_processor_controls, ?vtl, "requesting window change");
1201            self.runner.write_vmcs32(
1202                vtl,
1203                VmcsField::VMX_VMCS_PROCESSOR_CONTROLS,
1204                !0,
1205                new_processor_controls.into(),
1206            );
1207            self.backing.vtls[vtl].processor_controls = new_processor_controls;
1208        }
1209
1210        // Offloading and proxying is only done with VTL 0 today.
1211        if vtl == GuestVtl::Vtl0 {
1212            let mut update_rvi = false;
1213            let r: Result<(), OffloadNotSupported> = self.backing.cvm.lapics[vtl]
1214                .lapic
1215                .push_to_offload(|irr, isr, tmr| {
1216                    let apic_page = self.runner.tdx_apic_page_mut(vtl);
1217
1218                    for (((irr, page_irr), isr), page_isr) in irr
1219                        .iter()
1220                        .zip(&mut apic_page.irr)
1221                        .zip(isr)
1222                        .zip(&mut apic_page.isr)
1223                    {
1224                        page_irr.value |= *irr;
1225                        page_isr.value |= *isr;
1226                    }
1227
1228                    // Update SVI and RVI.
1229                    let svi = top_vector(&apic_page.isr);
1230                    self.backing.vtls[vtl].private_regs.svi = svi;
1231                    update_rvi = true;
1232
1233                    // Ensure the EOI exit bitmap is up to date.
1234                    let fields = [
1235                        VmcsField::VMX_VMCS_EOI_EXIT_0,
1236                        VmcsField::VMX_VMCS_EOI_EXIT_1,
1237                        VmcsField::VMX_VMCS_EOI_EXIT_2,
1238                        VmcsField::VMX_VMCS_EOI_EXIT_3,
1239                    ];
1240                    for ((&field, eoi_exit), (i, tmr)) in fields
1241                        .iter()
1242                        .zip(&mut self.backing.eoi_exit_bitmap)
1243                        .zip(tmr.chunks_exact(2).enumerate())
1244                    {
1245                        let tmr = tmr[0] as u64 | ((tmr[1] as u64) << 32);
1246                        if *eoi_exit != tmr {
1247                            self.runner.write_vmcs64(vtl, field, !0, tmr);
1248                            *eoi_exit = tmr;
1249                            // The kernel driver supports some common APIC functionality (ICR writes,
1250                            // interrupt injection). When the kernel driver handles an interrupt, it
1251                            // must know if that interrupt was previously level-triggered. Otherwise,
1252                            // the EOI will be incorrectly treated as level-triggered. We keep a copy
1253                            // of the tmr in the kernel so it knows when this scenario occurs.
1254                            self.runner.proxy_irr_exit_mut_vtl0()[i * 2] = tmr as u32;
1255                            self.runner.proxy_irr_exit_mut_vtl0()[i * 2 + 1] = (tmr >> 32) as u32;
1256                        }
1257                    }
1258                });
1259
1260            if let Err(OffloadNotSupported) = r {
1261                // APIC needs offloading to be disabled to support auto-EOI. The caller
1262                // will disable offload and try again.
1263                return false;
1264            }
1265
1266            if update_rvi {
1267                let page = self.runner.tdx_apic_page_mut(vtl);
1268                let rvi = top_vector(&page.irr);
1269                self.backing.vtls[vtl].private_regs.rvi = rvi;
1270            }
1271        }
1272
1273        // If there is a pending interrupt, clear the halted and idle state.
1274        if (self.backing.cvm.lapics[vtl].activity != MpState::Running)
1275            && self.backing.cvm.lapics[vtl].lapic.is_offloaded()
1276            && self.backing.vtls[vtl].private_regs.rvi != 0
1277        {
1278            // To model a non-virtualized processor, we should only do this if
1279            // TPR and IF and interrupt shadow allow. However, fetching the
1280            // interrupt shadow state is expensive (tdcall). This shouldn't
1281            // matter much, because real guests don't issue hlt while in
1282            // interrupt shadow or with interrupts disabled or with a non-zero
1283            // TPR.
1284            //
1285            // Note that the processor will not actually inject the interrupt
1286            // until conditions hold. So, unless the guest fails to loop around
1287            // and hlt again (which we already treat as a guest bug, since
1288            // Hyper-V in general does not guarantee hlt will stick until an
1289            // interrupt is pending), at worst this will just burn some CPU.
1290            self.backing.cvm.lapics[vtl].activity = MpState::Running;
1291        }
1292
1293        true
1294    }
1295
1296    fn access_apic_without_offload<R>(
1297        &mut self,
1298        vtl: GuestVtl,
1299        f: impl FnOnce(&mut Self) -> R,
1300    ) -> R {
1301        let offloaded = self.backing.cvm.lapics[vtl].lapic.is_offloaded();
1302        if offloaded {
1303            let (irr, isr) = pull_apic_offload(self.runner.tdx_apic_page_mut(vtl));
1304            self.backing.cvm.lapics[vtl]
1305                .lapic
1306                .disable_offload(&irr, &isr);
1307        }
1308        let r = f(self);
1309        if offloaded {
1310            self.backing.cvm.lapics[vtl].lapic.enable_offload();
1311        }
1312        r
1313    }
1314
1315    fn set_apic_offload(&mut self, vtl: GuestVtl, offload: bool) {
1316        // Update the APIC portion of the MSR bitmap.
1317        let offload_bitmap = if offload {
1318            (1 << x86defs::apic::ApicRegister::TPR.0)
1319                | (1 << x86defs::apic::ApicRegister::EOI.0)
1320                | (1 << x86defs::apic::ApicRegister::SELF_IPI.0)
1321        } else {
1322            0
1323        };
1324        // Once for read and once for write.
1325        for offset in [0, 0x100] {
1326            self.runner
1327                .write_msr_bitmap(vtl, offset + X2APIC_MSR_BASE / 64, !0, !offload_bitmap);
1328        }
1329
1330        // Update virtual-interrupt delivery.
1331        self.runner.write_vmcs32(
1332            vtl,
1333            VmcsField::VMX_VMCS_SECONDARY_PROCESSOR_CONTROLS,
1334            SecondaryProcessorControls::new()
1335                .with_virtual_interrupt_delivery(true)
1336                .into(),
1337            SecondaryProcessorControls::new()
1338                .with_virtual_interrupt_delivery(offload)
1339                .into(),
1340        );
1341
1342        // Clear any pending external interrupt when enabling the APIC offload.
1343        if offload
1344            && self.backing.vtls[vtl]
1345                .interruption_information
1346                .interruption_type()
1347                == INTERRUPT_TYPE_EXTERNAL
1348        {
1349            self.backing.vtls[vtl]
1350                .interruption_information
1351                .set_valid(false);
1352        }
1353    }
1354}
1355
1356struct TdxApicScanner<'a, 'b> {
1357    vp: &'a mut UhProcessor<'b, TdxBacked>,
1358    processor_controls: ProcessorControls,
1359    tpr_threshold: u8,
1360}
1361
1362impl<'b> hardware_cvm::apic::ApicBacking<'b, TdxBacked> for TdxApicScanner<'_, 'b> {
1363    fn vp(&mut self) -> &mut UhProcessor<'b, TdxBacked> {
1364        self.vp
1365    }
1366
1367    fn handle_interrupt(&mut self, vtl: GuestVtl, vector: u8) {
1368        // Exit idle when an interrupt is received, regardless of IF
1369        if self.vp.backing.cvm.lapics[vtl].activity == MpState::Idle {
1370            self.vp.backing.cvm.lapics[vtl].activity = MpState::Running;
1371        }
1372        // If there is a higher-priority pending event of some kind, then
1373        // just request an exit after it has resolved, after which we will
1374        // try again.
1375        if self.vp.backing.vtls[vtl].interruption_information.valid()
1376            && self.vp.backing.vtls[vtl]
1377                .interruption_information
1378                .interruption_type()
1379                != INTERRUPT_TYPE_EXTERNAL
1380        {
1381            self.processor_controls.set_interrupt_window_exiting(true);
1382            return;
1383        }
1384
1385        // Ensure the interrupt is not blocked by RFLAGS.IF or interrupt shadow.
1386        let interruptibility: Interruptibility = self
1387            .vp
1388            .runner
1389            .read_vmcs32(vtl, VmcsField::VMX_VMCS_GUEST_INTERRUPTIBILITY)
1390            .into();
1391
1392        let rflags = RFlags::from(self.vp.backing.vtls[vtl].private_regs.rflags);
1393        if !rflags.interrupt_enable()
1394            || interruptibility.blocked_by_sti()
1395            || interruptibility.blocked_by_movss()
1396        {
1397            self.processor_controls.set_interrupt_window_exiting(true);
1398            return;
1399        }
1400
1401        let priority = vector >> 4;
1402        let apic = self.vp.runner.tdx_apic_page(vtl);
1403        if (apic.tpr.value as u8 >> 4) >= priority {
1404            self.tpr_threshold = priority;
1405            return;
1406        }
1407
1408        self.vp.backing.vtls[vtl].interruption_information = InterruptionInformation::new()
1409            .with_valid(true)
1410            .with_vector(vector)
1411            .with_interruption_type(INTERRUPT_TYPE_EXTERNAL);
1412
1413        self.vp.backing.cvm.lapics[vtl].activity = MpState::Running;
1414    }
1415
1416    fn handle_nmi(&mut self, vtl: GuestVtl) {
1417        // Exit idle when an interrupt is received, regardless of IF
1418        // TODO: Investigate lifting more activity management into poll_apic_core
1419        if self.vp.backing.cvm.lapics[vtl].activity == MpState::Idle {
1420            self.vp.backing.cvm.lapics[vtl].activity = MpState::Running;
1421        }
1422        // If there is a higher-priority pending event of some kind, then
1423        // just request an exit after it has resolved, after which we will
1424        // try again.
1425        if self.vp.backing.vtls[vtl].interruption_information.valid()
1426            && self.vp.backing.vtls[vtl]
1427                .interruption_information
1428                .interruption_type()
1429                != INTERRUPT_TYPE_EXTERNAL
1430        {
1431            self.processor_controls.set_nmi_window_exiting(true);
1432            return;
1433        }
1434
1435        let interruptibility: Interruptibility = self
1436            .vp
1437            .runner
1438            .read_vmcs32(vtl, VmcsField::VMX_VMCS_GUEST_INTERRUPTIBILITY)
1439            .into();
1440
1441        if interruptibility.blocked_by_nmi()
1442            || interruptibility.blocked_by_sti()
1443            || interruptibility.blocked_by_movss()
1444        {
1445            self.processor_controls.set_nmi_window_exiting(true);
1446            return;
1447        }
1448
1449        self.vp.backing.vtls[vtl].interruption_information = InterruptionInformation::new()
1450            .with_valid(true)
1451            .with_vector(2)
1452            .with_interruption_type(INTERRUPT_TYPE_NMI);
1453
1454        self.vp.backing.cvm.lapics[vtl].activity = MpState::Running;
1455    }
1456
1457    fn handle_sipi(&mut self, vtl: GuestVtl, cs: SegmentRegister) {
1458        self.vp.write_segment(vtl, TdxSegmentReg::Cs, cs).unwrap();
1459        self.vp.backing.vtls[vtl].private_regs.rip = 0;
1460        self.vp.backing.cvm.lapics[vtl].activity = MpState::Running;
1461    }
1462}
1463
1464impl UhProcessor<'_, TdxBacked> {
1465    async fn run_vp_tdx(&mut self, dev: &impl CpuIo) -> Result<(), VpHaltReason> {
1466        let next_vtl = self.backing.cvm.exit_vtl;
1467
1468        if self.backing.vtls[next_vtl].interruption_information.valid() {
1469            tracing::trace!(
1470                vector = self.backing.vtls[next_vtl]
1471                    .interruption_information
1472                    .vector(),
1473                vp_index = self.vp_index().index(),
1474                ?next_vtl,
1475                "injecting interrupt"
1476            );
1477
1478            self.runner.write_vmcs32(
1479                next_vtl,
1480                VmcsField::VMX_VMCS_ENTRY_INTERRUPT_INFO,
1481                !0,
1482                self.backing.vtls[next_vtl].interruption_information.into(),
1483            );
1484            if self.backing.vtls[next_vtl]
1485                .interruption_information
1486                .deliver_error_code()
1487            {
1488                self.runner.write_vmcs32(
1489                    next_vtl,
1490                    VmcsField::VMX_VMCS_ENTRY_EXCEPTION_ERROR_CODE,
1491                    !0,
1492                    self.backing.vtls[next_vtl].exception_error_code,
1493                );
1494            }
1495            self.backing.vtls[next_vtl].interruption_set = true;
1496        } else if self.backing.vtls[next_vtl].interruption_set {
1497            self.runner
1498                .write_vmcs32(next_vtl, VmcsField::VMX_VMCS_ENTRY_INTERRUPT_INFO, !0, 0);
1499            self.backing.vtls[next_vtl].interruption_set = false;
1500        }
1501
1502        // We're about to return to a lower VTL, so set active_vtl for other VPs,
1503        // do any pending flushes, unlock our TLB locks, and wait for any others
1504        // we're supposed to.
1505
1506        // active_vtl needs SeqCst ordering here in order to correctly synchronize
1507        // access with the TLB address flush list. We need to ensure that, when
1508        // other VPs are adding entries to the list, they always observe the
1509        // correct lower active VTL. Otherwise they might choose to not send this
1510        // VP a wake, leading to a stall, until this VP happens to exit to VTL 2 again.
1511        //
1512        // This does technically leave open a small window for potential spurious
1513        // wakes, but that's preferable, and will cause no problems besides a
1514        // small amount of time waste.
1515        self.shared.active_vtl[self.vp_index().index() as usize]
1516            .store(next_vtl as u8, Ordering::SeqCst);
1517
1518        self.do_tlb_flush(next_vtl);
1519        self.unlock_tlb_lock(Vtl::Vtl2);
1520        let tlb_halt = self.should_halt_for_tlb_unlock(next_vtl);
1521
1522        // If we are halted in the kernel due to hlt or idle, and we receive an interrupt
1523        // we'd like to unhalt, inject the interrupt, and resume vtl0 without returning to
1524        // user-mode. To enable this, the kernel must know why are are halted
1525        let activity = self.backing.cvm.lapics[next_vtl].activity;
1526        let kernel_known_state =
1527            matches!(activity, MpState::Running | MpState::Halted | MpState::Idle);
1528        let halted_other = tlb_halt || !kernel_known_state;
1529
1530        self.runner
1531            .set_halted(activity != MpState::Running || tlb_halt);
1532
1533        // Turn on kernel interrupt handling if possible. This will cause the
1534        // kernel to handle some exits internally, without returning to user
1535        // mode, to improve performance.
1536        //
1537        // Do not do this if there is a pending interruption, since we need to
1538        // run code on the next exit to clear it. If we miss this opportunity,
1539        // we will probably double-inject the interruption, wreaking havoc.
1540        //
1541        // Also do not do this if there is a pending TLB flush, since we need to
1542        // run code on the next exit to clear it. If we miss this opportunity,
1543        // we could double-inject the TLB flush unnecessarily.
1544        let offload_enabled = self.backing.cvm.lapics[next_vtl].lapic.can_offload_irr()
1545            && !self.backing.vtls[next_vtl].interruption_information.valid()
1546            && self.backing.vtls[next_vtl]
1547                .private_regs
1548                .vp_entry_flags
1549                .invd_translations()
1550                == 0;
1551        let x2apic_enabled = self.backing.cvm.lapics[next_vtl].lapic.x2apic_enabled();
1552
1553        let offload_flags = hcl_intr_offload_flags::new()
1554            .with_offload_intr_inject(offload_enabled)
1555            .with_offload_x2apic(offload_enabled && x2apic_enabled)
1556            .with_halted_other(halted_other)
1557            .with_halted_hlt(activity == MpState::Halted)
1558            .with_halted_idle(activity == MpState::Idle);
1559
1560        *self.runner.offload_flags_mut() = offload_flags;
1561
1562        self.runner
1563            .write_private_regs(&self.backing.vtls[next_vtl].private_regs);
1564
1565        let has_intercept = self
1566            .runner
1567            .run()
1568            .map_err(|e| VpHaltReason::Hypervisor(TdxRunVpError(e).into()))?;
1569
1570        // TLB flushes can only target lower VTLs, so it is fine to use a relaxed
1571        // ordering here. The worst that can happen is some spurious wakes, due
1572        // to another VP observing that this VP is still in a lower VTL.
1573        self.shared.active_vtl[self.vp_index().index() as usize].store(2, Ordering::Relaxed);
1574
1575        let entered_from_vtl = next_vtl;
1576        self.runner
1577            .read_private_regs(&mut self.backing.vtls[entered_from_vtl].private_regs);
1578
1579        // Kernel offload may have set or cleared the halt/idle states
1580        if offload_enabled && kernel_known_state {
1581            let offload_flags = self.runner.offload_flags_mut();
1582
1583            self.backing.cvm.lapics[entered_from_vtl].activity =
1584                match (offload_flags.halted_hlt(), offload_flags.halted_idle()) {
1585                    (false, false) => MpState::Running,
1586                    (true, false) => MpState::Halted,
1587                    (false, true) => MpState::Idle,
1588                    (true, true) => {
1589                        tracelimit::warn_ratelimited!(
1590                            CVM_ALLOWED,
1591                            "Kernel indicates VP is both halted and idle!"
1592                        );
1593                        activity
1594                    }
1595                };
1596        }
1597
1598        if !has_intercept {
1599            return Ok(());
1600        }
1601
1602        let exit_info = TdxExit(self.runner.tdx_vp_enter_exit_info());
1603
1604        // Result codes above PENDING_INTERRUPT indicate the L2 was never entered.
1605        if exit_info.code().tdx_exit() >= TdCallResultCode::PENDING_INTERRUPT {
1606            self.backing.vtls[entered_from_vtl]
1607                .enter_stats
1608                .pending_intr
1609                .increment();
1610            return Ok(());
1611        }
1612
1613        // Since the L2 was entered we can clear any TLB flush requests
1614        self.backing.vtls[entered_from_vtl]
1615            .private_regs
1616            .vp_entry_flags
1617            .set_invd_translations(0);
1618
1619        // The L2 was entered, so process the exit.
1620        let stat = match exit_info.code().tdx_exit() {
1621            TdCallResultCode::SUCCESS => {
1622                &mut self.backing.vtls[entered_from_vtl].enter_stats.success
1623            }
1624            TdCallResultCode::L2_EXIT_HOST_ROUTED_ASYNC => {
1625                &mut self.backing.vtls[entered_from_vtl]
1626                    .enter_stats
1627                    .host_routed_async
1628            }
1629            TdCallResultCode::L2_EXIT_PENDING_INTERRUPT => {
1630                &mut self.backing.vtls[entered_from_vtl]
1631                    .enter_stats
1632                    .l2_exit_pending_intr
1633            }
1634            TdCallResultCode::L2_EXIT_HOST_ROUTED_TDVMCALL => {
1635                // This is expected, and means that the hypervisor completed a
1636                // TD.VMCALL from the L2 and has requested to resume the L2 to
1637                // the L1.
1638                //
1639                // There is nothing to do here.
1640                assert_eq!(
1641                    exit_info.code().vmx_exit(),
1642                    VmxExit::new().with_basic_reason(VmxExitBasic::TDCALL)
1643                );
1644                &mut self.backing.vtls[entered_from_vtl]
1645                    .enter_stats
1646                    .host_routed_td_vmcall
1647            }
1648            _ => panic!("unexpected tdx exit code {:?}", exit_info.code()),
1649        };
1650
1651        stat.increment();
1652        self.handle_vmx_exit(dev, entered_from_vtl).await?;
1653        Ok(())
1654    }
1655
1656    async fn handle_vmx_exit(
1657        &mut self,
1658        dev: &impl CpuIo,
1659        intercepted_vtl: GuestVtl,
1660    ) -> Result<(), VpHaltReason> {
1661        let exit_info = TdxExit(self.runner.tdx_vp_enter_exit_info());
1662
1663        // First, check that the VM entry was even successful.
1664        let vmx_exit = exit_info.code().vmx_exit();
1665        if vmx_exit.vm_enter_failed() {
1666            return Err(self.handle_vm_enter_failed(intercepted_vtl, vmx_exit));
1667        }
1668
1669        let next_interruption = exit_info.idt_vectoring_info();
1670
1671        // Acknowledge the APIC interrupt/NMI if it was delivered.
1672        if self.backing.vtls[intercepted_vtl]
1673            .interruption_information
1674            .valid()
1675            && (!next_interruption.valid()
1676                || self.backing.vtls[intercepted_vtl]
1677                    .interruption_information
1678                    .interruption_type()
1679                    != next_interruption.interruption_type())
1680        {
1681            match self.backing.vtls[intercepted_vtl]
1682                .interruption_information
1683                .interruption_type()
1684            {
1685                INTERRUPT_TYPE_EXTERNAL
1686                    if !self.backing.cvm.lapics[intercepted_vtl]
1687                        .lapic
1688                        .is_offloaded() =>
1689                {
1690                    // This must be a pending APIC interrupt. Acknowledge it.
1691                    tracing::trace!(
1692                        vector = self.backing.vtls[intercepted_vtl]
1693                            .interruption_information
1694                            .vector(),
1695                        "acknowledging interrupt"
1696                    );
1697                    self.backing.cvm.lapics[intercepted_vtl]
1698                        .lapic
1699                        .acknowledge_interrupt(
1700                            self.backing.vtls[intercepted_vtl]
1701                                .interruption_information
1702                                .vector(),
1703                        );
1704                }
1705                INTERRUPT_TYPE_NMI => {
1706                    // This must be a pending NMI.
1707                    tracing::debug!("acknowledging NMI");
1708                    self.backing.cvm.lapics[intercepted_vtl].nmi_pending = false;
1709                }
1710                _ => {}
1711            }
1712        }
1713
1714        if self.backing.cvm.lapics[intercepted_vtl]
1715            .lapic
1716            .is_offloaded()
1717        {
1718            // It's possible with vAPIC that we take an exit in the window where
1719            // hardware has moved a bit from IRR to ISR, but has not injected
1720            // the interrupt into the guest. In this case, we need to track that
1721            // we must inject the interrupt before we return to the guest,
1722            // otherwise the interrupt will be lost and the guest left in a bad
1723            // state.
1724            //
1725            // TODO TDX: Unclear what kind of exits these would be, but they
1726            // should be spurious EPT exits. Can we validate or assert that
1727            // somehow? If we were to somehow call some other path which would
1728            // set interruption_information before we inject this one, we would
1729            // lose this interrupt.
1730            if next_interruption.valid() {
1731                tracing::debug!(
1732                    ?next_interruption,
1733                    vp_index = self.vp_index().index(),
1734                    "exit requires reinjecting interrupt"
1735                );
1736                self.backing.vtls[intercepted_vtl].interruption_information = next_interruption;
1737                self.backing.vtls[intercepted_vtl].exception_error_code =
1738                    exit_info.idt_vectoring_error_code();
1739                self.backing.vtls[intercepted_vtl]
1740                    .exit_stats
1741                    .needs_interrupt_reinject
1742                    .increment();
1743            } else {
1744                self.backing.vtls[intercepted_vtl].interruption_information = Default::default();
1745            }
1746        } else {
1747            // Ignore (and later recalculate) the next interruption if it is an
1748            // external interrupt or NMI, since it may change if the APIC state
1749            // changes.
1750            if next_interruption.valid()
1751                && !matches!(
1752                    next_interruption.interruption_type(),
1753                    INTERRUPT_TYPE_EXTERNAL | INTERRUPT_TYPE_NMI
1754                )
1755            {
1756                self.backing.vtls[intercepted_vtl].interruption_information = next_interruption;
1757                self.backing.vtls[intercepted_vtl].exception_error_code =
1758                    exit_info.idt_vectoring_error_code();
1759            } else {
1760                self.backing.vtls[intercepted_vtl].interruption_information = Default::default();
1761            }
1762        }
1763
1764        let mut breakpoint_debug_exception = false;
1765        let stat = match vmx_exit.basic_reason() {
1766            VmxExitBasic::IO_INSTRUCTION => {
1767                let io_qual = ExitQualificationIo::from(exit_info.qualification() as u32);
1768
1769                let len = match io_qual.access_size() {
1770                    IO_SIZE_8_BIT => 1,
1771                    IO_SIZE_16_BIT => 2,
1772                    IO_SIZE_32_BIT => 4,
1773                    _ => panic!(
1774                        "tdx module returned invalid io instr size {}",
1775                        io_qual.access_size()
1776                    ),
1777                };
1778
1779                let port_access_protected = self.cvm_try_protect_io_port_access(
1780                    intercepted_vtl,
1781                    io_qual.port(),
1782                    io_qual.is_in(),
1783                    len,
1784                    io_qual.is_string(),
1785                    io_qual.rep_prefix(),
1786                );
1787
1788                if !port_access_protected {
1789                    if io_qual.is_string() || io_qual.rep_prefix() {
1790                        // TODO GUEST VSM: consider changing the emulation path
1791                        // to also check for io port installation, mainly for
1792                        // handling rep instructions.
1793
1794                        self.emulate(
1795                            dev,
1796                            self.backing.vtls[intercepted_vtl]
1797                                .interruption_information
1798                                .valid(),
1799                            intercepted_vtl,
1800                            TdxEmulationCache::default(),
1801                        )
1802                        .await?;
1803                    } else {
1804                        let mut rax = self.runner.tdx_enter_guest_gps()[TdxGp::RAX];
1805                        emulate_io(
1806                            self.inner.vp_info.base.vp_index,
1807                            !io_qual.is_in(),
1808                            io_qual.port(),
1809                            &mut rax,
1810                            len,
1811                            dev,
1812                        )
1813                        .await;
1814                        self.runner.tdx_enter_guest_gps_mut()[TdxGp::RAX] = rax;
1815
1816                        self.advance_to_next_instruction(intercepted_vtl);
1817                    }
1818                }
1819
1820                &mut self.backing.vtls[intercepted_vtl].exit_stats.io
1821            }
1822            VmxExitBasic::MSR_READ => {
1823                let msr = self.runner.tdx_enter_guest_gps()[TdxGp::RCX] as u32;
1824
1825                let result = self.backing.cvm.lapics[intercepted_vtl]
1826                    .lapic
1827                    .access(&mut TdxApicClient {
1828                        partition: self.partition,
1829                        vmtime: &self.vmtime,
1830                        apic_page: self.runner.tdx_apic_page_mut(intercepted_vtl),
1831                        dev,
1832                        vtl: intercepted_vtl,
1833                    })
1834                    .msr_read(msr)
1835                    .or_else_if_unknown(|| self.read_msr_cvm(msr, intercepted_vtl))
1836                    .or_else_if_unknown(|| self.read_msr_tdx(msr, intercepted_vtl));
1837
1838                let value = match result {
1839                    Ok(v) => Some(v),
1840                    Err(MsrError::Unknown) => {
1841                        tracelimit::warn_ratelimited!(CVM_ALLOWED, msr, "unknown tdx vm msr read");
1842                        Some(0)
1843                    }
1844                    Err(MsrError::InvalidAccess) => None,
1845                };
1846
1847                let inject_gp = if let Some(value) = value {
1848                    let gps = self.runner.tdx_enter_guest_gps_mut();
1849                    gps[TdxGp::RAX] = (value as u32).into();
1850                    gps[TdxGp::RDX] = ((value >> 32) as u32).into();
1851                    false
1852                } else {
1853                    true
1854                };
1855
1856                if inject_gp {
1857                    self.inject_gpf(intercepted_vtl);
1858                } else {
1859                    self.advance_to_next_instruction(intercepted_vtl);
1860                }
1861                &mut self.backing.vtls[intercepted_vtl].exit_stats.msr_read
1862            }
1863            VmxExitBasic::MSR_WRITE => {
1864                let gps = self.runner.tdx_enter_guest_gps();
1865                let msr = gps[TdxGp::RCX] as u32;
1866                let value =
1867                    (gps[TdxGp::RAX] as u32 as u64) | ((gps[TdxGp::RDX] as u32 as u64) << 32);
1868
1869                if !self.cvm_try_protect_msr_write(intercepted_vtl, msr) {
1870                    let result = self.backing.cvm.lapics[intercepted_vtl]
1871                        .lapic
1872                        .access(&mut TdxApicClient {
1873                            partition: self.partition,
1874                            vmtime: &self.vmtime,
1875                            apic_page: self.runner.tdx_apic_page_mut(intercepted_vtl),
1876                            dev,
1877                            vtl: intercepted_vtl,
1878                        })
1879                        .msr_write(msr, value)
1880                        .or_else_if_unknown(|| self.write_msr_cvm(msr, value, intercepted_vtl))
1881                        .or_else_if_unknown(|| self.write_msr_tdx(msr, value, intercepted_vtl))
1882                        .or_else_if_unknown(|| {
1883                            // Sanity check
1884                            if MSR_ALLOWED_READ_WRITE.contains(&msr) {
1885                                unreachable!("intercepted a write to MSR {msr}, configured for passthrough by default, that wasn't registered for intercepts by a higher VTL");
1886                            }
1887                            Err(MsrError::Unknown)
1888                        });
1889
1890                    let inject_gp = match result {
1891                        Ok(()) => false,
1892                        Err(MsrError::Unknown) => {
1893                            tracelimit::warn_ratelimited!(
1894                                CVM_ALLOWED,
1895                                msr,
1896                                "unknown tdx vm msr write"
1897                            );
1898                            tracelimit::warn_ratelimited!(
1899                                CVM_CONFIDENTIAL,
1900                                value,
1901                                "unknown tdx vm msr write"
1902                            );
1903                            false
1904                        }
1905                        Err(MsrError::InvalidAccess) => true,
1906                    };
1907
1908                    if inject_gp {
1909                        self.inject_gpf(intercepted_vtl);
1910                    } else {
1911                        self.advance_to_next_instruction(intercepted_vtl);
1912                    }
1913                }
1914                &mut self.backing.vtls[intercepted_vtl].exit_stats.msr_write
1915            }
1916            VmxExitBasic::CPUID => {
1917                let gps = self.runner.tdx_enter_guest_gps();
1918                let leaf = gps[TdxGp::RAX] as u32;
1919                let subleaf = gps[TdxGp::RCX] as u32;
1920                let [eax, ebx, ecx, edx] = self.cvm_cpuid_result(intercepted_vtl, leaf, subleaf);
1921                let gps = self.runner.tdx_enter_guest_gps_mut();
1922                gps[TdxGp::RAX] = eax.into();
1923                gps[TdxGp::RBX] = ebx.into();
1924                gps[TdxGp::RCX] = ecx.into();
1925                gps[TdxGp::RDX] = edx.into();
1926                self.advance_to_next_instruction(intercepted_vtl);
1927                &mut self.backing.vtls[intercepted_vtl].exit_stats.cpuid
1928            }
1929            VmxExitBasic::VMCALL_INSTRUCTION => {
1930                if exit_info.cpl() != 0 {
1931                    self.inject_gpf(intercepted_vtl);
1932                } else {
1933                    let is_64bit = self.long_mode(intercepted_vtl);
1934                    let guest_memory = &self.partition.gm[intercepted_vtl];
1935                    let handler = UhHypercallHandler {
1936                        trusted: !self.cvm_partition().hide_isolation,
1937                        vp: &mut *self,
1938                        bus: dev,
1939                        intercepted_vtl,
1940                    };
1941
1942                    UhHypercallHandler::TDX_DISPATCHER.dispatch(
1943                        guest_memory,
1944                        hv1_hypercall::X64RegisterIo::new(handler, is_64bit),
1945                    );
1946                }
1947                &mut self.backing.vtls[intercepted_vtl].exit_stats.vmcall
1948            }
1949            VmxExitBasic::HLT_INSTRUCTION => {
1950                self.backing.cvm.lapics[intercepted_vtl].activity = MpState::Halted;
1951                self.clear_interrupt_shadow(intercepted_vtl);
1952                self.advance_to_next_instruction(intercepted_vtl);
1953                &mut self.backing.vtls[intercepted_vtl].exit_stats.hlt
1954            }
1955            VmxExitBasic::CR_ACCESS => {
1956                let qual = CrAccessQualification::from(exit_info.qualification());
1957                let cr;
1958                let value;
1959                match qual.access_type() {
1960                    CR_ACCESS_TYPE_MOV_TO_CR => {
1961                        cr = qual.cr();
1962                        value = self.runner.tdx_enter_guest_gps()[qual.gp_register() as usize];
1963                    }
1964                    CR_ACCESS_TYPE_LMSW => {
1965                        cr = 0;
1966                        let cr0 = self.backing.vtls[intercepted_vtl].cr0.read(&self.runner);
1967                        // LMSW updates the low four bits only.
1968                        value = (qual.lmsw_source_data() as u64 & 0xf) | (cr0 & !0xf);
1969                    }
1970                    access_type => unreachable!("not registered for cr access type {access_type}"),
1971                }
1972
1973                let cr = match cr {
1974                    0 => HvX64RegisterName::Cr0,
1975                    4 => HvX64RegisterName::Cr4,
1976                    _ => unreachable!("not registered for cr{cr} accesses"),
1977                };
1978
1979                if !self.cvm_try_protect_secure_register_write(intercepted_vtl, cr, value) {
1980                    let r = match cr {
1981                        HvX64RegisterName::Cr0 => self.backing.vtls[intercepted_vtl]
1982                            .cr0
1983                            .write(value, &mut self.runner),
1984                        HvX64RegisterName::Cr4 => self.backing.vtls[intercepted_vtl]
1985                            .cr4
1986                            .write(value, &mut self.runner),
1987                        _ => unreachable!(),
1988                    };
1989                    if r.is_ok() {
1990                        self.update_execution_mode(intercepted_vtl);
1991                        self.advance_to_next_instruction(intercepted_vtl);
1992                    } else {
1993                        tracelimit::warn_ratelimited!(
1994                            CVM_ALLOWED,
1995                            ?cr,
1996                            value,
1997                            "failed to write cr"
1998                        );
1999                        self.inject_gpf(intercepted_vtl);
2000                    }
2001                }
2002                &mut self.backing.vtls[intercepted_vtl].exit_stats.cr_access
2003            }
2004            VmxExitBasic::XSETBV => {
2005                let gps = self.runner.tdx_enter_guest_gps();
2006                if let Some(value) =
2007                    hardware_cvm::validate_xsetbv_exit(hardware_cvm::XsetbvExitInput {
2008                        rax: gps[TdxGp::RAX],
2009                        rcx: gps[TdxGp::RCX],
2010                        rdx: gps[TdxGp::RDX],
2011                        cr4: self.backing.vtls[intercepted_vtl].cr4.read(&self.runner),
2012                        cpl: exit_info.cpl(),
2013                    })
2014                {
2015                    if !self.cvm_try_protect_secure_register_write(
2016                        intercepted_vtl,
2017                        HvX64RegisterName::Xfem,
2018                        value,
2019                    ) {
2020                        self.runner
2021                            .set_vp_register(intercepted_vtl, HvX64RegisterName::Xfem, value.into())
2022                            .unwrap();
2023                        self.advance_to_next_instruction(intercepted_vtl);
2024                    }
2025                } else {
2026                    self.inject_gpf(intercepted_vtl);
2027                }
2028                &mut self.backing.vtls[intercepted_vtl].exit_stats.xsetbv
2029            }
2030            VmxExitBasic::WBINVD_INSTRUCTION => {
2031                // Ask the kernel to flush the cache before issuing VP.ENTER.
2032                let no_invalidate = exit_info.qualification() != 0;
2033                if no_invalidate {
2034                    self.runner.tdx_vp_state_flags_mut().set_wbnoinvd(true);
2035                } else {
2036                    self.runner.tdx_vp_state_flags_mut().set_wbinvd(true);
2037                }
2038
2039                self.advance_to_next_instruction(intercepted_vtl);
2040                &mut self.backing.vtls[intercepted_vtl].exit_stats.wbinvd
2041            }
2042            VmxExitBasic::EPT_VIOLATION => {
2043                let gpa = exit_info.gpa().expect("is EPT exit");
2044                let ept_info = VmxEptExitQualification::from(exit_info.qualification());
2045                // If this was an EPT violation while handling an iret, and
2046                // that iret cleared the NMI blocking state, restore it.
2047                if !next_interruption.valid() && ept_info.nmi_unmasking_due_to_iret() {
2048                    let mask = Interruptibility::new().with_blocked_by_nmi(true);
2049                    let value = Interruptibility::new().with_blocked_by_nmi(true);
2050                    let old_interruptibility: Interruptibility = self
2051                        .runner
2052                        .write_vmcs32(
2053                            intercepted_vtl,
2054                            VmcsField::VMX_VMCS_GUEST_INTERRUPTIBILITY,
2055                            mask.into(),
2056                            value.into(),
2057                        )
2058                        .into();
2059                    assert!(!old_interruptibility.blocked_by_nmi());
2060                } else {
2061                    let is_write = ept_info.access_mask() & 0b10 != 0;
2062                    if self.check_mem_fault(intercepted_vtl, gpa, is_write, ept_info) {
2063                        self.emulate(
2064                            dev,
2065                            self.backing.vtls[intercepted_vtl]
2066                                .interruption_information
2067                                .valid(),
2068                            intercepted_vtl,
2069                            TdxEmulationCache::default(),
2070                        )
2071                        .await?;
2072                    }
2073                }
2074
2075                &mut self.backing.vtls[intercepted_vtl].exit_stats.ept_violation
2076            }
2077            VmxExitBasic::TPR_BELOW_THRESHOLD => {
2078                // Loop around to reevaluate the APIC.
2079                &mut self.backing.vtls[intercepted_vtl]
2080                    .exit_stats
2081                    .tpr_below_threshold
2082            }
2083            VmxExitBasic::INTERRUPT_WINDOW => {
2084                // Loop around to reevaluate the APIC.
2085                &mut self.backing.vtls[intercepted_vtl]
2086                    .exit_stats
2087                    .interrupt_window
2088            }
2089            VmxExitBasic::NMI_WINDOW => {
2090                // Loop around to reevaluate pending NMIs.
2091                &mut self.backing.vtls[intercepted_vtl].exit_stats.nmi_window
2092            }
2093            VmxExitBasic::HW_INTERRUPT => {
2094                if cfg!(feature = "gdb") {
2095                    // Check if the interrupt was triggered by a hardware breakpoint.
2096                    let debug_regs = self
2097                        .access_state(intercepted_vtl.into())
2098                        .debug_regs()
2099                        .expect("register query should not fail");
2100                    // The lowest four bits of DR6 indicate which of the
2101                    // four breakpoints triggered.
2102                    breakpoint_debug_exception = debug_regs.dr6.trailing_zeros() < 4;
2103                }
2104                &mut self.backing.vtls[intercepted_vtl].exit_stats.hw_interrupt
2105            }
2106            VmxExitBasic::SMI_INTR => &mut self.backing.vtls[intercepted_vtl].exit_stats.smi_intr,
2107            VmxExitBasic::PAUSE_INSTRUCTION => {
2108                &mut self.backing.vtls[intercepted_vtl].exit_stats.pause
2109            }
2110            VmxExitBasic::TDCALL => {
2111                // If the proxy synic is local, then the host did not get this
2112                // instruction, and we need to handle it.
2113                if self.backing.untrusted_synic.is_some() {
2114                    assert_eq!(intercepted_vtl, GuestVtl::Vtl0);
2115                    self.handle_tdvmcall(dev, intercepted_vtl);
2116                } else if self.cvm_partition().hide_isolation {
2117                    // TDCALL is not valid when hiding isolation. Inject a #UD.
2118                    self.backing.vtls[intercepted_vtl].interruption_information =
2119                        InterruptionInformation::new()
2120                            .with_valid(true)
2121                            .with_vector(x86defs::Exception::INVALID_OPCODE.0)
2122                            .with_interruption_type(INTERRUPT_TYPE_HARDWARE_EXCEPTION);
2123                }
2124                &mut self.backing.vtls[intercepted_vtl].exit_stats.tdcall
2125            }
2126            VmxExitBasic::EXCEPTION => {
2127                tracing::trace!(
2128                    "Caught Exception: {:?}",
2129                    exit_info._exit_interruption_info()
2130                );
2131                if cfg!(feature = "gdb") {
2132                    breakpoint_debug_exception = true;
2133                }
2134                &mut self.backing.vtls[intercepted_vtl].exit_stats.exception
2135            }
2136            VmxExitBasic::TRIPLE_FAULT => {
2137                return Err(VpHaltReason::TripleFault {
2138                    vtl: intercepted_vtl.into(),
2139                });
2140            }
2141            VmxExitBasic::GDTR_OR_IDTR => {
2142                let info = GdtrOrIdtrInstructionInfo::from(exit_info.instr_info().info());
2143                tracing::trace!("Intercepted GDT or IDT instruction: {:?}", info);
2144                let reg = match info.instruction() {
2145                    GdtrOrIdtrInstruction::Sidt | GdtrOrIdtrInstruction::Lidt => {
2146                        HvX64RegisterName::Idtr
2147                    }
2148                    GdtrOrIdtrInstruction::Sgdt | GdtrOrIdtrInstruction::Lgdt => {
2149                        HvX64RegisterName::Gdtr
2150                    }
2151                };
2152                // We only support fowarding intercepts for descriptor table loads today.
2153                if (info.instruction().is_load()
2154                    && !self.cvm_try_protect_secure_register_write(intercepted_vtl, reg, 0))
2155                    || !info.instruction().is_load()
2156                {
2157                    self.emulate_gdtr_or_idtr(intercepted_vtl, dev).await?;
2158                }
2159                &mut self.backing.vtls[intercepted_vtl]
2160                    .exit_stats
2161                    .descriptor_table
2162            }
2163            VmxExitBasic::LDTR_OR_TR => {
2164                let info = LdtrOrTrInstructionInfo::from(exit_info.instr_info().info());
2165                tracing::trace!("Intercepted LDT or TR instruction: {:?}", info);
2166                let reg = match info.instruction() {
2167                    LdtrOrTrInstruction::Sldt | LdtrOrTrInstruction::Lldt => {
2168                        HvX64RegisterName::Ldtr
2169                    }
2170                    LdtrOrTrInstruction::Str | LdtrOrTrInstruction::Ltr => HvX64RegisterName::Tr,
2171                };
2172                // We only support fowarding intercepts for descriptor table loads today.
2173                if (info.instruction().is_load()
2174                    && !self.cvm_try_protect_secure_register_write(intercepted_vtl, reg, 0))
2175                    || !info.instruction().is_load()
2176                {
2177                    self.emulate_ldtr_or_tr(intercepted_vtl, dev).await?;
2178                }
2179                &mut self.backing.vtls[intercepted_vtl]
2180                    .exit_stats
2181                    .descriptor_table
2182            }
2183            _ => {
2184                return Err(VpHaltReason::InvalidVmState(
2185                    UnknownVmxExit(exit_info.code().vmx_exit()).into(),
2186                ));
2187            }
2188        };
2189        stat.increment();
2190
2191        // Breakpoint exceptions may return a non-fatal error.
2192        // We dispatch here to correctly increment the counter.
2193        if cfg!(feature = "gdb") && breakpoint_debug_exception {
2194            self.handle_debug_exception(intercepted_vtl)?;
2195        }
2196
2197        Ok(())
2198    }
2199
2200    /// Trace processor state for debugging purposes.
2201    fn trace_processor_state(&self, vtl: GuestVtl) {
2202        let raw_exit = self.runner.tdx_vp_enter_exit_info();
2203        tracing::error!(CVM_CONFIDENTIAL, ?raw_exit, "raw tdx vp enter exit info");
2204
2205        let gprs = self.runner.tdx_enter_guest_gps();
2206        tracing::error!(CVM_CONFIDENTIAL, ?gprs, "guest gpr list");
2207
2208        let TdxPrivateRegs {
2209            rflags,
2210            rip,
2211            rsp,
2212            ssp,
2213            rvi,
2214            svi,
2215            msr_kernel_gs_base,
2216            msr_star,
2217            msr_lstar,
2218            msr_sfmask,
2219            msr_xss,
2220            msr_tsc_aux,
2221            vp_entry_flags,
2222        } = self.backing.vtls[vtl].private_regs;
2223        tracing::error!(
2224            CVM_CONFIDENTIAL,
2225            rflags,
2226            rip,
2227            rsp,
2228            ssp,
2229            rvi,
2230            svi,
2231            msr_kernel_gs_base,
2232            msr_star,
2233            msr_lstar,
2234            msr_sfmask,
2235            msr_xss,
2236            msr_tsc_aux,
2237            ?vp_entry_flags,
2238            "private registers"
2239        );
2240
2241        let physical_cr0 = self.runner.read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_CR0);
2242        let shadow_cr0 = self
2243            .runner
2244            .read_vmcs64(vtl, VmcsField::VMX_VMCS_CR0_READ_SHADOW);
2245        let cr0_guest_host_mask: u64 = self
2246            .runner
2247            .read_vmcs64(vtl, VmcsField::VMX_VMCS_CR0_GUEST_HOST_MASK);
2248        tracing::error!(
2249            CVM_CONFIDENTIAL,
2250            physical_cr0,
2251            shadow_cr0,
2252            cr0_guest_host_mask,
2253            "cr0 values"
2254        );
2255
2256        let physical_cr4 = self.runner.read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_CR4);
2257        let shadow_cr4 = self
2258            .runner
2259            .read_vmcs64(vtl, VmcsField::VMX_VMCS_CR4_READ_SHADOW);
2260        let cr4_guest_host_mask = self
2261            .runner
2262            .read_vmcs64(vtl, VmcsField::VMX_VMCS_CR4_GUEST_HOST_MASK);
2263        tracing::error!(
2264            CVM_CONFIDENTIAL,
2265            physical_cr4,
2266            shadow_cr4,
2267            cr4_guest_host_mask,
2268            "cr4 values"
2269        );
2270
2271        let cr3 = self.runner.read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_CR3);
2272        tracing::error!(CVM_CONFIDENTIAL, cr3, "cr3");
2273
2274        let cached_efer = self.backing.vtls[vtl].efer;
2275        let vmcs_efer = self.runner.read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_EFER);
2276        let entry_controls = self
2277            .runner
2278            .read_vmcs32(vtl, VmcsField::VMX_VMCS_ENTRY_CONTROLS);
2279        tracing::error!(CVM_CONFIDENTIAL, cached_efer, vmcs_efer, "efer");
2280        tracing::error!(CVM_CONFIDENTIAL, entry_controls, "entry controls");
2281
2282        let cs = self.read_segment(vtl, TdxSegmentReg::Cs);
2283        let ds = self.read_segment(vtl, TdxSegmentReg::Ds);
2284        let es = self.read_segment(vtl, TdxSegmentReg::Es);
2285        let fs = self.read_segment(vtl, TdxSegmentReg::Fs);
2286        let gs = self.read_segment(vtl, TdxSegmentReg::Gs);
2287        let ss = self.read_segment(vtl, TdxSegmentReg::Ss);
2288        let tr = self.read_segment(vtl, TdxSegmentReg::Tr);
2289        let ldtr = self.read_segment(vtl, TdxSegmentReg::Ldtr);
2290
2291        tracing::error!(
2292            CVM_CONFIDENTIAL,
2293            ?cs,
2294            ?ds,
2295            ?es,
2296            ?fs,
2297            ?gs,
2298            ?ss,
2299            ?tr,
2300            ?ldtr,
2301            "segment values"
2302        );
2303
2304        let exception_bitmap = self
2305            .runner
2306            .read_vmcs32(vtl, VmcsField::VMX_VMCS_EXCEPTION_BITMAP);
2307        tracing::error!(CVM_CONFIDENTIAL, exception_bitmap, "exception bitmap");
2308
2309        let cached_processor_controls = self.backing.vtls[vtl].processor_controls;
2310        let vmcs_processor_controls = ProcessorControls::from(
2311            self.runner
2312                .read_vmcs32(vtl, VmcsField::VMX_VMCS_PROCESSOR_CONTROLS),
2313        );
2314        let vmcs_secondary_processor_controls = SecondaryProcessorControls::from(
2315            self.runner
2316                .read_vmcs32(vtl, VmcsField::VMX_VMCS_SECONDARY_PROCESSOR_CONTROLS),
2317        );
2318        tracing::error!(
2319            CVM_CONFIDENTIAL,
2320            ?cached_processor_controls,
2321            ?vmcs_processor_controls,
2322            ?vmcs_secondary_processor_controls,
2323            "processor controls"
2324        );
2325
2326        if cached_processor_controls != vmcs_processor_controls {
2327            tracing::error!(CVM_ALLOWED, "BUGBUG: processor controls mismatch");
2328        }
2329
2330        let cached_tpr_threshold = self.backing.vtls[vtl].tpr_threshold;
2331        let vmcs_tpr_threshold = self
2332            .runner
2333            .read_vmcs32(vtl, VmcsField::VMX_VMCS_TPR_THRESHOLD);
2334        tracing::error!(
2335            CVM_CONFIDENTIAL,
2336            cached_tpr_threshold,
2337            vmcs_tpr_threshold,
2338            "tpr threshold"
2339        );
2340
2341        let cached_eoi_exit_bitmap = self.backing.eoi_exit_bitmap;
2342        let vmcs_eoi_exit_bitmap = {
2343            let fields = [
2344                VmcsField::VMX_VMCS_EOI_EXIT_0,
2345                VmcsField::VMX_VMCS_EOI_EXIT_1,
2346                VmcsField::VMX_VMCS_EOI_EXIT_2,
2347                VmcsField::VMX_VMCS_EOI_EXIT_3,
2348            ];
2349            fields
2350                .iter()
2351                .map(|field| self.runner.read_vmcs64(vtl, *field))
2352                .collect::<Vec<_>>()
2353        };
2354        tracing::error!(
2355            CVM_CONFIDENTIAL,
2356            ?cached_eoi_exit_bitmap,
2357            ?vmcs_eoi_exit_bitmap,
2358            "eoi exit bitmap"
2359        );
2360
2361        let cached_interrupt_information = self.backing.vtls[vtl].interruption_information;
2362        let cached_interruption_set = self.backing.vtls[vtl].interruption_set;
2363        let vmcs_interrupt_information = self
2364            .runner
2365            .read_vmcs32(vtl, VmcsField::VMX_VMCS_ENTRY_INTERRUPT_INFO);
2366        let vmcs_entry_exception_code = self
2367            .runner
2368            .read_vmcs32(vtl, VmcsField::VMX_VMCS_ENTRY_EXCEPTION_ERROR_CODE);
2369        tracing::error!(
2370            CVM_CONFIDENTIAL,
2371            ?cached_interrupt_information,
2372            cached_interruption_set,
2373            vmcs_interrupt_information,
2374            vmcs_entry_exception_code,
2375            "interrupt information"
2376        );
2377
2378        let guest_interruptibility = self
2379            .runner
2380            .read_vmcs32(vtl, VmcsField::VMX_VMCS_GUEST_INTERRUPTIBILITY);
2381        tracing::error!(
2382            CVM_CONFIDENTIAL,
2383            guest_interruptibility,
2384            "guest interruptibility"
2385        );
2386
2387        let vmcs_sysenter_cs = self
2388            .runner
2389            .read_vmcs32(vtl, VmcsField::VMX_VMCS_GUEST_SYSENTER_CS_MSR);
2390        let vmcs_sysenter_esp = self
2391            .runner
2392            .read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_SYSENTER_ESP_MSR);
2393        let vmcs_sysenter_eip = self
2394            .runner
2395            .read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_SYSENTER_EIP_MSR);
2396        tracing::error!(
2397            CVM_CONFIDENTIAL,
2398            vmcs_sysenter_cs,
2399            vmcs_sysenter_esp,
2400            vmcs_sysenter_eip,
2401            "sysenter values"
2402        );
2403
2404        let vmcs_pat = self.runner.read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_PAT);
2405        tracing::error!(CVM_CONFIDENTIAL, vmcs_pat, "guest PAT");
2406    }
2407
2408    fn handle_vm_enter_failed(&self, vtl: GuestVtl, vmx_exit: VmxExit) -> VpHaltReason {
2409        assert!(vmx_exit.vm_enter_failed());
2410        match vmx_exit.basic_reason() {
2411            VmxExitBasic::BAD_GUEST_STATE => {
2412                // Log system register state for debugging why we were
2413                // unable to enter the guest. This is a VMM bug.
2414                tracing::error!(CVM_ALLOWED, "VP.ENTER failed with bad guest state");
2415                self.trace_processor_state(vtl);
2416
2417                VpHaltReason::InvalidVmState(VmxBadGuestState.into())
2418            }
2419            _ => VpHaltReason::InvalidVmState(UnknownVmxExit(vmx_exit).into()),
2420        }
2421    }
2422
2423    fn advance_to_next_instruction(&mut self, vtl: GuestVtl) {
2424        let instr_info = TdxExit(self.runner.tdx_vp_enter_exit_info()).instr_info();
2425        let rip = &mut self.backing.vtls[vtl].private_regs.rip;
2426        *rip = rip.wrapping_add(instr_info.length().into());
2427    }
2428
2429    fn clear_interrupt_shadow(&mut self, vtl: GuestVtl) {
2430        let mask = Interruptibility::new().with_blocked_by_sti(true);
2431        let value = Interruptibility::new().with_blocked_by_sti(false);
2432        self.runner.write_vmcs32(
2433            vtl,
2434            VmcsField::VMX_VMCS_GUEST_INTERRUPTIBILITY,
2435            mask.into(),
2436            value.into(),
2437        );
2438    }
2439
2440    fn inject_gpf(&mut self, vtl: GuestVtl) {
2441        self.backing.vtls[vtl].interruption_information = InterruptionInformation::new()
2442            .with_valid(true)
2443            .with_vector(x86defs::Exception::GENERAL_PROTECTION_FAULT.0)
2444            .with_interruption_type(INTERRUPT_TYPE_HARDWARE_EXCEPTION)
2445            .with_deliver_error_code(true);
2446        self.backing.vtls[vtl].exception_error_code = 0;
2447    }
2448
2449    fn handle_tdvmcall(&mut self, dev: &impl CpuIo, intercepted_vtl: GuestVtl) {
2450        let regs = self.runner.tdx_enter_guest_gps();
2451        if regs[TdxGp::R10] == 0 {
2452            // Architectural VMCALL.
2453            let result = match VmxExitBasic(regs[TdxGp::R11] as u16) {
2454                VmxExitBasic::MSR_WRITE => {
2455                    let msr = regs[TdxGp::R12] as u32;
2456                    let value = regs[TdxGp::R13];
2457                    match self.write_tdvmcall_msr(msr, value, intercepted_vtl) {
2458                        Ok(()) => {
2459                            tracing::debug!(msr, value, "tdvmcall msr write");
2460                            TdVmCallR10Result::SUCCESS
2461                        }
2462                        Err(err) => {
2463                            tracelimit::warn_ratelimited!(
2464                                CVM_ALLOWED,
2465                                msr,
2466                                ?err,
2467                                "failed tdvmcall msr write"
2468                            );
2469                            tracelimit::warn_ratelimited!(
2470                                CVM_CONFIDENTIAL,
2471                                value,
2472                                "failed tdvmcall msr write"
2473                            );
2474                            TdVmCallR10Result::OPERAND_INVALID
2475                        }
2476                    }
2477                }
2478                VmxExitBasic::MSR_READ => {
2479                    let msr = regs[TdxGp::R12] as u32;
2480                    match self.read_tdvmcall_msr(msr, intercepted_vtl) {
2481                        Ok(value) => {
2482                            tracing::debug!(msr, value, "tdvmcall msr read");
2483                            self.runner.tdx_enter_guest_gps_mut()[TdxGp::R11] = value;
2484                            TdVmCallR10Result::SUCCESS
2485                        }
2486                        Err(err) => {
2487                            tracelimit::warn_ratelimited!(
2488                                CVM_ALLOWED,
2489                                msr,
2490                                ?err,
2491                                "failed tdvmcall msr read"
2492                            );
2493                            TdVmCallR10Result::OPERAND_INVALID
2494                        }
2495                    }
2496                }
2497                subfunction => {
2498                    tracelimit::warn_ratelimited!(
2499                        CVM_ALLOWED,
2500                        ?subfunction,
2501                        "architectural vmcall not supported"
2502                    );
2503                    TdVmCallR10Result::OPERAND_INVALID
2504                }
2505            };
2506            self.runner.tdx_enter_guest_gps_mut()[TdxGp::R10] = result.0;
2507            self.backing.vtls[intercepted_vtl].private_regs.rip = self.backing.vtls
2508                [intercepted_vtl]
2509                .private_regs
2510                .rip
2511                .wrapping_add(4);
2512        } else {
2513            // This hypercall is normally handled by the hypervisor, so the gpas
2514            // given by the guest should all be shared. The hypervisor allows
2515            // gpas to be set with or without the shared gpa boundary bit, which
2516            // untrusted_dma_memory correctly models. Note that some Linux
2517            // guests will issue hypercalls without the boundary bit set,
2518            // whereas UEFI will issue with the bit set.
2519            let guest_memory = &self.shared.cvm.shared_memory;
2520            let handler = UhHypercallHandler {
2521                vp: &mut *self,
2522                bus: dev,
2523                trusted: false,
2524                intercepted_vtl,
2525            };
2526
2527            UhHypercallHandler::TDCALL_DISPATCHER.dispatch(guest_memory, TdHypercall(handler));
2528        }
2529    }
2530
2531    fn read_tdvmcall_msr(&mut self, msr: u32, intercepted_vtl: GuestVtl) -> Result<u64, MsrError> {
2532        match msr {
2533            msr @ (hvdef::HV_X64_MSR_GUEST_OS_ID | hvdef::HV_X64_MSR_VP_INDEX) => {
2534                self.backing.cvm.hv[intercepted_vtl].msr_read(msr)
2535            }
2536            _ => self
2537                .backing
2538                .untrusted_synic
2539                .as_mut()
2540                .unwrap()
2541                .read_nontimer_msr(msr),
2542        }
2543    }
2544
2545    fn write_tdvmcall_msr(
2546        &mut self,
2547        msr: u32,
2548        value: u64,
2549        intercepted_vtl: GuestVtl,
2550    ) -> Result<(), MsrError> {
2551        match msr {
2552            hvdef::HV_X64_MSR_GUEST_OS_ID => {
2553                self.backing.cvm.hv[intercepted_vtl].msr_write_guest_os_id(value)
2554            }
2555            _ => {
2556                // If we get here we must have an untrusted synic, as otherwise
2557                // we wouldn't be handling the TDVMCALL that ends up here. Therefore
2558                // this is fine to unwrap.
2559                self.backing
2560                    .untrusted_synic
2561                    .as_mut()
2562                    .unwrap()
2563                    .write_nontimer_msr(
2564                        msr,
2565                        value,
2566                        &mut UntrustedSynicVtlProts(&self.partition.gm[GuestVtl::Vtl0]),
2567                    )?;
2568                // Propagate sint MSR writes to the hypervisor as well
2569                // so that the hypervisor can directly inject events.
2570                if matches!(msr, hvdef::HV_X64_MSR_SINT0..=hvdef::HV_X64_MSR_SINT15) {
2571                    if let Err(err) = self.runner.set_vp_register(
2572                        intercepted_vtl,
2573                        HvX64RegisterName(
2574                            HvX64RegisterName::Sint0.0 + (msr - hvdef::HV_X64_MSR_SINT0),
2575                        ),
2576                        value.into(),
2577                    ) {
2578                        tracelimit::warn_ratelimited!(
2579                            CVM_ALLOWED,
2580                            error = &err as &dyn std::error::Error,
2581                            "failed to set sint register"
2582                        );
2583                    }
2584                }
2585            }
2586        }
2587
2588        Ok(())
2589    }
2590
2591    fn read_msr_tdx(&mut self, msr: u32, vtl: GuestVtl) -> Result<u64, MsrError> {
2592        // TODO TDX: port remaining tdx and common values
2593        //
2594        // TODO TDX: consider if this can be shared with SnpBacked's
2595        // implementation. For the most part other than Intel/TDX specific
2596        // registers, MSR handling should be the same.
2597
2598        match msr {
2599            // TODO TDX: LIFTED FROM WHP
2600            x86defs::X86X_IA32_MSR_PLATFORM_ID => {
2601                // Windows requires accessing this to boot. WHP
2602                // used to pass this through to the hardware,
2603                // but this regressed. Zero seems to work fine
2604                // for Windows.
2605                //
2606                // TODO: Pass through the host value if it can
2607                //       be retrieved.
2608                Ok(0)
2609            }
2610
2611            x86defs::X86X_MSR_MTRR_CAP => {
2612                // Advertise the absence of MTRR capabilities, but include the availability of write
2613                // combining.
2614                Ok(0x400)
2615            }
2616            x86defs::X86X_MSR_MTRR_DEF_TYPE => {
2617                // Because the MTRR registers are advertised via CPUID, even though no actual ranges
2618                // are supported a guest may choose to write to this MSR. Implement it as read as
2619                // zero/write ignore.
2620                Ok(0)
2621            }
2622            x86defs::X86X_MSR_CSTAR => Ok(self.backing.vtls[vtl].msr_cstar),
2623            x86defs::X86X_MSR_MCG_CAP => Ok(0),
2624            x86defs::X86X_MSR_MCG_STATUS => Ok(0),
2625            x86defs::X86X_MSR_MC_UPDATE_PATCH_LEVEL => Ok(0xFFFFFFFF),
2626            x86defs::X86X_MSR_XSS => Ok(self.backing.vtls[vtl].private_regs.msr_xss),
2627            x86defs::X86X_IA32_MSR_MISC_ENABLE => Ok(hv1_emulator::x86::MISC_ENABLE.into()),
2628            x86defs::X86X_IA32_MSR_FEATURE_CONTROL => Ok(VMX_FEATURE_CONTROL_LOCKED),
2629            x86defs::X86X_MSR_CR_PAT => {
2630                let pat = self.runner.read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_PAT);
2631                Ok(pat)
2632            }
2633
2634            // Following MSRs are unconditionally read by Linux guests.
2635            // These are not virtualized and unsupported for L2-VMs
2636            x86defs::X86X_MSR_MISC_FEATURE_ENABLES
2637            | x86defs::X86X_MSR_PLATFORM_INFO
2638            | x86defs::X86X_MSR_PPIN_CTL
2639            | x86defs::X86X_IA32_MSR_SMI_COUNT
2640            | x86defs::X86X_MSR_UMWAIT_CONTROL
2641            | x86defs::X86X_AMD_MSR_DE_CFG
2642            | x86defs::X86X_IA32_MSR_RAPL_POWER_UNIT
2643            | x86defs::X86X_IA32_MSR_PKG_ENERGY_STATUS
2644            | x86defs::X86X_IA32_MSR_DRAM_ENERGY_STATUS
2645            | x86defs::X86X_IA32_MSR_PP0_ENERGY_STATUS => Ok(0),
2646
2647            hvdef::HV_X64_MSR_GUEST_IDLE => {
2648                self.backing.cvm.lapics[vtl].activity = MpState::Idle;
2649                self.clear_interrupt_shadow(vtl);
2650                Ok(0)
2651            }
2652            X86X_MSR_EFER => Ok(self.backing.vtls[vtl].efer),
2653
2654            _ => Err(MsrError::Unknown),
2655        }
2656    }
2657
2658    fn write_msr_tdx(&mut self, msr: u32, value: u64, vtl: GuestVtl) -> Result<(), MsrError> {
2659        let state = &mut self.backing.vtls[vtl].private_regs;
2660
2661        match msr {
2662            X86X_MSR_EFER => {
2663                self.write_efer(vtl, value)
2664                    .map_err(|_| MsrError::InvalidAccess)?;
2665                self.update_execution_mode(vtl);
2666            }
2667            x86defs::X86X_MSR_STAR => state.msr_star = value,
2668            x86defs::X86X_MSR_CSTAR => self.backing.vtls[vtl].msr_cstar = value,
2669            x86defs::X86X_MSR_LSTAR => state.msr_lstar = value,
2670            x86defs::X86X_MSR_SFMASK => state.msr_sfmask = value,
2671            x86defs::X86X_MSR_TSC_AUX => state.msr_tsc_aux = value,
2672            x86defs::X86X_MSR_SYSENTER_CS => {
2673                self.runner.write_vmcs32(
2674                    vtl,
2675                    VmcsField::VMX_VMCS_GUEST_SYSENTER_CS_MSR,
2676                    !0,
2677                    value as u32,
2678                );
2679            }
2680            x86defs::X86X_MSR_SYSENTER_EIP => {
2681                self.runner.write_vmcs64(
2682                    vtl,
2683                    VmcsField::VMX_VMCS_GUEST_SYSENTER_EIP_MSR,
2684                    !0,
2685                    value,
2686                );
2687            }
2688            x86defs::X86X_MSR_SYSENTER_ESP => {
2689                self.runner.write_vmcs64(
2690                    vtl,
2691                    VmcsField::VMX_VMCS_GUEST_SYSENTER_ESP_MSR,
2692                    !0,
2693                    value,
2694                );
2695            }
2696            x86defs::X86X_MSR_XSS => state.msr_xss = value,
2697            x86defs::X86X_MSR_MC_UPDATE_PATCH_LEVEL => {
2698                // Writing zero on intel platforms is allowed and ignored.
2699                if value != 0 {
2700                    return Err(MsrError::InvalidAccess);
2701                }
2702            }
2703            x86defs::X86X_IA32_MSR_MISC_ENABLE => {}
2704            x86defs::X86X_MSR_CR_PAT => {
2705                self.runner
2706                    .write_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_PAT, !0, value);
2707            }
2708
2709            x86defs::X86X_MSR_MCG_STATUS => {
2710                // Writes are swallowed, except for reserved bits violations
2711                if x86defs::X86xMcgStatusRegister::from(value).reserved0() != 0 {
2712                    return Err(MsrError::InvalidAccess);
2713                }
2714            }
2715
2716            // Ignore writes to this MSR
2717            x86defs::X86X_MSR_MTRR_DEF_TYPE => {}
2718
2719            // Following MSRs are sometimes written by Windows guests.
2720            // These are not virtualized and unsupported for L2-VMs
2721            x86defs::X86X_MSR_BIOS_UPDT_TRIG => {}
2722
2723            // Following MSRs are unconditionally written by Linux guests.
2724            // These are not virtualized and unsupported for L2-VMs
2725            x86defs::X86X_MSR_MISC_FEATURE_ENABLES
2726            | x86defs::X86X_MSR_PLATFORM_INFO
2727            | x86defs::X86X_MSR_PPIN_CTL
2728            | x86defs::X86X_IA32_MSR_SMI_COUNT
2729            | x86defs::X86X_MSR_UMWAIT_CONTROL
2730            | x86defs::X86X_AMD_MSR_DE_CFG
2731            | x86defs::X86X_IA32_MSR_RAPL_POWER_UNIT
2732            | x86defs::X86X_IA32_MSR_PKG_ENERGY_STATUS
2733            | x86defs::X86X_IA32_MSR_DRAM_ENERGY_STATUS
2734            | x86defs::X86X_IA32_MSR_PP0_ENERGY_STATUS => {}
2735
2736            _ => return Err(MsrError::Unknown),
2737        }
2738
2739        Ok(())
2740    }
2741
2742    fn write_segment(
2743        &mut self,
2744        vtl: GuestVtl,
2745        seg: TdxSegmentReg,
2746        reg: SegmentRegister,
2747    ) -> Result<(), vp_state::Error> {
2748        // write base, selector, limit
2749        self.runner
2750            .write_vmcs16(vtl, seg.selector(), !0, reg.selector);
2751        self.runner.write_vmcs64(vtl, seg.base(), !0, reg.base);
2752        self.runner.write_vmcs32(vtl, seg.limit(), !0, reg.limit);
2753
2754        // Mark segment not valid if its attributes indicate not present.
2755        let mut attributes = x86defs::vmx::VmxSegmentAttributes::from(reg.attributes as u32);
2756        attributes.set_null(!attributes.present());
2757
2758        self.runner
2759            .write_vmcs32(vtl, seg.attributes(), !0, attributes.into());
2760
2761        Ok(())
2762    }
2763
2764    fn read_segment(&self, vtl: GuestVtl, seg: TdxSegmentReg) -> SegmentRegister {
2765        let selector = self.runner.read_vmcs16(vtl, seg.selector());
2766        let base = self.runner.read_vmcs64(vtl, seg.base());
2767        let limit = self.runner.read_vmcs32(vtl, seg.limit());
2768        let attributes = self.runner.read_vmcs32(vtl, seg.attributes());
2769
2770        SegmentRegister {
2771            selector,
2772            base,
2773            limit,
2774            attributes: attributes as u16,
2775        }
2776    }
2777
2778    fn long_mode(&self, vtl: GuestVtl) -> bool {
2779        let backing = &self.backing.vtls[vtl];
2780        backing.cr0.read(&self.runner) & X64_CR0_PE != 0 && backing.efer & X64_EFER_LMA != 0
2781    }
2782}
2783
2784impl<T: CpuIo> X86EmulatorSupport for UhEmulationState<'_, '_, T, TdxBacked> {
2785    fn vp_index(&self) -> VpIndex {
2786        self.vp.vp_index()
2787    }
2788
2789    fn flush(&mut self) {
2790        // no cached registers are modifiable by the emulator for TDX
2791    }
2792
2793    fn vendor(&self) -> x86defs::cpuid::Vendor {
2794        self.vp.partition.caps.vendor
2795    }
2796
2797    fn gp(&mut self, reg: Gp) -> u64 {
2798        self.vp.runner.tdx_enter_guest_gps()[reg as usize]
2799    }
2800
2801    fn set_gp(&mut self, reg: Gp, v: u64) {
2802        self.vp.runner.tdx_enter_guest_gps_mut()[reg as usize] = v;
2803    }
2804
2805    fn xmm(&mut self, index: usize) -> u128 {
2806        u128::from_ne_bytes(self.vp.runner.fx_state().xmm[index])
2807    }
2808
2809    fn set_xmm(&mut self, index: usize, v: u128) {
2810        self.vp.runner.fx_state_mut().xmm[index] = v.to_ne_bytes();
2811    }
2812
2813    fn rip(&mut self) -> u64 {
2814        self.vp.backing.vtls[self.vtl].private_regs.rip
2815    }
2816
2817    fn set_rip(&mut self, v: u64) {
2818        self.vp.backing.vtls[self.vtl].private_regs.rip = v;
2819    }
2820
2821    fn segment(&mut self, index: Segment) -> x86defs::SegmentRegister {
2822        let tdx_segment_index = match index {
2823            Segment::CS => TdxSegmentReg::Cs,
2824            Segment::ES => TdxSegmentReg::Es,
2825            Segment::SS => TdxSegmentReg::Ss,
2826            Segment::DS => TdxSegmentReg::Ds,
2827            Segment::FS => TdxSegmentReg::Fs,
2828            Segment::GS => TdxSegmentReg::Gs,
2829        };
2830        let reg = match tdx_segment_index {
2831            TdxSegmentReg::Cs => self.cache.segs[index as usize]
2832                .get_or_insert_with(|| TdxExit(self.vp.runner.tdx_vp_enter_exit_info()).cs()),
2833            _ => self.cache.segs[index as usize]
2834                .get_or_insert_with(|| self.vp.read_segment(self.vtl, tdx_segment_index)),
2835        };
2836        (*reg).into()
2837    }
2838
2839    fn efer(&mut self) -> u64 {
2840        self.vp.backing.vtls[self.vtl].efer
2841    }
2842
2843    fn cr0(&mut self) -> u64 {
2844        let reg = self
2845            .cache
2846            .cr0
2847            .get_or_insert_with(|| self.vp.backing.vtls[self.vtl].cr0.read(&self.vp.runner));
2848        *reg
2849    }
2850
2851    fn rflags(&mut self) -> RFlags {
2852        self.vp.backing.vtls[self.vtl].private_regs.rflags.into()
2853    }
2854
2855    fn set_rflags(&mut self, v: RFlags) {
2856        self.vp.backing.vtls[self.vtl].private_regs.rflags = v.into();
2857    }
2858
2859    fn instruction_bytes(&self) -> &[u8] {
2860        &[]
2861    }
2862
2863    fn physical_address(&self) -> Option<u64> {
2864        TdxExit(self.vp.runner.tdx_vp_enter_exit_info()).gpa()
2865    }
2866
2867    fn initial_gva_translation(
2868        &mut self,
2869    ) -> Option<virt_support_x86emu::emulate::InitialTranslation> {
2870        let exit_info = TdxExit(self.vp.runner.tdx_vp_enter_exit_info());
2871        let ept_info = VmxEptExitQualification::from(exit_info.qualification());
2872
2873        if exit_info.code().vmx_exit().basic_reason() == VmxExitBasic::EPT_VIOLATION
2874            && ept_info.gva_valid()
2875        {
2876            Some(virt_support_x86emu::emulate::InitialTranslation {
2877                gva: exit_info.gla().expect("already validated EPT exit"),
2878                gpa: exit_info.gpa().expect("already validated EPT exit"),
2879                translate_mode: match ept_info.access_mask() {
2880                    0x1 => TranslateMode::Read,
2881                    // As defined in "Table 28-7. Exit Qualification for EPT
2882                    // Violations" in the Intel SDM, the processor may set both
2883                    // the read and write bits in certain conditions:
2884                    //
2885                    // If accessed and dirty flags for EPT are enabled,
2886                    // processor accesses to guest paging-structure entries are
2887                    // treated as writes with regard to EPT violations (see
2888                    // Section 29.3.3.2). If such an access causes an EPT
2889                    // violation, the processor sets both bit 0 and bit 1 of the
2890                    // exit qualification.
2891                    //
2892                    // Treat both 0x2 and 0x3 as writes.
2893                    0x2 | 0x3 => TranslateMode::Write,
2894                    0x4 => TranslateMode::Execute,
2895                    _ => panic!("unexpected ept access mask 0x{:x}", ept_info.access_mask()),
2896                },
2897            })
2898        } else {
2899            None
2900        }
2901    }
2902
2903    fn interruption_pending(&self) -> bool {
2904        self.interruption_pending
2905    }
2906
2907    fn check_vtl_access(
2908        &mut self,
2909        _gpa: u64,
2910        _mode: TranslateMode,
2911    ) -> Result<(), virt_support_x86emu::emulate::EmuCheckVtlAccessError> {
2912        // Nothing to do here, the guest memory object will handle the check.
2913        Ok(())
2914    }
2915
2916    fn translate_gva(
2917        &mut self,
2918        gva: u64,
2919        mode: TranslateMode,
2920    ) -> Result<
2921        virt_support_x86emu::emulate::EmuTranslateResult,
2922        virt_support_x86emu::emulate::EmuTranslateError,
2923    > {
2924        emulate_translate_gva(self, gva, mode)
2925    }
2926
2927    fn inject_pending_event(&mut self, event_info: hvdef::HvX64PendingEvent) {
2928        assert!(event_info.reg_0.event_pending());
2929        assert_eq!(
2930            event_info.reg_0.event_type(),
2931            hvdef::HV_X64_PENDING_EVENT_EXCEPTION
2932        );
2933        assert!(!self.interruption_pending);
2934
2935        // There's no interruption pending, so just inject the exception
2936        // directly without checking for double fault.
2937        TdxBacked::set_pending_exception(
2938            self.vp,
2939            self.vtl,
2940            HvX64PendingExceptionEvent::from(event_info.reg_0.into_bits()),
2941        );
2942    }
2943
2944    fn is_gpa_mapped(&self, gpa: u64, write: bool) -> bool {
2945        // Ignore the VTOM address bit when checking, since memory is mirrored
2946        // across the VTOM.
2947        let vtom = self.vp.partition.caps.vtom.unwrap_or(0);
2948        debug_assert!(vtom == 0 || vtom.is_power_of_two());
2949        self.vp.partition.is_gpa_mapped(gpa & !vtom, write)
2950    }
2951
2952    fn lapic_base_address(&self) -> Option<u64> {
2953        self.vp.backing.cvm.lapics[self.vtl].lapic.base_address()
2954    }
2955
2956    fn lapic_read(&mut self, address: u64, data: &mut [u8]) {
2957        self.vp.backing.cvm.lapics[self.vtl]
2958            .lapic
2959            .access(&mut TdxApicClient {
2960                partition: self.vp.partition,
2961                dev: self.devices,
2962                vmtime: &self.vp.vmtime,
2963                apic_page: self.vp.runner.tdx_apic_page_mut(self.vtl),
2964                vtl: self.vtl,
2965            })
2966            .mmio_read(address, data);
2967    }
2968
2969    fn lapic_write(&mut self, address: u64, data: &[u8]) {
2970        self.vp.backing.cvm.lapics[self.vtl]
2971            .lapic
2972            .access(&mut TdxApicClient {
2973                partition: self.vp.partition,
2974                dev: self.devices,
2975                vmtime: &self.vp.vmtime,
2976                apic_page: self.vp.runner.tdx_apic_page_mut(self.vtl),
2977                vtl: self.vtl,
2978            })
2979            .mmio_write(address, data);
2980    }
2981
2982    fn monitor_support(&self) -> Option<&dyn EmulatorMonitorSupport> {
2983        Some(self)
2984    }
2985}
2986
2987#[derive(Debug)]
2988enum TdxSegmentReg {
2989    Es,
2990    Cs,
2991    Ss,
2992    Ds,
2993    Fs,
2994    Gs,
2995    Ldtr,
2996    Tr,
2997}
2998
2999impl TdxSegmentReg {
3000    /// The selector vmcs field code.
3001    fn selector(&self) -> VmcsField {
3002        match self {
3003            Self::Es => VmcsField::VMX_VMCS_GUEST_ES_SELECTOR,
3004            Self::Cs => VmcsField::VMX_VMCS_GUEST_CS_SELECTOR,
3005            Self::Ss => VmcsField::VMX_VMCS_GUEST_SS_SELECTOR,
3006            Self::Ds => VmcsField::VMX_VMCS_GUEST_DS_SELECTOR,
3007            Self::Fs => VmcsField::VMX_VMCS_GUEST_FS_SELECTOR,
3008            Self::Gs => VmcsField::VMX_VMCS_GUEST_GS_SELECTOR,
3009            Self::Ldtr => VmcsField::VMX_VMCS_GUEST_LDTR_SELECTOR,
3010            Self::Tr => VmcsField::VMX_VMCS_GUEST_TR_SELECTOR,
3011        }
3012    }
3013
3014    /// The base vmcs field code.
3015    fn base(&self) -> VmcsField {
3016        match self {
3017            Self::Es => VmcsField::VMX_VMCS_GUEST_ES_BASE,
3018            Self::Cs => VmcsField::VMX_VMCS_GUEST_CS_BASE,
3019            Self::Ss => VmcsField::VMX_VMCS_GUEST_SS_BASE,
3020            Self::Ds => VmcsField::VMX_VMCS_GUEST_DS_BASE,
3021            Self::Fs => VmcsField::VMX_VMCS_GUEST_FS_BASE,
3022            Self::Gs => VmcsField::VMX_VMCS_GUEST_GS_BASE,
3023            Self::Ldtr => VmcsField::VMX_VMCS_GUEST_LDTR_BASE,
3024            Self::Tr => VmcsField::VMX_VMCS_GUEST_TR_BASE,
3025        }
3026    }
3027
3028    /// The limit vmcs field code.
3029    fn limit(&self) -> VmcsField {
3030        match self {
3031            Self::Es => VmcsField::VMX_VMCS_GUEST_ES_LIMIT,
3032            Self::Cs => VmcsField::VMX_VMCS_GUEST_CS_LIMIT,
3033            Self::Ss => VmcsField::VMX_VMCS_GUEST_SS_LIMIT,
3034            Self::Ds => VmcsField::VMX_VMCS_GUEST_DS_LIMIT,
3035            Self::Fs => VmcsField::VMX_VMCS_GUEST_FS_LIMIT,
3036            Self::Gs => VmcsField::VMX_VMCS_GUEST_GS_LIMIT,
3037            Self::Ldtr => VmcsField::VMX_VMCS_GUEST_LDTR_LIMIT,
3038            Self::Tr => VmcsField::VMX_VMCS_GUEST_TR_LIMIT,
3039        }
3040    }
3041
3042    // The attributes vmcs field code.
3043    fn attributes(&self) -> VmcsField {
3044        match self {
3045            Self::Es => VmcsField::VMX_VMCS_GUEST_ES_AR,
3046            Self::Cs => VmcsField::VMX_VMCS_GUEST_CS_AR,
3047            Self::Ss => VmcsField::VMX_VMCS_GUEST_SS_AR,
3048            Self::Ds => VmcsField::VMX_VMCS_GUEST_DS_AR,
3049            Self::Fs => VmcsField::VMX_VMCS_GUEST_FS_AR,
3050            Self::Gs => VmcsField::VMX_VMCS_GUEST_GS_AR,
3051            Self::Ldtr => VmcsField::VMX_VMCS_GUEST_LDTR_AR,
3052            Self::Tr => VmcsField::VMX_VMCS_GUEST_TR_AR,
3053        }
3054    }
3055}
3056
3057#[derive(Debug)]
3058enum TdxTableReg {
3059    Idtr,
3060    Gdtr,
3061}
3062
3063impl TdxTableReg {
3064    fn base_code(&self) -> VmcsField {
3065        match self {
3066            Self::Idtr => VmcsField::VMX_VMCS_GUEST_IDTR_BASE,
3067            Self::Gdtr => VmcsField::VMX_VMCS_GUEST_GDTR_BASE,
3068        }
3069    }
3070
3071    fn limit_code(&self) -> VmcsField {
3072        match self {
3073            Self::Idtr => VmcsField::VMX_VMCS_GUEST_IDTR_LIMIT,
3074            Self::Gdtr => VmcsField::VMX_VMCS_GUEST_GDTR_LIMIT,
3075        }
3076    }
3077}
3078
3079impl UhProcessor<'_, TdxBacked> {
3080    /// Handle a write to EFER, which requires special handling on TDX due to
3081    /// required bits and state updates.
3082    ///
3083    /// Note that a caller must also call [`Self::update_execution_mode`] after
3084    /// updating EFER.
3085    fn write_efer(&mut self, vtl: GuestVtl, efer: u64) -> Result<(), vp_state::Error> {
3086        if efer & (X64_EFER_SVME | X64_EFER_FFXSR) != 0 {
3087            return Err(vp_state::Error::InvalidValue(
3088                efer,
3089                "EFER",
3090                "SVME or FFXSR set",
3091            ));
3092        }
3093
3094        // EFER.NXE must be 1.
3095        if efer & X64_EFER_NXE == 0 {
3096            return Err(vp_state::Error::InvalidValue(efer, "EFER", "NXE not set"));
3097        }
3098
3099        // Update the local value of EFER and the VMCS.
3100        if self.backing.vtls[vtl].efer != efer {
3101            self.backing.vtls[vtl].efer = efer;
3102            self.runner
3103                .write_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_EFER, !0, efer);
3104        }
3105
3106        Ok(())
3107    }
3108
3109    /// Read CR0 that includes guest shadowed bits. This is the value the guest
3110    /// sees.
3111    fn read_cr0(&self, vtl: GuestVtl) -> u64 {
3112        self.backing.vtls[vtl].cr0.read(&self.runner)
3113    }
3114
3115    /// Write to the guest CR0.
3116    fn write_cr0(&mut self, vtl: GuestVtl, value: u64) -> Result<(), vp_state::Error> {
3117        self.backing.vtls[vtl]
3118            .cr0
3119            .write(value | X64_CR0_ET, &mut self.runner)
3120    }
3121
3122    fn read_cr4(&self, vtl: GuestVtl) -> u64 {
3123        self.backing.vtls[vtl].cr4.read(&self.runner)
3124    }
3125
3126    fn write_cr4(&mut self, vtl: GuestVtl, value: u64) -> Result<(), vp_state::Error> {
3127        self.backing.vtls[vtl].cr4.write(value, &mut self.runner)
3128    }
3129
3130    fn write_table_register(&mut self, vtl: GuestVtl, table: TdxTableReg, reg: TableRegister) {
3131        self.runner
3132            .write_vmcs64(vtl, table.base_code(), !0, reg.base);
3133        self.runner
3134            .write_vmcs32(vtl, table.limit_code(), !0, reg.limit.into());
3135    }
3136
3137    fn read_table_register(&self, vtl: GuestVtl, table: TdxTableReg) -> TableRegister {
3138        let base = self.runner.read_vmcs64(vtl, table.base_code());
3139        let limit = self.runner.read_vmcs32(vtl, table.limit_code());
3140
3141        TableRegister {
3142            base,
3143            limit: limit as u16,
3144        }
3145    }
3146
3147    /// Update execution mode when CR0 or EFER is changed.
3148    fn update_execution_mode(&mut self, vtl: GuestVtl) {
3149        let lme = self.backing.vtls[vtl].efer & X64_EFER_LME == X64_EFER_LME;
3150        let pg = self.read_cr0(vtl) & X64_CR0_PG == X64_CR0_PG;
3151        let efer_lma = self.backing.vtls[vtl].efer & X64_EFER_LMA == X64_EFER_LMA;
3152        let lma = lme && pg;
3153
3154        if lma != efer_lma {
3155            // Flip only the LMA bit.
3156            let new_efer = self.backing.vtls[vtl].efer ^ X64_EFER_LMA;
3157            self.write_efer(vtl, new_efer)
3158                .expect("EFER was valid before, it should still be valid");
3159        }
3160
3161        self.runner.write_vmcs32(
3162            vtl,
3163            VmcsField::VMX_VMCS_ENTRY_CONTROLS,
3164            VMX_ENTRY_CONTROL_LONG_MODE_GUEST,
3165            if lma {
3166                VMX_ENTRY_CONTROL_LONG_MODE_GUEST
3167            } else {
3168                0
3169            },
3170        );
3171    }
3172
3173    async fn emulate_gdtr_or_idtr(
3174        &mut self,
3175        vtl: GuestVtl,
3176        dev: &impl CpuIo,
3177    ) -> Result<(), VpHaltReason> {
3178        let exit_info = TdxExit(self.runner.tdx_vp_enter_exit_info());
3179        assert_eq!(
3180            exit_info.code().vmx_exit().basic_reason(),
3181            VmxExitBasic::GDTR_OR_IDTR
3182        );
3183        let instr_info = GdtrOrIdtrInstructionInfo::from(exit_info.instr_info().info());
3184
3185        // Check if load instructions are executed outside of kernel mode.
3186        // Check if store instructions are blocked by UMIP.
3187        if (instr_info.instruction().is_load() && exit_info.cpl() != 0)
3188            || (!instr_info.instruction().is_load()
3189                && exit_info.cpl() > 0
3190                && self.read_cr4(vtl) & X64_CR4_UMIP != 0)
3191        {
3192            self.inject_gpf(vtl);
3193            return Ok(());
3194        }
3195
3196        let (gva, segment) = self.compute_gva_for_table_access_emulation(
3197            exit_info.qualification(),
3198            (!instr_info.base_register_invalid()).then_some(instr_info.base_register()),
3199            (!instr_info.index_register_invalid()).then_some(instr_info.index_register()),
3200            instr_info.scaling(),
3201            instr_info.address_size(),
3202            instr_info.segment_register(),
3203        );
3204
3205        let gm = &self.partition.gm[vtl];
3206        let interruption_pending = self.backing.vtls[vtl].interruption_information.valid();
3207        let len = 2 + if self.long_mode(vtl) { 8 } else { 4 };
3208        let mut buf = [0u8; 10];
3209
3210        match instr_info.instruction() {
3211            GdtrOrIdtrInstruction::Sidt | GdtrOrIdtrInstruction::Sgdt => {
3212                let table = self.read_table_register(
3213                    vtl,
3214                    if matches!(instr_info.instruction(), GdtrOrIdtrInstruction::Sidt) {
3215                        TdxTableReg::Idtr
3216                    } else {
3217                        TdxTableReg::Gdtr
3218                    },
3219                );
3220                buf[..2].copy_from_slice(&table.limit.to_le_bytes());
3221                buf[2..].copy_from_slice(&table.base.to_le_bytes());
3222                let mut emulation_state = UhEmulationState {
3223                    vp: &mut *self,
3224                    interruption_pending,
3225                    devices: dev,
3226                    vtl,
3227                    cache: TdxEmulationCache::default(),
3228                };
3229                emulate_insn_memory_op(
3230                    &mut emulation_state,
3231                    gm,
3232                    dev,
3233                    gva,
3234                    segment,
3235                    x86emu::AlignmentMode::Unaligned,
3236                    EmulatedMemoryOperation::Write(&buf[..len]),
3237                )
3238                .await?;
3239            }
3240
3241            GdtrOrIdtrInstruction::Lgdt | GdtrOrIdtrInstruction::Lidt => {
3242                let mut emulation_state = UhEmulationState {
3243                    vp: &mut *self,
3244                    interruption_pending,
3245                    devices: dev,
3246                    vtl,
3247                    cache: TdxEmulationCache::default(),
3248                };
3249                emulate_insn_memory_op(
3250                    &mut emulation_state,
3251                    gm,
3252                    dev,
3253                    gva,
3254                    segment,
3255                    x86emu::AlignmentMode::Unaligned,
3256                    EmulatedMemoryOperation::Read(&mut buf[..len]),
3257                )
3258                .await?;
3259                let table = TableRegister {
3260                    limit: u16::from_le_bytes(buf[..2].try_into().unwrap()),
3261                    base: u64::from_le_bytes(buf[2..len].try_into().unwrap()),
3262                };
3263                self.write_table_register(
3264                    vtl,
3265                    if matches!(instr_info.instruction(), GdtrOrIdtrInstruction::Lidt) {
3266                        TdxTableReg::Idtr
3267                    } else {
3268                        TdxTableReg::Gdtr
3269                    },
3270                    table,
3271                );
3272            }
3273        }
3274
3275        self.advance_to_next_instruction(vtl);
3276        Ok(())
3277    }
3278
3279    async fn emulate_ldtr_or_tr(
3280        &mut self,
3281        vtl: GuestVtl,
3282        dev: &impl CpuIo,
3283    ) -> Result<(), VpHaltReason> {
3284        let exit_info = TdxExit(self.runner.tdx_vp_enter_exit_info());
3285        assert_eq!(
3286            exit_info.code().vmx_exit().basic_reason(),
3287            VmxExitBasic::LDTR_OR_TR
3288        );
3289        let instr_info = LdtrOrTrInstructionInfo::from(exit_info.instr_info().info());
3290
3291        // Check if load instructions are executed outside of kernel mode.
3292        // Check if store instructions are blocked by UMIP.
3293        if (instr_info.instruction().is_load() && exit_info.cpl() != 0)
3294            || (!instr_info.instruction().is_load()
3295                && exit_info.cpl() > 0
3296                && self.read_cr4(vtl) & X64_CR4_UMIP != 0)
3297        {
3298            self.inject_gpf(vtl);
3299            return Ok(());
3300        }
3301
3302        let gm = &self.partition.gm[vtl];
3303        let interruption_pending = self.backing.vtls[vtl].interruption_information.valid();
3304
3305        match instr_info.instruction() {
3306            LdtrOrTrInstruction::Sldt | LdtrOrTrInstruction::Str => {
3307                let value = self.runner.read_vmcs16(
3308                    vtl,
3309                    if matches!(instr_info.instruction(), LdtrOrTrInstruction::Sldt) {
3310                        TdxSegmentReg::Ldtr
3311                    } else {
3312                        TdxSegmentReg::Tr
3313                    }
3314                    .selector(),
3315                );
3316
3317                if instr_info.memory_or_register() {
3318                    let gps = self.runner.tdx_enter_guest_gps_mut();
3319                    gps[instr_info.register_1() as usize] = value.into();
3320                } else {
3321                    let (gva, segment) = self.compute_gva_for_table_access_emulation(
3322                        exit_info.qualification(),
3323                        (!instr_info.base_register_invalid()).then_some(instr_info.base_register()),
3324                        (!instr_info.index_register_invalid())
3325                            .then_some(instr_info.index_register()),
3326                        instr_info.scaling(),
3327                        instr_info.address_size(),
3328                        instr_info.segment_register(),
3329                    );
3330                    let mut emulation_state = UhEmulationState {
3331                        vp: &mut *self,
3332                        interruption_pending,
3333                        devices: dev,
3334                        vtl,
3335                        cache: TdxEmulationCache::default(),
3336                    };
3337                    emulate_insn_memory_op(
3338                        &mut emulation_state,
3339                        gm,
3340                        dev,
3341                        gva,
3342                        segment,
3343                        x86emu::AlignmentMode::Standard,
3344                        EmulatedMemoryOperation::Write(&value.to_le_bytes()),
3345                    )
3346                    .await?;
3347                }
3348            }
3349
3350            LdtrOrTrInstruction::Lldt | LdtrOrTrInstruction::Ltr => {
3351                let value = if instr_info.memory_or_register() {
3352                    let gps = self.runner.tdx_enter_guest_gps();
3353                    gps[instr_info.register_1() as usize] as u16
3354                } else {
3355                    let (gva, segment) = self.compute_gva_for_table_access_emulation(
3356                        exit_info.qualification(),
3357                        (!instr_info.base_register_invalid()).then_some(instr_info.base_register()),
3358                        (!instr_info.index_register_invalid())
3359                            .then_some(instr_info.index_register()),
3360                        instr_info.scaling(),
3361                        instr_info.address_size(),
3362                        instr_info.segment_register(),
3363                    );
3364                    let mut emulation_state = UhEmulationState {
3365                        vp: &mut *self,
3366                        interruption_pending,
3367                        devices: dev,
3368                        vtl,
3369                        cache: TdxEmulationCache::default(),
3370                    };
3371                    let mut buf = [0u8; 2];
3372                    emulate_insn_memory_op(
3373                        &mut emulation_state,
3374                        gm,
3375                        dev,
3376                        gva,
3377                        segment,
3378                        x86emu::AlignmentMode::Standard,
3379                        EmulatedMemoryOperation::Read(&mut buf),
3380                    )
3381                    .await?;
3382                    u16::from_le_bytes(buf)
3383                };
3384                self.runner.write_vmcs16(
3385                    vtl,
3386                    if matches!(instr_info.instruction(), LdtrOrTrInstruction::Lldt) {
3387                        TdxSegmentReg::Ldtr
3388                    } else {
3389                        TdxSegmentReg::Tr
3390                    }
3391                    .selector(),
3392                    !0,
3393                    value,
3394                );
3395            }
3396        }
3397
3398        self.advance_to_next_instruction(vtl);
3399        Ok(())
3400    }
3401
3402    fn compute_gva_for_table_access_emulation(
3403        &self,
3404        qualification: u64,
3405        base_reg: Option<u8>,
3406        index_reg: Option<u8>,
3407        scaling: u8,
3408        address_size: u8,
3409        segment_register: u8,
3410    ) -> (u64, Segment) {
3411        let gps = self.runner.tdx_enter_guest_gps();
3412
3413        // Displacement is stored in the qualification field for these instructions.
3414        let mut gva = qualification;
3415        if let Some(base_register) = base_reg {
3416            gva += gps[base_register as usize];
3417        }
3418        if let Some(index_register) = index_reg {
3419            gva += gps[index_register as usize] << scaling;
3420        }
3421        match address_size {
3422            // 16-bit address size
3423            0 => gva &= 0xFFFF,
3424            // 32-bit address size
3425            1 => gva &= 0xFFFFFFFF,
3426            // 64-bit address size
3427            2 => {}
3428            _ => unreachable!(),
3429        }
3430
3431        let segment = match segment_register {
3432            0 => Segment::ES,
3433            1 => Segment::CS,
3434            2 => Segment::SS,
3435            3 => Segment::DS,
3436            4 => Segment::FS,
3437            5 => Segment::GS,
3438            _ => unreachable!(),
3439        };
3440
3441        (gva, segment)
3442    }
3443}
3444
3445struct TdxApicClient<'a, T> {
3446    partition: &'a UhPartitionInner,
3447    apic_page: &'a mut ApicPage,
3448    dev: &'a T,
3449    vmtime: &'a VmTimeAccess,
3450    vtl: GuestVtl,
3451}
3452
3453impl<T: CpuIo> ApicClient for TdxApicClient<'_, T> {
3454    fn cr8(&mut self) -> u32 {
3455        self.apic_page.tpr.value >> 4
3456    }
3457
3458    fn set_cr8(&mut self, value: u32) {
3459        self.apic_page.tpr.value = value << 4;
3460    }
3461
3462    fn set_apic_base(&mut self, _value: u64) {
3463        // No-op--the APIC base is stored in the APIC itself.
3464    }
3465
3466    fn wake(&mut self, vp_index: VpIndex) {
3467        self.partition.vps[vp_index.index() as usize].wake(self.vtl, WakeReason::INTCON);
3468    }
3469
3470    fn eoi(&mut self, vector: u8) {
3471        self.dev.handle_eoi(vector.into())
3472    }
3473
3474    fn now(&mut self) -> vmcore::vmtime::VmTime {
3475        self.vmtime.now()
3476    }
3477
3478    fn pull_offload(&mut self) -> ([u32; 8], [u32; 8]) {
3479        pull_apic_offload(self.apic_page)
3480    }
3481}
3482
3483fn pull_apic_offload(page: &mut ApicPage) -> ([u32; 8], [u32; 8]) {
3484    let mut irr = [0; 8];
3485    let mut isr = [0; 8];
3486    for (((irr, page_irr), isr), page_isr) in irr
3487        .iter_mut()
3488        .zip(page.irr.iter_mut())
3489        .zip(isr.iter_mut())
3490        .zip(page.isr.iter_mut())
3491    {
3492        *irr = std::mem::take(&mut page_irr.value);
3493        *isr = std::mem::take(&mut page_isr.value);
3494    }
3495    (irr, isr)
3496}
3497
3498impl<T> hv1_hypercall::X64RegisterState for UhHypercallHandler<'_, '_, T, TdxBacked> {
3499    fn rip(&mut self) -> u64 {
3500        self.vp.backing.vtls[self.intercepted_vtl].private_regs.rip
3501    }
3502
3503    fn set_rip(&mut self, rip: u64) {
3504        self.vp.backing.vtls[self.intercepted_vtl].private_regs.rip = rip;
3505    }
3506
3507    fn gp(&mut self, n: hv1_hypercall::X64HypercallRegister) -> u64 {
3508        let gps = self.vp.runner.tdx_enter_guest_gps();
3509        match n {
3510            hv1_hypercall::X64HypercallRegister::Rax => gps[TdxGp::RAX],
3511            hv1_hypercall::X64HypercallRegister::Rcx => gps[TdxGp::RCX],
3512            hv1_hypercall::X64HypercallRegister::Rdx => gps[TdxGp::RDX],
3513            hv1_hypercall::X64HypercallRegister::Rbx => gps[TdxGp::RBX],
3514            hv1_hypercall::X64HypercallRegister::Rsi => gps[TdxGp::RSI],
3515            hv1_hypercall::X64HypercallRegister::Rdi => gps[TdxGp::RDI],
3516            hv1_hypercall::X64HypercallRegister::R8 => gps[TdxGp::R8],
3517        }
3518    }
3519
3520    fn set_gp(&mut self, n: hv1_hypercall::X64HypercallRegister, value: u64) {
3521        let gps = self.vp.runner.tdx_enter_guest_gps_mut();
3522        match n {
3523            hv1_hypercall::X64HypercallRegister::Rax => gps[TdxGp::RAX] = value,
3524            hv1_hypercall::X64HypercallRegister::Rcx => gps[TdxGp::RCX] = value,
3525            hv1_hypercall::X64HypercallRegister::Rdx => gps[TdxGp::RDX] = value,
3526            hv1_hypercall::X64HypercallRegister::Rbx => gps[TdxGp::RBX] = value,
3527            hv1_hypercall::X64HypercallRegister::Rsi => gps[TdxGp::RSI] = value,
3528            hv1_hypercall::X64HypercallRegister::Rdi => gps[TdxGp::RDI] = value,
3529            hv1_hypercall::X64HypercallRegister::R8 => gps[TdxGp::R8] = value,
3530        }
3531    }
3532
3533    // TODO: cleanup xmm to not use same as mshv
3534    fn xmm(&mut self, n: usize) -> u128 {
3535        u128::from_ne_bytes(self.vp.runner.fx_state().xmm[n])
3536    }
3537
3538    fn set_xmm(&mut self, n: usize, value: u128) {
3539        self.vp.runner.fx_state_mut().xmm[n] = value.to_ne_bytes();
3540    }
3541}
3542
3543impl<T: CpuIo> UhHypercallHandler<'_, '_, T, TdxBacked> {
3544    const TDX_DISPATCHER: hv1_hypercall::Dispatcher<Self> = hv1_hypercall::dispatcher!(
3545        Self,
3546        [
3547            hv1_hypercall::HvModifySparseGpaPageHostVisibility,
3548            hv1_hypercall::HvQuerySparseGpaPageHostVisibility,
3549            hv1_hypercall::HvX64StartVirtualProcessor,
3550            hv1_hypercall::HvGetVpIndexFromApicId,
3551            hv1_hypercall::HvRetargetDeviceInterrupt,
3552            hv1_hypercall::HvFlushVirtualAddressList,
3553            hv1_hypercall::HvFlushVirtualAddressListEx,
3554            hv1_hypercall::HvFlushVirtualAddressSpace,
3555            hv1_hypercall::HvFlushVirtualAddressSpaceEx,
3556            hv1_hypercall::HvPostMessage,
3557            hv1_hypercall::HvSignalEvent,
3558            hv1_hypercall::HvExtQueryCapabilities,
3559            hv1_hypercall::HvGetVpRegisters,
3560            hv1_hypercall::HvSetVpRegisters,
3561            hv1_hypercall::HvEnablePartitionVtl,
3562            hv1_hypercall::HvX64EnableVpVtl,
3563            hv1_hypercall::HvVtlCall,
3564            hv1_hypercall::HvVtlReturn,
3565            hv1_hypercall::HvModifyVtlProtectionMask,
3566            hv1_hypercall::HvX64TranslateVirtualAddress,
3567            hv1_hypercall::HvSendSyntheticClusterIpi,
3568            hv1_hypercall::HvSendSyntheticClusterIpiEx,
3569            hv1_hypercall::HvInstallIntercept,
3570            hv1_hypercall::HvAssertVirtualInterrupt,
3571        ]
3572    );
3573
3574    /// Hypercalls that come through a tdg.vp.vmcall tdcall instruction.
3575    ///
3576    /// This is just to handle the proxy synic.
3577    const TDCALL_DISPATCHER: hv1_hypercall::Dispatcher<Self> = hv1_hypercall::dispatcher!(
3578        Self,
3579        [hv1_hypercall::HvPostMessage, hv1_hypercall::HvSignalEvent],
3580    );
3581}
3582
3583impl AccessVpState for UhVpStateAccess<'_, '_, TdxBacked> {
3584    type Error = vp_state::Error;
3585
3586    fn caps(&self) -> &virt::x86::X86PartitionCapabilities {
3587        &self.vp.partition.caps
3588    }
3589
3590    fn commit(&mut self) -> Result<(), Self::Error> {
3591        Ok(())
3592    }
3593
3594    fn registers(&mut self) -> Result<Registers, Self::Error> {
3595        let gps = self.vp.runner.tdx_enter_guest_gps();
3596
3597        let cs = self.vp.read_segment(self.vtl, TdxSegmentReg::Cs);
3598        let ds = self.vp.read_segment(self.vtl, TdxSegmentReg::Ds);
3599        let es = self.vp.read_segment(self.vtl, TdxSegmentReg::Es);
3600        let fs = self.vp.read_segment(self.vtl, TdxSegmentReg::Fs);
3601        let gs = self.vp.read_segment(self.vtl, TdxSegmentReg::Gs);
3602        let ss = self.vp.read_segment(self.vtl, TdxSegmentReg::Ss);
3603        let tr = self.vp.read_segment(self.vtl, TdxSegmentReg::Tr);
3604        let ldtr = self.vp.read_segment(self.vtl, TdxSegmentReg::Ldtr);
3605
3606        let gdtr = self.vp.read_table_register(self.vtl, TdxTableReg::Gdtr);
3607        let idtr = self.vp.read_table_register(self.vtl, TdxTableReg::Idtr);
3608
3609        let cr0 = self.vp.read_cr0(self.vtl);
3610        let cr2 = self.vp.runner.cr2();
3611        let cr3 = self
3612            .vp
3613            .runner
3614            .read_vmcs64(self.vtl, VmcsField::VMX_VMCS_GUEST_CR3);
3615        let cr4 = self.vp.read_cr4(self.vtl);
3616
3617        let cr8 = self.vp.runner.tdx_apic_page(self.vtl).tpr.value >> 4;
3618
3619        let efer = self.vp.backing.vtls[self.vtl].efer;
3620
3621        Ok(Registers {
3622            rax: gps[TdxGp::RAX],
3623            rcx: gps[TdxGp::RCX],
3624            rdx: gps[TdxGp::RDX],
3625            rbx: gps[TdxGp::RBX],
3626            rsp: self.vp.backing.vtls[self.vtl].private_regs.rsp,
3627            rbp: gps[TdxGp::RBP],
3628            rsi: gps[TdxGp::RSI],
3629            rdi: gps[TdxGp::RDI],
3630            r8: gps[TdxGp::R8],
3631            r9: gps[TdxGp::R9],
3632            r10: gps[TdxGp::R10],
3633            r11: gps[TdxGp::R11],
3634            r12: gps[TdxGp::R12],
3635            r13: gps[TdxGp::R13],
3636            r14: gps[TdxGp::R14],
3637            r15: gps[TdxGp::R15],
3638            rip: self.vp.backing.vtls[self.vtl].private_regs.rip,
3639            rflags: self.vp.backing.vtls[self.vtl].private_regs.rflags,
3640            cs,
3641            ds,
3642            es,
3643            fs,
3644            gs,
3645            ss,
3646            tr,
3647            ldtr,
3648            gdtr,
3649            idtr,
3650            cr0,
3651            cr2,
3652            cr3,
3653            cr4,
3654            cr8: cr8.into(),
3655            efer,
3656        })
3657    }
3658
3659    fn set_registers(&mut self, value: &Registers) -> Result<(), Self::Error> {
3660        let Registers {
3661            rax,
3662            rcx,
3663            rdx,
3664            rbx,
3665            rsp,
3666            rbp,
3667            rsi,
3668            rdi,
3669            r8,
3670            r9,
3671            r10,
3672            r11,
3673            r12,
3674            r13,
3675            r14,
3676            r15,
3677            rip,
3678            rflags,
3679            cs,
3680            ds,
3681            es,
3682            fs,
3683            gs,
3684            ss,
3685            tr,
3686            ldtr,
3687            gdtr,
3688            idtr,
3689            cr0,
3690            cr2,
3691            cr3,
3692            cr4,
3693            cr8,
3694            efer,
3695        } = value;
3696
3697        let gps = self.vp.runner.tdx_enter_guest_gps_mut();
3698        gps[TdxGp::RAX] = *rax;
3699        gps[TdxGp::RCX] = *rcx;
3700        gps[TdxGp::RDX] = *rdx;
3701        gps[TdxGp::RBX] = *rbx;
3702        self.vp.backing.vtls[self.vtl].private_regs.rsp = *rsp;
3703        gps[TdxGp::RBP] = *rbp;
3704        gps[TdxGp::RSI] = *rsi;
3705        gps[TdxGp::RDI] = *rdi;
3706        gps[TdxGp::R8] = *r8;
3707        gps[TdxGp::R9] = *r9;
3708        gps[TdxGp::R10] = *r10;
3709        gps[TdxGp::R11] = *r11;
3710        gps[TdxGp::R12] = *r12;
3711        gps[TdxGp::R13] = *r13;
3712        gps[TdxGp::R14] = *r14;
3713        gps[TdxGp::R15] = *r15;
3714        self.vp.backing.vtls[self.vtl].private_regs.rip = *rip;
3715        // BUGBUG: rflags set also updates interrupts in hcl
3716        self.vp.backing.vtls[self.vtl].private_regs.rflags = *rflags;
3717
3718        // Set segment registers
3719        self.vp.write_segment(self.vtl, TdxSegmentReg::Cs, *cs)?;
3720        self.vp.write_segment(self.vtl, TdxSegmentReg::Ds, *ds)?;
3721        self.vp.write_segment(self.vtl, TdxSegmentReg::Es, *es)?;
3722        self.vp.write_segment(self.vtl, TdxSegmentReg::Fs, *fs)?;
3723        self.vp.write_segment(self.vtl, TdxSegmentReg::Gs, *gs)?;
3724        self.vp.write_segment(self.vtl, TdxSegmentReg::Ss, *ss)?;
3725        self.vp.write_segment(self.vtl, TdxSegmentReg::Tr, *tr)?;
3726        self.vp
3727            .write_segment(self.vtl, TdxSegmentReg::Ldtr, *ldtr)?;
3728
3729        // Set table registers
3730        self.vp
3731            .write_table_register(self.vtl, TdxTableReg::Gdtr, *gdtr);
3732        self.vp
3733            .write_table_register(self.vtl, TdxTableReg::Idtr, *idtr);
3734
3735        self.vp.write_cr0(self.vtl, *cr0)?;
3736
3737        // CR2 is shared with the kernel, so set it in the VP run page which
3738        // will be set before lower VTL entry.
3739        self.vp.runner.set_cr2(*cr2);
3740
3741        self.vp
3742            .runner
3743            .write_vmcs64(self.vtl, VmcsField::VMX_VMCS_GUEST_CR3, !0, *cr3);
3744
3745        self.vp.write_cr4(self.vtl, *cr4)?;
3746
3747        self.vp.runner.tdx_apic_page_mut(self.vtl).tpr.value = (*cr8 << 4) as u32;
3748
3749        self.vp.write_efer(self.vtl, *efer)?;
3750
3751        // Execution mode must be updated after setting EFER and CR0.
3752        self.vp.update_execution_mode(self.vtl);
3753
3754        Ok(())
3755    }
3756
3757    fn activity(&mut self) -> Result<vp::Activity, Self::Error> {
3758        let lapic = &self.vp.backing.cvm.lapics[self.vtl];
3759        let interruptibility: Interruptibility = self
3760            .vp
3761            .runner
3762            .read_vmcs32(self.vtl, VmcsField::VMX_VMCS_GUEST_INTERRUPTIBILITY)
3763            .into();
3764        Ok(vp::Activity {
3765            mp_state: lapic.activity,
3766            nmi_pending: lapic.nmi_pending,
3767            nmi_masked: interruptibility.blocked_by_nmi(),
3768            interrupt_shadow: interruptibility.blocked_by_sti()
3769                || interruptibility.blocked_by_movss(),
3770            pending_event: None,        // TODO TDX
3771            pending_interruption: None, // TODO TDX
3772        })
3773    }
3774
3775    fn set_activity(&mut self, value: &vp::Activity) -> Result<(), Self::Error> {
3776        let &vp::Activity {
3777            mp_state,
3778            nmi_pending,
3779            nmi_masked,
3780            interrupt_shadow,
3781            pending_event: _,        // TODO TDX
3782            pending_interruption: _, // TODO TDX
3783        } = value;
3784        self.vp.backing.cvm.lapics[self.vtl].activity = mp_state;
3785        self.vp.backing.cvm.lapics[self.vtl].nmi_pending = nmi_pending;
3786        let interruptibility = Interruptibility::new()
3787            .with_blocked_by_movss(interrupt_shadow)
3788            .with_blocked_by_nmi(nmi_masked);
3789        self.vp.runner.write_vmcs32(
3790            self.vtl,
3791            VmcsField::VMX_VMCS_GUEST_INTERRUPTIBILITY,
3792            !0,
3793            interruptibility.into(),
3794        );
3795        Ok(())
3796    }
3797
3798    fn xsave(&mut self) -> Result<vp::Xsave, Self::Error> {
3799        // TODO: needed?
3800        Err(vp_state::Error::Unimplemented("xsave"))
3801    }
3802
3803    fn set_xsave(&mut self, _value: &vp::Xsave) -> Result<(), Self::Error> {
3804        // TODO: needed?
3805        Err(vp_state::Error::Unimplemented("xsave"))
3806    }
3807
3808    fn apic(&mut self) -> Result<vp::Apic, Self::Error> {
3809        self.vp.access_apic_without_offload(self.vtl, |vp| {
3810            Ok(vp.backing.cvm.lapics[self.vtl].lapic.save())
3811        })
3812    }
3813
3814    fn set_apic(&mut self, value: &vp::Apic) -> Result<(), Self::Error> {
3815        self.vp.access_apic_without_offload(self.vtl, |vp| {
3816            vp.backing.cvm.lapics[self.vtl]
3817                .lapic
3818                .restore(value)
3819                .map_err(vp_state::Error::InvalidApicBase)?;
3820
3821            Ok(())
3822        })
3823    }
3824
3825    fn xcr(&mut self) -> Result<vp::Xcr0, Self::Error> {
3826        Ok(vp::Xcr0 {
3827            value: self
3828                .vp
3829                .runner
3830                .get_vp_register(self.vtl, HvX64RegisterName::Xfem)
3831                .unwrap()
3832                .as_u64(),
3833        })
3834    }
3835
3836    fn set_xcr(&mut self, _value: &vp::Xcr0) -> Result<(), Self::Error> {
3837        Err(vp_state::Error::Unimplemented("xcr"))
3838    }
3839
3840    fn xss(&mut self) -> Result<vp::Xss, Self::Error> {
3841        Ok(vp::Xss {
3842            value: self.vp.backing.vtls[self.vtl].private_regs.msr_xss,
3843        })
3844    }
3845
3846    fn set_xss(&mut self, value: &vp::Xss) -> Result<(), Self::Error> {
3847        self.vp.backing.vtls[self.vtl].private_regs.msr_xss = value.value;
3848        Ok(())
3849    }
3850
3851    fn mtrrs(&mut self) -> Result<vp::Mtrrs, Self::Error> {
3852        Ok(vp::Mtrrs {
3853            msr_mtrr_def_type: 0, // TODO TDX: MTRRs
3854            fixed: [0; 11],       // TODO TDX: MTRRs
3855            variable: [0; 16],    // TODO TDX: MTRRs
3856        })
3857    }
3858
3859    fn set_mtrrs(&mut self, _value: &vp::Mtrrs) -> Result<(), Self::Error> {
3860        // TODO TDX: MTRRs
3861        Ok(())
3862    }
3863
3864    fn pat(&mut self) -> Result<vp::Pat, Self::Error> {
3865        let msr_cr_pat = self
3866            .vp
3867            .runner
3868            .read_vmcs64(self.vtl, VmcsField::VMX_VMCS_GUEST_PAT);
3869        Ok(vp::Pat { value: msr_cr_pat })
3870    }
3871
3872    fn set_pat(&mut self, value: &vp::Pat) -> Result<(), Self::Error> {
3873        self.vp
3874            .runner
3875            .write_vmcs64(self.vtl, VmcsField::VMX_VMCS_GUEST_PAT, !0, value.value);
3876        Ok(())
3877    }
3878
3879    fn virtual_msrs(&mut self) -> Result<vp::VirtualMsrs, Self::Error> {
3880        let state = &self.vp.backing.vtls[self.vtl].private_regs;
3881
3882        let sysenter_cs = self
3883            .vp
3884            .runner
3885            .read_vmcs32(self.vtl, VmcsField::VMX_VMCS_GUEST_SYSENTER_CS_MSR)
3886            .into();
3887        let sysenter_eip = self
3888            .vp
3889            .runner
3890            .read_vmcs64(self.vtl, VmcsField::VMX_VMCS_GUEST_SYSENTER_EIP_MSR);
3891        let sysenter_esp = self
3892            .vp
3893            .runner
3894            .read_vmcs64(self.vtl, VmcsField::VMX_VMCS_GUEST_SYSENTER_ESP_MSR);
3895
3896        Ok(vp::VirtualMsrs {
3897            kernel_gs_base: state.msr_kernel_gs_base,
3898            sysenter_cs,
3899            sysenter_eip,
3900            sysenter_esp,
3901            star: state.msr_star,
3902            lstar: state.msr_lstar,
3903            cstar: self.vp.backing.vtls[self.vtl].msr_cstar,
3904            sfmask: state.msr_sfmask,
3905        })
3906    }
3907
3908    fn set_virtual_msrs(&mut self, value: &vp::VirtualMsrs) -> Result<(), Self::Error> {
3909        let &vp::VirtualMsrs {
3910            kernel_gs_base,
3911            sysenter_cs,
3912            sysenter_eip,
3913            sysenter_esp,
3914            star,
3915            lstar,
3916            cstar,
3917            sfmask,
3918        } = value;
3919
3920        let state = &mut self.vp.backing.vtls[self.vtl].private_regs;
3921        state.msr_kernel_gs_base = kernel_gs_base;
3922        state.msr_star = star;
3923        state.msr_lstar = lstar;
3924        state.msr_sfmask = sfmask;
3925
3926        self.vp.runner.write_vmcs32(
3927            self.vtl,
3928            VmcsField::VMX_VMCS_GUEST_SYSENTER_CS_MSR,
3929            !0,
3930            sysenter_cs as u32,
3931        );
3932        self.vp.runner.write_vmcs64(
3933            self.vtl,
3934            VmcsField::VMX_VMCS_GUEST_SYSENTER_EIP_MSR,
3935            !0,
3936            sysenter_eip,
3937        );
3938        self.vp.runner.write_vmcs64(
3939            self.vtl,
3940            VmcsField::VMX_VMCS_GUEST_SYSENTER_ESP_MSR,
3941            !0,
3942            sysenter_esp,
3943        );
3944
3945        self.vp.backing.vtls[self.vtl].msr_cstar = cstar;
3946
3947        Ok(())
3948    }
3949
3950    fn debug_regs(&mut self) -> Result<vp::DebugRegisters, Self::Error> {
3951        let mut values = [0u64.into(); 5];
3952        self.vp
3953            .runner
3954            .get_vp_registers(
3955                self.vtl,
3956                &[
3957                    HvX64RegisterName::Dr0,
3958                    HvX64RegisterName::Dr1,
3959                    HvX64RegisterName::Dr2,
3960                    HvX64RegisterName::Dr3,
3961                    HvX64RegisterName::Dr6,
3962                ],
3963                &mut values,
3964            )
3965            .map_err(vp_state::Error::GetRegisters)?;
3966
3967        let dr7 = self
3968            .vp
3969            .runner
3970            .read_vmcs64(self.vtl, VmcsField::VMX_VMCS_GUEST_DR7);
3971
3972        Ok(vp::DebugRegisters {
3973            dr0: values[0].as_u64(),
3974            dr1: values[1].as_u64(),
3975            dr2: values[2].as_u64(),
3976            dr3: values[3].as_u64(),
3977            dr6: values[4].as_u64(),
3978            dr7,
3979        })
3980    }
3981
3982    fn set_debug_regs(&mut self, value: &vp::DebugRegisters) -> Result<(), Self::Error> {
3983        let &vp::DebugRegisters {
3984            dr0,
3985            dr1,
3986            dr2,
3987            dr3,
3988            dr6,
3989            dr7,
3990        } = value;
3991        self.vp
3992            .runner
3993            .set_vp_registers(
3994                self.vtl,
3995                [
3996                    (HvX64RegisterName::Dr0, dr0),
3997                    (HvX64RegisterName::Dr1, dr1),
3998                    (HvX64RegisterName::Dr2, dr2),
3999                    (HvX64RegisterName::Dr3, dr3),
4000                    (HvX64RegisterName::Dr6, dr6),
4001                ],
4002            )
4003            .map_err(vp_state::Error::SetRegisters)?;
4004
4005        self.vp
4006            .runner
4007            .write_vmcs64(self.vtl, VmcsField::VMX_VMCS_GUEST_DR7, !0, dr7);
4008
4009        Ok(())
4010    }
4011
4012    fn tsc(&mut self) -> Result<vp::Tsc, Self::Error> {
4013        Err(vp_state::Error::Unimplemented("tsc"))
4014    }
4015
4016    fn set_tsc(&mut self, _value: &vp::Tsc) -> Result<(), Self::Error> {
4017        Err(vp_state::Error::Unimplemented("tsc"))
4018    }
4019
4020    fn tsc_aux(&mut self) -> Result<vp::TscAux, Self::Error> {
4021        Ok(vp::TscAux {
4022            value: self.vp.backing.vtls[self.vtl].private_regs.msr_tsc_aux,
4023        })
4024    }
4025
4026    fn set_tsc_aux(&mut self, value: &vp::TscAux) -> Result<(), Self::Error> {
4027        self.vp.backing.vtls[self.vtl].private_regs.msr_tsc_aux = value.value;
4028        Ok(())
4029    }
4030
4031    fn cet(&mut self) -> Result<vp::Cet, Self::Error> {
4032        Err(vp_state::Error::Unimplemented("cet"))
4033    }
4034
4035    fn set_cet(&mut self, _value: &vp::Cet) -> Result<(), Self::Error> {
4036        Err(vp_state::Error::Unimplemented("cet"))
4037    }
4038
4039    fn cet_ss(&mut self) -> Result<vp::CetSs, Self::Error> {
4040        Err(vp_state::Error::Unimplemented("cet_ss"))
4041    }
4042
4043    fn set_cet_ss(&mut self, _value: &vp::CetSs) -> Result<(), Self::Error> {
4044        Err(vp_state::Error::Unimplemented("cet_ss"))
4045    }
4046
4047    fn synic_msrs(&mut self) -> Result<vp::SyntheticMsrs, Self::Error> {
4048        Err(vp_state::Error::Unimplemented("synic_msrs"))
4049    }
4050
4051    fn set_synic_msrs(&mut self, _value: &vp::SyntheticMsrs) -> Result<(), Self::Error> {
4052        Err(vp_state::Error::Unimplemented("synic_msrs"))
4053    }
4054
4055    fn synic_message_page(&mut self) -> Result<vp::SynicMessagePage, Self::Error> {
4056        Err(vp_state::Error::Unimplemented("synic_message_page"))
4057    }
4058
4059    fn set_synic_message_page(&mut self, _value: &vp::SynicMessagePage) -> Result<(), Self::Error> {
4060        Err(vp_state::Error::Unimplemented("synic_message_page"))
4061    }
4062
4063    fn synic_event_flags_page(&mut self) -> Result<vp::SynicEventFlagsPage, Self::Error> {
4064        Err(vp_state::Error::Unimplemented("synic_event_flags_page"))
4065    }
4066
4067    fn set_synic_event_flags_page(
4068        &mut self,
4069        _value: &vp::SynicEventFlagsPage,
4070    ) -> Result<(), Self::Error> {
4071        Err(vp_state::Error::Unimplemented("synic_event_flags_page"))
4072    }
4073
4074    fn synic_message_queues(&mut self) -> Result<vp::SynicMessageQueues, Self::Error> {
4075        Err(vp_state::Error::Unimplemented("synic_message_queues"))
4076    }
4077
4078    fn set_synic_message_queues(
4079        &mut self,
4080        _value: &vp::SynicMessageQueues,
4081    ) -> Result<(), Self::Error> {
4082        Err(vp_state::Error::Unimplemented("synic_message_queues"))
4083    }
4084
4085    fn synic_timers(&mut self) -> Result<vp::SynicTimers, Self::Error> {
4086        Err(vp_state::Error::Unimplemented("synic_timers"))
4087    }
4088
4089    fn set_synic_timers(&mut self, _value: &vp::SynicTimers) -> Result<(), Self::Error> {
4090        Err(vp_state::Error::Unimplemented("synic_timers"))
4091    }
4092}
4093
4094/// Compute the index of the highest vector set in IRR/ISR, or 0
4095/// if no vector is set. (Vectors 0-15 are invalid so this is not
4096/// ambiguous.)
4097fn top_vector(reg: &[ApicRegister; 8]) -> u8 {
4098    reg.iter()
4099        .enumerate()
4100        .rev()
4101        .find_map(|(i, r)| {
4102            (r.value != 0).then(|| (i as u32 * 32 + (31 - r.value.leading_zeros())) as u8)
4103        })
4104        .unwrap_or(0)
4105}
4106
4107struct TdHypercall<'a, 'b, T>(UhHypercallHandler<'a, 'b, T, TdxBacked>);
4108
4109impl<'a, 'b, T> AsHandler<UhHypercallHandler<'a, 'b, T, TdxBacked>> for TdHypercall<'a, 'b, T> {
4110    fn as_handler(&mut self) -> &mut UhHypercallHandler<'a, 'b, T, TdxBacked> {
4111        &mut self.0
4112    }
4113}
4114
4115impl<T> HypercallIo for TdHypercall<'_, '_, T> {
4116    fn advance_ip(&mut self) {
4117        self.0.vp.runner.tdx_enter_guest_gps_mut()[TdxGp::R10] = 0;
4118        self.0.vp.backing.vtls[self.0.intercepted_vtl]
4119            .private_regs
4120            .rip = self.0.vp.backing.vtls[self.0.intercepted_vtl]
4121            .private_regs
4122            .rip
4123            .wrapping_add(4);
4124    }
4125
4126    fn retry(&mut self, control: u64) {
4127        self.0.vp.runner.tdx_enter_guest_gps_mut()[TdxGp::R10] = control;
4128        self.set_result(hvdef::hypercall::HypercallOutput::from(HvError::Timeout).into());
4129    }
4130
4131    fn control(&mut self) -> u64 {
4132        self.0.vp.runner.tdx_enter_guest_gps()[TdxGp::R10]
4133    }
4134
4135    fn input_gpa(&mut self) -> u64 {
4136        self.0.vp.runner.tdx_enter_guest_gps()[TdxGp::RDX]
4137    }
4138
4139    fn output_gpa(&mut self) -> u64 {
4140        self.0.vp.runner.tdx_enter_guest_gps()[TdxGp::R8]
4141    }
4142
4143    fn fast_register_pair_count(&mut self) -> usize {
4144        7
4145    }
4146
4147    fn extended_fast_hypercalls_ok(&mut self) -> bool {
4148        false
4149    }
4150
4151    fn fast_input(&mut self, buf: &mut [[u64; 2]], _output_register_pairs: usize) -> usize {
4152        self.fast_regs(0, buf);
4153        buf.len()
4154    }
4155
4156    fn fast_output(&mut self, _starting_pair_index: usize, buf: &[[u64; 2]]) {
4157        assert!(buf.is_empty());
4158    }
4159
4160    fn vtl_input(&mut self) -> u64 {
4161        unreachable!()
4162    }
4163
4164    fn set_result(&mut self, n: u64) {
4165        self.0.vp.runner.tdx_enter_guest_gps_mut()[TdxGp::R11] = n;
4166    }
4167
4168    fn fast_regs(&mut self, starting_pair_index: usize, buf: &mut [[u64; 2]]) {
4169        let regs = self.0.vp.runner.tdx_enter_guest_gps();
4170        let fx_state = self.0.vp.runner.fx_state();
4171        for (i, [low, high]) in buf.iter_mut().enumerate() {
4172            let index = i + starting_pair_index;
4173            if index == 0 {
4174                *low = regs[TdxGp::RDX];
4175                *high = regs[TdxGp::R8];
4176            } else {
4177                let value = u128::from_ne_bytes(fx_state.xmm[index - 1]);
4178                *low = value as u64;
4179                *high = (value >> 64) as u64;
4180            }
4181        }
4182    }
4183}
4184
4185impl<T> hv1_hypercall::VtlSwitchOps for UhHypercallHandler<'_, '_, T, TdxBacked> {
4186    fn advance_ip(&mut self) {
4187        let long_mode = self.vp.long_mode(self.intercepted_vtl);
4188        let mut io = hv1_hypercall::X64RegisterIo::new(self, long_mode);
4189        io.advance_ip();
4190    }
4191
4192    fn inject_invalid_opcode_fault(&mut self) {
4193        self.vp.backing.vtls[self.intercepted_vtl].interruption_information =
4194            InterruptionInformation::new()
4195                .with_valid(true)
4196                .with_interruption_type(INTERRUPT_TYPE_HARDWARE_EXCEPTION)
4197                .with_vector(x86defs::Exception::INVALID_OPCODE.0);
4198    }
4199}
4200
4201impl<T: CpuIo> hv1_hypercall::FlushVirtualAddressList for UhHypercallHandler<'_, '_, T, TdxBacked> {
4202    fn flush_virtual_address_list(
4203        &mut self,
4204        processor_set: ProcessorSet<'_>,
4205        flags: HvFlushFlags,
4206        gva_ranges: &[HvGvaRange],
4207    ) -> HvRepResult {
4208        hv1_hypercall::FlushVirtualAddressListEx::flush_virtual_address_list_ex(
4209            self,
4210            processor_set,
4211            flags,
4212            gva_ranges,
4213        )
4214    }
4215}
4216
4217impl<T: CpuIo> hv1_hypercall::FlushVirtualAddressListEx
4218    for UhHypercallHandler<'_, '_, T, TdxBacked>
4219{
4220    fn flush_virtual_address_list_ex(
4221        &mut self,
4222        processor_set: ProcessorSet<'_>,
4223        flags: HvFlushFlags,
4224        gva_ranges: &[HvGvaRange],
4225    ) -> HvRepResult {
4226        self.hcvm_validate_flush_inputs(processor_set, flags, true)
4227            .map_err(|e| (e, 0))?;
4228
4229        let vtl = self.intercepted_vtl;
4230        let flush_state = &self.vp.shared.flush_state[vtl];
4231
4232        // If we fail to add ranges to the list for any reason then promote this request to a flush entire.
4233        if let Err(()) = Self::add_ranges_to_tlb_flush_list(
4234            flush_state,
4235            gva_ranges,
4236            flags.use_extended_range_format(),
4237        ) {
4238            if flags.non_global_mappings_only() {
4239                flush_state
4240                    .flush_entire_non_global_counter
4241                    .fetch_add(1, Ordering::Relaxed);
4242            } else {
4243                flush_state
4244                    .flush_entire_counter
4245                    .fetch_add(1, Ordering::Relaxed);
4246            }
4247        }
4248
4249        // Send flush IPIs to the specified VPs.
4250        TdxTlbLockFlushAccess {
4251            vp_index: Some(self.vp.vp_index()),
4252            partition: self.vp.partition,
4253            shared: self.vp.shared,
4254        }
4255        .wake_processors_for_tlb_flush(vtl, (!flags.all_processors()).then_some(processor_set));
4256
4257        // Mark that this VP needs to wait for all TLB locks to be released before returning.
4258        self.vp.set_wait_for_tlb_locks(vtl);
4259
4260        Ok(())
4261    }
4262}
4263
4264impl<T: CpuIo> hv1_hypercall::FlushVirtualAddressSpace
4265    for UhHypercallHandler<'_, '_, T, TdxBacked>
4266{
4267    fn flush_virtual_address_space(
4268        &mut self,
4269        processor_set: ProcessorSet<'_>,
4270        flags: HvFlushFlags,
4271    ) -> hvdef::HvResult<()> {
4272        hv1_hypercall::FlushVirtualAddressSpaceEx::flush_virtual_address_space_ex(
4273            self,
4274            processor_set,
4275            flags,
4276        )
4277    }
4278}
4279
4280impl<T: CpuIo> hv1_hypercall::FlushVirtualAddressSpaceEx
4281    for UhHypercallHandler<'_, '_, T, TdxBacked>
4282{
4283    fn flush_virtual_address_space_ex(
4284        &mut self,
4285        processor_set: ProcessorSet<'_>,
4286        flags: HvFlushFlags,
4287    ) -> hvdef::HvResult<()> {
4288        self.hcvm_validate_flush_inputs(processor_set, flags, false)?;
4289        let vtl = self.intercepted_vtl;
4290
4291        let flush_state = &self.vp.shared.flush_state[vtl];
4292
4293        // Set flush entire.
4294        if flags.non_global_mappings_only() {
4295            flush_state
4296                .flush_entire_non_global_counter
4297                .fetch_add(1, Ordering::Relaxed);
4298        } else {
4299            flush_state
4300                .flush_entire_counter
4301                .fetch_add(1, Ordering::Relaxed);
4302        }
4303
4304        // Send flush IPIs to the specified VPs.
4305        TdxTlbLockFlushAccess {
4306            vp_index: Some(self.vp.vp_index()),
4307            partition: self.vp.partition,
4308            shared: self.vp.shared,
4309        }
4310        .wake_processors_for_tlb_flush(vtl, (!flags.all_processors()).then_some(processor_set));
4311
4312        // Mark that this VP needs to wait for all TLB locks to be released before returning.
4313        self.vp.set_wait_for_tlb_locks(vtl);
4314
4315        Ok(())
4316    }
4317}
4318
4319impl<T: CpuIo> UhHypercallHandler<'_, '_, T, TdxBacked> {
4320    fn add_ranges_to_tlb_flush_list(
4321        flush_state: &TdxPartitionFlushState,
4322        gva_ranges: &[HvGvaRange],
4323        use_extended_range_format: bool,
4324    ) -> Result<(), ()> {
4325        // If there are more gvas than the list size there's no point in filling the list.
4326        if gva_ranges.len() > FLUSH_GVA_LIST_SIZE {
4327            return Err(());
4328        }
4329
4330        if use_extended_range_format
4331            && gva_ranges
4332                .iter()
4333                .any(|range| range.as_extended().large_page())
4334        {
4335            // TDX does not provide a way to flush large page ranges,
4336            // we have to promote this request to a flush entire.
4337            return Err(());
4338        }
4339
4340        flush_state
4341            .gva_list
4342            .write()
4343            .extend(gva_ranges.iter().copied());
4344
4345        Ok(())
4346    }
4347}
4348
4349impl TdxTlbLockFlushAccess<'_> {
4350    fn wake_processors_for_tlb_flush(
4351        &mut self,
4352        target_vtl: GuestVtl,
4353        processor_set: Option<ProcessorSet<'_>>,
4354    ) {
4355        match processor_set {
4356            Some(processors) => {
4357                self.wake_processors_for_tlb_flush_inner(target_vtl, processors);
4358            }
4359            None => self.wake_processors_for_tlb_flush_inner(
4360                target_vtl,
4361                0..(self.partition.vps.len() as u32),
4362            ),
4363        }
4364    }
4365
4366    fn wake_processors_for_tlb_flush_inner(
4367        &mut self,
4368        target_vtl: GuestVtl,
4369        processors: impl IntoIterator<Item = u32>,
4370    ) {
4371        // Use SeqCst ordering to ensure that we are observing the most
4372        // up-to-date value from other VPs. Otherwise we might not send a
4373        // wake to a VP in a lower VTL, which could cause TLB lock holders
4374        // to be stuck waiting until the target_vp happens to switch into
4375        // VTL 2.
4376        // We use a single fence to avoid having to take a SeqCst load
4377        // for each VP.
4378        std::sync::atomic::fence(Ordering::SeqCst);
4379        self.partition.hcl.kick_cpus(
4380            processors.into_iter().filter(|&vp| {
4381                self.shared.active_vtl[vp as usize].load(Ordering::Relaxed) == target_vtl as u8
4382            }),
4383            true,
4384            true,
4385        );
4386    }
4387}
4388
4389struct TdxTlbLockFlushAccess<'a> {
4390    vp_index: Option<VpIndex>,
4391    partition: &'a UhPartitionInner,
4392    shared: &'a TdxBackedShared,
4393}
4394
4395impl TlbFlushLockAccess for TdxTlbLockFlushAccess<'_> {
4396    fn flush(&mut self, vtl: GuestVtl) {
4397        self.shared.flush_state[vtl]
4398            .flush_entire_counter
4399            .fetch_add(1, Ordering::Relaxed);
4400
4401        self.wake_processors_for_tlb_flush(vtl, None);
4402        self.set_wait_for_tlb_locks(vtl);
4403    }
4404
4405    fn flush_entire(&mut self) {
4406        for vtl in [GuestVtl::Vtl0, GuestVtl::Vtl1] {
4407            self.shared.flush_state[vtl]
4408                .flush_entire_counter
4409                .fetch_add(1, Ordering::Relaxed);
4410        }
4411        for vtl in [GuestVtl::Vtl0, GuestVtl::Vtl1] {
4412            self.wake_processors_for_tlb_flush(vtl, None);
4413            self.set_wait_for_tlb_locks(vtl);
4414        }
4415    }
4416
4417    fn set_wait_for_tlb_locks(&mut self, vtl: GuestVtl) {
4418        if let Some(vp_index) = self.vp_index {
4419            hardware_cvm::tlb_lock::TlbLockAccess {
4420                vp_index,
4421                cvm_partition: &self.shared.cvm,
4422            }
4423            .set_wait_for_tlb_locks(vtl);
4424        }
4425    }
4426}
4427
4428mod save_restore {
4429    use super::TdxBacked;
4430    use super::UhProcessor;
4431    use vmcore::save_restore::RestoreError;
4432    use vmcore::save_restore::SaveError;
4433    use vmcore::save_restore::SaveRestore;
4434    use vmcore::save_restore::SavedStateNotSupported;
4435
4436    impl SaveRestore for UhProcessor<'_, TdxBacked> {
4437        type SavedState = SavedStateNotSupported;
4438
4439        fn save(&mut self) -> Result<Self::SavedState, SaveError> {
4440            Err(SaveError::NotSupported)
4441        }
4442
4443        fn restore(&mut self, state: Self::SavedState) -> Result<(), RestoreError> {
4444            match state {}
4445        }
4446    }
4447}