virt_mshv_vtl/processor/tdx/
mod.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! Processor support for TDX partitions.
5
6mod tlb_flush;
7
8use super::BackingPrivate;
9use super::BackingSharedParams;
10use super::HardwareIsolatedBacking;
11use super::UhEmulationState;
12use super::UhHypercallHandler;
13use super::hardware_cvm;
14use super::vp_state;
15use super::vp_state::UhVpStateAccess;
16use crate::BackingShared;
17use crate::GuestVtl;
18use crate::TlbFlushLockAccess;
19use crate::UhCvmPartitionState;
20use crate::UhCvmVpState;
21use crate::UhPartitionInner;
22use crate::UhPartitionNewParams;
23use crate::UhProcessor;
24use crate::WakeReason;
25use cvm_tracing::CVM_ALLOWED;
26use cvm_tracing::CVM_CONFIDENTIAL;
27use guestmem::GuestMemory;
28use hcl::ioctl::ProcessorRunner;
29use hcl::ioctl::tdx::Tdx;
30use hcl::ioctl::tdx::TdxPrivateRegs;
31use hcl::protocol::hcl_intr_offload_flags;
32use hcl::protocol::tdx_tdg_vp_enter_exit_info;
33use hv1_emulator::hv::ProcessorVtlHv;
34use hv1_emulator::synic::GlobalSynic;
35use hv1_emulator::synic::ProcessorSynic;
36use hv1_hypercall::AsHandler;
37use hv1_hypercall::HvRepResult;
38use hv1_hypercall::HypercallIo;
39use hv1_structs::ProcessorSet;
40use hv1_structs::VtlArray;
41use hvdef::HV_PAGE_SIZE;
42use hvdef::HvError;
43use hvdef::HvSynicSimpSiefp;
44use hvdef::HvX64PendingExceptionEvent;
45use hvdef::HvX64RegisterName;
46use hvdef::Vtl;
47use hvdef::hypercall::HvFlushFlags;
48use hvdef::hypercall::HvGvaRange;
49use inspect::Inspect;
50use inspect::InspectMut;
51use inspect_counters::Counter;
52use std::sync::atomic::AtomicU8;
53use std::sync::atomic::Ordering;
54use thiserror::Error;
55use tlb_flush::FLUSH_GVA_LIST_SIZE;
56use tlb_flush::TdxFlushState;
57use tlb_flush::TdxPartitionFlushState;
58use virt::EmulatorMonitorSupport;
59use virt::Processor;
60use virt::VpHaltReason;
61use virt::VpIndex;
62use virt::io::CpuIo;
63use virt::state::StateElement;
64use virt::vp;
65use virt::vp::AccessVpState;
66use virt::vp::MpState;
67use virt::vp::Registers;
68use virt::x86::MsrError;
69use virt::x86::MsrErrorExt;
70use virt::x86::SegmentRegister;
71use virt::x86::TableRegister;
72use virt_support_apic::ApicClient;
73use virt_support_apic::OffloadNotSupported;
74use virt_support_x86emu::emulate::EmulatedMemoryOperation;
75use virt_support_x86emu::emulate::EmulatorSupport as X86EmulatorSupport;
76use virt_support_x86emu::emulate::TranslateMode;
77use virt_support_x86emu::emulate::emulate_insn_memory_op;
78use virt_support_x86emu::emulate::emulate_io;
79use virt_support_x86emu::emulate::emulate_translate_gva;
80use virt_support_x86emu::translate::TranslationRegisters;
81use vmcore::vmtime::VmTimeAccess;
82use x86defs::RFlags;
83use x86defs::X64_CR0_ET;
84use x86defs::X64_CR0_NE;
85use x86defs::X64_CR0_PE;
86use x86defs::X64_CR0_PG;
87use x86defs::X64_CR4_MCE;
88use x86defs::X64_CR4_UMIP;
89use x86defs::X64_CR4_VMXE;
90use x86defs::X64_EFER_FFXSR;
91use x86defs::X64_EFER_LMA;
92use x86defs::X64_EFER_LME;
93use x86defs::X64_EFER_NXE;
94use x86defs::X64_EFER_SVME;
95use x86defs::X86X_MSR_EFER;
96use x86defs::apic::X2APIC_MSR_BASE;
97use x86defs::tdx::TdCallResultCode;
98use x86defs::tdx::TdVmCallR10Result;
99use x86defs::tdx::TdxGp;
100use x86defs::tdx::TdxInstructionInfo;
101use x86defs::tdx::TdxL2Ctls;
102use x86defs::tdx::TdxVpEnterRaxResult;
103use x86defs::vmx::ApicPage;
104use x86defs::vmx::ApicRegister;
105use x86defs::vmx::CR_ACCESS_TYPE_LMSW;
106use x86defs::vmx::CR_ACCESS_TYPE_MOV_TO_CR;
107use x86defs::vmx::CrAccessQualification;
108use x86defs::vmx::ExitQualificationIo;
109use x86defs::vmx::GdtrOrIdtrInstruction;
110use x86defs::vmx::GdtrOrIdtrInstructionInfo;
111use x86defs::vmx::INTERRUPT_TYPE_EXTERNAL;
112use x86defs::vmx::INTERRUPT_TYPE_HARDWARE_EXCEPTION;
113use x86defs::vmx::INTERRUPT_TYPE_NMI;
114use x86defs::vmx::IO_SIZE_8_BIT;
115use x86defs::vmx::IO_SIZE_16_BIT;
116use x86defs::vmx::IO_SIZE_32_BIT;
117use x86defs::vmx::Interruptibility;
118use x86defs::vmx::InterruptionInformation;
119use x86defs::vmx::LdtrOrTrInstruction;
120use x86defs::vmx::LdtrOrTrInstructionInfo;
121use x86defs::vmx::ProcessorControls;
122use x86defs::vmx::SecondaryProcessorControls;
123use x86defs::vmx::VMX_ENTRY_CONTROL_LONG_MODE_GUEST;
124use x86defs::vmx::VMX_FEATURE_CONTROL_LOCKED;
125use x86defs::vmx::VmcsField;
126use x86defs::vmx::VmxEptExitQualification;
127use x86defs::vmx::VmxExit;
128use x86defs::vmx::VmxExitBasic;
129use x86emu::Gp;
130use x86emu::Segment;
131
132/// MSRs that are allowed to be read by the guest without interception.
133const MSR_ALLOWED_READ: &[u32] = &[
134    x86defs::X86X_MSR_TSC,
135    x86defs::X86X_MSR_TSC_AUX,
136    X86X_MSR_EFER,
137    x86defs::X86X_MSR_STAR,
138    x86defs::X86X_MSR_LSTAR,
139    x86defs::X86X_MSR_SFMASK,
140    x86defs::X86X_MSR_SYSENTER_CS,
141    x86defs::X86X_MSR_SYSENTER_ESP,
142    x86defs::X86X_MSR_SYSENTER_EIP,
143];
144
145/// MSRs that are allowed to be read and written by the guest without interception.
146const MSR_ALLOWED_READ_WRITE: &[u32] = &[
147    x86defs::X64_MSR_FS_BASE,
148    x86defs::X64_MSR_GS_BASE,
149    x86defs::X64_MSR_KERNEL_GS_BASE,
150    x86defs::X86X_MSR_SPEC_CTRL,
151    x86defs::X86X_MSR_U_CET,
152    x86defs::X86X_MSR_S_CET,
153    x86defs::X86X_MSR_PL0_SSP,
154    x86defs::X86X_MSR_PL1_SSP,
155    x86defs::X86X_MSR_PL2_SSP,
156    x86defs::X86X_MSR_PL3_SSP,
157    x86defs::X86X_MSR_INTERRUPT_SSP_TABLE_ADDR,
158    x86defs::X86X_IA32_MSR_XFD,
159    x86defs::X86X_IA32_MSR_XFD_ERR,
160];
161
162#[derive(Debug, Error)]
163#[error("unknown exit {0:#x?}")]
164struct UnknownVmxExit(VmxExit);
165
166#[derive(Debug, Error)]
167#[error("bad guest state on VP.ENTER")]
168struct VmxBadGuestState;
169
170#[derive(Debug, Error)]
171#[error("failed to run")]
172struct TdxRunVpError(#[source] hcl::ioctl::Error);
173
174#[derive(Debug)]
175struct TdxExit<'a>(&'a tdx_tdg_vp_enter_exit_info);
176
177impl TdxExit<'_> {
178    fn code(&self) -> TdxVpEnterRaxResult {
179        self.0.rax.into()
180    }
181    fn qualification(&self) -> u64 {
182        self.0.rcx
183    }
184    fn gla(&self) -> Option<u64> {
185        // Only valid for EPT exits.
186        if self.code().vmx_exit().basic_reason() == VmxExitBasic::EPT_VIOLATION {
187            Some(self.0.rdx)
188        } else {
189            None
190        }
191    }
192    fn gpa(&self) -> Option<u64> {
193        // Only valid for EPT exits.
194        if self.code().vmx_exit().basic_reason() == VmxExitBasic::EPT_VIOLATION {
195            Some(self.0.r8)
196        } else {
197            None
198        }
199    }
200    fn _exit_interruption_info(&self) -> InterruptionInformation {
201        (self.0.r9 as u32).into()
202    }
203    fn _exit_interruption_error_code(&self) -> u32 {
204        (self.0.r9 >> 32) as u32
205    }
206    fn idt_vectoring_info(&self) -> InterruptionInformation {
207        (self.0.r10 as u32).into()
208    }
209    fn idt_vectoring_error_code(&self) -> u32 {
210        (self.0.r10 >> 32) as u32
211    }
212    fn instr_info(&self) -> TdxInstructionInfo {
213        self.0.r11.into()
214    }
215    fn cs(&self) -> SegmentRegister {
216        SegmentRegister {
217            selector: self.0.rsi as u16,
218            base: self.0.rdi,
219            limit: (self.0.rsi >> 32) as u32,
220            attributes: (self.0.rsi >> 16) as u16,
221        }
222    }
223    fn cpl(&self) -> u8 {
224        self.0.r12 as u8 & 3
225    }
226}
227
228/// Registers that can be virtual and shadowed.
229#[derive(Debug, Inspect)]
230enum ShadowedRegister {
231    Cr0,
232    Cr4,
233}
234
235impl ShadowedRegister {
236    fn name(&self) -> &'static str {
237        match self {
238            Self::Cr0 => "cr0",
239            Self::Cr4 => "cr4",
240        }
241    }
242
243    fn physical_vmcs_field(&self) -> VmcsField {
244        match self {
245            Self::Cr0 => VmcsField::VMX_VMCS_GUEST_CR0,
246            Self::Cr4 => VmcsField::VMX_VMCS_GUEST_CR4,
247        }
248    }
249
250    fn shadow_vmcs_field(&self) -> VmcsField {
251        match self {
252            Self::Cr0 => VmcsField::VMX_VMCS_CR0_READ_SHADOW,
253            Self::Cr4 => VmcsField::VMX_VMCS_CR4_READ_SHADOW,
254        }
255    }
256
257    fn guest_owned_mask(&self) -> u64 {
258        // Control register bits that are guest owned by default. A bit is guest
259        // owned when the physical register bit is always set to the virtual
260        // register bit (subject to validation of the virtual register).
261        match self {
262            Self::Cr0 => {
263                X64_CR0_ET
264                    | x86defs::X64_CR0_MP
265                    | x86defs::X64_CR0_EM
266                    | x86defs::X64_CR0_TS
267                    | x86defs::X64_CR0_WP
268                    | x86defs::X64_CR0_AM
269                    | X64_CR0_PE
270                    | X64_CR0_PG
271            }
272            Self::Cr4 => {
273                x86defs::X64_CR4_VME
274                    | x86defs::X64_CR4_PVI
275                    | x86defs::X64_CR4_TSD
276                    | x86defs::X64_CR4_DE
277                    | x86defs::X64_CR4_PSE
278                    | x86defs::X64_CR4_PAE
279                    | x86defs::X64_CR4_PGE
280                    | x86defs::X64_CR4_PCE
281                    | x86defs::X64_CR4_FXSR
282                    | x86defs::X64_CR4_XMMEXCPT
283                    | X64_CR4_UMIP
284                    | x86defs::X64_CR4_LA57
285                    | x86defs::X64_CR4_RWFSGS
286                    | x86defs::X64_CR4_PCIDE
287                    | x86defs::X64_CR4_OSXSAVE
288                    | x86defs::X64_CR4_SMEP
289                    | x86defs::X64_CR4_SMAP
290                    | x86defs::X64_CR4_CET
291            }
292        }
293    }
294}
295
296/// A virtual register that is shadowed by the virtstack.
297///
298/// Some bits are owned by the guest while others are owned by the virtstack,
299/// due to TDX requirements.
300#[derive(Inspect)]
301struct VirtualRegister {
302    /// The register being shadowed.
303    register: ShadowedRegister,
304    /// The VTL this register is shadowed for.
305    vtl: GuestVtl,
306    /// The value the guest sees.
307    shadow_value: u64,
308    /// Additional constraints on bits.
309    allowed_bits: u64,
310}
311
312impl VirtualRegister {
313    fn new(reg: ShadowedRegister, vtl: GuestVtl, initial_value: u64, allowed_bits: u64) -> Self {
314        Self {
315            register: reg,
316            vtl,
317            shadow_value: initial_value,
318            allowed_bits,
319        }
320    }
321
322    /// Write a new value to the virtual register. This updates host owned bits
323    /// in the shadowed value, and updates guest owned bits in the physical
324    /// register in the vmcs.
325    fn write<'a>(
326        &mut self,
327        value: u64,
328        runner: &mut ProcessorRunner<'a, Tdx<'a>>,
329    ) -> Result<(), vp_state::Error> {
330        tracing::trace!(?self.register, value, "write virtual register");
331
332        if value & !self.allowed_bits != 0 {
333            return Err(vp_state::Error::InvalidValue(
334                value,
335                self.register.name(),
336                "disallowed bit set",
337            ));
338        }
339
340        // If guest owned bits of the physical register have changed, then update
341        // the guest owned bits of the physical field.
342        let old_physical_reg = runner.read_vmcs64(self.vtl, self.register.physical_vmcs_field());
343
344        tracing::trace!(old_physical_reg, "old_physical_reg");
345
346        let guest_owned_mask = self.register.guest_owned_mask();
347        if (old_physical_reg ^ value) & guest_owned_mask != 0 {
348            let new_physical_reg =
349                (old_physical_reg & !guest_owned_mask) | (value & guest_owned_mask);
350
351            tracing::trace!(new_physical_reg, "new_physical_reg");
352
353            runner.write_vmcs64(
354                self.vtl,
355                self.register.physical_vmcs_field(),
356                !0,
357                new_physical_reg,
358            );
359        }
360
361        self.shadow_value = value;
362        runner.write_vmcs64(self.vtl, self.register.shadow_vmcs_field(), !0, value);
363        Ok(())
364    }
365
366    fn read<'a>(&self, runner: &ProcessorRunner<'a, Tdx<'a>>) -> u64 {
367        let physical_reg = runner.read_vmcs64(self.vtl, self.register.physical_vmcs_field());
368
369        // Get the bits owned by the host from the shadow and the bits owned by the
370        // guest from the physical value.
371        let guest_owned_mask = self.register.guest_owned_mask();
372        (self.shadow_value & !self.register.guest_owned_mask()) | (physical_reg & guest_owned_mask)
373    }
374}
375
376/// Backing for TDX partitions.
377#[derive(InspectMut)]
378pub struct TdxBacked {
379    #[inspect(mut)]
380    vtls: VtlArray<TdxVtl, 2>,
381
382    untrusted_synic: Option<ProcessorSynic>,
383    #[inspect(hex, iter_by_index)]
384    eoi_exit_bitmap: [u64; 4],
385
386    /// A mapped page used for issuing INVGLA hypercalls.
387    #[inspect(skip)]
388    flush_page: user_driver::memory::MemoryBlock,
389
390    #[inspect(flatten)]
391    cvm: UhCvmVpState,
392}
393
394#[derive(InspectMut)]
395struct TdxVtl {
396    /// The EFER value for this VP.
397    efer: u64,
398    /// Virtual cr0.
399    cr0: VirtualRegister,
400    /// Virtual cr4.
401    cr4: VirtualRegister,
402
403    // CSTAR doesn't exist on TDX, but Windows likes to verify that values are sticky.
404    msr_cstar: u64,
405
406    tpr_threshold: u8,
407    #[inspect(skip)]
408    processor_controls: ProcessorControls,
409    #[inspect(skip)]
410    interruption_information: InterruptionInformation,
411    exception_error_code: u32,
412    interruption_set: bool,
413
414    #[inspect(mut)]
415    private_regs: TdxPrivateRegs,
416
417    /// TDX only TLB flush state.
418    flush_state: TdxFlushState,
419
420    enter_stats: EnterStats,
421    exit_stats: ExitStats,
422}
423
424#[derive(Default)]
425pub struct TdxEmulationCache {
426    segs: [Option<SegmentRegister>; 6],
427    cr0: Option<u64>,
428}
429
430#[derive(Inspect, Default)]
431struct EnterStats {
432    success: Counter,
433    host_routed_async: Counter,
434    l2_exit_pending_intr: Counter,
435    pending_intr: Counter,
436    host_routed_td_vmcall: Counter,
437}
438
439#[derive(Inspect, Default)]
440struct ExitStats {
441    io: Counter,
442    msr_read: Counter,
443    msr_write: Counter,
444    ept_violation: Counter,
445    cpuid: Counter,
446    cr_access: Counter,
447    xsetbv: Counter,
448    tpr_below_threshold: Counter,
449    interrupt_window: Counter,
450    nmi_window: Counter,
451    vmcall: Counter,
452    smi_intr: Counter,
453    wbinvd: Counter,
454    hw_interrupt: Counter,
455    tdcall: Counter,
456    hlt: Counter,
457    pause: Counter,
458    needs_interrupt_reinject: Counter,
459    exception: Counter,
460    descriptor_table: Counter,
461}
462
463enum UhDirectOverlay {
464    Sipp,
465    Sifp,
466    Count,
467}
468
469impl HardwareIsolatedBacking for TdxBacked {
470    fn cvm_state(&self) -> &UhCvmVpState {
471        &self.cvm
472    }
473
474    fn cvm_state_mut(&mut self) -> &mut UhCvmVpState {
475        &mut self.cvm
476    }
477
478    fn cvm_partition_state(shared: &Self::Shared) -> &UhCvmPartitionState {
479        &shared.cvm
480    }
481
482    fn switch_vtl(this: &mut UhProcessor<'_, Self>, _source_vtl: GuestVtl, target_vtl: GuestVtl) {
483        // The GPs, Fxsave, and CR2 are saved in the shared kernel state. No copying needed.
484        // Debug registers and XFEM are shared architecturally. No copying needed.
485
486        this.backing.cvm_state_mut().exit_vtl = target_vtl;
487    }
488
489    fn translation_registers(
490        &self,
491        this: &UhProcessor<'_, Self>,
492        vtl: GuestVtl,
493    ) -> TranslationRegisters {
494        let cr0 = this.backing.vtls[vtl].cr0.read(&this.runner);
495        let cr4 = this.backing.vtls[vtl].cr4.read(&this.runner);
496        let efer = this.backing.vtls[vtl].efer;
497        let cr3 = this.runner.read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_CR3);
498        let ss = this.read_segment(vtl, TdxSegmentReg::Ss).into();
499        let rflags = this.backing.vtls[vtl].private_regs.rflags;
500
501        TranslationRegisters {
502            cr0,
503            cr4,
504            efer,
505            cr3,
506            ss,
507            rflags,
508            encryption_mode: this.partition.caps.vtom.map_or(
509                virt_support_x86emu::translate::EncryptionMode::None,
510                virt_support_x86emu::translate::EncryptionMode::Vtom,
511            ),
512        }
513    }
514
515    fn tlb_flush_lock_access<'a>(
516        vp_index: Option<VpIndex>,
517        partition: &'a UhPartitionInner,
518        shared: &'a Self::Shared,
519    ) -> impl TlbFlushLockAccess + 'a {
520        TdxTlbLockFlushAccess {
521            vp_index,
522            partition,
523            shared,
524        }
525    }
526
527    fn pending_event_vector(this: &UhProcessor<'_, Self>, vtl: GuestVtl) -> Option<u8> {
528        let event_inject = this.backing.vtls[vtl].interruption_information;
529        if event_inject.valid() {
530            Some(event_inject.vector())
531        } else {
532            None
533        }
534    }
535
536    fn set_pending_exception(
537        this: &mut UhProcessor<'_, Self>,
538        vtl: GuestVtl,
539        event: HvX64PendingExceptionEvent,
540    ) {
541        let new_intr = InterruptionInformation::new()
542            .with_valid(true)
543            .with_deliver_error_code(event.deliver_error_code())
544            .with_vector(event.vector().try_into().unwrap())
545            .with_interruption_type(INTERRUPT_TYPE_HARDWARE_EXCEPTION);
546
547        this.backing.vtls[vtl].interruption_information = new_intr;
548        this.backing.vtls[vtl].exception_error_code = event.error_code();
549    }
550
551    fn cr0(this: &UhProcessor<'_, Self>, vtl: GuestVtl) -> u64 {
552        this.read_cr0(vtl)
553    }
554
555    fn cr4(this: &UhProcessor<'_, Self>, vtl: GuestVtl) -> u64 {
556        this.read_cr4(vtl)
557    }
558
559    fn intercept_message_state(
560        this: &UhProcessor<'_, Self>,
561        vtl: GuestVtl,
562        include_optional_state: bool,
563    ) -> super::InterceptMessageState {
564        let exit = TdxExit(this.runner.tdx_vp_enter_exit_info());
565        let backing_vtl = &this.backing.vtls[vtl];
566        let shared_gps = this.runner.tdx_enter_guest_gps();
567
568        super::InterceptMessageState {
569            instruction_length_and_cr8: exit.instr_info().length() as u8,
570            cpl: exit.cpl(),
571            efer_lma: backing_vtl.efer & X64_EFER_LMA != 0,
572            cs: exit.cs().into(),
573            rip: backing_vtl.private_regs.rip,
574            rflags: backing_vtl.private_regs.rflags,
575            rax: shared_gps[TdxGp::RAX],
576            rdx: shared_gps[TdxGp::RDX],
577            optional: if include_optional_state {
578                Some(super::InterceptMessageOptionalState {
579                    ds: this.read_segment(vtl, TdxSegmentReg::Ds).into(),
580                    es: this.read_segment(vtl, TdxSegmentReg::Es).into(),
581                })
582            } else {
583                None
584            },
585            rcx: shared_gps[TdxGp::RCX],
586            rsi: shared_gps[TdxGp::RSI],
587            rdi: shared_gps[TdxGp::RDI],
588        }
589    }
590
591    fn cr_intercept_registration(
592        this: &mut UhProcessor<'_, Self>,
593        intercept_control: hvdef::HvRegisterCrInterceptControl,
594    ) {
595        // Today we only support intercepting VTL 0 on behalf of VTL 1.
596        let vtl = GuestVtl::Vtl0;
597        let intercept_masks = &this
598            .backing
599            .cvm_state()
600            .vtl1
601            .as_ref()
602            .unwrap()
603            .reg_intercept;
604
605        // Update CR0 and CR4 intercept masks in the VMCS.
606        this.runner.write_vmcs64(
607            vtl,
608            VmcsField::VMX_VMCS_CR0_GUEST_HOST_MASK,
609            !0,
610            this.shared.cr_guest_host_mask(ShadowedRegister::Cr0)
611                | if intercept_control.cr0_write() {
612                    intercept_masks.cr0_mask
613                } else {
614                    0
615                },
616        );
617        this.runner.write_vmcs64(
618            vtl,
619            VmcsField::VMX_VMCS_CR4_GUEST_HOST_MASK,
620            !0,
621            this.shared.cr_guest_host_mask(ShadowedRegister::Cr4)
622                | if intercept_control.cr4_write() {
623                    intercept_masks.cr4_mask
624                } else {
625                    0
626                },
627        );
628
629        // Update descriptor table intercepts.
630        let intercept_tables = intercept_control.gdtr_write()
631            | intercept_control.idtr_write()
632            | intercept_control.ldtr_write()
633            | intercept_control.tr_write();
634        this.runner.write_vmcs32(
635            vtl,
636            VmcsField::VMX_VMCS_SECONDARY_PROCESSOR_CONTROLS,
637            SecondaryProcessorControls::new()
638                .with_descriptor_table_exiting(true)
639                .into_bits(),
640            SecondaryProcessorControls::new()
641                .with_descriptor_table_exiting(intercept_tables)
642                .into_bits(),
643        );
644
645        // Update MSR intercepts. We only need to update those that are allowed
646        // to be passed through, as the default otherwise is to always intercept.
647        // See [`MSR_ALLOWED_READ_WRITE`].
648        this.runner.set_msr_bit(
649            vtl,
650            x86defs::X86X_MSR_S_CET,
651            true,
652            intercept_control.msr_scet_write(),
653        );
654        this.runner.set_msr_bit(
655            vtl,
656            x86defs::X86X_MSR_PL0_SSP,
657            true,
658            intercept_control.msr_pls_ssp_write(),
659        );
660        this.runner.set_msr_bit(
661            vtl,
662            x86defs::X86X_MSR_PL1_SSP,
663            true,
664            intercept_control.msr_pls_ssp_write(),
665        );
666        this.runner.set_msr_bit(
667            vtl,
668            x86defs::X86X_MSR_PL2_SSP,
669            true,
670            intercept_control.msr_pls_ssp_write(),
671        );
672        this.runner.set_msr_bit(
673            vtl,
674            x86defs::X86X_MSR_PL3_SSP,
675            true,
676            intercept_control.msr_pls_ssp_write(),
677        );
678        this.runner.set_msr_bit(
679            vtl,
680            x86defs::X86X_MSR_INTERRUPT_SSP_TABLE_ADDR,
681            true,
682            intercept_control.msr_pls_ssp_write(),
683        );
684    }
685
686    fn is_interrupt_pending(
687        this: &mut UhProcessor<'_, Self>,
688        vtl: GuestVtl,
689        check_rflags: bool,
690        dev: &impl CpuIo,
691    ) -> bool {
692        let backing_vtl = &this.backing.vtls[vtl];
693        if backing_vtl.interruption_information.valid()
694            && backing_vtl.interruption_information.interruption_type() == INTERRUPT_TYPE_NMI
695        {
696            return true;
697        }
698
699        let (vector, ppr) = if this.backing.cvm.lapics[vtl].lapic.is_offloaded() {
700            let vector = backing_vtl.private_regs.rvi;
701            let ppr = std::cmp::max(
702                backing_vtl.private_regs.svi.into(),
703                this.runner.tdx_apic_page(vtl).tpr.value,
704            );
705            (vector, ppr)
706        } else {
707            let lapic = &mut this.backing.cvm.lapics[vtl].lapic;
708            let vector = lapic.next_irr().unwrap_or(0);
709            let ppr = lapic
710                .access(&mut TdxApicClient {
711                    partition: this.partition,
712                    apic_page: this.runner.tdx_apic_page_mut(vtl),
713                    dev,
714                    vmtime: &this.vmtime,
715                    vtl,
716                })
717                .get_ppr();
718            (vector, ppr)
719        };
720        let vector_priority = (vector as u32) >> 4;
721        let ppr_priority = ppr >> 4;
722
723        if vector_priority <= ppr_priority {
724            return false;
725        }
726
727        if check_rflags && !RFlags::from_bits(backing_vtl.private_regs.rflags).interrupt_enable() {
728            return false;
729        }
730
731        let interruptibility: Interruptibility = this
732            .runner
733            .read_vmcs32(vtl, VmcsField::VMX_VMCS_GUEST_INTERRUPTIBILITY)
734            .into();
735
736        if interruptibility.blocked_by_sti() || interruptibility.blocked_by_movss() {
737            return false;
738        }
739
740        true
741    }
742
743    fn untrusted_synic_mut(&mut self) -> Option<&mut ProcessorSynic> {
744        self.untrusted_synic.as_mut()
745    }
746}
747
748/// Partition-wide shared data for TDX VPs.
749#[derive(Inspect)]
750pub struct TdxBackedShared {
751    #[inspect(flatten)]
752    pub(crate) cvm: UhCvmPartitionState,
753    /// The synic state used for untrusted SINTs, that is, the SINTs for which
754    /// the guest thinks it is interacting directly with the untrusted
755    /// hypervisor via an architecture-specific interface.
756    pub(crate) untrusted_synic: Option<GlobalSynic>,
757    flush_state: VtlArray<TdxPartitionFlushState, 2>,
758    #[inspect(iter_by_index)]
759    active_vtl: Vec<AtomicU8>,
760    /// CR4 bits that the guest is allowed to set to 1.
761    cr4_allowed_bits: u64,
762}
763
764impl TdxBackedShared {
765    pub(crate) fn new(
766        partition_params: &UhPartitionNewParams<'_>,
767        params: BackingSharedParams<'_>,
768    ) -> Result<Self, crate::Error> {
769        // Create a second synic to fully manage the untrusted SINTs
770        // here. At time of writing, the hypervisor does not support
771        // sharing the untrusted SINTs with the TDX L1. Even if it did,
772        // performance would be poor for cases where the L1 implements
773        // high-performance devices.
774        let untrusted_synic = (partition_params.handle_synic && !partition_params.hide_isolation)
775            .then(|| GlobalSynic::new(partition_params.topology.vp_count()));
776
777        // TODO TDX: Consider just using MSR kernel module instead of explicit ioctl.
778        let cr4_fixed1 = params.hcl.read_vmx_cr4_fixed1();
779        let cr4_allowed_bits =
780            (ShadowedRegister::Cr4.guest_owned_mask() | X64_CR4_MCE) & cr4_fixed1;
781
782        Ok(Self {
783            untrusted_synic,
784            flush_state: VtlArray::from_fn(|_| TdxPartitionFlushState::new()),
785            cvm: params.cvm_state.unwrap(),
786            // VPs start in VTL 2.
787            active_vtl: std::iter::repeat_n(2, partition_params.topology.vp_count() as usize)
788                .map(AtomicU8::new)
789                .collect(),
790            cr4_allowed_bits,
791        })
792    }
793
794    /// Get the default guest host mask for the specified register.
795    fn cr_guest_host_mask(&self, reg: ShadowedRegister) -> u64 {
796        match reg {
797            ShadowedRegister::Cr0 => {
798                !ShadowedRegister::Cr0.guest_owned_mask() | X64_CR0_PE | X64_CR0_PG
799            }
800            ShadowedRegister::Cr4 => {
801                !(ShadowedRegister::Cr4.guest_owned_mask() & self.cr4_allowed_bits)
802            }
803        }
804    }
805}
806
807impl TdxBacked {
808    /// Gets the number of pages that will be allocated from the shared page pool
809    /// for each CPU.
810    pub fn shared_pages_required_per_cpu() -> u64 {
811        UhDirectOverlay::Count as u64
812    }
813}
814
815// The memory used to back the untrusted synic is not guest-visible, but rather
816// is allocated from our shared pool. Therefore it does not need to go through
817// the normal memory protections path.
818struct UntrustedSynicVtlProts<'a>(&'a GuestMemory);
819
820impl hv1_emulator::VtlProtectAccess for UntrustedSynicVtlProts<'_> {
821    fn check_modify_and_lock_overlay_page(
822        &mut self,
823        gpn: u64,
824        _check_perms: hvdef::HvMapGpaFlags,
825        _new_perms: Option<hvdef::HvMapGpaFlags>,
826    ) -> Result<guestmem::LockedPages, HvError> {
827        self.0
828            .lock_gpns(false, &[gpn])
829            .map_err(|_| HvError::OperationFailed)
830    }
831
832    fn unlock_overlay_page(&mut self, _gpn: u64) -> Result<(), HvError> {
833        Ok(())
834    }
835}
836
837#[expect(private_interfaces)]
838impl BackingPrivate for TdxBacked {
839    type HclBacking<'tdx> = Tdx<'tdx>;
840    type Shared = TdxBackedShared;
841    type EmulationCache = TdxEmulationCache;
842
843    fn shared(shared: &BackingShared) -> &Self::Shared {
844        let BackingShared::Tdx(shared) = shared else {
845            unreachable!()
846        };
847        shared
848    }
849
850    fn new(
851        params: super::BackingParams<'_, '_, Self>,
852        shared: &TdxBackedShared,
853    ) -> Result<Self, crate::Error> {
854        // TODO TDX: ssp is for shadow stack
855        // TODO TDX: direct overlay like snp?
856        // TODO TDX: lapic / APIC setup?
857        // TODO TDX: see ValInitializeVplc
858        // TODO TDX: XCR_XFMEM setup?
859
860        // Turn on MBEC for just VTL 0.
861        params.runner.write_vmcs32(
862            GuestVtl::Vtl0,
863            VmcsField::VMX_VMCS_SECONDARY_PROCESSOR_CONTROLS,
864            SecondaryProcessorControls::new()
865                .with_mode_based_execute_control(true)
866                .into(),
867            SecondaryProcessorControls::new()
868                .with_mode_based_execute_control(true)
869                .into(),
870        );
871
872        let controls = TdxL2Ctls::new()
873            // Configure L2 controls to permit shared memory.
874            .with_enable_shared_ept(!shared.cvm.hide_isolation)
875            // If the synic is to be managed by the hypervisor, then enable TDVMCALLs.
876            .with_enable_tdvmcall(shared.untrusted_synic.is_none() && !shared.cvm.hide_isolation);
877
878        params
879            .runner
880            .set_l2_ctls(GuestVtl::Vtl0, controls)
881            .map_err(crate::Error::FailedToSetL2Ctls)?;
882
883        for vtl in [GuestVtl::Vtl0, GuestVtl::Vtl1] {
884            // Set guest/host masks for CR0 and CR4. These enable shadowing these
885            // registers since TDX requires certain bits to be set at all times.
886            let initial_cr0 = params
887                .runner
888                .read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_CR0);
889            assert_eq!(initial_cr0, X64_CR0_PE | X64_CR0_NE);
890
891            // N.B. CR0.PE and CR0.PG are guest owned but still intercept when they
892            // are changed for caching purposes and to ensure EFER is managed
893            // properly due to the need to change execution state.
894            params.runner.write_vmcs64(
895                vtl,
896                VmcsField::VMX_VMCS_CR0_READ_SHADOW,
897                !0,
898                X64_CR0_PE | X64_CR0_NE,
899            );
900            params.runner.write_vmcs64(
901                vtl,
902                VmcsField::VMX_VMCS_CR0_GUEST_HOST_MASK,
903                !0,
904                shared.cr_guest_host_mask(ShadowedRegister::Cr0),
905            );
906
907            let initial_cr4 = params
908                .runner
909                .read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_CR4);
910            assert_eq!(initial_cr4, X64_CR4_MCE | X64_CR4_VMXE);
911
912            params
913                .runner
914                .write_vmcs64(vtl, VmcsField::VMX_VMCS_CR4_READ_SHADOW, !0, 0);
915            params.runner.write_vmcs64(
916                vtl,
917                VmcsField::VMX_VMCS_CR4_GUEST_HOST_MASK,
918                !0,
919                shared.cr_guest_host_mask(ShadowedRegister::Cr4),
920            );
921
922            // Configure the MSR bitmap for this VP. Since the default MSR bitmap
923            // is set to intercept everything only the MSRs that we want to allow
924            // to passthrough need to be set.
925            for msr in MSR_ALLOWED_READ {
926                params.runner.set_msr_bit(vtl, *msr, false, false);
927            }
928            for msr in MSR_ALLOWED_READ_WRITE {
929                params.runner.set_msr_bit(vtl, *msr, false, false);
930                params.runner.set_msr_bit(vtl, *msr, true, false);
931            }
932
933            // Set the exception bitmap.
934            if params.partition.intercept_debug_exceptions {
935                if cfg!(feature = "gdb") {
936                    let initial_exception_bitmap = params
937                        .runner
938                        .read_vmcs32(vtl, VmcsField::VMX_VMCS_EXCEPTION_BITMAP);
939
940                    let exception_bitmap =
941                        initial_exception_bitmap | (1 << x86defs::Exception::DEBUG.0);
942
943                    params.runner.write_vmcs32(
944                        vtl,
945                        VmcsField::VMX_VMCS_EXCEPTION_BITMAP,
946                        !0,
947                        exception_bitmap,
948                    );
949                } else {
950                    return Err(super::Error::InvalidDebugConfiguration);
951                }
952            }
953        }
954
955        let flush_page = shared
956            .cvm
957            .private_dma_client
958            .allocate_dma_buffer(HV_PAGE_SIZE as usize)
959            .map_err(crate::Error::AllocateTlbFlushPage)?;
960
961        let untrusted_synic = shared
962            .untrusted_synic
963            .as_ref()
964            .map(|synic| synic.add_vp(params.vp_info.base.vp_index));
965
966        Ok(Self {
967            vtls: VtlArray::from_fn(|vtl| {
968                let vtl: GuestVtl = vtl.try_into().unwrap();
969                TdxVtl {
970                    efer: params
971                        .runner
972                        .read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_EFER),
973                    cr0: VirtualRegister::new(
974                        ShadowedRegister::Cr0,
975                        vtl,
976                        params
977                            .runner
978                            .read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_CR0),
979                        !0,
980                    ),
981                    cr4: VirtualRegister::new(
982                        ShadowedRegister::Cr4,
983                        vtl,
984                        params
985                            .runner
986                            .read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_CR4),
987                        shared.cr4_allowed_bits,
988                    ),
989                    msr_cstar: 0,
990                    tpr_threshold: 0,
991                    processor_controls: params
992                        .runner
993                        .read_vmcs32(vtl, VmcsField::VMX_VMCS_PROCESSOR_CONTROLS)
994                        .into(),
995                    interruption_information: Default::default(),
996                    exception_error_code: 0,
997                    interruption_set: false,
998                    flush_state: TdxFlushState::new(),
999                    private_regs: TdxPrivateRegs::new(vtl),
1000                    enter_stats: Default::default(),
1001                    exit_stats: Default::default(),
1002                }
1003            }),
1004            untrusted_synic,
1005            eoi_exit_bitmap: [0; 4],
1006            flush_page,
1007            cvm: UhCvmVpState::new(
1008                &shared.cvm,
1009                params.partition,
1010                params.vp_info,
1011                UhDirectOverlay::Count as usize,
1012            )?,
1013        })
1014    }
1015
1016    type StateAccess<'p, 'a>
1017        = UhVpStateAccess<'a, 'p, Self>
1018    where
1019        Self: 'a + 'p,
1020        'p: 'a;
1021
1022    fn access_vp_state<'a, 'p>(
1023        this: &'a mut UhProcessor<'p, Self>,
1024        vtl: GuestVtl,
1025    ) -> Self::StateAccess<'p, 'a> {
1026        UhVpStateAccess::new(this, vtl)
1027    }
1028
1029    fn init(this: &mut UhProcessor<'_, Self>) {
1030        // Configure the synic direct overlays.
1031        // So far, only VTL 0 is using these (for VMBus).
1032        let pfns = &this.backing.cvm.direct_overlay_handle.pfns();
1033        let reg = |gpn| {
1034            u64::from(
1035                HvSynicSimpSiefp::new()
1036                    .with_base_gpn(gpn)
1037                    .with_enabled(true),
1038            )
1039        };
1040
1041        let values: &[(HvX64RegisterName, u64); 2] = &[
1042            (
1043                HvX64RegisterName::Sifp,
1044                reg(pfns[UhDirectOverlay::Sifp as usize]),
1045            ),
1046            (
1047                HvX64RegisterName::Sipp,
1048                reg(pfns[UhDirectOverlay::Sipp as usize]),
1049            ),
1050        ];
1051
1052        let reg_count = if let Some(synic) = &mut this.backing.untrusted_synic {
1053            let prot_access = &mut UntrustedSynicVtlProts(&this.partition.gm[GuestVtl::Vtl0]);
1054
1055            synic
1056                .set_simp(reg(pfns[UhDirectOverlay::Sipp as usize]), prot_access)
1057                .unwrap();
1058            synic
1059                .set_siefp(reg(pfns[UhDirectOverlay::Sifp as usize]), prot_access)
1060                .unwrap();
1061            // Set the SIEFP in the hypervisor so that the hypervisor can
1062            // directly signal synic events. Don't set the SIMP, since the
1063            // message page is owned by the paravisor.
1064            1
1065        } else {
1066            2
1067        };
1068
1069        this.runner
1070            .set_vp_registers_hvcall(Vtl::Vtl0, &values[..reg_count])
1071            .expect("set_vp_registers hypercall for direct overlays should succeed");
1072
1073        // Enable APIC offload by default for VTL 0.
1074        this.set_apic_offload(GuestVtl::Vtl0, true);
1075        this.backing.cvm.lapics[GuestVtl::Vtl0]
1076            .lapic
1077            .enable_offload();
1078
1079        // But disable it for VTL 1.
1080        this.set_apic_offload(GuestVtl::Vtl1, false);
1081
1082        // Initialize registers to the reset state, since this may be different
1083        // than what's on the VMCS and is certainly different than what's in the
1084        // VP enter and private register state (which was mostly zero
1085        // initialized).
1086        for vtl in [GuestVtl::Vtl0, GuestVtl::Vtl1] {
1087            let registers = Registers::at_reset(&this.partition.caps, &this.inner.vp_info);
1088
1089            let mut state = this.access_state(vtl.into());
1090            state
1091                .set_registers(&registers)
1092                .expect("Resetting to architectural state should succeed");
1093
1094            state.commit().expect("committing state should succeed");
1095        }
1096
1097        // FX regs and XMM registers are zero-initialized by the kernel. Set
1098        // them to the arch default.
1099        *this.runner.fx_state_mut() =
1100            vp::Xsave::at_reset(&this.partition.caps, &this.inner.vp_info).fxsave();
1101    }
1102
1103    async fn run_vp(
1104        this: &mut UhProcessor<'_, Self>,
1105        dev: &impl CpuIo,
1106        _stop: &mut virt::StopVp<'_>,
1107    ) -> Result<(), VpHaltReason> {
1108        this.run_vp_tdx(dev).await
1109    }
1110
1111    fn poll_apic(this: &mut UhProcessor<'_, Self>, vtl: GuestVtl, scan_irr: bool) {
1112        if !this.try_poll_apic(vtl, scan_irr) {
1113            tracing::info!(CVM_ALLOWED, "disabling APIC offload due to auto EOI");
1114            let page = this.runner.tdx_apic_page_mut(vtl);
1115            let (irr, isr) = pull_apic_offload(page);
1116
1117            this.backing.cvm.lapics[vtl]
1118                .lapic
1119                .disable_offload(&irr, &isr);
1120            this.set_apic_offload(vtl, false);
1121            this.try_poll_apic(vtl, false);
1122        }
1123    }
1124
1125    fn request_extint_readiness(_this: &mut UhProcessor<'_, Self>) {
1126        unreachable!("extint managed through software apic")
1127    }
1128
1129    fn request_untrusted_sint_readiness(this: &mut UhProcessor<'_, Self>, sints: u16) {
1130        if let Some(synic) = &mut this.backing.untrusted_synic {
1131            synic.request_sint_readiness(sints);
1132        } else {
1133            tracelimit::error_ratelimited!(CVM_ALLOWED, "untrusted synic is not configured");
1134        }
1135    }
1136
1137    fn hv(&self, vtl: GuestVtl) -> Option<&ProcessorVtlHv> {
1138        Some(&self.cvm.hv[vtl])
1139    }
1140
1141    fn hv_mut(&mut self, vtl: GuestVtl) -> Option<&mut ProcessorVtlHv> {
1142        Some(&mut self.cvm.hv[vtl])
1143    }
1144
1145    fn handle_vp_start_enable_vtl_wake(this: &mut UhProcessor<'_, Self>, vtl: GuestVtl) {
1146        this.hcvm_handle_vp_start_enable_vtl(vtl)
1147    }
1148
1149    fn vtl1_inspectable(this: &UhProcessor<'_, Self>) -> bool {
1150        this.hcvm_vtl1_inspectable()
1151    }
1152
1153    fn process_interrupts(
1154        this: &mut UhProcessor<'_, Self>,
1155        scan_irr: VtlArray<bool, 2>,
1156        first_scan_irr: &mut bool,
1157        dev: &impl CpuIo,
1158    ) -> bool {
1159        this.cvm_process_interrupts(scan_irr, first_scan_irr, dev)
1160    }
1161}
1162
1163impl UhProcessor<'_, TdxBacked> {
1164    /// Returns `Ok(false)` if the APIC offload needs to be disabled and the
1165    /// poll retried.
1166    fn try_poll_apic(&mut self, vtl: GuestVtl, scan_irr: bool) -> bool {
1167        let mut scan = TdxApicScanner {
1168            processor_controls: self.backing.vtls[vtl]
1169                .processor_controls
1170                .with_nmi_window_exiting(false)
1171                .with_interrupt_window_exiting(false),
1172            vp: self,
1173            tpr_threshold: 0,
1174        };
1175
1176        // TODO TDX: filter proxy IRRs by setting the `proxy_irr_blocked` field of the run page
1177        hardware_cvm::apic::poll_apic_core(&mut scan, vtl, scan_irr);
1178
1179        let TdxApicScanner {
1180            vp: _,
1181            processor_controls: new_processor_controls,
1182            tpr_threshold: new_tpr_threshold,
1183        } = scan;
1184
1185        // Interrupts are ignored while waiting for SIPI.
1186        if self.backing.cvm.lapics[vtl].activity != MpState::WaitForSipi
1187            && self.backing.vtls[vtl].tpr_threshold != new_tpr_threshold
1188        {
1189            tracing::trace!(new_tpr_threshold, ?vtl, "setting tpr threshold");
1190            self.runner.write_vmcs32(
1191                vtl,
1192                VmcsField::VMX_VMCS_TPR_THRESHOLD,
1193                !0,
1194                new_tpr_threshold.into(),
1195            );
1196            self.backing.vtls[vtl].tpr_threshold = new_tpr_threshold;
1197        }
1198
1199        if self.backing.vtls[vtl].processor_controls != new_processor_controls {
1200            tracing::trace!(?new_processor_controls, ?vtl, "requesting window change");
1201            self.runner.write_vmcs32(
1202                vtl,
1203                VmcsField::VMX_VMCS_PROCESSOR_CONTROLS,
1204                !0,
1205                new_processor_controls.into(),
1206            );
1207            self.backing.vtls[vtl].processor_controls = new_processor_controls;
1208        }
1209
1210        // Offloading and proxying is only done with VTL 0 today.
1211        if vtl == GuestVtl::Vtl0 {
1212            let mut update_rvi = false;
1213            let r: Result<(), OffloadNotSupported> = self.backing.cvm.lapics[vtl]
1214                .lapic
1215                .push_to_offload(|irr, isr, tmr| {
1216                    let apic_page = self.runner.tdx_apic_page_mut(vtl);
1217
1218                    for (((irr, page_irr), isr), page_isr) in irr
1219                        .iter()
1220                        .zip(&mut apic_page.irr)
1221                        .zip(isr)
1222                        .zip(&mut apic_page.isr)
1223                    {
1224                        page_irr.value |= *irr;
1225                        page_isr.value |= *isr;
1226                    }
1227
1228                    // Update SVI and RVI.
1229                    let svi = top_vector(&apic_page.isr);
1230                    self.backing.vtls[vtl].private_regs.svi = svi;
1231                    update_rvi = true;
1232
1233                    // Ensure the EOI exit bitmap is up to date.
1234                    let fields = [
1235                        VmcsField::VMX_VMCS_EOI_EXIT_0,
1236                        VmcsField::VMX_VMCS_EOI_EXIT_1,
1237                        VmcsField::VMX_VMCS_EOI_EXIT_2,
1238                        VmcsField::VMX_VMCS_EOI_EXIT_3,
1239                    ];
1240                    for ((&field, eoi_exit), (i, tmr)) in fields
1241                        .iter()
1242                        .zip(&mut self.backing.eoi_exit_bitmap)
1243                        .zip(tmr.chunks_exact(2).enumerate())
1244                    {
1245                        let tmr = tmr[0] as u64 | ((tmr[1] as u64) << 32);
1246                        if *eoi_exit != tmr {
1247                            self.runner.write_vmcs64(vtl, field, !0, tmr);
1248                            *eoi_exit = tmr;
1249                            // The kernel driver supports some common APIC functionality (ICR writes,
1250                            // interrupt injection). When the kernel driver handles an interrupt, it
1251                            // must know if that interrupt was previously level-triggered. Otherwise,
1252                            // the EOI will be incorrectly treated as level-triggered. We keep a copy
1253                            // of the tmr in the kernel so it knows when this scenario occurs.
1254                            self.runner.proxy_irr_exit_mut_vtl0()[i * 2] = tmr as u32;
1255                            self.runner.proxy_irr_exit_mut_vtl0()[i * 2 + 1] = (tmr >> 32) as u32;
1256                        }
1257                    }
1258                });
1259
1260            if let Err(OffloadNotSupported) = r {
1261                // APIC needs offloading to be disabled to support auto-EOI. The caller
1262                // will disable offload and try again.
1263                return false;
1264            }
1265
1266            if update_rvi {
1267                let page = self.runner.tdx_apic_page_mut(vtl);
1268                let rvi = top_vector(&page.irr);
1269                self.backing.vtls[vtl].private_regs.rvi = rvi;
1270            }
1271        }
1272
1273        // If there is a pending interrupt, clear the halted and idle state.
1274        if (self.backing.cvm.lapics[vtl].activity != MpState::Running)
1275            && self.backing.cvm.lapics[vtl].lapic.is_offloaded()
1276            && self.backing.vtls[vtl].private_regs.rvi != 0
1277        {
1278            // To model a non-virtualized processor, we should only do this if
1279            // TPR and IF and interrupt shadow allow. However, fetching the
1280            // interrupt shadow state is expensive (tdcall). This shouldn't
1281            // matter much, because real guests don't issue hlt while in
1282            // interrupt shadow or with interrupts disabled or with a non-zero
1283            // TPR.
1284            //
1285            // Note that the processor will not actually inject the interrupt
1286            // until conditions hold. So, unless the guest fails to loop around
1287            // and hlt again (which we already treat as a guest bug, since
1288            // Hyper-V in general does not guarantee hlt will stick until an
1289            // interrupt is pending), at worst this will just burn some CPU.
1290            self.backing.cvm.lapics[vtl].activity = MpState::Running;
1291        }
1292
1293        true
1294    }
1295
1296    fn access_apic_without_offload<R>(
1297        &mut self,
1298        vtl: GuestVtl,
1299        f: impl FnOnce(&mut Self) -> R,
1300    ) -> R {
1301        let offloaded = self.backing.cvm.lapics[vtl].lapic.is_offloaded();
1302        if offloaded {
1303            let (irr, isr) = pull_apic_offload(self.runner.tdx_apic_page_mut(vtl));
1304            self.backing.cvm.lapics[vtl]
1305                .lapic
1306                .disable_offload(&irr, &isr);
1307        }
1308        let r = f(self);
1309        if offloaded {
1310            self.backing.cvm.lapics[vtl].lapic.enable_offload();
1311        }
1312        r
1313    }
1314
1315    fn set_apic_offload(&mut self, vtl: GuestVtl, offload: bool) {
1316        // Update the APIC portion of the MSR bitmap.
1317        let offload_bitmap = if offload {
1318            (1 << x86defs::apic::ApicRegister::TPR.0)
1319                | (1 << x86defs::apic::ApicRegister::EOI.0)
1320                | (1 << x86defs::apic::ApicRegister::SELF_IPI.0)
1321        } else {
1322            0
1323        };
1324        // Once for read and once for write.
1325        for offset in [0, 0x100] {
1326            self.runner
1327                .write_msr_bitmap(vtl, offset + X2APIC_MSR_BASE / 64, !0, !offload_bitmap);
1328        }
1329
1330        // Update virtual-interrupt delivery.
1331        self.runner.write_vmcs32(
1332            vtl,
1333            VmcsField::VMX_VMCS_SECONDARY_PROCESSOR_CONTROLS,
1334            SecondaryProcessorControls::new()
1335                .with_virtual_interrupt_delivery(true)
1336                .into(),
1337            SecondaryProcessorControls::new()
1338                .with_virtual_interrupt_delivery(offload)
1339                .into(),
1340        );
1341
1342        // Clear any pending external interrupt when enabling the APIC offload.
1343        if offload
1344            && self.backing.vtls[vtl]
1345                .interruption_information
1346                .interruption_type()
1347                == INTERRUPT_TYPE_EXTERNAL
1348        {
1349            self.backing.vtls[vtl]
1350                .interruption_information
1351                .set_valid(false);
1352        }
1353    }
1354}
1355
1356struct TdxApicScanner<'a, 'b> {
1357    vp: &'a mut UhProcessor<'b, TdxBacked>,
1358    processor_controls: ProcessorControls,
1359    tpr_threshold: u8,
1360}
1361
1362impl<'b> hardware_cvm::apic::ApicBacking<'b, TdxBacked> for TdxApicScanner<'_, 'b> {
1363    fn vp(&mut self) -> &mut UhProcessor<'b, TdxBacked> {
1364        self.vp
1365    }
1366
1367    fn handle_interrupt(&mut self, vtl: GuestVtl, vector: u8) {
1368        // Exit idle when an interrupt is received, regardless of IF
1369        if self.vp.backing.cvm.lapics[vtl].activity == MpState::Idle {
1370            self.vp.backing.cvm.lapics[vtl].activity = MpState::Running;
1371        }
1372        // If there is a higher-priority pending event of some kind, then
1373        // just request an exit after it has resolved, after which we will
1374        // try again.
1375        if self.vp.backing.vtls[vtl].interruption_information.valid()
1376            && self.vp.backing.vtls[vtl]
1377                .interruption_information
1378                .interruption_type()
1379                != INTERRUPT_TYPE_EXTERNAL
1380        {
1381            self.processor_controls.set_interrupt_window_exiting(true);
1382            return;
1383        }
1384
1385        // Ensure the interrupt is not blocked by RFLAGS.IF or interrupt shadow.
1386        let interruptibility: Interruptibility = self
1387            .vp
1388            .runner
1389            .read_vmcs32(vtl, VmcsField::VMX_VMCS_GUEST_INTERRUPTIBILITY)
1390            .into();
1391
1392        let rflags = RFlags::from(self.vp.backing.vtls[vtl].private_regs.rflags);
1393        if !rflags.interrupt_enable()
1394            || interruptibility.blocked_by_sti()
1395            || interruptibility.blocked_by_movss()
1396        {
1397            self.processor_controls.set_interrupt_window_exiting(true);
1398            return;
1399        }
1400
1401        let priority = vector >> 4;
1402        let apic = self.vp.runner.tdx_apic_page(vtl);
1403        if (apic.tpr.value as u8 >> 4) >= priority {
1404            self.tpr_threshold = priority;
1405            return;
1406        }
1407
1408        self.vp.backing.vtls[vtl].interruption_information = InterruptionInformation::new()
1409            .with_valid(true)
1410            .with_vector(vector)
1411            .with_interruption_type(INTERRUPT_TYPE_EXTERNAL);
1412
1413        self.vp.backing.cvm.lapics[vtl].activity = MpState::Running;
1414    }
1415
1416    fn handle_nmi(&mut self, vtl: GuestVtl) {
1417        // Exit idle when an interrupt is received, regardless of IF
1418        // TODO: Investigate lifting more activity management into poll_apic_core
1419        if self.vp.backing.cvm.lapics[vtl].activity == MpState::Idle {
1420            self.vp.backing.cvm.lapics[vtl].activity = MpState::Running;
1421        }
1422        // If there is a higher-priority pending event of some kind, then
1423        // just request an exit after it has resolved, after which we will
1424        // try again.
1425        if self.vp.backing.vtls[vtl].interruption_information.valid()
1426            && self.vp.backing.vtls[vtl]
1427                .interruption_information
1428                .interruption_type()
1429                != INTERRUPT_TYPE_EXTERNAL
1430        {
1431            self.processor_controls.set_nmi_window_exiting(true);
1432            return;
1433        }
1434
1435        let interruptibility: Interruptibility = self
1436            .vp
1437            .runner
1438            .read_vmcs32(vtl, VmcsField::VMX_VMCS_GUEST_INTERRUPTIBILITY)
1439            .into();
1440
1441        if interruptibility.blocked_by_nmi()
1442            || interruptibility.blocked_by_sti()
1443            || interruptibility.blocked_by_movss()
1444        {
1445            self.processor_controls.set_nmi_window_exiting(true);
1446            return;
1447        }
1448
1449        self.vp.backing.vtls[vtl].interruption_information = InterruptionInformation::new()
1450            .with_valid(true)
1451            .with_vector(2)
1452            .with_interruption_type(INTERRUPT_TYPE_NMI);
1453
1454        self.vp.backing.cvm.lapics[vtl].activity = MpState::Running;
1455    }
1456
1457    fn handle_sipi(&mut self, vtl: GuestVtl, cs: SegmentRegister) {
1458        self.vp.write_segment(vtl, TdxSegmentReg::Cs, cs).unwrap();
1459        self.vp.backing.vtls[vtl].private_regs.rip = 0;
1460        self.vp.backing.cvm.lapics[vtl].activity = MpState::Running;
1461    }
1462}
1463
1464impl UhProcessor<'_, TdxBacked> {
1465    async fn run_vp_tdx(&mut self, dev: &impl CpuIo) -> Result<(), VpHaltReason> {
1466        let next_vtl = self.backing.cvm.exit_vtl;
1467
1468        if self.backing.vtls[next_vtl].interruption_information.valid() {
1469            tracing::trace!(
1470                vector = self.backing.vtls[next_vtl]
1471                    .interruption_information
1472                    .vector(),
1473                vp_index = self.vp_index().index(),
1474                ?next_vtl,
1475                "injecting interrupt"
1476            );
1477
1478            self.runner.write_vmcs32(
1479                next_vtl,
1480                VmcsField::VMX_VMCS_ENTRY_INTERRUPT_INFO,
1481                !0,
1482                self.backing.vtls[next_vtl].interruption_information.into(),
1483            );
1484            if self.backing.vtls[next_vtl]
1485                .interruption_information
1486                .deliver_error_code()
1487            {
1488                self.runner.write_vmcs32(
1489                    next_vtl,
1490                    VmcsField::VMX_VMCS_ENTRY_EXCEPTION_ERROR_CODE,
1491                    !0,
1492                    self.backing.vtls[next_vtl].exception_error_code,
1493                );
1494            }
1495            self.backing.vtls[next_vtl].interruption_set = true;
1496        } else if self.backing.vtls[next_vtl].interruption_set {
1497            self.runner
1498                .write_vmcs32(next_vtl, VmcsField::VMX_VMCS_ENTRY_INTERRUPT_INFO, !0, 0);
1499            self.backing.vtls[next_vtl].interruption_set = false;
1500        }
1501
1502        // We're about to return to a lower VTL, so set active_vtl for other VPs,
1503        // do any pending flushes, unlock our TLB locks, and wait for any others
1504        // we're supposed to.
1505
1506        // active_vtl needs SeqCst ordering here in order to correctly synchronize
1507        // access with the TLB address flush list. We need to ensure that, when
1508        // other VPs are adding entries to the list, they always observe the
1509        // correct lower active VTL. Otherwise they might choose to not send this
1510        // VP a wake, leading to a stall, until this VP happens to exit to VTL 2 again.
1511        //
1512        // This does technically leave open a small window for potential spurious
1513        // wakes, but that's preferable, and will cause no problems besides a
1514        // small amount of time waste.
1515        self.shared.active_vtl[self.vp_index().index() as usize]
1516            .store(next_vtl as u8, Ordering::SeqCst);
1517
1518        self.do_tlb_flush(next_vtl);
1519        self.unlock_tlb_lock(Vtl::Vtl2);
1520        let tlb_halt = self.should_halt_for_tlb_unlock(next_vtl);
1521
1522        // If we are halted in the kernel due to hlt or idle, and we receive an interrupt
1523        // we'd like to unhalt, inject the interrupt, and resume vtl0 without returning to
1524        // user-mode. To enable this, the kernel must know why are are halted
1525        let activity = self.backing.cvm.lapics[next_vtl].activity;
1526        let kernel_known_state =
1527            matches!(activity, MpState::Running | MpState::Halted | MpState::Idle);
1528        let halted_other = tlb_halt || !kernel_known_state;
1529
1530        self.runner
1531            .set_halted(activity != MpState::Running || tlb_halt);
1532
1533        // Turn on kernel interrupt handling if possible. This will cause the
1534        // kernel to handle some exits internally, without returning to user
1535        // mode, to improve performance.
1536        //
1537        // Do not do this if there is a pending interruption, since we need to
1538        // run code on the next exit to clear it. If we miss this opportunity,
1539        // we will probably double-inject the interruption, wreaking havoc.
1540        //
1541        // Also do not do this if there is a pending TLB flush, since we need to
1542        // run code on the next exit to clear it. If we miss this opportunity,
1543        // we could double-inject the TLB flush unnecessarily.
1544        let offload_enabled = self.backing.cvm.lapics[next_vtl].lapic.can_offload_irr()
1545            && !self.backing.vtls[next_vtl].interruption_information.valid()
1546            && self.backing.vtls[next_vtl]
1547                .private_regs
1548                .vp_entry_flags
1549                .invd_translations()
1550                == 0;
1551        let x2apic_enabled = self.backing.cvm.lapics[next_vtl].lapic.x2apic_enabled();
1552
1553        let offload_flags = hcl_intr_offload_flags::new()
1554            .with_offload_intr_inject(offload_enabled)
1555            .with_offload_x2apic(offload_enabled && x2apic_enabled)
1556            .with_halted_other(halted_other)
1557            .with_halted_hlt(activity == MpState::Halted)
1558            .with_halted_idle(activity == MpState::Idle);
1559
1560        *self.runner.offload_flags_mut() = offload_flags;
1561
1562        self.runner
1563            .write_private_regs(&self.backing.vtls[next_vtl].private_regs);
1564
1565        let has_intercept = self
1566            .runner
1567            .run()
1568            .map_err(|e| dev.fatal_error(TdxRunVpError(e).into()))?;
1569
1570        // TLB flushes can only target lower VTLs, so it is fine to use a relaxed
1571        // ordering here. The worst that can happen is some spurious wakes, due
1572        // to another VP observing that this VP is still in a lower VTL.
1573        self.shared.active_vtl[self.vp_index().index() as usize].store(2, Ordering::Relaxed);
1574
1575        let entered_from_vtl = next_vtl;
1576        self.runner
1577            .read_private_regs(&mut self.backing.vtls[entered_from_vtl].private_regs);
1578
1579        // Kernel offload may have set or cleared the halt/idle states
1580        if offload_enabled && kernel_known_state {
1581            let offload_flags = self.runner.offload_flags_mut();
1582
1583            self.backing.cvm.lapics[entered_from_vtl].activity =
1584                match (offload_flags.halted_hlt(), offload_flags.halted_idle()) {
1585                    (false, false) => MpState::Running,
1586                    (true, false) => MpState::Halted,
1587                    (false, true) => MpState::Idle,
1588                    (true, true) => {
1589                        tracelimit::warn_ratelimited!(
1590                            CVM_ALLOWED,
1591                            "Kernel indicates VP is both halted and idle!"
1592                        );
1593                        activity
1594                    }
1595                };
1596        }
1597
1598        if !has_intercept {
1599            return Ok(());
1600        }
1601
1602        let exit_info = TdxExit(self.runner.tdx_vp_enter_exit_info());
1603
1604        // Result codes above PENDING_INTERRUPT indicate the L2 was never entered.
1605        if exit_info.code().tdx_exit() >= TdCallResultCode::PENDING_INTERRUPT {
1606            self.backing.vtls[entered_from_vtl]
1607                .enter_stats
1608                .pending_intr
1609                .increment();
1610            return Ok(());
1611        }
1612
1613        // Since the L2 was entered we can clear any TLB flush requests
1614        self.backing.vtls[entered_from_vtl]
1615            .private_regs
1616            .vp_entry_flags
1617            .set_invd_translations(0);
1618
1619        // The L2 was entered, so process the exit.
1620        let stat = match exit_info.code().tdx_exit() {
1621            TdCallResultCode::SUCCESS => {
1622                &mut self.backing.vtls[entered_from_vtl].enter_stats.success
1623            }
1624            TdCallResultCode::L2_EXIT_HOST_ROUTED_ASYNC => {
1625                &mut self.backing.vtls[entered_from_vtl]
1626                    .enter_stats
1627                    .host_routed_async
1628            }
1629            TdCallResultCode::L2_EXIT_PENDING_INTERRUPT => {
1630                &mut self.backing.vtls[entered_from_vtl]
1631                    .enter_stats
1632                    .l2_exit_pending_intr
1633            }
1634            TdCallResultCode::L2_EXIT_HOST_ROUTED_TDVMCALL => {
1635                // This is expected, and means that the hypervisor completed a
1636                // TD.VMCALL from the L2 and has requested to resume the L2 to
1637                // the L1.
1638                //
1639                // There is nothing to do here.
1640                assert_eq!(
1641                    exit_info.code().vmx_exit(),
1642                    VmxExit::new().with_basic_reason(VmxExitBasic::TDCALL)
1643                );
1644                &mut self.backing.vtls[entered_from_vtl]
1645                    .enter_stats
1646                    .host_routed_td_vmcall
1647            }
1648            _ => panic!("unexpected tdx exit code {:?}", exit_info.code()),
1649        };
1650
1651        stat.increment();
1652        self.handle_vmx_exit(dev, entered_from_vtl).await?;
1653        Ok(())
1654    }
1655
1656    async fn handle_vmx_exit(
1657        &mut self,
1658        dev: &impl CpuIo,
1659        intercepted_vtl: GuestVtl,
1660    ) -> Result<(), VpHaltReason> {
1661        let exit_info = TdxExit(self.runner.tdx_vp_enter_exit_info());
1662
1663        // First, check that the VM entry was even successful.
1664        let vmx_exit = exit_info.code().vmx_exit();
1665        if vmx_exit.vm_enter_failed() {
1666            return Err(self.handle_vm_enter_failed(dev, intercepted_vtl, vmx_exit));
1667        }
1668
1669        let next_interruption = exit_info.idt_vectoring_info();
1670
1671        // Acknowledge the APIC interrupt/NMI if it was delivered.
1672        if self.backing.vtls[intercepted_vtl]
1673            .interruption_information
1674            .valid()
1675            && (!next_interruption.valid()
1676                || self.backing.vtls[intercepted_vtl]
1677                    .interruption_information
1678                    .interruption_type()
1679                    != next_interruption.interruption_type())
1680        {
1681            match self.backing.vtls[intercepted_vtl]
1682                .interruption_information
1683                .interruption_type()
1684            {
1685                INTERRUPT_TYPE_EXTERNAL
1686                    if !self.backing.cvm.lapics[intercepted_vtl]
1687                        .lapic
1688                        .is_offloaded() =>
1689                {
1690                    // This must be a pending APIC interrupt. Acknowledge it.
1691                    tracing::trace!(
1692                        vector = self.backing.vtls[intercepted_vtl]
1693                            .interruption_information
1694                            .vector(),
1695                        "acknowledging interrupt"
1696                    );
1697                    self.backing.cvm.lapics[intercepted_vtl]
1698                        .lapic
1699                        .acknowledge_interrupt(
1700                            self.backing.vtls[intercepted_vtl]
1701                                .interruption_information
1702                                .vector(),
1703                        );
1704                }
1705                INTERRUPT_TYPE_NMI => {
1706                    // This must be a pending NMI.
1707                    tracing::debug!("acknowledging NMI");
1708                    self.backing.cvm.lapics[intercepted_vtl].nmi_pending = false;
1709                }
1710                _ => {}
1711            }
1712        }
1713
1714        if self.backing.cvm.lapics[intercepted_vtl]
1715            .lapic
1716            .is_offloaded()
1717        {
1718            // It's possible with vAPIC that we take an exit in the window where
1719            // hardware has moved a bit from IRR to ISR, but has not injected
1720            // the interrupt into the guest. In this case, we need to track that
1721            // we must inject the interrupt before we return to the guest,
1722            // otherwise the interrupt will be lost and the guest left in a bad
1723            // state.
1724            //
1725            // TODO TDX: Unclear what kind of exits these would be, but they
1726            // should be spurious EPT exits. Can we validate or assert that
1727            // somehow? If we were to somehow call some other path which would
1728            // set interruption_information before we inject this one, we would
1729            // lose this interrupt.
1730            if next_interruption.valid() {
1731                tracing::debug!(
1732                    ?next_interruption,
1733                    vp_index = self.vp_index().index(),
1734                    "exit requires reinjecting interrupt"
1735                );
1736                self.backing.vtls[intercepted_vtl].interruption_information = next_interruption;
1737                self.backing.vtls[intercepted_vtl].exception_error_code =
1738                    exit_info.idt_vectoring_error_code();
1739                self.backing.vtls[intercepted_vtl]
1740                    .exit_stats
1741                    .needs_interrupt_reinject
1742                    .increment();
1743            } else {
1744                self.backing.vtls[intercepted_vtl].interruption_information = Default::default();
1745            }
1746        } else {
1747            // Ignore (and later recalculate) the next interruption if it is an
1748            // external interrupt or NMI, since it may change if the APIC state
1749            // changes.
1750            if next_interruption.valid()
1751                && !matches!(
1752                    next_interruption.interruption_type(),
1753                    INTERRUPT_TYPE_EXTERNAL | INTERRUPT_TYPE_NMI
1754                )
1755            {
1756                self.backing.vtls[intercepted_vtl].interruption_information = next_interruption;
1757                self.backing.vtls[intercepted_vtl].exception_error_code =
1758                    exit_info.idt_vectoring_error_code();
1759            } else {
1760                self.backing.vtls[intercepted_vtl].interruption_information = Default::default();
1761            }
1762        }
1763
1764        let mut breakpoint_debug_exception = false;
1765        let stat = match vmx_exit.basic_reason() {
1766            VmxExitBasic::IO_INSTRUCTION => {
1767                let io_qual = ExitQualificationIo::from(exit_info.qualification() as u32);
1768
1769                let len = match io_qual.access_size() {
1770                    IO_SIZE_8_BIT => 1,
1771                    IO_SIZE_16_BIT => 2,
1772                    IO_SIZE_32_BIT => 4,
1773                    _ => panic!(
1774                        "tdx module returned invalid io instr size {}",
1775                        io_qual.access_size()
1776                    ),
1777                };
1778
1779                let port_access_protected = self.cvm_try_protect_io_port_access(
1780                    intercepted_vtl,
1781                    io_qual.port(),
1782                    io_qual.is_in(),
1783                    len,
1784                    io_qual.is_string(),
1785                    io_qual.rep_prefix(),
1786                );
1787
1788                if !port_access_protected {
1789                    if io_qual.is_string() || io_qual.rep_prefix() {
1790                        // TODO GUEST VSM: consider changing the emulation path
1791                        // to also check for io port installation, mainly for
1792                        // handling rep instructions.
1793
1794                        self.emulate(
1795                            dev,
1796                            self.backing.vtls[intercepted_vtl]
1797                                .interruption_information
1798                                .valid(),
1799                            intercepted_vtl,
1800                            TdxEmulationCache::default(),
1801                        )
1802                        .await?;
1803                    } else {
1804                        let mut rax = self.runner.tdx_enter_guest_gps()[TdxGp::RAX];
1805                        emulate_io(
1806                            self.inner.vp_info.base.vp_index,
1807                            !io_qual.is_in(),
1808                            io_qual.port(),
1809                            &mut rax,
1810                            len,
1811                            dev,
1812                        )
1813                        .await;
1814                        self.runner.tdx_enter_guest_gps_mut()[TdxGp::RAX] = rax;
1815
1816                        self.advance_to_next_instruction(intercepted_vtl);
1817                    }
1818                }
1819
1820                &mut self.backing.vtls[intercepted_vtl].exit_stats.io
1821            }
1822            VmxExitBasic::MSR_READ => {
1823                let msr = self.runner.tdx_enter_guest_gps()[TdxGp::RCX] as u32;
1824
1825                let result = self.backing.cvm.lapics[intercepted_vtl]
1826                    .lapic
1827                    .access(&mut TdxApicClient {
1828                        partition: self.partition,
1829                        vmtime: &self.vmtime,
1830                        apic_page: self.runner.tdx_apic_page_mut(intercepted_vtl),
1831                        dev,
1832                        vtl: intercepted_vtl,
1833                    })
1834                    .msr_read(msr)
1835                    .or_else_if_unknown(|| self.read_msr_cvm(msr, intercepted_vtl))
1836                    .or_else_if_unknown(|| self.read_msr_tdx(msr, intercepted_vtl));
1837
1838                let value = match result {
1839                    Ok(v) => Some(v),
1840                    Err(MsrError::Unknown) => {
1841                        tracelimit::warn_ratelimited!(CVM_ALLOWED, msr, "unknown tdx vm msr read");
1842                        Some(0)
1843                    }
1844                    Err(MsrError::InvalidAccess) => None,
1845                };
1846
1847                let inject_gp = if let Some(value) = value {
1848                    let gps = self.runner.tdx_enter_guest_gps_mut();
1849                    gps[TdxGp::RAX] = (value as u32).into();
1850                    gps[TdxGp::RDX] = ((value >> 32) as u32).into();
1851                    false
1852                } else {
1853                    true
1854                };
1855
1856                if inject_gp {
1857                    self.inject_gpf(intercepted_vtl);
1858                } else {
1859                    self.advance_to_next_instruction(intercepted_vtl);
1860                }
1861                &mut self.backing.vtls[intercepted_vtl].exit_stats.msr_read
1862            }
1863            VmxExitBasic::MSR_WRITE => {
1864                let gps = self.runner.tdx_enter_guest_gps();
1865                let msr = gps[TdxGp::RCX] as u32;
1866                let value =
1867                    (gps[TdxGp::RAX] as u32 as u64) | ((gps[TdxGp::RDX] as u32 as u64) << 32);
1868
1869                if !self.cvm_try_protect_msr_write(intercepted_vtl, msr) {
1870                    let result = self.backing.cvm.lapics[intercepted_vtl]
1871                        .lapic
1872                        .access(&mut TdxApicClient {
1873                            partition: self.partition,
1874                            vmtime: &self.vmtime,
1875                            apic_page: self.runner.tdx_apic_page_mut(intercepted_vtl),
1876                            dev,
1877                            vtl: intercepted_vtl,
1878                        })
1879                        .msr_write(msr, value)
1880                        .or_else_if_unknown(|| self.write_msr_cvm(msr, value, intercepted_vtl))
1881                        .or_else_if_unknown(|| self.write_msr_tdx(msr, value, intercepted_vtl))
1882                        .or_else_if_unknown(|| {
1883                            // Sanity check
1884                            if MSR_ALLOWED_READ_WRITE.contains(&msr) {
1885                                unreachable!("intercepted a write to MSR {msr}, configured for passthrough by default, that wasn't registered for intercepts by a higher VTL");
1886                            }
1887                            Err(MsrError::Unknown)
1888                        });
1889
1890                    let inject_gp = match result {
1891                        Ok(()) => false,
1892                        Err(MsrError::Unknown) => {
1893                            tracelimit::warn_ratelimited!(
1894                                CVM_ALLOWED,
1895                                msr,
1896                                "unknown tdx vm msr write"
1897                            );
1898                            tracelimit::warn_ratelimited!(
1899                                CVM_CONFIDENTIAL,
1900                                value,
1901                                "unknown tdx vm msr write"
1902                            );
1903                            false
1904                        }
1905                        Err(MsrError::InvalidAccess) => true,
1906                    };
1907
1908                    if inject_gp {
1909                        self.inject_gpf(intercepted_vtl);
1910                    } else {
1911                        self.advance_to_next_instruction(intercepted_vtl);
1912                    }
1913                }
1914                &mut self.backing.vtls[intercepted_vtl].exit_stats.msr_write
1915            }
1916            VmxExitBasic::CPUID => {
1917                let gps = self.runner.tdx_enter_guest_gps();
1918                let leaf = gps[TdxGp::RAX] as u32;
1919                let subleaf = gps[TdxGp::RCX] as u32;
1920                let [eax, ebx, ecx, edx] = self.cvm_cpuid_result(intercepted_vtl, leaf, subleaf);
1921                let gps = self.runner.tdx_enter_guest_gps_mut();
1922                gps[TdxGp::RAX] = eax.into();
1923                gps[TdxGp::RBX] = ebx.into();
1924                gps[TdxGp::RCX] = ecx.into();
1925                gps[TdxGp::RDX] = edx.into();
1926                self.advance_to_next_instruction(intercepted_vtl);
1927                &mut self.backing.vtls[intercepted_vtl].exit_stats.cpuid
1928            }
1929            VmxExitBasic::VMCALL_INSTRUCTION => {
1930                if exit_info.cpl() != 0 {
1931                    self.inject_gpf(intercepted_vtl);
1932                } else {
1933                    let is_64bit = self.long_mode(intercepted_vtl);
1934                    let guest_memory = &self.partition.gm[intercepted_vtl];
1935                    let handler = UhHypercallHandler {
1936                        trusted: !self.cvm_partition().hide_isolation,
1937                        vp: &mut *self,
1938                        bus: dev,
1939                        intercepted_vtl,
1940                    };
1941
1942                    UhHypercallHandler::TDX_DISPATCHER.dispatch(
1943                        guest_memory,
1944                        hv1_hypercall::X64RegisterIo::new(handler, is_64bit),
1945                    );
1946                }
1947                &mut self.backing.vtls[intercepted_vtl].exit_stats.vmcall
1948            }
1949            VmxExitBasic::HLT_INSTRUCTION => {
1950                self.backing.cvm.lapics[intercepted_vtl].activity = MpState::Halted;
1951                self.clear_interrupt_shadow(intercepted_vtl);
1952                self.advance_to_next_instruction(intercepted_vtl);
1953                &mut self.backing.vtls[intercepted_vtl].exit_stats.hlt
1954            }
1955            VmxExitBasic::CR_ACCESS => {
1956                let qual = CrAccessQualification::from(exit_info.qualification());
1957                let cr;
1958                let value;
1959                match qual.access_type() {
1960                    CR_ACCESS_TYPE_MOV_TO_CR => {
1961                        cr = qual.cr();
1962                        value = self.runner.tdx_enter_guest_gps()[qual.gp_register() as usize];
1963                    }
1964                    CR_ACCESS_TYPE_LMSW => {
1965                        cr = 0;
1966                        let cr0 = self.backing.vtls[intercepted_vtl].cr0.read(&self.runner);
1967                        // LMSW updates the low four bits only.
1968                        value = (qual.lmsw_source_data() as u64 & 0xf) | (cr0 & !0xf);
1969                    }
1970                    access_type => unreachable!("not registered for cr access type {access_type}"),
1971                }
1972
1973                let cr = match cr {
1974                    0 => HvX64RegisterName::Cr0,
1975                    4 => HvX64RegisterName::Cr4,
1976                    _ => unreachable!("not registered for cr{cr} accesses"),
1977                };
1978
1979                if !self.cvm_try_protect_secure_register_write(intercepted_vtl, cr, value) {
1980                    let r = match cr {
1981                        HvX64RegisterName::Cr0 => self.backing.vtls[intercepted_vtl]
1982                            .cr0
1983                            .write(value, &mut self.runner),
1984                        HvX64RegisterName::Cr4 => self.backing.vtls[intercepted_vtl]
1985                            .cr4
1986                            .write(value, &mut self.runner),
1987                        _ => unreachable!(),
1988                    };
1989                    if r.is_ok() {
1990                        self.update_execution_mode(intercepted_vtl);
1991                        self.advance_to_next_instruction(intercepted_vtl);
1992                    } else {
1993                        tracelimit::warn_ratelimited!(
1994                            CVM_ALLOWED,
1995                            ?cr,
1996                            value,
1997                            "failed to write cr"
1998                        );
1999                        self.inject_gpf(intercepted_vtl);
2000                    }
2001                }
2002                &mut self.backing.vtls[intercepted_vtl].exit_stats.cr_access
2003            }
2004            VmxExitBasic::XSETBV => {
2005                let gps = self.runner.tdx_enter_guest_gps();
2006                if let Some(value) =
2007                    hardware_cvm::validate_xsetbv_exit(hardware_cvm::XsetbvExitInput {
2008                        rax: gps[TdxGp::RAX],
2009                        rcx: gps[TdxGp::RCX],
2010                        rdx: gps[TdxGp::RDX],
2011                        cr4: self.backing.vtls[intercepted_vtl].cr4.read(&self.runner),
2012                        cpl: exit_info.cpl(),
2013                    })
2014                {
2015                    if !self.cvm_try_protect_secure_register_write(
2016                        intercepted_vtl,
2017                        HvX64RegisterName::Xfem,
2018                        value,
2019                    ) {
2020                        self.runner
2021                            .set_vp_register(intercepted_vtl, HvX64RegisterName::Xfem, value.into())
2022                            .unwrap();
2023                        self.advance_to_next_instruction(intercepted_vtl);
2024                    }
2025                } else {
2026                    self.inject_gpf(intercepted_vtl);
2027                }
2028                &mut self.backing.vtls[intercepted_vtl].exit_stats.xsetbv
2029            }
2030            VmxExitBasic::WBINVD_INSTRUCTION => {
2031                // Ask the kernel to flush the cache before issuing VP.ENTER.
2032                let no_invalidate = exit_info.qualification() != 0;
2033                if no_invalidate {
2034                    self.runner.tdx_vp_state_flags_mut().set_wbnoinvd(true);
2035                } else {
2036                    self.runner.tdx_vp_state_flags_mut().set_wbinvd(true);
2037                }
2038
2039                self.advance_to_next_instruction(intercepted_vtl);
2040                &mut self.backing.vtls[intercepted_vtl].exit_stats.wbinvd
2041            }
2042            VmxExitBasic::EPT_VIOLATION => {
2043                let gpa = exit_info.gpa().expect("is EPT exit");
2044                let ept_info = VmxEptExitQualification::from(exit_info.qualification());
2045                // If this was an EPT violation while handling an iret, and
2046                // that iret cleared the NMI blocking state, restore it.
2047                if !next_interruption.valid() && ept_info.nmi_unmasking_due_to_iret() {
2048                    let mask = Interruptibility::new().with_blocked_by_nmi(true);
2049                    let value = Interruptibility::new().with_blocked_by_nmi(true);
2050                    let old_interruptibility: Interruptibility = self
2051                        .runner
2052                        .write_vmcs32(
2053                            intercepted_vtl,
2054                            VmcsField::VMX_VMCS_GUEST_INTERRUPTIBILITY,
2055                            mask.into(),
2056                            value.into(),
2057                        )
2058                        .into();
2059                    assert!(!old_interruptibility.blocked_by_nmi());
2060                } else {
2061                    let is_write = ept_info.access_mask() & 0b10 != 0;
2062                    if self.check_mem_fault(intercepted_vtl, gpa, is_write, ept_info) {
2063                        self.emulate(
2064                            dev,
2065                            self.backing.vtls[intercepted_vtl]
2066                                .interruption_information
2067                                .valid(),
2068                            intercepted_vtl,
2069                            TdxEmulationCache::default(),
2070                        )
2071                        .await?;
2072                    }
2073                }
2074
2075                &mut self.backing.vtls[intercepted_vtl].exit_stats.ept_violation
2076            }
2077            VmxExitBasic::TPR_BELOW_THRESHOLD => {
2078                // Loop around to reevaluate the APIC.
2079                &mut self.backing.vtls[intercepted_vtl]
2080                    .exit_stats
2081                    .tpr_below_threshold
2082            }
2083            VmxExitBasic::INTERRUPT_WINDOW => {
2084                // Loop around to reevaluate the APIC.
2085                &mut self.backing.vtls[intercepted_vtl]
2086                    .exit_stats
2087                    .interrupt_window
2088            }
2089            VmxExitBasic::NMI_WINDOW => {
2090                // Loop around to reevaluate pending NMIs.
2091                &mut self.backing.vtls[intercepted_vtl].exit_stats.nmi_window
2092            }
2093            VmxExitBasic::HW_INTERRUPT => {
2094                if cfg!(feature = "gdb") {
2095                    // Check if the interrupt was triggered by a hardware breakpoint.
2096                    let debug_regs = self
2097                        .access_state(intercepted_vtl.into())
2098                        .debug_regs()
2099                        .expect("register query should not fail");
2100                    // The lowest four bits of DR6 indicate which of the
2101                    // four breakpoints triggered.
2102                    breakpoint_debug_exception = debug_regs.dr6.trailing_zeros() < 4;
2103                }
2104                &mut self.backing.vtls[intercepted_vtl].exit_stats.hw_interrupt
2105            }
2106            VmxExitBasic::SMI_INTR => &mut self.backing.vtls[intercepted_vtl].exit_stats.smi_intr,
2107            VmxExitBasic::PAUSE_INSTRUCTION => {
2108                &mut self.backing.vtls[intercepted_vtl].exit_stats.pause
2109            }
2110            VmxExitBasic::TDCALL => {
2111                // If the proxy synic is local, then the host did not get this
2112                // instruction, and we need to handle it.
2113                if self.backing.untrusted_synic.is_some() {
2114                    assert_eq!(intercepted_vtl, GuestVtl::Vtl0);
2115                    self.handle_tdvmcall(dev, intercepted_vtl);
2116                } else if self.cvm_partition().hide_isolation {
2117                    // TDCALL is not valid when hiding isolation. Inject a #UD.
2118                    self.backing.vtls[intercepted_vtl].interruption_information =
2119                        InterruptionInformation::new()
2120                            .with_valid(true)
2121                            .with_vector(x86defs::Exception::INVALID_OPCODE.0)
2122                            .with_interruption_type(INTERRUPT_TYPE_HARDWARE_EXCEPTION);
2123                }
2124                &mut self.backing.vtls[intercepted_vtl].exit_stats.tdcall
2125            }
2126            VmxExitBasic::EXCEPTION => {
2127                tracing::trace!(
2128                    "Caught Exception: {:?}",
2129                    exit_info._exit_interruption_info()
2130                );
2131                if cfg!(feature = "gdb") {
2132                    breakpoint_debug_exception = true;
2133                }
2134                &mut self.backing.vtls[intercepted_vtl].exit_stats.exception
2135            }
2136            VmxExitBasic::TRIPLE_FAULT => {
2137                return Err(VpHaltReason::TripleFault {
2138                    vtl: intercepted_vtl.into(),
2139                });
2140            }
2141            VmxExitBasic::GDTR_OR_IDTR => {
2142                let info = GdtrOrIdtrInstructionInfo::from(exit_info.instr_info().info());
2143                tracing::trace!("Intercepted GDT or IDT instruction: {:?}", info);
2144                let reg = match info.instruction() {
2145                    GdtrOrIdtrInstruction::Sidt | GdtrOrIdtrInstruction::Lidt => {
2146                        HvX64RegisterName::Idtr
2147                    }
2148                    GdtrOrIdtrInstruction::Sgdt | GdtrOrIdtrInstruction::Lgdt => {
2149                        HvX64RegisterName::Gdtr
2150                    }
2151                };
2152                // We only support fowarding intercepts for descriptor table loads today.
2153                if (info.instruction().is_load()
2154                    && !self.cvm_try_protect_secure_register_write(intercepted_vtl, reg, 0))
2155                    || !info.instruction().is_load()
2156                {
2157                    self.emulate_gdtr_or_idtr(intercepted_vtl, dev).await?;
2158                }
2159                &mut self.backing.vtls[intercepted_vtl]
2160                    .exit_stats
2161                    .descriptor_table
2162            }
2163            VmxExitBasic::LDTR_OR_TR => {
2164                let info = LdtrOrTrInstructionInfo::from(exit_info.instr_info().info());
2165                tracing::trace!("Intercepted LDT or TR instruction: {:?}", info);
2166                let reg = match info.instruction() {
2167                    LdtrOrTrInstruction::Sldt | LdtrOrTrInstruction::Lldt => {
2168                        HvX64RegisterName::Ldtr
2169                    }
2170                    LdtrOrTrInstruction::Str | LdtrOrTrInstruction::Ltr => HvX64RegisterName::Tr,
2171                };
2172                // We only support fowarding intercepts for descriptor table loads today.
2173                if (info.instruction().is_load()
2174                    && !self.cvm_try_protect_secure_register_write(intercepted_vtl, reg, 0))
2175                    || !info.instruction().is_load()
2176                {
2177                    self.emulate_ldtr_or_tr(intercepted_vtl, dev).await?;
2178                }
2179                &mut self.backing.vtls[intercepted_vtl]
2180                    .exit_stats
2181                    .descriptor_table
2182            }
2183            _ => {
2184                return Err(dev.fatal_error(UnknownVmxExit(exit_info.code().vmx_exit()).into()));
2185            }
2186        };
2187        stat.increment();
2188
2189        // Breakpoint exceptions may return a non-fatal error.
2190        // We dispatch here to correctly increment the counter.
2191        if cfg!(feature = "gdb") && breakpoint_debug_exception {
2192            self.handle_debug_exception(dev, intercepted_vtl)?;
2193        }
2194
2195        Ok(())
2196    }
2197
2198    /// Trace processor state for debugging purposes.
2199    fn trace_processor_state(&self, vtl: GuestVtl) {
2200        let raw_exit = self.runner.tdx_vp_enter_exit_info();
2201        tracing::error!(CVM_CONFIDENTIAL, ?raw_exit, "raw tdx vp enter exit info");
2202
2203        let gprs = self.runner.tdx_enter_guest_gps();
2204        tracing::error!(CVM_CONFIDENTIAL, ?gprs, "guest gpr list");
2205
2206        let TdxPrivateRegs {
2207            rflags,
2208            rip,
2209            rsp,
2210            ssp,
2211            rvi,
2212            svi,
2213            msr_kernel_gs_base,
2214            msr_star,
2215            msr_lstar,
2216            msr_sfmask,
2217            msr_xss,
2218            msr_tsc_aux,
2219            vp_entry_flags,
2220        } = self.backing.vtls[vtl].private_regs;
2221        tracing::error!(
2222            CVM_CONFIDENTIAL,
2223            rflags,
2224            rip,
2225            rsp,
2226            ssp,
2227            rvi,
2228            svi,
2229            msr_kernel_gs_base,
2230            msr_star,
2231            msr_lstar,
2232            msr_sfmask,
2233            msr_xss,
2234            msr_tsc_aux,
2235            ?vp_entry_flags,
2236            "private registers"
2237        );
2238
2239        let physical_cr0 = self.runner.read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_CR0);
2240        let shadow_cr0 = self
2241            .runner
2242            .read_vmcs64(vtl, VmcsField::VMX_VMCS_CR0_READ_SHADOW);
2243        let cr0_guest_host_mask: u64 = self
2244            .runner
2245            .read_vmcs64(vtl, VmcsField::VMX_VMCS_CR0_GUEST_HOST_MASK);
2246        tracing::error!(
2247            CVM_CONFIDENTIAL,
2248            physical_cr0,
2249            shadow_cr0,
2250            cr0_guest_host_mask,
2251            "cr0 values"
2252        );
2253
2254        let physical_cr4 = self.runner.read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_CR4);
2255        let shadow_cr4 = self
2256            .runner
2257            .read_vmcs64(vtl, VmcsField::VMX_VMCS_CR4_READ_SHADOW);
2258        let cr4_guest_host_mask = self
2259            .runner
2260            .read_vmcs64(vtl, VmcsField::VMX_VMCS_CR4_GUEST_HOST_MASK);
2261        tracing::error!(
2262            CVM_CONFIDENTIAL,
2263            physical_cr4,
2264            shadow_cr4,
2265            cr4_guest_host_mask,
2266            "cr4 values"
2267        );
2268
2269        let cr3 = self.runner.read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_CR3);
2270        tracing::error!(CVM_CONFIDENTIAL, cr3, "cr3");
2271
2272        let cached_efer = self.backing.vtls[vtl].efer;
2273        let vmcs_efer = self.runner.read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_EFER);
2274        let entry_controls = self
2275            .runner
2276            .read_vmcs32(vtl, VmcsField::VMX_VMCS_ENTRY_CONTROLS);
2277        tracing::error!(CVM_CONFIDENTIAL, cached_efer, vmcs_efer, "efer");
2278        tracing::error!(CVM_CONFIDENTIAL, entry_controls, "entry controls");
2279
2280        let cs = self.read_segment(vtl, TdxSegmentReg::Cs);
2281        let ds = self.read_segment(vtl, TdxSegmentReg::Ds);
2282        let es = self.read_segment(vtl, TdxSegmentReg::Es);
2283        let fs = self.read_segment(vtl, TdxSegmentReg::Fs);
2284        let gs = self.read_segment(vtl, TdxSegmentReg::Gs);
2285        let ss = self.read_segment(vtl, TdxSegmentReg::Ss);
2286        let tr = self.read_segment(vtl, TdxSegmentReg::Tr);
2287        let ldtr = self.read_segment(vtl, TdxSegmentReg::Ldtr);
2288
2289        tracing::error!(
2290            CVM_CONFIDENTIAL,
2291            ?cs,
2292            ?ds,
2293            ?es,
2294            ?fs,
2295            ?gs,
2296            ?ss,
2297            ?tr,
2298            ?ldtr,
2299            "segment values"
2300        );
2301
2302        let exception_bitmap = self
2303            .runner
2304            .read_vmcs32(vtl, VmcsField::VMX_VMCS_EXCEPTION_BITMAP);
2305        tracing::error!(CVM_CONFIDENTIAL, exception_bitmap, "exception bitmap");
2306
2307        let cached_processor_controls = self.backing.vtls[vtl].processor_controls;
2308        let vmcs_processor_controls = ProcessorControls::from(
2309            self.runner
2310                .read_vmcs32(vtl, VmcsField::VMX_VMCS_PROCESSOR_CONTROLS),
2311        );
2312        let vmcs_secondary_processor_controls = SecondaryProcessorControls::from(
2313            self.runner
2314                .read_vmcs32(vtl, VmcsField::VMX_VMCS_SECONDARY_PROCESSOR_CONTROLS),
2315        );
2316        tracing::error!(
2317            CVM_CONFIDENTIAL,
2318            ?cached_processor_controls,
2319            ?vmcs_processor_controls,
2320            ?vmcs_secondary_processor_controls,
2321            "processor controls"
2322        );
2323
2324        if cached_processor_controls != vmcs_processor_controls {
2325            tracing::error!(CVM_ALLOWED, "BUGBUG: processor controls mismatch");
2326        }
2327
2328        let cached_tpr_threshold = self.backing.vtls[vtl].tpr_threshold;
2329        let vmcs_tpr_threshold = self
2330            .runner
2331            .read_vmcs32(vtl, VmcsField::VMX_VMCS_TPR_THRESHOLD);
2332        tracing::error!(
2333            CVM_CONFIDENTIAL,
2334            cached_tpr_threshold,
2335            vmcs_tpr_threshold,
2336            "tpr threshold"
2337        );
2338
2339        let cached_eoi_exit_bitmap = self.backing.eoi_exit_bitmap;
2340        let vmcs_eoi_exit_bitmap = {
2341            let fields = [
2342                VmcsField::VMX_VMCS_EOI_EXIT_0,
2343                VmcsField::VMX_VMCS_EOI_EXIT_1,
2344                VmcsField::VMX_VMCS_EOI_EXIT_2,
2345                VmcsField::VMX_VMCS_EOI_EXIT_3,
2346            ];
2347            fields
2348                .iter()
2349                .map(|field| self.runner.read_vmcs64(vtl, *field))
2350                .collect::<Vec<_>>()
2351        };
2352        tracing::error!(
2353            CVM_CONFIDENTIAL,
2354            ?cached_eoi_exit_bitmap,
2355            ?vmcs_eoi_exit_bitmap,
2356            "eoi exit bitmap"
2357        );
2358
2359        let cached_interrupt_information = self.backing.vtls[vtl].interruption_information;
2360        let cached_interruption_set = self.backing.vtls[vtl].interruption_set;
2361        let vmcs_interrupt_information = self
2362            .runner
2363            .read_vmcs32(vtl, VmcsField::VMX_VMCS_ENTRY_INTERRUPT_INFO);
2364        let vmcs_entry_exception_code = self
2365            .runner
2366            .read_vmcs32(vtl, VmcsField::VMX_VMCS_ENTRY_EXCEPTION_ERROR_CODE);
2367        tracing::error!(
2368            CVM_CONFIDENTIAL,
2369            ?cached_interrupt_information,
2370            cached_interruption_set,
2371            vmcs_interrupt_information,
2372            vmcs_entry_exception_code,
2373            "interrupt information"
2374        );
2375
2376        let guest_interruptibility = self
2377            .runner
2378            .read_vmcs32(vtl, VmcsField::VMX_VMCS_GUEST_INTERRUPTIBILITY);
2379        tracing::error!(
2380            CVM_CONFIDENTIAL,
2381            guest_interruptibility,
2382            "guest interruptibility"
2383        );
2384
2385        let vmcs_sysenter_cs = self
2386            .runner
2387            .read_vmcs32(vtl, VmcsField::VMX_VMCS_GUEST_SYSENTER_CS_MSR);
2388        let vmcs_sysenter_esp = self
2389            .runner
2390            .read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_SYSENTER_ESP_MSR);
2391        let vmcs_sysenter_eip = self
2392            .runner
2393            .read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_SYSENTER_EIP_MSR);
2394        tracing::error!(
2395            CVM_CONFIDENTIAL,
2396            vmcs_sysenter_cs,
2397            vmcs_sysenter_esp,
2398            vmcs_sysenter_eip,
2399            "sysenter values"
2400        );
2401
2402        let vmcs_pat = self.runner.read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_PAT);
2403        tracing::error!(CVM_CONFIDENTIAL, vmcs_pat, "guest PAT");
2404    }
2405
2406    fn handle_vm_enter_failed(
2407        &self,
2408        dev: &impl CpuIo,
2409        vtl: GuestVtl,
2410        vmx_exit: VmxExit,
2411    ) -> VpHaltReason {
2412        assert!(vmx_exit.vm_enter_failed());
2413        match vmx_exit.basic_reason() {
2414            VmxExitBasic::BAD_GUEST_STATE => {
2415                // Log system register state for debugging why we were
2416                // unable to enter the guest. This is a VMM bug.
2417                tracing::error!(CVM_ALLOWED, "VP.ENTER failed with bad guest state");
2418                self.trace_processor_state(vtl);
2419
2420                dev.fatal_error(VmxBadGuestState.into())
2421            }
2422            _ => dev.fatal_error(UnknownVmxExit(vmx_exit).into()),
2423        }
2424    }
2425
2426    fn advance_to_next_instruction(&mut self, vtl: GuestVtl) {
2427        let instr_info = TdxExit(self.runner.tdx_vp_enter_exit_info()).instr_info();
2428        let rip = &mut self.backing.vtls[vtl].private_regs.rip;
2429        *rip = rip.wrapping_add(instr_info.length().into());
2430    }
2431
2432    fn clear_interrupt_shadow(&mut self, vtl: GuestVtl) {
2433        let mask = Interruptibility::new().with_blocked_by_sti(true);
2434        let value = Interruptibility::new().with_blocked_by_sti(false);
2435        self.runner.write_vmcs32(
2436            vtl,
2437            VmcsField::VMX_VMCS_GUEST_INTERRUPTIBILITY,
2438            mask.into(),
2439            value.into(),
2440        );
2441    }
2442
2443    fn inject_gpf(&mut self, vtl: GuestVtl) {
2444        self.backing.vtls[vtl].interruption_information = InterruptionInformation::new()
2445            .with_valid(true)
2446            .with_vector(x86defs::Exception::GENERAL_PROTECTION_FAULT.0)
2447            .with_interruption_type(INTERRUPT_TYPE_HARDWARE_EXCEPTION)
2448            .with_deliver_error_code(true);
2449        self.backing.vtls[vtl].exception_error_code = 0;
2450    }
2451
2452    fn handle_tdvmcall(&mut self, dev: &impl CpuIo, intercepted_vtl: GuestVtl) {
2453        let regs = self.runner.tdx_enter_guest_gps();
2454        if regs[TdxGp::R10] == 0 {
2455            // Architectural VMCALL.
2456            let result = match VmxExitBasic(regs[TdxGp::R11] as u16) {
2457                VmxExitBasic::MSR_WRITE => {
2458                    let msr = regs[TdxGp::R12] as u32;
2459                    let value = regs[TdxGp::R13];
2460                    match self.write_tdvmcall_msr(msr, value, intercepted_vtl) {
2461                        Ok(()) => {
2462                            tracing::debug!(msr, value, "tdvmcall msr write");
2463                            TdVmCallR10Result::SUCCESS
2464                        }
2465                        Err(err) => {
2466                            tracelimit::warn_ratelimited!(
2467                                CVM_ALLOWED,
2468                                msr,
2469                                ?err,
2470                                "failed tdvmcall msr write"
2471                            );
2472                            tracelimit::warn_ratelimited!(
2473                                CVM_CONFIDENTIAL,
2474                                value,
2475                                "failed tdvmcall msr write"
2476                            );
2477                            TdVmCallR10Result::OPERAND_INVALID
2478                        }
2479                    }
2480                }
2481                VmxExitBasic::MSR_READ => {
2482                    let msr = regs[TdxGp::R12] as u32;
2483                    match self.read_tdvmcall_msr(msr, intercepted_vtl) {
2484                        Ok(value) => {
2485                            tracing::debug!(msr, value, "tdvmcall msr read");
2486                            self.runner.tdx_enter_guest_gps_mut()[TdxGp::R11] = value;
2487                            TdVmCallR10Result::SUCCESS
2488                        }
2489                        Err(err) => {
2490                            tracelimit::warn_ratelimited!(
2491                                CVM_ALLOWED,
2492                                msr,
2493                                ?err,
2494                                "failed tdvmcall msr read"
2495                            );
2496                            TdVmCallR10Result::OPERAND_INVALID
2497                        }
2498                    }
2499                }
2500                subfunction => {
2501                    tracelimit::warn_ratelimited!(
2502                        CVM_ALLOWED,
2503                        ?subfunction,
2504                        "architectural vmcall not supported"
2505                    );
2506                    TdVmCallR10Result::OPERAND_INVALID
2507                }
2508            };
2509            self.runner.tdx_enter_guest_gps_mut()[TdxGp::R10] = result.0;
2510            self.backing.vtls[intercepted_vtl].private_regs.rip = self.backing.vtls
2511                [intercepted_vtl]
2512                .private_regs
2513                .rip
2514                .wrapping_add(4);
2515        } else {
2516            // This hypercall is normally handled by the hypervisor, so the gpas
2517            // given by the guest should all be shared. The hypervisor allows
2518            // gpas to be set with or without the shared gpa boundary bit, which
2519            // untrusted_dma_memory correctly models. Note that some Linux
2520            // guests will issue hypercalls without the boundary bit set,
2521            // whereas UEFI will issue with the bit set.
2522            let guest_memory = &self.shared.cvm.shared_memory;
2523            let handler = UhHypercallHandler {
2524                vp: &mut *self,
2525                bus: dev,
2526                trusted: false,
2527                intercepted_vtl,
2528            };
2529
2530            UhHypercallHandler::TDCALL_DISPATCHER.dispatch(guest_memory, TdHypercall(handler));
2531        }
2532    }
2533
2534    fn read_tdvmcall_msr(&mut self, msr: u32, intercepted_vtl: GuestVtl) -> Result<u64, MsrError> {
2535        match msr {
2536            msr @ (hvdef::HV_X64_MSR_GUEST_OS_ID | hvdef::HV_X64_MSR_VP_INDEX) => {
2537                self.backing.cvm.hv[intercepted_vtl].msr_read(msr)
2538            }
2539            _ => self
2540                .backing
2541                .untrusted_synic
2542                .as_mut()
2543                .unwrap()
2544                .read_nontimer_msr(msr),
2545        }
2546    }
2547
2548    fn write_tdvmcall_msr(
2549        &mut self,
2550        msr: u32,
2551        value: u64,
2552        intercepted_vtl: GuestVtl,
2553    ) -> Result<(), MsrError> {
2554        match msr {
2555            hvdef::HV_X64_MSR_GUEST_OS_ID => {
2556                self.backing.cvm.hv[intercepted_vtl].msr_write_guest_os_id(value)
2557            }
2558            _ => {
2559                // If we get here we must have an untrusted synic, as otherwise
2560                // we wouldn't be handling the TDVMCALL that ends up here. Therefore
2561                // this is fine to unwrap.
2562                self.backing
2563                    .untrusted_synic
2564                    .as_mut()
2565                    .unwrap()
2566                    .write_nontimer_msr(
2567                        msr,
2568                        value,
2569                        &mut UntrustedSynicVtlProts(&self.partition.gm[GuestVtl::Vtl0]),
2570                    )?;
2571                // Propagate sint MSR writes to the hypervisor as well
2572                // so that the hypervisor can directly inject events.
2573                if matches!(msr, hvdef::HV_X64_MSR_SINT0..=hvdef::HV_X64_MSR_SINT15) {
2574                    if let Err(err) = self.runner.set_vp_register(
2575                        intercepted_vtl,
2576                        HvX64RegisterName(
2577                            HvX64RegisterName::Sint0.0 + (msr - hvdef::HV_X64_MSR_SINT0),
2578                        ),
2579                        value.into(),
2580                    ) {
2581                        tracelimit::warn_ratelimited!(
2582                            CVM_ALLOWED,
2583                            error = &err as &dyn std::error::Error,
2584                            "failed to set sint register"
2585                        );
2586                    }
2587                }
2588            }
2589        }
2590
2591        Ok(())
2592    }
2593
2594    fn read_msr_tdx(&mut self, msr: u32, vtl: GuestVtl) -> Result<u64, MsrError> {
2595        // TODO TDX: port remaining tdx and common values
2596        //
2597        // TODO TDX: consider if this can be shared with SnpBacked's
2598        // implementation. For the most part other than Intel/TDX specific
2599        // registers, MSR handling should be the same.
2600
2601        match msr {
2602            // TODO TDX: LIFTED FROM WHP
2603            x86defs::X86X_IA32_MSR_PLATFORM_ID => {
2604                // Windows requires accessing this to boot. WHP
2605                // used to pass this through to the hardware,
2606                // but this regressed. Zero seems to work fine
2607                // for Windows.
2608                //
2609                // TODO: Pass through the host value if it can
2610                //       be retrieved.
2611                Ok(0)
2612            }
2613
2614            x86defs::X86X_MSR_MTRR_CAP => {
2615                // Advertise the absence of MTRR capabilities, but include the availability of write
2616                // combining.
2617                Ok(0x400)
2618            }
2619            x86defs::X86X_MSR_MTRR_DEF_TYPE => {
2620                // Because the MTRR registers are advertised via CPUID, even though no actual ranges
2621                // are supported a guest may choose to write to this MSR. Implement it as read as
2622                // zero/write ignore.
2623                Ok(0)
2624            }
2625            x86defs::X86X_MSR_CSTAR => Ok(self.backing.vtls[vtl].msr_cstar),
2626            x86defs::X86X_MSR_MCG_CAP => Ok(0),
2627            x86defs::X86X_MSR_MCG_STATUS => Ok(0),
2628            x86defs::X86X_MSR_MC_UPDATE_PATCH_LEVEL => Ok(0xFFFFFFFF),
2629            x86defs::X86X_MSR_XSS => Ok(self.backing.vtls[vtl].private_regs.msr_xss),
2630            x86defs::X86X_IA32_MSR_MISC_ENABLE => Ok(hv1_emulator::x86::MISC_ENABLE.into()),
2631            x86defs::X86X_IA32_MSR_FEATURE_CONTROL => Ok(VMX_FEATURE_CONTROL_LOCKED),
2632            x86defs::X86X_MSR_CR_PAT => {
2633                let pat = self.runner.read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_PAT);
2634                Ok(pat)
2635            }
2636
2637            // Following MSRs are unconditionally read by Linux guests.
2638            // These are not virtualized and unsupported for L2-VMs
2639            x86defs::X86X_MSR_MISC_FEATURE_ENABLES
2640            | x86defs::X86X_MSR_PLATFORM_INFO
2641            | x86defs::X86X_MSR_PPIN_CTL
2642            | x86defs::X86X_IA32_MSR_SMI_COUNT
2643            | x86defs::X86X_MSR_UMWAIT_CONTROL
2644            | x86defs::X86X_AMD_MSR_DE_CFG
2645            | x86defs::X86X_IA32_MSR_RAPL_POWER_UNIT
2646            | x86defs::X86X_IA32_MSR_PKG_ENERGY_STATUS
2647            | x86defs::X86X_IA32_MSR_DRAM_ENERGY_STATUS
2648            | x86defs::X86X_IA32_MSR_PP0_ENERGY_STATUS => Ok(0),
2649
2650            hvdef::HV_X64_MSR_GUEST_IDLE => {
2651                self.backing.cvm.lapics[vtl].activity = MpState::Idle;
2652                self.clear_interrupt_shadow(vtl);
2653                Ok(0)
2654            }
2655            X86X_MSR_EFER => Ok(self.backing.vtls[vtl].efer),
2656
2657            _ => Err(MsrError::Unknown),
2658        }
2659    }
2660
2661    fn write_msr_tdx(&mut self, msr: u32, value: u64, vtl: GuestVtl) -> Result<(), MsrError> {
2662        let state = &mut self.backing.vtls[vtl].private_regs;
2663
2664        match msr {
2665            X86X_MSR_EFER => {
2666                self.write_efer(vtl, value)
2667                    .map_err(|_| MsrError::InvalidAccess)?;
2668                self.update_execution_mode(vtl);
2669            }
2670            x86defs::X86X_MSR_STAR => state.msr_star = value,
2671            x86defs::X86X_MSR_CSTAR => self.backing.vtls[vtl].msr_cstar = value,
2672            x86defs::X86X_MSR_LSTAR => state.msr_lstar = value,
2673            x86defs::X86X_MSR_SFMASK => state.msr_sfmask = value,
2674            x86defs::X86X_MSR_TSC_AUX => state.msr_tsc_aux = value,
2675            x86defs::X86X_MSR_SYSENTER_CS => {
2676                self.runner.write_vmcs32(
2677                    vtl,
2678                    VmcsField::VMX_VMCS_GUEST_SYSENTER_CS_MSR,
2679                    !0,
2680                    value as u32,
2681                );
2682            }
2683            x86defs::X86X_MSR_SYSENTER_EIP => {
2684                self.runner.write_vmcs64(
2685                    vtl,
2686                    VmcsField::VMX_VMCS_GUEST_SYSENTER_EIP_MSR,
2687                    !0,
2688                    value,
2689                );
2690            }
2691            x86defs::X86X_MSR_SYSENTER_ESP => {
2692                self.runner.write_vmcs64(
2693                    vtl,
2694                    VmcsField::VMX_VMCS_GUEST_SYSENTER_ESP_MSR,
2695                    !0,
2696                    value,
2697                );
2698            }
2699            x86defs::X86X_MSR_XSS => state.msr_xss = value,
2700            x86defs::X86X_MSR_MC_UPDATE_PATCH_LEVEL => {
2701                // Writing zero on intel platforms is allowed and ignored.
2702                if value != 0 {
2703                    return Err(MsrError::InvalidAccess);
2704                }
2705            }
2706            x86defs::X86X_IA32_MSR_MISC_ENABLE => {}
2707            x86defs::X86X_MSR_CR_PAT => {
2708                self.runner
2709                    .write_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_PAT, !0, value);
2710            }
2711
2712            x86defs::X86X_MSR_MCG_STATUS => {
2713                // Writes are swallowed, except for reserved bits violations
2714                if x86defs::X86xMcgStatusRegister::from(value).reserved0() != 0 {
2715                    return Err(MsrError::InvalidAccess);
2716                }
2717            }
2718
2719            // Ignore writes to this MSR
2720            x86defs::X86X_MSR_MTRR_DEF_TYPE => {}
2721
2722            // Following MSRs are sometimes written by Windows guests.
2723            // These are not virtualized and unsupported for L2-VMs
2724            x86defs::X86X_MSR_BIOS_UPDT_TRIG => {}
2725
2726            // Following MSRs are unconditionally written by Linux guests.
2727            // These are not virtualized and unsupported for L2-VMs
2728            x86defs::X86X_MSR_MISC_FEATURE_ENABLES
2729            | x86defs::X86X_MSR_PLATFORM_INFO
2730            | x86defs::X86X_MSR_PPIN_CTL
2731            | x86defs::X86X_IA32_MSR_SMI_COUNT
2732            | x86defs::X86X_MSR_UMWAIT_CONTROL
2733            | x86defs::X86X_AMD_MSR_DE_CFG
2734            | x86defs::X86X_IA32_MSR_RAPL_POWER_UNIT
2735            | x86defs::X86X_IA32_MSR_PKG_ENERGY_STATUS
2736            | x86defs::X86X_IA32_MSR_DRAM_ENERGY_STATUS
2737            | x86defs::X86X_IA32_MSR_PP0_ENERGY_STATUS => {}
2738
2739            _ => return Err(MsrError::Unknown),
2740        }
2741
2742        Ok(())
2743    }
2744
2745    fn write_segment(
2746        &mut self,
2747        vtl: GuestVtl,
2748        seg: TdxSegmentReg,
2749        reg: SegmentRegister,
2750    ) -> Result<(), vp_state::Error> {
2751        // write base, selector, limit
2752        self.runner
2753            .write_vmcs16(vtl, seg.selector(), !0, reg.selector);
2754        self.runner.write_vmcs64(vtl, seg.base(), !0, reg.base);
2755        self.runner.write_vmcs32(vtl, seg.limit(), !0, reg.limit);
2756
2757        // Mark segment not valid if its attributes indicate not present.
2758        let mut attributes = x86defs::vmx::VmxSegmentAttributes::from(reg.attributes as u32);
2759        attributes.set_null(!attributes.present());
2760
2761        self.runner
2762            .write_vmcs32(vtl, seg.attributes(), !0, attributes.into());
2763
2764        Ok(())
2765    }
2766
2767    fn read_segment(&self, vtl: GuestVtl, seg: TdxSegmentReg) -> SegmentRegister {
2768        let selector = self.runner.read_vmcs16(vtl, seg.selector());
2769        let base = self.runner.read_vmcs64(vtl, seg.base());
2770        let limit = self.runner.read_vmcs32(vtl, seg.limit());
2771        let attributes = self.runner.read_vmcs32(vtl, seg.attributes());
2772
2773        SegmentRegister {
2774            selector,
2775            base,
2776            limit,
2777            attributes: attributes as u16,
2778        }
2779    }
2780
2781    fn long_mode(&self, vtl: GuestVtl) -> bool {
2782        let backing = &self.backing.vtls[vtl];
2783        backing.cr0.read(&self.runner) & X64_CR0_PE != 0 && backing.efer & X64_EFER_LMA != 0
2784    }
2785}
2786
2787impl<T: CpuIo> X86EmulatorSupport for UhEmulationState<'_, '_, T, TdxBacked> {
2788    fn vp_index(&self) -> VpIndex {
2789        self.vp.vp_index()
2790    }
2791
2792    fn flush(&mut self) {
2793        // no cached registers are modifiable by the emulator for TDX
2794    }
2795
2796    fn vendor(&self) -> x86defs::cpuid::Vendor {
2797        self.vp.partition.caps.vendor
2798    }
2799
2800    fn gp(&mut self, reg: Gp) -> u64 {
2801        self.vp.runner.tdx_enter_guest_gps()[reg as usize]
2802    }
2803
2804    fn set_gp(&mut self, reg: Gp, v: u64) {
2805        self.vp.runner.tdx_enter_guest_gps_mut()[reg as usize] = v;
2806    }
2807
2808    fn xmm(&mut self, index: usize) -> u128 {
2809        u128::from_ne_bytes(self.vp.runner.fx_state().xmm[index])
2810    }
2811
2812    fn set_xmm(&mut self, index: usize, v: u128) {
2813        self.vp.runner.fx_state_mut().xmm[index] = v.to_ne_bytes();
2814    }
2815
2816    fn rip(&mut self) -> u64 {
2817        self.vp.backing.vtls[self.vtl].private_regs.rip
2818    }
2819
2820    fn set_rip(&mut self, v: u64) {
2821        self.vp.backing.vtls[self.vtl].private_regs.rip = v;
2822    }
2823
2824    fn segment(&mut self, index: Segment) -> x86defs::SegmentRegister {
2825        let tdx_segment_index = match index {
2826            Segment::CS => TdxSegmentReg::Cs,
2827            Segment::ES => TdxSegmentReg::Es,
2828            Segment::SS => TdxSegmentReg::Ss,
2829            Segment::DS => TdxSegmentReg::Ds,
2830            Segment::FS => TdxSegmentReg::Fs,
2831            Segment::GS => TdxSegmentReg::Gs,
2832        };
2833        let reg = match tdx_segment_index {
2834            TdxSegmentReg::Cs => self.cache.segs[index as usize]
2835                .get_or_insert_with(|| TdxExit(self.vp.runner.tdx_vp_enter_exit_info()).cs()),
2836            _ => self.cache.segs[index as usize]
2837                .get_or_insert_with(|| self.vp.read_segment(self.vtl, tdx_segment_index)),
2838        };
2839        (*reg).into()
2840    }
2841
2842    fn efer(&mut self) -> u64 {
2843        self.vp.backing.vtls[self.vtl].efer
2844    }
2845
2846    fn cr0(&mut self) -> u64 {
2847        let reg = self
2848            .cache
2849            .cr0
2850            .get_or_insert_with(|| self.vp.backing.vtls[self.vtl].cr0.read(&self.vp.runner));
2851        *reg
2852    }
2853
2854    fn rflags(&mut self) -> RFlags {
2855        self.vp.backing.vtls[self.vtl].private_regs.rflags.into()
2856    }
2857
2858    fn set_rflags(&mut self, v: RFlags) {
2859        self.vp.backing.vtls[self.vtl].private_regs.rflags = v.into();
2860    }
2861
2862    fn instruction_bytes(&self) -> &[u8] {
2863        &[]
2864    }
2865
2866    fn physical_address(&self) -> Option<u64> {
2867        TdxExit(self.vp.runner.tdx_vp_enter_exit_info()).gpa()
2868    }
2869
2870    fn initial_gva_translation(
2871        &mut self,
2872    ) -> Option<virt_support_x86emu::emulate::InitialTranslation> {
2873        let exit_info = TdxExit(self.vp.runner.tdx_vp_enter_exit_info());
2874        let ept_info = VmxEptExitQualification::from(exit_info.qualification());
2875
2876        if exit_info.code().vmx_exit().basic_reason() == VmxExitBasic::EPT_VIOLATION
2877            && ept_info.gva_valid()
2878        {
2879            Some(virt_support_x86emu::emulate::InitialTranslation {
2880                gva: exit_info.gla().expect("already validated EPT exit"),
2881                gpa: exit_info.gpa().expect("already validated EPT exit"),
2882                translate_mode: match ept_info.access_mask() {
2883                    0x1 => TranslateMode::Read,
2884                    // As defined in "Table 28-7. Exit Qualification for EPT
2885                    // Violations" in the Intel SDM, the processor may set both
2886                    // the read and write bits in certain conditions:
2887                    //
2888                    // If accessed and dirty flags for EPT are enabled,
2889                    // processor accesses to guest paging-structure entries are
2890                    // treated as writes with regard to EPT violations (see
2891                    // Section 29.3.3.2). If such an access causes an EPT
2892                    // violation, the processor sets both bit 0 and bit 1 of the
2893                    // exit qualification.
2894                    //
2895                    // Treat both 0x2 and 0x3 as writes.
2896                    0x2 | 0x3 => TranslateMode::Write,
2897                    0x4 => TranslateMode::Execute,
2898                    _ => panic!("unexpected ept access mask 0x{:x}", ept_info.access_mask()),
2899                },
2900            })
2901        } else {
2902            None
2903        }
2904    }
2905
2906    fn interruption_pending(&self) -> bool {
2907        self.interruption_pending
2908    }
2909
2910    fn check_vtl_access(
2911        &mut self,
2912        _gpa: u64,
2913        _mode: TranslateMode,
2914    ) -> Result<(), virt_support_x86emu::emulate::EmuCheckVtlAccessError> {
2915        // Nothing to do here, the guest memory object will handle the check.
2916        Ok(())
2917    }
2918
2919    fn translate_gva(
2920        &mut self,
2921        gva: u64,
2922        mode: TranslateMode,
2923    ) -> Result<
2924        virt_support_x86emu::emulate::EmuTranslateResult,
2925        virt_support_x86emu::emulate::EmuTranslateError,
2926    > {
2927        emulate_translate_gva(self, gva, mode)
2928    }
2929
2930    fn inject_pending_event(&mut self, event_info: hvdef::HvX64PendingEvent) {
2931        assert!(event_info.reg_0.event_pending());
2932        assert_eq!(
2933            event_info.reg_0.event_type(),
2934            hvdef::HV_X64_PENDING_EVENT_EXCEPTION
2935        );
2936        assert!(!self.interruption_pending);
2937
2938        // There's no interruption pending, so just inject the exception
2939        // directly without checking for double fault.
2940        TdxBacked::set_pending_exception(
2941            self.vp,
2942            self.vtl,
2943            HvX64PendingExceptionEvent::from(event_info.reg_0.into_bits()),
2944        );
2945    }
2946
2947    fn is_gpa_mapped(&self, gpa: u64, write: bool) -> bool {
2948        // Ignore the VTOM address bit when checking, since memory is mirrored
2949        // across the VTOM.
2950        let vtom = self.vp.partition.caps.vtom.unwrap_or(0);
2951        debug_assert!(vtom == 0 || vtom.is_power_of_two());
2952        self.vp.partition.is_gpa_mapped(gpa & !vtom, write)
2953    }
2954
2955    fn lapic_base_address(&self) -> Option<u64> {
2956        self.vp.backing.cvm.lapics[self.vtl].lapic.base_address()
2957    }
2958
2959    fn lapic_read(&mut self, address: u64, data: &mut [u8]) {
2960        self.vp.backing.cvm.lapics[self.vtl]
2961            .lapic
2962            .access(&mut TdxApicClient {
2963                partition: self.vp.partition,
2964                dev: self.devices,
2965                vmtime: &self.vp.vmtime,
2966                apic_page: self.vp.runner.tdx_apic_page_mut(self.vtl),
2967                vtl: self.vtl,
2968            })
2969            .mmio_read(address, data);
2970    }
2971
2972    fn lapic_write(&mut self, address: u64, data: &[u8]) {
2973        self.vp.backing.cvm.lapics[self.vtl]
2974            .lapic
2975            .access(&mut TdxApicClient {
2976                partition: self.vp.partition,
2977                dev: self.devices,
2978                vmtime: &self.vp.vmtime,
2979                apic_page: self.vp.runner.tdx_apic_page_mut(self.vtl),
2980                vtl: self.vtl,
2981            })
2982            .mmio_write(address, data);
2983    }
2984
2985    fn monitor_support(&self) -> Option<&dyn EmulatorMonitorSupport> {
2986        Some(self)
2987    }
2988}
2989
2990#[derive(Debug)]
2991enum TdxSegmentReg {
2992    Es,
2993    Cs,
2994    Ss,
2995    Ds,
2996    Fs,
2997    Gs,
2998    Ldtr,
2999    Tr,
3000}
3001
3002impl TdxSegmentReg {
3003    /// The selector vmcs field code.
3004    fn selector(&self) -> VmcsField {
3005        match self {
3006            Self::Es => VmcsField::VMX_VMCS_GUEST_ES_SELECTOR,
3007            Self::Cs => VmcsField::VMX_VMCS_GUEST_CS_SELECTOR,
3008            Self::Ss => VmcsField::VMX_VMCS_GUEST_SS_SELECTOR,
3009            Self::Ds => VmcsField::VMX_VMCS_GUEST_DS_SELECTOR,
3010            Self::Fs => VmcsField::VMX_VMCS_GUEST_FS_SELECTOR,
3011            Self::Gs => VmcsField::VMX_VMCS_GUEST_GS_SELECTOR,
3012            Self::Ldtr => VmcsField::VMX_VMCS_GUEST_LDTR_SELECTOR,
3013            Self::Tr => VmcsField::VMX_VMCS_GUEST_TR_SELECTOR,
3014        }
3015    }
3016
3017    /// The base vmcs field code.
3018    fn base(&self) -> VmcsField {
3019        match self {
3020            Self::Es => VmcsField::VMX_VMCS_GUEST_ES_BASE,
3021            Self::Cs => VmcsField::VMX_VMCS_GUEST_CS_BASE,
3022            Self::Ss => VmcsField::VMX_VMCS_GUEST_SS_BASE,
3023            Self::Ds => VmcsField::VMX_VMCS_GUEST_DS_BASE,
3024            Self::Fs => VmcsField::VMX_VMCS_GUEST_FS_BASE,
3025            Self::Gs => VmcsField::VMX_VMCS_GUEST_GS_BASE,
3026            Self::Ldtr => VmcsField::VMX_VMCS_GUEST_LDTR_BASE,
3027            Self::Tr => VmcsField::VMX_VMCS_GUEST_TR_BASE,
3028        }
3029    }
3030
3031    /// The limit vmcs field code.
3032    fn limit(&self) -> VmcsField {
3033        match self {
3034            Self::Es => VmcsField::VMX_VMCS_GUEST_ES_LIMIT,
3035            Self::Cs => VmcsField::VMX_VMCS_GUEST_CS_LIMIT,
3036            Self::Ss => VmcsField::VMX_VMCS_GUEST_SS_LIMIT,
3037            Self::Ds => VmcsField::VMX_VMCS_GUEST_DS_LIMIT,
3038            Self::Fs => VmcsField::VMX_VMCS_GUEST_FS_LIMIT,
3039            Self::Gs => VmcsField::VMX_VMCS_GUEST_GS_LIMIT,
3040            Self::Ldtr => VmcsField::VMX_VMCS_GUEST_LDTR_LIMIT,
3041            Self::Tr => VmcsField::VMX_VMCS_GUEST_TR_LIMIT,
3042        }
3043    }
3044
3045    // The attributes vmcs field code.
3046    fn attributes(&self) -> VmcsField {
3047        match self {
3048            Self::Es => VmcsField::VMX_VMCS_GUEST_ES_AR,
3049            Self::Cs => VmcsField::VMX_VMCS_GUEST_CS_AR,
3050            Self::Ss => VmcsField::VMX_VMCS_GUEST_SS_AR,
3051            Self::Ds => VmcsField::VMX_VMCS_GUEST_DS_AR,
3052            Self::Fs => VmcsField::VMX_VMCS_GUEST_FS_AR,
3053            Self::Gs => VmcsField::VMX_VMCS_GUEST_GS_AR,
3054            Self::Ldtr => VmcsField::VMX_VMCS_GUEST_LDTR_AR,
3055            Self::Tr => VmcsField::VMX_VMCS_GUEST_TR_AR,
3056        }
3057    }
3058}
3059
3060#[derive(Debug)]
3061enum TdxTableReg {
3062    Idtr,
3063    Gdtr,
3064}
3065
3066impl TdxTableReg {
3067    fn base_code(&self) -> VmcsField {
3068        match self {
3069            Self::Idtr => VmcsField::VMX_VMCS_GUEST_IDTR_BASE,
3070            Self::Gdtr => VmcsField::VMX_VMCS_GUEST_GDTR_BASE,
3071        }
3072    }
3073
3074    fn limit_code(&self) -> VmcsField {
3075        match self {
3076            Self::Idtr => VmcsField::VMX_VMCS_GUEST_IDTR_LIMIT,
3077            Self::Gdtr => VmcsField::VMX_VMCS_GUEST_GDTR_LIMIT,
3078        }
3079    }
3080}
3081
3082impl UhProcessor<'_, TdxBacked> {
3083    /// Handle a write to EFER, which requires special handling on TDX due to
3084    /// required bits and state updates.
3085    ///
3086    /// Note that a caller must also call [`Self::update_execution_mode`] after
3087    /// updating EFER.
3088    fn write_efer(&mut self, vtl: GuestVtl, efer: u64) -> Result<(), vp_state::Error> {
3089        if efer & (X64_EFER_SVME | X64_EFER_FFXSR) != 0 {
3090            return Err(vp_state::Error::InvalidValue(
3091                efer,
3092                "EFER",
3093                "SVME or FFXSR set",
3094            ));
3095        }
3096
3097        // EFER.NXE must be 1.
3098        if efer & X64_EFER_NXE == 0 {
3099            return Err(vp_state::Error::InvalidValue(efer, "EFER", "NXE not set"));
3100        }
3101
3102        // Update the local value of EFER and the VMCS.
3103        if self.backing.vtls[vtl].efer != efer {
3104            self.backing.vtls[vtl].efer = efer;
3105            self.runner
3106                .write_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_EFER, !0, efer);
3107        }
3108
3109        Ok(())
3110    }
3111
3112    /// Read CR0 that includes guest shadowed bits. This is the value the guest
3113    /// sees.
3114    fn read_cr0(&self, vtl: GuestVtl) -> u64 {
3115        self.backing.vtls[vtl].cr0.read(&self.runner)
3116    }
3117
3118    /// Write to the guest CR0.
3119    fn write_cr0(&mut self, vtl: GuestVtl, value: u64) -> Result<(), vp_state::Error> {
3120        self.backing.vtls[vtl]
3121            .cr0
3122            .write(value | X64_CR0_ET, &mut self.runner)
3123    }
3124
3125    fn read_cr4(&self, vtl: GuestVtl) -> u64 {
3126        self.backing.vtls[vtl].cr4.read(&self.runner)
3127    }
3128
3129    fn write_cr4(&mut self, vtl: GuestVtl, value: u64) -> Result<(), vp_state::Error> {
3130        self.backing.vtls[vtl].cr4.write(value, &mut self.runner)
3131    }
3132
3133    fn write_table_register(&mut self, vtl: GuestVtl, table: TdxTableReg, reg: TableRegister) {
3134        self.runner
3135            .write_vmcs64(vtl, table.base_code(), !0, reg.base);
3136        self.runner
3137            .write_vmcs32(vtl, table.limit_code(), !0, reg.limit.into());
3138    }
3139
3140    fn read_table_register(&self, vtl: GuestVtl, table: TdxTableReg) -> TableRegister {
3141        let base = self.runner.read_vmcs64(vtl, table.base_code());
3142        let limit = self.runner.read_vmcs32(vtl, table.limit_code());
3143
3144        TableRegister {
3145            base,
3146            limit: limit as u16,
3147        }
3148    }
3149
3150    /// Update execution mode when CR0 or EFER is changed.
3151    fn update_execution_mode(&mut self, vtl: GuestVtl) {
3152        let lme = self.backing.vtls[vtl].efer & X64_EFER_LME == X64_EFER_LME;
3153        let pg = self.read_cr0(vtl) & X64_CR0_PG == X64_CR0_PG;
3154        let efer_lma = self.backing.vtls[vtl].efer & X64_EFER_LMA == X64_EFER_LMA;
3155        let lma = lme && pg;
3156
3157        if lma != efer_lma {
3158            // Flip only the LMA bit.
3159            let new_efer = self.backing.vtls[vtl].efer ^ X64_EFER_LMA;
3160            self.write_efer(vtl, new_efer)
3161                .expect("EFER was valid before, it should still be valid");
3162        }
3163
3164        self.runner.write_vmcs32(
3165            vtl,
3166            VmcsField::VMX_VMCS_ENTRY_CONTROLS,
3167            VMX_ENTRY_CONTROL_LONG_MODE_GUEST,
3168            if lma {
3169                VMX_ENTRY_CONTROL_LONG_MODE_GUEST
3170            } else {
3171                0
3172            },
3173        );
3174    }
3175
3176    async fn emulate_gdtr_or_idtr(
3177        &mut self,
3178        vtl: GuestVtl,
3179        dev: &impl CpuIo,
3180    ) -> Result<(), VpHaltReason> {
3181        let exit_info = TdxExit(self.runner.tdx_vp_enter_exit_info());
3182        assert_eq!(
3183            exit_info.code().vmx_exit().basic_reason(),
3184            VmxExitBasic::GDTR_OR_IDTR
3185        );
3186        let instr_info = GdtrOrIdtrInstructionInfo::from(exit_info.instr_info().info());
3187
3188        // Check if load instructions are executed outside of kernel mode.
3189        // Check if store instructions are blocked by UMIP.
3190        if (instr_info.instruction().is_load() && exit_info.cpl() != 0)
3191            || (!instr_info.instruction().is_load()
3192                && exit_info.cpl() > 0
3193                && self.read_cr4(vtl) & X64_CR4_UMIP != 0)
3194        {
3195            self.inject_gpf(vtl);
3196            return Ok(());
3197        }
3198
3199        let (gva, segment) = self.compute_gva_for_table_access_emulation(
3200            exit_info.qualification(),
3201            (!instr_info.base_register_invalid()).then_some(instr_info.base_register()),
3202            (!instr_info.index_register_invalid()).then_some(instr_info.index_register()),
3203            instr_info.scaling(),
3204            instr_info.address_size(),
3205            instr_info.segment_register(),
3206        );
3207
3208        let gm = &self.partition.gm[vtl];
3209        let interruption_pending = self.backing.vtls[vtl].interruption_information.valid();
3210        let len = 2 + if self.long_mode(vtl) { 8 } else { 4 };
3211        let mut buf = [0u8; 10];
3212
3213        match instr_info.instruction() {
3214            GdtrOrIdtrInstruction::Sidt | GdtrOrIdtrInstruction::Sgdt => {
3215                let table = self.read_table_register(
3216                    vtl,
3217                    if matches!(instr_info.instruction(), GdtrOrIdtrInstruction::Sidt) {
3218                        TdxTableReg::Idtr
3219                    } else {
3220                        TdxTableReg::Gdtr
3221                    },
3222                );
3223                buf[..2].copy_from_slice(&table.limit.to_le_bytes());
3224                buf[2..].copy_from_slice(&table.base.to_le_bytes());
3225                let mut emulation_state = UhEmulationState {
3226                    vp: &mut *self,
3227                    interruption_pending,
3228                    devices: dev,
3229                    vtl,
3230                    cache: TdxEmulationCache::default(),
3231                };
3232                emulate_insn_memory_op(
3233                    &mut emulation_state,
3234                    gm,
3235                    dev,
3236                    gva,
3237                    segment,
3238                    x86emu::AlignmentMode::Unaligned,
3239                    EmulatedMemoryOperation::Write(&buf[..len]),
3240                )
3241                .await?;
3242            }
3243
3244            GdtrOrIdtrInstruction::Lgdt | GdtrOrIdtrInstruction::Lidt => {
3245                let mut emulation_state = UhEmulationState {
3246                    vp: &mut *self,
3247                    interruption_pending,
3248                    devices: dev,
3249                    vtl,
3250                    cache: TdxEmulationCache::default(),
3251                };
3252                emulate_insn_memory_op(
3253                    &mut emulation_state,
3254                    gm,
3255                    dev,
3256                    gva,
3257                    segment,
3258                    x86emu::AlignmentMode::Unaligned,
3259                    EmulatedMemoryOperation::Read(&mut buf[..len]),
3260                )
3261                .await?;
3262                let table = TableRegister {
3263                    limit: u16::from_le_bytes(buf[..2].try_into().unwrap()),
3264                    base: u64::from_le_bytes(buf[2..len].try_into().unwrap()),
3265                };
3266                self.write_table_register(
3267                    vtl,
3268                    if matches!(instr_info.instruction(), GdtrOrIdtrInstruction::Lidt) {
3269                        TdxTableReg::Idtr
3270                    } else {
3271                        TdxTableReg::Gdtr
3272                    },
3273                    table,
3274                );
3275            }
3276        }
3277
3278        self.advance_to_next_instruction(vtl);
3279        Ok(())
3280    }
3281
3282    async fn emulate_ldtr_or_tr(
3283        &mut self,
3284        vtl: GuestVtl,
3285        dev: &impl CpuIo,
3286    ) -> Result<(), VpHaltReason> {
3287        let exit_info = TdxExit(self.runner.tdx_vp_enter_exit_info());
3288        assert_eq!(
3289            exit_info.code().vmx_exit().basic_reason(),
3290            VmxExitBasic::LDTR_OR_TR
3291        );
3292        let instr_info = LdtrOrTrInstructionInfo::from(exit_info.instr_info().info());
3293
3294        // Check if load instructions are executed outside of kernel mode.
3295        // Check if store instructions are blocked by UMIP.
3296        if (instr_info.instruction().is_load() && exit_info.cpl() != 0)
3297            || (!instr_info.instruction().is_load()
3298                && exit_info.cpl() > 0
3299                && self.read_cr4(vtl) & X64_CR4_UMIP != 0)
3300        {
3301            self.inject_gpf(vtl);
3302            return Ok(());
3303        }
3304
3305        let gm = &self.partition.gm[vtl];
3306        let interruption_pending = self.backing.vtls[vtl].interruption_information.valid();
3307
3308        match instr_info.instruction() {
3309            LdtrOrTrInstruction::Sldt | LdtrOrTrInstruction::Str => {
3310                let value = self.runner.read_vmcs16(
3311                    vtl,
3312                    if matches!(instr_info.instruction(), LdtrOrTrInstruction::Sldt) {
3313                        TdxSegmentReg::Ldtr
3314                    } else {
3315                        TdxSegmentReg::Tr
3316                    }
3317                    .selector(),
3318                );
3319
3320                if instr_info.memory_or_register() {
3321                    let gps = self.runner.tdx_enter_guest_gps_mut();
3322                    gps[instr_info.register_1() as usize] = value.into();
3323                } else {
3324                    let (gva, segment) = self.compute_gva_for_table_access_emulation(
3325                        exit_info.qualification(),
3326                        (!instr_info.base_register_invalid()).then_some(instr_info.base_register()),
3327                        (!instr_info.index_register_invalid())
3328                            .then_some(instr_info.index_register()),
3329                        instr_info.scaling(),
3330                        instr_info.address_size(),
3331                        instr_info.segment_register(),
3332                    );
3333                    let mut emulation_state = UhEmulationState {
3334                        vp: &mut *self,
3335                        interruption_pending,
3336                        devices: dev,
3337                        vtl,
3338                        cache: TdxEmulationCache::default(),
3339                    };
3340                    emulate_insn_memory_op(
3341                        &mut emulation_state,
3342                        gm,
3343                        dev,
3344                        gva,
3345                        segment,
3346                        x86emu::AlignmentMode::Standard,
3347                        EmulatedMemoryOperation::Write(&value.to_le_bytes()),
3348                    )
3349                    .await?;
3350                }
3351            }
3352
3353            LdtrOrTrInstruction::Lldt | LdtrOrTrInstruction::Ltr => {
3354                let value = if instr_info.memory_or_register() {
3355                    let gps = self.runner.tdx_enter_guest_gps();
3356                    gps[instr_info.register_1() as usize] as u16
3357                } else {
3358                    let (gva, segment) = self.compute_gva_for_table_access_emulation(
3359                        exit_info.qualification(),
3360                        (!instr_info.base_register_invalid()).then_some(instr_info.base_register()),
3361                        (!instr_info.index_register_invalid())
3362                            .then_some(instr_info.index_register()),
3363                        instr_info.scaling(),
3364                        instr_info.address_size(),
3365                        instr_info.segment_register(),
3366                    );
3367                    let mut emulation_state = UhEmulationState {
3368                        vp: &mut *self,
3369                        interruption_pending,
3370                        devices: dev,
3371                        vtl,
3372                        cache: TdxEmulationCache::default(),
3373                    };
3374                    let mut buf = [0u8; 2];
3375                    emulate_insn_memory_op(
3376                        &mut emulation_state,
3377                        gm,
3378                        dev,
3379                        gva,
3380                        segment,
3381                        x86emu::AlignmentMode::Standard,
3382                        EmulatedMemoryOperation::Read(&mut buf),
3383                    )
3384                    .await?;
3385                    u16::from_le_bytes(buf)
3386                };
3387                self.runner.write_vmcs16(
3388                    vtl,
3389                    if matches!(instr_info.instruction(), LdtrOrTrInstruction::Lldt) {
3390                        TdxSegmentReg::Ldtr
3391                    } else {
3392                        TdxSegmentReg::Tr
3393                    }
3394                    .selector(),
3395                    !0,
3396                    value,
3397                );
3398            }
3399        }
3400
3401        self.advance_to_next_instruction(vtl);
3402        Ok(())
3403    }
3404
3405    fn compute_gva_for_table_access_emulation(
3406        &self,
3407        qualification: u64,
3408        base_reg: Option<u8>,
3409        index_reg: Option<u8>,
3410        scaling: u8,
3411        address_size: u8,
3412        segment_register: u8,
3413    ) -> (u64, Segment) {
3414        let gps = self.runner.tdx_enter_guest_gps();
3415
3416        // Displacement is stored in the qualification field for these instructions.
3417        let mut gva = qualification;
3418        if let Some(base_register) = base_reg {
3419            gva += gps[base_register as usize];
3420        }
3421        if let Some(index_register) = index_reg {
3422            gva += gps[index_register as usize] << scaling;
3423        }
3424        match address_size {
3425            // 16-bit address size
3426            0 => gva &= 0xFFFF,
3427            // 32-bit address size
3428            1 => gva &= 0xFFFFFFFF,
3429            // 64-bit address size
3430            2 => {}
3431            _ => unreachable!(),
3432        }
3433
3434        let segment = match segment_register {
3435            0 => Segment::ES,
3436            1 => Segment::CS,
3437            2 => Segment::SS,
3438            3 => Segment::DS,
3439            4 => Segment::FS,
3440            5 => Segment::GS,
3441            _ => unreachable!(),
3442        };
3443
3444        (gva, segment)
3445    }
3446}
3447
3448struct TdxApicClient<'a, T> {
3449    partition: &'a UhPartitionInner,
3450    apic_page: &'a mut ApicPage,
3451    dev: &'a T,
3452    vmtime: &'a VmTimeAccess,
3453    vtl: GuestVtl,
3454}
3455
3456impl<T: CpuIo> ApicClient for TdxApicClient<'_, T> {
3457    fn cr8(&mut self) -> u32 {
3458        self.apic_page.tpr.value >> 4
3459    }
3460
3461    fn set_cr8(&mut self, value: u32) {
3462        self.apic_page.tpr.value = value << 4;
3463    }
3464
3465    fn set_apic_base(&mut self, _value: u64) {
3466        // No-op--the APIC base is stored in the APIC itself.
3467    }
3468
3469    fn wake(&mut self, vp_index: VpIndex) {
3470        self.partition.vps[vp_index.index() as usize].wake(self.vtl, WakeReason::INTCON);
3471    }
3472
3473    fn eoi(&mut self, vector: u8) {
3474        self.dev.handle_eoi(vector.into())
3475    }
3476
3477    fn now(&mut self) -> vmcore::vmtime::VmTime {
3478        self.vmtime.now()
3479    }
3480
3481    fn pull_offload(&mut self) -> ([u32; 8], [u32; 8]) {
3482        pull_apic_offload(self.apic_page)
3483    }
3484}
3485
3486fn pull_apic_offload(page: &mut ApicPage) -> ([u32; 8], [u32; 8]) {
3487    let mut irr = [0; 8];
3488    let mut isr = [0; 8];
3489    for (((irr, page_irr), isr), page_isr) in irr
3490        .iter_mut()
3491        .zip(page.irr.iter_mut())
3492        .zip(isr.iter_mut())
3493        .zip(page.isr.iter_mut())
3494    {
3495        *irr = std::mem::take(&mut page_irr.value);
3496        *isr = std::mem::take(&mut page_isr.value);
3497    }
3498    (irr, isr)
3499}
3500
3501impl<T> hv1_hypercall::X64RegisterState for UhHypercallHandler<'_, '_, T, TdxBacked> {
3502    fn rip(&mut self) -> u64 {
3503        self.vp.backing.vtls[self.intercepted_vtl].private_regs.rip
3504    }
3505
3506    fn set_rip(&mut self, rip: u64) {
3507        self.vp.backing.vtls[self.intercepted_vtl].private_regs.rip = rip;
3508    }
3509
3510    fn gp(&mut self, n: hv1_hypercall::X64HypercallRegister) -> u64 {
3511        let gps = self.vp.runner.tdx_enter_guest_gps();
3512        match n {
3513            hv1_hypercall::X64HypercallRegister::Rax => gps[TdxGp::RAX],
3514            hv1_hypercall::X64HypercallRegister::Rcx => gps[TdxGp::RCX],
3515            hv1_hypercall::X64HypercallRegister::Rdx => gps[TdxGp::RDX],
3516            hv1_hypercall::X64HypercallRegister::Rbx => gps[TdxGp::RBX],
3517            hv1_hypercall::X64HypercallRegister::Rsi => gps[TdxGp::RSI],
3518            hv1_hypercall::X64HypercallRegister::Rdi => gps[TdxGp::RDI],
3519            hv1_hypercall::X64HypercallRegister::R8 => gps[TdxGp::R8],
3520        }
3521    }
3522
3523    fn set_gp(&mut self, n: hv1_hypercall::X64HypercallRegister, value: u64) {
3524        let gps = self.vp.runner.tdx_enter_guest_gps_mut();
3525        match n {
3526            hv1_hypercall::X64HypercallRegister::Rax => gps[TdxGp::RAX] = value,
3527            hv1_hypercall::X64HypercallRegister::Rcx => gps[TdxGp::RCX] = value,
3528            hv1_hypercall::X64HypercallRegister::Rdx => gps[TdxGp::RDX] = value,
3529            hv1_hypercall::X64HypercallRegister::Rbx => gps[TdxGp::RBX] = value,
3530            hv1_hypercall::X64HypercallRegister::Rsi => gps[TdxGp::RSI] = value,
3531            hv1_hypercall::X64HypercallRegister::Rdi => gps[TdxGp::RDI] = value,
3532            hv1_hypercall::X64HypercallRegister::R8 => gps[TdxGp::R8] = value,
3533        }
3534    }
3535
3536    // TODO: cleanup xmm to not use same as mshv
3537    fn xmm(&mut self, n: usize) -> u128 {
3538        u128::from_ne_bytes(self.vp.runner.fx_state().xmm[n])
3539    }
3540
3541    fn set_xmm(&mut self, n: usize, value: u128) {
3542        self.vp.runner.fx_state_mut().xmm[n] = value.to_ne_bytes();
3543    }
3544}
3545
3546impl<T: CpuIo> UhHypercallHandler<'_, '_, T, TdxBacked> {
3547    const TDX_DISPATCHER: hv1_hypercall::Dispatcher<Self> = hv1_hypercall::dispatcher!(
3548        Self,
3549        [
3550            hv1_hypercall::HvModifySparseGpaPageHostVisibility,
3551            hv1_hypercall::HvQuerySparseGpaPageHostVisibility,
3552            hv1_hypercall::HvX64StartVirtualProcessor,
3553            hv1_hypercall::HvGetVpIndexFromApicId,
3554            hv1_hypercall::HvRetargetDeviceInterrupt,
3555            hv1_hypercall::HvFlushVirtualAddressList,
3556            hv1_hypercall::HvFlushVirtualAddressListEx,
3557            hv1_hypercall::HvFlushVirtualAddressSpace,
3558            hv1_hypercall::HvFlushVirtualAddressSpaceEx,
3559            hv1_hypercall::HvPostMessage,
3560            hv1_hypercall::HvSignalEvent,
3561            hv1_hypercall::HvExtQueryCapabilities,
3562            hv1_hypercall::HvGetVpRegisters,
3563            hv1_hypercall::HvSetVpRegisters,
3564            hv1_hypercall::HvEnablePartitionVtl,
3565            hv1_hypercall::HvX64EnableVpVtl,
3566            hv1_hypercall::HvVtlCall,
3567            hv1_hypercall::HvVtlReturn,
3568            hv1_hypercall::HvModifyVtlProtectionMask,
3569            hv1_hypercall::HvX64TranslateVirtualAddress,
3570            hv1_hypercall::HvSendSyntheticClusterIpi,
3571            hv1_hypercall::HvSendSyntheticClusterIpiEx,
3572            hv1_hypercall::HvInstallIntercept,
3573            hv1_hypercall::HvAssertVirtualInterrupt,
3574        ]
3575    );
3576
3577    /// Hypercalls that come through a tdg.vp.vmcall tdcall instruction.
3578    ///
3579    /// This is just to handle the proxy synic.
3580    const TDCALL_DISPATCHER: hv1_hypercall::Dispatcher<Self> = hv1_hypercall::dispatcher!(
3581        Self,
3582        [hv1_hypercall::HvPostMessage, hv1_hypercall::HvSignalEvent],
3583    );
3584}
3585
3586impl AccessVpState for UhVpStateAccess<'_, '_, TdxBacked> {
3587    type Error = vp_state::Error;
3588
3589    fn caps(&self) -> &virt::x86::X86PartitionCapabilities {
3590        &self.vp.partition.caps
3591    }
3592
3593    fn commit(&mut self) -> Result<(), Self::Error> {
3594        Ok(())
3595    }
3596
3597    fn registers(&mut self) -> Result<Registers, Self::Error> {
3598        let gps = self.vp.runner.tdx_enter_guest_gps();
3599
3600        let cs = self.vp.read_segment(self.vtl, TdxSegmentReg::Cs);
3601        let ds = self.vp.read_segment(self.vtl, TdxSegmentReg::Ds);
3602        let es = self.vp.read_segment(self.vtl, TdxSegmentReg::Es);
3603        let fs = self.vp.read_segment(self.vtl, TdxSegmentReg::Fs);
3604        let gs = self.vp.read_segment(self.vtl, TdxSegmentReg::Gs);
3605        let ss = self.vp.read_segment(self.vtl, TdxSegmentReg::Ss);
3606        let tr = self.vp.read_segment(self.vtl, TdxSegmentReg::Tr);
3607        let ldtr = self.vp.read_segment(self.vtl, TdxSegmentReg::Ldtr);
3608
3609        let gdtr = self.vp.read_table_register(self.vtl, TdxTableReg::Gdtr);
3610        let idtr = self.vp.read_table_register(self.vtl, TdxTableReg::Idtr);
3611
3612        let cr0 = self.vp.read_cr0(self.vtl);
3613        let cr2 = self.vp.runner.cr2();
3614        let cr3 = self
3615            .vp
3616            .runner
3617            .read_vmcs64(self.vtl, VmcsField::VMX_VMCS_GUEST_CR3);
3618        let cr4 = self.vp.read_cr4(self.vtl);
3619
3620        let cr8 = self.vp.runner.tdx_apic_page(self.vtl).tpr.value >> 4;
3621
3622        let efer = self.vp.backing.vtls[self.vtl].efer;
3623
3624        Ok(Registers {
3625            rax: gps[TdxGp::RAX],
3626            rcx: gps[TdxGp::RCX],
3627            rdx: gps[TdxGp::RDX],
3628            rbx: gps[TdxGp::RBX],
3629            rsp: self.vp.backing.vtls[self.vtl].private_regs.rsp,
3630            rbp: gps[TdxGp::RBP],
3631            rsi: gps[TdxGp::RSI],
3632            rdi: gps[TdxGp::RDI],
3633            r8: gps[TdxGp::R8],
3634            r9: gps[TdxGp::R9],
3635            r10: gps[TdxGp::R10],
3636            r11: gps[TdxGp::R11],
3637            r12: gps[TdxGp::R12],
3638            r13: gps[TdxGp::R13],
3639            r14: gps[TdxGp::R14],
3640            r15: gps[TdxGp::R15],
3641            rip: self.vp.backing.vtls[self.vtl].private_regs.rip,
3642            rflags: self.vp.backing.vtls[self.vtl].private_regs.rflags,
3643            cs,
3644            ds,
3645            es,
3646            fs,
3647            gs,
3648            ss,
3649            tr,
3650            ldtr,
3651            gdtr,
3652            idtr,
3653            cr0,
3654            cr2,
3655            cr3,
3656            cr4,
3657            cr8: cr8.into(),
3658            efer,
3659        })
3660    }
3661
3662    fn set_registers(&mut self, value: &Registers) -> Result<(), Self::Error> {
3663        let Registers {
3664            rax,
3665            rcx,
3666            rdx,
3667            rbx,
3668            rsp,
3669            rbp,
3670            rsi,
3671            rdi,
3672            r8,
3673            r9,
3674            r10,
3675            r11,
3676            r12,
3677            r13,
3678            r14,
3679            r15,
3680            rip,
3681            rflags,
3682            cs,
3683            ds,
3684            es,
3685            fs,
3686            gs,
3687            ss,
3688            tr,
3689            ldtr,
3690            gdtr,
3691            idtr,
3692            cr0,
3693            cr2,
3694            cr3,
3695            cr4,
3696            cr8,
3697            efer,
3698        } = value;
3699
3700        let gps = self.vp.runner.tdx_enter_guest_gps_mut();
3701        gps[TdxGp::RAX] = *rax;
3702        gps[TdxGp::RCX] = *rcx;
3703        gps[TdxGp::RDX] = *rdx;
3704        gps[TdxGp::RBX] = *rbx;
3705        self.vp.backing.vtls[self.vtl].private_regs.rsp = *rsp;
3706        gps[TdxGp::RBP] = *rbp;
3707        gps[TdxGp::RSI] = *rsi;
3708        gps[TdxGp::RDI] = *rdi;
3709        gps[TdxGp::R8] = *r8;
3710        gps[TdxGp::R9] = *r9;
3711        gps[TdxGp::R10] = *r10;
3712        gps[TdxGp::R11] = *r11;
3713        gps[TdxGp::R12] = *r12;
3714        gps[TdxGp::R13] = *r13;
3715        gps[TdxGp::R14] = *r14;
3716        gps[TdxGp::R15] = *r15;
3717        self.vp.backing.vtls[self.vtl].private_regs.rip = *rip;
3718        // BUGBUG: rflags set also updates interrupts in hcl
3719        self.vp.backing.vtls[self.vtl].private_regs.rflags = *rflags;
3720
3721        // Set segment registers
3722        self.vp.write_segment(self.vtl, TdxSegmentReg::Cs, *cs)?;
3723        self.vp.write_segment(self.vtl, TdxSegmentReg::Ds, *ds)?;
3724        self.vp.write_segment(self.vtl, TdxSegmentReg::Es, *es)?;
3725        self.vp.write_segment(self.vtl, TdxSegmentReg::Fs, *fs)?;
3726        self.vp.write_segment(self.vtl, TdxSegmentReg::Gs, *gs)?;
3727        self.vp.write_segment(self.vtl, TdxSegmentReg::Ss, *ss)?;
3728        self.vp.write_segment(self.vtl, TdxSegmentReg::Tr, *tr)?;
3729        self.vp
3730            .write_segment(self.vtl, TdxSegmentReg::Ldtr, *ldtr)?;
3731
3732        // Set table registers
3733        self.vp
3734            .write_table_register(self.vtl, TdxTableReg::Gdtr, *gdtr);
3735        self.vp
3736            .write_table_register(self.vtl, TdxTableReg::Idtr, *idtr);
3737
3738        self.vp.write_cr0(self.vtl, *cr0)?;
3739
3740        // CR2 is shared with the kernel, so set it in the VP run page which
3741        // will be set before lower VTL entry.
3742        self.vp.runner.set_cr2(*cr2);
3743
3744        self.vp
3745            .runner
3746            .write_vmcs64(self.vtl, VmcsField::VMX_VMCS_GUEST_CR3, !0, *cr3);
3747
3748        self.vp.write_cr4(self.vtl, *cr4)?;
3749
3750        self.vp.runner.tdx_apic_page_mut(self.vtl).tpr.value = (*cr8 << 4) as u32;
3751
3752        self.vp.write_efer(self.vtl, *efer)?;
3753
3754        // Execution mode must be updated after setting EFER and CR0.
3755        self.vp.update_execution_mode(self.vtl);
3756
3757        Ok(())
3758    }
3759
3760    fn activity(&mut self) -> Result<vp::Activity, Self::Error> {
3761        let lapic = &self.vp.backing.cvm.lapics[self.vtl];
3762        let interruptibility: Interruptibility = self
3763            .vp
3764            .runner
3765            .read_vmcs32(self.vtl, VmcsField::VMX_VMCS_GUEST_INTERRUPTIBILITY)
3766            .into();
3767        Ok(vp::Activity {
3768            mp_state: lapic.activity,
3769            nmi_pending: lapic.nmi_pending,
3770            nmi_masked: interruptibility.blocked_by_nmi(),
3771            interrupt_shadow: interruptibility.blocked_by_sti()
3772                || interruptibility.blocked_by_movss(),
3773            pending_event: None,        // TODO TDX
3774            pending_interruption: None, // TODO TDX
3775        })
3776    }
3777
3778    fn set_activity(&mut self, value: &vp::Activity) -> Result<(), Self::Error> {
3779        let &vp::Activity {
3780            mp_state,
3781            nmi_pending,
3782            nmi_masked,
3783            interrupt_shadow,
3784            pending_event: _,        // TODO TDX
3785            pending_interruption: _, // TODO TDX
3786        } = value;
3787        self.vp.backing.cvm.lapics[self.vtl].activity = mp_state;
3788        self.vp.backing.cvm.lapics[self.vtl].nmi_pending = nmi_pending;
3789        let interruptibility = Interruptibility::new()
3790            .with_blocked_by_movss(interrupt_shadow)
3791            .with_blocked_by_nmi(nmi_masked);
3792        self.vp.runner.write_vmcs32(
3793            self.vtl,
3794            VmcsField::VMX_VMCS_GUEST_INTERRUPTIBILITY,
3795            !0,
3796            interruptibility.into(),
3797        );
3798        Ok(())
3799    }
3800
3801    fn xsave(&mut self) -> Result<vp::Xsave, Self::Error> {
3802        // TODO: needed?
3803        Err(vp_state::Error::Unimplemented("xsave"))
3804    }
3805
3806    fn set_xsave(&mut self, _value: &vp::Xsave) -> Result<(), Self::Error> {
3807        // TODO: needed?
3808        Err(vp_state::Error::Unimplemented("xsave"))
3809    }
3810
3811    fn apic(&mut self) -> Result<vp::Apic, Self::Error> {
3812        self.vp.access_apic_without_offload(self.vtl, |vp| {
3813            Ok(vp.backing.cvm.lapics[self.vtl].lapic.save())
3814        })
3815    }
3816
3817    fn set_apic(&mut self, value: &vp::Apic) -> Result<(), Self::Error> {
3818        self.vp.access_apic_without_offload(self.vtl, |vp| {
3819            vp.backing.cvm.lapics[self.vtl]
3820                .lapic
3821                .restore(value)
3822                .map_err(vp_state::Error::InvalidApicBase)?;
3823
3824            Ok(())
3825        })
3826    }
3827
3828    fn xcr(&mut self) -> Result<vp::Xcr0, Self::Error> {
3829        Ok(vp::Xcr0 {
3830            value: self
3831                .vp
3832                .runner
3833                .get_vp_register(self.vtl, HvX64RegisterName::Xfem)
3834                .unwrap()
3835                .as_u64(),
3836        })
3837    }
3838
3839    fn set_xcr(&mut self, _value: &vp::Xcr0) -> Result<(), Self::Error> {
3840        Err(vp_state::Error::Unimplemented("xcr"))
3841    }
3842
3843    fn xss(&mut self) -> Result<vp::Xss, Self::Error> {
3844        Ok(vp::Xss {
3845            value: self.vp.backing.vtls[self.vtl].private_regs.msr_xss,
3846        })
3847    }
3848
3849    fn set_xss(&mut self, value: &vp::Xss) -> Result<(), Self::Error> {
3850        self.vp.backing.vtls[self.vtl].private_regs.msr_xss = value.value;
3851        Ok(())
3852    }
3853
3854    fn mtrrs(&mut self) -> Result<vp::Mtrrs, Self::Error> {
3855        Ok(vp::Mtrrs {
3856            msr_mtrr_def_type: 0, // TODO TDX: MTRRs
3857            fixed: [0; 11],       // TODO TDX: MTRRs
3858            variable: [0; 16],    // TODO TDX: MTRRs
3859        })
3860    }
3861
3862    fn set_mtrrs(&mut self, _value: &vp::Mtrrs) -> Result<(), Self::Error> {
3863        // TODO TDX: MTRRs
3864        Ok(())
3865    }
3866
3867    fn pat(&mut self) -> Result<vp::Pat, Self::Error> {
3868        let msr_cr_pat = self
3869            .vp
3870            .runner
3871            .read_vmcs64(self.vtl, VmcsField::VMX_VMCS_GUEST_PAT);
3872        Ok(vp::Pat { value: msr_cr_pat })
3873    }
3874
3875    fn set_pat(&mut self, value: &vp::Pat) -> Result<(), Self::Error> {
3876        self.vp
3877            .runner
3878            .write_vmcs64(self.vtl, VmcsField::VMX_VMCS_GUEST_PAT, !0, value.value);
3879        Ok(())
3880    }
3881
3882    fn virtual_msrs(&mut self) -> Result<vp::VirtualMsrs, Self::Error> {
3883        let state = &self.vp.backing.vtls[self.vtl].private_regs;
3884
3885        let sysenter_cs = self
3886            .vp
3887            .runner
3888            .read_vmcs32(self.vtl, VmcsField::VMX_VMCS_GUEST_SYSENTER_CS_MSR)
3889            .into();
3890        let sysenter_eip = self
3891            .vp
3892            .runner
3893            .read_vmcs64(self.vtl, VmcsField::VMX_VMCS_GUEST_SYSENTER_EIP_MSR);
3894        let sysenter_esp = self
3895            .vp
3896            .runner
3897            .read_vmcs64(self.vtl, VmcsField::VMX_VMCS_GUEST_SYSENTER_ESP_MSR);
3898
3899        Ok(vp::VirtualMsrs {
3900            kernel_gs_base: state.msr_kernel_gs_base,
3901            sysenter_cs,
3902            sysenter_eip,
3903            sysenter_esp,
3904            star: state.msr_star,
3905            lstar: state.msr_lstar,
3906            cstar: self.vp.backing.vtls[self.vtl].msr_cstar,
3907            sfmask: state.msr_sfmask,
3908        })
3909    }
3910
3911    fn set_virtual_msrs(&mut self, value: &vp::VirtualMsrs) -> Result<(), Self::Error> {
3912        let &vp::VirtualMsrs {
3913            kernel_gs_base,
3914            sysenter_cs,
3915            sysenter_eip,
3916            sysenter_esp,
3917            star,
3918            lstar,
3919            cstar,
3920            sfmask,
3921        } = value;
3922
3923        let state = &mut self.vp.backing.vtls[self.vtl].private_regs;
3924        state.msr_kernel_gs_base = kernel_gs_base;
3925        state.msr_star = star;
3926        state.msr_lstar = lstar;
3927        state.msr_sfmask = sfmask;
3928
3929        self.vp.runner.write_vmcs32(
3930            self.vtl,
3931            VmcsField::VMX_VMCS_GUEST_SYSENTER_CS_MSR,
3932            !0,
3933            sysenter_cs as u32,
3934        );
3935        self.vp.runner.write_vmcs64(
3936            self.vtl,
3937            VmcsField::VMX_VMCS_GUEST_SYSENTER_EIP_MSR,
3938            !0,
3939            sysenter_eip,
3940        );
3941        self.vp.runner.write_vmcs64(
3942            self.vtl,
3943            VmcsField::VMX_VMCS_GUEST_SYSENTER_ESP_MSR,
3944            !0,
3945            sysenter_esp,
3946        );
3947
3948        self.vp.backing.vtls[self.vtl].msr_cstar = cstar;
3949
3950        Ok(())
3951    }
3952
3953    fn debug_regs(&mut self) -> Result<vp::DebugRegisters, Self::Error> {
3954        let mut values = [0u64.into(); 5];
3955        self.vp
3956            .runner
3957            .get_vp_registers(
3958                self.vtl,
3959                &[
3960                    HvX64RegisterName::Dr0,
3961                    HvX64RegisterName::Dr1,
3962                    HvX64RegisterName::Dr2,
3963                    HvX64RegisterName::Dr3,
3964                    HvX64RegisterName::Dr6,
3965                ],
3966                &mut values,
3967            )
3968            .map_err(vp_state::Error::GetRegisters)?;
3969
3970        let dr7 = self
3971            .vp
3972            .runner
3973            .read_vmcs64(self.vtl, VmcsField::VMX_VMCS_GUEST_DR7);
3974
3975        Ok(vp::DebugRegisters {
3976            dr0: values[0].as_u64(),
3977            dr1: values[1].as_u64(),
3978            dr2: values[2].as_u64(),
3979            dr3: values[3].as_u64(),
3980            dr6: values[4].as_u64(),
3981            dr7,
3982        })
3983    }
3984
3985    fn set_debug_regs(&mut self, value: &vp::DebugRegisters) -> Result<(), Self::Error> {
3986        let &vp::DebugRegisters {
3987            dr0,
3988            dr1,
3989            dr2,
3990            dr3,
3991            dr6,
3992            dr7,
3993        } = value;
3994        self.vp
3995            .runner
3996            .set_vp_registers(
3997                self.vtl,
3998                [
3999                    (HvX64RegisterName::Dr0, dr0),
4000                    (HvX64RegisterName::Dr1, dr1),
4001                    (HvX64RegisterName::Dr2, dr2),
4002                    (HvX64RegisterName::Dr3, dr3),
4003                    (HvX64RegisterName::Dr6, dr6),
4004                ],
4005            )
4006            .map_err(vp_state::Error::SetRegisters)?;
4007
4008        self.vp
4009            .runner
4010            .write_vmcs64(self.vtl, VmcsField::VMX_VMCS_GUEST_DR7, !0, dr7);
4011
4012        Ok(())
4013    }
4014
4015    fn tsc(&mut self) -> Result<vp::Tsc, Self::Error> {
4016        Err(vp_state::Error::Unimplemented("tsc"))
4017    }
4018
4019    fn set_tsc(&mut self, _value: &vp::Tsc) -> Result<(), Self::Error> {
4020        Err(vp_state::Error::Unimplemented("tsc"))
4021    }
4022
4023    fn tsc_aux(&mut self) -> Result<vp::TscAux, Self::Error> {
4024        Ok(vp::TscAux {
4025            value: self.vp.backing.vtls[self.vtl].private_regs.msr_tsc_aux,
4026        })
4027    }
4028
4029    fn set_tsc_aux(&mut self, value: &vp::TscAux) -> Result<(), Self::Error> {
4030        self.vp.backing.vtls[self.vtl].private_regs.msr_tsc_aux = value.value;
4031        Ok(())
4032    }
4033
4034    fn cet(&mut self) -> Result<vp::Cet, Self::Error> {
4035        Err(vp_state::Error::Unimplemented("cet"))
4036    }
4037
4038    fn set_cet(&mut self, _value: &vp::Cet) -> Result<(), Self::Error> {
4039        Err(vp_state::Error::Unimplemented("cet"))
4040    }
4041
4042    fn cet_ss(&mut self) -> Result<vp::CetSs, Self::Error> {
4043        Err(vp_state::Error::Unimplemented("cet_ss"))
4044    }
4045
4046    fn set_cet_ss(&mut self, _value: &vp::CetSs) -> Result<(), Self::Error> {
4047        Err(vp_state::Error::Unimplemented("cet_ss"))
4048    }
4049
4050    fn synic_msrs(&mut self) -> Result<vp::SyntheticMsrs, Self::Error> {
4051        Err(vp_state::Error::Unimplemented("synic_msrs"))
4052    }
4053
4054    fn set_synic_msrs(&mut self, _value: &vp::SyntheticMsrs) -> Result<(), Self::Error> {
4055        Err(vp_state::Error::Unimplemented("synic_msrs"))
4056    }
4057
4058    fn synic_message_page(&mut self) -> Result<vp::SynicMessagePage, Self::Error> {
4059        Err(vp_state::Error::Unimplemented("synic_message_page"))
4060    }
4061
4062    fn set_synic_message_page(&mut self, _value: &vp::SynicMessagePage) -> Result<(), Self::Error> {
4063        Err(vp_state::Error::Unimplemented("synic_message_page"))
4064    }
4065
4066    fn synic_event_flags_page(&mut self) -> Result<vp::SynicEventFlagsPage, Self::Error> {
4067        Err(vp_state::Error::Unimplemented("synic_event_flags_page"))
4068    }
4069
4070    fn set_synic_event_flags_page(
4071        &mut self,
4072        _value: &vp::SynicEventFlagsPage,
4073    ) -> Result<(), Self::Error> {
4074        Err(vp_state::Error::Unimplemented("synic_event_flags_page"))
4075    }
4076
4077    fn synic_message_queues(&mut self) -> Result<vp::SynicMessageQueues, Self::Error> {
4078        Err(vp_state::Error::Unimplemented("synic_message_queues"))
4079    }
4080
4081    fn set_synic_message_queues(
4082        &mut self,
4083        _value: &vp::SynicMessageQueues,
4084    ) -> Result<(), Self::Error> {
4085        Err(vp_state::Error::Unimplemented("synic_message_queues"))
4086    }
4087
4088    fn synic_timers(&mut self) -> Result<vp::SynicTimers, Self::Error> {
4089        Err(vp_state::Error::Unimplemented("synic_timers"))
4090    }
4091
4092    fn set_synic_timers(&mut self, _value: &vp::SynicTimers) -> Result<(), Self::Error> {
4093        Err(vp_state::Error::Unimplemented("synic_timers"))
4094    }
4095}
4096
4097/// Compute the index of the highest vector set in IRR/ISR, or 0
4098/// if no vector is set. (Vectors 0-15 are invalid so this is not
4099/// ambiguous.)
4100fn top_vector(reg: &[ApicRegister; 8]) -> u8 {
4101    reg.iter()
4102        .enumerate()
4103        .rev()
4104        .find_map(|(i, r)| {
4105            (r.value != 0).then(|| (i as u32 * 32 + (31 - r.value.leading_zeros())) as u8)
4106        })
4107        .unwrap_or(0)
4108}
4109
4110struct TdHypercall<'a, 'b, T>(UhHypercallHandler<'a, 'b, T, TdxBacked>);
4111
4112impl<'a, 'b, T> AsHandler<UhHypercallHandler<'a, 'b, T, TdxBacked>> for TdHypercall<'a, 'b, T> {
4113    fn as_handler(&mut self) -> &mut UhHypercallHandler<'a, 'b, T, TdxBacked> {
4114        &mut self.0
4115    }
4116}
4117
4118impl<T> HypercallIo for TdHypercall<'_, '_, T> {
4119    fn advance_ip(&mut self) {
4120        self.0.vp.runner.tdx_enter_guest_gps_mut()[TdxGp::R10] = 0;
4121        self.0.vp.backing.vtls[self.0.intercepted_vtl]
4122            .private_regs
4123            .rip = self.0.vp.backing.vtls[self.0.intercepted_vtl]
4124            .private_regs
4125            .rip
4126            .wrapping_add(4);
4127    }
4128
4129    fn retry(&mut self, control: u64) {
4130        self.0.vp.runner.tdx_enter_guest_gps_mut()[TdxGp::R10] = control;
4131        self.set_result(hvdef::hypercall::HypercallOutput::from(HvError::Timeout).into());
4132    }
4133
4134    fn control(&mut self) -> u64 {
4135        self.0.vp.runner.tdx_enter_guest_gps()[TdxGp::R10]
4136    }
4137
4138    fn input_gpa(&mut self) -> u64 {
4139        self.0.vp.runner.tdx_enter_guest_gps()[TdxGp::RDX]
4140    }
4141
4142    fn output_gpa(&mut self) -> u64 {
4143        self.0.vp.runner.tdx_enter_guest_gps()[TdxGp::R8]
4144    }
4145
4146    fn fast_register_pair_count(&mut self) -> usize {
4147        7
4148    }
4149
4150    fn extended_fast_hypercalls_ok(&mut self) -> bool {
4151        false
4152    }
4153
4154    fn fast_input(&mut self, buf: &mut [[u64; 2]], _output_register_pairs: usize) -> usize {
4155        self.fast_regs(0, buf);
4156        buf.len()
4157    }
4158
4159    fn fast_output(&mut self, _starting_pair_index: usize, buf: &[[u64; 2]]) {
4160        assert!(buf.is_empty());
4161    }
4162
4163    fn vtl_input(&mut self) -> u64 {
4164        unreachable!()
4165    }
4166
4167    fn set_result(&mut self, n: u64) {
4168        self.0.vp.runner.tdx_enter_guest_gps_mut()[TdxGp::R11] = n;
4169    }
4170
4171    fn fast_regs(&mut self, starting_pair_index: usize, buf: &mut [[u64; 2]]) {
4172        let regs = self.0.vp.runner.tdx_enter_guest_gps();
4173        let fx_state = self.0.vp.runner.fx_state();
4174        for (i, [low, high]) in buf.iter_mut().enumerate() {
4175            let index = i + starting_pair_index;
4176            if index == 0 {
4177                *low = regs[TdxGp::RDX];
4178                *high = regs[TdxGp::R8];
4179            } else {
4180                let value = u128::from_ne_bytes(fx_state.xmm[index - 1]);
4181                *low = value as u64;
4182                *high = (value >> 64) as u64;
4183            }
4184        }
4185    }
4186}
4187
4188impl<T> hv1_hypercall::VtlSwitchOps for UhHypercallHandler<'_, '_, T, TdxBacked> {
4189    fn advance_ip(&mut self) {
4190        let long_mode = self.vp.long_mode(self.intercepted_vtl);
4191        let mut io = hv1_hypercall::X64RegisterIo::new(self, long_mode);
4192        io.advance_ip();
4193    }
4194
4195    fn inject_invalid_opcode_fault(&mut self) {
4196        self.vp.backing.vtls[self.intercepted_vtl].interruption_information =
4197            InterruptionInformation::new()
4198                .with_valid(true)
4199                .with_interruption_type(INTERRUPT_TYPE_HARDWARE_EXCEPTION)
4200                .with_vector(x86defs::Exception::INVALID_OPCODE.0);
4201    }
4202}
4203
4204impl<T: CpuIo> hv1_hypercall::FlushVirtualAddressList for UhHypercallHandler<'_, '_, T, TdxBacked> {
4205    fn flush_virtual_address_list(
4206        &mut self,
4207        processor_set: ProcessorSet<'_>,
4208        flags: HvFlushFlags,
4209        gva_ranges: &[HvGvaRange],
4210    ) -> HvRepResult {
4211        hv1_hypercall::FlushVirtualAddressListEx::flush_virtual_address_list_ex(
4212            self,
4213            processor_set,
4214            flags,
4215            gva_ranges,
4216        )
4217    }
4218}
4219
4220impl<T: CpuIo> hv1_hypercall::FlushVirtualAddressListEx
4221    for UhHypercallHandler<'_, '_, T, TdxBacked>
4222{
4223    fn flush_virtual_address_list_ex(
4224        &mut self,
4225        processor_set: ProcessorSet<'_>,
4226        flags: HvFlushFlags,
4227        gva_ranges: &[HvGvaRange],
4228    ) -> HvRepResult {
4229        self.hcvm_validate_flush_inputs(processor_set, flags, true)
4230            .map_err(|e| (e, 0))?;
4231
4232        let vtl = self.intercepted_vtl;
4233        let flush_state = &self.vp.shared.flush_state[vtl];
4234
4235        // If we fail to add ranges to the list for any reason then promote this request to a flush entire.
4236        if let Err(()) = Self::add_ranges_to_tlb_flush_list(
4237            flush_state,
4238            gva_ranges,
4239            flags.use_extended_range_format(),
4240        ) {
4241            if flags.non_global_mappings_only() {
4242                flush_state
4243                    .flush_entire_non_global_counter
4244                    .fetch_add(1, Ordering::Relaxed);
4245            } else {
4246                flush_state
4247                    .flush_entire_counter
4248                    .fetch_add(1, Ordering::Relaxed);
4249            }
4250        }
4251
4252        // Send flush IPIs to the specified VPs.
4253        TdxTlbLockFlushAccess {
4254            vp_index: Some(self.vp.vp_index()),
4255            partition: self.vp.partition,
4256            shared: self.vp.shared,
4257        }
4258        .wake_processors_for_tlb_flush(vtl, (!flags.all_processors()).then_some(processor_set));
4259
4260        // Mark that this VP needs to wait for all TLB locks to be released before returning.
4261        self.vp.set_wait_for_tlb_locks(vtl);
4262
4263        Ok(())
4264    }
4265}
4266
4267impl<T: CpuIo> hv1_hypercall::FlushVirtualAddressSpace
4268    for UhHypercallHandler<'_, '_, T, TdxBacked>
4269{
4270    fn flush_virtual_address_space(
4271        &mut self,
4272        processor_set: ProcessorSet<'_>,
4273        flags: HvFlushFlags,
4274    ) -> hvdef::HvResult<()> {
4275        hv1_hypercall::FlushVirtualAddressSpaceEx::flush_virtual_address_space_ex(
4276            self,
4277            processor_set,
4278            flags,
4279        )
4280    }
4281}
4282
4283impl<T: CpuIo> hv1_hypercall::FlushVirtualAddressSpaceEx
4284    for UhHypercallHandler<'_, '_, T, TdxBacked>
4285{
4286    fn flush_virtual_address_space_ex(
4287        &mut self,
4288        processor_set: ProcessorSet<'_>,
4289        flags: HvFlushFlags,
4290    ) -> hvdef::HvResult<()> {
4291        self.hcvm_validate_flush_inputs(processor_set, flags, false)?;
4292        let vtl = self.intercepted_vtl;
4293
4294        let flush_state = &self.vp.shared.flush_state[vtl];
4295
4296        // Set flush entire.
4297        if flags.non_global_mappings_only() {
4298            flush_state
4299                .flush_entire_non_global_counter
4300                .fetch_add(1, Ordering::Relaxed);
4301        } else {
4302            flush_state
4303                .flush_entire_counter
4304                .fetch_add(1, Ordering::Relaxed);
4305        }
4306
4307        // Send flush IPIs to the specified VPs.
4308        TdxTlbLockFlushAccess {
4309            vp_index: Some(self.vp.vp_index()),
4310            partition: self.vp.partition,
4311            shared: self.vp.shared,
4312        }
4313        .wake_processors_for_tlb_flush(vtl, (!flags.all_processors()).then_some(processor_set));
4314
4315        // Mark that this VP needs to wait for all TLB locks to be released before returning.
4316        self.vp.set_wait_for_tlb_locks(vtl);
4317
4318        Ok(())
4319    }
4320}
4321
4322impl<T: CpuIo> UhHypercallHandler<'_, '_, T, TdxBacked> {
4323    fn add_ranges_to_tlb_flush_list(
4324        flush_state: &TdxPartitionFlushState,
4325        gva_ranges: &[HvGvaRange],
4326        use_extended_range_format: bool,
4327    ) -> Result<(), ()> {
4328        // If there are more gvas than the list size there's no point in filling the list.
4329        if gva_ranges.len() > FLUSH_GVA_LIST_SIZE {
4330            return Err(());
4331        }
4332
4333        if use_extended_range_format
4334            && gva_ranges
4335                .iter()
4336                .any(|range| range.as_extended().large_page())
4337        {
4338            // TDX does not provide a way to flush large page ranges,
4339            // we have to promote this request to a flush entire.
4340            return Err(());
4341        }
4342
4343        flush_state
4344            .gva_list
4345            .write()
4346            .extend(gva_ranges.iter().copied());
4347
4348        Ok(())
4349    }
4350}
4351
4352impl TdxTlbLockFlushAccess<'_> {
4353    fn wake_processors_for_tlb_flush(
4354        &mut self,
4355        target_vtl: GuestVtl,
4356        processor_set: Option<ProcessorSet<'_>>,
4357    ) {
4358        match processor_set {
4359            Some(processors) => {
4360                self.wake_processors_for_tlb_flush_inner(target_vtl, processors);
4361            }
4362            None => self.wake_processors_for_tlb_flush_inner(
4363                target_vtl,
4364                0..(self.partition.vps.len() as u32),
4365            ),
4366        }
4367    }
4368
4369    fn wake_processors_for_tlb_flush_inner(
4370        &mut self,
4371        target_vtl: GuestVtl,
4372        processors: impl IntoIterator<Item = u32>,
4373    ) {
4374        // Use SeqCst ordering to ensure that we are observing the most
4375        // up-to-date value from other VPs. Otherwise we might not send a
4376        // wake to a VP in a lower VTL, which could cause TLB lock holders
4377        // to be stuck waiting until the target_vp happens to switch into
4378        // VTL 2.
4379        // We use a single fence to avoid having to take a SeqCst load
4380        // for each VP.
4381        std::sync::atomic::fence(Ordering::SeqCst);
4382        self.partition.hcl.kick_cpus(
4383            processors.into_iter().filter(|&vp| {
4384                self.shared.active_vtl[vp as usize].load(Ordering::Relaxed) == target_vtl as u8
4385            }),
4386            true,
4387            true,
4388        );
4389    }
4390}
4391
4392struct TdxTlbLockFlushAccess<'a> {
4393    vp_index: Option<VpIndex>,
4394    partition: &'a UhPartitionInner,
4395    shared: &'a TdxBackedShared,
4396}
4397
4398impl TlbFlushLockAccess for TdxTlbLockFlushAccess<'_> {
4399    fn flush(&mut self, vtl: GuestVtl) {
4400        self.shared.flush_state[vtl]
4401            .flush_entire_counter
4402            .fetch_add(1, Ordering::Relaxed);
4403
4404        self.wake_processors_for_tlb_flush(vtl, None);
4405        self.set_wait_for_tlb_locks(vtl);
4406    }
4407
4408    fn flush_entire(&mut self) {
4409        for vtl in [GuestVtl::Vtl0, GuestVtl::Vtl1] {
4410            self.shared.flush_state[vtl]
4411                .flush_entire_counter
4412                .fetch_add(1, Ordering::Relaxed);
4413        }
4414        for vtl in [GuestVtl::Vtl0, GuestVtl::Vtl1] {
4415            self.wake_processors_for_tlb_flush(vtl, None);
4416            self.set_wait_for_tlb_locks(vtl);
4417        }
4418    }
4419
4420    fn set_wait_for_tlb_locks(&mut self, vtl: GuestVtl) {
4421        if let Some(vp_index) = self.vp_index {
4422            hardware_cvm::tlb_lock::TlbLockAccess {
4423                vp_index,
4424                cvm_partition: &self.shared.cvm,
4425            }
4426            .set_wait_for_tlb_locks(vtl);
4427        }
4428    }
4429}
4430
4431mod save_restore {
4432    use super::TdxBacked;
4433    use super::UhProcessor;
4434    use vmcore::save_restore::RestoreError;
4435    use vmcore::save_restore::SaveError;
4436    use vmcore::save_restore::SaveRestore;
4437    use vmcore::save_restore::SavedStateNotSupported;
4438
4439    impl SaveRestore for UhProcessor<'_, TdxBacked> {
4440        type SavedState = SavedStateNotSupported;
4441
4442        fn save(&mut self) -> Result<Self::SavedState, SaveError> {
4443            Err(SaveError::NotSupported)
4444        }
4445
4446        fn restore(&mut self, state: Self::SavedState) -> Result<(), RestoreError> {
4447            match state {}
4448        }
4449    }
4450}