virt_mshv_vtl/processor/tdx/
mod.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! Processor support for TDX partitions.
5
6mod tlb_flush;
7
8use super::BackingPrivate;
9use super::BackingSharedParams;
10use super::HardwareIsolatedBacking;
11use super::UhEmulationState;
12use super::UhHypercallHandler;
13use super::hardware_cvm;
14use super::vp_state;
15use super::vp_state::UhVpStateAccess;
16use crate::BackingShared;
17use crate::GuestVtl;
18use crate::IsolationType;
19use crate::TlbFlushLockAccess;
20use crate::UhCvmPartitionState;
21use crate::UhCvmVpState;
22use crate::UhPartitionInner;
23use crate::UhPartitionNewParams;
24use crate::UhProcessor;
25use crate::WakeReason;
26use crate::get_tsc_frequency;
27use cvm_tracing::CVM_ALLOWED;
28use cvm_tracing::CVM_CONFIDENTIAL;
29use guestmem::GuestMemory;
30use hcl::ioctl::ProcessorRunner;
31use hcl::ioctl::tdx::Tdx;
32use hcl::ioctl::tdx::TdxPrivateRegs;
33use hcl::protocol::hcl_intr_offload_flags;
34use hcl::protocol::tdx_tdg_vp_enter_exit_info;
35use hv1_emulator::hv::ProcessorVtlHv;
36use hv1_emulator::synic::GlobalSynic;
37use hv1_emulator::synic::ProcessorSynic;
38use hv1_hypercall::AsHandler;
39use hv1_hypercall::HvRepResult;
40use hv1_hypercall::HypercallIo;
41use hv1_structs::ProcessorSet;
42use hv1_structs::VtlArray;
43use hvdef::HV_PAGE_SIZE;
44use hvdef::HvError;
45use hvdef::HvSynicSimpSiefp;
46use hvdef::HvX64PendingExceptionEvent;
47use hvdef::HvX64RegisterName;
48use hvdef::Vtl;
49use hvdef::hypercall::HvFlushFlags;
50use hvdef::hypercall::HvGvaRange;
51use inspect::Inspect;
52use inspect::InspectMut;
53use inspect_counters::Counter;
54use std::sync::atomic::AtomicU8;
55use std::sync::atomic::Ordering;
56use thiserror::Error;
57use tlb_flush::FLUSH_GVA_LIST_SIZE;
58use tlb_flush::TdxFlushState;
59use tlb_flush::TdxPartitionFlushState;
60use virt::EmulatorMonitorSupport;
61use virt::Processor;
62use virt::VpHaltReason;
63use virt::VpIndex;
64use virt::io::CpuIo;
65use virt::state::StateElement;
66use virt::vp;
67use virt::vp::AccessVpState;
68use virt::vp::MpState;
69use virt::vp::Registers;
70use virt::x86::MsrError;
71use virt::x86::MsrErrorExt;
72use virt::x86::SegmentRegister;
73use virt::x86::TableRegister;
74use virt_support_apic::ApicClient;
75use virt_support_apic::OffloadNotSupported;
76use virt_support_x86emu::emulate::EmulatedMemoryOperation;
77use virt_support_x86emu::emulate::EmulatorSupport as X86EmulatorSupport;
78use virt_support_x86emu::emulate::TranslateMode;
79use virt_support_x86emu::emulate::emulate_insn_memory_op;
80use virt_support_x86emu::emulate::emulate_io;
81use virt_support_x86emu::emulate::emulate_translate_gva;
82use virt_support_x86emu::translate::TranslationRegisters;
83use vmcore::vmtime::VmTimeAccess;
84use x86defs::RFlags;
85use x86defs::X64_CR0_ET;
86use x86defs::X64_CR0_NE;
87use x86defs::X64_CR0_PE;
88use x86defs::X64_CR0_PG;
89use x86defs::X64_CR4_MCE;
90use x86defs::X64_CR4_UMIP;
91use x86defs::X64_CR4_VMXE;
92use x86defs::X64_EFER_FFXSR;
93use x86defs::X64_EFER_LMA;
94use x86defs::X64_EFER_LME;
95use x86defs::X64_EFER_NXE;
96use x86defs::X64_EFER_SVME;
97use x86defs::X86X_MSR_EFER;
98use x86defs::apic::X2APIC_MSR_BASE;
99use x86defs::tdx::TdCallResultCode;
100use x86defs::tdx::TdVmCallR10Result;
101use x86defs::tdx::TdxGp;
102use x86defs::tdx::TdxInstructionInfo;
103use x86defs::tdx::TdxL2Ctls;
104use x86defs::tdx::TdxVpEnterRaxResult;
105use x86defs::vmx::ApicPage;
106use x86defs::vmx::ApicRegister;
107use x86defs::vmx::CR_ACCESS_TYPE_LMSW;
108use x86defs::vmx::CR_ACCESS_TYPE_MOV_TO_CR;
109use x86defs::vmx::CrAccessQualification;
110use x86defs::vmx::ExitQualificationIo;
111use x86defs::vmx::GdtrOrIdtrInstruction;
112use x86defs::vmx::GdtrOrIdtrInstructionInfo;
113use x86defs::vmx::INTERRUPT_TYPE_EXTERNAL;
114use x86defs::vmx::INTERRUPT_TYPE_HARDWARE_EXCEPTION;
115use x86defs::vmx::INTERRUPT_TYPE_NMI;
116use x86defs::vmx::IO_SIZE_8_BIT;
117use x86defs::vmx::IO_SIZE_16_BIT;
118use x86defs::vmx::IO_SIZE_32_BIT;
119use x86defs::vmx::Interruptibility;
120use x86defs::vmx::InterruptionInformation;
121use x86defs::vmx::LdtrOrTrInstruction;
122use x86defs::vmx::LdtrOrTrInstructionInfo;
123use x86defs::vmx::ProcessorControls;
124use x86defs::vmx::SecondaryProcessorControls;
125use x86defs::vmx::VMX_ENTRY_CONTROL_LONG_MODE_GUEST;
126use x86defs::vmx::VMX_FEATURE_CONTROL_LOCKED;
127use x86defs::vmx::VmcsField;
128use x86defs::vmx::VmxEptExitQualification;
129use x86defs::vmx::VmxExit;
130use x86defs::vmx::VmxExitBasic;
131use x86emu::Gp;
132use x86emu::Segment;
133
134/// MSRs that are allowed to be read by the guest without interception.
135const MSR_ALLOWED_READ: &[u32] = &[
136    x86defs::X86X_MSR_TSC,
137    x86defs::X86X_MSR_TSC_AUX,
138    X86X_MSR_EFER,
139    x86defs::X86X_MSR_STAR,
140    x86defs::X86X_MSR_LSTAR,
141    x86defs::X86X_MSR_SFMASK,
142    x86defs::X86X_MSR_SYSENTER_CS,
143    x86defs::X86X_MSR_SYSENTER_ESP,
144    x86defs::X86X_MSR_SYSENTER_EIP,
145];
146
147/// MSRs that are allowed to be read and written by the guest without interception.
148const MSR_ALLOWED_READ_WRITE: &[u32] = &[
149    x86defs::X64_MSR_FS_BASE,
150    x86defs::X64_MSR_GS_BASE,
151    x86defs::X64_MSR_KERNEL_GS_BASE,
152    x86defs::X86X_MSR_SPEC_CTRL,
153    x86defs::X86X_MSR_U_CET,
154    x86defs::X86X_MSR_S_CET,
155    x86defs::X86X_MSR_PL0_SSP,
156    x86defs::X86X_MSR_PL1_SSP,
157    x86defs::X86X_MSR_PL2_SSP,
158    x86defs::X86X_MSR_PL3_SSP,
159    x86defs::X86X_MSR_INTERRUPT_SSP_TABLE_ADDR,
160    x86defs::X86X_IA32_MSR_XFD,
161    x86defs::X86X_IA32_MSR_XFD_ERR,
162];
163
164#[derive(Debug, Error)]
165#[error("unknown exit {0:#x?}")]
166struct UnknownVmxExit(VmxExit);
167
168#[derive(Debug, Error)]
169#[error("bad guest state on VP.ENTER")]
170struct VmxBadGuestState;
171
172#[derive(Debug, Error)]
173#[error("failed to run")]
174struct TdxRunVpError(#[source] hcl::ioctl::Error);
175
176#[derive(Debug)]
177struct TdxExit<'a>(&'a tdx_tdg_vp_enter_exit_info);
178
179impl TdxExit<'_> {
180    fn code(&self) -> TdxVpEnterRaxResult {
181        self.0.rax.into()
182    }
183    fn qualification(&self) -> u64 {
184        self.0.rcx
185    }
186    fn gla(&self) -> Option<u64> {
187        // Only valid for EPT exits.
188        if self.code().vmx_exit().basic_reason() == VmxExitBasic::EPT_VIOLATION {
189            Some(self.0.rdx)
190        } else {
191            None
192        }
193    }
194    fn gpa(&self) -> Option<u64> {
195        // Only valid for EPT exits.
196        if self.code().vmx_exit().basic_reason() == VmxExitBasic::EPT_VIOLATION {
197            Some(self.0.r8)
198        } else {
199            None
200        }
201    }
202    fn _exit_interruption_info(&self) -> InterruptionInformation {
203        (self.0.r9 as u32).into()
204    }
205    fn _exit_interruption_error_code(&self) -> u32 {
206        (self.0.r9 >> 32) as u32
207    }
208    fn idt_vectoring_info(&self) -> InterruptionInformation {
209        (self.0.r10 as u32).into()
210    }
211    fn idt_vectoring_error_code(&self) -> u32 {
212        (self.0.r10 >> 32) as u32
213    }
214    fn instr_info(&self) -> TdxInstructionInfo {
215        self.0.r11.into()
216    }
217    fn cs(&self) -> SegmentRegister {
218        SegmentRegister {
219            selector: self.0.rsi as u16,
220            base: self.0.rdi,
221            limit: (self.0.rsi >> 32) as u32,
222            attributes: (self.0.rsi >> 16) as u16,
223        }
224    }
225    fn cpl(&self) -> u8 {
226        self.0.r12 as u8 & 3
227    }
228}
229
230/// Registers that can be virtual and shadowed.
231#[derive(Debug, Inspect)]
232enum ShadowedRegister {
233    Cr0,
234    Cr4,
235}
236
237impl ShadowedRegister {
238    fn name(&self) -> &'static str {
239        match self {
240            Self::Cr0 => "cr0",
241            Self::Cr4 => "cr4",
242        }
243    }
244
245    fn physical_vmcs_field(&self) -> VmcsField {
246        match self {
247            Self::Cr0 => VmcsField::VMX_VMCS_GUEST_CR0,
248            Self::Cr4 => VmcsField::VMX_VMCS_GUEST_CR4,
249        }
250    }
251
252    fn shadow_vmcs_field(&self) -> VmcsField {
253        match self {
254            Self::Cr0 => VmcsField::VMX_VMCS_CR0_READ_SHADOW,
255            Self::Cr4 => VmcsField::VMX_VMCS_CR4_READ_SHADOW,
256        }
257    }
258
259    fn guest_owned_mask(&self) -> u64 {
260        // Control register bits that are guest owned by default. A bit is guest
261        // owned when the physical register bit is always set to the virtual
262        // register bit (subject to validation of the virtual register).
263        match self {
264            Self::Cr0 => {
265                X64_CR0_ET
266                    | x86defs::X64_CR0_MP
267                    | x86defs::X64_CR0_EM
268                    | x86defs::X64_CR0_TS
269                    | x86defs::X64_CR0_WP
270                    | x86defs::X64_CR0_AM
271                    | X64_CR0_PE
272                    | X64_CR0_PG
273            }
274            Self::Cr4 => {
275                x86defs::X64_CR4_VME
276                    | x86defs::X64_CR4_PVI
277                    | x86defs::X64_CR4_TSD
278                    | x86defs::X64_CR4_DE
279                    | x86defs::X64_CR4_PSE
280                    | x86defs::X64_CR4_PAE
281                    | x86defs::X64_CR4_PGE
282                    | x86defs::X64_CR4_PCE
283                    | x86defs::X64_CR4_FXSR
284                    | x86defs::X64_CR4_XMMEXCPT
285                    | X64_CR4_UMIP
286                    | x86defs::X64_CR4_LA57
287                    | x86defs::X64_CR4_RWFSGS
288                    | x86defs::X64_CR4_PCIDE
289                    | x86defs::X64_CR4_OSXSAVE
290                    | x86defs::X64_CR4_SMEP
291                    | x86defs::X64_CR4_SMAP
292                    | x86defs::X64_CR4_CET
293            }
294        }
295    }
296}
297
298/// A virtual register that is shadowed by the virtstack.
299///
300/// Some bits are owned by the guest while others are owned by the virtstack,
301/// due to TDX requirements.
302#[derive(Inspect)]
303struct VirtualRegister {
304    /// The register being shadowed.
305    register: ShadowedRegister,
306    /// The VTL this register is shadowed for.
307    vtl: GuestVtl,
308    /// The value the guest sees.
309    shadow_value: u64,
310    /// Additional constraints on bits.
311    allowed_bits: u64,
312}
313
314impl VirtualRegister {
315    fn new(reg: ShadowedRegister, vtl: GuestVtl, initial_value: u64, allowed_bits: u64) -> Self {
316        Self {
317            register: reg,
318            vtl,
319            shadow_value: initial_value,
320            allowed_bits,
321        }
322    }
323
324    /// Write a new value to the virtual register. This updates host owned bits
325    /// in the shadowed value, and updates guest owned bits in the physical
326    /// register in the vmcs.
327    fn write<'a>(
328        &mut self,
329        value: u64,
330        runner: &mut ProcessorRunner<'a, Tdx<'a>>,
331    ) -> Result<(), vp_state::Error> {
332        tracing::trace!(?self.register, value, "write virtual register");
333
334        if value & !self.allowed_bits != 0 {
335            return Err(vp_state::Error::InvalidValue(
336                value,
337                self.register.name(),
338                "disallowed bit set",
339            ));
340        }
341
342        // If guest owned bits of the physical register have changed, then update
343        // the guest owned bits of the physical field.
344        let old_physical_reg = runner.read_vmcs64(self.vtl, self.register.physical_vmcs_field());
345
346        tracing::trace!(old_physical_reg, "old_physical_reg");
347
348        let guest_owned_mask = self.register.guest_owned_mask();
349        if (old_physical_reg ^ value) & guest_owned_mask != 0 {
350            let new_physical_reg =
351                (old_physical_reg & !guest_owned_mask) | (value & guest_owned_mask);
352
353            tracing::trace!(new_physical_reg, "new_physical_reg");
354
355            runner.write_vmcs64(
356                self.vtl,
357                self.register.physical_vmcs_field(),
358                !0,
359                new_physical_reg,
360            );
361        }
362
363        self.shadow_value = value;
364        runner.write_vmcs64(self.vtl, self.register.shadow_vmcs_field(), !0, value);
365        Ok(())
366    }
367
368    fn read<'a>(&self, runner: &ProcessorRunner<'a, Tdx<'a>>) -> u64 {
369        let physical_reg = runner.read_vmcs64(self.vtl, self.register.physical_vmcs_field());
370
371        // Get the bits owned by the host from the shadow and the bits owned by the
372        // guest from the physical value.
373        let guest_owned_mask = self.register.guest_owned_mask();
374        (self.shadow_value & !self.register.guest_owned_mask()) | (physical_reg & guest_owned_mask)
375    }
376}
377
378/// Interface for managing lower VTL timer deadlines via TDX L2-VM TSC Deadline
379/// Timer capability.
380///
381/// This allows VTL2 to set an execution deadline for lower VTLs, in absolute
382/// virtual TSC units. If the lower VTL is running when the deadline time
383/// arrives, it exits to VTL2 with exit reason `VmxExitBasic::TIMER_EXPIRED`.
384/// If the TSC deadline is in the past during entry into lower VTL (i.e., TSC
385/// deadline value is lower than the current virtual TSC value), it will immediately
386/// exit back to VTL2 with exit reason `VmxExitBasic::TIMER_EXPIRED`.
387///
388/// The TSC deadline is set using `TDG.VP.WR` for `TDVPS.TSC_DEADLINE[L2-VM Index]`.
389/// The actual `TDG.VP.WR` call to set the deadline is made by the `mshv_vtl` driver
390///  before entering the lower VTL.
391struct TdxTscDeadlineService {
392    // Fixed-point scale factor to convert 100ns to TSC units.
393    tsc_scale_100ns: u128,
394}
395
396impl TdxTscDeadlineService {
397    /// Convert hypervisor reference time (in 100ns) to TSC units.
398    fn ref_time_to_tsc(&self, ref_time: u64) -> u64 {
399        // Use fixed-point multiplication to calculate:
400        // tsc_ticks = (time_100ns /  10_000_000) * tsc_frequency
401        ((ref_time as u128 * self.tsc_scale_100ns) >> 64) as u64
402    }
403
404    /// Returns true if `ref_time` is before `ref_time_last`.
405    ///
406    /// Note that this uses wrapping arithmetic to handle 64-bit timestamp wraparound
407    ///  and hence this is not transitive: if `a` is before `b`, and `b` is before `c`,
408    /// `a` may still appear after `c` if they are too far apart in the circular space.
409    fn is_before(ref_time: u64, ref_time_last: u64) -> bool {
410        let delta = ref_time.wrapping_sub(ref_time_last);
411        (delta as i64) < 0
412    }
413}
414
415impl hardware_cvm::HardwareIsolatedGuestTimer<TdxBacked> for TdxTscDeadlineService {
416    fn is_hardware_virtualized(&self) -> bool {
417        true
418    }
419
420    /// Update the virtual timer deadline in the processor's context shared with kernel.
421    /// This deadline will be set by `mshv_vtl` using
422    /// `TDG.VP.WR(TDVPS.TSC_DEADLINE[L2-VM Index])` before entering into lower VTL.
423    fn update_deadline(
424        &self,
425        vp: &mut UhProcessor<'_, TdxBacked>,
426        ref_time_now: u64,
427        ref_time_next: u64,
428    ) {
429        let vp_state = vp
430            .backing
431            .tsc_deadline_state
432            .as_mut()
433            .expect("TdxTscDeadlineService requires tsc_deadline_state");
434
435        // Update needed only if no deadline is set or the new time is earlier.
436        if vp_state
437            .deadline_100ns
438            .is_none_or(|last| Self::is_before(ref_time_next, last))
439        {
440            // Record the new reference time.
441            vp_state.deadline_100ns = Some(ref_time_next);
442
443            let state = vp.runner.tdx_l2_tsc_deadline_state_mut();
444            if vp_state
445                .last_deadline_100ns
446                .is_none_or(|last| last != ref_time_next)
447            {
448                let ref_time_from_now = ref_time_next.saturating_sub(ref_time_now);
449                let tsc_delta = self.ref_time_to_tsc(ref_time_from_now);
450                let deadline = safe_intrinsics::rdtsc().wrapping_add(tsc_delta);
451
452                state.deadline = deadline;
453                state.update_deadline = 1;
454
455                tracing::trace!(
456                    ref_time_from_now,
457                    tsc_delta,
458                    deadline,
459                    "updating deadline for TDX L2-VM TSC deadline timer"
460                );
461            } else {
462                state.update_deadline = 0;
463            }
464        }
465    }
466
467    /// Clears the virtual timer deadline in the processor context.
468    fn clear_deadline(&self, vp: &mut UhProcessor<'_, TdxBacked>) {
469        let vp_state = vp
470            .backing
471            .tsc_deadline_state
472            .as_mut()
473            .expect("TdxTscDeadlineService requires tsc_deadline_state");
474
475        vp_state.deadline_100ns = None;
476
477        let state = vp.runner.tdx_l2_tsc_deadline_state_mut();
478        state.update_deadline = 0;
479    }
480
481    /// Synchronize armed deadline state in the processor context.
482    fn sync_deadline_state(&self, vp: &mut UhProcessor<'_, TdxBacked>) {
483        let vp_state = vp
484            .backing
485            .tsc_deadline_state
486            .as_mut()
487            .expect("TdxTscDeadlineService requires tsc_deadline_state");
488
489        vp_state.last_deadline_100ns = vp_state.deadline_100ns;
490    }
491}
492
493/// Per-VP state for TDX L2-VM TSC deadline timer.
494#[derive(Inspect, Default)]
495struct TdxTscDeadline {
496    /// Next deadline to be armed (in 100ns units).
497    #[inspect(hex)]
498    deadline_100ns: Option<u64>,
499    /// Deadline (in 100ns units) armed by `mshv_vtl` driver during previous entry
500    /// into lower VTL.
501    #[inspect(hex)]
502    last_deadline_100ns: Option<u64>,
503}
504
505/// Backing for TDX partitions.
506#[derive(InspectMut)]
507pub struct TdxBacked {
508    #[inspect(mut)]
509    vtls: VtlArray<TdxVtl, 2>,
510
511    untrusted_synic: Option<ProcessorSynic>,
512    #[inspect(hex, iter_by_index)]
513    eoi_exit_bitmap: [u64; 4],
514
515    /// A mapped page used for issuing INVGLA hypercalls.
516    #[inspect(skip)]
517    flush_page: user_driver::memory::MemoryBlock,
518
519    #[inspect(flatten)]
520    cvm: UhCvmVpState,
521
522    /// Per-processor state for [`TdxTscDeadlineService`].
523    #[inspect(flatten)]
524    tsc_deadline_state: Option<TdxTscDeadline>,
525}
526
527#[derive(InspectMut)]
528struct TdxVtl {
529    /// The EFER value for this VP.
530    efer: u64,
531    /// Virtual cr0.
532    cr0: VirtualRegister,
533    /// Virtual cr4.
534    cr4: VirtualRegister,
535
536    // CSTAR doesn't exist on TDX, but Windows likes to verify that values are sticky.
537    msr_cstar: u64,
538
539    tpr_threshold: u8,
540    #[inspect(skip)]
541    processor_controls: ProcessorControls,
542    #[inspect(skip)]
543    interruption_information: InterruptionInformation,
544    exception_error_code: u32,
545    interruption_set: bool,
546
547    #[inspect(mut)]
548    private_regs: TdxPrivateRegs,
549
550    /// TDX only TLB flush state.
551    flush_state: TdxFlushState,
552
553    enter_stats: EnterStats,
554    exit_stats: ExitStats,
555}
556
557#[derive(Default)]
558pub struct TdxEmulationCache {
559    segs: [Option<SegmentRegister>; 6],
560    cr0: Option<u64>,
561}
562
563#[derive(Inspect, Default)]
564struct EnterStats {
565    success: Counter,
566    host_routed_async: Counter,
567    l2_exit_pending_intr: Counter,
568    pending_intr: Counter,
569    host_routed_td_vmcall: Counter,
570}
571
572#[derive(Inspect, Default)]
573struct ExitStats {
574    io: Counter,
575    msr_read: Counter,
576    msr_write: Counter,
577    ept_violation: Counter,
578    cpuid: Counter,
579    cr_access: Counter,
580    xsetbv: Counter,
581    tpr_below_threshold: Counter,
582    interrupt_window: Counter,
583    nmi_window: Counter,
584    vmcall: Counter,
585    smi_intr: Counter,
586    wbinvd: Counter,
587    hw_interrupt: Counter,
588    tdcall: Counter,
589    hlt: Counter,
590    pause: Counter,
591    needs_interrupt_reinject: Counter,
592    exception: Counter,
593    descriptor_table: Counter,
594    timer_expired: Counter,
595}
596
597enum UhDirectOverlay {
598    Sipp,
599    Sifp,
600    Count,
601}
602
603impl HardwareIsolatedBacking for TdxBacked {
604    fn cvm_state(&self) -> &UhCvmVpState {
605        &self.cvm
606    }
607
608    fn cvm_state_mut(&mut self) -> &mut UhCvmVpState {
609        &mut self.cvm
610    }
611
612    fn cvm_partition_state(shared: &Self::Shared) -> &UhCvmPartitionState {
613        &shared.cvm
614    }
615
616    fn switch_vtl(this: &mut UhProcessor<'_, Self>, _source_vtl: GuestVtl, target_vtl: GuestVtl) {
617        // The GPs, Fxsave, and CR2 are saved in the shared kernel state. No copying needed.
618        // Debug registers and XFEM are shared architecturally. No copying needed.
619
620        this.backing.cvm_state_mut().exit_vtl = target_vtl;
621    }
622
623    fn translation_registers(
624        &self,
625        this: &UhProcessor<'_, Self>,
626        vtl: GuestVtl,
627    ) -> TranslationRegisters {
628        let cr0 = this.backing.vtls[vtl].cr0.read(&this.runner);
629        let cr4 = this.backing.vtls[vtl].cr4.read(&this.runner);
630        let efer = this.backing.vtls[vtl].efer;
631        let cr3 = this.runner.read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_CR3);
632        let ss = this.read_segment(vtl, TdxSegmentReg::Ss).into();
633        let rflags = this.backing.vtls[vtl].private_regs.rflags;
634
635        TranslationRegisters {
636            cr0,
637            cr4,
638            efer,
639            cr3,
640            ss,
641            rflags,
642            encryption_mode: this.partition.caps.vtom.map_or(
643                virt_support_x86emu::translate::EncryptionMode::None,
644                virt_support_x86emu::translate::EncryptionMode::Vtom,
645            ),
646        }
647    }
648
649    fn tlb_flush_lock_access<'a>(
650        vp_index: Option<VpIndex>,
651        partition: &'a UhPartitionInner,
652        shared: &'a Self::Shared,
653    ) -> impl TlbFlushLockAccess + 'a {
654        TdxTlbLockFlushAccess {
655            vp_index,
656            partition,
657            shared,
658        }
659    }
660
661    fn pending_event_vector(this: &UhProcessor<'_, Self>, vtl: GuestVtl) -> Option<u8> {
662        let event_inject = this.backing.vtls[vtl].interruption_information;
663        if event_inject.valid() {
664            Some(event_inject.vector())
665        } else {
666            None
667        }
668    }
669
670    fn set_pending_exception(
671        this: &mut UhProcessor<'_, Self>,
672        vtl: GuestVtl,
673        event: HvX64PendingExceptionEvent,
674    ) {
675        let new_intr = InterruptionInformation::new()
676            .with_valid(true)
677            .with_deliver_error_code(event.deliver_error_code())
678            .with_vector(event.vector().try_into().unwrap())
679            .with_interruption_type(INTERRUPT_TYPE_HARDWARE_EXCEPTION);
680
681        this.backing.vtls[vtl].interruption_information = new_intr;
682        this.backing.vtls[vtl].exception_error_code = event.error_code();
683    }
684
685    fn cr0(this: &UhProcessor<'_, Self>, vtl: GuestVtl) -> u64 {
686        this.read_cr0(vtl)
687    }
688
689    fn cr4(this: &UhProcessor<'_, Self>, vtl: GuestVtl) -> u64 {
690        this.read_cr4(vtl)
691    }
692
693    fn intercept_message_state(
694        this: &UhProcessor<'_, Self>,
695        vtl: GuestVtl,
696        include_optional_state: bool,
697    ) -> super::InterceptMessageState {
698        let exit = TdxExit(this.runner.tdx_vp_enter_exit_info());
699        let backing_vtl = &this.backing.vtls[vtl];
700        let shared_gps = this.runner.tdx_enter_guest_gps();
701
702        super::InterceptMessageState {
703            instruction_length_and_cr8: exit.instr_info().length() as u8,
704            cpl: exit.cpl(),
705            efer_lma: backing_vtl.efer & X64_EFER_LMA != 0,
706            cs: exit.cs().into(),
707            rip: backing_vtl.private_regs.rip,
708            rflags: backing_vtl.private_regs.rflags,
709            rax: shared_gps[TdxGp::RAX],
710            rdx: shared_gps[TdxGp::RDX],
711            optional: if include_optional_state {
712                Some(super::InterceptMessageOptionalState {
713                    ds: this.read_segment(vtl, TdxSegmentReg::Ds).into(),
714                    es: this.read_segment(vtl, TdxSegmentReg::Es).into(),
715                })
716            } else {
717                None
718            },
719            rcx: shared_gps[TdxGp::RCX],
720            rsi: shared_gps[TdxGp::RSI],
721            rdi: shared_gps[TdxGp::RDI],
722        }
723    }
724
725    fn cr_intercept_registration(
726        this: &mut UhProcessor<'_, Self>,
727        intercept_control: hvdef::HvRegisterCrInterceptControl,
728    ) {
729        // Today we only support intercepting VTL 0 on behalf of VTL 1.
730        let vtl = GuestVtl::Vtl0;
731        let intercept_masks = &this
732            .backing
733            .cvm_state()
734            .vtl1
735            .as_ref()
736            .unwrap()
737            .reg_intercept;
738
739        // Update CR0 and CR4 intercept masks in the VMCS.
740        this.runner.write_vmcs64(
741            vtl,
742            VmcsField::VMX_VMCS_CR0_GUEST_HOST_MASK,
743            !0,
744            this.shared.cr_guest_host_mask(ShadowedRegister::Cr0)
745                | if intercept_control.cr0_write() {
746                    intercept_masks.cr0_mask
747                } else {
748                    0
749                },
750        );
751        this.runner.write_vmcs64(
752            vtl,
753            VmcsField::VMX_VMCS_CR4_GUEST_HOST_MASK,
754            !0,
755            this.shared.cr_guest_host_mask(ShadowedRegister::Cr4)
756                | if intercept_control.cr4_write() {
757                    intercept_masks.cr4_mask
758                } else {
759                    0
760                },
761        );
762
763        // Update descriptor table intercepts.
764        let intercept_tables = intercept_control.gdtr_write()
765            | intercept_control.idtr_write()
766            | intercept_control.ldtr_write()
767            | intercept_control.tr_write();
768        this.runner.write_vmcs32(
769            vtl,
770            VmcsField::VMX_VMCS_SECONDARY_PROCESSOR_CONTROLS,
771            SecondaryProcessorControls::new()
772                .with_descriptor_table_exiting(true)
773                .into_bits(),
774            SecondaryProcessorControls::new()
775                .with_descriptor_table_exiting(intercept_tables)
776                .into_bits(),
777        );
778
779        // Update MSR intercepts. We only need to update those that are allowed
780        // to be passed through, as the default otherwise is to always intercept.
781        // See [`MSR_ALLOWED_READ_WRITE`].
782        this.runner.set_msr_bit(
783            vtl,
784            x86defs::X86X_MSR_S_CET,
785            true,
786            intercept_control.msr_scet_write(),
787        );
788        this.runner.set_msr_bit(
789            vtl,
790            x86defs::X86X_MSR_PL0_SSP,
791            true,
792            intercept_control.msr_pls_ssp_write(),
793        );
794        this.runner.set_msr_bit(
795            vtl,
796            x86defs::X86X_MSR_PL1_SSP,
797            true,
798            intercept_control.msr_pls_ssp_write(),
799        );
800        this.runner.set_msr_bit(
801            vtl,
802            x86defs::X86X_MSR_PL2_SSP,
803            true,
804            intercept_control.msr_pls_ssp_write(),
805        );
806        this.runner.set_msr_bit(
807            vtl,
808            x86defs::X86X_MSR_PL3_SSP,
809            true,
810            intercept_control.msr_pls_ssp_write(),
811        );
812        this.runner.set_msr_bit(
813            vtl,
814            x86defs::X86X_MSR_INTERRUPT_SSP_TABLE_ADDR,
815            true,
816            intercept_control.msr_pls_ssp_write(),
817        );
818    }
819
820    fn is_interrupt_pending(
821        this: &mut UhProcessor<'_, Self>,
822        vtl: GuestVtl,
823        check_rflags: bool,
824        dev: &impl CpuIo,
825    ) -> bool {
826        let backing_vtl = &this.backing.vtls[vtl];
827        if backing_vtl.interruption_information.valid()
828            && backing_vtl.interruption_information.interruption_type() == INTERRUPT_TYPE_NMI
829        {
830            return true;
831        }
832
833        let (vector, ppr) = if this.backing.cvm.lapics[vtl].lapic.is_offloaded() {
834            let vector = backing_vtl.private_regs.rvi;
835            let ppr = std::cmp::max(
836                backing_vtl.private_regs.svi.into(),
837                this.runner.tdx_apic_page(vtl).tpr.value,
838            );
839            (vector, ppr)
840        } else {
841            let lapic = &mut this.backing.cvm.lapics[vtl].lapic;
842            let vector = lapic.next_irr().unwrap_or(0);
843            let ppr = lapic
844                .access(&mut TdxApicClient {
845                    partition: this.partition,
846                    apic_page: this.runner.tdx_apic_page_mut(vtl),
847                    dev,
848                    vmtime: &this.vmtime,
849                    vtl,
850                })
851                .get_ppr();
852            (vector, ppr)
853        };
854        let vector_priority = (vector as u32) >> 4;
855        let ppr_priority = ppr >> 4;
856
857        if vector_priority <= ppr_priority {
858            return false;
859        }
860
861        if check_rflags && !RFlags::from_bits(backing_vtl.private_regs.rflags).interrupt_enable() {
862            return false;
863        }
864
865        let interruptibility: Interruptibility = this
866            .runner
867            .read_vmcs32(vtl, VmcsField::VMX_VMCS_GUEST_INTERRUPTIBILITY)
868            .into();
869
870        if interruptibility.blocked_by_sti() || interruptibility.blocked_by_movss() {
871            return false;
872        }
873
874        true
875    }
876
877    fn untrusted_synic_mut(&mut self) -> Option<&mut ProcessorSynic> {
878        self.untrusted_synic.as_mut()
879    }
880
881    fn update_deadline(this: &mut UhProcessor<'_, Self>, ref_time_now: u64, next_ref_time: u64) {
882        this.shared
883            .guest_timer
884            .update_deadline(this, ref_time_now, next_ref_time);
885    }
886
887    fn clear_deadline(this: &mut UhProcessor<'_, Self>) {
888        this.shared.guest_timer.clear_deadline(this);
889    }
890}
891
892/// Partition-wide shared data for TDX VPs.
893#[derive(Inspect)]
894pub struct TdxBackedShared {
895    #[inspect(flatten)]
896    pub(crate) cvm: UhCvmPartitionState,
897    /// The synic state used for untrusted SINTs, that is, the SINTs for which
898    /// the guest thinks it is interacting directly with the untrusted
899    /// hypervisor via an architecture-specific interface.
900    pub(crate) untrusted_synic: Option<GlobalSynic>,
901    flush_state: VtlArray<TdxPartitionFlushState, 2>,
902    #[inspect(iter_by_index)]
903    active_vtl: Vec<AtomicU8>,
904    /// CR4 bits that the guest is allowed to set to 1.
905    cr4_allowed_bits: u64,
906    /// Accessor for managing lower VTL timer deadlines.
907    #[inspect(skip)]
908    guest_timer: Box<dyn hardware_cvm::HardwareIsolatedGuestTimer<TdxBacked>>,
909}
910
911impl TdxBackedShared {
912    pub(crate) fn new(
913        partition_params: &UhPartitionNewParams<'_>,
914        params: BackingSharedParams<'_>,
915    ) -> Result<Self, crate::Error> {
916        // Create a second synic to fully manage the untrusted SINTs
917        // here. At time of writing, the hypervisor does not support
918        // sharing the untrusted SINTs with the TDX L1. Even if it did,
919        // performance would be poor for cases where the L1 implements
920        // high-performance devices.
921        let untrusted_synic = (partition_params.handle_synic && !partition_params.hide_isolation)
922            .then(|| GlobalSynic::new(partition_params.topology.vp_count()));
923
924        // TODO TDX: Consider just using MSR kernel module instead of explicit ioctl.
925        let cr4_fixed1 = params.hcl.read_vmx_cr4_fixed1();
926        let cr4_allowed_bits =
927            (ShadowedRegister::Cr4.guest_owned_mask() | X64_CR4_MCE) & cr4_fixed1;
928
929        let cvm = params.cvm_state.unwrap();
930
931        // Configure timer interface for lower VTLs.
932        let guest_timer: Box<dyn hardware_cvm::HardwareIsolatedGuestTimer<TdxBacked>> =
933            match params.lower_vtl_timer_virt_available {
934                true => {
935                    // Use TDX L2-VM TSC deadline timer service. Calculate scale factor
936                    // for fixed-point conversion from 100ns to TSC units.
937                    let tsc_frequency = get_tsc_frequency(IsolationType::Tdx).unwrap();
938                    const NUM_100NS_IN_SEC: u128 = 10_000_000;
939                    let tsc_scale_100ns = ((tsc_frequency as u128) << 64) / NUM_100NS_IN_SEC;
940
941                    tracing::info!(CVM_ALLOWED, "enabling TDX L2-VM TSC deadline timer service");
942
943                    Box::new(TdxTscDeadlineService { tsc_scale_100ns })
944                }
945                false => {
946                    // Fall back to [`VmTime`] interface.
947                    Box::new(hardware_cvm::VmTimeGuestTimer)
948                }
949            };
950
951        Ok(Self {
952            untrusted_synic,
953            flush_state: VtlArray::from_fn(|_| TdxPartitionFlushState::new()),
954            cvm,
955            // VPs start in VTL 2.
956            active_vtl: std::iter::repeat_n(2, partition_params.topology.vp_count() as usize)
957                .map(AtomicU8::new)
958                .collect(),
959            cr4_allowed_bits,
960            guest_timer,
961        })
962    }
963
964    /// Get the default guest host mask for the specified register.
965    fn cr_guest_host_mask(&self, reg: ShadowedRegister) -> u64 {
966        match reg {
967            ShadowedRegister::Cr0 => {
968                !ShadowedRegister::Cr0.guest_owned_mask() | X64_CR0_PE | X64_CR0_PG
969            }
970            ShadowedRegister::Cr4 => {
971                !(ShadowedRegister::Cr4.guest_owned_mask() & self.cr4_allowed_bits)
972            }
973        }
974    }
975}
976
977impl TdxBacked {
978    /// Gets the number of pages that will be allocated from the shared page pool
979    /// for each CPU.
980    pub fn shared_pages_required_per_cpu() -> u64 {
981        UhDirectOverlay::Count as u64
982    }
983}
984
985// The memory used to back the untrusted synic is not guest-visible, but rather
986// is allocated from our shared pool. Therefore it does not need to go through
987// the normal memory protections path.
988struct UntrustedSynicVtlProts<'a>(&'a GuestMemory);
989
990impl hv1_emulator::VtlProtectAccess for UntrustedSynicVtlProts<'_> {
991    fn check_modify_and_lock_overlay_page(
992        &mut self,
993        gpn: u64,
994        _check_perms: hvdef::HvMapGpaFlags,
995        _new_perms: Option<hvdef::HvMapGpaFlags>,
996    ) -> Result<guestmem::LockedPages, HvError> {
997        self.0
998            .lock_gpns(false, &[gpn])
999            .map_err(|_| HvError::OperationFailed)
1000    }
1001
1002    fn unlock_overlay_page(&mut self, _gpn: u64) -> Result<(), HvError> {
1003        Ok(())
1004    }
1005}
1006
1007#[expect(private_interfaces)]
1008impl BackingPrivate for TdxBacked {
1009    type HclBacking<'tdx> = Tdx<'tdx>;
1010    type Shared = TdxBackedShared;
1011    type EmulationCache = TdxEmulationCache;
1012
1013    fn shared(shared: &BackingShared) -> &Self::Shared {
1014        let BackingShared::Tdx(shared) = shared else {
1015            unreachable!()
1016        };
1017        shared
1018    }
1019
1020    fn new(
1021        params: super::BackingParams<'_, '_, Self>,
1022        shared: &TdxBackedShared,
1023    ) -> Result<Self, crate::Error> {
1024        // TODO TDX: ssp is for shadow stack
1025        // TODO TDX: direct overlay like snp?
1026        // TODO TDX: lapic / APIC setup?
1027        // TODO TDX: see ValInitializeVplc
1028        // TODO TDX: XCR_XFMEM setup?
1029
1030        // Turn on MBEC for just VTL 0.
1031        params.runner.write_vmcs32(
1032            GuestVtl::Vtl0,
1033            VmcsField::VMX_VMCS_SECONDARY_PROCESSOR_CONTROLS,
1034            SecondaryProcessorControls::new()
1035                .with_mode_based_execute_control(true)
1036                .into(),
1037            SecondaryProcessorControls::new()
1038                .with_mode_based_execute_control(true)
1039                .into(),
1040        );
1041
1042        let controls = TdxL2Ctls::new()
1043            // Configure L2 controls to permit shared memory.
1044            .with_enable_shared_ept(!shared.cvm.hide_isolation)
1045            // If the synic is to be managed by the hypervisor, then enable TDVMCALLs.
1046            .with_enable_tdvmcall(shared.untrusted_synic.is_none() && !shared.cvm.hide_isolation);
1047
1048        params
1049            .runner
1050            .set_l2_ctls(GuestVtl::Vtl0, controls)
1051            .map_err(crate::Error::FailedToSetL2Ctls)?;
1052
1053        for vtl in [GuestVtl::Vtl0, GuestVtl::Vtl1] {
1054            // Set guest/host masks for CR0 and CR4. These enable shadowing these
1055            // registers since TDX requires certain bits to be set at all times.
1056            let initial_cr0 = params
1057                .runner
1058                .read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_CR0);
1059            assert_eq!(initial_cr0, X64_CR0_PE | X64_CR0_NE);
1060
1061            // N.B. CR0.PE and CR0.PG are guest owned but still intercept when they
1062            // are changed for caching purposes and to ensure EFER is managed
1063            // properly due to the need to change execution state.
1064            params.runner.write_vmcs64(
1065                vtl,
1066                VmcsField::VMX_VMCS_CR0_READ_SHADOW,
1067                !0,
1068                X64_CR0_PE | X64_CR0_NE,
1069            );
1070            params.runner.write_vmcs64(
1071                vtl,
1072                VmcsField::VMX_VMCS_CR0_GUEST_HOST_MASK,
1073                !0,
1074                shared.cr_guest_host_mask(ShadowedRegister::Cr0),
1075            );
1076
1077            let initial_cr4 = params
1078                .runner
1079                .read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_CR4);
1080            assert_eq!(initial_cr4, X64_CR4_MCE | X64_CR4_VMXE);
1081
1082            params
1083                .runner
1084                .write_vmcs64(vtl, VmcsField::VMX_VMCS_CR4_READ_SHADOW, !0, 0);
1085            params.runner.write_vmcs64(
1086                vtl,
1087                VmcsField::VMX_VMCS_CR4_GUEST_HOST_MASK,
1088                !0,
1089                shared.cr_guest_host_mask(ShadowedRegister::Cr4),
1090            );
1091
1092            // Configure the MSR bitmap for this VP. Since the default MSR bitmap
1093            // is set to intercept everything only the MSRs that we want to allow
1094            // to passthrough need to be set.
1095            for msr in MSR_ALLOWED_READ {
1096                params.runner.set_msr_bit(vtl, *msr, false, false);
1097            }
1098            for msr in MSR_ALLOWED_READ_WRITE {
1099                params.runner.set_msr_bit(vtl, *msr, false, false);
1100                params.runner.set_msr_bit(vtl, *msr, true, false);
1101            }
1102
1103            // Set the exception bitmap.
1104            if params.partition.intercept_debug_exceptions {
1105                if cfg!(feature = "gdb") {
1106                    let initial_exception_bitmap = params
1107                        .runner
1108                        .read_vmcs32(vtl, VmcsField::VMX_VMCS_EXCEPTION_BITMAP);
1109
1110                    let exception_bitmap =
1111                        initial_exception_bitmap | (1 << x86defs::Exception::DEBUG.0);
1112
1113                    params.runner.write_vmcs32(
1114                        vtl,
1115                        VmcsField::VMX_VMCS_EXCEPTION_BITMAP,
1116                        !0,
1117                        exception_bitmap,
1118                    );
1119                } else {
1120                    return Err(super::Error::InvalidDebugConfiguration);
1121                }
1122            }
1123        }
1124
1125        let flush_page = shared
1126            .cvm
1127            .private_dma_client
1128            .allocate_dma_buffer(HV_PAGE_SIZE as usize)
1129            .map_err(crate::Error::AllocateTlbFlushPage)?;
1130
1131        let untrusted_synic = shared
1132            .untrusted_synic
1133            .as_ref()
1134            .map(|synic| synic.add_vp(params.vp_info.base.vp_index));
1135
1136        Ok(Self {
1137            vtls: VtlArray::from_fn(|vtl| {
1138                let vtl: GuestVtl = vtl.try_into().unwrap();
1139                TdxVtl {
1140                    efer: params
1141                        .runner
1142                        .read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_EFER),
1143                    cr0: VirtualRegister::new(
1144                        ShadowedRegister::Cr0,
1145                        vtl,
1146                        params
1147                            .runner
1148                            .read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_CR0),
1149                        !0,
1150                    ),
1151                    cr4: VirtualRegister::new(
1152                        ShadowedRegister::Cr4,
1153                        vtl,
1154                        params
1155                            .runner
1156                            .read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_CR4),
1157                        shared.cr4_allowed_bits,
1158                    ),
1159                    msr_cstar: 0,
1160                    tpr_threshold: 0,
1161                    processor_controls: params
1162                        .runner
1163                        .read_vmcs32(vtl, VmcsField::VMX_VMCS_PROCESSOR_CONTROLS)
1164                        .into(),
1165                    interruption_information: Default::default(),
1166                    exception_error_code: 0,
1167                    interruption_set: false,
1168                    flush_state: TdxFlushState::new(),
1169                    private_regs: TdxPrivateRegs::new(vtl),
1170                    enter_stats: Default::default(),
1171                    exit_stats: Default::default(),
1172                }
1173            }),
1174            untrusted_synic,
1175            eoi_exit_bitmap: [0; 4],
1176            flush_page,
1177            cvm: UhCvmVpState::new(
1178                &shared.cvm,
1179                params.partition,
1180                params.vp_info,
1181                UhDirectOverlay::Count as usize,
1182            )?,
1183            tsc_deadline_state: shared
1184                .guest_timer
1185                .is_hardware_virtualized()
1186                .then(TdxTscDeadline::default),
1187        })
1188    }
1189
1190    type StateAccess<'p, 'a>
1191        = UhVpStateAccess<'a, 'p, Self>
1192    where
1193        Self: 'a + 'p,
1194        'p: 'a;
1195
1196    fn access_vp_state<'a, 'p>(
1197        this: &'a mut UhProcessor<'p, Self>,
1198        vtl: GuestVtl,
1199    ) -> Self::StateAccess<'p, 'a> {
1200        UhVpStateAccess::new(this, vtl)
1201    }
1202
1203    fn init(this: &mut UhProcessor<'_, Self>) {
1204        // Configure the synic direct overlays.
1205        // So far, only VTL 0 is using these (for VMBus).
1206        let pfns = &this.backing.cvm.direct_overlay_handle.pfns();
1207        let reg = |gpn| {
1208            u64::from(
1209                HvSynicSimpSiefp::new()
1210                    .with_base_gpn(gpn)
1211                    .with_enabled(true),
1212            )
1213        };
1214
1215        let values: &[(HvX64RegisterName, u64); 2] = &[
1216            (
1217                HvX64RegisterName::Sifp,
1218                reg(pfns[UhDirectOverlay::Sifp as usize]),
1219            ),
1220            (
1221                HvX64RegisterName::Sipp,
1222                reg(pfns[UhDirectOverlay::Sipp as usize]),
1223            ),
1224        ];
1225
1226        let reg_count = if let Some(synic) = &mut this.backing.untrusted_synic {
1227            let prot_access = &mut UntrustedSynicVtlProts(&this.partition.gm[GuestVtl::Vtl0]);
1228
1229            synic
1230                .set_simp(reg(pfns[UhDirectOverlay::Sipp as usize]), prot_access)
1231                .unwrap();
1232            synic
1233                .set_siefp(reg(pfns[UhDirectOverlay::Sifp as usize]), prot_access)
1234                .unwrap();
1235            // Set the SIEFP in the hypervisor so that the hypervisor can
1236            // directly signal synic events. Don't set the SIMP, since the
1237            // message page is owned by the paravisor.
1238            1
1239        } else {
1240            2
1241        };
1242
1243        this.runner
1244            .set_vp_registers_hvcall(Vtl::Vtl0, &values[..reg_count])
1245            .expect("set_vp_registers hypercall for direct overlays should succeed");
1246
1247        // Enable APIC offload by default for VTL 0.
1248        this.set_apic_offload(GuestVtl::Vtl0, true);
1249        this.backing.cvm.lapics[GuestVtl::Vtl0]
1250            .lapic
1251            .enable_offload();
1252
1253        // But disable it for VTL 1.
1254        this.set_apic_offload(GuestVtl::Vtl1, false);
1255
1256        // Initialize registers to the reset state, since this may be different
1257        // than what's on the VMCS and is certainly different than what's in the
1258        // VP enter and private register state (which was mostly zero
1259        // initialized).
1260        for vtl in [GuestVtl::Vtl0, GuestVtl::Vtl1] {
1261            let registers = Registers::at_reset(&this.partition.caps, &this.inner.vp_info);
1262
1263            let mut state = this.access_state(vtl.into());
1264            state
1265                .set_registers(&registers)
1266                .expect("Resetting to architectural state should succeed");
1267
1268            state.commit().expect("committing state should succeed");
1269        }
1270
1271        // FX regs and XMM registers are zero-initialized by the kernel. Set
1272        // them to the arch default.
1273        *this.runner.fx_state_mut() =
1274            vp::Xsave::at_reset(&this.partition.caps, &this.inner.vp_info).fxsave();
1275    }
1276
1277    async fn run_vp(
1278        this: &mut UhProcessor<'_, Self>,
1279        dev: &impl CpuIo,
1280        _stop: &mut virt::StopVp<'_>,
1281    ) -> Result<(), VpHaltReason> {
1282        this.run_vp_tdx(dev).await
1283    }
1284
1285    fn poll_apic(this: &mut UhProcessor<'_, Self>, vtl: GuestVtl, scan_irr: bool) {
1286        if !this.try_poll_apic(vtl, scan_irr) {
1287            tracing::info!(CVM_ALLOWED, "disabling APIC offload due to auto EOI");
1288            let page = this.runner.tdx_apic_page_mut(vtl);
1289            let (irr, isr) = pull_apic_offload(page);
1290
1291            this.backing.cvm.lapics[vtl]
1292                .lapic
1293                .disable_offload(&irr, &isr);
1294            this.set_apic_offload(vtl, false);
1295            this.try_poll_apic(vtl, false);
1296        }
1297    }
1298
1299    fn request_extint_readiness(_this: &mut UhProcessor<'_, Self>) {
1300        unreachable!("extint managed through software apic")
1301    }
1302
1303    fn request_untrusted_sint_readiness(this: &mut UhProcessor<'_, Self>, sints: u16) {
1304        if let Some(synic) = &mut this.backing.untrusted_synic {
1305            synic.request_sint_readiness(sints);
1306        } else {
1307            tracelimit::error_ratelimited!(CVM_ALLOWED, "untrusted synic is not configured");
1308        }
1309    }
1310
1311    fn hv(&self, vtl: GuestVtl) -> Option<&ProcessorVtlHv> {
1312        Some(&self.cvm.hv[vtl])
1313    }
1314
1315    fn hv_mut(&mut self, vtl: GuestVtl) -> Option<&mut ProcessorVtlHv> {
1316        Some(&mut self.cvm.hv[vtl])
1317    }
1318
1319    fn handle_vp_start_enable_vtl_wake(this: &mut UhProcessor<'_, Self>, vtl: GuestVtl) {
1320        this.hcvm_handle_vp_start_enable_vtl(vtl)
1321    }
1322
1323    fn vtl1_inspectable(this: &UhProcessor<'_, Self>) -> bool {
1324        this.hcvm_vtl1_inspectable()
1325    }
1326
1327    fn process_interrupts(
1328        this: &mut UhProcessor<'_, Self>,
1329        scan_irr: VtlArray<bool, 2>,
1330        first_scan_irr: &mut bool,
1331        dev: &impl CpuIo,
1332    ) -> bool {
1333        this.cvm_process_interrupts(scan_irr, first_scan_irr, dev)
1334    }
1335}
1336
1337impl UhProcessor<'_, TdxBacked> {
1338    /// Returns `Ok(false)` if the APIC offload needs to be disabled and the
1339    /// poll retried.
1340    fn try_poll_apic(&mut self, vtl: GuestVtl, scan_irr: bool) -> bool {
1341        let mut scan = TdxApicScanner {
1342            processor_controls: self.backing.vtls[vtl]
1343                .processor_controls
1344                .with_nmi_window_exiting(false)
1345                .with_interrupt_window_exiting(false),
1346            vp: self,
1347            tpr_threshold: 0,
1348        };
1349
1350        // TODO TDX: filter proxy IRRs by setting the `proxy_irr_blocked` field of the run page
1351        hardware_cvm::apic::poll_apic_core(&mut scan, vtl, scan_irr);
1352
1353        let TdxApicScanner {
1354            vp: _,
1355            processor_controls: new_processor_controls,
1356            tpr_threshold: new_tpr_threshold,
1357        } = scan;
1358
1359        // Interrupts are ignored while waiting for SIPI.
1360        if self.backing.cvm.lapics[vtl].activity != MpState::WaitForSipi
1361            && self.backing.vtls[vtl].tpr_threshold != new_tpr_threshold
1362        {
1363            tracing::trace!(new_tpr_threshold, ?vtl, "setting tpr threshold");
1364            self.runner.write_vmcs32(
1365                vtl,
1366                VmcsField::VMX_VMCS_TPR_THRESHOLD,
1367                !0,
1368                new_tpr_threshold.into(),
1369            );
1370            self.backing.vtls[vtl].tpr_threshold = new_tpr_threshold;
1371        }
1372
1373        if self.backing.vtls[vtl].processor_controls != new_processor_controls {
1374            tracing::trace!(?new_processor_controls, ?vtl, "requesting window change");
1375            self.runner.write_vmcs32(
1376                vtl,
1377                VmcsField::VMX_VMCS_PROCESSOR_CONTROLS,
1378                !0,
1379                new_processor_controls.into(),
1380            );
1381            self.backing.vtls[vtl].processor_controls = new_processor_controls;
1382        }
1383
1384        // Offloading and proxying is only done with VTL 0 today.
1385        if vtl == GuestVtl::Vtl0 {
1386            let mut update_rvi = false;
1387            let r: Result<(), OffloadNotSupported> = self.backing.cvm.lapics[vtl]
1388                .lapic
1389                .push_to_offload(|irr, isr, tmr| {
1390                    let apic_page = self.runner.tdx_apic_page_mut(vtl);
1391
1392                    for (((irr, page_irr), isr), page_isr) in irr
1393                        .iter()
1394                        .zip(&mut apic_page.irr)
1395                        .zip(isr)
1396                        .zip(&mut apic_page.isr)
1397                    {
1398                        page_irr.value |= *irr;
1399                        page_isr.value |= *isr;
1400                    }
1401
1402                    // Update SVI and RVI.
1403                    let svi = top_vector(&apic_page.isr);
1404                    self.backing.vtls[vtl].private_regs.svi = svi;
1405                    update_rvi = true;
1406
1407                    // Ensure the EOI exit bitmap is up to date.
1408                    let fields = [
1409                        VmcsField::VMX_VMCS_EOI_EXIT_0,
1410                        VmcsField::VMX_VMCS_EOI_EXIT_1,
1411                        VmcsField::VMX_VMCS_EOI_EXIT_2,
1412                        VmcsField::VMX_VMCS_EOI_EXIT_3,
1413                    ];
1414                    for ((&field, eoi_exit), (i, tmr)) in fields
1415                        .iter()
1416                        .zip(&mut self.backing.eoi_exit_bitmap)
1417                        .zip(tmr.chunks_exact(2).enumerate())
1418                    {
1419                        let tmr = tmr[0] as u64 | ((tmr[1] as u64) << 32);
1420                        if *eoi_exit != tmr {
1421                            self.runner.write_vmcs64(vtl, field, !0, tmr);
1422                            *eoi_exit = tmr;
1423                            // The kernel driver supports some common APIC functionality (ICR writes,
1424                            // interrupt injection). When the kernel driver handles an interrupt, it
1425                            // must know if that interrupt was previously level-triggered. Otherwise,
1426                            // the EOI will be incorrectly treated as level-triggered. We keep a copy
1427                            // of the tmr in the kernel so it knows when this scenario occurs.
1428                            self.runner.proxy_irr_exit_mut_vtl0()[i * 2] = tmr as u32;
1429                            self.runner.proxy_irr_exit_mut_vtl0()[i * 2 + 1] = (tmr >> 32) as u32;
1430                        }
1431                    }
1432                });
1433
1434            if let Err(OffloadNotSupported) = r {
1435                // APIC needs offloading to be disabled to support auto-EOI. The caller
1436                // will disable offload and try again.
1437                return false;
1438            }
1439
1440            if update_rvi {
1441                let page = self.runner.tdx_apic_page_mut(vtl);
1442                let rvi = top_vector(&page.irr);
1443                self.backing.vtls[vtl].private_regs.rvi = rvi;
1444            }
1445        }
1446
1447        // If there is a pending interrupt, clear the halted and idle state.
1448        if (self.backing.cvm.lapics[vtl].activity != MpState::Running)
1449            && self.backing.cvm.lapics[vtl].lapic.is_offloaded()
1450            && self.backing.vtls[vtl].private_regs.rvi != 0
1451        {
1452            // To model a non-virtualized processor, we should only do this if
1453            // TPR and IF and interrupt shadow allow. However, fetching the
1454            // interrupt shadow state is expensive (tdcall). This shouldn't
1455            // matter much, because real guests don't issue hlt while in
1456            // interrupt shadow or with interrupts disabled or with a non-zero
1457            // TPR.
1458            //
1459            // Note that the processor will not actually inject the interrupt
1460            // until conditions hold. So, unless the guest fails to loop around
1461            // and hlt again (which we already treat as a guest bug, since
1462            // Hyper-V in general does not guarantee hlt will stick until an
1463            // interrupt is pending), at worst this will just burn some CPU.
1464            self.backing.cvm.lapics[vtl].activity = MpState::Running;
1465        }
1466
1467        true
1468    }
1469
1470    fn access_apic_without_offload<R>(
1471        &mut self,
1472        vtl: GuestVtl,
1473        f: impl FnOnce(&mut Self) -> R,
1474    ) -> R {
1475        let offloaded = self.backing.cvm.lapics[vtl].lapic.is_offloaded();
1476        if offloaded {
1477            let (irr, isr) = pull_apic_offload(self.runner.tdx_apic_page_mut(vtl));
1478            self.backing.cvm.lapics[vtl]
1479                .lapic
1480                .disable_offload(&irr, &isr);
1481        }
1482        let r = f(self);
1483        if offloaded {
1484            self.backing.cvm.lapics[vtl].lapic.enable_offload();
1485        }
1486        r
1487    }
1488
1489    fn set_apic_offload(&mut self, vtl: GuestVtl, offload: bool) {
1490        // Update the APIC portion of the MSR bitmap.
1491        let offload_bitmap = if offload {
1492            (1 << x86defs::apic::ApicRegister::TPR.0)
1493                | (1 << x86defs::apic::ApicRegister::EOI.0)
1494                | (1 << x86defs::apic::ApicRegister::SELF_IPI.0)
1495        } else {
1496            0
1497        };
1498        // Once for read and once for write.
1499        for offset in [0, 0x100] {
1500            self.runner
1501                .write_msr_bitmap(vtl, offset + X2APIC_MSR_BASE / 64, !0, !offload_bitmap);
1502        }
1503
1504        // Update virtual-interrupt delivery.
1505        self.runner.write_vmcs32(
1506            vtl,
1507            VmcsField::VMX_VMCS_SECONDARY_PROCESSOR_CONTROLS,
1508            SecondaryProcessorControls::new()
1509                .with_virtual_interrupt_delivery(true)
1510                .into(),
1511            SecondaryProcessorControls::new()
1512                .with_virtual_interrupt_delivery(offload)
1513                .into(),
1514        );
1515
1516        // Clear any pending external interrupt when enabling the APIC offload.
1517        if offload
1518            && self.backing.vtls[vtl]
1519                .interruption_information
1520                .interruption_type()
1521                == INTERRUPT_TYPE_EXTERNAL
1522        {
1523            self.backing.vtls[vtl]
1524                .interruption_information
1525                .set_valid(false);
1526        }
1527    }
1528}
1529
1530struct TdxApicScanner<'a, 'b> {
1531    vp: &'a mut UhProcessor<'b, TdxBacked>,
1532    processor_controls: ProcessorControls,
1533    tpr_threshold: u8,
1534}
1535
1536impl<'b> hardware_cvm::apic::ApicBacking<'b, TdxBacked> for TdxApicScanner<'_, 'b> {
1537    fn vp(&mut self) -> &mut UhProcessor<'b, TdxBacked> {
1538        self.vp
1539    }
1540
1541    fn handle_interrupt(&mut self, vtl: GuestVtl, vector: u8) {
1542        // Exit idle when an interrupt is received, regardless of IF
1543        if self.vp.backing.cvm.lapics[vtl].activity == MpState::Idle {
1544            self.vp.backing.cvm.lapics[vtl].activity = MpState::Running;
1545        }
1546        // If there is a higher-priority pending event of some kind, then
1547        // just request an exit after it has resolved, after which we will
1548        // try again.
1549        if self.vp.backing.vtls[vtl].interruption_information.valid()
1550            && self.vp.backing.vtls[vtl]
1551                .interruption_information
1552                .interruption_type()
1553                != INTERRUPT_TYPE_EXTERNAL
1554        {
1555            self.processor_controls.set_interrupt_window_exiting(true);
1556            return;
1557        }
1558
1559        // Ensure the interrupt is not blocked by RFLAGS.IF or interrupt shadow.
1560        let interruptibility: Interruptibility = self
1561            .vp
1562            .runner
1563            .read_vmcs32(vtl, VmcsField::VMX_VMCS_GUEST_INTERRUPTIBILITY)
1564            .into();
1565
1566        let rflags = RFlags::from(self.vp.backing.vtls[vtl].private_regs.rflags);
1567        if !rflags.interrupt_enable()
1568            || interruptibility.blocked_by_sti()
1569            || interruptibility.blocked_by_movss()
1570        {
1571            self.processor_controls.set_interrupt_window_exiting(true);
1572            return;
1573        }
1574
1575        let priority = vector >> 4;
1576        let apic = self.vp.runner.tdx_apic_page(vtl);
1577        if (apic.tpr.value as u8 >> 4) >= priority {
1578            self.tpr_threshold = priority;
1579            return;
1580        }
1581
1582        self.vp.backing.vtls[vtl].interruption_information = InterruptionInformation::new()
1583            .with_valid(true)
1584            .with_vector(vector)
1585            .with_interruption_type(INTERRUPT_TYPE_EXTERNAL);
1586
1587        self.vp.backing.cvm.lapics[vtl].activity = MpState::Running;
1588    }
1589
1590    fn handle_nmi(&mut self, vtl: GuestVtl) {
1591        // Exit idle when an interrupt is received, regardless of IF
1592        // TODO: Investigate lifting more activity management into poll_apic_core
1593        if self.vp.backing.cvm.lapics[vtl].activity == MpState::Idle {
1594            self.vp.backing.cvm.lapics[vtl].activity = MpState::Running;
1595        }
1596        // If there is a higher-priority pending event of some kind, then
1597        // just request an exit after it has resolved, after which we will
1598        // try again.
1599        if self.vp.backing.vtls[vtl].interruption_information.valid()
1600            && self.vp.backing.vtls[vtl]
1601                .interruption_information
1602                .interruption_type()
1603                != INTERRUPT_TYPE_EXTERNAL
1604        {
1605            self.processor_controls.set_nmi_window_exiting(true);
1606            return;
1607        }
1608
1609        let interruptibility: Interruptibility = self
1610            .vp
1611            .runner
1612            .read_vmcs32(vtl, VmcsField::VMX_VMCS_GUEST_INTERRUPTIBILITY)
1613            .into();
1614
1615        if interruptibility.blocked_by_nmi()
1616            || interruptibility.blocked_by_sti()
1617            || interruptibility.blocked_by_movss()
1618        {
1619            self.processor_controls.set_nmi_window_exiting(true);
1620            return;
1621        }
1622
1623        self.vp.backing.vtls[vtl].interruption_information = InterruptionInformation::new()
1624            .with_valid(true)
1625            .with_vector(2)
1626            .with_interruption_type(INTERRUPT_TYPE_NMI);
1627
1628        self.vp.backing.cvm.lapics[vtl].activity = MpState::Running;
1629    }
1630
1631    fn handle_sipi(&mut self, vtl: GuestVtl, cs: SegmentRegister) {
1632        self.vp.write_segment(vtl, TdxSegmentReg::Cs, cs).unwrap();
1633        self.vp.backing.vtls[vtl].private_regs.rip = 0;
1634        self.vp.backing.cvm.lapics[vtl].activity = MpState::Running;
1635    }
1636}
1637
1638impl UhProcessor<'_, TdxBacked> {
1639    async fn run_vp_tdx(&mut self, dev: &impl CpuIo) -> Result<(), VpHaltReason> {
1640        let next_vtl = self.backing.cvm.exit_vtl;
1641
1642        if self.backing.vtls[next_vtl].interruption_information.valid() {
1643            tracing::trace!(
1644                vector = self.backing.vtls[next_vtl]
1645                    .interruption_information
1646                    .vector(),
1647                vp_index = self.vp_index().index(),
1648                ?next_vtl,
1649                "injecting interrupt"
1650            );
1651
1652            self.runner.write_vmcs32(
1653                next_vtl,
1654                VmcsField::VMX_VMCS_ENTRY_INTERRUPT_INFO,
1655                !0,
1656                self.backing.vtls[next_vtl].interruption_information.into(),
1657            );
1658            if self.backing.vtls[next_vtl]
1659                .interruption_information
1660                .deliver_error_code()
1661            {
1662                self.runner.write_vmcs32(
1663                    next_vtl,
1664                    VmcsField::VMX_VMCS_ENTRY_EXCEPTION_ERROR_CODE,
1665                    !0,
1666                    self.backing.vtls[next_vtl].exception_error_code,
1667                );
1668            }
1669            self.backing.vtls[next_vtl].interruption_set = true;
1670        } else if self.backing.vtls[next_vtl].interruption_set {
1671            self.runner
1672                .write_vmcs32(next_vtl, VmcsField::VMX_VMCS_ENTRY_INTERRUPT_INFO, !0, 0);
1673            self.backing.vtls[next_vtl].interruption_set = false;
1674        }
1675
1676        // We're about to return to a lower VTL, so set active_vtl for other VPs,
1677        // do any pending flushes, unlock our TLB locks, and wait for any others
1678        // we're supposed to.
1679
1680        // active_vtl needs SeqCst ordering here in order to correctly synchronize
1681        // access with the TLB address flush list. We need to ensure that, when
1682        // other VPs are adding entries to the list, they always observe the
1683        // correct lower active VTL. Otherwise they might choose to not send this
1684        // VP a wake, leading to a stall, until this VP happens to exit to VTL 2 again.
1685        //
1686        // This does technically leave open a small window for potential spurious
1687        // wakes, but that's preferable, and will cause no problems besides a
1688        // small amount of time waste.
1689        self.shared.active_vtl[self.vp_index().index() as usize]
1690            .store(next_vtl as u8, Ordering::SeqCst);
1691
1692        self.do_tlb_flush(next_vtl);
1693        self.unlock_tlb_lock(Vtl::Vtl2);
1694        let tlb_halt = self.should_halt_for_tlb_unlock(next_vtl);
1695
1696        // If we are halted in the kernel due to hlt or idle, and we receive an interrupt
1697        // we'd like to unhalt, inject the interrupt, and resume vtl0 without returning to
1698        // user-mode. To enable this, the kernel must know why are are halted
1699        let activity = self.backing.cvm.lapics[next_vtl].activity;
1700        let kernel_known_state =
1701            matches!(activity, MpState::Running | MpState::Halted | MpState::Idle);
1702        let halted_other = tlb_halt || !kernel_known_state;
1703
1704        self.runner
1705            .set_halted(activity != MpState::Running || tlb_halt);
1706
1707        // Turn on kernel interrupt handling if possible. This will cause the
1708        // kernel to handle some exits internally, without returning to user
1709        // mode, to improve performance.
1710        //
1711        // Do not do this if there is a pending interruption, since we need to
1712        // run code on the next exit to clear it. If we miss this opportunity,
1713        // we will probably double-inject the interruption, wreaking havoc.
1714        //
1715        // Also do not do this if there is a pending TLB flush, since we need to
1716        // run code on the next exit to clear it. If we miss this opportunity,
1717        // we could double-inject the TLB flush unnecessarily.
1718        let offload_enabled = self.backing.cvm.lapics[next_vtl].lapic.can_offload_irr()
1719            && !self.backing.vtls[next_vtl].interruption_information.valid()
1720            && self.backing.vtls[next_vtl]
1721                .private_regs
1722                .vp_entry_flags
1723                .invd_translations()
1724                == 0;
1725        let x2apic_enabled = self.backing.cvm.lapics[next_vtl].lapic.x2apic_enabled();
1726
1727        let offload_flags = hcl_intr_offload_flags::new()
1728            .with_offload_intr_inject(offload_enabled)
1729            .with_offload_x2apic(offload_enabled && x2apic_enabled)
1730            .with_halted_other(halted_other)
1731            .with_halted_hlt(activity == MpState::Halted)
1732            .with_halted_idle(activity == MpState::Idle);
1733
1734        *self.runner.offload_flags_mut() = offload_flags;
1735
1736        self.runner
1737            .write_private_regs(&self.backing.vtls[next_vtl].private_regs);
1738
1739        let has_intercept = self
1740            .runner
1741            .run()
1742            .map_err(|e| dev.fatal_error(TdxRunVpError(e).into()))?;
1743
1744        // TLB flushes can only target lower VTLs, so it is fine to use a relaxed
1745        // ordering here. The worst that can happen is some spurious wakes, due
1746        // to another VP observing that this VP is still in a lower VTL.
1747        self.shared.active_vtl[self.vp_index().index() as usize].store(2, Ordering::Relaxed);
1748
1749        let entered_from_vtl = next_vtl;
1750        self.runner
1751            .read_private_regs(&mut self.backing.vtls[entered_from_vtl].private_regs);
1752
1753        // Synchronize timer deadline state
1754        self.shared.guest_timer.sync_deadline_state(self);
1755
1756        // Kernel offload may have set or cleared the halt/idle states
1757        if offload_enabled && kernel_known_state {
1758            let offload_flags = self.runner.offload_flags_mut();
1759
1760            self.backing.cvm.lapics[entered_from_vtl].activity =
1761                match (offload_flags.halted_hlt(), offload_flags.halted_idle()) {
1762                    (false, false) => MpState::Running,
1763                    (true, false) => MpState::Halted,
1764                    (false, true) => MpState::Idle,
1765                    (true, true) => {
1766                        tracelimit::warn_ratelimited!(
1767                            CVM_ALLOWED,
1768                            "Kernel indicates VP is both halted and idle!"
1769                        );
1770                        activity
1771                    }
1772                };
1773        }
1774
1775        if !has_intercept {
1776            return Ok(());
1777        }
1778
1779        let exit_info = TdxExit(self.runner.tdx_vp_enter_exit_info());
1780
1781        // Result codes above PENDING_INTERRUPT indicate the L2 was never entered.
1782        if exit_info.code().tdx_exit() >= TdCallResultCode::PENDING_INTERRUPT {
1783            self.backing.vtls[entered_from_vtl]
1784                .enter_stats
1785                .pending_intr
1786                .increment();
1787            return Ok(());
1788        }
1789
1790        // Since the L2 was entered we can clear any TLB flush requests
1791        self.backing.vtls[entered_from_vtl]
1792            .private_regs
1793            .vp_entry_flags
1794            .set_invd_translations(0);
1795
1796        // The L2 was entered, so process the exit.
1797        let stat = match exit_info.code().tdx_exit() {
1798            TdCallResultCode::SUCCESS => {
1799                &mut self.backing.vtls[entered_from_vtl].enter_stats.success
1800            }
1801            TdCallResultCode::L2_EXIT_HOST_ROUTED_ASYNC => {
1802                &mut self.backing.vtls[entered_from_vtl]
1803                    .enter_stats
1804                    .host_routed_async
1805            }
1806            TdCallResultCode::L2_EXIT_PENDING_INTERRUPT => {
1807                &mut self.backing.vtls[entered_from_vtl]
1808                    .enter_stats
1809                    .l2_exit_pending_intr
1810            }
1811            TdCallResultCode::L2_EXIT_HOST_ROUTED_TDVMCALL => {
1812                // This is expected, and means that the hypervisor completed a
1813                // TD.VMCALL from the L2 and has requested to resume the L2 to
1814                // the L1.
1815                //
1816                // There is nothing to do here.
1817                assert_eq!(
1818                    exit_info.code().vmx_exit(),
1819                    VmxExit::new().with_basic_reason(VmxExitBasic::TDCALL)
1820                );
1821                &mut self.backing.vtls[entered_from_vtl]
1822                    .enter_stats
1823                    .host_routed_td_vmcall
1824            }
1825            _ => panic!("unexpected tdx exit code {:?}", exit_info.code()),
1826        };
1827
1828        stat.increment();
1829        self.handle_vmx_exit(dev, entered_from_vtl).await?;
1830        Ok(())
1831    }
1832
1833    async fn handle_vmx_exit(
1834        &mut self,
1835        dev: &impl CpuIo,
1836        intercepted_vtl: GuestVtl,
1837    ) -> Result<(), VpHaltReason> {
1838        let exit_info = TdxExit(self.runner.tdx_vp_enter_exit_info());
1839
1840        // First, check that the VM entry was even successful.
1841        let vmx_exit = exit_info.code().vmx_exit();
1842        if vmx_exit.vm_enter_failed() {
1843            return Err(self.handle_vm_enter_failed(dev, intercepted_vtl, vmx_exit));
1844        }
1845
1846        let next_interruption = exit_info.idt_vectoring_info();
1847
1848        // Acknowledge the APIC interrupt/NMI if it was delivered.
1849        if self.backing.vtls[intercepted_vtl]
1850            .interruption_information
1851            .valid()
1852            && (!next_interruption.valid()
1853                || self.backing.vtls[intercepted_vtl]
1854                    .interruption_information
1855                    .interruption_type()
1856                    != next_interruption.interruption_type())
1857        {
1858            match self.backing.vtls[intercepted_vtl]
1859                .interruption_information
1860                .interruption_type()
1861            {
1862                INTERRUPT_TYPE_EXTERNAL
1863                    if !self.backing.cvm.lapics[intercepted_vtl]
1864                        .lapic
1865                        .is_offloaded() =>
1866                {
1867                    // This must be a pending APIC interrupt. Acknowledge it.
1868                    tracing::trace!(
1869                        vector = self.backing.vtls[intercepted_vtl]
1870                            .interruption_information
1871                            .vector(),
1872                        "acknowledging interrupt"
1873                    );
1874                    self.backing.cvm.lapics[intercepted_vtl]
1875                        .lapic
1876                        .acknowledge_interrupt(
1877                            self.backing.vtls[intercepted_vtl]
1878                                .interruption_information
1879                                .vector(),
1880                        );
1881                }
1882                INTERRUPT_TYPE_NMI => {
1883                    // This must be a pending NMI.
1884                    tracing::debug!("acknowledging NMI");
1885                    self.backing.cvm.lapics[intercepted_vtl].nmi_pending = false;
1886                }
1887                _ => {}
1888            }
1889        }
1890
1891        if self.backing.cvm.lapics[intercepted_vtl]
1892            .lapic
1893            .is_offloaded()
1894        {
1895            // It's possible with vAPIC that we take an exit in the window where
1896            // hardware has moved a bit from IRR to ISR, but has not injected
1897            // the interrupt into the guest. In this case, we need to track that
1898            // we must inject the interrupt before we return to the guest,
1899            // otherwise the interrupt will be lost and the guest left in a bad
1900            // state.
1901            //
1902            // TODO TDX: Unclear what kind of exits these would be, but they
1903            // should be spurious EPT exits. Can we validate or assert that
1904            // somehow? If we were to somehow call some other path which would
1905            // set interruption_information before we inject this one, we would
1906            // lose this interrupt.
1907            if next_interruption.valid() {
1908                tracing::debug!(
1909                    ?next_interruption,
1910                    vp_index = self.vp_index().index(),
1911                    "exit requires reinjecting interrupt"
1912                );
1913                self.backing.vtls[intercepted_vtl].interruption_information = next_interruption;
1914                self.backing.vtls[intercepted_vtl].exception_error_code =
1915                    exit_info.idt_vectoring_error_code();
1916                self.backing.vtls[intercepted_vtl]
1917                    .exit_stats
1918                    .needs_interrupt_reinject
1919                    .increment();
1920            } else {
1921                self.backing.vtls[intercepted_vtl].interruption_information = Default::default();
1922            }
1923        } else {
1924            // Ignore (and later recalculate) the next interruption if it is an
1925            // external interrupt or NMI, since it may change if the APIC state
1926            // changes.
1927            if next_interruption.valid()
1928                && !matches!(
1929                    next_interruption.interruption_type(),
1930                    INTERRUPT_TYPE_EXTERNAL | INTERRUPT_TYPE_NMI
1931                )
1932            {
1933                self.backing.vtls[intercepted_vtl].interruption_information = next_interruption;
1934                self.backing.vtls[intercepted_vtl].exception_error_code =
1935                    exit_info.idt_vectoring_error_code();
1936            } else {
1937                self.backing.vtls[intercepted_vtl].interruption_information = Default::default();
1938            }
1939        }
1940
1941        let mut breakpoint_debug_exception = false;
1942        let stat = match vmx_exit.basic_reason() {
1943            VmxExitBasic::IO_INSTRUCTION => {
1944                let io_qual = ExitQualificationIo::from(exit_info.qualification() as u32);
1945
1946                let len = match io_qual.access_size() {
1947                    IO_SIZE_8_BIT => 1,
1948                    IO_SIZE_16_BIT => 2,
1949                    IO_SIZE_32_BIT => 4,
1950                    _ => panic!(
1951                        "tdx module returned invalid io instr size {}",
1952                        io_qual.access_size()
1953                    ),
1954                };
1955
1956                let port_access_protected = self.cvm_try_protect_io_port_access(
1957                    intercepted_vtl,
1958                    io_qual.port(),
1959                    io_qual.is_in(),
1960                    len,
1961                    io_qual.is_string(),
1962                    io_qual.rep_prefix(),
1963                );
1964
1965                if !port_access_protected {
1966                    if io_qual.is_string() || io_qual.rep_prefix() {
1967                        // TODO GUEST VSM: consider changing the emulation path
1968                        // to also check for io port installation, mainly for
1969                        // handling rep instructions.
1970
1971                        self.emulate(
1972                            dev,
1973                            self.backing.vtls[intercepted_vtl]
1974                                .interruption_information
1975                                .valid(),
1976                            intercepted_vtl,
1977                            TdxEmulationCache::default(),
1978                        )
1979                        .await?;
1980                    } else {
1981                        let mut rax = self.runner.tdx_enter_guest_gps()[TdxGp::RAX];
1982                        emulate_io(
1983                            self.inner.vp_info.base.vp_index,
1984                            !io_qual.is_in(),
1985                            io_qual.port(),
1986                            &mut rax,
1987                            len,
1988                            dev,
1989                        )
1990                        .await;
1991                        self.runner.tdx_enter_guest_gps_mut()[TdxGp::RAX] = rax;
1992
1993                        self.advance_to_next_instruction(intercepted_vtl);
1994                    }
1995                }
1996
1997                &mut self.backing.vtls[intercepted_vtl].exit_stats.io
1998            }
1999            VmxExitBasic::MSR_READ => {
2000                let msr = self.runner.tdx_enter_guest_gps()[TdxGp::RCX] as u32;
2001
2002                let result = self.backing.cvm.lapics[intercepted_vtl]
2003                    .lapic
2004                    .access(&mut TdxApicClient {
2005                        partition: self.partition,
2006                        vmtime: &self.vmtime,
2007                        apic_page: self.runner.tdx_apic_page_mut(intercepted_vtl),
2008                        dev,
2009                        vtl: intercepted_vtl,
2010                    })
2011                    .msr_read(msr)
2012                    .or_else_if_unknown(|| self.read_msr_cvm(msr, intercepted_vtl))
2013                    .or_else_if_unknown(|| self.read_msr_tdx(msr, intercepted_vtl));
2014
2015                let value = match result {
2016                    Ok(v) => Some(v),
2017                    Err(MsrError::Unknown) => {
2018                        tracelimit::warn_ratelimited!(CVM_ALLOWED, msr, "unknown tdx vm msr read");
2019                        Some(0)
2020                    }
2021                    Err(MsrError::InvalidAccess) => None,
2022                };
2023
2024                let inject_gp = if let Some(value) = value {
2025                    let gps = self.runner.tdx_enter_guest_gps_mut();
2026                    gps[TdxGp::RAX] = (value as u32).into();
2027                    gps[TdxGp::RDX] = ((value >> 32) as u32).into();
2028                    false
2029                } else {
2030                    true
2031                };
2032
2033                if inject_gp {
2034                    self.inject_gpf(intercepted_vtl);
2035                } else {
2036                    self.advance_to_next_instruction(intercepted_vtl);
2037                }
2038                &mut self.backing.vtls[intercepted_vtl].exit_stats.msr_read
2039            }
2040            VmxExitBasic::MSR_WRITE => {
2041                let gps = self.runner.tdx_enter_guest_gps();
2042                let msr = gps[TdxGp::RCX] as u32;
2043                let value =
2044                    (gps[TdxGp::RAX] as u32 as u64) | ((gps[TdxGp::RDX] as u32 as u64) << 32);
2045
2046                if !self.cvm_try_protect_msr_write(intercepted_vtl, msr) {
2047                    let result = self.backing.cvm.lapics[intercepted_vtl]
2048                        .lapic
2049                        .access(&mut TdxApicClient {
2050                            partition: self.partition,
2051                            vmtime: &self.vmtime,
2052                            apic_page: self.runner.tdx_apic_page_mut(intercepted_vtl),
2053                            dev,
2054                            vtl: intercepted_vtl,
2055                        })
2056                        .msr_write(msr, value)
2057                        .or_else_if_unknown(|| self.write_msr_cvm(msr, value, intercepted_vtl))
2058                        .or_else_if_unknown(|| self.write_msr_tdx(msr, value, intercepted_vtl))
2059                        .or_else_if_unknown(|| {
2060                            // Sanity check
2061                            if MSR_ALLOWED_READ_WRITE.contains(&msr) {
2062                                unreachable!("intercepted a write to MSR {msr}, configured for passthrough by default, that wasn't registered for intercepts by a higher VTL");
2063                            }
2064                            Err(MsrError::Unknown)
2065                        });
2066
2067                    let inject_gp = match result {
2068                        Ok(()) => false,
2069                        Err(MsrError::Unknown) => {
2070                            tracelimit::warn_ratelimited!(
2071                                CVM_ALLOWED,
2072                                msr,
2073                                "unknown tdx vm msr write"
2074                            );
2075                            tracelimit::warn_ratelimited!(
2076                                CVM_CONFIDENTIAL,
2077                                value,
2078                                "unknown tdx vm msr write"
2079                            );
2080                            false
2081                        }
2082                        Err(MsrError::InvalidAccess) => true,
2083                    };
2084
2085                    if inject_gp {
2086                        self.inject_gpf(intercepted_vtl);
2087                    } else {
2088                        self.advance_to_next_instruction(intercepted_vtl);
2089                    }
2090                }
2091                &mut self.backing.vtls[intercepted_vtl].exit_stats.msr_write
2092            }
2093            VmxExitBasic::CPUID => {
2094                let gps = self.runner.tdx_enter_guest_gps();
2095                let leaf = gps[TdxGp::RAX] as u32;
2096                let subleaf = gps[TdxGp::RCX] as u32;
2097                let [eax, ebx, ecx, edx] = self.cvm_cpuid_result(intercepted_vtl, leaf, subleaf);
2098                let gps = self.runner.tdx_enter_guest_gps_mut();
2099                gps[TdxGp::RAX] = eax.into();
2100                gps[TdxGp::RBX] = ebx.into();
2101                gps[TdxGp::RCX] = ecx.into();
2102                gps[TdxGp::RDX] = edx.into();
2103                self.advance_to_next_instruction(intercepted_vtl);
2104                &mut self.backing.vtls[intercepted_vtl].exit_stats.cpuid
2105            }
2106            VmxExitBasic::VMCALL_INSTRUCTION => {
2107                if exit_info.cpl() != 0 {
2108                    self.inject_gpf(intercepted_vtl);
2109                } else {
2110                    let is_64bit = self.long_mode(intercepted_vtl);
2111                    let guest_memory = &self.partition.gm[intercepted_vtl];
2112                    let handler = UhHypercallHandler {
2113                        trusted: !self.cvm_partition().hide_isolation,
2114                        vp: &mut *self,
2115                        bus: dev,
2116                        intercepted_vtl,
2117                    };
2118
2119                    UhHypercallHandler::TDX_DISPATCHER.dispatch(
2120                        guest_memory,
2121                        hv1_hypercall::X64RegisterIo::new(handler, is_64bit),
2122                    );
2123                }
2124                &mut self.backing.vtls[intercepted_vtl].exit_stats.vmcall
2125            }
2126            VmxExitBasic::HLT_INSTRUCTION => {
2127                self.backing.cvm.lapics[intercepted_vtl].activity = MpState::Halted;
2128                self.clear_interrupt_shadow(intercepted_vtl);
2129                self.advance_to_next_instruction(intercepted_vtl);
2130                &mut self.backing.vtls[intercepted_vtl].exit_stats.hlt
2131            }
2132            VmxExitBasic::CR_ACCESS => {
2133                let qual = CrAccessQualification::from(exit_info.qualification());
2134                let cr;
2135                let value;
2136                match qual.access_type() {
2137                    CR_ACCESS_TYPE_MOV_TO_CR => {
2138                        cr = qual.cr();
2139                        value = self.runner.tdx_enter_guest_gps()[qual.gp_register() as usize];
2140                    }
2141                    CR_ACCESS_TYPE_LMSW => {
2142                        cr = 0;
2143                        let cr0 = self.backing.vtls[intercepted_vtl].cr0.read(&self.runner);
2144                        // LMSW updates the low four bits only.
2145                        value = (qual.lmsw_source_data() as u64 & 0xf) | (cr0 & !0xf);
2146                    }
2147                    access_type => unreachable!("not registered for cr access type {access_type}"),
2148                }
2149
2150                let cr = match cr {
2151                    0 => HvX64RegisterName::Cr0,
2152                    4 => HvX64RegisterName::Cr4,
2153                    _ => unreachable!("not registered for cr{cr} accesses"),
2154                };
2155
2156                if !self.cvm_try_protect_secure_register_write(intercepted_vtl, cr, value) {
2157                    let r = match cr {
2158                        HvX64RegisterName::Cr0 => self.backing.vtls[intercepted_vtl]
2159                            .cr0
2160                            .write(value, &mut self.runner),
2161                        HvX64RegisterName::Cr4 => self.backing.vtls[intercepted_vtl]
2162                            .cr4
2163                            .write(value, &mut self.runner),
2164                        _ => unreachable!(),
2165                    };
2166                    if r.is_ok() {
2167                        self.update_execution_mode(intercepted_vtl);
2168                        self.advance_to_next_instruction(intercepted_vtl);
2169                    } else {
2170                        tracelimit::warn_ratelimited!(
2171                            CVM_ALLOWED,
2172                            ?cr,
2173                            value,
2174                            "failed to write cr"
2175                        );
2176                        self.inject_gpf(intercepted_vtl);
2177                    }
2178                }
2179                &mut self.backing.vtls[intercepted_vtl].exit_stats.cr_access
2180            }
2181            VmxExitBasic::XSETBV => {
2182                let gps = self.runner.tdx_enter_guest_gps();
2183                if let Some(value) =
2184                    hardware_cvm::validate_xsetbv_exit(hardware_cvm::XsetbvExitInput {
2185                        rax: gps[TdxGp::RAX],
2186                        rcx: gps[TdxGp::RCX],
2187                        rdx: gps[TdxGp::RDX],
2188                        cr4: self.backing.vtls[intercepted_vtl].cr4.read(&self.runner),
2189                        cpl: exit_info.cpl(),
2190                    })
2191                {
2192                    if !self.cvm_try_protect_secure_register_write(
2193                        intercepted_vtl,
2194                        HvX64RegisterName::Xfem,
2195                        value,
2196                    ) {
2197                        self.runner
2198                            .set_vp_register(intercepted_vtl, HvX64RegisterName::Xfem, value.into())
2199                            .unwrap();
2200                        self.advance_to_next_instruction(intercepted_vtl);
2201                    }
2202                } else {
2203                    self.inject_gpf(intercepted_vtl);
2204                }
2205                &mut self.backing.vtls[intercepted_vtl].exit_stats.xsetbv
2206            }
2207            VmxExitBasic::WBINVD_INSTRUCTION => {
2208                // Ask the kernel to flush the cache before issuing VP.ENTER.
2209                let no_invalidate = exit_info.qualification() != 0;
2210                if no_invalidate {
2211                    self.runner.tdx_vp_state_flags_mut().set_wbnoinvd(true);
2212                } else {
2213                    self.runner.tdx_vp_state_flags_mut().set_wbinvd(true);
2214                }
2215
2216                self.advance_to_next_instruction(intercepted_vtl);
2217                &mut self.backing.vtls[intercepted_vtl].exit_stats.wbinvd
2218            }
2219            VmxExitBasic::EPT_VIOLATION => {
2220                let gpa = exit_info.gpa().expect("is EPT exit");
2221                let ept_info = VmxEptExitQualification::from(exit_info.qualification());
2222                // If this was an EPT violation while handling an iret, and
2223                // that iret cleared the NMI blocking state, restore it.
2224                if !next_interruption.valid() && ept_info.nmi_unmasking_due_to_iret() {
2225                    let mask = Interruptibility::new().with_blocked_by_nmi(true);
2226                    let value = Interruptibility::new().with_blocked_by_nmi(true);
2227                    let old_interruptibility: Interruptibility = self
2228                        .runner
2229                        .write_vmcs32(
2230                            intercepted_vtl,
2231                            VmcsField::VMX_VMCS_GUEST_INTERRUPTIBILITY,
2232                            mask.into(),
2233                            value.into(),
2234                        )
2235                        .into();
2236                    assert!(!old_interruptibility.blocked_by_nmi());
2237                } else {
2238                    let is_write = ept_info.access_mask() & 0b10 != 0;
2239                    if self.check_mem_fault(intercepted_vtl, gpa, is_write, ept_info) {
2240                        self.emulate(
2241                            dev,
2242                            self.backing.vtls[intercepted_vtl]
2243                                .interruption_information
2244                                .valid(),
2245                            intercepted_vtl,
2246                            TdxEmulationCache::default(),
2247                        )
2248                        .await?;
2249                    }
2250                }
2251
2252                &mut self.backing.vtls[intercepted_vtl].exit_stats.ept_violation
2253            }
2254            VmxExitBasic::TPR_BELOW_THRESHOLD => {
2255                // Loop around to reevaluate the APIC.
2256                &mut self.backing.vtls[intercepted_vtl]
2257                    .exit_stats
2258                    .tpr_below_threshold
2259            }
2260            VmxExitBasic::INTERRUPT_WINDOW => {
2261                // Loop around to reevaluate the APIC.
2262                &mut self.backing.vtls[intercepted_vtl]
2263                    .exit_stats
2264                    .interrupt_window
2265            }
2266            VmxExitBasic::NMI_WINDOW => {
2267                // Loop around to reevaluate pending NMIs.
2268                &mut self.backing.vtls[intercepted_vtl].exit_stats.nmi_window
2269            }
2270            VmxExitBasic::HW_INTERRUPT => {
2271                if cfg!(feature = "gdb") {
2272                    // Check if the interrupt was triggered by a hardware breakpoint.
2273                    let debug_regs = self
2274                        .access_state(intercepted_vtl.into())
2275                        .debug_regs()
2276                        .expect("register query should not fail");
2277                    // The lowest four bits of DR6 indicate which of the
2278                    // four breakpoints triggered.
2279                    breakpoint_debug_exception = debug_regs.dr6.trailing_zeros() < 4;
2280                }
2281                &mut self.backing.vtls[intercepted_vtl].exit_stats.hw_interrupt
2282            }
2283            VmxExitBasic::SMI_INTR => &mut self.backing.vtls[intercepted_vtl].exit_stats.smi_intr,
2284            VmxExitBasic::PAUSE_INSTRUCTION => {
2285                &mut self.backing.vtls[intercepted_vtl].exit_stats.pause
2286            }
2287            VmxExitBasic::TDCALL => {
2288                // If the proxy synic is local, then the host did not get this
2289                // instruction, and we need to handle it.
2290                if self.backing.untrusted_synic.is_some() {
2291                    assert_eq!(intercepted_vtl, GuestVtl::Vtl0);
2292                    self.handle_tdvmcall(dev, intercepted_vtl);
2293                } else if self.cvm_partition().hide_isolation {
2294                    // TDCALL is not valid when hiding isolation. Inject a #UD.
2295                    self.backing.vtls[intercepted_vtl].interruption_information =
2296                        InterruptionInformation::new()
2297                            .with_valid(true)
2298                            .with_vector(x86defs::Exception::INVALID_OPCODE.0)
2299                            .with_interruption_type(INTERRUPT_TYPE_HARDWARE_EXCEPTION);
2300                }
2301                &mut self.backing.vtls[intercepted_vtl].exit_stats.tdcall
2302            }
2303            VmxExitBasic::EXCEPTION => {
2304                tracing::trace!(
2305                    "Caught Exception: {:?}",
2306                    exit_info._exit_interruption_info()
2307                );
2308                if cfg!(feature = "gdb") {
2309                    breakpoint_debug_exception = true;
2310                }
2311                &mut self.backing.vtls[intercepted_vtl].exit_stats.exception
2312            }
2313            VmxExitBasic::TRIPLE_FAULT => {
2314                return Err(VpHaltReason::TripleFault {
2315                    vtl: intercepted_vtl.into(),
2316                });
2317            }
2318            VmxExitBasic::GDTR_OR_IDTR => {
2319                let info = GdtrOrIdtrInstructionInfo::from(exit_info.instr_info().info());
2320                tracing::trace!("Intercepted GDT or IDT instruction: {:?}", info);
2321                let reg = match info.instruction() {
2322                    GdtrOrIdtrInstruction::Sidt | GdtrOrIdtrInstruction::Lidt => {
2323                        HvX64RegisterName::Idtr
2324                    }
2325                    GdtrOrIdtrInstruction::Sgdt | GdtrOrIdtrInstruction::Lgdt => {
2326                        HvX64RegisterName::Gdtr
2327                    }
2328                };
2329                // We only support fowarding intercepts for descriptor table loads today.
2330                if (info.instruction().is_load()
2331                    && !self.cvm_try_protect_secure_register_write(intercepted_vtl, reg, 0))
2332                    || !info.instruction().is_load()
2333                {
2334                    self.emulate_gdtr_or_idtr(intercepted_vtl, dev).await?;
2335                }
2336                &mut self.backing.vtls[intercepted_vtl]
2337                    .exit_stats
2338                    .descriptor_table
2339            }
2340            VmxExitBasic::LDTR_OR_TR => {
2341                let info = LdtrOrTrInstructionInfo::from(exit_info.instr_info().info());
2342                tracing::trace!("Intercepted LDT or TR instruction: {:?}", info);
2343                let reg = match info.instruction() {
2344                    LdtrOrTrInstruction::Sldt | LdtrOrTrInstruction::Lldt => {
2345                        HvX64RegisterName::Ldtr
2346                    }
2347                    LdtrOrTrInstruction::Str | LdtrOrTrInstruction::Ltr => HvX64RegisterName::Tr,
2348                };
2349                // We only support fowarding intercepts for descriptor table loads today.
2350                if (info.instruction().is_load()
2351                    && !self.cvm_try_protect_secure_register_write(intercepted_vtl, reg, 0))
2352                    || !info.instruction().is_load()
2353                {
2354                    self.emulate_ldtr_or_tr(intercepted_vtl, dev).await?;
2355                }
2356                &mut self.backing.vtls[intercepted_vtl]
2357                    .exit_stats
2358                    .descriptor_table
2359            }
2360            VmxExitBasic::TIMER_EXPIRED => {
2361                // Loop around to reevaluate pending interrupts.
2362                &mut self.backing.vtls[intercepted_vtl].exit_stats.timer_expired
2363            }
2364            _ => {
2365                return Err(dev.fatal_error(UnknownVmxExit(exit_info.code().vmx_exit()).into()));
2366            }
2367        };
2368        stat.increment();
2369
2370        // Breakpoint exceptions may return a non-fatal error.
2371        // We dispatch here to correctly increment the counter.
2372        if cfg!(feature = "gdb") && breakpoint_debug_exception {
2373            self.handle_debug_exception(dev, intercepted_vtl)?;
2374        }
2375
2376        Ok(())
2377    }
2378
2379    /// Trace processor state for debugging purposes.
2380    fn trace_processor_state(&self, vtl: GuestVtl) {
2381        let raw_exit = self.runner.tdx_vp_enter_exit_info();
2382        tracing::error!(CVM_CONFIDENTIAL, ?raw_exit, "raw tdx vp enter exit info");
2383
2384        let gprs = self.runner.tdx_enter_guest_gps();
2385        tracing::error!(CVM_CONFIDENTIAL, ?gprs, "guest gpr list");
2386
2387        let TdxPrivateRegs {
2388            rflags,
2389            rip,
2390            rsp,
2391            ssp,
2392            rvi,
2393            svi,
2394            msr_kernel_gs_base,
2395            msr_star,
2396            msr_lstar,
2397            msr_sfmask,
2398            msr_xss,
2399            msr_tsc_aux,
2400            vp_entry_flags,
2401        } = self.backing.vtls[vtl].private_regs;
2402        tracing::error!(
2403            CVM_CONFIDENTIAL,
2404            rflags,
2405            rip,
2406            rsp,
2407            ssp,
2408            rvi,
2409            svi,
2410            msr_kernel_gs_base,
2411            msr_star,
2412            msr_lstar,
2413            msr_sfmask,
2414            msr_xss,
2415            msr_tsc_aux,
2416            ?vp_entry_flags,
2417            "private registers"
2418        );
2419
2420        let physical_cr0 = self.runner.read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_CR0);
2421        let shadow_cr0 = self
2422            .runner
2423            .read_vmcs64(vtl, VmcsField::VMX_VMCS_CR0_READ_SHADOW);
2424        let cr0_guest_host_mask: u64 = self
2425            .runner
2426            .read_vmcs64(vtl, VmcsField::VMX_VMCS_CR0_GUEST_HOST_MASK);
2427        tracing::error!(
2428            CVM_CONFIDENTIAL,
2429            physical_cr0,
2430            shadow_cr0,
2431            cr0_guest_host_mask,
2432            "cr0 values"
2433        );
2434
2435        let physical_cr4 = self.runner.read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_CR4);
2436        let shadow_cr4 = self
2437            .runner
2438            .read_vmcs64(vtl, VmcsField::VMX_VMCS_CR4_READ_SHADOW);
2439        let cr4_guest_host_mask = self
2440            .runner
2441            .read_vmcs64(vtl, VmcsField::VMX_VMCS_CR4_GUEST_HOST_MASK);
2442        tracing::error!(
2443            CVM_CONFIDENTIAL,
2444            physical_cr4,
2445            shadow_cr4,
2446            cr4_guest_host_mask,
2447            "cr4 values"
2448        );
2449
2450        let cr3 = self.runner.read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_CR3);
2451        tracing::error!(CVM_CONFIDENTIAL, cr3, "cr3");
2452
2453        let cached_efer = self.backing.vtls[vtl].efer;
2454        let vmcs_efer = self.runner.read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_EFER);
2455        let entry_controls = self
2456            .runner
2457            .read_vmcs32(vtl, VmcsField::VMX_VMCS_ENTRY_CONTROLS);
2458        tracing::error!(CVM_CONFIDENTIAL, cached_efer, vmcs_efer, "efer");
2459        tracing::error!(CVM_CONFIDENTIAL, entry_controls, "entry controls");
2460
2461        let cs = self.read_segment(vtl, TdxSegmentReg::Cs);
2462        let ds = self.read_segment(vtl, TdxSegmentReg::Ds);
2463        let es = self.read_segment(vtl, TdxSegmentReg::Es);
2464        let fs = self.read_segment(vtl, TdxSegmentReg::Fs);
2465        let gs = self.read_segment(vtl, TdxSegmentReg::Gs);
2466        let ss = self.read_segment(vtl, TdxSegmentReg::Ss);
2467        let tr = self.read_segment(vtl, TdxSegmentReg::Tr);
2468        let ldtr = self.read_segment(vtl, TdxSegmentReg::Ldtr);
2469
2470        tracing::error!(
2471            CVM_CONFIDENTIAL,
2472            ?cs,
2473            ?ds,
2474            ?es,
2475            ?fs,
2476            ?gs,
2477            ?ss,
2478            ?tr,
2479            ?ldtr,
2480            "segment values"
2481        );
2482
2483        let exception_bitmap = self
2484            .runner
2485            .read_vmcs32(vtl, VmcsField::VMX_VMCS_EXCEPTION_BITMAP);
2486        tracing::error!(CVM_CONFIDENTIAL, exception_bitmap, "exception bitmap");
2487
2488        let cached_processor_controls = self.backing.vtls[vtl].processor_controls;
2489        let vmcs_processor_controls = ProcessorControls::from(
2490            self.runner
2491                .read_vmcs32(vtl, VmcsField::VMX_VMCS_PROCESSOR_CONTROLS),
2492        );
2493        let vmcs_secondary_processor_controls = SecondaryProcessorControls::from(
2494            self.runner
2495                .read_vmcs32(vtl, VmcsField::VMX_VMCS_SECONDARY_PROCESSOR_CONTROLS),
2496        );
2497        tracing::error!(
2498            CVM_CONFIDENTIAL,
2499            ?cached_processor_controls,
2500            ?vmcs_processor_controls,
2501            ?vmcs_secondary_processor_controls,
2502            "processor controls"
2503        );
2504
2505        if cached_processor_controls != vmcs_processor_controls {
2506            tracing::error!(CVM_ALLOWED, "BUGBUG: processor controls mismatch");
2507        }
2508
2509        let cached_tpr_threshold = self.backing.vtls[vtl].tpr_threshold;
2510        let vmcs_tpr_threshold = self
2511            .runner
2512            .read_vmcs32(vtl, VmcsField::VMX_VMCS_TPR_THRESHOLD);
2513        tracing::error!(
2514            CVM_CONFIDENTIAL,
2515            cached_tpr_threshold,
2516            vmcs_tpr_threshold,
2517            "tpr threshold"
2518        );
2519
2520        let cached_eoi_exit_bitmap = self.backing.eoi_exit_bitmap;
2521        let vmcs_eoi_exit_bitmap = {
2522            let fields = [
2523                VmcsField::VMX_VMCS_EOI_EXIT_0,
2524                VmcsField::VMX_VMCS_EOI_EXIT_1,
2525                VmcsField::VMX_VMCS_EOI_EXIT_2,
2526                VmcsField::VMX_VMCS_EOI_EXIT_3,
2527            ];
2528            fields
2529                .iter()
2530                .map(|field| self.runner.read_vmcs64(vtl, *field))
2531                .collect::<Vec<_>>()
2532        };
2533        tracing::error!(
2534            CVM_CONFIDENTIAL,
2535            ?cached_eoi_exit_bitmap,
2536            ?vmcs_eoi_exit_bitmap,
2537            "eoi exit bitmap"
2538        );
2539
2540        let cached_interrupt_information = self.backing.vtls[vtl].interruption_information;
2541        let cached_interruption_set = self.backing.vtls[vtl].interruption_set;
2542        let vmcs_interrupt_information = self
2543            .runner
2544            .read_vmcs32(vtl, VmcsField::VMX_VMCS_ENTRY_INTERRUPT_INFO);
2545        let vmcs_entry_exception_code = self
2546            .runner
2547            .read_vmcs32(vtl, VmcsField::VMX_VMCS_ENTRY_EXCEPTION_ERROR_CODE);
2548        tracing::error!(
2549            CVM_CONFIDENTIAL,
2550            ?cached_interrupt_information,
2551            cached_interruption_set,
2552            vmcs_interrupt_information,
2553            vmcs_entry_exception_code,
2554            "interrupt information"
2555        );
2556
2557        let guest_interruptibility = self
2558            .runner
2559            .read_vmcs32(vtl, VmcsField::VMX_VMCS_GUEST_INTERRUPTIBILITY);
2560        tracing::error!(
2561            CVM_CONFIDENTIAL,
2562            guest_interruptibility,
2563            "guest interruptibility"
2564        );
2565
2566        let vmcs_sysenter_cs = self
2567            .runner
2568            .read_vmcs32(vtl, VmcsField::VMX_VMCS_GUEST_SYSENTER_CS_MSR);
2569        let vmcs_sysenter_esp = self
2570            .runner
2571            .read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_SYSENTER_ESP_MSR);
2572        let vmcs_sysenter_eip = self
2573            .runner
2574            .read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_SYSENTER_EIP_MSR);
2575        tracing::error!(
2576            CVM_CONFIDENTIAL,
2577            vmcs_sysenter_cs,
2578            vmcs_sysenter_esp,
2579            vmcs_sysenter_eip,
2580            "sysenter values"
2581        );
2582
2583        let vmcs_pat = self.runner.read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_PAT);
2584        tracing::error!(CVM_CONFIDENTIAL, vmcs_pat, "guest PAT");
2585    }
2586
2587    fn handle_vm_enter_failed(
2588        &self,
2589        dev: &impl CpuIo,
2590        vtl: GuestVtl,
2591        vmx_exit: VmxExit,
2592    ) -> VpHaltReason {
2593        assert!(vmx_exit.vm_enter_failed());
2594        match vmx_exit.basic_reason() {
2595            VmxExitBasic::BAD_GUEST_STATE => {
2596                // Log system register state for debugging why we were
2597                // unable to enter the guest. This is a VMM bug.
2598                tracing::error!(CVM_ALLOWED, "VP.ENTER failed with bad guest state");
2599                self.trace_processor_state(vtl);
2600
2601                dev.fatal_error(VmxBadGuestState.into())
2602            }
2603            _ => dev.fatal_error(UnknownVmxExit(vmx_exit).into()),
2604        }
2605    }
2606
2607    fn advance_to_next_instruction(&mut self, vtl: GuestVtl) {
2608        let instr_info = TdxExit(self.runner.tdx_vp_enter_exit_info()).instr_info();
2609        let rip = &mut self.backing.vtls[vtl].private_regs.rip;
2610        *rip = rip.wrapping_add(instr_info.length().into());
2611    }
2612
2613    fn clear_interrupt_shadow(&mut self, vtl: GuestVtl) {
2614        let mask = Interruptibility::new().with_blocked_by_sti(true);
2615        let value = Interruptibility::new().with_blocked_by_sti(false);
2616        self.runner.write_vmcs32(
2617            vtl,
2618            VmcsField::VMX_VMCS_GUEST_INTERRUPTIBILITY,
2619            mask.into(),
2620            value.into(),
2621        );
2622    }
2623
2624    fn inject_gpf(&mut self, vtl: GuestVtl) {
2625        self.backing.vtls[vtl].interruption_information = InterruptionInformation::new()
2626            .with_valid(true)
2627            .with_vector(x86defs::Exception::GENERAL_PROTECTION_FAULT.0)
2628            .with_interruption_type(INTERRUPT_TYPE_HARDWARE_EXCEPTION)
2629            .with_deliver_error_code(true);
2630        self.backing.vtls[vtl].exception_error_code = 0;
2631    }
2632
2633    fn handle_tdvmcall(&mut self, dev: &impl CpuIo, intercepted_vtl: GuestVtl) {
2634        let regs = self.runner.tdx_enter_guest_gps();
2635        if regs[TdxGp::R10] == 0 {
2636            // Architectural VMCALL.
2637            let result = match VmxExitBasic(regs[TdxGp::R11] as u16) {
2638                VmxExitBasic::MSR_WRITE => {
2639                    let msr = regs[TdxGp::R12] as u32;
2640                    let value = regs[TdxGp::R13];
2641                    match self.write_tdvmcall_msr(msr, value, intercepted_vtl) {
2642                        Ok(()) => {
2643                            tracing::debug!(msr, value, "tdvmcall msr write");
2644                            TdVmCallR10Result::SUCCESS
2645                        }
2646                        Err(err) => {
2647                            tracelimit::warn_ratelimited!(
2648                                CVM_ALLOWED,
2649                                msr,
2650                                ?err,
2651                                "failed tdvmcall msr write"
2652                            );
2653                            tracelimit::warn_ratelimited!(
2654                                CVM_CONFIDENTIAL,
2655                                value,
2656                                "failed tdvmcall msr write"
2657                            );
2658                            TdVmCallR10Result::OPERAND_INVALID
2659                        }
2660                    }
2661                }
2662                VmxExitBasic::MSR_READ => {
2663                    let msr = regs[TdxGp::R12] as u32;
2664                    match self.read_tdvmcall_msr(msr, intercepted_vtl) {
2665                        Ok(value) => {
2666                            tracing::debug!(msr, value, "tdvmcall msr read");
2667                            self.runner.tdx_enter_guest_gps_mut()[TdxGp::R11] = value;
2668                            TdVmCallR10Result::SUCCESS
2669                        }
2670                        Err(err) => {
2671                            tracelimit::warn_ratelimited!(
2672                                CVM_ALLOWED,
2673                                msr,
2674                                ?err,
2675                                "failed tdvmcall msr read"
2676                            );
2677                            TdVmCallR10Result::OPERAND_INVALID
2678                        }
2679                    }
2680                }
2681                subfunction => {
2682                    tracelimit::warn_ratelimited!(
2683                        CVM_ALLOWED,
2684                        ?subfunction,
2685                        "architectural vmcall not supported"
2686                    );
2687                    TdVmCallR10Result::OPERAND_INVALID
2688                }
2689            };
2690            self.runner.tdx_enter_guest_gps_mut()[TdxGp::R10] = result.0;
2691            self.backing.vtls[intercepted_vtl].private_regs.rip = self.backing.vtls
2692                [intercepted_vtl]
2693                .private_regs
2694                .rip
2695                .wrapping_add(4);
2696        } else {
2697            // This hypercall is normally handled by the hypervisor, so the gpas
2698            // given by the guest should all be shared. The hypervisor allows
2699            // gpas to be set with or without the shared gpa boundary bit, which
2700            // untrusted_dma_memory correctly models. Note that some Linux
2701            // guests will issue hypercalls without the boundary bit set,
2702            // whereas UEFI will issue with the bit set.
2703            let guest_memory = &self.shared.cvm.shared_memory;
2704            let handler = UhHypercallHandler {
2705                vp: &mut *self,
2706                bus: dev,
2707                trusted: false,
2708                intercepted_vtl,
2709            };
2710
2711            UhHypercallHandler::TDCALL_DISPATCHER.dispatch(guest_memory, TdHypercall(handler));
2712        }
2713    }
2714
2715    fn read_tdvmcall_msr(&mut self, msr: u32, intercepted_vtl: GuestVtl) -> Result<u64, MsrError> {
2716        match msr {
2717            msr @ (hvdef::HV_X64_MSR_GUEST_OS_ID | hvdef::HV_X64_MSR_VP_INDEX) => {
2718                self.backing.cvm.hv[intercepted_vtl].msr_read(msr)
2719            }
2720            _ => self
2721                .backing
2722                .untrusted_synic
2723                .as_mut()
2724                .unwrap()
2725                .read_nontimer_msr(msr),
2726        }
2727    }
2728
2729    fn write_tdvmcall_msr(
2730        &mut self,
2731        msr: u32,
2732        value: u64,
2733        intercepted_vtl: GuestVtl,
2734    ) -> Result<(), MsrError> {
2735        match msr {
2736            hvdef::HV_X64_MSR_GUEST_OS_ID => {
2737                self.backing.cvm.hv[intercepted_vtl].msr_write_guest_os_id(value)
2738            }
2739            _ => {
2740                // If we get here we must have an untrusted synic, as otherwise
2741                // we wouldn't be handling the TDVMCALL that ends up here. Therefore
2742                // this is fine to unwrap.
2743                self.backing
2744                    .untrusted_synic
2745                    .as_mut()
2746                    .unwrap()
2747                    .write_nontimer_msr(
2748                        msr,
2749                        value,
2750                        &mut UntrustedSynicVtlProts(&self.partition.gm[GuestVtl::Vtl0]),
2751                    )?;
2752                // Propagate sint MSR writes to the hypervisor as well
2753                // so that the hypervisor can directly inject events.
2754                if matches!(msr, hvdef::HV_X64_MSR_SINT0..=hvdef::HV_X64_MSR_SINT15) {
2755                    if let Err(err) = self.runner.set_vp_register(
2756                        intercepted_vtl,
2757                        HvX64RegisterName(
2758                            HvX64RegisterName::Sint0.0 + (msr - hvdef::HV_X64_MSR_SINT0),
2759                        ),
2760                        value.into(),
2761                    ) {
2762                        tracelimit::warn_ratelimited!(
2763                            CVM_ALLOWED,
2764                            error = &err as &dyn std::error::Error,
2765                            "failed to set sint register"
2766                        );
2767                    }
2768                }
2769            }
2770        }
2771
2772        Ok(())
2773    }
2774
2775    fn read_msr_tdx(&mut self, msr: u32, vtl: GuestVtl) -> Result<u64, MsrError> {
2776        // TODO TDX: port remaining tdx and common values
2777        //
2778        // TODO TDX: consider if this can be shared with SnpBacked's
2779        // implementation. For the most part other than Intel/TDX specific
2780        // registers, MSR handling should be the same.
2781
2782        match msr {
2783            // TODO TDX: LIFTED FROM WHP
2784            x86defs::X86X_IA32_MSR_PLATFORM_ID => {
2785                // Windows requires accessing this to boot. WHP
2786                // used to pass this through to the hardware,
2787                // but this regressed. Zero seems to work fine
2788                // for Windows.
2789                //
2790                // TODO: Pass through the host value if it can
2791                //       be retrieved.
2792                Ok(0)
2793            }
2794
2795            x86defs::X86X_MSR_MTRR_CAP => {
2796                // Advertise the absence of MTRR capabilities, but include the availability of write
2797                // combining.
2798                Ok(0x400)
2799            }
2800            x86defs::X86X_MSR_MTRR_DEF_TYPE => {
2801                // Because the MTRR registers are advertised via CPUID, even though no actual ranges
2802                // are supported a guest may choose to write to this MSR. Implement it as read as
2803                // zero/write ignore.
2804                Ok(0)
2805            }
2806            x86defs::X86X_MSR_CSTAR => Ok(self.backing.vtls[vtl].msr_cstar),
2807            x86defs::X86X_MSR_MCG_CAP => Ok(0),
2808            x86defs::X86X_MSR_MCG_STATUS => Ok(0),
2809            x86defs::X86X_MSR_MC_UPDATE_PATCH_LEVEL => Ok(0xFFFFFFFF),
2810            x86defs::X86X_MSR_XSS => Ok(self.backing.vtls[vtl].private_regs.msr_xss),
2811            x86defs::X86X_IA32_MSR_MISC_ENABLE => Ok(hv1_emulator::x86::MISC_ENABLE.into()),
2812            x86defs::X86X_IA32_MSR_FEATURE_CONTROL => Ok(VMX_FEATURE_CONTROL_LOCKED),
2813            x86defs::X86X_MSR_CR_PAT => {
2814                let pat = self.runner.read_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_PAT);
2815                Ok(pat)
2816            }
2817
2818            // Following MSRs are unconditionally read by Linux guests.
2819            // These are not virtualized and unsupported for L2-VMs
2820            x86defs::X86X_MSR_MISC_FEATURE_ENABLES
2821            | x86defs::X86X_MSR_PLATFORM_INFO
2822            | x86defs::X86X_MSR_PPIN_CTL
2823            | x86defs::X86X_IA32_MSR_SMI_COUNT
2824            | x86defs::X86X_MSR_UMWAIT_CONTROL
2825            | x86defs::X86X_AMD_MSR_DE_CFG
2826            | x86defs::X86X_IA32_MSR_RAPL_POWER_UNIT
2827            | x86defs::X86X_IA32_MSR_PKG_ENERGY_STATUS
2828            | x86defs::X86X_IA32_MSR_DRAM_ENERGY_STATUS
2829            | x86defs::X86X_IA32_MSR_PP0_ENERGY_STATUS => Ok(0),
2830
2831            hvdef::HV_X64_MSR_GUEST_IDLE => {
2832                self.backing.cvm.lapics[vtl].activity = MpState::Idle;
2833                self.clear_interrupt_shadow(vtl);
2834                Ok(0)
2835            }
2836            X86X_MSR_EFER => Ok(self.backing.vtls[vtl].efer),
2837
2838            _ => Err(MsrError::Unknown),
2839        }
2840    }
2841
2842    fn write_msr_tdx(&mut self, msr: u32, value: u64, vtl: GuestVtl) -> Result<(), MsrError> {
2843        let state = &mut self.backing.vtls[vtl].private_regs;
2844
2845        match msr {
2846            X86X_MSR_EFER => {
2847                self.write_efer(vtl, value)
2848                    .map_err(|_| MsrError::InvalidAccess)?;
2849                self.update_execution_mode(vtl);
2850            }
2851            x86defs::X86X_MSR_STAR => state.msr_star = value,
2852            x86defs::X86X_MSR_CSTAR => self.backing.vtls[vtl].msr_cstar = value,
2853            x86defs::X86X_MSR_LSTAR => state.msr_lstar = value,
2854            x86defs::X86X_MSR_SFMASK => state.msr_sfmask = value,
2855            x86defs::X86X_MSR_TSC_AUX => state.msr_tsc_aux = value,
2856            x86defs::X86X_MSR_SYSENTER_CS => {
2857                self.runner.write_vmcs32(
2858                    vtl,
2859                    VmcsField::VMX_VMCS_GUEST_SYSENTER_CS_MSR,
2860                    !0,
2861                    value as u32,
2862                );
2863            }
2864            x86defs::X86X_MSR_SYSENTER_EIP => {
2865                self.runner.write_vmcs64(
2866                    vtl,
2867                    VmcsField::VMX_VMCS_GUEST_SYSENTER_EIP_MSR,
2868                    !0,
2869                    value,
2870                );
2871            }
2872            x86defs::X86X_MSR_SYSENTER_ESP => {
2873                self.runner.write_vmcs64(
2874                    vtl,
2875                    VmcsField::VMX_VMCS_GUEST_SYSENTER_ESP_MSR,
2876                    !0,
2877                    value,
2878                );
2879            }
2880            x86defs::X86X_MSR_XSS => state.msr_xss = value,
2881            x86defs::X86X_MSR_MC_UPDATE_PATCH_LEVEL => {
2882                // Writing zero on intel platforms is allowed and ignored.
2883                if value != 0 {
2884                    return Err(MsrError::InvalidAccess);
2885                }
2886            }
2887            x86defs::X86X_IA32_MSR_MISC_ENABLE => {}
2888            x86defs::X86X_MSR_CR_PAT => {
2889                self.runner
2890                    .write_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_PAT, !0, value);
2891            }
2892
2893            x86defs::X86X_MSR_MCG_STATUS => {
2894                // Writes are swallowed, except for reserved bits violations
2895                if x86defs::X86xMcgStatusRegister::from(value).reserved0() != 0 {
2896                    return Err(MsrError::InvalidAccess);
2897                }
2898            }
2899
2900            // Ignore writes to this MSR
2901            x86defs::X86X_MSR_MTRR_DEF_TYPE => {}
2902
2903            // Following MSRs are sometimes written by Windows guests.
2904            // These are not virtualized and unsupported for L2-VMs
2905            x86defs::X86X_MSR_BIOS_UPDT_TRIG => {}
2906
2907            // Following MSRs are unconditionally written by Linux guests.
2908            // These are not virtualized and unsupported for L2-VMs
2909            x86defs::X86X_MSR_MISC_FEATURE_ENABLES
2910            | x86defs::X86X_MSR_PLATFORM_INFO
2911            | x86defs::X86X_MSR_PPIN_CTL
2912            | x86defs::X86X_IA32_MSR_SMI_COUNT
2913            | x86defs::X86X_MSR_UMWAIT_CONTROL
2914            | x86defs::X86X_AMD_MSR_DE_CFG
2915            | x86defs::X86X_IA32_MSR_RAPL_POWER_UNIT
2916            | x86defs::X86X_IA32_MSR_PKG_ENERGY_STATUS
2917            | x86defs::X86X_IA32_MSR_DRAM_ENERGY_STATUS
2918            | x86defs::X86X_IA32_MSR_PP0_ENERGY_STATUS => {}
2919
2920            _ => return Err(MsrError::Unknown),
2921        }
2922
2923        Ok(())
2924    }
2925
2926    fn write_segment(
2927        &mut self,
2928        vtl: GuestVtl,
2929        seg: TdxSegmentReg,
2930        reg: SegmentRegister,
2931    ) -> Result<(), vp_state::Error> {
2932        // write base, selector, limit
2933        self.runner
2934            .write_vmcs16(vtl, seg.selector(), !0, reg.selector);
2935        self.runner.write_vmcs64(vtl, seg.base(), !0, reg.base);
2936        self.runner.write_vmcs32(vtl, seg.limit(), !0, reg.limit);
2937
2938        // Mark segment not valid if its attributes indicate not present.
2939        let mut attributes = x86defs::vmx::VmxSegmentAttributes::from(reg.attributes as u32);
2940        attributes.set_null(!attributes.present());
2941
2942        self.runner
2943            .write_vmcs32(vtl, seg.attributes(), !0, attributes.into());
2944
2945        Ok(())
2946    }
2947
2948    fn read_segment(&self, vtl: GuestVtl, seg: TdxSegmentReg) -> SegmentRegister {
2949        let selector = self.runner.read_vmcs16(vtl, seg.selector());
2950        let base = self.runner.read_vmcs64(vtl, seg.base());
2951        let limit = self.runner.read_vmcs32(vtl, seg.limit());
2952        let attributes = self.runner.read_vmcs32(vtl, seg.attributes());
2953
2954        SegmentRegister {
2955            selector,
2956            base,
2957            limit,
2958            attributes: attributes as u16,
2959        }
2960    }
2961
2962    fn long_mode(&self, vtl: GuestVtl) -> bool {
2963        let backing = &self.backing.vtls[vtl];
2964        backing.cr0.read(&self.runner) & X64_CR0_PE != 0 && backing.efer & X64_EFER_LMA != 0
2965    }
2966}
2967
2968impl<T: CpuIo> X86EmulatorSupport for UhEmulationState<'_, '_, T, TdxBacked> {
2969    fn vp_index(&self) -> VpIndex {
2970        self.vp.vp_index()
2971    }
2972
2973    fn flush(&mut self) {
2974        // no cached registers are modifiable by the emulator for TDX
2975    }
2976
2977    fn vendor(&self) -> x86defs::cpuid::Vendor {
2978        self.vp.partition.caps.vendor
2979    }
2980
2981    fn gp(&mut self, reg: Gp) -> u64 {
2982        self.vp.runner.tdx_enter_guest_gps()[reg as usize]
2983    }
2984
2985    fn set_gp(&mut self, reg: Gp, v: u64) {
2986        self.vp.runner.tdx_enter_guest_gps_mut()[reg as usize] = v;
2987    }
2988
2989    fn xmm(&mut self, index: usize) -> u128 {
2990        u128::from_ne_bytes(self.vp.runner.fx_state().xmm[index])
2991    }
2992
2993    fn set_xmm(&mut self, index: usize, v: u128) {
2994        self.vp.runner.fx_state_mut().xmm[index] = v.to_ne_bytes();
2995    }
2996
2997    fn rip(&mut self) -> u64 {
2998        self.vp.backing.vtls[self.vtl].private_regs.rip
2999    }
3000
3001    fn set_rip(&mut self, v: u64) {
3002        self.vp.backing.vtls[self.vtl].private_regs.rip = v;
3003    }
3004
3005    fn segment(&mut self, index: Segment) -> x86defs::SegmentRegister {
3006        let tdx_segment_index = match index {
3007            Segment::CS => TdxSegmentReg::Cs,
3008            Segment::ES => TdxSegmentReg::Es,
3009            Segment::SS => TdxSegmentReg::Ss,
3010            Segment::DS => TdxSegmentReg::Ds,
3011            Segment::FS => TdxSegmentReg::Fs,
3012            Segment::GS => TdxSegmentReg::Gs,
3013        };
3014        let reg = match tdx_segment_index {
3015            TdxSegmentReg::Cs => self.cache.segs[index as usize]
3016                .get_or_insert_with(|| TdxExit(self.vp.runner.tdx_vp_enter_exit_info()).cs()),
3017            _ => self.cache.segs[index as usize]
3018                .get_or_insert_with(|| self.vp.read_segment(self.vtl, tdx_segment_index)),
3019        };
3020        (*reg).into()
3021    }
3022
3023    fn efer(&mut self) -> u64 {
3024        self.vp.backing.vtls[self.vtl].efer
3025    }
3026
3027    fn cr0(&mut self) -> u64 {
3028        let reg = self
3029            .cache
3030            .cr0
3031            .get_or_insert_with(|| self.vp.backing.vtls[self.vtl].cr0.read(&self.vp.runner));
3032        *reg
3033    }
3034
3035    fn rflags(&mut self) -> RFlags {
3036        self.vp.backing.vtls[self.vtl].private_regs.rflags.into()
3037    }
3038
3039    fn set_rflags(&mut self, v: RFlags) {
3040        self.vp.backing.vtls[self.vtl].private_regs.rflags = v.into();
3041    }
3042
3043    fn instruction_bytes(&self) -> &[u8] {
3044        &[]
3045    }
3046
3047    fn physical_address(&self) -> Option<u64> {
3048        TdxExit(self.vp.runner.tdx_vp_enter_exit_info()).gpa()
3049    }
3050
3051    fn initial_gva_translation(
3052        &mut self,
3053    ) -> Option<virt_support_x86emu::emulate::InitialTranslation> {
3054        let exit_info = TdxExit(self.vp.runner.tdx_vp_enter_exit_info());
3055        let ept_info = VmxEptExitQualification::from(exit_info.qualification());
3056
3057        if exit_info.code().vmx_exit().basic_reason() == VmxExitBasic::EPT_VIOLATION
3058            && ept_info.gva_valid()
3059        {
3060            Some(virt_support_x86emu::emulate::InitialTranslation {
3061                gva: exit_info.gla().expect("already validated EPT exit"),
3062                gpa: exit_info.gpa().expect("already validated EPT exit"),
3063                translate_mode: match ept_info.access_mask() {
3064                    0x1 => TranslateMode::Read,
3065                    // As defined in "Table 28-7. Exit Qualification for EPT
3066                    // Violations" in the Intel SDM, the processor may set both
3067                    // the read and write bits in certain conditions:
3068                    //
3069                    // If accessed and dirty flags for EPT are enabled,
3070                    // processor accesses to guest paging-structure entries are
3071                    // treated as writes with regard to EPT violations (see
3072                    // Section 29.3.3.2). If such an access causes an EPT
3073                    // violation, the processor sets both bit 0 and bit 1 of the
3074                    // exit qualification.
3075                    //
3076                    // Treat both 0x2 and 0x3 as writes.
3077                    0x2 | 0x3 => TranslateMode::Write,
3078                    0x4 => TranslateMode::Execute,
3079                    _ => panic!("unexpected ept access mask 0x{:x}", ept_info.access_mask()),
3080                },
3081            })
3082        } else {
3083            None
3084        }
3085    }
3086
3087    fn interruption_pending(&self) -> bool {
3088        self.interruption_pending
3089    }
3090
3091    fn check_vtl_access(
3092        &mut self,
3093        _gpa: u64,
3094        _mode: TranslateMode,
3095    ) -> Result<(), virt_support_x86emu::emulate::EmuCheckVtlAccessError> {
3096        // Nothing to do here, the guest memory object will handle the check.
3097        Ok(())
3098    }
3099
3100    fn translate_gva(
3101        &mut self,
3102        gva: u64,
3103        mode: TranslateMode,
3104    ) -> Result<
3105        virt_support_x86emu::emulate::EmuTranslateResult,
3106        virt_support_x86emu::emulate::EmuTranslateError,
3107    > {
3108        emulate_translate_gva(self, gva, mode)
3109    }
3110
3111    fn inject_pending_event(&mut self, event_info: hvdef::HvX64PendingEvent) {
3112        assert!(event_info.reg_0.event_pending());
3113        assert_eq!(
3114            event_info.reg_0.event_type(),
3115            hvdef::HV_X64_PENDING_EVENT_EXCEPTION
3116        );
3117        assert!(!self.interruption_pending);
3118
3119        // There's no interruption pending, so just inject the exception
3120        // directly without checking for double fault.
3121        TdxBacked::set_pending_exception(
3122            self.vp,
3123            self.vtl,
3124            HvX64PendingExceptionEvent::from(event_info.reg_0.into_bits()),
3125        );
3126    }
3127
3128    fn is_gpa_mapped(&self, gpa: u64, write: bool) -> bool {
3129        // Ignore the VTOM address bit when checking, since memory is mirrored
3130        // across the VTOM.
3131        let vtom = self.vp.partition.caps.vtom.unwrap_or(0);
3132        debug_assert!(vtom == 0 || vtom.is_power_of_two());
3133        self.vp.partition.is_gpa_mapped(gpa & !vtom, write)
3134    }
3135
3136    fn lapic_base_address(&self) -> Option<u64> {
3137        self.vp.backing.cvm.lapics[self.vtl].lapic.base_address()
3138    }
3139
3140    fn lapic_read(&mut self, address: u64, data: &mut [u8]) {
3141        self.vp.backing.cvm.lapics[self.vtl]
3142            .lapic
3143            .access(&mut TdxApicClient {
3144                partition: self.vp.partition,
3145                dev: self.devices,
3146                vmtime: &self.vp.vmtime,
3147                apic_page: self.vp.runner.tdx_apic_page_mut(self.vtl),
3148                vtl: self.vtl,
3149            })
3150            .mmio_read(address, data);
3151    }
3152
3153    fn lapic_write(&mut self, address: u64, data: &[u8]) {
3154        self.vp.backing.cvm.lapics[self.vtl]
3155            .lapic
3156            .access(&mut TdxApicClient {
3157                partition: self.vp.partition,
3158                dev: self.devices,
3159                vmtime: &self.vp.vmtime,
3160                apic_page: self.vp.runner.tdx_apic_page_mut(self.vtl),
3161                vtl: self.vtl,
3162            })
3163            .mmio_write(address, data);
3164    }
3165
3166    fn monitor_support(&self) -> Option<&dyn EmulatorMonitorSupport> {
3167        Some(self)
3168    }
3169}
3170
3171#[derive(Debug)]
3172enum TdxSegmentReg {
3173    Es,
3174    Cs,
3175    Ss,
3176    Ds,
3177    Fs,
3178    Gs,
3179    Ldtr,
3180    Tr,
3181}
3182
3183impl TdxSegmentReg {
3184    /// The selector vmcs field code.
3185    fn selector(&self) -> VmcsField {
3186        match self {
3187            Self::Es => VmcsField::VMX_VMCS_GUEST_ES_SELECTOR,
3188            Self::Cs => VmcsField::VMX_VMCS_GUEST_CS_SELECTOR,
3189            Self::Ss => VmcsField::VMX_VMCS_GUEST_SS_SELECTOR,
3190            Self::Ds => VmcsField::VMX_VMCS_GUEST_DS_SELECTOR,
3191            Self::Fs => VmcsField::VMX_VMCS_GUEST_FS_SELECTOR,
3192            Self::Gs => VmcsField::VMX_VMCS_GUEST_GS_SELECTOR,
3193            Self::Ldtr => VmcsField::VMX_VMCS_GUEST_LDTR_SELECTOR,
3194            Self::Tr => VmcsField::VMX_VMCS_GUEST_TR_SELECTOR,
3195        }
3196    }
3197
3198    /// The base vmcs field code.
3199    fn base(&self) -> VmcsField {
3200        match self {
3201            Self::Es => VmcsField::VMX_VMCS_GUEST_ES_BASE,
3202            Self::Cs => VmcsField::VMX_VMCS_GUEST_CS_BASE,
3203            Self::Ss => VmcsField::VMX_VMCS_GUEST_SS_BASE,
3204            Self::Ds => VmcsField::VMX_VMCS_GUEST_DS_BASE,
3205            Self::Fs => VmcsField::VMX_VMCS_GUEST_FS_BASE,
3206            Self::Gs => VmcsField::VMX_VMCS_GUEST_GS_BASE,
3207            Self::Ldtr => VmcsField::VMX_VMCS_GUEST_LDTR_BASE,
3208            Self::Tr => VmcsField::VMX_VMCS_GUEST_TR_BASE,
3209        }
3210    }
3211
3212    /// The limit vmcs field code.
3213    fn limit(&self) -> VmcsField {
3214        match self {
3215            Self::Es => VmcsField::VMX_VMCS_GUEST_ES_LIMIT,
3216            Self::Cs => VmcsField::VMX_VMCS_GUEST_CS_LIMIT,
3217            Self::Ss => VmcsField::VMX_VMCS_GUEST_SS_LIMIT,
3218            Self::Ds => VmcsField::VMX_VMCS_GUEST_DS_LIMIT,
3219            Self::Fs => VmcsField::VMX_VMCS_GUEST_FS_LIMIT,
3220            Self::Gs => VmcsField::VMX_VMCS_GUEST_GS_LIMIT,
3221            Self::Ldtr => VmcsField::VMX_VMCS_GUEST_LDTR_LIMIT,
3222            Self::Tr => VmcsField::VMX_VMCS_GUEST_TR_LIMIT,
3223        }
3224    }
3225
3226    // The attributes vmcs field code.
3227    fn attributes(&self) -> VmcsField {
3228        match self {
3229            Self::Es => VmcsField::VMX_VMCS_GUEST_ES_AR,
3230            Self::Cs => VmcsField::VMX_VMCS_GUEST_CS_AR,
3231            Self::Ss => VmcsField::VMX_VMCS_GUEST_SS_AR,
3232            Self::Ds => VmcsField::VMX_VMCS_GUEST_DS_AR,
3233            Self::Fs => VmcsField::VMX_VMCS_GUEST_FS_AR,
3234            Self::Gs => VmcsField::VMX_VMCS_GUEST_GS_AR,
3235            Self::Ldtr => VmcsField::VMX_VMCS_GUEST_LDTR_AR,
3236            Self::Tr => VmcsField::VMX_VMCS_GUEST_TR_AR,
3237        }
3238    }
3239}
3240
3241#[derive(Debug)]
3242enum TdxTableReg {
3243    Idtr,
3244    Gdtr,
3245}
3246
3247impl TdxTableReg {
3248    fn base_code(&self) -> VmcsField {
3249        match self {
3250            Self::Idtr => VmcsField::VMX_VMCS_GUEST_IDTR_BASE,
3251            Self::Gdtr => VmcsField::VMX_VMCS_GUEST_GDTR_BASE,
3252        }
3253    }
3254
3255    fn limit_code(&self) -> VmcsField {
3256        match self {
3257            Self::Idtr => VmcsField::VMX_VMCS_GUEST_IDTR_LIMIT,
3258            Self::Gdtr => VmcsField::VMX_VMCS_GUEST_GDTR_LIMIT,
3259        }
3260    }
3261}
3262
3263impl UhProcessor<'_, TdxBacked> {
3264    /// Handle a write to EFER, which requires special handling on TDX due to
3265    /// required bits and state updates.
3266    ///
3267    /// Note that a caller must also call [`Self::update_execution_mode`] after
3268    /// updating EFER.
3269    fn write_efer(&mut self, vtl: GuestVtl, efer: u64) -> Result<(), vp_state::Error> {
3270        if efer & (X64_EFER_SVME | X64_EFER_FFXSR) != 0 {
3271            return Err(vp_state::Error::InvalidValue(
3272                efer,
3273                "EFER",
3274                "SVME or FFXSR set",
3275            ));
3276        }
3277
3278        // EFER.NXE must be 1.
3279        if efer & X64_EFER_NXE == 0 {
3280            return Err(vp_state::Error::InvalidValue(efer, "EFER", "NXE not set"));
3281        }
3282
3283        // Update the local value of EFER and the VMCS.
3284        if self.backing.vtls[vtl].efer != efer {
3285            self.backing.vtls[vtl].efer = efer;
3286            self.runner
3287                .write_vmcs64(vtl, VmcsField::VMX_VMCS_GUEST_EFER, !0, efer);
3288        }
3289
3290        Ok(())
3291    }
3292
3293    /// Read CR0 that includes guest shadowed bits. This is the value the guest
3294    /// sees.
3295    fn read_cr0(&self, vtl: GuestVtl) -> u64 {
3296        self.backing.vtls[vtl].cr0.read(&self.runner)
3297    }
3298
3299    /// Write to the guest CR0.
3300    fn write_cr0(&mut self, vtl: GuestVtl, value: u64) -> Result<(), vp_state::Error> {
3301        self.backing.vtls[vtl]
3302            .cr0
3303            .write(value | X64_CR0_ET, &mut self.runner)
3304    }
3305
3306    fn read_cr4(&self, vtl: GuestVtl) -> u64 {
3307        self.backing.vtls[vtl].cr4.read(&self.runner)
3308    }
3309
3310    fn write_cr4(&mut self, vtl: GuestVtl, value: u64) -> Result<(), vp_state::Error> {
3311        self.backing.vtls[vtl].cr4.write(value, &mut self.runner)
3312    }
3313
3314    fn write_table_register(&mut self, vtl: GuestVtl, table: TdxTableReg, reg: TableRegister) {
3315        self.runner
3316            .write_vmcs64(vtl, table.base_code(), !0, reg.base);
3317        self.runner
3318            .write_vmcs32(vtl, table.limit_code(), !0, reg.limit.into());
3319    }
3320
3321    fn read_table_register(&self, vtl: GuestVtl, table: TdxTableReg) -> TableRegister {
3322        let base = self.runner.read_vmcs64(vtl, table.base_code());
3323        let limit = self.runner.read_vmcs32(vtl, table.limit_code());
3324
3325        TableRegister {
3326            base,
3327            limit: limit as u16,
3328        }
3329    }
3330
3331    /// Update execution mode when CR0 or EFER is changed.
3332    fn update_execution_mode(&mut self, vtl: GuestVtl) {
3333        let lme = self.backing.vtls[vtl].efer & X64_EFER_LME == X64_EFER_LME;
3334        let pg = self.read_cr0(vtl) & X64_CR0_PG == X64_CR0_PG;
3335        let efer_lma = self.backing.vtls[vtl].efer & X64_EFER_LMA == X64_EFER_LMA;
3336        let lma = lme && pg;
3337
3338        if lma != efer_lma {
3339            // Flip only the LMA bit.
3340            let new_efer = self.backing.vtls[vtl].efer ^ X64_EFER_LMA;
3341            self.write_efer(vtl, new_efer)
3342                .expect("EFER was valid before, it should still be valid");
3343        }
3344
3345        self.runner.write_vmcs32(
3346            vtl,
3347            VmcsField::VMX_VMCS_ENTRY_CONTROLS,
3348            VMX_ENTRY_CONTROL_LONG_MODE_GUEST,
3349            if lma {
3350                VMX_ENTRY_CONTROL_LONG_MODE_GUEST
3351            } else {
3352                0
3353            },
3354        );
3355    }
3356
3357    async fn emulate_gdtr_or_idtr(
3358        &mut self,
3359        vtl: GuestVtl,
3360        dev: &impl CpuIo,
3361    ) -> Result<(), VpHaltReason> {
3362        let exit_info = TdxExit(self.runner.tdx_vp_enter_exit_info());
3363        assert_eq!(
3364            exit_info.code().vmx_exit().basic_reason(),
3365            VmxExitBasic::GDTR_OR_IDTR
3366        );
3367        let instr_info = GdtrOrIdtrInstructionInfo::from(exit_info.instr_info().info());
3368
3369        // Check if load instructions are executed outside of kernel mode.
3370        // Check if store instructions are blocked by UMIP.
3371        if (instr_info.instruction().is_load() && exit_info.cpl() != 0)
3372            || (!instr_info.instruction().is_load()
3373                && exit_info.cpl() > 0
3374                && self.read_cr4(vtl) & X64_CR4_UMIP != 0)
3375        {
3376            self.inject_gpf(vtl);
3377            return Ok(());
3378        }
3379
3380        let (gva, segment) = self.compute_gva_for_table_access_emulation(
3381            exit_info.qualification(),
3382            (!instr_info.base_register_invalid()).then_some(instr_info.base_register()),
3383            (!instr_info.index_register_invalid()).then_some(instr_info.index_register()),
3384            instr_info.scaling(),
3385            instr_info.address_size(),
3386            instr_info.segment_register(),
3387        );
3388
3389        let gm = &self.partition.gm[vtl];
3390        let interruption_pending = self.backing.vtls[vtl].interruption_information.valid();
3391        let len = 2 + if self.long_mode(vtl) { 8 } else { 4 };
3392        let mut buf = [0u8; 10];
3393
3394        match instr_info.instruction() {
3395            GdtrOrIdtrInstruction::Sidt | GdtrOrIdtrInstruction::Sgdt => {
3396                let table = self.read_table_register(
3397                    vtl,
3398                    if matches!(instr_info.instruction(), GdtrOrIdtrInstruction::Sidt) {
3399                        TdxTableReg::Idtr
3400                    } else {
3401                        TdxTableReg::Gdtr
3402                    },
3403                );
3404                buf[..2].copy_from_slice(&table.limit.to_le_bytes());
3405                buf[2..].copy_from_slice(&table.base.to_le_bytes());
3406                let mut emulation_state = UhEmulationState {
3407                    vp: &mut *self,
3408                    interruption_pending,
3409                    devices: dev,
3410                    vtl,
3411                    cache: TdxEmulationCache::default(),
3412                };
3413                emulate_insn_memory_op(
3414                    &mut emulation_state,
3415                    gm,
3416                    dev,
3417                    gva,
3418                    segment,
3419                    x86emu::AlignmentMode::Unaligned,
3420                    EmulatedMemoryOperation::Write(&buf[..len]),
3421                )
3422                .await?;
3423            }
3424
3425            GdtrOrIdtrInstruction::Lgdt | GdtrOrIdtrInstruction::Lidt => {
3426                let mut emulation_state = UhEmulationState {
3427                    vp: &mut *self,
3428                    interruption_pending,
3429                    devices: dev,
3430                    vtl,
3431                    cache: TdxEmulationCache::default(),
3432                };
3433                emulate_insn_memory_op(
3434                    &mut emulation_state,
3435                    gm,
3436                    dev,
3437                    gva,
3438                    segment,
3439                    x86emu::AlignmentMode::Unaligned,
3440                    EmulatedMemoryOperation::Read(&mut buf[..len]),
3441                )
3442                .await?;
3443                let table = TableRegister {
3444                    limit: u16::from_le_bytes(buf[..2].try_into().unwrap()),
3445                    base: u64::from_le_bytes(buf[2..len].try_into().unwrap()),
3446                };
3447                self.write_table_register(
3448                    vtl,
3449                    if matches!(instr_info.instruction(), GdtrOrIdtrInstruction::Lidt) {
3450                        TdxTableReg::Idtr
3451                    } else {
3452                        TdxTableReg::Gdtr
3453                    },
3454                    table,
3455                );
3456            }
3457        }
3458
3459        self.advance_to_next_instruction(vtl);
3460        Ok(())
3461    }
3462
3463    async fn emulate_ldtr_or_tr(
3464        &mut self,
3465        vtl: GuestVtl,
3466        dev: &impl CpuIo,
3467    ) -> Result<(), VpHaltReason> {
3468        let exit_info = TdxExit(self.runner.tdx_vp_enter_exit_info());
3469        assert_eq!(
3470            exit_info.code().vmx_exit().basic_reason(),
3471            VmxExitBasic::LDTR_OR_TR
3472        );
3473        let instr_info = LdtrOrTrInstructionInfo::from(exit_info.instr_info().info());
3474
3475        // Check if load instructions are executed outside of kernel mode.
3476        // Check if store instructions are blocked by UMIP.
3477        if (instr_info.instruction().is_load() && exit_info.cpl() != 0)
3478            || (!instr_info.instruction().is_load()
3479                && exit_info.cpl() > 0
3480                && self.read_cr4(vtl) & X64_CR4_UMIP != 0)
3481        {
3482            self.inject_gpf(vtl);
3483            return Ok(());
3484        }
3485
3486        let gm = &self.partition.gm[vtl];
3487        let interruption_pending = self.backing.vtls[vtl].interruption_information.valid();
3488
3489        match instr_info.instruction() {
3490            LdtrOrTrInstruction::Sldt | LdtrOrTrInstruction::Str => {
3491                let value = self.runner.read_vmcs16(
3492                    vtl,
3493                    if matches!(instr_info.instruction(), LdtrOrTrInstruction::Sldt) {
3494                        TdxSegmentReg::Ldtr
3495                    } else {
3496                        TdxSegmentReg::Tr
3497                    }
3498                    .selector(),
3499                );
3500
3501                if instr_info.memory_or_register() {
3502                    let gps = self.runner.tdx_enter_guest_gps_mut();
3503                    gps[instr_info.register_1() as usize] = value.into();
3504                } else {
3505                    let (gva, segment) = self.compute_gva_for_table_access_emulation(
3506                        exit_info.qualification(),
3507                        (!instr_info.base_register_invalid()).then_some(instr_info.base_register()),
3508                        (!instr_info.index_register_invalid())
3509                            .then_some(instr_info.index_register()),
3510                        instr_info.scaling(),
3511                        instr_info.address_size(),
3512                        instr_info.segment_register(),
3513                    );
3514                    let mut emulation_state = UhEmulationState {
3515                        vp: &mut *self,
3516                        interruption_pending,
3517                        devices: dev,
3518                        vtl,
3519                        cache: TdxEmulationCache::default(),
3520                    };
3521                    emulate_insn_memory_op(
3522                        &mut emulation_state,
3523                        gm,
3524                        dev,
3525                        gva,
3526                        segment,
3527                        x86emu::AlignmentMode::Standard,
3528                        EmulatedMemoryOperation::Write(&value.to_le_bytes()),
3529                    )
3530                    .await?;
3531                }
3532            }
3533
3534            LdtrOrTrInstruction::Lldt | LdtrOrTrInstruction::Ltr => {
3535                let value = if instr_info.memory_or_register() {
3536                    let gps = self.runner.tdx_enter_guest_gps();
3537                    gps[instr_info.register_1() as usize] as u16
3538                } else {
3539                    let (gva, segment) = self.compute_gva_for_table_access_emulation(
3540                        exit_info.qualification(),
3541                        (!instr_info.base_register_invalid()).then_some(instr_info.base_register()),
3542                        (!instr_info.index_register_invalid())
3543                            .then_some(instr_info.index_register()),
3544                        instr_info.scaling(),
3545                        instr_info.address_size(),
3546                        instr_info.segment_register(),
3547                    );
3548                    let mut emulation_state = UhEmulationState {
3549                        vp: &mut *self,
3550                        interruption_pending,
3551                        devices: dev,
3552                        vtl,
3553                        cache: TdxEmulationCache::default(),
3554                    };
3555                    let mut buf = [0u8; 2];
3556                    emulate_insn_memory_op(
3557                        &mut emulation_state,
3558                        gm,
3559                        dev,
3560                        gva,
3561                        segment,
3562                        x86emu::AlignmentMode::Standard,
3563                        EmulatedMemoryOperation::Read(&mut buf),
3564                    )
3565                    .await?;
3566                    u16::from_le_bytes(buf)
3567                };
3568                self.runner.write_vmcs16(
3569                    vtl,
3570                    if matches!(instr_info.instruction(), LdtrOrTrInstruction::Lldt) {
3571                        TdxSegmentReg::Ldtr
3572                    } else {
3573                        TdxSegmentReg::Tr
3574                    }
3575                    .selector(),
3576                    !0,
3577                    value,
3578                );
3579            }
3580        }
3581
3582        self.advance_to_next_instruction(vtl);
3583        Ok(())
3584    }
3585
3586    fn compute_gva_for_table_access_emulation(
3587        &self,
3588        qualification: u64,
3589        base_reg: Option<u8>,
3590        index_reg: Option<u8>,
3591        scaling: u8,
3592        address_size: u8,
3593        segment_register: u8,
3594    ) -> (u64, Segment) {
3595        let gps = self.runner.tdx_enter_guest_gps();
3596
3597        // Displacement is stored in the qualification field for these instructions.
3598        let mut gva = qualification;
3599        if let Some(base_register) = base_reg {
3600            gva += gps[base_register as usize];
3601        }
3602        if let Some(index_register) = index_reg {
3603            gva += gps[index_register as usize] << scaling;
3604        }
3605        match address_size {
3606            // 16-bit address size
3607            0 => gva &= 0xFFFF,
3608            // 32-bit address size
3609            1 => gva &= 0xFFFFFFFF,
3610            // 64-bit address size
3611            2 => {}
3612            _ => unreachable!(),
3613        }
3614
3615        let segment = match segment_register {
3616            0 => Segment::ES,
3617            1 => Segment::CS,
3618            2 => Segment::SS,
3619            3 => Segment::DS,
3620            4 => Segment::FS,
3621            5 => Segment::GS,
3622            _ => unreachable!(),
3623        };
3624
3625        (gva, segment)
3626    }
3627}
3628
3629struct TdxApicClient<'a, T> {
3630    partition: &'a UhPartitionInner,
3631    apic_page: &'a mut ApicPage,
3632    dev: &'a T,
3633    vmtime: &'a VmTimeAccess,
3634    vtl: GuestVtl,
3635}
3636
3637impl<T: CpuIo> ApicClient for TdxApicClient<'_, T> {
3638    fn cr8(&mut self) -> u32 {
3639        self.apic_page.tpr.value >> 4
3640    }
3641
3642    fn set_cr8(&mut self, value: u32) {
3643        self.apic_page.tpr.value = value << 4;
3644    }
3645
3646    fn set_apic_base(&mut self, _value: u64) {
3647        // No-op--the APIC base is stored in the APIC itself.
3648    }
3649
3650    fn wake(&mut self, vp_index: VpIndex) {
3651        self.partition.vps[vp_index.index() as usize].wake(self.vtl, WakeReason::INTCON);
3652    }
3653
3654    fn eoi(&mut self, vector: u8) {
3655        self.dev.handle_eoi(vector.into())
3656    }
3657
3658    fn now(&mut self) -> vmcore::vmtime::VmTime {
3659        self.vmtime.now()
3660    }
3661
3662    fn pull_offload(&mut self) -> ([u32; 8], [u32; 8]) {
3663        pull_apic_offload(self.apic_page)
3664    }
3665}
3666
3667fn pull_apic_offload(page: &mut ApicPage) -> ([u32; 8], [u32; 8]) {
3668    let mut irr = [0; 8];
3669    let mut isr = [0; 8];
3670    for (((irr, page_irr), isr), page_isr) in irr
3671        .iter_mut()
3672        .zip(page.irr.iter_mut())
3673        .zip(isr.iter_mut())
3674        .zip(page.isr.iter_mut())
3675    {
3676        *irr = std::mem::take(&mut page_irr.value);
3677        *isr = std::mem::take(&mut page_isr.value);
3678    }
3679    (irr, isr)
3680}
3681
3682impl<T> hv1_hypercall::X64RegisterState for UhHypercallHandler<'_, '_, T, TdxBacked> {
3683    fn rip(&mut self) -> u64 {
3684        self.vp.backing.vtls[self.intercepted_vtl].private_regs.rip
3685    }
3686
3687    fn set_rip(&mut self, rip: u64) {
3688        self.vp.backing.vtls[self.intercepted_vtl].private_regs.rip = rip;
3689    }
3690
3691    fn gp(&mut self, n: hv1_hypercall::X64HypercallRegister) -> u64 {
3692        let gps = self.vp.runner.tdx_enter_guest_gps();
3693        match n {
3694            hv1_hypercall::X64HypercallRegister::Rax => gps[TdxGp::RAX],
3695            hv1_hypercall::X64HypercallRegister::Rcx => gps[TdxGp::RCX],
3696            hv1_hypercall::X64HypercallRegister::Rdx => gps[TdxGp::RDX],
3697            hv1_hypercall::X64HypercallRegister::Rbx => gps[TdxGp::RBX],
3698            hv1_hypercall::X64HypercallRegister::Rsi => gps[TdxGp::RSI],
3699            hv1_hypercall::X64HypercallRegister::Rdi => gps[TdxGp::RDI],
3700            hv1_hypercall::X64HypercallRegister::R8 => gps[TdxGp::R8],
3701        }
3702    }
3703
3704    fn set_gp(&mut self, n: hv1_hypercall::X64HypercallRegister, value: u64) {
3705        let gps = self.vp.runner.tdx_enter_guest_gps_mut();
3706        match n {
3707            hv1_hypercall::X64HypercallRegister::Rax => gps[TdxGp::RAX] = value,
3708            hv1_hypercall::X64HypercallRegister::Rcx => gps[TdxGp::RCX] = value,
3709            hv1_hypercall::X64HypercallRegister::Rdx => gps[TdxGp::RDX] = value,
3710            hv1_hypercall::X64HypercallRegister::Rbx => gps[TdxGp::RBX] = value,
3711            hv1_hypercall::X64HypercallRegister::Rsi => gps[TdxGp::RSI] = value,
3712            hv1_hypercall::X64HypercallRegister::Rdi => gps[TdxGp::RDI] = value,
3713            hv1_hypercall::X64HypercallRegister::R8 => gps[TdxGp::R8] = value,
3714        }
3715    }
3716
3717    // TODO: cleanup xmm to not use same as mshv
3718    fn xmm(&mut self, n: usize) -> u128 {
3719        u128::from_ne_bytes(self.vp.runner.fx_state().xmm[n])
3720    }
3721
3722    fn set_xmm(&mut self, n: usize, value: u128) {
3723        self.vp.runner.fx_state_mut().xmm[n] = value.to_ne_bytes();
3724    }
3725}
3726
3727impl<T: CpuIo> UhHypercallHandler<'_, '_, T, TdxBacked> {
3728    const TDX_DISPATCHER: hv1_hypercall::Dispatcher<Self> = hv1_hypercall::dispatcher!(
3729        Self,
3730        [
3731            hv1_hypercall::HvModifySparseGpaPageHostVisibility,
3732            hv1_hypercall::HvQuerySparseGpaPageHostVisibility,
3733            hv1_hypercall::HvX64StartVirtualProcessor,
3734            hv1_hypercall::HvGetVpIndexFromApicId,
3735            hv1_hypercall::HvRetargetDeviceInterrupt,
3736            hv1_hypercall::HvFlushVirtualAddressList,
3737            hv1_hypercall::HvFlushVirtualAddressListEx,
3738            hv1_hypercall::HvFlushVirtualAddressSpace,
3739            hv1_hypercall::HvFlushVirtualAddressSpaceEx,
3740            hv1_hypercall::HvPostMessage,
3741            hv1_hypercall::HvSignalEvent,
3742            hv1_hypercall::HvExtQueryCapabilities,
3743            hv1_hypercall::HvGetVpRegisters,
3744            hv1_hypercall::HvSetVpRegisters,
3745            hv1_hypercall::HvEnablePartitionVtl,
3746            hv1_hypercall::HvX64EnableVpVtl,
3747            hv1_hypercall::HvVtlCall,
3748            hv1_hypercall::HvVtlReturn,
3749            hv1_hypercall::HvModifyVtlProtectionMask,
3750            hv1_hypercall::HvX64TranslateVirtualAddress,
3751            hv1_hypercall::HvSendSyntheticClusterIpi,
3752            hv1_hypercall::HvSendSyntheticClusterIpiEx,
3753            hv1_hypercall::HvInstallIntercept,
3754            hv1_hypercall::HvAssertVirtualInterrupt,
3755        ]
3756    );
3757
3758    /// Hypercalls that come through a tdg.vp.vmcall tdcall instruction.
3759    ///
3760    /// This is just to handle the proxy synic.
3761    const TDCALL_DISPATCHER: hv1_hypercall::Dispatcher<Self> = hv1_hypercall::dispatcher!(
3762        Self,
3763        [hv1_hypercall::HvPostMessage, hv1_hypercall::HvSignalEvent],
3764    );
3765}
3766
3767impl AccessVpState for UhVpStateAccess<'_, '_, TdxBacked> {
3768    type Error = vp_state::Error;
3769
3770    fn caps(&self) -> &virt::x86::X86PartitionCapabilities {
3771        &self.vp.partition.caps
3772    }
3773
3774    fn commit(&mut self) -> Result<(), Self::Error> {
3775        Ok(())
3776    }
3777
3778    fn registers(&mut self) -> Result<Registers, Self::Error> {
3779        let gps = self.vp.runner.tdx_enter_guest_gps();
3780
3781        let cs = self.vp.read_segment(self.vtl, TdxSegmentReg::Cs);
3782        let ds = self.vp.read_segment(self.vtl, TdxSegmentReg::Ds);
3783        let es = self.vp.read_segment(self.vtl, TdxSegmentReg::Es);
3784        let fs = self.vp.read_segment(self.vtl, TdxSegmentReg::Fs);
3785        let gs = self.vp.read_segment(self.vtl, TdxSegmentReg::Gs);
3786        let ss = self.vp.read_segment(self.vtl, TdxSegmentReg::Ss);
3787        let tr = self.vp.read_segment(self.vtl, TdxSegmentReg::Tr);
3788        let ldtr = self.vp.read_segment(self.vtl, TdxSegmentReg::Ldtr);
3789
3790        let gdtr = self.vp.read_table_register(self.vtl, TdxTableReg::Gdtr);
3791        let idtr = self.vp.read_table_register(self.vtl, TdxTableReg::Idtr);
3792
3793        let cr0 = self.vp.read_cr0(self.vtl);
3794        let cr2 = self.vp.runner.cr2();
3795        let cr3 = self
3796            .vp
3797            .runner
3798            .read_vmcs64(self.vtl, VmcsField::VMX_VMCS_GUEST_CR3);
3799        let cr4 = self.vp.read_cr4(self.vtl);
3800
3801        let cr8 = self.vp.runner.tdx_apic_page(self.vtl).tpr.value >> 4;
3802
3803        let efer = self.vp.backing.vtls[self.vtl].efer;
3804
3805        Ok(Registers {
3806            rax: gps[TdxGp::RAX],
3807            rcx: gps[TdxGp::RCX],
3808            rdx: gps[TdxGp::RDX],
3809            rbx: gps[TdxGp::RBX],
3810            rsp: self.vp.backing.vtls[self.vtl].private_regs.rsp,
3811            rbp: gps[TdxGp::RBP],
3812            rsi: gps[TdxGp::RSI],
3813            rdi: gps[TdxGp::RDI],
3814            r8: gps[TdxGp::R8],
3815            r9: gps[TdxGp::R9],
3816            r10: gps[TdxGp::R10],
3817            r11: gps[TdxGp::R11],
3818            r12: gps[TdxGp::R12],
3819            r13: gps[TdxGp::R13],
3820            r14: gps[TdxGp::R14],
3821            r15: gps[TdxGp::R15],
3822            rip: self.vp.backing.vtls[self.vtl].private_regs.rip,
3823            rflags: self.vp.backing.vtls[self.vtl].private_regs.rflags,
3824            cs,
3825            ds,
3826            es,
3827            fs,
3828            gs,
3829            ss,
3830            tr,
3831            ldtr,
3832            gdtr,
3833            idtr,
3834            cr0,
3835            cr2,
3836            cr3,
3837            cr4,
3838            cr8: cr8.into(),
3839            efer,
3840        })
3841    }
3842
3843    fn set_registers(&mut self, value: &Registers) -> Result<(), Self::Error> {
3844        let Registers {
3845            rax,
3846            rcx,
3847            rdx,
3848            rbx,
3849            rsp,
3850            rbp,
3851            rsi,
3852            rdi,
3853            r8,
3854            r9,
3855            r10,
3856            r11,
3857            r12,
3858            r13,
3859            r14,
3860            r15,
3861            rip,
3862            rflags,
3863            cs,
3864            ds,
3865            es,
3866            fs,
3867            gs,
3868            ss,
3869            tr,
3870            ldtr,
3871            gdtr,
3872            idtr,
3873            cr0,
3874            cr2,
3875            cr3,
3876            cr4,
3877            cr8,
3878            efer,
3879        } = value;
3880
3881        let gps = self.vp.runner.tdx_enter_guest_gps_mut();
3882        gps[TdxGp::RAX] = *rax;
3883        gps[TdxGp::RCX] = *rcx;
3884        gps[TdxGp::RDX] = *rdx;
3885        gps[TdxGp::RBX] = *rbx;
3886        self.vp.backing.vtls[self.vtl].private_regs.rsp = *rsp;
3887        gps[TdxGp::RBP] = *rbp;
3888        gps[TdxGp::RSI] = *rsi;
3889        gps[TdxGp::RDI] = *rdi;
3890        gps[TdxGp::R8] = *r8;
3891        gps[TdxGp::R9] = *r9;
3892        gps[TdxGp::R10] = *r10;
3893        gps[TdxGp::R11] = *r11;
3894        gps[TdxGp::R12] = *r12;
3895        gps[TdxGp::R13] = *r13;
3896        gps[TdxGp::R14] = *r14;
3897        gps[TdxGp::R15] = *r15;
3898        self.vp.backing.vtls[self.vtl].private_regs.rip = *rip;
3899        // BUGBUG: rflags set also updates interrupts in hcl
3900        self.vp.backing.vtls[self.vtl].private_regs.rflags = *rflags;
3901
3902        // Set segment registers
3903        self.vp.write_segment(self.vtl, TdxSegmentReg::Cs, *cs)?;
3904        self.vp.write_segment(self.vtl, TdxSegmentReg::Ds, *ds)?;
3905        self.vp.write_segment(self.vtl, TdxSegmentReg::Es, *es)?;
3906        self.vp.write_segment(self.vtl, TdxSegmentReg::Fs, *fs)?;
3907        self.vp.write_segment(self.vtl, TdxSegmentReg::Gs, *gs)?;
3908        self.vp.write_segment(self.vtl, TdxSegmentReg::Ss, *ss)?;
3909        self.vp.write_segment(self.vtl, TdxSegmentReg::Tr, *tr)?;
3910        self.vp
3911            .write_segment(self.vtl, TdxSegmentReg::Ldtr, *ldtr)?;
3912
3913        // Set table registers
3914        self.vp
3915            .write_table_register(self.vtl, TdxTableReg::Gdtr, *gdtr);
3916        self.vp
3917            .write_table_register(self.vtl, TdxTableReg::Idtr, *idtr);
3918
3919        self.vp.write_cr0(self.vtl, *cr0)?;
3920
3921        // CR2 is shared with the kernel, so set it in the VP run page which
3922        // will be set before lower VTL entry.
3923        self.vp.runner.set_cr2(*cr2);
3924
3925        self.vp
3926            .runner
3927            .write_vmcs64(self.vtl, VmcsField::VMX_VMCS_GUEST_CR3, !0, *cr3);
3928
3929        self.vp.write_cr4(self.vtl, *cr4)?;
3930
3931        self.vp.runner.tdx_apic_page_mut(self.vtl).tpr.value = (*cr8 << 4) as u32;
3932
3933        self.vp.write_efer(self.vtl, *efer)?;
3934
3935        // Execution mode must be updated after setting EFER and CR0.
3936        self.vp.update_execution_mode(self.vtl);
3937
3938        Ok(())
3939    }
3940
3941    fn activity(&mut self) -> Result<vp::Activity, Self::Error> {
3942        let lapic = &self.vp.backing.cvm.lapics[self.vtl];
3943        let interruptibility: Interruptibility = self
3944            .vp
3945            .runner
3946            .read_vmcs32(self.vtl, VmcsField::VMX_VMCS_GUEST_INTERRUPTIBILITY)
3947            .into();
3948        Ok(vp::Activity {
3949            mp_state: lapic.activity,
3950            nmi_pending: lapic.nmi_pending,
3951            nmi_masked: interruptibility.blocked_by_nmi(),
3952            interrupt_shadow: interruptibility.blocked_by_sti()
3953                || interruptibility.blocked_by_movss(),
3954            pending_event: None,        // TODO TDX
3955            pending_interruption: None, // TODO TDX
3956        })
3957    }
3958
3959    fn set_activity(&mut self, value: &vp::Activity) -> Result<(), Self::Error> {
3960        let &vp::Activity {
3961            mp_state,
3962            nmi_pending,
3963            nmi_masked,
3964            interrupt_shadow,
3965            pending_event: _,        // TODO TDX
3966            pending_interruption: _, // TODO TDX
3967        } = value;
3968        self.vp.backing.cvm.lapics[self.vtl].activity = mp_state;
3969        self.vp.backing.cvm.lapics[self.vtl].nmi_pending = nmi_pending;
3970        let interruptibility = Interruptibility::new()
3971            .with_blocked_by_movss(interrupt_shadow)
3972            .with_blocked_by_nmi(nmi_masked);
3973        self.vp.runner.write_vmcs32(
3974            self.vtl,
3975            VmcsField::VMX_VMCS_GUEST_INTERRUPTIBILITY,
3976            !0,
3977            interruptibility.into(),
3978        );
3979        Ok(())
3980    }
3981
3982    fn xsave(&mut self) -> Result<vp::Xsave, Self::Error> {
3983        // TODO: needed?
3984        Err(vp_state::Error::Unimplemented("xsave"))
3985    }
3986
3987    fn set_xsave(&mut self, _value: &vp::Xsave) -> Result<(), Self::Error> {
3988        // TODO: needed?
3989        Err(vp_state::Error::Unimplemented("xsave"))
3990    }
3991
3992    fn apic(&mut self) -> Result<vp::Apic, Self::Error> {
3993        self.vp.access_apic_without_offload(self.vtl, |vp| {
3994            Ok(vp.backing.cvm.lapics[self.vtl].lapic.save())
3995        })
3996    }
3997
3998    fn set_apic(&mut self, value: &vp::Apic) -> Result<(), Self::Error> {
3999        self.vp.access_apic_without_offload(self.vtl, |vp| {
4000            vp.backing.cvm.lapics[self.vtl]
4001                .lapic
4002                .restore(value)
4003                .map_err(vp_state::Error::InvalidApicBase)?;
4004
4005            Ok(())
4006        })
4007    }
4008
4009    fn xcr(&mut self) -> Result<vp::Xcr0, Self::Error> {
4010        Ok(vp::Xcr0 {
4011            value: self
4012                .vp
4013                .runner
4014                .get_vp_register(self.vtl, HvX64RegisterName::Xfem)
4015                .unwrap()
4016                .as_u64(),
4017        })
4018    }
4019
4020    fn set_xcr(&mut self, _value: &vp::Xcr0) -> Result<(), Self::Error> {
4021        Err(vp_state::Error::Unimplemented("xcr"))
4022    }
4023
4024    fn xss(&mut self) -> Result<vp::Xss, Self::Error> {
4025        Ok(vp::Xss {
4026            value: self.vp.backing.vtls[self.vtl].private_regs.msr_xss,
4027        })
4028    }
4029
4030    fn set_xss(&mut self, value: &vp::Xss) -> Result<(), Self::Error> {
4031        self.vp.backing.vtls[self.vtl].private_regs.msr_xss = value.value;
4032        Ok(())
4033    }
4034
4035    fn mtrrs(&mut self) -> Result<vp::Mtrrs, Self::Error> {
4036        Ok(vp::Mtrrs {
4037            msr_mtrr_def_type: 0, // TODO TDX: MTRRs
4038            fixed: [0; 11],       // TODO TDX: MTRRs
4039            variable: [0; 16],    // TODO TDX: MTRRs
4040        })
4041    }
4042
4043    fn set_mtrrs(&mut self, _value: &vp::Mtrrs) -> Result<(), Self::Error> {
4044        // TODO TDX: MTRRs
4045        Ok(())
4046    }
4047
4048    fn pat(&mut self) -> Result<vp::Pat, Self::Error> {
4049        let msr_cr_pat = self
4050            .vp
4051            .runner
4052            .read_vmcs64(self.vtl, VmcsField::VMX_VMCS_GUEST_PAT);
4053        Ok(vp::Pat { value: msr_cr_pat })
4054    }
4055
4056    fn set_pat(&mut self, value: &vp::Pat) -> Result<(), Self::Error> {
4057        self.vp
4058            .runner
4059            .write_vmcs64(self.vtl, VmcsField::VMX_VMCS_GUEST_PAT, !0, value.value);
4060        Ok(())
4061    }
4062
4063    fn virtual_msrs(&mut self) -> Result<vp::VirtualMsrs, Self::Error> {
4064        let state = &self.vp.backing.vtls[self.vtl].private_regs;
4065
4066        let sysenter_cs = self
4067            .vp
4068            .runner
4069            .read_vmcs32(self.vtl, VmcsField::VMX_VMCS_GUEST_SYSENTER_CS_MSR)
4070            .into();
4071        let sysenter_eip = self
4072            .vp
4073            .runner
4074            .read_vmcs64(self.vtl, VmcsField::VMX_VMCS_GUEST_SYSENTER_EIP_MSR);
4075        let sysenter_esp = self
4076            .vp
4077            .runner
4078            .read_vmcs64(self.vtl, VmcsField::VMX_VMCS_GUEST_SYSENTER_ESP_MSR);
4079
4080        Ok(vp::VirtualMsrs {
4081            kernel_gs_base: state.msr_kernel_gs_base,
4082            sysenter_cs,
4083            sysenter_eip,
4084            sysenter_esp,
4085            star: state.msr_star,
4086            lstar: state.msr_lstar,
4087            cstar: self.vp.backing.vtls[self.vtl].msr_cstar,
4088            sfmask: state.msr_sfmask,
4089        })
4090    }
4091
4092    fn set_virtual_msrs(&mut self, value: &vp::VirtualMsrs) -> Result<(), Self::Error> {
4093        let &vp::VirtualMsrs {
4094            kernel_gs_base,
4095            sysenter_cs,
4096            sysenter_eip,
4097            sysenter_esp,
4098            star,
4099            lstar,
4100            cstar,
4101            sfmask,
4102        } = value;
4103
4104        let state = &mut self.vp.backing.vtls[self.vtl].private_regs;
4105        state.msr_kernel_gs_base = kernel_gs_base;
4106        state.msr_star = star;
4107        state.msr_lstar = lstar;
4108        state.msr_sfmask = sfmask;
4109
4110        self.vp.runner.write_vmcs32(
4111            self.vtl,
4112            VmcsField::VMX_VMCS_GUEST_SYSENTER_CS_MSR,
4113            !0,
4114            sysenter_cs as u32,
4115        );
4116        self.vp.runner.write_vmcs64(
4117            self.vtl,
4118            VmcsField::VMX_VMCS_GUEST_SYSENTER_EIP_MSR,
4119            !0,
4120            sysenter_eip,
4121        );
4122        self.vp.runner.write_vmcs64(
4123            self.vtl,
4124            VmcsField::VMX_VMCS_GUEST_SYSENTER_ESP_MSR,
4125            !0,
4126            sysenter_esp,
4127        );
4128
4129        self.vp.backing.vtls[self.vtl].msr_cstar = cstar;
4130
4131        Ok(())
4132    }
4133
4134    fn debug_regs(&mut self) -> Result<vp::DebugRegisters, Self::Error> {
4135        let mut values = [0u64.into(); 5];
4136        self.vp
4137            .runner
4138            .get_vp_registers(
4139                self.vtl,
4140                &[
4141                    HvX64RegisterName::Dr0,
4142                    HvX64RegisterName::Dr1,
4143                    HvX64RegisterName::Dr2,
4144                    HvX64RegisterName::Dr3,
4145                    HvX64RegisterName::Dr6,
4146                ],
4147                &mut values,
4148            )
4149            .map_err(vp_state::Error::GetRegisters)?;
4150
4151        let dr7 = self
4152            .vp
4153            .runner
4154            .read_vmcs64(self.vtl, VmcsField::VMX_VMCS_GUEST_DR7);
4155
4156        Ok(vp::DebugRegisters {
4157            dr0: values[0].as_u64(),
4158            dr1: values[1].as_u64(),
4159            dr2: values[2].as_u64(),
4160            dr3: values[3].as_u64(),
4161            dr6: values[4].as_u64(),
4162            dr7,
4163        })
4164    }
4165
4166    fn set_debug_regs(&mut self, value: &vp::DebugRegisters) -> Result<(), Self::Error> {
4167        let &vp::DebugRegisters {
4168            dr0,
4169            dr1,
4170            dr2,
4171            dr3,
4172            dr6,
4173            dr7,
4174        } = value;
4175        self.vp
4176            .runner
4177            .set_vp_registers(
4178                self.vtl,
4179                [
4180                    (HvX64RegisterName::Dr0, dr0),
4181                    (HvX64RegisterName::Dr1, dr1),
4182                    (HvX64RegisterName::Dr2, dr2),
4183                    (HvX64RegisterName::Dr3, dr3),
4184                    (HvX64RegisterName::Dr6, dr6),
4185                ],
4186            )
4187            .map_err(vp_state::Error::SetRegisters)?;
4188
4189        self.vp
4190            .runner
4191            .write_vmcs64(self.vtl, VmcsField::VMX_VMCS_GUEST_DR7, !0, dr7);
4192
4193        Ok(())
4194    }
4195
4196    fn tsc(&mut self) -> Result<vp::Tsc, Self::Error> {
4197        Err(vp_state::Error::Unimplemented("tsc"))
4198    }
4199
4200    fn set_tsc(&mut self, _value: &vp::Tsc) -> Result<(), Self::Error> {
4201        Err(vp_state::Error::Unimplemented("tsc"))
4202    }
4203
4204    fn tsc_aux(&mut self) -> Result<vp::TscAux, Self::Error> {
4205        Ok(vp::TscAux {
4206            value: self.vp.backing.vtls[self.vtl].private_regs.msr_tsc_aux,
4207        })
4208    }
4209
4210    fn set_tsc_aux(&mut self, value: &vp::TscAux) -> Result<(), Self::Error> {
4211        self.vp.backing.vtls[self.vtl].private_regs.msr_tsc_aux = value.value;
4212        Ok(())
4213    }
4214
4215    fn cet(&mut self) -> Result<vp::Cet, Self::Error> {
4216        Err(vp_state::Error::Unimplemented("cet"))
4217    }
4218
4219    fn set_cet(&mut self, _value: &vp::Cet) -> Result<(), Self::Error> {
4220        Err(vp_state::Error::Unimplemented("cet"))
4221    }
4222
4223    fn cet_ss(&mut self) -> Result<vp::CetSs, Self::Error> {
4224        Err(vp_state::Error::Unimplemented("cet_ss"))
4225    }
4226
4227    fn set_cet_ss(&mut self, _value: &vp::CetSs) -> Result<(), Self::Error> {
4228        Err(vp_state::Error::Unimplemented("cet_ss"))
4229    }
4230
4231    fn synic_msrs(&mut self) -> Result<vp::SyntheticMsrs, Self::Error> {
4232        Err(vp_state::Error::Unimplemented("synic_msrs"))
4233    }
4234
4235    fn set_synic_msrs(&mut self, _value: &vp::SyntheticMsrs) -> Result<(), Self::Error> {
4236        Err(vp_state::Error::Unimplemented("synic_msrs"))
4237    }
4238
4239    fn synic_message_page(&mut self) -> Result<vp::SynicMessagePage, Self::Error> {
4240        Err(vp_state::Error::Unimplemented("synic_message_page"))
4241    }
4242
4243    fn set_synic_message_page(&mut self, _value: &vp::SynicMessagePage) -> Result<(), Self::Error> {
4244        Err(vp_state::Error::Unimplemented("synic_message_page"))
4245    }
4246
4247    fn synic_event_flags_page(&mut self) -> Result<vp::SynicEventFlagsPage, Self::Error> {
4248        Err(vp_state::Error::Unimplemented("synic_event_flags_page"))
4249    }
4250
4251    fn set_synic_event_flags_page(
4252        &mut self,
4253        _value: &vp::SynicEventFlagsPage,
4254    ) -> Result<(), Self::Error> {
4255        Err(vp_state::Error::Unimplemented("synic_event_flags_page"))
4256    }
4257
4258    fn synic_message_queues(&mut self) -> Result<vp::SynicMessageQueues, Self::Error> {
4259        Err(vp_state::Error::Unimplemented("synic_message_queues"))
4260    }
4261
4262    fn set_synic_message_queues(
4263        &mut self,
4264        _value: &vp::SynicMessageQueues,
4265    ) -> Result<(), Self::Error> {
4266        Err(vp_state::Error::Unimplemented("synic_message_queues"))
4267    }
4268
4269    fn synic_timers(&mut self) -> Result<vp::SynicTimers, Self::Error> {
4270        Err(vp_state::Error::Unimplemented("synic_timers"))
4271    }
4272
4273    fn set_synic_timers(&mut self, _value: &vp::SynicTimers) -> Result<(), Self::Error> {
4274        Err(vp_state::Error::Unimplemented("synic_timers"))
4275    }
4276}
4277
4278/// Compute the index of the highest vector set in IRR/ISR, or 0
4279/// if no vector is set. (Vectors 0-15 are invalid so this is not
4280/// ambiguous.)
4281fn top_vector(reg: &[ApicRegister; 8]) -> u8 {
4282    reg.iter()
4283        .enumerate()
4284        .rev()
4285        .find_map(|(i, r)| {
4286            (r.value != 0).then(|| (i as u32 * 32 + (31 - r.value.leading_zeros())) as u8)
4287        })
4288        .unwrap_or(0)
4289}
4290
4291struct TdHypercall<'a, 'b, T>(UhHypercallHandler<'a, 'b, T, TdxBacked>);
4292
4293impl<'a, 'b, T> AsHandler<UhHypercallHandler<'a, 'b, T, TdxBacked>> for TdHypercall<'a, 'b, T> {
4294    fn as_handler(&mut self) -> &mut UhHypercallHandler<'a, 'b, T, TdxBacked> {
4295        &mut self.0
4296    }
4297}
4298
4299impl<T> HypercallIo for TdHypercall<'_, '_, T> {
4300    fn advance_ip(&mut self) {
4301        self.0.vp.runner.tdx_enter_guest_gps_mut()[TdxGp::R10] = 0;
4302        self.0.vp.backing.vtls[self.0.intercepted_vtl]
4303            .private_regs
4304            .rip = self.0.vp.backing.vtls[self.0.intercepted_vtl]
4305            .private_regs
4306            .rip
4307            .wrapping_add(4);
4308    }
4309
4310    fn retry(&mut self, control: u64) {
4311        self.0.vp.runner.tdx_enter_guest_gps_mut()[TdxGp::R10] = control;
4312        self.set_result(hvdef::hypercall::HypercallOutput::from(HvError::Timeout).into());
4313    }
4314
4315    fn control(&mut self) -> u64 {
4316        self.0.vp.runner.tdx_enter_guest_gps()[TdxGp::R10]
4317    }
4318
4319    fn input_gpa(&mut self) -> u64 {
4320        self.0.vp.runner.tdx_enter_guest_gps()[TdxGp::RDX]
4321    }
4322
4323    fn output_gpa(&mut self) -> u64 {
4324        self.0.vp.runner.tdx_enter_guest_gps()[TdxGp::R8]
4325    }
4326
4327    fn fast_register_pair_count(&mut self) -> usize {
4328        7
4329    }
4330
4331    fn extended_fast_hypercalls_ok(&mut self) -> bool {
4332        false
4333    }
4334
4335    fn fast_input(&mut self, buf: &mut [[u64; 2]], _output_register_pairs: usize) -> usize {
4336        self.fast_regs(0, buf);
4337        buf.len()
4338    }
4339
4340    fn fast_output(&mut self, _starting_pair_index: usize, buf: &[[u64; 2]]) {
4341        assert!(buf.is_empty());
4342    }
4343
4344    fn vtl_input(&mut self) -> u64 {
4345        unreachable!()
4346    }
4347
4348    fn set_result(&mut self, n: u64) {
4349        self.0.vp.runner.tdx_enter_guest_gps_mut()[TdxGp::R11] = n;
4350    }
4351
4352    fn fast_regs(&mut self, starting_pair_index: usize, buf: &mut [[u64; 2]]) {
4353        let regs = self.0.vp.runner.tdx_enter_guest_gps();
4354        let fx_state = self.0.vp.runner.fx_state();
4355        for (i, [low, high]) in buf.iter_mut().enumerate() {
4356            let index = i + starting_pair_index;
4357            if index == 0 {
4358                *low = regs[TdxGp::RDX];
4359                *high = regs[TdxGp::R8];
4360            } else {
4361                let value = u128::from_ne_bytes(fx_state.xmm[index - 1]);
4362                *low = value as u64;
4363                *high = (value >> 64) as u64;
4364            }
4365        }
4366    }
4367}
4368
4369impl<T> hv1_hypercall::VtlSwitchOps for UhHypercallHandler<'_, '_, T, TdxBacked> {
4370    fn advance_ip(&mut self) {
4371        let long_mode = self.vp.long_mode(self.intercepted_vtl);
4372        let mut io = hv1_hypercall::X64RegisterIo::new(self, long_mode);
4373        io.advance_ip();
4374    }
4375
4376    fn inject_invalid_opcode_fault(&mut self) {
4377        self.vp.backing.vtls[self.intercepted_vtl].interruption_information =
4378            InterruptionInformation::new()
4379                .with_valid(true)
4380                .with_interruption_type(INTERRUPT_TYPE_HARDWARE_EXCEPTION)
4381                .with_vector(x86defs::Exception::INVALID_OPCODE.0);
4382    }
4383}
4384
4385impl<T: CpuIo> hv1_hypercall::FlushVirtualAddressList for UhHypercallHandler<'_, '_, T, TdxBacked> {
4386    fn flush_virtual_address_list(
4387        &mut self,
4388        processor_set: ProcessorSet<'_>,
4389        flags: HvFlushFlags,
4390        gva_ranges: &[HvGvaRange],
4391    ) -> HvRepResult {
4392        hv1_hypercall::FlushVirtualAddressListEx::flush_virtual_address_list_ex(
4393            self,
4394            processor_set,
4395            flags,
4396            gva_ranges,
4397        )
4398    }
4399}
4400
4401impl<T: CpuIo> hv1_hypercall::FlushVirtualAddressListEx
4402    for UhHypercallHandler<'_, '_, T, TdxBacked>
4403{
4404    fn flush_virtual_address_list_ex(
4405        &mut self,
4406        processor_set: ProcessorSet<'_>,
4407        flags: HvFlushFlags,
4408        gva_ranges: &[HvGvaRange],
4409    ) -> HvRepResult {
4410        self.hcvm_validate_flush_inputs(processor_set, flags, true)
4411            .map_err(|e| (e, 0))?;
4412
4413        let vtl = self.intercepted_vtl;
4414        let flush_state = &self.vp.shared.flush_state[vtl];
4415
4416        // If we fail to add ranges to the list for any reason then promote this request to a flush entire.
4417        if let Err(()) = Self::add_ranges_to_tlb_flush_list(
4418            flush_state,
4419            gva_ranges,
4420            flags.use_extended_range_format(),
4421        ) {
4422            if flags.non_global_mappings_only() {
4423                flush_state
4424                    .flush_entire_non_global_counter
4425                    .fetch_add(1, Ordering::Relaxed);
4426            } else {
4427                flush_state
4428                    .flush_entire_counter
4429                    .fetch_add(1, Ordering::Relaxed);
4430            }
4431        }
4432
4433        // Send flush IPIs to the specified VPs.
4434        TdxTlbLockFlushAccess {
4435            vp_index: Some(self.vp.vp_index()),
4436            partition: self.vp.partition,
4437            shared: self.vp.shared,
4438        }
4439        .wake_processors_for_tlb_flush(vtl, (!flags.all_processors()).then_some(processor_set));
4440
4441        // Mark that this VP needs to wait for all TLB locks to be released before returning.
4442        self.vp.set_wait_for_tlb_locks(vtl);
4443
4444        Ok(())
4445    }
4446}
4447
4448impl<T: CpuIo> hv1_hypercall::FlushVirtualAddressSpace
4449    for UhHypercallHandler<'_, '_, T, TdxBacked>
4450{
4451    fn flush_virtual_address_space(
4452        &mut self,
4453        processor_set: ProcessorSet<'_>,
4454        flags: HvFlushFlags,
4455    ) -> hvdef::HvResult<()> {
4456        hv1_hypercall::FlushVirtualAddressSpaceEx::flush_virtual_address_space_ex(
4457            self,
4458            processor_set,
4459            flags,
4460        )
4461    }
4462}
4463
4464impl<T: CpuIo> hv1_hypercall::FlushVirtualAddressSpaceEx
4465    for UhHypercallHandler<'_, '_, T, TdxBacked>
4466{
4467    fn flush_virtual_address_space_ex(
4468        &mut self,
4469        processor_set: ProcessorSet<'_>,
4470        flags: HvFlushFlags,
4471    ) -> hvdef::HvResult<()> {
4472        self.hcvm_validate_flush_inputs(processor_set, flags, false)?;
4473        let vtl = self.intercepted_vtl;
4474
4475        let flush_state = &self.vp.shared.flush_state[vtl];
4476
4477        // Set flush entire.
4478        if flags.non_global_mappings_only() {
4479            flush_state
4480                .flush_entire_non_global_counter
4481                .fetch_add(1, Ordering::Relaxed);
4482        } else {
4483            flush_state
4484                .flush_entire_counter
4485                .fetch_add(1, Ordering::Relaxed);
4486        }
4487
4488        // Send flush IPIs to the specified VPs.
4489        TdxTlbLockFlushAccess {
4490            vp_index: Some(self.vp.vp_index()),
4491            partition: self.vp.partition,
4492            shared: self.vp.shared,
4493        }
4494        .wake_processors_for_tlb_flush(vtl, (!flags.all_processors()).then_some(processor_set));
4495
4496        // Mark that this VP needs to wait for all TLB locks to be released before returning.
4497        self.vp.set_wait_for_tlb_locks(vtl);
4498
4499        Ok(())
4500    }
4501}
4502
4503impl<T: CpuIo> UhHypercallHandler<'_, '_, T, TdxBacked> {
4504    fn add_ranges_to_tlb_flush_list(
4505        flush_state: &TdxPartitionFlushState,
4506        gva_ranges: &[HvGvaRange],
4507        use_extended_range_format: bool,
4508    ) -> Result<(), ()> {
4509        // If there are more gvas than the list size there's no point in filling the list.
4510        if gva_ranges.len() > FLUSH_GVA_LIST_SIZE {
4511            return Err(());
4512        }
4513
4514        if use_extended_range_format
4515            && gva_ranges
4516                .iter()
4517                .any(|range| range.as_extended().large_page())
4518        {
4519            // TDX does not provide a way to flush large page ranges,
4520            // we have to promote this request to a flush entire.
4521            return Err(());
4522        }
4523
4524        flush_state
4525            .gva_list
4526            .write()
4527            .extend(gva_ranges.iter().copied());
4528
4529        Ok(())
4530    }
4531}
4532
4533impl TdxTlbLockFlushAccess<'_> {
4534    fn wake_processors_for_tlb_flush(
4535        &mut self,
4536        target_vtl: GuestVtl,
4537        processor_set: Option<ProcessorSet<'_>>,
4538    ) {
4539        match processor_set {
4540            Some(processors) => {
4541                self.wake_processors_for_tlb_flush_inner(target_vtl, processors);
4542            }
4543            None => self.wake_processors_for_tlb_flush_inner(
4544                target_vtl,
4545                0..(self.partition.vps.len() as u32),
4546            ),
4547        }
4548    }
4549
4550    fn wake_processors_for_tlb_flush_inner(
4551        &mut self,
4552        target_vtl: GuestVtl,
4553        processors: impl IntoIterator<Item = u32>,
4554    ) {
4555        // Use SeqCst ordering to ensure that we are observing the most
4556        // up-to-date value from other VPs. Otherwise we might not send a
4557        // wake to a VP in a lower VTL, which could cause TLB lock holders
4558        // to be stuck waiting until the target_vp happens to switch into
4559        // VTL 2.
4560        // We use a single fence to avoid having to take a SeqCst load
4561        // for each VP.
4562        std::sync::atomic::fence(Ordering::SeqCst);
4563        self.partition.hcl.kick_cpus(
4564            processors.into_iter().filter(|&vp| {
4565                self.shared.active_vtl[vp as usize].load(Ordering::Relaxed) == target_vtl as u8
4566            }),
4567            true,
4568            true,
4569        );
4570    }
4571}
4572
4573struct TdxTlbLockFlushAccess<'a> {
4574    vp_index: Option<VpIndex>,
4575    partition: &'a UhPartitionInner,
4576    shared: &'a TdxBackedShared,
4577}
4578
4579impl TlbFlushLockAccess for TdxTlbLockFlushAccess<'_> {
4580    fn flush(&mut self, vtl: GuestVtl) {
4581        self.shared.flush_state[vtl]
4582            .flush_entire_counter
4583            .fetch_add(1, Ordering::Relaxed);
4584
4585        self.wake_processors_for_tlb_flush(vtl, None);
4586        self.set_wait_for_tlb_locks(vtl);
4587    }
4588
4589    fn flush_entire(&mut self) {
4590        for vtl in [GuestVtl::Vtl0, GuestVtl::Vtl1] {
4591            self.shared.flush_state[vtl]
4592                .flush_entire_counter
4593                .fetch_add(1, Ordering::Relaxed);
4594        }
4595        for vtl in [GuestVtl::Vtl0, GuestVtl::Vtl1] {
4596            self.wake_processors_for_tlb_flush(vtl, None);
4597            self.set_wait_for_tlb_locks(vtl);
4598        }
4599    }
4600
4601    fn set_wait_for_tlb_locks(&mut self, vtl: GuestVtl) {
4602        if let Some(vp_index) = self.vp_index {
4603            hardware_cvm::tlb_lock::TlbLockAccess {
4604                vp_index,
4605                cvm_partition: &self.shared.cvm,
4606            }
4607            .set_wait_for_tlb_locks(vtl);
4608        }
4609    }
4610}
4611
4612mod save_restore {
4613    use super::TdxBacked;
4614    use super::UhProcessor;
4615    use vmcore::save_restore::RestoreError;
4616    use vmcore::save_restore::SaveError;
4617    use vmcore::save_restore::SaveRestore;
4618    use vmcore::save_restore::SavedStateNotSupported;
4619
4620    impl SaveRestore for UhProcessor<'_, TdxBacked> {
4621        type SavedState = SavedStateNotSupported;
4622
4623        fn save(&mut self) -> Result<Self::SavedState, SaveError> {
4624            Err(SaveError::NotSupported)
4625        }
4626
4627        fn restore(&mut self, state: Self::SavedState) -> Result<(), RestoreError> {
4628            match state {}
4629        }
4630    }
4631}