Skip to main content

vfio_assigned_device/
lib.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! VFIO-backed PCI device assignment for OpenVMM.
5//!
6//! This crate implements a `ChipsetDevice` that proxies PCI config space
7//! and BAR MMIO accesses to a physical device opened via Linux VFIO. The device
8//! appears as a standard PCIe endpoint to the guest. MSI-X table and PBA
9//! accesses are intercepted and handled by a software emulator; all other BAR
10//! MMIO regions are mapped directly into guest GPA space via a `MemoryMapper`,
11//! allowing the guest to access device registers without VM exits. A
12//! `MemoryMapper` is required for VFIO device assignment; mapping failures are
13//! fatal.
14
15#![cfg(target_os = "linux")]
16
17pub mod manager;
18pub mod resolver;
19
20use anyhow::Context as _;
21use chipset_device::ChipsetDevice;
22use chipset_device::io::IoResult;
23use chipset_device::mmio::MmioIntercept;
24use chipset_device::pci::PciConfigSpace;
25use guestmem::MappableGuestMemory;
26use guestmem::MemoryMapper;
27use inspect::Inspect;
28use inspect::InspectMut;
29use memory_range::MemoryRange;
30use pci_core::bar_mapping::BarMappings;
31use pci_core::capabilities::PciCapability;
32use pci_core::capabilities::msix::MsixEmulator;
33use pci_core::msi::MsiTarget;
34use pci_core::spec::cfg_space;
35use pci_core::spec::cfg_space::HeaderType00;
36use std::ops::Range;
37use std::os::unix::fs::FileExt;
38use std::sync::Arc;
39use virt::irqfd::IrqFd;
40use vmcore::device_state::ChangeDeviceState;
41use vmcore::save_restore::RestoreError;
42use vmcore::save_restore::SaveError;
43use vmcore::save_restore::SaveRestore;
44use vmcore::save_restore::SavedStateNotSupported;
45use vmcore::vm_task::VmTaskDriverSource;
46
47/// VFIO BAR region information (offset and size within the device fd).
48#[derive(Debug, Clone, Copy, Inspect)]
49struct VfioBarInfo {
50    /// Offset within the VFIO device fd where this BAR region starts.
51    #[inspect(hex)]
52    pub vfio_offset: u64,
53    /// Size of the BAR region in bytes.
54    #[inspect(hex)]
55    pub size: u64,
56}
57
58/// A direct mapping of a VFIO BAR sub-region into guest GPA space.
59///
60/// Created during device initialization for each mmappable sub-region
61/// of a BAR. The `memory` handle is mapped/unmapped from guest GPA space
62/// as the guest enables/disables MMIO via the PCI Command register.
63#[derive(Inspect)]
64struct BarDirectMap {
65    /// Guest memory mapping handle backed by the VFIO device fd.
66    #[inspect(skip)]
67    memory: Box<dyn MappableGuestMemory>,
68    /// BAR index this sub-region belongs to.
69    bar_index: u8,
70    /// The memory range within the BAR.
71    bar_range: MemoryRange,
72    /// Whether this sub-region is currently mapped into guest GPA space.
73    mapping: Option<MemoryRange>,
74}
75
76/// MSI-X emulation state, discovered from the physical device's capabilities.
77#[derive(Inspect)]
78struct MsixEmulationState {
79    /// Software MSI-X table emulator (handles table entries, PBA,
80    /// enable/disable state transitions, and irqfd route management).
81    #[inspect(skip)]
82    emulator: MsixEmulator,
83    /// MSI-X PCI capability handler (shared state with emulator; used to
84    /// forward config space writes so the emulator tracks enable/disable).
85    #[inspect(skip)]
86    capability: Box<dyn PciCapability>,
87    /// Offset of the MSI-X capability in PCI config space.
88    #[inspect(hex)]
89    cap_offset: u16,
90    /// Number of MSI-X vectors.
91    vector_count: u16,
92    /// BAR index containing the MSI-X table.
93    table_bar: u8,
94    #[inspect(with = r#"|x| format!("{:#x}-{:#x}", x.start, x.end)"#)]
95    table_range: Range<u64>,
96    /// BAR index containing the PBA.
97    pba_bar: u8,
98    #[inspect(with = r#"|x| format!("{:#x}-{:#x}", x.start, x.end)"#)]
99    pba_range: Range<u64>,
100    /// Whether MSI-X is currently enabled by the guest.
101    enabled: bool,
102}
103
104/// A PCI device backed by a VFIO device file.
105///
106/// Config space reads/writes are proxied to the physical device via the VFIO
107/// config region. BARs are cached locally so the guest can probe sizes without
108/// hitting hardware on every access. MSI-X table and PBA MMIO accesses are
109/// intercepted and handled by a software emulator; all other BAR MMIO is
110/// proxied to the physical device via pread/pwrite on the VFIO device fd.
111#[derive(InspectMut)]
112pub(crate) struct VfioAssignedPciDevice {
113    /// The PCI address string (e.g., "0000:01:00.0") for diagnostics.
114    #[inspect(display)]
115    pci_id: String,
116
117    /// The VFIO device, used for config space, BAR MMIO, and MSI-X mapping.
118    #[inspect(skip)]
119    vfio_device: vfio_sys::Device,
120
121    /// irqfd routing interface for registering eventfds with the hypervisor.
122    #[inspect(skip)]
123    irqfd: Arc<dyn IrqFd>,
124
125    /// Offset into the VFIO device fd where the PCI config region starts.
126    #[inspect(hex)]
127    config_offset: u64,
128
129    /// Size of the config space region.
130    #[inspect(hex)]
131    config_size: u64,
132
133    /// BAR masks as read from the physical device (write 0xFFFFFFFF, read back).
134    #[inspect(iter_by_index, hex)]
135    bar_masks: [u32; 6],
136
137    /// Current BAR values as seen by the guest.
138    #[inspect(iter_by_index, hex)]
139    bars: [u32; 6],
140
141    /// Low bits of each BAR that encode type/prefetch flags.
142    #[inspect(iter_by_index, hex)]
143    bar_flags: [u32; 6],
144
145    /// Current MMIO-enabled state (from PCI Command register bit 1).
146    mmio_enabled: bool,
147
148    /// Decoded BAR mappings when MMIO is enabled.
149    active_bars: BarMappings,
150
151    /// Chipset MMIO region controls per BAR — used to register/unregister
152    /// the device's BAR address ranges with the chipset so MMIO accesses
153    /// are routed to this device.
154    #[inspect(skip)]
155    bar_mmio_controls: [Option<Box<dyn chipset_device::mmio::ControlMmioIntercept>>; 6],
156
157    /// VFIO region info per BAR for MMIO proxying via pread/pwrite.
158    #[inspect(iter_by_index)]
159    bar_regions: [Option<VfioBarInfo>; 6],
160
161    /// MSI-X emulation state (None if device has no MSI-X capability).
162    msix: Option<MsixEmulationState>,
163
164    /// Whether the device supports VFIO_DEVICE_RESET (cached from device info
165    /// flags at init).
166    supports_reset: bool,
167
168    /// Direct guest GPA mappings for BAR sub-regions that support mmap.
169    /// When MMIO is enabled, these are mapped to guest GPA space for direct
170    /// access without VM exits. When MMIO is disabled, they are unmapped.
171    #[inspect(iter_by_index)]
172    bar_direct_maps: Vec<BarDirectMap>,
173
174    /// VFIO container/group binding. Keeps the container and group fds alive
175    /// and notifies the container manager on drop.
176    binding: manager::VfioDeviceBinding,
177}
178
179impl VfioAssignedPciDevice {
180    /// Create a new VFIO assigned PCI device.
181    ///
182    /// Reads BAR flags from config space and derives BAR masks from the VFIO
183    /// region sizes (avoiding the write-all-ones probe cycle). Discovers MSI-X
184    /// capability if present and creates a software emulator for it.
185    pub async fn new(
186        binding: manager::VfioDeviceBinding,
187        pci_id: String,
188        driver_source: &VmTaskDriverSource,
189        register_mmio: &mut (dyn chipset_device::mmio::RegisterMmioIntercept + Send),
190        msi_target: &MsiTarget,
191        irqfd: Arc<dyn IrqFd>,
192        memory_mapper: &dyn MemoryMapper,
193    ) -> anyhow::Result<Self> {
194        let driver = driver_source.simple();
195        let retry = vfio_sys::VfioRetry::new(&driver, &pci_id);
196        let is_enodev = |e: &anyhow::Error| {
197            e.chain().any(|cause| {
198                cause
199                    .downcast_ref::<nix::errno::Errno>()
200                    .is_some_and(|e| *e == nix::errno::Errno::ENODEV)
201            })
202        };
203        let vfio_device = retry
204            .retry(
205                || binding.group().open_device(&pci_id),
206                &is_enodev,
207                "open_device",
208            )
209            .await
210            .with_context(|| format!("failed to open VFIO device {pci_id}"))?;
211
212        let config_info = vfio_device
213            .region_info(vfio_bindings::bindings::vfio::VFIO_PCI_CONFIG_REGION_INDEX)
214            .context("failed to get VFIO config region info")?;
215
216        let config_offset = config_info.offset;
217        let config_size = config_info.size;
218
219        // Read BAR values and derive masks from VFIO region sizes.
220        // This avoids the standard write-all-ones probe cycle — VFIO already
221        // knows the BAR sizes from the host kernel.
222        let mut bar_masks = [0u32; 6];
223        let mut bar_flags = [0u32; 6];
224
225        let mut bars = [0u32; 6];
226        for (i, bar) in bars.iter_mut().enumerate() {
227            *bar = read_config_u32(
228                vfio_device.as_ref(),
229                config_offset,
230                config_size,
231                HeaderType00::BAR0.0 + (i as u16) * 4,
232            )?;
233        }
234
235        let mut bar_regions = [None; 6];
236        let mut bar_mmio_controls = [(); 6].map(|_| None);
237        let mut bar_mmap_areas: [Vec<_>; 6] = Default::default();
238        let mut processed = 0;
239        while processed < 6 {
240            let i = processed;
241            processed += 1;
242            let Ok(info) = vfio_device.region_info(i as u32) else {
243                continue;
244            };
245            if info.size == 0 {
246                continue;
247            }
248
249            let flags = bars[i] & 0xf;
250            bar_flags[i] = flags;
251            let encoded = cfg_space::BarEncodingBits::from(flags);
252            if encoded.use_pio() {
253                anyhow::bail!("PIO BARs are not supported");
254            }
255            let is_64bit = encoded.type_64_bit();
256            if is_64bit && i == 5 {
257                anyhow::bail!("64-bit BAR at index 5 is invalid");
258            }
259
260            if !info.size.is_power_of_two() {
261                anyhow::bail!("BAR size is not a power of two: {:#x}", info.size);
262            }
263
264            // Derive the mask from the VFIO region size. For a BAR of size N
265            // (power of 2), the mask is ~(N - 1). Set the type_64_bit flag
266            // so that BarMappings::parse correctly merges 64-bit BAR pairs.
267            let mask64 = !(info.size - 1);
268            bar_masks[i] = (mask64 as u32) | flags;
269            if is_64bit {
270                bar_masks[i + 1] = (mask64 >> 32) as u32;
271                processed += 1;
272            }
273
274            bar_regions[i] = Some(VfioBarInfo {
275                vfio_offset: info.offset,
276                size: info.size,
277            });
278
279            bar_mmio_controls[i] = Some(register_mmio.new_io_region(&format!("bar{i}"), info.size));
280            bar_mmap_areas[i] = vfio_device
281                .region_mmap_areas(i as u32)
282                .with_context(|| format!("failed to query VFIO mmap areas for BAR {i}"))?;
283        }
284
285        // Discover MSI-X capability from physical device config space.
286        // This must happen BEFORE creating direct BAR mappings so we can
287        // exclude the MSI-X table/PBA regions.
288        let msix = discover_msix(vfio_device.as_ref(), config_offset, config_size, msi_target);
289
290        // Cache whether the device supports VFIO_DEVICE_RESET so we can skip
291        // the ioctl on every VM reset for devices that don't support it.
292        let supports_reset = vfio_device
293            .info()
294            .map(|info| info.flags.reset())
295            .unwrap_or(false);
296
297        // If the device has MSI-X, remove the table and PBA regions from
298        // the mmap areas so they remain trap-and-emulate.
299        if let Some(msix) = &msix {
300            subtract_msix_regions(&mut bar_mmap_areas, msix);
301        }
302
303        // Create direct BAR mappings for mmappable regions. Each
304        // mmappable sub-region gets a guest memory mapping backed by the
305        // VFIO device fd. These are mapped into guest GPA space when the
306        // guest enables MMIO, allowing direct hardware access without VM
307        // exits. Non-mmappable regions (e.g. MSI-X table/PBA) remain
308        // trap-and-emulate.
309        let mut bar_direct_maps = Vec::new();
310        for (i, areas) in bar_mmap_areas.iter().enumerate() {
311            let Some(region) = &bar_regions[i] else {
312                continue;
313            };
314            for &area in areas {
315                let name = format!("vfio-{pci_id}-bar{i}-{area}");
316                let (memory, mapped_region) = memory_mapper
317                    .new_region(area.len() as usize, name)
318                    .with_context(|| {
319                    format!("failed to create BAR {i} direct mapping region for {pci_id}")
320                })?;
321                mapped_region
322                    .map(
323                        0,
324                        &vfio_device,
325                        region.vfio_offset + area.start(),
326                        area.len() as usize,
327                        true,
328                    )
329                    .with_context(|| {
330                        format!(
331                            "failed to map VFIO BAR {i} region at offset {:#x}",
332                            area.start()
333                        )
334                    })?;
335                bar_direct_maps.push(BarDirectMap {
336                    memory,
337                    bar_index: i as u8,
338                    bar_range: area,
339                    mapping: None,
340                });
341            }
342        }
343
344        tracing::info!(
345            pci_id = pci_id.as_str(),
346            ?bar_masks,
347            has_msix = msix.is_some(),
348            supports_reset,
349            "VFIO assigned PCI device initialized"
350        );
351
352        Ok(Self {
353            pci_id,
354            vfio_device,
355            irqfd,
356            config_offset,
357            config_size,
358            bar_masks,
359            bars: bar_flags, // Ignore the current BAR values--we don't care what the device thinks the BARs are.
360            bar_flags,
361            mmio_enabled: false,
362            active_bars: BarMappings::default(),
363            bar_mmio_controls,
364            bar_regions,
365            msix,
366            supports_reset,
367            bar_direct_maps,
368            binding,
369        })
370    }
371
372    fn read_phys_config(&self, offset: u16) -> u32 {
373        match read_config_u32(
374            self.vfio_device.as_ref(),
375            self.config_offset,
376            self.config_size,
377            offset,
378        ) {
379            Ok(value) => value,
380            Err(e) => {
381                tracelimit::warn_ratelimited!(
382                    offset,
383                    error = ?e,
384                    "VFIO config space read failed"
385                );
386                !0
387            }
388        }
389    }
390
391    fn write_phys_config(&self, offset: u16, value: u32) {
392        if let Err(e) = write_config_u32(
393            self.vfio_device.as_ref(),
394            self.config_offset,
395            self.config_size,
396            offset,
397            value,
398        ) {
399            tracelimit::warn_ratelimited!(
400                offset,
401                error = ?e,
402                "VFIO config space write failed"
403            );
404        }
405    }
406
407    /// Map a BAR + offset to an MsixEmulator offset, if the access falls
408    /// within the MSI-X table or PBA region.
409    fn msix_emulator_offset(&self, bar: u8, offset: u64) -> Option<u64> {
410        let msix = self.msix.as_ref()?;
411
412        // Check MSI-X table region.
413        if bar == msix.table_bar && msix.table_range.contains(&offset) {
414            // Emulator table starts at offset 0.
415            return Some(offset - msix.table_range.start);
416        }
417
418        // Check PBA region.
419        if bar == msix.pba_bar && msix.pba_range.contains(&offset) {
420            // In the emulator, PBA starts right after the table.
421            let emu_pba_start = msix.table_range.end - msix.table_range.start;
422            return Some(emu_pba_start + (offset - msix.pba_range.start));
423        }
424
425        None
426    }
427
428    /// Set up irqfd-backed MSI-X interrupt delivery when the guest enables MSI-X.
429    ///
430    /// Tells the emulator to create irqfd routes and passes the resulting
431    /// events to VFIO so the physical device signals them on interrupt.
432    fn msix_enable(&mut self) -> anyhow::Result<()> {
433        let msix = self.msix.as_mut().expect("msix must be present");
434        let count = msix.vector_count;
435
436        // VFIO map_msix has a hard limit of 256 eventfds per call.
437        anyhow::ensure!(
438            count <= 256,
439            "MSI-X vector count ({count}) exceeds VFIO limit of 256"
440        );
441
442        let vfio_device = &self.vfio_device;
443        msix.emulator.enable_irqfd(self.irqfd.as_ref(), |events| {
444            vfio_device
445                .map_msix(0, events)
446                .context("VFIO map_msix failed")
447        })?;
448
449        tracing::info!(
450            count,
451            pci_id = self.pci_id.as_str(),
452            "MSI-X enabled: mapped vectors to irqfd routes"
453        );
454        Ok(())
455    }
456
457    /// Tear down VFIO MSI-X eventfd mapping when the guest disables MSI-X.
458    fn msix_disable(&mut self) {
459        let msix = self.msix.as_mut().expect("msix must be present");
460        let count = msix.vector_count;
461
462        if let Err(e) = self.vfio_device.unmap_msix(0, count as u32) {
463            tracing::warn!(
464                error = ?e,
465                pci_id = self.pci_id.as_str(),
466                "VFIO unmap_msix failed"
467            );
468        }
469
470        msix.emulator.disable_irqfd();
471        tracing::info!(
472            pci_id = self.pci_id.as_str(),
473            "MSI-X disabled: unmapped vectors"
474        );
475    }
476
477    /// Re-evaluate BAR mappings against the current BAR register values.
478    ///
479    /// Diffs the old and new decoded addresses and only unmaps/remaps BARs
480    /// whose address actually changed. When MMIO is disabled, all BARs are
481    /// treated as unmapped so the diff naturally tears everything down.
482    fn update_bar_mappings(&mut self) {
483        let new_bars = if self.mmio_enabled {
484            BarMappings::parse(&self.bars, &self.bar_masks)
485        } else {
486            BarMappings::default()
487        };
488
489        // For each BAR that had a mapping, check if its address changed.
490        // Unmap any that moved or disappeared.
491        for old in self.active_bars.iter() {
492            let new_addr = new_bars.get(old.index);
493            if new_addr == Some(old.base_address) {
494                continue;
495            }
496            // Address changed or BAR disappeared — tear down old mapping.
497            if let Some(control) = self.bar_mmio_controls[old.index as usize].as_mut() {
498                control.unmap();
499            }
500            for dm in &mut self.bar_direct_maps {
501                if dm.bar_index == old.index {
502                    dm.memory.unmap_from_guest();
503                    dm.mapping = None;
504                }
505            }
506        }
507
508        // For each BAR in the new set, map any that are new or moved.
509        for new in new_bars.iter() {
510            let old_addr = self.active_bars.get(new.index);
511            if old_addr == Some(new.base_address) {
512                continue;
513            }
514            // New or moved — set up mapping.
515            self.bar_mmio_controls[new.index as usize]
516                .as_mut()
517                .expect("BAR MMIO control must be present")
518                .map(new.base_address);
519
520            for dm in &mut self.bar_direct_maps {
521                if dm.bar_index == new.index {
522                    let gpa = new.base_address + dm.bar_range.start();
523                    match dm.memory.map_to_guest(gpa, true) {
524                        Ok(()) => {
525                            dm.mapping = Some(MemoryRange::new(gpa..gpa + dm.bar_range.len()));
526                        }
527                        Err(e) => {
528                            tracelimit::error_ratelimited!(
529                                bar = dm.bar_index,
530                                gpa,
531                                error = ?e,
532                                pci_id = self.pci_id.as_str(),
533                                "failed to direct-map BAR region to guest"
534                            );
535                        }
536                    }
537                }
538            }
539        }
540
541        self.active_bars = new_bars;
542    }
543}
544
545fn read_config_u32(
546    file: &std::fs::File,
547    config_offset: u64,
548    config_size: u64,
549    offset: u16,
550) -> anyhow::Result<u32> {
551    if (offset as u64) + 4 > config_size {
552        anyhow::bail!("config read offset {offset:#x} out of range");
553    }
554    let mut buf = [0u8; 4];
555    let n = file
556        .read_at(&mut buf, config_offset + offset as u64)
557        .with_context(|| format!("failed to read config at offset {offset:#x}"))?;
558    anyhow::ensure!(
559        n == 4,
560        "short config read at offset {offset:#x}: got {n} bytes"
561    );
562    // VFIO config space reads return host-endian bytes on x86. Using
563    // native endian is correct on LE platforms (x86, aarch64).
564    Ok(u32::from_ne_bytes(buf))
565}
566
567fn write_config_u32(
568    file: &std::fs::File,
569    config_offset: u64,
570    config_size: u64,
571    offset: u16,
572    value: u32,
573) -> anyhow::Result<()> {
574    if (offset as u64) + 4 > config_size {
575        anyhow::bail!("config write offset {offset:#x} out of range");
576    }
577    let n = file.write_at(&value.to_ne_bytes(), config_offset + offset as u64)?;
578    anyhow::ensure!(
579        n == 4,
580        "short config write at offset {offset:#x}: wrote {n} bytes"
581    );
582    Ok(())
583}
584
585/// Remove MSI-X table and PBA regions from the mmap areas for the
586/// corresponding BARs. This ensures those regions are NOT direct-mapped
587/// and remain trap-and-emulate so the software MSI-X emulator can
588/// intercept accesses.
589///
590/// Each mmap area that overlaps with the MSI-X table or PBA is split
591/// into up to two non-overlapping areas (before and after the excluded
592/// region). The exclusion zone is expanded to page boundaries since
593/// the resulting areas must be page-aligned for mmap.
594fn subtract_msix_regions(bar_mmap_areas: &mut [Vec<MemoryRange>; 6], msix: &MsixEmulationState) {
595    let page_size = page_size();
596
597    for (i, area) in bar_mmap_areas.iter_mut().enumerate() {
598        let i = i as u8;
599        if area.is_empty() || (msix.table_bar != i && msix.pba_bar != i) {
600            continue;
601        }
602        area.sort();
603        *area = memory_range::subtract_ranges(
604            memory_range::subtract_ranges(
605                area.iter().copied(),
606                if msix.table_bar == i {
607                    Some(MemoryRange::bounding_aligned(
608                        msix.table_range.clone(),
609                        page_size,
610                    ))
611                } else {
612                    None
613                },
614            ),
615            if msix.pba_bar == i {
616                Some(MemoryRange::bounding_aligned(
617                    msix.pba_range.clone(),
618                    page_size,
619                ))
620            } else {
621                None
622            },
623        )
624        .collect();
625    }
626}
627
628fn page_size() -> u64 {
629    vfio_sys::host_page_size()
630}
631
632/// Walk the PCI capabilities list to find an MSI-X capability. If found,
633/// create an [`MsixEmulator`] and return the discovery info.
634fn discover_msix(
635    device_file: &std::fs::File,
636    config_offset: u64,
637    config_size: u64,
638    msi_target: &MsiTarget,
639) -> Option<MsixEmulationState> {
640    // Read the Capabilities Pointer. Bottom 2 bits are reserved per PCI spec §6.7.
641    let cap_ptr_dword = read_config_u32(
642        device_file,
643        config_offset,
644        config_size,
645        HeaderType00::RESERVED_CAP_PTR.0,
646    )
647    .ok()?;
648    let mut cap_ptr = (cap_ptr_dword & 0xFC) as u16; // mask off reserved bits [1:0]
649    let mut iterations = 0usize;
650
651    while cap_ptr != 0 {
652        // Guard against malformed capability lists (cycles or excessive length).
653        // PCI config space is 256 bytes; capabilities are at least 4 bytes each.
654        const MAX_CAPS: usize = 48;
655        if iterations >= MAX_CAPS {
656            tracing::warn!("PCI capability list exceeded {MAX_CAPS} entries, aborting walk");
657            return None;
658        }
659        iterations += 1;
660
661        let header = read_config_u32(device_file, config_offset, config_size, cap_ptr).ok()?;
662        let cap_id = (header & 0xFF) as u8;
663        let next_ptr = ((header >> 8) & 0xFC) as u16;
664
665        if cap_id == pci_core::spec::caps::CapabilityId::MSIX.0 {
666            // Message Control is in the upper 16 bits of the first DWORD.
667            let msg_ctrl = (header >> 16) as u16;
668            let table_count = (msg_ctrl & 0x7FF) + 1;
669
670            // Table Offset/BIR (second DWORD of the capability).
671            let table_dword =
672                read_config_u32(device_file, config_offset, config_size, cap_ptr + 4).ok()?;
673            let table_bir = (table_dword & 0x7) as u8;
674            let table_offset = table_dword & !0x7;
675
676            // PBA Offset/BIR (third DWORD of the capability).
677            let pba_dword =
678                read_config_u32(device_file, config_offset, config_size, cap_ptr + 8).ok()?;
679            let pba_bir = (pba_dword & 0x7) as u8;
680            let pba_offset = pba_dword & !0x7;
681
682            let table_size = table_count as u64 * 16; // MSI-X entry size
683            // PBA: one bit per vector, rounded up to QWORD boundary.
684            let pba_size = (table_count as u64).div_ceil(64) * 8;
685
686            let (emulator, msix_cap) = MsixEmulator::new(table_bir, table_count, msi_target);
687
688            tracing::info!(
689                table_count,
690                table_bir,
691                table_offset,
692                pba_bir,
693                pba_offset,
694                cap_offset = cap_ptr,
695                "discovered MSI-X capability"
696            );
697
698            return Some(MsixEmulationState {
699                emulator,
700                capability: Box::new(msix_cap),
701                cap_offset: cap_ptr,
702                vector_count: table_count,
703                table_bar: table_bir,
704                table_range: table_offset as u64..table_offset as u64 + table_size,
705                pba_bar: pba_bir,
706                pba_range: pba_offset as u64..pba_offset as u64 + pba_size,
707                enabled: false,
708            });
709        }
710
711        cap_ptr = next_ptr;
712    }
713
714    None
715}
716
717/// Read from the MSI-X emulator at the given offset, handling sub-DWORD
718/// accesses by aligning to u32 boundaries.
719fn read_msix_emulator(emulator: &MsixEmulator, offset: u64, data: &mut [u8]) {
720    let aligned = offset & !3;
721    let shift = (offset & 3) as usize;
722    let val = emulator.read_u32(aligned);
723    let bytes = val.to_le_bytes();
724    let first_chunk = data.len().min(4 - shift);
725    data[..first_chunk].copy_from_slice(&bytes[shift..shift + first_chunk]);
726
727    // Handle reads that span a u32 boundary.
728    if first_chunk < data.len() {
729        let next_val = emulator.read_u32(aligned + 4);
730        let next_bytes = next_val.to_le_bytes();
731        let remaining = data.len() - first_chunk;
732        data[first_chunk..first_chunk + remaining].copy_from_slice(&next_bytes[..remaining]);
733    }
734}
735
736/// Write to the MSI-X emulator at the given offset, handling sub-DWORD
737/// accesses via read-modify-write.
738fn write_msix_emulator(emulator: &mut MsixEmulator, offset: u64, data: &[u8]) {
739    let aligned = offset & !3;
740    let shift = (offset & 3) as usize;
741    let first_chunk = data.len().min(4 - shift);
742
743    if first_chunk == 4 && shift == 0 {
744        // Fast path: aligned u32 write.
745        let val = u32::from_le_bytes([data[0], data[1], data[2], data[3]]);
746        emulator.write_u32(aligned, val);
747    } else {
748        // Read-modify-write for sub-DWORD access.
749        let mut current = emulator.read_u32(aligned).to_le_bytes();
750        current[shift..shift + first_chunk].copy_from_slice(&data[..first_chunk]);
751        emulator.write_u32(aligned, u32::from_le_bytes(current));
752    }
753
754    // Handle writes that span a u32 boundary.
755    if first_chunk < data.len() {
756        let remaining = data.len() - first_chunk;
757        let mut next = emulator.read_u32(aligned + 4).to_le_bytes();
758        next[..remaining].copy_from_slice(&data[first_chunk..]);
759        emulator.write_u32(aligned + 4, u32::from_le_bytes(next));
760    }
761}
762
763impl ChangeDeviceState for VfioAssignedPciDevice {
764    fn start(&mut self) {}
765
766    async fn stop(&mut self) {}
767
768    async fn reset(&mut self) {
769        // Tear down MSI-X irqfd routes before resetting state.
770        if self.msix.as_ref().is_some_and(|m| m.enabled) {
771            self.msix_disable();
772        }
773
774        self.mmio_enabled = false;
775        self.update_bar_mappings();
776
777        // Destructure to ensure every field is explicitly considered for reset.
778        let Self {
779            pci_id,
780            vfio_device,
781            irqfd: _,         // handle — no reset needed
782            config_offset: _, // immutable device geometry
783            config_size: _,   // immutable device geometry
784            bar_masks: _,     // immutable device geometry
785            bars,
786            bar_flags,
787            mmio_enabled: _,      // handled above
788            active_bars: _,       // handled by update_bar_mappings()
789            bar_mmio_controls: _, // handled by update_bar_mappings()
790            bar_direct_maps: _,   // handled by update_bar_mappings()
791            bar_regions: _,       // immutable device geometry
792            msix,
793            supports_reset,
794            binding: _, // lifetime handle — no reset needed
795        } = self;
796
797        // Reset emulated MSI-X table and capability to power-on defaults
798        // (all vectors masked, address/data zeroed). The capability and
799        // emulator share state via Arc<Mutex>.
800        if let Some(msix) = msix {
801            msix.enabled = false;
802            msix.capability.reset();
803        }
804
805        // Reset cached BAR addresses to power-on defaults (flags only, no
806        // address bits). The guest will re-probe and re-program BARs.
807        *bars = *bar_flags;
808
809        // Reset the physical device via VFIO so it starts in a clean state.
810        if *supports_reset {
811            if let Err(err) = vfio_device.reset() {
812                tracing::warn!(
813                    pci_id = pci_id.as_str(),
814                    error = err.as_ref() as &dyn std::error::Error,
815                    "failed to reset VFIO device"
816                );
817            }
818        }
819    }
820}
821
822impl ChipsetDevice for VfioAssignedPciDevice {
823    fn supports_pci(&mut self) -> Option<&mut dyn PciConfigSpace> {
824        Some(self)
825    }
826
827    fn supports_mmio(&mut self) -> Option<&mut dyn MmioIntercept> {
828        Some(self)
829    }
830}
831
832impl PciConfigSpace for VfioAssignedPciDevice {
833    fn pci_cfg_read(&mut self, offset: u16, value: &mut u32) -> IoResult {
834        *value = match HeaderType00(offset) {
835            // BAR registers: return locally cached values.
836            HeaderType00::BAR0
837            | HeaderType00::BAR1
838            | HeaderType00::BAR2
839            | HeaderType00::BAR3
840            | HeaderType00::BAR4
841            | HeaderType00::BAR5 => {
842                let i = (offset - HeaderType00::BAR0.0) as usize / 4;
843                self.bars[i]
844            }
845            // MSI-X capability first DWORD: merge hardware ID/NextPtr (low
846            // 16 bits) with emulator's Message Control (high 16 bits). The
847            // emulator tracks the enable/function-mask bits; the hardware
848            // provides the correct capability ID and Next Pointer so the
849            // capability chain remains intact.
850            offset if self.msix.as_ref().is_some_and(|m| offset.0 == m.cap_offset) => {
851                let msix = self.msix.as_ref().unwrap();
852                let hw = self.read_phys_config(offset.0);
853                let emu = msix.capability.read_u32(0);
854                // Low 16 bits from hardware (cap ID + next ptr),
855                // high 16 bits from emulator (message control).
856                (hw & 0xFFFF) | (emu & 0xFFFF0000)
857            }
858            // Everything else: read from physical device.
859            _ => self.read_phys_config(offset),
860        };
861
862        IoResult::Ok
863    }
864
865    fn pci_cfg_write(&mut self, offset: u16, value: u32) -> IoResult {
866        match HeaderType00(offset) {
867            // Command register: track MMIO enable/disable.
868            HeaderType00::STATUS_COMMAND => {
869                let command = cfg_space::Command::from_bits(value as u16);
870                let new_mmio_enabled = command.mmio_enabled();
871
872                if new_mmio_enabled != self.mmio_enabled {
873                    self.mmio_enabled = new_mmio_enabled;
874                    self.update_bar_mappings();
875                    tracing::debug!(
876                        pci_id = self.pci_id.as_str(),
877                        enabled = new_mmio_enabled,
878                        "MMIO state changed by guest"
879                    );
880                }
881
882                self.write_phys_config(offset, value);
883            }
884            // BAR registers: mask and cache locally. If MMIO is active,
885            // re-evaluate mappings so the device responds at the new address
886            // immediately (matching real hardware behavior).
887            HeaderType00::BAR0
888            | HeaderType00::BAR1
889            | HeaderType00::BAR2
890            | HeaderType00::BAR3
891            | HeaderType00::BAR4
892            | HeaderType00::BAR5 => {
893                let i = (offset - HeaderType00::BAR0.0) as usize / 4;
894                self.bars[i] = (value & self.bar_masks[i]) | self.bar_flags[i];
895
896                if self.mmio_enabled {
897                    self.update_bar_mappings();
898                }
899            }
900            // All other registers: pass through to physical device.
901            _ => {
902                // Intercept MSI-X capability writes to track enable/disable
903                // state in the software emulator. Do NOT forward the MSI-X
904                // control register to hardware via write_phys_config — VFIO
905                // manages the hardware MSI-X enable bit internally via
906                // VFIO_DEVICE_SET_IRQS. Writing it again through config space
907                // causes VFIO to tear down and re-setup MSI-X, losing the
908                // eventfd associations.
909                if let Some(msix) = &mut self.msix {
910                    if offset == msix.cap_offset {
911                        let new_enabled = value & 0x8000_0000 != 0;
912                        let was_enabled = msix.enabled;
913
914                        if new_enabled && !was_enabled {
915                            // Install irqfd routes BEFORE writing the
916                            // capability, so that when the capability
917                            // processes the enable transition it can call
918                            // set_msi() on the already-installed routes.
919                            match self.msix_enable() {
920                                Ok(()) => {
921                                    let msix = self.msix.as_mut().unwrap();
922                                    msix.capability.write_u32(0, value);
923                                    msix.enabled = true;
924                                }
925                                Err(e) => {
926                                    tracing::error!(
927                                        error = ?e,
928                                        pci_id = self.pci_id.as_str(),
929                                        "failed to enable MSI-X"
930                                    );
931                                }
932                            }
933                        } else if was_enabled && !new_enabled {
934                            // Write capability first to disable vectors,
935                            // then tear down VFIO mapping.
936                            msix.capability.write_u32(0, value);
937                            self.msix_disable();
938                            self.msix.as_mut().unwrap().enabled = false;
939                        } else {
940                            // No enable/disable transition — just forward.
941                            msix.capability.write_u32(0, value);
942                        }
943                        // Skip write_phys_config for MSI-X control register.
944                        return IoResult::Ok;
945                    }
946                }
947                self.write_phys_config(offset, value);
948            }
949        }
950
951        IoResult::Ok
952    }
953}
954
955impl MmioIntercept for VfioAssignedPciDevice {
956    fn mmio_read(&mut self, addr: u64, data: &mut [u8]) -> IoResult {
957        if let Some((bar, offset)) = self.active_bars.find(addr) {
958            // Check if this access falls in the MSI-X table or PBA.
959            if let Some(emu_offset) = self.msix_emulator_offset(bar, offset) {
960                let msix = self.msix.as_ref().expect("msix must be present");
961                read_msix_emulator(&msix.emulator, emu_offset, data);
962                return IoResult::Ok;
963            }
964
965            // Proxy to physical device BAR via pread.
966            if let Some(region) = &self.bar_regions[bar as usize] {
967                if offset + data.len() as u64 <= region.size {
968                    match self
969                        .vfio_device
970                        .as_ref()
971                        .read_at(data, region.vfio_offset + offset)
972                    {
973                        Ok(n) if n == data.len() => return IoResult::Ok,
974                        Ok(n) => {
975                            tracelimit::warn_ratelimited!(
976                                bar,
977                                offset,
978                                expected = data.len(),
979                                actual = n,
980                                "VFIO BAR short read"
981                            );
982                        }
983                        Err(_) => {}
984                    }
985                }
986                tracelimit::warn_ratelimited!(
987                    bar,
988                    offset,
989                    len = data.len(),
990                    pci_id = self.pci_id.as_str(),
991                    "VFIO BAR read failed or out of range"
992                );
993            }
994        }
995        data.fill(!0);
996        IoResult::Ok
997    }
998
999    fn mmio_write(&mut self, addr: u64, data: &[u8]) -> IoResult {
1000        if let Some((bar, offset)) = self.active_bars.find(addr) {
1001            // Check if this access falls in the MSI-X table or PBA.
1002            if let Some(emu_offset) = self.msix_emulator_offset(bar, offset) {
1003                let msix = self.msix.as_mut().expect("msix must be present");
1004                write_msix_emulator(&mut msix.emulator, emu_offset, data);
1005                return IoResult::Ok;
1006            }
1007
1008            // Proxy to physical device BAR via pwrite.
1009            if let Some(region) = &self.bar_regions[bar as usize] {
1010                if offset + data.len() as u64 <= region.size {
1011                    match self
1012                        .vfio_device
1013                        .as_ref()
1014                        .write_at(data, region.vfio_offset + offset)
1015                    {
1016                        Ok(n) if n == data.len() => return IoResult::Ok,
1017                        Ok(n) => {
1018                            tracelimit::warn_ratelimited!(
1019                                bar,
1020                                offset,
1021                                expected = data.len(),
1022                                actual = n,
1023                                pci_id = self.pci_id.as_str(),
1024                                "VFIO BAR short write"
1025                            );
1026                        }
1027                        Err(e) => {
1028                            tracelimit::warn_ratelimited!(
1029                                bar,
1030                                offset,
1031                                error = ?e,
1032                                pci_id = self.pci_id.as_str(),
1033                                "VFIO BAR write failed"
1034                            );
1035                        }
1036                    }
1037                    return IoResult::Ok;
1038                }
1039                tracelimit::warn_ratelimited!(
1040                    bar,
1041                    offset,
1042                    len = data.len(),
1043                    pci_id = self.pci_id.as_str(),
1044                    "VFIO BAR write out of range"
1045                );
1046            }
1047        }
1048        IoResult::Ok
1049    }
1050}
1051
1052impl SaveRestore for VfioAssignedPciDevice {
1053    type SavedState = SavedStateNotSupported;
1054
1055    fn save(&mut self) -> Result<Self::SavedState, SaveError> {
1056        // TODO
1057        Err(SaveError::NotSupported)
1058    }
1059
1060    fn restore(&mut self, state: Self::SavedState) -> Result<(), RestoreError> {
1061        match state {}
1062    }
1063}