vfio_sys/
lib.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4#![expect(missing_docs)]
5#![cfg(unix)]
6// UNSAFETY: Manual memory management with mmap and vfio ioctls.
7#![expect(unsafe_code)]
8
9use anyhow::Context;
10use bitfield_struct::bitfield;
11use libc::c_void;
12use pal_async::driver::Driver;
13use pal_async::timer::PolledTimer;
14use std::ffi::CString;
15use std::fs;
16use std::fs::File;
17use std::io::BufRead;
18use std::io::BufReader;
19use std::os::unix::prelude::*;
20use std::path::Path;
21use vfio_bindings::bindings::vfio::VFIO_IRQ_SET_ACTION_TRIGGER;
22use vfio_bindings::bindings::vfio::VFIO_IRQ_SET_DATA_EVENTFD;
23use vfio_bindings::bindings::vfio::VFIO_IRQ_SET_DATA_NONE;
24use vfio_bindings::bindings::vfio::VFIO_PCI_MSIX_IRQ_INDEX;
25use vfio_bindings::bindings::vfio::vfio_device_info;
26use vfio_bindings::bindings::vfio::vfio_group_status;
27use vfio_bindings::bindings::vfio::vfio_irq_info;
28use vfio_bindings::bindings::vfio::vfio_irq_set;
29use vfio_bindings::bindings::vfio::vfio_region_info;
30
31mod ioctl {
32    use nix::request_code_none;
33    use std::os::raw::c_char;
34    use std::os::raw::c_int;
35    use vfio_bindings::bindings::vfio::VFIO_BASE;
36    use vfio_bindings::bindings::vfio::VFIO_TYPE;
37    use vfio_bindings::bindings::vfio::vfio_device_info;
38    use vfio_bindings::bindings::vfio::vfio_group_status;
39    use vfio_bindings::bindings::vfio::vfio_irq_info;
40    use vfio_bindings::bindings::vfio::vfio_irq_set;
41    use vfio_bindings::bindings::vfio::vfio_region_info;
42
43    const VFIO_PRIVATE_BASE: u32 = 200;
44
45    nix::ioctl_write_int_bad!(vfio_set_iommu, request_code_none!(VFIO_TYPE, VFIO_BASE + 2));
46    nix::ioctl_read_bad!(
47        vfio_group_get_status,
48        request_code_none!(VFIO_TYPE, VFIO_BASE + 3),
49        vfio_group_status
50    );
51    nix::ioctl_write_ptr_bad!(
52        vfio_group_set_container,
53        request_code_none!(VFIO_TYPE, VFIO_BASE + 4),
54        c_int
55    );
56    nix::ioctl_write_ptr_bad!(
57        vfio_group_get_device_fd,
58        request_code_none!(VFIO_TYPE, VFIO_BASE + 6),
59        c_char
60    );
61    nix::ioctl_read_bad!(
62        vfio_device_get_info,
63        request_code_none!(VFIO_TYPE, VFIO_BASE + 7),
64        vfio_device_info
65    );
66    nix::ioctl_readwrite_bad!(
67        vfio_device_get_region_info,
68        request_code_none!(VFIO_TYPE, VFIO_BASE + 8),
69        vfio_region_info
70    );
71    nix::ioctl_readwrite_bad!(
72        vfio_device_get_irq_info,
73        request_code_none!(VFIO_TYPE, VFIO_BASE + 9),
74        vfio_irq_info
75    );
76    nix::ioctl_write_ptr_bad!(
77        vfio_device_set_irqs,
78        request_code_none!(VFIO_TYPE, VFIO_BASE + 10),
79        vfio_irq_set
80    );
81    nix::ioctl_write_ptr_bad!(
82        vfio_group_set_keep_alive,
83        request_code_none!(VFIO_TYPE, VFIO_PRIVATE_BASE),
84        c_char
85    );
86}
87
88pub struct Container {
89    file: File,
90}
91
92impl Container {
93    pub fn new() -> anyhow::Result<Self> {
94        let file = fs::OpenOptions::new()
95            .read(true)
96            .write(true)
97            .open("/dev/vfio/vfio")
98            .context("failed to open /dev/vfio/vfio")?;
99
100        Ok(Self { file })
101    }
102
103    pub fn set_iommu(&self, iommu: IommuType) -> anyhow::Result<()> {
104        // SAFETY: The file descriptor is valid.
105        unsafe {
106            ioctl::vfio_set_iommu(self.file.as_raw_fd(), iommu as i32)
107                .context("failed to set iommu")?;
108        }
109        Ok(())
110    }
111}
112
113#[repr(u32)]
114pub enum IommuType {
115    NoIommu = vfio_bindings::bindings::vfio::VFIO_NOIOMMU_IOMMU,
116}
117
118pub struct Group {
119    file: File,
120}
121
122impl Group {
123    pub fn open(group: u64) -> anyhow::Result<Self> {
124        Self::open_path(format!("/dev/vfio/{group}").as_ref())
125    }
126
127    pub fn open_noiommu(group: u64) -> anyhow::Result<Self> {
128        Self::open_path(format!("/dev/vfio/noiommu-{group}").as_ref())
129    }
130
131    fn open_path(group: &Path) -> anyhow::Result<Self> {
132        let file = fs::OpenOptions::new()
133            .read(true)
134            .write(true)
135            .open(group)
136            .with_context(|| format!("failed to open group {}", group.display()))?;
137
138        Ok(Self { file })
139    }
140
141    pub fn find_group_for_device(device_sysfs_path: &Path) -> anyhow::Result<u64> {
142        let group = device_sysfs_path.join("iommu_group");
143        let group = fs::read_link(group).context("failed to read iommu group")?;
144        let group: u64 = group
145            .file_name()
146            .and_then(|s| s.to_str())
147            .context("invalid group link")?
148            .parse()
149            .context("failed to parse iommu group")?;
150
151        Ok(group)
152    }
153
154    pub async fn open_device(
155        &self,
156        device_id: &str,
157        driver: &(impl ?Sized + Driver),
158    ) -> anyhow::Result<Device> {
159        let id = CString::new(device_id)?;
160        // SAFETY: The file descriptor is valid and the string is null-terminated.
161        let file = unsafe {
162            let fd = ioctl::vfio_group_get_device_fd(self.file.as_raw_fd(), id.as_ptr());
163            // There is a small race window in the 6.1 kernel between when the
164            // vfio device is visible to userspace, and when it is added to its
165            // internal list. Try one more time on ENODEV failure after a brief
166            // sleep.
167            let fd = match fd {
168                Err(nix::errno::Errno::ENODEV) => {
169                    tracing::warn!(pci_id = device_id, "Retrying vfio open_device after delay");
170                    PolledTimer::new(driver)
171                        .sleep(std::time::Duration::from_millis(250))
172                        .await;
173                    ioctl::vfio_group_get_device_fd(self.file.as_raw_fd(), id.as_ptr())
174                }
175                _ => fd,
176            };
177            let fd = fd.with_context(|| format!("failed to get device fd for {device_id}"))?;
178            File::from_raw_fd(fd)
179        };
180
181        Ok(Device { file })
182    }
183
184    pub fn set_container(&self, container: &Container) -> anyhow::Result<()> {
185        // SAFETY: The file descriptors are valid.
186        unsafe {
187            ioctl::vfio_group_set_container(self.file.as_raw_fd(), &container.file.as_raw_fd())
188                .context("failed to set container")?;
189        }
190        Ok(())
191    }
192
193    pub fn status(&self) -> anyhow::Result<GroupStatus> {
194        let mut status = vfio_group_status {
195            argsz: size_of::<vfio_group_status>() as u32,
196            flags: 0,
197        };
198        // SAFETY: The file descriptor is valid and a correctly constructed struct is being passed.
199        unsafe {
200            ioctl::vfio_group_get_status(self.file.as_raw_fd(), &mut status)
201                .context("failed to get group status")?;
202        };
203        Ok(GroupStatus::from(status.flags))
204    }
205
206    /// Skip VFIO device reset when kernel is reloaded during servicing.
207    /// This feature is non-upstream version of our kernel and will be
208    /// eventually replaced with iommufd.
209    pub async fn set_keep_alive(
210        &self,
211        device_id: &str,
212        driver: &(impl ?Sized + Driver),
213    ) -> anyhow::Result<()> {
214        // SAFETY: The file descriptor is valid and a correctly constructed struct is being passed.
215        unsafe {
216            let id = CString::new(device_id)?;
217            let r = ioctl::vfio_group_set_keep_alive(self.file.as_raw_fd(), id.as_ptr());
218            match r {
219                Ok(_) => Ok(()),
220                Err(nix::errno::Errno::ENODEV) => {
221                    // There is a small race window in the kernel between when the
222                    // vfio device is visible to userspace, and when it is added to its
223                    // internal list. Try one more time on ENODEV failure after a brief
224                    // sleep.
225                    tracing::warn!(
226                        pci_id = device_id,
227                        "vfio keepalive got ENODEV, retrying after delay"
228                    );
229                    PolledTimer::new(driver)
230                        .sleep(std::time::Duration::from_millis(250))
231                        .await;
232                    ioctl::vfio_group_set_keep_alive(self.file.as_raw_fd(), id.as_ptr())
233                        .with_context(|| {
234                            format!("failed to set keep-alive after delay for {device_id}")
235                        })
236                        .map(|_| ())
237                }
238                Err(_) => r
239                    .with_context(|| format!("failed to set keep-alive for {device_id}"))
240                    .map(|_| ()),
241            }
242        }
243    }
244}
245
246#[bitfield(u32)]
247pub struct GroupStatus {
248    pub viable: bool,
249    pub container_set: bool,
250
251    #[bits(30)]
252    _reserved: u32,
253}
254
255pub struct Device {
256    file: File,
257}
258
259#[derive(Debug)]
260pub struct DeviceInfo {
261    pub flags: DeviceFlags,
262    pub num_regions: u32,
263    pub num_irqs: u32,
264}
265
266#[bitfield(u32)]
267pub struct DeviceFlags {
268    reset: bool,
269    pci: bool,
270    platform: bool,
271    amba: bool,
272    ccw: bool,
273    ap: bool,
274
275    #[bits(26)]
276    _reserved: u32,
277}
278
279#[derive(Debug)]
280pub struct RegionInfo {
281    pub flags: RegionFlags,
282    pub size: u64,
283    pub offset: u64,
284}
285
286#[bitfield(u32)]
287pub struct RegionFlags {
288    read: bool,
289    write: bool,
290    mmap: bool,
291    caps: bool,
292
293    #[bits(28)]
294    _reserved: u32,
295}
296
297#[derive(Debug)]
298pub struct IrqInfo {
299    pub flags: IrqFlags,
300    pub count: u32,
301}
302
303#[bitfield(u32)]
304pub struct IrqFlags {
305    eventfd: bool,
306    maskable: bool,
307    automasked: bool,
308    pub noresize: bool,
309
310    #[bits(28)]
311    _reserved: u32,
312}
313
314impl Device {
315    pub fn info(&self) -> anyhow::Result<DeviceInfo> {
316        let mut info = vfio_device_info {
317            argsz: size_of::<vfio_device_info>() as u32,
318            flags: 0,
319            num_regions: 0,
320            num_irqs: 0,
321        };
322        // SAFETY: The file descriptor is valid and a correctly constructed struct is being passed.
323        unsafe {
324            ioctl::vfio_device_get_info(self.file.as_raw_fd(), &mut info)
325                .context("failed to get device info")?;
326        }
327        Ok(DeviceInfo {
328            flags: DeviceFlags::from(info.flags),
329            num_regions: info.num_regions,
330            num_irqs: info.num_irqs,
331        })
332    }
333
334    pub fn region_info(&self, index: u32) -> anyhow::Result<RegionInfo> {
335        let mut info = vfio_region_info {
336            argsz: size_of::<vfio_region_info>() as u32,
337            index,
338            flags: 0,
339            cap_offset: 0,
340            size: 0,
341            offset: 0,
342        };
343        // SAFETY: The file descriptor is valid and a correctly constructed struct is being passed.
344        unsafe {
345            ioctl::vfio_device_get_region_info(self.file.as_raw_fd(), &mut info)
346                .context("failed to get region info")?;
347        };
348        Ok(RegionInfo {
349            flags: RegionFlags::from(info.flags),
350            size: info.size,
351            offset: info.offset,
352        })
353    }
354
355    pub fn irq_info(&self, index: u32) -> anyhow::Result<IrqInfo> {
356        let mut info = vfio_irq_info {
357            argsz: size_of::<vfio_irq_info>() as u32,
358            index,
359            flags: 0,
360            count: 0,
361        };
362        // SAFETY: The file descriptor is valid and a correctly constructed struct is being passed.
363        unsafe {
364            ioctl::vfio_device_get_irq_info(self.file.as_raw_fd(), &mut info)
365                .context("failed to get irq info")?;
366        }
367        Ok(IrqInfo {
368            flags: IrqFlags::from(info.flags),
369            count: info.count,
370        })
371    }
372
373    pub fn map(&self, offset: u64, len: usize, write: bool) -> anyhow::Result<MappedRegion> {
374        let mut prot = libc::PROT_READ;
375        if write {
376            prot |= libc::PROT_WRITE;
377        }
378        // SAFETY: The file descriptor is valid and no address is being passed.
379        // The result is being validated.
380        let addr = unsafe {
381            libc::mmap(
382                std::ptr::null_mut(),
383                len,
384                prot,
385                libc::MAP_SHARED,
386                self.file.as_raw_fd(),
387                offset as i64,
388            )
389        };
390        if addr == libc::MAP_FAILED {
391            return Err(std::io::Error::last_os_error()).context("failed to map region");
392        }
393        Ok(MappedRegion { addr, len })
394    }
395
396    pub fn map_msix<I>(&self, start: u32, eventfd: I) -> anyhow::Result<()>
397    where
398        I: IntoIterator,
399        I::Item: AsFd,
400    {
401        #[repr(C)]
402        struct VfioIrqSetWithArray {
403            header: vfio_irq_set,
404            fd: [i32; 256],
405        }
406        let mut param = VfioIrqSetWithArray {
407            header: vfio_irq_set {
408                argsz: size_of::<VfioIrqSetWithArray>() as u32,
409                flags: VFIO_IRQ_SET_ACTION_TRIGGER,
410                index: VFIO_PCI_MSIX_IRQ_INDEX,
411                start,
412                count: 0,
413                // data is a zero-sized array, the real data is fd.
414                data: Default::default(),
415            },
416            fd: [-1; 256],
417        };
418
419        for (x, y) in eventfd.into_iter().zip(&mut param.fd) {
420            *y = x.as_fd().as_raw_fd();
421            param.header.count += 1;
422        }
423
424        if param.header.count == 0 {
425            param.header.flags |= VFIO_IRQ_SET_DATA_NONE;
426        } else {
427            param.header.flags |= VFIO_IRQ_SET_DATA_EVENTFD;
428        }
429
430        // SAFETY: The file descriptor is valid and a correctly constructed struct is being passed.
431        unsafe {
432            ioctl::vfio_device_set_irqs(self.file.as_raw_fd(), &param.header)
433                .context("failed to set msi-x trigger")?;
434        }
435        Ok(())
436    }
437
438    /// Disable (unmap) a contiguous range of previously mapped MSI-X vectors.
439    ///
440    /// This issues VFIO_DEVICE_SET_IRQS with ACTION_TRIGGER + DATA_NONE and a
441    /// non-zero count, which per VFIO semantics removes the eventfd bindings
442    /// for the specified range starting at `start`.
443    pub fn unmap_msix(&self, start: u32, count: u32) -> anyhow::Result<()> {
444        if count == 0 {
445            return Ok(());
446        }
447
448        let header = vfio_irq_set {
449            argsz: size_of::<vfio_irq_set>() as u32,
450            flags: VFIO_IRQ_SET_ACTION_TRIGGER | VFIO_IRQ_SET_DATA_NONE,
451            index: VFIO_PCI_MSIX_IRQ_INDEX,
452            start,
453            count,
454            data: Default::default(),
455        };
456
457        // SAFETY: The file descriptor is valid; header constructed per VFIO spec.
458        unsafe {
459            ioctl::vfio_device_set_irqs(self.file.as_raw_fd(), &header)
460                .context("failed to unmap msix vectors")?;
461        }
462        Ok(())
463    }
464}
465
466impl AsRef<File> for Device {
467    fn as_ref(&self) -> &File {
468        &self.file
469    }
470}
471
472impl AsFd for Device {
473    fn as_fd(&self) -> BorrowedFd<'_> {
474        self.file.as_fd()
475    }
476}
477
478/// Find the Linux irq number for the MSI-X `index` of the PCI device `pci_id`.
479pub fn find_msix_irq(pci_id: &str, index: u32) -> anyhow::Result<u32> {
480    let buffered = BufReader::new(File::open("/proc/interrupts")?);
481
482    let id = format!("vfio-msix[{}]({})", index, pci_id);
483    let match_str = buffered
484        .lines()
485        .map_while(Result::ok)
486        .find(|line| line.contains(&id))
487        .with_context(|| format!("cannot find interrupt {id} in /proc/interrupts"))?;
488
489    // irq format is: <irq#:> cpu# <irq name>
490    let irq = match_str.trim_start().split(':').next().unwrap();
491    let irq: u32 = irq
492        .parse()
493        .with_context(|| format!("unexpected irq format {}. Expecting 'irq#:'", irq))?;
494
495    Ok(irq)
496}
497
498pub fn print_relevant_params() {
499    #[derive(Debug)]
500    struct Param {
501        _name: &'static str,
502        _value: Option<String>,
503    }
504
505    let vfio_params = [
506        "/sys/module/vfio/parameters/enable_unsafe_noiommu_mode",
507        "/sys/module/driver/parameters/async_probe",
508    ]
509    .iter()
510    .map(|path| Param {
511        _name: path,
512        _value: fs::read_to_string(path).ok().map(|s| s.trim().to_string()),
513    })
514    .collect::<Vec<_>>();
515
516    tracing::debug!(
517        vfio_params = ?vfio_params,
518        "Relevant VFIO module parameters"
519    );
520}
521
522pub struct MappedRegion {
523    addr: *mut c_void,
524    len: usize,
525}
526
527// SAFETY: The result of an mmap is safe to share amongst threads.
528unsafe impl Send for MappedRegion {}
529// SAFETY: The result of an mmap is safe to share amongst threads.
530unsafe impl Sync for MappedRegion {}
531
532impl MappedRegion {
533    pub fn as_ptr(&self) -> *mut c_void {
534        self.addr
535    }
536
537    pub fn len(&self) -> usize {
538        self.len
539    }
540
541    pub fn read_u32(&self, offset: usize) -> u32 {
542        assert_eq!(offset % 4, 0);
543        assert!(offset.saturating_add(4) <= self.len);
544        // SAFETY: We have validated that the offset is inside the region.
545        unsafe { std::ptr::read_volatile(self.addr.byte_add(offset).cast()) }
546    }
547
548    pub fn read_u64(&self, offset: usize) -> u64 {
549        assert_eq!(offset % 8, 0);
550        assert!(offset.saturating_add(8) <= self.len);
551        // SAFETY: We have validated that the offset is inside the region.
552        unsafe { std::ptr::read_volatile(self.addr.byte_add(offset).cast()) }
553    }
554
555    pub fn write_u32(&self, offset: usize, data: u32) {
556        assert_eq!(offset % 4, 0);
557        assert!(offset.saturating_add(4) <= self.len);
558        // SAFETY: We have validated that the offset is inside the region.
559        unsafe {
560            std::ptr::write_volatile(self.addr.byte_add(offset).cast(), data);
561        }
562    }
563
564    pub fn write_u64(&self, offset: usize, data: u64) {
565        assert_eq!(offset % 8, 0);
566        assert!(offset.saturating_add(8) <= self.len);
567        // SAFETY: We have validated that the offset is inside the region.
568        unsafe {
569            std::ptr::write_volatile(self.addr.byte_add(offset).cast(), data);
570        }
571    }
572}
573
574impl Drop for MappedRegion {
575    fn drop(&mut self) {
576        // SAFETY: The address and length are a valid mmap result.
577        unsafe {
578            libc::munmap(self.addr, self.len);
579        }
580    }
581}