vfio_sys/
lib.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4#![expect(missing_docs)]
5#![cfg(unix)]
6// UNSAFETY: Manual memory management with mmap and vfio ioctls.
7#![expect(unsafe_code)]
8
9use anyhow::Context;
10use bitfield_struct::bitfield;
11use libc::c_void;
12use std::ffi::CString;
13use std::fs;
14use std::fs::File;
15use std::io::BufRead;
16use std::io::BufReader;
17use std::os::unix::prelude::*;
18use std::path::Path;
19use vfio_bindings::bindings::vfio::VFIO_IRQ_SET_ACTION_TRIGGER;
20use vfio_bindings::bindings::vfio::VFIO_IRQ_SET_DATA_EVENTFD;
21use vfio_bindings::bindings::vfio::VFIO_IRQ_SET_DATA_NONE;
22use vfio_bindings::bindings::vfio::VFIO_PCI_MSIX_IRQ_INDEX;
23use vfio_bindings::bindings::vfio::vfio_device_info;
24use vfio_bindings::bindings::vfio::vfio_group_status;
25use vfio_bindings::bindings::vfio::vfio_irq_info;
26use vfio_bindings::bindings::vfio::vfio_irq_set;
27use vfio_bindings::bindings::vfio::vfio_region_info;
28
29mod ioctl {
30    use nix::request_code_none;
31    use std::os::raw::c_char;
32    use std::os::raw::c_int;
33    use vfio_bindings::bindings::vfio::VFIO_BASE;
34    use vfio_bindings::bindings::vfio::VFIO_TYPE;
35    use vfio_bindings::bindings::vfio::vfio_device_info;
36    use vfio_bindings::bindings::vfio::vfio_group_status;
37    use vfio_bindings::bindings::vfio::vfio_irq_info;
38    use vfio_bindings::bindings::vfio::vfio_irq_set;
39    use vfio_bindings::bindings::vfio::vfio_region_info;
40
41    const VFIO_PRIVATE_BASE: u32 = 200;
42
43    nix::ioctl_write_int_bad!(vfio_set_iommu, request_code_none!(VFIO_TYPE, VFIO_BASE + 2));
44    nix::ioctl_read_bad!(
45        vfio_group_get_status,
46        request_code_none!(VFIO_TYPE, VFIO_BASE + 3),
47        vfio_group_status
48    );
49    nix::ioctl_write_ptr_bad!(
50        vfio_group_set_container,
51        request_code_none!(VFIO_TYPE, VFIO_BASE + 4),
52        c_int
53    );
54    nix::ioctl_write_ptr_bad!(
55        vfio_group_get_device_fd,
56        request_code_none!(VFIO_TYPE, VFIO_BASE + 6),
57        c_char
58    );
59    nix::ioctl_read_bad!(
60        vfio_device_get_info,
61        request_code_none!(VFIO_TYPE, VFIO_BASE + 7),
62        vfio_device_info
63    );
64    nix::ioctl_readwrite_bad!(
65        vfio_device_get_region_info,
66        request_code_none!(VFIO_TYPE, VFIO_BASE + 8),
67        vfio_region_info
68    );
69    nix::ioctl_readwrite_bad!(
70        vfio_device_get_irq_info,
71        request_code_none!(VFIO_TYPE, VFIO_BASE + 9),
72        vfio_irq_info
73    );
74    nix::ioctl_write_ptr_bad!(
75        vfio_device_set_irqs,
76        request_code_none!(VFIO_TYPE, VFIO_BASE + 10),
77        vfio_irq_set
78    );
79    nix::ioctl_write_ptr_bad!(
80        vfio_group_set_keep_alive,
81        request_code_none!(VFIO_TYPE, VFIO_PRIVATE_BASE),
82        c_char
83    );
84}
85
86pub struct Container {
87    file: File,
88}
89
90impl Container {
91    pub fn new() -> anyhow::Result<Self> {
92        let file = fs::OpenOptions::new()
93            .read(true)
94            .write(true)
95            .open("/dev/vfio/vfio")
96            .context("failed to open /dev/vfio/vfio")?;
97
98        Ok(Self { file })
99    }
100
101    pub fn set_iommu(&self, iommu: IommuType) -> anyhow::Result<()> {
102        // SAFETY: The file descriptor is valid.
103        unsafe {
104            ioctl::vfio_set_iommu(self.file.as_raw_fd(), iommu as i32)
105                .context("failed to set iommu")?;
106        }
107        Ok(())
108    }
109}
110
111#[repr(u32)]
112pub enum IommuType {
113    NoIommu = vfio_bindings::bindings::vfio::VFIO_NOIOMMU_IOMMU,
114}
115
116pub struct Group {
117    file: File,
118}
119
120impl Group {
121    pub fn open(group: u64) -> anyhow::Result<Self> {
122        Self::open_path(format!("/dev/vfio/{group}").as_ref())
123    }
124
125    pub fn open_noiommu(group: u64) -> anyhow::Result<Self> {
126        Self::open_path(format!("/dev/vfio/noiommu-{group}").as_ref())
127    }
128
129    fn open_path(group: &Path) -> anyhow::Result<Self> {
130        let file = fs::OpenOptions::new()
131            .read(true)
132            .write(true)
133            .open(group)
134            .with_context(|| format!("failed to open group {}", group.display()))?;
135
136        Ok(Self { file })
137    }
138
139    pub fn find_group_for_device(device_sysfs_path: &Path) -> anyhow::Result<u64> {
140        let group = device_sysfs_path.join("iommu_group");
141        let group = fs::read_link(group).context("failed to read iommu group")?;
142        let group: u64 = group
143            .file_name()
144            .and_then(|s| s.to_str())
145            .context("invalid group link")?
146            .parse()
147            .context("failed to parse iommu group")?;
148
149        Ok(group)
150    }
151
152    pub fn open_device(&self, device_id: &str) -> anyhow::Result<Device> {
153        let id = CString::new(device_id)?;
154        // SAFETY: The file descriptor is valid and the string is null-terminated.
155        let file = unsafe {
156            let fd = ioctl::vfio_group_get_device_fd(self.file.as_raw_fd(), id.as_ptr());
157            // There is a small race window in the 6.1 kernel between when the
158            // vfio device is visible to userspace, and when it is added to its
159            // internal list. Try one more time on ENODEV failure after a brief
160            // sleep.
161            let fd = match fd {
162                Err(nix::errno::Errno::ENODEV) => {
163                    std::thread::sleep(std::time::Duration::from_millis(250));
164                    tracing::warn!("Retrying vfio open_device after delay");
165                    ioctl::vfio_group_get_device_fd(self.file.as_raw_fd(), id.as_ptr())
166                }
167                _ => fd,
168            };
169            let fd = fd.with_context(|| format!("failed to get device fd for {device_id}"))?;
170            File::from_raw_fd(fd)
171        };
172
173        Ok(Device { file })
174    }
175
176    pub fn set_container(&self, container: &Container) -> anyhow::Result<()> {
177        // SAFETY: The file descriptors are valid.
178        unsafe {
179            ioctl::vfio_group_set_container(self.file.as_raw_fd(), &container.file.as_raw_fd())
180                .context("failed to set container")?;
181        }
182        Ok(())
183    }
184
185    pub fn status(&self) -> anyhow::Result<GroupStatus> {
186        let mut status = vfio_group_status {
187            argsz: size_of::<vfio_group_status>() as u32,
188            flags: 0,
189        };
190        // SAFETY: The file descriptor is valid and a correctly constructed struct is being passed.
191        unsafe {
192            ioctl::vfio_group_get_status(self.file.as_raw_fd(), &mut status)
193                .context("failed to get group status")?;
194        };
195        Ok(GroupStatus::from(status.flags))
196    }
197
198    /// Skip VFIO device reset when kernel is reloaded during servicing.
199    /// This feature is non-upstream version of our kernel and will be
200    /// eventually replaced with iommufd.
201    pub fn set_keep_alive(&self, device_id: &str) -> anyhow::Result<()> {
202        // SAFETY: The file descriptor is valid and a correctly constructed struct is being passed.
203        unsafe {
204            let id = CString::new(device_id.to_owned())?;
205            ioctl::vfio_group_set_keep_alive(self.file.as_raw_fd(), id.as_ptr())
206                .context("failed to set keep-alive")?;
207        }
208        Ok(())
209    }
210}
211
212#[bitfield(u32)]
213pub struct GroupStatus {
214    pub viable: bool,
215    pub container_set: bool,
216
217    #[bits(30)]
218    _reserved: u32,
219}
220
221pub struct Device {
222    file: File,
223}
224
225#[derive(Debug)]
226pub struct DeviceInfo {
227    pub flags: DeviceFlags,
228    pub num_regions: u32,
229    pub num_irqs: u32,
230}
231
232#[bitfield(u32)]
233pub struct DeviceFlags {
234    reset: bool,
235    pci: bool,
236    platform: bool,
237    amba: bool,
238    ccw: bool,
239    ap: bool,
240
241    #[bits(26)]
242    _reserved: u32,
243}
244
245#[derive(Debug)]
246pub struct RegionInfo {
247    pub flags: RegionFlags,
248    pub size: u64,
249    pub offset: u64,
250}
251
252#[bitfield(u32)]
253pub struct RegionFlags {
254    read: bool,
255    write: bool,
256    mmap: bool,
257    caps: bool,
258
259    #[bits(28)]
260    _reserved: u32,
261}
262
263#[derive(Debug)]
264pub struct IrqInfo {
265    pub flags: IrqFlags,
266    pub count: u32,
267}
268
269#[bitfield(u32)]
270pub struct IrqFlags {
271    eventfd: bool,
272    maskable: bool,
273    automasked: bool,
274    pub noresize: bool,
275
276    #[bits(28)]
277    _reserved: u32,
278}
279
280impl Device {
281    pub fn info(&self) -> anyhow::Result<DeviceInfo> {
282        let mut info = vfio_device_info {
283            argsz: size_of::<vfio_device_info>() as u32,
284            flags: 0,
285            num_regions: 0,
286            num_irqs: 0,
287        };
288        // SAFETY: The file descriptor is valid and a correctly constructed struct is being passed.
289        unsafe {
290            ioctl::vfio_device_get_info(self.file.as_raw_fd(), &mut info)
291                .context("failed to get device info")?;
292        }
293        Ok(DeviceInfo {
294            flags: DeviceFlags::from(info.flags),
295            num_regions: info.num_regions,
296            num_irqs: info.num_irqs,
297        })
298    }
299
300    pub fn region_info(&self, index: u32) -> anyhow::Result<RegionInfo> {
301        let mut info = vfio_region_info {
302            argsz: size_of::<vfio_region_info>() as u32,
303            index,
304            flags: 0,
305            cap_offset: 0,
306            size: 0,
307            offset: 0,
308        };
309        // SAFETY: The file descriptor is valid and a correctly constructed struct is being passed.
310        unsafe {
311            ioctl::vfio_device_get_region_info(self.file.as_raw_fd(), &mut info)
312                .context("failed to get region info")?;
313        };
314        Ok(RegionInfo {
315            flags: RegionFlags::from(info.flags),
316            size: info.size,
317            offset: info.offset,
318        })
319    }
320
321    pub fn irq_info(&self, index: u32) -> anyhow::Result<IrqInfo> {
322        let mut info = vfio_irq_info {
323            argsz: size_of::<vfio_irq_info>() as u32,
324            index,
325            flags: 0,
326            count: 0,
327        };
328        // SAFETY: The file descriptor is valid and a correctly constructed struct is being passed.
329        unsafe {
330            ioctl::vfio_device_get_irq_info(self.file.as_raw_fd(), &mut info)
331                .context("failed to get irq info")?;
332        }
333        Ok(IrqInfo {
334            flags: IrqFlags::from(info.flags),
335            count: info.count,
336        })
337    }
338
339    pub fn map(&self, offset: u64, len: usize, write: bool) -> anyhow::Result<MappedRegion> {
340        let mut prot = libc::PROT_READ;
341        if write {
342            prot |= libc::PROT_WRITE;
343        }
344        // SAFETY: The file descriptor is valid and no address is being passed.
345        // The result is being validated.
346        let addr = unsafe {
347            libc::mmap(
348                std::ptr::null_mut(),
349                len,
350                prot,
351                libc::MAP_SHARED,
352                self.file.as_raw_fd(),
353                offset as i64,
354            )
355        };
356        if addr == libc::MAP_FAILED {
357            return Err(std::io::Error::last_os_error()).context("failed to map region");
358        }
359        Ok(MappedRegion { addr, len })
360    }
361
362    pub fn map_msix<I>(&self, start: u32, eventfd: I) -> anyhow::Result<()>
363    where
364        I: IntoIterator,
365        I::Item: AsFd,
366    {
367        #[repr(C)]
368        struct VfioIrqSetWithArray {
369            header: vfio_irq_set,
370            fd: [i32; 256],
371        }
372        let mut param = VfioIrqSetWithArray {
373            header: vfio_irq_set {
374                argsz: size_of::<VfioIrqSetWithArray>() as u32,
375                flags: VFIO_IRQ_SET_ACTION_TRIGGER,
376                index: VFIO_PCI_MSIX_IRQ_INDEX,
377                start,
378                count: 0,
379                // data is a zero-sized array, the real data is fd.
380                data: Default::default(),
381            },
382            fd: [-1; 256],
383        };
384
385        for (x, y) in eventfd.into_iter().zip(&mut param.fd) {
386            *y = x.as_fd().as_raw_fd();
387            param.header.count += 1;
388        }
389
390        if param.header.count == 0 {
391            param.header.flags |= VFIO_IRQ_SET_DATA_NONE;
392        } else {
393            param.header.flags |= VFIO_IRQ_SET_DATA_EVENTFD;
394        }
395
396        // SAFETY: The file descriptor is valid and a correctly constructed struct is being passed.
397        unsafe {
398            ioctl::vfio_device_set_irqs(self.file.as_raw_fd(), &param.header)
399                .context("failed to set msi-x trigger")?;
400        }
401        Ok(())
402    }
403}
404
405impl AsRef<File> for Device {
406    fn as_ref(&self) -> &File {
407        &self.file
408    }
409}
410
411impl AsFd for Device {
412    fn as_fd(&self) -> BorrowedFd<'_> {
413        self.file.as_fd()
414    }
415}
416
417/// Find the Linux irq number for the MSI-X `index` of the PCI device `pci_id`.
418pub fn find_msix_irq(pci_id: &str, index: u32) -> anyhow::Result<u32> {
419    let buffered = BufReader::new(File::open("/proc/interrupts")?);
420
421    let id = format!("vfio-msix[{}]({})", index, pci_id);
422    let match_str = buffered
423        .lines()
424        .map_while(Result::ok)
425        .find(|line| line.contains(&id))
426        .with_context(|| format!("cannot find interrupt {id} in /proc/interrupts"))?;
427
428    // irq format is: <irq#:> cpu# <irq name>
429    let irq = match_str.trim_start().split(':').next().unwrap();
430    let irq: u32 = irq
431        .parse()
432        .with_context(|| format!("unexpected irq format {}. Expecting 'irq#:'", irq))?;
433
434    Ok(irq)
435}
436
437pub struct MappedRegion {
438    addr: *mut c_void,
439    len: usize,
440}
441
442// SAFETY: The result of an mmap is safe to share amongst threads.
443unsafe impl Send for MappedRegion {}
444// SAFETY: The result of an mmap is safe to share amongst threads.
445unsafe impl Sync for MappedRegion {}
446
447impl MappedRegion {
448    pub fn as_ptr(&self) -> *mut c_void {
449        self.addr
450    }
451
452    pub fn len(&self) -> usize {
453        self.len
454    }
455
456    pub fn read_u32(&self, offset: usize) -> u32 {
457        assert_eq!(offset % 4, 0);
458        assert!(offset.saturating_add(4) <= self.len);
459        // SAFETY: We have validated that the offset is inside the region.
460        unsafe { std::ptr::read_volatile(self.addr.byte_add(offset).cast()) }
461    }
462
463    pub fn read_u64(&self, offset: usize) -> u64 {
464        assert_eq!(offset % 8, 0);
465        assert!(offset.saturating_add(8) <= self.len);
466        // SAFETY: We have validated that the offset is inside the region.
467        unsafe { std::ptr::read_volatile(self.addr.byte_add(offset).cast()) }
468    }
469
470    pub fn write_u32(&self, offset: usize, data: u32) {
471        assert_eq!(offset % 4, 0);
472        assert!(offset.saturating_add(4) <= self.len);
473        // SAFETY: We have validated that the offset is inside the region.
474        unsafe {
475            std::ptr::write_volatile(self.addr.byte_add(offset).cast(), data);
476        }
477    }
478
479    pub fn write_u64(&self, offset: usize, data: u64) {
480        assert_eq!(offset % 8, 0);
481        assert!(offset.saturating_add(8) <= self.len);
482        // SAFETY: We have validated that the offset is inside the region.
483        unsafe {
484            std::ptr::write_volatile(self.addr.byte_add(offset).cast(), data);
485        }
486    }
487}
488
489impl Drop for MappedRegion {
490    fn drop(&mut self) {
491        // SAFETY: The address and length are a valid mmap result.
492        unsafe {
493            libc::munmap(self.addr, self.len);
494        }
495    }
496}