disk_blockdevice/
lib.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4#![expect(missing_docs)]
5#![cfg(target_os = "linux")]
6
7//! Implements the [`DiskIo`] trait for virtual disks backed by a raw block
8//! device.
9
10// UNSAFETY: Issuing IOs and calling ioctls.
11#![expect(unsafe_code)]
12
13mod ioctl;
14mod nvme;
15
16use anyhow::Context;
17use async_trait::async_trait;
18use blocking::unblock;
19use disk_backend::DiskError;
20use disk_backend::DiskIo;
21use disk_backend::UnmapBehavior;
22use disk_backend::pr::PersistentReservation;
23use disk_backend::pr::ReservationCapabilities;
24use disk_backend::pr::ReservationReport;
25use disk_backend::pr::ReservationType;
26use disk_backend::resolve::ResolveDiskParameters;
27use disk_backend::resolve::ResolvedDisk;
28use fs_err::PathExt;
29use guestmem::MemoryRead;
30use guestmem::MemoryWrite;
31use inspect::Inspect;
32use io_uring::opcode;
33use io_uring::types;
34use io_uring::types::RwFlags;
35use mesh::MeshPayload;
36use nvme::check_nvme_status;
37use nvme_spec::nvm;
38use pal::unix::affinity;
39use pal_uring::Initiate;
40use pal_uring::IoInitiator;
41use scsi_buffers::BounceBufferTracker;
42use scsi_buffers::RequestBuffers;
43use std::fmt::Debug;
44use std::fs;
45use std::os::unix::io::AsRawFd;
46use std::os::unix::prelude::FileTypeExt;
47use std::os::unix::prelude::MetadataExt;
48use std::path::Path;
49use std::path::PathBuf;
50use std::str::FromStr;
51use std::sync::Arc;
52use std::sync::atomic::AtomicU64;
53use std::sync::atomic::Ordering;
54use thiserror::Error;
55use uevent::CallbackHandle;
56use uevent::UeventListener;
57use vm_resource::AsyncResolveResource;
58use vm_resource::ResourceId;
59use vm_resource::ResourceResolver;
60use vm_resource::kind::DiskHandleKind;
61
62pub struct BlockDeviceResolver {
63    uring: Arc<dyn Initiate>,
64    uevent_listener: Option<Arc<UeventListener>>,
65    bounce_buffer_tracker: Arc<BounceBufferTracker>,
66    always_bounce: bool,
67}
68
69impl BlockDeviceResolver {
70    pub fn new(
71        uring: Arc<dyn Initiate>,
72        uevent_listener: Option<Arc<UeventListener>>,
73        bounce_buffer_tracker: Arc<BounceBufferTracker>,
74        always_bounce: bool,
75    ) -> Self {
76        Self {
77            uring,
78            uevent_listener,
79            bounce_buffer_tracker,
80            always_bounce,
81        }
82    }
83}
84
85#[derive(MeshPayload)]
86pub struct OpenBlockDeviceConfig {
87    pub file: fs::File,
88}
89
90impl ResourceId<DiskHandleKind> for OpenBlockDeviceConfig {
91    const ID: &'static str = "block";
92}
93
94#[derive(Debug, Error)]
95pub enum ResolveDiskError {
96    #[error("failed to create new device")]
97    NewDevice(#[source] NewDeviceError),
98    #[error("invalid disk")]
99    InvalidDisk(#[source] disk_backend::InvalidDisk),
100}
101
102#[async_trait]
103impl AsyncResolveResource<DiskHandleKind, OpenBlockDeviceConfig> for BlockDeviceResolver {
104    type Output = ResolvedDisk;
105    type Error = ResolveDiskError;
106
107    async fn resolve(
108        &self,
109        _resolver: &ResourceResolver,
110        rsrc: OpenBlockDeviceConfig,
111        input: ResolveDiskParameters<'_>,
112    ) -> Result<Self::Output, Self::Error> {
113        let disk = BlockDevice::new(
114            rsrc.file,
115            input.read_only,
116            self.uring.clone(),
117            self.uevent_listener.as_deref(),
118            self.bounce_buffer_tracker.clone(),
119            self.always_bounce,
120        )
121        .await
122        .map_err(ResolveDiskError::NewDevice)?;
123        ResolvedDisk::new(disk).map_err(ResolveDiskError::InvalidDisk)
124    }
125}
126
127/// Opens a file for use with [`BlockDevice`] or [`OpenBlockDeviceConfig`].
128pub fn open_file_for_block(path: &Path, read_only: bool) -> std::io::Result<fs::File> {
129    use std::os::unix::prelude::*;
130
131    tracing::debug!(?path, read_only, "open_file_for_block");
132    fs::OpenOptions::new()
133        .read(true)
134        .write(!read_only)
135        .custom_flags(libc::O_DIRECT)
136        .open(path)
137}
138
139/// A storvsp disk backed by a raw block device.
140#[derive(Inspect)]
141#[inspect(extra = "BlockDevice::inspect_extra")]
142pub struct BlockDevice {
143    file: Arc<fs::File>,
144    sector_size: u32,
145    physical_sector_size: u32,
146    sector_shift: u32,
147    sector_count: AtomicU64,
148    optimal_unmap_sectors: u32,
149    read_only: bool,
150    #[inspect(skip)]
151    uring: Arc<dyn Initiate>,
152    #[inspect(flatten)]
153    device_type: DeviceType,
154    supports_pr: bool,
155    supports_fua: bool,
156    #[inspect(skip)]
157    _uevent_filter: Option<CallbackHandle>,
158    resize_epoch: Arc<ResizeEpoch>,
159    resized_acked: AtomicU64,
160    #[inspect(skip)]
161    bounce_buffer_tracker: Arc<BounceBufferTracker>,
162    always_bounce: bool,
163}
164
165#[derive(Inspect, Debug, Default)]
166#[inspect(transparent)]
167struct ResizeEpoch {
168    epoch: AtomicU64,
169    #[inspect(skip)]
170    event: event_listener::Event,
171}
172
173#[derive(Debug, Copy, Clone, Inspect)]
174#[inspect(tag = "device_type")]
175enum DeviceType {
176    File {
177        sector_count: u64,
178    },
179    UnknownBlock,
180    NVMe {
181        ns_id: u32,
182        rescap: nvm::ReservationCapabilities,
183    },
184}
185
186impl BlockDevice {
187    fn inspect_extra(&self, resp: &mut inspect::Response<'_>) {
188        match self.device_type {
189            DeviceType::NVMe { .. } => {
190                resp.field_mut_with("interrupt_aggregation", |new_value| {
191                    self.inspect_interrupt_coalescing(new_value)
192                });
193            }
194            DeviceType::UnknownBlock => {}
195            DeviceType::File { .. } => {}
196        }
197    }
198
199    fn inspect_interrupt_coalescing(&self, new_value: Option<&str>) -> anyhow::Result<String> {
200        let coalescing = if let Some(new_value) = new_value {
201            let coalescing = (|| {
202                let (threshold, time) = new_value.split_once(' ')?;
203                Some(
204                    nvme::InterruptCoalescing::new()
205                        .with_aggregation_threshold(threshold.parse().ok()?)
206                        .with_aggregation_time(time.parse().ok()?),
207                )
208            })()
209            .context("expected `<aggregation_threshold> <aggregation_time>`")?;
210            nvme::nvme_set_features_interrupt_coalescing(&self.file, coalescing)?;
211            coalescing
212        } else if let Ok(coalescing) = nvme::nvme_get_features_interrupt_coalescing(&self.file) {
213            coalescing
214        } else {
215            return Ok("not supported".into());
216        };
217        Ok(format!(
218            "{} {}",
219            coalescing.aggregation_threshold(),
220            coalescing.aggregation_time()
221        ))
222    }
223}
224
225/// New device error
226#[derive(Debug, Error)]
227pub enum NewDeviceError {
228    #[error("block device ioctl error")]
229    IoctlError(#[from] DiskError),
230    #[error("failed to read device metadata")]
231    DeviceMetadata(#[source] anyhow::Error),
232    #[error("invalid file type, not a file or block device")]
233    InvalidFileType,
234    #[error("invalid disk size {0:#x}")]
235    InvalidDiskSize(u64),
236}
237
238impl BlockDevice {
239    /// Constructs a new `BlockDevice` backed by the specified file.
240    ///
241    /// # Arguments
242    /// * `file` - The backing device opened for raw access.
243    /// * `read_only` - Indicates whether the device is opened for read-only access.
244    /// * `uring` - The IO uring to use for issuing IOs.
245    /// * `always_bounce` - Whether to always use bounce buffers for IOs, even for those that are aligned.
246    pub async fn new(
247        file: fs::File,
248        read_only: bool,
249        uring: Arc<dyn Initiate>,
250        uevent_listener: Option<&UeventListener>,
251        bounce_buffer_tracker: Arc<BounceBufferTracker>,
252        always_bounce: bool,
253    ) -> Result<BlockDevice, NewDeviceError> {
254        let initiator = uring.initiator();
255        assert!(initiator.probe(opcode::Read::CODE));
256        assert!(initiator.probe(opcode::Write::CODE));
257        assert!(initiator.probe(opcode::Readv::CODE));
258        assert!(initiator.probe(opcode::Writev::CODE));
259        assert!(initiator.probe(opcode::Fsync::CODE));
260
261        let metadata = file.metadata().map_err(DiskError::Io)?;
262
263        let mut uevent_filter = None;
264        let resize_epoch = Arc::new(ResizeEpoch::default());
265
266        let devmeta = if metadata.file_type().is_block_device() {
267            let rdev = metadata.rdev();
268            let (major, minor) = (libc::major(rdev), libc::minor(rdev));
269
270            // Register for resize events.
271            if let Some(uevent_listener) = uevent_listener {
272                let resize_epoch = resize_epoch.clone();
273                uevent_filter = Some(
274                    uevent_listener
275                        .add_block_resize_callback(major, minor, {
276                            move || {
277                                tracing::info!(major, minor, "disk resized");
278                                resize_epoch.epoch.fetch_add(1, Ordering::SeqCst);
279                                resize_epoch.event.notify(usize::MAX);
280                            }
281                        })
282                        .await,
283                );
284            }
285
286            DeviceMetadata::from_block_device(&file, major, minor)
287                .map_err(NewDeviceError::DeviceMetadata)?
288        } else if metadata.file_type().is_file() {
289            DeviceMetadata::from_file(&metadata).map_err(NewDeviceError::DeviceMetadata)?
290        } else {
291            return Err(NewDeviceError::InvalidFileType);
292        };
293
294        let sector_size = devmeta.logical_block_size;
295        let sector_shift = sector_size.trailing_zeros();
296        let physical_sector_size = devmeta.physical_block_size.max(sector_size);
297        let sector_count = devmeta.disk_size >> sector_shift;
298        let unmap_granularity = devmeta.discard_granularity >> sector_shift;
299        let file = Arc::new(file);
300        let device = BlockDevice {
301            file,
302            sector_size,
303            physical_sector_size,
304            sector_shift: sector_size.trailing_zeros(),
305            sector_count: sector_count.into(),
306            optimal_unmap_sectors: unmap_granularity,
307            read_only,
308            uring,
309            device_type: devmeta.device_type,
310            supports_pr: devmeta.supports_pr,
311            supports_fua: devmeta.fua,
312            _uevent_filter: uevent_filter,
313            resize_epoch,
314            resized_acked: 0.into(),
315            bounce_buffer_tracker,
316            always_bounce,
317        };
318
319        Ok(device)
320    }
321
322    fn initiator(&self) -> &IoInitiator {
323        self.uring.initiator()
324    }
325
326    fn handle_resize(&self) {
327        if let Err(err) = self.handle_resize_inner() {
328            tracing::error!(
329                error = &err as &dyn std::error::Error,
330                "failed to update disk size"
331            );
332        }
333    }
334
335    fn handle_resize_inner(&self) -> std::io::Result<()> {
336        let mut acked = self.resized_acked.load(Ordering::SeqCst);
337        loop {
338            let epoch = self.resize_epoch.epoch.load(Ordering::SeqCst);
339            if acked == epoch {
340                break Ok(());
341            }
342
343            let size_in_bytes = ioctl::query_block_device_size_in_bytes(&self.file)?;
344
345            let new_sector_count = size_in_bytes / self.sector_size as u64;
346            let original_sector_count = self.sector_count.load(Ordering::SeqCst);
347
348            tracing::debug!(original_sector_count, new_sector_count, "resize");
349            if original_sector_count != new_sector_count {
350                tracing::info!(
351                    original_sector_count,
352                    new_sector_count,
353                    "Disk size updating..."
354                );
355                self.sector_count.store(new_sector_count, Ordering::SeqCst);
356            }
357
358            acked = self
359                .resized_acked
360                .compare_exchange(acked, epoch, Ordering::SeqCst, Ordering::SeqCst)
361                .unwrap_or_else(|x| x);
362        }
363    }
364
365    fn map_io_error(&self, err: std::io::Error) -> DiskError {
366        if !matches!(self.device_type, DeviceType::File { .. }) {
367            match err.raw_os_error() {
368                Some(libc::EBADE) => return DiskError::ReservationConflict,
369                Some(libc::ENOSPC) => return DiskError::IllegalBlock,
370                _ => {}
371            }
372        }
373        DiskError::Io(err)
374    }
375}
376
377struct DeviceMetadata {
378    device_type: DeviceType,
379    disk_size: u64,
380    logical_block_size: u32,
381    physical_block_size: u32,
382    discard_granularity: u32,
383    supports_pr: bool,
384    fua: bool,
385}
386
387impl DeviceMetadata {
388    fn from_block_device(file: &fs::File, major: u32, minor: u32) -> anyhow::Result<Self> {
389        // Ensure the sysfs path exists.
390        let devpath = PathBuf::from(format!("/sys/dev/block/{major}:{minor}"));
391        devpath
392            .fs_err_metadata()
393            .context("could not find sysfs path for block device")?;
394
395        let mut supports_pr = false;
396
397        // Check for NVMe by looking for the namespace ID.
398        let device_type = match fs_err::read_to_string(devpath.join("nsid")) {
399            Ok(ns_id) => {
400                let ns_id = ns_id
401                    .trim()
402                    .parse()
403                    .context("failed to parse NVMe namespace ID")?;
404
405                let rescap = nvme::nvme_identify_namespace_data(file, ns_id)?.rescap;
406                let oncs = nvme::nvme_identify_controller_data(file)?.oncs;
407                tracing::debug!(rescap = ?rescap, oncs = ?oncs, "get identify data");
408                supports_pr = oncs.reservations() && u8::from(rescap) != 0;
409                Some(DeviceType::NVMe { ns_id, rescap })
410            }
411            Err(err) if err.kind() == std::io::ErrorKind::NotFound => None,
412            Err(err) => Err(err).context("failed to read NVMe namespace ID")?,
413        };
414
415        // Fall back to unknown.
416        let device_type = device_type.unwrap_or(DeviceType::UnknownBlock);
417
418        fn read_val<T: FromStr>(devpath: &Path, path: &str, msg: &str) -> anyhow::Result<T>
419        where
420            T::Err: 'static + std::error::Error + Send + Sync,
421        {
422            fs_err::read_to_string(devpath.join(path))
423                .with_context(|| format!("failed to read {msg}"))?
424                .trim()
425                .parse()
426                .with_context(|| format!("failed to parse {msg}"))
427        }
428
429        let logical_block_size = read_val(&devpath, "queue/logical_block_size", "sector size")?;
430        let physical_block_size = read_val(
431            &devpath,
432            "queue/physical_block_size",
433            "physical sector size",
434        )?;
435
436        // sys/dev/block/*/*/size shows the size in 512-byte
437        // sectors irrespective of the block device
438        let disk_size = read_val::<u64>(&devpath, "size", "disk size")? * 512;
439        let discard_granularity =
440            read_val(&devpath, "queue/discard_granularity", "discard granularity")?;
441
442        let fua = read_val::<u8>(&devpath, "queue/fua", "fua")? != 0;
443
444        Self {
445            device_type,
446            disk_size,
447            logical_block_size,
448            physical_block_size,
449            discard_granularity,
450            supports_pr,
451            fua,
452        }
453        .validate()
454    }
455
456    fn from_file(metadata: &fs::Metadata) -> anyhow::Result<Self> {
457        let logical_block_size = 512;
458        Self {
459            device_type: DeviceType::File {
460                sector_count: metadata.len() / logical_block_size as u64,
461            },
462            disk_size: metadata.size(),
463            logical_block_size,
464            physical_block_size: metadata.blksize() as u32,
465            discard_granularity: 0,
466            supports_pr: false,
467            fua: false,
468        }
469        .validate()
470    }
471
472    fn validate(self) -> anyhow::Result<Self> {
473        let Self {
474            device_type: _,
475            disk_size,
476            logical_block_size,
477            physical_block_size,
478            discard_granularity,
479            supports_pr: _,
480            fua: _,
481        } = self;
482        if logical_block_size < 512 || !logical_block_size.is_power_of_two() {
483            anyhow::bail!("invalid sector size {logical_block_size}");
484        }
485        if !physical_block_size.is_power_of_two() {
486            anyhow::bail!("invalid physical sector size {physical_block_size}");
487        }
488        if disk_size % logical_block_size as u64 != 0 {
489            anyhow::bail!("invalid disk size {disk_size:#x}");
490        }
491        if discard_granularity % logical_block_size != 0 {
492            anyhow::bail!("invalid discard granularity {discard_granularity}");
493        }
494        Ok(self)
495    }
496}
497
498impl DiskIo for BlockDevice {
499    fn disk_type(&self) -> &str {
500        "block_device"
501    }
502
503    fn sector_count(&self) -> u64 {
504        if self.resize_epoch.epoch.load(Ordering::Relaxed)
505            != self.resized_acked.load(Ordering::Relaxed)
506        {
507            self.handle_resize();
508        }
509        self.sector_count.load(Ordering::Relaxed)
510    }
511
512    fn sector_size(&self) -> u32 {
513        self.sector_size
514    }
515
516    fn disk_id(&self) -> Option<[u8; 16]> {
517        None
518    }
519
520    fn physical_sector_size(&self) -> u32 {
521        self.physical_sector_size
522    }
523
524    fn is_fua_respected(&self) -> bool {
525        self.supports_fua
526    }
527
528    fn is_read_only(&self) -> bool {
529        self.read_only
530    }
531
532    fn pr(&self) -> Option<&dyn PersistentReservation> {
533        if self.supports_pr { Some(self) } else { None }
534    }
535
536    async fn eject(&self) -> Result<(), DiskError> {
537        let file = self.file.clone();
538        unblock(move || {
539            ioctl::lockdoor(&file, false)?;
540            ioctl::eject(&file)
541        })
542        .await
543        .map_err(|err| self.map_io_error(err))?;
544        Ok(())
545    }
546
547    async fn read_vectored(
548        &self,
549        buffers: &RequestBuffers<'_>,
550        sector: u64,
551    ) -> Result<(), DiskError> {
552        let io_size = buffers.len();
553        tracing::trace!(sector, io_size, "read_vectored");
554
555        let mut bounce_buffer = None;
556        let locked;
557        let should_bounce = self.always_bounce || !buffers.is_aligned(self.sector_size() as usize);
558        let io_vecs = if !should_bounce {
559            locked = buffers.lock(true)?;
560            locked.io_vecs()
561        } else {
562            tracing::trace!("double buffering IO");
563
564            bounce_buffer
565                .insert(
566                    self.bounce_buffer_tracker
567                        .acquire_bounce_buffers(buffers.len(), affinity::get_cpu_number() as usize)
568                        .await,
569                )
570                .buffer
571                .io_vecs()
572        };
573
574        // SAFETY: the buffers for the IO are this stack, and they will be
575        // kept alive for the duration of the IO since we immediately call
576        // await on the IO.
577        let (r, _) = unsafe {
578            self.initiator().issue_io((), |_| {
579                opcode::Readv::new(
580                    types::Fd(self.file.as_raw_fd()),
581                    io_vecs.as_ptr().cast(),
582                    io_vecs.len() as u32,
583                )
584                .offset((sector * self.sector_size() as u64) as _)
585                .build()
586            })
587        }
588        .await;
589
590        let bytes_read = r.map_err(|err| self.map_io_error(err))?;
591        tracing::trace!(bytes_read, "read_vectored");
592        if bytes_read != io_size as i32 {
593            return Err(DiskError::IllegalBlock);
594        }
595
596        if let Some(mut bounce_buffer) = bounce_buffer {
597            buffers
598                .writer()
599                .write(bounce_buffer.buffer.as_mut_bytes())?;
600        }
601        Ok(())
602    }
603
604    async fn write_vectored(
605        &self,
606        buffers: &RequestBuffers<'_>,
607        sector: u64,
608        fua: bool,
609    ) -> Result<(), DiskError> {
610        let io_size = buffers.len();
611        tracing::trace!(sector, io_size, "write_vectored");
612
613        // Ensure the write doesn't extend the file.
614        if let DeviceType::File { sector_count } = self.device_type {
615            if sector + (io_size as u64 >> self.sector_shift) > sector_count {
616                return Err(DiskError::IllegalBlock);
617            }
618        }
619
620        let mut bounce_buffer;
621        let locked;
622        let should_bounce = self.always_bounce || !buffers.is_aligned(self.sector_size() as usize);
623        let io_vecs = if !should_bounce {
624            locked = buffers.lock(false)?;
625            locked.io_vecs()
626        } else {
627            tracing::trace!("double buffering IO");
628            bounce_buffer = self
629                .bounce_buffer_tracker
630                .acquire_bounce_buffers(buffers.len(), affinity::get_cpu_number() as usize)
631                .await;
632            buffers.reader().read(bounce_buffer.buffer.as_mut_bytes())?;
633            bounce_buffer.buffer.io_vecs()
634        };
635
636        // Documented in Linux manual page: https://man7.org/linux/man-pages/man2/readv.2.html
637        // It's only defined in linux_gnu but not in linux_musl. So we have to define it.
638        const RWF_DSYNC: RwFlags = 0x00000002;
639
640        // SAFETY: the buffers for the IO are this stack, and they will be
641        // kept alive for the duration of the IO since we immediately call
642        // await on the IO.
643        let (r, _) = unsafe {
644            self.initiator().issue_io((), |_| {
645                opcode::Writev::new(
646                    types::Fd(self.file.as_raw_fd()),
647                    io_vecs.as_ptr().cast::<libc::iovec>(),
648                    io_vecs.len() as _,
649                )
650                .offset((sector * self.sector_size() as u64) as _)
651                .rw_flags(if fua { RWF_DSYNC } else { 0 })
652                .build()
653            })
654        }
655        .await;
656
657        let bytes_written = r.map_err(|err| self.map_io_error(err))?;
658        tracing::trace!(bytes_written, "write_vectored");
659        if bytes_written != io_size as i32 {
660            return Err(DiskError::IllegalBlock);
661        }
662
663        Ok(())
664    }
665
666    async fn sync_cache(&self) -> Result<(), DiskError> {
667        // SAFETY: No data buffers.
668        unsafe {
669            self.initiator()
670                .issue_io((), |_| {
671                    opcode::Fsync::new(types::Fd(self.file.as_raw_fd())).build()
672                })
673                .await
674                .0
675                .map_err(|err| self.map_io_error(err))?;
676        }
677        Ok(())
678    }
679
680    async fn wait_resize(&self, sector_count: u64) -> u64 {
681        loop {
682            let listen = self.resize_epoch.event.listen();
683            let current = self.sector_count();
684            if current != sector_count {
685                break current;
686            }
687            listen.await;
688        }
689    }
690
691    async fn unmap(
692        &self,
693        sector_offset: u64,
694        sector_count: u64,
695        _block_level_only: bool,
696    ) -> Result<(), DiskError> {
697        let file = self.file.clone();
698        let file_offset = sector_offset << self.sector_shift;
699        let length = sector_count << self.sector_shift;
700        tracing::debug!(file = ?file, file_offset, length, "unmap_async");
701        match unblock(move || ioctl::discard(&file, file_offset, length)).await {
702            Ok(()) => {}
703            Err(_) if sector_offset + sector_count > self.sector_count() => {
704                return Err(DiskError::IllegalBlock);
705            }
706            Err(err) => return Err(self.map_io_error(err)),
707        }
708        Ok(())
709    }
710
711    fn unmap_behavior(&self) -> UnmapBehavior {
712        if self.optimal_unmap_sectors == 0 {
713            UnmapBehavior::Ignored
714        } else {
715            UnmapBehavior::Unspecified
716        }
717    }
718
719    fn optimal_unmap_sectors(&self) -> u32 {
720        self.optimal_unmap_sectors
721    }
722}
723
724#[async_trait::async_trait]
725impl PersistentReservation for BlockDevice {
726    fn capabilities(&self) -> ReservationCapabilities {
727        match &self.device_type {
728            &DeviceType::NVMe { rescap, .. } => {
729                nvme_common::from_nvme_reservation_capabilities(rescap)
730            }
731            DeviceType::File { .. } | DeviceType::UnknownBlock => unreachable!(),
732        }
733    }
734
735    async fn report(&self) -> Result<ReservationReport, DiskError> {
736        assert!(matches!(self.device_type, DeviceType::NVMe { .. }));
737        self.nvme_persistent_reservation_report()
738            .await
739            .map_err(|err| self.map_io_error(err))
740    }
741
742    async fn register(
743        &self,
744        current_key: Option<u64>,
745        new_key: u64,
746        ptpl: Option<bool>,
747    ) -> Result<(), DiskError> {
748        assert!(matches!(self.device_type, DeviceType::NVMe { .. }));
749
750        // The Linux kernel interface to register does not allow ptpl to be
751        // configured. We could manually issue an NVMe command, but this code
752        // path is not really used anyway.
753        if ptpl == Some(false) {
754            tracing::warn!("ignoring guest request to disable persist through power loss");
755        }
756
757        let file = self.file.clone();
758        unblock(move || {
759            ioctl::pr_register(
760                &file,
761                current_key.unwrap_or(0),
762                new_key,
763                if current_key.is_none() {
764                    ioctl::PR_FL_IGNORE_KEY
765                } else {
766                    0
767                },
768            )
769        })
770        .await
771        .and_then(check_nvme_status)
772        .map_err(|err| self.map_io_error(err))?;
773        Ok(())
774    }
775
776    async fn reserve(&self, key: u64, reservation_type: ReservationType) -> Result<(), DiskError> {
777        assert!(matches!(self.device_type, DeviceType::NVMe { .. }));
778        let file = self.file.clone();
779        unblock(move || ioctl::pr_reserve(&file, reservation_type, key))
780            .await
781            .and_then(check_nvme_status)
782            .map_err(|err| self.map_io_error(err))?;
783        Ok(())
784    }
785
786    async fn release(&self, key: u64, reservation_type: ReservationType) -> Result<(), DiskError> {
787        assert!(matches!(self.device_type, DeviceType::NVMe { .. }));
788        let file = self.file.clone();
789        unblock(move || ioctl::pr_release(&file, reservation_type, key))
790            .await
791            .and_then(check_nvme_status)
792            .map_err(|err| self.map_io_error(err))?;
793        Ok(())
794    }
795
796    async fn clear(&self, key: u64) -> Result<(), DiskError> {
797        assert!(matches!(self.device_type, DeviceType::NVMe { .. }));
798        let file = self.file.clone();
799        unblock(move || ioctl::pr_clear(&file, key))
800            .await
801            .and_then(check_nvme_status)
802            .map_err(|err| self.map_io_error(err))?;
803        Ok(())
804    }
805
806    async fn preempt(
807        &self,
808        current_key: u64,
809        preempt_key: u64,
810        reservation_type: ReservationType,
811        abort: bool,
812    ) -> Result<(), DiskError> {
813        assert!(matches!(self.device_type, DeviceType::NVMe { .. }));
814        let file = self.file.clone();
815        unblock(move || {
816            ioctl::pr_preempt(&file, reservation_type, current_key, preempt_key, abort)
817        })
818        .await
819        .and_then(check_nvme_status)
820        .map_err(|err| self.map_io_error(err))?;
821        Ok(())
822    }
823}
824
825#[cfg(test)]
826mod tests {
827    use super::*;
828    use futures::executor::block_on;
829    use guestmem::GuestMemory;
830    use hvdef::HV_PAGE_SIZE;
831    use hvdef::HV_PAGE_SIZE_USIZE;
832    use once_cell::sync::OnceCell;
833    use pal_async::async_test;
834    use pal_uring::IoUringPool;
835    use scsi_buffers::OwnedRequestBuffers;
836
837    fn is_buggy_kernel() -> bool {
838        // 5.13 kernels seem to have a bug with io_uring where tests hang.
839        let output = String::from_utf8(
840            std::process::Command::new("uname")
841                .arg("-r")
842                .output()
843                .unwrap()
844                .stdout,
845        )
846        .unwrap();
847
848        output.contains("5.13")
849    }
850
851    fn new_block_device() -> Result<BlockDevice, NewDeviceError> {
852        // TODO: switch to std::sync::OnceLock once `get_or_try_init` is stable
853        static POOL: OnceCell<Arc<IoInitiator>> = OnceCell::new();
854
855        let initiator = POOL
856            .get_or_try_init(|| {
857                let pool = IoUringPool::new("test", 16)?;
858                let initiator = pool.client().initiator().clone();
859                std::thread::spawn(|| pool.run());
860                Ok(Arc::new(initiator))
861            })
862            .map_err(|err| NewDeviceError::IoctlError(DiskError::Io(err)))?;
863
864        let bounce_buffer_tracker = Arc::new(BounceBufferTracker::new(
865            2048,
866            affinity::num_procs() as usize,
867        ));
868
869        let test_file = tempfile::tempfile().unwrap();
870        test_file.set_len(1024 * 64).unwrap();
871        block_on(BlockDevice::new(
872            test_file.try_clone().unwrap(),
873            false,
874            initiator.clone(),
875            None,
876            bounce_buffer_tracker,
877            false,
878        ))
879    }
880
881    macro_rules! get_block_device_or_skip {
882        () => {
883            match new_block_device() {
884                Ok(pool) => {
885                    if is_buggy_kernel() {
886                        println!("Test case skipped (buggy kernel version)");
887                        return;
888                    }
889
890                    pool
891                }
892                Err(NewDeviceError::IoctlError(DiskError::Io(err)))
893                    if err.raw_os_error() == Some(libc::ENOSYS) =>
894                {
895                    println!("Test case skipped (no IO-Uring support)");
896                    return;
897                }
898                Err(err) => panic!("{}", err),
899            }
900        };
901    }
902
903    async fn run_async_disk_io(fua: bool) {
904        let disk = get_block_device_or_skip!();
905
906        let test_guest_mem = GuestMemory::allocate(0x8000);
907        test_guest_mem
908            .write_at(0, &(0..0x8000).map(|x| x as u8).collect::<Vec<_>>())
909            .unwrap();
910
911        let write_buffers = OwnedRequestBuffers::new(&[3, 2, 1, 0]);
912        disk.write_vectored(&write_buffers.buffer(&test_guest_mem), 0, fua)
913            .await
914            .unwrap();
915
916        if !fua {
917            disk.sync_cache().await.unwrap();
918        }
919
920        let read_buffers = OwnedRequestBuffers::new(&[7, 6, 5, 4]);
921        disk.read_vectored(&read_buffers.buffer(&test_guest_mem), 0)
922            .await
923            .unwrap();
924
925        let mut source = vec![0u8; 4 * HV_PAGE_SIZE_USIZE];
926        test_guest_mem.read_at(0, &mut source).unwrap();
927        let mut target = vec![0u8; 4 * HV_PAGE_SIZE_USIZE];
928        test_guest_mem
929            .read_at(4 * HV_PAGE_SIZE, &mut target)
930            .unwrap();
931        assert_eq!(source, target);
932    }
933
934    #[async_test]
935    async fn test_async_disk_io() {
936        run_async_disk_io(false).await;
937    }
938
939    #[async_test]
940    async fn test_async_disk_io_fua() {
941        run_async_disk_io(true).await;
942    }
943
944    async fn run_async_disk_io_unaligned(fua: bool) {
945        let disk = get_block_device_or_skip!();
946
947        let test_guest_mem = GuestMemory::allocate(0x8000);
948        test_guest_mem
949            .write_at(0, &(0..0x8000).map(|x| x as u8).collect::<Vec<_>>())
950            .unwrap();
951
952        let write_buffers =
953            OwnedRequestBuffers::new_unaligned(&[0, 1, 2, 3], 512, 3 * HV_PAGE_SIZE_USIZE);
954
955        disk.write_vectored(&write_buffers.buffer(&test_guest_mem), 0, fua)
956            .await
957            .unwrap();
958
959        if !fua {
960            disk.sync_cache().await.unwrap();
961        }
962
963        let read_buffers =
964            OwnedRequestBuffers::new_unaligned(&[4, 5, 6, 7], 512, 3 * HV_PAGE_SIZE_USIZE);
965        disk.read_vectored(&read_buffers.buffer(&test_guest_mem), 0)
966            .await
967            .unwrap();
968
969        let mut source = vec![0u8; 3 * HV_PAGE_SIZE_USIZE];
970        test_guest_mem.read_at(512, &mut source).unwrap();
971        let mut target = vec![0u8; 3 * HV_PAGE_SIZE_USIZE];
972        test_guest_mem
973            .read_at(4 * HV_PAGE_SIZE + 512, &mut target)
974            .unwrap();
975        assert_eq!(source, target);
976    }
977
978    #[async_test]
979    async fn test_async_disk_io_unaligned() {
980        run_async_disk_io_unaligned(false).await;
981    }
982
983    #[async_test]
984    async fn test_async_disk_io_unaligned_fua() {
985        run_async_disk_io_unaligned(true).await;
986    }
987
988    #[async_test]
989    async fn test_illegal_lba() {
990        let disk = get_block_device_or_skip!();
991        let gm = GuestMemory::allocate(512);
992        match disk
993            .write_vectored(
994                &OwnedRequestBuffers::linear(0, 512, true).buffer(&gm),
995                i64::MAX as u64 / 512,
996                false,
997            )
998            .await
999        {
1000            Err(DiskError::IllegalBlock) => {}
1001            r => panic!("unexpected result: {:?}", r),
1002        }
1003    }
1004}