disk_backend/
lib.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! The shared disk backend abstraction for OpenVMM storage.
5//!
6//! This crate defines [`Disk`] and the [`DiskIo`] trait, the central
7//! interface between storage frontends (NVMe, SCSI/StorVSP, IDE) and disk
8//! backends (host files, block devices, remote blobs, and more).
9//!
10//! # Architecture
11//!
12//! Every disk backend implements [`DiskIo`]. Frontends don't interact with
13//! backends directly — they hold a [`Disk`], which wraps a type-erased
14//! backend (`DynDisk`, an adapter around [`DiskIo`] that normalizes return
15//! futures) behind an `Arc` for cheap, concurrent cloning. The `Disk`
16//! wrapper caches immutable metadata (sector size, physical sector size,
17//! disk ID, FUA support) at construction time and validates that sector
18//! sizes are powers of two and at least 512 bytes.
19//!
20//! # I/O model
21//!
22//! All I/O is **async** and uses **scatter-gather** buffers via
23//! [`RequestBuffers`]. Callers must pass
24//! buffers that are an integral number of sectors.
25//!
26//! The key operations are:
27//!
28//! - [`DiskIo::read_vectored`] / [`DiskIo::write_vectored`] — async
29//!   scatter-gather read and write. The `fua` parameter on writes requests
30//!   Force Unit Access (write-through to stable storage). Whether FUA is
31//!   actually respected depends on the backend — check
32//!   [`DiskIo::is_fua_respected`].
33//! - [`DiskIo::sync_cache`] — flush (equivalent to SCSI SYNCHRONIZE CACHE
34//!   or NVMe FLUSH).
35//! - [`DiskIo::unmap`] — trim / deallocate sectors. The
36//!   [`DiskIo::unmap_behavior`] method reports whether unmapped sectors
37//!   become zero, become indeterminate, or whether unmap is ignored
38//!   entirely.
39//! - [`DiskIo::eject`] — eject media (optical drives only). The default
40//!   returns [`DiskError::UnsupportedEject`]. Eject is a media state change
41//!   managed by the SCSI DVD layer, not by the backend.
42//! - [`DiskIo::wait_resize`] — block until the disk's sector count changes.
43//!   The default returns [`std::future::pending()`], meaning the backend
44//!   never signals a resize. Only backends that can detect runtime capacity
45//!   changes (e.g., `BlockDeviceDisk` via Linux uevent, `NvmeDisk` via AEN)
46//!   should override this. Decorators and layered disks delegate to the
47//!   inner backend.
48//!
49//! # Error model
50//!
51//! All I/O methods return [`DiskError`], which frontends translate into
52//! protocol-specific errors (NVMe status codes, SCSI sense keys). The
53//! variants cover out-of-range LBAs, I/O errors, medium errors with
54//! sub-classification, guest memory access failures, read-only violations,
55//! persistent reservation conflicts, and unsupported eject.
56//!
57//! # Available backends
58//!
59//! | Backend | Crate | Description |
60//! |---------|-------|-------------|
61//! | `FileDisk` | `disk_file` | Host file, cross-platform |
62//! | `Vhd1Disk` | `disk_vhd1` | VHD1 fixed format |
63//! | `VhdmpDisk` | `disk_vhdmp` | Windows vhdmp driver |
64//! | `BlobDisk` | `disk_blob` | Read-only HTTP / Azure Blob |
65//! | `BlockDeviceDisk` | `disk_blockdevice` | Linux block device (io_uring) |
66//! | `NvmeDisk` | `disk_nvme` | Physical NVMe (user-mode driver) |
67//! | `StripedDisk` | `disk_striped` | Striped across multiple disks |
68//! | `CryptDisk` | `disk_crypt` | XTS-AES-256 encryption wrapper |
69//! | `DelayDisk` | `disk_delay` | Injected I/O latency wrapper |
70//! | `DiskWithReservations` | `disk_prwrap` | In-memory PR emulation wrapper |
71//! | `LayeredDisk` | `disk_layered` | Layered disk with per-sector presence |
72
73#![forbid(unsafe_code)]
74
75pub mod pr;
76pub mod resolve;
77pub mod sync_wrapper;
78
79use guestmem::AccessError;
80use inspect::Inspect;
81use scsi_buffers::RequestBuffers;
82use stackfuture::StackFuture;
83use std::fmt::Debug;
84use std::future::Future;
85use std::future::ready;
86use std::pin::Pin;
87use std::sync::Arc;
88use thiserror::Error;
89
90/// A disk operation error.
91#[derive(Debug, Error)]
92pub enum DiskError {
93    /// The request failed due to a preempt and abort status.
94    #[error("aborted command")]
95    AbortDueToPreemptAndAbort,
96    /// The LBA was out of range.
97    #[error("illegal request")]
98    IllegalBlock,
99    /// The request failed due to invalid input.
100    #[error("invalid input")]
101    InvalidInput,
102    /// The request failed due to an unrecovered IO error.
103    #[error("io error")]
104    Io(#[source] std::io::Error),
105    /// The request failed due to a reportable medium error.
106    #[error("medium error")]
107    MediumError(#[source] std::io::Error, MediumErrorDetails),
108    /// The request failed due to a failure to access the specified buffers.
109    #[error("failed to access guest memory")]
110    MemoryAccess(#[from] AccessError),
111    /// The request failed because the disk is read-only.
112    #[error("attempt to write to read-only disk/range")]
113    ReadOnly,
114    /// The request failed due to a persistent reservation conflict.
115    #[error("reservation conflict")]
116    ReservationConflict,
117    /// The request failed because eject is not supported.
118    #[error("unsupported eject")]
119    UnsupportedEject,
120}
121
122/// Failure details for [`DiskError::MediumError`].
123#[derive(Debug)]
124pub enum MediumErrorDetails {
125    /// The medium had an application tag check failure.
126    ApplicationTagCheckFailed,
127    /// The medium had a guard check failure.
128    GuardCheckFailed,
129    /// The medium had a reference tag check failure.
130    ReferenceTagCheckFailed,
131    /// The medium had an unrecovered read error.
132    UnrecoveredReadError,
133    /// The medium had a write fault.
134    WriteFault,
135}
136
137/// Disk metadata and IO operations.
138pub trait DiskIo: 'static + Send + Sync + Inspect {
139    /// Returns the disk type name as a string.
140    ///
141    /// This is used for diagnostic purposes.
142    fn disk_type(&self) -> &str;
143
144    /// Returns the current sector count.
145    ///
146    /// For some backing stores, this may change at runtime. If it does, then
147    /// the backing store must also implement [`DiskIo::wait_resize`].
148    fn sector_count(&self) -> u64;
149
150    /// Returns the logical sector size of the backing store.
151    ///
152    /// This must not change at runtime.
153    fn sector_size(&self) -> u32;
154
155    /// Optionally returns a 16-byte identifier for the disk, if there is a
156    /// natural one for this backing store.
157    ///
158    /// This may be exposed to the guest as a unique disk identifier.
159    /// This must not change at runtime.
160    fn disk_id(&self) -> Option<[u8; 16]>;
161
162    /// Returns the physical sector size of the backing store.
163    ///
164    /// This must not change at runtime.
165    fn physical_sector_size(&self) -> u32;
166
167    /// Returns true if the `fua` parameter to [`DiskIo::write_vectored`] is
168    /// respected by the backing store by ensuring that the IO is immediately
169    /// committed to disk.
170    fn is_fua_respected(&self) -> bool;
171
172    /// Returns true if the disk is read only.
173    fn is_read_only(&self) -> bool;
174
175    /// Unmap sectors from the layer.
176    fn unmap(
177        &self,
178        sector: u64,
179        count: u64,
180        block_level_only: bool,
181    ) -> impl Future<Output = Result<(), DiskError>> + Send;
182
183    /// Returns the behavior of the unmap operation.
184    ///
185    /// This tells callers what happens to the content of unmapped sectors:
186    ///
187    /// - [`UnmapBehavior::Zeroes`] — unmapped sectors read back as zero.
188    /// - [`UnmapBehavior::Unspecified`] — content may or may not change, and
189    ///   not necessarily to zero.
190    /// - [`UnmapBehavior::Ignored`] — unmap is a no-op; content is unchanged.
191    fn unmap_behavior(&self) -> UnmapBehavior;
192
193    /// Returns the optimal granularity for unmaps, in sectors.
194    fn optimal_unmap_sectors(&self) -> u32 {
195        1
196    }
197
198    /// Optionally returns a trait object to issue persistent reservation
199    /// requests.
200    fn pr(&self) -> Option<&dyn pr::PersistentReservation> {
201        None
202    }
203
204    /// Issues an asynchronous eject media operation to the disk.
205    ///
206    /// The default implementation returns [`DiskError::UnsupportedEject`].
207    /// Eject is primarily a media state change managed by the SCSI DVD layer
208    /// (`SimpleScsiDvd`), not by disk backends. Backends generally do not
209    /// need to override this.
210    fn eject(&self) -> impl Future<Output = Result<(), DiskError>> + Send {
211        ready(Err(DiskError::UnsupportedEject))
212    }
213
214    /// Issues an asynchronous read-scatter operation to the disk.
215    ///
216    /// # Arguments
217    /// * `buffers` - An object representing the data buffers into which the disk data will be transferred.
218    /// * `sector` - The logical sector at which the read operation starts.
219    fn read_vectored(
220        &self,
221        buffers: &RequestBuffers<'_>,
222        sector: u64,
223    ) -> impl Future<Output = Result<(), DiskError>> + Send;
224
225    /// Issues an asynchronous write-gather operation to the disk.
226    /// # Arguments
227    /// * `buffers` - An object representing the data buffers containing the data to transfer to the disk.
228    /// * `sector` - The logical sector at which the write operation starts.
229    /// * `fua` - A flag indicates if FUA (force unit access) is requested.
230    fn write_vectored(
231        &self,
232        buffers: &RequestBuffers<'_>,
233        sector: u64,
234        fua: bool,
235    ) -> impl Future<Output = Result<(), DiskError>> + Send;
236
237    /// Issues an asynchronous flush operation to the disk.
238    fn sync_cache(&self) -> impl Future<Output = Result<(), DiskError>> + Send;
239
240    /// Waits for the disk sector count to change from the specified value.
241    ///
242    /// Returns the new sector count once [`DiskIo::sector_count`] would return
243    /// a value different from `sector_count`. Frontends use this to detect
244    /// runtime capacity changes and notify the guest (NVMe via AEN, SCSI via
245    /// UNIT_ATTENTION).
246    ///
247    /// The default implementation returns [`std::future::pending()`], meaning
248    /// the disk never signals a resize. Only backends that can detect runtime
249    /// capacity changes should override this — for example, `BlockDeviceDisk`
250    /// (via Linux uevent) and `NvmeDisk` (via NVMe AEN). Decorator wrappers
251    /// and `LayeredDisk` should delegate to the inner disk.
252    fn wait_resize(&self, sector_count: u64) -> impl Future<Output = u64> + Send {
253        let _ = sector_count;
254        std::future::pending()
255    }
256}
257
258/// An asynchronous block device.
259///
260/// This type is cheap to clone, for sharing the disk among multiple concurrent
261/// users.
262#[derive(Inspect, Clone)]
263#[inspect(extra = "Self::inspect_extra")]
264pub struct Disk(#[inspect(flatten)] Arc<DiskInner>);
265
266impl Disk {
267    fn inspect_extra(&self, resp: &mut inspect::Response<'_>) {
268        resp.field("disk_type", self.0.disk.disk_type())
269            .field("sector_count", self.0.disk.sector_count())
270            .field("supports_pr", self.0.disk.pr().is_some());
271    }
272}
273
274impl Debug for Disk {
275    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
276        f.debug_tuple("Disk").finish()
277    }
278}
279
280#[derive(Inspect)]
281#[inspect(bound = "T: DynDisk")]
282struct DiskInner<T: ?Sized = dyn DynDisk> {
283    sector_size: u32,
284    sector_shift: u32,
285    physical_sector_size: u32,
286    disk_id: Option<[u8; 16]>,
287    is_fua_respected: bool,
288    is_read_only: bool,
289    unmap_behavior: UnmapBehavior,
290    optimal_unmap_sectors: u32,
291    disk: T,
292}
293
294/// Errors that can occur when creating a `Disk`.
295#[derive(Debug, Error)]
296pub enum InvalidDisk {
297    /// The sector size is invalid.
298    #[error("invalid sector size: {0}")]
299    InvalidSectorSize(u32),
300    /// The physical sector size is invalid.
301    #[error("invalid physical sector size: {0}")]
302    InvalidPhysicalSectorSize(u32),
303}
304
305impl Disk {
306    /// Returns a new disk wrapping the given backing object.
307    pub fn new(disk: impl 'static + DiskIo) -> Result<Self, InvalidDisk> {
308        // Cache the metadata locally to validate it and so that it can be
309        // accessed without needing to go through the trait object. This is more
310        // efficient and ensures the backing disk does not change these values
311        // during the lifetime of the disk.
312        let sector_size = disk.sector_size();
313        if !sector_size.is_power_of_two() || sector_size < 512 {
314            return Err(InvalidDisk::InvalidSectorSize(sector_size));
315        }
316        let physical_sector_size = disk.physical_sector_size();
317        if !physical_sector_size.is_power_of_two() || physical_sector_size < sector_size {
318            return Err(InvalidDisk::InvalidPhysicalSectorSize(physical_sector_size));
319        }
320        Ok(Self(Arc::new(DiskInner {
321            sector_size,
322            sector_shift: sector_size.trailing_zeros(),
323            physical_sector_size,
324            disk_id: disk.disk_id(),
325            is_fua_respected: disk.is_fua_respected(),
326            is_read_only: disk.is_read_only(),
327            optimal_unmap_sectors: disk.optimal_unmap_sectors(),
328            unmap_behavior: disk.unmap_behavior(),
329            disk,
330        })))
331    }
332
333    /// Returns the current sector count.
334    ///
335    /// For some backing stores, this may change at runtime. Use
336    /// [`wait_resize`](Self::wait_resize) to detect this change.
337    pub fn sector_count(&self) -> u64 {
338        self.0.disk.sector_count()
339    }
340
341    /// Returns the logical sector size of the backing store.
342    pub fn sector_size(&self) -> u32 {
343        self.0.sector_size
344    }
345
346    /// Returns log2 of the logical sector size of the backing store.
347    pub fn sector_shift(&self) -> u32 {
348        self.0.sector_shift
349    }
350
351    /// Optionally returns a 16-byte identifier for the disk, if there is a
352    /// natural one for this backing store.
353    ///
354    /// This may be exposed to the guest as a unique disk identifier.
355    pub fn disk_id(&self) -> Option<[u8; 16]> {
356        self.0.disk_id
357    }
358
359    /// Returns the physical sector size of the backing store.
360    pub fn physical_sector_size(&self) -> u32 {
361        self.0.physical_sector_size
362    }
363
364    /// Returns true if the `fua` parameter to
365    /// [`write_vectored`](Self::write_vectored) is respected by the backing
366    /// store by ensuring that the IO is immediately committed to disk.
367    pub fn is_fua_respected(&self) -> bool {
368        self.0.is_fua_respected
369    }
370
371    /// Returns true if the disk is read only.
372    pub fn is_read_only(&self) -> bool {
373        self.0.is_read_only
374    }
375
376    /// Unmap sectors from the disk.
377    pub fn unmap(
378        &self,
379        sector: u64,
380        count: u64,
381        block_level_only: bool,
382    ) -> impl use<'_> + Future<Output = Result<(), DiskError>> + Send {
383        self.0.disk.unmap(sector, count, block_level_only)
384    }
385
386    /// Returns the behavior of the unmap operation.
387    pub fn unmap_behavior(&self) -> UnmapBehavior {
388        self.0.unmap_behavior
389    }
390
391    /// Returns the optimal granularity for unmaps, in sectors.
392    pub fn optimal_unmap_sectors(&self) -> u32 {
393        self.0.optimal_unmap_sectors
394    }
395
396    /// Optionally returns a trait object to issue persistent reservation
397    /// requests.
398    pub fn pr(&self) -> Option<&dyn pr::PersistentReservation> {
399        self.0.disk.pr()
400    }
401
402    /// Issues an asynchronous eject media operation to the disk.
403    pub fn eject(&self) -> impl use<'_> + Future<Output = Result<(), DiskError>> + Send {
404        self.0.disk.eject()
405    }
406
407    /// Issues an asynchronous read-scatter operation to the disk.
408    ///
409    /// # Arguments
410    ///
411    /// * `buffers` - An object representing the data buffers into which the disk data will be transferred.
412    /// * `sector` - The logical sector at which the read operation starts.
413    pub fn read_vectored<'a>(
414        &'a self,
415        buffers: &'a RequestBuffers<'_>,
416        sector: u64,
417    ) -> impl use<'a> + Future<Output = Result<(), DiskError>> + Send {
418        self.0.disk.read_vectored(buffers, sector)
419    }
420
421    /// Issues an asynchronous write-gather operation to the disk.
422    ///
423    /// # Arguments
424    ///
425    /// * `buffers` - An object representing the data buffers containing the data to transfer to the disk.
426    /// * `sector` - The logical sector at which the write operation starts.
427    /// * `fua` - A flag indicates if FUA (force unit access) is requested.
428    ///
429    /// # Panics
430    ///
431    /// The caller must pass a buffer with an integer number of sectors.
432    pub fn write_vectored<'a>(
433        &'a self,
434        buffers: &'a RequestBuffers<'_>,
435        sector: u64,
436        fua: bool,
437    ) -> impl use<'a> + Future<Output = Result<(), DiskError>> + Send {
438        self.0.disk.write_vectored(buffers, sector, fua)
439    }
440
441    /// Issues an asynchronous flush operation to the disk.
442    pub fn sync_cache(&self) -> impl use<'_> + Future<Output = Result<(), DiskError>> + Send {
443        self.0.disk.sync_cache()
444    }
445
446    /// Waits for the disk sector count to change from the specified value.
447    pub fn wait_resize(&self, sector_count: u64) -> impl use<'_> + Future<Output = u64> {
448        self.0.disk.wait_resize(sector_count)
449    }
450}
451
452/// The behavior of the [`DiskIo::unmap`] operation.
453///
454/// This describes what happens to the content of unmapped sectors. Frontends
455/// use this to report the correct behavior to the guest (e.g., SCSI
456/// `LBPRZ` bit or NVMe DLFEAT field).
457#[derive(Clone, Copy, Debug, PartialEq, Eq, Inspect)]
458pub enum UnmapBehavior {
459    /// Unmap may or may not change the content, and not necessarily to zero.
460    /// The guest cannot assume anything about the content of unmapped sectors.
461    Unspecified,
462    /// Unmaps are guaranteed to be ignored — the content is unchanged.
463    /// The disk reports that unmap is not supported.
464    Ignored,
465    /// Unmap will deterministically zero the content. The guest can rely on
466    /// reading back zeroes from unmapped sectors.
467    Zeroes,
468}
469
470/// The amount of space reserved for a DiskIo future
471///
472/// This was chosen by running `cargo test -p storvsp -- --no-capture` and looking at the required
473/// size that was given in the failure message
474const ASYNC_DISK_STACK_SIZE: usize = 1256;
475
476type IoFuture<'a> = StackFuture<'a, Result<(), DiskError>, { ASYNC_DISK_STACK_SIZE }>;
477
478trait DynDisk: Send + Sync + Inspect {
479    fn disk_type(&self) -> &str;
480    fn sector_count(&self) -> u64;
481
482    fn unmap(&self, sector_offset: u64, sector_count: u64, block_level_only: bool) -> IoFuture<'_>;
483
484    fn pr(&self) -> Option<&dyn pr::PersistentReservation>;
485    fn eject(&self) -> IoFuture<'_>;
486
487    fn read_vectored<'a>(&'a self, buffers: &'a RequestBuffers<'_>, sector: u64) -> IoFuture<'a>;
488
489    fn write_vectored<'a>(
490        &'a self,
491        buffers: &'a RequestBuffers<'_>,
492        sector: u64,
493        fua: bool,
494    ) -> IoFuture<'a>;
495
496    fn sync_cache(&self) -> IoFuture<'_>;
497
498    fn wait_resize<'a>(
499        &'a self,
500        sector_count: u64,
501    ) -> Pin<Box<dyn 'a + Send + Future<Output = u64>>> {
502        let _ = sector_count;
503        Box::pin(std::future::pending())
504    }
505}
506
507impl<T: DiskIo> DynDisk for T {
508    fn disk_type(&self) -> &str {
509        self.disk_type()
510    }
511
512    fn sector_count(&self) -> u64 {
513        self.sector_count()
514    }
515
516    fn unmap(
517        &self,
518        sector_offset: u64,
519        sector_count: u64,
520        block_level_only: bool,
521    ) -> StackFuture<'_, Result<(), DiskError>, { ASYNC_DISK_STACK_SIZE }> {
522        StackFuture::from_or_box(self.unmap(sector_offset, sector_count, block_level_only))
523    }
524
525    fn pr(&self) -> Option<&dyn pr::PersistentReservation> {
526        self.pr()
527    }
528
529    fn eject(&self) -> IoFuture<'_> {
530        StackFuture::from_or_box(self.eject())
531    }
532
533    fn read_vectored<'a>(&'a self, buffers: &'a RequestBuffers<'_>, sector: u64) -> IoFuture<'a> {
534        StackFuture::from_or_box(self.read_vectored(buffers, sector))
535    }
536
537    fn write_vectored<'a>(
538        &'a self,
539        buffers: &'a RequestBuffers<'a>,
540        sector: u64,
541        fua: bool,
542    ) -> IoFuture<'a> {
543        StackFuture::from_or_box(self.write_vectored(buffers, sector, fua))
544    }
545
546    fn sync_cache(&self) -> IoFuture<'_> {
547        StackFuture::from_or_box(self.sync_cache())
548    }
549}