disk_backend/lib.rs
1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! The shared disk backend abstraction for OpenVMM storage.
5//!
6//! This crate defines [`Disk`] and the [`DiskIo`] trait, the central
7//! interface between storage frontends (NVMe, SCSI/StorVSP, IDE) and disk
8//! backends (host files, block devices, remote blobs, and more).
9//!
10//! # Architecture
11//!
12//! Every disk backend implements [`DiskIo`]. Frontends don't interact with
13//! backends directly — they hold a [`Disk`], which wraps a type-erased
14//! backend (`DynDisk`, an adapter around [`DiskIo`] that normalizes return
15//! futures) behind an `Arc` for cheap, concurrent cloning. The `Disk`
16//! wrapper caches immutable metadata (sector size, physical sector size,
17//! disk ID, FUA support) at construction time and validates that sector
18//! sizes are powers of two and at least 512 bytes.
19//!
20//! # I/O model
21//!
22//! All I/O is **async** and uses **scatter-gather** buffers via
23//! [`RequestBuffers`]. Callers must pass
24//! buffers that are an integral number of sectors.
25//!
26//! The key operations are:
27//!
28//! - [`DiskIo::read_vectored`] / [`DiskIo::write_vectored`] — async
29//! scatter-gather read and write. The `fua` parameter on writes requests
30//! Force Unit Access (write-through to stable storage). Whether FUA is
31//! actually respected depends on the backend — check
32//! [`DiskIo::is_fua_respected`].
33//! - [`DiskIo::sync_cache`] — flush (equivalent to SCSI SYNCHRONIZE CACHE
34//! or NVMe FLUSH).
35//! - [`DiskIo::unmap`] — trim / deallocate sectors. The
36//! [`DiskIo::unmap_behavior`] method reports whether unmapped sectors
37//! become zero, become indeterminate, or whether unmap is ignored
38//! entirely.
39//! - [`DiskIo::eject`] — eject media (optical drives only). The default
40//! returns [`DiskError::UnsupportedEject`]. Eject is a media state change
41//! managed by the SCSI DVD layer, not by the backend.
42//! - [`DiskIo::wait_resize`] — block until the disk's sector count changes.
43//! The default returns [`std::future::pending()`], meaning the backend
44//! never signals a resize. Only backends that can detect runtime capacity
45//! changes (e.g., `BlockDeviceDisk` via Linux uevent, `NvmeDisk` via AEN)
46//! should override this. Decorators and layered disks delegate to the
47//! inner backend.
48//!
49//! # Error model
50//!
51//! All I/O methods return [`DiskError`], which frontends translate into
52//! protocol-specific errors (NVMe status codes, SCSI sense keys). The
53//! variants cover out-of-range LBAs, I/O errors, medium errors with
54//! sub-classification, guest memory access failures, read-only violations,
55//! persistent reservation conflicts, and unsupported eject.
56//!
57//! # Available backends
58//!
59//! | Backend | Crate | Description |
60//! |---------|-------|-------------|
61//! | `FileDisk` | `disk_file` | Host file, cross-platform |
62//! | `Vhd1Disk` | `disk_vhd1` | VHD1 fixed format |
63//! | `VhdmpDisk` | `disk_vhdmp` | Windows vhdmp driver |
64//! | `BlobDisk` | `disk_blob` | Read-only HTTP / Azure Blob |
65//! | `BlockDeviceDisk` | `disk_blockdevice` | Linux block device (io_uring) |
66//! | `NvmeDisk` | `disk_nvme` | Physical NVMe (user-mode driver) |
67//! | `StripedDisk` | `disk_striped` | Striped across multiple disks |
68//! | `CryptDisk` | `disk_crypt` | XTS-AES-256 encryption wrapper |
69//! | `DelayDisk` | `disk_delay` | Injected I/O latency wrapper |
70//! | `DiskWithReservations` | `disk_prwrap` | In-memory PR emulation wrapper |
71//! | `LayeredDisk` | `disk_layered` | Layered disk with per-sector presence |
72
73#![forbid(unsafe_code)]
74
75pub mod pr;
76pub mod resolve;
77pub mod sync_wrapper;
78
79use guestmem::AccessError;
80use inspect::Inspect;
81use scsi_buffers::RequestBuffers;
82use stackfuture::StackFuture;
83use std::fmt::Debug;
84use std::future::Future;
85use std::future::ready;
86use std::pin::Pin;
87use std::sync::Arc;
88use thiserror::Error;
89
90/// A disk operation error.
91#[derive(Debug, Error)]
92pub enum DiskError {
93 /// The request failed due to a preempt and abort status.
94 #[error("aborted command")]
95 AbortDueToPreemptAndAbort,
96 /// The LBA was out of range.
97 #[error("illegal request")]
98 IllegalBlock,
99 /// The request failed due to invalid input.
100 #[error("invalid input")]
101 InvalidInput,
102 /// The request failed due to an unrecovered IO error.
103 #[error("io error")]
104 Io(#[source] std::io::Error),
105 /// The request failed due to a reportable medium error.
106 #[error("medium error")]
107 MediumError(#[source] std::io::Error, MediumErrorDetails),
108 /// The request failed due to a failure to access the specified buffers.
109 #[error("failed to access guest memory")]
110 MemoryAccess(#[from] AccessError),
111 /// The request failed because the disk is read-only.
112 #[error("attempt to write to read-only disk/range")]
113 ReadOnly,
114 /// The request failed due to a persistent reservation conflict.
115 #[error("reservation conflict")]
116 ReservationConflict,
117 /// The request failed because eject is not supported.
118 #[error("unsupported eject")]
119 UnsupportedEject,
120}
121
122/// Failure details for [`DiskError::MediumError`].
123#[derive(Debug)]
124pub enum MediumErrorDetails {
125 /// The medium had an application tag check failure.
126 ApplicationTagCheckFailed,
127 /// The medium had a guard check failure.
128 GuardCheckFailed,
129 /// The medium had a reference tag check failure.
130 ReferenceTagCheckFailed,
131 /// The medium had an unrecovered read error.
132 UnrecoveredReadError,
133 /// The medium had a write fault.
134 WriteFault,
135}
136
137/// Disk metadata and IO operations.
138pub trait DiskIo: 'static + Send + Sync + Inspect {
139 /// Returns the disk type name as a string.
140 ///
141 /// This is used for diagnostic purposes.
142 fn disk_type(&self) -> &str;
143
144 /// Returns the current sector count.
145 ///
146 /// For some backing stores, this may change at runtime. If it does, then
147 /// the backing store must also implement [`DiskIo::wait_resize`].
148 fn sector_count(&self) -> u64;
149
150 /// Returns the logical sector size of the backing store.
151 ///
152 /// This must not change at runtime.
153 fn sector_size(&self) -> u32;
154
155 /// Optionally returns a 16-byte identifier for the disk, if there is a
156 /// natural one for this backing store.
157 ///
158 /// This may be exposed to the guest as a unique disk identifier.
159 /// This must not change at runtime.
160 fn disk_id(&self) -> Option<[u8; 16]>;
161
162 /// Returns the physical sector size of the backing store.
163 ///
164 /// This must not change at runtime.
165 fn physical_sector_size(&self) -> u32;
166
167 /// Returns true if the `fua` parameter to [`DiskIo::write_vectored`] is
168 /// respected by the backing store by ensuring that the IO is immediately
169 /// committed to disk.
170 fn is_fua_respected(&self) -> bool;
171
172 /// Returns true if the disk is read only.
173 fn is_read_only(&self) -> bool;
174
175 /// Unmap sectors from the layer.
176 fn unmap(
177 &self,
178 sector: u64,
179 count: u64,
180 block_level_only: bool,
181 ) -> impl Future<Output = Result<(), DiskError>> + Send;
182
183 /// Returns the behavior of the unmap operation.
184 ///
185 /// This tells callers what happens to the content of unmapped sectors:
186 ///
187 /// - [`UnmapBehavior::Zeroes`] — unmapped sectors read back as zero.
188 /// - [`UnmapBehavior::Unspecified`] — content may or may not change, and
189 /// not necessarily to zero.
190 /// - [`UnmapBehavior::Ignored`] — unmap is a no-op; content is unchanged.
191 fn unmap_behavior(&self) -> UnmapBehavior;
192
193 /// Returns the optimal granularity for unmaps, in sectors.
194 fn optimal_unmap_sectors(&self) -> u32 {
195 1
196 }
197
198 /// Optionally returns a trait object to issue persistent reservation
199 /// requests.
200 fn pr(&self) -> Option<&dyn pr::PersistentReservation> {
201 None
202 }
203
204 /// Issues an asynchronous eject media operation to the disk.
205 ///
206 /// The default implementation returns [`DiskError::UnsupportedEject`].
207 /// Eject is primarily a media state change managed by the SCSI DVD layer
208 /// (`SimpleScsiDvd`), not by disk backends. Backends generally do not
209 /// need to override this.
210 fn eject(&self) -> impl Future<Output = Result<(), DiskError>> + Send {
211 ready(Err(DiskError::UnsupportedEject))
212 }
213
214 /// Issues an asynchronous read-scatter operation to the disk.
215 ///
216 /// # Arguments
217 /// * `buffers` - An object representing the data buffers into which the disk data will be transferred.
218 /// * `sector` - The logical sector at which the read operation starts.
219 fn read_vectored(
220 &self,
221 buffers: &RequestBuffers<'_>,
222 sector: u64,
223 ) -> impl Future<Output = Result<(), DiskError>> + Send;
224
225 /// Issues an asynchronous write-gather operation to the disk.
226 /// # Arguments
227 /// * `buffers` - An object representing the data buffers containing the data to transfer to the disk.
228 /// * `sector` - The logical sector at which the write operation starts.
229 /// * `fua` - A flag indicates if FUA (force unit access) is requested.
230 fn write_vectored(
231 &self,
232 buffers: &RequestBuffers<'_>,
233 sector: u64,
234 fua: bool,
235 ) -> impl Future<Output = Result<(), DiskError>> + Send;
236
237 /// Issues an asynchronous flush operation to the disk.
238 fn sync_cache(&self) -> impl Future<Output = Result<(), DiskError>> + Send;
239
240 /// Waits for the disk sector count to change from the specified value.
241 ///
242 /// Returns the new sector count once [`DiskIo::sector_count`] would return
243 /// a value different from `sector_count`. Frontends use this to detect
244 /// runtime capacity changes and notify the guest (NVMe via AEN, SCSI via
245 /// UNIT_ATTENTION).
246 ///
247 /// The default implementation returns [`std::future::pending()`], meaning
248 /// the disk never signals a resize. Only backends that can detect runtime
249 /// capacity changes should override this — for example, `BlockDeviceDisk`
250 /// (via Linux uevent) and `NvmeDisk` (via NVMe AEN). Decorator wrappers
251 /// and `LayeredDisk` should delegate to the inner disk.
252 fn wait_resize(&self, sector_count: u64) -> impl Future<Output = u64> + Send {
253 let _ = sector_count;
254 std::future::pending()
255 }
256}
257
258/// An asynchronous block device.
259///
260/// This type is cheap to clone, for sharing the disk among multiple concurrent
261/// users.
262#[derive(Inspect, Clone)]
263#[inspect(extra = "Self::inspect_extra")]
264pub struct Disk(#[inspect(flatten)] Arc<DiskInner>);
265
266impl Disk {
267 fn inspect_extra(&self, resp: &mut inspect::Response<'_>) {
268 resp.field("disk_type", self.0.disk.disk_type())
269 .field("sector_count", self.0.disk.sector_count())
270 .field("supports_pr", self.0.disk.pr().is_some());
271 }
272}
273
274impl Debug for Disk {
275 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
276 f.debug_tuple("Disk").finish()
277 }
278}
279
280#[derive(Inspect)]
281#[inspect(bound = "T: DynDisk")]
282struct DiskInner<T: ?Sized = dyn DynDisk> {
283 sector_size: u32,
284 sector_shift: u32,
285 physical_sector_size: u32,
286 disk_id: Option<[u8; 16]>,
287 is_fua_respected: bool,
288 is_read_only: bool,
289 unmap_behavior: UnmapBehavior,
290 optimal_unmap_sectors: u32,
291 disk: T,
292}
293
294/// Errors that can occur when creating a `Disk`.
295#[derive(Debug, Error)]
296pub enum InvalidDisk {
297 /// The sector size is invalid.
298 #[error("invalid sector size: {0}")]
299 InvalidSectorSize(u32),
300 /// The physical sector size is invalid.
301 #[error("invalid physical sector size: {0}")]
302 InvalidPhysicalSectorSize(u32),
303}
304
305impl Disk {
306 /// Returns a new disk wrapping the given backing object.
307 pub fn new(disk: impl 'static + DiskIo) -> Result<Self, InvalidDisk> {
308 // Cache the metadata locally to validate it and so that it can be
309 // accessed without needing to go through the trait object. This is more
310 // efficient and ensures the backing disk does not change these values
311 // during the lifetime of the disk.
312 let sector_size = disk.sector_size();
313 if !sector_size.is_power_of_two() || sector_size < 512 {
314 return Err(InvalidDisk::InvalidSectorSize(sector_size));
315 }
316 let physical_sector_size = disk.physical_sector_size();
317 if !physical_sector_size.is_power_of_two() || physical_sector_size < sector_size {
318 return Err(InvalidDisk::InvalidPhysicalSectorSize(physical_sector_size));
319 }
320 Ok(Self(Arc::new(DiskInner {
321 sector_size,
322 sector_shift: sector_size.trailing_zeros(),
323 physical_sector_size,
324 disk_id: disk.disk_id(),
325 is_fua_respected: disk.is_fua_respected(),
326 is_read_only: disk.is_read_only(),
327 optimal_unmap_sectors: disk.optimal_unmap_sectors(),
328 unmap_behavior: disk.unmap_behavior(),
329 disk,
330 })))
331 }
332
333 /// Returns the current sector count.
334 ///
335 /// For some backing stores, this may change at runtime. Use
336 /// [`wait_resize`](Self::wait_resize) to detect this change.
337 pub fn sector_count(&self) -> u64 {
338 self.0.disk.sector_count()
339 }
340
341 /// Returns the logical sector size of the backing store.
342 pub fn sector_size(&self) -> u32 {
343 self.0.sector_size
344 }
345
346 /// Returns log2 of the logical sector size of the backing store.
347 pub fn sector_shift(&self) -> u32 {
348 self.0.sector_shift
349 }
350
351 /// Optionally returns a 16-byte identifier for the disk, if there is a
352 /// natural one for this backing store.
353 ///
354 /// This may be exposed to the guest as a unique disk identifier.
355 pub fn disk_id(&self) -> Option<[u8; 16]> {
356 self.0.disk_id
357 }
358
359 /// Returns the physical sector size of the backing store.
360 pub fn physical_sector_size(&self) -> u32 {
361 self.0.physical_sector_size
362 }
363
364 /// Returns true if the `fua` parameter to
365 /// [`write_vectored`](Self::write_vectored) is respected by the backing
366 /// store by ensuring that the IO is immediately committed to disk.
367 pub fn is_fua_respected(&self) -> bool {
368 self.0.is_fua_respected
369 }
370
371 /// Returns true if the disk is read only.
372 pub fn is_read_only(&self) -> bool {
373 self.0.is_read_only
374 }
375
376 /// Unmap sectors from the disk.
377 pub fn unmap(
378 &self,
379 sector: u64,
380 count: u64,
381 block_level_only: bool,
382 ) -> impl use<'_> + Future<Output = Result<(), DiskError>> + Send {
383 self.0.disk.unmap(sector, count, block_level_only)
384 }
385
386 /// Returns the behavior of the unmap operation.
387 pub fn unmap_behavior(&self) -> UnmapBehavior {
388 self.0.unmap_behavior
389 }
390
391 /// Returns the optimal granularity for unmaps, in sectors.
392 pub fn optimal_unmap_sectors(&self) -> u32 {
393 self.0.optimal_unmap_sectors
394 }
395
396 /// Optionally returns a trait object to issue persistent reservation
397 /// requests.
398 pub fn pr(&self) -> Option<&dyn pr::PersistentReservation> {
399 self.0.disk.pr()
400 }
401
402 /// Issues an asynchronous eject media operation to the disk.
403 pub fn eject(&self) -> impl use<'_> + Future<Output = Result<(), DiskError>> + Send {
404 self.0.disk.eject()
405 }
406
407 /// Issues an asynchronous read-scatter operation to the disk.
408 ///
409 /// # Arguments
410 ///
411 /// * `buffers` - An object representing the data buffers into which the disk data will be transferred.
412 /// * `sector` - The logical sector at which the read operation starts.
413 pub fn read_vectored<'a>(
414 &'a self,
415 buffers: &'a RequestBuffers<'_>,
416 sector: u64,
417 ) -> impl use<'a> + Future<Output = Result<(), DiskError>> + Send {
418 self.0.disk.read_vectored(buffers, sector)
419 }
420
421 /// Issues an asynchronous write-gather operation to the disk.
422 ///
423 /// # Arguments
424 ///
425 /// * `buffers` - An object representing the data buffers containing the data to transfer to the disk.
426 /// * `sector` - The logical sector at which the write operation starts.
427 /// * `fua` - A flag indicates if FUA (force unit access) is requested.
428 ///
429 /// # Panics
430 ///
431 /// The caller must pass a buffer with an integer number of sectors.
432 pub fn write_vectored<'a>(
433 &'a self,
434 buffers: &'a RequestBuffers<'_>,
435 sector: u64,
436 fua: bool,
437 ) -> impl use<'a> + Future<Output = Result<(), DiskError>> + Send {
438 self.0.disk.write_vectored(buffers, sector, fua)
439 }
440
441 /// Issues an asynchronous flush operation to the disk.
442 pub fn sync_cache(&self) -> impl use<'_> + Future<Output = Result<(), DiskError>> + Send {
443 self.0.disk.sync_cache()
444 }
445
446 /// Waits for the disk sector count to change from the specified value.
447 pub fn wait_resize(&self, sector_count: u64) -> impl use<'_> + Future<Output = u64> {
448 self.0.disk.wait_resize(sector_count)
449 }
450}
451
452/// The behavior of the [`DiskIo::unmap`] operation.
453///
454/// This describes what happens to the content of unmapped sectors. Frontends
455/// use this to report the correct behavior to the guest (e.g., SCSI
456/// `LBPRZ` bit or NVMe DLFEAT field).
457#[derive(Clone, Copy, Debug, PartialEq, Eq, Inspect)]
458pub enum UnmapBehavior {
459 /// Unmap may or may not change the content, and not necessarily to zero.
460 /// The guest cannot assume anything about the content of unmapped sectors.
461 Unspecified,
462 /// Unmaps are guaranteed to be ignored — the content is unchanged.
463 /// The disk reports that unmap is not supported.
464 Ignored,
465 /// Unmap will deterministically zero the content. The guest can rely on
466 /// reading back zeroes from unmapped sectors.
467 Zeroes,
468}
469
470/// The amount of space reserved for a DiskIo future
471///
472/// This was chosen by running `cargo test -p storvsp -- --no-capture` and looking at the required
473/// size that was given in the failure message
474const ASYNC_DISK_STACK_SIZE: usize = 1256;
475
476type IoFuture<'a> = StackFuture<'a, Result<(), DiskError>, { ASYNC_DISK_STACK_SIZE }>;
477
478trait DynDisk: Send + Sync + Inspect {
479 fn disk_type(&self) -> &str;
480 fn sector_count(&self) -> u64;
481
482 fn unmap(&self, sector_offset: u64, sector_count: u64, block_level_only: bool) -> IoFuture<'_>;
483
484 fn pr(&self) -> Option<&dyn pr::PersistentReservation>;
485 fn eject(&self) -> IoFuture<'_>;
486
487 fn read_vectored<'a>(&'a self, buffers: &'a RequestBuffers<'_>, sector: u64) -> IoFuture<'a>;
488
489 fn write_vectored<'a>(
490 &'a self,
491 buffers: &'a RequestBuffers<'_>,
492 sector: u64,
493 fua: bool,
494 ) -> IoFuture<'a>;
495
496 fn sync_cache(&self) -> IoFuture<'_>;
497
498 fn wait_resize<'a>(
499 &'a self,
500 sector_count: u64,
501 ) -> Pin<Box<dyn 'a + Send + Future<Output = u64>>> {
502 let _ = sector_count;
503 Box::pin(std::future::pending())
504 }
505}
506
507impl<T: DiskIo> DynDisk for T {
508 fn disk_type(&self) -> &str {
509 self.disk_type()
510 }
511
512 fn sector_count(&self) -> u64 {
513 self.sector_count()
514 }
515
516 fn unmap(
517 &self,
518 sector_offset: u64,
519 sector_count: u64,
520 block_level_only: bool,
521 ) -> StackFuture<'_, Result<(), DiskError>, { ASYNC_DISK_STACK_SIZE }> {
522 StackFuture::from_or_box(self.unmap(sector_offset, sector_count, block_level_only))
523 }
524
525 fn pr(&self) -> Option<&dyn pr::PersistentReservation> {
526 self.pr()
527 }
528
529 fn eject(&self) -> IoFuture<'_> {
530 StackFuture::from_or_box(self.eject())
531 }
532
533 fn read_vectored<'a>(&'a self, buffers: &'a RequestBuffers<'_>, sector: u64) -> IoFuture<'a> {
534 StackFuture::from_or_box(self.read_vectored(buffers, sector))
535 }
536
537 fn write_vectored<'a>(
538 &'a self,
539 buffers: &'a RequestBuffers<'a>,
540 sector: u64,
541 fua: bool,
542 ) -> IoFuture<'a> {
543 StackFuture::from_or_box(self.write_vectored(buffers, sector, fua))
544 }
545
546 fn sync_cache(&self) -> IoFuture<'_> {
547 StackFuture::from_or_box(self.sync_cache())
548 }
549}