disk_layered/
lib.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! A layered disk implementation, [`LayeredDisk`].
5//!
6//! A layered disk is a disk composed of multiple layers. Each layer is a block
7//! device made up of sectors, but with the added per-sector state of whether
8//! the sector is present or not. When reading a sector, the layered disk will
9//! read from the topmost layer that has the sector present. When writing, the
10//! disk will write to the topmost layer.
11//!
12//! A layer can also have caching behavior. If a layer is configured to cache
13//! reads, then sectors that are read from lower layers are written back to the
14//! layer. If a layer is configured to write through, then writes are written to
15//! the layer and the next layer. These can be useful to implement simple
16//! persistent and non-persistent caches, primarily designed for lazily
17//! populating local backing stores from remote sources.
18//!
19//! Missing from this implementation is write-back caching and cache eviction,
20//! which would be needed for caches that are smaller than the disk. These
21//! require potentially complicated cache management policies and are probably
22//! best implemented in a separate disk implementation.
23//!
24//! # Layer types
25//!
26//! Each layer implements [`LayerIo`], which is similar to [`DiskIo`]
27//! but adds per-sector presence tracking via [`SectorMarker`]. Two concrete
28//! layer implementations exist:
29//!
30//! - **`RamDiskLayer`** (`disklayer_ram`) — ephemeral, in-memory.
31//! - **`SqliteDiskLayer`** (`disklayer_sqlite`) — persistent, file-backed
32//!   (dev/test only).
33//!
34//! A full [`Disk`] can appear at the bottom of the stack
35//! as a fully-present layer via `DiskLayer::from_disk`, which wraps it in
36//! `DiskAsLayer` — a layer that marks all sectors as present on every read.
37//!
38//! # Construction and validation
39//!
40//! [`LayeredDisk::new`] validates the layer stack at construction time:
41//!
42//! - All layers must have matching sector sizes.
43//! - Write-through layers must be contiguous from the top.
44//! - The last layer must not be write-through.
45//! - Layers used as read caches must support [`WriteNoOverwrite`].
46//! - If the disk is writable, all layers in the write path must be writable.
47
48#![forbid(unsafe_code)]
49
50mod bitmap;
51pub mod resolve;
52pub mod resolver;
53
54pub use bitmap::SectorMarker;
55
56use bitmap::Bitmap;
57use disk_backend::Disk;
58use disk_backend::DiskError;
59use disk_backend::DiskIo;
60use disk_backend::UnmapBehavior;
61use guestmem::GuestMemory;
62use guestmem::MemoryWrite;
63use inspect::Inspect;
64use scsi_buffers::OwnedRequestBuffers;
65use scsi_buffers::RequestBuffers;
66use std::convert::Infallible;
67use std::future::Future;
68use std::pin::Pin;
69use thiserror::Error;
70
71/// A disk composed of multiple layers.
72#[derive(Inspect)]
73pub struct LayeredDisk {
74    #[inspect(iter_by_index)]
75    layers: Vec<Layer>,
76    read_only: bool,
77    is_fua_respected: bool,
78    sector_shift: u32,
79    disk_id: Option<[u8; 16]>,
80    physical_sector_size: u32,
81    unmap_behavior: UnmapBehavior,
82    optimal_unmap_sectors: u32,
83}
84
85#[derive(Inspect)]
86struct Layer {
87    backing: Box<dyn DynLayerIo>,
88    visible_sector_count: u64,
89    read_cache: bool,
90    write_through: bool,
91}
92
93/// A single layer which can be attached to a [`LayeredDisk`].
94pub struct DiskLayer(Box<dyn DynLayerAttach>);
95
96impl DiskLayer {
97    /// Creates a new layer from a backing store.
98    pub fn new<T: LayerAttach>(backing: T) -> Self {
99        Self(Box::new(backing))
100    }
101
102    /// Creates a layer from a disk. The resulting layer is always fully
103    /// present.
104    pub fn from_disk(disk: Disk) -> Self {
105        Self::new(DiskAsLayer(disk))
106    }
107}
108
109/// Metadata of a particular layer, collected from various [`LayerIo`] APIs.
110#[derive(Clone)]
111#[expect(missing_docs)] // self-explanatory names
112pub struct DiskLayerMetadata {
113    pub disk_id: Option<[u8; 16]>,
114    pub sector_size: u32,
115    pub sector_count: u64,
116    pub physical_sector_size: u32,
117    pub unmap_behavior: UnmapBehavior,
118    pub optimal_unmap_sectors: u32,
119    pub read_only: bool,
120    pub can_read_cache: bool,
121    pub is_fua_respected: bool,
122}
123
124// DEVNOTE: this is a transient object, used solely in LayeredDisk::new.
125struct AttachedDiskLayer {
126    backing: Box<dyn DynLayerIo>,
127    meta: DiskLayerMetadata,
128}
129
130/// An error returned when creating a [`DiskLayer`].
131#[derive(Debug, Error)]
132pub enum InvalidLayer {
133    /// Failed to attach the layer
134    #[error("failed to attach layer")]
135    AttachFailed(#[source] anyhow::Error),
136    /// Read caching was requested but is not supported.
137    #[error("read caching was requested but is not supported")]
138    ReadCacheNotSupported,
139    /// The sector size is invalid.
140    #[error("sector size {0} is invalid")]
141    InvalidSectorSize(u32),
142    /// The sector size of the layers do not match.
143    #[error("mismatched sector size {found}, expected {expected}")]
144    MismatchedSectorSize {
145        /// The expected sector size.
146        expected: u32,
147        /// The sector size found in the layer.
148        found: u32,
149    },
150    /// A write-through layer is preceeded by a layer that is not write-through, or
151    /// the last layer is write-through.
152    #[error("nothing to write through")]
153    UselessWriteThrough,
154    /// Writing to the layered disk would require this layer to be writable.
155    #[error("read only layer in a writable disk")]
156    ReadOnly,
157}
158
159/// An error returned when creating a [`LayeredDisk`].
160#[derive(Debug, Error)]
161pub enum InvalidLayeredDisk {
162    /// No layers were configured.
163    #[error("no layers were configured")]
164    NoLayers,
165    /// An error occurred in a layer.
166    #[error("invalid layer {0}")]
167    Layer(usize, #[source] InvalidLayer),
168}
169
170/// A configuration for a layer in a [`LayeredDisk`].
171pub struct LayerConfiguration<L = DiskLayer> {
172    /// The backing store for the layer.
173    pub layer: L,
174    /// Writes are written both to this layer and the next one.
175    pub write_through: bool,
176    /// Reads that miss this layer are written back to this layer.
177    pub read_cache: bool,
178}
179
180impl LayeredDisk {
181    /// Creates a new layered disk from a list of layers.
182    ///
183    /// The layers must be ordered from top to bottom, with the top layer being
184    /// the first in the list.
185    pub async fn new(
186        read_only: bool,
187        layers: Vec<LayerConfiguration>,
188    ) -> Result<Self, InvalidLayeredDisk> {
189        if layers.is_empty() {
190            return Err(InvalidLayeredDisk::NoLayers);
191        }
192
193        let mut attached_layers: Vec<LayerConfiguration<AttachedDiskLayer>> = {
194            let mut attached_layers = Vec::new();
195
196            // layers are attached to one another from the bottom-up, hence the need
197            // to iterate in reverse.
198            let mut lower_layer_metadata = None;
199            for (
200                i,
201                LayerConfiguration {
202                    layer,
203                    write_through,
204                    read_cache,
205                },
206            ) in layers.into_iter().enumerate().rev()
207            {
208                let layer_error = |e| InvalidLayeredDisk::Layer(i, e);
209
210                let layer = layer
211                    .0
212                    .attach(lower_layer_metadata.take())
213                    .await
214                    .map_err(|e| layer_error(InvalidLayer::AttachFailed(e)))?;
215
216                let layer_meta = layer.meta.clone();
217
218                attached_layers.push(LayerConfiguration {
219                    layer,
220                    write_through,
221                    read_cache,
222                });
223
224                // perform some layer validation prior to attaching subsequent layers
225                if read_cache && !layer_meta.can_read_cache {
226                    return Err(layer_error(InvalidLayer::ReadCacheNotSupported));
227                }
228                if !layer_meta.sector_size.is_power_of_two() {
229                    return Err(layer_error(InvalidLayer::InvalidSectorSize(
230                        layer_meta.sector_size,
231                    )));
232                }
233                if layer_meta.sector_size != attached_layers[0].layer.meta.sector_size {
234                    // FUTURE: consider supporting different sector sizes, within reason.
235                    return Err(layer_error(InvalidLayer::MismatchedSectorSize {
236                        expected: attached_layers[0].layer.meta.sector_size,
237                        found: layer_meta.sector_size,
238                    }));
239                }
240
241                lower_layer_metadata = Some(layer_meta);
242            }
243
244            attached_layers.reverse();
245            attached_layers
246        };
247
248        // perform top-down validation of the layer-stack, collecting various
249        // common properties of the stack along the way.
250        let mut last_write_through = true;
251        let mut is_fua_respected = true;
252        let mut optimal_unmap_sectors = 1;
253        let mut unmap_must_zero = false;
254        let mut disk_id = None;
255        let mut unmap_behavior = UnmapBehavior::Zeroes;
256        for (
257            i,
258            &LayerConfiguration {
259                ref layer,
260                write_through,
261                read_cache: _,
262            },
263        ) in attached_layers.iter().enumerate()
264        {
265            let layer_error = |e| InvalidLayeredDisk::Layer(i, e);
266
267            if last_write_through {
268                if layer.meta.read_only && !read_only {
269                    return Err(layer_error(InvalidLayer::ReadOnly));
270                }
271                is_fua_respected &= layer.meta.is_fua_respected;
272                // Merge the unmap behavior. If any affected layer ignores
273                // unmap, then force the whole disk to. If all affected layers
274                // zero the sectors, then report that the disk zeroes sectors.
275                //
276                // If there is at least one write-through layer, then unmap only
277                // works if the unmap operation will produce the same result in
278                // all the layers that are being written to. Otherwise, the
279                // guest could see inconsistent disk contents when the write
280                // through layer is removed.
281                unmap_must_zero |= write_through;
282                unmap_behavior = match (unmap_behavior, layer.meta.unmap_behavior) {
283                    (UnmapBehavior::Zeroes, UnmapBehavior::Zeroes) => UnmapBehavior::Zeroes,
284                    _ if unmap_must_zero => UnmapBehavior::Ignored,
285                    (UnmapBehavior::Ignored, _) => UnmapBehavior::Ignored,
286                    (_, UnmapBehavior::Ignored) => UnmapBehavior::Ignored,
287                    _ => UnmapBehavior::Unspecified,
288                };
289                optimal_unmap_sectors = optimal_unmap_sectors.max(layer.meta.optimal_unmap_sectors);
290            } else if write_through {
291                // The write-through layers must all come first.
292                return Err(layer_error(InvalidLayer::UselessWriteThrough));
293            }
294            last_write_through = write_through;
295            if disk_id.is_none() {
296                disk_id = layer.meta.disk_id;
297            }
298        }
299
300        if last_write_through {
301            return Err(InvalidLayeredDisk::Layer(
302                attached_layers.len() - 1,
303                InvalidLayer::UselessWriteThrough,
304            ));
305        }
306
307        let sector_size = attached_layers[0].layer.meta.sector_size;
308        let physical_sector_size = attached_layers[0].layer.meta.physical_sector_size;
309
310        let mut last_sector_count = None;
311        let sector_counts_rev = attached_layers
312            .iter_mut()
313            .rev()
314            .map(|config| *last_sector_count.insert(config.layer.backing.sector_count()))
315            .collect::<Vec<_>>();
316
317        let mut visible_sector_count = !0;
318        let layers = attached_layers
319            .into_iter()
320            .zip(sector_counts_rev.into_iter().rev())
321            .map(|(config, sector_count)| {
322                let LayerConfiguration {
323                    layer,
324                    write_through,
325                    read_cache,
326                } = config;
327                visible_sector_count = sector_count.min(visible_sector_count);
328                Layer {
329                    backing: layer.backing,
330                    visible_sector_count,
331                    read_cache,
332                    write_through,
333                }
334            })
335            .collect::<Vec<_>>();
336
337        Ok(Self {
338            is_fua_respected,
339            read_only,
340            sector_shift: sector_size.trailing_zeros(),
341            disk_id,
342            physical_sector_size,
343            unmap_behavior,
344            optimal_unmap_sectors,
345            layers,
346        })
347    }
348}
349
350trait DynLayerIo: Send + Sync + Inspect {
351    fn sector_count(&self) -> u64;
352
353    fn read<'a>(
354        &'a self,
355        buffers: &'a RequestBuffers<'_>,
356        sector: u64,
357        bitmap: SectorMarker<'a>,
358    ) -> Pin<Box<dyn 'a + Future<Output = Result<(), DiskError>> + Send>>;
359
360    fn write<'a>(
361        &'a self,
362        buffers: &'a RequestBuffers<'_>,
363        sector: u64,
364        fua: bool,
365        no_overwrite: bool,
366    ) -> Pin<Box<dyn 'a + Future<Output = Result<(), DiskError>> + Send>>;
367
368    fn sync_cache(&self) -> Pin<Box<dyn '_ + Future<Output = Result<(), DiskError>> + Send>>;
369
370    fn unmap(
371        &self,
372        sector: u64,
373        count: u64,
374        block_level_only: bool,
375        next_is_zero: bool,
376    ) -> Pin<Box<dyn '_ + Future<Output = Result<(), DiskError>> + Send>>;
377
378    fn wait_resize(&self, sector_count: u64) -> Pin<Box<dyn '_ + Future<Output = u64> + Send>>;
379}
380
381impl<T: LayerIo> DynLayerIo for T {
382    fn sector_count(&self) -> u64 {
383        self.sector_count()
384    }
385
386    fn read<'a>(
387        &'a self,
388        buffers: &'a RequestBuffers<'_>,
389        sector: u64,
390        bitmap: SectorMarker<'a>,
391    ) -> Pin<Box<dyn 'a + Future<Output = Result<(), DiskError>> + Send>> {
392        Box::pin(async move { self.read(buffers, sector, bitmap).await })
393    }
394
395    fn write<'a>(
396        &'a self,
397        buffers: &'a RequestBuffers<'_>,
398        sector: u64,
399        fua: bool,
400        no_overwrite: bool,
401    ) -> Pin<Box<dyn 'a + Future<Output = Result<(), DiskError>> + Send>> {
402        Box::pin(async move {
403            if no_overwrite {
404                self.write_no_overwrite()
405                    .unwrap()
406                    .write_no_overwrite(buffers, sector)
407                    .await
408            } else {
409                self.write(buffers, sector, fua).await
410            }
411        })
412    }
413
414    fn sync_cache(&self) -> Pin<Box<dyn '_ + Future<Output = Result<(), DiskError>> + Send>> {
415        Box::pin(self.sync_cache())
416    }
417
418    fn unmap(
419        &self,
420        sector: u64,
421        count: u64,
422        block_level_only: bool,
423        next_is_zero: bool,
424    ) -> Pin<Box<dyn '_ + Future<Output = Result<(), DiskError>> + Send>> {
425        Box::pin(self.unmap(sector, count, block_level_only, next_is_zero))
426    }
427
428    fn wait_resize(&self, sector_count: u64) -> Pin<Box<dyn '_ + Future<Output = u64> + Send>> {
429        Box::pin(self.wait_resize(sector_count))
430    }
431}
432
433trait DynLayerAttach: Send + Sync {
434    fn attach(
435        self: Box<Self>,
436        lower_layer_metadata: Option<DiskLayerMetadata>,
437    ) -> Pin<Box<dyn Future<Output = anyhow::Result<AttachedDiskLayer>> + Send>>;
438}
439
440impl<T: LayerAttach> DynLayerAttach for T {
441    fn attach(
442        self: Box<Self>,
443        lower_layer_metadata: Option<DiskLayerMetadata>,
444    ) -> Pin<Box<dyn Future<Output = anyhow::Result<AttachedDiskLayer>> + Send>> {
445        Box::pin(async move {
446            Ok({
447                let backing = (*self)
448                    .attach(lower_layer_metadata)
449                    .await
450                    .map_err(|e| anyhow::anyhow!(e.into()))?;
451                let can_read_cache = backing.write_no_overwrite().is_some();
452                AttachedDiskLayer {
453                    meta: DiskLayerMetadata {
454                        sector_count: LayerIo::sector_count(&backing),
455                        disk_id: backing.disk_id(),
456                        is_fua_respected: backing.is_fua_respected(),
457                        sector_size: backing.sector_size(),
458                        physical_sector_size: backing.physical_sector_size(),
459                        unmap_behavior: backing.unmap_behavior(),
460                        optimal_unmap_sectors: backing.optimal_unmap_sectors(),
461                        read_only: backing.is_logically_read_only(),
462                        can_read_cache,
463                    },
464                    backing: Box::new(backing),
465                }
466            })
467        })
468    }
469}
470
471/// Transition a layer from an unattached type-state, into an attached
472/// type-state, capable of performing [`LayerIo`].
473///
474/// Layers which do not require a type-state transition on-attach (e.g: those
475/// which are pre-initialized with a fixed set of metadata) can simply implement
476/// `LayerIo` directly, and leverage the blanket-impl of `impl<T: LayerIo>
477/// LayerAttach for T` which simply returns `Self` during the state transition.
478pub trait LayerAttach: 'static + Send + Sync {
479    /// Error returned if on attach failure.
480    type Error: Into<Box<dyn std::error::Error + Send + Sync + 'static>>;
481    /// Object implementating [`LayerIo`] after being attached.
482    type Layer: LayerIo;
483
484    /// Invoked when the layer is being attached to a layer stack.
485    ///
486    /// If the layer is being attached on-top of an existing layer,
487    /// `lower_layer_metadata` can be used to initialize and/or reconfigure the
488    /// layer using the properties of the layer is is being stacked on-top of.
489    fn attach(
490        self,
491        lower_layer_metadata: Option<DiskLayerMetadata>,
492    ) -> impl Future<Output = Result<Self::Layer, Self::Error>> + Send;
493}
494
495impl<T: LayerIo> LayerAttach for T {
496    type Error = Infallible;
497    type Layer = Self;
498    async fn attach(
499        self,
500        _lower_layer_metadata: Option<DiskLayerMetadata>,
501    ) -> Result<Self, Infallible> {
502        Ok(self)
503    }
504}
505
506/// Metadata and IO for disk layers.
507pub trait LayerIo: 'static + Send + Sync + Inspect {
508    /// Returns the layer type name as a string.
509    ///
510    /// This is used for diagnostic purposes.
511    fn layer_type(&self) -> &str;
512
513    /// Returns the current sector count.
514    ///
515    /// For some backing stores, this may change at runtime. If it does, then
516    /// the backing store must also implement [`DiskIo::wait_resize`].
517    fn sector_count(&self) -> u64;
518
519    /// Returns the logical sector size of the backing store.
520    ///
521    /// This must not change at runtime.
522    fn sector_size(&self) -> u32;
523
524    /// Optionally returns a 16-byte identifier for the disk, if there is a
525    /// natural one for this backing store.
526    ///
527    /// This may be exposed to the guest as a unique disk identifier.
528    /// This must not change at runtime.
529    fn disk_id(&self) -> Option<[u8; 16]>;
530
531    /// Returns the physical sector size of the backing store.
532    ///
533    /// This must not change at runtime.
534    fn physical_sector_size(&self) -> u32;
535
536    /// Returns true if the `fua` parameter to [`LayerIo::write`] is
537    /// respected by the backing store by ensuring that the IO is immediately
538    /// committed to disk.
539    fn is_fua_respected(&self) -> bool;
540
541    /// Returns true if the layer is logically read only.
542    ///
543    /// If this returns true, the layer might still be writable via
544    /// `write_no_overwrite`, used to populate the layer as a read cache.
545    fn is_logically_read_only(&self) -> bool;
546
547    /// Issues an asynchronous flush operation to the disk.
548    fn sync_cache(&self) -> impl Future<Output = Result<(), DiskError>> + Send;
549
550    /// Reads sectors from the layer.
551    ///
552    /// `marker` is used to specify which sectors have been read. Those that are
553    /// not read will be passed to the next layer, or zeroed if there are no
554    /// more layers.
555    fn read(
556        &self,
557        buffers: &RequestBuffers<'_>,
558        sector: u64,
559        marker: SectorMarker<'_>,
560    ) -> impl Future<Output = Result<(), DiskError>> + Send;
561
562    /// Writes sectors to the layer.
563    ///
564    /// # Panics
565    ///
566    /// The caller must pass a buffer with an integer number of sectors.
567    fn write(
568        &self,
569        buffers: &RequestBuffers<'_>,
570        sector: u64,
571        fua: bool,
572    ) -> impl Future<Output = Result<(), DiskError>> + Send;
573
574    /// Unmap sectors from the layer.
575    ///
576    /// If `next_is_zero` is true, then the next layer's content's are known to
577    /// be zero. A layer can use this information to just discard the sectors
578    /// rather than putting them in the zero state (which make take more space).
579    fn unmap(
580        &self,
581        sector: u64,
582        count: u64,
583        block_level_only: bool,
584        next_is_zero: bool,
585    ) -> impl Future<Output = Result<(), DiskError>> + Send;
586
587    /// Returns the behavior of the unmap operation.
588    fn unmap_behavior(&self) -> UnmapBehavior;
589
590    /// Returns the optimal granularity for unmaps, in sectors.
591    fn optimal_unmap_sectors(&self) -> u32 {
592        1
593    }
594
595    /// Optionally returns a write-no-overwrite implementation.
596    fn write_no_overwrite(&self) -> Option<impl WriteNoOverwrite> {
597        None::<NoIdet>
598    }
599
600    /// Waits for the disk sector size to be different than the specified value.
601    fn wait_resize(&self, sector_count: u64) -> impl Future<Output = u64> + Send {
602        let _ = sector_count;
603        std::future::pending()
604    }
605}
606
607enum NoIdet {}
608
609/// Writes to the layer without overwriting existing data.
610pub trait WriteNoOverwrite: Send + Sync {
611    /// Write to the layer without overwriting existing data. Existing sectors
612    /// must be preserved.
613    ///
614    /// This is used to support read caching, where the data being written may
615    /// be stale by the time it is written back to the layer.
616    fn write_no_overwrite(
617        &self,
618        buffers: &RequestBuffers<'_>,
619        sector: u64,
620    ) -> impl Future<Output = Result<(), DiskError>> + Send;
621}
622
623impl<T: WriteNoOverwrite> WriteNoOverwrite for &T {
624    fn write_no_overwrite(
625        &self,
626        buffers: &RequestBuffers<'_>,
627        sector: u64,
628    ) -> impl Future<Output = Result<(), DiskError>> + Send {
629        (*self).write_no_overwrite(buffers, sector)
630    }
631}
632
633impl WriteNoOverwrite for NoIdet {
634    async fn write_no_overwrite(
635        &self,
636        _buffers: &RequestBuffers<'_>,
637        _sector: u64,
638    ) -> Result<(), DiskError> {
639        unreachable!()
640    }
641}
642
643impl DiskIo for LayeredDisk {
644    fn disk_type(&self) -> &str {
645        "layered"
646    }
647
648    fn sector_count(&self) -> u64 {
649        self.layers[0].backing.sector_count()
650    }
651
652    fn sector_size(&self) -> u32 {
653        1 << self.sector_shift
654    }
655
656    fn disk_id(&self) -> Option<[u8; 16]> {
657        self.disk_id
658    }
659
660    fn physical_sector_size(&self) -> u32 {
661        self.physical_sector_size
662    }
663
664    fn is_fua_respected(&self) -> bool {
665        self.is_fua_respected
666    }
667
668    fn is_read_only(&self) -> bool {
669        self.read_only
670    }
671
672    async fn read_vectored(
673        &self,
674        buffers: &RequestBuffers<'_>,
675        sector: u64,
676    ) -> Result<(), DiskError> {
677        let mut bounce_buffers = None::<(OwnedRequestBuffers, GuestMemory)>;
678        let sector_count = buffers.len() >> self.sector_shift;
679        let mut bitmap = Bitmap::new(sector, sector_count);
680        let mut bits_set = 0;
681        let mut populate_cache = Vec::new();
682        // FUTURE: queue the reads to the layers in parallel.
683        'done: for (i, layer) in self.layers.iter().enumerate() {
684            if bits_set == sector_count {
685                break;
686            }
687            for mut range in bitmap.unset_iter() {
688                let end = if i == 0 {
689                    // The visible sector count of the first layer is unknown,
690                    // since it could change at any time.
691                    range.end_sector()
692                } else {
693                    // Restrict the range to the visible sector count of the
694                    // layer; sectors beyond this are logically zero.
695                    let end = range.end_sector().min(layer.visible_sector_count);
696                    if range.start_sector() == end {
697                        break 'done;
698                    }
699                    end
700                };
701
702                let sectors = end - range.start_sector();
703
704                let this_buffers = if let Some((bounce_buffers, mem)) = &bounce_buffers {
705                    &bounce_buffers.buffer(mem)
706                } else {
707                    buffers
708                };
709                let this_buffers = this_buffers.subrange(
710                    range.start_sector_within_bitmap() << self.sector_shift,
711                    (sectors as usize) << self.sector_shift,
712                );
713
714                layer
715                    .backing
716                    .read(&this_buffers, range.start_sector(), range.view(sectors))
717                    .await?;
718
719                bits_set += range.set_count();
720
721                if range.set_count() as u64 != range.len() && layer.read_cache {
722                    // Allocate bounce buffers to read into to ensure that we get a stable
723                    // copy of the data to populate the cache.
724                    bounce_buffers.get_or_insert_with(|| {
725                        let mem = GuestMemory::allocate(buffers.len());
726                        let owned_buf = OwnedRequestBuffers::linear(0, buffers.len(), true);
727                        (owned_buf, mem)
728                    });
729
730                    populate_cache.extend(range.unset_iter().map(|range| (layer, range)));
731                }
732            }
733        }
734        if bits_set != sector_count {
735            for range in bitmap.unset_iter() {
736                let len = (range.len() as usize) << self.sector_shift;
737                buffers
738                    .subrange(range.start_sector_within_bitmap() << self.sector_shift, len)
739                    .writer()
740                    .zero(len)?;
741            }
742        }
743        if !populate_cache.is_empty() {
744            let (bounce_buffers, mem) = bounce_buffers.unwrap();
745            let bounce_buffers = bounce_buffers.buffer(&mem);
746            for &(layer, ref range) in &populate_cache {
747                assert!(layer.read_cache);
748                let offset = ((range.start - sector) as usize) << self.sector_shift;
749                let len = ((range.end - range.start) as usize) << self.sector_shift;
750                if let Err(err) = layer
751                    .backing
752                    .write(
753                        &bounce_buffers.subrange(offset, len),
754                        range.start,
755                        false,
756                        true,
757                    )
758                    .await
759                {
760                    tracelimit::warn_ratelimited!(
761                        error = &err as &dyn std::error::Error,
762                        sector = range.start,
763                        count = range.end - range.start,
764                        "failed to populate read cache",
765                    );
766                }
767            }
768            let mut mem = mem.into_inner_buf().ok().unwrap();
769            for (_, range) in populate_cache {
770                // Write this bounced range back to the original buffer. This
771                // might be redundant in the presence of multiple cache layers,
772                // but this is the simplest implementation.
773                let offset = ((range.start - sector) as usize) << self.sector_shift;
774                let len = ((range.end - range.start) as usize) << self.sector_shift;
775                buffers
776                    .subrange(offset, len)
777                    .writer()
778                    .write(&mem.as_bytes()[offset..][..len])?;
779            }
780        }
781        Ok(())
782    }
783
784    async fn write_vectored(
785        &self,
786        buffers: &RequestBuffers<'_>,
787        sector: u64,
788        fua: bool,
789    ) -> Result<(), DiskError> {
790        for layer in &self.layers {
791            layer.backing.write(buffers, sector, fua, false).await?;
792            if !layer.write_through {
793                break;
794            }
795        }
796        Ok(())
797    }
798
799    async fn sync_cache(&self) -> Result<(), DiskError> {
800        for layer in &self.layers {
801            layer.backing.sync_cache().await?;
802            if !layer.write_through {
803                break;
804            }
805        }
806        Ok(())
807    }
808
809    fn wait_resize(&self, sector_count: u64) -> impl Future<Output = u64> + Send {
810        self.layers[0].backing.wait_resize(sector_count)
811    }
812
813    async fn unmap(
814        &self,
815        sector_offset: u64,
816        sector_count: u64,
817        block_level_only: bool,
818    ) -> Result<(), DiskError> {
819        if self.unmap_behavior == UnmapBehavior::Ignored {
820            return Ok(());
821        }
822
823        for (layer, next_layer) in self
824            .layers
825            .iter()
826            .zip(self.layers.iter().map(Some).skip(1).chain([None]))
827        {
828            let next_is_zero = if let Some(next_layer) = next_layer {
829                // Sectors beyond the layer's visible sector count are logically
830                // zero.
831                //
832                // FUTURE: consider splitting the unmap operation into multiple
833                // operations across this boundary.
834                sector_offset >= next_layer.visible_sector_count
835            } else {
836                true
837            };
838
839            layer
840                .backing
841                .unmap(sector_offset, sector_count, block_level_only, next_is_zero)
842                .await?;
843            if !layer.write_through {
844                break;
845            }
846        }
847        Ok(())
848    }
849
850    fn unmap_behavior(&self) -> UnmapBehavior {
851        self.unmap_behavior
852    }
853
854    fn optimal_unmap_sectors(&self) -> u32 {
855        self.optimal_unmap_sectors
856    }
857}
858
859/// A disk layer wrapping a full disk.
860#[derive(Inspect)]
861#[inspect(transparent)]
862struct DiskAsLayer(Disk);
863
864impl LayerIo for DiskAsLayer {
865    fn layer_type(&self) -> &str {
866        "disk"
867    }
868
869    fn sector_count(&self) -> u64 {
870        self.0.sector_count()
871    }
872
873    fn sector_size(&self) -> u32 {
874        self.0.sector_size()
875    }
876
877    fn disk_id(&self) -> Option<[u8; 16]> {
878        self.0.disk_id()
879    }
880
881    fn physical_sector_size(&self) -> u32 {
882        self.0.physical_sector_size()
883    }
884
885    fn is_fua_respected(&self) -> bool {
886        self.0.is_fua_respected()
887    }
888
889    fn is_logically_read_only(&self) -> bool {
890        self.0.is_read_only()
891    }
892
893    fn sync_cache(&self) -> impl Future<Output = Result<(), DiskError>> + Send {
894        self.0.sync_cache()
895    }
896
897    async fn read(
898        &self,
899        buffers: &RequestBuffers<'_>,
900        sector: u64,
901        mut bitmap: SectorMarker<'_>,
902    ) -> Result<(), DiskError> {
903        // The disk is fully populated.
904        bitmap.set_all();
905        self.0.read_vectored(buffers, sector).await
906    }
907
908    async fn write(
909        &self,
910        buffers: &RequestBuffers<'_>,
911        sector: u64,
912        fua: bool,
913    ) -> Result<(), DiskError> {
914        self.0.write_vectored(buffers, sector, fua).await
915    }
916
917    fn unmap(
918        &self,
919        sector: u64,
920        count: u64,
921        block_level_only: bool,
922        _lower_is_zero: bool,
923    ) -> impl Future<Output = Result<(), DiskError>> + Send {
924        self.0.unmap(sector, count, block_level_only)
925    }
926
927    fn unmap_behavior(&self) -> UnmapBehavior {
928        self.0.unmap_behavior()
929    }
930}
931
932#[cfg(test)]
933mod tests {
934    use crate::DiskLayer;
935    use crate::LayerConfiguration;
936    use crate::LayerIo;
937    use crate::LayeredDisk;
938    use crate::SectorMarker;
939    use crate::WriteNoOverwrite;
940    use disk_backend::DiskIo;
941    use disk_backend::UnmapBehavior;
942    use guestmem::GuestMemory;
943    use guestmem::MemoryRead as _;
944    use guestmem::MemoryWrite;
945    use inspect::Inspect;
946    use pal_async::async_test;
947    use parking_lot::Mutex;
948    use scsi_buffers::OwnedRequestBuffers;
949    use std::collections::BTreeMap;
950    use std::collections::btree_map::Entry;
951    use std::sync::Arc;
952
953    #[derive(Inspect)]
954    #[inspect(skip)]
955    struct TestLayer {
956        sectors: Mutex<BTreeMap<u64, Data>>,
957        sector_count: u64,
958    }
959
960    impl TestLayer {
961        fn new(sector_count: u64) -> Self {
962            Self {
963                sectors: Mutex::new(BTreeMap::new()),
964                sector_count,
965            }
966        }
967    }
968
969    struct Data(Box<[u8]>);
970
971    impl LayerIo for Arc<TestLayer> {
972        fn layer_type(&self) -> &str {
973            "test"
974        }
975
976        fn sector_count(&self) -> u64 {
977            self.sector_count
978        }
979
980        fn sector_size(&self) -> u32 {
981            512
982        }
983
984        fn disk_id(&self) -> Option<[u8; 16]> {
985            None
986        }
987
988        fn physical_sector_size(&self) -> u32 {
989            512
990        }
991
992        fn is_fua_respected(&self) -> bool {
993            false
994        }
995
996        fn is_logically_read_only(&self) -> bool {
997            false
998        }
999
1000        async fn sync_cache(&self) -> Result<(), disk_backend::DiskError> {
1001            Ok(())
1002        }
1003
1004        async fn read(
1005            &self,
1006            buffers: &scsi_buffers::RequestBuffers<'_>,
1007            sector: u64,
1008            mut marker: SectorMarker<'_>,
1009        ) -> Result<(), disk_backend::DiskError> {
1010            let sector_count = buffers.len() / self.sector_size() as usize;
1011            let sectors = self.sectors.lock();
1012            for i in sector..sector + sector_count as u64 {
1013                let Some(data) = sectors.get(&i) else {
1014                    continue;
1015                };
1016                let offset = ((i - sector) * self.sector_size() as u64) as usize;
1017                buffers
1018                    .subrange(offset, self.sector_size() as usize)
1019                    .writer()
1020                    .write(&data.0)?;
1021                marker.set(i);
1022            }
1023            Ok(())
1024        }
1025
1026        async fn write(
1027            &self,
1028            buffers: &scsi_buffers::RequestBuffers<'_>,
1029            sector: u64,
1030            _fua: bool,
1031        ) -> Result<(), disk_backend::DiskError> {
1032            let sector_count = buffers.len() / self.sector_size() as usize;
1033            let mut sectors = self.sectors.lock();
1034            for i in sector..sector + sector_count as u64 {
1035                let offset = ((i - sector) * self.sector_size() as u64) as usize;
1036                let mut data = Data(vec![0; self.sector_size() as usize].into());
1037                buffers
1038                    .subrange(offset, self.sector_size() as usize)
1039                    .reader()
1040                    .read(&mut data.0)?;
1041                sectors.insert(i, data);
1042            }
1043            Ok(())
1044        }
1045
1046        async fn unmap(
1047            &self,
1048            sector: u64,
1049            count: u64,
1050            _block_level_only: bool,
1051            next_is_zero: bool,
1052        ) -> Result<(), disk_backend::DiskError> {
1053            if !next_is_zero {
1054                return Ok(());
1055            }
1056            let mut sectors = self.sectors.lock();
1057            let mut next_sector = sector;
1058            let end = sector + count;
1059            while next_sector < end {
1060                let Some((&sector, _)) = sectors.range_mut(next_sector..).next() else {
1061                    break;
1062                };
1063                if sector >= end {
1064                    break;
1065                }
1066                sectors.remove(&sector);
1067                next_sector = sector + 1;
1068            }
1069            Ok(())
1070        }
1071
1072        fn unmap_behavior(&self) -> UnmapBehavior {
1073            UnmapBehavior::Unspecified
1074        }
1075
1076        fn write_no_overwrite(&self) -> Option<impl WriteNoOverwrite> {
1077            Some(self)
1078        }
1079    }
1080
1081    impl WriteNoOverwrite for Arc<TestLayer> {
1082        async fn write_no_overwrite(
1083            &self,
1084            buffers: &scsi_buffers::RequestBuffers<'_>,
1085            sector: u64,
1086        ) -> Result<(), disk_backend::DiskError> {
1087            let sector_count = buffers.len() / self.sector_size() as usize;
1088            let mut sectors = self.sectors.lock();
1089            for i in sector..sector + sector_count as u64 {
1090                let Entry::Vacant(entry) = sectors.entry(i) else {
1091                    continue;
1092                };
1093                let offset = ((i - sector) * self.sector_size() as u64) as usize;
1094                let mut data = Data(vec![0; self.sector_size() as usize].into());
1095                buffers
1096                    .subrange(offset, self.sector_size() as usize)
1097                    .reader()
1098                    .read(&mut data.0)?;
1099                entry.insert(data);
1100            }
1101            Ok(())
1102        }
1103    }
1104
1105    #[async_test]
1106    async fn test_read_cache() {
1107        const SIZE: u64 = 2048;
1108        let bottom = Arc::new(TestLayer::new(SIZE));
1109        let pattern = |i: u64| {
1110            let mut acc = (i + 1) * 3;
1111            Data(
1112                (0..512)
1113                    .map(|_| {
1114                        acc = acc.wrapping_mul(7);
1115                        acc as u8
1116                    })
1117                    .collect::<Vec<_>>()
1118                    .into(),
1119            )
1120        };
1121        bottom
1122            .sectors
1123            .lock()
1124            .extend((0..SIZE).map(|i| (i, pattern(i))));
1125
1126        let cache = Arc::new(TestLayer::new(SIZE));
1127        let cache_cfg = LayerConfiguration {
1128            layer: DiskLayer::new(cache.clone()),
1129            read_cache: true,
1130            write_through: false,
1131        };
1132        let bottom_cfg = LayerConfiguration {
1133            layer: DiskLayer::new(bottom),
1134            read_cache: false,
1135            write_through: false,
1136        };
1137        let disk = LayeredDisk::new(false, vec![cache_cfg, bottom_cfg])
1138            .await
1139            .unwrap();
1140
1141        let mut mem = GuestMemory::allocate(0x10000);
1142        let buffers = OwnedRequestBuffers::linear(0, 0x10000, true);
1143
1144        for i in [0, 2, 4, 6, 8, 0, 2, 4, 6, 8] {
1145            disk.read_vectored(&buffers.buffer(&mem).subrange(0, 512), i)
1146                .await
1147                .unwrap();
1148
1149            assert_eq!(mem.inner_buf_mut().unwrap()[..512], pattern(i).0[..]);
1150        }
1151
1152        assert_eq!(cache.sectors.lock().len(), 5);
1153
1154        mem.inner_buf_mut().unwrap().fill(0);
1155
1156        disk.read_vectored(&buffers.buffer(&mem).subrange(0, 15 * 512), 1)
1157            .await
1158            .unwrap();
1159
1160        assert_eq!(cache.sectors.lock().len(), 16);
1161
1162        for i in 0..15 {
1163            assert_eq!(
1164                mem.inner_buf_mut().unwrap()[i as usize * 512..][..512],
1165                pattern(i + 1).0[..],
1166                "{i}"
1167            );
1168        }
1169    }
1170}