disk_layered/
lib.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! A layered disk implementation, [`LayeredDisk`].
5//!
6//! A layered disk is a disk composed of multiple layers. Each layer is a block
7//! device made up of sectors, but with the added per-sector state of whether
8//! the sector is present or not. When reading a sector, the layered disk will
9//! read from the topmost layer that has the sector present. When writing, the
10//! disk will write to the topmost layer.
11//!
12//! A layer can also have caching behavior. If a layer is configured to cache
13//! reads, then sectors that are read from lower layers are written back to the
14//! layer. If a layer is configured to write through, then writes are written to
15//! the layer and the next layer. These can be useful to implement simple
16//! persistent and non-persistent caches, primarily designed for lazily
17//! populating local backing stores from remote sources.
18//!
19//! Missing from this implementation is write-back caching and cache eviction,
20//! which would be needed for caches that are smaller than the disk. These
21//! require potentially complicated cache management policies and are probably
22//! best implemented in a separate disk implementation.
23
24#![forbid(unsafe_code)]
25
26mod bitmap;
27pub mod resolve;
28pub mod resolver;
29
30pub use bitmap::SectorMarker;
31
32use bitmap::Bitmap;
33use disk_backend::Disk;
34use disk_backend::DiskError;
35use disk_backend::DiskIo;
36use disk_backend::UnmapBehavior;
37use guestmem::GuestMemory;
38use guestmem::MemoryWrite;
39use inspect::Inspect;
40use scsi_buffers::OwnedRequestBuffers;
41use scsi_buffers::RequestBuffers;
42use std::convert::Infallible;
43use std::future::Future;
44use std::pin::Pin;
45use thiserror::Error;
46
47/// A disk composed of multiple layers.
48#[derive(Inspect)]
49pub struct LayeredDisk {
50    #[inspect(iter_by_index)]
51    layers: Vec<Layer>,
52    read_only: bool,
53    is_fua_respected: bool,
54    sector_shift: u32,
55    disk_id: Option<[u8; 16]>,
56    physical_sector_size: u32,
57    unmap_behavior: UnmapBehavior,
58    optimal_unmap_sectors: u32,
59}
60
61#[derive(Inspect)]
62struct Layer {
63    backing: Box<dyn DynLayerIo>,
64    visible_sector_count: u64,
65    read_cache: bool,
66    write_through: bool,
67}
68
69/// A single layer which can be attached to a [`LayeredDisk`].
70pub struct DiskLayer(Box<dyn DynLayerAttach>);
71
72impl DiskLayer {
73    /// Creates a new layer from a backing store.
74    pub fn new<T: LayerAttach>(backing: T) -> Self {
75        Self(Box::new(backing))
76    }
77
78    /// Creates a layer from a disk. The resulting layer is always fully
79    /// present.
80    pub fn from_disk(disk: Disk) -> Self {
81        Self::new(DiskAsLayer(disk))
82    }
83}
84
85/// Metadata of a particular layer, collected from various [`LayerIo`] APIs.
86#[derive(Clone)]
87#[expect(missing_docs)] // self-explanatory names
88pub struct DiskLayerMetadata {
89    pub disk_id: Option<[u8; 16]>,
90    pub sector_size: u32,
91    pub sector_count: u64,
92    pub physical_sector_size: u32,
93    pub unmap_behavior: UnmapBehavior,
94    pub optimal_unmap_sectors: u32,
95    pub read_only: bool,
96    pub can_read_cache: bool,
97    pub is_fua_respected: bool,
98}
99
100// DEVNOTE: this is a transient object, used solely in LayeredDisk::new.
101struct AttachedDiskLayer {
102    backing: Box<dyn DynLayerIo>,
103    meta: DiskLayerMetadata,
104}
105
106/// An error returned when creating a [`DiskLayer`].
107#[derive(Debug, Error)]
108pub enum InvalidLayer {
109    /// Failed to attach the layer
110    #[error("failed to attach layer")]
111    AttachFailed(#[source] anyhow::Error),
112    /// Read caching was requested but is not supported.
113    #[error("read caching was requested but is not supported")]
114    ReadCacheNotSupported,
115    /// The sector size is invalid.
116    #[error("sector size {0} is invalid")]
117    InvalidSectorSize(u32),
118    /// The sector size of the layers do not match.
119    #[error("mismatched sector size {found}, expected {expected}")]
120    MismatchedSectorSize {
121        /// The expected sector size.
122        expected: u32,
123        /// The sector size found in the layer.
124        found: u32,
125    },
126    /// A write-through layer is preceeded by a layer that is not write-through, or
127    /// the last layer is write-through.
128    #[error("nothing to write through")]
129    UselessWriteThrough,
130    /// Writing to the layered disk would require this layer to be writable.
131    #[error("read only layer in a writable disk")]
132    ReadOnly,
133}
134
135/// An error returned when creating a [`LayeredDisk`].
136#[derive(Debug, Error)]
137pub enum InvalidLayeredDisk {
138    /// No layers were configured.
139    #[error("no layers were configured")]
140    NoLayers,
141    /// An error occurred in a layer.
142    #[error("invalid layer {0}")]
143    Layer(usize, #[source] InvalidLayer),
144}
145
146/// A configuration for a layer in a [`LayeredDisk`].
147pub struct LayerConfiguration<L = DiskLayer> {
148    /// The backing store for the layer.
149    pub layer: L,
150    /// Writes are written both to this layer and the next one.
151    pub write_through: bool,
152    /// Reads that miss this layer are written back to this layer.
153    pub read_cache: bool,
154}
155
156impl LayeredDisk {
157    /// Creates a new layered disk from a list of layers.
158    ///
159    /// The layers must be ordered from top to bottom, with the top layer being
160    /// the first in the list.
161    pub async fn new(
162        read_only: bool,
163        layers: Vec<LayerConfiguration>,
164    ) -> Result<Self, InvalidLayeredDisk> {
165        if layers.is_empty() {
166            return Err(InvalidLayeredDisk::NoLayers);
167        }
168
169        let mut attached_layers: Vec<LayerConfiguration<AttachedDiskLayer>> = {
170            let mut attached_layers = Vec::new();
171
172            // layers are attached to one another from the bottom-up, hence the need
173            // to iterate in reverse.
174            let mut lower_layer_metadata = None;
175            for (
176                i,
177                LayerConfiguration {
178                    layer,
179                    write_through,
180                    read_cache,
181                },
182            ) in layers.into_iter().enumerate().rev()
183            {
184                let layer_error = |e| InvalidLayeredDisk::Layer(i, e);
185
186                let layer = layer
187                    .0
188                    .attach(lower_layer_metadata.take())
189                    .await
190                    .map_err(|e| layer_error(InvalidLayer::AttachFailed(e)))?;
191
192                let layer_meta = layer.meta.clone();
193
194                attached_layers.push(LayerConfiguration {
195                    layer,
196                    write_through,
197                    read_cache,
198                });
199
200                // perform some layer validation prior to attaching subsequent layers
201                if read_cache && !layer_meta.can_read_cache {
202                    return Err(layer_error(InvalidLayer::ReadCacheNotSupported));
203                }
204                if !layer_meta.sector_size.is_power_of_two() {
205                    return Err(layer_error(InvalidLayer::InvalidSectorSize(
206                        layer_meta.sector_size,
207                    )));
208                }
209                if layer_meta.sector_size != attached_layers[0].layer.meta.sector_size {
210                    // FUTURE: consider supporting different sector sizes, within reason.
211                    return Err(layer_error(InvalidLayer::MismatchedSectorSize {
212                        expected: attached_layers[0].layer.meta.sector_size,
213                        found: layer_meta.sector_size,
214                    }));
215                }
216
217                lower_layer_metadata = Some(layer_meta);
218            }
219
220            attached_layers.reverse();
221            attached_layers
222        };
223
224        // perform top-down validation of the layer-stack, collecting various
225        // common properties of the stack along the way.
226        let mut last_write_through = true;
227        let mut is_fua_respected = true;
228        let mut optimal_unmap_sectors = 1;
229        let mut unmap_must_zero = false;
230        let mut disk_id = None;
231        let mut unmap_behavior = UnmapBehavior::Zeroes;
232        for (
233            i,
234            &LayerConfiguration {
235                ref layer,
236                write_through,
237                read_cache: _,
238            },
239        ) in attached_layers.iter().enumerate()
240        {
241            let layer_error = |e| InvalidLayeredDisk::Layer(i, e);
242
243            if last_write_through {
244                if layer.meta.read_only && !read_only {
245                    return Err(layer_error(InvalidLayer::ReadOnly));
246                }
247                is_fua_respected &= layer.meta.is_fua_respected;
248                // Merge the unmap behavior. If any affected layer ignores
249                // unmap, then force the whole disk to. If all affected layers
250                // zero the sectors, then report that the disk zeroes sectors.
251                //
252                // If there is at least one write-through layer, then unmap only
253                // works if the unmap operation will produce the same result in
254                // all the layers that are being written to. Otherwise, the
255                // guest could see inconsistent disk contents when the write
256                // through layer is removed.
257                unmap_must_zero |= write_through;
258                unmap_behavior = match (unmap_behavior, layer.meta.unmap_behavior) {
259                    (UnmapBehavior::Zeroes, UnmapBehavior::Zeroes) => UnmapBehavior::Zeroes,
260                    _ if unmap_must_zero => UnmapBehavior::Ignored,
261                    (UnmapBehavior::Ignored, _) => UnmapBehavior::Ignored,
262                    (_, UnmapBehavior::Ignored) => UnmapBehavior::Ignored,
263                    _ => UnmapBehavior::Unspecified,
264                };
265                optimal_unmap_sectors = optimal_unmap_sectors.max(layer.meta.optimal_unmap_sectors);
266            } else if write_through {
267                // The write-through layers must all come first.
268                return Err(layer_error(InvalidLayer::UselessWriteThrough));
269            }
270            last_write_through = write_through;
271            if disk_id.is_none() {
272                disk_id = layer.meta.disk_id;
273            }
274        }
275
276        if last_write_through {
277            return Err(InvalidLayeredDisk::Layer(
278                attached_layers.len() - 1,
279                InvalidLayer::UselessWriteThrough,
280            ));
281        }
282
283        let sector_size = attached_layers[0].layer.meta.sector_size;
284        let physical_sector_size = attached_layers[0].layer.meta.physical_sector_size;
285
286        let mut last_sector_count = None;
287        let sector_counts_rev = attached_layers
288            .iter_mut()
289            .rev()
290            .map(|config| *last_sector_count.insert(config.layer.backing.sector_count()))
291            .collect::<Vec<_>>();
292
293        let mut visible_sector_count = !0;
294        let layers = attached_layers
295            .into_iter()
296            .zip(sector_counts_rev.into_iter().rev())
297            .map(|(config, sector_count)| {
298                let LayerConfiguration {
299                    layer,
300                    write_through,
301                    read_cache,
302                } = config;
303                visible_sector_count = sector_count.min(visible_sector_count);
304                Layer {
305                    backing: layer.backing,
306                    visible_sector_count,
307                    read_cache,
308                    write_through,
309                }
310            })
311            .collect::<Vec<_>>();
312
313        Ok(Self {
314            is_fua_respected,
315            read_only,
316            sector_shift: sector_size.trailing_zeros(),
317            disk_id,
318            physical_sector_size,
319            unmap_behavior,
320            optimal_unmap_sectors,
321            layers,
322        })
323    }
324}
325
326trait DynLayerIo: Send + Sync + Inspect {
327    fn sector_count(&self) -> u64;
328
329    fn read<'a>(
330        &'a self,
331        buffers: &'a RequestBuffers<'_>,
332        sector: u64,
333        bitmap: SectorMarker<'a>,
334    ) -> Pin<Box<dyn 'a + Future<Output = Result<(), DiskError>> + Send>>;
335
336    fn write<'a>(
337        &'a self,
338        buffers: &'a RequestBuffers<'_>,
339        sector: u64,
340        fua: bool,
341        no_overwrite: bool,
342    ) -> Pin<Box<dyn 'a + Future<Output = Result<(), DiskError>> + Send>>;
343
344    fn sync_cache(&self) -> Pin<Box<dyn '_ + Future<Output = Result<(), DiskError>> + Send>>;
345
346    fn unmap(
347        &self,
348        sector: u64,
349        count: u64,
350        block_level_only: bool,
351        next_is_zero: bool,
352    ) -> Pin<Box<dyn '_ + Future<Output = Result<(), DiskError>> + Send>>;
353
354    fn wait_resize(&self, sector_count: u64) -> Pin<Box<dyn '_ + Future<Output = u64> + Send>>;
355}
356
357impl<T: LayerIo> DynLayerIo for T {
358    fn sector_count(&self) -> u64 {
359        self.sector_count()
360    }
361
362    fn read<'a>(
363        &'a self,
364        buffers: &'a RequestBuffers<'_>,
365        sector: u64,
366        bitmap: SectorMarker<'a>,
367    ) -> Pin<Box<dyn 'a + Future<Output = Result<(), DiskError>> + Send>> {
368        Box::pin(async move { self.read(buffers, sector, bitmap).await })
369    }
370
371    fn write<'a>(
372        &'a self,
373        buffers: &'a RequestBuffers<'_>,
374        sector: u64,
375        fua: bool,
376        no_overwrite: bool,
377    ) -> Pin<Box<dyn 'a + Future<Output = Result<(), DiskError>> + Send>> {
378        Box::pin(async move {
379            if no_overwrite {
380                self.write_no_overwrite()
381                    .unwrap()
382                    .write_no_overwrite(buffers, sector)
383                    .await
384            } else {
385                self.write(buffers, sector, fua).await
386            }
387        })
388    }
389
390    fn sync_cache(&self) -> Pin<Box<dyn '_ + Future<Output = Result<(), DiskError>> + Send>> {
391        Box::pin(self.sync_cache())
392    }
393
394    fn unmap(
395        &self,
396        sector: u64,
397        count: u64,
398        block_level_only: bool,
399        next_is_zero: bool,
400    ) -> Pin<Box<dyn '_ + Future<Output = Result<(), DiskError>> + Send>> {
401        Box::pin(self.unmap(sector, count, block_level_only, next_is_zero))
402    }
403
404    fn wait_resize(&self, sector_count: u64) -> Pin<Box<dyn '_ + Future<Output = u64> + Send>> {
405        Box::pin(self.wait_resize(sector_count))
406    }
407}
408
409trait DynLayerAttach: Send + Sync {
410    fn attach(
411        self: Box<Self>,
412        lower_layer_metadata: Option<DiskLayerMetadata>,
413    ) -> Pin<Box<dyn Future<Output = anyhow::Result<AttachedDiskLayer>> + Send>>;
414}
415
416impl<T: LayerAttach> DynLayerAttach for T {
417    fn attach(
418        self: Box<Self>,
419        lower_layer_metadata: Option<DiskLayerMetadata>,
420    ) -> Pin<Box<dyn Future<Output = anyhow::Result<AttachedDiskLayer>> + Send>> {
421        Box::pin(async move {
422            Ok({
423                let backing = (*self)
424                    .attach(lower_layer_metadata)
425                    .await
426                    .map_err(|e| anyhow::anyhow!(e.into()))?;
427                let can_read_cache = backing.write_no_overwrite().is_some();
428                AttachedDiskLayer {
429                    meta: DiskLayerMetadata {
430                        sector_count: LayerIo::sector_count(&backing),
431                        disk_id: backing.disk_id(),
432                        is_fua_respected: backing.is_fua_respected(),
433                        sector_size: backing.sector_size(),
434                        physical_sector_size: backing.physical_sector_size(),
435                        unmap_behavior: backing.unmap_behavior(),
436                        optimal_unmap_sectors: backing.optimal_unmap_sectors(),
437                        read_only: backing.is_logically_read_only(),
438                        can_read_cache,
439                    },
440                    backing: Box::new(backing),
441                }
442            })
443        })
444    }
445}
446
447/// Transition a layer from an unattached type-state, into an attached
448/// type-state, capable of performing [`LayerIo`].
449///
450/// Layers which do not require a type-state transition on-attach (e.g: those
451/// which are pre-initialized with a fixed set of metadata) can simply implement
452/// `LayerIo` directly, and leverage the blanket-impl of `impl<T: LayerIo>
453/// LayerAttach for T` which simply returns `Self` during the state transition.
454pub trait LayerAttach: 'static + Send + Sync {
455    /// Error returned if on attach failure.
456    type Error: Into<Box<dyn std::error::Error + Send + Sync + 'static>>;
457    /// Object implementating [`LayerIo`] after being attached.
458    type Layer: LayerIo;
459
460    /// Invoked when the layer is being attached to a layer stack.
461    ///
462    /// If the layer is being attached on-top of an existing layer,
463    /// `lower_layer_metadata` can be used to initialize and/or reconfigure the
464    /// layer using the properties of the layer is is being stacked on-top of.
465    fn attach(
466        self,
467        lower_layer_metadata: Option<DiskLayerMetadata>,
468    ) -> impl Future<Output = Result<Self::Layer, Self::Error>> + Send;
469}
470
471impl<T: LayerIo> LayerAttach for T {
472    type Error = Infallible;
473    type Layer = Self;
474    async fn attach(
475        self,
476        _lower_layer_metadata: Option<DiskLayerMetadata>,
477    ) -> Result<Self, Infallible> {
478        Ok(self)
479    }
480}
481
482/// Metadata and IO for disk layers.
483pub trait LayerIo: 'static + Send + Sync + Inspect {
484    /// Returns the layer type name as a string.
485    ///
486    /// This is used for diagnostic purposes.
487    fn layer_type(&self) -> &str;
488
489    /// Returns the current sector count.
490    ///
491    /// For some backing stores, this may change at runtime. If it does, then
492    /// the backing store must also implement [`DiskIo::wait_resize`].
493    fn sector_count(&self) -> u64;
494
495    /// Returns the logical sector size of the backing store.
496    ///
497    /// This must not change at runtime.
498    fn sector_size(&self) -> u32;
499
500    /// Optionally returns a 16-byte identifier for the disk, if there is a
501    /// natural one for this backing store.
502    ///
503    /// This may be exposed to the guest as a unique disk identifier.
504    /// This must not change at runtime.
505    fn disk_id(&self) -> Option<[u8; 16]>;
506
507    /// Returns the physical sector size of the backing store.
508    ///
509    /// This must not change at runtime.
510    fn physical_sector_size(&self) -> u32;
511
512    /// Returns true if the `fua` parameter to [`LayerIo::write`] is
513    /// respected by the backing store by ensuring that the IO is immediately
514    /// committed to disk.
515    fn is_fua_respected(&self) -> bool;
516
517    /// Returns true if the layer is logically read only.
518    ///
519    /// If this returns true, the layer might still be writable via
520    /// `write_no_overwrite`, used to populate the layer as a read cache.
521    fn is_logically_read_only(&self) -> bool;
522
523    /// Issues an asynchronous flush operation to the disk.
524    fn sync_cache(&self) -> impl Future<Output = Result<(), DiskError>> + Send;
525
526    /// Reads sectors from the layer.
527    ///
528    /// `marker` is used to specify which sectors have been read. Those that are
529    /// not read will be passed to the next layer, or zeroed if there are no
530    /// more layers.
531    fn read(
532        &self,
533        buffers: &RequestBuffers<'_>,
534        sector: u64,
535        marker: SectorMarker<'_>,
536    ) -> impl Future<Output = Result<(), DiskError>> + Send;
537
538    /// Writes sectors to the layer.
539    ///
540    /// # Panics
541    ///
542    /// The caller must pass a buffer with an integer number of sectors.
543    fn write(
544        &self,
545        buffers: &RequestBuffers<'_>,
546        sector: u64,
547        fua: bool,
548    ) -> impl Future<Output = Result<(), DiskError>> + Send;
549
550    /// Unmap sectors from the layer.
551    ///
552    /// If `next_is_zero` is true, then the next layer's content's are known to
553    /// be zero. A layer can use this information to just discard the sectors
554    /// rather than putting them in the zero state (which make take more space).
555    fn unmap(
556        &self,
557        sector: u64,
558        count: u64,
559        block_level_only: bool,
560        next_is_zero: bool,
561    ) -> impl Future<Output = Result<(), DiskError>> + Send;
562
563    /// Returns the behavior of the unmap operation.
564    fn unmap_behavior(&self) -> UnmapBehavior;
565
566    /// Returns the optimal granularity for unmaps, in sectors.
567    fn optimal_unmap_sectors(&self) -> u32 {
568        1
569    }
570
571    /// Optionally returns a write-no-overwrite implementation.
572    fn write_no_overwrite(&self) -> Option<impl WriteNoOverwrite> {
573        None::<NoIdet>
574    }
575
576    /// Waits for the disk sector size to be different than the specified value.
577    fn wait_resize(&self, sector_count: u64) -> impl Future<Output = u64> + Send {
578        let _ = sector_count;
579        std::future::pending()
580    }
581}
582
583enum NoIdet {}
584
585/// Writes to the layer without overwriting existing data.
586pub trait WriteNoOverwrite: Send + Sync {
587    /// Write to the layer without overwriting existing data. Existing sectors
588    /// must be preserved.
589    ///
590    /// This is used to support read caching, where the data being written may
591    /// be stale by the time it is written back to the layer.
592    fn write_no_overwrite(
593        &self,
594        buffers: &RequestBuffers<'_>,
595        sector: u64,
596    ) -> impl Future<Output = Result<(), DiskError>> + Send;
597}
598
599impl<T: WriteNoOverwrite> WriteNoOverwrite for &T {
600    fn write_no_overwrite(
601        &self,
602        buffers: &RequestBuffers<'_>,
603        sector: u64,
604    ) -> impl Future<Output = Result<(), DiskError>> + Send {
605        (*self).write_no_overwrite(buffers, sector)
606    }
607}
608
609impl WriteNoOverwrite for NoIdet {
610    async fn write_no_overwrite(
611        &self,
612        _buffers: &RequestBuffers<'_>,
613        _sector: u64,
614    ) -> Result<(), DiskError> {
615        unreachable!()
616    }
617}
618
619impl DiskIo for LayeredDisk {
620    fn disk_type(&self) -> &str {
621        "layered"
622    }
623
624    fn sector_count(&self) -> u64 {
625        self.layers[0].backing.sector_count()
626    }
627
628    fn sector_size(&self) -> u32 {
629        1 << self.sector_shift
630    }
631
632    fn disk_id(&self) -> Option<[u8; 16]> {
633        self.disk_id
634    }
635
636    fn physical_sector_size(&self) -> u32 {
637        self.physical_sector_size
638    }
639
640    fn is_fua_respected(&self) -> bool {
641        self.is_fua_respected
642    }
643
644    fn is_read_only(&self) -> bool {
645        self.read_only
646    }
647
648    async fn read_vectored(
649        &self,
650        buffers: &RequestBuffers<'_>,
651        sector: u64,
652    ) -> Result<(), DiskError> {
653        let mut bounce_buffers = None::<(OwnedRequestBuffers, GuestMemory)>;
654        let sector_count = buffers.len() >> self.sector_shift;
655        let mut bitmap = Bitmap::new(sector, sector_count);
656        let mut bits_set = 0;
657        let mut populate_cache = Vec::new();
658        // FUTURE: queue the reads to the layers in parallel.
659        'done: for (i, layer) in self.layers.iter().enumerate() {
660            if bits_set == sector_count {
661                break;
662            }
663            for mut range in bitmap.unset_iter() {
664                let end = if i == 0 {
665                    // The visible sector count of the first layer is unknown,
666                    // since it could change at any time.
667                    range.end_sector()
668                } else {
669                    // Restrict the range to the visible sector count of the
670                    // layer; sectors beyond this are logically zero.
671                    let end = range.end_sector().min(layer.visible_sector_count);
672                    if range.start_sector() == end {
673                        break 'done;
674                    }
675                    end
676                };
677
678                let sectors = end - range.start_sector();
679
680                let this_buffers = if let Some((bounce_buffers, mem)) = &bounce_buffers {
681                    &bounce_buffers.buffer(mem)
682                } else {
683                    buffers
684                };
685                let this_buffers = this_buffers.subrange(
686                    range.start_sector_within_bitmap() << self.sector_shift,
687                    (sectors as usize) << self.sector_shift,
688                );
689
690                layer
691                    .backing
692                    .read(&this_buffers, range.start_sector(), range.view(sectors))
693                    .await?;
694
695                bits_set += range.set_count();
696
697                if range.set_count() as u64 != range.len() && layer.read_cache {
698                    // Allocate bounce buffers to read into to ensure that we get a stable
699                    // copy of the data to populate the cache.
700                    bounce_buffers.get_or_insert_with(|| {
701                        let mem = GuestMemory::allocate(buffers.len());
702                        let owned_buf = OwnedRequestBuffers::linear(0, buffers.len(), true);
703                        (owned_buf, mem)
704                    });
705
706                    populate_cache.extend(range.unset_iter().map(|range| (layer, range)));
707                }
708            }
709        }
710        if bits_set != sector_count {
711            for range in bitmap.unset_iter() {
712                let len = (range.len() as usize) << self.sector_shift;
713                buffers
714                    .subrange(range.start_sector_within_bitmap() << self.sector_shift, len)
715                    .writer()
716                    .zero(len)?;
717            }
718        }
719        if !populate_cache.is_empty() {
720            let (bounce_buffers, mem) = bounce_buffers.unwrap();
721            let bounce_buffers = bounce_buffers.buffer(&mem);
722            for &(layer, ref range) in &populate_cache {
723                assert!(layer.read_cache);
724                let offset = ((range.start - sector) as usize) << self.sector_shift;
725                let len = ((range.end - range.start) as usize) << self.sector_shift;
726                if let Err(err) = layer
727                    .backing
728                    .write(
729                        &bounce_buffers.subrange(offset, len),
730                        range.start,
731                        false,
732                        true,
733                    )
734                    .await
735                {
736                    tracelimit::warn_ratelimited!(
737                        error = &err as &dyn std::error::Error,
738                        sector = range.start,
739                        count = range.end - range.start,
740                        "failed to populate read cache",
741                    );
742                }
743            }
744            let mut mem = mem.into_inner_buf().ok().unwrap();
745            for (_, range) in populate_cache {
746                // Write this bounced range back to the original buffer. This
747                // might be redundant in the presence of multiple cache layers,
748                // but this is the simplest implementation.
749                let offset = ((range.start - sector) as usize) << self.sector_shift;
750                let len = ((range.end - range.start) as usize) << self.sector_shift;
751                buffers
752                    .subrange(offset, len)
753                    .writer()
754                    .write(&mem.as_bytes()[offset..][..len])?;
755            }
756        }
757        Ok(())
758    }
759
760    async fn write_vectored(
761        &self,
762        buffers: &RequestBuffers<'_>,
763        sector: u64,
764        fua: bool,
765    ) -> Result<(), DiskError> {
766        for layer in &self.layers {
767            layer.backing.write(buffers, sector, fua, false).await?;
768            if !layer.write_through {
769                break;
770            }
771        }
772        Ok(())
773    }
774
775    async fn sync_cache(&self) -> Result<(), DiskError> {
776        for layer in &self.layers {
777            layer.backing.sync_cache().await?;
778            if !layer.write_through {
779                break;
780            }
781        }
782        Ok(())
783    }
784
785    fn wait_resize(&self, sector_count: u64) -> impl Future<Output = u64> + Send {
786        self.layers[0].backing.wait_resize(sector_count)
787    }
788
789    async fn unmap(
790        &self,
791        sector_offset: u64,
792        sector_count: u64,
793        block_level_only: bool,
794    ) -> Result<(), DiskError> {
795        if self.unmap_behavior == UnmapBehavior::Ignored {
796            return Ok(());
797        }
798
799        for (layer, next_layer) in self
800            .layers
801            .iter()
802            .zip(self.layers.iter().map(Some).skip(1).chain([None]))
803        {
804            let next_is_zero = if let Some(next_layer) = next_layer {
805                // Sectors beyond the layer's visible sector count are logically
806                // zero.
807                //
808                // FUTURE: consider splitting the unmap operation into multiple
809                // operations across this boundary.
810                sector_offset >= next_layer.visible_sector_count
811            } else {
812                true
813            };
814
815            layer
816                .backing
817                .unmap(sector_offset, sector_count, block_level_only, next_is_zero)
818                .await?;
819            if !layer.write_through {
820                break;
821            }
822        }
823        Ok(())
824    }
825
826    fn unmap_behavior(&self) -> UnmapBehavior {
827        self.unmap_behavior
828    }
829
830    fn optimal_unmap_sectors(&self) -> u32 {
831        self.optimal_unmap_sectors
832    }
833}
834
835/// A disk layer wrapping a full disk.
836#[derive(Inspect)]
837#[inspect(transparent)]
838struct DiskAsLayer(Disk);
839
840impl LayerIo for DiskAsLayer {
841    fn layer_type(&self) -> &str {
842        "disk"
843    }
844
845    fn sector_count(&self) -> u64 {
846        self.0.sector_count()
847    }
848
849    fn sector_size(&self) -> u32 {
850        self.0.sector_size()
851    }
852
853    fn disk_id(&self) -> Option<[u8; 16]> {
854        self.0.disk_id()
855    }
856
857    fn physical_sector_size(&self) -> u32 {
858        self.0.physical_sector_size()
859    }
860
861    fn is_fua_respected(&self) -> bool {
862        self.0.is_fua_respected()
863    }
864
865    fn is_logically_read_only(&self) -> bool {
866        self.0.is_read_only()
867    }
868
869    fn sync_cache(&self) -> impl Future<Output = Result<(), DiskError>> + Send {
870        self.0.sync_cache()
871    }
872
873    async fn read(
874        &self,
875        buffers: &RequestBuffers<'_>,
876        sector: u64,
877        mut bitmap: SectorMarker<'_>,
878    ) -> Result<(), DiskError> {
879        // The disk is fully populated.
880        bitmap.set_all();
881        self.0.read_vectored(buffers, sector).await
882    }
883
884    async fn write(
885        &self,
886        buffers: &RequestBuffers<'_>,
887        sector: u64,
888        fua: bool,
889    ) -> Result<(), DiskError> {
890        self.0.write_vectored(buffers, sector, fua).await
891    }
892
893    fn unmap(
894        &self,
895        sector: u64,
896        count: u64,
897        block_level_only: bool,
898        _lower_is_zero: bool,
899    ) -> impl Future<Output = Result<(), DiskError>> + Send {
900        self.0.unmap(sector, count, block_level_only)
901    }
902
903    fn unmap_behavior(&self) -> UnmapBehavior {
904        self.0.unmap_behavior()
905    }
906}
907
908#[cfg(test)]
909mod tests {
910    use crate::DiskLayer;
911    use crate::LayerConfiguration;
912    use crate::LayerIo;
913    use crate::LayeredDisk;
914    use crate::SectorMarker;
915    use crate::WriteNoOverwrite;
916    use disk_backend::DiskIo;
917    use disk_backend::UnmapBehavior;
918    use guestmem::GuestMemory;
919    use guestmem::MemoryRead as _;
920    use guestmem::MemoryWrite;
921    use inspect::Inspect;
922    use pal_async::async_test;
923    use parking_lot::Mutex;
924    use scsi_buffers::OwnedRequestBuffers;
925    use std::collections::BTreeMap;
926    use std::collections::btree_map::Entry;
927    use std::sync::Arc;
928
929    #[derive(Inspect)]
930    #[inspect(skip)]
931    struct TestLayer {
932        sectors: Mutex<BTreeMap<u64, Data>>,
933        sector_count: u64,
934    }
935
936    impl TestLayer {
937        fn new(sector_count: u64) -> Self {
938            Self {
939                sectors: Mutex::new(BTreeMap::new()),
940                sector_count,
941            }
942        }
943    }
944
945    struct Data(Box<[u8]>);
946
947    impl LayerIo for Arc<TestLayer> {
948        fn layer_type(&self) -> &str {
949            "test"
950        }
951
952        fn sector_count(&self) -> u64 {
953            self.sector_count
954        }
955
956        fn sector_size(&self) -> u32 {
957            512
958        }
959
960        fn disk_id(&self) -> Option<[u8; 16]> {
961            None
962        }
963
964        fn physical_sector_size(&self) -> u32 {
965            512
966        }
967
968        fn is_fua_respected(&self) -> bool {
969            false
970        }
971
972        fn is_logically_read_only(&self) -> bool {
973            false
974        }
975
976        async fn sync_cache(&self) -> Result<(), disk_backend::DiskError> {
977            Ok(())
978        }
979
980        async fn read(
981            &self,
982            buffers: &scsi_buffers::RequestBuffers<'_>,
983            sector: u64,
984            mut marker: SectorMarker<'_>,
985        ) -> Result<(), disk_backend::DiskError> {
986            let sector_count = buffers.len() / self.sector_size() as usize;
987            let sectors = self.sectors.lock();
988            for i in sector..sector + sector_count as u64 {
989                let Some(data) = sectors.get(&i) else {
990                    continue;
991                };
992                let offset = ((i - sector) * self.sector_size() as u64) as usize;
993                buffers
994                    .subrange(offset, self.sector_size() as usize)
995                    .writer()
996                    .write(&data.0)?;
997                marker.set(i);
998            }
999            Ok(())
1000        }
1001
1002        async fn write(
1003            &self,
1004            buffers: &scsi_buffers::RequestBuffers<'_>,
1005            sector: u64,
1006            _fua: bool,
1007        ) -> Result<(), disk_backend::DiskError> {
1008            let sector_count = buffers.len() / self.sector_size() as usize;
1009            let mut sectors = self.sectors.lock();
1010            for i in sector..sector + sector_count as u64 {
1011                let offset = ((i - sector) * self.sector_size() as u64) as usize;
1012                let mut data = Data(vec![0; self.sector_size() as usize].into());
1013                buffers
1014                    .subrange(offset, self.sector_size() as usize)
1015                    .reader()
1016                    .read(&mut data.0)?;
1017                sectors.insert(i, data);
1018            }
1019            Ok(())
1020        }
1021
1022        async fn unmap(
1023            &self,
1024            sector: u64,
1025            count: u64,
1026            _block_level_only: bool,
1027            next_is_zero: bool,
1028        ) -> Result<(), disk_backend::DiskError> {
1029            if !next_is_zero {
1030                return Ok(());
1031            }
1032            let mut sectors = self.sectors.lock();
1033            let mut next_sector = sector;
1034            let end = sector + count;
1035            while next_sector < end {
1036                let Some((&sector, _)) = sectors.range_mut(next_sector..).next() else {
1037                    break;
1038                };
1039                if sector >= end {
1040                    break;
1041                }
1042                sectors.remove(&sector);
1043                next_sector = sector + 1;
1044            }
1045            Ok(())
1046        }
1047
1048        fn unmap_behavior(&self) -> UnmapBehavior {
1049            UnmapBehavior::Unspecified
1050        }
1051
1052        fn write_no_overwrite(&self) -> Option<impl WriteNoOverwrite> {
1053            Some(self)
1054        }
1055    }
1056
1057    impl WriteNoOverwrite for Arc<TestLayer> {
1058        async fn write_no_overwrite(
1059            &self,
1060            buffers: &scsi_buffers::RequestBuffers<'_>,
1061            sector: u64,
1062        ) -> Result<(), disk_backend::DiskError> {
1063            let sector_count = buffers.len() / self.sector_size() as usize;
1064            let mut sectors = self.sectors.lock();
1065            for i in sector..sector + sector_count as u64 {
1066                let Entry::Vacant(entry) = sectors.entry(i) else {
1067                    continue;
1068                };
1069                let offset = ((i - sector) * self.sector_size() as u64) as usize;
1070                let mut data = Data(vec![0; self.sector_size() as usize].into());
1071                buffers
1072                    .subrange(offset, self.sector_size() as usize)
1073                    .reader()
1074                    .read(&mut data.0)?;
1075                entry.insert(data);
1076            }
1077            Ok(())
1078        }
1079    }
1080
1081    #[async_test]
1082    async fn test_read_cache() {
1083        const SIZE: u64 = 2048;
1084        let bottom = Arc::new(TestLayer::new(SIZE));
1085        let pattern = |i: u64| {
1086            let mut acc = (i + 1) * 3;
1087            Data(
1088                (0..512)
1089                    .map(|_| {
1090                        acc = acc.wrapping_mul(7);
1091                        acc as u8
1092                    })
1093                    .collect::<Vec<_>>()
1094                    .into(),
1095            )
1096        };
1097        bottom
1098            .sectors
1099            .lock()
1100            .extend((0..SIZE).map(|i| (i, pattern(i))));
1101
1102        let cache = Arc::new(TestLayer::new(SIZE));
1103        let cache_cfg = LayerConfiguration {
1104            layer: DiskLayer::new(cache.clone()),
1105            read_cache: true,
1106            write_through: false,
1107        };
1108        let bottom_cfg = LayerConfiguration {
1109            layer: DiskLayer::new(bottom),
1110            read_cache: false,
1111            write_through: false,
1112        };
1113        let disk = LayeredDisk::new(false, vec![cache_cfg, bottom_cfg])
1114            .await
1115            .unwrap();
1116
1117        let mut mem = GuestMemory::allocate(0x10000);
1118        let buffers = OwnedRequestBuffers::linear(0, 0x10000, true);
1119
1120        for i in [0, 2, 4, 6, 8, 0, 2, 4, 6, 8] {
1121            disk.read_vectored(&buffers.buffer(&mem).subrange(0, 512), i)
1122                .await
1123                .unwrap();
1124
1125            assert_eq!(mem.inner_buf_mut().unwrap()[..512], pattern(i).0[..]);
1126        }
1127
1128        assert_eq!(cache.sectors.lock().len(), 5);
1129
1130        mem.inner_buf_mut().unwrap().fill(0);
1131
1132        disk.read_vectored(&buffers.buffer(&mem).subrange(0, 15 * 512), 1)
1133            .await
1134            .unwrap();
1135
1136        assert_eq!(cache.sectors.lock().len(), 16);
1137
1138        for i in 0..15 {
1139            assert_eq!(
1140                mem.inner_buf_mut().unwrap()[i as usize * 512..][..512],
1141                pattern(i + 1).0[..],
1142                "{i}"
1143            );
1144        }
1145    }
1146}