scsi_buffers/
lib.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! Guest memory buffer abstractions for storage IO.
5//!
6//! This crate provides [`RequestBuffers`], the primary type used by disk
7//! backends to access guest memory during IO operations. It wraps a
8//! [`PagedRange`] (a single contiguous byte range across guest pages)
9//! with direction flags and provides:
10//!
11//! - Byte-level streaming via [`reader()`](RequestBuffers::reader) /
12//!   [`writer()`](RequestBuffers::writer)
13//! - DMA-ready locked buffers via [`lock()`](RequestBuffers::lock)
14//! - Alignment checking via [`is_aligned()`](RequestBuffers::is_aligned)
15//!
16//! # Alignment and bounce buffering
17//!
18//! Disk backends that use direct IO (O_DIRECT, io_uring) require buffers
19//! to be aligned to the disk's sector size. When guest-provided buffers
20//! are not aligned, a [`BounceBuffer`] is used: data is copied to/from a
21//! page-aligned temporary buffer for the actual disk IO.
22//!
23//! [`BounceBufferTracker`] manages a per-thread page budget to limit
24//! memory consumption from concurrent bounce-buffered IOs.
25//!
26//! # Important: `PagedRange` constraints
27//!
28//! `RequestBuffers` wraps a *single* `PagedRange`, which can only
29//! represent a contiguous byte range where interior pages are fully
30//! covered (see [`PagedRange`] docs). This means:
31//!
32//! - You cannot combine two guest memory regions with arbitrary GPAs
33//!   into one `RequestBuffers` unless every boundary falls on a page
34//!   boundary.
35//! - When a device (e.g., virtio-blk) receives multiple descriptors
36//!   forming a scatter-gather list, each descriptor typically gets its
37//!   own `RequestBuffers`. If a descriptor boundary falls mid-sector,
38//!   bounce buffering or coalescing is needed to issue correct IO.
39
40// UNSAFETY: Handling raw pointers and transmuting between types for different use cases.
41#![expect(unsafe_code)]
42
43use guestmem::AccessError;
44use guestmem::GuestMemory;
45use guestmem::LockedRange;
46use guestmem::LockedRangeImpl;
47use guestmem::MemoryRead;
48use guestmem::MemoryWrite;
49use guestmem::ranges::PagedRange;
50use guestmem::ranges::PagedRangeWriter;
51use safeatomic::AsAtomicBytes;
52use smallvec::SmallVec;
53use std::marker::PhantomData;
54use std::ops::Deref;
55use std::sync::atomic::AtomicU8;
56use std::sync::atomic::AtomicUsize;
57use std::sync::atomic::Ordering;
58use zerocopy::FromBytes;
59use zerocopy::Immutable;
60use zerocopy::IntoBytes;
61use zerocopy::KnownLayout;
62
63/// A pointer/length pair that is ABI compatible with the iovec type on Linux.
64#[derive(Debug, Copy, Clone)]
65#[repr(C)]
66pub struct AtomicIoVec {
67    /// The address of the buffer.
68    pub address: *const AtomicU8,
69    /// The length of the buffer in bytes.
70    pub len: usize,
71}
72
73impl Default for AtomicIoVec {
74    fn default() -> Self {
75        Self {
76            address: std::ptr::null(),
77            len: 0,
78        }
79    }
80}
81
82impl From<&'_ [AtomicU8]> for AtomicIoVec {
83    fn from(p: &'_ [AtomicU8]) -> Self {
84        Self {
85            address: p.as_ptr(),
86            len: p.len(),
87        }
88    }
89}
90
91impl AtomicIoVec {
92    /// Returns a pointer to a slice backed by the buffer.
93    ///
94    /// # Safety
95    /// The caller must ensure this iovec points to [valid](std::ptr#Safety)
96    /// data.
97    pub unsafe fn as_slice_unchecked(&self) -> &[AtomicU8] {
98        // SAFETY: guaranteed by caller.
99        unsafe { std::slice::from_raw_parts(self.address, self.len) }
100    }
101}
102
103/// SAFETY: AtomicIoVec just represents a pointer and length and can be
104/// sent/accessed anywhere freely.
105unsafe impl Send for AtomicIoVec {}
106// SAFETY: see above comment
107unsafe impl Sync for AtomicIoVec {}
108
109/// Wrapper around an &[AtomicU8] guaranteed to be ABI compatible with the
110/// `iovec` type on Linux.
111#[derive(Debug, Copy, Clone, Default)]
112#[repr(transparent)]
113pub struct IoBuffer<'a> {
114    io_vec: AtomicIoVec,
115    phantom: PhantomData<&'a AtomicU8>,
116}
117
118impl<'a> IoBuffer<'a> {
119    /// Wraps `buffer` and returns it.
120    pub fn new(buffer: &'a [AtomicU8]) -> Self {
121        Self {
122            io_vec: AtomicIoVec {
123                address: buffer.as_ptr(),
124                len: buffer.len(),
125            },
126            phantom: PhantomData,
127        }
128    }
129
130    /// Reinterprets `io_vec` as `IoBuffer`.
131    ///
132    /// # Safety
133    /// `io_vec` must reference a valid buffer for the lifetime of `Self`.
134    pub unsafe fn from_io_vec(io_vec: &AtomicIoVec) -> &Self {
135        // SAFETY: IoBuffer is #[repr(transparent)] over AtomicIoVec
136        unsafe { std::mem::transmute(io_vec) }
137    }
138
139    /// Reinterprets the `io_vecs` slice as `[IoBuffer]`.
140    ///
141    /// # Safety
142    /// `io_vecs` must reference valid buffers for the lifetime of `Self`.
143    pub unsafe fn from_io_vecs(io_vecs: &[AtomicIoVec]) -> &[Self] {
144        // SAFETY: IoBuffer is #[repr(transparent)] over AtomicIoVec
145        unsafe { std::mem::transmute(io_vecs) }
146    }
147
148    /// Returns a pointer to the beginning of the buffer.
149    pub fn as_ptr(&self) -> *const AtomicU8 {
150        self.io_vec.address
151    }
152
153    /// Returns the buffer's length in bytes.
154    pub fn len(&self) -> usize {
155        self.io_vec.len
156    }
157}
158
159impl Deref for IoBuffer<'_> {
160    type Target = [AtomicU8];
161
162    fn deref(&self) -> &Self::Target {
163        // SAFETY: the buffer is guaranteed to be valid for the lifetime of
164        // self.
165        unsafe { self.io_vec.as_slice_unchecked() }
166    }
167}
168
169const PAGE_SIZE: usize = 4096;
170
171#[repr(C, align(4096))]
172#[derive(Clone, IntoBytes, Immutable, KnownLayout, FromBytes)]
173struct Page([u8; PAGE_SIZE]);
174
175const ZERO_PAGE: Page = Page([0; PAGE_SIZE]);
176
177/// A page-aligned temporary buffer used to double-buffer IO data.
178///
179/// When guest-provided buffers are not aligned to the disk's sector size
180/// (or when the `PagedRange` constraints prevent direct IO), data is
181/// copied through a `BounceBuffer`:
182///
183/// - **Reads:** IO is performed into the bounce buffer, then copied to
184///   guest memory via `RequestBuffers::writer()`.
185/// - **Writes:** Data is copied from guest memory via
186///   `RequestBuffers::reader()` into the bounce buffer, then IO is
187///   performed from the bounce buffer.
188///
189/// The buffer is always page-aligned (4096 bytes), satisfying the
190/// alignment requirements of O_DIRECT and io_uring.
191pub struct BounceBuffer {
192    pages: Vec<Page>,
193    io_vec: AtomicIoVec,
194}
195
196impl BounceBuffer {
197    /// Allocates a new bounce buffer of `size` bytes.
198    pub fn new(size: usize) -> Self {
199        let mut pages = vec![ZERO_PAGE; size.div_ceil(PAGE_SIZE)];
200        let io_vec = pages.as_mut_bytes()[..size].as_atomic_bytes().into();
201        BounceBuffer { pages, io_vec }
202    }
203
204    fn len(&self) -> usize {
205        self.io_vec.len
206    }
207
208    /// Returns the bounce buffer memory.
209    pub fn as_mut_bytes(&mut self) -> &mut [u8] {
210        // SAFETY: while there are no concurrent references (e.g., via io_vec),
211        // the buffer in pages is exclusively owned, and it is accessible as a
212        // byte array.
213        unsafe { std::slice::from_raw_parts_mut(self.pages.as_mut_ptr().cast::<u8>(), self.len()) }
214    }
215
216    /// Returns a reference to the underlying buffer.
217    ///
218    /// This is returned in a form convenient for using with IO functions.
219    pub fn io_vecs(&self) -> &[IoBuffer<'_>] {
220        std::slice::from_ref({
221            // SAFETY: io_vec contains a pointer to the live data in pages.
222            unsafe { IoBuffer::from_io_vec(&self.io_vec) }
223        })
224    }
225}
226
227/// A set of locked memory ranges, represented by [`IoBuffer`]s.
228pub struct LockedIoBuffers(LockedRangeImpl<LockedIoVecs>);
229
230impl LockedIoBuffers {
231    /// Returns the slice of IO buffers.
232    pub fn io_vecs(&self) -> &[IoBuffer<'_>] {
233        // SAFETY: the LockedRangeImpl passed to new guarantees that only
234        // vectors with valid lifetimes were passed to
235        // LockedGuestBuffers::push_sub_range.
236        unsafe { IoBuffer::from_io_vecs(&self.0.get().0) }
237    }
238}
239
240struct LockedIoVecs(SmallVec<[AtomicIoVec; 64]>);
241
242impl LockedIoVecs {
243    fn new() -> Self {
244        Self(Default::default())
245    }
246}
247
248impl LockedRange for LockedIoVecs {
249    fn push_sub_range(&mut self, sub_range: &[AtomicU8]) {
250        self.0.push(sub_range.into());
251    }
252}
253
254/// An implementation of [`MemoryWrite`] that provides semantically
255/// correct results. Specifically, it always returns a `ReadOnly` error
256/// when attempting to write to it.
257struct PermissionedMemoryWriter<'a> {
258    range: PagedRange<'a>,
259    writer: PagedRangeWriter<'a>,
260    is_write: bool,
261}
262
263impl PermissionedMemoryWriter<'_> {
264    /// Creates a new memory writer with the given range and guest memory.
265    fn new<'a>(
266        range: PagedRange<'a>,
267        guest_memory: &'a GuestMemory,
268        is_write: bool,
269    ) -> PermissionedMemoryWriter<'a> {
270        // Simply create an empty range here to avoid branching on hot paths (`write`, `fill`, etc.)
271        let range = if is_write { range } else { PagedRange::empty() };
272        PermissionedMemoryWriter {
273            range,
274            writer: range.writer(guest_memory),
275            is_write,
276        }
277    }
278}
279
280impl MemoryWrite for PermissionedMemoryWriter<'_> {
281    fn write(&mut self, data: &[u8]) -> Result<(), AccessError> {
282        self.writer.write(data).map_err(|e| {
283            if self.is_write {
284                e
285            } else {
286                AccessError::ReadOnly
287            }
288        })
289    }
290
291    fn fill(&mut self, val: u8, len: usize) -> Result<(), AccessError> {
292        self.writer.fill(val, len).map_err(|e| {
293            if self.is_write {
294                e
295            } else {
296                AccessError::ReadOnly
297            }
298        })
299    }
300
301    fn len(&self) -> usize {
302        self.range.len()
303    }
304}
305
306/// An accessor for guest memory associated with a storage IO request.
307///
308/// Wraps a single [`PagedRange`] — a contiguous byte range scattered
309/// across guest pages — together with a `GuestMemory` reference and a
310/// read/write direction flag.
311///
312/// # One range per `RequestBuffers`
313///
314/// Because `PagedRange` requires interior pages to be fully covered,
315/// a `RequestBuffers` can only describe memory regions where every
316/// page boundary between the first and last page is fully spanned.
317/// Two guest memory regions with arbitrary starting GPAs generally
318/// cannot be combined into one `RequestBuffers`.
319///
320/// When a device has multiple disjoint memory regions for a single IO
321/// (e.g., a virtio descriptor chain whose descriptors don't align to
322/// page boundaries), options include:
323///
324/// - Issue separate IOs per region (only valid if each region is
325///   sector-aligned)
326/// - Use a [`BounceBuffer`] to coalesce into one contiguous buffer
327/// - Use multiple `RequestBuffers` with a multi-range disk backend
328///   API (not currently available)
329#[derive(Clone, Debug)]
330pub struct RequestBuffers<'a> {
331    range: PagedRange<'a>,
332    guest_memory: &'a GuestMemory,
333    is_write: bool,
334}
335
336impl<'a> RequestBuffers<'a> {
337    /// Creates a new request buffer from the given memory ranges.
338    pub fn new(guest_memory: &'a GuestMemory, range: PagedRange<'a>, is_write: bool) -> Self {
339        Self {
340            range,
341            guest_memory,
342            is_write,
343        }
344    }
345
346    /// Returns true if the buffer is empty.
347    pub fn is_empty(&self) -> bool {
348        self.range.is_empty()
349    }
350
351    /// Return the total length of the buffers in bytes.
352    pub fn len(&self) -> usize {
353        self.range.len()
354    }
355
356    /// Returns the guest memory accessor.
357    pub fn guest_memory(&self) -> &GuestMemory {
358        self.guest_memory
359    }
360
361    /// Return the internal paged range.
362    pub fn range(&self) -> PagedRange<'_> {
363        self.range
364    }
365
366    /// Returns whether the buffer is aligned to at least `alignment` bytes.
367    ///
368    /// Checks three things (all must be multiples of `alignment`):
369    /// 1. The byte offset into the first page (`range.offset()`)
370    /// 2. The total byte length (`range.len()`)
371    /// 3. The page size (4096) — always true for alignment ≤ 4096
372    ///
373    /// When this returns `false`, disk backends that require aligned
374    /// buffers (e.g., those using O_DIRECT or io_uring) must use a
375    /// [`BounceBuffer`] to perform the IO.
376    ///
377    /// # Panics
378    ///
379    /// Panics if `alignment` is not a power of two.
380    pub fn is_aligned(&self, alignment: usize) -> bool {
381        assert!(alignment.is_power_of_two());
382        ((self.range.offset() | self.range.len() | PAGE_SIZE) & (alignment - 1)) == 0
383    }
384
385    /// Gets a memory writer for the buffers.
386    ///
387    /// Returns an empty writer if the buffers are only available for read access.
388    pub fn writer(&self) -> impl MemoryWrite + '_ {
389        PermissionedMemoryWriter::new(self.range, self.guest_memory, self.is_write)
390    }
391
392    /// Gets a memory reader for the buffers.
393    pub fn reader(&self) -> impl MemoryRead + '_ {
394        self.range.reader(self.guest_memory)
395    }
396
397    /// Locks the guest memory ranges described by this buffer and returns an
398    /// object containing [`IoBuffer`]s, suitable for executing asynchronous I/O
399    /// operations.
400    pub fn lock(&self, for_write: bool) -> Result<LockedIoBuffers, AccessError> {
401        if for_write && !self.is_write {
402            return Err(AccessError::ReadOnly);
403        }
404        Ok(LockedIoBuffers(
405            self.guest_memory
406                .lock_range(self.range, LockedIoVecs::new())?,
407        ))
408    }
409
410    /// Returns a subrange of this set of buffers.
411    ///
412    /// Panics if `offset + len > self.len()`.
413    pub fn subrange(&self, offset: usize, len: usize) -> Self {
414        Self {
415            range: self.range.subrange(offset, len),
416            guest_memory: self.guest_memory,
417            is_write: self.is_write,
418        }
419    }
420}
421
422/// A memory range.
423#[derive(Debug, Clone)]
424pub struct OwnedRequestBuffers {
425    gpns: Vec<u64>,
426    offset: usize,
427    len: usize,
428    is_write: bool,
429}
430
431impl OwnedRequestBuffers {
432    /// A new memory range with the given guest page numbers.
433    pub fn new(gpns: &[u64]) -> Self {
434        Self::new_unaligned(gpns, 0, gpns.len() * PAGE_SIZE)
435    }
436
437    /// A new memory range with the given guest page numbers, offset by `offset`
438    /// bytes, and of `len` bytes length.
439    pub fn new_unaligned(gpns: &[u64], offset: usize, len: usize) -> Self {
440        Self {
441            gpns: gpns.to_vec(),
442            offset,
443            len,
444            is_write: true,
445        }
446    }
447
448    /// A new memory range containing the linear address range from
449    /// `offset..offset+len`.
450    pub fn linear(offset: u64, len: usize, is_write: bool) -> Self {
451        let start_page = offset / PAGE_SIZE as u64;
452        let end_page = start_page + (len as u64).div_ceil(PAGE_SIZE as u64);
453        let gpns: Vec<u64> = (start_page..end_page).collect();
454        Self {
455            gpns,
456            offset: (offset % PAGE_SIZE as u64) as usize,
457            len,
458            is_write,
459        }
460    }
461
462    /// A [`RequestBuffers`] referencing this memory range.
463    pub fn buffer<'a>(&'a self, guest_memory: &'a GuestMemory) -> RequestBuffers<'a> {
464        RequestBuffers::new(
465            guest_memory,
466            PagedRange::new(self.offset, self.len, &self.gpns).unwrap(),
467            self.is_write,
468        )
469    }
470
471    /// The length of the range in bytes.
472    pub fn len(&self) -> usize {
473        self.len
474    }
475}
476
477/// Tracks an active bounce buffer, signaling to the bounce buffer tracker
478/// upon drop that pages can be reclaimed.
479pub struct TrackedBounceBuffer<'a> {
480    /// The active bounce buffer being tracked.
481    pub buffer: BounceBuffer,
482    /// Reference to free page counter for current IO thread.
483    free_pages: &'a AtomicUsize,
484    /// Used to signal pending bounce buffer requests of newly freed pages.
485    event: &'a event_listener::Event,
486}
487
488impl Drop for TrackedBounceBuffer<'_> {
489    fn drop(&mut self) {
490        let pages = self.buffer.len().div_ceil(4096);
491        self.free_pages.fetch_add(pages, Ordering::SeqCst);
492        self.event.notify(usize::MAX);
493    }
494}
495
496/// Tracks active bounce buffers against a set limit of pages. If no limit is
497/// specified a default of 8Mb will be applied. This limit is tracked per thread
498/// specified by the backing AffinitizedThreadpool.
499#[derive(Debug)]
500pub struct BounceBufferTracker {
501    /// Active bounce buffer pages on a given thread.
502    free_pages: Vec<AtomicUsize>,
503    /// Event used by TrackedBounceBuffer to signal pages have been dropped.
504    event: Vec<event_listener::Event>,
505}
506
507impl BounceBufferTracker {
508    /// Create a new bounce buffer tracker.
509    pub fn new(max_bounce_buffer_pages: usize, threads: usize) -> Self {
510        let mut free_pages = Vec::with_capacity(threads);
511        let mut event = Vec::with_capacity(threads);
512
513        (0..threads).for_each(|_| {
514            event.push(event_listener::Event::new());
515            free_pages.push(AtomicUsize::new(max_bounce_buffer_pages));
516        });
517
518        Self { free_pages, event }
519    }
520
521    /// Attempts to acquire bounce buffers from the tracker proceeding if pages
522    /// are available or waiting until a tracked bounce buffer is dropped, which
523    /// triggers the per-thread event to indicate newly freed pages.
524    pub async fn acquire_bounce_buffers<'a, 'b>(
525        &'b self,
526        size: usize,
527        thread: usize,
528    ) -> Box<TrackedBounceBuffer<'a>>
529    where
530        'b: 'a,
531    {
532        let pages = size.div_ceil(4096);
533        let event = self.event.get(thread).unwrap();
534        let free_pages = self.free_pages.get(thread).unwrap();
535
536        loop {
537            let listener = event.listen();
538            if free_pages
539                .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |x| x.checked_sub(pages))
540                .is_ok()
541            {
542                break;
543            }
544            listener.await;
545        }
546
547        Box::new(TrackedBounceBuffer {
548            buffer: BounceBuffer::new(size),
549            free_pages,
550            event,
551        })
552    }
553}
554
555#[cfg(test)]
556mod tests {
557    use super::*;
558    use sparse_mmap::SparseMapping;
559    const SIZE_1MB: usize = 1048576;
560
561    #[test]
562    fn correct_read_only_behavior() {
563        let mapping = SparseMapping::new(SIZE_1MB * 4).unwrap();
564        let guest_memory = GuestMemory::new("test-scsi-buffers", mapping);
565        let range = PagedRange::new(0, 4096, &[0]).unwrap();
566        let buffers = RequestBuffers::new(&guest_memory, range, false);
567
568        let r = buffers.writer().write(&[1; 4096]);
569        assert!(
570            matches!(r, Err(AccessError::ReadOnly)),
571            "Expected read-only error, got {:?}",
572            r
573        );
574
575        let r = buffers.writer().fill(1, 4096);
576        assert!(
577            matches!(r, Err(AccessError::ReadOnly)),
578            "Expected read-only error, got {:?}",
579            r
580        );
581
582        assert!(
583            buffers.writer().len() == 0,
584            "Length should be 0 for read-only writer"
585        );
586    }
587}