scsi_buffers/lib.rs
1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! Guest memory buffer abstractions for storage IO.
5//!
6//! This crate provides [`RequestBuffers`], the primary type used by disk
7//! backends to access guest memory during IO operations. It wraps a
8//! [`PagedRange`] (a single contiguous byte range across guest pages)
9//! with direction flags and provides:
10//!
11//! - Byte-level streaming via [`reader()`](RequestBuffers::reader) /
12//! [`writer()`](RequestBuffers::writer)
13//! - DMA-ready locked buffers via [`lock()`](RequestBuffers::lock)
14//! - Alignment checking via [`is_aligned()`](RequestBuffers::is_aligned)
15//!
16//! # Alignment and bounce buffering
17//!
18//! Disk backends that use direct IO (O_DIRECT, io_uring) require buffers
19//! to be aligned to the disk's sector size. When guest-provided buffers
20//! are not aligned, a [`BounceBuffer`] is used: data is copied to/from a
21//! page-aligned temporary buffer for the actual disk IO.
22//!
23//! [`BounceBufferTracker`] manages a per-thread page budget to limit
24//! memory consumption from concurrent bounce-buffered IOs.
25//!
26//! # Important: `PagedRange` constraints
27//!
28//! `RequestBuffers` wraps a *single* `PagedRange`, which can only
29//! represent a contiguous byte range where interior pages are fully
30//! covered (see [`PagedRange`] docs). This means:
31//!
32//! - You cannot combine two guest memory regions with arbitrary GPAs
33//! into one `RequestBuffers` unless every boundary falls on a page
34//! boundary.
35//! - When a device (e.g., virtio-blk) receives multiple descriptors
36//! forming a scatter-gather list, each descriptor typically gets its
37//! own `RequestBuffers`. If a descriptor boundary falls mid-sector,
38//! bounce buffering or coalescing is needed to issue correct IO.
39
40// UNSAFETY: Handling raw pointers and transmuting between types for different use cases.
41#![expect(unsafe_code)]
42
43use guestmem::AccessError;
44use guestmem::GuestMemory;
45use guestmem::LockedRange;
46use guestmem::LockedRangeImpl;
47use guestmem::MemoryRead;
48use guestmem::MemoryWrite;
49use guestmem::ranges::PagedRange;
50use guestmem::ranges::PagedRangeWriter;
51use safeatomic::AsAtomicBytes;
52use smallvec::SmallVec;
53use std::marker::PhantomData;
54use std::ops::Deref;
55use std::sync::atomic::AtomicU8;
56use std::sync::atomic::AtomicUsize;
57use std::sync::atomic::Ordering;
58use zerocopy::FromBytes;
59use zerocopy::Immutable;
60use zerocopy::IntoBytes;
61use zerocopy::KnownLayout;
62
63/// A pointer/length pair that is ABI compatible with the iovec type on Linux.
64#[derive(Debug, Copy, Clone)]
65#[repr(C)]
66pub struct AtomicIoVec {
67 /// The address of the buffer.
68 pub address: *const AtomicU8,
69 /// The length of the buffer in bytes.
70 pub len: usize,
71}
72
73impl Default for AtomicIoVec {
74 fn default() -> Self {
75 Self {
76 address: std::ptr::null(),
77 len: 0,
78 }
79 }
80}
81
82impl From<&'_ [AtomicU8]> for AtomicIoVec {
83 fn from(p: &'_ [AtomicU8]) -> Self {
84 Self {
85 address: p.as_ptr(),
86 len: p.len(),
87 }
88 }
89}
90
91impl AtomicIoVec {
92 /// Returns a pointer to a slice backed by the buffer.
93 ///
94 /// # Safety
95 /// The caller must ensure this iovec points to [valid](std::ptr#Safety)
96 /// data.
97 pub unsafe fn as_slice_unchecked(&self) -> &[AtomicU8] {
98 // SAFETY: guaranteed by caller.
99 unsafe { std::slice::from_raw_parts(self.address, self.len) }
100 }
101}
102
103/// SAFETY: AtomicIoVec just represents a pointer and length and can be
104/// sent/accessed anywhere freely.
105unsafe impl Send for AtomicIoVec {}
106// SAFETY: see above comment
107unsafe impl Sync for AtomicIoVec {}
108
109/// Wrapper around an &[AtomicU8] guaranteed to be ABI compatible with the
110/// `iovec` type on Linux.
111#[derive(Debug, Copy, Clone, Default)]
112#[repr(transparent)]
113pub struct IoBuffer<'a> {
114 io_vec: AtomicIoVec,
115 phantom: PhantomData<&'a AtomicU8>,
116}
117
118impl<'a> IoBuffer<'a> {
119 /// Wraps `buffer` and returns it.
120 pub fn new(buffer: &'a [AtomicU8]) -> Self {
121 Self {
122 io_vec: AtomicIoVec {
123 address: buffer.as_ptr(),
124 len: buffer.len(),
125 },
126 phantom: PhantomData,
127 }
128 }
129
130 /// Reinterprets `io_vec` as `IoBuffer`.
131 ///
132 /// # Safety
133 /// `io_vec` must reference a valid buffer for the lifetime of `Self`.
134 pub unsafe fn from_io_vec(io_vec: &AtomicIoVec) -> &Self {
135 // SAFETY: IoBuffer is #[repr(transparent)] over AtomicIoVec
136 unsafe { std::mem::transmute(io_vec) }
137 }
138
139 /// Reinterprets the `io_vecs` slice as `[IoBuffer]`.
140 ///
141 /// # Safety
142 /// `io_vecs` must reference valid buffers for the lifetime of `Self`.
143 pub unsafe fn from_io_vecs(io_vecs: &[AtomicIoVec]) -> &[Self] {
144 // SAFETY: IoBuffer is #[repr(transparent)] over AtomicIoVec
145 unsafe { std::mem::transmute(io_vecs) }
146 }
147
148 /// Returns a pointer to the beginning of the buffer.
149 pub fn as_ptr(&self) -> *const AtomicU8 {
150 self.io_vec.address
151 }
152
153 /// Returns the buffer's length in bytes.
154 pub fn len(&self) -> usize {
155 self.io_vec.len
156 }
157}
158
159impl Deref for IoBuffer<'_> {
160 type Target = [AtomicU8];
161
162 fn deref(&self) -> &Self::Target {
163 // SAFETY: the buffer is guaranteed to be valid for the lifetime of
164 // self.
165 unsafe { self.io_vec.as_slice_unchecked() }
166 }
167}
168
169const PAGE_SIZE: usize = 4096;
170
171#[repr(C, align(4096))]
172#[derive(Clone, IntoBytes, Immutable, KnownLayout, FromBytes)]
173struct Page([u8; PAGE_SIZE]);
174
175const ZERO_PAGE: Page = Page([0; PAGE_SIZE]);
176
177/// A page-aligned temporary buffer used to double-buffer IO data.
178///
179/// When guest-provided buffers are not aligned to the disk's sector size
180/// (or when the `PagedRange` constraints prevent direct IO), data is
181/// copied through a `BounceBuffer`:
182///
183/// - **Reads:** IO is performed into the bounce buffer, then copied to
184/// guest memory via `RequestBuffers::writer()`.
185/// - **Writes:** Data is copied from guest memory via
186/// `RequestBuffers::reader()` into the bounce buffer, then IO is
187/// performed from the bounce buffer.
188///
189/// The buffer is always page-aligned (4096 bytes), satisfying the
190/// alignment requirements of O_DIRECT and io_uring.
191pub struct BounceBuffer {
192 pages: Vec<Page>,
193 io_vec: AtomicIoVec,
194}
195
196impl BounceBuffer {
197 /// Allocates a new bounce buffer of `size` bytes.
198 pub fn new(size: usize) -> Self {
199 let mut pages = vec![ZERO_PAGE; size.div_ceil(PAGE_SIZE)];
200 let io_vec = pages.as_mut_bytes()[..size].as_atomic_bytes().into();
201 BounceBuffer { pages, io_vec }
202 }
203
204 fn len(&self) -> usize {
205 self.io_vec.len
206 }
207
208 /// Returns the bounce buffer memory.
209 pub fn as_mut_bytes(&mut self) -> &mut [u8] {
210 // SAFETY: while there are no concurrent references (e.g., via io_vec),
211 // the buffer in pages is exclusively owned, and it is accessible as a
212 // byte array.
213 unsafe { std::slice::from_raw_parts_mut(self.pages.as_mut_ptr().cast::<u8>(), self.len()) }
214 }
215
216 /// Returns a reference to the underlying buffer.
217 ///
218 /// This is returned in a form convenient for using with IO functions.
219 pub fn io_vecs(&self) -> &[IoBuffer<'_>] {
220 std::slice::from_ref({
221 // SAFETY: io_vec contains a pointer to the live data in pages.
222 unsafe { IoBuffer::from_io_vec(&self.io_vec) }
223 })
224 }
225}
226
227/// A set of locked memory ranges, represented by [`IoBuffer`]s.
228pub struct LockedIoBuffers(LockedRangeImpl<LockedIoVecs>);
229
230impl LockedIoBuffers {
231 /// Returns the slice of IO buffers.
232 pub fn io_vecs(&self) -> &[IoBuffer<'_>] {
233 // SAFETY: the LockedRangeImpl passed to new guarantees that only
234 // vectors with valid lifetimes were passed to
235 // LockedGuestBuffers::push_sub_range.
236 unsafe { IoBuffer::from_io_vecs(&self.0.get().0) }
237 }
238}
239
240struct LockedIoVecs(SmallVec<[AtomicIoVec; 64]>);
241
242impl LockedIoVecs {
243 fn new() -> Self {
244 Self(Default::default())
245 }
246}
247
248impl LockedRange for LockedIoVecs {
249 fn push_sub_range(&mut self, sub_range: &[AtomicU8]) {
250 self.0.push(sub_range.into());
251 }
252}
253
254/// An implementation of [`MemoryWrite`] that provides semantically
255/// correct results. Specifically, it always returns a `ReadOnly` error
256/// when attempting to write to it.
257struct PermissionedMemoryWriter<'a> {
258 range: PagedRange<'a>,
259 writer: PagedRangeWriter<'a>,
260 is_write: bool,
261}
262
263impl PermissionedMemoryWriter<'_> {
264 /// Creates a new memory writer with the given range and guest memory.
265 fn new<'a>(
266 range: PagedRange<'a>,
267 guest_memory: &'a GuestMemory,
268 is_write: bool,
269 ) -> PermissionedMemoryWriter<'a> {
270 // Simply create an empty range here to avoid branching on hot paths (`write`, `fill`, etc.)
271 let range = if is_write { range } else { PagedRange::empty() };
272 PermissionedMemoryWriter {
273 range,
274 writer: range.writer(guest_memory),
275 is_write,
276 }
277 }
278}
279
280impl MemoryWrite for PermissionedMemoryWriter<'_> {
281 fn write(&mut self, data: &[u8]) -> Result<(), AccessError> {
282 self.writer.write(data).map_err(|e| {
283 if self.is_write {
284 e
285 } else {
286 AccessError::ReadOnly
287 }
288 })
289 }
290
291 fn fill(&mut self, val: u8, len: usize) -> Result<(), AccessError> {
292 self.writer.fill(val, len).map_err(|e| {
293 if self.is_write {
294 e
295 } else {
296 AccessError::ReadOnly
297 }
298 })
299 }
300
301 fn len(&self) -> usize {
302 self.range.len()
303 }
304}
305
306/// An accessor for guest memory associated with a storage IO request.
307///
308/// Wraps a single [`PagedRange`] — a contiguous byte range scattered
309/// across guest pages — together with a `GuestMemory` reference and a
310/// read/write direction flag.
311///
312/// # One range per `RequestBuffers`
313///
314/// Because `PagedRange` requires interior pages to be fully covered,
315/// a `RequestBuffers` can only describe memory regions where every
316/// page boundary between the first and last page is fully spanned.
317/// Two guest memory regions with arbitrary starting GPAs generally
318/// cannot be combined into one `RequestBuffers`.
319///
320/// When a device has multiple disjoint memory regions for a single IO
321/// (e.g., a virtio descriptor chain whose descriptors don't align to
322/// page boundaries), options include:
323///
324/// - Issue separate IOs per region (only valid if each region is
325/// sector-aligned)
326/// - Use a [`BounceBuffer`] to coalesce into one contiguous buffer
327/// - Use multiple `RequestBuffers` with a multi-range disk backend
328/// API (not currently available)
329#[derive(Clone, Debug)]
330pub struct RequestBuffers<'a> {
331 range: PagedRange<'a>,
332 guest_memory: &'a GuestMemory,
333 is_write: bool,
334}
335
336impl<'a> RequestBuffers<'a> {
337 /// Creates a new request buffer from the given memory ranges.
338 pub fn new(guest_memory: &'a GuestMemory, range: PagedRange<'a>, is_write: bool) -> Self {
339 Self {
340 range,
341 guest_memory,
342 is_write,
343 }
344 }
345
346 /// Returns true if the buffer is empty.
347 pub fn is_empty(&self) -> bool {
348 self.range.is_empty()
349 }
350
351 /// Return the total length of the buffers in bytes.
352 pub fn len(&self) -> usize {
353 self.range.len()
354 }
355
356 /// Returns the guest memory accessor.
357 pub fn guest_memory(&self) -> &GuestMemory {
358 self.guest_memory
359 }
360
361 /// Return the internal paged range.
362 pub fn range(&self) -> PagedRange<'_> {
363 self.range
364 }
365
366 /// Returns whether the buffer is aligned to at least `alignment` bytes.
367 ///
368 /// Checks three things (all must be multiples of `alignment`):
369 /// 1. The byte offset into the first page (`range.offset()`)
370 /// 2. The total byte length (`range.len()`)
371 /// 3. The page size (4096) — always true for alignment ≤ 4096
372 ///
373 /// When this returns `false`, disk backends that require aligned
374 /// buffers (e.g., those using O_DIRECT or io_uring) must use a
375 /// [`BounceBuffer`] to perform the IO.
376 ///
377 /// # Panics
378 ///
379 /// Panics if `alignment` is not a power of two.
380 pub fn is_aligned(&self, alignment: usize) -> bool {
381 assert!(alignment.is_power_of_two());
382 ((self.range.offset() | self.range.len() | PAGE_SIZE) & (alignment - 1)) == 0
383 }
384
385 /// Gets a memory writer for the buffers.
386 ///
387 /// Returns an empty writer if the buffers are only available for read access.
388 pub fn writer(&self) -> impl MemoryWrite + '_ {
389 PermissionedMemoryWriter::new(self.range, self.guest_memory, self.is_write)
390 }
391
392 /// Gets a memory reader for the buffers.
393 pub fn reader(&self) -> impl MemoryRead + '_ {
394 self.range.reader(self.guest_memory)
395 }
396
397 /// Locks the guest memory ranges described by this buffer and returns an
398 /// object containing [`IoBuffer`]s, suitable for executing asynchronous I/O
399 /// operations.
400 pub fn lock(&self, for_write: bool) -> Result<LockedIoBuffers, AccessError> {
401 if for_write && !self.is_write {
402 return Err(AccessError::ReadOnly);
403 }
404 Ok(LockedIoBuffers(
405 self.guest_memory
406 .lock_range(self.range, LockedIoVecs::new())?,
407 ))
408 }
409
410 /// Returns a subrange of this set of buffers.
411 ///
412 /// Panics if `offset + len > self.len()`.
413 pub fn subrange(&self, offset: usize, len: usize) -> Self {
414 Self {
415 range: self.range.subrange(offset, len),
416 guest_memory: self.guest_memory,
417 is_write: self.is_write,
418 }
419 }
420}
421
422/// A memory range.
423#[derive(Debug, Clone)]
424pub struct OwnedRequestBuffers {
425 gpns: Vec<u64>,
426 offset: usize,
427 len: usize,
428 is_write: bool,
429}
430
431impl OwnedRequestBuffers {
432 /// A new memory range with the given guest page numbers.
433 pub fn new(gpns: &[u64]) -> Self {
434 Self::new_unaligned(gpns, 0, gpns.len() * PAGE_SIZE)
435 }
436
437 /// A new memory range with the given guest page numbers, offset by `offset`
438 /// bytes, and of `len` bytes length.
439 pub fn new_unaligned(gpns: &[u64], offset: usize, len: usize) -> Self {
440 Self {
441 gpns: gpns.to_vec(),
442 offset,
443 len,
444 is_write: true,
445 }
446 }
447
448 /// A new memory range containing the linear address range from
449 /// `offset..offset+len`.
450 pub fn linear(offset: u64, len: usize, is_write: bool) -> Self {
451 let start_page = offset / PAGE_SIZE as u64;
452 let end_page = start_page + (len as u64).div_ceil(PAGE_SIZE as u64);
453 let gpns: Vec<u64> = (start_page..end_page).collect();
454 Self {
455 gpns,
456 offset: (offset % PAGE_SIZE as u64) as usize,
457 len,
458 is_write,
459 }
460 }
461
462 /// A [`RequestBuffers`] referencing this memory range.
463 pub fn buffer<'a>(&'a self, guest_memory: &'a GuestMemory) -> RequestBuffers<'a> {
464 RequestBuffers::new(
465 guest_memory,
466 PagedRange::new(self.offset, self.len, &self.gpns).unwrap(),
467 self.is_write,
468 )
469 }
470
471 /// The length of the range in bytes.
472 pub fn len(&self) -> usize {
473 self.len
474 }
475}
476
477/// Tracks an active bounce buffer, signaling to the bounce buffer tracker
478/// upon drop that pages can be reclaimed.
479pub struct TrackedBounceBuffer<'a> {
480 /// The active bounce buffer being tracked.
481 pub buffer: BounceBuffer,
482 /// Reference to free page counter for current IO thread.
483 free_pages: &'a AtomicUsize,
484 /// Used to signal pending bounce buffer requests of newly freed pages.
485 event: &'a event_listener::Event,
486}
487
488impl Drop for TrackedBounceBuffer<'_> {
489 fn drop(&mut self) {
490 let pages = self.buffer.len().div_ceil(4096);
491 self.free_pages.fetch_add(pages, Ordering::SeqCst);
492 self.event.notify(usize::MAX);
493 }
494}
495
496/// Tracks active bounce buffers against a set limit of pages. If no limit is
497/// specified a default of 8Mb will be applied. This limit is tracked per thread
498/// specified by the backing AffinitizedThreadpool.
499#[derive(Debug)]
500pub struct BounceBufferTracker {
501 /// Active bounce buffer pages on a given thread.
502 free_pages: Vec<AtomicUsize>,
503 /// Event used by TrackedBounceBuffer to signal pages have been dropped.
504 event: Vec<event_listener::Event>,
505}
506
507impl BounceBufferTracker {
508 /// Create a new bounce buffer tracker.
509 pub fn new(max_bounce_buffer_pages: usize, threads: usize) -> Self {
510 let mut free_pages = Vec::with_capacity(threads);
511 let mut event = Vec::with_capacity(threads);
512
513 (0..threads).for_each(|_| {
514 event.push(event_listener::Event::new());
515 free_pages.push(AtomicUsize::new(max_bounce_buffer_pages));
516 });
517
518 Self { free_pages, event }
519 }
520
521 /// Attempts to acquire bounce buffers from the tracker proceeding if pages
522 /// are available or waiting until a tracked bounce buffer is dropped, which
523 /// triggers the per-thread event to indicate newly freed pages.
524 pub async fn acquire_bounce_buffers<'a, 'b>(
525 &'b self,
526 size: usize,
527 thread: usize,
528 ) -> Box<TrackedBounceBuffer<'a>>
529 where
530 'b: 'a,
531 {
532 let pages = size.div_ceil(4096);
533 let event = self.event.get(thread).unwrap();
534 let free_pages = self.free_pages.get(thread).unwrap();
535
536 loop {
537 let listener = event.listen();
538 if free_pages
539 .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |x| x.checked_sub(pages))
540 .is_ok()
541 {
542 break;
543 }
544 listener.await;
545 }
546
547 Box::new(TrackedBounceBuffer {
548 buffer: BounceBuffer::new(size),
549 free_pages,
550 event,
551 })
552 }
553}
554
555#[cfg(test)]
556mod tests {
557 use super::*;
558 use sparse_mmap::SparseMapping;
559 const SIZE_1MB: usize = 1048576;
560
561 #[test]
562 fn correct_read_only_behavior() {
563 let mapping = SparseMapping::new(SIZE_1MB * 4).unwrap();
564 let guest_memory = GuestMemory::new("test-scsi-buffers", mapping);
565 let range = PagedRange::new(0, 4096, &[0]).unwrap();
566 let buffers = RequestBuffers::new(&guest_memory, range, false);
567
568 let r = buffers.writer().write(&[1; 4096]);
569 assert!(
570 matches!(r, Err(AccessError::ReadOnly)),
571 "Expected read-only error, got {:?}",
572 r
573 );
574
575 let r = buffers.writer().fill(1, 4096);
576 assert!(
577 matches!(r, Err(AccessError::ReadOnly)),
578 "Expected read-only error, got {:?}",
579 r
580 );
581
582 assert!(
583 buffers.writer().len() == 0,
584 "Length should be 0 for read-only writer"
585 );
586 }
587}