scsi_buffers/
lib.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! Functionality for referencing locked memory buffers for the lifetime of an
5//! IO.
6
7// UNSAFETY: Handling raw pointers and transmuting between types for different use cases.
8#![expect(unsafe_code)]
9
10use guestmem::AccessError;
11use guestmem::GuestMemory;
12use guestmem::LockedRange;
13use guestmem::LockedRangeImpl;
14use guestmem::MemoryRead;
15use guestmem::MemoryWrite;
16use guestmem::ranges::PagedRange;
17use safeatomic::AsAtomicBytes;
18use smallvec::SmallVec;
19use std::marker::PhantomData;
20use std::ops::Deref;
21use std::sync::atomic::AtomicU8;
22use std::sync::atomic::AtomicUsize;
23use std::sync::atomic::Ordering;
24use zerocopy::FromBytes;
25use zerocopy::Immutable;
26use zerocopy::IntoBytes;
27use zerocopy::KnownLayout;
28
29/// A pointer/length pair that is ABI compatible with the iovec type on Linux.
30#[derive(Debug, Copy, Clone)]
31#[repr(C)]
32pub struct AtomicIoVec {
33    /// The address of the buffer.
34    pub address: *const AtomicU8,
35    /// The length of the buffer in bytes.
36    pub len: usize,
37}
38
39impl Default for AtomicIoVec {
40    fn default() -> Self {
41        Self {
42            address: std::ptr::null(),
43            len: 0,
44        }
45    }
46}
47
48impl From<&'_ [AtomicU8]> for AtomicIoVec {
49    fn from(p: &'_ [AtomicU8]) -> Self {
50        Self {
51            address: p.as_ptr(),
52            len: p.len(),
53        }
54    }
55}
56
57impl AtomicIoVec {
58    /// Returns a pointer to a slice backed by the buffer.
59    ///
60    /// # Safety
61    /// The caller must ensure this iovec points to [valid](std::ptr#Safety)
62    /// data.
63    pub unsafe fn as_slice_unchecked(&self) -> &[AtomicU8] {
64        // SAFETY: guaranteed by caller.
65        unsafe { std::slice::from_raw_parts(self.address, self.len) }
66    }
67}
68
69/// SAFETY: AtomicIoVec just represents a pointer and length and can be
70/// sent/accessed anywhere freely.
71unsafe impl Send for AtomicIoVec {}
72// SAFETY: see above comment
73unsafe impl Sync for AtomicIoVec {}
74
75/// Wrapper around an &[AtomicU8] guaranteed to be ABI compatible with the
76/// `iovec` type on Linux.
77#[derive(Debug, Copy, Clone, Default)]
78#[repr(transparent)]
79pub struct IoBuffer<'a> {
80    io_vec: AtomicIoVec,
81    phantom: PhantomData<&'a AtomicU8>,
82}
83
84impl<'a> IoBuffer<'a> {
85    /// Wraps `buffer` and returns it.
86    pub fn new(buffer: &'a [AtomicU8]) -> Self {
87        Self {
88            io_vec: AtomicIoVec {
89                address: buffer.as_ptr(),
90                len: buffer.len(),
91            },
92            phantom: PhantomData,
93        }
94    }
95
96    /// Reinterprets `io_vec` as `IoBuffer`.
97    ///
98    /// # Safety
99    /// `io_vec` must reference a valid buffer for the lifetime of `Self`.
100    pub unsafe fn from_io_vec(io_vec: &AtomicIoVec) -> &Self {
101        // SAFETY: IoBuffer is #[repr(transparent)] over AtomicIoVec
102        unsafe { std::mem::transmute(io_vec) }
103    }
104
105    /// Reinterprets the `io_vecs` slice as `[IoBuffer]`.
106    ///
107    /// # Safety
108    /// `io_vecs` must reference valid buffers for the lifetime of `Self`.
109    pub unsafe fn from_io_vecs(io_vecs: &[AtomicIoVec]) -> &[Self] {
110        // SAFETY: IoBuffer is #[repr(transparent)] over AtomicIoVec
111        unsafe { std::mem::transmute(io_vecs) }
112    }
113
114    /// Returns a pointer to the beginning of the buffer.
115    pub fn as_ptr(&self) -> *const AtomicU8 {
116        self.io_vec.address
117    }
118
119    /// Returns the buffer's length in bytes.
120    pub fn len(&self) -> usize {
121        self.io_vec.len
122    }
123}
124
125impl Deref for IoBuffer<'_> {
126    type Target = [AtomicU8];
127
128    fn deref(&self) -> &Self::Target {
129        // SAFETY: the buffer is guaranteed to be valid for the lifetime of
130        // self.
131        unsafe { self.io_vec.as_slice_unchecked() }
132    }
133}
134
135const PAGE_SIZE: usize = 4096;
136
137#[repr(C, align(4096))]
138#[derive(Clone, IntoBytes, Immutable, KnownLayout, FromBytes)]
139struct Page([u8; PAGE_SIZE]);
140
141const ZERO_PAGE: Page = Page([0; PAGE_SIZE]);
142
143/// A page-aligned buffer used to double-buffer IO data.
144pub struct BounceBuffer {
145    pages: Vec<Page>,
146    io_vec: AtomicIoVec,
147}
148
149impl BounceBuffer {
150    /// Allocates a new bounce buffer of `size` bytes.
151    pub fn new(size: usize) -> Self {
152        let mut pages = vec![ZERO_PAGE; size.div_ceil(PAGE_SIZE)];
153        let io_vec = pages.as_mut_bytes()[..size].as_atomic_bytes().into();
154        BounceBuffer { pages, io_vec }
155    }
156
157    fn len(&self) -> usize {
158        self.io_vec.len
159    }
160
161    /// Returns the bounce buffer memory.
162    pub fn as_mut_bytes(&mut self) -> &mut [u8] {
163        // SAFETY: while there are no concurrent references (e.g., via io_vec),
164        // the buffer in pages is exclusively owned, and it is accessible as a
165        // byte array.
166        unsafe { std::slice::from_raw_parts_mut(self.pages.as_mut_ptr().cast::<u8>(), self.len()) }
167    }
168
169    /// Returns a reference to the underlying buffer.
170    ///
171    /// This is returned in a form convenient for using with IO functions.
172    pub fn io_vecs(&self) -> &[IoBuffer<'_>] {
173        std::slice::from_ref({
174            // SAFETY: io_vec contains a pointer to the live data in pages.
175            unsafe { IoBuffer::from_io_vec(&self.io_vec) }
176        })
177    }
178}
179
180/// A set of locked memory ranges, represented by [`IoBuffer`]s.
181pub struct LockedIoBuffers(LockedRangeImpl<LockedIoVecs>);
182
183impl LockedIoBuffers {
184    /// Returns the slice of IO buffers.
185    pub fn io_vecs(&self) -> &[IoBuffer<'_>] {
186        // SAFETY: the LockedRangeImpl passed to new guarantees that only
187        // vectors with valid lifetimes were passed to
188        // LockedGuestBuffers::push_sub_range.
189        unsafe { IoBuffer::from_io_vecs(&self.0.get().0) }
190    }
191}
192
193struct LockedIoVecs(SmallVec<[AtomicIoVec; 64]>);
194
195impl LockedIoVecs {
196    fn new() -> Self {
197        Self(Default::default())
198    }
199}
200
201impl LockedRange for LockedIoVecs {
202    fn push_sub_range(&mut self, sub_range: &[AtomicU8]) {
203        self.0.push(sub_range.into());
204    }
205
206    fn pop_sub_range(&mut self) -> Option<(*const AtomicU8, usize)> {
207        self.0.pop().map(|buffer| (buffer.address, buffer.len))
208    }
209}
210
211/// An accessor for the memory associated with an IO request.
212#[derive(Clone, Debug)]
213pub struct RequestBuffers<'a> {
214    range: PagedRange<'a>,
215    guest_memory: &'a GuestMemory,
216    is_write: bool,
217}
218
219impl<'a> RequestBuffers<'a> {
220    /// Creates a new request buffer from the given memory ranges.
221    pub fn new(guest_memory: &'a GuestMemory, range: PagedRange<'a>, is_write: bool) -> Self {
222        Self {
223            range,
224            guest_memory,
225            is_write,
226        }
227    }
228
229    /// Returns true if the buffer is empty.
230    pub fn is_empty(&self) -> bool {
231        self.range.is_empty()
232    }
233
234    /// Return the total length of the buffers in bytes.
235    pub fn len(&self) -> usize {
236        self.range.len()
237    }
238
239    /// Returns the guest memory accessor.
240    pub fn guest_memory(&self) -> &GuestMemory {
241        self.guest_memory
242    }
243
244    /// Return the internal paged range.
245    pub fn range(&self) -> PagedRange<'_> {
246        self.range
247    }
248
249    /// Returns whether the buffers are all aligned to at least `alignment`
250    /// bytes.
251    ///
252    /// `alignment` must be a power of two.
253    pub fn is_aligned(&self, alignment: usize) -> bool {
254        assert!(alignment.is_power_of_two());
255        ((self.range.offset() | self.range.len() | PAGE_SIZE) & (alignment - 1)) == 0
256    }
257
258    /// Gets a memory writer for the buffers.
259    ///
260    /// Returns an empty writer if the buffers are only available for read access.
261    pub fn writer(&self) -> impl MemoryWrite + '_ {
262        let range = if self.is_write {
263            self.range
264        } else {
265            PagedRange::empty()
266        };
267        range.writer(self.guest_memory)
268    }
269
270    /// Gets a memory reader for the buffers.
271    pub fn reader(&self) -> impl MemoryRead + '_ {
272        self.range.reader(self.guest_memory)
273    }
274
275    /// Locks the guest memory ranges described by this buffer and returns an
276    /// object containing [`IoBuffer`]s, suitable for executing asynchronous I/O
277    /// operations.
278    pub fn lock(&self, for_write: bool) -> Result<LockedIoBuffers, AccessError> {
279        if for_write && !self.is_write {
280            return Err(AccessError::ReadOnly);
281        }
282        Ok(LockedIoBuffers(
283            self.guest_memory
284                .lock_range(self.range, LockedIoVecs::new())?,
285        ))
286    }
287
288    /// Returns a subrange of this set of buffers.
289    ///
290    /// Panics if `offset + len > self.len()`.
291    pub fn subrange(&self, offset: usize, len: usize) -> Self {
292        Self {
293            range: self.range.subrange(offset, len),
294            guest_memory: self.guest_memory,
295            is_write: self.is_write,
296        }
297    }
298}
299
300/// A memory range.
301#[derive(Debug, Clone)]
302pub struct OwnedRequestBuffers {
303    gpns: Vec<u64>,
304    offset: usize,
305    len: usize,
306    is_write: bool,
307}
308
309impl OwnedRequestBuffers {
310    /// A new memory range with the given guest page numbers.
311    pub fn new(gpns: &[u64]) -> Self {
312        Self::new_unaligned(gpns, 0, gpns.len() * PAGE_SIZE)
313    }
314
315    /// A new memory range with the given guest page numbers, offset by `offset`
316    /// bytes, and of `len` bytes length.
317    pub fn new_unaligned(gpns: &[u64], offset: usize, len: usize) -> Self {
318        Self {
319            gpns: gpns.to_vec(),
320            offset,
321            len,
322            is_write: true,
323        }
324    }
325
326    /// A new memory range containing the linear address range from
327    /// `offset..offset+len`.
328    pub fn linear(offset: u64, len: usize, is_write: bool) -> Self {
329        let start_page = offset / PAGE_SIZE as u64;
330        let end_page = offset + (len as u64).div_ceil(PAGE_SIZE as u64);
331        let gpns: Vec<u64> = (start_page..end_page).collect();
332        Self {
333            gpns,
334            offset: (offset % PAGE_SIZE as u64) as usize,
335            len,
336            is_write,
337        }
338    }
339
340    /// A [`RequestBuffers`] referencing this memory range.
341    pub fn buffer<'a>(&'a self, guest_memory: &'a GuestMemory) -> RequestBuffers<'a> {
342        RequestBuffers::new(
343            guest_memory,
344            PagedRange::new(self.offset, self.len, &self.gpns).unwrap(),
345            self.is_write,
346        )
347    }
348
349    /// The length of the range in bytes.
350    pub fn len(&self) -> usize {
351        self.len
352    }
353}
354
355/// Tracks an active bounce buffer, signaling to the bounce buffer tracker
356/// upon drop that pages can be reclaimed.
357pub struct TrackedBounceBuffer<'a> {
358    /// The active bounce buffer being tracked.
359    pub buffer: BounceBuffer,
360    /// Reference to free page counter for current IO thread.
361    free_pages: &'a AtomicUsize,
362    /// Used to signal pending bounce buffer requests of newly freed pages.
363    event: &'a event_listener::Event,
364}
365
366impl Drop for TrackedBounceBuffer<'_> {
367    fn drop(&mut self) {
368        let pages = self.buffer.len().div_ceil(4096);
369        self.free_pages.fetch_add(pages, Ordering::SeqCst);
370        self.event.notify(usize::MAX);
371    }
372}
373
374/// Tracks active bounce buffers against a set limit of pages. If no limit is
375/// specified a default of 8Mb will be applied. This limit is tracked per thread
376/// specified by the backing AffinitizedThreadpool.
377#[derive(Debug)]
378pub struct BounceBufferTracker {
379    /// Active bounce buffer pages on a given thread.
380    free_pages: Vec<AtomicUsize>,
381    /// Event used by TrackedBounceBuffer to signal pages have been dropped.
382    event: Vec<event_listener::Event>,
383}
384
385impl BounceBufferTracker {
386    /// Create a new bounce buffer tracker.
387    pub fn new(max_bounce_buffer_pages: usize, threads: usize) -> Self {
388        let mut free_pages = Vec::with_capacity(threads);
389        let mut event = Vec::with_capacity(threads);
390
391        (0..threads).for_each(|_| {
392            event.push(event_listener::Event::new());
393            free_pages.push(AtomicUsize::new(max_bounce_buffer_pages));
394        });
395
396        Self { free_pages, event }
397    }
398
399    /// Attempts to acquire bounce buffers from the tracker proceeding if pages
400    /// are available or waiting until a tracked bounce buffer is dropped, which
401    /// triggers the per-thread event to indicate newly freed pages.
402    pub async fn acquire_bounce_buffers<'a, 'b>(
403        &'b self,
404        size: usize,
405        thread: usize,
406    ) -> Box<TrackedBounceBuffer<'a>>
407    where
408        'b: 'a,
409    {
410        let pages = size.div_ceil(4096);
411        let event = self.event.get(thread).unwrap();
412        let free_pages = self.free_pages.get(thread).unwrap();
413
414        loop {
415            let listener = event.listen();
416            if free_pages
417                .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |x| x.checked_sub(pages))
418                .is_ok()
419            {
420                break;
421            }
422            listener.await;
423        }
424
425        Box::new(TrackedBounceBuffer {
426            buffer: BounceBuffer::new(size),
427            free_pages,
428            event,
429        })
430    }
431}