disk_nvme/
lib.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! Disk backend implementation that uses a user-mode NVMe driver based on VFIO.
5
6#![cfg(any(windows, target_os = "linux"))]
7#![forbid(unsafe_code)]
8#![expect(missing_docs)]
9
10use async_trait::async_trait;
11use disk_backend::DiskError;
12use disk_backend::DiskIo;
13use disk_backend::MediumErrorDetails;
14use disk_backend::pr;
15use inspect::Inspect;
16use nvme_common::from_nvme_reservation_report;
17use nvme_spec::Status;
18use nvme_spec::nvm;
19#[cfg(target_os = "linux")]
20use pal::unix::affinity::get_cpu_number;
21#[cfg(windows)]
22use pal::windows::affinity::get_cpu_number;
23use std::io;
24
25#[derive(Debug, Inspect)]
26pub struct NvmeDisk {
27    /// NVMe namespace mapped to the disk representation.
28    #[inspect(flatten)]
29    namespace: nvme_driver::NamespaceHandle,
30    #[inspect(skip)]
31    block_shift: u32,
32}
33
34impl NvmeDisk {
35    pub fn new(namespace: nvme_driver::NamespaceHandle) -> Self {
36        Self {
37            block_shift: namespace.block_size().trailing_zeros(),
38            namespace,
39        }
40    }
41}
42
43impl DiskIo for NvmeDisk {
44    fn disk_type(&self) -> &str {
45        "nvme"
46    }
47
48    fn sector_count(&self) -> u64 {
49        self.namespace.block_count()
50    }
51
52    fn sector_size(&self) -> u32 {
53        self.namespace.block_size()
54    }
55
56    fn disk_id(&self) -> Option<[u8; 16]> {
57        None // TODO
58    }
59
60    fn physical_sector_size(&self) -> u32 {
61        4096 // TODO
62    }
63
64    fn is_fua_respected(&self) -> bool {
65        // NVMe does not provide a way to specify that FUA is ignored.
66        true
67    }
68
69    fn is_read_only(&self) -> bool {
70        false // TODO
71    }
72
73    fn pr(&self) -> Option<&dyn pr::PersistentReservation> {
74        (u8::from(self.namespace.reservation_capabilities()) != 0).then_some(self)
75    }
76
77    async fn read_vectored(
78        &self,
79        buffers: &scsi_buffers::RequestBuffers<'_>,
80        sector: u64,
81    ) -> Result<(), DiskError> {
82        let block_count = buffers.len() as u64 >> self.block_shift;
83        let mut block_offset = 0;
84        while block_offset < block_count {
85            let this_block_count = (block_count - block_offset)
86                .min(self.namespace.max_transfer_block_count().into())
87                as u32;
88
89            self.namespace
90                .read(
91                    get_cpu_number(),
92                    sector + block_offset,
93                    this_block_count,
94                    buffers.guest_memory(),
95                    buffers.range().subrange(
96                        (block_offset as usize) << self.block_shift,
97                        (this_block_count as usize) << self.block_shift,
98                    ),
99                )
100                .await
101                .map_err(map_nvme_error)?;
102
103            block_offset += this_block_count as u64;
104        }
105        Ok(())
106    }
107
108    async fn write_vectored(
109        &self,
110        buffers: &scsi_buffers::RequestBuffers<'_>,
111        sector: u64,
112        fua: bool,
113    ) -> Result<(), DiskError> {
114        let block_count = buffers.len() as u64 >> self.block_shift;
115        let mut block_offset = 0;
116        while block_offset < block_count {
117            let this_block_count = (block_count - block_offset)
118                .min(self.namespace.max_transfer_block_count().into())
119                as u32;
120
121            self.namespace
122                .write(
123                    get_cpu_number(),
124                    sector + block_offset,
125                    this_block_count,
126                    fua,
127                    buffers.guest_memory(),
128                    buffers.range().subrange(
129                        (block_offset as usize) << self.block_shift,
130                        (this_block_count as usize) << self.block_shift,
131                    ),
132                )
133                .await
134                .map_err(map_nvme_error)?;
135
136            block_offset += this_block_count as u64;
137        }
138        Ok(())
139    }
140
141    async fn sync_cache(&self) -> Result<(), DiskError> {
142        self.namespace
143            .flush(get_cpu_number())
144            .await
145            .map_err(map_nvme_error)?;
146        Ok(())
147    }
148
149    async fn wait_resize(&self, sector_count: u64) -> u64 {
150        self.namespace.wait_resize(sector_count).await
151    }
152
153    async fn unmap(
154        &self,
155        sector_offset: u64,
156        sector_count: u64,
157        _block_level_only: bool,
158    ) -> Result<(), DiskError> {
159        if !self.namespace.supports_dataset_management() {
160            return Ok(());
161        }
162        let mut processed = 0;
163        let max = self.namespace.dataset_management_range_size_limit();
164        while processed < sector_count {
165            let lba_count = (sector_count - processed).min(max.into());
166            self.namespace
167                .deallocate(
168                    get_cpu_number(),
169                    &[nvm::DsmRange {
170                        context_attributes: 0,
171                        lba_count: lba_count as u32,
172                        starting_lba: sector_offset + processed,
173                    }],
174                )
175                .await
176                .map_err(map_nvme_error)?;
177
178            processed += lba_count;
179        }
180        Ok(())
181    }
182
183    fn unmap_behavior(&self) -> disk_backend::UnmapBehavior {
184        if self.namespace.supports_dataset_management() {
185            disk_backend::UnmapBehavior::Unspecified
186        } else {
187            disk_backend::UnmapBehavior::Ignored
188        }
189    }
190
191    fn optimal_unmap_sectors(&self) -> u32 {
192        self.namespace.preferred_deallocate_granularity().into()
193    }
194}
195
196#[async_trait]
197impl pr::PersistentReservation for NvmeDisk {
198    fn capabilities(&self) -> pr::ReservationCapabilities {
199        nvme_common::from_nvme_reservation_capabilities(self.namespace.reservation_capabilities())
200    }
201
202    async fn report(&self) -> Result<pr::ReservationReport, DiskError> {
203        let (report, controllers) = self
204            .namespace
205            .reservation_report_extended(get_cpu_number())
206            .await
207            .map_err(map_nvme_error)?;
208
209        from_nvme_reservation_report(&report.report, &controllers)
210            .map_err(|err| DiskError::Io(io::Error::new(io::ErrorKind::InvalidInput, err)))
211    }
212
213    async fn register(
214        &self,
215        current_key: Option<u64>,
216        new_key: u64,
217        ptpl: Option<bool>,
218    ) -> Result<(), DiskError> {
219        let action = if new_key == 0 {
220            nvm::ReservationRegisterAction::UNREGISTER
221        } else if current_key.is_some() {
222            nvm::ReservationRegisterAction::REPLACE
223        } else {
224            nvm::ReservationRegisterAction::REGISTER
225        };
226        self.namespace
227            .reservation_register(get_cpu_number(), action, current_key, new_key, ptpl)
228            .await
229            .map_err(map_nvme_error)?;
230
231        Ok(())
232    }
233
234    async fn reserve(
235        &self,
236        key: u64,
237        reservation_type: pr::ReservationType,
238    ) -> Result<(), DiskError> {
239        self.namespace
240            .reservation_acquire(
241                get_cpu_number(),
242                nvm::ReservationAcquireAction::ACQUIRE,
243                key,
244                0,
245                nvme_common::to_nvme_reservation_type(reservation_type),
246            )
247            .await
248            .map_err(map_nvme_error)?;
249
250        Ok(())
251    }
252
253    async fn release(
254        &self,
255        key: u64,
256        reservation_type: pr::ReservationType,
257    ) -> Result<(), DiskError> {
258        self.namespace
259            .reservation_release(
260                get_cpu_number(),
261                nvm::ReservationReleaseAction::RELEASE,
262                key,
263                nvme_common::to_nvme_reservation_type(reservation_type),
264            )
265            .await
266            .map_err(map_nvme_error)?;
267
268        Ok(())
269    }
270
271    async fn clear(&self, key: u64) -> Result<(), DiskError> {
272        self.namespace
273            .reservation_release(
274                get_cpu_number(),
275                nvm::ReservationReleaseAction::CLEAR,
276                key,
277                nvm::ReservationType(0),
278            )
279            .await
280            .map_err(map_nvme_error)?;
281
282        Ok(())
283    }
284
285    async fn preempt(
286        &self,
287        current_key: u64,
288        preempt_key: u64,
289        reservation_type: pr::ReservationType,
290        abort: bool,
291    ) -> Result<(), DiskError> {
292        self.namespace
293            .reservation_acquire(
294                get_cpu_number(),
295                if abort {
296                    nvm::ReservationAcquireAction::PREEMPT_AND_ABORT
297                } else {
298                    nvm::ReservationAcquireAction::PREEMPT
299                },
300                current_key,
301                preempt_key,
302                nvme_common::to_nvme_reservation_type(reservation_type),
303            )
304            .await
305            .map_err(map_nvme_error)?;
306
307        Ok(())
308    }
309}
310
311fn map_nvme_error(err: nvme_driver::RequestError) -> DiskError {
312    match err {
313        err @ nvme_driver::RequestError::Gone(_) => {
314            DiskError::Io(io::Error::new(io::ErrorKind::NotConnected, err))
315        }
316        nvme_driver::RequestError::Nvme(err) => {
317            match err.status() {
318                Status::RESERVATION_CONFLICT => DiskError::ReservationConflict,
319
320                Status::INVALID_FIELD_IN_COMMAND => DiskError::InvalidInput,
321
322                Status::LBA_OUT_OF_RANGE => DiskError::IllegalBlock,
323
324                // MediumError
325                Status::DATA_TRANSFER_ERROR | Status::CAPACITY_EXCEEDED => {
326                    DiskError::Io(io::Error::other(err))
327                }
328                Status::MEDIA_WRITE_FAULT => {
329                    DiskError::MediumError(io::Error::other(err), MediumErrorDetails::WriteFault)
330                }
331                Status::MEDIA_UNRECOVERED_READ_ERROR => DiskError::MediumError(
332                    io::Error::other(err),
333                    MediumErrorDetails::UnrecoveredReadError,
334                ),
335                Status::MEDIA_END_TO_END_GUARD_CHECK_ERROR => DiskError::MediumError(
336                    io::Error::other(err),
337                    MediumErrorDetails::GuardCheckFailed,
338                ),
339                Status::MEDIA_END_TO_END_APPLICATION_TAG_CHECK_ERROR => DiskError::MediumError(
340                    io::Error::other(err),
341                    MediumErrorDetails::ApplicationTagCheckFailed,
342                ),
343                Status::MEDIA_END_TO_END_REFERENCE_TAG_CHECK_ERROR => DiskError::MediumError(
344                    io::Error::other(err),
345                    MediumErrorDetails::ReferenceTagCheckFailed,
346                ),
347
348                Status::COMMAND_ABORTED_DUE_TO_PREEMPT_AND_ABORT => {
349                    DiskError::AbortDueToPreemptAndAbort
350                }
351
352                _ => DiskError::Io(io::Error::other(err)),
353            }
354        }
355        nvme_driver::RequestError::Memory(err) => DiskError::MemoryAccess(err.into()),
356        err @ nvme_driver::RequestError::TooLarge => {
357            DiskError::Io(io::Error::new(io::ErrorKind::InvalidInput, err))
358        }
359    }
360}