disk_nvme/
lib.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! Disk backend implementation that uses a user-mode NVMe driver based on VFIO.
5
6#![cfg(target_os = "linux")]
7#![forbid(unsafe_code)]
8#![expect(missing_docs)]
9
10use async_trait::async_trait;
11use disk_backend::DiskError;
12use disk_backend::DiskIo;
13use disk_backend::MediumErrorDetails;
14use disk_backend::pr;
15use inspect::Inspect;
16use nvme_common::from_nvme_reservation_report;
17use nvme_spec::Status;
18use nvme_spec::nvm;
19use pal::unix::affinity::get_cpu_number;
20use std::io;
21
22#[derive(Debug, Inspect)]
23pub struct NvmeDisk {
24    /// NVMe namespace mapped to the disk representation.
25    #[inspect(flatten)]
26    namespace: nvme_driver::Namespace,
27    #[inspect(skip)]
28    block_shift: u32,
29}
30
31impl NvmeDisk {
32    pub fn new(namespace: nvme_driver::Namespace) -> Self {
33        Self {
34            block_shift: namespace.block_size().trailing_zeros(),
35            namespace,
36        }
37    }
38}
39
40impl DiskIo for NvmeDisk {
41    fn disk_type(&self) -> &str {
42        "nvme"
43    }
44
45    fn sector_count(&self) -> u64 {
46        self.namespace.block_count()
47    }
48
49    fn sector_size(&self) -> u32 {
50        self.namespace.block_size()
51    }
52
53    fn disk_id(&self) -> Option<[u8; 16]> {
54        None // TODO
55    }
56
57    fn physical_sector_size(&self) -> u32 {
58        4096 // TODO
59    }
60
61    fn is_fua_respected(&self) -> bool {
62        // NVMe does not provide a way to specify that FUA is ignored.
63        true
64    }
65
66    fn is_read_only(&self) -> bool {
67        false // TODO
68    }
69
70    fn pr(&self) -> Option<&dyn pr::PersistentReservation> {
71        (u8::from(self.namespace.reservation_capabilities()) != 0).then_some(self)
72    }
73
74    async fn read_vectored(
75        &self,
76        buffers: &scsi_buffers::RequestBuffers<'_>,
77        sector: u64,
78    ) -> Result<(), DiskError> {
79        let block_count = buffers.len() as u64 >> self.block_shift;
80        let mut block_offset = 0;
81        while block_offset < block_count {
82            let this_block_count = (block_count - block_offset)
83                .min(self.namespace.max_transfer_block_count().into())
84                as u32;
85
86            self.namespace
87                .read(
88                    get_cpu_number(),
89                    sector + block_offset,
90                    this_block_count,
91                    buffers.guest_memory(),
92                    buffers.range().subrange(
93                        (block_offset as usize) << self.block_shift,
94                        (this_block_count as usize) << self.block_shift,
95                    ),
96                )
97                .await
98                .map_err(map_nvme_error)?;
99
100            block_offset += this_block_count as u64;
101        }
102        Ok(())
103    }
104
105    async fn write_vectored(
106        &self,
107        buffers: &scsi_buffers::RequestBuffers<'_>,
108        sector: u64,
109        fua: bool,
110    ) -> Result<(), DiskError> {
111        let block_count = buffers.len() as u64 >> self.block_shift;
112        let mut block_offset = 0;
113        while block_offset < block_count {
114            let this_block_count = (block_count - block_offset)
115                .min(self.namespace.max_transfer_block_count().into())
116                as u32;
117
118            self.namespace
119                .write(
120                    get_cpu_number(),
121                    sector + block_offset,
122                    this_block_count,
123                    fua,
124                    buffers.guest_memory(),
125                    buffers.range().subrange(
126                        (block_offset as usize) << self.block_shift,
127                        (this_block_count as usize) << self.block_shift,
128                    ),
129                )
130                .await
131                .map_err(map_nvme_error)?;
132
133            block_offset += this_block_count as u64;
134        }
135        Ok(())
136    }
137
138    async fn sync_cache(&self) -> Result<(), DiskError> {
139        self.namespace
140            .flush(get_cpu_number())
141            .await
142            .map_err(map_nvme_error)?;
143        Ok(())
144    }
145
146    async fn wait_resize(&self, sector_count: u64) -> u64 {
147        self.namespace.wait_resize(sector_count).await
148    }
149
150    async fn unmap(
151        &self,
152        sector_offset: u64,
153        sector_count: u64,
154        _block_level_only: bool,
155    ) -> Result<(), DiskError> {
156        if !self.namespace.supports_dataset_management() {
157            return Ok(());
158        }
159        let mut processed = 0;
160        let max = self.namespace.dataset_management_range_size_limit();
161        while processed < sector_count {
162            let lba_count = (sector_count - processed).min(max.into());
163            self.namespace
164                .deallocate(
165                    get_cpu_number(),
166                    &[nvm::DsmRange {
167                        context_attributes: 0,
168                        lba_count: lba_count as u32,
169                        starting_lba: sector_offset + processed,
170                    }],
171                )
172                .await
173                .map_err(map_nvme_error)?;
174
175            processed += lba_count;
176        }
177        Ok(())
178    }
179
180    fn unmap_behavior(&self) -> disk_backend::UnmapBehavior {
181        if self.namespace.supports_dataset_management() {
182            disk_backend::UnmapBehavior::Unspecified
183        } else {
184            disk_backend::UnmapBehavior::Ignored
185        }
186    }
187
188    fn optimal_unmap_sectors(&self) -> u32 {
189        self.namespace.preferred_deallocate_granularity().into()
190    }
191}
192
193#[async_trait]
194impl pr::PersistentReservation for NvmeDisk {
195    fn capabilities(&self) -> pr::ReservationCapabilities {
196        nvme_common::from_nvme_reservation_capabilities(self.namespace.reservation_capabilities())
197    }
198
199    async fn report(&self) -> Result<pr::ReservationReport, DiskError> {
200        let (report, controllers) = self
201            .namespace
202            .reservation_report_extended(get_cpu_number())
203            .await
204            .map_err(map_nvme_error)?;
205
206        from_nvme_reservation_report(&report.report, &controllers)
207            .map_err(|err| DiskError::Io(io::Error::new(io::ErrorKind::InvalidInput, err)))
208    }
209
210    async fn register(
211        &self,
212        current_key: Option<u64>,
213        new_key: u64,
214        ptpl: Option<bool>,
215    ) -> Result<(), DiskError> {
216        let action = if new_key == 0 {
217            nvm::ReservationRegisterAction::UNREGISTER
218        } else if current_key.is_some() {
219            nvm::ReservationRegisterAction::REPLACE
220        } else {
221            nvm::ReservationRegisterAction::REGISTER
222        };
223        self.namespace
224            .reservation_register(get_cpu_number(), action, current_key, new_key, ptpl)
225            .await
226            .map_err(map_nvme_error)?;
227
228        Ok(())
229    }
230
231    async fn reserve(
232        &self,
233        key: u64,
234        reservation_type: pr::ReservationType,
235    ) -> Result<(), DiskError> {
236        self.namespace
237            .reservation_acquire(
238                get_cpu_number(),
239                nvm::ReservationAcquireAction::ACQUIRE,
240                key,
241                0,
242                nvme_common::to_nvme_reservation_type(reservation_type),
243            )
244            .await
245            .map_err(map_nvme_error)?;
246
247        Ok(())
248    }
249
250    async fn release(
251        &self,
252        key: u64,
253        reservation_type: pr::ReservationType,
254    ) -> Result<(), DiskError> {
255        self.namespace
256            .reservation_release(
257                get_cpu_number(),
258                nvm::ReservationReleaseAction::RELEASE,
259                key,
260                nvme_common::to_nvme_reservation_type(reservation_type),
261            )
262            .await
263            .map_err(map_nvme_error)?;
264
265        Ok(())
266    }
267
268    async fn clear(&self, key: u64) -> Result<(), DiskError> {
269        self.namespace
270            .reservation_release(
271                get_cpu_number(),
272                nvm::ReservationReleaseAction::CLEAR,
273                key,
274                nvm::ReservationType(0),
275            )
276            .await
277            .map_err(map_nvme_error)?;
278
279        Ok(())
280    }
281
282    async fn preempt(
283        &self,
284        current_key: u64,
285        preempt_key: u64,
286        reservation_type: pr::ReservationType,
287        abort: bool,
288    ) -> Result<(), DiskError> {
289        self.namespace
290            .reservation_acquire(
291                get_cpu_number(),
292                if abort {
293                    nvm::ReservationAcquireAction::PREEMPT_AND_ABORT
294                } else {
295                    nvm::ReservationAcquireAction::PREEMPT
296                },
297                current_key,
298                preempt_key,
299                nvme_common::to_nvme_reservation_type(reservation_type),
300            )
301            .await
302            .map_err(map_nvme_error)?;
303
304        Ok(())
305    }
306}
307
308fn map_nvme_error(err: nvme_driver::RequestError) -> DiskError {
309    match err {
310        err @ nvme_driver::RequestError::Gone(_) => {
311            DiskError::Io(io::Error::new(io::ErrorKind::NotConnected, err))
312        }
313        nvme_driver::RequestError::Nvme(err) => {
314            match err.status() {
315                Status::RESERVATION_CONFLICT => DiskError::ReservationConflict,
316
317                Status::INVALID_FIELD_IN_COMMAND => DiskError::InvalidInput,
318
319                Status::LBA_OUT_OF_RANGE => DiskError::IllegalBlock,
320
321                // MediumError
322                Status::DATA_TRANSFER_ERROR | Status::CAPACITY_EXCEEDED => {
323                    DiskError::Io(io::Error::other(err))
324                }
325                Status::MEDIA_WRITE_FAULT => {
326                    DiskError::MediumError(io::Error::other(err), MediumErrorDetails::WriteFault)
327                }
328                Status::MEDIA_UNRECOVERED_READ_ERROR => DiskError::MediumError(
329                    io::Error::other(err),
330                    MediumErrorDetails::UnrecoveredReadError,
331                ),
332                Status::MEDIA_END_TO_END_GUARD_CHECK_ERROR => DiskError::MediumError(
333                    io::Error::other(err),
334                    MediumErrorDetails::GuardCheckFailed,
335                ),
336                Status::MEDIA_END_TO_END_APPLICATION_TAG_CHECK_ERROR => DiskError::MediumError(
337                    io::Error::other(err),
338                    MediumErrorDetails::ApplicationTagCheckFailed,
339                ),
340                Status::MEDIA_END_TO_END_REFERENCE_TAG_CHECK_ERROR => DiskError::MediumError(
341                    io::Error::other(err),
342                    MediumErrorDetails::ReferenceTagCheckFailed,
343                ),
344
345                Status::COMMAND_ABORTED_DUE_TO_PREEMPT_AND_ABORT => {
346                    DiskError::AbortDueToPreemptAndAbort
347                }
348
349                _ => DiskError::Io(io::Error::other(err)),
350            }
351        }
352        nvme_driver::RequestError::Memory(err) => DiskError::MemoryAccess(err.into()),
353        err @ nvme_driver::RequestError::TooLarge => {
354            DiskError::Io(io::Error::new(io::ErrorKind::InvalidInput, err))
355        }
356    }
357}