nvme_driver/
namespace.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! NVMe namespace frontend.
5
6use super::spec;
7use super::spec::nvm;
8use crate::NVME_PAGE_SHIFT;
9use crate::driver::IoIssuers;
10use crate::driver::save_restore::SavedNamespaceData;
11use crate::queue_pair::Issuer;
12use crate::queue_pair::RequestError;
13use crate::queue_pair::admin_cmd;
14use guestmem::GuestMemory;
15use guestmem::ranges::PagedRange;
16use inspect::Inspect;
17use mesh::CancelContext;
18use pal_async::task::Spawn;
19use parking_lot::Mutex;
20use std::sync::Arc;
21use std::sync::atomic::AtomicBool;
22use std::sync::atomic::AtomicU64;
23use std::sync::atomic::Ordering;
24use thiserror::Error;
25use vmcore::vm_task::VmTaskDriver;
26use zerocopy::FromBytes;
27use zerocopy::FromZeros;
28use zerocopy::IntoBytes;
29
30/// An error getting a namespace.
31#[derive(Debug, Error)]
32#[expect(missing_docs)]
33pub enum NamespaceError {
34    #[error("namespace not found")]
35    NotFound,
36    #[error("formatted lba size invalid")]
37    FlbasInvalid,
38    #[error("lba format invalid: {0:?}")]
39    LbaFormatInvalid(nvm::Lbaf),
40    #[error("nvme request failed")]
41    Request(#[source] RequestError),
42    #[error("maximum data transfer size too small: 2^{0} pages")]
43    MdtsInvalid(u8),
44}
45
46/// An NVMe namespace.
47#[derive(Debug, Inspect)]
48pub struct Namespace {
49    nsid: u32,
50    #[inspect(flatten)]
51    state: Arc<DynamicState>,
52    block_shift: u32,
53    max_transfer_block_count: u32,
54    preferred_deallocate_granularity: u16,
55    reservation_capabilities: nvm::ReservationCapabilities,
56    controller_identify: Arc<spec::IdentifyController>,
57    #[inspect(skip)]
58    issuers: Arc<IoIssuers>,
59    #[inspect(skip)]
60    _cancel_rescan: mesh::Cancel,
61}
62
63#[derive(Debug, Inspect)]
64struct DynamicState {
65    block_count: AtomicU64,
66    #[inspect(skip)]
67    resize_event: event_listener::Event,
68    removed: AtomicBool,
69    identify: Mutex<nvm::IdentifyNamespace>,
70}
71
72impl Namespace {
73    pub(super) async fn new(
74        driver: &VmTaskDriver,
75        admin: Arc<Issuer>,
76        rescan_event: Arc<event_listener::Event>,
77        controller_identify: Arc<spec::IdentifyController>,
78        io_issuers: &Arc<IoIssuers>,
79        nsid: u32,
80    ) -> Result<Self, NamespaceError> {
81        let identify = identify_namespace(&admin, nsid)
82            .await
83            .map_err(NamespaceError::Request)?;
84
85        Namespace::new_from_identify(
86            driver,
87            admin,
88            rescan_event,
89            controller_identify.clone(),
90            io_issuers,
91            nsid,
92            identify,
93        )
94    }
95
96    /// Create Namespace object from Identify data structure.
97    fn new_from_identify(
98        driver: &VmTaskDriver,
99        admin: Arc<Issuer>,
100        rescan_event: Arc<event_listener::Event>,
101        controller_identify: Arc<spec::IdentifyController>,
102        io_issuers: &Arc<IoIssuers>,
103        nsid: u32,
104        identify: nvm::IdentifyNamespace,
105    ) -> Result<Self, NamespaceError> {
106        if identify.nsze == 0 {
107            return Err(NamespaceError::NotFound);
108        }
109
110        let lba_format_index = identify.flbas.low_index();
111        if lba_format_index > identify.nlbaf {
112            return Err(NamespaceError::FlbasInvalid);
113        }
114
115        let lbaf = identify.lbaf[lba_format_index as usize];
116        let block_shift = lbaf.lbads();
117        if !matches!(block_shift, 9..=16) {
118            return Err(NamespaceError::LbaFormatInvalid(lbaf));
119        }
120
121        let max_transfer_block_count = {
122            let mdts = if controller_identify.mdts != 0 {
123                controller_identify.mdts
124            } else {
125                u8::MAX
126            };
127            let max_transfer_bits = mdts.saturating_add(NVME_PAGE_SHIFT);
128            1 << max_transfer_bits
129                .checked_sub(block_shift)
130                .ok_or(NamespaceError::MdtsInvalid(mdts))?
131                .min(16)
132        };
133
134        let preferred_deallocate_granularity = if identify.nsfeat.optperf() {
135            identify.npdg
136        } else {
137            1
138        };
139
140        let reservation_capabilities = if controller_identify.oncs.reservations() {
141            identify.rescap
142        } else {
143            nvm::ReservationCapabilities::new()
144        };
145
146        let state = Arc::new(DynamicState {
147            block_count: identify.nsze.into(),
148            removed: false.into(),
149            identify: Mutex::new(identify),
150            resize_event: Default::default(),
151        });
152
153        // Spawn a task, but detach is so that it doesn't get dropped while NVMe
154        // request is in flight. Use a cancel context, whose cancel gets dropped
155        // when `self` gets dropped, so that it terminates after finishing any
156        // requests.
157        let (mut ctx, cancel_rescan) = CancelContext::new().with_cancel();
158        driver
159            .spawn(format!("nvme_poll_rescan_{nsid}"), {
160                let state = state.clone();
161                async move {
162                    state
163                        .poll_for_rescans(&mut ctx, &admin, nsid, &rescan_event)
164                        .await
165                }
166            })
167            .detach();
168
169        Ok(Self {
170            nsid,
171            state,
172            max_transfer_block_count,
173            block_shift: block_shift.into(),
174            preferred_deallocate_granularity,
175            reservation_capabilities,
176            controller_identify,
177            issuers: io_issuers.clone(),
178            _cancel_rescan: cancel_rescan,
179        })
180    }
181
182    /// Gets the current block count.
183    pub fn block_count(&self) -> u64 {
184        self.state.block_count.load(Ordering::Relaxed)
185    }
186
187    /// Wait for the block count to be different from `block_count`.
188    pub async fn wait_resize(&self, block_count: u64) -> u64 {
189        loop {
190            let listen = self.state.resize_event.listen();
191            let current = self.block_count();
192            if current != block_count {
193                break current;
194            }
195            listen.await;
196        }
197    }
198
199    /// Gets the block size in bytes.
200    pub fn block_size(&self) -> u32 {
201        1 << self.block_shift
202    }
203
204    fn check_active(&self) -> Result<(), RequestError> {
205        if self.state.removed.load(Ordering::Relaxed) {
206            // The namespace has been removed. Return invalid namespace even if
207            // the namespace has returned to avoid accidentally accessing the
208            // wrong disk.
209            return Err(RequestError::Nvme(
210                spec::Status::INVALID_NAMESPACE_OR_FORMAT.into(),
211            ));
212        }
213        Ok(())
214    }
215
216    async fn issuer(&self, cpu: u32) -> Result<&Issuer, RequestError> {
217        self.issuers.get(cpu).await
218    }
219
220    /// Reads from the namespace.
221    pub async fn read(
222        &self,
223        target_cpu: u32,
224        lba: u64,
225        block_count: u32,
226        guest_memory: &GuestMemory,
227        mem: PagedRange<'_>,
228    ) -> Result<(), RequestError> {
229        self.check_active()?;
230        if block_count == 0 {
231            return Ok(());
232        }
233        assert!(block_count <= self.max_transfer_block_count);
234        let len = (block_count as usize) << self.block_shift;
235        if len > mem.len() {
236            panic!(
237                "invalid block count: {len} > {mem_len}",
238                mem_len = mem.len()
239            );
240        }
241        self.issuer(target_cpu)
242            .await?
243            .issue_external(
244                spec::Command {
245                    cdw10: nvm::Cdw10ReadWrite::new().with_sbla_low(lba as u32).into(),
246                    cdw11: nvm::Cdw11ReadWrite::new()
247                        .with_sbla_high((lba >> 32) as u32)
248                        .into(),
249                    cdw12: nvm::Cdw12ReadWrite::new()
250                        .with_nlb_z((block_count - 1) as u16)
251                        .into(),
252                    ..nvm_cmd(nvm::NvmOpcode::READ, self.nsid)
253                },
254                guest_memory,
255                mem.subrange(0, len),
256            )
257            .await?;
258        Ok(())
259    }
260
261    /// Writes to the namespace.
262    pub async fn write(
263        &self,
264        target_cpu: u32,
265        lba: u64,
266        block_count: u32,
267        fua: bool,
268        guest_memory: &GuestMemory,
269        mem: PagedRange<'_>,
270    ) -> Result<(), RequestError> {
271        self.check_active()?;
272        if block_count == 0 {
273            return Ok(());
274        }
275        assert!(block_count <= self.max_transfer_block_count);
276        let len = (block_count as usize) << self.block_shift;
277        if len > mem.len() {
278            panic!(
279                "invalid block count: {len} > {mem_len}",
280                mem_len = mem.len()
281            );
282        }
283        self.issuer(target_cpu)
284            .await?
285            .issue_external(
286                spec::Command {
287                    cdw10: nvm::Cdw10ReadWrite::new().with_sbla_low(lba as u32).into(),
288                    cdw11: nvm::Cdw11ReadWrite::new()
289                        .with_sbla_high((lba >> 32) as u32)
290                        .into(),
291                    cdw12: nvm::Cdw12ReadWrite::new()
292                        .with_nlb_z((block_count - 1) as u16)
293                        .with_fua(fua)
294                        .into(),
295                    ..nvm_cmd(nvm::NvmOpcode::WRITE, self.nsid)
296                },
297                guest_memory,
298                mem.subrange(0, len),
299            )
300            .await?;
301        Ok(())
302    }
303
304    /// Flushes the namespace to persistent media.
305    pub async fn flush(&self, target_cpu: u32) -> Result<(), RequestError> {
306        self.check_active()?;
307        self.issuer(target_cpu)
308            .await?
309            .issue_neither(spec::Command {
310                ..nvm_cmd(nvm::NvmOpcode::FLUSH, self.nsid)
311            })
312            .await?;
313        Ok(())
314    }
315
316    /// Returns the maximum size for a read or write, in blocks.
317    pub fn max_transfer_block_count(&self) -> u32 {
318        self.max_transfer_block_count
319    }
320
321    /// Returns whether the namespace support dataset management, needed to call
322    /// [`Self::deallocate`].
323    pub fn supports_dataset_management(&self) -> bool {
324        self.controller_identify.oncs.dataset_management()
325    }
326
327    /// The preferred granularity for unmap requests.
328    pub fn preferred_deallocate_granularity(&self) -> u16 {
329        self.preferred_deallocate_granularity
330    }
331
332    /// Returns the maximum number of ranges to pass to [`Self::deallocate`].
333    pub fn dataset_management_range_limit(&self) -> usize {
334        // TODO: query DMRL
335        256
336    }
337
338    /// Returns the maximum size of a single range to pass to
339    /// [`Self::deallocate`].
340    pub fn dataset_management_range_size_limit(&self) -> u32 {
341        // TODO: query DMRSL
342        u32::MAX
343    }
344
345    /// Issues a dataset management command to deallocate the specified ranges.
346    ///
347    /// The device may ignore ranges or LBA counts beyond a certain point. Use
348    /// [`Self::dataset_management_range_limit`] and
349    /// [`Self::dataset_management_range_size_limit`] to get the
350    /// controller-reported bounds.
351    pub async fn deallocate(
352        &self,
353        target_cpu: u32,
354        ranges: &[nvm::DsmRange],
355    ) -> Result<(), RequestError> {
356        self.check_active()?;
357        // Limit the requested ranges.
358        let ranges = &ranges[..ranges.len().min(256)];
359        self.issuer(target_cpu)
360            .await?
361            .issue_in(
362                spec::Command {
363                    cdw10: nvm::Cdw10Dsm::new()
364                        .with_nr_z((ranges.len() - 1) as u8)
365                        .into(),
366                    cdw11: nvm::Cdw11Dsm::new().with_ad(true).into(),
367                    ..nvm_cmd(nvm::NvmOpcode::DSM, self.nsid)
368                },
369                ranges.as_bytes(),
370            )
371            .await?;
372        Ok(())
373    }
374
375    /// Gets the namespace's reservation capabilities.
376    pub fn reservation_capabilities(&self) -> nvm::ReservationCapabilities {
377        self.reservation_capabilities
378    }
379
380    /// Gets the namespace's reservation report.
381    pub async fn reservation_report_extended(
382        &self,
383        target_cpu: u32,
384    ) -> Result<
385        (
386            nvm::ReservationReportExtended,
387            Vec<nvm::RegisteredControllerExtended>,
388        ),
389        RequestError,
390    > {
391        let mut data = vec![0; 4096];
392        let issuer = self.issuer(target_cpu).await?;
393        loop {
394            issuer
395                .issue_out(
396                    spec::Command {
397                        cdw10: nvm::Cdw10ReservationReport::new()
398                            .with_numd_z((data.len() / 4 - 1) as u32)
399                            .into(),
400                        cdw11: nvm::Cdw11ReservationReport::new().with_eds(true).into(),
401                        ..nvm_cmd(nvm::NvmOpcode::RESERVATION_REPORT, self.nsid)
402                    },
403                    &mut data,
404                )
405                .await?;
406
407            let header = nvm::ReservationReportExtended::read_from_prefix(&data[..])
408                .unwrap()
409                .0; // TODO: zerocopy: use-rest-of-range (https://github.com/microsoft/openvmm/issues/759)
410            let len = size_of_val(&header)
411                + header.report.regctl.get() as usize
412                    * size_of::<nvm::RegisteredControllerExtended>();
413
414            if len > data.len() {
415                data.resize(len, 0);
416                continue;
417            }
418
419            let mut controllers = vec![
420                nvm::RegisteredControllerExtended::new_zeroed();
421                header.report.regctl.get().into()
422            ];
423
424            controllers
425                .as_mut_bytes()
426                .copy_from_slice(&data[size_of_val(&header)..len]);
427
428            break Ok((header, controllers));
429        }
430    }
431
432    /// Acquires a reservation.
433    pub async fn reservation_acquire(
434        &self,
435        target_cpu: u32,
436        action: nvm::ReservationAcquireAction,
437        crkey: u64,
438        prkey: u64,
439        reservation_type: nvm::ReservationType,
440    ) -> Result<(), RequestError> {
441        let data = nvm::ReservationAcquire { crkey, prkey };
442        self.issuer(target_cpu)
443            .await?
444            .issue_in(
445                spec::Command {
446                    cdw10: nvm::Cdw10ReservationAcquire::new()
447                        .with_racqa(action.0)
448                        .with_rtype(reservation_type.0)
449                        .into(),
450                    ..nvm_cmd(nvm::NvmOpcode::RESERVATION_ACQUIRE, self.nsid)
451                },
452                data.as_bytes(),
453            )
454            .await?;
455
456        Ok(())
457    }
458
459    /// Releases a reservation.
460    pub async fn reservation_release(
461        &self,
462        target_cpu: u32,
463        action: nvm::ReservationReleaseAction,
464        crkey: u64,
465        reservation_type: nvm::ReservationType,
466    ) -> Result<(), RequestError> {
467        let data = nvm::ReservationRelease { crkey };
468        self.issuer(target_cpu)
469            .await?
470            .issue_in(
471                spec::Command {
472                    cdw10: nvm::Cdw10ReservationRelease::new()
473                        .with_rrela(action.0)
474                        .with_rtype(reservation_type.0)
475                        .into(),
476                    ..nvm_cmd(nvm::NvmOpcode::RESERVATION_RELEASE, self.nsid)
477                },
478                data.as_bytes(),
479            )
480            .await?;
481
482        Ok(())
483    }
484
485    /// Modifies a reservation registration.
486    pub async fn reservation_register(
487        &self,
488        target_cpu: u32,
489        action: nvm::ReservationRegisterAction,
490        crkey: Option<u64>,
491        nrkey: u64,
492        ptpl: Option<bool>,
493    ) -> Result<(), RequestError> {
494        let data = nvm::ReservationRegister {
495            crkey: crkey.unwrap_or(0),
496            nrkey,
497        };
498        let cptpl = match ptpl {
499            None => nvm::ChangePersistThroughPowerLoss::NO_CHANGE,
500            Some(false) => nvm::ChangePersistThroughPowerLoss::CLEAR,
501            Some(true) => nvm::ChangePersistThroughPowerLoss::SET,
502        };
503        self.issuer(target_cpu)
504            .await?
505            .issue_in(
506                spec::Command {
507                    cdw10: nvm::Cdw10ReservationRegister::new()
508                        .with_rrega(action.0)
509                        .with_iekey(crkey.is_none())
510                        .with_cptpl(cptpl.0)
511                        .into(),
512                    ..nvm_cmd(nvm::NvmOpcode::RESERVATION_REGISTER, self.nsid)
513                },
514                data.as_bytes(),
515            )
516            .await?;
517
518        Ok(())
519    }
520
521    /// Return Namespace ID.
522    pub fn nsid(&self) -> u32 {
523        self.nsid
524    }
525
526    /// Save namespace object data for servicing.
527    /// Initially we will re-query namespace state after restore
528    /// to avoid possible contention if namespace was changed
529    /// during servicing.
530    /// TODO: Re-enable namespace save/restore once we confirm
531    /// that we can process namespace change AEN.
532    pub fn save(&self) -> anyhow::Result<SavedNamespaceData> {
533        Ok(SavedNamespaceData {
534            nsid: self.nsid,
535            identify_ns: self.state.identify.lock().clone(),
536        })
537    }
538
539    /// Restore namespace object data after servicing.
540    pub(super) fn restore(
541        driver: &VmTaskDriver,
542        admin: Arc<Issuer>,
543        rescan_event: Arc<event_listener::Event>,
544        identify_ctrl: Arc<spec::IdentifyController>,
545        io_issuers: &Arc<IoIssuers>,
546        saved_state: &SavedNamespaceData,
547    ) -> Result<Self, NamespaceError> {
548        let SavedNamespaceData { nsid, identify_ns } = saved_state;
549
550        Namespace::new_from_identify(
551            driver,
552            admin,
553            rescan_event,
554            identify_ctrl.clone(),
555            io_issuers,
556            *nsid,
557            identify_ns.clone(),
558        )
559    }
560}
561
562impl DynamicState {
563    async fn poll_for_rescans(
564        &self,
565        ctx: &mut CancelContext,
566        admin: &Issuer,
567        nsid: u32,
568        rescan_event: &event_listener::Event,
569    ) {
570        loop {
571            let listen = rescan_event.listen();
572            tracing::debug!("rescan");
573            // Query again even the first time through the loop to make sure
574            // we didn't miss the initial rescan notification.
575            match identify_namespace(admin, nsid).await {
576                Ok(identify) => {
577                    if identify.nsze == 0 {
578                        tracing::info!(nsid, "namespace was hot removed");
579                        self.removed.store(true, Ordering::Relaxed);
580                    } else {
581                        let old_block_count = self.block_count.load(Ordering::Relaxed);
582                        let new_block_count = identify.nsze;
583                        if old_block_count != new_block_count {
584                            tracing::info!(
585                                old_block_count,
586                                new_block_count,
587                                "nvme disk size changed"
588                            );
589                            self.block_count.store(new_block_count, Ordering::Relaxed);
590                            self.resize_event.notify(usize::MAX);
591                        } else {
592                            tracing::debug!("rescanned, no change");
593                        }
594                    }
595                    *self.identify.lock() = identify;
596                }
597                Err(err) => {
598                    tracing::warn!(
599                        nsid,
600                        error = &err as &dyn std::error::Error,
601                        "failed to query namespace during rescan"
602                    );
603                }
604            }
605
606            if ctx.until_cancelled(listen).await.is_err() {
607                break;
608            }
609        }
610    }
611}
612
613async fn identify_namespace(
614    admin: &Issuer,
615    nsid: u32,
616) -> Result<nvm::IdentifyNamespace, RequestError> {
617    let mut identify = nvm::IdentifyNamespace::new_zeroed();
618    admin
619        .issue_out(
620            spec::Command {
621                nsid,
622                cdw10: spec::Cdw10Identify::new()
623                    .with_cns(spec::Cns::NAMESPACE.0)
624                    .into(),
625                ..admin_cmd(spec::AdminOpcode::IDENTIFY)
626            },
627            identify.as_mut_bytes(),
628        )
629        .await?;
630    Ok(identify)
631}
632
633fn nvm_cmd(opcode: nvm::NvmOpcode, nsid: u32) -> spec::Command {
634    spec::Command {
635        cdw0: spec::Cdw0::new().with_opcode(opcode.0),
636        nsid,
637        ..FromZeros::new_zeroed()
638    }
639}