nvme_driver/
namespace.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! NVMe namespace frontend.
5
6use super::spec;
7use super::spec::nvm;
8use crate::NVME_PAGE_SHIFT;
9use crate::driver::IoIssuers;
10use crate::driver::save_restore::SavedNamespaceData;
11use crate::queue_pair::Issuer;
12use crate::queue_pair::RequestError;
13use crate::queue_pair::admin_cmd;
14use futures::StreamExt;
15use guestmem::GuestMemory;
16use guestmem::ranges::PagedRange;
17use inspect::Inspect;
18use pal_async::task::Spawn;
19use parking_lot::Mutex;
20use std::ops::Deref;
21use std::sync::Arc;
22use std::sync::atomic::AtomicBool;
23use std::sync::atomic::AtomicU64;
24use std::sync::atomic::Ordering;
25use thiserror::Error;
26use vmcore::vm_task::VmTaskDriver;
27use zerocopy::FromBytes;
28use zerocopy::FromZeros;
29use zerocopy::IntoBytes;
30
31/// An error getting a namespace.
32#[derive(Debug, Error)]
33#[expect(missing_docs)]
34pub enum NamespaceError {
35    #[error("namespace not found")]
36    NotFound,
37    #[error("formatted lba size invalid")]
38    FlbasInvalid,
39    #[error("lba format invalid: {0:?}")]
40    LbaFormatInvalid(nvm::Lbaf),
41    #[error("nvme request failed")]
42    Request(#[source] RequestError),
43    #[error("maximum data transfer size too small: 2^{0} pages")]
44    MdtsInvalid(u8),
45    #[error("requesting a duplicate namespace: {0}")]
46    Duplicate(u32),
47}
48
49/// A thin Namespace wrapper to revoke cloning permissions on `Arc<Namespace>`.
50/// This type allows the nvme_driver to force system-wide single-ownership
51/// semantics for `Namespace` objects.
52/// Because the end-user can no longer call namespace.clone(), `weak.upgrade()` can
53/// safely be used to determine when a Namespace is no longer in use by the disk.
54#[derive(Debug, Inspect)]
55pub struct NamespaceHandle {
56    namespace: Arc<Namespace>,
57}
58
59impl NamespaceHandle {
60    /// Creates a new handle
61    pub fn new(namespace: Arc<Namespace>) -> Self {
62        Self { namespace }
63    }
64}
65
66impl Deref for NamespaceHandle {
67    type Target = Namespace;
68    fn deref(&self) -> &Self::Target {
69        &self.namespace
70    }
71}
72
73/// An NVMe namespace.
74#[derive(Debug, Inspect)]
75pub struct Namespace {
76    nsid: u32,
77    #[inspect(flatten)]
78    state: Arc<DynamicState>,
79    block_shift: u32,
80    max_transfer_block_count: u32,
81    preferred_deallocate_granularity: u16,
82    reservation_capabilities: nvm::ReservationCapabilities,
83    controller_identify: Arc<spec::IdentifyController>,
84    #[inspect(skip)]
85    issuers: Arc<IoIssuers>,
86}
87
88#[derive(Debug, Inspect)]
89struct DynamicState {
90    block_count: AtomicU64,
91    #[inspect(skip)]
92    resize_event: event_listener::Event,
93    removed: AtomicBool,
94    identify: Mutex<nvm::IdentifyNamespace>,
95}
96
97impl Namespace {
98    pub(super) async fn new(
99        driver: &VmTaskDriver,
100        admin: Arc<Issuer>,
101        rescan_event: mesh::Receiver<()>,
102        controller_identify: Arc<spec::IdentifyController>,
103        io_issuers: &Arc<IoIssuers>,
104        nsid: u32,
105    ) -> Result<Self, NamespaceError> {
106        let identify = identify_namespace(&admin, nsid)
107            .await
108            .map_err(NamespaceError::Request)?;
109
110        tracing::debug!(
111            "created namespace from identify nsid={}, nsze={}, nsguid={:?}",
112            nsid,
113            identify.nsze,
114            identify.nguid
115        );
116
117        Namespace::new_from_identify(
118            driver,
119            admin,
120            rescan_event,
121            controller_identify.clone(),
122            io_issuers,
123            nsid,
124            identify,
125        )
126    }
127
128    /// Create Namespace object from Identify data structure.
129    fn new_from_identify(
130        driver: &VmTaskDriver,
131        admin: Arc<Issuer>,
132        rescan_event: mesh::Receiver<()>,
133        controller_identify: Arc<spec::IdentifyController>,
134        io_issuers: &Arc<IoIssuers>,
135        nsid: u32,
136        identify: nvm::IdentifyNamespace,
137    ) -> Result<Self, NamespaceError> {
138        if identify.nsze == 0 {
139            return Err(NamespaceError::NotFound);
140        }
141
142        let lba_format_index = identify.flbas.low_index();
143        if lba_format_index > identify.nlbaf {
144            return Err(NamespaceError::FlbasInvalid);
145        }
146
147        let lbaf = identify.lbaf[lba_format_index as usize];
148        let block_shift = lbaf.lbads();
149        if !matches!(block_shift, 9..=16) {
150            return Err(NamespaceError::LbaFormatInvalid(lbaf));
151        }
152
153        let max_transfer_block_count = {
154            let mdts = if controller_identify.mdts != 0 {
155                controller_identify.mdts
156            } else {
157                u8::MAX
158            };
159            let max_transfer_bits = mdts.saturating_add(NVME_PAGE_SHIFT);
160            1 << max_transfer_bits
161                .checked_sub(block_shift)
162                .ok_or(NamespaceError::MdtsInvalid(mdts))?
163                .min(16)
164        };
165
166        let preferred_deallocate_granularity = if identify.nsfeat.optperf() {
167            identify.npdg
168        } else {
169            1
170        };
171
172        let reservation_capabilities = if controller_identify.oncs.reservations() {
173            identify.rescap
174        } else {
175            nvm::ReservationCapabilities::new()
176        };
177
178        let state = Arc::new(DynamicState {
179            block_count: identify.nsze.into(),
180            removed: false.into(),
181            identify: Mutex::new(identify),
182            resize_event: Default::default(),
183        });
184
185        // NOTE: Detach `poll_for_rescans` task because its lifetime is not tied
186        // to that of the Namespace object. `poll_for_rescans` terminates when the sender of
187        // rescan_event is dropped. i.e. lifetime of this task is tied to the NvmeDriver
188        // & `handle_asynchronous_events` task within the driver. Because the
189        // driver stores references to Namespaces, this task will never outlive
190        // the Namespace object.
191        driver
192            .spawn(format!("nvme_poll_rescan_{nsid}"), {
193                let state = state.clone();
194                async move { state.poll_for_rescans(&admin, nsid, rescan_event).await }
195            })
196            .detach();
197
198        Ok(Self {
199            nsid,
200            state,
201            max_transfer_block_count,
202            block_shift: block_shift.into(),
203            preferred_deallocate_granularity,
204            reservation_capabilities,
205            controller_identify,
206            issuers: io_issuers.clone(),
207        })
208    }
209
210    /// Gets the current block count.
211    pub fn block_count(&self) -> u64 {
212        self.state.block_count.load(Ordering::Relaxed)
213    }
214
215    /// Wait for the block count to be different from `block_count`.
216    pub async fn wait_resize(&self, block_count: u64) -> u64 {
217        loop {
218            let listen = self.state.resize_event.listen();
219            let current = self.block_count();
220            if current != block_count {
221                break current;
222            }
223            listen.await;
224        }
225    }
226
227    /// Gets the block size in bytes.
228    pub fn block_size(&self) -> u32 {
229        1 << self.block_shift
230    }
231
232    pub fn check_active(&self) -> Result<(), RequestError> {
233        if self.state.removed.load(Ordering::Relaxed) {
234            // The namespace has been removed. Return invalid namespace even if
235            // the namespace has returned to avoid accidentally accessing the
236            // wrong disk.
237            return Err(RequestError::Nvme(
238                spec::Status::INVALID_NAMESPACE_OR_FORMAT.into(),
239            ));
240        }
241        Ok(())
242    }
243
244    async fn issuer(&self, cpu: u32) -> Result<&Issuer, RequestError> {
245        self.issuers.get(cpu).await
246    }
247
248    /// Reads from the namespace.
249    pub async fn read(
250        &self,
251        target_cpu: u32,
252        lba: u64,
253        block_count: u32,
254        guest_memory: &GuestMemory,
255        mem: PagedRange<'_>,
256    ) -> Result<(), RequestError> {
257        self.check_active()?;
258        if block_count == 0 {
259            return Ok(());
260        }
261        assert!(block_count <= self.max_transfer_block_count);
262        let len = (block_count as usize) << self.block_shift;
263        if len > mem.len() {
264            panic!(
265                "invalid block count: {len} > {mem_len}",
266                mem_len = mem.len()
267            );
268        }
269        self.issuer(target_cpu)
270            .await?
271            .issue_external(
272                spec::Command {
273                    cdw10: nvm::Cdw10ReadWrite::new().with_sbla_low(lba as u32).into(),
274                    cdw11: nvm::Cdw11ReadWrite::new()
275                        .with_sbla_high((lba >> 32) as u32)
276                        .into(),
277                    cdw12: nvm::Cdw12ReadWrite::new()
278                        .with_nlb_z((block_count - 1) as u16)
279                        .into(),
280                    ..nvm_cmd(nvm::NvmOpcode::READ, self.nsid)
281                },
282                guest_memory,
283                mem.subrange(0, len),
284            )
285            .await?;
286        Ok(())
287    }
288
289    /// Writes to the namespace.
290    pub async fn write(
291        &self,
292        target_cpu: u32,
293        lba: u64,
294        block_count: u32,
295        fua: bool,
296        guest_memory: &GuestMemory,
297        mem: PagedRange<'_>,
298    ) -> Result<(), RequestError> {
299        self.check_active()?;
300        if block_count == 0 {
301            return Ok(());
302        }
303        assert!(block_count <= self.max_transfer_block_count);
304        let len = (block_count as usize) << self.block_shift;
305        if len > mem.len() {
306            panic!(
307                "invalid block count: {len} > {mem_len}",
308                mem_len = mem.len()
309            );
310        }
311        self.issuer(target_cpu)
312            .await?
313            .issue_external(
314                spec::Command {
315                    cdw10: nvm::Cdw10ReadWrite::new().with_sbla_low(lba as u32).into(),
316                    cdw11: nvm::Cdw11ReadWrite::new()
317                        .with_sbla_high((lba >> 32) as u32)
318                        .into(),
319                    cdw12: nvm::Cdw12ReadWrite::new()
320                        .with_nlb_z((block_count - 1) as u16)
321                        .with_fua(fua)
322                        .into(),
323                    ..nvm_cmd(nvm::NvmOpcode::WRITE, self.nsid)
324                },
325                guest_memory,
326                mem.subrange(0, len),
327            )
328            .await?;
329        Ok(())
330    }
331
332    /// Flushes the namespace to persistent media.
333    pub async fn flush(&self, target_cpu: u32) -> Result<(), RequestError> {
334        self.check_active()?;
335        self.issuer(target_cpu)
336            .await?
337            .issue_neither(spec::Command {
338                ..nvm_cmd(nvm::NvmOpcode::FLUSH, self.nsid)
339            })
340            .await?;
341        Ok(())
342    }
343
344    /// Returns the maximum size for a read or write, in blocks.
345    pub fn max_transfer_block_count(&self) -> u32 {
346        self.max_transfer_block_count
347    }
348
349    /// Returns whether the namespace support dataset management, needed to call
350    /// [`Self::deallocate`].
351    pub fn supports_dataset_management(&self) -> bool {
352        self.controller_identify.oncs.dataset_management()
353    }
354
355    /// The preferred granularity for unmap requests.
356    pub fn preferred_deallocate_granularity(&self) -> u16 {
357        self.preferred_deallocate_granularity
358    }
359
360    /// Returns the maximum number of ranges to pass to [`Self::deallocate`].
361    pub fn dataset_management_range_limit(&self) -> usize {
362        // TODO: query DMRL
363        256
364    }
365
366    /// Returns the maximum size of a single range to pass to
367    /// [`Self::deallocate`].
368    pub fn dataset_management_range_size_limit(&self) -> u32 {
369        // TODO: query DMRSL
370        u32::MAX
371    }
372
373    /// Issues a dataset management command to deallocate the specified ranges.
374    ///
375    /// The device may ignore ranges or LBA counts beyond a certain point. Use
376    /// [`Self::dataset_management_range_limit`] and
377    /// [`Self::dataset_management_range_size_limit`] to get the
378    /// controller-reported bounds.
379    pub async fn deallocate(
380        &self,
381        target_cpu: u32,
382        ranges: &[nvm::DsmRange],
383    ) -> Result<(), RequestError> {
384        self.check_active()?;
385        // Limit the requested ranges.
386        let ranges = &ranges[..ranges.len().min(256)];
387        self.issuer(target_cpu)
388            .await?
389            .issue_in(
390                spec::Command {
391                    cdw10: nvm::Cdw10Dsm::new()
392                        .with_nr_z((ranges.len() - 1) as u8)
393                        .into(),
394                    cdw11: nvm::Cdw11Dsm::new().with_ad(true).into(),
395                    ..nvm_cmd(nvm::NvmOpcode::DSM, self.nsid)
396                },
397                ranges.as_bytes(),
398            )
399            .await?;
400        Ok(())
401    }
402
403    /// Gets the namespace's reservation capabilities.
404    pub fn reservation_capabilities(&self) -> nvm::ReservationCapabilities {
405        self.reservation_capabilities
406    }
407
408    /// Gets the namespace's reservation report.
409    pub async fn reservation_report_extended(
410        &self,
411        target_cpu: u32,
412    ) -> Result<
413        (
414            nvm::ReservationReportExtended,
415            Vec<nvm::RegisteredControllerExtended>,
416        ),
417        RequestError,
418    > {
419        let mut data = vec![0; 4096];
420        let issuer = self.issuer(target_cpu).await?;
421        loop {
422            issuer
423                .issue_out(
424                    spec::Command {
425                        cdw10: nvm::Cdw10ReservationReport::new()
426                            .with_numd_z((data.len() / 4 - 1) as u32)
427                            .into(),
428                        cdw11: nvm::Cdw11ReservationReport::new().with_eds(true).into(),
429                        ..nvm_cmd(nvm::NvmOpcode::RESERVATION_REPORT, self.nsid)
430                    },
431                    &mut data,
432                )
433                .await?;
434
435            let header = nvm::ReservationReportExtended::read_from_prefix(&data[..])
436                .unwrap()
437                .0; // TODO: zerocopy: use-rest-of-range (https://github.com/microsoft/openvmm/issues/759)
438            let len = size_of_val(&header)
439                + header.report.regctl.get() as usize
440                    * size_of::<nvm::RegisteredControllerExtended>();
441
442            if len > data.len() {
443                data.resize(len, 0);
444                continue;
445            }
446
447            let mut controllers = vec![
448                nvm::RegisteredControllerExtended::new_zeroed();
449                header.report.regctl.get().into()
450            ];
451
452            controllers
453                .as_mut_bytes()
454                .copy_from_slice(&data[size_of_val(&header)..len]);
455
456            break Ok((header, controllers));
457        }
458    }
459
460    /// Acquires a reservation.
461    pub async fn reservation_acquire(
462        &self,
463        target_cpu: u32,
464        action: nvm::ReservationAcquireAction,
465        crkey: u64,
466        prkey: u64,
467        reservation_type: nvm::ReservationType,
468    ) -> Result<(), RequestError> {
469        let data = nvm::ReservationAcquire { crkey, prkey };
470        self.issuer(target_cpu)
471            .await?
472            .issue_in(
473                spec::Command {
474                    cdw10: nvm::Cdw10ReservationAcquire::new()
475                        .with_racqa(action.0)
476                        .with_rtype(reservation_type.0)
477                        .into(),
478                    ..nvm_cmd(nvm::NvmOpcode::RESERVATION_ACQUIRE, self.nsid)
479                },
480                data.as_bytes(),
481            )
482            .await?;
483
484        Ok(())
485    }
486
487    /// Releases a reservation.
488    pub async fn reservation_release(
489        &self,
490        target_cpu: u32,
491        action: nvm::ReservationReleaseAction,
492        crkey: u64,
493        reservation_type: nvm::ReservationType,
494    ) -> Result<(), RequestError> {
495        let data = nvm::ReservationRelease { crkey };
496        self.issuer(target_cpu)
497            .await?
498            .issue_in(
499                spec::Command {
500                    cdw10: nvm::Cdw10ReservationRelease::new()
501                        .with_rrela(action.0)
502                        .with_rtype(reservation_type.0)
503                        .into(),
504                    ..nvm_cmd(nvm::NvmOpcode::RESERVATION_RELEASE, self.nsid)
505                },
506                data.as_bytes(),
507            )
508            .await?;
509
510        Ok(())
511    }
512
513    /// Modifies a reservation registration.
514    pub async fn reservation_register(
515        &self,
516        target_cpu: u32,
517        action: nvm::ReservationRegisterAction,
518        crkey: Option<u64>,
519        nrkey: u64,
520        ptpl: Option<bool>,
521    ) -> Result<(), RequestError> {
522        let data = nvm::ReservationRegister {
523            crkey: crkey.unwrap_or(0),
524            nrkey,
525        };
526        let cptpl = match ptpl {
527            None => nvm::ChangePersistThroughPowerLoss::NO_CHANGE,
528            Some(false) => nvm::ChangePersistThroughPowerLoss::CLEAR,
529            Some(true) => nvm::ChangePersistThroughPowerLoss::SET,
530        };
531        self.issuer(target_cpu)
532            .await?
533            .issue_in(
534                spec::Command {
535                    cdw10: nvm::Cdw10ReservationRegister::new()
536                        .with_rrega(action.0)
537                        .with_iekey(crkey.is_none())
538                        .with_cptpl(cptpl.0)
539                        .into(),
540                    ..nvm_cmd(nvm::NvmOpcode::RESERVATION_REGISTER, self.nsid)
541                },
542                data.as_bytes(),
543            )
544            .await?;
545
546        Ok(())
547    }
548
549    /// Return Namespace ID.
550    pub fn nsid(&self) -> u32 {
551        self.nsid
552    }
553
554    /// Save namespace object data for servicing.
555    /// Initially we will re-query namespace state after restore
556    /// to avoid possible contention if namespace was changed
557    /// during servicing.
558    pub fn save(&self) -> anyhow::Result<SavedNamespaceData> {
559        Ok(SavedNamespaceData {
560            nsid: self.nsid,
561            identify_ns: self.state.identify.lock().clone(),
562        })
563    }
564
565    /// Restore namespace object data after servicing.
566    pub(super) fn restore(
567        driver: &VmTaskDriver,
568        admin: Arc<Issuer>,
569        rescan_event: mesh::Receiver<()>,
570        identify_ctrl: Arc<spec::IdentifyController>,
571        io_issuers: &Arc<IoIssuers>,
572        saved_state: &SavedNamespaceData,
573    ) -> Result<Self, NamespaceError> {
574        let SavedNamespaceData { nsid, identify_ns } = saved_state;
575
576        Namespace::new_from_identify(
577            driver,
578            admin,
579            rescan_event,
580            identify_ctrl.clone(),
581            io_issuers,
582            *nsid,
583            identify_ns.clone(),
584        )
585    }
586}
587
588impl DynamicState {
589    async fn poll_for_rescans(
590        &self,
591        admin: &Issuer,
592        nsid: u32,
593        mut rescan_event: mesh::Receiver<()>,
594    ) {
595        loop {
596            tracing::debug!("rescan task started nsid={}", nsid);
597
598            // This relies on a mesh channel so notifications will NOT be missed
599            // even if the task was not started when the first AEN was processed.
600            let event = rescan_event.next().await;
601
602            // Once the sender is dropped, no more repoll signals can be received so
603            // there is no point in continuing.
604            if event.is_none() {
605                tracing::debug!("rescan task exiting nsid={}", nsid);
606                break;
607            }
608
609            match identify_namespace(admin, nsid).await {
610                Ok(identify) => {
611                    if identify.nsze == 0 {
612                        tracing::info!(nsid, "namespace was hot removed");
613                        self.removed.store(true, Ordering::Relaxed);
614                    } else {
615                        let old_block_count = self.block_count.load(Ordering::Relaxed);
616                        let new_block_count = identify.nsze;
617                        if old_block_count != new_block_count {
618                            tracing::info!(
619                                old_block_count,
620                                new_block_count,
621                                "nvme disk size changed"
622                            );
623                            self.block_count.store(new_block_count, Ordering::Relaxed);
624                            self.resize_event.notify(usize::MAX);
625                        } else {
626                            tracing::debug!("rescanned, no change");
627                        }
628                    }
629                    *self.identify.lock() = identify;
630                }
631                Err(err) => {
632                    tracing::warn!(
633                        nsid,
634                        error = &err as &dyn std::error::Error,
635                        "failed to query namespace during rescan"
636                    );
637                }
638            }
639        }
640    }
641}
642
643async fn identify_namespace(
644    admin: &Issuer,
645    nsid: u32,
646) -> Result<nvm::IdentifyNamespace, RequestError> {
647    let mut identify = nvm::IdentifyNamespace::new_zeroed();
648    admin
649        .issue_out(
650            spec::Command {
651                nsid,
652                cdw10: spec::Cdw10Identify::new()
653                    .with_cns(spec::Cns::NAMESPACE.0)
654                    .into(),
655                ..admin_cmd(spec::AdminOpcode::IDENTIFY)
656            },
657            identify.as_mut_bytes(),
658        )
659        .await?;
660    Ok(identify)
661}
662
663fn nvm_cmd(opcode: nvm::NvmOpcode, nsid: u32) -> spec::Command {
664    spec::Command {
665        cdw0: spec::Cdw0::new().with_opcode(opcode.0),
666        nsid,
667        ..FromZeros::new_zeroed()
668    }
669}