nvme_driver/
namespace.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! NVMe namespace frontend.
5
6use super::spec;
7use super::spec::nvm;
8use crate::NVME_PAGE_SHIFT;
9use crate::driver::IoIssuers;
10use crate::driver::save_restore::SavedNamespaceData;
11use crate::queue_pair::Issuer;
12use crate::queue_pair::RequestError;
13use crate::queue_pair::admin_cmd;
14use futures::StreamExt;
15use guestmem::GuestMemory;
16use guestmem::ranges::PagedRange;
17use inspect::Inspect;
18use pal_async::task::Spawn;
19use parking_lot::Mutex;
20use std::sync::Arc;
21use std::sync::atomic::AtomicBool;
22use std::sync::atomic::AtomicU64;
23use std::sync::atomic::Ordering;
24use thiserror::Error;
25use vmcore::vm_task::VmTaskDriver;
26use zerocopy::FromBytes;
27use zerocopy::FromZeros;
28use zerocopy::IntoBytes;
29
30/// An error getting a namespace.
31#[derive(Debug, Error)]
32#[expect(missing_docs)]
33pub enum NamespaceError {
34    #[error("namespace not found")]
35    NotFound,
36    #[error("formatted lba size invalid")]
37    FlbasInvalid,
38    #[error("lba format invalid: {0:?}")]
39    LbaFormatInvalid(nvm::Lbaf),
40    #[error("nvme request failed")]
41    Request(#[source] RequestError),
42    #[error("maximum data transfer size too small: 2^{0} pages")]
43    MdtsInvalid(u8),
44    #[error("namespace ID {nsid} already exists")]
45    DuplicateRequest { nsid: u32 },
46}
47
48/// An NVMe namespace.
49#[derive(Debug, Inspect)]
50pub struct Namespace {
51    nsid: u32,
52    #[inspect(flatten)]
53    state: Arc<DynamicState>,
54    block_shift: u32,
55    max_transfer_block_count: u32,
56    preferred_deallocate_granularity: u16,
57    reservation_capabilities: nvm::ReservationCapabilities,
58    controller_identify: Arc<spec::IdentifyController>,
59    #[inspect(skip)]
60    issuers: Arc<IoIssuers>,
61}
62
63#[derive(Debug, Inspect)]
64struct DynamicState {
65    block_count: AtomicU64,
66    #[inspect(skip)]
67    resize_event: event_listener::Event,
68    removed: AtomicBool,
69    identify: Mutex<nvm::IdentifyNamespace>,
70}
71
72impl Namespace {
73    pub(super) async fn new(
74        driver: &VmTaskDriver,
75        admin: Arc<Issuer>,
76        rescan_event: mesh::Receiver<()>,
77        controller_identify: Arc<spec::IdentifyController>,
78        io_issuers: &Arc<IoIssuers>,
79        nsid: u32,
80    ) -> Result<Self, NamespaceError> {
81        let identify = identify_namespace(&admin, nsid)
82            .await
83            .map_err(NamespaceError::Request)?;
84
85        Namespace::new_from_identify(
86            driver,
87            admin,
88            rescan_event,
89            controller_identify.clone(),
90            io_issuers,
91            nsid,
92            identify,
93        )
94    }
95
96    /// Create Namespace object from Identify data structure.
97    fn new_from_identify(
98        driver: &VmTaskDriver,
99        admin: Arc<Issuer>,
100        rescan_event: mesh::Receiver<()>,
101        controller_identify: Arc<spec::IdentifyController>,
102        io_issuers: &Arc<IoIssuers>,
103        nsid: u32,
104        identify: nvm::IdentifyNamespace,
105    ) -> Result<Self, NamespaceError> {
106        if identify.nsze == 0 {
107            return Err(NamespaceError::NotFound);
108        }
109
110        let lba_format_index = identify.flbas.low_index();
111        if lba_format_index > identify.nlbaf {
112            return Err(NamespaceError::FlbasInvalid);
113        }
114
115        let lbaf = identify.lbaf[lba_format_index as usize];
116        let block_shift = lbaf.lbads();
117        if !matches!(block_shift, 9..=16) {
118            return Err(NamespaceError::LbaFormatInvalid(lbaf));
119        }
120
121        let max_transfer_block_count = {
122            let mdts = if controller_identify.mdts != 0 {
123                controller_identify.mdts
124            } else {
125                u8::MAX
126            };
127            let max_transfer_bits = mdts.saturating_add(NVME_PAGE_SHIFT);
128            1 << max_transfer_bits
129                .checked_sub(block_shift)
130                .ok_or(NamespaceError::MdtsInvalid(mdts))?
131                .min(16)
132        };
133
134        let preferred_deallocate_granularity = if identify.nsfeat.optperf() {
135            identify.npdg
136        } else {
137            1
138        };
139
140        let reservation_capabilities = if controller_identify.oncs.reservations() {
141            identify.rescap
142        } else {
143            nvm::ReservationCapabilities::new()
144        };
145
146        let state = Arc::new(DynamicState {
147            block_count: identify.nsze.into(),
148            removed: false.into(),
149            identify: Mutex::new(identify),
150            resize_event: Default::default(),
151        });
152
153        // NOTE: Detach `poll_for_rescans` task because its lifetime is not tied
154        // to that of the Namespace object. `poll_for_rescans` terminates when the sender of
155        // rescan_event is dropped. i.e. lifetime of this task is tied to the NvmeDriver
156        // & `handle_asynchronous_events` task within the driver. Because the
157        // driver stores references to Namespaces, this task will never outlive
158        // the Namespace object.
159        driver
160            .spawn(format!("nvme_poll_rescan_{nsid}"), {
161                let state = state.clone();
162                async move { state.poll_for_rescans(&admin, nsid, rescan_event).await }
163            })
164            .detach();
165
166        Ok(Self {
167            nsid,
168            state,
169            max_transfer_block_count,
170            block_shift: block_shift.into(),
171            preferred_deallocate_granularity,
172            reservation_capabilities,
173            controller_identify,
174            issuers: io_issuers.clone(),
175        })
176    }
177
178    /// Gets the current block count.
179    pub fn block_count(&self) -> u64 {
180        self.state.block_count.load(Ordering::Relaxed)
181    }
182
183    /// Wait for the block count to be different from `block_count`.
184    pub async fn wait_resize(&self, block_count: u64) -> u64 {
185        loop {
186            let listen = self.state.resize_event.listen();
187            let current = self.block_count();
188            if current != block_count {
189                break current;
190            }
191            listen.await;
192        }
193    }
194
195    /// Gets the block size in bytes.
196    pub fn block_size(&self) -> u32 {
197        1 << self.block_shift
198    }
199
200    fn check_active(&self) -> Result<(), RequestError> {
201        if self.state.removed.load(Ordering::Relaxed) {
202            // The namespace has been removed. Return invalid namespace even if
203            // the namespace has returned to avoid accidentally accessing the
204            // wrong disk.
205            return Err(RequestError::Nvme(
206                spec::Status::INVALID_NAMESPACE_OR_FORMAT.into(),
207            ));
208        }
209        Ok(())
210    }
211
212    async fn issuer(&self, cpu: u32) -> Result<&Issuer, RequestError> {
213        self.issuers.get(cpu).await
214    }
215
216    /// Reads from the namespace.
217    pub async fn read(
218        &self,
219        target_cpu: u32,
220        lba: u64,
221        block_count: u32,
222        guest_memory: &GuestMemory,
223        mem: PagedRange<'_>,
224    ) -> Result<(), RequestError> {
225        self.check_active()?;
226        if block_count == 0 {
227            return Ok(());
228        }
229        assert!(block_count <= self.max_transfer_block_count);
230        let len = (block_count as usize) << self.block_shift;
231        if len > mem.len() {
232            panic!(
233                "invalid block count: {len} > {mem_len}",
234                mem_len = mem.len()
235            );
236        }
237        self.issuer(target_cpu)
238            .await?
239            .issue_external(
240                spec::Command {
241                    cdw10: nvm::Cdw10ReadWrite::new().with_sbla_low(lba as u32).into(),
242                    cdw11: nvm::Cdw11ReadWrite::new()
243                        .with_sbla_high((lba >> 32) as u32)
244                        .into(),
245                    cdw12: nvm::Cdw12ReadWrite::new()
246                        .with_nlb_z((block_count - 1) as u16)
247                        .into(),
248                    ..nvm_cmd(nvm::NvmOpcode::READ, self.nsid)
249                },
250                guest_memory,
251                mem.subrange(0, len),
252            )
253            .await?;
254        Ok(())
255    }
256
257    /// Writes to the namespace.
258    pub async fn write(
259        &self,
260        target_cpu: u32,
261        lba: u64,
262        block_count: u32,
263        fua: bool,
264        guest_memory: &GuestMemory,
265        mem: PagedRange<'_>,
266    ) -> Result<(), RequestError> {
267        self.check_active()?;
268        if block_count == 0 {
269            return Ok(());
270        }
271        assert!(block_count <= self.max_transfer_block_count);
272        let len = (block_count as usize) << self.block_shift;
273        if len > mem.len() {
274            panic!(
275                "invalid block count: {len} > {mem_len}",
276                mem_len = mem.len()
277            );
278        }
279        self.issuer(target_cpu)
280            .await?
281            .issue_external(
282                spec::Command {
283                    cdw10: nvm::Cdw10ReadWrite::new().with_sbla_low(lba as u32).into(),
284                    cdw11: nvm::Cdw11ReadWrite::new()
285                        .with_sbla_high((lba >> 32) as u32)
286                        .into(),
287                    cdw12: nvm::Cdw12ReadWrite::new()
288                        .with_nlb_z((block_count - 1) as u16)
289                        .with_fua(fua)
290                        .into(),
291                    ..nvm_cmd(nvm::NvmOpcode::WRITE, self.nsid)
292                },
293                guest_memory,
294                mem.subrange(0, len),
295            )
296            .await?;
297        Ok(())
298    }
299
300    /// Flushes the namespace to persistent media.
301    pub async fn flush(&self, target_cpu: u32) -> Result<(), RequestError> {
302        self.check_active()?;
303        self.issuer(target_cpu)
304            .await?
305            .issue_neither(spec::Command {
306                ..nvm_cmd(nvm::NvmOpcode::FLUSH, self.nsid)
307            })
308            .await?;
309        Ok(())
310    }
311
312    /// Returns the maximum size for a read or write, in blocks.
313    pub fn max_transfer_block_count(&self) -> u32 {
314        self.max_transfer_block_count
315    }
316
317    /// Returns whether the namespace support dataset management, needed to call
318    /// [`Self::deallocate`].
319    pub fn supports_dataset_management(&self) -> bool {
320        self.controller_identify.oncs.dataset_management()
321    }
322
323    /// The preferred granularity for unmap requests.
324    pub fn preferred_deallocate_granularity(&self) -> u16 {
325        self.preferred_deallocate_granularity
326    }
327
328    /// Returns the maximum number of ranges to pass to [`Self::deallocate`].
329    pub fn dataset_management_range_limit(&self) -> usize {
330        // TODO: query DMRL
331        256
332    }
333
334    /// Returns the maximum size of a single range to pass to
335    /// [`Self::deallocate`].
336    pub fn dataset_management_range_size_limit(&self) -> u32 {
337        // TODO: query DMRSL
338        u32::MAX
339    }
340
341    /// Issues a dataset management command to deallocate the specified ranges.
342    ///
343    /// The device may ignore ranges or LBA counts beyond a certain point. Use
344    /// [`Self::dataset_management_range_limit`] and
345    /// [`Self::dataset_management_range_size_limit`] to get the
346    /// controller-reported bounds.
347    pub async fn deallocate(
348        &self,
349        target_cpu: u32,
350        ranges: &[nvm::DsmRange],
351    ) -> Result<(), RequestError> {
352        self.check_active()?;
353        // Limit the requested ranges.
354        let ranges = &ranges[..ranges.len().min(256)];
355        self.issuer(target_cpu)
356            .await?
357            .issue_in(
358                spec::Command {
359                    cdw10: nvm::Cdw10Dsm::new()
360                        .with_nr_z((ranges.len() - 1) as u8)
361                        .into(),
362                    cdw11: nvm::Cdw11Dsm::new().with_ad(true).into(),
363                    ..nvm_cmd(nvm::NvmOpcode::DSM, self.nsid)
364                },
365                ranges.as_bytes(),
366            )
367            .await?;
368        Ok(())
369    }
370
371    /// Gets the namespace's reservation capabilities.
372    pub fn reservation_capabilities(&self) -> nvm::ReservationCapabilities {
373        self.reservation_capabilities
374    }
375
376    /// Gets the namespace's reservation report.
377    pub async fn reservation_report_extended(
378        &self,
379        target_cpu: u32,
380    ) -> Result<
381        (
382            nvm::ReservationReportExtended,
383            Vec<nvm::RegisteredControllerExtended>,
384        ),
385        RequestError,
386    > {
387        let mut data = vec![0; 4096];
388        let issuer = self.issuer(target_cpu).await?;
389        loop {
390            issuer
391                .issue_out(
392                    spec::Command {
393                        cdw10: nvm::Cdw10ReservationReport::new()
394                            .with_numd_z((data.len() / 4 - 1) as u32)
395                            .into(),
396                        cdw11: nvm::Cdw11ReservationReport::new().with_eds(true).into(),
397                        ..nvm_cmd(nvm::NvmOpcode::RESERVATION_REPORT, self.nsid)
398                    },
399                    &mut data,
400                )
401                .await?;
402
403            let header = nvm::ReservationReportExtended::read_from_prefix(&data[..])
404                .unwrap()
405                .0; // TODO: zerocopy: use-rest-of-range (https://github.com/microsoft/openvmm/issues/759)
406            let len = size_of_val(&header)
407                + header.report.regctl.get() as usize
408                    * size_of::<nvm::RegisteredControllerExtended>();
409
410            if len > data.len() {
411                data.resize(len, 0);
412                continue;
413            }
414
415            let mut controllers = vec![
416                nvm::RegisteredControllerExtended::new_zeroed();
417                header.report.regctl.get().into()
418            ];
419
420            controllers
421                .as_mut_bytes()
422                .copy_from_slice(&data[size_of_val(&header)..len]);
423
424            break Ok((header, controllers));
425        }
426    }
427
428    /// Acquires a reservation.
429    pub async fn reservation_acquire(
430        &self,
431        target_cpu: u32,
432        action: nvm::ReservationAcquireAction,
433        crkey: u64,
434        prkey: u64,
435        reservation_type: nvm::ReservationType,
436    ) -> Result<(), RequestError> {
437        let data = nvm::ReservationAcquire { crkey, prkey };
438        self.issuer(target_cpu)
439            .await?
440            .issue_in(
441                spec::Command {
442                    cdw10: nvm::Cdw10ReservationAcquire::new()
443                        .with_racqa(action.0)
444                        .with_rtype(reservation_type.0)
445                        .into(),
446                    ..nvm_cmd(nvm::NvmOpcode::RESERVATION_ACQUIRE, self.nsid)
447                },
448                data.as_bytes(),
449            )
450            .await?;
451
452        Ok(())
453    }
454
455    /// Releases a reservation.
456    pub async fn reservation_release(
457        &self,
458        target_cpu: u32,
459        action: nvm::ReservationReleaseAction,
460        crkey: u64,
461        reservation_type: nvm::ReservationType,
462    ) -> Result<(), RequestError> {
463        let data = nvm::ReservationRelease { crkey };
464        self.issuer(target_cpu)
465            .await?
466            .issue_in(
467                spec::Command {
468                    cdw10: nvm::Cdw10ReservationRelease::new()
469                        .with_rrela(action.0)
470                        .with_rtype(reservation_type.0)
471                        .into(),
472                    ..nvm_cmd(nvm::NvmOpcode::RESERVATION_RELEASE, self.nsid)
473                },
474                data.as_bytes(),
475            )
476            .await?;
477
478        Ok(())
479    }
480
481    /// Modifies a reservation registration.
482    pub async fn reservation_register(
483        &self,
484        target_cpu: u32,
485        action: nvm::ReservationRegisterAction,
486        crkey: Option<u64>,
487        nrkey: u64,
488        ptpl: Option<bool>,
489    ) -> Result<(), RequestError> {
490        let data = nvm::ReservationRegister {
491            crkey: crkey.unwrap_or(0),
492            nrkey,
493        };
494        let cptpl = match ptpl {
495            None => nvm::ChangePersistThroughPowerLoss::NO_CHANGE,
496            Some(false) => nvm::ChangePersistThroughPowerLoss::CLEAR,
497            Some(true) => nvm::ChangePersistThroughPowerLoss::SET,
498        };
499        self.issuer(target_cpu)
500            .await?
501            .issue_in(
502                spec::Command {
503                    cdw10: nvm::Cdw10ReservationRegister::new()
504                        .with_rrega(action.0)
505                        .with_iekey(crkey.is_none())
506                        .with_cptpl(cptpl.0)
507                        .into(),
508                    ..nvm_cmd(nvm::NvmOpcode::RESERVATION_REGISTER, self.nsid)
509                },
510                data.as_bytes(),
511            )
512            .await?;
513
514        Ok(())
515    }
516
517    /// Return Namespace ID.
518    pub fn nsid(&self) -> u32 {
519        self.nsid
520    }
521
522    /// Save namespace object data for servicing.
523    /// Initially we will re-query namespace state after restore
524    /// to avoid possible contention if namespace was changed
525    /// during servicing.
526    pub fn save(&self) -> anyhow::Result<SavedNamespaceData> {
527        Ok(SavedNamespaceData {
528            nsid: self.nsid,
529            identify_ns: self.state.identify.lock().clone(),
530        })
531    }
532
533    /// Restore namespace object data after servicing.
534    pub(super) fn restore(
535        driver: &VmTaskDriver,
536        admin: Arc<Issuer>,
537        rescan_event: mesh::Receiver<()>,
538        identify_ctrl: Arc<spec::IdentifyController>,
539        io_issuers: &Arc<IoIssuers>,
540        saved_state: &SavedNamespaceData,
541    ) -> Result<Self, NamespaceError> {
542        let SavedNamespaceData { nsid, identify_ns } = saved_state;
543
544        Namespace::new_from_identify(
545            driver,
546            admin,
547            rescan_event,
548            identify_ctrl.clone(),
549            io_issuers,
550            *nsid,
551            identify_ns.clone(),
552        )
553    }
554}
555
556impl DynamicState {
557    async fn poll_for_rescans(
558        &self,
559        admin: &Issuer,
560        nsid: u32,
561        mut rescan_event: mesh::Receiver<()>,
562    ) {
563        loop {
564            tracing::debug!("rescan");
565
566            // This relies on a mesh channel so notifications will NOT be missed
567            // even if the task was not started when the first AEN was processed.
568            let event = rescan_event.next().await;
569
570            // Once the sender is dropped, no more repoll signals can be received so
571            // there is no point in continuing.
572            if event.is_none() {
573                tracing::debug!("rescan task exiting");
574                break;
575            }
576
577            match identify_namespace(admin, nsid).await {
578                Ok(identify) => {
579                    if identify.nsze == 0 {
580                        tracing::info!(nsid, "namespace was hot removed");
581                        self.removed.store(true, Ordering::Relaxed);
582                    } else {
583                        let old_block_count = self.block_count.load(Ordering::Relaxed);
584                        let new_block_count = identify.nsze;
585                        if old_block_count != new_block_count {
586                            tracing::info!(
587                                old_block_count,
588                                new_block_count,
589                                "nvme disk size changed"
590                            );
591                            self.block_count.store(new_block_count, Ordering::Relaxed);
592                            self.resize_event.notify(usize::MAX);
593                        } else {
594                            tracing::debug!("rescanned, no change");
595                        }
596                    }
597                    *self.identify.lock() = identify;
598                }
599                Err(err) => {
600                    tracing::warn!(
601                        nsid,
602                        error = &err as &dyn std::error::Error,
603                        "failed to query namespace during rescan"
604                    );
605                }
606            }
607        }
608    }
609}
610
611async fn identify_namespace(
612    admin: &Issuer,
613    nsid: u32,
614) -> Result<nvm::IdentifyNamespace, RequestError> {
615    let mut identify = nvm::IdentifyNamespace::new_zeroed();
616    admin
617        .issue_out(
618            spec::Command {
619                nsid,
620                cdw10: spec::Cdw10Identify::new()
621                    .with_cns(spec::Cns::NAMESPACE.0)
622                    .into(),
623                ..admin_cmd(spec::AdminOpcode::IDENTIFY)
624            },
625            identify.as_mut_bytes(),
626        )
627        .await?;
628    Ok(identify)
629}
630
631fn nvm_cmd(opcode: nvm::NvmOpcode, nsid: u32) -> spec::Command {
632    spec::Command {
633        cdw0: spec::Cdw0::new().with_opcode(opcode.0),
634        nsid,
635        ..FromZeros::new_zeroed()
636    }
637}