1use super::spec;
7use super::spec::nvm;
8use crate::NVME_PAGE_SHIFT;
9use crate::driver::IoIssuers;
10use crate::driver::save_restore::SavedNamespaceData;
11use crate::queue_pair::Issuer;
12use crate::queue_pair::RequestError;
13use crate::queue_pair::admin_cmd;
14use futures::StreamExt;
15use guestmem::GuestMemory;
16use guestmem::ranges::PagedRange;
17use inspect::Inspect;
18use pal_async::task::Spawn;
19use parking_lot::Mutex;
20use std::ops::Deref;
21use std::sync::Arc;
22use std::sync::atomic::AtomicBool;
23use std::sync::atomic::AtomicU64;
24use std::sync::atomic::Ordering;
25use thiserror::Error;
26use vmcore::vm_task::VmTaskDriver;
27use zerocopy::FromBytes;
28use zerocopy::FromZeros;
29use zerocopy::IntoBytes;
30
31#[derive(Debug, Error)]
33#[expect(missing_docs)]
34pub enum NamespaceError {
35 #[error("namespace not found")]
36 NotFound,
37 #[error("formatted lba size invalid")]
38 FlbasInvalid,
39 #[error("lba format invalid: {0:?}")]
40 LbaFormatInvalid(nvm::Lbaf),
41 #[error("nvme request failed")]
42 Request(#[source] RequestError),
43 #[error("maximum data transfer size too small: 2^{0} pages")]
44 MdtsInvalid(u8),
45 #[error("requesting a duplicate namespace: {0}")]
46 Duplicate(u32),
47}
48
49#[derive(Debug, Inspect)]
55pub struct NamespaceHandle {
56 namespace: Arc<Namespace>,
57}
58
59impl NamespaceHandle {
60 pub fn new(namespace: Arc<Namespace>) -> Self {
62 Self { namespace }
63 }
64}
65
66impl Deref for NamespaceHandle {
67 type Target = Namespace;
68 fn deref(&self) -> &Self::Target {
69 &self.namespace
70 }
71}
72
73#[derive(Debug, Inspect)]
75pub struct Namespace {
76 nsid: u32,
77 #[inspect(flatten)]
78 state: Arc<DynamicState>,
79 block_shift: u32,
80 max_transfer_block_count: u32,
81 preferred_deallocate_granularity: u16,
82 reservation_capabilities: nvm::ReservationCapabilities,
83 controller_identify: Arc<spec::IdentifyController>,
84 #[inspect(skip)]
85 issuers: Arc<IoIssuers>,
86}
87
88#[derive(Debug, Inspect)]
89struct DynamicState {
90 block_count: AtomicU64,
91 #[inspect(skip)]
92 resize_event: event_listener::Event,
93 removed: AtomicBool,
94 identify: Mutex<nvm::IdentifyNamespace>,
95}
96
97impl Namespace {
98 pub(super) async fn new(
99 driver: &VmTaskDriver,
100 admin: Arc<Issuer>,
101 rescan_event: mesh::Receiver<()>,
102 controller_identify: Arc<spec::IdentifyController>,
103 io_issuers: &Arc<IoIssuers>,
104 nsid: u32,
105 ) -> Result<Self, NamespaceError> {
106 let identify = identify_namespace(&admin, nsid)
107 .await
108 .map_err(NamespaceError::Request)?;
109
110 tracing::debug!(
111 "created namespace from identify nsid={}, nsze={}, nsguid={:?}",
112 nsid,
113 identify.nsze,
114 identify.nguid
115 );
116
117 Namespace::new_from_identify(
118 driver,
119 admin,
120 rescan_event,
121 controller_identify.clone(),
122 io_issuers,
123 nsid,
124 identify,
125 )
126 }
127
128 fn new_from_identify(
130 driver: &VmTaskDriver,
131 admin: Arc<Issuer>,
132 rescan_event: mesh::Receiver<()>,
133 controller_identify: Arc<spec::IdentifyController>,
134 io_issuers: &Arc<IoIssuers>,
135 nsid: u32,
136 identify: nvm::IdentifyNamespace,
137 ) -> Result<Self, NamespaceError> {
138 if identify.nsze == 0 {
139 return Err(NamespaceError::NotFound);
140 }
141
142 let lba_format_index = identify.flbas.low_index();
143 if lba_format_index > identify.nlbaf {
144 return Err(NamespaceError::FlbasInvalid);
145 }
146
147 let lbaf = identify.lbaf[lba_format_index as usize];
148 let block_shift = lbaf.lbads();
149 if !matches!(block_shift, 9..=16) {
150 return Err(NamespaceError::LbaFormatInvalid(lbaf));
151 }
152
153 let max_transfer_block_count = {
154 let mdts = if controller_identify.mdts != 0 {
155 controller_identify.mdts
156 } else {
157 u8::MAX
158 };
159 let max_transfer_bits = mdts.saturating_add(NVME_PAGE_SHIFT);
160 1 << max_transfer_bits
161 .checked_sub(block_shift)
162 .ok_or(NamespaceError::MdtsInvalid(mdts))?
163 .min(16)
164 };
165
166 let preferred_deallocate_granularity = if identify.nsfeat.optperf() {
167 identify.npdg
168 } else {
169 1
170 };
171
172 let reservation_capabilities = if controller_identify.oncs.reservations() {
173 identify.rescap
174 } else {
175 nvm::ReservationCapabilities::new()
176 };
177
178 let state = Arc::new(DynamicState {
179 block_count: identify.nsze.into(),
180 removed: false.into(),
181 identify: Mutex::new(identify),
182 resize_event: Default::default(),
183 });
184
185 driver
192 .spawn(format!("nvme_poll_rescan_{nsid}"), {
193 let state = state.clone();
194 async move { state.poll_for_rescans(&admin, nsid, rescan_event).await }
195 })
196 .detach();
197
198 Ok(Self {
199 nsid,
200 state,
201 max_transfer_block_count,
202 block_shift: block_shift.into(),
203 preferred_deallocate_granularity,
204 reservation_capabilities,
205 controller_identify,
206 issuers: io_issuers.clone(),
207 })
208 }
209
210 pub fn block_count(&self) -> u64 {
212 self.state.block_count.load(Ordering::Relaxed)
213 }
214
215 pub async fn wait_resize(&self, block_count: u64) -> u64 {
217 loop {
218 let listen = self.state.resize_event.listen();
219 let current = self.block_count();
220 if current != block_count {
221 break current;
222 }
223 listen.await;
224 }
225 }
226
227 pub fn block_size(&self) -> u32 {
229 1 << self.block_shift
230 }
231
232 pub fn check_active(&self) -> Result<(), RequestError> {
233 if self.state.removed.load(Ordering::Relaxed) {
234 return Err(RequestError::Nvme(
238 spec::Status::INVALID_NAMESPACE_OR_FORMAT.into(),
239 ));
240 }
241 Ok(())
242 }
243
244 async fn issuer(&self, cpu: u32) -> Result<&Issuer, RequestError> {
245 self.issuers.get(cpu).await
246 }
247
248 pub async fn read(
250 &self,
251 target_cpu: u32,
252 lba: u64,
253 block_count: u32,
254 guest_memory: &GuestMemory,
255 mem: PagedRange<'_>,
256 ) -> Result<(), RequestError> {
257 self.check_active()?;
258 if block_count == 0 {
259 return Ok(());
260 }
261 assert!(block_count <= self.max_transfer_block_count);
262 let len = (block_count as usize) << self.block_shift;
263 if len > mem.len() {
264 panic!(
265 "invalid block count: {len} > {mem_len}",
266 mem_len = mem.len()
267 );
268 }
269 self.issuer(target_cpu)
270 .await?
271 .issue_external(
272 spec::Command {
273 cdw10: nvm::Cdw10ReadWrite::new().with_sbla_low(lba as u32).into(),
274 cdw11: nvm::Cdw11ReadWrite::new()
275 .with_sbla_high((lba >> 32) as u32)
276 .into(),
277 cdw12: nvm::Cdw12ReadWrite::new()
278 .with_nlb_z((block_count - 1) as u16)
279 .into(),
280 ..nvm_cmd(nvm::NvmOpcode::READ, self.nsid)
281 },
282 guest_memory,
283 mem.subrange(0, len),
284 )
285 .await?;
286 Ok(())
287 }
288
289 pub async fn write(
291 &self,
292 target_cpu: u32,
293 lba: u64,
294 block_count: u32,
295 fua: bool,
296 guest_memory: &GuestMemory,
297 mem: PagedRange<'_>,
298 ) -> Result<(), RequestError> {
299 self.check_active()?;
300 if block_count == 0 {
301 return Ok(());
302 }
303 assert!(block_count <= self.max_transfer_block_count);
304 let len = (block_count as usize) << self.block_shift;
305 if len > mem.len() {
306 panic!(
307 "invalid block count: {len} > {mem_len}",
308 mem_len = mem.len()
309 );
310 }
311 self.issuer(target_cpu)
312 .await?
313 .issue_external(
314 spec::Command {
315 cdw10: nvm::Cdw10ReadWrite::new().with_sbla_low(lba as u32).into(),
316 cdw11: nvm::Cdw11ReadWrite::new()
317 .with_sbla_high((lba >> 32) as u32)
318 .into(),
319 cdw12: nvm::Cdw12ReadWrite::new()
320 .with_nlb_z((block_count - 1) as u16)
321 .with_fua(fua)
322 .into(),
323 ..nvm_cmd(nvm::NvmOpcode::WRITE, self.nsid)
324 },
325 guest_memory,
326 mem.subrange(0, len),
327 )
328 .await?;
329 Ok(())
330 }
331
332 pub async fn flush(&self, target_cpu: u32) -> Result<(), RequestError> {
334 self.check_active()?;
335 self.issuer(target_cpu)
336 .await?
337 .issue_neither(spec::Command {
338 ..nvm_cmd(nvm::NvmOpcode::FLUSH, self.nsid)
339 })
340 .await?;
341 Ok(())
342 }
343
344 pub fn max_transfer_block_count(&self) -> u32 {
346 self.max_transfer_block_count
347 }
348
349 pub fn supports_dataset_management(&self) -> bool {
352 self.controller_identify.oncs.dataset_management()
353 }
354
355 pub fn preferred_deallocate_granularity(&self) -> u16 {
357 self.preferred_deallocate_granularity
358 }
359
360 pub fn dataset_management_range_limit(&self) -> usize {
362 256
364 }
365
366 pub fn dataset_management_range_size_limit(&self) -> u32 {
369 u32::MAX
371 }
372
373 pub async fn deallocate(
380 &self,
381 target_cpu: u32,
382 ranges: &[nvm::DsmRange],
383 ) -> Result<(), RequestError> {
384 self.check_active()?;
385 let ranges = &ranges[..ranges.len().min(256)];
387 self.issuer(target_cpu)
388 .await?
389 .issue_in(
390 spec::Command {
391 cdw10: nvm::Cdw10Dsm::new()
392 .with_nr_z((ranges.len() - 1) as u8)
393 .into(),
394 cdw11: nvm::Cdw11Dsm::new().with_ad(true).into(),
395 ..nvm_cmd(nvm::NvmOpcode::DSM, self.nsid)
396 },
397 ranges.as_bytes(),
398 )
399 .await?;
400 Ok(())
401 }
402
403 pub fn reservation_capabilities(&self) -> nvm::ReservationCapabilities {
405 self.reservation_capabilities
406 }
407
408 pub async fn reservation_report_extended(
410 &self,
411 target_cpu: u32,
412 ) -> Result<
413 (
414 nvm::ReservationReportExtended,
415 Vec<nvm::RegisteredControllerExtended>,
416 ),
417 RequestError,
418 > {
419 let mut data = vec![0; 4096];
420 let issuer = self.issuer(target_cpu).await?;
421 loop {
422 issuer
423 .issue_out(
424 spec::Command {
425 cdw10: nvm::Cdw10ReservationReport::new()
426 .with_numd_z((data.len() / 4 - 1) as u32)
427 .into(),
428 cdw11: nvm::Cdw11ReservationReport::new().with_eds(true).into(),
429 ..nvm_cmd(nvm::NvmOpcode::RESERVATION_REPORT, self.nsid)
430 },
431 &mut data,
432 )
433 .await?;
434
435 let header = nvm::ReservationReportExtended::read_from_prefix(&data[..])
436 .unwrap()
437 .0; let len = size_of_val(&header)
439 + header.report.regctl.get() as usize
440 * size_of::<nvm::RegisteredControllerExtended>();
441
442 if len > data.len() {
443 data.resize(len, 0);
444 continue;
445 }
446
447 let mut controllers = vec![
448 nvm::RegisteredControllerExtended::new_zeroed();
449 header.report.regctl.get().into()
450 ];
451
452 controllers
453 .as_mut_bytes()
454 .copy_from_slice(&data[size_of_val(&header)..len]);
455
456 break Ok((header, controllers));
457 }
458 }
459
460 pub async fn reservation_acquire(
462 &self,
463 target_cpu: u32,
464 action: nvm::ReservationAcquireAction,
465 crkey: u64,
466 prkey: u64,
467 reservation_type: nvm::ReservationType,
468 ) -> Result<(), RequestError> {
469 let data = nvm::ReservationAcquire { crkey, prkey };
470 self.issuer(target_cpu)
471 .await?
472 .issue_in(
473 spec::Command {
474 cdw10: nvm::Cdw10ReservationAcquire::new()
475 .with_racqa(action.0)
476 .with_rtype(reservation_type.0)
477 .into(),
478 ..nvm_cmd(nvm::NvmOpcode::RESERVATION_ACQUIRE, self.nsid)
479 },
480 data.as_bytes(),
481 )
482 .await?;
483
484 Ok(())
485 }
486
487 pub async fn reservation_release(
489 &self,
490 target_cpu: u32,
491 action: nvm::ReservationReleaseAction,
492 crkey: u64,
493 reservation_type: nvm::ReservationType,
494 ) -> Result<(), RequestError> {
495 let data = nvm::ReservationRelease { crkey };
496 self.issuer(target_cpu)
497 .await?
498 .issue_in(
499 spec::Command {
500 cdw10: nvm::Cdw10ReservationRelease::new()
501 .with_rrela(action.0)
502 .with_rtype(reservation_type.0)
503 .into(),
504 ..nvm_cmd(nvm::NvmOpcode::RESERVATION_RELEASE, self.nsid)
505 },
506 data.as_bytes(),
507 )
508 .await?;
509
510 Ok(())
511 }
512
513 pub async fn reservation_register(
515 &self,
516 target_cpu: u32,
517 action: nvm::ReservationRegisterAction,
518 crkey: Option<u64>,
519 nrkey: u64,
520 ptpl: Option<bool>,
521 ) -> Result<(), RequestError> {
522 let data = nvm::ReservationRegister {
523 crkey: crkey.unwrap_or(0),
524 nrkey,
525 };
526 let cptpl = match ptpl {
527 None => nvm::ChangePersistThroughPowerLoss::NO_CHANGE,
528 Some(false) => nvm::ChangePersistThroughPowerLoss::CLEAR,
529 Some(true) => nvm::ChangePersistThroughPowerLoss::SET,
530 };
531 self.issuer(target_cpu)
532 .await?
533 .issue_in(
534 spec::Command {
535 cdw10: nvm::Cdw10ReservationRegister::new()
536 .with_rrega(action.0)
537 .with_iekey(crkey.is_none())
538 .with_cptpl(cptpl.0)
539 .into(),
540 ..nvm_cmd(nvm::NvmOpcode::RESERVATION_REGISTER, self.nsid)
541 },
542 data.as_bytes(),
543 )
544 .await?;
545
546 Ok(())
547 }
548
549 pub fn nsid(&self) -> u32 {
551 self.nsid
552 }
553
554 pub fn save(&self) -> anyhow::Result<SavedNamespaceData> {
559 Ok(SavedNamespaceData {
560 nsid: self.nsid,
561 identify_ns: self.state.identify.lock().clone(),
562 })
563 }
564
565 pub(super) fn restore(
567 driver: &VmTaskDriver,
568 admin: Arc<Issuer>,
569 rescan_event: mesh::Receiver<()>,
570 identify_ctrl: Arc<spec::IdentifyController>,
571 io_issuers: &Arc<IoIssuers>,
572 saved_state: &SavedNamespaceData,
573 ) -> Result<Self, NamespaceError> {
574 let SavedNamespaceData { nsid, identify_ns } = saved_state;
575
576 Namespace::new_from_identify(
577 driver,
578 admin,
579 rescan_event,
580 identify_ctrl.clone(),
581 io_issuers,
582 *nsid,
583 identify_ns.clone(),
584 )
585 }
586}
587
588impl DynamicState {
589 async fn poll_for_rescans(
590 &self,
591 admin: &Issuer,
592 nsid: u32,
593 mut rescan_event: mesh::Receiver<()>,
594 ) {
595 loop {
596 tracing::debug!("rescan task started nsid={}", nsid);
597
598 let event = rescan_event.next().await;
601
602 if event.is_none() {
605 tracing::debug!("rescan task exiting nsid={}", nsid);
606 break;
607 }
608
609 match identify_namespace(admin, nsid).await {
610 Ok(identify) => {
611 if identify.nsze == 0 {
612 tracing::info!(nsid, "namespace was hot removed");
613 self.removed.store(true, Ordering::Relaxed);
614 } else {
615 let old_block_count = self.block_count.load(Ordering::Relaxed);
616 let new_block_count = identify.nsze;
617 if old_block_count != new_block_count {
618 tracing::info!(
619 old_block_count,
620 new_block_count,
621 "nvme disk size changed"
622 );
623 self.block_count.store(new_block_count, Ordering::Relaxed);
624 self.resize_event.notify(usize::MAX);
625 } else {
626 tracing::debug!("rescanned, no change");
627 }
628 }
629 *self.identify.lock() = identify;
630 }
631 Err(err) => {
632 tracing::warn!(
633 nsid,
634 error = &err as &dyn std::error::Error,
635 "failed to query namespace during rescan"
636 );
637 }
638 }
639 }
640 }
641}
642
643async fn identify_namespace(
644 admin: &Issuer,
645 nsid: u32,
646) -> Result<nvm::IdentifyNamespace, RequestError> {
647 let mut identify = nvm::IdentifyNamespace::new_zeroed();
648 admin
649 .issue_out(
650 spec::Command {
651 nsid,
652 cdw10: spec::Cdw10Identify::new()
653 .with_cns(spec::Cns::NAMESPACE.0)
654 .into(),
655 ..admin_cmd(spec::AdminOpcode::IDENTIFY)
656 },
657 identify.as_mut_bytes(),
658 )
659 .await?;
660 Ok(identify)
661}
662
663fn nvm_cmd(opcode: nvm::NvmOpcode, nsid: u32) -> spec::Command {
664 spec::Command {
665 cdw0: spec::Cdw0::new().with_opcode(opcode.0),
666 nsid,
667 ..FromZeros::new_zeroed()
668 }
669}