1use super::spec;
7use super::spec::nvm;
8use crate::NVME_PAGE_SHIFT;
9use crate::driver::IoIssuers;
10use crate::driver::save_restore::SavedNamespaceData;
11use crate::queue_pair::Issuer;
12use crate::queue_pair::RequestError;
13use crate::queue_pair::admin_cmd;
14use futures::StreamExt;
15use guestmem::GuestMemory;
16use guestmem::ranges::PagedRange;
17use inspect::Inspect;
18use pal_async::task::Spawn;
19use parking_lot::Mutex;
20use std::sync::Arc;
21use std::sync::atomic::AtomicBool;
22use std::sync::atomic::AtomicU64;
23use std::sync::atomic::Ordering;
24use thiserror::Error;
25use vmcore::vm_task::VmTaskDriver;
26use zerocopy::FromBytes;
27use zerocopy::FromZeros;
28use zerocopy::IntoBytes;
29
30#[derive(Debug, Error)]
32#[expect(missing_docs)]
33pub enum NamespaceError {
34 #[error("namespace not found")]
35 NotFound,
36 #[error("formatted lba size invalid")]
37 FlbasInvalid,
38 #[error("lba format invalid: {0:?}")]
39 LbaFormatInvalid(nvm::Lbaf),
40 #[error("nvme request failed")]
41 Request(#[source] RequestError),
42 #[error("maximum data transfer size too small: 2^{0} pages")]
43 MdtsInvalid(u8),
44 #[error("namespace ID {nsid} already exists")]
45 DuplicateRequest { nsid: u32 },
46}
47
48#[derive(Debug, Inspect)]
50pub struct Namespace {
51 nsid: u32,
52 #[inspect(flatten)]
53 state: Arc<DynamicState>,
54 block_shift: u32,
55 max_transfer_block_count: u32,
56 preferred_deallocate_granularity: u16,
57 reservation_capabilities: nvm::ReservationCapabilities,
58 controller_identify: Arc<spec::IdentifyController>,
59 #[inspect(skip)]
60 issuers: Arc<IoIssuers>,
61}
62
63#[derive(Debug, Inspect)]
64struct DynamicState {
65 block_count: AtomicU64,
66 #[inspect(skip)]
67 resize_event: event_listener::Event,
68 removed: AtomicBool,
69 identify: Mutex<nvm::IdentifyNamespace>,
70}
71
72impl Namespace {
73 pub(super) async fn new(
74 driver: &VmTaskDriver,
75 admin: Arc<Issuer>,
76 rescan_event: mesh::Receiver<()>,
77 controller_identify: Arc<spec::IdentifyController>,
78 io_issuers: &Arc<IoIssuers>,
79 nsid: u32,
80 ) -> Result<Self, NamespaceError> {
81 let identify = identify_namespace(&admin, nsid)
82 .await
83 .map_err(NamespaceError::Request)?;
84
85 Namespace::new_from_identify(
86 driver,
87 admin,
88 rescan_event,
89 controller_identify.clone(),
90 io_issuers,
91 nsid,
92 identify,
93 )
94 }
95
96 fn new_from_identify(
98 driver: &VmTaskDriver,
99 admin: Arc<Issuer>,
100 rescan_event: mesh::Receiver<()>,
101 controller_identify: Arc<spec::IdentifyController>,
102 io_issuers: &Arc<IoIssuers>,
103 nsid: u32,
104 identify: nvm::IdentifyNamespace,
105 ) -> Result<Self, NamespaceError> {
106 if identify.nsze == 0 {
107 return Err(NamespaceError::NotFound);
108 }
109
110 let lba_format_index = identify.flbas.low_index();
111 if lba_format_index > identify.nlbaf {
112 return Err(NamespaceError::FlbasInvalid);
113 }
114
115 let lbaf = identify.lbaf[lba_format_index as usize];
116 let block_shift = lbaf.lbads();
117 if !matches!(block_shift, 9..=16) {
118 return Err(NamespaceError::LbaFormatInvalid(lbaf));
119 }
120
121 let max_transfer_block_count = {
122 let mdts = if controller_identify.mdts != 0 {
123 controller_identify.mdts
124 } else {
125 u8::MAX
126 };
127 let max_transfer_bits = mdts.saturating_add(NVME_PAGE_SHIFT);
128 1 << max_transfer_bits
129 .checked_sub(block_shift)
130 .ok_or(NamespaceError::MdtsInvalid(mdts))?
131 .min(16)
132 };
133
134 let preferred_deallocate_granularity = if identify.nsfeat.optperf() {
135 identify.npdg
136 } else {
137 1
138 };
139
140 let reservation_capabilities = if controller_identify.oncs.reservations() {
141 identify.rescap
142 } else {
143 nvm::ReservationCapabilities::new()
144 };
145
146 let state = Arc::new(DynamicState {
147 block_count: identify.nsze.into(),
148 removed: false.into(),
149 identify: Mutex::new(identify),
150 resize_event: Default::default(),
151 });
152
153 driver
160 .spawn(format!("nvme_poll_rescan_{nsid}"), {
161 let state = state.clone();
162 async move { state.poll_for_rescans(&admin, nsid, rescan_event).await }
163 })
164 .detach();
165
166 Ok(Self {
167 nsid,
168 state,
169 max_transfer_block_count,
170 block_shift: block_shift.into(),
171 preferred_deallocate_granularity,
172 reservation_capabilities,
173 controller_identify,
174 issuers: io_issuers.clone(),
175 })
176 }
177
178 pub fn block_count(&self) -> u64 {
180 self.state.block_count.load(Ordering::Relaxed)
181 }
182
183 pub async fn wait_resize(&self, block_count: u64) -> u64 {
185 loop {
186 let listen = self.state.resize_event.listen();
187 let current = self.block_count();
188 if current != block_count {
189 break current;
190 }
191 listen.await;
192 }
193 }
194
195 pub fn block_size(&self) -> u32 {
197 1 << self.block_shift
198 }
199
200 fn check_active(&self) -> Result<(), RequestError> {
201 if self.state.removed.load(Ordering::Relaxed) {
202 return Err(RequestError::Nvme(
206 spec::Status::INVALID_NAMESPACE_OR_FORMAT.into(),
207 ));
208 }
209 Ok(())
210 }
211
212 async fn issuer(&self, cpu: u32) -> Result<&Issuer, RequestError> {
213 self.issuers.get(cpu).await
214 }
215
216 pub async fn read(
218 &self,
219 target_cpu: u32,
220 lba: u64,
221 block_count: u32,
222 guest_memory: &GuestMemory,
223 mem: PagedRange<'_>,
224 ) -> Result<(), RequestError> {
225 self.check_active()?;
226 if block_count == 0 {
227 return Ok(());
228 }
229 assert!(block_count <= self.max_transfer_block_count);
230 let len = (block_count as usize) << self.block_shift;
231 if len > mem.len() {
232 panic!(
233 "invalid block count: {len} > {mem_len}",
234 mem_len = mem.len()
235 );
236 }
237 self.issuer(target_cpu)
238 .await?
239 .issue_external(
240 spec::Command {
241 cdw10: nvm::Cdw10ReadWrite::new().with_sbla_low(lba as u32).into(),
242 cdw11: nvm::Cdw11ReadWrite::new()
243 .with_sbla_high((lba >> 32) as u32)
244 .into(),
245 cdw12: nvm::Cdw12ReadWrite::new()
246 .with_nlb_z((block_count - 1) as u16)
247 .into(),
248 ..nvm_cmd(nvm::NvmOpcode::READ, self.nsid)
249 },
250 guest_memory,
251 mem.subrange(0, len),
252 )
253 .await?;
254 Ok(())
255 }
256
257 pub async fn write(
259 &self,
260 target_cpu: u32,
261 lba: u64,
262 block_count: u32,
263 fua: bool,
264 guest_memory: &GuestMemory,
265 mem: PagedRange<'_>,
266 ) -> Result<(), RequestError> {
267 self.check_active()?;
268 if block_count == 0 {
269 return Ok(());
270 }
271 assert!(block_count <= self.max_transfer_block_count);
272 let len = (block_count as usize) << self.block_shift;
273 if len > mem.len() {
274 panic!(
275 "invalid block count: {len} > {mem_len}",
276 mem_len = mem.len()
277 );
278 }
279 self.issuer(target_cpu)
280 .await?
281 .issue_external(
282 spec::Command {
283 cdw10: nvm::Cdw10ReadWrite::new().with_sbla_low(lba as u32).into(),
284 cdw11: nvm::Cdw11ReadWrite::new()
285 .with_sbla_high((lba >> 32) as u32)
286 .into(),
287 cdw12: nvm::Cdw12ReadWrite::new()
288 .with_nlb_z((block_count - 1) as u16)
289 .with_fua(fua)
290 .into(),
291 ..nvm_cmd(nvm::NvmOpcode::WRITE, self.nsid)
292 },
293 guest_memory,
294 mem.subrange(0, len),
295 )
296 .await?;
297 Ok(())
298 }
299
300 pub async fn flush(&self, target_cpu: u32) -> Result<(), RequestError> {
302 self.check_active()?;
303 self.issuer(target_cpu)
304 .await?
305 .issue_neither(spec::Command {
306 ..nvm_cmd(nvm::NvmOpcode::FLUSH, self.nsid)
307 })
308 .await?;
309 Ok(())
310 }
311
312 pub fn max_transfer_block_count(&self) -> u32 {
314 self.max_transfer_block_count
315 }
316
317 pub fn supports_dataset_management(&self) -> bool {
320 self.controller_identify.oncs.dataset_management()
321 }
322
323 pub fn preferred_deallocate_granularity(&self) -> u16 {
325 self.preferred_deallocate_granularity
326 }
327
328 pub fn dataset_management_range_limit(&self) -> usize {
330 256
332 }
333
334 pub fn dataset_management_range_size_limit(&self) -> u32 {
337 u32::MAX
339 }
340
341 pub async fn deallocate(
348 &self,
349 target_cpu: u32,
350 ranges: &[nvm::DsmRange],
351 ) -> Result<(), RequestError> {
352 self.check_active()?;
353 let ranges = &ranges[..ranges.len().min(256)];
355 self.issuer(target_cpu)
356 .await?
357 .issue_in(
358 spec::Command {
359 cdw10: nvm::Cdw10Dsm::new()
360 .with_nr_z((ranges.len() - 1) as u8)
361 .into(),
362 cdw11: nvm::Cdw11Dsm::new().with_ad(true).into(),
363 ..nvm_cmd(nvm::NvmOpcode::DSM, self.nsid)
364 },
365 ranges.as_bytes(),
366 )
367 .await?;
368 Ok(())
369 }
370
371 pub fn reservation_capabilities(&self) -> nvm::ReservationCapabilities {
373 self.reservation_capabilities
374 }
375
376 pub async fn reservation_report_extended(
378 &self,
379 target_cpu: u32,
380 ) -> Result<
381 (
382 nvm::ReservationReportExtended,
383 Vec<nvm::RegisteredControllerExtended>,
384 ),
385 RequestError,
386 > {
387 let mut data = vec![0; 4096];
388 let issuer = self.issuer(target_cpu).await?;
389 loop {
390 issuer
391 .issue_out(
392 spec::Command {
393 cdw10: nvm::Cdw10ReservationReport::new()
394 .with_numd_z((data.len() / 4 - 1) as u32)
395 .into(),
396 cdw11: nvm::Cdw11ReservationReport::new().with_eds(true).into(),
397 ..nvm_cmd(nvm::NvmOpcode::RESERVATION_REPORT, self.nsid)
398 },
399 &mut data,
400 )
401 .await?;
402
403 let header = nvm::ReservationReportExtended::read_from_prefix(&data[..])
404 .unwrap()
405 .0; let len = size_of_val(&header)
407 + header.report.regctl.get() as usize
408 * size_of::<nvm::RegisteredControllerExtended>();
409
410 if len > data.len() {
411 data.resize(len, 0);
412 continue;
413 }
414
415 let mut controllers = vec![
416 nvm::RegisteredControllerExtended::new_zeroed();
417 header.report.regctl.get().into()
418 ];
419
420 controllers
421 .as_mut_bytes()
422 .copy_from_slice(&data[size_of_val(&header)..len]);
423
424 break Ok((header, controllers));
425 }
426 }
427
428 pub async fn reservation_acquire(
430 &self,
431 target_cpu: u32,
432 action: nvm::ReservationAcquireAction,
433 crkey: u64,
434 prkey: u64,
435 reservation_type: nvm::ReservationType,
436 ) -> Result<(), RequestError> {
437 let data = nvm::ReservationAcquire { crkey, prkey };
438 self.issuer(target_cpu)
439 .await?
440 .issue_in(
441 spec::Command {
442 cdw10: nvm::Cdw10ReservationAcquire::new()
443 .with_racqa(action.0)
444 .with_rtype(reservation_type.0)
445 .into(),
446 ..nvm_cmd(nvm::NvmOpcode::RESERVATION_ACQUIRE, self.nsid)
447 },
448 data.as_bytes(),
449 )
450 .await?;
451
452 Ok(())
453 }
454
455 pub async fn reservation_release(
457 &self,
458 target_cpu: u32,
459 action: nvm::ReservationReleaseAction,
460 crkey: u64,
461 reservation_type: nvm::ReservationType,
462 ) -> Result<(), RequestError> {
463 let data = nvm::ReservationRelease { crkey };
464 self.issuer(target_cpu)
465 .await?
466 .issue_in(
467 spec::Command {
468 cdw10: nvm::Cdw10ReservationRelease::new()
469 .with_rrela(action.0)
470 .with_rtype(reservation_type.0)
471 .into(),
472 ..nvm_cmd(nvm::NvmOpcode::RESERVATION_RELEASE, self.nsid)
473 },
474 data.as_bytes(),
475 )
476 .await?;
477
478 Ok(())
479 }
480
481 pub async fn reservation_register(
483 &self,
484 target_cpu: u32,
485 action: nvm::ReservationRegisterAction,
486 crkey: Option<u64>,
487 nrkey: u64,
488 ptpl: Option<bool>,
489 ) -> Result<(), RequestError> {
490 let data = nvm::ReservationRegister {
491 crkey: crkey.unwrap_or(0),
492 nrkey,
493 };
494 let cptpl = match ptpl {
495 None => nvm::ChangePersistThroughPowerLoss::NO_CHANGE,
496 Some(false) => nvm::ChangePersistThroughPowerLoss::CLEAR,
497 Some(true) => nvm::ChangePersistThroughPowerLoss::SET,
498 };
499 self.issuer(target_cpu)
500 .await?
501 .issue_in(
502 spec::Command {
503 cdw10: nvm::Cdw10ReservationRegister::new()
504 .with_rrega(action.0)
505 .with_iekey(crkey.is_none())
506 .with_cptpl(cptpl.0)
507 .into(),
508 ..nvm_cmd(nvm::NvmOpcode::RESERVATION_REGISTER, self.nsid)
509 },
510 data.as_bytes(),
511 )
512 .await?;
513
514 Ok(())
515 }
516
517 pub fn nsid(&self) -> u32 {
519 self.nsid
520 }
521
522 pub fn save(&self) -> anyhow::Result<SavedNamespaceData> {
527 Ok(SavedNamespaceData {
528 nsid: self.nsid,
529 identify_ns: self.state.identify.lock().clone(),
530 })
531 }
532
533 pub(super) fn restore(
535 driver: &VmTaskDriver,
536 admin: Arc<Issuer>,
537 rescan_event: mesh::Receiver<()>,
538 identify_ctrl: Arc<spec::IdentifyController>,
539 io_issuers: &Arc<IoIssuers>,
540 saved_state: &SavedNamespaceData,
541 ) -> Result<Self, NamespaceError> {
542 let SavedNamespaceData { nsid, identify_ns } = saved_state;
543
544 Namespace::new_from_identify(
545 driver,
546 admin,
547 rescan_event,
548 identify_ctrl.clone(),
549 io_issuers,
550 *nsid,
551 identify_ns.clone(),
552 )
553 }
554}
555
556impl DynamicState {
557 async fn poll_for_rescans(
558 &self,
559 admin: &Issuer,
560 nsid: u32,
561 mut rescan_event: mesh::Receiver<()>,
562 ) {
563 loop {
564 tracing::debug!("rescan");
565
566 let event = rescan_event.next().await;
569
570 if event.is_none() {
573 tracing::debug!("rescan task exiting");
574 break;
575 }
576
577 match identify_namespace(admin, nsid).await {
578 Ok(identify) => {
579 if identify.nsze == 0 {
580 tracing::info!(nsid, "namespace was hot removed");
581 self.removed.store(true, Ordering::Relaxed);
582 } else {
583 let old_block_count = self.block_count.load(Ordering::Relaxed);
584 let new_block_count = identify.nsze;
585 if old_block_count != new_block_count {
586 tracing::info!(
587 old_block_count,
588 new_block_count,
589 "nvme disk size changed"
590 );
591 self.block_count.store(new_block_count, Ordering::Relaxed);
592 self.resize_event.notify(usize::MAX);
593 } else {
594 tracing::debug!("rescanned, no change");
595 }
596 }
597 *self.identify.lock() = identify;
598 }
599 Err(err) => {
600 tracing::warn!(
601 nsid,
602 error = &err as &dyn std::error::Error,
603 "failed to query namespace during rescan"
604 );
605 }
606 }
607 }
608 }
609}
610
611async fn identify_namespace(
612 admin: &Issuer,
613 nsid: u32,
614) -> Result<nvm::IdentifyNamespace, RequestError> {
615 let mut identify = nvm::IdentifyNamespace::new_zeroed();
616 admin
617 .issue_out(
618 spec::Command {
619 nsid,
620 cdw10: spec::Cdw10Identify::new()
621 .with_cns(spec::Cns::NAMESPACE.0)
622 .into(),
623 ..admin_cmd(spec::AdminOpcode::IDENTIFY)
624 },
625 identify.as_mut_bytes(),
626 )
627 .await?;
628 Ok(identify)
629}
630
631fn nvm_cmd(opcode: nvm::NvmOpcode, nsid: u32) -> spec::Command {
632 spec::Command {
633 cdw0: spec::Cdw0::new().with_opcode(opcode.0),
634 nsid,
635 ..FromZeros::new_zeroed()
636 }
637}