1use super::spec;
7use super::spec::nvm;
8use crate::NVME_PAGE_SHIFT;
9use crate::driver::IoIssuers;
10use crate::driver::save_restore::SavedNamespaceData;
11use crate::queue_pair::Issuer;
12use crate::queue_pair::RequestError;
13use crate::queue_pair::admin_cmd;
14use guestmem::GuestMemory;
15use guestmem::ranges::PagedRange;
16use inspect::Inspect;
17use mesh::CancelContext;
18use pal_async::task::Spawn;
19use parking_lot::Mutex;
20use std::sync::Arc;
21use std::sync::atomic::AtomicBool;
22use std::sync::atomic::AtomicU64;
23use std::sync::atomic::Ordering;
24use thiserror::Error;
25use vmcore::vm_task::VmTaskDriver;
26use zerocopy::FromBytes;
27use zerocopy::FromZeros;
28use zerocopy::IntoBytes;
29
30#[derive(Debug, Error)]
32#[expect(missing_docs)]
33pub enum NamespaceError {
34 #[error("namespace not found")]
35 NotFound,
36 #[error("formatted lba size invalid")]
37 FlbasInvalid,
38 #[error("lba format invalid: {0:?}")]
39 LbaFormatInvalid(nvm::Lbaf),
40 #[error("nvme request failed")]
41 Request(#[source] RequestError),
42 #[error("maximum data transfer size too small: 2^{0} pages")]
43 MdtsInvalid(u8),
44}
45
46#[derive(Debug, Inspect)]
48pub struct Namespace {
49 nsid: u32,
50 #[inspect(flatten)]
51 state: Arc<DynamicState>,
52 block_shift: u32,
53 max_transfer_block_count: u32,
54 preferred_deallocate_granularity: u16,
55 reservation_capabilities: nvm::ReservationCapabilities,
56 controller_identify: Arc<spec::IdentifyController>,
57 #[inspect(skip)]
58 issuers: Arc<IoIssuers>,
59 #[inspect(skip)]
60 _cancel_rescan: mesh::Cancel,
61}
62
63#[derive(Debug, Inspect)]
64struct DynamicState {
65 block_count: AtomicU64,
66 #[inspect(skip)]
67 resize_event: event_listener::Event,
68 removed: AtomicBool,
69 identify: Mutex<nvm::IdentifyNamespace>,
70}
71
72impl Namespace {
73 pub(super) async fn new(
74 driver: &VmTaskDriver,
75 admin: Arc<Issuer>,
76 rescan_event: Arc<event_listener::Event>,
77 controller_identify: Arc<spec::IdentifyController>,
78 io_issuers: &Arc<IoIssuers>,
79 nsid: u32,
80 ) -> Result<Self, NamespaceError> {
81 let identify = identify_namespace(&admin, nsid)
82 .await
83 .map_err(NamespaceError::Request)?;
84
85 Namespace::new_from_identify(
86 driver,
87 admin,
88 rescan_event,
89 controller_identify.clone(),
90 io_issuers,
91 nsid,
92 identify,
93 )
94 }
95
96 fn new_from_identify(
98 driver: &VmTaskDriver,
99 admin: Arc<Issuer>,
100 rescan_event: Arc<event_listener::Event>,
101 controller_identify: Arc<spec::IdentifyController>,
102 io_issuers: &Arc<IoIssuers>,
103 nsid: u32,
104 identify: nvm::IdentifyNamespace,
105 ) -> Result<Self, NamespaceError> {
106 if identify.nsze == 0 {
107 return Err(NamespaceError::NotFound);
108 }
109
110 let lba_format_index = identify.flbas.low_index();
111 if lba_format_index > identify.nlbaf {
112 return Err(NamespaceError::FlbasInvalid);
113 }
114
115 let lbaf = identify.lbaf[lba_format_index as usize];
116 let block_shift = lbaf.lbads();
117 if !matches!(block_shift, 9..=16) {
118 return Err(NamespaceError::LbaFormatInvalid(lbaf));
119 }
120
121 let max_transfer_block_count = {
122 let mdts = if controller_identify.mdts != 0 {
123 controller_identify.mdts
124 } else {
125 u8::MAX
126 };
127 let max_transfer_bits = mdts.saturating_add(NVME_PAGE_SHIFT);
128 1 << max_transfer_bits
129 .checked_sub(block_shift)
130 .ok_or(NamespaceError::MdtsInvalid(mdts))?
131 .min(16)
132 };
133
134 let preferred_deallocate_granularity = if identify.nsfeat.optperf() {
135 identify.npdg
136 } else {
137 1
138 };
139
140 let reservation_capabilities = if controller_identify.oncs.reservations() {
141 identify.rescap
142 } else {
143 nvm::ReservationCapabilities::new()
144 };
145
146 let state = Arc::new(DynamicState {
147 block_count: identify.nsze.into(),
148 removed: false.into(),
149 identify: Mutex::new(identify),
150 resize_event: Default::default(),
151 });
152
153 let (mut ctx, cancel_rescan) = CancelContext::new().with_cancel();
158 driver
159 .spawn(format!("nvme_poll_rescan_{nsid}"), {
160 let state = state.clone();
161 async move {
162 state
163 .poll_for_rescans(&mut ctx, &admin, nsid, &rescan_event)
164 .await
165 }
166 })
167 .detach();
168
169 Ok(Self {
170 nsid,
171 state,
172 max_transfer_block_count,
173 block_shift: block_shift.into(),
174 preferred_deallocate_granularity,
175 reservation_capabilities,
176 controller_identify,
177 issuers: io_issuers.clone(),
178 _cancel_rescan: cancel_rescan,
179 })
180 }
181
182 pub fn block_count(&self) -> u64 {
184 self.state.block_count.load(Ordering::Relaxed)
185 }
186
187 pub async fn wait_resize(&self, block_count: u64) -> u64 {
189 loop {
190 let listen = self.state.resize_event.listen();
191 let current = self.block_count();
192 if current != block_count {
193 break current;
194 }
195 listen.await;
196 }
197 }
198
199 pub fn block_size(&self) -> u32 {
201 1 << self.block_shift
202 }
203
204 fn check_active(&self) -> Result<(), RequestError> {
205 if self.state.removed.load(Ordering::Relaxed) {
206 return Err(RequestError::Nvme(
210 spec::Status::INVALID_NAMESPACE_OR_FORMAT.into(),
211 ));
212 }
213 Ok(())
214 }
215
216 async fn issuer(&self, cpu: u32) -> Result<&Issuer, RequestError> {
217 self.issuers.get(cpu).await
218 }
219
220 pub async fn read(
222 &self,
223 target_cpu: u32,
224 lba: u64,
225 block_count: u32,
226 guest_memory: &GuestMemory,
227 mem: PagedRange<'_>,
228 ) -> Result<(), RequestError> {
229 self.check_active()?;
230 if block_count == 0 {
231 return Ok(());
232 }
233 assert!(block_count <= self.max_transfer_block_count);
234 let len = (block_count as usize) << self.block_shift;
235 if len > mem.len() {
236 panic!(
237 "invalid block count: {len} > {mem_len}",
238 mem_len = mem.len()
239 );
240 }
241 self.issuer(target_cpu)
242 .await?
243 .issue_external(
244 spec::Command {
245 cdw10: nvm::Cdw10ReadWrite::new().with_sbla_low(lba as u32).into(),
246 cdw11: nvm::Cdw11ReadWrite::new()
247 .with_sbla_high((lba >> 32) as u32)
248 .into(),
249 cdw12: nvm::Cdw12ReadWrite::new()
250 .with_nlb_z((block_count - 1) as u16)
251 .into(),
252 ..nvm_cmd(nvm::NvmOpcode::READ, self.nsid)
253 },
254 guest_memory,
255 mem.subrange(0, len),
256 )
257 .await?;
258 Ok(())
259 }
260
261 pub async fn write(
263 &self,
264 target_cpu: u32,
265 lba: u64,
266 block_count: u32,
267 fua: bool,
268 guest_memory: &GuestMemory,
269 mem: PagedRange<'_>,
270 ) -> Result<(), RequestError> {
271 self.check_active()?;
272 if block_count == 0 {
273 return Ok(());
274 }
275 assert!(block_count <= self.max_transfer_block_count);
276 let len = (block_count as usize) << self.block_shift;
277 if len > mem.len() {
278 panic!(
279 "invalid block count: {len} > {mem_len}",
280 mem_len = mem.len()
281 );
282 }
283 self.issuer(target_cpu)
284 .await?
285 .issue_external(
286 spec::Command {
287 cdw10: nvm::Cdw10ReadWrite::new().with_sbla_low(lba as u32).into(),
288 cdw11: nvm::Cdw11ReadWrite::new()
289 .with_sbla_high((lba >> 32) as u32)
290 .into(),
291 cdw12: nvm::Cdw12ReadWrite::new()
292 .with_nlb_z((block_count - 1) as u16)
293 .with_fua(fua)
294 .into(),
295 ..nvm_cmd(nvm::NvmOpcode::WRITE, self.nsid)
296 },
297 guest_memory,
298 mem.subrange(0, len),
299 )
300 .await?;
301 Ok(())
302 }
303
304 pub async fn flush(&self, target_cpu: u32) -> Result<(), RequestError> {
306 self.check_active()?;
307 self.issuer(target_cpu)
308 .await?
309 .issue_neither(spec::Command {
310 ..nvm_cmd(nvm::NvmOpcode::FLUSH, self.nsid)
311 })
312 .await?;
313 Ok(())
314 }
315
316 pub fn max_transfer_block_count(&self) -> u32 {
318 self.max_transfer_block_count
319 }
320
321 pub fn supports_dataset_management(&self) -> bool {
324 self.controller_identify.oncs.dataset_management()
325 }
326
327 pub fn preferred_deallocate_granularity(&self) -> u16 {
329 self.preferred_deallocate_granularity
330 }
331
332 pub fn dataset_management_range_limit(&self) -> usize {
334 256
336 }
337
338 pub fn dataset_management_range_size_limit(&self) -> u32 {
341 u32::MAX
343 }
344
345 pub async fn deallocate(
352 &self,
353 target_cpu: u32,
354 ranges: &[nvm::DsmRange],
355 ) -> Result<(), RequestError> {
356 self.check_active()?;
357 let ranges = &ranges[..ranges.len().min(256)];
359 self.issuer(target_cpu)
360 .await?
361 .issue_in(
362 spec::Command {
363 cdw10: nvm::Cdw10Dsm::new()
364 .with_nr_z((ranges.len() - 1) as u8)
365 .into(),
366 cdw11: nvm::Cdw11Dsm::new().with_ad(true).into(),
367 ..nvm_cmd(nvm::NvmOpcode::DSM, self.nsid)
368 },
369 ranges.as_bytes(),
370 )
371 .await?;
372 Ok(())
373 }
374
375 pub fn reservation_capabilities(&self) -> nvm::ReservationCapabilities {
377 self.reservation_capabilities
378 }
379
380 pub async fn reservation_report_extended(
382 &self,
383 target_cpu: u32,
384 ) -> Result<
385 (
386 nvm::ReservationReportExtended,
387 Vec<nvm::RegisteredControllerExtended>,
388 ),
389 RequestError,
390 > {
391 let mut data = vec![0; 4096];
392 let issuer = self.issuer(target_cpu).await?;
393 loop {
394 issuer
395 .issue_out(
396 spec::Command {
397 cdw10: nvm::Cdw10ReservationReport::new()
398 .with_numd_z((data.len() / 4 - 1) as u32)
399 .into(),
400 cdw11: nvm::Cdw11ReservationReport::new().with_eds(true).into(),
401 ..nvm_cmd(nvm::NvmOpcode::RESERVATION_REPORT, self.nsid)
402 },
403 &mut data,
404 )
405 .await?;
406
407 let header = nvm::ReservationReportExtended::read_from_prefix(&data[..])
408 .unwrap()
409 .0; let len = size_of_val(&header)
411 + header.report.regctl.get() as usize
412 * size_of::<nvm::RegisteredControllerExtended>();
413
414 if len > data.len() {
415 data.resize(len, 0);
416 continue;
417 }
418
419 let mut controllers = vec![
420 nvm::RegisteredControllerExtended::new_zeroed();
421 header.report.regctl.get().into()
422 ];
423
424 controllers
425 .as_mut_bytes()
426 .copy_from_slice(&data[size_of_val(&header)..len]);
427
428 break Ok((header, controllers));
429 }
430 }
431
432 pub async fn reservation_acquire(
434 &self,
435 target_cpu: u32,
436 action: nvm::ReservationAcquireAction,
437 crkey: u64,
438 prkey: u64,
439 reservation_type: nvm::ReservationType,
440 ) -> Result<(), RequestError> {
441 let data = nvm::ReservationAcquire { crkey, prkey };
442 self.issuer(target_cpu)
443 .await?
444 .issue_in(
445 spec::Command {
446 cdw10: nvm::Cdw10ReservationAcquire::new()
447 .with_racqa(action.0)
448 .with_rtype(reservation_type.0)
449 .into(),
450 ..nvm_cmd(nvm::NvmOpcode::RESERVATION_ACQUIRE, self.nsid)
451 },
452 data.as_bytes(),
453 )
454 .await?;
455
456 Ok(())
457 }
458
459 pub async fn reservation_release(
461 &self,
462 target_cpu: u32,
463 action: nvm::ReservationReleaseAction,
464 crkey: u64,
465 reservation_type: nvm::ReservationType,
466 ) -> Result<(), RequestError> {
467 let data = nvm::ReservationRelease { crkey };
468 self.issuer(target_cpu)
469 .await?
470 .issue_in(
471 spec::Command {
472 cdw10: nvm::Cdw10ReservationRelease::new()
473 .with_rrela(action.0)
474 .with_rtype(reservation_type.0)
475 .into(),
476 ..nvm_cmd(nvm::NvmOpcode::RESERVATION_RELEASE, self.nsid)
477 },
478 data.as_bytes(),
479 )
480 .await?;
481
482 Ok(())
483 }
484
485 pub async fn reservation_register(
487 &self,
488 target_cpu: u32,
489 action: nvm::ReservationRegisterAction,
490 crkey: Option<u64>,
491 nrkey: u64,
492 ptpl: Option<bool>,
493 ) -> Result<(), RequestError> {
494 let data = nvm::ReservationRegister {
495 crkey: crkey.unwrap_or(0),
496 nrkey,
497 };
498 let cptpl = match ptpl {
499 None => nvm::ChangePersistThroughPowerLoss::NO_CHANGE,
500 Some(false) => nvm::ChangePersistThroughPowerLoss::CLEAR,
501 Some(true) => nvm::ChangePersistThroughPowerLoss::SET,
502 };
503 self.issuer(target_cpu)
504 .await?
505 .issue_in(
506 spec::Command {
507 cdw10: nvm::Cdw10ReservationRegister::new()
508 .with_rrega(action.0)
509 .with_iekey(crkey.is_none())
510 .with_cptpl(cptpl.0)
511 .into(),
512 ..nvm_cmd(nvm::NvmOpcode::RESERVATION_REGISTER, self.nsid)
513 },
514 data.as_bytes(),
515 )
516 .await?;
517
518 Ok(())
519 }
520
521 pub fn nsid(&self) -> u32 {
523 self.nsid
524 }
525
526 pub fn save(&self) -> anyhow::Result<SavedNamespaceData> {
533 Ok(SavedNamespaceData {
534 nsid: self.nsid,
535 identify_ns: self.state.identify.lock().clone(),
536 })
537 }
538
539 pub(super) fn restore(
541 driver: &VmTaskDriver,
542 admin: Arc<Issuer>,
543 rescan_event: Arc<event_listener::Event>,
544 identify_ctrl: Arc<spec::IdentifyController>,
545 io_issuers: &Arc<IoIssuers>,
546 saved_state: &SavedNamespaceData,
547 ) -> Result<Self, NamespaceError> {
548 let SavedNamespaceData { nsid, identify_ns } = saved_state;
549
550 Namespace::new_from_identify(
551 driver,
552 admin,
553 rescan_event,
554 identify_ctrl.clone(),
555 io_issuers,
556 *nsid,
557 identify_ns.clone(),
558 )
559 }
560}
561
562impl DynamicState {
563 async fn poll_for_rescans(
564 &self,
565 ctx: &mut CancelContext,
566 admin: &Issuer,
567 nsid: u32,
568 rescan_event: &event_listener::Event,
569 ) {
570 loop {
571 let listen = rescan_event.listen();
572 tracing::debug!("rescan");
573 match identify_namespace(admin, nsid).await {
576 Ok(identify) => {
577 if identify.nsze == 0 {
578 tracing::info!(nsid, "namespace was hot removed");
579 self.removed.store(true, Ordering::Relaxed);
580 } else {
581 let old_block_count = self.block_count.load(Ordering::Relaxed);
582 let new_block_count = identify.nsze;
583 if old_block_count != new_block_count {
584 tracing::info!(
585 old_block_count,
586 new_block_count,
587 "nvme disk size changed"
588 );
589 self.block_count.store(new_block_count, Ordering::Relaxed);
590 self.resize_event.notify(usize::MAX);
591 } else {
592 tracing::debug!("rescanned, no change");
593 }
594 }
595 *self.identify.lock() = identify;
596 }
597 Err(err) => {
598 tracing::warn!(
599 nsid,
600 error = &err as &dyn std::error::Error,
601 "failed to query namespace during rescan"
602 );
603 }
604 }
605
606 if ctx.until_cancelled(listen).await.is_err() {
607 break;
608 }
609 }
610 }
611}
612
613async fn identify_namespace(
614 admin: &Issuer,
615 nsid: u32,
616) -> Result<nvm::IdentifyNamespace, RequestError> {
617 let mut identify = nvm::IdentifyNamespace::new_zeroed();
618 admin
619 .issue_out(
620 spec::Command {
621 nsid,
622 cdw10: spec::Cdw10Identify::new()
623 .with_cns(spec::Cns::NAMESPACE.0)
624 .into(),
625 ..admin_cmd(spec::AdminOpcode::IDENTIFY)
626 },
627 identify.as_mut_bytes(),
628 )
629 .await?;
630 Ok(identify)
631}
632
633fn nvm_cmd(opcode: nvm::NvmOpcode, nsid: u32) -> spec::Command {
634 spec::Command {
635 cdw0: spec::Cdw0::new().with_opcode(opcode.0),
636 nsid,
637 ..FromZeros::new_zeroed()
638 }
639}