use super::spec;
use super::spec::nvm;
use crate::NVME_PAGE_SHIFT;
use crate::driver::IoIssuers;
use crate::driver::save_restore::SavedNamespaceData;
use crate::queue_pair::Issuer;
use crate::queue_pair::RequestError;
use crate::queue_pair::admin_cmd;
use guestmem::GuestMemory;
use guestmem::ranges::PagedRange;
use inspect::Inspect;
use mesh::CancelContext;
use pal_async::task::Spawn;
use parking_lot::Mutex;
use std::sync::Arc;
use std::sync::atomic::AtomicBool;
use std::sync::atomic::AtomicU64;
use std::sync::atomic::Ordering;
use thiserror::Error;
use vmcore::vm_task::VmTaskDriver;
use zerocopy::FromBytes;
use zerocopy::FromZeros;
use zerocopy::IntoBytes;
#[derive(Debug, Error)]
#[expect(missing_docs)]
pub enum NamespaceError {
#[error("namespace not found")]
NotFound,
#[error("formatted lba size invalid")]
FlbasInvalid,
#[error("lba format invalid: {0:?}")]
LbaFormatInvalid(nvm::Lbaf),
#[error("nvme request failed")]
Request(#[source] RequestError),
#[error("maximum data transfer size too small: 2^{0} pages")]
MdtsInvalid(u8),
}
#[derive(Debug, Inspect)]
pub struct Namespace {
nsid: u32,
#[inspect(flatten)]
state: Arc<DynamicState>,
block_shift: u32,
max_transfer_block_count: u32,
preferred_deallocate_granularity: u16,
reservation_capabilities: nvm::ReservationCapabilities,
controller_identify: Arc<spec::IdentifyController>,
#[inspect(skip)]
issuers: Arc<IoIssuers>,
#[inspect(skip)]
_cancel_rescan: mesh::Cancel,
}
#[derive(Debug, Inspect)]
struct DynamicState {
block_count: AtomicU64,
#[inspect(skip)]
resize_event: event_listener::Event,
removed: AtomicBool,
identify: Mutex<nvm::IdentifyNamespace>,
}
impl Namespace {
pub(super) async fn new(
driver: &VmTaskDriver,
admin: Arc<Issuer>,
rescan_event: Arc<event_listener::Event>,
controller_identify: Arc<spec::IdentifyController>,
io_issuers: &Arc<IoIssuers>,
nsid: u32,
) -> Result<Self, NamespaceError> {
let identify = identify_namespace(&admin, nsid)
.await
.map_err(NamespaceError::Request)?;
Namespace::new_from_identify(
driver,
admin,
rescan_event,
controller_identify.clone(),
io_issuers,
nsid,
identify,
)
}
fn new_from_identify(
driver: &VmTaskDriver,
admin: Arc<Issuer>,
rescan_event: Arc<event_listener::Event>,
controller_identify: Arc<spec::IdentifyController>,
io_issuers: &Arc<IoIssuers>,
nsid: u32,
identify: nvm::IdentifyNamespace,
) -> Result<Self, NamespaceError> {
if identify.nsze == 0 {
return Err(NamespaceError::NotFound);
}
let lba_format_index = identify.flbas.low_index();
if lba_format_index > identify.nlbaf {
return Err(NamespaceError::FlbasInvalid);
}
let lbaf = identify.lbaf[lba_format_index as usize];
let block_shift = lbaf.lbads();
if !matches!(block_shift, 9..=16) {
return Err(NamespaceError::LbaFormatInvalid(lbaf));
}
let max_transfer_block_count = {
let mdts = if controller_identify.mdts != 0 {
controller_identify.mdts
} else {
u8::MAX
};
let max_transfer_bits = mdts.saturating_add(NVME_PAGE_SHIFT);
1 << max_transfer_bits
.checked_sub(block_shift)
.ok_or(NamespaceError::MdtsInvalid(mdts))?
.min(16)
};
let preferred_deallocate_granularity = if identify.nsfeat.optperf() {
identify.npdg
} else {
1
};
let reservation_capabilities = if controller_identify.oncs.reservations() {
identify.rescap
} else {
nvm::ReservationCapabilities::new()
};
let state = Arc::new(DynamicState {
block_count: identify.nsze.into(),
removed: false.into(),
identify: Mutex::new(identify),
resize_event: Default::default(),
});
let (mut ctx, cancel_rescan) = CancelContext::new().with_cancel();
driver
.spawn(format!("nvme_poll_rescan_{nsid}"), {
let state = state.clone();
async move {
state
.poll_for_rescans(&mut ctx, &admin, nsid, &rescan_event)
.await
}
})
.detach();
Ok(Self {
nsid,
state,
max_transfer_block_count,
block_shift: block_shift.into(),
preferred_deallocate_granularity,
reservation_capabilities,
controller_identify,
issuers: io_issuers.clone(),
_cancel_rescan: cancel_rescan,
})
}
pub fn block_count(&self) -> u64 {
self.state.block_count.load(Ordering::Relaxed)
}
pub async fn wait_resize(&self, block_count: u64) -> u64 {
loop {
let listen = self.state.resize_event.listen();
let current = self.block_count();
if current != block_count {
break current;
}
listen.await;
}
}
pub fn block_size(&self) -> u32 {
1 << self.block_shift
}
fn check_active(&self) -> Result<(), RequestError> {
if self.state.removed.load(Ordering::Relaxed) {
return Err(RequestError::Nvme(
spec::Status::INVALID_NAMESPACE_OR_FORMAT.into(),
));
}
Ok(())
}
async fn issuer(&self, cpu: u32) -> Result<&Issuer, RequestError> {
self.issuers.get(cpu).await
}
pub async fn read(
&self,
target_cpu: u32,
lba: u64,
block_count: u32,
guest_memory: &GuestMemory,
mem: PagedRange<'_>,
) -> Result<(), RequestError> {
self.check_active()?;
if block_count == 0 {
return Ok(());
}
assert!(block_count <= self.max_transfer_block_count);
let len = (block_count as usize) << self.block_shift;
if len > mem.len() {
panic!(
"invalid block count: {len} > {mem_len}",
mem_len = mem.len()
);
}
self.issuer(target_cpu)
.await?
.issue_external(
spec::Command {
cdw10: nvm::Cdw10ReadWrite::new().with_sbla_low(lba as u32).into(),
cdw11: nvm::Cdw11ReadWrite::new()
.with_sbla_high((lba >> 32) as u32)
.into(),
cdw12: nvm::Cdw12ReadWrite::new()
.with_nlb_z((block_count - 1) as u16)
.into(),
..nvm_cmd(nvm::NvmOpcode::READ, self.nsid)
},
guest_memory,
mem.subrange(0, len),
)
.await?;
Ok(())
}
pub async fn write(
&self,
target_cpu: u32,
lba: u64,
block_count: u32,
fua: bool,
guest_memory: &GuestMemory,
mem: PagedRange<'_>,
) -> Result<(), RequestError> {
self.check_active()?;
if block_count == 0 {
return Ok(());
}
assert!(block_count <= self.max_transfer_block_count);
let len = (block_count as usize) << self.block_shift;
if len > mem.len() {
panic!(
"invalid block count: {len} > {mem_len}",
mem_len = mem.len()
);
}
self.issuer(target_cpu)
.await?
.issue_external(
spec::Command {
cdw10: nvm::Cdw10ReadWrite::new().with_sbla_low(lba as u32).into(),
cdw11: nvm::Cdw11ReadWrite::new()
.with_sbla_high((lba >> 32) as u32)
.into(),
cdw12: nvm::Cdw12ReadWrite::new()
.with_nlb_z((block_count - 1) as u16)
.with_fua(fua)
.into(),
..nvm_cmd(nvm::NvmOpcode::WRITE, self.nsid)
},
guest_memory,
mem.subrange(0, len),
)
.await?;
Ok(())
}
pub async fn flush(&self, target_cpu: u32) -> Result<(), RequestError> {
self.check_active()?;
self.issuer(target_cpu)
.await?
.issue_neither(spec::Command {
..nvm_cmd(nvm::NvmOpcode::FLUSH, self.nsid)
})
.await?;
Ok(())
}
pub fn max_transfer_block_count(&self) -> u32 {
self.max_transfer_block_count
}
pub fn supports_dataset_management(&self) -> bool {
self.controller_identify.oncs.dataset_management()
}
pub fn preferred_deallocate_granularity(&self) -> u16 {
self.preferred_deallocate_granularity
}
pub fn dataset_management_range_limit(&self) -> usize {
256
}
pub fn dataset_management_range_size_limit(&self) -> u32 {
u32::MAX
}
pub async fn deallocate(
&self,
target_cpu: u32,
ranges: &[nvm::DsmRange],
) -> Result<(), RequestError> {
self.check_active()?;
let ranges = &ranges[..ranges.len().min(256)];
self.issuer(target_cpu)
.await?
.issue_in(
spec::Command {
cdw10: nvm::Cdw10Dsm::new()
.with_nr_z((ranges.len() - 1) as u8)
.into(),
cdw11: nvm::Cdw11Dsm::new().with_ad(true).into(),
..nvm_cmd(nvm::NvmOpcode::DSM, self.nsid)
},
ranges.as_bytes(),
)
.await?;
Ok(())
}
pub fn reservation_capabilities(&self) -> nvm::ReservationCapabilities {
self.reservation_capabilities
}
pub async fn reservation_report_extended(
&self,
target_cpu: u32,
) -> Result<
(
nvm::ReservationReportExtended,
Vec<nvm::RegisteredControllerExtended>,
),
RequestError,
> {
let mut data = vec![0; 4096];
let issuer = self.issuer(target_cpu).await?;
loop {
issuer
.issue_out(
spec::Command {
cdw10: nvm::Cdw10ReservationReport::new()
.with_numd_z((data.len() / 4 - 1) as u32)
.into(),
cdw11: nvm::Cdw11ReservationReport::new().with_eds(true).into(),
..nvm_cmd(nvm::NvmOpcode::RESERVATION_REPORT, self.nsid)
},
&mut data,
)
.await?;
let header = nvm::ReservationReportExtended::read_from_prefix(&data[..])
.unwrap()
.0; let len = size_of_val(&header)
+ header.report.regctl.get() as usize
* size_of::<nvm::RegisteredControllerExtended>();
if len > data.len() {
data.resize(len, 0);
continue;
}
let mut controllers = vec![
nvm::RegisteredControllerExtended::new_zeroed();
header.report.regctl.get().into()
];
controllers
.as_mut_bytes()
.copy_from_slice(&data[size_of_val(&header)..len]);
break Ok((header, controllers));
}
}
pub async fn reservation_acquire(
&self,
target_cpu: u32,
action: nvm::ReservationAcquireAction,
crkey: u64,
prkey: u64,
reservation_type: nvm::ReservationType,
) -> Result<(), RequestError> {
let data = nvm::ReservationAcquire { crkey, prkey };
self.issuer(target_cpu)
.await?
.issue_in(
spec::Command {
cdw10: nvm::Cdw10ReservationAcquire::new()
.with_racqa(action.0)
.with_rtype(reservation_type.0)
.into(),
..nvm_cmd(nvm::NvmOpcode::RESERVATION_ACQUIRE, self.nsid)
},
data.as_bytes(),
)
.await?;
Ok(())
}
pub async fn reservation_release(
&self,
target_cpu: u32,
action: nvm::ReservationReleaseAction,
crkey: u64,
reservation_type: nvm::ReservationType,
) -> Result<(), RequestError> {
let data = nvm::ReservationRelease { crkey };
self.issuer(target_cpu)
.await?
.issue_in(
spec::Command {
cdw10: nvm::Cdw10ReservationRelease::new()
.with_rrela(action.0)
.with_rtype(reservation_type.0)
.into(),
..nvm_cmd(nvm::NvmOpcode::RESERVATION_RELEASE, self.nsid)
},
data.as_bytes(),
)
.await?;
Ok(())
}
pub async fn reservation_register(
&self,
target_cpu: u32,
action: nvm::ReservationRegisterAction,
crkey: Option<u64>,
nrkey: u64,
ptpl: Option<bool>,
) -> Result<(), RequestError> {
let data = nvm::ReservationRegister {
crkey: crkey.unwrap_or(0),
nrkey,
};
let cptpl = match ptpl {
None => nvm::ChangePersistThroughPowerLoss::NO_CHANGE,
Some(false) => nvm::ChangePersistThroughPowerLoss::CLEAR,
Some(true) => nvm::ChangePersistThroughPowerLoss::SET,
};
self.issuer(target_cpu)
.await?
.issue_in(
spec::Command {
cdw10: nvm::Cdw10ReservationRegister::new()
.with_rrega(action.0)
.with_iekey(crkey.is_none())
.with_cptpl(cptpl.0)
.into(),
..nvm_cmd(nvm::NvmOpcode::RESERVATION_REGISTER, self.nsid)
},
data.as_bytes(),
)
.await?;
Ok(())
}
pub fn nsid(&self) -> u32 {
self.nsid
}
pub fn save(&self) -> anyhow::Result<SavedNamespaceData> {
Ok(SavedNamespaceData {
nsid: self.nsid,
identify_ns: self.state.identify.lock().clone(),
})
}
pub(super) fn restore(
driver: &VmTaskDriver,
admin: Arc<Issuer>,
rescan_event: Arc<event_listener::Event>,
identify_ctrl: Arc<spec::IdentifyController>,
io_issuers: &Arc<IoIssuers>,
saved_state: &SavedNamespaceData,
) -> Result<Self, NamespaceError> {
let SavedNamespaceData { nsid, identify_ns } = saved_state;
Namespace::new_from_identify(
driver,
admin,
rescan_event,
identify_ctrl.clone(),
io_issuers,
*nsid,
identify_ns.clone(),
)
}
}
impl DynamicState {
async fn poll_for_rescans(
&self,
ctx: &mut CancelContext,
admin: &Issuer,
nsid: u32,
rescan_event: &event_listener::Event,
) {
loop {
let listen = rescan_event.listen();
tracing::debug!("rescan");
match identify_namespace(admin, nsid).await {
Ok(identify) => {
if identify.nsze == 0 {
tracing::info!(nsid, "namespace was hot removed");
self.removed.store(true, Ordering::Relaxed);
} else {
let old_block_count = self.block_count.load(Ordering::Relaxed);
let new_block_count = identify.nsze;
if old_block_count != new_block_count {
tracing::info!(
old_block_count,
new_block_count,
"nvme disk size changed"
);
self.block_count.store(new_block_count, Ordering::Relaxed);
self.resize_event.notify(usize::MAX);
} else {
tracing::debug!("rescanned, no change");
}
}
*self.identify.lock() = identify;
}
Err(err) => {
tracing::warn!(
nsid,
error = &err as &dyn std::error::Error,
"failed to query namespace during rescan"
);
}
}
if ctx.until_cancelled(listen).await.is_err() {
break;
}
}
}
}
async fn identify_namespace(
admin: &Issuer,
nsid: u32,
) -> Result<nvm::IdentifyNamespace, RequestError> {
let mut identify = nvm::IdentifyNamespace::new_zeroed();
admin
.issue_out(
spec::Command {
nsid,
cdw10: spec::Cdw10Identify::new()
.with_cns(spec::Cns::NAMESPACE.0)
.into(),
..admin_cmd(spec::AdminOpcode::IDENTIFY)
},
identify.as_mut_bytes(),
)
.await?;
Ok(identify)
}
fn nvm_cmd(opcode: nvm::NvmOpcode, nsid: u32) -> spec::Command {
spec::Command {
cdw0: spec::Cdw0::new().with_opcode(opcode.0),
nsid,
..FromZeros::new_zeroed()
}
}