nvme_resources/fault.rs
1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! Provides an interface to programmatically and deterministically inject faults in the NVMe fault controller.
5
6use mesh::Cell;
7use mesh::MeshPayload;
8use mesh::OneshotSender;
9use mesh::rpc::Rpc;
10use nvme_spec::Command;
11use nvme_spec::Completion;
12use std::sync::Arc;
13use std::time::Duration;
14
15/// Supported fault behaviour for NVMe admin queues
16#[derive(Debug, MeshPayload)]
17pub enum AdminQueueFaultBehavior<T> {
18 /// Update the queue entry with the returned data
19 Update(T),
20 /// Drop the queue entry
21 Drop,
22 /// Delay. Note: This delay is not asynchronously applied. i.e. Subsequent
23 /// commands will be processed until the delay is over.
24 Delay(Duration),
25 /// Panic
26 Panic(String),
27 /// Writes the given payload to the PRP range. The test should ensure
28 /// that the payload is of valid size. If the size is too large, the fault
29 /// controller will panic. This behavior is not yet supported by the submission
30 /// queue fault.
31 CustomPayload(Vec<u8>),
32 /// Verify that a command was seen.
33 Verify(Option<OneshotSender<()>>),
34}
35
36/// Supported fault behaviour for NVMe IO queues
37#[derive(Debug, MeshPayload, Clone)]
38pub enum IoQueueFaultBehavior {
39 /// Writes the given payload to the PRP range. The test should ensure
40 /// that the payload is of valid size. If the size is too large, the fault
41 /// controller will panic. This behavior is not yet supported by the submission
42 /// queue fault.
43 CustomPayload(Vec<u8>),
44 /// Panic
45 Panic(String),
46}
47
48/// Supported fault behaviour for PCI faults
49#[derive(MeshPayload)]
50pub enum PciFaultBehavior {
51 /// Introduce a delay to the PCI operation. This WILL block the processing
52 /// thread for the delay duration.
53 Delay(Duration),
54 /// Do nothing
55 Default,
56 /// Verify that the fault was triggered.
57 Verify(Option<OneshotSender<()>>),
58}
59
60/// A notification to the test confirming namespace change processing.
61#[derive(MeshPayload)]
62pub enum NamespaceChange {
63 /// Input: Namespace ID to notify, Output: Empty confirmation.
64 ChangeNotification(Rpc<u32, ()>),
65}
66
67/// A fault configuration to apply [`PciFaultBehavior`] to the controller management layer.
68///
69/// Currently the only supported fault is to delay enabling the controller via
70/// cc.en().
71///
72/// # Example
73/// Delay enabling the controller by 500ms.
74///
75/// ```no_run
76/// use mesh::CellUpdater;
77/// use nvme_resources::fault::FaultConfiguration;
78/// use nvme_resources::fault::PciFaultBehavior;
79/// use nvme_resources::fault::PciFaultConfig;
80/// use std::time::Duration;
81///
82/// pub fn pci_enable_delay_fault() -> FaultConfiguration{
83/// let mut fault_start_updater = CellUpdater::new(false);
84/// FaultConfiguration::new(fault_start_updater.cell())
85/// .with_pci_fault(
86/// PciFaultConfig::new().with_cc_enable_fault(
87/// PciFaultBehavior::Delay(Duration::from_millis(500)),
88/// )
89/// )
90/// }
91/// ```
92#[derive(MeshPayload)]
93pub struct PciFaultConfig {
94 /// Fault to apply to cc.en() bit during enablement
95 pub controller_management_fault_enable: PciFaultBehavior,
96}
97
98/// A fault config to trigger spurious namespace change notifications from the controller.
99///
100/// The fault controller listens on the provided channel for notifications containing
101/// a `u32` value representing the NSID (Namespace Identifier) that has changed.
102/// This does not actually modify the namespace; instead, it triggers the controller
103/// to process a namespace change notification. The fault is modeled as an
104/// RPC, which the controller completes once it has processed the change and sent
105/// the corresponding Asynchronous Event Notification (AEN).
106/// As per NVMe spec: If multiple namespace changes are notified, only the first triggers an AEN.
107/// Subsequent changes do not trigger additional AENs until the driver issues a
108/// GET_LOG_PAGE command. For implementation simplicity, namespace fault is not
109/// gated by the `fault_active` flag. Since only test code can send
110/// notifications on the fault channel, it is safe to bypass this check.
111///
112/// # Example
113/// Send a namespace change notification for NSID 1 and wait for it to be processed.
114/// ```no_run
115/// use mesh::CellUpdater;
116/// use nvme_resources::fault::NamespaceChange;
117/// use nvme_resources::fault::FaultConfiguration;
118/// use nvme_resources::fault::NamespaceFaultConfig;
119/// use nvme_resources::NvmeFaultControllerHandle;
120/// use guid::Guid;
121/// use mesh::rpc::RpcSend;
122///
123/// pub async fn send_namespace_change_fault() {
124/// let mut fault_start_updater = CellUpdater::new(false);
125/// let (ns_change_send, ns_change_recv) = mesh::channel::<NamespaceChange>();
126/// let fault_configuration = FaultConfiguration::new(fault_start_updater.cell())
127/// .with_namespace_fault(
128/// NamespaceFaultConfig::new(ns_change_recv),
129/// );
130/// // Complete setup
131/// let fault_controller_handle = NvmeFaultControllerHandle {
132/// subsystem_id: Guid::new_random(),
133/// msix_count: 10,
134/// max_io_queues: 10,
135/// namespaces: vec![
136/// // Define `NamespaceDefinitions` here
137/// ],
138/// fault_config: fault_configuration,
139/// };
140///
141/// // Send the namespace change notification and await processing.
142/// ns_change_send.call(NamespaceChange::ChangeNotification, 1).await.unwrap();
143/// }
144/// ```
145#[derive(MeshPayload)]
146pub struct NamespaceFaultConfig {
147 /// Receiver for changed namespace notifications
148 pub recv_changed_namespace: mesh::Receiver<NamespaceChange>,
149}
150
151/// A fault configuration to inject faults into the admin submission and completion queues.
152///
153/// This struct maintains a mapping from [`CommandMatch`] to [`AdminQueueFaultBehavior`] for
154/// submission and completion queues. When a command match is found, (and `fault_active == true`)
155/// the associated fault is applied.
156/// Both submission and completion queue faults match on commands
157/// because completions do not contain enough identifying information to
158/// match against. If there is more than one match for a given command, the
159/// match defined first is prioritized. Faults are added via the
160/// `with_submission_queue_fault` and `with_completion_queue_fault` methods and
161/// can be chained. AdminQueueFaultConfig::new() creates an empty fault.
162///
163/// # Panics
164/// Panics if a duplicate `CommandMatch` is added for either submission or
165/// completion queues
166///
167/// # Example
168/// Panic on CREATE_IO_COMPLETION_QUEUE and delay before sending completion for 500ms after
169/// GET_LOG_PAGE command is processed.
170/// ```no_run
171/// use mesh::CellUpdater;
172/// use nvme_resources::fault::AdminQueueFaultConfig;
173/// use nvme_resources::fault::CommandMatch;
174/// use nvme_resources::fault::FaultConfiguration;
175/// use nvme_resources::fault::AdminQueueFaultBehavior;
176/// use nvme_spec::Command;
177/// use std::time::Duration;
178/// use zerocopy::FromZeros;
179/// use zerocopy::IntoBytes;
180///
181/// pub fn build_admin_queue_fault() -> FaultConfiguration {
182/// let mut fault_start_updater = CellUpdater::new(false);
183///
184/// // Setup command matches
185/// let mut command_io_queue = Command::new_zeroed();
186/// let mut command_log_page = Command::new_zeroed();
187/// let mut mask = Command::new_zeroed();
188///
189/// command_io_queue.cdw0 = command_io_queue.cdw0.with_opcode(nvme_spec::AdminOpcode::CREATE_IO_COMPLETION_QUEUE.0);
190/// command_log_page.cdw0 = command_log_page.cdw0.with_opcode(nvme_spec::AdminOpcode::GET_LOG_PAGE.0);
191/// mask.cdw0 = mask.cdw0.with_opcode(u8::MAX);
192///
193/// return FaultConfiguration::new(fault_start_updater.cell())
194/// .with_admin_queue_fault(
195/// AdminQueueFaultConfig::new().with_submission_queue_fault(
196/// CommandMatch {
197/// command: command_io_queue,
198/// mask: mask.as_bytes().try_into().expect("mask should be 64 bytes"),
199/// },
200/// AdminQueueFaultBehavior::Panic("Received a CREATE_IO_COMPLETION_QUEUE command".to_string()),
201/// ).with_completion_queue_fault(
202/// CommandMatch {
203/// command: command_log_page,
204/// mask: mask.as_bytes().try_into().expect("mask should be 64 bytes"),
205/// },
206/// AdminQueueFaultBehavior::Delay(Duration::from_millis(500)),
207/// )
208/// );
209/// }
210/// ```
211#[derive(MeshPayload)]
212pub struct AdminQueueFaultConfig {
213 /// A map of NVME opcodes to the submission fault behavior for each. (This
214 /// would ideally be a `HashMap`, but `mesh` doesn't support that type.
215 /// Given that this is not performance sensitive, the lookup is okay)
216 pub admin_submission_queue_faults: Vec<(CommandMatch, AdminQueueFaultBehavior<Command>)>,
217 /// A map of NVME opcodes to the completion fault behavior for each.
218 pub admin_completion_queue_faults: Vec<(CommandMatch, AdminQueueFaultBehavior<Completion>)>,
219}
220
221/// A fault configuration to inject faults into the io completions.
222///
223/// This struct maintains a mapping from [`CommandMatch`] to [`IoQueueFaultBehavior`] for
224/// completions. When a command match is found, (and `fault_active == true`)
225/// the associated fault is applied.
226/// If there is more than one match for a given command, the
227/// match defined first is prioritized. Faults are added via the
228/// `with_completion_queue_fault` method and calls
229/// can be chained. IoQueueFaultConfig::new() creates an empty fault.
230///
231/// # Panics
232/// Panics if a duplicate `CommandMatch` is added
233///
234/// # Example
235/// Panic when RESERVATION_REPORT command is seen.
236/// ```no_run
237/// use mesh::CellUpdater;
238/// use nvme_resources::fault::IoQueueFaultConfig;
239/// use nvme_resources::fault::CommandMatch;
240/// use nvme_resources::fault::FaultConfiguration;
241/// use nvme_resources::fault::IoQueueFaultBehavior;
242/// use nvme_spec::Command;
243/// use nvme_spec::nvm;
244/// use zerocopy::FromZeros;
245/// use zerocopy::IntoBytes;
246///
247/// pub fn build_admin_queue_fault() -> FaultConfiguration {
248/// let mut fault_start_updater = CellUpdater::new(false);
249///
250/// // Setup command matches
251/// let mut command_io_queue = Command::new_zeroed();
252/// let mut command_log_page = Command::new_zeroed();
253/// let mut mask = Command::new_zeroed();
254///
255/// command_io_queue.cdw0 = command_io_queue.cdw0.with_opcode(nvm::NvmOpcode::RESERVATION_REPORT.0);
256/// mask.cdw0 = mask.cdw0.with_opcode(u8::MAX);
257///
258/// return FaultConfiguration::new(fault_start_updater.cell())
259/// .with_io_queue_fault(
260/// IoQueueFaultConfig::new(fault_start_updater.cell()).with_completion_queue_fault(
261/// CommandMatch {
262/// command: command_io_queue,
263/// mask: mask.as_bytes().try_into().expect("mask should be 64 bytes"),
264/// },
265/// IoQueueFaultBehavior::Panic("Received a RESERVATION_REPORT command".to_string()),
266/// )
267/// );
268/// }
269/// ```
270#[derive(MeshPayload, Clone)]
271pub struct IoQueueFaultConfig {
272 /// A map of NVME opcodes to the completion fault behavior for each.
273 pub io_completion_queue_faults: Vec<(CommandMatch, IoQueueFaultBehavior)>,
274 /// Fault active state. (Repeated here because FaultConfiguration is not Cloneable).
275 pub fault_active: Cell<bool>,
276}
277
278/// A versatile definition to command match [`NVMe commands`](nvme_spec::Command)
279///
280/// Matches NVMe commands using a 512-bit mask: (command_bytes & mask) == (pattern_bytes & mask).
281/// A convenient way to build the patterns is to treat both the command and the mask as
282/// `nvme_spec::Command` and max out the fields in the mask that should be
283/// matched.
284///
285/// # Example
286/// Builds a command match that matches on all CREATE_IO_COMPLETION_QUEUE admin commands.
287/// ```no_run
288/// use nvme_resources::fault::CommandMatch;
289/// use nvme_spec::Command;
290/// use zerocopy::FromZeros;
291/// use zerocopy::IntoBytes;
292///
293/// pub fn build_command_match() -> CommandMatch {
294/// let mut command = Command::new_zeroed();
295/// let mut mask = Command::new_zeroed();
296/// command.cdw0 = command.cdw0.with_opcode(nvme_spec::AdminOpcode::CREATE_IO_COMPLETION_QUEUE.0);
297/// mask.cdw0 = mask.cdw0.with_opcode(u8::MAX);
298/// CommandMatch {
299/// command,
300/// mask: mask.as_bytes().try_into().expect("mask should be 64 bytes"),
301/// }
302/// }
303/// ```
304#[derive(Clone, MeshPayload, PartialEq)]
305pub struct CommandMatch {
306 /// Command to match against
307 pub command: Command,
308 /// Bitmask that defines the bits to match against
309 pub mask: [u8; 64],
310}
311
312/// Fault configuration for the NVMe fault controller.
313///
314/// This struct defines behaviors that inject faults into the NVMe fault controller logic,
315/// such as delaying or dropping commands, triggering namespace change notifications,
316/// or customizing completion payloads. Fault injection is controlled by the
317/// `fault_active` flag, unless specified otherwise in the fault description.
318/// `fault_active` is managed by the test via [`mesh::CellUpdater`]. An
319/// exception to the `fault_active` check is the [`NamespaceFaultConfig`] which
320/// is processed regardless of `fault_active` state. (See `nvme_test` crate for
321/// details on how the faults are applied.)
322///
323/// # Example
324/// Panic when a command that matches CREATE_IO_COMPLETION_QUEUE is seen in the
325/// admin queue:
326/// ```no_run
327/// use mesh::CellUpdater;
328/// use nvme_resources::fault::FaultConfiguration;
329/// use nvme_resources::fault::AdminQueueFaultConfig;
330/// use nvme_resources::fault::CommandMatch;
331/// use nvme_spec::Command;
332/// use nvme_resources::fault::AdminQueueFaultBehavior;
333/// use nvme_resources::NvmeFaultControllerHandle;
334/// use guid::Guid;
335/// use zerocopy::FromZeros;
336/// use zerocopy::IntoBytes;
337///
338/// pub fn example_fault() {
339/// let mut fault_start_updater = CellUpdater::new(false);
340///
341/// // Setup command matches
342/// let mut command = Command::new_zeroed();
343/// let mut mask = Command::new_zeroed();
344///
345/// command.cdw0 = command.cdw0.with_opcode(nvme_spec::AdminOpcode::CREATE_IO_COMPLETION_QUEUE.0);
346/// mask.cdw0 = mask.cdw0.with_opcode(u8::MAX);
347///
348/// let fault_configuration = FaultConfiguration::new(fault_start_updater.cell())
349/// .with_admin_queue_fault(
350/// AdminQueueFaultConfig::new().with_submission_queue_fault(
351/// CommandMatch {
352/// command: command,
353/// mask: mask.as_bytes().try_into().expect("mask should be 64 bytes"),
354/// },
355/// AdminQueueFaultBehavior::Panic("Received a CREATE_IO_COMPLETION_QUEUE command".to_string()),
356/// )
357/// );
358/// let fault_controller_handle = NvmeFaultControllerHandle {
359/// subsystem_id: Guid::new_random(),
360/// msix_count: 10,
361/// max_io_queues: 10,
362/// namespaces: vec![
363/// // Define NamespaceDefinitions here
364/// ],
365/// fault_config: fault_configuration,
366/// };
367/// // Pass the controller handle in to the vm config to create and attach the fault controller. At this point the fault is inactive.
368/// fault_start_updater.set(true); // Activate the fault injection.
369/// // ... run test ...
370/// fault_start_updater.set(false); // Deactivate the fault injection.
371/// }
372/// ```
373#[derive(MeshPayload)]
374pub struct FaultConfiguration {
375 /// Fault active state
376 pub fault_active: Cell<bool>,
377 /// Fault to apply to the admin queues
378 pub admin_fault: AdminQueueFaultConfig,
379 /// Fault to apply to management layer of the controller. Option because it
380 /// needs to be extracted by the PCI layer during initialization.
381 pub pci_fault: Option<PciFaultConfig>,
382 /// Fault for test triggered namespace change notifications
383 pub namespace_fault: NamespaceFaultConfig,
384 /// Fault to apply to all IO queues
385 pub io_fault: Arc<IoQueueFaultConfig>,
386}
387
388impl FaultConfiguration {
389 /// Create a new empty fault configuration
390 pub fn new(fault_active: Cell<bool>) -> Self {
391 // Ideally the faults should begin life as Option::None.
392 // For now, use a dummy mesh channel for namespace fault to avoid
393 // test setup complexity & special cases in the AdminHandler run loop.
394 Self {
395 fault_active: fault_active.clone(),
396 admin_fault: AdminQueueFaultConfig::new(),
397 pci_fault: Some(PciFaultConfig::new()),
398 namespace_fault: NamespaceFaultConfig::new(mesh::channel().1),
399 io_fault: Arc::new(IoQueueFaultConfig::new(fault_active)),
400 }
401 }
402
403 /// Add a PCI fault configuration to the fault configuration
404 pub fn with_pci_fault(mut self, pci_fault: PciFaultConfig) -> Self {
405 self.pci_fault = Some(pci_fault);
406 self
407 }
408
409 /// Add an admin queue fault configuration to the fault configuration
410 pub fn with_admin_queue_fault(mut self, admin_fault: AdminQueueFaultConfig) -> Self {
411 self.admin_fault = admin_fault;
412 self
413 }
414
415 /// Add an IO queue fault configuration to the fault configuration
416 pub fn with_io_queue_fault(mut self, io_fault: IoQueueFaultConfig) -> Self {
417 self.io_fault = Arc::new(io_fault);
418 self
419 }
420
421 /// Add a namespace fault configuration to the fault configuration
422 pub fn with_namespace_fault(mut self, namespace_fault: NamespaceFaultConfig) -> Self {
423 self.namespace_fault = namespace_fault;
424 self
425 }
426}
427
428impl PciFaultConfig {
429 /// Create a new no-op fault configuration
430 pub fn new() -> Self {
431 Self {
432 controller_management_fault_enable: PciFaultBehavior::Default,
433 }
434 }
435
436 /// Add a cc.en() fault
437 pub fn with_cc_enable_fault(mut self, behaviour: PciFaultBehavior) -> Self {
438 self.controller_management_fault_enable = behaviour;
439 self
440 }
441}
442
443impl AdminQueueFaultConfig {
444 /// Create an empty fault configuration
445 pub fn new() -> Self {
446 Self {
447 admin_submission_queue_faults: vec![],
448 admin_completion_queue_faults: vec![],
449 }
450 }
451
452 /// Add a [`CommandMatch`] -> [`AdminQueueFaultBehavior`] mapping for the submission queue.
453 ///
454 /// # Panics
455 /// Panics if an identical [`CommandMatch`] has already been configured.
456 pub fn with_submission_queue_fault(
457 mut self,
458 pattern: CommandMatch,
459 behaviour: AdminQueueFaultBehavior<Command>,
460 ) -> Self {
461 if self
462 .admin_submission_queue_faults
463 .iter()
464 .any(|(c, _)| pattern == *c)
465 {
466 panic!(
467 "Duplicate submission queue fault for Compare {:?} and Mask {:?}",
468 pattern.command, pattern.mask
469 );
470 }
471
472 self.admin_submission_queue_faults
473 .push((pattern, behaviour));
474 self
475 }
476
477 /// Add a [`CommandMatch`] -> [`AdminQueueFaultBehavior`] mapping for the completion queue.
478 ///
479 /// # Panics
480 /// Panics if an identical [`CommandMatch`] has already been configured.
481 pub fn with_completion_queue_fault(
482 mut self,
483 pattern: CommandMatch,
484 behaviour: AdminQueueFaultBehavior<Completion>,
485 ) -> Self {
486 if self
487 .admin_completion_queue_faults
488 .iter()
489 .any(|(c, _)| pattern == *c)
490 {
491 panic!(
492 "Duplicate completion queue fault for Compare {:?} and Mask {:?}",
493 pattern.command, pattern.mask
494 );
495 }
496
497 self.admin_completion_queue_faults
498 .push((pattern, behaviour));
499 self
500 }
501}
502
503impl NamespaceFaultConfig {
504 /// Creates a new NamespaceFaultConfig with a fresh channel.
505 pub fn new(recv_changed_namespace: mesh::Receiver<NamespaceChange>) -> Self {
506 Self {
507 recv_changed_namespace,
508 }
509 }
510}
511
512impl IoQueueFaultConfig {
513 /// Create an empty IO queue fault configuration
514 pub fn new(fault_active: Cell<bool>) -> Self {
515 Self {
516 io_completion_queue_faults: vec![],
517 fault_active,
518 }
519 }
520
521 /// Add a [`CommandMatch`] -> [`IoQueueFaultBehavior`] mapping for the completion queue.
522 ///
523 /// # Panics
524 /// Panics if an identical [`CommandMatch`] has already been configured.
525 pub fn with_completion_queue_fault(
526 mut self,
527 pattern: CommandMatch,
528 behaviour: IoQueueFaultBehavior,
529 ) -> Self {
530 if self
531 .io_completion_queue_faults
532 .iter()
533 .any(|(c, _)| pattern == *c)
534 {
535 panic!(
536 "Duplicate completion queue fault for Compare {:?} and Mask {:?}",
537 pattern.command, pattern.mask
538 );
539 }
540
541 self.io_completion_queue_faults.push((pattern, behaviour));
542 self
543 }
544}