nvme_resources/fault.rs
1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! Provides an interface to programmatically and deterministically inject faults in the NVMe fault controller.
5
6use mesh::Cell;
7use mesh::MeshPayload;
8use mesh::OneshotSender;
9use mesh::rpc::Rpc;
10use nvme_spec::Command;
11use nvme_spec::Completion;
12use std::sync::Arc;
13use std::time::Duration;
14
15/// Supported fault behaviour for NVMe admin queues
16#[derive(Debug, MeshPayload)]
17pub enum AdminQueueFaultBehavior<T> {
18 /// Update the queue entry with the returned data
19 Update(T),
20 /// Drop the queue entry
21 Drop,
22 /// Delay. Note: This delay is not asynchronously applied. i.e. Subsequent
23 /// commands will be processed until the delay is over.
24 Delay(Duration),
25 /// Panic
26 Panic(String),
27 /// Writes the given payload to the PRP range. The test should ensure
28 /// that the payload is of valid size. If the size is too large, the fault
29 /// controller will panic. This behavior is not yet supported by the submission
30 /// queue fault.
31 CustomPayload(Vec<u8>),
32 /// Verify that a command was seen.
33 Verify(Option<OneshotSender<()>>),
34}
35
36/// Supported fault behaviour for NVMe IO queues
37#[derive(Debug, MeshPayload, Clone)]
38pub enum IoQueueFaultBehavior {
39 /// Writes the given payload to the PRP range. The test should ensure
40 /// that the payload is of valid size. If the size is too large, the fault
41 /// controller will panic. This behavior is not yet supported by the submission
42 /// queue fault.
43 CustomPayload(Vec<u8>),
44 /// Panic
45 Panic(String),
46 /// Delay. Note: This delay is not asynchronously applied. i.e. Subsequent
47 /// commands will be processed until the delay is over.
48 Delay(Duration),
49}
50
51/// Supported fault behaviour for PCI faults
52#[derive(MeshPayload)]
53pub enum PciFaultBehavior {
54 /// Introduce a delay to the PCI operation. This WILL block the processing
55 /// thread for the delay duration.
56 Delay(Duration),
57 /// Do nothing
58 Default,
59 /// Verify that the fault was triggered.
60 Verify(Option<OneshotSender<()>>),
61}
62
63/// A notification to the test confirming namespace change processing.
64#[derive(MeshPayload)]
65pub enum NamespaceChange {
66 /// Input: Namespace ID to notify, Output: Empty confirmation.
67 ChangeNotification(Rpc<u32, ()>),
68}
69
70/// A fault configuration to apply [`PciFaultBehavior`] to the controller management layer.
71///
72/// Currently the only supported fault is to delay enabling the controller via
73/// cc.en().
74///
75/// # Example
76/// Delay enabling the controller by 500ms.
77///
78/// ```no_run
79/// use mesh::CellUpdater;
80/// use nvme_resources::fault::FaultConfiguration;
81/// use nvme_resources::fault::PciFaultBehavior;
82/// use nvme_resources::fault::PciFaultConfig;
83/// use std::time::Duration;
84///
85/// pub fn pci_enable_delay_fault() -> FaultConfiguration{
86/// let mut fault_start_updater = CellUpdater::new(false);
87/// FaultConfiguration::new(fault_start_updater.cell())
88/// .with_pci_fault(
89/// PciFaultConfig::new().with_cc_enable_fault(
90/// PciFaultBehavior::Delay(Duration::from_millis(500)),
91/// )
92/// )
93/// }
94/// ```
95#[derive(MeshPayload)]
96pub struct PciFaultConfig {
97 /// Fault to apply to cc.en() bit during enablement
98 pub controller_management_fault_enable: PciFaultBehavior,
99 /// Custom MQES value to return in CAP register reads. 1 based value.
100 pub max_queue_size: Option<u16>,
101}
102
103/// A fault config to trigger spurious namespace change notifications from the controller.
104///
105/// The fault controller listens on the provided channel for notifications containing
106/// a `u32` value representing the NSID (Namespace Identifier) that has changed.
107/// This does not actually modify the namespace; instead, it triggers the controller
108/// to process a namespace change notification. The fault is modeled as an
109/// RPC, which the controller completes once it has processed the change and sent
110/// the corresponding Asynchronous Event Notification (AEN).
111/// As per NVMe spec: If multiple namespace changes are notified, only the first triggers an AEN.
112/// Subsequent changes do not trigger additional AENs until the driver issues a
113/// GET_LOG_PAGE command. For implementation simplicity, namespace fault is not
114/// gated by the `fault_active` flag. Since only test code can send
115/// notifications on the fault channel, it is safe to bypass this check.
116///
117/// # Example
118/// Send a namespace change notification for NSID 1 and wait for it to be processed.
119/// ```no_run
120/// use mesh::CellUpdater;
121/// use nvme_resources::fault::NamespaceChange;
122/// use nvme_resources::fault::FaultConfiguration;
123/// use nvme_resources::fault::NamespaceFaultConfig;
124/// use nvme_resources::NvmeFaultControllerHandle;
125/// use guid::Guid;
126/// use mesh::rpc::RpcSend;
127///
128/// pub async fn send_namespace_change_fault() {
129/// let mut fault_start_updater = CellUpdater::new(false);
130/// let (ns_change_send, ns_change_recv) = mesh::channel::<NamespaceChange>();
131/// let fault_configuration = FaultConfiguration::new(fault_start_updater.cell())
132/// .with_namespace_fault(
133/// NamespaceFaultConfig::new(ns_change_recv),
134/// );
135/// // Complete setup
136/// let fault_controller_handle = NvmeFaultControllerHandle {
137/// subsystem_id: Guid::new_random(),
138/// msix_count: 10,
139/// max_io_queues: 10,
140/// namespaces: vec![
141/// // Define `NamespaceDefinitions` here
142/// ],
143/// fault_config: fault_configuration,
144/// enable_tdisp_tests: false,
145/// };
146///
147/// // Send the namespace change notification and await processing.
148/// ns_change_send.call(NamespaceChange::ChangeNotification, 1).await.unwrap();
149/// }
150/// ```
151#[derive(MeshPayload)]
152pub struct NamespaceFaultConfig {
153 /// Receiver for changed namespace notifications
154 pub recv_changed_namespace: mesh::Receiver<NamespaceChange>,
155}
156
157/// A fault configuration to inject faults into the admin submission and completion queues.
158///
159/// This struct maintains a mapping from [`CommandMatch`] to [`AdminQueueFaultBehavior`] for
160/// submission and completion queues. When a command match is found, (and `fault_active == true`)
161/// the associated fault is applied.
162/// Both submission and completion queue faults match on commands
163/// because completions do not contain enough identifying information to
164/// match against. If there is more than one match for a given command, the
165/// match defined first is prioritized. Faults are added via the
166/// `with_submission_queue_fault` and `with_completion_queue_fault` methods and
167/// can be chained. AdminQueueFaultConfig::new() creates an empty fault.
168///
169/// # Panics
170/// Panics if a duplicate `CommandMatch` is added for either submission or
171/// completion queues
172///
173/// # Example
174/// Panic on CREATE_IO_COMPLETION_QUEUE and delay before sending completion for 500ms after
175/// GET_LOG_PAGE command is processed.
176/// ```no_run
177/// use mesh::CellUpdater;
178/// use nvme_resources::fault::AdminQueueFaultConfig;
179/// use nvme_resources::fault::CommandMatch;
180/// use nvme_resources::fault::FaultConfiguration;
181/// use nvme_resources::fault::AdminQueueFaultBehavior;
182/// use nvme_spec::Command;
183/// use std::time::Duration;
184/// use zerocopy::FromZeros;
185/// use zerocopy::IntoBytes;
186///
187/// pub fn build_admin_queue_fault() -> FaultConfiguration {
188/// let mut fault_start_updater = CellUpdater::new(false);
189///
190/// // Setup command matches
191/// let mut command_io_queue = Command::new_zeroed();
192/// let mut command_log_page = Command::new_zeroed();
193/// let mut mask = Command::new_zeroed();
194///
195/// command_io_queue.cdw0 = command_io_queue.cdw0.with_opcode(nvme_spec::AdminOpcode::CREATE_IO_COMPLETION_QUEUE.0);
196/// command_log_page.cdw0 = command_log_page.cdw0.with_opcode(nvme_spec::AdminOpcode::GET_LOG_PAGE.0);
197/// mask.cdw0 = mask.cdw0.with_opcode(u8::MAX);
198///
199/// return FaultConfiguration::new(fault_start_updater.cell())
200/// .with_admin_queue_fault(
201/// AdminQueueFaultConfig::new().with_submission_queue_fault(
202/// CommandMatch {
203/// command: command_io_queue,
204/// mask: mask.as_bytes().try_into().expect("mask should be 64 bytes"),
205/// },
206/// AdminQueueFaultBehavior::Panic("Received a CREATE_IO_COMPLETION_QUEUE command".to_string()),
207/// ).with_completion_queue_fault(
208/// CommandMatch {
209/// command: command_log_page,
210/// mask: mask.as_bytes().try_into().expect("mask should be 64 bytes"),
211/// },
212/// AdminQueueFaultBehavior::Delay(Duration::from_millis(500)),
213/// )
214/// );
215/// }
216/// ```
217#[derive(MeshPayload)]
218pub struct AdminQueueFaultConfig {
219 /// A map of NVME opcodes to the submission fault behavior for each. (This
220 /// would ideally be a `HashMap`, but `mesh` doesn't support that type.
221 /// Given that this is not performance sensitive, the lookup is okay)
222 pub admin_submission_queue_faults: Vec<(CommandMatch, AdminQueueFaultBehavior<Command>)>,
223 /// A map of NVME opcodes to the completion fault behavior for each.
224 pub admin_completion_queue_faults: Vec<(CommandMatch, AdminQueueFaultBehavior<Completion>)>,
225}
226
227/// A fault configuration to inject faults into the io completions.
228///
229/// This struct maintains a mapping from [`CommandMatch`] to [`IoQueueFaultBehavior`] for
230/// completions. When a command match is found, (and `fault_active == true`)
231/// the associated fault is applied.
232/// If there is more than one match for a given command, the
233/// match defined first is prioritized. Faults are added via the
234/// `with_completion_queue_fault` method and calls
235/// can be chained. IoQueueFaultConfig::new() creates an empty fault.
236///
237/// # Panics
238/// Panics if a duplicate `CommandMatch` is added
239///
240/// # Example
241/// Panic when RESERVATION_REPORT command is seen.
242/// ```no_run
243/// use mesh::CellUpdater;
244/// use nvme_resources::fault::IoQueueFaultConfig;
245/// use nvme_resources::fault::CommandMatch;
246/// use nvme_resources::fault::FaultConfiguration;
247/// use nvme_resources::fault::IoQueueFaultBehavior;
248/// use nvme_spec::Command;
249/// use nvme_spec::nvm;
250/// use zerocopy::FromZeros;
251/// use zerocopy::IntoBytes;
252///
253/// pub fn build_admin_queue_fault() -> FaultConfiguration {
254/// let mut fault_start_updater = CellUpdater::new(false);
255///
256/// // Setup command matches
257/// let mut command_io_queue = Command::new_zeroed();
258/// let mut command_log_page = Command::new_zeroed();
259/// let mut mask = Command::new_zeroed();
260///
261/// command_io_queue.cdw0 = command_io_queue.cdw0.with_opcode(nvm::NvmOpcode::RESERVATION_REPORT.0);
262/// mask.cdw0 = mask.cdw0.with_opcode(u8::MAX);
263///
264/// return FaultConfiguration::new(fault_start_updater.cell())
265/// .with_io_queue_fault(
266/// IoQueueFaultConfig::new(fault_start_updater.cell()).with_completion_queue_fault(
267/// CommandMatch {
268/// command: command_io_queue,
269/// mask: mask.as_bytes().try_into().expect("mask should be 64 bytes"),
270/// },
271/// IoQueueFaultBehavior::Panic("Received a RESERVATION_REPORT command".to_string()),
272/// )
273/// );
274/// }
275/// ```
276#[derive(MeshPayload, Clone)]
277pub struct IoQueueFaultConfig {
278 /// A map of NVME opcodes to the completion fault behavior for each.
279 pub io_completion_queue_faults: Vec<(CommandMatch, IoQueueFaultBehavior)>,
280 /// Fault active state. (Repeated here because FaultConfiguration is not Cloneable).
281 pub fault_active: Cell<bool>,
282}
283
284/// A versatile definition to command match [`NVMe commands`](nvme_spec::Command)
285///
286/// Matches NVMe commands using a 512-bit mask: (command_bytes & mask) == (pattern_bytes & mask).
287/// A convenient way to build the patterns is to treat both the command and the mask as
288/// `nvme_spec::Command` and max out the fields in the mask that should be
289/// matched.
290///
291/// # Example
292/// Builds a command match that matches on all CREATE_IO_COMPLETION_QUEUE admin commands.
293/// ```no_run
294/// use nvme_resources::fault::CommandMatch;
295/// use nvme_spec::Command;
296/// use zerocopy::FromZeros;
297/// use zerocopy::IntoBytes;
298///
299/// pub fn build_command_match() -> CommandMatch {
300/// let mut command = Command::new_zeroed();
301/// let mut mask = Command::new_zeroed();
302/// command.cdw0 = command.cdw0.with_opcode(nvme_spec::AdminOpcode::CREATE_IO_COMPLETION_QUEUE.0);
303/// mask.cdw0 = mask.cdw0.with_opcode(u8::MAX);
304/// CommandMatch {
305/// command,
306/// mask: mask.as_bytes().try_into().expect("mask should be 64 bytes"),
307/// }
308/// }
309/// ```
310#[derive(Clone, MeshPayload, PartialEq)]
311pub struct CommandMatch {
312 /// Command to match against
313 pub command: Command,
314 /// Bitmask that defines the bits to match against
315 pub mask: [u8; 64],
316}
317
318/// Fault configuration for the NVMe fault controller.
319///
320/// This struct defines behaviors that inject faults into the NVMe fault controller logic,
321/// such as delaying or dropping commands, triggering namespace change notifications,
322/// or customizing completion payloads. Fault injection is controlled by the
323/// `fault_active` flag, unless specified otherwise in the fault description.
324/// `fault_active` is managed by the test via [`mesh::CellUpdater`]. An
325/// exception to the `fault_active` check is the [`NamespaceFaultConfig`] which
326/// is processed regardless of `fault_active` state. (See `nvme_test` crate for
327/// details on how the faults are applied.)
328///
329/// # Example
330/// Panic when a command that matches CREATE_IO_COMPLETION_QUEUE is seen in the
331/// admin queue:
332/// ```no_run
333/// use mesh::CellUpdater;
334/// use nvme_resources::fault::FaultConfiguration;
335/// use nvme_resources::fault::AdminQueueFaultConfig;
336/// use nvme_resources::fault::CommandMatch;
337/// use nvme_spec::Command;
338/// use nvme_resources::fault::AdminQueueFaultBehavior;
339/// use nvme_resources::NvmeFaultControllerHandle;
340/// use guid::Guid;
341/// use zerocopy::FromZeros;
342/// use zerocopy::IntoBytes;
343///
344/// pub fn example_fault() {
345/// let mut fault_start_updater = CellUpdater::new(false);
346///
347/// // Setup command matches
348/// let mut command = Command::new_zeroed();
349/// let mut mask = Command::new_zeroed();
350///
351/// command.cdw0 = command.cdw0.with_opcode(nvme_spec::AdminOpcode::CREATE_IO_COMPLETION_QUEUE.0);
352/// mask.cdw0 = mask.cdw0.with_opcode(u8::MAX);
353///
354/// let fault_configuration = FaultConfiguration::new(fault_start_updater.cell())
355/// .with_admin_queue_fault(
356/// AdminQueueFaultConfig::new().with_submission_queue_fault(
357/// CommandMatch {
358/// command: command,
359/// mask: mask.as_bytes().try_into().expect("mask should be 64 bytes"),
360/// },
361/// AdminQueueFaultBehavior::Panic("Received a CREATE_IO_COMPLETION_QUEUE command".to_string()),
362/// )
363/// );
364/// let fault_controller_handle = NvmeFaultControllerHandle {
365/// subsystem_id: Guid::new_random(),
366/// msix_count: 10,
367/// max_io_queues: 10,
368/// namespaces: vec![
369/// // Define NamespaceDefinitions here
370/// ],
371/// fault_config: fault_configuration,
372/// enable_tdisp_tests: false,
373/// };
374/// // Pass the controller handle in to the vm config to create and attach the fault controller. At this point the fault is inactive.
375/// fault_start_updater.set(true); // Activate the fault injection.
376/// // ... run test ...
377/// fault_start_updater.set(false); // Deactivate the fault injection.
378/// }
379/// ```
380#[derive(MeshPayload)]
381pub struct FaultConfiguration {
382 /// Fault active state
383 pub fault_active: Cell<bool>,
384 /// Fault to apply to the admin queues
385 pub admin_fault: AdminQueueFaultConfig,
386 /// Fault to apply to management layer of the controller. Option because it
387 /// needs to be extracted by the PCI layer during initialization.
388 pub pci_fault: Option<PciFaultConfig>,
389 /// Fault for test triggered namespace change notifications
390 pub namespace_fault: NamespaceFaultConfig,
391 /// Fault to apply to all IO queues
392 pub io_fault: Arc<IoQueueFaultConfig>,
393}
394
395impl FaultConfiguration {
396 /// Create a new empty fault configuration
397 pub fn new(fault_active: Cell<bool>) -> Self {
398 // Ideally the faults should begin life as Option::None.
399 // For now, use a dummy mesh channel for namespace fault to avoid
400 // test setup complexity & special cases in the AdminHandler run loop.
401 Self {
402 fault_active: fault_active.clone(),
403 admin_fault: AdminQueueFaultConfig::new(),
404 pci_fault: Some(PciFaultConfig::new()),
405 namespace_fault: NamespaceFaultConfig::new(mesh::channel().1),
406 io_fault: Arc::new(IoQueueFaultConfig::new(fault_active)),
407 }
408 }
409
410 /// Add a PCI fault configuration to the fault configuration
411 pub fn with_pci_fault(mut self, pci_fault: PciFaultConfig) -> Self {
412 self.pci_fault = Some(pci_fault);
413 self
414 }
415
416 /// Add an admin queue fault configuration to the fault configuration
417 pub fn with_admin_queue_fault(mut self, admin_fault: AdminQueueFaultConfig) -> Self {
418 self.admin_fault = admin_fault;
419 self
420 }
421
422 /// Add an IO queue fault configuration to the fault configuration
423 pub fn with_io_queue_fault(mut self, io_fault: IoQueueFaultConfig) -> Self {
424 self.io_fault = Arc::new(io_fault);
425 self
426 }
427
428 /// Add a namespace fault configuration to the fault configuration
429 pub fn with_namespace_fault(mut self, namespace_fault: NamespaceFaultConfig) -> Self {
430 self.namespace_fault = namespace_fault;
431 self
432 }
433}
434
435impl PciFaultConfig {
436 /// Create a new no-op fault configuration
437 pub fn new() -> Self {
438 Self {
439 controller_management_fault_enable: PciFaultBehavior::Default,
440 max_queue_size: None,
441 }
442 }
443
444 /// Add a cc.en() fault
445 pub fn with_cc_enable_fault(mut self, behaviour: PciFaultBehavior) -> Self {
446 self.controller_management_fault_enable = behaviour;
447 self
448 }
449
450 /// Add a custom CAP.MQES value to return on register reads
451 pub fn with_max_queue_size(mut self, max_queue_size: u16) -> Self {
452 self.max_queue_size = Some(max_queue_size);
453 self
454 }
455}
456
457impl AdminQueueFaultConfig {
458 /// Create an empty fault configuration
459 pub fn new() -> Self {
460 Self {
461 admin_submission_queue_faults: vec![],
462 admin_completion_queue_faults: vec![],
463 }
464 }
465
466 /// Add a [`CommandMatch`] -> [`AdminQueueFaultBehavior`] mapping for the submission queue.
467 ///
468 /// # Panics
469 /// Panics if an identical [`CommandMatch`] has already been configured.
470 pub fn with_submission_queue_fault(
471 mut self,
472 pattern: CommandMatch,
473 behaviour: AdminQueueFaultBehavior<Command>,
474 ) -> Self {
475 if self
476 .admin_submission_queue_faults
477 .iter()
478 .any(|(c, _)| pattern == *c)
479 {
480 panic!(
481 "Duplicate submission queue fault for Compare {:?} and Mask {:?}",
482 pattern.command, pattern.mask
483 );
484 }
485
486 self.admin_submission_queue_faults
487 .push((pattern, behaviour));
488 self
489 }
490
491 /// Add a [`CommandMatch`] -> [`AdminQueueFaultBehavior`] mapping for the completion queue.
492 ///
493 /// # Panics
494 /// Panics if an identical [`CommandMatch`] has already been configured.
495 pub fn with_completion_queue_fault(
496 mut self,
497 pattern: CommandMatch,
498 behaviour: AdminQueueFaultBehavior<Completion>,
499 ) -> Self {
500 if self
501 .admin_completion_queue_faults
502 .iter()
503 .any(|(c, _)| pattern == *c)
504 {
505 panic!(
506 "Duplicate completion queue fault for Compare {:?} and Mask {:?}",
507 pattern.command, pattern.mask
508 );
509 }
510
511 self.admin_completion_queue_faults
512 .push((pattern, behaviour));
513 self
514 }
515}
516
517impl NamespaceFaultConfig {
518 /// Creates a new NamespaceFaultConfig with a fresh channel.
519 pub fn new(recv_changed_namespace: mesh::Receiver<NamespaceChange>) -> Self {
520 Self {
521 recv_changed_namespace,
522 }
523 }
524}
525
526impl IoQueueFaultConfig {
527 /// Create an empty IO queue fault configuration
528 pub fn new(fault_active: Cell<bool>) -> Self {
529 Self {
530 io_completion_queue_faults: vec![],
531 fault_active,
532 }
533 }
534
535 /// Add a [`CommandMatch`] -> [`IoQueueFaultBehavior`] mapping for the completion queue.
536 ///
537 /// # Panics
538 /// Panics if an identical [`CommandMatch`] has already been configured.
539 pub fn with_completion_queue_fault(
540 mut self,
541 pattern: CommandMatch,
542 behaviour: IoQueueFaultBehavior,
543 ) -> Self {
544 if self
545 .io_completion_queue_faults
546 .iter()
547 .any(|(c, _)| pattern == *c)
548 {
549 panic!(
550 "Duplicate completion queue fault for Compare {:?} and Mask {:?}",
551 pattern.command, pattern.mask
552 );
553 }
554
555 self.io_completion_queue_faults.push((pattern, behaviour));
556 self
557 }
558}