nvme_resources/
fault.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! Provides an interface to programmatically and deterministically inject faults in the NVMe fault controller.
5
6use mesh::Cell;
7use mesh::MeshPayload;
8use mesh::OneshotSender;
9use mesh::rpc::Rpc;
10use nvme_spec::Command;
11use nvme_spec::Completion;
12use std::sync::Arc;
13use std::time::Duration;
14
15/// Supported fault behaviour for NVMe admin queues
16#[derive(Debug, MeshPayload)]
17pub enum AdminQueueFaultBehavior<T> {
18    /// Update the queue entry with the returned data
19    Update(T),
20    /// Drop the queue entry
21    Drop,
22    /// Delay. Note: This delay is not asynchronously applied. i.e. Subsequent
23    /// commands will be processed until the delay is over.
24    Delay(Duration),
25    /// Panic
26    Panic(String),
27    /// Writes the given payload to the PRP range. The test should ensure
28    /// that the payload is of valid size. If the size is too large, the fault
29    /// controller will panic. This behavior is not yet supported by the submission
30    /// queue fault.
31    CustomPayload(Vec<u8>),
32    /// Verify that a command was seen.
33    Verify(Option<OneshotSender<()>>),
34}
35
36/// Supported fault behaviour for NVMe IO queues
37#[derive(Debug, MeshPayload, Clone)]
38pub enum IoQueueFaultBehavior {
39    /// Writes the given payload to the PRP range. The test should ensure
40    /// that the payload is of valid size. If the size is too large, the fault
41    /// controller will panic. This behavior is not yet supported by the submission
42    /// queue fault.
43    CustomPayload(Vec<u8>),
44    /// Panic
45    Panic(String),
46}
47
48/// Supported fault behaviour for PCI faults
49#[derive(Clone, MeshPayload)]
50pub enum PciFaultBehavior {
51    /// Introduce a delay to the PCI operation. This WILL block the processing
52    /// thread for the delay duration.
53    Delay(Duration),
54    /// Do nothing
55    Default,
56}
57
58/// A notification to the test confirming namespace change processing.
59#[derive(MeshPayload)]
60pub enum NamespaceChange {
61    /// Input: Namespace ID to notify, Output: Empty confirmation.
62    ChangeNotification(Rpc<u32, ()>),
63}
64
65/// A fault configuration to apply [`PciFaultBehavior`] to the controller management layer.
66///
67/// Currently the only supported fault is to delay enabling the controller via
68/// cc.en().
69///
70/// # Example
71/// Delay enabling the controller by 500ms.
72///
73/// ```no_run
74/// use mesh::CellUpdater;
75/// use nvme_resources::fault::FaultConfiguration;
76/// use nvme_resources::fault::PciFaultBehavior;
77/// use nvme_resources::fault::PciFaultConfig;
78/// use std::time::Duration;
79///
80/// pub fn pci_enable_delay_fault() -> FaultConfiguration{
81///     let mut fault_start_updater = CellUpdater::new(false);
82///     FaultConfiguration::new(fault_start_updater.cell())
83///         .with_pci_fault(
84///             PciFaultConfig::new().with_cc_enable_fault(
85///                 PciFaultBehavior::Delay(Duration::from_millis(500)),
86///             )
87///         )
88/// }
89/// ```
90#[derive(MeshPayload, Clone)]
91pub struct PciFaultConfig {
92    /// Fault to apply to cc.en() bit during enablement
93    pub controller_management_fault_enable: PciFaultBehavior,
94}
95
96/// A fault config to trigger spurious namespace change notifications from the controller.
97///
98/// The fault controller listens on the provided channel for notifications containing
99/// a `u32` value representing the NSID (Namespace Identifier) that has changed.
100/// This does not actually modify the namespace; instead, it triggers the controller
101/// to process a namespace change notification. The fault is modeled as an
102/// RPC, which the controller completes once it has processed the change and sent
103/// the corresponding Asynchronous Event Notification (AEN).
104/// As per NVMe spec: If multiple namespace changes are notified, only the first triggers an AEN.
105/// Subsequent changes do not trigger additional AENs until the driver issues a
106/// GET_LOG_PAGE command. For implementation simplicity, namespace fault is not
107/// gated by the `fault_active` flag. Since only test code can send
108/// notifications on the fault channel, it is safe to bypass this check.
109///
110/// # Example
111/// Send a namespace change notification for NSID 1 and wait for it to be processed.
112/// ```no_run
113/// use mesh::CellUpdater;
114/// use nvme_resources::fault::NamespaceChange;
115/// use nvme_resources::fault::FaultConfiguration;
116/// use nvme_resources::fault::NamespaceFaultConfig;
117/// use nvme_resources::NvmeFaultControllerHandle;
118/// use guid::Guid;
119/// use mesh::rpc::RpcSend;
120///
121/// pub async fn send_namespace_change_fault() {
122///     let mut fault_start_updater = CellUpdater::new(false);
123///     let (ns_change_send, ns_change_recv) = mesh::channel::<NamespaceChange>();
124///     let fault_configuration = FaultConfiguration::new(fault_start_updater.cell())
125///         .with_namespace_fault(
126///             NamespaceFaultConfig::new(ns_change_recv),
127///         );
128///     // Complete setup
129///     let fault_controller_handle = NvmeFaultControllerHandle {
130///         subsystem_id: Guid::new_random(),
131///         msix_count: 10,
132///         max_io_queues: 10,
133///         namespaces: vec![
134///             // Define `NamespaceDefinitions` here
135///         ],
136///         fault_config: fault_configuration,
137///     };
138///
139///     // Send the namespace change notification and await processing.
140///     ns_change_send.call(NamespaceChange::ChangeNotification, 1).await.unwrap();
141/// }
142/// ```
143#[derive(MeshPayload)]
144pub struct NamespaceFaultConfig {
145    /// Receiver for changed namespace notifications
146    pub recv_changed_namespace: mesh::Receiver<NamespaceChange>,
147}
148
149/// A fault configuration to inject faults into the admin submission and completion queues.
150///
151/// This struct maintains a mapping from [`CommandMatch`] to [`AdminQueueFaultBehavior`] for
152/// submission and completion queues. When a command match is found, (and `fault_active == true`)
153/// the associated fault is applied.
154/// Both submission and completion queue faults match on commands
155/// because completions do not contain enough identifying information to
156/// match against. If there is more than one match for a given command, the
157/// match defined first is prioritized. Faults are added via the
158/// `with_submission_queue_fault` and `with_completion_queue_fault` methods and
159/// can be chained. AdminQueueFaultConfig::new() creates an empty fault.
160///
161/// # Panics
162/// Panics if a duplicate `CommandMatch` is added for either submission or
163/// completion queues
164///
165/// # Example
166/// Panic on CREATE_IO_COMPLETION_QUEUE and delay before sending completion for 500ms after
167/// GET_LOG_PAGE command is processed.
168/// ```no_run
169/// use mesh::CellUpdater;
170/// use nvme_resources::fault::AdminQueueFaultConfig;
171/// use nvme_resources::fault::CommandMatch;
172/// use nvme_resources::fault::FaultConfiguration;
173/// use nvme_resources::fault::AdminQueueFaultBehavior;
174/// use nvme_spec::Command;
175/// use std::time::Duration;
176/// use zerocopy::FromZeros;
177/// use zerocopy::IntoBytes;
178///
179/// pub fn build_admin_queue_fault() -> FaultConfiguration {
180///     let mut fault_start_updater = CellUpdater::new(false);
181///
182///     // Setup command matches
183///     let mut command_io_queue = Command::new_zeroed();
184///     let mut command_log_page = Command::new_zeroed();
185///     let mut mask = Command::new_zeroed();
186///
187///     command_io_queue.cdw0 = command_io_queue.cdw0.with_opcode(nvme_spec::AdminOpcode::CREATE_IO_COMPLETION_QUEUE.0);
188///     command_log_page.cdw0 = command_log_page.cdw0.with_opcode(nvme_spec::AdminOpcode::GET_LOG_PAGE.0);
189///     mask.cdw0 = mask.cdw0.with_opcode(u8::MAX);
190///
191///     return FaultConfiguration::new(fault_start_updater.cell())
192///         .with_admin_queue_fault(
193///             AdminQueueFaultConfig::new().with_submission_queue_fault(
194///                 CommandMatch {
195///                     command: command_io_queue,
196///                     mask: mask.as_bytes().try_into().expect("mask should be 64 bytes"),
197///                 },
198///                 AdminQueueFaultBehavior::Panic("Received a CREATE_IO_COMPLETION_QUEUE command".to_string()),
199///             ).with_completion_queue_fault(
200///                 CommandMatch {
201///                     command: command_log_page,
202///                     mask: mask.as_bytes().try_into().expect("mask should be 64 bytes"),
203///                 },
204///                 AdminQueueFaultBehavior::Delay(Duration::from_millis(500)),
205///             )
206///         );
207/// }
208/// ```
209#[derive(MeshPayload)]
210pub struct AdminQueueFaultConfig {
211    /// A map of NVME opcodes to the submission fault behavior for each. (This
212    /// would ideally be a `HashMap`, but `mesh` doesn't support that type.
213    /// Given that this is not performance sensitive, the lookup is okay)
214    pub admin_submission_queue_faults: Vec<(CommandMatch, AdminQueueFaultBehavior<Command>)>,
215    /// A map of NVME opcodes to the completion fault behavior for each.
216    pub admin_completion_queue_faults: Vec<(CommandMatch, AdminQueueFaultBehavior<Completion>)>,
217}
218
219/// A fault configuration to inject faults into the io completions.
220///
221/// This struct maintains a mapping from [`CommandMatch`] to [`IoQueueFaultBehavior`] for
222/// completions. When a command match is found, (and `fault_active == true`)
223/// the associated fault is applied.
224/// If there is more than one match for a given command, the
225/// match defined first is prioritized. Faults are added via the
226/// `with_completion_queue_fault` method and calls
227/// can be chained. IoQueueFaultConfig::new() creates an empty fault.
228///
229/// # Panics
230/// Panics if a duplicate `CommandMatch` is added
231///
232/// # Example
233/// Panic when RESERVATION_REPORT command is seen.
234/// ```no_run
235/// use mesh::CellUpdater;
236/// use nvme_resources::fault::IoQueueFaultConfig;
237/// use nvme_resources::fault::CommandMatch;
238/// use nvme_resources::fault::FaultConfiguration;
239/// use nvme_resources::fault::IoQueueFaultBehavior;
240/// use nvme_spec::Command;
241/// use nvme_spec::nvm;
242/// use zerocopy::FromZeros;
243/// use zerocopy::IntoBytes;
244///
245/// pub fn build_admin_queue_fault() -> FaultConfiguration {
246///     let mut fault_start_updater = CellUpdater::new(false);
247///
248///     // Setup command matches
249///     let mut command_io_queue = Command::new_zeroed();
250///     let mut command_log_page = Command::new_zeroed();
251///     let mut mask = Command::new_zeroed();
252///
253///     command_io_queue.cdw0 = command_io_queue.cdw0.with_opcode(nvm::NvmOpcode::RESERVATION_REPORT.0);
254///     mask.cdw0 = mask.cdw0.with_opcode(u8::MAX);
255///
256///     return FaultConfiguration::new(fault_start_updater.cell())
257///         .with_io_queue_fault(
258///             IoQueueFaultConfig::new(fault_start_updater.cell()).with_completion_queue_fault(
259///                 CommandMatch {
260///                     command: command_io_queue,
261///                     mask: mask.as_bytes().try_into().expect("mask should be 64 bytes"),
262///                 },
263///                 IoQueueFaultBehavior::Panic("Received a RESERVATION_REPORT command".to_string()),
264///             )
265///         );
266/// }
267/// ```
268#[derive(MeshPayload, Clone)]
269pub struct IoQueueFaultConfig {
270    /// A map of NVME opcodes to the completion fault behavior for each.
271    pub io_completion_queue_faults: Vec<(CommandMatch, IoQueueFaultBehavior)>,
272    /// Fault active state. (Repeated here because FaultConfiguration is not Cloneable).
273    pub fault_active: Cell<bool>,
274}
275
276/// A versatile definition to command match [`NVMe commands`](nvme_spec::Command)
277///
278/// Matches NVMe commands using a 512-bit mask: (command_bytes & mask) == (pattern_bytes & mask).
279/// A convenient way to build the patterns is to treat both the command and the mask as
280/// `nvme_spec::Command` and max out the fields in the mask that should be
281/// matched.
282///
283/// # Example
284/// Builds a command match that matches on all CREATE_IO_COMPLETION_QUEUE admin commands.
285/// ```no_run
286/// use nvme_resources::fault::CommandMatch;
287/// use nvme_spec::Command;
288/// use zerocopy::FromZeros;
289/// use zerocopy::IntoBytes;
290///
291/// pub fn build_command_match() -> CommandMatch {
292///     let mut command = Command::new_zeroed();
293///     let mut mask = Command::new_zeroed();
294///     command.cdw0 = command.cdw0.with_opcode(nvme_spec::AdminOpcode::CREATE_IO_COMPLETION_QUEUE.0);
295///     mask.cdw0 = mask.cdw0.with_opcode(u8::MAX);
296///     CommandMatch {
297///         command,
298///         mask: mask.as_bytes().try_into().expect("mask should be 64 bytes"),
299///     }
300/// }
301/// ```
302#[derive(Clone, MeshPayload, PartialEq)]
303pub struct CommandMatch {
304    /// Command to match against
305    pub command: Command,
306    /// Bitmask that defines the bits to match against
307    pub mask: [u8; 64],
308}
309
310/// Fault configuration for the NVMe fault controller.
311///
312/// This struct defines behaviors that inject faults into the NVMe fault controller logic,
313/// such as delaying or dropping commands, triggering namespace change notifications,
314/// or customizing completion payloads. Fault injection is controlled by the
315/// `fault_active` flag, unless specified otherwise in the fault description.
316/// `fault_active` is managed by the test via [`mesh::CellUpdater`]. An
317/// exception to the `fault_active` check is the [`NamespaceFaultConfig`] which
318/// is processed regardless of `fault_active` state. (See `nvme_test` crate for
319/// details on how the faults are applied.)
320///
321/// # Example
322/// Panic when a command that matches CREATE_IO_COMPLETION_QUEUE is seen in the
323/// admin queue:
324/// ```no_run
325/// use mesh::CellUpdater;
326/// use nvme_resources::fault::FaultConfiguration;
327/// use nvme_resources::fault::AdminQueueFaultConfig;
328/// use nvme_resources::fault::CommandMatch;
329/// use nvme_spec::Command;
330/// use nvme_resources::fault::AdminQueueFaultBehavior;
331/// use nvme_resources::NvmeFaultControllerHandle;
332/// use guid::Guid;
333/// use zerocopy::FromZeros;
334/// use zerocopy::IntoBytes;
335///
336/// pub fn example_fault() {
337///     let mut fault_start_updater = CellUpdater::new(false);
338///
339///     // Setup command matches
340///     let mut command = Command::new_zeroed();
341///     let mut mask = Command::new_zeroed();
342///
343///     command.cdw0 = command.cdw0.with_opcode(nvme_spec::AdminOpcode::CREATE_IO_COMPLETION_QUEUE.0);
344///     mask.cdw0 = mask.cdw0.with_opcode(u8::MAX);
345///
346///     let fault_configuration = FaultConfiguration::new(fault_start_updater.cell())
347///         .with_admin_queue_fault(
348///             AdminQueueFaultConfig::new().with_submission_queue_fault(
349///                 CommandMatch {
350///                     command: command,
351///                     mask: mask.as_bytes().try_into().expect("mask should be 64 bytes"),
352///                 },
353///                 AdminQueueFaultBehavior::Panic("Received a CREATE_IO_COMPLETION_QUEUE command".to_string()),
354///             )
355///         );
356///     let fault_controller_handle = NvmeFaultControllerHandle {
357///         subsystem_id: Guid::new_random(),
358///         msix_count: 10,
359///         max_io_queues: 10,
360///         namespaces: vec![
361///             // Define NamespaceDefinitions here
362///         ],
363///         fault_config: fault_configuration,
364///     };
365///     // Pass the controller handle in to the vm config to create and attach the fault controller. At this point the fault is inactive.
366///     fault_start_updater.set(true); // Activate the fault injection.
367///     // ... run test ...
368///     fault_start_updater.set(false); // Deactivate the fault injection.
369/// }
370/// ```
371#[derive(MeshPayload)]
372pub struct FaultConfiguration {
373    /// Fault active state
374    pub fault_active: Cell<bool>,
375    /// Fault to apply to the admin queues
376    pub admin_fault: AdminQueueFaultConfig,
377    /// Fault to apply to management layer of the controller
378    pub pci_fault: PciFaultConfig,
379    /// Fault for test triggered namespace change notifications
380    pub namespace_fault: NamespaceFaultConfig,
381    /// Fault to apply to all IO queues
382    pub io_fault: Arc<IoQueueFaultConfig>,
383}
384
385impl FaultConfiguration {
386    /// Create a new empty fault configuration
387    pub fn new(fault_active: Cell<bool>) -> Self {
388        // Ideally the faults should begin life as Option::None.
389        // For now, use a dummy mesh channel for namespace fault to avoid
390        // test setup complexity & special cases in the AdminHandler run loop.
391        Self {
392            fault_active: fault_active.clone(),
393            admin_fault: AdminQueueFaultConfig::new(),
394            pci_fault: PciFaultConfig::new(),
395            namespace_fault: NamespaceFaultConfig::new(mesh::channel().1),
396            io_fault: Arc::new(IoQueueFaultConfig::new(fault_active)),
397        }
398    }
399
400    /// Add a PCI fault configuration to the fault configuration
401    pub fn with_pci_fault(mut self, pci_fault: PciFaultConfig) -> Self {
402        self.pci_fault = pci_fault;
403        self
404    }
405
406    /// Add an admin queue fault configuration to the fault configuration
407    pub fn with_admin_queue_fault(mut self, admin_fault: AdminQueueFaultConfig) -> Self {
408        self.admin_fault = admin_fault;
409        self
410    }
411
412    /// Add an IO queue fault configuration to the fault configuration
413    pub fn with_io_queue_fault(mut self, io_fault: IoQueueFaultConfig) -> Self {
414        self.io_fault = Arc::new(io_fault);
415        self
416    }
417
418    /// Add a namespace fault configuration to the fault configuration
419    pub fn with_namespace_fault(mut self, namespace_fault: NamespaceFaultConfig) -> Self {
420        self.namespace_fault = namespace_fault;
421        self
422    }
423}
424
425impl PciFaultConfig {
426    /// Create a new no-op fault configuration
427    pub fn new() -> Self {
428        Self {
429            controller_management_fault_enable: PciFaultBehavior::Default,
430        }
431    }
432
433    /// Add a cc.en() fault
434    pub fn with_cc_enable_fault(mut self, behaviour: PciFaultBehavior) -> Self {
435        self.controller_management_fault_enable = behaviour;
436        self
437    }
438}
439
440impl AdminQueueFaultConfig {
441    /// Create an empty fault configuration
442    pub fn new() -> Self {
443        Self {
444            admin_submission_queue_faults: vec![],
445            admin_completion_queue_faults: vec![],
446        }
447    }
448
449    /// Add a [`CommandMatch`] -> [`AdminQueueFaultBehavior`] mapping for the submission queue.
450    ///
451    /// # Panics
452    /// Panics if an identical [`CommandMatch`] has already been configured.
453    pub fn with_submission_queue_fault(
454        mut self,
455        pattern: CommandMatch,
456        behaviour: AdminQueueFaultBehavior<Command>,
457    ) -> Self {
458        if self
459            .admin_submission_queue_faults
460            .iter()
461            .any(|(c, _)| pattern == *c)
462        {
463            panic!(
464                "Duplicate submission queue fault for Compare {:?} and Mask {:?}",
465                pattern.command, pattern.mask
466            );
467        }
468
469        self.admin_submission_queue_faults
470            .push((pattern, behaviour));
471        self
472    }
473
474    /// Add a [`CommandMatch`] -> [`AdminQueueFaultBehavior`] mapping for the completion queue.
475    ///
476    /// # Panics
477    /// Panics if an identical [`CommandMatch`] has already been configured.
478    pub fn with_completion_queue_fault(
479        mut self,
480        pattern: CommandMatch,
481        behaviour: AdminQueueFaultBehavior<Completion>,
482    ) -> Self {
483        if self
484            .admin_completion_queue_faults
485            .iter()
486            .any(|(c, _)| pattern == *c)
487        {
488            panic!(
489                "Duplicate completion queue fault for Compare {:?} and Mask {:?}",
490                pattern.command, pattern.mask
491            );
492        }
493
494        self.admin_completion_queue_faults
495            .push((pattern, behaviour));
496        self
497    }
498}
499
500impl NamespaceFaultConfig {
501    /// Creates a new NamespaceFaultConfig with a fresh channel.
502    pub fn new(recv_changed_namespace: mesh::Receiver<NamespaceChange>) -> Self {
503        Self {
504            recv_changed_namespace,
505        }
506    }
507}
508
509impl IoQueueFaultConfig {
510    /// Create an empty IO queue fault configuration
511    pub fn new(fault_active: Cell<bool>) -> Self {
512        Self {
513            io_completion_queue_faults: vec![],
514            fault_active,
515        }
516    }
517
518    /// Add a [`CommandMatch`] -> [`IoQueueFaultBehavior`] mapping for the completion queue.
519    ///
520    /// # Panics
521    /// Panics if an identical [`CommandMatch`] has already been configured.
522    pub fn with_completion_queue_fault(
523        mut self,
524        pattern: CommandMatch,
525        behaviour: IoQueueFaultBehavior,
526    ) -> Self {
527        if self
528            .io_completion_queue_faults
529            .iter()
530            .any(|(c, _)| pattern == *c)
531        {
532            panic!(
533                "Duplicate completion queue fault for Compare {:?} and Mask {:?}",
534                pattern.command, pattern.mask
535            );
536        }
537
538        self.io_completion_queue_faults.push((pattern, behaviour));
539        self
540    }
541}