nvme_resources/
fault.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! Provides an interface to programmatically and deterministically inject faults in the NVMe fault controller.
5
6use mesh::Cell;
7use mesh::MeshPayload;
8use mesh::OneshotSender;
9use mesh::rpc::Rpc;
10use nvme_spec::Command;
11use nvme_spec::Completion;
12use std::sync::Arc;
13use std::time::Duration;
14
15/// Supported fault behaviour for NVMe admin queues
16#[derive(Debug, MeshPayload)]
17pub enum AdminQueueFaultBehavior<T> {
18    /// Update the queue entry with the returned data
19    Update(T),
20    /// Drop the queue entry
21    Drop,
22    /// Delay. Note: This delay is not asynchronously applied. i.e. Subsequent
23    /// commands will be processed until the delay is over.
24    Delay(Duration),
25    /// Panic
26    Panic(String),
27    /// Writes the given payload to the PRP range. The test should ensure
28    /// that the payload is of valid size. If the size is too large, the fault
29    /// controller will panic. This behavior is not yet supported by the submission
30    /// queue fault.
31    CustomPayload(Vec<u8>),
32    /// Verify that a command was seen.
33    Verify(Option<OneshotSender<()>>),
34}
35
36/// Supported fault behaviour for NVMe IO queues
37#[derive(Debug, MeshPayload, Clone)]
38pub enum IoQueueFaultBehavior {
39    /// Writes the given payload to the PRP range. The test should ensure
40    /// that the payload is of valid size. If the size is too large, the fault
41    /// controller will panic. This behavior is not yet supported by the submission
42    /// queue fault.
43    CustomPayload(Vec<u8>),
44    /// Panic
45    Panic(String),
46}
47
48/// Supported fault behaviour for PCI faults
49#[derive(MeshPayload)]
50pub enum PciFaultBehavior {
51    /// Introduce a delay to the PCI operation. This WILL block the processing
52    /// thread for the delay duration.
53    Delay(Duration),
54    /// Do nothing
55    Default,
56    /// Verify that the fault was triggered.
57    Verify(Option<OneshotSender<()>>),
58}
59
60/// A notification to the test confirming namespace change processing.
61#[derive(MeshPayload)]
62pub enum NamespaceChange {
63    /// Input: Namespace ID to notify, Output: Empty confirmation.
64    ChangeNotification(Rpc<u32, ()>),
65}
66
67/// A fault configuration to apply [`PciFaultBehavior`] to the controller management layer.
68///
69/// Currently the only supported fault is to delay enabling the controller via
70/// cc.en().
71///
72/// # Example
73/// Delay enabling the controller by 500ms.
74///
75/// ```no_run
76/// use mesh::CellUpdater;
77/// use nvme_resources::fault::FaultConfiguration;
78/// use nvme_resources::fault::PciFaultBehavior;
79/// use nvme_resources::fault::PciFaultConfig;
80/// use std::time::Duration;
81///
82/// pub fn pci_enable_delay_fault() -> FaultConfiguration{
83///     let mut fault_start_updater = CellUpdater::new(false);
84///     FaultConfiguration::new(fault_start_updater.cell())
85///         .with_pci_fault(
86///             PciFaultConfig::new().with_cc_enable_fault(
87///                 PciFaultBehavior::Delay(Duration::from_millis(500)),
88///             )
89///         )
90/// }
91/// ```
92#[derive(MeshPayload)]
93pub struct PciFaultConfig {
94    /// Fault to apply to cc.en() bit during enablement
95    pub controller_management_fault_enable: PciFaultBehavior,
96}
97
98/// A fault config to trigger spurious namespace change notifications from the controller.
99///
100/// The fault controller listens on the provided channel for notifications containing
101/// a `u32` value representing the NSID (Namespace Identifier) that has changed.
102/// This does not actually modify the namespace; instead, it triggers the controller
103/// to process a namespace change notification. The fault is modeled as an
104/// RPC, which the controller completes once it has processed the change and sent
105/// the corresponding Asynchronous Event Notification (AEN).
106/// As per NVMe spec: If multiple namespace changes are notified, only the first triggers an AEN.
107/// Subsequent changes do not trigger additional AENs until the driver issues a
108/// GET_LOG_PAGE command. For implementation simplicity, namespace fault is not
109/// gated by the `fault_active` flag. Since only test code can send
110/// notifications on the fault channel, it is safe to bypass this check.
111///
112/// # Example
113/// Send a namespace change notification for NSID 1 and wait for it to be processed.
114/// ```no_run
115/// use mesh::CellUpdater;
116/// use nvme_resources::fault::NamespaceChange;
117/// use nvme_resources::fault::FaultConfiguration;
118/// use nvme_resources::fault::NamespaceFaultConfig;
119/// use nvme_resources::NvmeFaultControllerHandle;
120/// use guid::Guid;
121/// use mesh::rpc::RpcSend;
122///
123/// pub async fn send_namespace_change_fault() {
124///     let mut fault_start_updater = CellUpdater::new(false);
125///     let (ns_change_send, ns_change_recv) = mesh::channel::<NamespaceChange>();
126///     let fault_configuration = FaultConfiguration::new(fault_start_updater.cell())
127///         .with_namespace_fault(
128///             NamespaceFaultConfig::new(ns_change_recv),
129///         );
130///     // Complete setup
131///     let fault_controller_handle = NvmeFaultControllerHandle {
132///         subsystem_id: Guid::new_random(),
133///         msix_count: 10,
134///         max_io_queues: 10,
135///         namespaces: vec![
136///             // Define `NamespaceDefinitions` here
137///         ],
138///         fault_config: fault_configuration,
139///     };
140///
141///     // Send the namespace change notification and await processing.
142///     ns_change_send.call(NamespaceChange::ChangeNotification, 1).await.unwrap();
143/// }
144/// ```
145#[derive(MeshPayload)]
146pub struct NamespaceFaultConfig {
147    /// Receiver for changed namespace notifications
148    pub recv_changed_namespace: mesh::Receiver<NamespaceChange>,
149}
150
151/// A fault configuration to inject faults into the admin submission and completion queues.
152///
153/// This struct maintains a mapping from [`CommandMatch`] to [`AdminQueueFaultBehavior`] for
154/// submission and completion queues. When a command match is found, (and `fault_active == true`)
155/// the associated fault is applied.
156/// Both submission and completion queue faults match on commands
157/// because completions do not contain enough identifying information to
158/// match against. If there is more than one match for a given command, the
159/// match defined first is prioritized. Faults are added via the
160/// `with_submission_queue_fault` and `with_completion_queue_fault` methods and
161/// can be chained. AdminQueueFaultConfig::new() creates an empty fault.
162///
163/// # Panics
164/// Panics if a duplicate `CommandMatch` is added for either submission or
165/// completion queues
166///
167/// # Example
168/// Panic on CREATE_IO_COMPLETION_QUEUE and delay before sending completion for 500ms after
169/// GET_LOG_PAGE command is processed.
170/// ```no_run
171/// use mesh::CellUpdater;
172/// use nvme_resources::fault::AdminQueueFaultConfig;
173/// use nvme_resources::fault::CommandMatch;
174/// use nvme_resources::fault::FaultConfiguration;
175/// use nvme_resources::fault::AdminQueueFaultBehavior;
176/// use nvme_spec::Command;
177/// use std::time::Duration;
178/// use zerocopy::FromZeros;
179/// use zerocopy::IntoBytes;
180///
181/// pub fn build_admin_queue_fault() -> FaultConfiguration {
182///     let mut fault_start_updater = CellUpdater::new(false);
183///
184///     // Setup command matches
185///     let mut command_io_queue = Command::new_zeroed();
186///     let mut command_log_page = Command::new_zeroed();
187///     let mut mask = Command::new_zeroed();
188///
189///     command_io_queue.cdw0 = command_io_queue.cdw0.with_opcode(nvme_spec::AdminOpcode::CREATE_IO_COMPLETION_QUEUE.0);
190///     command_log_page.cdw0 = command_log_page.cdw0.with_opcode(nvme_spec::AdminOpcode::GET_LOG_PAGE.0);
191///     mask.cdw0 = mask.cdw0.with_opcode(u8::MAX);
192///
193///     return FaultConfiguration::new(fault_start_updater.cell())
194///         .with_admin_queue_fault(
195///             AdminQueueFaultConfig::new().with_submission_queue_fault(
196///                 CommandMatch {
197///                     command: command_io_queue,
198///                     mask: mask.as_bytes().try_into().expect("mask should be 64 bytes"),
199///                 },
200///                 AdminQueueFaultBehavior::Panic("Received a CREATE_IO_COMPLETION_QUEUE command".to_string()),
201///             ).with_completion_queue_fault(
202///                 CommandMatch {
203///                     command: command_log_page,
204///                     mask: mask.as_bytes().try_into().expect("mask should be 64 bytes"),
205///                 },
206///                 AdminQueueFaultBehavior::Delay(Duration::from_millis(500)),
207///             )
208///         );
209/// }
210/// ```
211#[derive(MeshPayload)]
212pub struct AdminQueueFaultConfig {
213    /// A map of NVME opcodes to the submission fault behavior for each. (This
214    /// would ideally be a `HashMap`, but `mesh` doesn't support that type.
215    /// Given that this is not performance sensitive, the lookup is okay)
216    pub admin_submission_queue_faults: Vec<(CommandMatch, AdminQueueFaultBehavior<Command>)>,
217    /// A map of NVME opcodes to the completion fault behavior for each.
218    pub admin_completion_queue_faults: Vec<(CommandMatch, AdminQueueFaultBehavior<Completion>)>,
219}
220
221/// A fault configuration to inject faults into the io completions.
222///
223/// This struct maintains a mapping from [`CommandMatch`] to [`IoQueueFaultBehavior`] for
224/// completions. When a command match is found, (and `fault_active == true`)
225/// the associated fault is applied.
226/// If there is more than one match for a given command, the
227/// match defined first is prioritized. Faults are added via the
228/// `with_completion_queue_fault` method and calls
229/// can be chained. IoQueueFaultConfig::new() creates an empty fault.
230///
231/// # Panics
232/// Panics if a duplicate `CommandMatch` is added
233///
234/// # Example
235/// Panic when RESERVATION_REPORT command is seen.
236/// ```no_run
237/// use mesh::CellUpdater;
238/// use nvme_resources::fault::IoQueueFaultConfig;
239/// use nvme_resources::fault::CommandMatch;
240/// use nvme_resources::fault::FaultConfiguration;
241/// use nvme_resources::fault::IoQueueFaultBehavior;
242/// use nvme_spec::Command;
243/// use nvme_spec::nvm;
244/// use zerocopy::FromZeros;
245/// use zerocopy::IntoBytes;
246///
247/// pub fn build_admin_queue_fault() -> FaultConfiguration {
248///     let mut fault_start_updater = CellUpdater::new(false);
249///
250///     // Setup command matches
251///     let mut command_io_queue = Command::new_zeroed();
252///     let mut command_log_page = Command::new_zeroed();
253///     let mut mask = Command::new_zeroed();
254///
255///     command_io_queue.cdw0 = command_io_queue.cdw0.with_opcode(nvm::NvmOpcode::RESERVATION_REPORT.0);
256///     mask.cdw0 = mask.cdw0.with_opcode(u8::MAX);
257///
258///     return FaultConfiguration::new(fault_start_updater.cell())
259///         .with_io_queue_fault(
260///             IoQueueFaultConfig::new(fault_start_updater.cell()).with_completion_queue_fault(
261///                 CommandMatch {
262///                     command: command_io_queue,
263///                     mask: mask.as_bytes().try_into().expect("mask should be 64 bytes"),
264///                 },
265///                 IoQueueFaultBehavior::Panic("Received a RESERVATION_REPORT command".to_string()),
266///             )
267///         );
268/// }
269/// ```
270#[derive(MeshPayload, Clone)]
271pub struct IoQueueFaultConfig {
272    /// A map of NVME opcodes to the completion fault behavior for each.
273    pub io_completion_queue_faults: Vec<(CommandMatch, IoQueueFaultBehavior)>,
274    /// Fault active state. (Repeated here because FaultConfiguration is not Cloneable).
275    pub fault_active: Cell<bool>,
276}
277
278/// A versatile definition to command match [`NVMe commands`](nvme_spec::Command)
279///
280/// Matches NVMe commands using a 512-bit mask: (command_bytes & mask) == (pattern_bytes & mask).
281/// A convenient way to build the patterns is to treat both the command and the mask as
282/// `nvme_spec::Command` and max out the fields in the mask that should be
283/// matched.
284///
285/// # Example
286/// Builds a command match that matches on all CREATE_IO_COMPLETION_QUEUE admin commands.
287/// ```no_run
288/// use nvme_resources::fault::CommandMatch;
289/// use nvme_spec::Command;
290/// use zerocopy::FromZeros;
291/// use zerocopy::IntoBytes;
292///
293/// pub fn build_command_match() -> CommandMatch {
294///     let mut command = Command::new_zeroed();
295///     let mut mask = Command::new_zeroed();
296///     command.cdw0 = command.cdw0.with_opcode(nvme_spec::AdminOpcode::CREATE_IO_COMPLETION_QUEUE.0);
297///     mask.cdw0 = mask.cdw0.with_opcode(u8::MAX);
298///     CommandMatch {
299///         command,
300///         mask: mask.as_bytes().try_into().expect("mask should be 64 bytes"),
301///     }
302/// }
303/// ```
304#[derive(Clone, MeshPayload, PartialEq)]
305pub struct CommandMatch {
306    /// Command to match against
307    pub command: Command,
308    /// Bitmask that defines the bits to match against
309    pub mask: [u8; 64],
310}
311
312/// Fault configuration for the NVMe fault controller.
313///
314/// This struct defines behaviors that inject faults into the NVMe fault controller logic,
315/// such as delaying or dropping commands, triggering namespace change notifications,
316/// or customizing completion payloads. Fault injection is controlled by the
317/// `fault_active` flag, unless specified otherwise in the fault description.
318/// `fault_active` is managed by the test via [`mesh::CellUpdater`]. An
319/// exception to the `fault_active` check is the [`NamespaceFaultConfig`] which
320/// is processed regardless of `fault_active` state. (See `nvme_test` crate for
321/// details on how the faults are applied.)
322///
323/// # Example
324/// Panic when a command that matches CREATE_IO_COMPLETION_QUEUE is seen in the
325/// admin queue:
326/// ```no_run
327/// use mesh::CellUpdater;
328/// use nvme_resources::fault::FaultConfiguration;
329/// use nvme_resources::fault::AdminQueueFaultConfig;
330/// use nvme_resources::fault::CommandMatch;
331/// use nvme_spec::Command;
332/// use nvme_resources::fault::AdminQueueFaultBehavior;
333/// use nvme_resources::NvmeFaultControllerHandle;
334/// use guid::Guid;
335/// use zerocopy::FromZeros;
336/// use zerocopy::IntoBytes;
337///
338/// pub fn example_fault() {
339///     let mut fault_start_updater = CellUpdater::new(false);
340///
341///     // Setup command matches
342///     let mut command = Command::new_zeroed();
343///     let mut mask = Command::new_zeroed();
344///
345///     command.cdw0 = command.cdw0.with_opcode(nvme_spec::AdminOpcode::CREATE_IO_COMPLETION_QUEUE.0);
346///     mask.cdw0 = mask.cdw0.with_opcode(u8::MAX);
347///
348///     let fault_configuration = FaultConfiguration::new(fault_start_updater.cell())
349///         .with_admin_queue_fault(
350///             AdminQueueFaultConfig::new().with_submission_queue_fault(
351///                 CommandMatch {
352///                     command: command,
353///                     mask: mask.as_bytes().try_into().expect("mask should be 64 bytes"),
354///                 },
355///                 AdminQueueFaultBehavior::Panic("Received a CREATE_IO_COMPLETION_QUEUE command".to_string()),
356///             )
357///         );
358///     let fault_controller_handle = NvmeFaultControllerHandle {
359///         subsystem_id: Guid::new_random(),
360///         msix_count: 10,
361///         max_io_queues: 10,
362///         namespaces: vec![
363///             // Define NamespaceDefinitions here
364///         ],
365///         fault_config: fault_configuration,
366///     };
367///     // Pass the controller handle in to the vm config to create and attach the fault controller. At this point the fault is inactive.
368///     fault_start_updater.set(true); // Activate the fault injection.
369///     // ... run test ...
370///     fault_start_updater.set(false); // Deactivate the fault injection.
371/// }
372/// ```
373#[derive(MeshPayload)]
374pub struct FaultConfiguration {
375    /// Fault active state
376    pub fault_active: Cell<bool>,
377    /// Fault to apply to the admin queues
378    pub admin_fault: AdminQueueFaultConfig,
379    /// Fault to apply to management layer of the controller. Option because it
380    /// needs to be extracted by the PCI layer during initialization.
381    pub pci_fault: Option<PciFaultConfig>,
382    /// Fault for test triggered namespace change notifications
383    pub namespace_fault: NamespaceFaultConfig,
384    /// Fault to apply to all IO queues
385    pub io_fault: Arc<IoQueueFaultConfig>,
386}
387
388impl FaultConfiguration {
389    /// Create a new empty fault configuration
390    pub fn new(fault_active: Cell<bool>) -> Self {
391        // Ideally the faults should begin life as Option::None.
392        // For now, use a dummy mesh channel for namespace fault to avoid
393        // test setup complexity & special cases in the AdminHandler run loop.
394        Self {
395            fault_active: fault_active.clone(),
396            admin_fault: AdminQueueFaultConfig::new(),
397            pci_fault: Some(PciFaultConfig::new()),
398            namespace_fault: NamespaceFaultConfig::new(mesh::channel().1),
399            io_fault: Arc::new(IoQueueFaultConfig::new(fault_active)),
400        }
401    }
402
403    /// Add a PCI fault configuration to the fault configuration
404    pub fn with_pci_fault(mut self, pci_fault: PciFaultConfig) -> Self {
405        self.pci_fault = Some(pci_fault);
406        self
407    }
408
409    /// Add an admin queue fault configuration to the fault configuration
410    pub fn with_admin_queue_fault(mut self, admin_fault: AdminQueueFaultConfig) -> Self {
411        self.admin_fault = admin_fault;
412        self
413    }
414
415    /// Add an IO queue fault configuration to the fault configuration
416    pub fn with_io_queue_fault(mut self, io_fault: IoQueueFaultConfig) -> Self {
417        self.io_fault = Arc::new(io_fault);
418        self
419    }
420
421    /// Add a namespace fault configuration to the fault configuration
422    pub fn with_namespace_fault(mut self, namespace_fault: NamespaceFaultConfig) -> Self {
423        self.namespace_fault = namespace_fault;
424        self
425    }
426}
427
428impl PciFaultConfig {
429    /// Create a new no-op fault configuration
430    pub fn new() -> Self {
431        Self {
432            controller_management_fault_enable: PciFaultBehavior::Default,
433        }
434    }
435
436    /// Add a cc.en() fault
437    pub fn with_cc_enable_fault(mut self, behaviour: PciFaultBehavior) -> Self {
438        self.controller_management_fault_enable = behaviour;
439        self
440    }
441}
442
443impl AdminQueueFaultConfig {
444    /// Create an empty fault configuration
445    pub fn new() -> Self {
446        Self {
447            admin_submission_queue_faults: vec![],
448            admin_completion_queue_faults: vec![],
449        }
450    }
451
452    /// Add a [`CommandMatch`] -> [`AdminQueueFaultBehavior`] mapping for the submission queue.
453    ///
454    /// # Panics
455    /// Panics if an identical [`CommandMatch`] has already been configured.
456    pub fn with_submission_queue_fault(
457        mut self,
458        pattern: CommandMatch,
459        behaviour: AdminQueueFaultBehavior<Command>,
460    ) -> Self {
461        if self
462            .admin_submission_queue_faults
463            .iter()
464            .any(|(c, _)| pattern == *c)
465        {
466            panic!(
467                "Duplicate submission queue fault for Compare {:?} and Mask {:?}",
468                pattern.command, pattern.mask
469            );
470        }
471
472        self.admin_submission_queue_faults
473            .push((pattern, behaviour));
474        self
475    }
476
477    /// Add a [`CommandMatch`] -> [`AdminQueueFaultBehavior`] mapping for the completion queue.
478    ///
479    /// # Panics
480    /// Panics if an identical [`CommandMatch`] has already been configured.
481    pub fn with_completion_queue_fault(
482        mut self,
483        pattern: CommandMatch,
484        behaviour: AdminQueueFaultBehavior<Completion>,
485    ) -> Self {
486        if self
487            .admin_completion_queue_faults
488            .iter()
489            .any(|(c, _)| pattern == *c)
490        {
491            panic!(
492                "Duplicate completion queue fault for Compare {:?} and Mask {:?}",
493                pattern.command, pattern.mask
494            );
495        }
496
497        self.admin_completion_queue_faults
498            .push((pattern, behaviour));
499        self
500    }
501}
502
503impl NamespaceFaultConfig {
504    /// Creates a new NamespaceFaultConfig with a fresh channel.
505    pub fn new(recv_changed_namespace: mesh::Receiver<NamespaceChange>) -> Self {
506        Self {
507            recv_changed_namespace,
508        }
509    }
510}
511
512impl IoQueueFaultConfig {
513    /// Create an empty IO queue fault configuration
514    pub fn new(fault_active: Cell<bool>) -> Self {
515        Self {
516            io_completion_queue_faults: vec![],
517            fault_active,
518        }
519    }
520
521    /// Add a [`CommandMatch`] -> [`IoQueueFaultBehavior`] mapping for the completion queue.
522    ///
523    /// # Panics
524    /// Panics if an identical [`CommandMatch`] has already been configured.
525    pub fn with_completion_queue_fault(
526        mut self,
527        pattern: CommandMatch,
528        behaviour: IoQueueFaultBehavior,
529    ) -> Self {
530        if self
531            .io_completion_queue_faults
532            .iter()
533            .any(|(c, _)| pattern == *c)
534        {
535            panic!(
536                "Duplicate completion queue fault for Compare {:?} and Mask {:?}",
537                pattern.command, pattern.mask
538            );
539        }
540
541        self.io_completion_queue_faults.push((pattern, behaviour));
542        self
543    }
544}