nvme_resources/
fault.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! Provides an interface to programmatically and deterministically inject faults in the NVMe fault controller.
5
6use mesh::Cell;
7use mesh::MeshPayload;
8use mesh::OneshotSender;
9use mesh::rpc::Rpc;
10use nvme_spec::Command;
11use nvme_spec::Completion;
12use std::sync::Arc;
13use std::time::Duration;
14
15/// Supported fault behaviour for NVMe admin queues
16#[derive(Debug, MeshPayload)]
17pub enum AdminQueueFaultBehavior<T> {
18    /// Update the queue entry with the returned data
19    Update(T),
20    /// Drop the queue entry
21    Drop,
22    /// Delay. Note: This delay is not asynchronously applied. i.e. Subsequent
23    /// commands will be processed until the delay is over.
24    Delay(Duration),
25    /// Panic
26    Panic(String),
27    /// Writes the given payload to the PRP range. The test should ensure
28    /// that the payload is of valid size. If the size is too large, the fault
29    /// controller will panic. This behavior is not yet supported by the submission
30    /// queue fault.
31    CustomPayload(Vec<u8>),
32    /// Verify that a command was seen.
33    Verify(Option<OneshotSender<()>>),
34}
35
36/// Supported fault behaviour for NVMe IO queues
37#[derive(Debug, MeshPayload, Clone)]
38pub enum IoQueueFaultBehavior {
39    /// Writes the given payload to the PRP range. The test should ensure
40    /// that the payload is of valid size. If the size is too large, the fault
41    /// controller will panic. This behavior is not yet supported by the submission
42    /// queue fault.
43    CustomPayload(Vec<u8>),
44    /// Panic
45    Panic(String),
46    /// Delay. Note: This delay is not asynchronously applied. i.e. Subsequent
47    /// commands will be processed until the delay is over.
48    Delay(Duration),
49}
50
51/// Supported fault behaviour for PCI faults
52#[derive(MeshPayload)]
53pub enum PciFaultBehavior {
54    /// Introduce a delay to the PCI operation. This WILL block the processing
55    /// thread for the delay duration.
56    Delay(Duration),
57    /// Do nothing
58    Default,
59    /// Verify that the fault was triggered.
60    Verify(Option<OneshotSender<()>>),
61}
62
63/// A notification to the test confirming namespace change processing.
64#[derive(MeshPayload)]
65pub enum NamespaceChange {
66    /// Input: Namespace ID to notify, Output: Empty confirmation.
67    ChangeNotification(Rpc<u32, ()>),
68}
69
70/// A fault configuration to apply [`PciFaultBehavior`] to the controller management layer.
71///
72/// Currently the only supported fault is to delay enabling the controller via
73/// cc.en().
74///
75/// # Example
76/// Delay enabling the controller by 500ms.
77///
78/// ```no_run
79/// use mesh::CellUpdater;
80/// use nvme_resources::fault::FaultConfiguration;
81/// use nvme_resources::fault::PciFaultBehavior;
82/// use nvme_resources::fault::PciFaultConfig;
83/// use std::time::Duration;
84///
85/// pub fn pci_enable_delay_fault() -> FaultConfiguration{
86///     let mut fault_start_updater = CellUpdater::new(false);
87///     FaultConfiguration::new(fault_start_updater.cell())
88///         .with_pci_fault(
89///             PciFaultConfig::new().with_cc_enable_fault(
90///                 PciFaultBehavior::Delay(Duration::from_millis(500)),
91///             )
92///         )
93/// }
94/// ```
95#[derive(MeshPayload)]
96pub struct PciFaultConfig {
97    /// Fault to apply to cc.en() bit during enablement
98    pub controller_management_fault_enable: PciFaultBehavior,
99    /// Custom MQES value to return in CAP register reads. 1 based value.
100    pub max_queue_size: Option<u16>,
101}
102
103/// A fault config to trigger spurious namespace change notifications from the controller.
104///
105/// The fault controller listens on the provided channel for notifications containing
106/// a `u32` value representing the NSID (Namespace Identifier) that has changed.
107/// This does not actually modify the namespace; instead, it triggers the controller
108/// to process a namespace change notification. The fault is modeled as an
109/// RPC, which the controller completes once it has processed the change and sent
110/// the corresponding Asynchronous Event Notification (AEN).
111/// As per NVMe spec: If multiple namespace changes are notified, only the first triggers an AEN.
112/// Subsequent changes do not trigger additional AENs until the driver issues a
113/// GET_LOG_PAGE command. For implementation simplicity, namespace fault is not
114/// gated by the `fault_active` flag. Since only test code can send
115/// notifications on the fault channel, it is safe to bypass this check.
116///
117/// # Example
118/// Send a namespace change notification for NSID 1 and wait for it to be processed.
119/// ```no_run
120/// use mesh::CellUpdater;
121/// use nvme_resources::fault::NamespaceChange;
122/// use nvme_resources::fault::FaultConfiguration;
123/// use nvme_resources::fault::NamespaceFaultConfig;
124/// use nvme_resources::NvmeFaultControllerHandle;
125/// use guid::Guid;
126/// use mesh::rpc::RpcSend;
127///
128/// pub async fn send_namespace_change_fault() {
129///     let mut fault_start_updater = CellUpdater::new(false);
130///     let (ns_change_send, ns_change_recv) = mesh::channel::<NamespaceChange>();
131///     let fault_configuration = FaultConfiguration::new(fault_start_updater.cell())
132///         .with_namespace_fault(
133///             NamespaceFaultConfig::new(ns_change_recv),
134///         );
135///     // Complete setup
136///     let fault_controller_handle = NvmeFaultControllerHandle {
137///         subsystem_id: Guid::new_random(),
138///         msix_count: 10,
139///         max_io_queues: 10,
140///         namespaces: vec![
141///             // Define `NamespaceDefinitions` here
142///         ],
143///         fault_config: fault_configuration,
144///         enable_tdisp_tests: false,
145///     };
146///
147///     // Send the namespace change notification and await processing.
148///     ns_change_send.call(NamespaceChange::ChangeNotification, 1).await.unwrap();
149/// }
150/// ```
151#[derive(MeshPayload)]
152pub struct NamespaceFaultConfig {
153    /// Receiver for changed namespace notifications
154    pub recv_changed_namespace: mesh::Receiver<NamespaceChange>,
155}
156
157/// A fault configuration to inject faults into the admin submission and completion queues.
158///
159/// This struct maintains a mapping from [`CommandMatch`] to [`AdminQueueFaultBehavior`] for
160/// submission and completion queues. When a command match is found, (and `fault_active == true`)
161/// the associated fault is applied.
162/// Both submission and completion queue faults match on commands
163/// because completions do not contain enough identifying information to
164/// match against. If there is more than one match for a given command, the
165/// match defined first is prioritized. Faults are added via the
166/// `with_submission_queue_fault` and `with_completion_queue_fault` methods and
167/// can be chained. AdminQueueFaultConfig::new() creates an empty fault.
168///
169/// # Panics
170/// Panics if a duplicate `CommandMatch` is added for either submission or
171/// completion queues
172///
173/// # Example
174/// Panic on CREATE_IO_COMPLETION_QUEUE and delay before sending completion for 500ms after
175/// GET_LOG_PAGE command is processed.
176/// ```no_run
177/// use mesh::CellUpdater;
178/// use nvme_resources::fault::AdminQueueFaultConfig;
179/// use nvme_resources::fault::CommandMatch;
180/// use nvme_resources::fault::FaultConfiguration;
181/// use nvme_resources::fault::AdminQueueFaultBehavior;
182/// use nvme_spec::Command;
183/// use std::time::Duration;
184/// use zerocopy::FromZeros;
185/// use zerocopy::IntoBytes;
186///
187/// pub fn build_admin_queue_fault() -> FaultConfiguration {
188///     let mut fault_start_updater = CellUpdater::new(false);
189///
190///     // Setup command matches
191///     let mut command_io_queue = Command::new_zeroed();
192///     let mut command_log_page = Command::new_zeroed();
193///     let mut mask = Command::new_zeroed();
194///
195///     command_io_queue.cdw0 = command_io_queue.cdw0.with_opcode(nvme_spec::AdminOpcode::CREATE_IO_COMPLETION_QUEUE.0);
196///     command_log_page.cdw0 = command_log_page.cdw0.with_opcode(nvme_spec::AdminOpcode::GET_LOG_PAGE.0);
197///     mask.cdw0 = mask.cdw0.with_opcode(u8::MAX);
198///
199///     return FaultConfiguration::new(fault_start_updater.cell())
200///         .with_admin_queue_fault(
201///             AdminQueueFaultConfig::new().with_submission_queue_fault(
202///                 CommandMatch {
203///                     command: command_io_queue,
204///                     mask: mask.as_bytes().try_into().expect("mask should be 64 bytes"),
205///                 },
206///                 AdminQueueFaultBehavior::Panic("Received a CREATE_IO_COMPLETION_QUEUE command".to_string()),
207///             ).with_completion_queue_fault(
208///                 CommandMatch {
209///                     command: command_log_page,
210///                     mask: mask.as_bytes().try_into().expect("mask should be 64 bytes"),
211///                 },
212///                 AdminQueueFaultBehavior::Delay(Duration::from_millis(500)),
213///             )
214///         );
215/// }
216/// ```
217#[derive(MeshPayload)]
218pub struct AdminQueueFaultConfig {
219    /// A map of NVME opcodes to the submission fault behavior for each. (This
220    /// would ideally be a `HashMap`, but `mesh` doesn't support that type.
221    /// Given that this is not performance sensitive, the lookup is okay)
222    pub admin_submission_queue_faults: Vec<(CommandMatch, AdminQueueFaultBehavior<Command>)>,
223    /// A map of NVME opcodes to the completion fault behavior for each.
224    pub admin_completion_queue_faults: Vec<(CommandMatch, AdminQueueFaultBehavior<Completion>)>,
225}
226
227/// A fault configuration to inject faults into the io completions.
228///
229/// This struct maintains a mapping from [`CommandMatch`] to [`IoQueueFaultBehavior`] for
230/// completions. When a command match is found, (and `fault_active == true`)
231/// the associated fault is applied.
232/// If there is more than one match for a given command, the
233/// match defined first is prioritized. Faults are added via the
234/// `with_completion_queue_fault` method and calls
235/// can be chained. IoQueueFaultConfig::new() creates an empty fault.
236///
237/// # Panics
238/// Panics if a duplicate `CommandMatch` is added
239///
240/// # Example
241/// Panic when RESERVATION_REPORT command is seen.
242/// ```no_run
243/// use mesh::CellUpdater;
244/// use nvme_resources::fault::IoQueueFaultConfig;
245/// use nvme_resources::fault::CommandMatch;
246/// use nvme_resources::fault::FaultConfiguration;
247/// use nvme_resources::fault::IoQueueFaultBehavior;
248/// use nvme_spec::Command;
249/// use nvme_spec::nvm;
250/// use zerocopy::FromZeros;
251/// use zerocopy::IntoBytes;
252///
253/// pub fn build_admin_queue_fault() -> FaultConfiguration {
254///     let mut fault_start_updater = CellUpdater::new(false);
255///
256///     // Setup command matches
257///     let mut command_io_queue = Command::new_zeroed();
258///     let mut command_log_page = Command::new_zeroed();
259///     let mut mask = Command::new_zeroed();
260///
261///     command_io_queue.cdw0 = command_io_queue.cdw0.with_opcode(nvm::NvmOpcode::RESERVATION_REPORT.0);
262///     mask.cdw0 = mask.cdw0.with_opcode(u8::MAX);
263///
264///     return FaultConfiguration::new(fault_start_updater.cell())
265///         .with_io_queue_fault(
266///             IoQueueFaultConfig::new(fault_start_updater.cell()).with_completion_queue_fault(
267///                 CommandMatch {
268///                     command: command_io_queue,
269///                     mask: mask.as_bytes().try_into().expect("mask should be 64 bytes"),
270///                 },
271///                 IoQueueFaultBehavior::Panic("Received a RESERVATION_REPORT command".to_string()),
272///             )
273///         );
274/// }
275/// ```
276#[derive(MeshPayload, Clone)]
277pub struct IoQueueFaultConfig {
278    /// A map of NVME opcodes to the completion fault behavior for each.
279    pub io_completion_queue_faults: Vec<(CommandMatch, IoQueueFaultBehavior)>,
280    /// Fault active state. (Repeated here because FaultConfiguration is not Cloneable).
281    pub fault_active: Cell<bool>,
282}
283
284/// A versatile definition to command match [`NVMe commands`](nvme_spec::Command)
285///
286/// Matches NVMe commands using a 512-bit mask: (command_bytes & mask) == (pattern_bytes & mask).
287/// A convenient way to build the patterns is to treat both the command and the mask as
288/// `nvme_spec::Command` and max out the fields in the mask that should be
289/// matched.
290///
291/// # Example
292/// Builds a command match that matches on all CREATE_IO_COMPLETION_QUEUE admin commands.
293/// ```no_run
294/// use nvme_resources::fault::CommandMatch;
295/// use nvme_spec::Command;
296/// use zerocopy::FromZeros;
297/// use zerocopy::IntoBytes;
298///
299/// pub fn build_command_match() -> CommandMatch {
300///     let mut command = Command::new_zeroed();
301///     let mut mask = Command::new_zeroed();
302///     command.cdw0 = command.cdw0.with_opcode(nvme_spec::AdminOpcode::CREATE_IO_COMPLETION_QUEUE.0);
303///     mask.cdw0 = mask.cdw0.with_opcode(u8::MAX);
304///     CommandMatch {
305///         command,
306///         mask: mask.as_bytes().try_into().expect("mask should be 64 bytes"),
307///     }
308/// }
309/// ```
310#[derive(Clone, MeshPayload, PartialEq)]
311pub struct CommandMatch {
312    /// Command to match against
313    pub command: Command,
314    /// Bitmask that defines the bits to match against
315    pub mask: [u8; 64],
316}
317
318/// Fault configuration for the NVMe fault controller.
319///
320/// This struct defines behaviors that inject faults into the NVMe fault controller logic,
321/// such as delaying or dropping commands, triggering namespace change notifications,
322/// or customizing completion payloads. Fault injection is controlled by the
323/// `fault_active` flag, unless specified otherwise in the fault description.
324/// `fault_active` is managed by the test via [`mesh::CellUpdater`]. An
325/// exception to the `fault_active` check is the [`NamespaceFaultConfig`] which
326/// is processed regardless of `fault_active` state. (See `nvme_test` crate for
327/// details on how the faults are applied.)
328///
329/// # Example
330/// Panic when a command that matches CREATE_IO_COMPLETION_QUEUE is seen in the
331/// admin queue:
332/// ```no_run
333/// use mesh::CellUpdater;
334/// use nvme_resources::fault::FaultConfiguration;
335/// use nvme_resources::fault::AdminQueueFaultConfig;
336/// use nvme_resources::fault::CommandMatch;
337/// use nvme_spec::Command;
338/// use nvme_resources::fault::AdminQueueFaultBehavior;
339/// use nvme_resources::NvmeFaultControllerHandle;
340/// use guid::Guid;
341/// use zerocopy::FromZeros;
342/// use zerocopy::IntoBytes;
343///
344/// pub fn example_fault() {
345///     let mut fault_start_updater = CellUpdater::new(false);
346///
347///     // Setup command matches
348///     let mut command = Command::new_zeroed();
349///     let mut mask = Command::new_zeroed();
350///
351///     command.cdw0 = command.cdw0.with_opcode(nvme_spec::AdminOpcode::CREATE_IO_COMPLETION_QUEUE.0);
352///     mask.cdw0 = mask.cdw0.with_opcode(u8::MAX);
353///
354///     let fault_configuration = FaultConfiguration::new(fault_start_updater.cell())
355///         .with_admin_queue_fault(
356///             AdminQueueFaultConfig::new().with_submission_queue_fault(
357///                 CommandMatch {
358///                     command: command,
359///                     mask: mask.as_bytes().try_into().expect("mask should be 64 bytes"),
360///                 },
361///                 AdminQueueFaultBehavior::Panic("Received a CREATE_IO_COMPLETION_QUEUE command".to_string()),
362///             )
363///         );
364///     let fault_controller_handle = NvmeFaultControllerHandle {
365///         subsystem_id: Guid::new_random(),
366///         msix_count: 10,
367///         max_io_queues: 10,
368///         namespaces: vec![
369///             // Define NamespaceDefinitions here
370///         ],
371///         fault_config: fault_configuration,
372///         enable_tdisp_tests: false,
373///     };
374///     // Pass the controller handle in to the vm config to create and attach the fault controller. At this point the fault is inactive.
375///     fault_start_updater.set(true); // Activate the fault injection.
376///     // ... run test ...
377///     fault_start_updater.set(false); // Deactivate the fault injection.
378/// }
379/// ```
380#[derive(MeshPayload)]
381pub struct FaultConfiguration {
382    /// Fault active state
383    pub fault_active: Cell<bool>,
384    /// Fault to apply to the admin queues
385    pub admin_fault: AdminQueueFaultConfig,
386    /// Fault to apply to management layer of the controller. Option because it
387    /// needs to be extracted by the PCI layer during initialization.
388    pub pci_fault: Option<PciFaultConfig>,
389    /// Fault for test triggered namespace change notifications
390    pub namespace_fault: NamespaceFaultConfig,
391    /// Fault to apply to all IO queues
392    pub io_fault: Arc<IoQueueFaultConfig>,
393}
394
395impl FaultConfiguration {
396    /// Create a new empty fault configuration
397    pub fn new(fault_active: Cell<bool>) -> Self {
398        // Ideally the faults should begin life as Option::None.
399        // For now, use a dummy mesh channel for namespace fault to avoid
400        // test setup complexity & special cases in the AdminHandler run loop.
401        Self {
402            fault_active: fault_active.clone(),
403            admin_fault: AdminQueueFaultConfig::new(),
404            pci_fault: Some(PciFaultConfig::new()),
405            namespace_fault: NamespaceFaultConfig::new(mesh::channel().1),
406            io_fault: Arc::new(IoQueueFaultConfig::new(fault_active)),
407        }
408    }
409
410    /// Add a PCI fault configuration to the fault configuration
411    pub fn with_pci_fault(mut self, pci_fault: PciFaultConfig) -> Self {
412        self.pci_fault = Some(pci_fault);
413        self
414    }
415
416    /// Add an admin queue fault configuration to the fault configuration
417    pub fn with_admin_queue_fault(mut self, admin_fault: AdminQueueFaultConfig) -> Self {
418        self.admin_fault = admin_fault;
419        self
420    }
421
422    /// Add an IO queue fault configuration to the fault configuration
423    pub fn with_io_queue_fault(mut self, io_fault: IoQueueFaultConfig) -> Self {
424        self.io_fault = Arc::new(io_fault);
425        self
426    }
427
428    /// Add a namespace fault configuration to the fault configuration
429    pub fn with_namespace_fault(mut self, namespace_fault: NamespaceFaultConfig) -> Self {
430        self.namespace_fault = namespace_fault;
431        self
432    }
433}
434
435impl PciFaultConfig {
436    /// Create a new no-op fault configuration
437    pub fn new() -> Self {
438        Self {
439            controller_management_fault_enable: PciFaultBehavior::Default,
440            max_queue_size: None,
441        }
442    }
443
444    /// Add a cc.en() fault
445    pub fn with_cc_enable_fault(mut self, behaviour: PciFaultBehavior) -> Self {
446        self.controller_management_fault_enable = behaviour;
447        self
448    }
449
450    /// Add a custom CAP.MQES value to return on register reads
451    pub fn with_max_queue_size(mut self, max_queue_size: u16) -> Self {
452        self.max_queue_size = Some(max_queue_size);
453        self
454    }
455}
456
457impl AdminQueueFaultConfig {
458    /// Create an empty fault configuration
459    pub fn new() -> Self {
460        Self {
461            admin_submission_queue_faults: vec![],
462            admin_completion_queue_faults: vec![],
463        }
464    }
465
466    /// Add a [`CommandMatch`] -> [`AdminQueueFaultBehavior`] mapping for the submission queue.
467    ///
468    /// # Panics
469    /// Panics if an identical [`CommandMatch`] has already been configured.
470    pub fn with_submission_queue_fault(
471        mut self,
472        pattern: CommandMatch,
473        behaviour: AdminQueueFaultBehavior<Command>,
474    ) -> Self {
475        if self
476            .admin_submission_queue_faults
477            .iter()
478            .any(|(c, _)| pattern == *c)
479        {
480            panic!(
481                "Duplicate submission queue fault for Compare {:?} and Mask {:?}",
482                pattern.command, pattern.mask
483            );
484        }
485
486        self.admin_submission_queue_faults
487            .push((pattern, behaviour));
488        self
489    }
490
491    /// Add a [`CommandMatch`] -> [`AdminQueueFaultBehavior`] mapping for the completion queue.
492    ///
493    /// # Panics
494    /// Panics if an identical [`CommandMatch`] has already been configured.
495    pub fn with_completion_queue_fault(
496        mut self,
497        pattern: CommandMatch,
498        behaviour: AdminQueueFaultBehavior<Completion>,
499    ) -> Self {
500        if self
501            .admin_completion_queue_faults
502            .iter()
503            .any(|(c, _)| pattern == *c)
504        {
505            panic!(
506                "Duplicate completion queue fault for Compare {:?} and Mask {:?}",
507                pattern.command, pattern.mask
508            );
509        }
510
511        self.admin_completion_queue_faults
512            .push((pattern, behaviour));
513        self
514    }
515}
516
517impl NamespaceFaultConfig {
518    /// Creates a new NamespaceFaultConfig with a fresh channel.
519    pub fn new(recv_changed_namespace: mesh::Receiver<NamespaceChange>) -> Self {
520        Self {
521            recv_changed_namespace,
522        }
523    }
524}
525
526impl IoQueueFaultConfig {
527    /// Create an empty IO queue fault configuration
528    pub fn new(fault_active: Cell<bool>) -> Self {
529        Self {
530            io_completion_queue_faults: vec![],
531            fault_active,
532        }
533    }
534
535    /// Add a [`CommandMatch`] -> [`IoQueueFaultBehavior`] mapping for the completion queue.
536    ///
537    /// # Panics
538    /// Panics if an identical [`CommandMatch`] has already been configured.
539    pub fn with_completion_queue_fault(
540        mut self,
541        pattern: CommandMatch,
542        behaviour: IoQueueFaultBehavior,
543    ) -> Self {
544        if self
545            .io_completion_queue_faults
546            .iter()
547            .any(|(c, _)| pattern == *c)
548        {
549            panic!(
550                "Duplicate completion queue fault for Compare {:?} and Mask {:?}",
551                pattern.command, pattern.mask
552            );
553        }
554
555        self.io_completion_queue_faults.push((pattern, behaviour));
556        self
557    }
558}