nvme_test/
pci.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! The NVMe (Fault Injection) PCI device implementation.
5
6use crate::BAR0_LEN;
7use crate::DOORBELL_STRIDE_BITS;
8use crate::IOCQES;
9use crate::IOSQES;
10use crate::MAX_QES;
11use crate::NVME_VERSION;
12use crate::NvmeFaultControllerClient;
13use crate::PAGE_MASK;
14use crate::VENDOR_ID;
15use crate::spec;
16use crate::workers::IoQueueEntrySizes;
17use crate::workers::NvmeWorkers;
18use chipset_device::ChipsetDevice;
19use chipset_device::io::IoError;
20use chipset_device::io::IoError::InvalidRegister;
21use chipset_device::io::IoResult;
22use chipset_device::mmio::MmioIntercept;
23use chipset_device::mmio::RegisterMmioIntercept;
24use chipset_device::pci::PciConfigSpace;
25use device_emulators::ReadWriteRequestType;
26use device_emulators::read_as_u32_chunks;
27use device_emulators::write_as_u32_chunks;
28use guestmem::GuestMemory;
29use guid::Guid;
30use inspect::Inspect;
31use inspect::InspectMut;
32use nvme_resources::fault::FaultConfiguration;
33use nvme_resources::fault::PciFaultBehavior;
34use nvme_resources::fault::PciFaultConfig;
35use parking_lot::Mutex;
36use pci_core::capabilities::msix::MsixEmulator;
37use pci_core::cfg_space_emu::BarMemoryKind;
38use pci_core::cfg_space_emu::ConfigSpaceType0Emulator;
39use pci_core::cfg_space_emu::DeviceBars;
40use pci_core::msi::MsiTarget;
41use pci_core::spec::hwid::ClassCode;
42use pci_core::spec::hwid::HardwareIds;
43use pci_core::spec::hwid::ProgrammingInterface;
44use pci_core::spec::hwid::Subclass;
45use std::sync::Arc;
46use tdisp::TdispHostDeviceTarget;
47use vmcore::device_state::ChangeDeviceState;
48use vmcore::save_restore::SaveError;
49use vmcore::save_restore::SaveRestore;
50use vmcore::save_restore::SavedStateNotSupported;
51use vmcore::vm_task::VmTaskDriverSource;
52
53/// An NVMe controller.
54#[derive(InspectMut)]
55pub struct NvmeFaultController {
56    cfg_space: ConfigSpaceType0Emulator,
57    #[inspect(skip)]
58    msix: MsixEmulator,
59    registers: RegState,
60    #[inspect(skip)]
61    qe_sizes: Arc<Mutex<IoQueueEntrySizes>>,
62    #[inspect(flatten, mut)]
63    workers: NvmeWorkers,
64    #[inspect(skip)]
65    pci_fault_config: PciFaultConfig,
66    #[inspect(skip)]
67    fault_active: mesh::Cell<bool>,
68    /// The NVMe fault controller is repurposed for use in TDISP tests.
69    #[inspect(skip)]
70    tdisp_interface: Option<Box<dyn TdispHostDeviceTarget>>,
71}
72
73#[derive(Inspect)]
74struct RegState {
75    #[inspect(hex)]
76    interrupt_mask: u32,
77    cc: spec::Cc,
78    csts: spec::Csts,
79    aqa: spec::Aqa,
80    #[inspect(hex)]
81    asq: u64,
82    #[inspect(hex)]
83    acq: u64,
84}
85
86impl RegState {
87    fn new() -> Self {
88        Self {
89            interrupt_mask: 0,
90            cc: spec::Cc::new(),
91            csts: spec::Csts::new(),
92            aqa: spec::Aqa::new(),
93            asq: 0,
94            acq: 0,
95        }
96    }
97}
98
99const CAP: spec::Cap = spec::Cap::new()
100    .with_dstrd(DOORBELL_STRIDE_BITS - 2)
101    .with_mqes_z(MAX_QES - 1)
102    .with_cqr(true)
103    .with_css_nvm(true)
104    .with_to(!0);
105
106/// The NVMe controller's capabilities.
107#[derive(Debug, Copy, Clone)]
108pub struct NvmeFaultControllerCaps {
109    /// The number of entries in the MSI-X table.
110    pub msix_count: u16,
111    /// The maximum number of IO submission and completion queues.
112    pub max_io_queues: u16,
113    /// The subsystem ID, used as part of the subnqn field of the identify
114    /// controller response.
115    pub subsystem_id: Guid,
116}
117
118impl NvmeFaultController {
119    /// Creates a new NVMe controller.
120    pub fn new(
121        driver_source: &VmTaskDriverSource,
122        guest_memory: GuestMemory,
123        msi_target: &MsiTarget,
124        register_mmio: &mut dyn RegisterMmioIntercept,
125        caps: NvmeFaultControllerCaps,
126        mut fault_configuration: FaultConfiguration,
127        tdisp_interface: Option<Box<dyn TdispHostDeviceTarget>>,
128    ) -> Self {
129        let (msix, msix_cap) = MsixEmulator::new(4, caps.msix_count, msi_target);
130        let bars = DeviceBars::new()
131            .bar0(
132                BAR0_LEN,
133                BarMemoryKind::Intercept(register_mmio.new_io_region("bar0", BAR0_LEN)),
134            )
135            .bar4(
136                msix.bar_len(),
137                BarMemoryKind::Intercept(register_mmio.new_io_region("msix", msix.bar_len())),
138            );
139
140        let cfg_space = ConfigSpaceType0Emulator::new(
141            HardwareIds {
142                vendor_id: VENDOR_ID,
143                device_id: 0x00a9,
144                revision_id: 0,
145                prog_if: ProgrammingInterface::MASS_STORAGE_CONTROLLER_NON_VOLATILE_MEMORY_NVME,
146                sub_class: Subclass::MASS_STORAGE_CONTROLLER_NON_VOLATILE_MEMORY,
147                base_class: ClassCode::MASS_STORAGE_CONTROLLER,
148                type0_sub_vendor_id: 0,
149                type0_sub_system_id: 0,
150            },
151            vec![Box::new(msix_cap)],
152            bars,
153        );
154
155        let interrupts = (0..caps.msix_count)
156            .map(|i| msix.interrupt(i).unwrap())
157            .collect();
158
159        let pci_fault_config = fault_configuration
160            .pci_fault
161            .take()
162            .unwrap_or(PciFaultConfig::new());
163
164        let fault_active = fault_configuration.fault_active.clone();
165
166        let qe_sizes = Arc::new(Default::default());
167        let admin = NvmeWorkers::new(
168            driver_source,
169            guest_memory,
170            interrupts,
171            caps.max_io_queues,
172            caps.max_io_queues,
173            Arc::clone(&qe_sizes),
174            caps.subsystem_id,
175            fault_configuration,
176        );
177
178        Self {
179            cfg_space,
180            msix,
181            registers: RegState::new(),
182            workers: admin,
183            qe_sizes,
184            pci_fault_config,
185            fault_active,
186            tdisp_interface,
187        }
188    }
189
190    /// Returns a client for manipulating the NVMe controller at runtime.
191    pub fn client(&self) -> NvmeFaultControllerClient {
192        self.workers.client()
193    }
194
195    /// Reads from the virtual BAR 0.
196    pub fn read_bar0(&mut self, addr: u16, data: &mut [u8]) -> IoResult {
197        if data.len() < 4 {
198            return IoResult::Err(IoError::InvalidAccessSize);
199        }
200        if addr & (data.len() - 1) as u16 != 0 {
201            return IoResult::Err(IoError::UnalignedAccess);
202        }
203
204        // Check for 64-bit registers.
205        let d: Option<u64> = match spec::Register(addr & !7) {
206            spec::Register::CAP => {
207                if let Some(mqes) = self.pci_fault_config.max_queue_size {
208                    Some(CAP.with_mqes_z(mqes - 1).into())
209                } else {
210                    Some(CAP.into())
211                }
212            }
213            spec::Register::ASQ => Some(self.registers.asq),
214            spec::Register::ACQ => Some(self.registers.acq),
215            spec::Register::BPMBL => Some(0),
216            _ => None,
217        };
218        if let Some(d) = d {
219            if data.len() == 8 {
220                data.copy_from_slice(&d.to_ne_bytes());
221            } else if addr & 7 == 0 {
222                data.copy_from_slice(&(d as u32).to_ne_bytes());
223            } else {
224                data.copy_from_slice(&((d >> 32) as u32).to_ne_bytes());
225            }
226            return IoResult::Ok;
227        }
228
229        if data.len() != 4 {
230            return IoResult::Err(IoError::InvalidAccessSize);
231        }
232
233        // Handle 32-bit registers.
234        let d: u32 = match spec::Register(addr) {
235            spec::Register::VS => NVME_VERSION,
236            spec::Register::INTMS => self.registers.interrupt_mask,
237            spec::Register::INTMC => self.registers.interrupt_mask,
238            spec::Register::CC => self.registers.cc.into(),
239            spec::Register::RESERVED => 0,
240            spec::Register::CSTS => self.get_csts(),
241            spec::Register::NSSR => 0,
242            spec::Register::AQA => self.registers.aqa.into(),
243            spec::Register::CMBLOC => 0,
244            spec::Register::CMBSZ => 0,
245            spec::Register::BPINFO => 0,
246            spec::Register::BPRSEL => 0,
247            _ => return IoResult::Err(InvalidRegister),
248        };
249        data.copy_from_slice(&d.to_ne_bytes());
250        IoResult::Ok
251    }
252
253    /// Writes to the virtual BAR 0.
254    pub fn write_bar0(&mut self, addr: u16, data: &[u8]) -> IoResult {
255        if addr >= 0x1000 {
256            // Doorbell write.
257            let base = addr - 0x1000;
258            let db_id = base >> DOORBELL_STRIDE_BITS;
259            if (db_id << DOORBELL_STRIDE_BITS) != base {
260                return IoResult::Err(InvalidRegister);
261            }
262            let Ok(data) = data.try_into() else {
263                return IoResult::Err(IoError::InvalidAccessSize);
264            };
265            let value = u32::from_ne_bytes(data);
266            self.workers.doorbell(db_id, value);
267            return IoResult::Ok;
268        }
269
270        if data.len() < 4 {
271            return IoResult::Err(IoError::InvalidAccessSize);
272        }
273        if addr & (data.len() - 1) as u16 != 0 {
274            return IoResult::Err(IoError::UnalignedAccess);
275        }
276
277        let update_reg = |x: u64| {
278            if data.len() == 8 {
279                u64::from_ne_bytes(data.try_into().unwrap())
280            } else {
281                let data = u32::from_ne_bytes(data.try_into().unwrap()) as u64;
282                if addr & 7 == 0 {
283                    (x & !(u32::MAX as u64)) | data
284                } else {
285                    (x & u32::MAX as u64) | (data << 32)
286                }
287            }
288        };
289
290        // Check for 64-bit registers.
291        let handled = match spec::Register(addr & !7) {
292            spec::Register::ASQ => {
293                if !self.registers.cc.en() {
294                    self.registers.asq = update_reg(self.registers.asq) & PAGE_MASK;
295                } else {
296                    tracelimit::warn_ratelimited!("attempt to set asq while enabled");
297                }
298                true
299            }
300            spec::Register::ACQ => {
301                if !self.registers.cc.en() {
302                    self.registers.acq = update_reg(self.registers.acq) & PAGE_MASK;
303                } else {
304                    tracelimit::warn_ratelimited!("attempt to set acq while enabled");
305                }
306                true
307            }
308            _ => false,
309        };
310        if handled {
311            return IoResult::Ok;
312        }
313
314        let Ok(data) = data.try_into() else {
315            return IoResult::Err(IoError::InvalidAccessSize);
316        };
317        let data = u32::from_ne_bytes(data);
318
319        // Handle 32-bit registers.
320        match spec::Register(addr) {
321            spec::Register::INTMS => self.registers.interrupt_mask |= data,
322            spec::Register::INTMC => self.registers.interrupt_mask &= !data,
323            spec::Register::CC => self.set_cc(data.into()),
324            spec::Register::AQA => self.registers.aqa = data.into(),
325            _ => return IoResult::Err(InvalidRegister),
326        }
327        IoResult::Ok
328    }
329
330    fn set_cc(&mut self, cc: spec::Cc) {
331        tracing::debug!(?cc, "set cc");
332
333        if cc.mps() != 0 {
334            tracelimit::warn_ratelimited!(
335                "This implementation only supports memory page sizes of 4K."
336            );
337            self.fatal_error();
338            return;
339        }
340
341        if cc.css() != 0 {
342            tracelimit::warn_ratelimited!("This implementation only supports the NVM command set.");
343            self.fatal_error();
344            return;
345        }
346
347        if let 2..=6 = cc.ams() {
348            tracelimit::warn_ratelimited!("Undefined arbitration mechanism.");
349            self.fatal_error();
350        }
351
352        let mask: u32 = u32::from(
353            spec::Cc::new()
354                .with_en(true)
355                .with_shn(0b11)
356                .with_iosqes(0b1111)
357                .with_iocqes(0b1111),
358        );
359        let mut cc: spec::Cc = (u32::from(cc) & mask).into();
360
361        if cc.shn() != 0 {
362            // It is unclear in the spec (to me) what guarantees a
363            // controller is supposed to make after shutdown. For now, just
364            // complete shutdown immediately.
365            self.registers.csts.set_shst(0b10);
366        }
367
368        if cc.en() != self.registers.cc.en() {
369            if cc.en() {
370                // If any fault was configured for cc.en() process it here
371                if self.fault_active.get() {
372                    match &mut self.pci_fault_config.controller_management_fault_enable {
373                        PciFaultBehavior::Delay(duration) => {
374                            std::thread::sleep(*duration);
375                        }
376                        PciFaultBehavior::Default => {}
377                        PciFaultBehavior::Verify(send) => {
378                            if let Some(send) = send.take() {
379                                send.send(());
380                            }
381                        }
382                    }
383                }
384
385                // Some drivers will write zeros to IOSQES and IOCQES, assuming that the defaults will work.
386                if cc.iocqes() == 0 {
387                    cc.set_iocqes(IOCQES);
388                } else if cc.iocqes() != IOCQES {
389                    tracelimit::warn_ratelimited!(
390                        "This implementation only supports CQEs of the default size."
391                    );
392                    self.fatal_error();
393                    return;
394                }
395
396                if cc.iosqes() == 0 {
397                    cc.set_iosqes(IOSQES);
398                } else if cc.iosqes() != IOSQES {
399                    tracelimit::warn_ratelimited!(
400                        "This implementation only supports SQEs of the default size."
401                    );
402                    self.fatal_error();
403                    return;
404                }
405
406                if self.registers.csts.rdy() {
407                    tracelimit::warn_ratelimited!("enabling during reset");
408                    return;
409                }
410                if cc.shn() == 0 {
411                    self.registers.csts.set_shst(0);
412                }
413
414                self.workers.enable(
415                    self.registers.asq,
416                    self.registers.aqa.asqs_z().max(1) + 1,
417                    self.registers.acq,
418                    self.registers.aqa.acqs_z().max(1) + 1,
419                );
420            } else if self.registers.csts.rdy() {
421                self.workers.controller_reset();
422            } else {
423                tracelimit::warn_ratelimited!("disabling while not ready");
424                return;
425            }
426        }
427
428        self.registers.cc = cc;
429        *self.qe_sizes.lock() = IoQueueEntrySizes {
430            sqe_bits: cc.iosqes(),
431            cqe_bits: cc.iocqes(),
432        };
433    }
434
435    fn get_csts(&mut self) -> u32 {
436        if !self.registers.cc.en() && self.registers.csts.rdy() {
437            // Keep trying to disable.
438            if self.workers.poll_controller_reset() {
439                // AQA, ASQ, and ACQ are not reset by controller reset.
440                self.registers.csts = 0.into();
441                self.registers.cc = 0.into();
442                self.registers.interrupt_mask = 0;
443            }
444        } else if self.registers.cc.en() && !self.registers.csts.rdy() {
445            if self.workers.poll_enabled() {
446                self.registers.csts.set_rdy(true);
447            }
448        }
449
450        let csts = self.registers.csts;
451        tracing::debug!(?csts, "get csts");
452        csts.into()
453    }
454
455    /// Sets the CFS bit in the controller status register (CSTS), indicating
456    /// that the controller has experienced "undefined" behavior.
457    pub fn fatal_error(&mut self) {
458        self.registers.csts.set_cfs(true);
459    }
460}
461
462impl ChangeDeviceState for NvmeFaultController {
463    fn start(&mut self) {}
464
465    async fn stop(&mut self) {}
466
467    async fn reset(&mut self) {
468        let Self {
469            cfg_space,
470            msix: _,
471            registers,
472            qe_sizes,
473            workers,
474            pci_fault_config: _,
475            fault_active: _,
476            tdisp_interface: _,
477        } = self;
478        workers.reset().await;
479        cfg_space.reset();
480        *registers = RegState::new();
481        *qe_sizes.lock() = Default::default();
482    }
483}
484
485impl ChipsetDevice for NvmeFaultController {
486    fn supports_mmio(&mut self) -> Option<&mut dyn MmioIntercept> {
487        Some(self)
488    }
489
490    fn supports_pci(&mut self) -> Option<&mut dyn PciConfigSpace> {
491        Some(self)
492    }
493
494    /// The NVMe fault controller is repurposed for use in TDISP tests.
495    fn supports_tdisp(&mut self) -> Option<&mut dyn TdispHostDeviceTarget> {
496        tracing::debug!(
497            supported = self.tdisp_interface.is_some(),
498            "fault controller TDISP support in ChipsetDevice"
499        );
500
501        match &mut self.tdisp_interface {
502            Some(tdisp) => Some(tdisp.as_mut()),
503            None => None,
504        }
505    }
506}
507
508impl MmioIntercept for NvmeFaultController {
509    fn mmio_read(&mut self, addr: u64, data: &mut [u8]) -> IoResult {
510        match self.cfg_space.find_bar(addr) {
511            Some((0, offset)) => self.read_bar0(offset, data),
512            Some((4, offset)) => {
513                read_as_u32_chunks(offset, data, |offset| self.msix.read_u32(offset));
514                IoResult::Ok
515            }
516            _ => IoResult::Err(InvalidRegister),
517        }
518    }
519
520    fn mmio_write(&mut self, addr: u64, data: &[u8]) -> IoResult {
521        match self.cfg_space.find_bar(addr) {
522            Some((0, offset)) => self.write_bar0(offset, data),
523            Some((4, offset)) => {
524                write_as_u32_chunks(offset, data, |offset, ty| match ty {
525                    ReadWriteRequestType::Read => Some(self.msix.read_u32(offset)),
526                    ReadWriteRequestType::Write(val) => {
527                        self.msix.write_u32(offset, val);
528                        None
529                    }
530                });
531                IoResult::Ok
532            }
533            _ => IoResult::Err(InvalidRegister),
534        }
535    }
536}
537
538impl PciConfigSpace for NvmeFaultController {
539    fn pci_cfg_read(&mut self, offset: u16, value: &mut u32) -> IoResult {
540        self.cfg_space.read_u32(offset, value)
541    }
542
543    fn pci_cfg_write(&mut self, offset: u16, value: u32) -> IoResult {
544        self.cfg_space.write_u32(offset, value)
545    }
546}
547
548impl SaveRestore for NvmeFaultController {
549    type SavedState = SavedStateNotSupported;
550
551    fn save(&mut self) -> Result<Self::SavedState, SaveError> {
552        Err(SaveError::NotSupported)
553    }
554
555    fn restore(
556        &mut self,
557        state: Self::SavedState,
558    ) -> Result<(), vmcore::save_restore::RestoreError> {
559        match state {}
560    }
561}