nvme_test/
pci.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! The NVMe (Fault Injection) PCI device implementation.
5
6use crate::BAR0_LEN;
7use crate::DOORBELL_STRIDE_BITS;
8use crate::IOCQES;
9use crate::IOSQES;
10use crate::MAX_QES;
11use crate::NVME_VERSION;
12use crate::NvmeFaultControllerClient;
13use crate::PAGE_MASK;
14use crate::VENDOR_ID;
15use crate::spec;
16use crate::workers::IoQueueEntrySizes;
17use crate::workers::NvmeWorkers;
18use chipset_device::ChipsetDevice;
19use chipset_device::io::IoError;
20use chipset_device::io::IoError::InvalidRegister;
21use chipset_device::io::IoResult;
22use chipset_device::mmio::MmioIntercept;
23use chipset_device::mmio::RegisterMmioIntercept;
24use chipset_device::pci::PciConfigSpace;
25use device_emulators::ReadWriteRequestType;
26use device_emulators::read_as_u32_chunks;
27use device_emulators::write_as_u32_chunks;
28use guestmem::GuestMemory;
29use guid::Guid;
30use inspect::Inspect;
31use inspect::InspectMut;
32use nvme_resources::fault::FaultConfiguration;
33use nvme_resources::fault::PciFaultBehavior;
34use nvme_resources::fault::PciFaultConfig;
35use parking_lot::Mutex;
36use pci_core::capabilities::msix::MsixEmulator;
37use pci_core::cfg_space_emu::BarMemoryKind;
38use pci_core::cfg_space_emu::ConfigSpaceType0Emulator;
39use pci_core::cfg_space_emu::DeviceBars;
40use pci_core::msi::MsiTarget;
41use pci_core::spec::hwid::ClassCode;
42use pci_core::spec::hwid::HardwareIds;
43use pci_core::spec::hwid::ProgrammingInterface;
44use pci_core::spec::hwid::Subclass;
45use std::sync::Arc;
46use tdisp::TdispHostDeviceTarget;
47use vmcore::device_state::ChangeDeviceState;
48use vmcore::save_restore::SaveError;
49use vmcore::save_restore::SaveRestore;
50use vmcore::save_restore::SavedStateNotSupported;
51use vmcore::vm_task::VmTaskDriverSource;
52
53/// An NVMe controller.
54#[derive(InspectMut)]
55pub struct NvmeFaultController {
56    cfg_space: ConfigSpaceType0Emulator,
57    #[inspect(skip)]
58    msix: MsixEmulator,
59    registers: RegState,
60    #[inspect(skip)]
61    qe_sizes: Arc<Mutex<IoQueueEntrySizes>>,
62    #[inspect(flatten, mut)]
63    workers: NvmeWorkers,
64    #[inspect(skip)]
65    pci_fault_config: PciFaultConfig,
66    #[inspect(skip)]
67    fault_active: mesh::Cell<bool>,
68    /// The NVMe fault controller is repurposed for use in TDISP tests.
69    #[inspect(skip)]
70    tdisp_interface: Option<Box<dyn TdispHostDeviceTarget>>,
71}
72
73#[derive(Inspect)]
74struct RegState {
75    #[inspect(hex)]
76    interrupt_mask: u32,
77    cc: spec::Cc,
78    csts: spec::Csts,
79    aqa: spec::Aqa,
80    #[inspect(hex)]
81    asq: u64,
82    #[inspect(hex)]
83    acq: u64,
84}
85
86impl RegState {
87    fn new() -> Self {
88        Self {
89            interrupt_mask: 0,
90            cc: spec::Cc::new(),
91            csts: spec::Csts::new(),
92            aqa: spec::Aqa::new(),
93            asq: 0,
94            acq: 0,
95        }
96    }
97}
98
99const CAP: spec::Cap = spec::Cap::new()
100    .with_dstrd(DOORBELL_STRIDE_BITS - 2)
101    .with_mqes_z(MAX_QES - 1)
102    .with_cqr(true)
103    .with_css_nvm(true)
104    .with_to(!0);
105
106/// The NVMe controller's capabilities.
107#[derive(Debug, Copy, Clone)]
108pub struct NvmeFaultControllerCaps {
109    /// The number of entries in the MSI-X table.
110    pub msix_count: u16,
111    /// The maximum number of IO submission and completion queues.
112    pub max_io_queues: u16,
113    /// The subsystem ID, used as part of the subnqn field of the identify
114    /// controller response.
115    pub subsystem_id: Guid,
116}
117
118impl NvmeFaultController {
119    /// Creates a new NVMe controller.
120    pub fn new(
121        driver_source: &VmTaskDriverSource,
122        guest_memory: GuestMemory,
123        msi_target: &MsiTarget,
124        register_mmio: &mut dyn RegisterMmioIntercept,
125        caps: NvmeFaultControllerCaps,
126        mut fault_configuration: FaultConfiguration,
127        tdisp_interface: Option<Box<dyn TdispHostDeviceTarget>>,
128    ) -> Self {
129        let (msix, msix_cap) = MsixEmulator::new(4, caps.msix_count, msi_target);
130        let bars = DeviceBars::new()
131            .bar0(
132                BAR0_LEN,
133                BarMemoryKind::Intercept(register_mmio.new_io_region("bar0", BAR0_LEN)),
134            )
135            .bar4(
136                msix.bar_len(),
137                BarMemoryKind::Intercept(register_mmio.new_io_region("msix", msix.bar_len())),
138            );
139
140        let cfg_space = ConfigSpaceType0Emulator::new(
141            HardwareIds {
142                vendor_id: VENDOR_ID,
143                device_id: 0x00a9,
144                revision_id: 0,
145                prog_if: ProgrammingInterface::MASS_STORAGE_CONTROLLER_NON_VOLATILE_MEMORY_NVME,
146                sub_class: Subclass::MASS_STORAGE_CONTROLLER_NON_VOLATILE_MEMORY,
147                base_class: ClassCode::MASS_STORAGE_CONTROLLER,
148                type0_sub_vendor_id: 0,
149                type0_sub_system_id: 0,
150            },
151            vec![Box::new(msix_cap)],
152            bars,
153        );
154
155        let interrupts = (0..caps.msix_count)
156            .map(|i| msix.interrupt(i).unwrap())
157            .collect();
158
159        let pci_fault_config = fault_configuration
160            .pci_fault
161            .take()
162            .unwrap_or(PciFaultConfig::new());
163
164        let fault_active = fault_configuration.fault_active.clone();
165
166        let qe_sizes = Arc::new(Default::default());
167        let admin = NvmeWorkers::new(
168            driver_source,
169            guest_memory,
170            interrupts,
171            caps.max_io_queues,
172            caps.max_io_queues,
173            Arc::clone(&qe_sizes),
174            caps.subsystem_id,
175            fault_configuration,
176        );
177
178        Self {
179            cfg_space,
180            msix,
181            registers: RegState::new(),
182            workers: admin,
183            qe_sizes,
184            pci_fault_config,
185            fault_active,
186            tdisp_interface,
187        }
188    }
189
190    /// Returns a client for manipulating the NVMe controller at runtime.
191    pub fn client(&self) -> NvmeFaultControllerClient {
192        self.workers.client()
193    }
194
195    /// Reads from the virtual BAR 0.
196    pub fn read_bar0(&mut self, addr: u64, data: &mut [u8]) -> IoResult {
197        if data.len() < 4 {
198            return IoResult::Err(IoError::InvalidAccessSize);
199        }
200        if addr & (data.len() as u64 - 1) != 0 {
201            return IoResult::Err(IoError::UnalignedAccess);
202        }
203
204        // Check for 64-bit registers.
205        let d: Option<u64> = match spec::Register(addr & !7) {
206            spec::Register::CAP => {
207                if let Some(mqes) = self.pci_fault_config.max_queue_size {
208                    Some(CAP.with_mqes_z(mqes - 1).into())
209                } else {
210                    Some(CAP.into())
211                }
212            }
213            spec::Register::ASQ => Some(self.registers.asq),
214            spec::Register::ACQ => Some(self.registers.acq),
215            spec::Register::BPMBL => Some(0),
216            _ => None,
217        };
218        if let Some(d) = d {
219            if data.len() == 8 {
220                data.copy_from_slice(&d.to_ne_bytes());
221            } else if addr & 7 == 0 {
222                data.copy_from_slice(&(d as u32).to_ne_bytes());
223            } else {
224                data.copy_from_slice(&((d >> 32) as u32).to_ne_bytes());
225            }
226            return IoResult::Ok;
227        }
228
229        if data.len() != 4 {
230            return IoResult::Err(IoError::InvalidAccessSize);
231        }
232
233        // Handle 32-bit registers.
234        let d: u32 = match spec::Register(addr) {
235            spec::Register::VS => NVME_VERSION,
236            spec::Register::INTMS => self.registers.interrupt_mask,
237            spec::Register::INTMC => self.registers.interrupt_mask,
238            spec::Register::CC => self.registers.cc.into(),
239            spec::Register::RESERVED => 0,
240            spec::Register::CSTS => self.get_csts(),
241            spec::Register::NSSR => 0,
242            spec::Register::AQA => self.registers.aqa.into(),
243            spec::Register::CMBLOC => 0,
244            spec::Register::CMBSZ => 0,
245            spec::Register::BPINFO => 0,
246            spec::Register::BPRSEL => 0,
247            _ => return IoResult::Err(InvalidRegister),
248        };
249        data.copy_from_slice(&d.to_ne_bytes());
250        IoResult::Ok
251    }
252
253    /// Writes to the virtual BAR 0.
254    pub fn write_bar0(&mut self, addr: u64, data: &[u8]) -> IoResult {
255        if addr >= 0x1000 {
256            // Doorbell write.
257            let base = addr - 0x1000;
258            let db_id = base >> DOORBELL_STRIDE_BITS;
259            if (db_id << DOORBELL_STRIDE_BITS) != base {
260                return IoResult::Err(InvalidRegister);
261            }
262            let Ok(db_id) = u16::try_from(db_id) else {
263                return IoResult::Err(InvalidRegister);
264            };
265            let Ok(data) = data.try_into() else {
266                return IoResult::Err(IoError::InvalidAccessSize);
267            };
268            let value = u32::from_ne_bytes(data);
269            self.workers.doorbell(db_id, value);
270            return IoResult::Ok;
271        }
272
273        if data.len() < 4 {
274            return IoResult::Err(IoError::InvalidAccessSize);
275        }
276        if addr & (data.len() as u64 - 1) != 0 {
277            return IoResult::Err(IoError::UnalignedAccess);
278        }
279
280        let update_reg = |x: u64| {
281            if data.len() == 8 {
282                u64::from_ne_bytes(data.try_into().unwrap())
283            } else {
284                let data = u32::from_ne_bytes(data.try_into().unwrap()) as u64;
285                if addr & 7 == 0 {
286                    (x & !(u32::MAX as u64)) | data
287                } else {
288                    (x & u32::MAX as u64) | (data << 32)
289                }
290            }
291        };
292
293        // Check for 64-bit registers.
294        let handled = match spec::Register(addr & !7) {
295            spec::Register::ASQ => {
296                if !self.registers.cc.en() {
297                    self.registers.asq = update_reg(self.registers.asq) & PAGE_MASK;
298                } else {
299                    tracelimit::warn_ratelimited!("attempt to set asq while enabled");
300                }
301                true
302            }
303            spec::Register::ACQ => {
304                if !self.registers.cc.en() {
305                    self.registers.acq = update_reg(self.registers.acq) & PAGE_MASK;
306                } else {
307                    tracelimit::warn_ratelimited!("attempt to set acq while enabled");
308                }
309                true
310            }
311            _ => false,
312        };
313        if handled {
314            return IoResult::Ok;
315        }
316
317        let Ok(data) = data.try_into() else {
318            return IoResult::Err(IoError::InvalidAccessSize);
319        };
320        let data = u32::from_ne_bytes(data);
321
322        // Handle 32-bit registers.
323        match spec::Register(addr) {
324            spec::Register::INTMS => self.registers.interrupt_mask |= data,
325            spec::Register::INTMC => self.registers.interrupt_mask &= !data,
326            spec::Register::CC => self.set_cc(data.into()),
327            spec::Register::AQA => self.registers.aqa = data.into(),
328            _ => return IoResult::Err(InvalidRegister),
329        }
330        IoResult::Ok
331    }
332
333    fn set_cc(&mut self, cc: spec::Cc) {
334        tracing::debug!(?cc, "set cc");
335
336        if cc.mps() != 0 {
337            tracelimit::warn_ratelimited!(
338                "This implementation only supports memory page sizes of 4K."
339            );
340            self.fatal_error();
341            return;
342        }
343
344        if cc.css() != 0 {
345            tracelimit::warn_ratelimited!("This implementation only supports the NVM command set.");
346            self.fatal_error();
347            return;
348        }
349
350        if let 2..=6 = cc.ams() {
351            tracelimit::warn_ratelimited!("Undefined arbitration mechanism.");
352            self.fatal_error();
353        }
354
355        let mask: u32 = u32::from(
356            spec::Cc::new()
357                .with_en(true)
358                .with_shn(0b11)
359                .with_iosqes(0b1111)
360                .with_iocqes(0b1111),
361        );
362        let mut cc: spec::Cc = (u32::from(cc) & mask).into();
363
364        if cc.shn() != 0 {
365            // It is unclear in the spec (to me) what guarantees a
366            // controller is supposed to make after shutdown. For now, just
367            // complete shutdown immediately.
368            self.registers.csts.set_shst(0b10);
369        }
370
371        if cc.en() != self.registers.cc.en() {
372            if cc.en() {
373                // If any fault was configured for cc.en() process it here
374                if self.fault_active.get() {
375                    match &mut self.pci_fault_config.controller_management_fault_enable {
376                        PciFaultBehavior::Delay(duration) => {
377                            std::thread::sleep(*duration);
378                        }
379                        PciFaultBehavior::Default => {}
380                        PciFaultBehavior::Verify(send) => {
381                            if let Some(send) = send.take() {
382                                send.send(());
383                            }
384                        }
385                    }
386                }
387
388                // Some drivers will write zeros to IOSQES and IOCQES, assuming that the defaults will work.
389                if cc.iocqes() == 0 {
390                    cc.set_iocqes(IOCQES);
391                } else if cc.iocqes() != IOCQES {
392                    tracelimit::warn_ratelimited!(
393                        "This implementation only supports CQEs of the default size."
394                    );
395                    self.fatal_error();
396                    return;
397                }
398
399                if cc.iosqes() == 0 {
400                    cc.set_iosqes(IOSQES);
401                } else if cc.iosqes() != IOSQES {
402                    tracelimit::warn_ratelimited!(
403                        "This implementation only supports SQEs of the default size."
404                    );
405                    self.fatal_error();
406                    return;
407                }
408
409                if self.registers.csts.rdy() {
410                    tracelimit::warn_ratelimited!("enabling during reset");
411                    return;
412                }
413                if cc.shn() == 0 {
414                    self.registers.csts.set_shst(0);
415                }
416
417                self.workers.enable(
418                    self.registers.asq,
419                    self.registers.aqa.asqs_z().max(1) + 1,
420                    self.registers.acq,
421                    self.registers.aqa.acqs_z().max(1) + 1,
422                );
423            } else if self.registers.csts.rdy() {
424                self.workers.controller_reset();
425            } else {
426                tracelimit::warn_ratelimited!("disabling while not ready");
427                return;
428            }
429        }
430
431        self.registers.cc = cc;
432        *self.qe_sizes.lock() = IoQueueEntrySizes {
433            sqe_bits: cc.iosqes(),
434            cqe_bits: cc.iocqes(),
435        };
436    }
437
438    fn get_csts(&mut self) -> u32 {
439        if !self.registers.cc.en() && self.registers.csts.rdy() {
440            // Keep trying to disable.
441            if self.workers.poll_controller_reset() {
442                // AQA, ASQ, and ACQ are not reset by controller reset.
443                self.registers.csts = 0.into();
444                self.registers.cc = 0.into();
445                self.registers.interrupt_mask = 0;
446            }
447        } else if self.registers.cc.en() && !self.registers.csts.rdy() {
448            if self.workers.poll_enabled() {
449                self.registers.csts.set_rdy(true);
450            }
451        }
452
453        let csts = self.registers.csts;
454        tracing::debug!(?csts, "get csts");
455        csts.into()
456    }
457
458    /// Sets the CFS bit in the controller status register (CSTS), indicating
459    /// that the controller has experienced "undefined" behavior.
460    pub fn fatal_error(&mut self) {
461        self.registers.csts.set_cfs(true);
462    }
463}
464
465impl ChangeDeviceState for NvmeFaultController {
466    fn start(&mut self) {}
467
468    async fn stop(&mut self) {}
469
470    async fn reset(&mut self) {
471        let Self {
472            cfg_space,
473            msix: _,
474            registers,
475            qe_sizes,
476            workers,
477            pci_fault_config: _,
478            fault_active: _,
479            tdisp_interface: _,
480        } = self;
481        workers.reset().await;
482        cfg_space.reset();
483        *registers = RegState::new();
484        *qe_sizes.lock() = Default::default();
485    }
486}
487
488impl ChipsetDevice for NvmeFaultController {
489    fn supports_mmio(&mut self) -> Option<&mut dyn MmioIntercept> {
490        Some(self)
491    }
492
493    fn supports_pci(&mut self) -> Option<&mut dyn PciConfigSpace> {
494        Some(self)
495    }
496
497    /// The NVMe fault controller is repurposed for use in TDISP tests.
498    fn supports_tdisp(&mut self) -> Option<&mut dyn TdispHostDeviceTarget> {
499        tracing::debug!(
500            supported = self.tdisp_interface.is_some(),
501            "fault controller TDISP support in ChipsetDevice"
502        );
503
504        match &mut self.tdisp_interface {
505            Some(tdisp) => Some(tdisp.as_mut()),
506            None => None,
507        }
508    }
509}
510
511impl MmioIntercept for NvmeFaultController {
512    fn mmio_read(&mut self, addr: u64, data: &mut [u8]) -> IoResult {
513        match self.cfg_space.find_bar(addr) {
514            Some((0, offset)) => self.read_bar0(offset, data),
515            Some((4, offset)) => {
516                read_as_u32_chunks(offset, data, |offset| self.msix.read_u32(offset));
517                IoResult::Ok
518            }
519            _ => IoResult::Err(InvalidRegister),
520        }
521    }
522
523    fn mmio_write(&mut self, addr: u64, data: &[u8]) -> IoResult {
524        match self.cfg_space.find_bar(addr) {
525            Some((0, offset)) => self.write_bar0(offset, data),
526            Some((4, offset)) => {
527                write_as_u32_chunks(offset, data, |offset, ty| match ty {
528                    ReadWriteRequestType::Read => Some(self.msix.read_u32(offset)),
529                    ReadWriteRequestType::Write(val) => {
530                        self.msix.write_u32(offset, val);
531                        None
532                    }
533                });
534                IoResult::Ok
535            }
536            _ => IoResult::Err(InvalidRegister),
537        }
538    }
539}
540
541impl PciConfigSpace for NvmeFaultController {
542    fn pci_cfg_read(&mut self, offset: u16, value: &mut u32) -> IoResult {
543        self.cfg_space.read_u32(offset, value)
544    }
545
546    fn pci_cfg_write(&mut self, offset: u16, value: u32) -> IoResult {
547        self.cfg_space.write_u32(offset, value)
548    }
549}
550
551impl SaveRestore for NvmeFaultController {
552    type SavedState = SavedStateNotSupported;
553
554    fn save(&mut self) -> Result<Self::SavedState, SaveError> {
555        Err(SaveError::NotSupported)
556    }
557
558    fn restore(
559        &mut self,
560        state: Self::SavedState,
561    ) -> Result<(), vmcore::save_restore::RestoreError> {
562        match state {}
563    }
564}