Skip to main content

nvme_test/
pci.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! The NVMe (Fault Injection) PCI device implementation.
5
6use crate::BAR0_LEN;
7use crate::DEVICE_ID;
8use crate::DOORBELL_STRIDE_BITS;
9use crate::IOCQES;
10use crate::IOSQES;
11use crate::MAX_QES;
12use crate::NVME_VERSION;
13use crate::NvmeFaultControllerClient;
14use crate::PAGE_MASK;
15use crate::VENDOR_ID;
16use crate::spec;
17use crate::workers::IoQueueEntrySizes;
18use crate::workers::NvmeWorkers;
19use chipset_device::ChipsetDevice;
20use chipset_device::io::IoError;
21use chipset_device::io::IoError::InvalidRegister;
22use chipset_device::io::IoResult;
23use chipset_device::mmio::MmioIntercept;
24use chipset_device::mmio::RegisterMmioIntercept;
25use chipset_device::pci::PciConfigSpace;
26use device_emulators::ReadWriteRequestType;
27use device_emulators::read_as_u32_chunks;
28use device_emulators::write_as_u32_chunks;
29use guestmem::GuestMemory;
30use guid::Guid;
31use inspect::Inspect;
32use inspect::InspectMut;
33use nvme_resources::fault::FaultConfiguration;
34use nvme_resources::fault::PciFaultBehavior;
35use nvme_resources::fault::PciFaultConfig;
36use parking_lot::Mutex;
37use pci_core::capabilities::msix::MsixEmulator;
38use pci_core::cfg_space_emu::BarMemoryKind;
39use pci_core::cfg_space_emu::ConfigSpaceType0Emulator;
40use pci_core::cfg_space_emu::DeviceBars;
41use pci_core::msi::MsiTarget;
42use pci_core::spec::hwid::ClassCode;
43use pci_core::spec::hwid::HardwareIds;
44use pci_core::spec::hwid::ProgrammingInterface;
45use pci_core::spec::hwid::Subclass;
46use std::sync::Arc;
47use tdisp::TdispHostDeviceTarget;
48use vmcore::device_state::ChangeDeviceState;
49use vmcore::save_restore::SaveError;
50use vmcore::save_restore::SaveRestore;
51use vmcore::save_restore::SavedStateNotSupported;
52use vmcore::vm_task::VmTaskDriverSource;
53
54/// An NVMe controller.
55#[derive(InspectMut)]
56pub struct NvmeFaultController {
57    cfg_space: ConfigSpaceType0Emulator,
58    #[inspect(skip)]
59    msix: MsixEmulator,
60    registers: RegState,
61    #[inspect(skip)]
62    qe_sizes: Arc<Mutex<IoQueueEntrySizes>>,
63    #[inspect(flatten, mut)]
64    workers: NvmeWorkers,
65    #[inspect(skip)]
66    pci_fault_config: PciFaultConfig,
67    #[inspect(skip)]
68    fault_active: mesh::Cell<bool>,
69    /// The NVMe fault controller is repurposed for use in TDISP tests.
70    #[inspect(skip)]
71    tdisp_interface: Option<Box<dyn TdispHostDeviceTarget>>,
72}
73
74#[derive(Inspect)]
75struct RegState {
76    #[inspect(hex)]
77    interrupt_mask: u32,
78    cc: spec::Cc,
79    csts: spec::Csts,
80    aqa: spec::Aqa,
81    #[inspect(hex)]
82    asq: u64,
83    #[inspect(hex)]
84    acq: u64,
85}
86
87impl RegState {
88    fn new() -> Self {
89        Self {
90            interrupt_mask: 0,
91            cc: spec::Cc::new(),
92            csts: spec::Csts::new(),
93            aqa: spec::Aqa::new(),
94            asq: 0,
95            acq: 0,
96        }
97    }
98}
99
100const CAP: spec::Cap = spec::Cap::new()
101    .with_dstrd(DOORBELL_STRIDE_BITS - 2)
102    .with_mqes_z(MAX_QES - 1)
103    .with_cqr(true)
104    .with_css_nvm(true)
105    .with_to(!0);
106
107/// The NVMe controller's capabilities.
108#[derive(Debug, Copy, Clone)]
109pub struct NvmeFaultControllerCaps {
110    /// The number of entries in the MSI-X table.
111    pub msix_count: u16,
112    /// The maximum number of IO submission and completion queues.
113    pub max_io_queues: u16,
114    /// The subsystem ID, used as part of the subnqn field of the identify
115    /// controller response.
116    pub subsystem_id: Guid,
117}
118
119impl NvmeFaultController {
120    /// Creates a new NVMe controller.
121    pub fn new(
122        driver_source: &VmTaskDriverSource,
123        guest_memory: GuestMemory,
124        msi_target: &MsiTarget,
125        register_mmio: &mut dyn RegisterMmioIntercept,
126        caps: NvmeFaultControllerCaps,
127        mut fault_configuration: FaultConfiguration,
128        tdisp_interface: Option<Box<dyn TdispHostDeviceTarget>>,
129    ) -> Self {
130        let (msix, msix_cap) = MsixEmulator::new(4, caps.msix_count, msi_target);
131        let bars = DeviceBars::new()
132            .bar0(
133                BAR0_LEN,
134                BarMemoryKind::Intercept(register_mmio.new_io_region("bar0", BAR0_LEN)),
135            )
136            .bar4(
137                msix.bar_len(),
138                BarMemoryKind::Intercept(register_mmio.new_io_region("msix", msix.bar_len())),
139            );
140
141        // Apply any hardware-config fault overrides for the IDs reported in
142        // PCI configuration space, falling back to the real values when no
143        // override is configured.
144        let hardware_config_fault = fault_configuration.hardware_config_fault.take();
145        let vendor_id = hardware_config_fault
146            .and_then(|f| f.vendor_id)
147            .unwrap_or(VENDOR_ID);
148        let device_id = hardware_config_fault
149            .and_then(|f| f.device_id)
150            .unwrap_or(DEVICE_ID);
151
152        let cfg_space = ConfigSpaceType0Emulator::new(
153            HardwareIds {
154                vendor_id,
155                device_id,
156                revision_id: 0,
157                prog_if: ProgrammingInterface::MASS_STORAGE_CONTROLLER_NON_VOLATILE_MEMORY_NVME,
158                sub_class: Subclass::MASS_STORAGE_CONTROLLER_NON_VOLATILE_MEMORY,
159                base_class: ClassCode::MASS_STORAGE_CONTROLLER,
160                type0_sub_vendor_id: 0,
161                type0_sub_system_id: 0,
162            },
163            vec![Box::new(msix_cap)],
164            Vec::new(),
165            bars,
166        );
167
168        let interrupts = (0..caps.msix_count)
169            .map(|i| msix.interrupt(i).unwrap())
170            .collect();
171
172        let pci_fault_config = fault_configuration
173            .pci_fault
174            .take()
175            .unwrap_or(PciFaultConfig::new());
176
177        let fault_active = fault_configuration.fault_active.clone();
178
179        let qe_sizes = Arc::new(Default::default());
180        let admin = NvmeWorkers::new(
181            driver_source,
182            guest_memory,
183            interrupts,
184            caps.max_io_queues,
185            caps.max_io_queues,
186            Arc::clone(&qe_sizes),
187            caps.subsystem_id,
188            fault_configuration,
189        );
190
191        Self {
192            cfg_space,
193            msix,
194            registers: RegState::new(),
195            workers: admin,
196            qe_sizes,
197            pci_fault_config,
198            fault_active,
199            tdisp_interface,
200        }
201    }
202
203    /// Returns a client for manipulating the NVMe controller at runtime.
204    pub fn client(&self) -> NvmeFaultControllerClient {
205        self.workers.client()
206    }
207
208    /// Reads from the virtual BAR 0.
209    pub fn read_bar0(&mut self, addr: u64, data: &mut [u8]) -> IoResult {
210        if data.len() < 4 {
211            return IoResult::Err(IoError::InvalidAccessSize);
212        }
213        if addr & (data.len() as u64 - 1) != 0 {
214            return IoResult::Err(IoError::UnalignedAccess);
215        }
216
217        // Check for 64-bit registers.
218        let d: Option<u64> = match spec::Register(addr & !7) {
219            spec::Register::CAP => {
220                if let Some(mqes) = self.pci_fault_config.max_queue_size {
221                    Some(CAP.with_mqes_z(mqes - 1).into())
222                } else {
223                    Some(CAP.into())
224                }
225            }
226            spec::Register::ASQ => Some(self.registers.asq),
227            spec::Register::ACQ => Some(self.registers.acq),
228            spec::Register::BPMBL => Some(0),
229            _ => None,
230        };
231        if let Some(d) = d {
232            if data.len() == 8 {
233                data.copy_from_slice(&d.to_ne_bytes());
234            } else if addr & 7 == 0 {
235                data.copy_from_slice(&(d as u32).to_ne_bytes());
236            } else {
237                data.copy_from_slice(&((d >> 32) as u32).to_ne_bytes());
238            }
239            return IoResult::Ok;
240        }
241
242        if data.len() != 4 {
243            return IoResult::Err(IoError::InvalidAccessSize);
244        }
245
246        // Handle 32-bit registers.
247        let d: u32 = match spec::Register(addr) {
248            spec::Register::VS => NVME_VERSION,
249            spec::Register::INTMS => self.registers.interrupt_mask,
250            spec::Register::INTMC => self.registers.interrupt_mask,
251            spec::Register::CC => self.registers.cc.into(),
252            spec::Register::RESERVED => 0,
253            spec::Register::CSTS => self.get_csts(),
254            spec::Register::NSSR => 0,
255            spec::Register::AQA => self.registers.aqa.into(),
256            spec::Register::CMBLOC => 0,
257            spec::Register::CMBSZ => 0,
258            spec::Register::BPINFO => 0,
259            spec::Register::BPRSEL => 0,
260            _ => return IoResult::Err(InvalidRegister),
261        };
262        data.copy_from_slice(&d.to_ne_bytes());
263        IoResult::Ok
264    }
265
266    /// Writes to the virtual BAR 0.
267    pub fn write_bar0(&mut self, addr: u64, data: &[u8]) -> IoResult {
268        if addr >= 0x1000 {
269            // Doorbell write.
270            let base = addr - 0x1000;
271            let db_id = base >> DOORBELL_STRIDE_BITS;
272            if (db_id << DOORBELL_STRIDE_BITS) != base {
273                return IoResult::Err(InvalidRegister);
274            }
275            let Ok(db_id) = u16::try_from(db_id) else {
276                return IoResult::Err(InvalidRegister);
277            };
278            let Ok(data) = data.try_into() else {
279                return IoResult::Err(IoError::InvalidAccessSize);
280            };
281            let value = u32::from_ne_bytes(data);
282            self.workers.doorbell(db_id, value);
283            return IoResult::Ok;
284        }
285
286        if data.len() < 4 {
287            return IoResult::Err(IoError::InvalidAccessSize);
288        }
289        if addr & (data.len() as u64 - 1) != 0 {
290            return IoResult::Err(IoError::UnalignedAccess);
291        }
292
293        let update_reg = |x: u64| {
294            if data.len() == 8 {
295                u64::from_ne_bytes(data.try_into().unwrap())
296            } else {
297                let data = u32::from_ne_bytes(data.try_into().unwrap()) as u64;
298                if addr & 7 == 0 {
299                    (x & !(u32::MAX as u64)) | data
300                } else {
301                    (x & u32::MAX as u64) | (data << 32)
302                }
303            }
304        };
305
306        // Check for 64-bit registers.
307        let handled = match spec::Register(addr & !7) {
308            spec::Register::ASQ => {
309                if !self.registers.cc.en() {
310                    self.registers.asq = update_reg(self.registers.asq) & PAGE_MASK;
311                } else {
312                    tracelimit::warn_ratelimited!("attempt to set asq while enabled");
313                }
314                true
315            }
316            spec::Register::ACQ => {
317                if !self.registers.cc.en() {
318                    self.registers.acq = update_reg(self.registers.acq) & PAGE_MASK;
319                } else {
320                    tracelimit::warn_ratelimited!("attempt to set acq while enabled");
321                }
322                true
323            }
324            _ => false,
325        };
326        if handled {
327            return IoResult::Ok;
328        }
329
330        let Ok(data) = data.try_into() else {
331            return IoResult::Err(IoError::InvalidAccessSize);
332        };
333        let data = u32::from_ne_bytes(data);
334
335        // Handle 32-bit registers.
336        match spec::Register(addr) {
337            spec::Register::INTMS => self.registers.interrupt_mask |= data,
338            spec::Register::INTMC => self.registers.interrupt_mask &= !data,
339            spec::Register::CC => self.set_cc(data.into()),
340            spec::Register::AQA => self.registers.aqa = data.into(),
341            _ => return IoResult::Err(InvalidRegister),
342        }
343        IoResult::Ok
344    }
345
346    fn set_cc(&mut self, cc: spec::Cc) {
347        tracing::debug!(?cc, "set cc");
348
349        if cc.mps() != 0 {
350            tracelimit::warn_ratelimited!(
351                "This implementation only supports memory page sizes of 4K."
352            );
353            self.fatal_error();
354            return;
355        }
356
357        if cc.css() != 0 {
358            tracelimit::warn_ratelimited!("This implementation only supports the NVM command set.");
359            self.fatal_error();
360            return;
361        }
362
363        if let 2..=6 = cc.ams() {
364            tracelimit::warn_ratelimited!("Undefined arbitration mechanism.");
365            self.fatal_error();
366        }
367
368        let mask: u32 = u32::from(
369            spec::Cc::new()
370                .with_en(true)
371                .with_shn(0b11)
372                .with_iosqes(0b1111)
373                .with_iocqes(0b1111),
374        );
375        let mut cc: spec::Cc = (u32::from(cc) & mask).into();
376
377        if cc.shn() != 0 {
378            // It is unclear in the spec (to me) what guarantees a
379            // controller is supposed to make after shutdown. For now, just
380            // complete shutdown immediately.
381            self.registers.csts.set_shst(0b10);
382        }
383
384        if cc.en() != self.registers.cc.en() {
385            if cc.en() {
386                // If any fault was configured for cc.en() process it here
387                if self.fault_active.get() {
388                    match &mut self.pci_fault_config.controller_management_fault_enable {
389                        PciFaultBehavior::Delay(duration) => {
390                            std::thread::sleep(*duration);
391                        }
392                        PciFaultBehavior::Default => {}
393                        PciFaultBehavior::Verify(send) => {
394                            if let Some(send) = send.take() {
395                                send.send(());
396                            }
397                        }
398                    }
399                }
400
401                // Some drivers will write zeros to IOSQES and IOCQES, assuming that the defaults will work.
402                if cc.iocqes() == 0 {
403                    cc.set_iocqes(IOCQES);
404                } else if cc.iocqes() != IOCQES {
405                    tracelimit::warn_ratelimited!(
406                        "This implementation only supports CQEs of the default size."
407                    );
408                    self.fatal_error();
409                    return;
410                }
411
412                if cc.iosqes() == 0 {
413                    cc.set_iosqes(IOSQES);
414                } else if cc.iosqes() != IOSQES {
415                    tracelimit::warn_ratelimited!(
416                        "This implementation only supports SQEs of the default size."
417                    );
418                    self.fatal_error();
419                    return;
420                }
421
422                if self.registers.csts.rdy() {
423                    tracelimit::warn_ratelimited!("enabling during reset");
424                    return;
425                }
426                if cc.shn() == 0 {
427                    self.registers.csts.set_shst(0);
428                }
429
430                self.workers.enable(
431                    self.registers.asq,
432                    self.registers.aqa.asqs_z().max(1) + 1,
433                    self.registers.acq,
434                    self.registers.aqa.acqs_z().max(1) + 1,
435                );
436            } else if self.registers.csts.rdy() {
437                self.workers.controller_reset();
438            } else {
439                tracelimit::warn_ratelimited!("disabling while not ready");
440                return;
441            }
442        }
443
444        self.registers.cc = cc;
445        *self.qe_sizes.lock() = IoQueueEntrySizes {
446            sqe_bits: cc.iosqes(),
447            cqe_bits: cc.iocqes(),
448        };
449    }
450
451    fn get_csts(&mut self) -> u32 {
452        if !self.registers.cc.en() && self.registers.csts.rdy() {
453            // Keep trying to disable.
454            if self.workers.poll_controller_reset() {
455                // AQA, ASQ, and ACQ are not reset by controller reset.
456                self.registers.csts = 0.into();
457                self.registers.cc = 0.into();
458                self.registers.interrupt_mask = 0;
459            }
460        } else if self.registers.cc.en() && !self.registers.csts.rdy() {
461            if self.workers.poll_enabled() {
462                self.registers.csts.set_rdy(true);
463            }
464        }
465
466        let csts = self.registers.csts;
467        tracing::debug!(?csts, "get csts");
468        csts.into()
469    }
470
471    /// Sets the CFS bit in the controller status register (CSTS), indicating
472    /// that the controller has experienced "undefined" behavior.
473    pub fn fatal_error(&mut self) {
474        self.registers.csts.set_cfs(true);
475    }
476}
477
478impl ChangeDeviceState for NvmeFaultController {
479    fn start(&mut self) {}
480
481    async fn stop(&mut self) {}
482
483    async fn reset(&mut self) {
484        let Self {
485            cfg_space,
486            msix: _,
487            registers,
488            qe_sizes,
489            workers,
490            pci_fault_config: _,
491            fault_active: _,
492            tdisp_interface: _,
493        } = self;
494        workers.reset().await;
495        cfg_space.reset();
496        *registers = RegState::new();
497        *qe_sizes.lock() = Default::default();
498    }
499}
500
501impl ChipsetDevice for NvmeFaultController {
502    fn supports_mmio(&mut self) -> Option<&mut dyn MmioIntercept> {
503        Some(self)
504    }
505
506    fn supports_pci(&mut self) -> Option<&mut dyn PciConfigSpace> {
507        Some(self)
508    }
509
510    /// The NVMe fault controller is repurposed for use in TDISP tests.
511    fn supports_tdisp(&mut self) -> Option<&mut dyn TdispHostDeviceTarget> {
512        tracing::debug!(
513            supported = self.tdisp_interface.is_some(),
514            "fault controller TDISP support in ChipsetDevice"
515        );
516
517        match &mut self.tdisp_interface {
518            Some(tdisp) => Some(tdisp.as_mut()),
519            None => None,
520        }
521    }
522}
523
524impl MmioIntercept for NvmeFaultController {
525    fn mmio_read(&mut self, addr: u64, data: &mut [u8]) -> IoResult {
526        match self.cfg_space.find_bar(addr) {
527            Some((0, offset)) => self.read_bar0(offset, data),
528            Some((4, offset)) => {
529                read_as_u32_chunks(offset, data, |offset| self.msix.read_u32(offset));
530                IoResult::Ok
531            }
532            _ => IoResult::Err(InvalidRegister),
533        }
534    }
535
536    fn mmio_write(&mut self, addr: u64, data: &[u8]) -> IoResult {
537        match self.cfg_space.find_bar(addr) {
538            Some((0, offset)) => self.write_bar0(offset, data),
539            Some((4, offset)) => {
540                write_as_u32_chunks(offset, data, |offset, ty| match ty {
541                    ReadWriteRequestType::Read => Some(self.msix.read_u32(offset)),
542                    ReadWriteRequestType::Write(val) => {
543                        self.msix.write_u32(offset, val);
544                        None
545                    }
546                });
547                IoResult::Ok
548            }
549            _ => IoResult::Err(InvalidRegister),
550        }
551    }
552}
553
554impl PciConfigSpace for NvmeFaultController {
555    fn pci_cfg_read(&mut self, offset: u16, value: &mut u32) -> IoResult {
556        self.cfg_space.read_u32(offset, value)
557    }
558
559    fn pci_cfg_write(&mut self, offset: u16, value: u32) -> IoResult {
560        self.cfg_space.write_u32(offset, value)
561    }
562}
563
564impl SaveRestore for NvmeFaultController {
565    type SavedState = SavedStateNotSupported;
566
567    fn save(&mut self) -> Result<Self::SavedState, SaveError> {
568        Err(SaveError::NotSupported)
569    }
570
571    fn restore(
572        &mut self,
573        state: Self::SavedState,
574    ) -> Result<(), vmcore::save_restore::RestoreError> {
575        match state {}
576    }
577}