Skip to main content

nvme_test/
pci.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! The NVMe (Fault Injection) PCI device implementation.
5
6use crate::BAR0_LEN;
7use crate::DOORBELL_STRIDE_BITS;
8use crate::IOCQES;
9use crate::IOSQES;
10use crate::MAX_QES;
11use crate::NVME_VERSION;
12use crate::NvmeFaultControllerClient;
13use crate::PAGE_MASK;
14use crate::VENDOR_ID;
15use crate::spec;
16use crate::workers::IoQueueEntrySizes;
17use crate::workers::NvmeWorkers;
18use chipset_device::ChipsetDevice;
19use chipset_device::io::IoError;
20use chipset_device::io::IoError::InvalidRegister;
21use chipset_device::io::IoResult;
22use chipset_device::mmio::MmioIntercept;
23use chipset_device::mmio::RegisterMmioIntercept;
24use chipset_device::pci::PciConfigSpace;
25use device_emulators::ReadWriteRequestType;
26use device_emulators::read_as_u32_chunks;
27use device_emulators::write_as_u32_chunks;
28use guestmem::GuestMemory;
29use guid::Guid;
30use inspect::Inspect;
31use inspect::InspectMut;
32use nvme_resources::fault::FaultConfiguration;
33use nvme_resources::fault::PciFaultBehavior;
34use nvme_resources::fault::PciFaultConfig;
35use parking_lot::Mutex;
36use pci_core::capabilities::msix::MsixEmulator;
37use pci_core::cfg_space_emu::BarMemoryKind;
38use pci_core::cfg_space_emu::ConfigSpaceType0Emulator;
39use pci_core::cfg_space_emu::DeviceBars;
40use pci_core::msi::MsiTarget;
41use pci_core::spec::hwid::ClassCode;
42use pci_core::spec::hwid::HardwareIds;
43use pci_core::spec::hwid::ProgrammingInterface;
44use pci_core::spec::hwid::Subclass;
45use std::sync::Arc;
46use tdisp::TdispHostDeviceTarget;
47use vmcore::device_state::ChangeDeviceState;
48use vmcore::save_restore::SaveError;
49use vmcore::save_restore::SaveRestore;
50use vmcore::save_restore::SavedStateNotSupported;
51use vmcore::vm_task::VmTaskDriverSource;
52
53/// An NVMe controller.
54#[derive(InspectMut)]
55pub struct NvmeFaultController {
56    cfg_space: ConfigSpaceType0Emulator,
57    #[inspect(skip)]
58    msix: MsixEmulator,
59    registers: RegState,
60    #[inspect(skip)]
61    qe_sizes: Arc<Mutex<IoQueueEntrySizes>>,
62    #[inspect(flatten, mut)]
63    workers: NvmeWorkers,
64    #[inspect(skip)]
65    pci_fault_config: PciFaultConfig,
66    #[inspect(skip)]
67    fault_active: mesh::Cell<bool>,
68    /// The NVMe fault controller is repurposed for use in TDISP tests.
69    #[inspect(skip)]
70    tdisp_interface: Option<Box<dyn TdispHostDeviceTarget>>,
71}
72
73#[derive(Inspect)]
74struct RegState {
75    #[inspect(hex)]
76    interrupt_mask: u32,
77    cc: spec::Cc,
78    csts: spec::Csts,
79    aqa: spec::Aqa,
80    #[inspect(hex)]
81    asq: u64,
82    #[inspect(hex)]
83    acq: u64,
84}
85
86impl RegState {
87    fn new() -> Self {
88        Self {
89            interrupt_mask: 0,
90            cc: spec::Cc::new(),
91            csts: spec::Csts::new(),
92            aqa: spec::Aqa::new(),
93            asq: 0,
94            acq: 0,
95        }
96    }
97}
98
99const CAP: spec::Cap = spec::Cap::new()
100    .with_dstrd(DOORBELL_STRIDE_BITS - 2)
101    .with_mqes_z(MAX_QES - 1)
102    .with_cqr(true)
103    .with_css_nvm(true)
104    .with_to(!0);
105
106/// The NVMe controller's capabilities.
107#[derive(Debug, Copy, Clone)]
108pub struct NvmeFaultControllerCaps {
109    /// The number of entries in the MSI-X table.
110    pub msix_count: u16,
111    /// The maximum number of IO submission and completion queues.
112    pub max_io_queues: u16,
113    /// The subsystem ID, used as part of the subnqn field of the identify
114    /// controller response.
115    pub subsystem_id: Guid,
116}
117
118impl NvmeFaultController {
119    /// Creates a new NVMe controller.
120    pub fn new(
121        driver_source: &VmTaskDriverSource,
122        guest_memory: GuestMemory,
123        msi_target: &MsiTarget,
124        register_mmio: &mut dyn RegisterMmioIntercept,
125        caps: NvmeFaultControllerCaps,
126        mut fault_configuration: FaultConfiguration,
127        tdisp_interface: Option<Box<dyn TdispHostDeviceTarget>>,
128    ) -> Self {
129        let (msix, msix_cap) = MsixEmulator::new(4, caps.msix_count, msi_target);
130        let bars = DeviceBars::new()
131            .bar0(
132                BAR0_LEN,
133                BarMemoryKind::Intercept(register_mmio.new_io_region("bar0", BAR0_LEN)),
134            )
135            .bar4(
136                msix.bar_len(),
137                BarMemoryKind::Intercept(register_mmio.new_io_region("msix", msix.bar_len())),
138            );
139
140        // Apply any hardware-config fault overrides for the IDs reported in
141        // PCI configuration space, falling back to the real values when no
142        // override is configured.
143        let hardware_config_fault = fault_configuration.hardware_config_fault.take();
144        let vendor_id = hardware_config_fault
145            .and_then(|f| f.vendor_id)
146            .unwrap_or(VENDOR_ID);
147        let device_id = hardware_config_fault
148            .and_then(|f| f.device_id)
149            .unwrap_or(0x00a9);
150
151        let cfg_space = ConfigSpaceType0Emulator::new(
152            HardwareIds {
153                vendor_id,
154                device_id,
155                revision_id: 0,
156                prog_if: ProgrammingInterface::MASS_STORAGE_CONTROLLER_NON_VOLATILE_MEMORY_NVME,
157                sub_class: Subclass::MASS_STORAGE_CONTROLLER_NON_VOLATILE_MEMORY,
158                base_class: ClassCode::MASS_STORAGE_CONTROLLER,
159                type0_sub_vendor_id: 0,
160                type0_sub_system_id: 0,
161            },
162            vec![Box::new(msix_cap)],
163            Vec::new(),
164            bars,
165        );
166
167        let interrupts = (0..caps.msix_count)
168            .map(|i| msix.interrupt(i).unwrap())
169            .collect();
170
171        let pci_fault_config = fault_configuration
172            .pci_fault
173            .take()
174            .unwrap_or(PciFaultConfig::new());
175
176        let fault_active = fault_configuration.fault_active.clone();
177
178        let qe_sizes = Arc::new(Default::default());
179        let admin = NvmeWorkers::new(
180            driver_source,
181            guest_memory,
182            interrupts,
183            caps.max_io_queues,
184            caps.max_io_queues,
185            Arc::clone(&qe_sizes),
186            caps.subsystem_id,
187            fault_configuration,
188        );
189
190        Self {
191            cfg_space,
192            msix,
193            registers: RegState::new(),
194            workers: admin,
195            qe_sizes,
196            pci_fault_config,
197            fault_active,
198            tdisp_interface,
199        }
200    }
201
202    /// Returns a client for manipulating the NVMe controller at runtime.
203    pub fn client(&self) -> NvmeFaultControllerClient {
204        self.workers.client()
205    }
206
207    /// Reads from the virtual BAR 0.
208    pub fn read_bar0(&mut self, addr: u64, data: &mut [u8]) -> IoResult {
209        if data.len() < 4 {
210            return IoResult::Err(IoError::InvalidAccessSize);
211        }
212        if addr & (data.len() as u64 - 1) != 0 {
213            return IoResult::Err(IoError::UnalignedAccess);
214        }
215
216        // Check for 64-bit registers.
217        let d: Option<u64> = match spec::Register(addr & !7) {
218            spec::Register::CAP => {
219                if let Some(mqes) = self.pci_fault_config.max_queue_size {
220                    Some(CAP.with_mqes_z(mqes - 1).into())
221                } else {
222                    Some(CAP.into())
223                }
224            }
225            spec::Register::ASQ => Some(self.registers.asq),
226            spec::Register::ACQ => Some(self.registers.acq),
227            spec::Register::BPMBL => Some(0),
228            _ => None,
229        };
230        if let Some(d) = d {
231            if data.len() == 8 {
232                data.copy_from_slice(&d.to_ne_bytes());
233            } else if addr & 7 == 0 {
234                data.copy_from_slice(&(d as u32).to_ne_bytes());
235            } else {
236                data.copy_from_slice(&((d >> 32) as u32).to_ne_bytes());
237            }
238            return IoResult::Ok;
239        }
240
241        if data.len() != 4 {
242            return IoResult::Err(IoError::InvalidAccessSize);
243        }
244
245        // Handle 32-bit registers.
246        let d: u32 = match spec::Register(addr) {
247            spec::Register::VS => NVME_VERSION,
248            spec::Register::INTMS => self.registers.interrupt_mask,
249            spec::Register::INTMC => self.registers.interrupt_mask,
250            spec::Register::CC => self.registers.cc.into(),
251            spec::Register::RESERVED => 0,
252            spec::Register::CSTS => self.get_csts(),
253            spec::Register::NSSR => 0,
254            spec::Register::AQA => self.registers.aqa.into(),
255            spec::Register::CMBLOC => 0,
256            spec::Register::CMBSZ => 0,
257            spec::Register::BPINFO => 0,
258            spec::Register::BPRSEL => 0,
259            _ => return IoResult::Err(InvalidRegister),
260        };
261        data.copy_from_slice(&d.to_ne_bytes());
262        IoResult::Ok
263    }
264
265    /// Writes to the virtual BAR 0.
266    pub fn write_bar0(&mut self, addr: u64, data: &[u8]) -> IoResult {
267        if addr >= 0x1000 {
268            // Doorbell write.
269            let base = addr - 0x1000;
270            let db_id = base >> DOORBELL_STRIDE_BITS;
271            if (db_id << DOORBELL_STRIDE_BITS) != base {
272                return IoResult::Err(InvalidRegister);
273            }
274            let Ok(db_id) = u16::try_from(db_id) else {
275                return IoResult::Err(InvalidRegister);
276            };
277            let Ok(data) = data.try_into() else {
278                return IoResult::Err(IoError::InvalidAccessSize);
279            };
280            let value = u32::from_ne_bytes(data);
281            self.workers.doorbell(db_id, value);
282            return IoResult::Ok;
283        }
284
285        if data.len() < 4 {
286            return IoResult::Err(IoError::InvalidAccessSize);
287        }
288        if addr & (data.len() as u64 - 1) != 0 {
289            return IoResult::Err(IoError::UnalignedAccess);
290        }
291
292        let update_reg = |x: u64| {
293            if data.len() == 8 {
294                u64::from_ne_bytes(data.try_into().unwrap())
295            } else {
296                let data = u32::from_ne_bytes(data.try_into().unwrap()) as u64;
297                if addr & 7 == 0 {
298                    (x & !(u32::MAX as u64)) | data
299                } else {
300                    (x & u32::MAX as u64) | (data << 32)
301                }
302            }
303        };
304
305        // Check for 64-bit registers.
306        let handled = match spec::Register(addr & !7) {
307            spec::Register::ASQ => {
308                if !self.registers.cc.en() {
309                    self.registers.asq = update_reg(self.registers.asq) & PAGE_MASK;
310                } else {
311                    tracelimit::warn_ratelimited!("attempt to set asq while enabled");
312                }
313                true
314            }
315            spec::Register::ACQ => {
316                if !self.registers.cc.en() {
317                    self.registers.acq = update_reg(self.registers.acq) & PAGE_MASK;
318                } else {
319                    tracelimit::warn_ratelimited!("attempt to set acq while enabled");
320                }
321                true
322            }
323            _ => false,
324        };
325        if handled {
326            return IoResult::Ok;
327        }
328
329        let Ok(data) = data.try_into() else {
330            return IoResult::Err(IoError::InvalidAccessSize);
331        };
332        let data = u32::from_ne_bytes(data);
333
334        // Handle 32-bit registers.
335        match spec::Register(addr) {
336            spec::Register::INTMS => self.registers.interrupt_mask |= data,
337            spec::Register::INTMC => self.registers.interrupt_mask &= !data,
338            spec::Register::CC => self.set_cc(data.into()),
339            spec::Register::AQA => self.registers.aqa = data.into(),
340            _ => return IoResult::Err(InvalidRegister),
341        }
342        IoResult::Ok
343    }
344
345    fn set_cc(&mut self, cc: spec::Cc) {
346        tracing::debug!(?cc, "set cc");
347
348        if cc.mps() != 0 {
349            tracelimit::warn_ratelimited!(
350                "This implementation only supports memory page sizes of 4K."
351            );
352            self.fatal_error();
353            return;
354        }
355
356        if cc.css() != 0 {
357            tracelimit::warn_ratelimited!("This implementation only supports the NVM command set.");
358            self.fatal_error();
359            return;
360        }
361
362        if let 2..=6 = cc.ams() {
363            tracelimit::warn_ratelimited!("Undefined arbitration mechanism.");
364            self.fatal_error();
365        }
366
367        let mask: u32 = u32::from(
368            spec::Cc::new()
369                .with_en(true)
370                .with_shn(0b11)
371                .with_iosqes(0b1111)
372                .with_iocqes(0b1111),
373        );
374        let mut cc: spec::Cc = (u32::from(cc) & mask).into();
375
376        if cc.shn() != 0 {
377            // It is unclear in the spec (to me) what guarantees a
378            // controller is supposed to make after shutdown. For now, just
379            // complete shutdown immediately.
380            self.registers.csts.set_shst(0b10);
381        }
382
383        if cc.en() != self.registers.cc.en() {
384            if cc.en() {
385                // If any fault was configured for cc.en() process it here
386                if self.fault_active.get() {
387                    match &mut self.pci_fault_config.controller_management_fault_enable {
388                        PciFaultBehavior::Delay(duration) => {
389                            std::thread::sleep(*duration);
390                        }
391                        PciFaultBehavior::Default => {}
392                        PciFaultBehavior::Verify(send) => {
393                            if let Some(send) = send.take() {
394                                send.send(());
395                            }
396                        }
397                    }
398                }
399
400                // Some drivers will write zeros to IOSQES and IOCQES, assuming that the defaults will work.
401                if cc.iocqes() == 0 {
402                    cc.set_iocqes(IOCQES);
403                } else if cc.iocqes() != IOCQES {
404                    tracelimit::warn_ratelimited!(
405                        "This implementation only supports CQEs of the default size."
406                    );
407                    self.fatal_error();
408                    return;
409                }
410
411                if cc.iosqes() == 0 {
412                    cc.set_iosqes(IOSQES);
413                } else if cc.iosqes() != IOSQES {
414                    tracelimit::warn_ratelimited!(
415                        "This implementation only supports SQEs of the default size."
416                    );
417                    self.fatal_error();
418                    return;
419                }
420
421                if self.registers.csts.rdy() {
422                    tracelimit::warn_ratelimited!("enabling during reset");
423                    return;
424                }
425                if cc.shn() == 0 {
426                    self.registers.csts.set_shst(0);
427                }
428
429                self.workers.enable(
430                    self.registers.asq,
431                    self.registers.aqa.asqs_z().max(1) + 1,
432                    self.registers.acq,
433                    self.registers.aqa.acqs_z().max(1) + 1,
434                );
435            } else if self.registers.csts.rdy() {
436                self.workers.controller_reset();
437            } else {
438                tracelimit::warn_ratelimited!("disabling while not ready");
439                return;
440            }
441        }
442
443        self.registers.cc = cc;
444        *self.qe_sizes.lock() = IoQueueEntrySizes {
445            sqe_bits: cc.iosqes(),
446            cqe_bits: cc.iocqes(),
447        };
448    }
449
450    fn get_csts(&mut self) -> u32 {
451        if !self.registers.cc.en() && self.registers.csts.rdy() {
452            // Keep trying to disable.
453            if self.workers.poll_controller_reset() {
454                // AQA, ASQ, and ACQ are not reset by controller reset.
455                self.registers.csts = 0.into();
456                self.registers.cc = 0.into();
457                self.registers.interrupt_mask = 0;
458            }
459        } else if self.registers.cc.en() && !self.registers.csts.rdy() {
460            if self.workers.poll_enabled() {
461                self.registers.csts.set_rdy(true);
462            }
463        }
464
465        let csts = self.registers.csts;
466        tracing::debug!(?csts, "get csts");
467        csts.into()
468    }
469
470    /// Sets the CFS bit in the controller status register (CSTS), indicating
471    /// that the controller has experienced "undefined" behavior.
472    pub fn fatal_error(&mut self) {
473        self.registers.csts.set_cfs(true);
474    }
475}
476
477impl ChangeDeviceState for NvmeFaultController {
478    fn start(&mut self) {}
479
480    async fn stop(&mut self) {}
481
482    async fn reset(&mut self) {
483        let Self {
484            cfg_space,
485            msix: _,
486            registers,
487            qe_sizes,
488            workers,
489            pci_fault_config: _,
490            fault_active: _,
491            tdisp_interface: _,
492        } = self;
493        workers.reset().await;
494        cfg_space.reset();
495        *registers = RegState::new();
496        *qe_sizes.lock() = Default::default();
497    }
498}
499
500impl ChipsetDevice for NvmeFaultController {
501    fn supports_mmio(&mut self) -> Option<&mut dyn MmioIntercept> {
502        Some(self)
503    }
504
505    fn supports_pci(&mut self) -> Option<&mut dyn PciConfigSpace> {
506        Some(self)
507    }
508
509    /// The NVMe fault controller is repurposed for use in TDISP tests.
510    fn supports_tdisp(&mut self) -> Option<&mut dyn TdispHostDeviceTarget> {
511        tracing::debug!(
512            supported = self.tdisp_interface.is_some(),
513            "fault controller TDISP support in ChipsetDevice"
514        );
515
516        match &mut self.tdisp_interface {
517            Some(tdisp) => Some(tdisp.as_mut()),
518            None => None,
519        }
520    }
521}
522
523impl MmioIntercept for NvmeFaultController {
524    fn mmio_read(&mut self, addr: u64, data: &mut [u8]) -> IoResult {
525        match self.cfg_space.find_bar(addr) {
526            Some((0, offset)) => self.read_bar0(offset, data),
527            Some((4, offset)) => {
528                read_as_u32_chunks(offset, data, |offset| self.msix.read_u32(offset));
529                IoResult::Ok
530            }
531            _ => IoResult::Err(InvalidRegister),
532        }
533    }
534
535    fn mmio_write(&mut self, addr: u64, data: &[u8]) -> IoResult {
536        match self.cfg_space.find_bar(addr) {
537            Some((0, offset)) => self.write_bar0(offset, data),
538            Some((4, offset)) => {
539                write_as_u32_chunks(offset, data, |offset, ty| match ty {
540                    ReadWriteRequestType::Read => Some(self.msix.read_u32(offset)),
541                    ReadWriteRequestType::Write(val) => {
542                        self.msix.write_u32(offset, val);
543                        None
544                    }
545                });
546                IoResult::Ok
547            }
548            _ => IoResult::Err(InvalidRegister),
549        }
550    }
551}
552
553impl PciConfigSpace for NvmeFaultController {
554    fn pci_cfg_read(&mut self, offset: u16, value: &mut u32) -> IoResult {
555        self.cfg_space.read_u32(offset, value)
556    }
557
558    fn pci_cfg_write(&mut self, offset: u16, value: u32) -> IoResult {
559        self.cfg_space.write_u32(offset, value)
560    }
561}
562
563impl SaveRestore for NvmeFaultController {
564    type SavedState = SavedStateNotSupported;
565
566    fn save(&mut self) -> Result<Self::SavedState, SaveError> {
567        Err(SaveError::NotSupported)
568    }
569
570    fn restore(
571        &mut self,
572        state: Self::SavedState,
573    ) -> Result<(), vmcore::save_restore::RestoreError> {
574        match state {}
575    }
576}