Skip to main content

nvme/
pci.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! The NVMe PCI device implementation.
5
6use crate::BAR0_LEN;
7use crate::DEVICE_ID;
8use crate::DOORBELL_STRIDE_BITS;
9use crate::IOCQES;
10use crate::IOSQES;
11use crate::MAX_QES;
12use crate::NVME_VERSION;
13use crate::NvmeControllerClient;
14use crate::PAGE_MASK;
15use crate::VENDOR_ID;
16use crate::spec;
17use crate::workers::IoQueueEntrySizes;
18use crate::workers::NvmeWorkers;
19use chipset_device::ChipsetDevice;
20use chipset_device::io::IoError;
21use chipset_device::io::IoError::InvalidRegister;
22use chipset_device::io::IoResult;
23use chipset_device::mmio::MmioIntercept;
24use chipset_device::mmio::RegisterMmioIntercept;
25use chipset_device::pci::PciConfigSpace;
26use device_emulators::ReadWriteRequestType;
27use device_emulators::read_as_u32_chunks;
28use device_emulators::write_as_u32_chunks;
29use guid::Guid;
30use inspect::Inspect;
31use inspect::InspectMut;
32use parking_lot::Mutex;
33use pci_core::capabilities::msix::MsixEmulator;
34use pci_core::capabilities::pci_express::PciExpressCapability;
35use pci_core::cfg_space_emu::BarMemoryKind;
36use pci_core::cfg_space_emu::ConfigSpaceType0Emulator;
37use pci_core::cfg_space_emu::DeviceBars;
38use pci_core::dma::DmaTarget;
39use pci_core::spec::hwid::ClassCode;
40use pci_core::spec::hwid::HardwareIds;
41use pci_core::spec::hwid::ProgrammingInterface;
42use pci_core::spec::hwid::Subclass;
43use std::sync::Arc;
44use vmcore::device_state::ChangeDeviceState;
45use vmcore::save_restore::SaveError;
46use vmcore::save_restore::SaveRestore;
47use vmcore::save_restore::SavedStateNotSupported;
48use vmcore::vm_task::VmTaskDriverSource;
49
50/// An NVMe controller.
51#[derive(InspectMut)]
52pub struct NvmeController {
53    cfg_space: ConfigSpaceType0Emulator,
54    #[inspect(skip)]
55    msix: MsixEmulator,
56
57    registers: RegState,
58    #[inspect(skip)]
59    qe_sizes: Arc<Mutex<IoQueueEntrySizes>>,
60    #[inspect(flatten, mut)]
61    workers: NvmeWorkers,
62}
63
64#[derive(Inspect)]
65struct RegState {
66    #[inspect(hex)]
67    interrupt_mask: u32,
68    cc: spec::Cc,
69    csts: spec::Csts,
70    aqa: spec::Aqa,
71    #[inspect(hex)]
72    asq: u64,
73    #[inspect(hex)]
74    acq: u64,
75}
76
77impl RegState {
78    fn new() -> Self {
79        Self {
80            interrupt_mask: 0,
81            cc: spec::Cc::new(),
82            csts: spec::Csts::new(),
83            aqa: spec::Aqa::new(),
84            asq: 0,
85            acq: 0,
86        }
87    }
88}
89
90const CAP: spec::Cap = spec::Cap::new()
91    .with_dstrd(DOORBELL_STRIDE_BITS - 2)
92    .with_mqes_z(MAX_QES - 1)
93    .with_cqr(true)
94    .with_css_nvm(true)
95    .with_to(!0);
96
97/// The NVMe controller's capabilities.
98#[derive(Debug, Copy, Clone)]
99pub struct NvmeControllerCaps {
100    /// The number of entries in the MSI-X table.
101    pub msix_count: u16,
102    /// The maximum number of IO submission and completion queues.
103    pub max_io_queues: u16,
104    /// The subsystem ID, used as part of the subnqn field of the identify
105    /// controller response.
106    pub subsystem_id: Guid,
107}
108
109impl NvmeController {
110    /// Creates a new NVMe controller.
111    pub fn new(
112        driver_source: &VmTaskDriverSource,
113        dma_target: &DmaTarget,
114        register_mmio: &mut dyn RegisterMmioIntercept,
115        caps: NvmeControllerCaps,
116    ) -> Self {
117        let msi_target = dma_target.msi_target();
118        let guest_memory = dma_target.guest_memory().clone();
119        let (msix, msix_cap) = MsixEmulator::new(4, caps.msix_count, msi_target);
120        let bars = DeviceBars::new()
121            .bar0(
122                BAR0_LEN,
123                BarMemoryKind::Intercept(register_mmio.new_io_region("bar0", BAR0_LEN)),
124            )
125            .bar4(
126                msix.bar_len(),
127                BarMemoryKind::Intercept(register_mmio.new_io_region("msix", msix.bar_len())),
128            );
129
130        let cfg_space = ConfigSpaceType0Emulator::new(
131            HardwareIds {
132                vendor_id: VENDOR_ID,
133                device_id: DEVICE_ID,
134                revision_id: 0,
135                prog_if: ProgrammingInterface::MASS_STORAGE_CONTROLLER_NON_VOLATILE_MEMORY_NVME,
136                sub_class: Subclass::MASS_STORAGE_CONTROLLER_NON_VOLATILE_MEMORY,
137                base_class: ClassCode::MASS_STORAGE_CONTROLLER,
138                type0_sub_vendor_id: 0,
139                type0_sub_system_id: 0,
140            },
141            vec![
142                Box::new(msix_cap),
143                Box::new(PciExpressCapability::new(
144                    pci_core::spec::caps::pci_express::DevicePortType::Endpoint,
145                    None,
146                )),
147            ],
148            Vec::new(),
149            bars,
150        );
151
152        let interrupts = (0..caps.msix_count)
153            .map(|i| msix.interrupt(i).unwrap())
154            .collect();
155
156        let qe_sizes = Arc::new(Default::default());
157        let admin = NvmeWorkers::new(
158            driver_source,
159            guest_memory,
160            interrupts,
161            caps.max_io_queues,
162            caps.max_io_queues,
163            Arc::clone(&qe_sizes),
164            caps.subsystem_id,
165        );
166
167        Self {
168            cfg_space,
169            msix,
170            registers: RegState::new(),
171            workers: admin,
172            qe_sizes,
173        }
174    }
175
176    /// Returns a client for manipulating the NVMe controller at runtime.
177    pub fn client(&self) -> NvmeControllerClient {
178        self.workers.client()
179    }
180
181    /// Reads from the virtual BAR 0.
182    pub fn read_bar0(&mut self, addr: u64, data: &mut [u8]) -> IoResult {
183        if data.len() < 4 {
184            return IoResult::Err(IoError::InvalidAccessSize);
185        }
186        if addr & (data.len() as u64 - 1) != 0 {
187            return IoResult::Err(IoError::UnalignedAccess);
188        }
189
190        // Check for 64-bit registers.
191        let d: Option<u64> = match spec::Register(addr & !7) {
192            spec::Register::CAP => Some(CAP.into()),
193            spec::Register::ASQ => Some(self.registers.asq),
194            spec::Register::ACQ => Some(self.registers.acq),
195            spec::Register::BPMBL => Some(0),
196            _ => None,
197        };
198        if let Some(d) = d {
199            if data.len() == 8 {
200                data.copy_from_slice(&d.to_ne_bytes());
201            } else if addr & 7 == 0 {
202                data.copy_from_slice(&(d as u32).to_ne_bytes());
203            } else {
204                data.copy_from_slice(&((d >> 32) as u32).to_ne_bytes());
205            }
206            return IoResult::Ok;
207        }
208
209        if data.len() != 4 {
210            return IoResult::Err(IoError::InvalidAccessSize);
211        }
212
213        // Handle 32-bit registers.
214        let d: u32 = match spec::Register(addr) {
215            spec::Register::VS => NVME_VERSION,
216            spec::Register::INTMS => self.registers.interrupt_mask,
217            spec::Register::INTMC => self.registers.interrupt_mask,
218            spec::Register::CC => self.registers.cc.into(),
219            spec::Register::RESERVED => 0,
220            spec::Register::CSTS => self.get_csts(),
221            spec::Register::NSSR => 0,
222            spec::Register::AQA => self.registers.aqa.into(),
223            spec::Register::CMBLOC => 0,
224            spec::Register::CMBSZ => 0,
225            spec::Register::BPINFO => 0,
226            spec::Register::BPRSEL => 0,
227            _ => return IoResult::Err(InvalidRegister),
228        };
229        data.copy_from_slice(&d.to_ne_bytes());
230        IoResult::Ok
231    }
232
233    /// Writes to the virtual BAR 0.
234    pub fn write_bar0(&mut self, addr: u64, data: &[u8]) -> IoResult {
235        if addr >= 0x1000 {
236            // Doorbell write.
237            let base = addr - 0x1000;
238            let db_id = base >> DOORBELL_STRIDE_BITS;
239            if (db_id << DOORBELL_STRIDE_BITS) != base {
240                return IoResult::Err(InvalidRegister);
241            }
242            let Ok(data) = data.try_into() else {
243                return IoResult::Err(IoError::InvalidAccessSize);
244            };
245            let value = u32::from_ne_bytes(data);
246            let db_id = match u16::try_from(db_id) {
247                Ok(id) => id,
248                Err(_) => return IoResult::Err(InvalidRegister),
249            };
250            self.workers.doorbell(db_id, value);
251            return IoResult::Ok;
252        }
253
254        if data.len() < 4 {
255            return IoResult::Err(IoError::InvalidAccessSize);
256        }
257        if addr & (data.len() as u64 - 1) != 0 {
258            return IoResult::Err(IoError::UnalignedAccess);
259        }
260
261        let update_reg = |x: u64| {
262            if data.len() == 8 {
263                u64::from_ne_bytes(data.try_into().unwrap())
264            } else {
265                let data = u32::from_ne_bytes(data.try_into().unwrap()) as u64;
266                if addr & 7 == 0 {
267                    (x & !(u32::MAX as u64)) | data
268                } else {
269                    (x & u32::MAX as u64) | (data << 32)
270                }
271            }
272        };
273
274        // Check for 64-bit registers.
275        let handled = match spec::Register(addr & !7) {
276            spec::Register::ASQ => {
277                if !self.registers.cc.en() {
278                    self.registers.asq = update_reg(self.registers.asq) & PAGE_MASK;
279                } else {
280                    tracelimit::warn_ratelimited!("attempt to set asq while enabled");
281                }
282                true
283            }
284            spec::Register::ACQ => {
285                if !self.registers.cc.en() {
286                    self.registers.acq = update_reg(self.registers.acq) & PAGE_MASK;
287                } else {
288                    tracelimit::warn_ratelimited!("attempt to set acq while enabled");
289                }
290                true
291            }
292            _ => false,
293        };
294        if handled {
295            return IoResult::Ok;
296        }
297
298        let Ok(data) = data.try_into() else {
299            return IoResult::Err(IoError::InvalidAccessSize);
300        };
301        let data = u32::from_ne_bytes(data);
302
303        // Handle 32-bit registers.
304        match spec::Register(addr) {
305            spec::Register::INTMS => self.registers.interrupt_mask |= data,
306            spec::Register::INTMC => self.registers.interrupt_mask &= !data,
307            spec::Register::CC => self.set_cc(data.into()),
308            spec::Register::AQA => self.registers.aqa = data.into(),
309            _ => return IoResult::Err(InvalidRegister),
310        }
311        IoResult::Ok
312    }
313
314    fn set_cc(&mut self, cc: spec::Cc) {
315        tracing::debug!(?cc, "set cc");
316
317        if cc.mps() != 0 {
318            tracelimit::warn_ratelimited!(
319                "This implementation only supports memory page sizes of 4K."
320            );
321            self.fatal_error();
322            return;
323        }
324
325        if cc.css() != 0 {
326            tracelimit::warn_ratelimited!("This implementation only supports the NVM command set.");
327            self.fatal_error();
328            return;
329        }
330
331        if let 2..=6 = cc.ams() {
332            tracelimit::warn_ratelimited!("Undefined arbitration mechanism.");
333            self.fatal_error();
334        }
335
336        let mask: u32 = u32::from(
337            spec::Cc::new()
338                .with_en(true)
339                .with_shn(0b11)
340                .with_iosqes(0b1111)
341                .with_iocqes(0b1111),
342        );
343        let mut cc: spec::Cc = (u32::from(cc) & mask).into();
344
345        if cc.shn() != 0 {
346            // It is unclear in the spec (to me) what guarantees a
347            // controller is supposed to make after shutdown. For now, just
348            // complete shutdown immediately.
349            self.registers.csts.set_shst(0b10);
350        }
351
352        if cc.en() != self.registers.cc.en() {
353            if cc.en() {
354                // Some drivers will write zeros to IOSQES and IOCQES, assuming that the defaults will work.
355                if cc.iocqes() == 0 {
356                    cc.set_iocqes(IOCQES);
357                } else if cc.iocqes() != IOCQES {
358                    tracelimit::warn_ratelimited!(
359                        "This implementation only supports CQEs of the default size."
360                    );
361                    self.fatal_error();
362                    return;
363                }
364
365                if cc.iosqes() == 0 {
366                    cc.set_iosqes(IOSQES);
367                } else if cc.iosqes() != IOSQES {
368                    tracelimit::warn_ratelimited!(
369                        "This implementation only supports SQEs of the default size."
370                    );
371                    self.fatal_error();
372                    return;
373                }
374
375                if self.registers.csts.rdy() {
376                    tracelimit::warn_ratelimited!("enabling during reset");
377                    return;
378                }
379                if cc.shn() == 0 {
380                    self.registers.csts.set_shst(0);
381                }
382
383                self.workers.enable(
384                    self.registers.asq,
385                    self.registers.aqa.asqs_z().max(1) + 1,
386                    self.registers.acq,
387                    self.registers.aqa.acqs_z().max(1) + 1,
388                );
389            } else if self.registers.csts.rdy() {
390                self.workers.controller_reset();
391            } else {
392                tracelimit::warn_ratelimited!("disabling while not ready");
393                return;
394            }
395        }
396
397        self.registers.cc = cc;
398        *self.qe_sizes.lock() = IoQueueEntrySizes {
399            sqe_bits: cc.iosqes(),
400            cqe_bits: cc.iocqes(),
401        };
402    }
403
404    fn get_csts(&mut self) -> u32 {
405        if !self.registers.cc.en() && self.registers.csts.rdy() {
406            // Keep trying to disable.
407            if self.workers.poll_controller_reset() {
408                // AQA, ASQ, and ACQ are not reset by controller reset.
409                self.registers.csts = 0.into();
410                self.registers.cc = 0.into();
411                self.registers.interrupt_mask = 0;
412            }
413        } else if self.registers.cc.en() && !self.registers.csts.rdy() {
414            if self.workers.poll_enabled() {
415                self.registers.csts.set_rdy(true);
416            }
417        }
418
419        let csts = self.registers.csts;
420        tracing::debug!(?csts, "get csts");
421        csts.into()
422    }
423
424    /// Sets the CFS bit in the controller status register (CSTS), indicating
425    /// that the controller has experienced "undefined" behavior.
426    pub fn fatal_error(&mut self) {
427        self.registers.csts.set_cfs(true);
428    }
429}
430
431impl ChangeDeviceState for NvmeController {
432    fn start(&mut self) {}
433
434    async fn stop(&mut self) {}
435
436    async fn reset(&mut self) {
437        let Self {
438            cfg_space,
439            msix: _,
440            registers,
441            qe_sizes,
442            workers,
443        } = self;
444        workers.reset().await;
445        cfg_space.reset();
446        *registers = RegState::new();
447        *qe_sizes.lock() = Default::default();
448    }
449}
450
451impl ChipsetDevice for NvmeController {
452    fn supports_mmio(&mut self) -> Option<&mut dyn MmioIntercept> {
453        Some(self)
454    }
455
456    fn supports_pci(&mut self) -> Option<&mut dyn PciConfigSpace> {
457        Some(self)
458    }
459}
460
461impl MmioIntercept for NvmeController {
462    fn mmio_read(&mut self, addr: u64, data: &mut [u8]) -> IoResult {
463        match self.cfg_space.find_bar(addr) {
464            Some((0, offset)) => self.read_bar0(offset, data),
465            Some((4, offset)) => {
466                read_as_u32_chunks(offset, data, |offset| self.msix.read_u32(offset));
467                IoResult::Ok
468            }
469            _ => IoResult::Err(InvalidRegister),
470        }
471    }
472
473    fn mmio_write(&mut self, addr: u64, data: &[u8]) -> IoResult {
474        match self.cfg_space.find_bar(addr) {
475            Some((0, offset)) => self.write_bar0(offset, data),
476            Some((4, offset)) => {
477                write_as_u32_chunks(offset, data, |offset, ty| match ty {
478                    ReadWriteRequestType::Read => Some(self.msix.read_u32(offset)),
479                    ReadWriteRequestType::Write(val) => {
480                        self.msix.write_u32(offset, val);
481                        None
482                    }
483                });
484                IoResult::Ok
485            }
486            _ => IoResult::Err(InvalidRegister),
487        }
488    }
489}
490
491impl PciConfigSpace for NvmeController {
492    fn pci_cfg_read(&mut self, offset: u16, value: &mut u32) -> IoResult {
493        self.cfg_space.read_u32(offset, value)
494    }
495
496    fn pci_cfg_write(&mut self, offset: u16, value: u32) -> IoResult {
497        self.cfg_space.write_u32(offset, value)
498    }
499}
500
501impl SaveRestore for NvmeController {
502    type SavedState = SavedStateNotSupported;
503
504    fn save(&mut self) -> Result<Self::SavedState, SaveError> {
505        Err(SaveError::NotSupported)
506    }
507
508    fn restore(
509        &mut self,
510        state: Self::SavedState,
511    ) -> Result<(), vmcore::save_restore::RestoreError> {
512        match state {}
513    }
514}