nvme/
pci.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! The NVMe PCI device implementation.
5
6use crate::BAR0_LEN;
7use crate::DOORBELL_STRIDE_BITS;
8use crate::IOCQES;
9use crate::IOSQES;
10use crate::MAX_QES;
11use crate::NVME_VERSION;
12use crate::NvmeControllerClient;
13use crate::PAGE_MASK;
14use crate::VENDOR_ID;
15use crate::spec;
16use crate::workers::IoQueueEntrySizes;
17use crate::workers::NvmeWorkers;
18use chipset_device::ChipsetDevice;
19use chipset_device::io::IoError;
20use chipset_device::io::IoError::InvalidRegister;
21use chipset_device::io::IoResult;
22use chipset_device::mmio::MmioIntercept;
23use chipset_device::mmio::RegisterMmioIntercept;
24use chipset_device::pci::PciConfigSpace;
25use device_emulators::ReadWriteRequestType;
26use device_emulators::read_as_u32_chunks;
27use device_emulators::write_as_u32_chunks;
28use guestmem::GuestMemory;
29use guid::Guid;
30use inspect::Inspect;
31use inspect::InspectMut;
32use parking_lot::Mutex;
33use pci_core::capabilities::msix::MsixEmulator;
34use pci_core::cfg_space_emu::BarMemoryKind;
35use pci_core::cfg_space_emu::ConfigSpaceType0Emulator;
36use pci_core::cfg_space_emu::DeviceBars;
37use pci_core::msi::MsiTarget;
38use pci_core::spec::hwid::ClassCode;
39use pci_core::spec::hwid::HardwareIds;
40use pci_core::spec::hwid::ProgrammingInterface;
41use pci_core::spec::hwid::Subclass;
42use std::sync::Arc;
43use vmcore::device_state::ChangeDeviceState;
44use vmcore::save_restore::SaveError;
45use vmcore::save_restore::SaveRestore;
46use vmcore::save_restore::SavedStateNotSupported;
47use vmcore::vm_task::VmTaskDriverSource;
48
49/// An NVMe controller.
50#[derive(InspectMut)]
51pub struct NvmeController {
52    cfg_space: ConfigSpaceType0Emulator,
53    #[inspect(skip)]
54    msix: MsixEmulator,
55
56    registers: RegState,
57    #[inspect(skip)]
58    qe_sizes: Arc<Mutex<IoQueueEntrySizes>>,
59    #[inspect(flatten, mut)]
60    workers: NvmeWorkers,
61}
62
63#[derive(Inspect)]
64struct RegState {
65    #[inspect(hex)]
66    interrupt_mask: u32,
67    cc: spec::Cc,
68    csts: spec::Csts,
69    aqa: spec::Aqa,
70    #[inspect(hex)]
71    asq: u64,
72    #[inspect(hex)]
73    acq: u64,
74}
75
76impl RegState {
77    fn new() -> Self {
78        Self {
79            interrupt_mask: 0,
80            cc: spec::Cc::new(),
81            csts: spec::Csts::new(),
82            aqa: spec::Aqa::new(),
83            asq: 0,
84            acq: 0,
85        }
86    }
87}
88
89const CAP: spec::Cap = spec::Cap::new()
90    .with_dstrd(DOORBELL_STRIDE_BITS - 2)
91    .with_mqes_z(MAX_QES - 1)
92    .with_cqr(true)
93    .with_css_nvm(true)
94    .with_to(!0);
95
96/// The NVMe controller's capabilities.
97#[derive(Debug, Copy, Clone)]
98pub struct NvmeControllerCaps {
99    /// The number of entries in the MSI-X table.
100    pub msix_count: u16,
101    /// The maximum number of IO submission and completion queues.
102    pub max_io_queues: u16,
103    /// The subsystem ID, used as part of the subnqn field of the identify
104    /// controller response.
105    pub subsystem_id: Guid,
106}
107
108impl NvmeController {
109    /// Creates a new NVMe controller.
110    pub fn new(
111        driver_source: &VmTaskDriverSource,
112        guest_memory: GuestMemory,
113        msi_target: &MsiTarget,
114        register_mmio: &mut dyn RegisterMmioIntercept,
115        caps: NvmeControllerCaps,
116    ) -> Self {
117        let (msix, msix_cap) = MsixEmulator::new(4, caps.msix_count, msi_target);
118        let bars = DeviceBars::new()
119            .bar0(
120                BAR0_LEN,
121                BarMemoryKind::Intercept(register_mmio.new_io_region("bar0", BAR0_LEN)),
122            )
123            .bar4(
124                msix.bar_len(),
125                BarMemoryKind::Intercept(register_mmio.new_io_region("msix", msix.bar_len())),
126            );
127
128        let cfg_space = ConfigSpaceType0Emulator::new(
129            HardwareIds {
130                vendor_id: VENDOR_ID,
131                device_id: 0x00a9,
132                revision_id: 0,
133                prog_if: ProgrammingInterface::MASS_STORAGE_CONTROLLER_NON_VOLATILE_MEMORY_NVME,
134                sub_class: Subclass::MASS_STORAGE_CONTROLLER_NON_VOLATILE_MEMORY,
135                base_class: ClassCode::MASS_STORAGE_CONTROLLER,
136                type0_sub_vendor_id: 0,
137                type0_sub_system_id: 0,
138            },
139            vec![Box::new(msix_cap)],
140            bars,
141        );
142
143        let interrupts = (0..caps.msix_count)
144            .map(|i| msix.interrupt(i).unwrap())
145            .collect();
146
147        let qe_sizes = Arc::new(Default::default());
148        let admin = NvmeWorkers::new(
149            driver_source,
150            guest_memory,
151            interrupts,
152            caps.max_io_queues,
153            caps.max_io_queues,
154            Arc::clone(&qe_sizes),
155            caps.subsystem_id,
156        );
157
158        Self {
159            cfg_space,
160            msix,
161            registers: RegState::new(),
162            workers: admin,
163            qe_sizes,
164        }
165    }
166
167    /// Returns a client for manipulating the NVMe controller at runtime.
168    pub fn client(&self) -> NvmeControllerClient {
169        self.workers.client()
170    }
171
172    /// Reads from the virtual BAR 0.
173    pub fn read_bar0(&mut self, addr: u16, data: &mut [u8]) -> IoResult {
174        if data.len() < 4 {
175            return IoResult::Err(IoError::InvalidAccessSize);
176        }
177        if addr & (data.len() - 1) as u16 != 0 {
178            return IoResult::Err(IoError::UnalignedAccess);
179        }
180
181        // Check for 64-bit registers.
182        let d: Option<u64> = match spec::Register(addr & !7) {
183            spec::Register::CAP => Some(CAP.into()),
184            spec::Register::ASQ => Some(self.registers.asq),
185            spec::Register::ACQ => Some(self.registers.acq),
186            spec::Register::BPMBL => Some(0),
187            _ => None,
188        };
189        if let Some(d) = d {
190            if data.len() == 8 {
191                data.copy_from_slice(&d.to_ne_bytes());
192            } else if addr & 7 == 0 {
193                data.copy_from_slice(&(d as u32).to_ne_bytes());
194            } else {
195                data.copy_from_slice(&((d >> 32) as u32).to_ne_bytes());
196            }
197            return IoResult::Ok;
198        }
199
200        if data.len() != 4 {
201            return IoResult::Err(IoError::InvalidAccessSize);
202        }
203
204        // Handle 32-bit registers.
205        let d: u32 = match spec::Register(addr) {
206            spec::Register::VS => NVME_VERSION,
207            spec::Register::INTMS => self.registers.interrupt_mask,
208            spec::Register::INTMC => self.registers.interrupt_mask,
209            spec::Register::CC => self.registers.cc.into(),
210            spec::Register::RESERVED => 0,
211            spec::Register::CSTS => self.get_csts(),
212            spec::Register::NSSR => 0,
213            spec::Register::AQA => self.registers.aqa.into(),
214            spec::Register::CMBLOC => 0,
215            spec::Register::CMBSZ => 0,
216            spec::Register::BPINFO => 0,
217            spec::Register::BPRSEL => 0,
218            _ => return IoResult::Err(InvalidRegister),
219        };
220        data.copy_from_slice(&d.to_ne_bytes());
221        IoResult::Ok
222    }
223
224    /// Writes to the virtual BAR 0.
225    pub fn write_bar0(&mut self, addr: u16, data: &[u8]) -> IoResult {
226        if addr >= 0x1000 {
227            // Doorbell write.
228            let base = addr - 0x1000;
229            let db_id = base >> DOORBELL_STRIDE_BITS;
230            if (db_id << DOORBELL_STRIDE_BITS) != base {
231                return IoResult::Err(InvalidRegister);
232            }
233            let Ok(data) = data.try_into() else {
234                return IoResult::Err(IoError::InvalidAccessSize);
235            };
236            let value = u32::from_ne_bytes(data);
237            self.workers.doorbell(db_id, value);
238            return IoResult::Ok;
239        }
240
241        if data.len() < 4 {
242            return IoResult::Err(IoError::InvalidAccessSize);
243        }
244        if addr & (data.len() - 1) as u16 != 0 {
245            return IoResult::Err(IoError::UnalignedAccess);
246        }
247
248        let update_reg = |x: u64| {
249            if data.len() == 8 {
250                u64::from_ne_bytes(data.try_into().unwrap())
251            } else {
252                let data = u32::from_ne_bytes(data.try_into().unwrap()) as u64;
253                if addr & 7 == 0 {
254                    (x & !(u32::MAX as u64)) | data
255                } else {
256                    (x & u32::MAX as u64) | (data << 32)
257                }
258            }
259        };
260
261        // Check for 64-bit registers.
262        let handled = match spec::Register(addr & !7) {
263            spec::Register::ASQ => {
264                if !self.registers.cc.en() {
265                    self.registers.asq = update_reg(self.registers.asq) & PAGE_MASK;
266                } else {
267                    tracelimit::warn_ratelimited!("attempt to set asq while enabled");
268                }
269                true
270            }
271            spec::Register::ACQ => {
272                if !self.registers.cc.en() {
273                    self.registers.acq = update_reg(self.registers.acq) & PAGE_MASK;
274                } else {
275                    tracelimit::warn_ratelimited!("attempt to set acq while enabled");
276                }
277                true
278            }
279            _ => false,
280        };
281        if handled {
282            return IoResult::Ok;
283        }
284
285        let Ok(data) = data.try_into() else {
286            return IoResult::Err(IoError::InvalidAccessSize);
287        };
288        let data = u32::from_ne_bytes(data);
289
290        // Handle 32-bit registers.
291        match spec::Register(addr) {
292            spec::Register::INTMS => self.registers.interrupt_mask |= data,
293            spec::Register::INTMC => self.registers.interrupt_mask &= !data,
294            spec::Register::CC => self.set_cc(data.into()),
295            spec::Register::AQA => self.registers.aqa = data.into(),
296            _ => return IoResult::Err(InvalidRegister),
297        }
298        IoResult::Ok
299    }
300
301    fn set_cc(&mut self, cc: spec::Cc) {
302        tracing::debug!(?cc, "set cc");
303
304        if cc.mps() != 0 {
305            tracelimit::warn_ratelimited!(
306                "This implementation only supports memory page sizes of 4K."
307            );
308            self.fatal_error();
309            return;
310        }
311
312        if cc.css() != 0 {
313            tracelimit::warn_ratelimited!("This implementation only supports the NVM command set.");
314            self.fatal_error();
315            return;
316        }
317
318        if let 2..=6 = cc.ams() {
319            tracelimit::warn_ratelimited!("Undefined arbitration mechanism.");
320            self.fatal_error();
321        }
322
323        let mask: u32 = u32::from(
324            spec::Cc::new()
325                .with_en(true)
326                .with_shn(0b11)
327                .with_iosqes(0b1111)
328                .with_iocqes(0b1111),
329        );
330        let mut cc: spec::Cc = (u32::from(cc) & mask).into();
331
332        if cc.shn() != 0 {
333            // It is unclear in the spec (to me) what guarantees a
334            // controller is supposed to make after shutdown. For now, just
335            // complete shutdown immediately.
336            self.registers.csts.set_shst(0b10);
337        }
338
339        if cc.en() != self.registers.cc.en() {
340            if cc.en() {
341                // Some drivers will write zeros to IOSQES and IOCQES, assuming that the defaults will work.
342                if cc.iocqes() == 0 {
343                    cc.set_iocqes(IOCQES);
344                } else if cc.iocqes() != IOCQES {
345                    tracelimit::warn_ratelimited!(
346                        "This implementation only supports CQEs of the default size."
347                    );
348                    self.fatal_error();
349                    return;
350                }
351
352                if cc.iosqes() == 0 {
353                    cc.set_iosqes(IOSQES);
354                } else if cc.iosqes() != IOSQES {
355                    tracelimit::warn_ratelimited!(
356                        "This implementation only supports SQEs of the default size."
357                    );
358                    self.fatal_error();
359                    return;
360                }
361
362                if self.registers.csts.rdy() {
363                    tracelimit::warn_ratelimited!("enabling during reset");
364                    return;
365                }
366                if cc.shn() == 0 {
367                    self.registers.csts.set_shst(0);
368                }
369
370                self.workers.enable(
371                    self.registers.asq,
372                    self.registers.aqa.asqs_z().max(1) + 1,
373                    self.registers.acq,
374                    self.registers.aqa.acqs_z().max(1) + 1,
375                );
376            } else if self.registers.csts.rdy() {
377                self.workers.controller_reset();
378            } else {
379                tracelimit::warn_ratelimited!("disabling while not ready");
380                return;
381            }
382        }
383
384        self.registers.cc = cc;
385        *self.qe_sizes.lock() = IoQueueEntrySizes {
386            sqe_bits: cc.iosqes(),
387            cqe_bits: cc.iocqes(),
388        };
389    }
390
391    fn get_csts(&mut self) -> u32 {
392        if !self.registers.cc.en() && self.registers.csts.rdy() {
393            // Keep trying to disable.
394            if self.workers.poll_controller_reset() {
395                // AQA, ASQ, and ACQ are not reset by controller reset.
396                self.registers.csts = 0.into();
397                self.registers.cc = 0.into();
398                self.registers.interrupt_mask = 0;
399            }
400        } else if self.registers.cc.en() && !self.registers.csts.rdy() {
401            if self.workers.poll_enabled() {
402                self.registers.csts.set_rdy(true);
403            }
404        }
405
406        let csts = self.registers.csts;
407        tracing::debug!(?csts, "get csts");
408        csts.into()
409    }
410
411    /// Sets the CFS bit in the controller status register (CSTS), indicating
412    /// that the controller has experienced "undefined" behavior.
413    pub fn fatal_error(&mut self) {
414        self.registers.csts.set_cfs(true);
415    }
416}
417
418impl ChangeDeviceState for NvmeController {
419    fn start(&mut self) {}
420
421    async fn stop(&mut self) {}
422
423    async fn reset(&mut self) {
424        let Self {
425            cfg_space,
426            msix: _,
427            registers,
428            qe_sizes,
429            workers,
430        } = self;
431        workers.reset().await;
432        cfg_space.reset();
433        *registers = RegState::new();
434        *qe_sizes.lock() = Default::default();
435    }
436}
437
438impl ChipsetDevice for NvmeController {
439    fn supports_mmio(&mut self) -> Option<&mut dyn MmioIntercept> {
440        Some(self)
441    }
442
443    fn supports_pci(&mut self) -> Option<&mut dyn PciConfigSpace> {
444        Some(self)
445    }
446}
447
448impl MmioIntercept for NvmeController {
449    fn mmio_read(&mut self, addr: u64, data: &mut [u8]) -> IoResult {
450        match self.cfg_space.find_bar(addr) {
451            Some((0, offset)) => self.read_bar0(offset, data),
452            Some((4, offset)) => {
453                read_as_u32_chunks(offset, data, |offset| self.msix.read_u32(offset));
454                IoResult::Ok
455            }
456            _ => IoResult::Err(InvalidRegister),
457        }
458    }
459
460    fn mmio_write(&mut self, addr: u64, data: &[u8]) -> IoResult {
461        match self.cfg_space.find_bar(addr) {
462            Some((0, offset)) => self.write_bar0(offset, data),
463            Some((4, offset)) => {
464                write_as_u32_chunks(offset, data, |offset, ty| match ty {
465                    ReadWriteRequestType::Read => Some(self.msix.read_u32(offset)),
466                    ReadWriteRequestType::Write(val) => {
467                        self.msix.write_u32(offset, val);
468                        None
469                    }
470                });
471                IoResult::Ok
472            }
473            _ => IoResult::Err(InvalidRegister),
474        }
475    }
476}
477
478impl PciConfigSpace for NvmeController {
479    fn pci_cfg_read(&mut self, offset: u16, value: &mut u32) -> IoResult {
480        self.cfg_space.read_u32(offset, value)
481    }
482
483    fn pci_cfg_write(&mut self, offset: u16, value: u32) -> IoResult {
484        self.cfg_space.write_u32(offset, value)
485    }
486}
487
488impl SaveRestore for NvmeController {
489    type SavedState = SavedStateNotSupported;
490
491    fn save(&mut self) -> Result<Self::SavedState, SaveError> {
492        Err(SaveError::NotSupported)
493    }
494
495    fn restore(
496        &mut self,
497        state: Self::SavedState,
498    ) -> Result<(), vmcore::save_restore::RestoreError> {
499        match state {}
500    }
501}