Skip to main content

nvme/
pci.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! The NVMe PCI device implementation.
5
6use crate::BAR0_LEN;
7use crate::DOORBELL_STRIDE_BITS;
8use crate::IOCQES;
9use crate::IOSQES;
10use crate::MAX_QES;
11use crate::NVME_VERSION;
12use crate::NvmeControllerClient;
13use crate::PAGE_MASK;
14use crate::VENDOR_ID;
15use crate::spec;
16use crate::workers::IoQueueEntrySizes;
17use crate::workers::NvmeWorkers;
18use chipset_device::ChipsetDevice;
19use chipset_device::io::IoError;
20use chipset_device::io::IoError::InvalidRegister;
21use chipset_device::io::IoResult;
22use chipset_device::mmio::MmioIntercept;
23use chipset_device::mmio::RegisterMmioIntercept;
24use chipset_device::pci::PciConfigSpace;
25use device_emulators::ReadWriteRequestType;
26use device_emulators::read_as_u32_chunks;
27use device_emulators::write_as_u32_chunks;
28use guestmem::GuestMemory;
29use guid::Guid;
30use inspect::Inspect;
31use inspect::InspectMut;
32use parking_lot::Mutex;
33use pci_core::capabilities::msix::MsixEmulator;
34use pci_core::capabilities::pci_express::PciExpressCapability;
35use pci_core::cfg_space_emu::BarMemoryKind;
36use pci_core::cfg_space_emu::ConfigSpaceType0Emulator;
37use pci_core::cfg_space_emu::DeviceBars;
38use pci_core::msi::MsiTarget;
39use pci_core::spec::hwid::ClassCode;
40use pci_core::spec::hwid::HardwareIds;
41use pci_core::spec::hwid::ProgrammingInterface;
42use pci_core::spec::hwid::Subclass;
43use std::sync::Arc;
44use vmcore::device_state::ChangeDeviceState;
45use vmcore::save_restore::SaveError;
46use vmcore::save_restore::SaveRestore;
47use vmcore::save_restore::SavedStateNotSupported;
48use vmcore::vm_task::VmTaskDriverSource;
49
50/// An NVMe controller.
51#[derive(InspectMut)]
52pub struct NvmeController {
53    cfg_space: ConfigSpaceType0Emulator,
54    #[inspect(skip)]
55    msix: MsixEmulator,
56
57    registers: RegState,
58    #[inspect(skip)]
59    qe_sizes: Arc<Mutex<IoQueueEntrySizes>>,
60    #[inspect(flatten, mut)]
61    workers: NvmeWorkers,
62}
63
64#[derive(Inspect)]
65struct RegState {
66    #[inspect(hex)]
67    interrupt_mask: u32,
68    cc: spec::Cc,
69    csts: spec::Csts,
70    aqa: spec::Aqa,
71    #[inspect(hex)]
72    asq: u64,
73    #[inspect(hex)]
74    acq: u64,
75}
76
77impl RegState {
78    fn new() -> Self {
79        Self {
80            interrupt_mask: 0,
81            cc: spec::Cc::new(),
82            csts: spec::Csts::new(),
83            aqa: spec::Aqa::new(),
84            asq: 0,
85            acq: 0,
86        }
87    }
88}
89
90const CAP: spec::Cap = spec::Cap::new()
91    .with_dstrd(DOORBELL_STRIDE_BITS - 2)
92    .with_mqes_z(MAX_QES - 1)
93    .with_cqr(true)
94    .with_css_nvm(true)
95    .with_to(!0);
96
97/// The NVMe controller's capabilities.
98#[derive(Debug, Copy, Clone)]
99pub struct NvmeControllerCaps {
100    /// The number of entries in the MSI-X table.
101    pub msix_count: u16,
102    /// The maximum number of IO submission and completion queues.
103    pub max_io_queues: u16,
104    /// The subsystem ID, used as part of the subnqn field of the identify
105    /// controller response.
106    pub subsystem_id: Guid,
107}
108
109impl NvmeController {
110    /// Creates a new NVMe controller.
111    pub fn new(
112        driver_source: &VmTaskDriverSource,
113        guest_memory: GuestMemory,
114        msi_target: &MsiTarget,
115        register_mmio: &mut dyn RegisterMmioIntercept,
116        caps: NvmeControllerCaps,
117    ) -> Self {
118        let (msix, msix_cap) = MsixEmulator::new(4, caps.msix_count, msi_target);
119        let bars = DeviceBars::new()
120            .bar0(
121                BAR0_LEN,
122                BarMemoryKind::Intercept(register_mmio.new_io_region("bar0", BAR0_LEN)),
123            )
124            .bar4(
125                msix.bar_len(),
126                BarMemoryKind::Intercept(register_mmio.new_io_region("msix", msix.bar_len())),
127            );
128
129        let cfg_space = ConfigSpaceType0Emulator::new(
130            HardwareIds {
131                vendor_id: VENDOR_ID,
132                device_id: 0x00a9,
133                revision_id: 0,
134                prog_if: ProgrammingInterface::MASS_STORAGE_CONTROLLER_NON_VOLATILE_MEMORY_NVME,
135                sub_class: Subclass::MASS_STORAGE_CONTROLLER_NON_VOLATILE_MEMORY,
136                base_class: ClassCode::MASS_STORAGE_CONTROLLER,
137                type0_sub_vendor_id: 0,
138                type0_sub_system_id: 0,
139            },
140            vec![
141                Box::new(msix_cap),
142                Box::new(PciExpressCapability::new(
143                    pci_core::spec::caps::pci_express::DevicePortType::Endpoint,
144                    None,
145                )),
146            ],
147            Vec::new(),
148            bars,
149        );
150
151        let interrupts = (0..caps.msix_count)
152            .map(|i| msix.interrupt(i).unwrap())
153            .collect();
154
155        let qe_sizes = Arc::new(Default::default());
156        let admin = NvmeWorkers::new(
157            driver_source,
158            guest_memory,
159            interrupts,
160            caps.max_io_queues,
161            caps.max_io_queues,
162            Arc::clone(&qe_sizes),
163            caps.subsystem_id,
164        );
165
166        Self {
167            cfg_space,
168            msix,
169            registers: RegState::new(),
170            workers: admin,
171            qe_sizes,
172        }
173    }
174
175    /// Returns a client for manipulating the NVMe controller at runtime.
176    pub fn client(&self) -> NvmeControllerClient {
177        self.workers.client()
178    }
179
180    /// Reads from the virtual BAR 0.
181    pub fn read_bar0(&mut self, addr: u64, data: &mut [u8]) -> IoResult {
182        if data.len() < 4 {
183            return IoResult::Err(IoError::InvalidAccessSize);
184        }
185        if addr & (data.len() as u64 - 1) != 0 {
186            return IoResult::Err(IoError::UnalignedAccess);
187        }
188
189        // Check for 64-bit registers.
190        let d: Option<u64> = match spec::Register(addr & !7) {
191            spec::Register::CAP => Some(CAP.into()),
192            spec::Register::ASQ => Some(self.registers.asq),
193            spec::Register::ACQ => Some(self.registers.acq),
194            spec::Register::BPMBL => Some(0),
195            _ => None,
196        };
197        if let Some(d) = d {
198            if data.len() == 8 {
199                data.copy_from_slice(&d.to_ne_bytes());
200            } else if addr & 7 == 0 {
201                data.copy_from_slice(&(d as u32).to_ne_bytes());
202            } else {
203                data.copy_from_slice(&((d >> 32) as u32).to_ne_bytes());
204            }
205            return IoResult::Ok;
206        }
207
208        if data.len() != 4 {
209            return IoResult::Err(IoError::InvalidAccessSize);
210        }
211
212        // Handle 32-bit registers.
213        let d: u32 = match spec::Register(addr) {
214            spec::Register::VS => NVME_VERSION,
215            spec::Register::INTMS => self.registers.interrupt_mask,
216            spec::Register::INTMC => self.registers.interrupt_mask,
217            spec::Register::CC => self.registers.cc.into(),
218            spec::Register::RESERVED => 0,
219            spec::Register::CSTS => self.get_csts(),
220            spec::Register::NSSR => 0,
221            spec::Register::AQA => self.registers.aqa.into(),
222            spec::Register::CMBLOC => 0,
223            spec::Register::CMBSZ => 0,
224            spec::Register::BPINFO => 0,
225            spec::Register::BPRSEL => 0,
226            _ => return IoResult::Err(InvalidRegister),
227        };
228        data.copy_from_slice(&d.to_ne_bytes());
229        IoResult::Ok
230    }
231
232    /// Writes to the virtual BAR 0.
233    pub fn write_bar0(&mut self, addr: u64, data: &[u8]) -> IoResult {
234        if addr >= 0x1000 {
235            // Doorbell write.
236            let base = addr - 0x1000;
237            let db_id = base >> DOORBELL_STRIDE_BITS;
238            if (db_id << DOORBELL_STRIDE_BITS) != base {
239                return IoResult::Err(InvalidRegister);
240            }
241            let Ok(data) = data.try_into() else {
242                return IoResult::Err(IoError::InvalidAccessSize);
243            };
244            let value = u32::from_ne_bytes(data);
245            let db_id = match u16::try_from(db_id) {
246                Ok(id) => id,
247                Err(_) => return IoResult::Err(InvalidRegister),
248            };
249            self.workers.doorbell(db_id, value);
250            return IoResult::Ok;
251        }
252
253        if data.len() < 4 {
254            return IoResult::Err(IoError::InvalidAccessSize);
255        }
256        if addr & (data.len() as u64 - 1) != 0 {
257            return IoResult::Err(IoError::UnalignedAccess);
258        }
259
260        let update_reg = |x: u64| {
261            if data.len() == 8 {
262                u64::from_ne_bytes(data.try_into().unwrap())
263            } else {
264                let data = u32::from_ne_bytes(data.try_into().unwrap()) as u64;
265                if addr & 7 == 0 {
266                    (x & !(u32::MAX as u64)) | data
267                } else {
268                    (x & u32::MAX as u64) | (data << 32)
269                }
270            }
271        };
272
273        // Check for 64-bit registers.
274        let handled = match spec::Register(addr & !7) {
275            spec::Register::ASQ => {
276                if !self.registers.cc.en() {
277                    self.registers.asq = update_reg(self.registers.asq) & PAGE_MASK;
278                } else {
279                    tracelimit::warn_ratelimited!("attempt to set asq while enabled");
280                }
281                true
282            }
283            spec::Register::ACQ => {
284                if !self.registers.cc.en() {
285                    self.registers.acq = update_reg(self.registers.acq) & PAGE_MASK;
286                } else {
287                    tracelimit::warn_ratelimited!("attempt to set acq while enabled");
288                }
289                true
290            }
291            _ => false,
292        };
293        if handled {
294            return IoResult::Ok;
295        }
296
297        let Ok(data) = data.try_into() else {
298            return IoResult::Err(IoError::InvalidAccessSize);
299        };
300        let data = u32::from_ne_bytes(data);
301
302        // Handle 32-bit registers.
303        match spec::Register(addr) {
304            spec::Register::INTMS => self.registers.interrupt_mask |= data,
305            spec::Register::INTMC => self.registers.interrupt_mask &= !data,
306            spec::Register::CC => self.set_cc(data.into()),
307            spec::Register::AQA => self.registers.aqa = data.into(),
308            _ => return IoResult::Err(InvalidRegister),
309        }
310        IoResult::Ok
311    }
312
313    fn set_cc(&mut self, cc: spec::Cc) {
314        tracing::debug!(?cc, "set cc");
315
316        if cc.mps() != 0 {
317            tracelimit::warn_ratelimited!(
318                "This implementation only supports memory page sizes of 4K."
319            );
320            self.fatal_error();
321            return;
322        }
323
324        if cc.css() != 0 {
325            tracelimit::warn_ratelimited!("This implementation only supports the NVM command set.");
326            self.fatal_error();
327            return;
328        }
329
330        if let 2..=6 = cc.ams() {
331            tracelimit::warn_ratelimited!("Undefined arbitration mechanism.");
332            self.fatal_error();
333        }
334
335        let mask: u32 = u32::from(
336            spec::Cc::new()
337                .with_en(true)
338                .with_shn(0b11)
339                .with_iosqes(0b1111)
340                .with_iocqes(0b1111),
341        );
342        let mut cc: spec::Cc = (u32::from(cc) & mask).into();
343
344        if cc.shn() != 0 {
345            // It is unclear in the spec (to me) what guarantees a
346            // controller is supposed to make after shutdown. For now, just
347            // complete shutdown immediately.
348            self.registers.csts.set_shst(0b10);
349        }
350
351        if cc.en() != self.registers.cc.en() {
352            if cc.en() {
353                // Some drivers will write zeros to IOSQES and IOCQES, assuming that the defaults will work.
354                if cc.iocqes() == 0 {
355                    cc.set_iocqes(IOCQES);
356                } else if cc.iocqes() != IOCQES {
357                    tracelimit::warn_ratelimited!(
358                        "This implementation only supports CQEs of the default size."
359                    );
360                    self.fatal_error();
361                    return;
362                }
363
364                if cc.iosqes() == 0 {
365                    cc.set_iosqes(IOSQES);
366                } else if cc.iosqes() != IOSQES {
367                    tracelimit::warn_ratelimited!(
368                        "This implementation only supports SQEs of the default size."
369                    );
370                    self.fatal_error();
371                    return;
372                }
373
374                if self.registers.csts.rdy() {
375                    tracelimit::warn_ratelimited!("enabling during reset");
376                    return;
377                }
378                if cc.shn() == 0 {
379                    self.registers.csts.set_shst(0);
380                }
381
382                self.workers.enable(
383                    self.registers.asq,
384                    self.registers.aqa.asqs_z().max(1) + 1,
385                    self.registers.acq,
386                    self.registers.aqa.acqs_z().max(1) + 1,
387                );
388            } else if self.registers.csts.rdy() {
389                self.workers.controller_reset();
390            } else {
391                tracelimit::warn_ratelimited!("disabling while not ready");
392                return;
393            }
394        }
395
396        self.registers.cc = cc;
397        *self.qe_sizes.lock() = IoQueueEntrySizes {
398            sqe_bits: cc.iosqes(),
399            cqe_bits: cc.iocqes(),
400        };
401    }
402
403    fn get_csts(&mut self) -> u32 {
404        if !self.registers.cc.en() && self.registers.csts.rdy() {
405            // Keep trying to disable.
406            if self.workers.poll_controller_reset() {
407                // AQA, ASQ, and ACQ are not reset by controller reset.
408                self.registers.csts = 0.into();
409                self.registers.cc = 0.into();
410                self.registers.interrupt_mask = 0;
411            }
412        } else if self.registers.cc.en() && !self.registers.csts.rdy() {
413            if self.workers.poll_enabled() {
414                self.registers.csts.set_rdy(true);
415            }
416        }
417
418        let csts = self.registers.csts;
419        tracing::debug!(?csts, "get csts");
420        csts.into()
421    }
422
423    /// Sets the CFS bit in the controller status register (CSTS), indicating
424    /// that the controller has experienced "undefined" behavior.
425    pub fn fatal_error(&mut self) {
426        self.registers.csts.set_cfs(true);
427    }
428}
429
430impl ChangeDeviceState for NvmeController {
431    fn start(&mut self) {}
432
433    async fn stop(&mut self) {}
434
435    async fn reset(&mut self) {
436        let Self {
437            cfg_space,
438            msix: _,
439            registers,
440            qe_sizes,
441            workers,
442        } = self;
443        workers.reset().await;
444        cfg_space.reset();
445        *registers = RegState::new();
446        *qe_sizes.lock() = Default::default();
447    }
448}
449
450impl ChipsetDevice for NvmeController {
451    fn supports_mmio(&mut self) -> Option<&mut dyn MmioIntercept> {
452        Some(self)
453    }
454
455    fn supports_pci(&mut self) -> Option<&mut dyn PciConfigSpace> {
456        Some(self)
457    }
458}
459
460impl MmioIntercept for NvmeController {
461    fn mmio_read(&mut self, addr: u64, data: &mut [u8]) -> IoResult {
462        match self.cfg_space.find_bar(addr) {
463            Some((0, offset)) => self.read_bar0(offset, data),
464            Some((4, offset)) => {
465                read_as_u32_chunks(offset, data, |offset| self.msix.read_u32(offset));
466                IoResult::Ok
467            }
468            _ => IoResult::Err(InvalidRegister),
469        }
470    }
471
472    fn mmio_write(&mut self, addr: u64, data: &[u8]) -> IoResult {
473        match self.cfg_space.find_bar(addr) {
474            Some((0, offset)) => self.write_bar0(offset, data),
475            Some((4, offset)) => {
476                write_as_u32_chunks(offset, data, |offset, ty| match ty {
477                    ReadWriteRequestType::Read => Some(self.msix.read_u32(offset)),
478                    ReadWriteRequestType::Write(val) => {
479                        self.msix.write_u32(offset, val);
480                        None
481                    }
482                });
483                IoResult::Ok
484            }
485            _ => IoResult::Err(InvalidRegister),
486        }
487    }
488}
489
490impl PciConfigSpace for NvmeController {
491    fn pci_cfg_read(&mut self, offset: u16, value: &mut u32) -> IoResult {
492        self.cfg_space.read_u32(offset, value)
493    }
494
495    fn pci_cfg_write(&mut self, offset: u16, value: u32) -> IoResult {
496        self.cfg_space.write_u32(offset, value)
497    }
498}
499
500impl SaveRestore for NvmeController {
501    type SavedState = SavedStateNotSupported;
502
503    fn save(&mut self) -> Result<Self::SavedState, SaveError> {
504        Err(SaveError::NotSupported)
505    }
506
507    fn restore(
508        &mut self,
509        state: Self::SavedState,
510    ) -> Result<(), vmcore::save_restore::RestoreError> {
511        match state {}
512    }
513}