nvme/
queue.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! NVMe submission and completion queue types.
5
6use crate::spec;
7use guestmem::GuestMemory;
8use guestmem::GuestMemoryError;
9use inspect::Inspect;
10use std::sync::Arc;
11use std::sync::atomic::AtomicU32;
12use std::sync::atomic::Ordering;
13use thiserror::Error;
14use vmcore::interrupt::Interrupt;
15
16pub const ILLEGAL_DOORBELL_VALUE: u32 = 0xffffffff;
17
18#[derive(Default, Inspect)]
19#[inspect(transparent)]
20pub struct DoorbellRegister {
21    #[inspect(hex)]
22    value: AtomicU32,
23    #[inspect(skip)]
24    event: event_listener::Event,
25}
26
27impl DoorbellRegister {
28    pub fn new() -> Self {
29        Self::default()
30    }
31
32    pub fn write(&self, value: u32) {
33        self.value.store(value, Ordering::SeqCst);
34        self.event.notify(usize::MAX);
35    }
36
37    pub fn read(&self) -> u32 {
38        self.value.load(Ordering::SeqCst)
39    }
40
41    pub async fn wait_read(&self, value: u32) -> u32 {
42        let v = self.read();
43        if value != v {
44            return v;
45        }
46        loop {
47            let listener = self.event.listen();
48            let v = self.read();
49            if value != v {
50                break v;
51            }
52            listener.await;
53        }
54    }
55}
56
57#[derive(Copy, Clone, Default, Inspect, Debug)]
58pub struct ShadowDoorbell {
59    #[inspect(hex)]
60    pub shadow_db_gpa: u64,
61    #[inspect(hex)]
62    pub event_idx_gpa: u64,
63}
64
65impl ShadowDoorbell {
66    // See NVMe Spec version 2.0a, Section 5.8 -- Doorbell Buffer Config Command for
67    // an explanation of this math.
68    pub fn new(
69        shadow_db_evt_idx_base: ShadowDoorbell,
70        qid: u16,
71        is_sq: bool,
72        doorbell_stride_bits: usize,
73    ) -> ShadowDoorbell {
74        let offset = match is_sq {
75            true => 0u64,
76            false => 1u64,
77        };
78        let shadow_db_gpa = shadow_db_evt_idx_base.shadow_db_gpa
79            + (qid as u64 * 2 + offset) * (4 << (doorbell_stride_bits - 2));
80        let event_idx_gpa = shadow_db_evt_idx_base.event_idx_gpa
81            + (qid as u64 * 2 + offset) * (4 << (doorbell_stride_bits - 2));
82        ShadowDoorbell {
83            shadow_db_gpa,
84            event_idx_gpa,
85        }
86    }
87}
88
89#[derive(Inspect)]
90pub struct SubmissionQueue {
91    #[inspect(hex)]
92    cached_tail: u32,
93    tail: Arc<DoorbellRegister>,
94    #[inspect(hex)]
95    head: u32,
96    #[inspect(hex)]
97    gpa: u64,
98    #[inspect(hex)]
99    len: u32,
100    #[inspect(with = "Option::is_some")]
101    shadow_db_evt_idx: Option<ShadowDoorbell>,
102    #[inspect(hex)]
103    evt_idx: u32,
104}
105
106#[derive(Debug, Error)]
107pub enum QueueError {
108    #[error("invalid doorbell tail {0:#x}")]
109    InvalidTail(u32),
110    #[error("invalid doorbell head {0:#x}")]
111    InvalidHead(u32),
112    #[error("queue access error")]
113    Memory(#[source] GuestMemoryError),
114}
115
116impl SubmissionQueue {
117    pub fn new(
118        tail: Arc<DoorbellRegister>,
119        gpa: u64,
120        len: u16,
121        shadow_db_evt_idx: Option<ShadowDoorbell>,
122    ) -> Self {
123        tail.write(0);
124        Self {
125            cached_tail: 0,
126            tail,
127            head: 0,
128            gpa,
129            len: len.into(),
130            shadow_db_evt_idx,
131            evt_idx: 0,
132        }
133    }
134
135    /// This function returns a future for the next entry in the submission queue.  It also
136    /// has a side effect of updating the tail.
137    ///
138    /// Note that this function returns a future that must be cancellable, which means that the
139    /// parts after an await may never run.  The tail update side effect is benign, so
140    /// that can happen before the await.
141    pub async fn next(&mut self, mem: &GuestMemory) -> Result<spec::Command, QueueError> {
142        // If shadow doorbells are in use, use that instead of what was written to the doorbell
143        // register, as it may be more current.
144        if let Some(shadow_db_evt_idx) = self.shadow_db_evt_idx {
145            let shadow_tail = mem
146                .read_plain(shadow_db_evt_idx.shadow_db_gpa)
147                .map_err(QueueError::Memory)?;
148
149            // ILLEGAL_DOORBELL_VALUE is the initial state.  The guest will overwrite
150            // it when it first uses the shadow.
151            if shadow_tail != ILLEGAL_DOORBELL_VALUE {
152                self.cached_tail = shadow_tail;
153                self.tail.write(self.cached_tail);
154            }
155        }
156        while self.cached_tail == self.head {
157            self.cached_tail = self.tail.wait_read(self.cached_tail).await;
158        }
159        if self.cached_tail >= self.len {
160            return Err(QueueError::InvalidTail(self.cached_tail));
161        }
162        let command: spec::Command = mem
163            .read_plain(self.gpa.wrapping_add(self.head as u64 * 64))
164            .map_err(QueueError::Memory)?;
165
166        self.head = advance(self.head, self.len);
167        Ok(command)
168    }
169
170    pub fn sqhd(&self) -> u16 {
171        self.head as u16
172    }
173
174    /// This function lets the driver know what doorbell value we consumed, allowing
175    /// it to elide the next ring, maybe.
176    pub fn advance_evt_idx(&mut self, mem: &GuestMemory) -> Result<(), QueueError> {
177        self.evt_idx = advance(self.evt_idx, self.len);
178        if let Some(shadow_db_evt_idx) = self.shadow_db_evt_idx {
179            mem.write_plain(shadow_db_evt_idx.event_idx_gpa, &self.evt_idx)
180                .map_err(QueueError::Memory)?;
181        }
182        Ok(())
183    }
184
185    /// This function updates the shadow doorbell values of a queue that is
186    /// potentially already in use.
187    pub fn update_shadow_db(&mut self, mem: &GuestMemory, sdb: ShadowDoorbell) {
188        self.shadow_db_evt_idx = Some(sdb);
189        self.evt_idx = self.cached_tail;
190        // Write the illegal value out to the buffer, so that we can tell
191        // if Linux has ever written a valid value.
192        let _ = mem.write_plain(sdb.shadow_db_gpa, &ILLEGAL_DOORBELL_VALUE);
193    }
194}
195
196#[derive(Inspect)]
197pub struct CompletionQueue {
198    #[inspect(hex)]
199    tail: u32,
200    #[inspect(hex)]
201    cached_head: u32,
202    head: Arc<DoorbellRegister>,
203    phase: bool,
204    #[inspect(hex)]
205    gpa: u64,
206    #[inspect(hex)]
207    len: u32,
208    #[inspect(with = "Option::is_some")]
209    interrupt: Option<Interrupt>,
210    shadow_db_evt_idx: Option<ShadowDoorbell>,
211}
212
213impl CompletionQueue {
214    pub fn new(
215        head: Arc<DoorbellRegister>,
216        interrupt: Option<Interrupt>,
217        gpa: u64,
218        len: u16,
219        shadow_db_evt_idx: Option<ShadowDoorbell>,
220    ) -> Self {
221        head.write(0);
222        Self {
223            tail: 0,
224            cached_head: 0,
225            head,
226            phase: true,
227            gpa,
228            len: len.into(),
229            interrupt,
230            shadow_db_evt_idx,
231        }
232    }
233
234    /// Wait for free completions.
235    pub async fn wait_ready(&mut self, mem: &GuestMemory) -> Result<(), QueueError> {
236        let next_tail = advance(self.tail, self.len);
237        // If shadow doorbells are in use, use that instead of what was written to the doorbell
238        // register, as it may be more current.
239        if let Some(shadow_db_evt_idx) = self.shadow_db_evt_idx {
240            let shadow_head = mem
241                .read_plain(shadow_db_evt_idx.shadow_db_gpa)
242                .map_err(QueueError::Memory)?;
243
244            // ILLEGAL_DOORBELL_VALUE is the initial state.  The guest will overwrite
245            // it when it first uses the shadow.
246            if shadow_head != ILLEGAL_DOORBELL_VALUE {
247                self.cached_head = shadow_head;
248                self.head.write(self.cached_head);
249            }
250        }
251        while self.cached_head == next_tail {
252            self.cached_head = self.head.wait_read(self.cached_head).await;
253        }
254        if self.cached_head >= self.len {
255            return Err(QueueError::InvalidHead(self.cached_head));
256        }
257        Ok(())
258    }
259
260    pub fn write(
261        &mut self,
262        mem: &GuestMemory,
263        mut data: spec::Completion,
264    ) -> Result<bool, QueueError> {
265        if self.cached_head == advance(self.tail, self.len) {
266            return Ok(false);
267        }
268        data.status.set_phase(self.phase);
269
270        // Atomically write the low part of the completion entry first, then the
271        // high part, using release fences to ensure ordering.
272        //
273        // This is necessary to ensure the guest can observe the full completion
274        // once it observes the phase bit change (which is in the high part).
275        let [low, high]: [u64; 2] = zerocopy::transmute!(data);
276        let gpa = self.gpa.wrapping_add(self.tail as u64 * 16);
277        mem.write_plain(gpa, &low).map_err(QueueError::Memory)?;
278        std::sync::atomic::fence(Ordering::Release);
279        mem.write_plain(gpa + 8, &high)
280            .map_err(QueueError::Memory)?;
281        std::sync::atomic::fence(Ordering::Release);
282
283        if let Some(interrupt) = &self.interrupt {
284            interrupt.deliver();
285        }
286        self.tail = advance(self.tail, self.len);
287        if self.tail == 0 {
288            self.phase = !self.phase;
289        }
290        Ok(true)
291    }
292
293    /// This method updates the EVT_IDX field to match the shadow doorbell
294    /// value, thus signalling to the guest driver that the next completion
295    /// removed should involve a doorbell ring.  In this emulator, such
296    /// a thing (the ring) is only necessary when the number of un-spoken-for
297    /// completion queue entries is getting small.  (Completion queue entries
298    /// are spoken for when a command is removed from the SQ).
299    pub fn catch_up_evt_idx(
300        &mut self,
301        force: bool,
302        io_outstanding: u32,
303        mem: &GuestMemory,
304    ) -> Result<(), QueueError> {
305        if let Some(shadow_db_evt_idx) = self.shadow_db_evt_idx {
306            if force | (io_outstanding >= self.len - 3) {
307                mem.write_plain(shadow_db_evt_idx.event_idx_gpa, &self.cached_head)
308                    .map_err(QueueError::Memory)?;
309            }
310        }
311        Ok(())
312    }
313
314    /// This function updates the shadow doorbell values of a queue that is
315    /// potentially already in use.
316    pub fn update_shadow_db(&mut self, mem: &GuestMemory, sdb: ShadowDoorbell) {
317        self.shadow_db_evt_idx = Some(sdb);
318        // Write the illegal value out to the buffer, so that we can tell
319        // if Linux has ever written a valid value.
320        let _ = mem.write_plain(sdb.shadow_db_gpa, &ILLEGAL_DOORBELL_VALUE);
321    }
322}
323
324fn advance(n: u32, l: u32) -> u32 {
325    if n + 1 < l { n + 1 } else { 0 }
326}