pci_bus/
lib.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! Generic PCI Bus infrastructure.
5//!
6//! [`GenericPciBus`] is a [`ChipsetDevice`] that implements a chipset and
7//! architecture agnostic PCI bus.
8//!
9//! [`GenericPciBus`] can be configured to support various spec-compliant PCI
10//! configuration space access mechanisms, such as legacy port-io based
11//! configuration space access, ECAM (Enhanced Configuration Access Mechanism),
12//! etc...
13//!
14//! Incoming config space accesses are then routed to connected
15//! [`GenericPciBusDevice`] devices.
16
17#![forbid(unsafe_code)]
18
19use bitfield_struct::bitfield;
20use chipset_device::ChipsetDevice;
21use chipset_device::io::IoError;
22use chipset_device::io::IoResult;
23use chipset_device::io::deferred::DeferredRead;
24use chipset_device::io::deferred::DeferredToken;
25use chipset_device::io::deferred::DeferredWrite;
26use chipset_device::io::deferred::defer_read;
27use chipset_device::io::deferred::defer_write;
28use chipset_device::pio::ControlPortIoIntercept;
29use chipset_device::pio::PortIoIntercept;
30use chipset_device::pio::RegisterPortIoIntercept;
31use chipset_device::poll_device::PollDevice;
32use inspect::Inspect;
33use inspect::InspectMut;
34use std::collections::BTreeMap;
35use std::sync::Arc;
36use std::task::Context;
37use std::task::Poll;
38use vmcore::device_state::ChangeDeviceState;
39use zerocopy::FromZeros;
40use zerocopy::IntoBytes;
41
42/// Standard x86 IO ports associated with PCI
43#[expect(missing_docs)] // self explanatory constants
44pub mod standard_x86_io_ports {
45    pub const ADDR_START: u16 = 0xCF8;
46    pub const ADDR_END: u16 = 0xCFB;
47
48    pub const DATA_START: u16 = 0xCFC;
49    pub const DATA_END: u16 = 0xCFF;
50}
51
52/// An abstract interface for a PCI device accessed via the [`GenericPciBus`].
53///
54/// This trait is nearly identical to [`chipset_device::pci::PciConfigSpace`],
55/// except for the fact that the return values are wrapped in an `Option`, where
56/// `None` indicates that the backing device is no longer responding to
57/// accesses.
58///
59/// e.g: a GenericPciBusDevice backed by a `Weak` pointer to a device could get
60/// invalidated, in which case, these APIs would return `None`.
61///
62/// This trait decouples the PCI bus implementation from any concrete
63/// `ChipsetDevice` ownership model being employed by upper-level code (i.e:
64/// Arc/Weak + Mutex vs. Channels, etc...).
65///
66/// This is also the reason why the read/write methods are fallible: the PCI bus
67/// should be resilient to backing devices unexpectedly going offline.
68pub trait GenericPciBusDevice: 'static + Send {
69    /// Dispatch a PCI config space read to the device with the given address.
70    fn pci_cfg_read(&mut self, offset: u16, value: &mut u32) -> Option<IoResult>;
71
72    /// Dispatch a PCI config space write to the device with the given address.
73    fn pci_cfg_write(&mut self, offset: u16, value: u32) -> Option<IoResult>;
74}
75
76#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Inspect)]
77#[inspect(display)]
78struct PciAddr {
79    bus: u8,
80    device: u8,
81    function: u8,
82}
83
84impl std::fmt::Display for PciAddr {
85    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
86        // Use standard-ish BDF notation (bb:dd.f).
87        write!(
88            f,
89            "{:02x}:{:02x}.{:x}",
90            self.bus, self.device, self.function
91        )
92    }
93}
94
95#[derive(Inspect)]
96struct GenericPciBusState {
97    pio_addr_reg: AddressRegister,
98}
99
100// This type is effectively two hand-rolled state machines combined into one, as
101// only one action can be taking place at a time.
102//
103// When a read is issued and deferred that results in a `DeferredAction::Read`,
104// which will then be processed asynchronously.
105//
106// When a write is issued, if the write is undersized, we must first read the
107// existing value on alignment before combining that with the  new value and
108// writing it. That read could be deferred, which will result in a
109// `DeferredAction::ReadForWrite`. If the write after this read is deferred
110// it will result in a `DeferredAction::Write`.
111//
112// If a fully sized write is issued and gets deferred, that does not result in a
113// `DeferredAction::Write`. Instead it is simply returned up the stack to let our
114// caller handle it, as we don't need to perform any extra work after completion.
115#[derive(Inspect)]
116#[inspect(tag = "kind")]
117enum DeferredAction {
118    Read {
119        #[inspect(skip)]
120        deferred_device_read: DeferredToken,
121        #[inspect(skip)]
122        bus_read: DeferredRead,
123        read_len: usize,
124        io_port: u16,
125        address: PciAddr,
126    },
127    ReadForWrite {
128        #[inspect(skip)]
129        deferred_device_read: DeferredToken,
130        #[inspect(skip)]
131        bus_write: DeferredWrite,
132        write_len: usize,
133        io_port: u16,
134        new_value: u32,
135        address: PciAddr,
136    },
137    Write {
138        #[inspect(skip)]
139        deferred_device_write: DeferredToken,
140        #[inspect(skip)]
141        bus_write: DeferredWrite,
142        value: u32,
143        address: PciAddr,
144    },
145}
146
147/// A generic PCI bus.
148#[derive(InspectMut)]
149pub struct GenericPciBus {
150    // Runtime glue
151    pio_addr: Box<dyn ControlPortIoIntercept>,
152    pio_data: Box<dyn ControlPortIoIntercept>,
153    #[inspect(with = "|x| inspect::iter_by_key(x).map_value(|(name, _)| name)")]
154    pci_devices: BTreeMap<PciAddr, (Arc<str>, Box<dyn GenericPciBusDevice>)>,
155
156    // Async bookkeeping
157    #[inspect(with = "|x| x.is_some()")]
158    waker: Option<std::task::Waker>,
159    deferred_action: Option<DeferredAction>,
160
161    // Volatile state
162    state: GenericPciBusState,
163}
164
165impl GenericPciBus {
166    /// Create a new [`GenericPciBus`] with the specified (4-byte) IO ports.
167    pub fn new(
168        register_pio: &mut dyn RegisterPortIoIntercept,
169        pio_addr: u16,
170        pio_data: u16,
171    ) -> GenericPciBus {
172        let mut addr_control = register_pio.new_io_region("addr", 4);
173        let mut data_control = register_pio.new_io_region("data", 4);
174        addr_control.map(pio_addr);
175        data_control.map(pio_data);
176        GenericPciBus {
177            pio_addr: addr_control,
178            pio_data: data_control,
179            pci_devices: BTreeMap::new(),
180
181            waker: None,
182            deferred_action: None,
183
184            state: GenericPciBusState {
185                pio_addr_reg: AddressRegister::new(),
186            },
187        }
188    }
189
190    /// Try to add a PCI device, returning (device, existing_device_name) if the
191    /// slot is already occupied.
192    pub fn add_pci_device<D: GenericPciBusDevice>(
193        &mut self,
194        bus: u8,
195        device: u8,
196        function: u8,
197        name: impl AsRef<str>,
198        dev: D,
199    ) -> Result<(), (D, Arc<str>)> {
200        let key = PciAddr {
201            bus,
202            device,
203            function,
204        };
205
206        if let Some((name, _)) = self.pci_devices.get(&key) {
207            return Err((dev, name.clone()));
208        }
209
210        self.pci_devices
211            .insert(key, (name.as_ref().into(), Box::new(dev)));
212        Ok(())
213    }
214
215    /// Handle a read from the ADDR register
216    fn handle_addr_read(&self, value: &mut u32) -> IoResult {
217        *value = self.state.pio_addr_reg.0;
218        IoResult::Ok
219    }
220
221    /// Handle a write to the ADDR register
222    fn handle_addr_write(&mut self, addr: u32) -> IoResult {
223        let addr_fixup = {
224            let mut addr = AddressRegister(addr);
225            addr.fixup();
226            addr
227        };
228
229        self.state.pio_addr_reg = addr_fixup;
230        IoResult::Ok
231    }
232
233    /// Handle a read from the DATA register
234    fn handle_data_read(&mut self, value: &mut u32) -> IoResult {
235        tracing::trace!(%self.state.pio_addr_reg, "data read");
236
237        if !self.state.pio_addr_reg.enabled() {
238            tracelimit::warn_ratelimited!("addr enable bit is set to disabled");
239            *value = !0;
240            return IoResult::Ok;
241        }
242
243        let address = self.state.pio_addr_reg.address();
244
245        match self.pci_devices.get_mut(&address) {
246            Some((name, device)) => {
247                let offset = self.state.pio_addr_reg.register().into();
248                let res = device.pci_cfg_read(offset, value);
249                if let Some(result) = res {
250                    tracing::trace!(
251                        device = &**name,
252                        %address,
253                        offset,
254                        value,
255                        "cfg space read"
256                    );
257                    result
258                } else {
259                    // TODO: should probably unregister from bus?
260                    // but then again, shouldn't the device do that as part of
261                    // its destructor?
262                    tracelimit::warn_ratelimited!(
263                        device = &**name,
264                        %address,
265                        offset,
266                        "cfg space read failed, device went away"
267                    );
268                    *value = !0;
269                    IoResult::Ok
270                }
271            }
272            None => {
273                tracing::trace!(%address, "no device found - returning F's");
274                *value = !0;
275                IoResult::Ok
276            }
277        }
278    }
279
280    /// Handler a write to the DATA register
281    fn handle_data_write(&mut self, data: u32) -> IoResult {
282        tracing::trace!(%self.state.pio_addr_reg, "data write");
283
284        if !self.state.pio_addr_reg.enabled() {
285            tracelimit::warn_ratelimited!("addr enable bit is set to disabled");
286            return IoResult::Ok;
287        }
288
289        let address = self.state.pio_addr_reg.address();
290        match self.pci_devices.get_mut(&address) {
291            Some((name, device)) => {
292                let offset = self.state.pio_addr_reg.register().into();
293                let res = device.pci_cfg_write(offset, data);
294                if let Some(result) = res {
295                    tracing::trace!(
296                        device = &**name,
297                        %address,
298                        offset,
299                        data,
300                        "cfg space write"
301                    );
302                    result
303                } else {
304                    // TODO: should probably unregister from bus?
305                    // but then again, shouldn't the device do that as part of
306                    // its destructor?
307                    tracelimit::warn_ratelimited!(
308                        device = &**name,
309                        %address,
310                        offset,
311                        "cfg space write failed, device went away"
312                    );
313                    IoResult::Ok
314                }
315            }
316            None => {
317                tracing::debug!(%address, "no device found");
318                IoResult::Ok
319            }
320        }
321    }
322
323    fn trace_error(&self, e: IoError, operation: &'static str) {
324        let error = match e {
325            IoError::InvalidRegister => "offset not supported",
326            IoError::InvalidAccessSize => "invalid access size",
327            IoError::UnalignedAccess => "unaligned access",
328        };
329        tracelimit::warn_ratelimited!(
330            address = %self.state.pio_addr_reg.address(),
331            "pci config space {} operation error: {}",
332            operation,
333            error
334        );
335    }
336
337    fn trace_recv_error(&self, e: mesh::RecvError, operation: &'static str) {
338        tracelimit::warn_ratelimited!(
339            address = %self.state.pio_addr_reg.address(),
340            "pci config space {} operation recv error: {:?}",
341            operation,
342            e,
343        );
344    }
345}
346
347impl ChangeDeviceState for GenericPciBus {
348    fn start(&mut self) {}
349
350    async fn stop(&mut self) {}
351
352    async fn reset(&mut self) {
353        self.state.pio_addr_reg = AddressRegister::new();
354    }
355}
356
357impl ChipsetDevice for GenericPciBus {
358    fn supports_pio(&mut self) -> Option<&mut dyn PortIoIntercept> {
359        Some(self)
360    }
361
362    fn supports_poll_device(&mut self) -> Option<&mut dyn PollDevice> {
363        Some(self)
364    }
365}
366
367fn shift_read_value(io_port: u16, len: usize, value: u32) -> u32 {
368    let shift = (io_port & 0x3) * 8;
369    match len {
370        4 => value,
371        2 => value >> shift & 0xFFFF,
372        1 => value >> shift & 0xFF,
373        _ => unreachable!(),
374    }
375}
376
377fn combine_old_new_values(io_port: u16, old_value: u32, new_value: u32, len: usize) -> u32 {
378    let shift = (io_port & 0x3) * 8;
379    let mask = (1 << (len * 8)) - 1;
380    (old_value & !(mask << shift)) | (new_value << shift)
381}
382
383impl PortIoIntercept for GenericPciBus {
384    fn io_read(&mut self, io_port: u16, data: &mut [u8]) -> IoResult {
385        if !matches!(data.len(), 1 | 2 | 4) {
386            return IoResult::Err(IoError::InvalidAccessSize);
387        }
388
389        if !(data.len() == 4 && io_port & 3 == 0
390            || data.len() == 2 && io_port & 1 == 0
391            || data.len() == 1)
392        {
393            return IoResult::Err(IoError::UnalignedAccess);
394        }
395
396        let mut value = 0;
397        let res = match io_port {
398            _ if self.pio_addr.offset_of(io_port).is_some() => self.handle_addr_read(&mut value),
399            _ if self.pio_data.offset_of(io_port).is_some() => self.handle_data_read(&mut value),
400            _ => {
401                return IoResult::Err(IoError::InvalidRegister);
402            }
403        };
404
405        tracing::trace!(?io_port, ?res, ?data, "io port read");
406
407        match res {
408            IoResult::Ok => {
409                let value = shift_read_value(io_port, data.len(), value);
410                data.copy_from_slice(&value.as_bytes()[..data.len()]);
411                IoResult::Ok
412            }
413            IoResult::Err(e) => {
414                self.trace_error(e, "read");
415                // Regardless of the pci error that occurred we return all zeros.
416                // This is technically device-specific behavior, but it's what all
417                // hyper-v devices do and it's worked for us so far.
418                data.zero();
419                IoResult::Ok
420            }
421            IoResult::Defer(deferred_device_read) => {
422                let (bus_read, bus_token) = defer_read();
423                assert!(self.deferred_action.is_none());
424                self.deferred_action = Some(DeferredAction::Read {
425                    deferred_device_read,
426                    bus_read,
427                    read_len: data.len(),
428                    io_port,
429                    address: self.state.pio_addr_reg.address(),
430                });
431                if let Some(waker) = self.waker.take() {
432                    waker.wake();
433                }
434                IoResult::Defer(bus_token)
435            }
436        }
437    }
438
439    fn io_write(&mut self, io_port: u16, data: &[u8]) -> IoResult {
440        if !matches!(data.len(), 1 | 2 | 4) {
441            return IoResult::Err(IoError::InvalidAccessSize);
442        }
443
444        let new_value = {
445            let mut temp: u32 = 0;
446            temp.as_mut_bytes()[..data.len()].copy_from_slice(data);
447            temp
448        };
449
450        tracing::trace!(?io_port, data = ?new_value, "io port write");
451
452        match io_port {
453            _ if self.pio_addr.offset_of(io_port).is_some() => {
454                // In theory, only 4-byte accesses are valid here, but
455                // RedHat Linux modifies the bottom byte of the PCI
456                // configuration address by using a 1-byte access
457                let v = if data.len() == 4 {
458                    new_value
459                } else {
460                    let mut old_value = 0;
461                    self.handle_addr_read(&mut old_value).unwrap();
462                    match data.len() {
463                        2 => (old_value & 0xFFFF0000) | (new_value & 0xFFFF),
464                        1 => (old_value & 0xFFFFFF00) | (new_value & 0xFF),
465                        _ => unreachable!(),
466                    }
467                };
468
469                self.handle_addr_write(v)
470            }
471            _ if self.pio_data.offset_of(io_port).is_some() => {
472                let merged_value = if data.len() == 4 {
473                    new_value
474                } else {
475                    // If the access isn't a double word, read in the old data
476                    // to form a full word.
477                    //
478                    // Note that this isn't *really* correct, because reading
479                    // bits may have a side-effect. Also, writing to bits that
480                    // weren't actually written to may have side-effects...
481                    //
482                    // However, this technique appears to work fine for
483                    // everything we've encountered so far ¯\_(ツ)_/¯
484                    let mut old_value = 0;
485                    match self.handle_data_read(&mut old_value) {
486                        IoResult::Ok => {
487                            combine_old_new_values(io_port, old_value, new_value, data.len())
488                        }
489                        IoResult::Err(e) => {
490                            self.trace_error(e, "read for undersized write");
491                            // Regardless of the pci error that occurred, we return all zeros.
492                            // This is technically device-specific behavior, but it's what all
493                            // hyper-v devices do and it's worked for us so far.
494                            0
495                        }
496                        IoResult::Defer(deferred_device_read) => {
497                            let (bus_write, bus_token) = defer_write();
498                            assert!(self.deferred_action.is_none());
499                            self.deferred_action = Some(DeferredAction::ReadForWrite {
500                                deferred_device_read,
501                                bus_write,
502                                write_len: data.len(),
503                                io_port,
504                                new_value,
505                                address: self.state.pio_addr_reg.address(),
506                            });
507                            if let Some(waker) = self.waker.take() {
508                                waker.wake();
509                            }
510                            return IoResult::Defer(bus_token);
511                        }
512                    }
513                };
514
515                let write_result = self.handle_data_write(merged_value);
516                match write_result {
517                    IoResult::Err(e) => {
518                        self.trace_error(e, "write");
519                        IoResult::Ok
520                    }
521                    IoResult::Ok | IoResult::Defer(_) => {
522                        // If the write was successful we're all set.
523                        // If the write is deferred we have no extra work to do after
524                        // it resolves, unlike with read, so we can just return it and
525                        // let the motherboard poll.
526                        write_result
527                    }
528                }
529            }
530            _ => IoResult::Err(IoError::InvalidRegister),
531        }
532    }
533}
534
535impl PollDevice for GenericPciBus {
536    fn poll_device(&mut self, cx: &mut Context<'_>) {
537        self.waker = Some(cx.waker().clone());
538        if let Some(action) = self.deferred_action.take() {
539            match action {
540                DeferredAction::Read {
541                    mut deferred_device_read,
542                    bus_read,
543                    read_len,
544                    io_port,
545                    address,
546                } => {
547                    let mut buf = 0;
548                    if let Poll::Ready(res) = deferred_device_read.poll_read(cx, buf.as_mut_bytes())
549                    {
550                        let value = match res {
551                            Ok(()) => buf,
552                            Err(e) => {
553                                self.trace_recv_error(e, "deferred read");
554                                0
555                            }
556                        };
557                        let value = shift_read_value(io_port, read_len, value);
558                        bus_read.complete(&value.as_bytes()[..read_len]);
559                    } else {
560                        self.deferred_action = Some(DeferredAction::Read {
561                            deferred_device_read,
562                            bus_read,
563                            read_len,
564                            io_port,
565                            address,
566                        });
567                    }
568                }
569                DeferredAction::ReadForWrite {
570                    mut deferred_device_read,
571                    bus_write,
572                    write_len,
573                    io_port,
574                    new_value,
575                    address,
576                } => {
577                    let mut buf = 0;
578                    if let Poll::Ready(res) = deferred_device_read.poll_read(cx, buf.as_mut_bytes())
579                    {
580                        let old_value = match res {
581                            Ok(()) => buf,
582                            Err(e) => {
583                                self.trace_recv_error(e, "deferred read for write");
584                                0
585                            }
586                        };
587                        let merged_value =
588                            combine_old_new_values(io_port, old_value, new_value, write_len);
589                        match self.handle_data_write(merged_value) {
590                            IoResult::Ok => {
591                                bus_write.complete();
592                            }
593                            IoResult::Err(e) => {
594                                self.trace_error(e, "write");
595                                bus_write.complete();
596                            }
597                            IoResult::Defer(deferred_device_write) => {
598                                self.deferred_action = Some(DeferredAction::Write {
599                                    deferred_device_write,
600                                    bus_write,
601                                    value: merged_value,
602                                    address,
603                                });
604                                cx.waker().wake_by_ref();
605                            }
606                        }
607                    } else {
608                        self.deferred_action = Some(DeferredAction::ReadForWrite {
609                            deferred_device_read,
610                            bus_write,
611                            write_len,
612                            io_port,
613                            new_value,
614                            address,
615                        });
616                    }
617                }
618                DeferredAction::Write {
619                    mut deferred_device_write,
620                    bus_write,
621                    value,
622                    address,
623                } => {
624                    if let Poll::Ready(res) = deferred_device_write.poll_write(cx) {
625                        match res {
626                            Ok(()) => {}
627                            Err(e) => {
628                                self.trace_recv_error(e, "deferred write");
629                            }
630                        }
631                        bus_write.complete();
632                    } else {
633                        self.deferred_action = Some(DeferredAction::Write {
634                            deferred_device_write,
635                            bus_write,
636                            value,
637                            address,
638                        });
639                    }
640                }
641            }
642        }
643    }
644}
645
646#[rustfmt::skip]
647#[derive(Inspect)]
648#[bitfield(u32)]
649struct AddressRegister {
650    #[bits(8)] register: u8,
651    #[bits(3)] function: u8,
652    #[bits(5)] device: u8,
653    #[bits(8)] bus: u8,
654    #[bits(7)] reserved: u8,
655    #[bits(1)] enabled: bool,
656}
657
658impl AddressRegister {
659    fn address(&self) -> PciAddr {
660        PciAddr {
661            bus: self.bus(),
662            device: self.device(),
663            function: self.function(),
664        }
665    }
666
667    /// Set all reserved / zero bits to zero
668    fn fixup(&mut self) {
669        // the register accessed is always DWORD aligned
670        // (the low two bits are hard-coded to 0)
671        self.set_register(self.register() & !0b11);
672        self.set_reserved(0);
673    }
674}
675
676impl core::fmt::Display for AddressRegister {
677    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
678        write!(f, "{}@{:04x}", self.address(), self.register())
679    }
680}
681
682mod save_restore {
683    use super::*;
684    use thiserror::Error;
685    use vmcore::save_restore::RestoreError;
686    use vmcore::save_restore::SaveError;
687    use vmcore::save_restore::SaveRestore;
688
689    mod state {
690        use mesh::payload::Protobuf;
691        use vmcore::save_restore::SavedStateRoot;
692
693        #[derive(Protobuf, SavedStateRoot)]
694        #[mesh(package = "pci.bus")]
695        pub struct SavedState {
696            #[mesh(1)]
697            pub pio_addr_reg: u32,
698        }
699    }
700
701    #[derive(Debug, Error)]
702    enum GenericPciBusRestoreError {
703        #[error("saved address contained non-zero reserved bits")]
704        AddressNonZeroReserved,
705        #[error("saved address contained non-dword aligned register bits")]
706        AddressNotDwordAligned,
707    }
708
709    impl SaveRestore for GenericPciBus {
710        type SavedState = state::SavedState;
711
712        fn save(&mut self) -> Result<Self::SavedState, SaveError> {
713            let GenericPciBusState { pio_addr_reg } = self.state;
714
715            let saved_state = state::SavedState {
716                pio_addr_reg: pio_addr_reg.into(),
717            };
718
719            Ok(saved_state)
720        }
721
722        fn restore(&mut self, state: Self::SavedState) -> Result<(), RestoreError> {
723            let state::SavedState { pio_addr_reg } = state;
724
725            self.state = GenericPciBusState {
726                pio_addr_reg: pio_addr_reg.into(),
727            };
728
729            // saved state sanity checks
730            {
731                if self.state.pio_addr_reg.reserved() != 0 {
732                    return Err(RestoreError::InvalidSavedState(
733                        GenericPciBusRestoreError::AddressNonZeroReserved.into(),
734                    ));
735                }
736
737                if self.state.pio_addr_reg.register() & 0b11 != 0 {
738                    return Err(RestoreError::InvalidSavedState(
739                        GenericPciBusRestoreError::AddressNotDwordAligned.into(),
740                    ));
741                }
742            }
743
744            Ok(())
745        }
746    }
747}