membacking/mapping_manager/
va_mapper.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! Implements the VA mapper, which maintains a linear virtual address space for
5//! all memory mapped into a partition.
6//!
7//! The VA mapper sends messages to the mapping manager to request mappings for
8//! specific address ranges, on demand. The mapping manager later sends
9//! invalidation requests back when tearing down mappings, e.g. when some device
10//! memory is unmapped from the partition.
11//!
12//! This lazy approach is taken to avoid having to keep each VA mapper
13//! up-to-date with all mappings at all times.
14//!
15//! TODO: This is a bit dubious because the backing hypervisor will not
16//! necessarily propagate a page fault. E.g., KVM will just fail the VP. So at
17//! least for the mapper used by the partition itself, this optimization
18//! probably needs to be removed and replaced with a guarantee that replacement
19//! mappings are established immediately (and atomically?) instead of just by
20//! invalidating the existing mappings.
21
22// UNSAFETY: Implementing the unsafe GuestMemoryAccess trait by calling unsafe
23// low level memory manipulation functions.
24#![expect(unsafe_code)]
25
26use super::manager::DmaRegionProvider;
27use super::manager::MapperId;
28use super::manager::MapperRequest;
29use super::manager::MappingParams;
30use super::manager::MappingRequest;
31use crate::RemoteProcess;
32use futures::executor::block_on;
33use guestmem::GuestMemoryAccess;
34use guestmem::GuestMemorySharing;
35use guestmem::PageFaultAction;
36use guestmem::PageFaultError;
37use memory_range::MemoryRange;
38use mesh::rpc::RpcError;
39use mesh::rpc::RpcSend;
40use parking_lot::Mutex;
41use sparse_mmap::SparseMapping;
42use std::ptr::NonNull;
43use std::sync::Arc;
44use std::thread::JoinHandle;
45use thiserror::Error;
46
47pub struct VaMapper {
48    inner: Arc<MapperInner>,
49    process: Option<RemoteProcess>,
50    private_ram: bool,
51    _thread: JoinHandle<()>,
52}
53
54impl std::fmt::Debug for VaMapper {
55    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
56        f.debug_struct("VaMapper")
57            .field("inner", &self.inner)
58            .field("_thread", &self._thread)
59            .finish()
60    }
61}
62
63#[derive(Debug)]
64struct MapperInner {
65    mapping: SparseMapping,
66    waiters: Mutex<Option<Vec<MapWaiter>>>,
67    req_send: mesh::Sender<MappingRequest>,
68    id: MapperId,
69}
70
71#[derive(Debug)]
72struct MapWaiter {
73    range: MemoryRange,
74    writable: bool,
75    done: mesh::OneshotSender<bool>,
76}
77
78impl MapWaiter {
79    fn complete(&mut self, range: MemoryRange, writable: Option<bool>) -> Option<bool> {
80        if range.contains_addr(self.range.start()) {
81            if writable.is_none() || (self.writable && writable == Some(false)) {
82                return Some(false);
83            }
84            let new_start = self.range.end().min(range.end());
85            let remaining = MemoryRange::new(new_start..self.range.end());
86            if remaining.is_empty() {
87                return Some(true);
88            }
89            tracing::debug!(%remaining, "waiting for more");
90            self.range = remaining;
91        }
92        None
93    }
94}
95
96struct MapperTask {
97    inner: Arc<MapperInner>,
98}
99
100impl MapperTask {
101    async fn run(mut self, mut req_recv: mesh::Receiver<MapperRequest>) {
102        while let Ok(req) = req_recv.recv().await {
103            match req {
104                MapperRequest::Unmap(rpc) => rpc.handle_sync(|range| {
105                    tracing::debug!(%range, "invalidate received");
106                    self.inner
107                        .mapping
108                        .unmap(range.start() as usize, range.len() as usize)
109                        .expect("invalidate request should be valid");
110                }),
111                MapperRequest::Map(MappingParams {
112                    range,
113                    mappable,
114                    writable,
115                    file_offset,
116                    ..
117                }) => {
118                    tracing::debug!(%range, "mapping received for range");
119
120                    self.inner
121                        .mapping
122                        .map_file(
123                            range.start() as usize,
124                            range.len() as usize,
125                            &mappable,
126                            file_offset,
127                            writable,
128                        )
129                        .expect("oom mapping file");
130
131                    self.wake_waiters(range, Some(writable));
132                }
133                MapperRequest::NoMapping(range) => {
134                    // Wake up waiters. They'll see a failure when they try to
135                    // access the VA.
136                    tracing::debug!(%range, "no mapping received for range");
137                    self.wake_waiters(range, None);
138                }
139            }
140        }
141        // Don't allow more waiters.
142        *self.inner.waiters.lock() = None;
143        // Invalidate everything.
144        let _ = self.inner.mapping.unmap(0, self.inner.mapping.len());
145    }
146
147    fn wake_waiters(&mut self, range: MemoryRange, writable: Option<bool>) {
148        let mut waiters = self.inner.waiters.lock();
149        let waiters = waiters.as_mut().unwrap();
150
151        let mut i = 0;
152        while i < waiters.len() {
153            if let Some(success) = waiters[i].complete(range, writable) {
154                waiters.swap_remove(i).done.send(success);
155            } else {
156                i += 1;
157            }
158        }
159    }
160}
161
162#[derive(Debug, Error)]
163pub enum VaMapperError {
164    #[error("failed to communicate with the memory manager")]
165    MemoryManagerGone(#[source] RpcError),
166    #[error("failed to reserve address space")]
167    Reserve(#[source] std::io::Error),
168    #[error("remote mappers are not supported in private memory mode")]
169    RemoteWithPrivateMemory,
170}
171
172#[derive(Debug, Error)]
173#[error("no mapping for {0}")]
174pub struct NoMapping(MemoryRange);
175
176impl MapperInner {
177    async fn request_mapping(&self, range: MemoryRange, writable: bool) -> Result<(), NoMapping> {
178        let (send, recv) = mesh::oneshot();
179        self.waiters
180            .lock()
181            .as_mut()
182            .ok_or(NoMapping(range))?
183            .push(MapWaiter {
184                range,
185                writable,
186                done: send,
187            });
188
189        tracing::debug!(%range, "waiting for mappings");
190        self.req_send
191            .send(MappingRequest::SendMappings(self.id, range));
192        match recv.await {
193            Ok(true) => Ok(()),
194            Ok(false) | Err(_) => Err(NoMapping(range)),
195        }
196    }
197}
198
199impl VaMapper {
200    pub(crate) async fn new(
201        req_send: mesh::Sender<MappingRequest>,
202        len: u64,
203        remote_process: Option<RemoteProcess>,
204        private_ram: bool,
205    ) -> Result<Self, VaMapperError> {
206        let mapping = match &remote_process {
207            None => SparseMapping::new(len as usize),
208            Some(process) => match process {
209                #[cfg(not(windows))]
210                _ => unreachable!(),
211                #[cfg(windows)]
212                process => SparseMapping::new_remote(
213                    process.as_handle().try_clone_to_owned().unwrap().into(),
214                    None,
215                    len as usize,
216                ),
217            },
218        }
219        .map_err(VaMapperError::Reserve)?;
220
221        // Name the VA reservation so it's identifiable in /proc/{pid}/smaps.
222        mapping.set_name(0, mapping.len(), "guest-memory");
223
224        let (send, req_recv) = mesh::channel();
225        let id = req_send
226            .call(MappingRequest::AddMapper, send)
227            .await
228            .map_err(VaMapperError::MemoryManagerGone)?;
229
230        let inner = Arc::new(MapperInner {
231            mapping,
232            waiters: Mutex::new(Some(Vec::new())),
233            req_send,
234            id,
235        });
236
237        // FUTURE: use a task once we resolve the block_ons in the
238        // GuestMemoryAccess implementation.
239        let thread = std::thread::Builder::new()
240            .name("mapper".to_owned())
241            .spawn({
242                let runner = MapperTask {
243                    inner: inner.clone(),
244                };
245                || block_on(runner.run(req_recv))
246            })
247            .unwrap();
248
249        Ok(VaMapper {
250            inner,
251            process: remote_process,
252            private_ram,
253            _thread: thread,
254        })
255    }
256
257    /// Ensures a mapping has been established for the given range.
258    pub async fn ensure_mapped(&self, range: MemoryRange) -> Result<(), NoMapping> {
259        self.inner.request_mapping(range, false).await
260    }
261
262    pub fn as_ptr(&self) -> *mut u8 {
263        self.inner.mapping.as_ptr().cast()
264    }
265
266    pub fn len(&self) -> usize {
267        self.inner.mapping.len()
268    }
269
270    pub fn process(&self) -> Option<&RemoteProcess> {
271        self.process.as_ref()
272    }
273
274    /// Allocates private anonymous memory for a range within the mapping.
275    ///
276    /// This replaces the placeholder at the given offset with committed
277    /// anonymous memory. Only valid when private_ram mode is enabled.
278    pub fn alloc_range(&self, offset: usize, len: usize) -> Result<(), std::io::Error> {
279        assert!(self.private_ram, "alloc_range requires private RAM mode");
280        self.inner.mapping.alloc(offset, len)
281    }
282
283    /// Names a range within the mapping for debugging (visible in smaps).
284    pub fn set_range_name(&self, offset: usize, len: usize, name: &str) {
285        self.inner.mapping.set_name(offset, len, name);
286    }
287
288    /// Marks a range as eligible for Transparent Huge Pages.
289    ///
290    /// Only valid when private_ram mode is enabled.
291    #[cfg(target_os = "linux")]
292    pub fn madvise_hugepage(&self, offset: usize, len: usize) -> Result<(), std::io::Error> {
293        assert!(
294            self.private_ram,
295            "madvise_hugepage requires private RAM mode"
296        );
297        self.inner.mapping.madvise_hugepage(offset, len)
298    }
299
300    /// Decommits a range of private RAM, releasing physical pages back to the
301    /// host.
302    ///
303    /// Only valid when private_ram mode is enabled.
304    #[expect(dead_code)] // Will be used by ballooning / memory hot-remove.
305    pub fn decommit(&self, offset: usize, len: usize) -> Result<(), std::io::Error> {
306        assert!(self.private_ram, "decommit requires private RAM mode");
307        self.inner.mapping.decommit(offset, len)
308    }
309}
310
311/// SAFETY: the underlying VA mapping is guaranteed to be valid for the lifetime
312/// of this object.
313unsafe impl GuestMemoryAccess for VaMapper {
314    fn mapping(&self) -> Option<NonNull<u8>> {
315        // No one should be using this as a GuestMemoryAccess for remote
316        // mappings, but it's convenient to have the same type for both local
317        // and remote mappings for the sake of simplicity in
318        // `PartitionRegionMapper`.
319        assert!(self.inner.mapping.is_local());
320
321        NonNull::new(self.inner.mapping.as_ptr().cast())
322    }
323
324    fn max_address(&self) -> u64 {
325        self.inner.mapping.len() as u64
326    }
327
328    fn page_fault(
329        &self,
330        address: u64,
331        len: usize,
332        write: bool,
333        bitmap_failure: bool,
334    ) -> PageFaultAction {
335        assert!(!bitmap_failure, "bitmaps are not used");
336
337        if self.private_ram {
338            // Private RAM mode: commit the page(s) directly.
339            #[cfg(windows)]
340            {
341                // Commit in 64KB-aligned chunks to amortize overhead.
342                let commit_start = address & !0xFFFF; // round down to 64KB
343                let commit_end = ((address + len as u64) + 0xFFFF) & !0xFFFF; // round up
344                let commit_end = commit_end.min(self.inner.mapping.len() as u64);
345                let commit_len = (commit_end - commit_start) as usize;
346
347                if let Err(err) = self.inner.mapping.commit(commit_start as usize, commit_len) {
348                    return PageFaultAction::Fail(PageFaultError::new(
349                        guestmem::GuestMemoryErrorKind::Other,
350                        err,
351                    ));
352                }
353                return PageFaultAction::Retry;
354            }
355            #[cfg(unix)]
356            {
357                // On Linux, the kernel handles page faults transparently.
358                // If we get here, something is wrong.
359                return PageFaultAction::Fail(PageFaultError::new(
360                    guestmem::GuestMemoryErrorKind::Other,
361                    std::io::Error::other("unexpected page fault in private RAM mode on Linux"),
362                ));
363            }
364        }
365
366        // File-backed path: request mapping from MappingManager.
367        // `block_on` is OK to call here (will not deadlock) because this is
368        // never called from the page fault handler thread or any threads it
369        // depends on.
370        //
371        // Removing this `block_on` would make all guest memory access `async`,
372        // which would be a difficult change.
373        if let Err(err) = block_on(
374            self.inner
375                .request_mapping(MemoryRange::bounding(address..address + len as u64), write),
376        ) {
377            return PageFaultAction::Fail(PageFaultError::new(
378                guestmem::GuestMemoryErrorKind::OutOfRange,
379                err,
380            ));
381        }
382        PageFaultAction::Retry
383    }
384
385    fn sharing(&self) -> Option<GuestMemorySharing> {
386        if self.private_ram {
387            return None;
388        }
389        Some(GuestMemorySharing::new(DmaRegionProvider {
390            req_send: self.inner.req_send.clone(),
391        }))
392    }
393}
394
395#[cfg(test)]
396mod tests {
397
398    use sparse_mmap::SparseMapping;
399
400    /// Tests that private RAM pages can be allocated, written to, and read from.
401    #[test]
402    fn test_private_ram_alloc_write_read() {
403        let page_size = SparseMapping::page_size();
404        let mapping = SparseMapping::new(4 * page_size).unwrap();
405
406        // Allocate (commit) the first two pages.
407        mapping.alloc(0, 2 * page_size).unwrap();
408
409        // Write and read through SparseMapping methods.
410        let data = [0xABu8; 128];
411        mapping.write_at(0, &data).unwrap();
412
413        let mut buf = [0u8; 128];
414        mapping.read_at(0, &mut buf).unwrap();
415        assert_eq!(buf, data);
416
417        // Verify zeros at an untouched offset within committed range.
418        let mut zero_buf = [0xFFu8; 64];
419        mapping.read_at(page_size, &mut zero_buf).unwrap();
420        assert!(
421            zero_buf.iter().all(|&b| b == 0),
422            "untouched committed memory should be zeros"
423        );
424    }
425
426    /// Tests that decommitting pages releases their contents (zeros on re-read on Linux).
427    #[test]
428    fn test_private_ram_decommit_zeros() {
429        let page_size = SparseMapping::page_size();
430        let mapping = SparseMapping::new(4 * page_size).unwrap();
431
432        // Commit and write data.
433        mapping.alloc(0, 2 * page_size).unwrap();
434        let pattern = vec![0xABu8; 64];
435        mapping.write_at(0, &pattern).unwrap();
436        mapping.write_at(page_size, &pattern).unwrap();
437
438        // Decommit first page.
439        mapping.decommit(0, page_size).unwrap();
440
441        // On Linux, decommitted pages read as zeros.
442        #[cfg(unix)]
443        {
444            let mut buf = vec![0xFFu8; 64];
445            mapping.read_at(0, &mut buf).unwrap();
446            assert!(
447                buf.iter().all(|&b| b == 0),
448                "decommitted page should be zeros on Linux"
449            );
450        }
451
452        // Second page should still have its data.
453        let mut buf2 = vec![0u8; 64];
454        mapping.read_at(page_size, &mut buf2).unwrap();
455        assert_eq!(buf2, pattern);
456    }
457
458    /// Tests that recommitting pages after decommit provides zeroed memory.
459    #[test]
460    fn test_private_ram_recommit_after_decommit() {
461        let page_size = SparseMapping::page_size();
462        let mapping = SparseMapping::new(4 * page_size).unwrap();
463
464        // Commit, write, decommit, recommit.
465        mapping.alloc(0, page_size).unwrap();
466        let pattern = vec![0xCDu8; 64];
467        mapping.write_at(0, &pattern).unwrap();
468
469        mapping.decommit(0, page_size).unwrap();
470        mapping.commit(0, page_size).unwrap();
471
472        // After recommit, the page should be zeros (old data is gone).
473        let mut buf = vec![0xFFu8; 64];
474        mapping.read_at(0, &mut buf).unwrap();
475        assert!(
476            buf.iter().all(|&b| b == 0),
477            "recommitted page should be zeros"
478        );
479
480        // Can write and read new data.
481        let new_data = vec![0xEFu8; 64];
482        mapping.write_at(0, &new_data).unwrap();
483        let mut buf2 = vec![0u8; 64];
484        mapping.read_at(0, &mut buf2).unwrap();
485        assert_eq!(buf2, new_data);
486    }
487
488    /// Tests that commit is idempotent (committing already-committed pages is
489    /// a no-op).
490    #[test]
491    fn test_private_ram_commit_idempotent() {
492        let page_size = SparseMapping::page_size();
493        let mapping = SparseMapping::new(4 * page_size).unwrap();
494
495        // Alloc then commit the same range again.
496        mapping.alloc(0, 2 * page_size).unwrap();
497        mapping.commit(0, 2 * page_size).unwrap();
498        mapping.commit(0, page_size).unwrap();
499
500        // Write and read should work.
501        let pattern = vec![0xEFu8; 64];
502        mapping.write_at(0, &pattern).unwrap();
503        let mut buf = vec![0u8; 64];
504        mapping.read_at(0, &mut buf).unwrap();
505        assert_eq!(buf, pattern);
506    }
507}