membacking/memory_manager/
mod.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! OpenVMM's memory manager.
5
6mod device_memory;
7
8pub use device_memory::DeviceMemoryMapper;
9
10use crate::RemoteProcess;
11use crate::mapping_manager::Mappable;
12use crate::mapping_manager::MappingManager;
13use crate::mapping_manager::MappingManagerClient;
14use crate::mapping_manager::VaMapper;
15use crate::mapping_manager::VaMapperError;
16use crate::partition_mapper::PartitionMapper;
17use crate::region_manager::MapParams;
18use crate::region_manager::RegionHandle;
19use crate::region_manager::RegionManager;
20use guestmem::GuestMemory;
21use hvdef::Vtl;
22use inspect::Inspect;
23use memory_range::MemoryRange;
24use mesh::MeshPayload;
25use pal_async::DefaultPool;
26use std::sync::Arc;
27use std::thread::JoinHandle;
28use thiserror::Error;
29use vm_topology::memory::MemoryLayout;
30
31/// The OpenVMM memory manager.
32#[derive(Debug, Inspect)]
33pub struct GuestMemoryManager {
34    /// Guest RAM allocation. None in private memory mode.
35    #[inspect(skip)]
36    guest_ram: Option<Mappable>,
37
38    #[inspect(skip)]
39    ram_regions: Arc<Vec<RamRegion>>,
40
41    #[inspect(flatten)]
42    mapping_manager: MappingManager,
43
44    #[inspect(flatten)]
45    region_manager: RegionManager,
46
47    #[inspect(skip)]
48    va_mapper: Arc<VaMapper>,
49
50    #[inspect(skip)]
51    _thread: JoinHandle<()>,
52
53    vtl0_alias_map_offset: Option<u64>,
54    pin_mappings: bool,
55}
56
57#[derive(Debug)]
58struct RamRegion {
59    range: MemoryRange,
60    handle: RegionHandle,
61}
62
63/// Errors when attaching a partition to a [`GuestMemoryManager`].
64#[derive(Error, Debug)]
65pub enum PartitionAttachError {
66    /// Failure to allocate a VA mapper.
67    #[error("failed to reserve VA range for partition mapping")]
68    VaMapper(#[source] VaMapperError),
69    /// Failure to map memory into a partition.
70    #[error("failed to attach partition to memory manager")]
71    PartitionMapper(#[source] crate::partition_mapper::PartitionMapperError),
72}
73
74/// Errors creating a [`GuestMemoryManager`].
75#[derive(Error, Debug)]
76pub enum MemoryBuildError {
77    /// RAM too large.
78    #[error("ram size {0} is too large")]
79    RamTooLarge(u64),
80    /// Couldn't allocate RAM.
81    #[error("failed to allocate memory")]
82    AllocationFailed(#[source] std::io::Error),
83    /// Couldn't allocate VA mapper.
84    #[error("failed to create VA mapper")]
85    VaMapper(#[source] VaMapperError),
86    /// Memory layout incompatible with VTL0 alias map.
87    #[error("not enough guest address space available for the vtl0 alias map")]
88    AliasMapWontFit,
89    /// Memory layout incompatible with x86 legacy support.
90    #[error("x86 support requires RAM to start at 0 and contain at least 1MB")]
91    InvalidRamForX86,
92    /// Private memory is incompatible with x86 legacy support.
93    #[error("private memory is incompatible with x86 legacy support")]
94    PrivateMemoryWithLegacy,
95    /// Private memory is incompatible with existing memory backing.
96    #[error("private memory is incompatible with existing memory backing")]
97    PrivateMemoryWithExistingBacking,
98    /// Failed to allocate private RAM range.
99    #[error("failed to allocate private RAM range {1}")]
100    PrivateRamAlloc(#[source] std::io::Error, MemoryRange),
101    /// THP requires private memory mode.
102    #[error("transparent huge pages requires private memory mode")]
103    ThpWithoutPrivateMemory,
104    /// THP is only supported on Linux.
105    #[error("transparent huge pages is only supported on Linux")]
106    ThpUnsupportedPlatform,
107}
108
109/// A builder for [`GuestMemoryManager`].
110pub struct GuestMemoryBuilder {
111    existing_mapping: Option<SharedMemoryBacking>,
112    vtl0_alias_map: Option<u64>,
113    prefetch_ram: bool,
114    pin_mappings: bool,
115    x86_legacy_support: bool,
116    private_memory: bool,
117    transparent_hugepages: bool,
118}
119
120impl GuestMemoryBuilder {
121    /// Returns a new builder.
122    pub fn new() -> Self {
123        Self {
124            existing_mapping: None,
125            vtl0_alias_map: None,
126            pin_mappings: false,
127            prefetch_ram: false,
128            x86_legacy_support: false,
129            private_memory: false,
130            transparent_hugepages: false,
131        }
132    }
133
134    /// Specifies an existing memory backing to use.
135    pub fn existing_backing(mut self, mapping: Option<SharedMemoryBacking>) -> Self {
136        self.existing_mapping = mapping;
137        self
138    }
139
140    /// Specifies the offset of the VTL0 alias map, if enabled for VTL2. This is
141    /// a mirror of VTL0 memory into a high portion of the VM's physical address
142    /// space.
143    pub fn vtl0_alias_map(mut self, offset: Option<u64>) -> Self {
144        self.vtl0_alias_map = offset;
145        self
146    }
147
148    /// Specify whether to pin mappings in memory. This is used to support
149    /// device assignment for devices that require the IOMMU to be programmed
150    /// for all addresses.
151    pub fn pin_mappings(mut self, enable: bool) -> Self {
152        self.pin_mappings = enable;
153        self
154    }
155
156    /// Specify whether to prefetch RAM mappings. This improves boot performance
157    /// by reducing memory intercepts at the cost of pre-allocating all of RAM.
158    pub fn prefetch_ram(mut self, enable: bool) -> Self {
159        self.prefetch_ram = enable;
160        self
161    }
162
163    /// Enables legacy x86 support.
164    ///
165    /// When set, create separate RAM regions for the various low memory ranges
166    /// that are special on x86 platforms. Specifically:
167    ///
168    /// 1. Create a separate RAM region for the VGA VRAM window:
169    ///    0xa0000-0xbffff.
170    /// 2. Create separate RAM regions within 0xc0000-0xfffff for control by PAM
171    ///    registers.
172    ///
173    /// The caller can use [`RamVisibilityControl`] to adjust the visibility of
174    /// these ranges.
175    pub fn x86_legacy_support(mut self, enable: bool) -> Self {
176        self.x86_legacy_support = enable;
177        self
178    }
179
180    /// Enables private anonymous memory for guest RAM.
181    ///
182    /// When set, guest RAM is backed by anonymous pages (`mmap
183    /// MAP_ANONYMOUS` on Linux, `VirtualAlloc` on Windows) rather than
184    /// shared file-backed sections. This supports decommit to release
185    /// physical pages back to the host.
186    ///
187    /// This is incompatible with [`x86_legacy_support`](Self::x86_legacy_support)
188    /// and [`existing_backing`](Self::existing_backing).
189    pub fn private_memory(mut self, enable: bool) -> Self {
190        self.private_memory = enable;
191        self
192    }
193
194    /// Enables Transparent Huge Pages for guest RAM.
195    ///
196    /// When set, `madvise(MADV_HUGEPAGE)` is called on private RAM allocations
197    /// to allow khugepaged to collapse 4K pages into 2MB huge pages.
198    /// Requires [`private_memory`](Self::private_memory) and Linux; `build()`
199    /// will return an error if either condition is not met.
200    pub fn transparent_hugepages(mut self, enable: bool) -> Self {
201        self.transparent_hugepages = enable;
202        self
203    }
204
205    /// Builds the memory backing, allocating memory if existing memory was not
206    /// provided by [`existing_backing`](Self::existing_backing).
207    pub async fn build(
208        self,
209        mem_layout: &MemoryLayout,
210    ) -> Result<GuestMemoryManager, MemoryBuildError> {
211        // Validate private memory constraints.
212        if self.private_memory {
213            if self.x86_legacy_support {
214                return Err(MemoryBuildError::PrivateMemoryWithLegacy);
215            }
216            if self.existing_mapping.is_some() {
217                return Err(MemoryBuildError::PrivateMemoryWithExistingBacking);
218            }
219        }
220
221        // Validate THP constraints.
222        if self.transparent_hugepages {
223            if !self.private_memory {
224                return Err(MemoryBuildError::ThpWithoutPrivateMemory);
225            }
226            if !cfg!(target_os = "linux") {
227                return Err(MemoryBuildError::ThpUnsupportedPlatform);
228            }
229        }
230
231        let ram_size = mem_layout.ram_size() + mem_layout.vtl2_range().map_or(0, |r| r.len());
232
233        let memory: Option<Mappable> = if self.private_memory {
234            // Private memory mode: no shared file-backed allocation.
235            // RAM will be backed by anonymous pages in the VaMapper's SparseMapping.
236            None
237        } else if let Some(memory) = self.existing_mapping {
238            Some(memory.guest_ram)
239        } else {
240            Some(
241                sparse_mmap::alloc_shared_memory(
242                    ram_size
243                        .try_into()
244                        .map_err(|_| MemoryBuildError::RamTooLarge(ram_size))?,
245                    "guest-ram",
246                )
247                .map_err(MemoryBuildError::AllocationFailed)?
248                .into(),
249            )
250        };
251
252        // Spawn a thread to handle memory requests.
253        //
254        // FUTURE: move this to a task once the GuestMemory deadlocks are resolved.
255        let (thread, spawner) = DefaultPool::spawn_on_thread("memory_manager");
256
257        let max_addr =
258            (mem_layout.end_of_layout()).max(mem_layout.vtl2_range().map_or(0, |r| r.end()));
259
260        let vtl0_alias_map_offset = if let Some(offset) = self.vtl0_alias_map {
261            if max_addr > offset {
262                return Err(MemoryBuildError::AliasMapWontFit);
263            }
264            Some(offset)
265        } else {
266            None
267        };
268
269        let mapping_manager = MappingManager::new(&spawner, max_addr, self.private_memory);
270        let va_mapper = mapping_manager
271            .client()
272            .new_mapper()
273            .await
274            .map_err(MemoryBuildError::VaMapper)?;
275
276        let region_manager = RegionManager::new(&spawner, mapping_manager.client().clone());
277
278        let mut ram_ranges = mem_layout
279            .ram()
280            .iter()
281            .map(|x| x.range)
282            .chain(mem_layout.vtl2_range())
283            .collect::<Vec<_>>();
284
285        if self.x86_legacy_support {
286            if ram_ranges[0].start() != 0 || ram_ranges[0].end() < 0x100000 {
287                return Err(MemoryBuildError::InvalidRamForX86);
288            }
289
290            // Split RAM ranges to support PAM registers and VGA RAM.
291            let range_starts = [
292                0,
293                0xa0000,
294                0xc0000,
295                0xc4000,
296                0xc8000,
297                0xcc000,
298                0xd0000,
299                0xd4000,
300                0xd8000,
301                0xdc000,
302                0xe0000,
303                0xe4000,
304                0xe8000,
305                0xec000,
306                0xf0000,
307                0x100000,
308                ram_ranges[0].end(),
309            ];
310
311            ram_ranges.splice(
312                0..1,
313                range_starts
314                    .iter()
315                    .zip(range_starts.iter().skip(1))
316                    .map(|(&start, &end)| MemoryRange::new(start..end)),
317            );
318        }
319
320        // In private memory mode, eagerly commit all RAM ranges with
321        // anonymous memory. alloc_range() handles both Linux (mmap MAP_FIXED)
322        // and Windows (MEM_REPLACE_PLACEHOLDER).
323        if self.private_memory {
324            for range in &ram_ranges {
325                va_mapper
326                    .alloc_range(range.start() as usize, range.len() as usize)
327                    .map_err(|e| MemoryBuildError::PrivateRamAlloc(e, *range))?;
328                va_mapper.set_range_name(
329                    range.start() as usize,
330                    range.len() as usize,
331                    "guest-ram-private",
332                );
333            }
334
335            // Mark private RAM as THP-eligible so khugepaged can collapse
336            // 4K pages into 2MB huge pages.
337            #[cfg(target_os = "linux")]
338            if self.transparent_hugepages {
339                for range in &ram_ranges {
340                    if let Err(e) =
341                        va_mapper.madvise_hugepage(range.start() as usize, range.len() as usize)
342                    {
343                        tracing::warn!(
344                            error = &e as &dyn std::error::Error,
345                            range = %range,
346                            "failed to mark RAM as THP eligible"
347                        );
348                    }
349                }
350            }
351        }
352
353        let mut ram_regions = Vec::new();
354        let mut start = 0;
355        for range in &ram_ranges {
356            let region = region_manager
357                .client()
358                .new_region("ram".into(), *range, RAM_PRIORITY, true)
359                .await
360                .expect("regions cannot overlap yet");
361
362            if let Some(ref memory) = memory {
363                // File-backed mode: add mapping for this RAM range.
364                region
365                    .add_mapping(
366                        MemoryRange::new(0..range.len()),
367                        memory.clone(),
368                        start,
369                        true,
370                    )
371                    .await;
372            }
373            // In private_memory mode, skip add_mapping — no file-backed RAM.
374            // The SparseMapping VA is already committed via alloc_range() above.
375
376            region
377                .map(MapParams {
378                    writable: true,
379                    executable: true,
380                    prefetch: self.prefetch_ram && !self.private_memory,
381                })
382                .await;
383
384            ram_regions.push(RamRegion {
385                range: *range,
386                handle: region,
387            });
388            start += range.len();
389        }
390
391        let gm = GuestMemoryManager {
392            guest_ram: memory,
393            _thread: thread,
394            ram_regions: Arc::new(ram_regions),
395            mapping_manager,
396            region_manager,
397            va_mapper,
398            vtl0_alias_map_offset,
399            pin_mappings: self.pin_mappings,
400        };
401        Ok(gm)
402    }
403}
404
405/// The backing objects used to transfer guest memory between processes.
406#[derive(Debug, MeshPayload)]
407pub struct SharedMemoryBacking {
408    guest_ram: Mappable,
409}
410
411impl SharedMemoryBacking {
412    /// Create a SharedMemoryBacking from a mappable handle/fd.
413    pub fn from_mappable(guest_ram: Mappable) -> Self {
414        Self { guest_ram }
415    }
416}
417
418/// A mesh-serializable object for providing access to guest memory.
419#[derive(Debug, MeshPayload)]
420pub struct GuestMemoryClient {
421    mapping_manager: MappingManagerClient,
422}
423
424impl GuestMemoryClient {
425    /// Retrieves a [`GuestMemory`] object to access guest memory from this
426    /// process.
427    ///
428    /// This call will ensure only one VA mapper is allocated per process, so
429    /// this is safe to call many times without allocating tons of virtual
430    /// address space.
431    pub async fn guest_memory(&self) -> Result<GuestMemory, VaMapperError> {
432        Ok(GuestMemory::new(
433            "ram",
434            self.mapping_manager.new_mapper().await?,
435        ))
436    }
437}
438
439// The region priority for RAM. Overrides anything else.
440const RAM_PRIORITY: u8 = 255;
441
442// The region priority for device memory.
443const DEVICE_PRIORITY: u8 = 0;
444
445impl GuestMemoryManager {
446    /// Returns an object to access guest memory.
447    pub fn client(&self) -> GuestMemoryClient {
448        GuestMemoryClient {
449            mapping_manager: self.mapping_manager.client().clone(),
450        }
451    }
452
453    /// Returns an object to map device memory into the VM.
454    pub fn device_memory_mapper(&self) -> DeviceMemoryMapper {
455        DeviceMemoryMapper::new(self.region_manager.client().clone())
456    }
457
458    /// Returns an object for manipulating the visibility state of different RAM
459    /// regions.
460    pub fn ram_visibility_control(&self) -> RamVisibilityControl {
461        RamVisibilityControl {
462            regions: self.ram_regions.clone(),
463        }
464    }
465
466    /// Returns the shared memory resources that can be used to reconstruct the
467    /// memory backing.
468    ///
469    /// This can be used with [`GuestMemoryBuilder::existing_backing`] to create a
470    /// new memory manager with the same memory state. Only one instance of this
471    /// type should be managing a given memory backing at a time, though, or the
472    /// guest may see unpredictable results.
473    ///
474    /// Returns `None` in private memory mode, where there is no shared
475    /// file-backed allocation.
476    pub fn shared_memory_backing(&self) -> Option<SharedMemoryBacking> {
477        let guest_ram = self.guest_ram.clone()?;
478        Some(SharedMemoryBacking { guest_ram })
479    }
480
481    /// Attaches the guest memory to a partition, mapping it to the guest
482    /// physical address space.
483    ///
484    /// If `process` is provided, then allocate a VA range in that process for
485    /// the guest memory, and map the memory into the partition from that
486    /// process. This is necessary to work around WHP's lack of support for
487    /// mapping multiple partitions from a single process.
488    ///
489    /// TODO: currently, all VTLs will get the same mappings--no support for
490    /// per-VTL memory protections is supported.
491    pub async fn attach_partition(
492        &mut self,
493        vtl: Vtl,
494        partition: &Arc<dyn virt::PartitionMemoryMap>,
495        process: Option<RemoteProcess>,
496    ) -> Result<(), PartitionAttachError> {
497        let va_mapper = if let Some(process) = process {
498            self.mapping_manager
499                .client()
500                .new_remote_mapper(process)
501                .await
502                .map_err(PartitionAttachError::VaMapper)?
503        } else {
504            self.va_mapper.clone()
505        };
506
507        if vtl == Vtl::Vtl2 {
508            if let Some(offset) = self.vtl0_alias_map_offset {
509                let partition =
510                    PartitionMapper::new(partition, va_mapper.clone(), offset, self.pin_mappings);
511                self.region_manager
512                    .client()
513                    .add_partition(partition)
514                    .await
515                    .map_err(PartitionAttachError::PartitionMapper)?;
516            }
517        }
518
519        let partition = PartitionMapper::new(partition, va_mapper, 0, self.pin_mappings);
520        self.region_manager
521            .client()
522            .add_partition(partition)
523            .await
524            .map_err(PartitionAttachError::PartitionMapper)?;
525        Ok(())
526    }
527}
528
529/// A client to the [`GuestMemoryManager`] used to control the visibility of
530/// RAM regions.
531pub struct RamVisibilityControl {
532    regions: Arc<Vec<RamRegion>>,
533}
534
535/// The RAM visibility for use with [`RamVisibilityControl::set_ram_visibility`].
536#[derive(Debug, Copy, Clone, PartialEq, Eq)]
537pub enum RamVisibility {
538    /// RAM is unmapped, so reads and writes will go to device memory or MMIO.
539    Unmapped,
540    /// RAM is read-only. Writes will go to device memory or MMIO.
541    ///
542    /// Note that writes will take exits even if there is mapped device memory.
543    ReadOnly,
544    /// RAM is read-write by the guest.
545    ReadWrite,
546}
547
548/// An error returned by [`RamVisibilityControl::set_ram_visibility`].
549#[derive(Debug, Error)]
550#[error("{0} is not a controllable RAM range")]
551pub struct InvalidRamRegion(MemoryRange);
552
553impl RamVisibilityControl {
554    /// Sets the visibility of a RAM region.
555    ///
556    /// A whole region's visibility must be controlled at once, or an error will
557    /// be returned. [`GuestMemoryBuilder::x86_legacy_support`] can be used to
558    /// ensure that there are RAM regions corresponding to x86 memory ranges
559    /// that need to be controlled.
560    pub async fn set_ram_visibility(
561        &self,
562        range: MemoryRange,
563        visibility: RamVisibility,
564    ) -> Result<(), InvalidRamRegion> {
565        let region = self
566            .regions
567            .iter()
568            .find(|region| region.range == range)
569            .ok_or(InvalidRamRegion(range))?;
570
571        match visibility {
572            RamVisibility::ReadWrite | RamVisibility::ReadOnly => {
573                region
574                    .handle
575                    .map(MapParams {
576                        writable: matches!(visibility, RamVisibility::ReadWrite),
577                        executable: true,
578                        prefetch: false,
579                    })
580                    .await
581            }
582            RamVisibility::Unmapped => region.handle.unmap().await,
583        }
584        Ok(())
585    }
586}