virtiofs/
lib.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4#![expect(missing_docs)]
5#![cfg(any(windows, target_os = "linux"))]
6
7mod file;
8mod inode;
9#[cfg(test)]
10mod integration_tests;
11pub mod resolver;
12#[cfg(windows)]
13mod section;
14mod util;
15pub mod virtio;
16mod virtio_util;
17
18#[cfg(windows)]
19pub use section::SectionFs;
20
21use file::VirtioFsFile;
22use fuse::protocol::*;
23use fuse::*;
24use inode::VirtioFsInode;
25pub use lxutil::LxVolumeOptions;
26use parking_lot::RwLock;
27use std::collections::HashMap;
28use std::collections::hash_map::Entry;
29use std::path::Path;
30use std::path::PathBuf;
31use std::sync::Arc;
32use std::time::Duration;
33
34// TODO: Make these configurable.
35// FUSE likes to spam getattr a lot, so having a small timeout on the attributes avoids excessive
36// calls. It also means that a lookup/stat sequence can use the attributes returned by lookup
37// rather than having to call getattr.
38const ATTRIBUTE_TIMEOUT: Duration = Duration::from_millis(1);
39
40// Entry timeout must be zero, because on rename existing entries for the child being renamed do
41// not get updated and would stop working. Having a zero timeout forces a new lookup which will
42// update the path.
43const ENTRY_TIMEOUT: Duration = Duration::from_secs(0);
44
45/// Implementation of the virtio-fs file system.
46pub struct VirtioFs {
47    inodes: RwLock<InodeMap>,
48    files: RwLock<HandleMap<Arc<VirtioFsFile>>>,
49    readonly: bool,
50}
51
52impl Fuse for VirtioFs {
53    fn init(&self, info: &mut SessionInfo) {
54        // Indicate we support both readdir and readdirplus.
55        if info.capable() & FUSE_DO_READDIRPLUS != 0 {
56            info.want |= FUSE_DO_READDIRPLUS;
57        }
58
59        // Using "auto" lets FUSE pick whether to use readdir or readdirplus, which can be
60        // beneficial since readdirplus needs to query every file and is therefore more expensive.
61        if info.capable() & FUSE_READDIRPLUS_AUTO != 0 {
62            info.want |= FUSE_READDIRPLUS_AUTO;
63        }
64    }
65
66    fn get_attr(&self, request: &Request, flags: u32, fh: u64) -> lx::Result<fuse_attr_out> {
67        let node_id = request.node_id();
68        // If a file handle is specified, get the attributes from the open file. This is faster on
69        // Windows and works if the file was deleted.
70        let attr = if flags & FUSE_GETATTR_FH != 0 {
71            let file = self.get_file(fh)?;
72            file.get_attr()?
73        } else {
74            let inode = self.get_inode(node_id)?;
75            inode.get_attr()?
76        };
77
78        Ok(fuse_attr_out::new(ATTRIBUTE_TIMEOUT, attr))
79    }
80
81    fn get_statx(
82        &self,
83        request: &Request,
84        fh: u64,
85        getattr_flags: u32,
86        flags: StatxFlags,
87        _mask: lx::StatExMask,
88    ) -> lx::Result<fuse_statx_out> {
89        let node_id = request.node_id();
90        // If a file handle is specified, get the attributes from the open file. This is faster on
91        // Windows and works if the file was deleted.
92        let statx = if getattr_flags & FUSE_GETATTR_FH != 0 {
93            let file = self.get_file(fh)?;
94            file.get_statx()?
95        } else {
96            let inode = self.get_inode(node_id)?;
97            inode.get_statx()?
98        };
99
100        Ok(fuse_statx_out::new(ATTRIBUTE_TIMEOUT, flags, statx))
101    }
102
103    fn set_attr(&self, request: &Request, arg: &fuse_setattr_in) -> lx::Result<fuse_attr_out> {
104        let node_id = request.node_id();
105
106        // If a file handle is specified, set the attributes on the open file. This is faster on
107        // Windows and works if the file was deleted.
108        let attr = if arg.valid & FATTR_FH != 0 {
109            let file = self.get_file(arg.fh)?;
110            // Block truncation and other modifications on readonly filesystems
111            if arg.valid & !(FATTR_FH | FATTR_LOCKOWNER) != 0 {
112                self.check_writable()?;
113            }
114            file.set_attr(arg, request.uid())?;
115            file.get_attr()?
116        } else {
117            let inode = self.get_inode(node_id)?;
118            // Block truncation and other modifications on readonly filesystems
119            if arg.valid & !(FATTR_FH | FATTR_LOCKOWNER) != 0 {
120                self.check_writable()?;
121            }
122            inode.set_attr(arg, request.uid())?
123        };
124
125        Ok(fuse_attr_out::new(ATTRIBUTE_TIMEOUT, attr))
126    }
127
128    fn lookup(&self, request: &Request, name: &lx::LxStr) -> lx::Result<fuse_entry_out> {
129        let inode = self.get_inode(request.node_id())?;
130        self.lookup_helper(&inode, name)
131    }
132
133    fn forget(&self, node_id: u64, lookup_count: u64) {
134        // This must be done under lock so an inode can't be resurrected between the lookup count
135        // reaching zero and removing it from the list.
136        let mut inodes = self.inodes.write();
137        if let Some(inode) = inodes.get(node_id) {
138            if inode.forget(node_id, lookup_count) == 0 {
139                tracing::trace!(node_id, "Removing inode");
140                inodes.remove(node_id);
141            }
142        }
143    }
144
145    fn open(&self, request: &Request, flags: u32) -> lx::Result<fuse_open_out> {
146        let inode = self.get_inode(request.node_id())?;
147        self.check_open_readonly(&inode, flags)?;
148        let file = inode.open(flags)?;
149        let fh = self.insert_file(file);
150
151        // TODO: Optionally allow caching.
152        Ok(fuse_open_out::new(fh, FOPEN_DIRECT_IO))
153    }
154
155    fn create(
156        &self,
157        request: &Request,
158        name: &lx::LxStr,
159        arg: &fuse_create_in,
160    ) -> lx::Result<CreateOut> {
161        let inode = self.get_inode(request.node_id())?;
162        self.check_writable()?;
163        let (new_inode, attr, file) =
164            inode.create(name, arg.flags, arg.mode, request.uid(), request.gid())?;
165
166        // Insert the newly created inode; this can return an existing inode if it found a match
167        // on the inode number (if this is a non-exclusive create), so make sure to associate the
168        // file with the returned inode.
169        let (new_inode, node_id) = self.insert_inode(new_inode);
170        let file = VirtioFsFile::new(file, new_inode);
171        let fh = self.insert_file(file);
172        Ok(CreateOut {
173            entry: fuse_entry_out::new(node_id, ENTRY_TIMEOUT, ATTRIBUTE_TIMEOUT, attr),
174            open: fuse_open_out::new(fh, FOPEN_DIRECT_IO),
175        })
176    }
177
178    fn mkdir(
179        &self,
180        request: &Request,
181        name: &lx::LxStr,
182        arg: &fuse_mkdir_in,
183    ) -> lx::Result<fuse_entry_out> {
184        let inode = self.get_inode(request.node_id())?;
185        self.check_writable()?;
186        let (new_inode, attr) = inode.mkdir(name, arg.mode, request.uid(), request.gid())?;
187        let (_, node_id) = self.insert_inode(new_inode);
188        Ok(fuse_entry_out::new(
189            node_id,
190            ENTRY_TIMEOUT,
191            ATTRIBUTE_TIMEOUT,
192            attr,
193        ))
194    }
195
196    fn mknod(
197        &self,
198        request: &Request,
199        name: &lx::LxStr,
200        arg: &fuse_mknod_in,
201    ) -> lx::Result<fuse_entry_out> {
202        let inode = self.get_inode(request.node_id())?;
203        self.check_writable()?;
204        let (new_inode, attr) =
205            inode.mknod(name, arg.mode, request.uid(), request.gid(), arg.rdev)?;
206
207        let (_, node_id) = self.insert_inode(new_inode);
208        Ok(fuse_entry_out::new(
209            node_id,
210            ENTRY_TIMEOUT,
211            ATTRIBUTE_TIMEOUT,
212            attr,
213        ))
214    }
215
216    fn symlink(
217        &self,
218        request: &Request,
219        name: &lx::LxStr,
220        target: &lx::LxStr,
221    ) -> lx::Result<fuse_entry_out> {
222        let inode = self.get_inode(request.node_id())?;
223        self.check_writable()?;
224        let (new_inode, attr) = inode.symlink(name, target, request.uid(), request.gid())?;
225
226        let (_, node_id) = self.insert_inode(new_inode);
227        Ok(fuse_entry_out::new(
228            node_id,
229            ENTRY_TIMEOUT,
230            ATTRIBUTE_TIMEOUT,
231            attr,
232        ))
233    }
234
235    fn link(&self, request: &Request, name: &lx::LxStr, target: u64) -> lx::Result<fuse_entry_out> {
236        let inode = self.get_inode(request.node_id())?;
237        let target_inode = self.get_inode(target)?;
238        self.check_writable()?;
239        let attr = inode.link(name, &target_inode)?;
240
241        // Increment the lookup count since we're returning an entry for this inode.
242        // The kernel will send a forget for this entry later.
243        target_inode.inc_lookup();
244
245        // Use the target inode as the reply, with refreshed attributes.
246        Ok(fuse_entry_out::new(
247            target,
248            ENTRY_TIMEOUT,
249            ATTRIBUTE_TIMEOUT,
250            attr,
251        ))
252    }
253
254    fn read_link(&self, request: &Request) -> lx::Result<lx::LxString> {
255        let inode = self.get_inode(request.node_id())?;
256        inode.read_link()
257    }
258
259    fn read(&self, _request: &Request, arg: &fuse_read_in) -> lx::Result<Vec<u8>> {
260        let file = self.get_file(arg.fh)?;
261        let mut buffer = vec![0u8; arg.size as usize];
262        let size = file.read(&mut buffer, arg.offset)?;
263        buffer.truncate(size);
264        Ok(buffer)
265    }
266
267    fn write(&self, request: &Request, arg: &fuse_write_in, data: &[u8]) -> lx::Result<usize> {
268        let file = self.get_file(arg.fh)?;
269        self.check_writable()?;
270        file.write(data, arg.offset, request.uid())
271    }
272
273    fn release(&self, _request: &Request, arg: &fuse_release_in) -> lx::Result<()> {
274        self.remove_file(arg.fh);
275        Ok(())
276    }
277
278    fn open_dir(&self, request: &Request, flags: u32) -> lx::Result<fuse_open_out> {
279        // There is no special handling for directories, so just call open.
280        self.open(request, flags)
281    }
282
283    fn read_dir(&self, _request: &Request, arg: &fuse_read_in) -> lx::Result<Vec<u8>> {
284        let file = self.get_file(arg.fh)?;
285        file.read_dir(self, arg.offset, arg.size, false)
286    }
287
288    fn read_dir_plus(&self, _request: &Request, arg: &fuse_read_in) -> lx::Result<Vec<u8>> {
289        let file = self.get_file(arg.fh)?;
290        file.read_dir(self, arg.offset, arg.size, true)
291    }
292
293    fn release_dir(&self, request: &Request, arg: &fuse_release_in) -> lx::Result<()> {
294        self.release(request, arg)
295    }
296
297    fn unlink(&self, request: &Request, name: &lx::LxStr) -> lx::Result<()> {
298        self.unlink_helper(request, name, 0)
299    }
300
301    fn rmdir(&self, request: &Request, name: &lx::LxStr) -> lx::Result<()> {
302        self.unlink_helper(request, name, lx::AT_REMOVEDIR)
303    }
304
305    fn rename(
306        &self,
307        request: &Request,
308        name: &lx::LxStr,
309        new_dir: u64,
310        new_name: &lx::LxStr,
311        flags: u32,
312    ) -> lx::Result<()> {
313        let inode = self.get_inode(request.node_id())?;
314        let new_inode = self.get_inode(new_dir)?;
315        self.check_writable()?;
316        inode.rename(name, &new_inode, new_name, flags)
317    }
318
319    fn statfs(&self, request: &Request) -> lx::Result<fuse_kstatfs> {
320        let inode = self.get_inode(request.node_id())?;
321        inode.stat_fs()
322    }
323
324    fn fsync(&self, _request: &Request, fh: u64, flags: u32) -> lx::Result<()> {
325        let file = self.get_file(fh)?;
326        let data_only = flags & FUSE_FSYNC_FDATASYNC != 0;
327        file.fsync(data_only)
328    }
329
330    fn fsync_dir(&self, request: &Request, fh: u64, flags: u32) -> lx::Result<()> {
331        self.fsync(request, fh, flags)
332    }
333
334    fn get_xattr(&self, request: &Request, name: &lx::LxStr, size: u32) -> lx::Result<Vec<u8>> {
335        let inode = self.get_inode(request.node_id())?;
336        let mut value = vec![0u8; size as usize];
337        let size = inode.get_xattr(name, Some(&mut value))?;
338        value.truncate(size);
339        Ok(value)
340    }
341
342    fn get_xattr_size(&self, request: &Request, name: &lx::LxStr) -> lx::Result<u32> {
343        let inode = self.get_inode(request.node_id())?;
344        let size = inode.get_xattr(name, None)?;
345        let size = size.try_into().map_err(|_| lx::Error::E2BIG)?;
346        Ok(size)
347    }
348
349    fn set_xattr(
350        &self,
351        request: &Request,
352        name: &lx::LxStr,
353        value: &[u8],
354        flags: u32,
355    ) -> lx::Result<()> {
356        let inode = self.get_inode(request.node_id())?;
357        self.check_writable()?;
358        inode.set_xattr(name, value, flags)
359    }
360
361    fn list_xattr(&self, request: &Request, size: u32) -> lx::Result<Vec<u8>> {
362        let inode = self.get_inode(request.node_id())?;
363        let mut list = vec![0u8; size as usize];
364        let size = inode.list_xattr(Some(&mut list))?;
365        list.truncate(size);
366        Ok(list)
367    }
368
369    fn list_xattr_size(&self, request: &Request) -> lx::Result<u32> {
370        let inode = self.get_inode(request.node_id())?;
371        let size = inode.list_xattr(None)?;
372        let size = size.try_into().map_err(|_| lx::Error::E2BIG)?;
373        Ok(size)
374    }
375
376    fn remove_xattr(&self, request: &Request, name: &lx::LxStr) -> lx::Result<()> {
377        let inode = self.get_inode(request.node_id())?;
378        self.check_writable()?;
379        inode.remove_xattr(name)
380    }
381
382    fn destroy(&self) {
383        // To get the file system ready for re-mount, clean out any open files and leaked inodes.
384        self.files.write().clear();
385        self.inodes.write().clear();
386    }
387}
388
389impl VirtioFs {
390    /// Check if the filesystem is readonly and return EROFS if so.
391    fn check_writable(&self) -> lx::Result<()> {
392        if self.readonly {
393            Err(lx::Error::EROFS)
394        } else {
395            Ok(())
396        }
397    }
398
399    /// Check whether the open flags are permitted on a read-only filesystem.
400    fn check_open_readonly(&self, inode: &VirtioFsInode, flags: u32) -> lx::Result<()> {
401        if !self.readonly {
402            return Ok(());
403        }
404
405        // This section exists to superceed error codes when various combination of flags
406        // are passed to the open() call. This helps maintain POSIX compatibility
407        // If O_CREAT | O_EXCL && file_exists => EEXIST
408        // If O_CREAT && file_exists => fallthrough to check other checks
409        // If O_CREAT && !file_exists => EROFS
410        // Other errors that occur while checking file_exists should bubble up
411        if flags & lx::O_CREAT as u32 != 0 {
412            match inode.get_attr() {
413                Ok(_) if flags & lx::O_EXCL as u32 != 0 => return Err(lx::Error::EEXIST),
414                Ok(_) => {}
415                Err(e) if e == lx::Error::ENOENT => return Err(lx::Error::EROFS),
416                Err(e) => return Err(e),
417            }
418        } else {
419            inode.get_attr()?;
420        }
421
422        let access_mode = (flags & lx::O_ACCESS_MASK as u32) as i32;
423        if matches!(access_mode, lx::O_WRONLY | lx::O_RDWR) || flags & lx::O_TRUNC as u32 != 0 {
424            return Err(lx::Error::EROFS);
425        }
426
427        Ok(())
428    }
429
430    /// Create a new virtio-fs for the specified root path.
431    pub fn new(
432        root_path: impl AsRef<Path>,
433        mount_options: Option<&LxVolumeOptions>,
434    ) -> lx::Result<Self> {
435        let readonly = mount_options.is_some_and(|o| o.is_readonly());
436        let volume = if let Some(mount_options) = mount_options {
437            mount_options.new_volume(root_path)
438        } else {
439            lxutil::LxVolume::new(root_path)
440        }?;
441        let mut inodes = InodeMap::new(volume.supports_stable_file_id());
442        let (root_inode, _) = VirtioFsInode::new(Arc::new(volume), PathBuf::new())?;
443        assert!(inodes.insert(root_inode).1 == FUSE_ROOT_ID);
444        Ok(Self {
445            inodes: RwLock::new(inodes),
446            files: RwLock::new(HandleMap::new()),
447            readonly,
448        })
449    }
450
451    /// Perform lookup on a specified directory inode.
452    fn lookup_helper(&self, inode: &VirtioFsInode, name: &lx::LxStr) -> lx::Result<fuse_entry_out> {
453        let (new_inode, attr) = inode.lookup_child(name)?;
454        let (_, new_inode_nr) = self.insert_inode(new_inode);
455        Ok(fuse_entry_out::new(
456            new_inode_nr,
457            ENTRY_TIMEOUT,
458            ATTRIBUTE_TIMEOUT,
459            attr,
460        ))
461    }
462
463    /// Removes a file or directory.
464    fn unlink_helper(&self, request: &Request, name: &lx::LxStr, flags: i32) -> lx::Result<()> {
465        let inode = self.get_inode(request.node_id())?;
466        self.check_writable()?;
467        inode.unlink(name, flags)
468    }
469
470    /// Retrieve the inode with the specified node ID.
471    fn get_inode(&self, node_id: u64) -> lx::Result<Arc<VirtioFsInode>> {
472        self.inodes.read().get(node_id).ok_or_else(|| {
473            tracing::warn!(node_id, "request for unknown inode");
474            lx::Error::EINVAL
475        })
476    }
477
478    /// Insert a new inode, and returns the assigned node ID as well as a reference to the inode.
479    ///
480    /// If the file system supports stable inode numbers and an inode already existed with this
481    /// number, the existing inode is returned, not the passed in one.
482    fn insert_inode(&self, inode: VirtioFsInode) -> (Arc<VirtioFsInode>, u64) {
483        self.inodes.write().insert(inode)
484    }
485
486    /// Retrieve the file object with the specified file handle.
487    fn get_file(&self, fh: u64) -> lx::Result<Arc<VirtioFsFile>> {
488        let files = self.files.read();
489        let file = files.get(fh).ok_or_else(|| {
490            tracing::warn!(fh, "Request for unknown file");
491            lx::Error::EBADF
492        })?;
493
494        Ok(Arc::clone(file))
495    }
496
497    /// Insert a new file object, and return the assigned file handle.
498    fn insert_file(&self, file: VirtioFsFile) -> u64 {
499        self.files.write().insert(Arc::new(file))
500    }
501
502    /// Remove the file with the specified node ID.
503    fn remove_file(&self, fh: u64) {
504        self.files.write().remove(fh);
505    }
506}
507
508/// A key/value map where the keys are automatically incremented identifiers.
509struct HandleMap<T> {
510    values: HashMap<u64, T>,
511    next_handle: u64,
512}
513
514impl<T> HandleMap<T> {
515    /// Create a new `HandleMap`.
516    pub fn new() -> Self {
517        Self::starting_at(1)
518    }
519
520    /// Create a new `HandleMap` starting with handle value `next_handle`.
521    pub fn starting_at(next_handle: u64) -> Self {
522        Self {
523            values: HashMap::new(),
524            next_handle,
525        }
526    }
527
528    /// Inserts an item into the map, and returns the assigned handle.
529    pub fn insert(&mut self, value: T) -> u64 {
530        let handle = self.next_handle;
531        if self.values.insert(handle, value).is_some() {
532            panic!("Inode number reused.");
533        }
534
535        self.next_handle += 1;
536        handle
537    }
538
539    /// Retrieves a value from the map.
540    pub fn get(&self, handle: u64) -> Option<&T> {
541        self.values.get(&handle)
542    }
543
544    /// Retrieves a value from the map.
545    #[cfg_attr(not(windows), expect(dead_code))]
546    pub fn get_mut(&mut self, handle: u64) -> Option<&mut T> {
547        self.values.get_mut(&handle)
548    }
549
550    /// Removes a value from the map.
551    pub fn remove(&mut self, handle: u64) -> Option<T> {
552        self.values.remove(&handle)
553    }
554
555    /// Clears the map and resets the handle values.
556    pub fn clear(&mut self) {
557        self.values.clear();
558        self.next_handle = 1;
559    }
560}
561
562/// Assigns node IDs to inodes, and keeps track of in-use inodes by their actual inode number.
563///
564/// We cannot use the real inode number as the FUSE node ID:
565/// - FUSE node ID 1 is reserved for the root, so this would break if a file system used that inode
566///   number.
567/// - When we want to support multiple volumes in a single file system, node IDs still need to be
568///   globally unique, whereas inode numbers are per-volume.
569struct InodeMap {
570    inodes_by_node_id: HandleMap<Arc<VirtioFsInode>>,
571    inodes_by_inode_nr: Option<HashMap<lx::ino_t, (Arc<VirtioFsInode>, u64)>>,
572}
573
574impl InodeMap {
575    /// Create a new `InodeMap`.
576    pub fn new(supports_stable_file_id: bool) -> Self {
577        // TODO: Once multiple volumes are supported, the inodes_by_inode_nr map should be per
578        // volume.
579        Self {
580            inodes_by_node_id: HandleMap::new(),
581            inodes_by_inode_nr: if supports_stable_file_id {
582                Some(HashMap::new())
583            } else {
584                None
585            },
586        }
587    }
588
589    /// Get an inode with the specified FUSE node ID.
590    pub fn get(&self, node_id: u64) -> Option<Arc<VirtioFsInode>> {
591        let inode = self.inodes_by_node_id.get(node_id)?;
592        Some(Arc::clone(inode))
593    }
594
595    /// Insert an inode into the map, returning its node ID.
596    pub fn insert(&mut self, inode: VirtioFsInode) -> (Arc<VirtioFsInode>, u64) {
597        // If stable inode numbers are supported, look for the inode by its number.
598        if let Some(inodes_by_inode_nr) = self.inodes_by_inode_nr.as_mut() {
599            match inodes_by_inode_nr.entry(inode.inode_nr()) {
600                Entry::Occupied(entry) => {
601                    // Inode found; increment its count and return the existing FUSE node ID.
602                    let new_path = inode.clone_path();
603                    let (inode, node_id) = entry.get();
604                    inode.lookup(new_path);
605                    return (Arc::clone(inode), *node_id);
606                }
607                Entry::Vacant(entry) => {
608                    // Inode not found, so insert it into both maps.
609                    let inode = Arc::new(inode);
610                    let node_id = self.inodes_by_node_id.insert(Arc::clone(&inode));
611                    entry.insert((Arc::clone(&inode), node_id));
612                    return (inode, node_id);
613                }
614            }
615        }
616
617        // No support for stable inode numbers, so just use node ID.
618        let inode = Arc::new(inode);
619        let node_id = self.inodes_by_node_id.insert(Arc::clone(&inode));
620        (inode, node_id)
621    }
622
623    /// Remove an inode with the specified FUSE node ID from the map.
624    pub fn remove(&mut self, node_id: u64) {
625        let inode = self.inodes_by_node_id.remove(node_id).unwrap();
626        if let Some(inodes_by_inode_nr) = self.inodes_by_inode_nr.as_mut() {
627            inodes_by_inode_nr.remove(&inode.inode_nr());
628        }
629    }
630
631    /// Clears the map, preserving the root inode.
632    pub fn clear(&mut self) {
633        let root_inode = Arc::clone(self.inodes_by_node_id.get(FUSE_ROOT_ID).unwrap());
634        self.inodes_by_node_id.clear();
635
636        // Re-insert the root inode.
637        assert!(self.inodes_by_node_id.insert(Arc::clone(&root_inode)) == FUSE_ROOT_ID);
638
639        // Clear the inode number map if it's supported.
640        if let Some(inodes_by_inode_nr) = self.inodes_by_inode_nr.as_mut() {
641            inodes_by_inode_nr.clear();
642            inodes_by_inode_nr.insert(root_inode.inode_nr(), (root_inode, FUSE_ROOT_ID));
643        }
644    }
645}